runner: add test for unicode token processing

2026-01-29 17:53:16 -05:00 · 2025-05-14 11:29:11 -07:00
130 changed files with 1753 additions and 7078 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -51,8 +51,6 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/include
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu/amx)

-add_compile_definitions(NDEBUG)
-
 set(GGML_CPU ON)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src)
 set_property(TARGET ggml PROPERTY EXCLUDE_FROM_ALL TRUE)
--- a/README.md
+++ b/README.md
@@ -405,8 +405,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Writeopia](https://github.com/Writeopia/Writeopia) (Text editor with integration with Ollama)
 - [AppFlowy](https://github.com/AppFlowy-IO/AppFlowy) (AI collaborative workspace with Ollama, cross-platform and self-hostable)
 - [Lumina](https://github.com/cushydigit/lumina.git) (A lightweight, minimal React.js frontend for interacting with Ollama servers)
- [Tiny Notepad](https://pypi.org/project/tiny-notepad) (A lightweight, notepad-like interface to chat with ollama available on PyPI)
- [macLlama (macOS native)](https://github.com/hellotunamayo/macLlama) (A native macOS GUI application for interacting with Ollama models, featuring a chat interface.) 

 ### Cloud

@@ -450,7 +448,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [orbiton](https://github.com/xyproto/orbiton) Configuration-free text editor and IDE with support for tab completion with Ollama.
 - [orca-cli](https://github.com/molbal/orca-cli) Ollama Registry CLI Application - Browse, pull, and download models from Ollama Registry in your terminal.
 - [GGUF-to-Ollama](https://github.com/jonathanhecl/gguf-to-ollama) - Importing GGUF to Ollama made easy (multiplatform)
- [AWS-Strands-With-Ollama](https://github.com/rapidarchitect/ollama_strands) - AWS Strands Agents with Ollama Examples

 ### Apple Vision Pro

@@ -587,7 +584,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Simple-Discord-AI](https://github.com/zyphixor/simple-discord-ai)
 - [LLM Telegram Bot](https://github.com/innightwolfsleep/llm_telegram_bot) (telegram bot, primary for RP. Oobabooga-like buttons, [A1111](https://github.com/AUTOMATIC1111/stable-diffusion-webui) API integration e.t.c)
 - [mcp-llm](https://github.com/sammcj/mcp-llm) (MCP Server to allow LLMs to call other LLMs)
- [SimpleOllamaUnity](https://github.com/HardCodeDev777/SimpleOllamaUnity) (Unity Engine extension for communicating with Ollama in a few lines of code. Also works at runtime)
 - [UnityCodeLama](https://github.com/HardCodeDev777/UnityCodeLama) (Unity Edtior tool to analyze scripts via Ollama)

 ### Supported backends
--- a/api/client.go
+++ b/api/client.go
@@ -24,10 +24,7 @@ import (
 	"net/http"
 	"net/url"
 	"runtime"
-	"strconv"
-	"time"

-	"github.com/ollama/ollama/auth"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/version"
@@ -79,14 +76,6 @@ func NewClient(base *url.URL, http *http.Client) *Client {
 	}
 }

-func getAuthorizationToken(ctx context.Context, challenge string) (string, error) {
-	token, err := auth.Sign(ctx, []byte(challenge))
-	if err != nil {
-		return "", err
-	}
-	return token, nil
-}
-
 func (c *Client) do(ctx context.Context, method, path string, reqData, respData any) error {
 	var reqBody io.Reader
 	var data []byte
@@ -108,21 +97,6 @@ func (c *Client) do(ctx context.Context, method, path string, reqData, respData
 	}

 	requestURL := c.base.JoinPath(path)
-
-	var token string
-	if envconfig.UseAuth() || c.base.Hostname() == "ollama.com" {
-		now := strconv.FormatInt(time.Now().Unix(), 10)
-		chal := fmt.Sprintf("%s,%s?ts=%s", method, path, now)
-		token, err = getAuthorizationToken(ctx, chal)
-		if err != nil {
-			return err
-		}
-
-		q := requestURL.Query()
-		q.Set("ts", now)
-		requestURL.RawQuery = q.Encode()
-	}
-
 	request, err := http.NewRequestWithContext(ctx, method, requestURL.String(), reqBody)
 	if err != nil {
 		return err
@@ -132,10 +106,6 @@ func (c *Client) do(ctx context.Context, method, path string, reqData, respData
 	request.Header.Set("Accept", "application/json")
 	request.Header.Set("User-Agent", fmt.Sprintf("ollama/%s (%s %s) Go/%s", version.Version, runtime.GOARCH, runtime.GOOS, runtime.Version()))

-	if token != "" {
-		request.Header.Set("Authorization", token)
-	}
-
 	respObj, err := c.http.Do(request)
 	if err != nil {
 		return err
@@ -173,22 +143,6 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f
 	}

 	requestURL := c.base.JoinPath(path)
-
-	var token string
-	if envconfig.UseAuth() || c.base.Hostname() == "ollama.com" {
-		var err error
-		now := strconv.FormatInt(time.Now().Unix(), 10)
-		chal := fmt.Sprintf("%s,%s?ts=%s", method, path, now)
-		token, err = getAuthorizationToken(ctx, chal)
-		if err != nil {
-			return err
-		}
-
-		q := requestURL.Query()
-		q.Set("ts", now)
-		requestURL.RawQuery = q.Encode()
-	}
-
 	request, err := http.NewRequestWithContext(ctx, method, requestURL.String(), buf)
 	if err != nil {
 		return err
@@ -198,10 +152,6 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f
 	request.Header.Set("Accept", "application/x-ndjson")
 	request.Header.Set("User-Agent", fmt.Sprintf("ollama/%s (%s %s) Go/%s", version.Version, runtime.GOARCH, runtime.GOOS, runtime.Version()))

-	if token != "" {
-		request.Header.Set("Authorization", token)
-	}
-
 	response, err := c.http.Do(request)
 	if err != nil {
 		return err
--- a/api/types.go
+++ b/api/types.go
@@ -83,12 +83,6 @@ type GenerateRequest struct {
 	// Options lists model-specific options. For example, temperature can be
 	// set through this field, if the model supports it.
 	Options map[string]any `json:"options"`
-
-	// Think controls whether thinking/reasoning models will think before
-	// responding. Needs to be a pointer so we can distinguish between false
-	// (request that thinking _not_ be used) and unset (use the old behavior
-	// before this option was introduced)
-	Think *bool `json:"think,omitempty"`
 }

 // ChatRequest describes a request sent by [Client.Chat].
@@ -114,10 +108,6 @@ type ChatRequest struct {

 	// Options lists model-specific options.
 	Options map[string]any `json:"options"`
-
-	// Think controls whether thinking/reasoning models will think before
-	// responding
-	Think *bool `json:"think,omitempty"`
 }

 type Tools []Tool
@@ -136,11 +126,8 @@ func (t Tool) String() string {
 // role ("system", "user", or "assistant"), the content and an optional list
 // of images.
 type Message struct {
-	Role    string `json:"role"`
-	Content string `json:"content"`
-	// Thinking contains the text that was inside thinking tags in the
-	// original model output when ChatRequest.Think is enabled.
-	Thinking  string      `json:"thinking,omitempty"`
+	Role      string      `json:"role"`
+	Content   string      `json:"content"`
 	Images    []ImageData `json:"images,omitempty"`
 	ToolCalls []ToolCall  `json:"tool_calls,omitempty"`
 }
@@ -457,13 +444,12 @@ type ProcessResponse struct {

 // ListModelResponse is a single model description in [ListResponse].
 type ListModelResponse struct {
-	Name         string             `json:"name"`
-	Model        string             `json:"model"`
-	ModifiedAt   time.Time          `json:"modified_at"`
-	Size         int64              `json:"size"`
-	Digest       string             `json:"digest"`
-	Capabilities []model.Capability `json:"capabilities,omitempty"`
-	Details      ModelDetails       `json:"details,omitempty"`
+	Name       string       `json:"name"`
+	Model      string       `json:"model"`
+	ModifiedAt time.Time    `json:"modified_at"`
+	Size       int64        `json:"size"`
+	Digest     string       `json:"digest"`
+	Details    ModelDetails `json:"details,omitempty"`
 }

 // ProcessModelResponse is a single model description in [ProcessResponse].
@@ -492,10 +478,6 @@ type GenerateResponse struct {
 	// Response is the textual response itself.
 	Response string `json:"response"`

-	// Thinking contains the text that was inside thinking tags in the
-	// original model output when ChatRequest.Think is enabled.
-	Thinking string `json:"thinking,omitempty"`
-
 	// Done specifies if the response is complete.
 	Done bool `json:"done"`

--- a/api/types_test.go
+++ b/api/types_test.go
@@ -372,50 +372,3 @@ func TestPropertyType_MarshalJSON(t *testing.T) {
 		})
 	}
 }
-
-func TestThinking_UnmarshalJSON(t *testing.T) {
-	trueVal := true
-	falseVal := false
-
-	tests := []struct {
-		name             string
-		input            string
-		expectedThinking *bool
-		expectedError    bool
-	}{
-		{
-			name:             "true",
-			input:            `{ "think": true }`,
-			expectedThinking: &trueVal,
-		},
-		{
-			name:             "false",
-			input:            `{ "think": false }`,
-			expectedThinking: &falseVal,
-		},
-		{
-			name:             "unset",
-			input:            `{ }`,
-			expectedThinking: nil,
-		},
-		{
-			name:             "invalid",
-			input:            `{ "think": "true" }`,
-			expectedThinking: nil,
-			expectedError:    true,
-		},
-	}
-
-	for _, test := range tests {
-		t.Run(test.name, func(t *testing.T) {
-			var req GenerateRequest
-			err := json.Unmarshal([]byte(test.input), &req)
-			if test.expectedError {
-				require.Error(t, err)
-			} else {
-				require.NoError(t, err)
-				assert.Equal(t, test.expectedThinking, req.Think)
-			}
-		})
-	}
-}
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -39,7 +39,6 @@ import (
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/progress"
-	"github.com/ollama/ollama/readline"
 	"github.com/ollama/ollama/runner"
 	"github.com/ollama/ollama/server"
 	"github.com/ollama/ollama/types/model"
@@ -47,23 +46,6 @@ import (
 	"github.com/ollama/ollama/version"
 )

-// ensureThinkingSupport emits a warning if the model does not advertise thinking support
-func ensureThinkingSupport(ctx context.Context, client *api.Client, name string) {
-	if name == "" {
-		return
-	}
-	resp, err := client.Show(ctx, &api.ShowRequest{Model: name})
-	if err != nil {
-		return
-	}
-	for _, cap := range resp.Capabilities {
-		if cap == model.CapabilityThinking {
-			return
-		}
-	}
-	fmt.Fprintf(os.Stderr, "warning: model %q does not support thinking output\n", name)
-}
-
 var errModelfileNotFound = errors.New("specified Modelfile wasn't found")

 func getModelfileName(cmd *cobra.Command) (string, error) {
@@ -283,9 +265,6 @@ func loadOrUnloadModel(cmd *cobra.Command, opts *runOptions) error {
 	req := &api.GenerateRequest{
 		Model:     opts.Model,
 		KeepAlive: opts.KeepAlive,
-
-		// pass Think here so we fail before getting to the chat prompt if the model doesn't support it
-		Think: opts.Think,
 	}

 	return client.Generate(cmd.Context(), req, func(api.GenerateResponse) error { return nil })
@@ -320,22 +299,6 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 	}
 	opts.Format = format

-	thinkFlag := cmd.Flags().Lookup("think")
-	if thinkFlag.Changed {
-		think, err := cmd.Flags().GetBool("think")
-		if err != nil {
-			return err
-		}
-		opts.Think = &think
-	} else {
-		opts.Think = nil
-	}
-	hidethinking, err := cmd.Flags().GetBool("hidethinking")
-	if err != nil {
-		return err
-	}
-	opts.HideThinking = hidethinking
-
 	keepAlive, err := cmd.Flags().GetString("keepalive")
 	if err != nil {
 		return err
@@ -399,11 +362,6 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 		return err
 	}

-	opts.Think, err = inferThinkingOption(&info.Capabilities, &opts, thinkFlag.Changed)
-	if err != nil {
-		return err
-	}
-
 	opts.MultiModal = slices.Contains(info.Capabilities, model.CapabilityVision)

 	// TODO: remove the projector info and vision info checks below,
@@ -789,38 +747,11 @@ func showInfo(resp *api.ShowResponse, verbose bool, w io.Writer) error {
 				case float64:
 					v = fmt.Sprintf("%g", vData)
 				case []any:
-					targetWidth := 10 // Small width where we are displaying the data in a column
-
-					var itemsToShow int
-					totalWidth := 1 // Start with 1 for opening bracket
-
-					// Find how many we can fit
-					for i := range vData {
-						itemStr := fmt.Sprintf("%v", vData[i])
-						width := runewidth.StringWidth(itemStr)
-
-						// Add separator width (", ") for all items except the first
-						if i > 0 {
-							width += 2
-						}
-
-						// Check if adding this item would exceed our width limit
-						if totalWidth+width > targetWidth && i > 0 {
-							break
-						}
-
-						totalWidth += width
-						itemsToShow++
-					}
-
-					// Format the output
-					if itemsToShow < len(vData) {
-						v = fmt.Sprintf("%v", vData[:itemsToShow])
-						v = strings.TrimSuffix(v, "]")
-						v += fmt.Sprintf(" ...+%d more]", len(vData)-itemsToShow)
-					} else {
-						v = fmt.Sprintf("%v", vData)
+					n := 3
+					if len(vData) < n {
+						n = len(vData)
 					}
+					v = fmt.Sprintf("%v", vData[:n])
 				default:
 					v = fmt.Sprintf("%T", vData)
 				}
@@ -841,19 +772,10 @@ func showInfo(resp *api.ShowResponse, verbose bool, w io.Writer) error {

 	head := func(s string, n int) (rows [][]string) {
 		scanner := bufio.NewScanner(strings.NewReader(s))
-		count := 0
-		for scanner.Scan() {
-			text := strings.TrimSpace(scanner.Text())
-			if text == "" {
-				continue
+		for scanner.Scan() && (len(rows) < n || n < 0) {
+			if text := scanner.Text(); text != "" {
+				rows = append(rows, []string{"", strings.TrimSpace(text)})
 			}
-			count++
-			if n < 0 || count <= n {
-				rows = append(rows, []string{"", text})
-			}
-		}
-		if n >= 0 && count > n {
-			rows = append(rows, []string{"", "..."})
 		}
 		return
 	}
@@ -965,19 +887,17 @@ func PullHandler(cmd *cobra.Command, args []string) error {
 type generateContextKey string

 type runOptions struct {
-	Model        string
-	ParentModel  string
-	Prompt       string
-	Messages     []api.Message
-	WordWrap     bool
-	Format       string
-	System       string
-	Images       []api.ImageData
-	Options      map[string]any
-	MultiModal   bool
-	KeepAlive    *api.Duration
-	Think        *bool
-	HideThinking bool
+	Model       string
+	ParentModel string
+	Prompt      string
+	Messages    []api.Message
+	WordWrap    bool
+	Format      string
+	System      string
+	Images      []api.ImageData
+	Options     map[string]any
+	MultiModal  bool
+	KeepAlive   *api.Duration
 }

 type displayResponseState struct {
@@ -1033,26 +953,6 @@ func displayResponse(content string, wordWrap bool, state *displayResponseState)
 	}
 }

-func thinkingOutputOpeningText(plainText bool) string {
-	text := "Thinking...\n"
-
-	if plainText {
-		return text
-	}
-
-	return readline.ColorGrey + readline.ColorBold + text + readline.ColorDefault + readline.ColorGrey
-}
-
-func thinkingOutputClosingText(plainText bool) string {
-	text := "...done thinking.\n\n"
-
-	if plainText {
-		return text
-	}
-
-	return readline.ColorGrey + readline.ColorBold + text + readline.ColorDefault
-}
-
 func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
@@ -1080,34 +980,14 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 	var latest api.ChatResponse
 	var fullResponse strings.Builder
 	var role string
-	var thinkTagOpened bool = false
-	var thinkTagClosed bool = false

 	fn := func(response api.ChatResponse) error {
-		if response.Message.Content != "" || !opts.HideThinking {
-			p.StopAndClear()
-		}
+		p.StopAndClear()

 		latest = response

 		role = response.Message.Role
-		if response.Message.Thinking != "" && !opts.HideThinking {
-			if !thinkTagOpened {
-				fmt.Print(thinkingOutputOpeningText(false))
-				thinkTagOpened = true
-			}
-			displayResponse(response.Message.Thinking, opts.WordWrap, state)
-		}
-
 		content := response.Message.Content
-		if thinkTagOpened && !thinkTagClosed && content != "" {
-			fmt.Print(thinkingOutputClosingText(false))
-			thinkTagClosed = true
-		}
-		// purposefully not putting thinking blocks in the response, which would
-		// only be needed if we later added tool calling to the cli (they get
-		// filtered out anyway since current models don't expect them unless you're
-		// about to finish some tool calls)
 		fullResponse.WriteString(content)

 		displayResponse(content, opts.WordWrap, state)
@@ -1124,7 +1004,6 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 		Messages: opts.Messages,
 		Format:   json.RawMessage(opts.Format),
 		Options:  opts.Options,
-		Think:    opts.Think,
 	}

 	if opts.KeepAlive != nil {
@@ -1186,32 +1065,13 @@ func generate(cmd *cobra.Command, opts runOptions) error {
 	}()

 	var state *displayResponseState = &displayResponseState{}
-	var thinkTagOpened bool = false
-	var thinkTagClosed bool = false
-
-	plainText := !term.IsTerminal(int(os.Stdout.Fd()))

 	fn := func(response api.GenerateResponse) error {
+		p.StopAndClear()
+
 		latest = response
 		content := response.Response

-		if response.Response != "" || !opts.HideThinking {
-			p.StopAndClear()
-		}
-
-		if response.Thinking != "" && !opts.HideThinking {
-			if !thinkTagOpened {
-				fmt.Print(thinkingOutputOpeningText(plainText))
-				thinkTagOpened = true
-			}
-			displayResponse(response.Thinking, opts.WordWrap, state)
-		}
-
-		if thinkTagOpened && !thinkTagClosed && content != "" {
-			fmt.Print(thinkingOutputClosingText(plainText))
-			thinkTagClosed = true
-		}
-
 		displayResponse(content, opts.WordWrap, state)

 		return nil
@@ -1237,7 +1097,6 @@ func generate(cmd *cobra.Command, opts runOptions) error {
 		System:    opts.System,
 		Options:   opts.Options,
 		KeepAlive: opts.KeepAlive,
-		Think:     opts.Think,
 	}

 	if err := client.Generate(ctx, &request, fn); err != nil {
@@ -1341,11 +1200,11 @@ func checkServerHeartbeat(cmd *cobra.Command, _ []string) error {
 		return err
 	}
 	if err := client.Heartbeat(cmd.Context()); err != nil {
-		if !(strings.Contains(err.Error(), " refused") || strings.Contains(err.Error(), "could not connect")) {
+		if !strings.Contains(err.Error(), " refused") {
 			return err
 		}
 		if err := startApp(cmd.Context(), client); err != nil {
-			return fmt.Errorf("ollama server not responding - %w", err)
+			return errors.New("could not connect to ollama app, is it running?")
 		}
 	}
 	return nil
@@ -1423,7 +1282,7 @@ func NewCLI() *cobra.Command {
 	}

 	createCmd.Flags().StringP("file", "f", "", "Name of the Modelfile (default \"Modelfile\"")
-	createCmd.Flags().StringP("quantize", "q", "", "Quantize model to this level (e.g. q4_K_M)")
+	createCmd.Flags().StringP("quantize", "q", "", "Quantize model to this level (e.g. q4_0)")

 	showCmd := &cobra.Command{
 		Use:     "show MODEL",
@@ -1453,8 +1312,6 @@ func NewCLI() *cobra.Command {
 	runCmd.Flags().Bool("insecure", false, "Use an insecure registry")
 	runCmd.Flags().Bool("nowordwrap", false, "Don't wrap words to the next line automatically")
 	runCmd.Flags().String("format", "", "Response format (e.g. json)")
-	runCmd.Flags().Bool("think", false, "Whether to use thinking mode for supported models")
-	runCmd.Flags().Bool("hidethinking", false, "Hide thinking output (if provided)")

 	stopCmd := &cobra.Command{
 		Use:     "stop MODEL",
@@ -1506,6 +1363,7 @@ func NewCLI() *cobra.Command {
 		PreRunE: checkServerHeartbeat,
 		RunE:    ListRunningHandler,
 	}
+
 	copyCmd := &cobra.Command{
 		Use:     "cp SOURCE DESTINATION",
 		Short:   "Copy a model",
@@ -1594,45 +1452,3 @@ func NewCLI() *cobra.Command {

 	return rootCmd
 }
-
-// If the user has explicitly set thinking options, either through the CLI or
-// through the `/set think` or `set nothink` interactive options, then we
-// respect them. Otherwise, we check model capabilities to see if the model
-// supports thinking. If the model does support thinking, we enable it.
-// Otherwise, we unset the thinking option (which is different than setting it
-// to false).
-//
-// If capabilities are not provided, we fetch them from the server.
-func inferThinkingOption(caps *[]model.Capability, runOpts *runOptions, explicitlySetByUser bool) (*bool, error) {
-	if explicitlySetByUser {
-		return runOpts.Think, nil
-	}
-
-	if caps == nil {
-		client, err := api.ClientFromEnvironment()
-		if err != nil {
-			return nil, err
-		}
-		ret, err := client.Show(context.Background(), &api.ShowRequest{
-			Model: runOpts.Model,
-		})
-		if err != nil {
-			return nil, err
-		}
-		caps = &ret.Capabilities
-	}
-
-	thinkingSupported := false
-	for _, cap := range *caps {
-		if cap == model.CapabilityThinking {
-			thinkingSupported = true
-		}
-	}
-
-	if thinkingSupported {
-		thinking := true
-		return &thinking, nil
-	}
-
-	return nil, nil
-}
--- a/cmd/cmd_test.go
+++ b/cmd/cmd_test.go
@@ -225,7 +225,6 @@ Weigh anchor!
  System
    You are a pirate!    
    Ahoy, matey!         
-    ...                  

 `
 		if diff := cmp.Diff(expect, b.String()); diff != "" {
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -62,8 +62,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 		fmt.Fprintln(os.Stderr, "  /set noformat          Disable formatting")
 		fmt.Fprintln(os.Stderr, "  /set verbose           Show LLM stats")
 		fmt.Fprintln(os.Stderr, "  /set quiet             Disable LLM stats")
-		fmt.Fprintln(os.Stderr, "  /set think             Enable thinking")
-		fmt.Fprintln(os.Stderr, "  /set nothink           Disable thinking")
 		fmt.Fprintln(os.Stderr, "")
 	}

@@ -130,7 +128,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {

 	var sb strings.Builder
 	var multiline MultilineState
-	var thinkExplicitlySet bool = opts.Think != nil

 	for {
 		line, err := scanner.Readline()
@@ -198,19 +195,11 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 			opts.Model = args[1]
 			opts.Messages = []api.Message{}
 			fmt.Printf("Loading model '%s'\n", opts.Model)
-			opts.Think, err = inferThinkingOption(nil, &opts, thinkExplicitlySet)
-			if err != nil {
-				return err
-			}
 			if err := loadOrUnloadModel(cmd, &opts); err != nil {
 				if strings.Contains(err.Error(), "not found") {
 					fmt.Printf("error: %v\n", err)
 					continue
 				}
-				if strings.Contains(err.Error(), "does not support thinking") {
-					fmt.Printf("error: %v\n", err)
-					continue
-				}
 				return err
 			}
 			continue
@@ -271,22 +260,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 						return err
 					}
 					fmt.Println("Set 'quiet' mode.")
-				case "think":
-					think := true
-					opts.Think = &think
-					thinkExplicitlySet = true
-					if client, err := api.ClientFromEnvironment(); err == nil {
-						ensureThinkingSupport(cmd.Context(), client, opts.Model)
-					}
-					fmt.Println("Set 'think' mode.")
-				case "nothink":
-					think := false
-					opts.Think = &think
-					thinkExplicitlySet = true
-					if client, err := api.ClientFromEnvironment(); err == nil {
-						ensureThinkingSupport(cmd.Context(), client, opts.Model)
-					}
-					fmt.Println("Set 'nothink' mode.")
 				case "format":
 					if len(args) < 3 || args[2] != "json" {
 						fmt.Println("Invalid or missing format. For 'json' mode use '/set format json'")
@@ -475,11 +448,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {

 			assistant, err := chat(cmd, opts)
 			if err != nil {
-				if strings.Contains(err.Error(), "does not support thinking") {
-					fmt.Printf("error: %v\n", err)
-					sb.Reset()
-					continue
-				}
 				return err
 			}
 			if assistant != nil {
--- a/cmd/start_darwin.go
+++ b/cmd/start_darwin.go
@@ -23,7 +23,7 @@ func startApp(ctx context.Context, client *api.Client) error {
 		return errors.New("could not find ollama app")
 	}
 	path := strings.Split(link, "Ollama.app")
-	if err := exec.Command("/usr/bin/open", "-j", "-a", path[0]+"Ollama.app").Run(); err != nil {
+	if err := exec.Command("/usr/bin/open", "-a", path[0]+"Ollama.app").Run(); err != nil {
 		return err
 	}
 	return waitForServer(ctx, client)
--- a/cmd/start_windows.go
+++ b/cmd/start_windows.go
@@ -4,27 +4,17 @@ import (
 	"context"
 	"errors"
 	"fmt"
-	"log/slog"
 	"os"
 	"os/exec"
-	"path"
 	"path/filepath"
 	"strings"
 	"syscall"
-	"unsafe"

 	"github.com/ollama/ollama/api"
-	"golang.org/x/sys/windows"
-)
-
-const (
-	Installer = "OllamaSetup.exe"
 )

 func startApp(ctx context.Context, client *api.Client) error {
-	if len(isProcRunning(Installer)) > 0 {
-		return fmt.Errorf("upgrade in progress...")
-	}
+	// log.Printf("XXX Attempting to find and start ollama app")
 	AppName := "ollama app.exe"
 	exe, err := os.Executable()
 	if err != nil {
@@ -45,11 +35,14 @@ func startApp(ctx context.Context, client *api.Client) error {
 			}
 		}
 	}
+	// log.Printf("XXX attempting to start app %s", appExe)

 	cmd_path := "c:\\Windows\\system32\\cmd.exe"
-	cmd := exec.Command(cmd_path, "/c", appExe, "hidden")
+	cmd := exec.Command(cmd_path, "/c", appExe)
+	// TODO - these hide flags aren't working - still pops up a command window for some reason
 	cmd.SysProcAttr = &syscall.SysProcAttr{CreationFlags: 0x08000000, HideWindow: true}

+	// TODO this didn't help either...
 	cmd.Stdin = strings.NewReader("")
 	cmd.Stdout = os.Stdout
 	cmd.Stderr = os.Stderr
@@ -63,50 +56,3 @@ func startApp(ctx context.Context, client *api.Client) error {
 	}
 	return waitForServer(ctx, client)
 }
-
-func isProcRunning(procName string) []uint32 {
-	pids := make([]uint32, 2048)
-	var ret uint32
-	if err := windows.EnumProcesses(pids, &ret); err != nil || ret == 0 {
-		slog.Debug("failed to check for running installers", "error", err)
-		return nil
-	}
-	if ret > uint32(len(pids)) {
-		pids = make([]uint32, ret+10)
-		if err := windows.EnumProcesses(pids, &ret); err != nil || ret == 0 {
-			slog.Debug("failed to check for running installers", "error", err)
-			return nil
-		}
-	}
-	if ret < uint32(len(pids)) {
-		pids = pids[:ret]
-	}
-	var matches []uint32
-	for _, pid := range pids {
-		if pid == 0 {
-			continue
-		}
-		hProcess, err := windows.OpenProcess(windows.PROCESS_QUERY_INFORMATION|windows.PROCESS_VM_READ, false, pid)
-		if err != nil {
-			continue
-		}
-		defer windows.CloseHandle(hProcess)
-		var module windows.Handle
-		var cbNeeded uint32
-		cb := (uint32)(unsafe.Sizeof(module))
-		if err := windows.EnumProcessModules(hProcess, &module, cb, &cbNeeded); err != nil {
-			continue
-		}
-		var sz uint32 = 1024 * 8
-		moduleName := make([]uint16, sz)
-		cb = uint32(len(moduleName)) * (uint32)(unsafe.Sizeof(uint16(0)))
-		if err := windows.GetModuleBaseName(hProcess, module, &moduleName[0], cb); err != nil && err != syscall.ERROR_INSUFFICIENT_BUFFER {
-			continue
-		}
-		exeFile := path.Base(strings.ToLower(syscall.UTF16ToString(moduleName)))
-		if strings.EqualFold(exeFile, procName) {
-			matches = append(matches, pid)
-		}
-	}
-	return matches
-}
--- a/cmd/warn_thinking_test.go
+++ b/cmd/warn_thinking_test.go
@@ -1,63 +0,0 @@
-package cmd
-
-import (
-	"encoding/json"
-	"io"
-	"net/http"
-	"net/http/httptest"
-	"os"
-	"strings"
-	"testing"
-
-	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/types/model"
-)
-
-// Test that a warning is printed when thinking is requested but not supported.
-func TestWarnMissingThinking(t *testing.T) {
-	cases := []struct {
-		capabilities []model.Capability
-		expectWarn   bool
-	}{
-		{capabilities: []model.Capability{model.CapabilityThinking}, expectWarn: false},
-		{capabilities: []model.Capability{}, expectWarn: true},
-	}
-
-	for _, tc := range cases {
-		srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-			if r.URL.Path != "/api/show" || r.Method != http.MethodPost {
-				t.Fatalf("unexpected request to %s %s", r.URL.Path, r.Method)
-			}
-			var req api.ShowRequest
-			if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
-				t.Fatalf("decode request: %v", err)
-			}
-			resp := api.ShowResponse{Capabilities: tc.capabilities}
-			if err := json.NewEncoder(w).Encode(resp); err != nil {
-				t.Fatalf("encode response: %v", err)
-			}
-		}))
-		defer srv.Close()
-
-		t.Setenv("OLLAMA_HOST", srv.URL)
-		client, err := api.ClientFromEnvironment()
-		if err != nil {
-			t.Fatal(err)
-		}
-		oldStderr := os.Stderr
-		r, w, _ := os.Pipe()
-		os.Stderr = w
-		ensureThinkingSupport(t.Context(), client, "m")
-		w.Close()
-		os.Stderr = oldStderr
-		out, _ := io.ReadAll(r)
-
-		warned := strings.Contains(string(out), "warning:")
-		if tc.expectWarn && !warned {
-			t.Errorf("expected warning, got none")
-		}
-		if !tc.expectWarn && warned {
-			t.Errorf("did not expect warning, got: %s", string(out))
-		}
-	}
-}
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -53,11 +53,8 @@ func (ModelParameters) KV(t *Tokenizer) ggml.KV {
 	}

 	for _, sv := range t.SpecialVocabulary {
-		kv[fmt.Sprintf("tokenizer.ggml.add_%s_token", sv.Key())] = sv.AddToken
 		kv[fmt.Sprintf("tokenizer.ggml.%s_token_id", sv.Key())] = uint32(sv.ID)
-		if len(sv.IDs) > 0 {
-			kv[fmt.Sprintf("tokenizer.ggml.%s_token_ids", sv.Key())] = sv.IDs
-		}
+		kv[fmt.Sprintf("tokenizer.ggml.add_%s_token", sv.Key())] = sv.AddToken
 	}

 	return kv
--- a/convert/convert_llama.go
+++ b/convert/convert_llama.go
@@ -139,8 +139,7 @@ func (p *llamaModel) Tensors(ts []Tensor) []*ggml.Tensor {
 	}

 	for _, t := range ts {
-		if strings.HasSuffix(t.Name(), "attn_q.weight") || strings.HasSuffix(t.Name(), "attn_k.weight") ||
-			strings.HasSuffix(t.Name(), "attn_q_proj.weight") || strings.HasSuffix(t.Name(), "attn_k_proj.weight") {
+		if strings.HasSuffix(t.Name(), "attn_q.weight") || strings.HasSuffix(t.Name(), "attn_k.weight") {
 			if !p.skipRepack {
 				t.SetRepacker(p.repack)
 			}
@@ -182,9 +181,9 @@ func (p *llamaModel) repack(name string, data []float32, shape []uint64) ([]floa
 	}

 	var heads uint32
-	if strings.HasSuffix(name, "attn_q.weight") || strings.HasSuffix(name, "attn_q_proj.weight") {
+	if strings.HasSuffix(name, "attn_q.weight") {
 		heads = p.NumAttentionHeads
-	} else if strings.HasSuffix(name, "attn_k.weight") || strings.HasSuffix(name, "attn_k_proj.weight") {
+	} else if strings.HasSuffix(name, "attn_k.weight") {
 		heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
 	} else {
 		return nil, fmt.Errorf("unknown tensor for repack: %s", name)
--- a/convert/convert_mllama.go
+++ b/convert/convert_mllama.go
@@ -94,9 +94,7 @@ func (m *mllamaModel) Tensors(ts []Tensor) []*ggml.Tensor {
 	var out []*ggml.Tensor
 	var text []Tensor
 	for _, t := range ts {
-		if !strings.HasPrefix(t.Name(), "v.") && !strings.HasPrefix(t.Name(), "mm.") {
-			text = append(text, t)
-		} else if t.Name() == "v.position_embd.gate" {
+		if t.Name() == "v.position_embd.gate" {
 			for _, name := range []string{"v.position_embd.gate", "v.tile_position_embd.gate"} {
 				tt := t.Clone()
 				tt.SetRepacker(m.repack(name))
@@ -107,21 +105,23 @@ func (m *mllamaModel) Tensors(ts []Tensor) []*ggml.Tensor {
 					WriterTo: tt,
 				})
 			}
-		} else {
-			if t.Name() == "v.pre_tile_position_embd.gate" || t.Name() == "v.post_tile_position_embd.gate" {
-				t.SetRepacker(m.repack(t.Name()))
-			} else if strings.HasSuffix(t.Name(), "attn_q.weight") || strings.HasSuffix(t.Name(), "attn_k.weight") {
-				t.SetRepacker(m.repack(t.Name()))
-			} else if strings.HasSuffix(t.Name(), "attn_gate") || strings.HasSuffix(t.Name(), "ffn_gate") {
-				t.SetRepacker(m.repack(t.Name()))
-			}
-
+		} else if t.Name() == "v.pre_tile_position_embd.gate" || t.Name() == "v.post_tile_position_embd.gate" {
+			t.SetRepacker(m.repack(t.Name()))
 			out = append(out, &ggml.Tensor{
 				Name:     t.Name(),
 				Kind:     t.Kind(),
 				Shape:    t.Shape(),
 				WriterTo: t,
 			})
+		} else if strings.HasPrefix(t.Name(), "v.") || strings.HasPrefix(t.Name(), "mm.") {
+			out = append(out, &ggml.Tensor{
+				Name:     t.Name(),
+				Kind:     t.Kind(),
+				Shape:    t.Shape(),
+				WriterTo: t,
+			})
+		} else {
+			text = append(text, t)
 		}
 	}

@@ -137,35 +137,16 @@ func (m *mllamaModel) repack(name string) Repacker {

 		var t tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))

-		if strings.HasSuffix(name, "attn_q.weight") || strings.HasSuffix(name, "attn_k.weight") {
-			heads := m.VisionModel.AttentionHeads
-			if err := t.Reshape(append([]int{int(heads), 2, dims[0] / int(heads) / 2}, dims[1:]...)...); err != nil {
-				return nil, err
-			}
+		t, err = tensor.Tanh(t)
+		if err != nil {
+			return nil, err
+		}

-			if err := t.T(0, 2, 1, 3); err != nil {
-				return nil, err
-			}
-
-			if err := t.Reshape(dims...); err != nil {
-				return nil, err
-			}
-
-			if err := t.Transpose(); err != nil {
-				return nil, err
-			}
-		} else {
-			t, err = tensor.Tanh(t)
+		if name == "v.position_embd.gate" {
+			t, err = tensor.Sub(float32(1), t)
 			if err != nil {
 				return nil, err
 			}
-
-			if name == "v.position_embd.gate" {
-				t, err = tensor.Sub(float32(1), t)
-				if err != nil {
-					return nil, err
-				}
-			}
 		}

 		t = tensor.Materialize(t)
--- a/convert/convert_test.go
+++ b/convert/convert_test.go
@@ -47,7 +47,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, ggml.KV, ggml.Tensors) {
 	}
 	t.Cleanup(func() { r.Close() })

-	m, err := ggml.Decode(r, -1)
+	m, _, err := ggml.Decode(r, -1)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -332,7 +332,7 @@ func TestConvertAdapter(t *testing.T) {
 			}
 			defer r.Close()

-			m, err := ggml.Decode(r, -1)
+			m, _, err := ggml.Decode(r, -1)
 			if err != nil {
 				t.Fatal(err)
 			}
--- a/convert/tokenizer.go
+++ b/convert/tokenizer.go
@@ -110,7 +110,6 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
 	}

 	if f, err := fsys.Open("tokenizer_config.json"); errors.Is(err, os.ErrNotExist) {
-		// noop
 	} else if err != nil {
 		return nil, err
 	} else {
@@ -172,34 +171,6 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
 		}
 	}

-	if f, err := fsys.Open("generation_config.json"); errors.Is(err, os.ErrNotExist) {
-	} else if err != nil {
-		return nil, err
-	} else {
-		defer f.Close()
-
-		var p map[string]json.RawMessage
-		if err := json.NewDecoder(f).Decode(&p); err != nil {
-			return nil, err
-		}
-
-		for _, st := range specialTokenTypes {
-			if bts, ok := p[fmt.Sprintf("%s_token_id", st)]; ok {
-				var ids []int32
-				if err := json.Unmarshal(bts, &ids); err != nil {
-					// value is not a list so the existing ID is used
-					continue
-				}
-
-				if i := slices.IndexFunc(t.SpecialVocabulary, func(sv *SpecialVocabulary) bool {
-					return sv.Type == st
-				}); i >= 0 {
-					t.SpecialVocabulary[i].IDs = ids
-				}
-			}
-		}
-	}
-
 	return t, nil
 }

@@ -309,9 +280,6 @@ type SpecialVocabulary struct {
 	ID       int
 	Content  string
 	AddToken bool
-
-	// IDs is populated by generation_config.json
-	IDs []int32
 }

 func (sv SpecialVocabulary) Key() string {
--- a/convert/tokenizer_test.go
+++ b/convert/tokenizer_test.go
@@ -247,67 +247,6 @@ func TestParseTokenizer(t *testing.T) {
 				Pre: "default",
 			},
 		},
-		{
-			name: "generation config eos token ids",
-			fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
-				"tokenizer.json": strings.NewReader(`{
-					"added_tokens": [
-						{
-							"id": 0,
-							"content": "<bos>",
-							"special": true
-						},
-						{
-							"id": 1,
-							"content": "<eos>",
-							"special": true
-						},
-						{
-							"id": 2,
-							"content": "<eot>",
-							"special": true
-						},
-						{
-							"id": 3,
-							"content": "<eom>",
-							"special": true
-						}
-					],
-					"model": {
-						"vocab": {
-							"<bos>": 0,
-							"<eos>": 1,
-							"<eot>": 2,
-							"<eom>": 3
-						}
-					}
-				}`),
-				"tokenizer_config.json": strings.NewReader(`{
-					"add_bos_token": true,
-					"add_eos_token": false,
-					"bos_token": "<bos>",
-					"eos_token": "<eos>"
-				}`),
-				"generation_config.json": strings.NewReader(`{
-					"bos_token_id": 0,
-					"eos_token_id": [1, 2, 3]
-				}`),
-			}),
-			specialTokenTypes: []string{"pad", "eos", "bos", "unk"},
-			want: &Tokenizer{
-				Vocabulary: &Vocabulary{
-					Model:  "gpt2",
-					Tokens: []string{"<bos>", "<eos>", "<eot>", "<eom>"},
-					Scores: []float32{0, 1, 2, 3},
-					Types:  []int32{3, 3, 3, 3},
-				},
-				SpecialVocabulary: []*SpecialVocabulary{
-					{Type: "eos", Content: "<eos>", ID: 1, IDs: []int32{1, 2, 3}, AddToken: false},
-					{Type: "bos", Content: "<bos>", ID: 0, AddToken: true},
-				},
-				Pre: "default",
-			},
-		},
 	}

 	for _, tt := range cases {
--- a/docs/api.md
+++ b/docs/api.md
@@ -43,7 +43,6 @@ Generate a response for a given prompt with a provided model. This is a streamin
 - `prompt`: the prompt to generate a response for
 - `suffix`: the text after the model response
 - `images`: (optional) a list of base64-encoded images (for multimodal models such as `llava`)
- `think`: (for thinking models) should the model think before responding?

 Advanced parameters (optional):

@@ -491,13 +490,11 @@ Generate the next message in a chat with a provided model. This is a streaming e
 - `model`: (required) the [model name](#model-names)
 - `messages`: the messages of the chat, this can be used to keep a chat memory
 - `tools`: list of tools in JSON for the model to use if supported
- `think`: (for thinking models) should the model think before responding?

 The `message` object has the following fields:

 - `role`: the role of the message, either `system`, `user`, `assistant`, or `tool`
 - `content`: the content of the message
- `thinking`: (for thinking models) the model's thinking process
 - `images` (optional): a list of images to include in the message (for multimodal models such as `llava`)
 - `tool_calls` (optional): a list of tools in JSON that the model wants to use

@@ -1157,15 +1154,11 @@ A single JSON object will be returned.
 {
  "models": [
    {
-
-      "model": "codellama:13b",
-      "modified_at": "2023-11-04T14:56:49.277302595-07:00",
-      "size": 7365960935,
-      "digest": "9f438cb9cd581fc025612d27f7c1a6669ff83a8bb0ed86c94fcf4c5440555697",
-      "capabilities": [
-        "completion"
-      ],
-
+      "name": "deepseek-r1:latest",
+      "model": "deepseek-r1:latest",
+      "modified_at": "2025-05-10T08:06:48.639712648-07:00",
+      "size": 4683075271,
+      "digest": "0a8c266910232fd3291e71e5ba1e058cc5af9d411192cf88b6d30e92b6e73163",
      "details": {
        "parent_model": "",
        "format": "gguf",
@@ -1178,16 +1171,11 @@ A single JSON object will be returned.
      }
    },
    {
-
-      "model": "llama4:latest",
-      "modified_at": "2023-12-07T09:32:18.757212583-08:00",
-      "size": 3825819519,
-      "digest": "fe938a131f40e6f6d40083c9f0f430a515233eb2edaa6d72eb85c50d64f2300e",
-      "capabilities": [
-        "completion",
-        "vision"
-      ],
-
+      "name": "llama3.2:latest",
+      "model": "llama3.2:latest",
+      "modified_at": "2025-05-04T17:37:44.706015396-07:00",
+      "size": 2019393189,
+      "digest": "a80c4f17acd55265feec403c7aef86be0c25983ab279d83f3bcd3abbcb5b8b72",
      "details": {
        "parent_model": "",
        "format": "gguf",
--- a/docs/development.md
+++ b/docs/development.md
@@ -118,7 +118,7 @@ To run tests, use `go test`:
 go test ./...
 ```

-> NOTE: In rare cirumstances, you may need to change a package using the new
+> NOTE: In rare cirumstances, you may nedd to change a package using the new
 > "synctest" package in go1.24.
 >
 > If you do not have the "synctest" package enabled, you will not see build or
--- a/docs/import.md
+++ b/docs/import.md
@@ -132,12 +132,22 @@ success

 ### Supported Quantizations

+- `q4_0`
+- `q4_1`
+- `q5_0`
+- `q5_1`
 - `q8_0`

 #### K-means Quantizations

+- `q3_K_S`
+- `q3_K_M`
+- `q3_K_L`
 - `q4_K_S`
 - `q4_K_M`
+- `q5_K_S`
+- `q5_K_M`
+- `q6_K`


 ## Sharing your model on ollama.com
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -183,8 +183,6 @@ var (
 	NewEngine = Bool("OLLAMA_NEW_ENGINE")
 	// ContextLength sets the default context length
 	ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 4096)
-	// Auth enables authentication between the Ollama client and server
-	UseAuth = Bool("OLLAMA_AUTH")
 )

 func String(s string) func() string {
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -6,6 +6,7 @@ import (
 	"fmt"
 	"io"
 	"log/slog"
+	"math"
 	"slices"
 	"strings"

@@ -15,7 +16,6 @@ import (
 type GGML struct {
 	container
 	model
-	Length int64
 }

 type model interface {
@@ -387,12 +387,12 @@ func DetectContentType(b []byte) string {
 //
 // It collects array values for arrays with a size less than or equal to
 // maxArraySize. If the maxArraySize is negative, all arrays are collected.
-func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) {
+func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
 	rs = bufioutil.NewBufferedSeeker(rs, 32<<10)

 	var magic uint32
 	if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
-		return nil, err
+		return nil, 0, err
 	}

 	var c container
@@ -402,25 +402,24 @@ func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) {
 	case FILE_MAGIC_GGUF_BE:
 		c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
 	default:
-		return nil, errors.New("invalid file magic")
+		return nil, 0, errors.New("invalid file magic")
 	}

 	model, err := c.Decode(rs)
 	if err != nil {
-		return nil, err
+		return nil, 0, err
 	}

 	offset, err := rs.Seek(0, io.SeekCurrent)
 	if err != nil {
-		return nil, err
+		return nil, 0, err
 	}

 	// final model type
 	return &GGML{
 		container: c,
 		model:     model,
-		Length:    offset,
-	}, nil
+	}, offset, nil
 }

 func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string) (kv []uint64, partialOffload, fullOffload uint64) {
@@ -654,15 +653,24 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
 			numPatches*numPatches*headCount)
 	case "qwen25vl":
 		maxPixels := uint64(llm.KV().Uint("vision.max_pixels", 28*28*1280))
+		mergeSize := uint64(llm.KV().Uint("vision.spatial_merge_size", 2))
+		temporalPatchSize := uint64(2)

-		numPatches := maxPixels / (patchSize * patchSize)
+		// Calculate max possible patches based on max_pixels
+		maxHeight := uint64(math.Sqrt(float64(maxPixels)))
+		maxWidth := maxPixels / maxHeight
+		maxGridHeight := maxHeight / patchSize
+		maxGridWidth := maxWidth / patchSize
+		// Account for merged patches (2x2 grid)
+		numPatches := (maxGridHeight * maxGridWidth) / (mergeSize * mergeSize)

+		// Calculate graph size based on typical operations in ProcessImage and createPatches
 		graphSize = 4 * (maxPixels*numChannels + // Original image storage
 			// Normalized pixels
 			maxPixels*numChannels +
-			// Patches storage (numPatches * channels * patchSize^2)
-			numPatches*numChannels*patchSize*patchSize +
-			// Self-attention calculations
+			// Patches storage (numPatches * channels * temporalPatchSize * patchSize^2)
+			numPatches*numChannels*temporalPatchSize*patchSize*patchSize +
+			// Self-attention calculations (similar to other architectures)
 			numPatches*numPatches*headCount +
 			// Additional buffer for processing
 			embeddingLength*numPatches)
--- a/fs/ggml/gguf_test.go
+++ b/fs/ggml/gguf_test.go
@@ -35,7 +35,7 @@ func TestWriteGGUF(t *testing.T) {
 	}
 	defer r.Close()

-	ff, err := Decode(r, 0)
+	ff, _, err := Decode(r, 0)
 	if err != nil {
 		t.Fatal(err)
 	}
--- a/fs/gguf/gguf.go
+++ b/fs/gguf/gguf.go
@@ -1,350 +0,0 @@
-package gguf
-
-import (
-	"bytes"
-	"cmp"
-	"encoding/binary"
-	"errors"
-	"fmt"
-	"io"
-	"iter"
-	"os"
-	"slices"
-	"strings"
-)
-
-const (
-	typeUint8 uint32 = iota
-	typeInt8
-	typeUint16
-	typeInt16
-	typeUint32
-	typeInt32
-	typeFloat32
-	typeBool
-	typeString
-	typeArray
-	typeUint64
-	typeInt64
-	typeFloat64
-)
-
-var ErrUnsupported = errors.New("unsupported")
-
-type File struct {
-	Magic   [4]byte
-	Version uint32
-
-	keyValues *lazy[KeyValue]
-	tensors   *lazy[TensorInfo]
-	offset    int64
-
-	file   *os.File
-	reader *readSeeker
-	bts    []byte
-}
-
-func Open(path string) (f *File, err error) {
-	f = &File{bts: make([]byte, 4096)}
-	f.file, err = os.Open(path)
-	if err != nil {
-		return nil, err
-	}
-
-	f.reader = newReadSeeker(f.file, 32<<10)
-
-	if err := binary.Read(f.reader, binary.LittleEndian, &f.Magic); err != nil {
-		return nil, err
-	}
-
-	if bytes.Equal(f.Magic[:], []byte("gguf")) {
-		return nil, fmt.Errorf("%w file type %v", ErrUnsupported, f.Magic)
-	}
-
-	if err := binary.Read(f.reader, binary.LittleEndian, &f.Version); err != nil {
-		return nil, err
-	}
-
-	if f.Version != 3 {
-		return nil, fmt.Errorf("%w version %v", ErrUnsupported, f.Version)
-	}
-
-	f.tensors, err = newLazy(f, f.readTensor)
-	if err != nil {
-		return nil, err
-	}
-
-	f.tensors.doneFunc = func() error {
-		offset, err := f.reader.Seek(0, io.SeekCurrent)
-		if err != nil {
-			return err
-		}
-
-		alignment := cmp.Or(f.KeyValue("general.alignment").Int(), 32)
-		f.offset = offset + (alignment-offset%alignment)%alignment
-		return nil
-	}
-
-	f.keyValues, err = newLazy(f, f.readKeyValue)
-	if err != nil {
-		return nil, err
-	}
-
-	return f, nil
-}
-
-func (f *File) readTensor() (TensorInfo, error) {
-	name, err := readString(f)
-	if err != nil {
-		return TensorInfo{}, err
-	}
-
-	dims, err := read[uint32](f)
-	if err != nil {
-		return TensorInfo{}, err
-	}
-
-	shape := make([]uint64, dims)
-	for i := range dims {
-		shape[i], err = read[uint64](f)
-		if err != nil {
-			return TensorInfo{}, err
-		}
-	}
-
-	type_, err := read[uint32](f)
-	if err != nil {
-		return TensorInfo{}, err
-	}
-
-	offset, err := read[uint64](f)
-	if err != nil {
-		return TensorInfo{}, err
-	}
-
-	return TensorInfo{
-		Name:   name,
-		Offset: offset,
-		Shape:  shape,
-		Type:   TensorType(type_),
-	}, nil
-}
-
-func (f *File) readKeyValue() (KeyValue, error) {
-	key, err := readString(f)
-	if err != nil {
-		return KeyValue{}, err
-	}
-
-	t, err := read[uint32](f)
-	if err != nil {
-		return KeyValue{}, err
-	}
-
-	value, err := func() (any, error) {
-		switch t {
-		case typeUint8:
-			return read[uint8](f)
-		case typeInt8:
-			return read[int8](f)
-		case typeUint16:
-			return read[uint16](f)
-		case typeInt16:
-			return read[int16](f)
-		case typeUint32:
-			return read[uint32](f)
-		case typeInt32:
-			return read[int32](f)
-		case typeUint64:
-			return read[uint64](f)
-		case typeInt64:
-			return read[int64](f)
-		case typeFloat32:
-			return read[float32](f)
-		case typeFloat64:
-			return read[float64](f)
-		case typeBool:
-			return read[bool](f)
-		case typeString:
-			return readString(f)
-		case typeArray:
-			return readArray(f)
-		default:
-			return nil, fmt.Errorf("%w type %d", ErrUnsupported, t)
-		}
-	}()
-	if err != nil {
-		return KeyValue{}, err
-	}
-
-	return KeyValue{
-		Key:   key,
-		Value: Value{value},
-	}, nil
-}
-
-func read[T any](f *File) (t T, err error) {
-	err = binary.Read(f.reader, binary.LittleEndian, &t)
-	return t, err
-}
-
-func readString(f *File) (string, error) {
-	n, err := read[uint64](f)
-	if err != nil {
-		return "", err
-	}
-
-	if int(n) > len(f.bts) {
-		f.bts = make([]byte, n)
-	}
-
-	bts := f.bts[:n]
-	if _, err := io.ReadFull(f.reader, bts); err != nil {
-		return "", err
-	}
-	defer clear(bts)
-
-	return string(bts), nil
-}
-
-func readArray(f *File) (any, error) {
-	t, err := read[uint32](f)
-	if err != nil {
-		return nil, err
-	}
-
-	n, err := read[uint64](f)
-	if err != nil {
-		return nil, err
-	}
-
-	switch t {
-	case typeUint8:
-		return readArrayData[uint8](f, n)
-	case typeInt8:
-		return readArrayData[int8](f, n)
-	case typeUint16:
-		return readArrayData[uint16](f, n)
-	case typeInt16:
-		return readArrayData[int16](f, n)
-	case typeUint32:
-		return readArrayData[uint32](f, n)
-	case typeInt32:
-		return readArrayData[int32](f, n)
-	case typeUint64:
-		return readArrayData[uint64](f, n)
-	case typeInt64:
-		return readArrayData[int64](f, n)
-	case typeFloat32:
-		return readArrayData[float32](f, n)
-	case typeFloat64:
-		return readArrayData[float64](f, n)
-	case typeBool:
-		return readArrayData[bool](f, n)
-	case typeString:
-		return readArrayString(f, n)
-	default:
-		return nil, fmt.Errorf("%w type %d", ErrUnsupported, t)
-	}
-}
-
-func readArrayData[T any](f *File, n uint64) (s []T, err error) {
-	s = make([]T, n)
-	for i := range n {
-		e, err := read[T](f)
-		if err != nil {
-			return nil, err
-		}
-
-		s[i] = e
-	}
-
-	return s, nil
-}
-
-func readArrayString(f *File, n uint64) (s []string, err error) {
-	s = make([]string, n)
-	for i := range n {
-		e, err := readString(f)
-		if err != nil {
-			return nil, err
-		}
-
-		s[i] = e
-	}
-
-	return s, nil
-}
-
-func (f *File) Close() error {
-	f.keyValues.stop()
-	f.tensors.stop()
-	return f.file.Close()
-}
-
-func (f *File) KeyValue(key string) KeyValue {
-	if !strings.HasPrefix(key, "general.") && !strings.HasPrefix(key, "tokenizer.") {
-		key = f.KeyValue("general.architecture").String() + "." + key
-	}
-
-	if index := slices.IndexFunc(f.keyValues.values, func(kv KeyValue) bool {
-		return kv.Key == key
-	}); index >= 0 {
-		return f.keyValues.values[index]
-	}
-
-	for keyValue, ok := f.keyValues.next(); ok; keyValue, ok = f.keyValues.next() {
-		if keyValue.Key == key {
-			return keyValue
-		}
-	}
-
-	return KeyValue{}
-}
-
-func (f *File) NumKeyValues() int {
-	return int(f.keyValues.count)
-}
-
-func (f *File) KeyValues() iter.Seq2[int, KeyValue] {
-	return f.keyValues.All()
-}
-
-func (f *File) TensorInfo(name string) TensorInfo {
-	if index := slices.IndexFunc(f.tensors.values, func(t TensorInfo) bool {
-		return t.Name == name
-	}); index >= 0 {
-		return f.tensors.values[index]
-	}
-
-	// fast-forward through key values if we haven't already
-	_ = f.keyValues.rest()
-	for tensor, ok := f.tensors.next(); ok; tensor, ok = f.tensors.next() {
-		if tensor.Name == name {
-			return tensor
-		}
-	}
-
-	return TensorInfo{}
-}
-
-func (f *File) NumTensors() int {
-	return int(f.tensors.count)
-}
-
-func (f *File) TensorInfos() iter.Seq2[int, TensorInfo] {
-	// fast forward through key values if we haven't already
-	f.keyValues.rest()
-	return f.tensors.All()
-}
-
-func (f *File) TensorReader(name string) (TensorInfo, io.Reader, error) {
-	t := f.TensorInfo(name)
-	if t.NumBytes() == 0 {
-		return TensorInfo{}, nil, fmt.Errorf("tensor %s not found", name)
-	}
-
-	// fast forward through tensor info if we haven't already
-	_ = f.tensors.rest()
-	return t, io.NewSectionReader(f.file, f.offset+int64(t.Offset), t.NumBytes()), nil
-}
--- a/fs/gguf/gguf_test.go
+++ b/fs/gguf/gguf_test.go
@@ -1,320 +0,0 @@
-package gguf
-
-import (
-	"encoding/binary"
-	"fmt"
-	"os"
-	"path/filepath"
-	"slices"
-	"testing"
-)
-
-func TestRead(t *testing.T) {
-	// Setup
-	tempDir := t.TempDir()
-	tempFile := filepath.Join(tempDir, "test.gguf")
-
-	if err := createTestGGUFFile(tempFile, map[string]any{
-		"general.architecture": "llama",
-		"general.alignment":    int64(32),
-	}, []testTensorInfo{
-		{Name: "token_embd.weight", Shape: []uint64{1000, 512}, Type: 1}, // F16
-		{Name: "output.weight", Shape: []uint64{512, 1000}, Type: 1},     // F16
-	}); err != nil {
-		t.Fatal(err)
-	}
-
-	f, err := Open(tempFile)
-	if err != nil {
-		t.Fatal(err)
-	}
-	defer f.Close()
-
-	// Test
-	if got := f.NumKeyValues(); got != 2 {
-		t.Errorf("NumKeyValues() = %d, want %d", got, 2)
-	}
-	if got := f.NumTensors(); got != 2 {
-		t.Errorf("NumTensors() = %d, want %d", got, 2)
-	}
-	archKV := f.KeyValue("general.architecture")
-	if archKV.Key == "" {
-		t.Error("KeyValue(\"general.architecture\") not found")
-	}
-	if got := archKV.String(); got != "llama" {
-		t.Errorf("KeyValue(\"general.architecture\").String() = %q, want %q", got, "llama")
-	}
-	alignKV := f.KeyValue("general.alignment")
-	if alignKV.Key == "" {
-		t.Error("KeyValue(\"general.alignment\") not found")
-	}
-	if got := alignKV.Int(); got != 32 {
-		t.Errorf("KeyValue(\"general.alignment\").Int() = %d, want %d", got, 32)
-	}
-	expectedTensorNames := []string{"token_embd.weight", "output.weight"}
-	var gotTensorNames []string
-	for _, tensor := range f.TensorInfos() {
-		gotTensorNames = append(gotTensorNames, tensor.Name)
-	}
-	if !slices.Equal(gotTensorNames, expectedTensorNames) {
-		t.Errorf("tensor names = %v, want %v", gotTensorNames, expectedTensorNames)
-	}
-	tokenTensor := f.TensorInfo("token_embd.weight")
-	if tokenTensor.Name != "token_embd.weight" {
-		t.Error("TensorInfo(\"token_embd.weight\") not found")
-	}
-	if len(tokenTensor.Shape) == 0 {
-		t.Error("TensorInfo(\"token_embd.weight\") has empty shape")
-	}
-	outputTensor := f.TensorInfo("output.weight")
-	if outputTensor.Name != "output.weight" {
-		t.Error("TensorInfo(\"output.weight\") not found")
-	}
-	if len(outputTensor.Shape) == 0 {
-		t.Error("TensorInfo(\"output.weight\") has empty shape")
-	}
-	var gotKeyCount int
-	for _, kv := range f.KeyValues() {
-		gotKeyCount++
-		if kv.Key == "" {
-			t.Error("found key value with empty key")
-		}
-	}
-	if gotKeyCount != 2 {
-		t.Errorf("iterated key count = %d, want %d", gotKeyCount, 2)
-	}
-	tensorInfo, reader, err := f.TensorReader("token_embd.weight")
-	if err != nil {
-		t.Errorf("TensorReader(\"token_embd.weight\") error: %v", err)
-	}
-	if tensorInfo.Name != "token_embd.weight" {
-		t.Errorf("TensorReader returned wrong tensor: %q", tensorInfo.Name)
-	}
-	if reader == nil {
-		t.Error("TensorReader returned nil reader")
-	}
-}
-
-func BenchmarkRead(b *testing.B) {
-	// Create benchmark test file
-	tempDir := b.TempDir()
-	tempFile := filepath.Join(tempDir, "benchmark.gguf")
-
-	if err := createTestGGUFFile(tempFile, map[string]any{
-		"general.architecture": "llama",
-		"general.alignment":    int64(32),
-	}, []testTensorInfo{
-		{Name: "token_embd.weight", Shape: []uint64{1000, 512}, Type: 1}, // F16
-		{Name: "output.weight", Shape: []uint64{512, 1000}, Type: 1},     // F16
-	}); err != nil {
-		b.Fatal(err)
-	}
-
-	// Get file info for reporting
-	info, err := os.Stat(tempFile)
-	if err != nil {
-		b.Fatal(err)
-	}
-	b.Logf("Benchmark file size: %d bytes", info.Size())
-
-	b.ReportAllocs()
-
-	for b.Loop() {
-		f, err := Open(tempFile)
-		if err != nil {
-			b.Fatal(err)
-		}
-
-		// Access some data to ensure it's actually being read
-		_ = f.KeyValue("general.architecture").String()
-		_ = f.KeyValue("general.alignment").Int()
-		_ = f.NumTensors()
-		_ = f.NumKeyValues()
-
-		// Iterate through some tensors
-		count := 0
-		for _, tensor := range f.TensorInfos() {
-			_ = tensor.Name
-			count++
-			if count >= 2 {
-				break
-			}
-		}
-
-		f.Close()
-	}
-}
-
-// Helper function to create test GGUF files
-func createTestGGUFFile(path string, keyValues map[string]any, tensors []testTensorInfo) error {
-	file, err := os.Create(path)
-	if err != nil {
-		return err
-	}
-	defer file.Close()
-
-	// Write GGUF magic
-	if _, err := file.Write([]byte("GGUF")); err != nil {
-		return err
-	}
-
-	// Write version
-	if err := binary.Write(file, binary.LittleEndian, uint32(3)); err != nil {
-		return err
-	}
-
-	// Write tensor count
-	if err := binary.Write(file, binary.LittleEndian, uint64(len(tensors))); err != nil {
-		return err
-	}
-
-	// Write metadata count
-	if err := binary.Write(file, binary.LittleEndian, uint64(len(keyValues))); err != nil {
-		return err
-	}
-
-	// Write metadata
-	for key, value := range keyValues {
-		if err := writeKeyValue(file, key, value); err != nil {
-			return err
-		}
-	}
-
-	// Write tensor info
-	for _, tensor := range tensors {
-		if err := writeTensorInfo(file, tensor); err != nil {
-			return err
-		}
-	}
-
-	// Write some dummy tensor data
-	dummyData := make([]byte, 1024)
-	file.Write(dummyData)
-
-	return nil
-}
-
-type testTensorInfo struct {
-	Name  string
-	Shape []uint64
-	Type  uint32
-}
-
-func writeKeyValue(file *os.File, key string, value any) error {
-	// Write key length and key
-	if err := binary.Write(file, binary.LittleEndian, uint64(len(key))); err != nil {
-		return err
-	}
-	if _, err := file.Write([]byte(key)); err != nil {
-		return err
-	}
-
-	// Write value based on type
-	switch v := value.(type) {
-	case string:
-		if err := binary.Write(file, binary.LittleEndian, typeString); err != nil {
-			return err
-		}
-		if err := binary.Write(file, binary.LittleEndian, uint64(len(v))); err != nil {
-			return err
-		}
-		_, err := file.Write([]byte(v))
-		return err
-	case int64:
-		if err := binary.Write(file, binary.LittleEndian, typeInt64); err != nil {
-			return err
-		}
-		return binary.Write(file, binary.LittleEndian, v)
-	case bool:
-		if err := binary.Write(file, binary.LittleEndian, typeBool); err != nil {
-			return err
-		}
-		return binary.Write(file, binary.LittleEndian, v)
-	case float64:
-		if err := binary.Write(file, binary.LittleEndian, typeFloat64); err != nil {
-			return err
-		}
-		return binary.Write(file, binary.LittleEndian, v)
-	case []string:
-		if err := binary.Write(file, binary.LittleEndian, typeArray); err != nil {
-			return err
-		}
-		if err := binary.Write(file, binary.LittleEndian, typeString); err != nil {
-			return err
-		}
-		if err := binary.Write(file, binary.LittleEndian, uint64(len(v))); err != nil {
-			return err
-		}
-		for _, s := range v {
-			if err := binary.Write(file, binary.LittleEndian, uint64(len(s))); err != nil {
-				return err
-			}
-			if _, err := file.Write([]byte(s)); err != nil {
-				return err
-			}
-		}
-		return nil
-	case []int64:
-		if err := binary.Write(file, binary.LittleEndian, typeArray); err != nil {
-			return err
-		}
-		if err := binary.Write(file, binary.LittleEndian, typeInt64); err != nil {
-			return err
-		}
-		if err := binary.Write(file, binary.LittleEndian, uint64(len(v))); err != nil {
-			return err
-		}
-		for _, i := range v {
-			if err := binary.Write(file, binary.LittleEndian, i); err != nil {
-				return err
-			}
-		}
-		return nil
-	case []float64:
-		if err := binary.Write(file, binary.LittleEndian, typeArray); err != nil {
-			return err
-		}
-		if err := binary.Write(file, binary.LittleEndian, typeFloat64); err != nil {
-			return err
-		}
-		if err := binary.Write(file, binary.LittleEndian, uint64(len(v))); err != nil {
-			return err
-		}
-		for _, f := range v {
-			if err := binary.Write(file, binary.LittleEndian, f); err != nil {
-				return err
-			}
-		}
-		return nil
-	default:
-		return fmt.Errorf("unsupported value type: %T", value)
-	}
-}
-
-func writeTensorInfo(file *os.File, tensor testTensorInfo) error {
-	// Write tensor name
-	if err := binary.Write(file, binary.LittleEndian, uint64(len(tensor.Name))); err != nil {
-		return err
-	}
-	if _, err := file.Write([]byte(tensor.Name)); err != nil {
-		return err
-	}
-
-	// Write dimensions
-	if err := binary.Write(file, binary.LittleEndian, uint32(len(tensor.Shape))); err != nil {
-		return err
-	}
-	for _, dim := range tensor.Shape {
-		if err := binary.Write(file, binary.LittleEndian, dim); err != nil {
-			return err
-		}
-	}
-
-	// Write type
-	if err := binary.Write(file, binary.LittleEndian, tensor.Type); err != nil {
-		return err
-	}
-
-	// Write offset (dummy value)
-	return binary.Write(file, binary.LittleEndian, uint64(0))
-}
--- a/fs/gguf/keyvalue.go
+++ b/fs/gguf/keyvalue.go
@@ -1,102 +0,0 @@
-package gguf
-
-import (
-	"reflect"
-	"slices"
-)
-
-type KeyValue struct {
-	Key string
-	Value
-}
-
-type Value struct {
-	value any
-}
-
-func value[T any](v Value, kinds ...reflect.Kind) (t T) {
-	vv := reflect.ValueOf(v.value)
-	if slices.Contains(kinds, vv.Kind()) {
-		t = vv.Convert(reflect.TypeOf(t)).Interface().(T)
-	}
-	return
-}
-
-func values[T any](v Value, kinds ...reflect.Kind) (ts []T) {
-	switch vv := reflect.ValueOf(v.value); vv.Kind() {
-	case reflect.Slice:
-		if slices.Contains(kinds, vv.Type().Elem().Kind()) {
-			ts = make([]T, vv.Len())
-			for i := range vv.Len() {
-				ts[i] = vv.Index(i).Convert(reflect.TypeOf(ts[i])).Interface().(T)
-			}
-		}
-	}
-	return
-}
-
-// Int returns Value as a signed integer. If it is not a signed integer, it returns 0.
-func (v Value) Int() int64 {
-	return value[int64](v, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64)
-}
-
-// Ints returns Value as a signed integer slice. If it is not a signed integer slice, it returns nil.
-func (v Value) Ints() (i64s []int64) {
-	return values[int64](v, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64)
-}
-
-// Uint converts an unsigned integer value to uint64. If the value is not a unsigned integer, it returns 0.
-func (v Value) Uint() uint64 {
-	return value[uint64](v, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64)
-}
-
-// Uints returns Value as a unsigned integer slice. If it is not a unsigned integer slice, it returns nil.
-func (v Value) Uints() (u64s []uint64) {
-	return values[uint64](v, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64)
-}
-
-// Float returns Value as a float. If it is not a float, it returns 0.
-func (v Value) Float() float64 {
-	return value[float64](v, reflect.Float32, reflect.Float64)
-}
-
-// Floats returns Value as a float slice. If it is not a float slice, it returns nil.
-func (v Value) Floats() (f64s []float64) {
-	return values[float64](v, reflect.Float32, reflect.Float64)
-}
-
-// Bool returns Value as a boolean. If it is not a boolean, it returns false.
-func (v Value) Bool() bool {
-	return value[bool](v, reflect.Bool)
-}
-
-// Bools returns Value as a boolean slice. If it is not a boolean slice, it returns nil.
-func (v Value) Bools() (bools []bool) {
-	return values[bool](v, reflect.Bool)
-}
-
-// String returns Value as a string. If it is not a string, it returns an empty string.
-func (v Value) String() string {
-	return value[string](v, reflect.String)
-}
-
-// Strings returns Value as a string slice. If it is not a string slice, it returns nil.
-func (v Value) Strings() (strings []string) {
-	return values[string](v, reflect.String)
-}
-
-// IsNil checks if the Value is nil. It returns true if the value is nil or if it is a nil pointer, interface, slice, map, channel, or function.
-func (v Value) IsNil() bool {
-	if v.value == nil {
-		return true
-	}
-
-	// Check for nil pointers, interfaces, slices, maps, channels, and functions
-	rv := reflect.ValueOf(v.value)
-	switch rv.Kind() {
-	case reflect.Ptr, reflect.Interface, reflect.Slice, reflect.Map, reflect.Chan, reflect.Func:
-		return rv.IsNil()
-	}
-
-	return false
-}
--- a/fs/gguf/keyvalue_test.go
+++ b/fs/gguf/keyvalue_test.go
@@ -1,208 +0,0 @@
-package gguf
-
-import (
-	"testing"
-
-	"github.com/google/go-cmp/cmp"
-)
-
-func split(name string, values map[string][]any) (matched []any, unmatched []any) {
-	for key, value := range values {
-		if key == name {
-			matched = value
-		} else {
-			unmatched = append(unmatched, value...)
-		}
-	}
-	return
-}
-
-func TestValue(t *testing.T) {
-	values := map[string][]any{
-		"int64":   {int(42), int8(42), int16(42), int32(42), int64(42)},
-		"uint64":  {uint(42), uint8(42), uint16(42), uint32(42), uint64(42)},
-		"float64": {float32(42), float64(42)},
-		"string":  {"42", "hello"},
-		"bool":    {true, false},
-	}
-
-	t.Run("int64", func(t *testing.T) {
-		matched, unmatched := split("int64", values)
-		for _, v := range matched {
-			kv := KeyValue{"key", Value{v}}
-			if i64 := kv.Int(); i64 != 42 {
-				t.Errorf("expected 42, got %d", i64)
-			}
-		}
-
-		for _, v := range unmatched {
-			kv := KeyValue{"key", Value{v}}
-			if i64 := kv.Int(); i64 != 0 {
-				t.Errorf("expected 42, got %d", i64)
-			}
-		}
-	})
-
-	t.Run("uint64", func(t *testing.T) {
-		matched, unmatched := split("uint64", values)
-		for _, v := range matched {
-			kv := KeyValue{"key", Value{v}}
-			if u64 := kv.Uint(); u64 != 42 {
-				t.Errorf("expected 42, got %d", u64)
-			}
-		}
-
-		for _, v := range unmatched {
-			kv := KeyValue{"key", Value{v}}
-			if u64 := kv.Uint(); u64 != 0 {
-				t.Errorf("expected 42, got %d", u64)
-			}
-		}
-	})
-
-	t.Run("float64", func(t *testing.T) {
-		matched, unmatched := split("float64", values)
-		for _, v := range matched {
-			kv := KeyValue{"key", Value{v}}
-			if f64 := kv.Float(); f64 != 42 {
-				t.Errorf("expected 42, got %f", f64)
-			}
-		}
-
-		for _, v := range unmatched {
-			kv := KeyValue{"key", Value{v}}
-			if f64 := kv.Float(); f64 != 0 {
-				t.Errorf("expected 42, got %f", f64)
-			}
-		}
-	})
-
-	t.Run("string", func(t *testing.T) {
-		matched, unmatched := split("string", values)
-		for _, v := range matched {
-			kv := KeyValue{"key", Value{v}}
-			if s := kv.String(); s != v {
-				t.Errorf("expected 42, got %s", s)
-			}
-		}
-
-		for _, v := range unmatched {
-			kv := KeyValue{"key", Value{v}}
-			if s := kv.String(); s != "" {
-				t.Errorf("expected 42, got %s", s)
-			}
-		}
-	})
-
-	t.Run("bool", func(t *testing.T) {
-		matched, unmatched := split("bool", values)
-		for _, v := range matched {
-			kv := KeyValue{"key", Value{v}}
-			if b := kv.Bool(); b != v {
-				t.Errorf("expected true, got %v", b)
-			}
-		}
-
-		for _, v := range unmatched {
-			kv := KeyValue{"key", Value{v}}
-			if b := kv.Bool(); b != false {
-				t.Errorf("expected false, got %v", b)
-			}
-		}
-	})
-}
-
-func TestValues(t *testing.T) {
-	values := map[string][]any{
-		"int64s":   {[]int{42}, []int8{42}, []int16{42}, []int32{42}, []int64{42}},
-		"uint64s":  {[]uint{42}, []uint8{42}, []uint16{42}, []uint32{42}, []uint64{42}},
-		"float64s": {[]float32{42}, []float64{42}},
-		"strings":  {[]string{"42"}, []string{"hello"}},
-		"bools":    {[]bool{true}, []bool{false}},
-	}
-
-	t.Run("int64s", func(t *testing.T) {
-		matched, unmatched := split("int64s", values)
-		for _, v := range matched {
-			kv := KeyValue{"key", Value{v}}
-			if diff := cmp.Diff(kv.Ints(), []int64{42}); diff != "" {
-				t.Errorf("diff: %s", diff)
-			}
-		}
-
-		for _, v := range unmatched {
-			kv := KeyValue{"key", Value{v}}
-			if i64s := kv.Ints(); i64s != nil {
-				t.Errorf("expected nil, got %v", i64s)
-			}
-		}
-	})
-
-	t.Run("uint64s", func(t *testing.T) {
-		matched, unmatched := split("uint64s", values)
-		for _, v := range matched {
-			kv := KeyValue{"key", Value{v}}
-			if diff := cmp.Diff(kv.Uints(), []uint64{42}); diff != "" {
-				t.Errorf("diff: %s", diff)
-			}
-		}
-
-		for _, v := range unmatched {
-			kv := KeyValue{"key", Value{v}}
-			if u64s := kv.Uints(); u64s != nil {
-				t.Errorf("expected nil, got %v", u64s)
-			}
-		}
-	})
-
-	t.Run("float64s", func(t *testing.T) {
-		matched, unmatched := split("float64s", values)
-		for _, v := range matched {
-			kv := KeyValue{"key", Value{v}}
-			if diff := cmp.Diff(kv.Floats(), []float64{42}); diff != "" {
-				t.Errorf("diff: %s", diff)
-			}
-		}
-
-		for _, v := range unmatched {
-			kv := KeyValue{"key", Value{v}}
-			if f64s := kv.Floats(); f64s != nil {
-				t.Errorf("expected nil, got %v", f64s)
-			}
-		}
-	})
-
-	t.Run("strings", func(t *testing.T) {
-		matched, unmatched := split("strings", values)
-		for _, v := range matched {
-			kv := KeyValue{"key", Value{v}}
-			if diff := cmp.Diff(kv.Strings(), v); diff != "" {
-				t.Errorf("diff: %s", diff)
-			}
-		}
-
-		for _, v := range unmatched {
-			kv := KeyValue{"key", Value{v}}
-			if s := kv.Strings(); s != nil {
-				t.Errorf("expected nil, got %v", s)
-			}
-		}
-	})
-
-	t.Run("bools", func(t *testing.T) {
-		matched, unmatched := split("bools", values)
-		for _, v := range matched {
-			kv := KeyValue{"key", Value{v}}
-			if diff := cmp.Diff(kv.Bools(), v); diff != "" {
-				t.Errorf("diff: %s", diff)
-			}
-		}
-
-		for _, v := range unmatched {
-			kv := KeyValue{"key", Value{v}}
-			if b := kv.Bools(); b != nil {
-				t.Errorf("expected nil, got %v", b)
-			}
-		}
-	})
-}
--- a/fs/gguf/lazy.go
+++ b/fs/gguf/lazy.go
@@ -1,88 +0,0 @@
-package gguf
-
-import (
-	"encoding/binary"
-	"iter"
-	"log/slog"
-)
-
-type lazy[T any] struct {
-	count  uint64
-	next   func() (T, bool)
-	stop   func()
-	values []T
-
-	doneFunc func() error
-}
-
-func newLazy[T any](f *File, fn func() (T, error)) (*lazy[T], error) {
-	it := lazy[T]{}
-	if err := binary.Read(f.reader, binary.LittleEndian, &it.count); err != nil {
-		return nil, err
-	}
-
-	it.values = make([]T, 0)
-	it.next, it.stop = iter.Pull(func(yield func(T) bool) {
-		for i := range it.count {
-			t, err := fn()
-			if err != nil {
-				slog.Error("error reading tensor", "index", i, "error", err)
-				return
-			}
-
-			it.values = append(it.values, t)
-			if !yield(t) {
-				break
-			}
-		}
-
-		if it.doneFunc != nil {
-			it.doneFunc()
-		}
-	})
-
-	return &it, nil
-}
-
-func (g *lazy[T]) Values() iter.Seq[T] {
-	return func(yield func(T) bool) {
-		for _, v := range g.All() {
-			if !yield(v) {
-				break
-			}
-		}
-	}
-}
-
-func (g *lazy[T]) All() iter.Seq2[int, T] {
-	return func(yield func(int, T) bool) {
-		for i := range int(g.count) {
-			if i < len(g.values) {
-				if !yield(i, g.values[i]) {
-					break
-				}
-			} else {
-				t, ok := g.next()
-				if !ok {
-					break
-				}
-
-				if !yield(i, t) {
-					break
-				}
-			}
-		}
-	}
-}
-
-func (g *lazy[T]) rest() (collected bool) {
-	for {
-		_, ok := g.next()
-		collected = collected || ok
-		if !ok {
-			break
-		}
-	}
-
-	return collected
-}
--- a/fs/gguf/reader.go
+++ b/fs/gguf/reader.go
@@ -1,34 +0,0 @@
-package gguf
-
-import (
-	"bufio"
-	"io"
-)
-
-type readSeeker struct {
-	rs io.ReadSeeker
-	br *bufio.Reader
-}
-
-func newReadSeeker(rs io.ReadSeeker, size int) *readSeeker {
-	return &readSeeker{
-		rs: rs,
-		br: bufio.NewReaderSize(rs, size),
-	}
-}
-
-func (b *readSeeker) Read(p []byte) (int, error) {
-	return b.br.Read(p)
-}
-
-func (b *readSeeker) Seek(offset int64, whence int) (int64, error) {
-	if whence == io.SeekCurrent {
-		offset -= int64(b.br.Buffered())
-	}
-	n, err := b.rs.Seek(offset, whence)
-	if err != nil {
-		return 0, err
-	}
-	b.br.Reset(b.rs)
-	return n, nil
-}
--- a/fs/gguf/tensor.go
+++ b/fs/gguf/tensor.go
@@ -1,284 +0,0 @@
-package gguf
-
-import (
-	"log/slog"
-	"strings"
-)
-
-type TensorInfo struct {
-	Name   string
-	Offset uint64
-	Shape  []uint64
-	Type   TensorType
-}
-
-func (t TensorInfo) NumValues() int64 {
-	var numItems int64 = 1
-	for _, dim := range t.Shape {
-		numItems *= int64(dim)
-	}
-	return numItems
-}
-
-// NumBytes returns the number of bytes in the tensor.
-func (t TensorInfo) NumBytes() int64 {
-	return int64(float64(t.NumValues()) * t.Type.NumBytes())
-}
-
-func (t TensorInfo) LogValue() slog.Value {
-	return slog.GroupValue(
-		slog.String("name", t.Name),
-		slog.Int64("offset", int64(t.Offset)),
-		slog.Any("shape", t.Shape),
-		slog.Int64("num_values", t.NumValues()),
-		slog.Int64("num_bytes", t.NumBytes()),
-		slog.Any("type", t.Type),
-	)
-}
-
-type TensorType uint32
-
-const (
-	TensorTypeF32 TensorType = iota
-	TensorTypeF16
-	TensorTypeQ4_0
-	TensorTypeQ4_1
-
-	// unexported // unused in gguf
-	tensorTypeQ4_2
-	tensorTypeQ4_3
-
-	TensorTypeQ5_0
-	TensorTypeQ5_1
-	TensorTypeQ8_0
-	TensorTypeQ8_1
-	TensorTypeQ2_K
-	TensorTypeQ3_K
-	TensorTypeQ4_K
-	TensorTypeQ5_K
-	TensorTypeQ6_K
-	TensorTypeQ8_K
-
-	// unexported // unquantizable by ollama
-	tensorTypeIQ2_XXS
-	tensorTypeIQ2_XS
-	tensorTypeIQ3_XXS
-	tensorTypeIQ1_S
-	tensorTypeIQ4_NL
-	tensorTypeIQ3_S
-	tensorTypeIQ2_S
-	tensorTypeIQ4_XS
-
-	TensorTypeI8
-	TensorTypeI16
-	TensorTypeI32
-	TensorTypeI64
-	TensorTypeF64
-
-	// unexported // unquantizable by ollama
-	tensorTypeIQ1_M
-
-	TensorTypeBF16
-
-	// unexported // unused in gguf
-	tensorTypeQ4_0_4_4
-	tensorTypeQ4_0_4_8
-	tensorTypeQ4_0_8_8
-
-	// unexported // unquantizable by ollama
-	tensorTypeTQ1_0
-	tensorTypeTQ2_0
-
-	// unexported // unused in gguf
-	tensorTypeIQ4_NL_4_4
-	tensorTypeIQ4_NL_4_8
-	tensorTypeIQ4_NL_8_8
-)
-
-func (t TensorType) NumBytes() float64 {
-	return float64(t.typeSize()) / float64(t.blockSize())
-}
-
-func (t TensorType) typeSize() int64 {
-	switch t {
-	case TensorTypeF32:
-		return 4
-	case TensorTypeF16:
-		return 2
-	case TensorTypeQ4_0:
-		return 2 + t.blockSize()/2
-	case TensorTypeQ4_1:
-		return 2 + 2 + t.blockSize()/2
-	case TensorTypeQ5_0:
-		return 2 + 4 + t.blockSize()/2
-	case TensorTypeQ5_1:
-		return 2 + 2 + 4 + t.blockSize()/2
-	case TensorTypeQ8_0:
-		return 2 + t.blockSize()
-	case TensorTypeQ8_1:
-		return 2 + 2 + t.blockSize()
-	case TensorTypeQ2_K:
-		return t.blockSize()/16 + t.blockSize()/4 + 2 + 2
-	case TensorTypeQ3_K:
-		return t.blockSize()/8 + t.blockSize()/4 + 12 + 2
-	case TensorTypeQ4_K:
-		return 2 + 2 + 12 + t.blockSize()/2
-	case TensorTypeQ5_K:
-		return 2 + 2 + 12 + t.blockSize()/8 + t.blockSize()/2
-	case TensorTypeQ6_K:
-		return t.blockSize()/2 + t.blockSize()/4 + t.blockSize()/16 + 2
-	case TensorTypeQ8_K:
-		return 4 + t.blockSize() + 2*t.blockSize()/16
-	case tensorTypeIQ2_XXS:
-		return 2 + 2*t.blockSize()/8
-	case tensorTypeIQ2_XS:
-		return 2 + 2*t.blockSize()/8 + t.blockSize()/32
-	case tensorTypeIQ3_XXS:
-		return 2 + t.blockSize()/4 + t.blockSize()/8
-	case tensorTypeIQ1_S:
-		return 2 + t.blockSize()/8 + t.blockSize()/16
-	case tensorTypeIQ4_NL:
-		return 2 + t.blockSize()/2
-	case tensorTypeIQ3_S:
-		return 2 + t.blockSize()/4 + t.blockSize()/8 + t.blockSize()/32 + 4
-	case tensorTypeIQ2_S:
-		return 2 + t.blockSize()/4 + t.blockSize()/16
-	case tensorTypeIQ4_XS:
-		return 2 + 2 + t.blockSize()/2 + t.blockSize()/64
-	case TensorTypeI8:
-		return 1
-	case TensorTypeI16:
-		return 2
-	case TensorTypeI32:
-		return 4
-	case TensorTypeI64:
-		return 8
-	case TensorTypeF64:
-		return 8
-	case tensorTypeIQ1_M:
-		return t.blockSize()/8 + t.blockSize()/16 + t.blockSize()/32
-	case TensorTypeBF16:
-		return 2
-	default:
-		return 0
-	}
-}
-
-func (t TensorType) blockSize() int64 {
-	switch t {
-	case TensorTypeF32,
-		TensorTypeF16,
-		TensorTypeI8,
-		TensorTypeI16,
-		TensorTypeI32,
-		TensorTypeI64,
-		TensorTypeF64,
-		TensorTypeBF16:
-		return 1
-	case TensorTypeQ4_0,
-		TensorTypeQ4_1,
-		TensorTypeQ5_0,
-		TensorTypeQ5_1,
-		TensorTypeQ8_0,
-		TensorTypeQ8_1,
-		tensorTypeIQ4_NL:
-		return 32
-	default:
-		return 256
-	}
-}
-
-func (t TensorType) String() string {
-	switch t {
-	case TensorTypeF32:
-		return "f32"
-	case TensorTypeF16:
-		return "f16"
-	case TensorTypeQ4_0:
-		return "q4_0"
-	case TensorTypeQ4_1:
-		return "q4_1"
-	case tensorTypeQ4_2:
-		return "q4_2"
-	case tensorTypeQ4_3:
-		return "q4_3"
-	case TensorTypeQ5_0:
-		return "q5_0"
-	case TensorTypeQ5_1:
-		return "q5_1"
-	case TensorTypeQ8_0:
-		return "q8_0"
-	case TensorTypeQ8_1:
-		return "q8_1"
-	case TensorTypeQ2_K:
-		return "q2_k"
-	case TensorTypeQ3_K:
-		return "q3_k"
-	case TensorTypeQ4_K:
-		return "q4_k"
-	case TensorTypeQ5_K:
-		return "q5_k"
-	case TensorTypeQ6_K:
-		return "q6_k"
-	case TensorTypeQ8_K:
-		return "q8_k"
-	case tensorTypeIQ2_XXS:
-		return "iq2_xxs"
-	case tensorTypeIQ2_XS:
-		return "iq2_xs"
-	case tensorTypeIQ3_XXS:
-		return "iq3_xxs"
-	case tensorTypeIQ1_S:
-		return "iq1_s"
-	case tensorTypeIQ4_NL:
-		return "iq4_nl"
-	case tensorTypeIQ3_S:
-		return "iq3_s"
-	case tensorTypeIQ2_S:
-		return "iq2_s"
-	case tensorTypeIQ4_XS:
-		return "iq4_xs"
-	case TensorTypeI8:
-		return "i8"
-	case TensorTypeI16:
-		return "i16"
-	case TensorTypeI32:
-		return "i32"
-	case TensorTypeI64:
-		return "i64"
-	case TensorTypeF64:
-		return "f64"
-	case tensorTypeIQ1_M:
-		return "iq1_m"
-	case TensorTypeBF16:
-		return "bf16"
-	case tensorTypeQ4_0_4_4:
-		return "q4_0_4_4"
-	case tensorTypeQ4_0_4_8:
-		return "q4_0_4_8"
-	case tensorTypeQ4_0_8_8:
-		return "q4_0_8_8"
-	case tensorTypeTQ1_0:
-		return "tq1_0"
-	case tensorTypeTQ2_0:
-		return "tq2_0"
-	case tensorTypeIQ4_NL_4_4:
-		return "iq4_nl_4_4"
-	case tensorTypeIQ4_NL_4_8:
-		return "iq4_nl_4_8"
-	case tensorTypeIQ4_NL_8_8:
-		return "iq4_nl_8_8"
-	default:
-		return "unknown"
-	}
-}
-
-func (t TensorType) LogValue() slog.Value {
-	return slog.GroupValue(
-		slog.Uint64("value", uint64(t)),
-		slog.String("name", strings.ToUpper(t.String())),
-		slog.Int64("size", t.typeSize()),
-		slog.Int64("block_size", t.blockSize()),
-		slog.Float64("num_bytes", t.NumBytes()),
-	)
-}
--- a/integration/llm_image_test.go
+++ b/integration/llm_image_test.go
@@ -19,7 +19,7 @@ func TestVisionModels(t *testing.T) {
 	}
 	testCases := []testCase{
 		{
-			model: "qwen2.5vl",
+			model: "llava:7b",
 		},
 		{
 			model: "llama3.2-vision",
@@ -60,7 +60,6 @@ func TestVisionModels(t *testing.T) {
 }

 func TestIntegrationSplitBatch(t *testing.T) {
-	skipUnderMinVRAM(t, 6)
 	image, err := base64.StdEncoding.DecodeString(imageEncoding)
 	require.NoError(t, err)
 	req := api.GenerateRequest{
--- a/integration/testdata/embed.json
+++ b/integration/testdata/embed.json
--- a/kvcache/causal.go
+++ b/kvcache/causal.go
@@ -30,11 +30,6 @@ type Causal struct {

 	// ** current forward pass **

-	// curReserve indicates that this forward pass is only for
-	// memory reservation and we should not update our metadata
-	// based on it.
-	curReserve bool
-
 	// the active layer for Get and Put
 	curLayer int

@@ -164,13 +159,12 @@ func (c *Causal) Close() {
 }

 func (c *Causal) StartForward(ctx ml.Context, batch input.Batch, reserve bool) error {
-	c.curReserve = reserve
 	c.curBatchSize = len(batch.Positions)
 	c.curSequences = batch.Sequences
 	c.curPositions = batch.Positions
 	c.opts.Except = nil

-	if !c.curReserve {
+	if !reserve {
 		c.updateSlidingWindow()

 		var err error
@@ -217,9 +211,10 @@ func (c *Causal) StartForward(ctx ml.Context, batch input.Batch, reserve bool) e
 		c.curCellRange.max = len(c.cells) - 1
 	}

-	c.curMask = c.buildMask(ctx)
+	var err error
+	c.curMask, err = c.buildMask(ctx)

-	return nil
+	return err
 }

 func newRange() cellRange {
@@ -302,7 +297,7 @@ func roundUp(length, pad int) int {
 // Builds a mask of history x batch indicating whether for each token in the batch the
 // token in the history should apply. This is based on both the sequence and causality (the
 // position of the history is not ahead of the token in the batch).
-func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
+func (c *Causal) buildMask(ctx ml.Context) (ml.Tensor, error) {
 	// Align and pad the two dimensions as required by the backend
 	batchSize := roundUp(c.curBatchSize, c.config.MaskBatchPadding)

@@ -310,11 +305,6 @@ func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
 	c.curCellRange.max = roundUp(c.curCellRange.max+1, c.config.CachePadding) - 1

 	length := c.curCellRange.max - c.curCellRange.min + 1
-
-	if c.curReserve {
-		return ctx.Input().Empty(c.config.MaskDType, length, batchSize)
-	}
-
 	mask := make([]float32, batchSize*length)

 	for i := range c.curBatchSize {
@@ -335,7 +325,10 @@ func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
 		mask[i] = float32(math.Inf(-1))
 	}

-	maskTensor := ctx.Input().FromFloatSlice(mask, length, batchSize)
+	maskTensor, err := ctx.Input().FromFloatSlice(mask, length, batchSize)
+	if err != nil {
+		return nil, err
+	}

 	if c.config.MaskDType != ml.DTypeF32 {
 		out := ctx.Input().Empty(c.config.MaskDType, maskTensor.Shape()...)
@@ -343,7 +336,7 @@ func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
 		maskTensor = out
 	}

-	return maskTensor
+	return maskTensor, nil
 }

 func (c *Causal) moveCells(ctx ml.Context, src, dst, length int) {
@@ -498,7 +491,12 @@ func (c *Causal) SetCausal(ctx ml.Context, opts CausalOptions) {
 	if !slices.Equal(c.opts.Except, opts.Except) {
 		c.opts = opts
 		if ctx != nil {
-			c.curMask = c.buildMask(ctx)
+			var err error
+			c.curMask, err = c.buildMask(ctx)
+			if err != nil {
+				// This error should never occur because we have previously built a mask with the same shape
+				panic(fmt.Errorf("SetCausal: %w", err))
+			}
 		}
 	}
 }
@@ -654,7 +652,10 @@ func (c *Causal) shift(seq int, beginIndex, offset int32) error {
 		}
 	}

-	kShift := ctx.Input().FromIntSlice(offsets, len(offsets))
+	kShift, err := ctx.Input().FromIntSlice(offsets, len(offsets))
+	if err != nil {
+		return err
+	}

 	for i, key := range c.keys {
 		if key == nil {
--- a/kvcache/causal_test.go
+++ b/kvcache/causal_test.go
@@ -344,7 +344,7 @@ func testCache(t *testing.T, backend ml.Backend, cache Cache, tests []testCase)
 			}

 			cache.SetLayer(0)
-			tensor := context.FromFloatSlice(test.in, test.inShape...)
+			tensor, _ := context.FromFloatSlice(test.in, test.inShape...)
 			cache.Put(context, tensor, tensor)

 			out, _, mask := cache.Get(context)
@@ -386,7 +386,7 @@ func TestCanResume(t *testing.T) {
 	}

 	cache.SetLayer(0)
-	tensor := context.FromFloatSlice([]float32{1, 2, 3, 4}, 1, 1, 4)
+	tensor, _ := context.FromFloatSlice([]float32{1, 2, 3, 4}, 1, 1, 4)
 	cache.Put(context, tensor, tensor)

 	// with window size 4, nothing has slid out of the window yet
@@ -413,7 +413,7 @@ func TestCanResume(t *testing.T) {
 	}

 	cache.SetLayer(0)
-	tensor = context.FromFloatSlice([]float32{5, 6}, 1, 1, 2)
+	tensor, _ = context.FromFloatSlice([]float32{5, 6}, 1, 1, 2)
 	cache.Put(context, tensor, tensor)

 	// only the latest position has overlapping windows
@@ -470,24 +470,24 @@ func (c *testContext) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
 	return c.Empty(dtype, shape...)
 }

-func (c *testContext) FromFloatSlice(s []float32, shape ...int) ml.Tensor {
+func (c *testContext) FromFloatSlice(s []float32, shape ...int) (ml.Tensor, error) {
 	t := c.Empty(ml.DTypeF32, shape...).(*testTensor)

 	copy(t.data, s)

-	return t
+	return t, nil
 }

-func (c *testContext) FromIntSlice(s []int32, shape ...int) ml.Tensor {
+func (c *testContext) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
 	f := make([]float32, len(s))
 	for i := range f {
 		f[i] = float32(s[i])
 	}

-	out := c.FromFloatSlice(f, shape...)
+	out, _ := c.FromFloatSlice(f, shape...)
 	out.(*testTensor).dtype = ml.DTypeI32

-	return out
+	return out, nil
 }

 func (c *testContext) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
@@ -496,7 +496,7 @@ func (c *testContext) Arange(start, stop, step float32, dtype ml.DType) ml.Tenso
 		s = append(s, i)
 	}

-	out := c.FromFloatSlice(s, len(s))
+	out, _ := c.FromFloatSlice(s, len(s))
 	out.(*testTensor).dtype = dtype
 	return out
 }
@@ -508,7 +508,7 @@ func (c *testContext) Forward(...ml.Tensor) ml.Context { return c }

 func (c *testContext) Compute(...ml.Tensor) {}

-func (c *testContext) Reserve() {}
+func (c *testContext) Reserve() error { return nil }

 func (c *testContext) MaxGraphNodes() int {
 	return 10
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -544,7 +544,7 @@ func NewSamplingContext(model *Model, params SamplingParams) (*SamplingContext,
 	cparams.penalty_last_n = C.int32_t(params.RepeatLastN)
 	cparams.penalty_repeat = C.float(params.PenaltyRepeat)
 	cparams.penalty_freq = C.float(params.PenaltyFreq)
-	cparams.penalty_present = C.float(params.PenaltyPresent)
+	cparams.penalty_present = C.float(params.PenaltyFreq)
 	cparams.seed = C.uint32_t(params.Seed)

 	grammar := C.CString(params.Grammar)
@@ -580,7 +580,7 @@ func SchemaToGrammar(schema []byte) []byte {
 	defer C.free(unsafe.Pointer(cStr))

 	// Allocate buffer for grammar based on schema length but with upper bound
-	maxLen := max(32768, min(1024*1024, len(schema)*4))
+	maxLen := min(1024*1024, len(schema)*4)
 	buf := make([]byte, maxLen)

 	// Call C function to convert schema to grammar
@@ -602,7 +602,7 @@ type Grammar struct {
 	mu sync.Mutex
 }

-func NewGrammar(grammar string, vocabIds []uint32, vocabValues []string, eogTokens []int32) *Grammar {
+func NewGrammar(grammar string, vocabIds []uint32, vocabValues []string, eogTokens []uint32) *Grammar {
 	cGrammar := C.CString(grammar)
 	defer C.free(unsafe.Pointer(cGrammar))

@@ -622,7 +622,7 @@ func NewGrammar(grammar string, vocabIds []uint32, vocabValues []string, eogToke
 		cEogTokens[i] = C.uint32_t(token)
 	}

-	g := C.grammar_init(cGrammar, unsafe.SliceData(cTokens), C.size_t(len(cTokens)), unsafe.SliceData(cPieces), unsafe.SliceData(cEogTokens), C.size_t(len(cEogTokens)))
+	g := C.grammar_init(cGrammar, (*C.uint32_t)(unsafe.Pointer(&cTokens[0])), C.size_t(len(cTokens)), (**C.char)(unsafe.Pointer(&cPieces[0])), (*C.uint32_t)(unsafe.Pointer(&cEogTokens[0])), C.size_t(len(cEogTokens)))
 	if g == nil {
 		return nil
 	}
--- a/llama/patches/0016-graph-memory-reporting-on-failure.patch
+++ b/llama/patches/0016-graph-memory-reporting-on-failure.patch
@@ -1,156 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Jesse Gross <jesse@ollama.com>
-Date: Fri, 18 Apr 2025 15:58:19 -0700
-Subject: [PATCH] graph memory reporting on failure
-
---
- ggml/include/ggml-alloc.h   |  6 ++++++
- ggml/include/ggml-backend.h |  6 ++++++
- ggml/src/ggml-alloc.c       | 38 +++++++++++++++++++++++++++++++++----
- ggml/src/ggml-backend.cpp   | 10 ++++++++++
- 4 files changed, 56 insertions(+), 4 deletions(-)
-
-diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h
-index 2cb150fd..781b1e10 100644
--- a/ggml/include/ggml-alloc.h
-+++ b/ggml/include/ggml-alloc.h
-@@ -66,6 +66,12 @@ GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph
- 
- GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
- 
-+struct ggml_allocr_buffer_status {
-+    size_t size;
-+    bool allocated;
-+};
-+GGML_API struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id);
-+
- // Utils
- // Create a buffer and allocate all the tensors in a ggml_context
- GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
-diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
-index 778927f6..74e46716 100644
--- a/ggml/include/ggml-backend.h
-+++ b/ggml/include/ggml-backend.h
-@@ -304,6 +304,12 @@ extern "C" {
- 
-     GGML_API size_t               ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
- 
-+    struct ggml_backend_buffer_status {
-+        size_t size;
-+        bool allocated;
-+    };
-+    GGML_API struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
-+
-     GGML_API void                 ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
-     GGML_API ggml_backend_t       ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
- 
-diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
-index 5fd379f6..04812990 100644
--- a/ggml/src/ggml-alloc.c
-+++ b/ggml/src/ggml-alloc.c
-@@ -364,6 +364,7 @@ struct node_alloc {
- struct ggml_gallocr {
-     ggml_backend_buffer_type_t * bufts; // [n_buffers]
-     ggml_backend_buffer_t * buffers; // [n_buffers]
-+    size_t *buffer_sizes; // [n_buffers]
-     struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
-     int n_buffers;
- 
-@@ -387,6 +388,9 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
-     galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
-     GGML_ASSERT(galloc->buffers != NULL);
- 
-+    galloc->buffer_sizes = calloc(n_bufs, sizeof(size_t));
-+    GGML_ASSERT(galloc->buffer_sizes != NULL);
-+
-     galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
-     GGML_ASSERT(galloc->buf_tallocs != NULL);
- 
-@@ -453,6 +457,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
-     ggml_hash_set_free(&galloc->hash_set);
-     free(galloc->hash_values);
-     free(galloc->bufts);
-+    free(galloc->buffer_sizes);
-     free(galloc->buffers);
-     free(galloc->buf_tallocs);
-     free(galloc->node_allocs);
-@@ -748,6 +753,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
-         }
-     }
- 
-+    bool success = true;
-+
-     // reallocate buffers if needed
-     for (int i = 0; i < galloc->n_buffers; i++) {
-         // if the buffer type is used multiple times, we reuse the same buffer
-@@ -769,15 +776,20 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
- 
-             ggml_backend_buffer_free(galloc->buffers[i]);
-             galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
-            if (galloc->buffers[i] == NULL) {
-+            if (galloc->buffers[i]) {
-+                galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
-+                ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
-+            } else {
-                 GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
-                return false;
-+                galloc->buffer_sizes[i] = new_size;
-+                success = false;
-             }
-            ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
-+        } else {
-+            galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
-         }
-     }
- 
-    return true;
-+    return success;
- }
- 
- bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
-@@ -934,6 +946,24 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
-     return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
- }
- 
-+struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
-+    GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
-+
-+    for (int i = 0; i < buffer_id; i++) {
-+        if (galloc->buf_tallocs[i] == galloc->buf_tallocs[buffer_id]) {
-+            // This buffer is the same as a previous one due to the same buffer type being used multiple times
-+            // (See above.) However, we need a different check because multiple buffers might be NULL in our
-+            // case and we still want to know the attempted size.
-+
-+            struct ggml_allocr_buffer_status status = {0, true};
-+            return status;
-+        }
-+    }
-+
-+    struct ggml_allocr_buffer_status status = {galloc->buffer_sizes[buffer_id], galloc->buffers[buffer_id] != NULL};
-+    return status;
-+}
-+
- // utils
- 
- static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
-diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
-index 0ce73a99..be335e8c 100644
--- a/ggml/src/ggml-backend.cpp
-+++ b/ggml/src/ggml-backend.cpp
-@@ -1629,6 +1629,16 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
-     return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
- }
- 
-+struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
-+    int backend_index = ggml_backend_sched_backend_id(sched, backend);
-+    GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
-+
-+    struct ggml_allocr_buffer_status allocr_status = ggml_gallocr_get_attempted_buffer_size(sched->galloc, backend_index);
-+    struct ggml_backend_buffer_status status = {allocr_status.size, allocr_status.allocated};
-+
-+    return status;
-+}
-+
- void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
-     int backend_index = ggml_backend_sched_backend_id(sched, backend);
-     GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
--- a/llama/patches/0017-ggml-Export-GPU-UUIDs.patch
+++ b/llama/patches/0017-ggml-Export-GPU-UUIDs.patch
@@ -1,102 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Jesse Gross <jesse@ollama.com>
-Date: Thu, 24 Apr 2025 14:48:51 -0700
-Subject: [PATCH] ggml: Export GPU UUIDs
-
-This enables matching up devices and information reported by the backend
-with tools (e.g. nvidia-smi) and system management libraries (e.g. nvml).
---
- ggml/include/ggml-backend.h      |  1 +
- ggml/src/ggml-cuda/ggml-cuda.cu  | 33 ++++++++++++++++++++++++++++++++
- ggml/src/ggml-metal/ggml-metal.m |  1 +
- 3 files changed, 35 insertions(+)
-
-diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
-index 74e46716..a880df33 100644
--- a/ggml/include/ggml-backend.h
-+++ b/ggml/include/ggml-backend.h
-@@ -152,6 +152,7 @@ extern "C" {
-     struct ggml_backend_dev_props {
-         const char * name;
-         const char * description;
-+        const char * uuid;
-         size_t memory_free;
-         size_t memory_total;
-         enum ggml_backend_dev_type type;
-diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index cb0d8528..4c829153 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
-+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -2884,6 +2884,7 @@ struct ggml_backend_cuda_device_context {
-     int device;
-     std::string name;
-     std::string description;
-+    std::string uuid;
- };
- 
- static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
-@@ -2896,6 +2897,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
-     return ctx->description.c_str();
- }
- 
-+static const char * ggml_backend_cuda_device_get_uuid(ggml_backend_dev_t dev) {
-+    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
-+    return ctx->uuid.c_str();
-+}
-+
- static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
-     ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
-     ggml_cuda_set_device(ctx->device);
-@@ -2910,6 +2916,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
- static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
-     props->name        = ggml_backend_cuda_device_get_name(dev);
-     props->description = ggml_backend_cuda_device_get_description(dev);
-+    props->uuid        = ggml_backend_cuda_device_get_uuid(dev);
-     props->type        = ggml_backend_cuda_device_get_type(dev);
-     ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
- 
-@@ -3458,6 +3465,32 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
-                 CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
-                 dev_ctx->description = prop.name;
- 
-+                #if !defined(GGML_USE_HIP)
-+                char uuid[64];
-+                snprintf(uuid, sizeof(uuid),
-+                    "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
-+                    (unsigned char)prop.uuid.bytes[0],
-+                    (unsigned char)prop.uuid.bytes[1],
-+                    (unsigned char)prop.uuid.bytes[2],
-+                    (unsigned char)prop.uuid.bytes[3],
-+                    (unsigned char)prop.uuid.bytes[4],
-+                    (unsigned char)prop.uuid.bytes[5],
-+                    (unsigned char)prop.uuid.bytes[6],
-+                    (unsigned char)prop.uuid.bytes[7],
-+                    (unsigned char)prop.uuid.bytes[8],
-+                    (unsigned char)prop.uuid.bytes[9],
-+                    (unsigned char)prop.uuid.bytes[10],
-+                    (unsigned char)prop.uuid.bytes[11],
-+                    (unsigned char)prop.uuid.bytes[12],
-+                    (unsigned char)prop.uuid.bytes[13],
-+                    (unsigned char)prop.uuid.bytes[14],
-+                    (unsigned char)prop.uuid.bytes[15]
-+                  );
-+                dev_ctx->uuid = uuid;
-+                #else
-+                dev_ctx->uuid = "GPU-" + std::string(prop.uuid.bytes, 16);
-+                #endif
-+
-                 ggml_backend_dev_t dev = new ggml_backend_device {
-                     /* .iface   = */ ggml_backend_cuda_device_interface,
-                     /* .reg     = */ &reg,
-diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index 1b56f858..ee4f2dcb 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
-+++ b/ggml/src/ggml-metal/ggml-metal.m
-@@ -5703,6 +5703,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
- static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
-     props->name        = ggml_backend_metal_device_get_name(dev);
-     props->description = ggml_backend_metal_device_get_description(dev);
-+    props->uuid        = "0";
-     props->type        = ggml_backend_metal_device_get_type(dev);
-     ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
-     props->caps = (struct ggml_backend_dev_caps) {
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -1,9 +1,12 @@
 package llm

 import (
+	"cmp"
 	"fmt"
 	"log/slog"
+	"maps"
 	"os"
+	"slices"
 	"strconv"
 	"strings"

@@ -82,11 +85,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	var graphOffload uint64

 	// Projectors loaded into GPU0 only
-	var llamaEngineProjectorWeights uint64
-
-	// Projectors loaded with output layer
-	var ollamaEngineProjectorWeights uint64
-	var ollamaEngineProjectorGraph uint64
+	var projectorWeights uint64
+	var projectorGraph uint64

 	// Conditional output size on GPU 0
 	var memoryLayerOutput uint64
@@ -111,23 +111,21 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList)

 	for _, projector := range projectors {
-		llamaEngineProjectorWeights += projectorMemoryRequirements(projector)
+		weight := projectorMemoryRequirements(projector)
+		projectorWeights += weight

 		// multimodal models require at least 2048 context
 		opts.NumCtx = max(opts.NumCtx, 2048)
 	}
-	if llamaEngineProjectorWeights == 0 {
-		ollamaEngineProjectorWeights, ollamaEngineProjectorGraph = f.VisionGraphSize()
-		opts.NumCtx = max(opts.NumCtx, 2048)
+	if projectorWeights == 0 && projectorGraph == 0 {
+		projectorWeights, projectorGraph = f.VisionGraphSize()
 	}

 	layers := f.Tensors().GroupLayers()
-	// add one layer worth of memory as a buffer
-	if blk0, ok := layers["blk.0"]; ok {
-		layerSize = blk0.Size()
-	} else {
-		slog.Warn("model missing blk.0 layer size")
-	}
+	// add one layer (chosing the max layer) worth of memory as a buffer
+	layerSize = slices.MaxFunc(slices.Collect(maps.Values(layers)), func(a, b ggml.Layer) int {
+		return cmp.Compare(a.Size(), b.Size())
+	}).Size()

 	var kvct string
 	if envconfig.FlashAttention() &&
@@ -165,7 +163,6 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		graphFullOffload = graphPartialOffload
 	}

-	// Output layer handled at the end if we have space
 	if layer, ok := layers["output_norm"]; ok {
 		memoryLayerOutput += layer.Size()
 	}
@@ -175,7 +172,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		memoryLayerOutput += layer.Size()
 	}

-	gpuZeroOverhead := llamaEngineProjectorWeights
+	// Output layer handled at the end if we have space
+	gpuZeroOverhead := projectorWeights + projectorGraph

 	// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
 	var layerCount int
@@ -218,8 +216,6 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	if len(gpusWithSpace) > 0 {
 		gpuZeroID = gpusWithSpace[0].i
 		gpuAllocations[gpuZeroID] += gpuZeroOverhead
-	} else {
-		overflow += gpuZeroOverhead
 	}

 	// For all the layers, find where they can fit on the GPU(s)
@@ -260,24 +256,21 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	}

 	// Determine if we need to consider output then find where it fits
-	memoryLastLayer := memoryLayerOutput + ollamaEngineProjectorWeights + ollamaEngineProjectorGraph
-	if memoryLastLayer > 0 {
-		if opts.NumGPU < 0 || layerCount < opts.NumGPU {
-			for j := len(gpusWithSpace); j > 0; j-- {
-				g := gpusWithSpace[layerCount%j]
-				used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
-				if g.g.FreeMemory > overhead+used+memoryLastLayer {
-					gpuAllocations[g.i] += memoryLastLayer
-					layerCounts[g.i]++
-					layerCount++
-					break
-				}
+	if memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) {
+		for j := len(gpusWithSpace); j > 0; j-- {
+			g := gpusWithSpace[layerCount%j]
+			used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
+			if g.g.FreeMemory > overhead+used+memoryLayerOutput {
+				gpuAllocations[g.i] += memoryLayerOutput
+				layerCounts[g.i]++
+				layerCount++
+				break
 			}
 		}

 		if layerCount < int(f.KV().BlockCount())+1 {
 			fullyLoaded = false
-			overflow += memoryLastLayer
+			overflow += memoryLayerOutput
 		}
 	}

@@ -335,8 +328,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		memoryLayerOutput:   memoryLayerOutput,
 		graphFullOffload:    graphFullOffload,
 		graphPartialOffload: graphPartialOffload,
-		projectorWeights:    llamaEngineProjectorWeights + ollamaEngineProjectorWeights,
-		projectorGraph:      ollamaEngineProjectorGraph,
+		projectorWeights:    projectorWeights,
+		projectorGraph:      projectorGraph,
 	}

 	if gpus[0].Library == "cpu" {
@@ -422,7 +415,7 @@ func projectorMemoryRequirements(filename string) (weights uint64) {
 	}
 	defer file.Close()

-	ggml, err := ggml.Decode(file, 1024)
+	ggml, _, err := ggml.Decode(file, 1024)
 	if err != nil {
 		return 0
 	}
--- a/llm/server.go
+++ b/llm/server.go
@@ -121,7 +121,7 @@ func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) {
 	}
 	defer f.Close()

-	ggml, err := ggml.Decode(f, maxArraySize)
+	ggml, _, err := ggml.Decode(f, maxArraySize)
 	return ggml, err
 }

@@ -797,8 +797,7 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu

 	res, err := http.DefaultClient.Do(serverReq)
 	if err != nil {
-		slog.Error("post predict", "error", err)
-		return errors.New("model runner has unexpectedly stopped, this may be due to resource limitations or an internal error, check ollama server logs for details")
+		return fmt.Errorf("POST predict: %v", err)
 	}
 	defer res.Body.Close()

--- a/ml/backend.go
+++ b/ml/backend.go
@@ -5,8 +5,8 @@ import (
 	"context"
 	"encoding/binary"
 	"fmt"
-	"log/slog"
 	"math"
+	"os"
 	"slices"
 	"strconv"
 	"strings"
@@ -15,11 +15,6 @@ import (
 )

 type Backend interface {
-	Load(ctx context.Context, progress func(float32)) error
-
-	// BackendMemory returns the memory allocations that were made for this model
-	BackendMemory() BackendMemory
-
 	Config() fs.Config
 	Get(name string) Tensor
 	NewContext() Context
@@ -57,6 +52,10 @@ type CacheConfig struct {

 // BackendParams controls how the backend loads and executes models
 type BackendParams struct {
+	// Progress is a callback function that allows reporting percentage completion
+	// of model loading
+	Progress func(float32)
+
 	// NumThreads sets the number of threads to use if running on the CPU
 	NumThreads int

@@ -73,130 +72,9 @@ type BackendParams struct {
 	FlashAttention bool
 }

-// ErrNoMem is returned when panicing due to insufficient memory. It includes
-// the attempted memory allocation.
-type ErrNoMem struct {
-	BackendMemory
-}
+var backends = make(map[string]func(context.Context, *os.File, BackendParams) (Backend, error))

-func (e ErrNoMem) Error() string {
-	return fmt.Sprintf("insufficient memory - required allocations: %+v", e.BackendMemory)
-}
-
-type AllocationStatus int
-
-const (
-	// Unallocated memory - have not yet attempted to allocate
-	Unallocated AllocationStatus = iota
-
-	// Failed memory - tried to allocate the memory and did not succeed
-	Failed
-
-	// Allocated memory = tried and succeeded to allocate memory
-	Allocated
-)
-
-// Memory is the size of an allocation and whether it was successful.
-type Memory struct {
-	Size   uint64
-	Status AllocationStatus
-}
-
-func (m Memory) String() string {
-	s := fmt.Sprint(m.Size)
-
-	switch m.Status {
-	case Unallocated:
-		s += "U"
-	case Failed:
-		s += "F"
-	case Allocated:
-		s += "A"
-	}
-
-	return s
-}
-
-// DeviceMemory provides a breakdown of the memory needed
-// per device, such as a CPU or GPU.
-type DeviceMemory struct {
-	// Name is the name of the device as labeled by the backend. It
-	// may not be persistent across instances of the runner.
-	Name string
-
-	// UUID is a unique persistent identifier for the device for matching
-	// with system management libraries
-	UUID string
-
-	// Weights is the per-layer memory needed for the model weights.
-	Weights []Memory
-
-	// Cache is the per-layer memory needed for the KV cache.
-	Cache []Memory
-
-	// Graph is the size of the compute graph. It is not per-layer.
-	Graph Memory
-}
-
-func memoryPresent(mem []Memory) bool {
-	return slices.ContainsFunc(mem, func(m Memory) bool { return m.Size != 0 })
-}
-
-func (m DeviceMemory) LogValue() slog.Value {
-	var attrs []slog.Attr
-	if memoryPresent(m.Weights) {
-		attrs = append(attrs, slog.Any("Weights", m.Weights))
-	}
-
-	if memoryPresent(m.Cache) {
-		attrs = append(attrs, slog.Any("Cache", m.Cache))
-	}
-
-	if m.Graph.Size != 0 {
-		attrs = append(attrs, slog.Any("Graph", m.Graph))
-	}
-
-	if len(attrs) > 0 && m.UUID != "" {
-		attrs = append([]slog.Attr{slog.String("UUID", m.UUID)}, attrs...)
-	}
-
-	return slog.GroupValue(attrs...)
-}
-
-// BackendMemory provides the amount of memory required to load the model
-// per device based on the BackendParams. In some cases, not all required
-// allocations will be known at this point. However, the size of the most recent
-// allocation is guaranteed to be provided so that if it failed, the caller can
-// accommodate that to make forward progress.
-type BackendMemory struct {
-	// InputsWeights are always located on the CPU and cannot be moved
-	InputWeights Memory
-
-	// CPU model components are located in system memory. This does not
-	// include unified memory allocated through the GPU.
-	CPU DeviceMemory
-
-	// GPU model components are located on one or more GPUs.
-	GPUs []DeviceMemory
-}
-
-func (m BackendMemory) LogValue() slog.Value {
-	var attrs []slog.Attr
-	if m.InputWeights.Size != 0 {
-		attrs = append(attrs, slog.Any("InputWeights", m.InputWeights))
-	}
-
-	attrs = append(attrs, slog.Any(m.CPU.Name, m.CPU))
-	for _, g := range m.GPUs {
-		attrs = append(attrs, slog.Any(g.Name, g))
-	}
-
-	return slog.GroupValue(attrs...)
-}
-
-var backends = make(map[string]func(string, BackendParams) (Backend, error))
-
-func RegisterBackend(name string, f func(string, BackendParams) (Backend, error)) {
+func RegisterBackend(name string, f func(context.Context, *os.File, BackendParams) (Backend, error)) {
 	if _, ok := backends[name]; ok {
 		panic("backend: backend already registered")
 	}
@@ -204,9 +82,9 @@ func RegisterBackend(name string, f func(string, BackendParams) (Backend, error)
 	backends[name] = f
 }

-func NewBackend(modelPath string, params BackendParams) (Backend, error) {
+func NewBackend(ctx context.Context, f *os.File, params BackendParams) (Backend, error) {
 	if backend, ok := backends["ggml"]; ok {
-		return backend(modelPath, params)
+		return backend(ctx, f, params)
 	}

 	return nil, fmt.Errorf("unsupported backend")
@@ -215,8 +93,8 @@ func NewBackend(modelPath string, params BackendParams) (Backend, error) {
 type Context interface {
 	Empty(dtype DType, shape ...int) Tensor
 	Zeros(dtype DType, shape ...int) Tensor
-	FromFloatSlice(s []float32, shape ...int) Tensor
-	FromIntSlice(s []int32, shape ...int) Tensor
+	FromFloatSlice(s []float32, shape ...int) (Tensor, error)
+	FromIntSlice(s []int32, shape ...int) (Tensor, error)

 	// Arange creates a 1D tensor with values within an interval (start, stop] increased by step.
 	Arange(start, stop, step float32, dtype DType) Tensor
@@ -228,7 +106,7 @@ type Context interface {
 	// graph, simply preallocates memory. Typically called with a
 	// worst case graph to ensure all resources are available for
 	// for future inference.
-	Reserve()
+	Reserve() error

 	MaxGraphNodes() int
 	Close()
@@ -241,6 +119,21 @@ type Context interface {
 	Layer(int) Context
 }

+// RopeOptions contains optional parameters for RoPE function
+type RopeOptions struct {
+	OriginalContextLen uint32
+}
+
+// RopeOption defines a function that modifies RopeOpts
+type RopeOption func(*RopeOptions)
+
+// WithContextLen sets a custom context length
+func WithContextLen(len uint32) RopeOption {
+	return func(opts *RopeOptions) {
+		opts.OriginalContextLen = len
+	}
+}
+
 type Tensor interface {
 	Dim(n int) int
 	Stride(n int) int
@@ -254,8 +147,6 @@ type Tensor interface {
 	Neg(ctx Context) Tensor
 	Add(ctx Context, t2 Tensor) Tensor
 	Mul(ctx Context, t2 Tensor) Tensor
-	Div(ctx Context, t2 Tensor) Tensor
-
 	Mulmat(ctx Context, t2 Tensor) Tensor
 	MulmatFullPrec(ctx Context, t2 Tensor) Tensor
 	MulmatID(ctx Context, t2, ids Tensor) Tensor
@@ -264,11 +155,11 @@ type Tensor interface {
 	LayerNorm(ctx Context, weight, bias Tensor, eps float32) Tensor
 	RMSNorm(ctx Context, weight Tensor, eps float32) Tensor
 	Scale(ctx Context, s float64) Tensor
-	SumRows(ctx Context) Tensor

 	AvgPool2D(ctx Context, k, s int, p float32) Tensor
 	Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor

+	RoPE(ctx Context, positionIDs, ropeFactors Tensor, dim, ropeType uint32, base, scale float32, options ...RopeOption) Tensor
 	IM2Col(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor

 	Sin(ctx Context) Tensor
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -10,6 +10,7 @@ import "C"

 import (
 	"context"
+	"errors"
 	"fmt"
 	"io"
 	"log/slog"
@@ -29,7 +30,6 @@ import (
 	"github.com/ollama/ollama/logutil"
 	"github.com/ollama/ollama/ml"
 	ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
-	"github.com/ollama/ollama/ml/nn/rope"
 	"golang.org/x/sync/errgroup"
 )

@@ -44,15 +44,8 @@ func devices() []*C.struct_ggml_backend_device {
 }

 type Backend struct {
-	// modelPath is the location of the model data
-	modelPath string
-
 	meta *fsggml.GGML

-	// tensorLoadTargets maps from the name of the tensor in the file
-	// to the name that is used by the model definition
-	tensorLoadTargets map[string][]string
-
 	sched         *C.struct_ggml_backend_sched
 	schedBackends []*C.struct_ggml_backend
 	schedBufts    []*C.struct_ggml_backend_buffer_type
@@ -65,26 +58,14 @@ type Backend struct {
 	// layers is the backend used for repeating layers
 	layers map[int]*C.struct_ggml_backend_buffer_type

-	// requiredMemory is the cumulative memory allocations needed by the backend
-	requiredMemory *ml.BackendMemory
-
-	// btDeviceMemory maps from a buffer type to the memory allocations associated with that device
-	btDeviceMemory map[*C.struct_ggml_backend_buffer_type]*ml.DeviceMemory
-
 	flashAttention bool

 	// maxGraphNodes is the maximum allowed number of graph nodes in this scheduler
 	maxGraphNodes int
 }

-func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
-	r, err := os.Open(modelPath)
-	if err != nil {
-		return nil, err
-	}
-	defer r.Close()
-
-	meta, err := fsggml.Decode(r, -1)
+func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend, error) {
+	meta, n, err := fsggml.Decode(r, -1)
 	if err != nil {
 		return nil, err
 	}
@@ -99,9 +80,6 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		"num_key_values", len(meta.KV()),
 	)

-	var requiredMemory ml.BackendMemory
-	btDeviceMemory := make(map[*C.struct_ggml_backend_buffer_type]*ml.DeviceMemory)
-
 	type deviceBufferType struct {
 		d   *C.struct_ggml_backend_device
 		bts []*C.struct_ggml_backend_buffer_type
@@ -122,8 +100,6 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		}
 	}

-	blocks := int(meta.KV().BlockCount())
-
 	// create list of buffer types for the cpu
 	cpuDeviceBufferType := deviceBufferType{d: C.ggml_backend_dev_by_type(C.GGML_BACKEND_DEVICE_TYPE_CPU)}
 	for _, d := range append(accels, append(gpus, cpus...)...) {
@@ -131,33 +107,17 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		case C.GGML_BACKEND_DEVICE_TYPE_CPU,
 			C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
 			cpuDeviceBufferType.bts = append(cpuDeviceBufferType.bts, C.ggml_backend_dev_buffer_type(d))
-			btDeviceMemory[C.ggml_backend_dev_buffer_type(d)] = &requiredMemory.CPU
 		}
 	}

-	requiredMemory.CPU.Name = C.GoString(C.ggml_backend_dev_name(cpuDeviceBufferType.d))
-	var props C.struct_ggml_backend_dev_props
-	C.ggml_backend_dev_get_props(cpuDeviceBufferType.d, &props)
-	requiredMemory.CPU.UUID = C.GoString(props.uuid)
-	requiredMemory.CPU.Weights = make([]ml.Memory, blocks+1)
-	requiredMemory.CPU.Cache = make([]ml.Memory, blocks+1)
-
 	// create list of buffer types for each gpu
 	var gpuDeviceBufferTypes []deviceBufferType
-	requiredMemory.GPUs = make([]ml.DeviceMemory, len(gpus))
-	for i, d := range gpus {
+	for _, d := range gpus {
 		bt := C.ggml_backend_dev_buffer_type(d)
 		gpuDeviceBufferTypes = append(gpuDeviceBufferTypes, deviceBufferType{
 			d:   d,
 			bts: append([]*C.struct_ggml_backend_buffer_type{bt}, cpuDeviceBufferType.bts...),
 		})
-		btDeviceMemory[bt] = &requiredMemory.GPUs[i]
-		requiredMemory.GPUs[i].Name = C.GoString(C.ggml_backend_dev_name(d))
-		var props C.struct_ggml_backend_dev_props
-		C.ggml_backend_dev_get_props(d, &props)
-		requiredMemory.GPUs[i].UUID = C.GoString(props.uuid)
-		requiredMemory.GPUs[i].Weights = make([]ml.Memory, blocks+1)
-		requiredMemory.GPUs[i].Cache = make([]ml.Memory, blocks+1)
 	}

 	useDefaultSplit := true
@@ -196,6 +156,8 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 	// inputs always use cpu
 	input := cpuDeviceBufferType

+	blocks := int(meta.KV().BlockCount())
+
 	// define a range of gpu layers. anything outside of this range is assigned to the cpu
 	gpuRangeStart := max(0, blocks-params.NumGPULayers)
 	gpuRangeStop := min(gpuRangeStart+params.NumGPULayers, blocks+1)
@@ -236,7 +198,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {

 	// contexts are shared by tensors of the same buffer type
 	ctxs := make(map[*C.struct_ggml_backend_buffer_type]*C.struct_ggml_context)
-	createTensor := func(t tensor, bts []*C.struct_ggml_backend_buffer_type, layer int) *C.struct_ggml_tensor {
+	createTensor := func(t tensor, bts []*C.struct_ggml_backend_buffer_type) *C.struct_ggml_tensor {
 		for _, bt := range bts {
 			if _, ok := ctxs[bt]; !ok {
 				ctxs[bt] = C.ggml_init(C.struct_ggml_init_params{
@@ -262,16 +224,6 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 			C.ggml_set_name(tt, cname)

 			slog.Log(context.TODO(), logutil.LevelTrace, "created tensor", "name", name, "shape", t.source.Shape, "dtype", t.source.Kind, "buffer_type", C.GoString(C.ggml_backend_buft_name(bt)))
-
-			size := pad(C.ggml_backend_buft_get_alloc_size(bt, tt), C.ggml_backend_buft_get_alignment(bt))
-			if layer == -1 {
-				// Assume that InputWeights can be allocated - they're always in system memory and can't be moved in any case
-				requiredMemory.InputWeights.Status = ml.Allocated
-				requiredMemory.InputWeights.Size += uint64(size)
-			} else {
-				btDeviceMemory[bt].Weights[layer].Size += uint64(size)
-			}
-
 			//nolint:staticcheck // TODO: check if buffer type supports this tensor
 			return tt
 		}
@@ -293,22 +245,22 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 	for _, t := range meta.Tensors().Items() {
 		switch {
 		case contains(t.Name, "position_embd", "token_embd", "token_norm_embd", "token_types"):
-			createTensor(tensor{source: t}, input.bts, -1)
+			createTensor(tensor{source: t}, input.bts)
 			if _, ok := meta.Tensors().GroupLayers()["output"]; !ok && t.Name == "token_embd.weight" {
-				createTensor(tensor{source: t, target: "output.weight"}, output.bts, blocks)
+				createTensor(tensor{source: t, target: "output.weight"}, output.bts)
 			}
 		case contains(t.Name, "cls", "output", "output_norm"):
-			createTensor(tensor{source: t}, output.bts, blocks)
+			createTensor(tensor{source: t}, output.bts)
 		case strings.HasPrefix(t.Name, "v.") || strings.HasPrefix(t.Name, "mm."):
 			// TODO: assign vision tensors to the gpu if possible
-			createTensor(tensor{source: t}, output.bts, blocks)
+			createTensor(tensor{source: t}, output.bts)
 		case contains(t.Name, "rope_freqs", "rope_factors_long", "rope_factors_short"):
 			// these tensors should be repeated per layer
 			for i, layer := range layers {
 				createTensor(tensor{
 					source: t,
 					target: "blk." + strconv.Itoa(i) + "." + t.Name,
-				}, layer.bts, i)
+				}, layer.bts)
 			}
 		default:
 			layerIndex := -1
@@ -319,10 +271,10 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 			}

 			if layerIndex >= 0 {
-				createTensor(tensor{source: t}, layers[layerIndex].bts, layerIndex)
+				createTensor(tensor{source: t}, layers[layerIndex].bts)
 			} else {
 				// load all other tensors on the cpu
-				createTensor(tensor{source: t}, input.bts, -1)
+				createTensor(tensor{source: t}, input.bts)
 			}
 		}
 	}
@@ -335,18 +287,8 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		}

 		b := C.ggml_backend_alloc_ctx_tensors_from_buft(c, bt)
-		for i := range btDeviceMemory[bt].Weights {
-			if btDeviceMemory[bt].Weights[i].Size != 0 {
-				if b != nil {
-					btDeviceMemory[bt].Weights[i].Status = ml.Allocated
-				} else {
-					btDeviceMemory[bt].Weights[i].Status = ml.Failed
-				}
-			}
-		}
-
 		if b == nil {
-			panic(ml.ErrNoMem{BackendMemory: requiredMemory})
+			return nil, fmt.Errorf("unable to allocate memory from device %v for model weights", C.GoString(C.ggml_backend_buft_name(bt)))
 		}

 		C.ggml_backend_buffer_set_usage(b, C.GGML_BACKEND_BUFFER_USAGE_WEIGHTS)
@@ -365,6 +307,73 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		}
 	}

+	var doneBytes atomic.Uint64
+	totalBytes := uint64(n) - meta.Tensors().Offset
+
+	g, ctx := errgroup.WithContext(ctx)
+	g.SetLimit(runtime.GOMAXPROCS(0))
+	for _, t := range meta.Tensors().Items() {
+		t := t
+		g.Go(func() error {
+			tts := make([]*C.struct_ggml_tensor, max(1, len(targets[t.Name])))
+			for i := range tts {
+				target := targets[t.Name][i]
+				if target == "" {
+					target = t.Name
+				}
+
+				tt, ok := tensors[target]
+				if !ok {
+					return fmt.Errorf("unassigned tensor: %s", t.Name)
+				}
+
+				tts[i] = tt
+			}
+
+			// Create a new FD for each goroutine so that each FD is read sequentially, rather than
+			// seeking around within an FD shared between all goroutines.
+			file, err := os.Open(r.Name())
+			if err != nil {
+				slog.Warn("file open error", "file", r.Name(), "error", err)
+				return err
+			}
+			defer file.Close()
+			sr := io.NewSectionReader(file, int64(meta.Tensors().Offset+t.Offset), int64(t.Size()))
+			bts := make([]byte, 128*format.KibiByte)
+
+			var s uint64
+			for s < t.Size() {
+				// Stop if either the parent context has been canceled or if any of the other tensors returned an error
+				if err := ctx.Err(); err != nil {
+					return err
+				}
+
+				n, err := io.ReadFull(sr, bts[:min(len(bts), int(t.Size()-s))])
+				if err != nil {
+					slog.Warn("file read error", "file", r.Name(), "error", err)
+					return err
+				}
+
+				for _, tt := range tts {
+					C.ggml_backend_tensor_set(tt, unsafe.Pointer(&bts[0]), C.size_t(s), C.size_t(n))
+				}
+
+				s += uint64(n)
+
+				if params.Progress != nil {
+					done := doneBytes.Add(uint64(n))
+					params.Progress(float32(done) / float32(totalBytes))
+				}
+			}
+
+			return nil
+		})
+	}
+
+	if err := g.Wait(); err != nil {
+		return nil, err
+	}
+
 	// map devices to backend buffer types so new tensors can be assigned to the correct device
 	deviceBufferTypes := make(map[*C.struct_ggml_backend_device]*C.struct_ggml_backend_buffer_type)

@@ -388,11 +397,9 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {

 	maxGraphNodes := max(8192, len(meta.Tensors().Items())*5)
 	return &Backend{
-		modelPath:         modelPath,
-		flashAttention:    params.FlashAttention,
-		meta:              meta,
-		tensorLoadTargets: targets,
-		tensors:           tensors,
+		flashAttention: params.FlashAttention,
+		meta:           meta,
+		tensors:        tensors,
 		sched: C.ggml_backend_sched_new(
 			(*C.ggml_backend_t)(unsafe.Pointer(&schedBackends[0])),
 			(*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&schedBufts[0])),
@@ -411,9 +418,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 			}
 			return m
 		}(),
-		requiredMemory: &requiredMemory,
-		btDeviceMemory: btDeviceMemory,
-		maxGraphNodes:  maxGraphNodes,
+		maxGraphNodes: maxGraphNodes,
 	}, nil
 }

@@ -421,81 +426,6 @@ func init() {
 	ml.RegisterBackend("ggml", New)
 }

-func (b *Backend) Load(ctx context.Context, progress func(float32)) error {
-	var doneBytes atomic.Uint64
-	totalBytes := uint64(b.meta.Length) - b.meta.Tensors().Offset
-
-	g, ctx := errgroup.WithContext(ctx)
-	g.SetLimit(runtime.GOMAXPROCS(0))
-	for _, t := range b.meta.Tensors().Items() {
-		t := t
-		g.Go(func() error {
-			tts := make([]*C.struct_ggml_tensor, max(1, len(b.tensorLoadTargets[t.Name])))
-			for i := range tts {
-				target := b.tensorLoadTargets[t.Name][i]
-				if target == "" {
-					target = t.Name
-				}
-
-				tt, ok := b.tensors[target]
-				if !ok {
-					return fmt.Errorf("unassigned tensor: %s", t.Name)
-				}
-
-				tts[i] = tt
-			}
-
-			// Create a new FD for each goroutine so that each FD is read sequentially, rather than
-			// seeking around within an FD shared between all goroutines.
-			file, err := os.Open(b.modelPath)
-			if err != nil {
-				slog.Warn("file open error", "file", b.modelPath, "error", err)
-				return err
-			}
-			defer file.Close()
-			sr := io.NewSectionReader(file, int64(b.meta.Tensors().Offset+t.Offset), int64(t.Size()))
-			bts := make([]byte, 128*format.KibiByte)
-
-			var s uint64
-			for s < t.Size() {
-				// Stop if either the parent context has been canceled or if any of the other tensors returned an error
-				if err := ctx.Err(); err != nil {
-					return err
-				}
-
-				n, err := io.ReadFull(sr, bts[:min(len(bts), int(t.Size()-s))])
-				if err != nil {
-					slog.Warn("file read error", "file", b.modelPath, "error", err)
-					return err
-				}
-
-				for _, tt := range tts {
-					C.ggml_backend_tensor_set(tt, unsafe.Pointer(&bts[0]), C.size_t(s), C.size_t(n))
-				}
-
-				s += uint64(n)
-
-				if progress != nil {
-					done := doneBytes.Add(uint64(n))
-					progress(float32(done) / float32(totalBytes))
-				}
-			}
-
-			return nil
-		})
-	}
-
-	if err := g.Wait(); err != nil {
-		return err
-	}
-
-	return nil
-}
-
-func (b *Backend) BackendMemory() ml.BackendMemory {
-	return *b.requiredMemory
-}
-
 func (b *Backend) Config() fs.Config {
 	return b.meta.KV()
 }
@@ -527,7 +457,6 @@ func (b *Backend) NewContextSize(n int) ml.Context {
 			no_alloc: true,
 		}),
 		allocatedBuffers: &allocatedBuffers,
-		layer:            -1,
 	}
 }

@@ -554,9 +483,6 @@ type Context struct {

 	// maxGraphNodes is the maximum allowed number of graph nodes in this context
 	maxGraphNodes int
-
-	// layer is the graph layer that this context is allocating for - assumed to be cache
-	layer int
 }

 func (c *Context) Input() ml.Context {
@@ -567,7 +493,6 @@ func (c *Context) Input() ml.Context {
 			buft:             c.b.input,
 			allocatedBuffers: c.allocatedBuffers,
 			maxGraphNodes:    c.maxGraphNodes,
-			layer:            -1,
 		}
 	}

@@ -582,7 +507,6 @@ func (c *Context) Layer(i int) ml.Context {
 			buft:             buft,
 			allocatedBuffers: c.allocatedBuffers,
 			maxGraphNodes:    c.maxGraphNodes,
-			layer:            i,
 		}
 	}

@@ -620,34 +544,22 @@ func (c *Context) Compute(tensors ...ml.Tensor) {
 	}
 }

-func (c *Context) Reserve() {
-	reserved := C.ggml_backend_sched_reserve(c.b.sched, c.graph)
+func (c *Context) Reserve() error {
+	if !C.ggml_backend_sched_reserve(c.b.sched, c.graph) {
+		C.ggml_backend_sched_reset(c.b.sched)
+		return errors.New("failed to reserve graph")
+	}

 	slog.Debug("compute graph", "nodes", C.ggml_graph_n_nodes(c.graph), "splits", C.ggml_backend_sched_get_n_splits(c.b.sched))
-
-	// Reserve may get called multiple times for different graphs - we just want the last run, which will contain the max allocations
-	for _, bt := range c.b.schedBufts {
-		c.b.btDeviceMemory[bt].Graph = ml.Memory{}
-	}
-
 	for i := range c.b.schedBackends {
-		bufferStatus := C.ggml_backend_sched_get_attempted_buffer_size(c.b.sched, c.b.schedBackends[i])
-
-		graph := &c.b.btDeviceMemory[c.b.schedBufts[i]].Graph
-		graph.Size += uint64(bufferStatus.size)
-		if bufferStatus.allocated && graph.Status != ml.Failed {
-			graph.Status = ml.Allocated
-		} else {
-			graph.Status = ml.Failed
-		}
-
+		size := C.ggml_backend_sched_get_buffer_size(c.b.sched, c.b.schedBackends[i])
 		slog.Info("compute graph", "backend", C.GoString(C.ggml_backend_name(c.b.schedBackends[i])), "buffer_type", C.GoString(C.ggml_backend_buft_name(c.b.schedBufts[i])),
-			"size", format.HumanBytes2(uint64(bufferStatus.size)))
+			"size", format.HumanBytes2(uint64(size)))
 	}

-	if !reserved {
-		panic(ml.ErrNoMem{BackendMemory: *c.b.requiredMemory})
-	}
+	C.ggml_backend_sched_reset(c.b.sched)
+
+	return nil
 }

 func (c *Context) MaxGraphNodes() int {
@@ -667,7 +579,7 @@ func pad(length, pad C.size_t) C.size_t {
 	return ((length + pad - 1) / pad) * pad
 }

-func (c *Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
+func (c *Context) newTensor(dtype ml.DType, shape []int) (ml.Tensor, error) {
 	if c.buft == nil {
 		panic("set Input or Layer before creating tensors")
 	}
@@ -690,7 +602,7 @@ func (c *Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {

 	if len(shape) < 1 || shape[0] == 0 {
 		var shape C.int64_t = 0
-		return &Tensor{b: c.b, t: C.ggml_new_tensor(c.ctx, cdtype, 1, &shape)}
+		return &Tensor{b: c.b, t: C.ggml_new_tensor(c.ctx, cdtype, 1, &shape)}, nil
 	} else if len(shape) > 4 {
 		panic("unsupported number of dimensions")
 	}
@@ -703,43 +615,40 @@ func (c *Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {

 	t := C.ggml_new_tensor(c.ctx, cdtype, C.int(len(shape)), shapeToGGML(shape))
 	size := pad(C.ggml_backend_buft_get_alloc_size(c.buft, t), C.ggml_backend_buft_get_alignment(c.buft))
-
 	b := C.ggml_backend_buft_alloc_buffer(c.buft, size)
-	if c.layer >= 0 {
-		cache := &c.b.btDeviceMemory[c.buft].Cache[c.layer]
-
-		cache.Size += uint64(size)
-		if b != nil {
-			cache.Status = ml.Allocated
-		} else {
-			cache.Status = ml.Failed
-		}
-	}
-
 	if b == nil {
-		panic(ml.ErrNoMem{BackendMemory: *c.b.requiredMemory})
+		return nil, fmt.Errorf("unable to allocate %v from device %v for new tensor", format.HumanBytes2(uint64(size)), C.GoString(C.ggml_backend_buft_name(c.buft)))
 	}
-
 	*c.allocatedBuffers = append(*c.allocatedBuffers, b)
+
 	C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
-	return &Tensor{b: c.b, t: t}
+	return &Tensor{b: c.b, t: t}, nil
 }

 func (c *Context) Empty(dtype ml.DType, shape ...int) ml.Tensor {
-	return c.newTensor(dtype, shape)
+	t, err := c.newTensor(dtype, shape)
+	if err != nil {
+		panic(err)
+	}
+
+	return t
 }

 func (c *Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
-	t := c.newTensor(dtype, shape)
+	t, err := c.newTensor(dtype, shape)
+	if err != nil {
+		panic(err)
+	}
+
 	C.ggml_set_zero(t.(*Tensor).t)
 	return t
 }

-func checkShape[S ~[]E, E any](s S, shape ...int) {
+func checkShape[S ~[]E, E any](s S, shape ...int) error {
 	n := len(s)

 	if n == 0 {
-		return
+		return nil
 	}

 	for _, v := range shape {
@@ -747,32 +656,44 @@ func checkShape[S ~[]E, E any](s S, shape ...int) {
 	}

 	if n != 1 {
-		panic(fmt.Errorf("invalid shape: %v", shape))
+		return fmt.Errorf("invalid shape: %v", shape)
 	}
+
+	return nil
 }

-func (c *Context) FromFloatSlice(s []float32, shape ...int) ml.Tensor {
-	checkShape(s, shape...)
+func (c *Context) FromFloatSlice(s []float32, shape ...int) (ml.Tensor, error) {
+	if err := checkShape(s, shape...); err != nil {
+		return nil, err
+	}

-	t := c.newTensor(ml.DTypeF32, shape)
+	t, err := c.newTensor(ml.DTypeF32, shape)
+	if err != nil {
+		return nil, err
+	}

 	if len(s) > 0 {
 		C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
 	}

-	return t
+	return t, nil
 }

-func (c *Context) FromIntSlice(s []int32, shape ...int) ml.Tensor {
-	checkShape(s, shape...)
+func (c *Context) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
+	if err := checkShape(s, shape...); err != nil {
+		return nil, err
+	}

-	t := c.newTensor(ml.DTypeI32, shape)
+	t, err := c.newTensor(ml.DTypeI32, shape)
+	if err != nil {
+		return nil, err
+	}

 	if len(s) > 0 {
 		C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
 	}

-	return t
+	return t, nil
 }

 func (c Context) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
@@ -790,7 +711,12 @@ func (c Context) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
 			arange = append(arange, int32(i))
 		}

-		return c.Input().FromIntSlice(arange, len(arange))
+		t, err := c.Input().FromIntSlice(arange, len(arange))
+		if err != nil {
+			panic(err)
+		}
+
+		return t
 	default:
 		panic("unsupported dtype for arange")
 	}
@@ -941,13 +867,6 @@ func (t *Tensor) Mul(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
 	}
 }

-func (t *Tensor) Div(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
-	return &Tensor{
-		b: t.b,
-		t: C.ggml_div(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
-	}
-}
-
 func (t *Tensor) Mulmat(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
 	return &Tensor{
 		b: t.b,
@@ -996,8 +915,6 @@ func (t *Tensor) RMSNorm(ctx ml.Context, w ml.Tensor, eps float32) ml.Tensor {
 func (t *Tensor) Pad(ctx ml.Context, shape ...int) ml.Tensor {
 	if len(shape) != 4 {
 		panic("expected 4 dimensions")
-	} else if shape[3] != 0 {
-		panic("cuda does not support 4d tensors")
 	}

 	return &Tensor{
@@ -1065,13 +982,6 @@ func (t *Tensor) Scale(ctx ml.Context, s float64) ml.Tensor {
 	}
 }

-func (t *Tensor) SumRows(ctx ml.Context) ml.Tensor {
-	return &Tensor{
-		b: t.b,
-		t: C.ggml_sum_rows(ctx.(*Context).ctx, t.t),
-	}
-}
-
 func (t *Tensor) Softmax(ctx ml.Context) ml.Tensor {
 	return &Tensor{
 		b: t.b,
@@ -1143,15 +1053,28 @@ func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
 	}
 }

-func (t *Tensor) RoPE(ctx ml.Context, positions ml.Tensor, ropeDim int, ropeBase, ropeScale float32, options ...func(*rope.Options)) ml.Tensor {
+const (
+	ropeTypeNorm   C.int = 0
+	ropeTypeNeox   C.int = 2
+	ropeTypeMrope  C.int = 8
+	ropeTypeVision C.int = 24
+)
+
+func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim, ropeType uint32, ropeBase, ropeScale float32, options ...ml.RopeOption) ml.Tensor {
 	// Default options
-	opts := &rope.Options{OriginalContextLength: 131072, Factors: &Tensor{}}
+	opts := &ml.RopeOptions{
+		OriginalContextLen: 131072,
+	}

 	// Apply any provided options
 	for _, option := range options {
 		option(opts)
 	}

+	if ropeFactors == nil {
+		ropeFactors = &Tensor{b: t.b}
+	}
+
 	dequant := t.t
 	if C.ggml_is_quantized(t.t._type) {
 		dequant = C.ggml_cast(ctx.(*Context).ctx, t.t, C.GGML_TYPE_F32)
@@ -1162,11 +1085,11 @@ func (t *Tensor) RoPE(ctx ml.Context, positions ml.Tensor, ropeDim int, ropeBase
 		t: C.ggml_rope_ext(
 			ctx.(*Context).ctx,
 			dequant,
-			positions.(*Tensor).t,
-			opts.Factors.(*Tensor).t,
+			positionIDs.(*Tensor).t,
+			ropeFactors.(*Tensor).t,
 			C.int(ropeDim),
-			C.int(opts.Type),
-			C.int(opts.OriginalContextLength),
+			C.int(ropeType),
+			C.int(opts.OriginalContextLen),
 			C.float(ropeBase),
 			C.float(ropeScale),
 			C.float(0.0),
--- a/ml/backend/ggml/ggml/include/ggml-alloc.h
+++ b/ml/backend/ggml/ggml/include/ggml-alloc.h
@@ -66,12 +66,6 @@ GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph

 GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);

-struct ggml_allocr_buffer_status {
-    size_t size;
-    bool allocated;
-};
-GGML_API struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id);
-
 // Utils
 // Create a buffer and allocate all the tensors in a ggml_context
 GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
--- a/ml/backend/ggml/ggml/include/ggml-backend.h
+++ b/ml/backend/ggml/ggml/include/ggml-backend.h
@@ -152,7 +152,6 @@ extern "C" {
    struct ggml_backend_dev_props {
        const char * name;
        const char * description;
-        const char * uuid;
        size_t memory_free;
        size_t memory_total;
        enum ggml_backend_dev_type type;
@@ -305,12 +304,6 @@ extern "C" {

    GGML_API size_t               ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);

-    struct ggml_backend_buffer_status {
-        size_t size;
-        bool allocated;
-    };
-    GGML_API struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
-
    GGML_API void                 ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
    GGML_API ggml_backend_t       ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);

--- a/ml/backend/ggml/ggml/src/ggml-alloc.c
+++ b/ml/backend/ggml/ggml/src/ggml-alloc.c
@@ -364,7 +364,6 @@ struct node_alloc {
 struct ggml_gallocr {
    ggml_backend_buffer_type_t * bufts; // [n_buffers]
    ggml_backend_buffer_t * buffers; // [n_buffers]
-    size_t *buffer_sizes; // [n_buffers]
    struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
    int n_buffers;

@@ -388,9 +387,6 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
    galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
    GGML_ASSERT(galloc->buffers != NULL);

-    galloc->buffer_sizes = calloc(n_bufs, sizeof(size_t));
-    GGML_ASSERT(galloc->buffer_sizes != NULL);
-
    galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
    GGML_ASSERT(galloc->buf_tallocs != NULL);

@@ -457,7 +453,6 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
    ggml_hash_set_free(&galloc->hash_set);
    free(galloc->hash_values);
    free(galloc->bufts);
-    free(galloc->buffer_sizes);
    free(galloc->buffers);
    free(galloc->buf_tallocs);
    free(galloc->node_allocs);
@@ -753,8 +748,6 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
        }
    }

-    bool success = true;
-
    // reallocate buffers if needed
    for (int i = 0; i < galloc->n_buffers; i++) {
        // if the buffer type is used multiple times, we reuse the same buffer
@@ -776,20 +769,15 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c

            ggml_backend_buffer_free(galloc->buffers[i]);
            galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
-            if (galloc->buffers[i]) {
-                galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
-                ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
-            } else {
+            if (galloc->buffers[i] == NULL) {
                GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
-                galloc->buffer_sizes[i] = new_size;
-                success = false;
+                return false;
            }
-        } else {
-            galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
+            ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
        }
    }

-    return success;
+    return true;
 }

 bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
@@ -946,24 +934,6 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
    return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
 }

-struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
-    GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
-
-    for (int i = 0; i < buffer_id; i++) {
-        if (galloc->buf_tallocs[i] == galloc->buf_tallocs[buffer_id]) {
-            // This buffer is the same as a previous one due to the same buffer type being used multiple times
-            // (See above.) However, we need a different check because multiple buffers might be NULL in our
-            // case and we still want to know the attempted size.
-
-            struct ggml_allocr_buffer_status status = {0, true};
-            return status;
-        }
-    }
-
-    struct ggml_allocr_buffer_status status = {galloc->buffer_sizes[buffer_id], galloc->buffers[buffer_id] != NULL};
-    return status;
-}
-
 // utils

 static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
--- a/ml/backend/ggml/ggml/src/ggml-backend.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-backend.cpp
@@ -1629,16 +1629,6 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
    return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
 }

-struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
-    int backend_index = ggml_backend_sched_backend_id(sched, backend);
-    GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
-
-    struct ggml_allocr_buffer_status allocr_status = ggml_gallocr_get_attempted_buffer_size(sched->galloc, backend_index);
-    struct ggml_backend_buffer_status status = {allocr_status.size, allocr_status.allocated};
-
-    return status;
-}
-
 void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
    int backend_index = ggml_backend_sched_backend_id(sched, backend);
    GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
--- a/ml/backend/ggml/ggml/src/ggml-cpu/cpu.go
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/cpu.go
@@ -3,7 +3,7 @@ package cpu
 // #cgo CFLAGS: -O3 -Wno-implicit-function-declaration
 // #cgo CXXFLAGS: -std=c++17
 // #cgo CPPFLAGS: -I${SRCDIR}/amx -I${SRCDIR}/llamafile -I${SRCDIR}/.. -I${SRCDIR}/../../include
-// #cgo CPPFLAGS: -DNDEBUG -DGGML_USE_LLAMAFILE
+// #cgo CPPFLAGS: -DGGML_USE_LLAMAFILE
 // #cgo linux CPPFLAGS: -D_GNU_SOURCE
 // #cgo darwin,arm64 CPPFLAGS: -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
 // #cgo darwin,arm64 LDFLAGS: -framework Accelerate
--- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2884,7 +2884,6 @@ struct ggml_backend_cuda_device_context {
    int device;
    std::string name;
    std::string description;
-    std::string uuid;
 };

 static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@@ -2897,11 +2896,6 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
    return ctx->description.c_str();
 }

-static const char * ggml_backend_cuda_device_get_uuid(ggml_backend_dev_t dev) {
-    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
-    return ctx->uuid.c_str();
-}
-
 static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
    ggml_cuda_set_device(ctx->device);
@@ -2916,7 +2910,6 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
 static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
    props->name        = ggml_backend_cuda_device_get_name(dev);
    props->description = ggml_backend_cuda_device_get_description(dev);
-    props->uuid        = ggml_backend_cuda_device_get_uuid(dev);
    props->type        = ggml_backend_cuda_device_get_type(dev);
    ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);

@@ -3465,32 +3458,6 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
                CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
                dev_ctx->description = prop.name;

-                #if !defined(GGML_USE_HIP)
-                char uuid[64];
-                snprintf(uuid, sizeof(uuid),
-                    "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
-                    (unsigned char)prop.uuid.bytes[0],
-                    (unsigned char)prop.uuid.bytes[1],
-                    (unsigned char)prop.uuid.bytes[2],
-                    (unsigned char)prop.uuid.bytes[3],
-                    (unsigned char)prop.uuid.bytes[4],
-                    (unsigned char)prop.uuid.bytes[5],
-                    (unsigned char)prop.uuid.bytes[6],
-                    (unsigned char)prop.uuid.bytes[7],
-                    (unsigned char)prop.uuid.bytes[8],
-                    (unsigned char)prop.uuid.bytes[9],
-                    (unsigned char)prop.uuid.bytes[10],
-                    (unsigned char)prop.uuid.bytes[11],
-                    (unsigned char)prop.uuid.bytes[12],
-                    (unsigned char)prop.uuid.bytes[13],
-                    (unsigned char)prop.uuid.bytes[14],
-                    (unsigned char)prop.uuid.bytes[15]
-                  );
-                dev_ctx->uuid = uuid;
-                #else
-                dev_ctx->uuid = "GPU-" + std::string(prop.uuid.bytes, 16);
-                #endif
-
                ggml_backend_dev_t dev = new ggml_backend_device {
                    /* .iface   = */ ggml_backend_cuda_device_interface,
                    /* .reg     = */ &reg,
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
@@ -5703,7 +5703,6 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
 static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
    props->name        = ggml_backend_metal_device_get_name(dev);
    props->description = ggml_backend_metal_device_get_description(dev);
-    props->uuid        = "0";
    props->type        = ggml_backend_metal_device_get_type(dev);
    ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
    props->caps = (struct ggml_backend_dev_caps) {
--- a/ml/backend/ggml/ggml/src/ggml-metal/metal.go
+++ b/ml/backend/ggml/ggml/src/ggml-metal/metal.go
@@ -4,6 +4,6 @@ package metal

 //go:generate sh -c "{ echo // Code generated by 'go generate'. DO NOT EDIT.; sed -e '/__embed_ggml-common.h__/r ../ggml-common.h' -e '/__embed_ggml-common.h__/d' -e '/#include \"ggml-metal-impl.h\"/r ggml-metal-impl.h' -e '/#include \"ggml-metal-impl.h\"/d' ggml-metal.metal; } >ggml-metal-embed.metal"

-// #cgo CPPFLAGS: -DGGML_METAL_NDEBUG -DGGML_METAL_EMBED_LIBRARY -I.. -I../../include
+// #cgo CPPFLAGS: -DGGML_METAL_EMBED_LIBRARY -I.. -I../../include
 // #cgo LDFLAGS: -framework Metal -framework MetalKit
 import "C"
--- a/ml/nn/fast/rope.go
+++ b/ml/nn/fast/rope.go
@@ -1,21 +0,0 @@
-// fast provides implementations of fast (fused) operations for increased performance.
-package fast
-
-import (
-	"github.com/ollama/ollama/ml"
-	"github.com/ollama/ollama/ml/nn/rope"
-)
-
-// fastRoPE is an interface for tensors that support fast rotary positional embedding.
-type fastRoPE interface {
-	RoPE(ctx ml.Context, positionIDs ml.Tensor, dim int, base, scale float32, options ...func(*rope.Options)) ml.Tensor
-}
-
-// RoPE applies rotary positional embedding to tensor `t`.
-func RoPE(ctx ml.Context, t, positions ml.Tensor, dim int, base, scale float32, options ...func(*rope.Options)) ml.Tensor {
-	if t, ok := t.(fastRoPE); ok {
-		return t.RoPE(ctx, positions, dim, base, scale, options...)
-	}
-
-	panic("RoPE not implemented for this tensor type")
-}
--- a/ml/nn/rope/rope.go
+++ b/ml/nn/rope/rope.go
@@ -1,33 +0,0 @@
-package rope
-
-import "github.com/ollama/ollama/ml"
-
-// Options contains optional parameters for RoPE function
-type Options struct {
-	OriginalContextLength int
-	Type                  int
-	Factors               ml.Tensor
-}
-
-// WithOriginalContextLength sets a custom context length
-func WithOriginalContextLength(n int) func(*Options) {
-	return func(opts *Options) {
-		opts.OriginalContextLength = n
-	}
-}
-
-// WithType sets RoPE type to NeoX
-func WithTypeNeoX() func(*Options) {
-	return func(opts *Options) {
-		opts.Type = 2
-	}
-}
-
-// WithFactors sets custom rope factors
-func WithFactors(factors ml.Tensor) func(*Options) {
-	return func(opts *Options) {
-		if factors != nil {
-			opts.Factors = factors
-		}
-	}
-}
--- a/model/input/input.go
+++ b/model/input/input.go
@@ -2,30 +2,16 @@ package input

 import "github.com/ollama/ollama/ml"

-// Multimodal is a multimodal embedding or a component of one.
-// For example, it could be a row of an image that can be processed
-// independently.
-type Multimodal struct {
-	// Tensor is the embedding data. Implementations may chose what to
-	// store here or it may be nil if not needed. However, any ml.Tensor
-	// objects must be stored here and not in Data.
-	Tensor ml.Tensor
-
-	// Data is implementation-specific opaque data, such as metadata on how
-	// to layout Tensor. It may be nil if not needed. It may also store larger
-	// objects such as complete images if they are to be processed later.
-	Data any
-}
-
 // Input represents one token in the input stream
 type Input struct {
 	// Token is a single element of text.
 	Token int32

-	// Multimodal is represents a non-text element such as an
-	// image (or part of one if the image can be processed in pieces).
-	// It may be used either together with Token or on its own.
-	Multimodal []Multimodal
+	// Multimodal is opaque data representing a non-text
+	// element such as an image (or part of one if the image
+	// can be processed in pieces). It may be either together
+	// with Token or on its own.
+	Multimodal any

 	// MultimodalHash is a unique representation of the data
 	// stored in Multimodal, used for caching and comparing
@@ -46,7 +32,7 @@ type Input struct {
 // Positions slice.
 type MultimodalIndex struct {
 	Index      int
-	Multimodal []Multimodal
+	Multimodal any
 }

 // Batch contains the inputs for a model forward pass
--- a/model/model.go
+++ b/model/model.go
@@ -40,13 +40,12 @@ type MultimodalProcessor interface {
 	// EncodeMultimodal processes a single input (such as an image) and
 	// generates an output (typically an embedding) that can be used by the model.
 	//
-	// The return value is one or more tensors, each with optional model-specific
-	// opaque metadata. Typically, the tensors might be views into an embedding
-	// with each view representing a chunk of data that can be processed independently
-	// in different batches.
+	// The return value is most typically an ml.Tensor, however, different
+	// type are possible, such as an object containing a tensor plus
+	// additional metadata, a slice of tensors or even just the original input.
 	//
 	// The result may be cached by the runner.
-	EncodeMultimodal(ml.Context, []byte) ([]input.Multimodal, error)
+	EncodeMultimodal(ml.Context, []byte) (any, error)

 	// PostTokenize is called after tokenization to allow the model to edit the
 	// input stream to correctly arrange multimodal elements.
@@ -98,8 +97,14 @@ func Register(name string, f func(fs.Config) (Model, error)) {
 }

 // New initializes a new model instance with the provided configuration based on the metadata in the model file
-func New(modelPath string, params ml.BackendParams) (Model, error) {
-	b, err := ml.NewBackend(modelPath, params)
+func New(ctx context.Context, modelPath string, params ml.BackendParams) (Model, error) {
+	r, err := os.Open(modelPath)
+	if err != nil {
+		return nil, err
+	}
+	defer r.Close()
+
+	b, err := ml.NewBackend(ctx, r, params)
 	if err != nil {
 		return nil, err
 	}
@@ -128,7 +133,7 @@ func NewTextProcessor(s string) (TextProcessor, error) {
 		return nil, err
 	}
 	defer r.Close()
-	meta, err := fsggml.Decode(r, -1)
+	meta, _, err := fsggml.Decode(r, -1)
 	if err != nil {
 		return nil, err
 	}
@@ -287,7 +292,11 @@ func Forward(ctx ml.Context, m Model, inputs []int32, batch input.Batch) (ml.Ten
 		return nil, errors.New("batch size cannot be less than 1")
 	}

-	batch.Inputs = ctx.Input().FromIntSlice(inputs, len(inputs))
+	var err error
+	batch.Inputs, err = ctx.Input().FromIntSlice(inputs, len(inputs))
+	if err != nil {
+		return nil, err
+	}

 	cache := m.Config().Cache
 	if cache != nil {
--- a/model/models/gemma2/model.go
+++ b/model/models/gemma2/model.go
@@ -7,8 +7,6 @@ import (
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
-	"github.com/ollama/ollama/ml/nn/fast"
-	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
 )
@@ -45,13 +43,10 @@ func New(c fs.Config) (model.Model, error) {
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Scores: c.Floats("tokenizer.ggml.scores"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
-				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
-				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				EOS: append(
-					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
-					c.Ints("tokenizer.ggml.eos_token_ids")...,
-				),
+				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
+				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+				// TODO: set EOT to EOS otherwise 0 will stop generation
+				EOT: int32(c.Uint("tokenizer.ggml.eos_token_id")),
 			},
 		),
 		Layers: make([]Layer, c.Uint("block_count")),
@@ -85,10 +80,11 @@ type SelfAttention struct {

 func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
+	ropeType := uint32(2)

 	q := sa.Query.Forward(ctx, hiddenState)
 	q = q.Reshape(ctx, opts.attnKeyLen, opts.numHeads, batchSize)
-	q = fast.RoPE(ctx, q, positionIDs, opts.attnKeyLen, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
+	q = q.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, opts.ropeBase, opts.ropeScale)

 	if opts.largeModelScaling {
 		q = q.Scale(ctx, 1.0/math.Sqrt(float64(opts.hiddenSize/opts.numHeads)))
@@ -98,7 +94,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten

 	k := sa.Key.Forward(ctx, hiddenState)
 	k = k.Reshape(ctx, opts.attnKeyLen, opts.numKVHeads, batchSize)
-	k = fast.RoPE(ctx, k, positionIDs, opts.attnKeyLen, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
+	k = k.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, opts.ropeBase, opts.ropeScale)

 	v := sa.Value.Forward(ctx, hiddenState)
 	v = v.Reshape(ctx, opts.attnValLen, opts.numKVHeads, batchSize)
@@ -128,7 +124,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
 }

 func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return fast.RoPE(ctx, key, shift, m.Options.attnKeyLen, m.Options.ropeBase, m.Options.ropeScale, rope.WithTypeNeoX()), nil
+	return key.RoPE(ctx, shift, nil, uint32(m.Options.attnKeyLen), uint32(2), m.Options.ropeBase, m.Options.ropeScale), nil
 }

 type MLP struct {
@@ -175,8 +171,15 @@ func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Ten
 }

 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	if err != nil {
+		return nil, err
+	}
+
+	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+	if err != nil {
+		return nil, err
+	}

 	hiddenState := m.TokenEmbedding.Forward(ctx, batch.Inputs)
 	hiddenState = hiddenState.Scale(ctx, math.Sqrt(float64(m.Options.hiddenSize)))
--- a/model/models/gemma3/model.go
+++ b/model/models/gemma3/model.go
@@ -60,16 +60,12 @@ func New(c fs.Config) (model.Model, error) {
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Scores: c.Floats("tokenizer.ggml.scores"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
+				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+				EOS:    int32(1),
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				EOS: append(
-					[]int32{
-						int32(c.Uint("tokenizer.ggml.eos_token_id")),
-						int32(c.Uint("tokenizer.ggml.eot_token_id", 106)),
-					},
-					c.Ints("tokenizer.ggml.eos_token_ids")...,
-				),
+				EOT:    int32(106),
+				AddEOT: c.Bool("tokenizer.ggml.add_eot_token", false),
 			},
 		),
 		ImageProcessor: newImageProcessor(c),
@@ -86,7 +82,7 @@ func New(c fs.Config) (model.Model, error) {
 	return &m, nil
 }

-func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
 	if len(m.VisionModel.Layers) == 0 {
 		return nil, model.ErrNoVisionModel
 	}
@@ -101,30 +97,33 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
 		return nil, err
 	}

-	pixelValues := ctx.Input().FromFloatSlice(f32s,
+	pixelValues, err := ctx.Input().FromFloatSlice(f32s,
 		m.ImageProcessor.imageSize,
 		m.ImageProcessor.imageSize,
 		m.ImageProcessor.numChannels,
 	)
+	if err != nil {
+		return nil, err
+	}

 	visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
 	visionOutputs = m.MultiModalProjector.Forward(ctx, visionOutputs, m.imageSize, m.patchSize, m.VisionModel.eps)
-	return []input.Multimodal{{Tensor: visionOutputs}}, nil
+	return visionOutputs, nil
 }

 func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 	var result []input.Input

 	for _, inp := range inputs {
-		if len(inp.Multimodal) == 0 {
+		if inp.Multimodal == nil {
 			result = append(result, inp)
 		} else {
-			inputMultimodal := inp.Multimodal[0].Tensor
+			inputMultimodal := inp.Multimodal.(ml.Tensor)

 			result = append(result,
-				input.Input{Token: 108, SameBatch: inputMultimodal.Dim(1) + 3}, // "\n\n"
-				input.Input{Token: 255999},                                     // "<start_of_image>""
-				input.Input{Multimodal: []input.Multimodal{{Tensor: inputMultimodal}}, MultimodalHash: inp.MultimodalHash}, // image data is on the first placeholder
+				input.Input{Token: 108, SameBatch: inputMultimodal.Dim(1) + 3},               // "\n\n"
+				input.Input{Token: 255999},                                                   // "<start_of_image>""
+				input.Input{Multimodal: inputMultimodal, MultimodalHash: inp.MultimodalHash}, // image data is on the first placeholder
 			)

 			// add image token placeholders
@@ -141,8 +140,15 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 }

 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	if err != nil {
+		return nil, err
+	}
+
+	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+	if err != nil {
+		return nil, err
+	}

 	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache), nil
 }
--- a/model/models/gemma3/model_text.go
+++ b/model/models/gemma3/model_text.go
@@ -7,8 +7,6 @@ import (
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
-	"github.com/ollama/ollama/ml/nn/fast"
-	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model/input"
 )

@@ -75,6 +73,7 @@ type TextSelfAttention struct {

 func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *TextConfig) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
+	ropeType := uint32(2)

 	ropeBase := opts.ropeLocalBase
 	if (layer+1)%gemmaGlobalCacheCount == 0 {
@@ -84,7 +83,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, pos
 	q := sa.Query.Forward(ctx, hiddenState)
 	q = q.Reshape(ctx, opts.attnKeyLen, opts.numHeads, batchSize)
 	q = sa.QueryNorm.Forward(ctx, q, opts.eps)
-	q = fast.RoPE(ctx, q, positionIDs, opts.attnKeyLen, ropeBase, opts.ropeScale, rope.WithTypeNeoX())
+	q = q.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, ropeBase, opts.ropeScale)

 	if opts.largeModelScaling {
 		q = q.Scale(ctx, 1.0/math.Sqrt(float64(opts.hiddenSize/opts.numHeads)))
@@ -95,7 +94,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, pos
 	k := sa.Key.Forward(ctx, hiddenState)
 	k = k.Reshape(ctx, opts.attnKeyLen, opts.numKVHeads, batchSize)
 	k = sa.KeyNorm.Forward(ctx, k, opts.eps)
-	k = fast.RoPE(ctx, k, positionIDs, opts.attnKeyLen, ropeBase, opts.ropeScale, rope.WithTypeNeoX())
+	k = k.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, ropeBase, opts.ropeScale)

 	v := sa.Value.Forward(ctx, hiddenState)
 	v = v.Reshape(ctx, opts.attnValLen, opts.numKVHeads, batchSize)
@@ -113,7 +112,7 @@ func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.T
 		ropeBase = m.TextConfig.ropeGlobalBase
 	}

-	return fast.RoPE(ctx, key, shift, m.TextConfig.attnKeyLen, ropeBase, m.TextConfig.ropeScale, rope.WithTypeNeoX()), nil
+	return key.RoPE(ctx, shift, nil, uint32(m.TextConfig.attnKeyLen), uint32(2), ropeBase, m.TextConfig.ropeScale), nil
 }

 type TextMLP struct {
@@ -166,7 +165,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 	// set image embeddings
 	var except []int
 	for _, image := range batch.Multimodal {
-		visionOutputs := image.Multimodal[0].Tensor
+		visionOutputs := image.Multimodal.(ml.Tensor)
 		ctx.Forward(visionOutputs.Copy(ctx, hiddenState.View(ctx, image.Index*hiddenState.Stride(1), visionOutputs.Dim(0)*visionOutputs.Dim(1))))

 		for i := range visionOutputs.Dim(1) {
--- a/model/models/llama/model.go
+++ b/model/models/llama/model.go
@@ -1,23 +1,22 @@
 package llama

 import (
-	"cmp"
+	"fmt"
 	"math"
+	"strings"

 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
-	"github.com/ollama/ollama/ml/nn/fast"
-	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
 )

 type Options struct {
 	hiddenSize, numHeads, numKVHeads int
-	headDim, ropeDim                 int
 	eps, ropeBase, ropeScale         float32
+	ropeDim                          uint32
 }

 type Model struct {
@@ -33,6 +32,10 @@ type Model struct {
 }

 func New(c fs.Config) (model.Model, error) {
+	if !strings.EqualFold(c.String("tokenizer.ggml.model"), "gpt2") {
+		return nil, fmt.Errorf("tokenizer %s not yet supported", c.String("tokenizer.ggml.model"))
+	}
+
 	m := Model{
 		BytePairEncoding: model.NewBytePairEncoding(
 			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
@@ -40,13 +43,13 @@ func New(c fs.Config) (model.Model, error) {
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
+				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				EOS: append(
-					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
-					c.Ints("tokenizer.ggml.eos_token_ids")...,
-				),
+				// TODO: set EOT to EOS otherwise 0 will stop generation
+				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
 			},
 		),
 		Layers: make([]Layer, c.Uint("block_count")),
@@ -54,11 +57,10 @@ func New(c fs.Config) (model.Model, error) {
 			hiddenSize: int(c.Uint("embedding_length")),
 			numHeads:   int(c.Uint("attention.head_count")),
 			numKVHeads: int(c.Uint("attention.head_count_kv")),
-			headDim:    int(c.Uint("attention.key_length")),
-			ropeDim:    int(c.Uint("rope.dimension_count")),
 			eps:        c.Float("attention.layer_norm_rms_epsilon"),
 			ropeBase:   c.Float("rope.freq_base"),
 			ropeScale:  c.Float("rope.freq_scale", 1),
+			ropeDim:    c.Uint("rope.dimension_count"),
 		},
 	}

@@ -75,31 +77,31 @@ type SelfAttention struct {
 	RopeFactors ml.Tensor  `gguf:"rope_freqs.weight"`
 }

-func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
+func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
-	headDim := cmp.Or(opts.headDim, opts.hiddenSize/opts.numHeads)
-	ropeDim := cmp.Or(opts.ropeDim, headDim)
+	headDim := opts.hiddenSize / opts.numHeads
+	ropeType := uint32(0)

-	query := sa.Query.Forward(ctx, hiddenState)
-	query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
+	q := sa.Query.Forward(ctx, hiddenState)
+	q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
+	q = q.RoPE(ctx, positionIDs, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)

-	key := sa.Key.Forward(ctx, hiddenState)
-	key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
+	k := sa.Key.Forward(ctx, hiddenState)
+	k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
+	k = k.RoPE(ctx, positionIDs, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)

-	value := sa.Value.Forward(ctx, hiddenState)
-	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
+	v := sa.Value.Forward(ctx, hiddenState)
+	v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)

-	query = fast.RoPE(ctx, query, positions, ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
-	key = fast.RoPE(ctx, key, positions, ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
+	scaleFactor := 1.0 / math.Sqrt(float64(headDim))
+	kqv := nn.Attention(ctx, q, k, v, scaleFactor, cache)
+	kqv = kqv.Reshape(ctx, opts.hiddenSize, batchSize)

-	attention := nn.Attention(ctx, query, key, value, 1.0/math.Sqrt(float64(headDim)), cache)
-	attention = attention.Reshape(ctx, headDim*opts.numHeads, batchSize)
-	return sa.Output.Forward(ctx, attention)
+	return sa.Output.Forward(ctx, kqv)
 }

 func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	ropeDim := cmp.Or(m.ropeDim, m.hiddenSize/m.numHeads)
-	return fast.RoPE(ctx, key, shift, ropeDim, m.ropeBase, m.ropeScale, rope.WithFactors(m.Layers[layer].SelfAttention.RopeFactors)), nil
+	return key.RoPE(ctx, shift, m.Layers[layer].SelfAttention.RopeFactors, uint32(0), m.ropeDim, m.ropeBase, m.ropeScale), nil
 }

 type MLP struct {
@@ -120,11 +122,11 @@ type Layer struct {
 	MLP           *MLP
 }

-func (l *Layer) Forward(ctx ml.Context, hiddenState, positions, outputs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
+func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
 	residual := hiddenState

 	hiddenState = l.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
-	hiddenState = l.SelfAttention.Forward(ctx, hiddenState, positions, cache, opts)
+	hiddenState = l.SelfAttention.Forward(ctx, hiddenState, positionIDs, cache, opts)

 	// In the final layer (outputs != nil), optimize by pruning to just the token positions
 	// we need logits for.
@@ -142,19 +144,27 @@ func (l *Layer) Forward(ctx ml.Context, hiddenState, positions, outputs ml.Tenso
 }

 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	if err != nil {
+		return nil, err
+	}
+
+	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+	if err != nil {
+		return nil, err
+	}

 	hiddenState := m.TokenEmbedding.Forward(ctx, batch.Inputs)

 	for i, layer := range m.Layers {
 		m.Cache.SetLayer(i)

-		var outputs ml.Tensor
+		var lastLayerOutputs ml.Tensor
 		if i == len(m.Layers)-1 {
-			outputs = ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+			lastLayerOutputs = outputs
 		}

-		hiddenState = layer.Forward(ctx, hiddenState, positions, outputs, m.Cache, m.Options)
+		hiddenState = layer.Forward(ctx, hiddenState, positions, lastLayerOutputs, m.Cache, m.Options)
 	}

 	hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
--- a/model/models/llama4/model.go
+++ b/model/models/llama4/model.go
@@ -4,6 +4,7 @@ import (
 	"bytes"
 	"image"
 	"slices"
+	"sync"

 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
@@ -40,13 +41,13 @@ func New(c fs.Config) (model.Model, error) {
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
+				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				EOS: append(
-					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
-					c.Ints("tokenizer.ggml.eos_token_ids")...,
-				),
+				// TODO: set EOT to EOS otherwise 0 will stop generation
+				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
 			},
 		),
 		ImageProcessor: newImageProcessor(c),
@@ -62,7 +63,7 @@ func New(c fs.Config) (model.Model, error) {
 	return &m, nil
 }

-func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
 	if len(m.VisionModel.Layers) < 1 {
 		return nil, model.ErrNoVisionModel
 	}
@@ -77,7 +78,10 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
 		return nil, err
 	}

-	tilesLocal := ctx.Input().FromFloatSlice(pixelsLocal, size.X, size.Y, m.numChannels)
+	tilesLocal, err := ctx.Input().FromFloatSlice(pixelsLocal, size.X, size.Y, m.numChannels)
+	if err != nil {
+		return nil, err
+	}

 	ratioW, ratioH := size.X/m.imageSize, size.Y/m.imageSize

@@ -88,86 +92,81 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
 	pixelValues := tilesLocal

 	if len(pixelsGlobal) > 0 {
-		tilesGlobal := ctx.Input().FromFloatSlice(pixelsGlobal, m.imageSize, m.imageSize, m.numChannels)
+		tilesGlobal, err := ctx.Input().FromFloatSlice(pixelsGlobal, m.imageSize, m.imageSize, m.numChannels)
+		if err != nil {
+			return nil, err
+		}
+
 		pixelValues = pixelValues.Concat(ctx, tilesGlobal, 3)
 	}

 	visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
 	visionOutputs = visionOutputs.Reshape(ctx, visionOutputs.Dim(0), visionOutputs.Dim(1)*visionOutputs.Dim(2)*visionOutputs.Dim(3))
 	projectedOutputs := m.Projector.Forward(ctx, visionOutputs)
-
-	var multimodal []input.Multimodal
-	aspectRatio := image.Point{ratioW, ratioH}
-
-	var offset int
-	patchesPerChunk := projectedOutputs.Dim(1)
-	if aspectRatio.Y*aspectRatio.X > 1 {
-		patchesPerChunk = projectedOutputs.Dim(1) / (aspectRatio.X*aspectRatio.Y + 1)
-
-		for range aspectRatio.Y {
-			for x := range aspectRatio.X {
-				view := projectedOutputs.View(ctx, projectedOutputs.Stride(1)*offset,
-					projectedOutputs.Dim(0), projectedOutputs.Stride(1),
-					patchesPerChunk)
-				var separator separator
-				if x < aspectRatio.X-1 {
-					separator.x = true // <|tile_x_separator|>
-				} else {
-					separator.y = true // <|tile_y_separator|>
-				}
-				multimodal = append(multimodal, input.Multimodal{Tensor: view, Data: &separator})
-				offset += patchesPerChunk
-			}
-		}
-	}
-
-	view := projectedOutputs.View(ctx, projectedOutputs.Stride(1)*offset,
-		projectedOutputs.Dim(0), projectedOutputs.Stride(1),
-		patchesPerChunk)
-	multimodal = append(multimodal, input.Multimodal{Tensor: view, Data: &separator{}})
-
-	return multimodal, nil
+	return &chunks{Model: m, Tensor: projectedOutputs, aspectRatio: image.Point{ratioW, ratioH}}, nil
 }

-type separator struct {
-	x bool
-	y bool
+type chunks struct {
+	*Model
+	ml.Tensor
+	aspectRatio image.Point
+
+	dataOnce sync.Once
+	data     []float32
+}
+
+type chunk struct {
+	*chunks
+	s, n int
+}
+
+func (r *chunk) floats() []float32 {
+	r.dataOnce.Do(func() {
+		temp := r.Backend().NewContext()
+		defer temp.Close()
+		temp.Forward(r.Tensor).Compute(r.Tensor)
+		r.data = r.Floats()
+	})
+
+	return r.data[r.s*r.Dim(0) : (r.s+r.n)*r.Dim(0)]
 }

 func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 	var result []input.Input
 	for _, inp := range inputs {
-		if len(inp.Multimodal) == 0 {
+		if inp.Multimodal == nil {
 			result = append(result, inp)
 			continue
 		}

+		t := inp.Multimodal.(*chunks)
 		var imageInputs []input.Input
 		imageInputs = append(imageInputs, input.Input{Token: 200080}) // <|image_start|>

-		for i, mm := range inp.Multimodal {
-			patchesPerChunk := mm.Tensor.Dim(1)
+		var offset int
+		patchesPerChunk := t.Dim(1)
+		if t.aspectRatio.Y*t.aspectRatio.X > 1 {
+			patchesPerChunk = t.Dim(1) / (t.aspectRatio.X*t.aspectRatio.Y + 1)

-			if i < len(inp.Multimodal)-1 {
-				separator := mm.Data.(*separator)
-
-				imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: []input.Multimodal{{Tensor: mm.Tensor}}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
-				imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
-
-				if separator.x {
-					imageInputs = append(imageInputs, input.Input{Token: 200084}) // <|tile_x_separator|>
+			for range t.aspectRatio.Y {
+				for x := range t.aspectRatio.X {
+					imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: &chunk{t, offset, patchesPerChunk}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
+					imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
+					if x < t.aspectRatio.X-1 {
+						imageInputs = append(imageInputs, input.Input{Token: 200084}) // <|tile_x_separator|>
+					}
+					offset += patchesPerChunk
 				}
-				if separator.y {
-					imageInputs = append(imageInputs, input.Input{Token: 200085}) // <|tile_y_separator|>
-				}
-			} else {
-				imageInputs = append(imageInputs, input.Input{Token: 200090})                                                                                                                      // <|image|>
-				imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: []input.Multimodal{{Tensor: mm.Tensor}}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
-				imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
-				imageInputs = append(imageInputs, input.Input{Token: 200080}) // <|image_end|>
+
+				imageInputs = append(imageInputs, input.Input{Token: 200085}) // <|tile_y_separator|>
 			}
 		}

+		imageInputs = append(imageInputs, input.Input{Token: 200090})                                                                                                                 // <|image|>
+		imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: &chunk{t, offset, patchesPerChunk}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
+		imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
+		imageInputs = append(imageInputs, input.Input{Token: 200080}) // <|image_end|>
+
 		result = append(result, imageInputs...)
 	}

@@ -175,8 +174,15 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 }

 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	if err != nil {
+		return nil, err
+	}
+
+	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+	if err != nil {
+		return nil, err
+	}

 	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache), nil
 }
--- a/model/models/llama4/model_text.go
+++ b/model/models/llama4/model_text.go
@@ -8,8 +8,6 @@ import (
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
-	"github.com/ollama/ollama/ml/nn/fast"
-	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model/input"
 )

@@ -33,8 +31,8 @@ func (sa *TextAttention) Forward(ctx ml.Context, hiddenStates, positions, attent
 	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)

 	if useRope {
-		query = fast.RoPE(ctx, query, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
-		key = fast.RoPE(ctx, key, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
+		query = query.RoPE(ctx, positions, sa.RopeFactors, uint32(opts.ropeDim), uint32(0), opts.ropeBase, opts.ropeScale)
+		key = key.RoPE(ctx, positions, sa.RopeFactors, uint32(opts.ropeDim), uint32(0), opts.ropeBase, opts.ropeScale)
 	}

 	if opts.useQKNorm {
@@ -82,7 +80,7 @@ func (e *TextExperts) Forward(ctx ml.Context, hiddenStates, routerLogits ml.Tens

 	nextStates := downStates.View(ctx, 0, hiddenStates.Dim(0), downStates.Stride(2), hiddenStates.Dim(2))
 	for i := 1; i < opts.numExpertsUsed; i++ {
-		nextStates = nextStates.Add(ctx, downStates.View(ctx, i*downStates.Stride(1), hiddenStates.Dim(0), downStates.Stride(2), hiddenStates.Dim(2)))
+		nextStates.Add(ctx, downStates.View(ctx, i*downStates.Stride(1), hiddenStates.Dim(0), downStates.Stride(2), hiddenStates.Dim(2)))
 	}

 	return nextStates
@@ -212,7 +210,12 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 	hiddenStates := m.TokenEmbedding.Forward(ctx, inputs).Duplicate(ctx)

 	for _, mi := range batch.Multimodal {
-		img := mi.Multimodal[0].Tensor
+		f32s := mi.Multimodal.(*chunk).floats()
+		img, err := ctx.Input().FromFloatSlice(f32s, len(f32s)/m.hiddenSize, m.hiddenSize)
+		if err != nil {
+			panic(err)
+		}
+
 		ctx.Forward(img.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), img.Dim(0)*img.Dim(1))))
 	}

@@ -223,7 +226,11 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 			scales[i] = float32(math.Log(math.Floor(((float64(p)+1.0)/float64(m.attentionFloorScale))+1.0))*m.attentionScale + 1.0)
 		}

-		attentionScales = ctx.Input().FromFloatSlice(scales, 1, 1, len(scales))
+		var err error
+		attentionScales, err = ctx.Input().FromFloatSlice(scales, 1, 1, len(scales))
+		if err != nil {
+			panic(err)
+		}
 	}

 	for i, layer := range m.Layers {
@@ -248,5 +255,5 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 }

 func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale, rope.WithFactors(m.Layers[layer].Attention.RopeFactors)), nil
+	return key.RoPE(ctx, shift, m.Layers[layer].Attention.RopeFactors, uint32(0), uint32(m.ropeDim), m.ropeBase, m.ropeScale), nil
 }
--- a/model/models/llama4/model_vision.go
+++ b/model/models/llama4/model_vision.go
@@ -245,7 +245,10 @@ func (m *VisionModel) rotaryEmbedding(ctx ml.Context) (ml.Tensor, ml.Tensor) {
 		}
 	}

-	ropeFreqs := ctx.Input().FromFloatSlice(freqs, freqDim/2, numPatches, 2)
+	ropeFreqs, err := ctx.Input().FromFloatSlice(freqs, freqDim/2, numPatches, 2)
+	if err != nil {
+		panic(err)
+	}

 	ropeFreqs = ropeFreqs.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
 	ropeFreqs = ropeFreqs.Reshape(ctx, freqDim, 1, numPatches)
--- a/model/models/mistral3/model.go
+++ b/model/models/mistral3/model.go
@@ -4,6 +4,7 @@ import (
 	"bytes"
 	"image"
 	"slices"
+	"sync"

 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
@@ -31,26 +32,31 @@ var _ model.MultimodalProcessor = (*Model)(nil)
 var _ model.TextProcessor = (*Model)(nil)

 func New(c fs.Config) (model.Model, error) {
+	textModel, err := NewTextModel(c)
+	if err != nil {
+		return nil, err
+	}
+
 	m := &Model{
+		TextModel:           textModel,
+		VisionModel:         newVisionModel(c),
+		ImageProcessor:      newImageProcessor(c),
+		MultiModalProjector: newMultiModalProjector(c),
 		BytePairEncoding: model.NewBytePairEncoding(
 			c.String("tokenizer.ggml.pretokenizer", `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
 			&model.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
+				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id", 1)),
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id", 2)),
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				EOS: append(
-					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
-					c.Ints("tokenizer.ggml.eos_token_ids")...,
-				),
+				// TODO: set EOT to EOS otherwise 0 will stop generation
+				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
 			},
 		),
-		TextModel:           newTextModel(c),
-		VisionModel:         newVisionModel(c),
-		ImageProcessor:      newImageProcessor(c),
-		MultiModalProjector: newMultiModalProjector(c),
 	}

 	m.Cache = kvcache.NewCausalCache(m.TextModel.Shift)
@@ -99,7 +105,7 @@ func newMultiModalProjector(c fs.Config) *MultiModalProjector {
 	}
 }

-func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
 	if len(m.VisionModel.Layers) == 0 {
 		return nil, model.ErrNoVisionModel
 	}
@@ -114,20 +120,46 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
 		return nil, err
 	}

-	pixelValues := ctx.Input().FromFloatSlice(f32s, size.X, size.Y, m.ImageProcessor.numChannels)
+	pixelValues, err := ctx.Input().FromFloatSlice(f32s, size.X, size.Y, m.ImageProcessor.numChannels)
+	if err != nil {
+		return nil, err
+	}

 	visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
 	features, size := m.MultiModalProjector.Forward(ctx, visionOutputs, size)

 	// split into patches to be sent to the text transformer
-	rows := make([]input.Multimodal, size.Y)
+	parent := imageFeatures{tensor: features}
+	rows := make([]*imageRow, size.Y)
 	for i := range rows {
-		rows[i].Tensor = features.View(ctx, features.Stride(1)*size.X*i, features.Dim(0), features.Stride(1), size.X)
+		rows[i] = &imageRow{parent: &parent, s: i, shape: []int{features.Dim(0), size.X}}
 	}

 	return rows, nil
 }

+type imageFeatures struct {
+	tensor ml.Tensor
+
+	dataOnce sync.Once
+	data     []float32
+}
+
+type imageRow struct {
+	parent *imageFeatures
+	s      int
+	shape  []int
+}
+
+func (r *imageRow) data() []float32 {
+	n := 1
+	for _, s := range r.shape {
+		n *= s
+	}
+
+	return r.parent.data[r.s*n : (r.s+1)*n]
+}
+
 // PostTokenize arranges Mistral 3's inputs for the forward pass
 // In Mistral 3 and Pixtral, the input patches are arranged as follows:
 // [IMG]...[IMG][IMG_BREAK][IMG]...[IMG][IMG_BREAK][IMG]...[IMG][IMG_END]
@@ -136,14 +168,15 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
 func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 	var result []input.Input
 	for _, inp := range inputs {
-		if len(inp.Multimodal) == 0 {
+		if inp.Multimodal == nil {
 			result = append(result, inp)
 		} else {
-			for i, row := range inp.Multimodal {
+			inputMultimodal := inp.Multimodal.([]*imageRow)
+			for i, row := range inputMultimodal {
 				// [IMG]
-				result = append(result, input.Input{Token: 10, Multimodal: []input.Multimodal{{Tensor: row.Tensor}}, MultimodalHash: inp.MultimodalHash, SameBatch: row.Tensor.Dim(1)})
-				result = append(result, slices.Repeat([]input.Input{{Token: 10}}, row.Tensor.Dim(1)-1)...)
-				if i == len(inp.Multimodal)-1 {
+				result = append(result, input.Input{Token: 10, Multimodal: row, MultimodalHash: inp.MultimodalHash, SameBatch: row.shape[1]})
+				result = append(result, slices.Repeat([]input.Input{{Token: 10}}, row.shape[1]-1)...)
+				if i == len(inputMultimodal)-1 {
 					// [IMG_END]
 					result = append(result, input.Input{Token: 13})
 				} else {
@@ -158,8 +191,15 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 }

 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	if err != nil {
+		return nil, err
+	}
+
+	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+	if err != nil {
+		return nil, err
+	}

 	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache), nil
 }
--- a/model/models/mistral3/model_text.go
+++ b/model/models/mistral3/model_text.go
@@ -1,24 +1,27 @@
 package mistral3

 import (
-	"cmp"
+	"fmt"
 	"math"
+	"strings"

 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
-	"github.com/ollama/ollama/ml/nn/fast"
+	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
 )

 type TextOptions struct {
-	hiddenSize, numHeads, numKVHeads int
-	headDim, ropeDim                 int
-	eps, ropeBase, ropeScale         float32
+	hiddenSize, numHeads, numKVHeads, headDim int
+	eps, ropeBase, ropeScale                  float32
+	ropeDim                                   uint32
 }

 type TextModel struct {
+	model.Base
+
 	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
 	Layers         []Layer       `gguf:"blk"`
 	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
@@ -36,15 +39,19 @@ type SelfAttention struct {

 func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
-	headDim := cmp.Or(opts.headDim, opts.hiddenSize/opts.numHeads)
+	ropeType := uint32(0)
+	headDim := opts.headDim
+	if headDim == 0 {
+		headDim = opts.hiddenSize / opts.numHeads
+	}

 	q := sa.Query.Forward(ctx, hiddenState)
 	q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
-	q = fast.RoPE(ctx, q, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale)
+	q = q.RoPE(ctx, positionIDs, nil, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)

 	k := sa.Key.Forward(ctx, hiddenState)
 	k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-	k = fast.RoPE(ctx, k, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale)
+	k = k.RoPE(ctx, positionIDs, nil, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)

 	v := sa.Value.Forward(ctx, hiddenState)
 	v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
@@ -55,7 +62,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
 }

 func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale), nil
+	return key.RoPE(ctx, shift, nil, uint32(0), m.ropeDim, m.ropeBase, m.ropeScale), nil
 }

 type MLP struct {
@@ -102,7 +109,20 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor

 	// image embeddings
 	for _, image := range batch.Multimodal {
-		imageFeature := image.Multimodal[0].Tensor
+		row := image.Multimodal.(*imageRow)
+		row.parent.dataOnce.Do(func() {
+			// use a new, throwaway context so the image tensor is not added to the graph
+			temp := m.Backend().NewContext()
+			temp.Forward(row.parent.tensor).Compute(row.parent.tensor)
+			row.parent.data = row.parent.tensor.Floats()
+			temp.Close()
+		})
+
+		imageFeature, err := ctx.Input().FromFloatSlice(row.data(), row.shape...)
+		if err != nil {
+			panic(err)
+		}
+
 		ctx.Forward(imageFeature.Copy(ctx, hiddenState.View(ctx, image.Index*hiddenState.Stride(1), imageFeature.Dim(0)*imageFeature.Dim(1))))
 	}

@@ -121,18 +141,24 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 	return m.Output.Forward(ctx, hiddenState)
 }

-func newTextModel(c fs.Config) *TextModel {
-	return &TextModel{
+func NewTextModel(c fs.Config) (*TextModel, error) {
+	if !strings.EqualFold(c.String("tokenizer.ggml.model"), "gpt2") {
+		return nil, fmt.Errorf("tokenizer %s not yet supported", c.String("tokenizer.ggml.model"))
+	}
+
+	textModel := &TextModel{
 		Layers: make([]Layer, c.Uint("block_count")),
 		TextOptions: &TextOptions{
 			hiddenSize: int(c.Uint("embedding_length")),
 			numHeads:   int(c.Uint("attention.head_count")),
 			numKVHeads: int(c.Uint("attention.head_count_kv")),
 			headDim:    int(c.Uint("attention.key_length")),
-			ropeDim:    int(c.Uint("rope.dimension_count")),
 			eps:        c.Float("attention.layer_norm_rms_epsilon"),
 			ropeBase:   c.Float("rope.freq_base"),
 			ropeScale:  c.Float("rope.freq_scale", 1),
+			ropeDim:    c.Uint("rope.dimension_count"),
 		},
 	}
+
+	return textModel, nil
 }
--- a/model/models/mistral3/model_vision.go
+++ b/model/models/mistral3/model_vision.go
@@ -110,8 +110,15 @@ func (m *VisionModel) positionalEmbedding(ctx ml.Context, positionIDs ml.Tensor)
 		}
 	}

-	h := ctx.Input().FromFloatSlice(frequenciesHeight, maxPatchesPerSide, frequencies/2)
-	w := ctx.Input().FromFloatSlice(frequenciesWidth, maxPatchesPerSide, frequencies/2)
+	h, err := ctx.Input().FromFloatSlice(frequenciesHeight, maxPatchesPerSide, frequencies/2)
+	if err != nil {
+		panic(err)
+	}
+
+	w, err := ctx.Input().FromFloatSlice(frequenciesWidth, maxPatchesPerSide, frequencies/2)
+	if err != nil {
+		panic(err)
+	}

 	h = h.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
 	w = w.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
@@ -144,7 +151,10 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
 		}
 	}

-	positionIDs := ctx.Input().FromIntSlice(positions, len(positions))
+	positionIDs, err := ctx.Input().FromIntSlice(positions, len(positions))
+	if err != nil {
+		panic(err)
+	}

 	positionEmbedding := m.positionalEmbedding(ctx, positionIDs)
 	cos, sin := positionEmbedding.Cos(ctx), positionEmbedding.Sin(ctx)
@@ -160,7 +170,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {

 func newVisionModel(c fs.Config) *VisionModel {
 	return &VisionModel{
-		Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count")),
+		Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count", 24)),
 		VisionModelOptions: &VisionModelOptions{
 			hiddenSize:       int(c.Uint("vision.embedding_length", 1024)),
 			numHeads:         int(c.Uint("vision.attention.head_count", 16)),
--- a/model/models/mllama/model.go
+++ b/model/models/mllama/model.go
@@ -3,7 +3,6 @@ package mllama
 import (
 	"bytes"
 	"image"
-	"slices"

 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
@@ -38,13 +37,13 @@ func New(c fs.Config) (model.Model, error) {
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
+				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				EOS: append(
-					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
-					c.Ints("tokenizer.ggml.eos_token_ids")...,
-				),
+				// TODO: set EOT to EOS otherwise 0 will stop generation
+				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
 			},
 		),
 		ImageProcessor: newImageProcessor(c),
@@ -59,7 +58,7 @@ func New(c fs.Config) (model.Model, error) {
 	return &m, nil
 }

-func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
 	if len(m.VisionModel.Transformer.Layers) == 0 || len(m.GlobalTransformer.Layers) == 0 {
 		return nil, model.ErrNoVisionModel
 	}
@@ -74,20 +73,21 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
 		return nil, err
 	}

-	if ratio.numTiles() < m.maxNumTiles {
-		// Pad tiles to maxNumTiles
-		f32s = slices.Grow(f32s, m.imageSize*m.imageSize*m.numChannels*m.maxNumTiles)
-		f32s = f32s[:m.imageSize*m.imageSize*m.numChannels*m.maxNumTiles]
+	pixelValues, err := ctx.Input().FromFloatSlice(f32s, m.imageSize, m.imageSize, m.numChannels, ratio.numTiles())
+	if err != nil {
+		return nil, err
 	}

-	pixelValues := ctx.Input().FromFloatSlice(f32s, m.imageSize, m.imageSize, m.numChannels, m.maxNumTiles)
-	aspectRatio := ctx.Input().FromIntSlice([]int32{int32(ratio.rank)}, 1)
+	pixelValues = pixelValues.Pad(ctx, 0, 0, 0, m.ImageProcessor.maxNumTiles-ratio.numTiles())
+
+	aspectRatio, err := ctx.Input().FromIntSlice([]int32{int32(ratio.rank)}, 1)
+	if err != nil {
+		return nil, err
+	}

 	positionIDs := ctx.Arange(0, 1601, 1, ml.DTypeI32)
 	crossAttentionStates := m.VisionModel.Forward(ctx, pixelValues, positionIDs, aspectRatio)
-	projectedOutputs := m.Projector.Forward(ctx, crossAttentionStates)
-
-	return []input.Multimodal{{Tensor: projectedOutputs}}, nil
+	return m.Projector.Forward(ctx, crossAttentionStates), nil
 }

 func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
@@ -103,11 +103,18 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 	var crossAttentionStates ml.Tensor
 	if len(batch.Multimodal) > 0 {
-		crossAttentionStates = batch.Multimodal[len(batch.Multimodal)-1].Multimodal[0].Tensor
+		crossAttentionStates = batch.Multimodal[len(batch.Multimodal)-1].Multimodal.(ml.Tensor)
 	}

-	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	if err != nil {
+		return nil, err
+	}
+
+	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+	if err != nil {
+		return nil, err
+	}

 	// TODO: attention mask, cross attention mask
 	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, crossAttentionStates, nil, m.Cache.(*kvcache.WrapperCache)), nil
--- a/model/models/mllama/model_text.go
+++ b/model/models/mllama/model_text.go
@@ -8,8 +8,6 @@ import (
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
-	"github.com/ollama/ollama/ml/nn/fast"
-	"github.com/ollama/ollama/ml/nn/rope"
 )

 type TextSelfAttention struct {
@@ -23,14 +21,15 @@ type TextSelfAttention struct {
 func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
 	headDim := opts.hiddenSize / opts.numHeads
+	ropeType := uint32(0)

 	query := sa.Query.Forward(ctx, hiddenState)
 	query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
-	query = fast.RoPE(ctx, query, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
+	query = query.RoPE(ctx, positions, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)

 	key := sa.Key.Forward(ctx, hiddenState)
 	key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-	key = fast.RoPE(ctx, key, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
+	key = key.RoPE(ctx, positions, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)

 	value := sa.Value.Forward(ctx, hiddenState)
 	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
@@ -45,7 +44,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.T
 func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
 	// This will only get called for layers in the cache, which are just the self attention layers
 	if sa, ok := m.Transformer.Layers[layer].(*TextSelfAttentionDecoderLayer); ok {
-		return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale, rope.WithFactors(sa.SelfAttention.RopeFactors)), nil
+		return key.RoPE(ctx, shift, sa.SelfAttention.RopeFactors, m.ropeDim, uint32(0), m.ropeBase, m.ropeScale), nil
 	}

 	return key, nil
@@ -200,8 +199,8 @@ func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs,

 type TextModelOptions struct {
 	hiddenSize, numHeads, numKVHeads int
-	ropeDim                          int
 	eps, ropeBase, ropeScale         float32
+	ropeDim                          uint32

 	crossAttentionLayers []int32
 }
@@ -241,10 +240,10 @@ func newTextModel(c fs.Config) *TextModel {
 			hiddenSize:           int(c.Uint("embedding_length")),
 			numHeads:             int(c.Uint("attention.head_count")),
 			numKVHeads:           int(c.Uint("attention.head_count_kv")),
-			ropeDim:              int(c.Uint("rope.dimension_count")),
 			eps:                  c.Float("attention.layer_norm_rms_epsilon"),
 			ropeBase:             c.Float("rope.freq_base"),
 			ropeScale:            c.Float("rope.freq_scale", 1),
+			ropeDim:              c.Uint("rope.dimension_count"),
 			crossAttentionLayers: c.Ints("attention.cross_attention_layers"),
 		},
 	}
--- a/model/models/mllama/model_vision.go
+++ b/model/models/mllama/model_vision.go
@@ -16,6 +16,8 @@ type VisionSelfAttention struct {
 	Key    *nn.Linear `gguf:"attn_k"`
 	Value  *nn.Linear `gguf:"attn_v"`
 	Output *nn.Linear `gguf:"attn_output"`
+
+	Gate ml.Tensor `gguf:"attn_gate"`
 }

 func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
@@ -23,16 +25,27 @@ func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, op

 	query := sa.Query.Forward(ctx, hiddenState)
 	query = query.Reshape(ctx, headDim, opts.numHeads, query.Dim(1), batchSize)
+	query = query.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)

 	key := sa.Key.Forward(ctx, hiddenState)
 	key = key.Reshape(ctx, headDim, opts.numHeads, key.Dim(1), batchSize)
+	key = key.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)

 	value := sa.Value.Forward(ctx, hiddenState)
 	value = value.Reshape(ctx, headDim, opts.numHeads, value.Dim(1), batchSize)
+	value = value.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)

-	attention := nn.Attention(ctx, query, key, value, 1./math.Sqrt(float64(headDim)), nil)
+	scores := key.Mulmat(ctx, query)
+	scores = scores.Scale(ctx, 1.0/math.Sqrt(float64(headDim)))
+	scores = scores.Softmax(ctx)
+
+	attention := value.Mulmat(ctx, scores)
+	attention = attention.Reshape(ctx, headDim, attention.Dim(1), opts.numHeads, batchSize)
+	attention = attention.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
 	attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize)
-	return sa.Output.Forward(ctx, attention)
+
+	hiddenState = sa.Output.Forward(ctx, attention)
+	return hiddenState
 }

 type VisionMLP struct {
@@ -63,18 +76,21 @@ func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts
 	// self attention
 	hiddenState = e.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
 	hiddenState = e.SelfAttention.Forward(ctx, hiddenState, opts)
+
 	if e.AttentionGate != nil {
 		hiddenState = hiddenState.Mul(ctx, e.AttentionGate)
 	}
 	hiddenState = hiddenState.Add(ctx, residual)
 	residual = hiddenState

+	// feed forward
 	hiddenState = e.MLPNorm.Forward(ctx, hiddenState, opts.eps)
 	hiddenState = e.MLP.Forward(ctx, hiddenState, opts)
+	hiddenState = hiddenState.Add(ctx, residual)
 	if e.MLPGate != nil {
 		hiddenState = hiddenState.Mul(ctx, e.MLPGate)
 	}
-	hiddenState = hiddenState.Add(ctx, residual)
+
 	return hiddenState
 }

--- a/model/models/models.go
+++ b/model/models/models.go
@@ -7,7 +7,5 @@ import (
 	_ "github.com/ollama/ollama/model/models/llama4"
 	_ "github.com/ollama/ollama/model/models/mistral3"
 	_ "github.com/ollama/ollama/model/models/mllama"
-	_ "github.com/ollama/ollama/model/models/qwen2"
 	_ "github.com/ollama/ollama/model/models/qwen25vl"
-	_ "github.com/ollama/ollama/model/models/qwen3"
 )
--- a/model/models/qwen2/model.go
+++ b/model/models/qwen2/model.go
@@ -1,164 +0,0 @@
-package qwen2
-
-import (
-	"cmp"
-	"math"
-
-	"github.com/ollama/ollama/fs"
-	"github.com/ollama/ollama/kvcache"
-	"github.com/ollama/ollama/ml"
-	"github.com/ollama/ollama/ml/nn"
-	"github.com/ollama/ollama/ml/nn/fast"
-	"github.com/ollama/ollama/ml/nn/rope"
-	"github.com/ollama/ollama/model"
-	"github.com/ollama/ollama/model/input"
-)
-
-type Options struct {
-	hiddenSize, numHeads, numKVHeads int
-	headDim, ropeDim                 int
-	eps, ropeBase, ropeScale         float32
-}
-
-type Attention struct {
-	Query  *nn.Linear `gguf:"attn_q"`
-	Key    *nn.Linear `gguf:"attn_k"`
-	Value  *nn.Linear `gguf:"attn_v"`
-	Output *nn.Linear `gguf:"attn_output"`
-}
-
-func (attn Attention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
-	batchSize := hiddenStates.Dim(1)
-	headDim := cmp.Or(opts.headDim, opts.hiddenSize/opts.numHeads)
-	ropeDim := cmp.Or(opts.ropeDim, headDim)
-
-	query := attn.Query.Forward(ctx, hiddenStates)
-	query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
-
-	key := attn.Key.Forward(ctx, hiddenStates)
-	key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-
-	value := attn.Value.Forward(ctx, hiddenStates)
-	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-
-	query = fast.RoPE(ctx, query, positions, ropeDim, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
-	key = fast.RoPE(ctx, key, positions, ropeDim, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
-
-	attention := nn.Attention(ctx, query, key, value, 1.0/math.Sqrt(float64(headDim)), cache)
-	attention = attention.Reshape(ctx, headDim*opts.numHeads, batchSize)
-
-	return attn.Output.Forward(ctx, attention)
-}
-
-type MLP struct {
-	Gate *nn.Linear `gguf:"ffn_gate"`
-	Up   *nn.Linear `gguf:"ffn_up"`
-	Down *nn.Linear `gguf:"ffn_down"`
-}
-
-func (mlp MLP) Forward(ctx ml.Context, hiddenStates ml.Tensor) ml.Tensor {
-	hiddenStates = mlp.Gate.Forward(ctx, hiddenStates).SILU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenStates))
-	return mlp.Down.Forward(ctx, hiddenStates)
-}
-
-type DecoderLayer struct {
-	AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
-	Attention     *Attention
-	MLPNorm       *nn.RMSNorm `gguf:"ffn_norm"`
-	MLP           *MLP
-}
-
-func (d DecoderLayer) Forward(ctx ml.Context, hiddenStates, positions, outputs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
-	residual := hiddenStates
-
-	hiddenStates = d.AttentionNorm.Forward(ctx, hiddenStates, opts.eps)
-	hiddenStates = d.Attention.Forward(ctx, hiddenStates, positions, cache, opts)
-	if outputs != nil {
-		hiddenStates = hiddenStates.Rows(ctx, outputs)
-		residual = residual.Rows(ctx, outputs)
-	}
-
-	hiddenStates = hiddenStates.Add(ctx, residual)
-	residual = hiddenStates
-
-	hiddenStates = d.MLPNorm.Forward(ctx, hiddenStates, opts.eps)
-	hiddenStates = d.MLP.Forward(ctx, hiddenStates)
-	return hiddenStates.Add(ctx, residual)
-}
-
-type Model struct {
-	model.Base
-	model.BytePairEncoding
-
-	TokenEmbedding *nn.Embedding  `gguf:"token_embd"`
-	Layers         []DecoderLayer `gguf:"blk"`
-	OutputNorm     *nn.RMSNorm    `gguf:"output_norm"`
-	Output         *nn.Linear     `gguf:"output,alt:token_embd"`
-
-	Options
-}
-
-// Forward implements model.Model.
-func (m Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-
-	hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs)
-
-	for i, layer := range m.Layers {
-		m.Cache.SetLayer(i)
-
-		var outputs ml.Tensor
-		if i == len(m.Layers)-1 {
-			outputs = ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
-		}
-
-		hiddenStates = layer.Forward(ctx, hiddenStates, positions, outputs, m.Cache, &m.Options)
-	}
-
-	hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.eps)
-	hiddenStates = m.Output.Forward(ctx, hiddenStates)
-	return hiddenStates, nil
-}
-
-func (m Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	ropeDim := cmp.Or(m.ropeDim, m.hiddenSize/m.numHeads)
-	return fast.RoPE(ctx, key, shift, ropeDim, m.ropeBase, m.ropeScale, rope.WithTypeNeoX()), nil
-}
-
-func New(c fs.Config) (model.Model, error) {
-	m := Model{
-		Layers: make([]DecoderLayer, c.Uint("block_count")),
-		BytePairEncoding: model.NewBytePairEncoding(
-			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
-			&model.Vocabulary{
-				Values: c.Strings("tokenizer.ggml.tokens"),
-				Types:  c.Ints("tokenizer.ggml.token_type"),
-				Merges: c.Strings("tokenizer.ggml.merges"),
-				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
-				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				EOS: append(
-					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
-					c.Ints("tokenizer.ggml.eos_token_ids")...,
-				),
-			},
-		),
-		Options: Options{
-			hiddenSize: int(c.Uint("embedding_length")),
-			numHeads:   int(c.Uint("attention.head_count")),
-			numKVHeads: int(c.Uint("attention.head_count_kv")),
-			headDim:    int(c.Uint("attention.key_length")),
-			ropeDim:    int(c.Uint("rope.dimension_count")),
-			ropeBase:   c.Float("rope.freq_base"),
-			ropeScale:  c.Float("rope.freq_scale", 1),
-			eps:        c.Float("attention.layer_norm_rms_epsilon"),
-		},
-	}
-
-	m.Cache = kvcache.NewCausalCache(m.Shift)
-	return &m, nil
-}
-
-func init() {
-	model.Register("qwen2", New)
-}
--- a/model/models/qwen25vl/model.go
+++ b/model/models/qwen25vl/model.go
@@ -5,6 +5,7 @@ import (
 	"fmt"
 	"image"
 	"slices"
+	"sync"

 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
@@ -34,13 +35,12 @@ func New(c fs.Config) (model.Model, error) {
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
-				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
+				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", false),
+				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				EOS: append(
-					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
-					c.Ints("tokenizer.ggml.eos_token_ids")...,
-				),
+				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
 			},
 		),
 		TextModel:      NewTextModel(c),
@@ -69,12 +69,15 @@ func (m *Model) PixelValues(ctx ml.Context, multimodalData []byte) (ml.Tensor, *
 		m.ImageProcessor.patchSize * m.ImageProcessor.patchSize
 	numPatches := grid.Temporal * grid.Height * grid.Width

-	pixelValues := ctx.Input().FromFloatSlice(f32s, patchDim, numPatches)
+	pixelValues, err := ctx.Input().FromFloatSlice(f32s, patchDim, numPatches)
+	if err != nil {
+		return nil, nil, fmt.Errorf("failed to create tensor from image: %w", err)
+	}

 	return pixelValues, grid, nil
 }

-func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
 	if len(m.VisionModel.Layers) == 0 {
 		return nil, model.ErrNoVisionModel
 	}
@@ -85,7 +88,31 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
 	}

 	visionOutputs := m.VisionModel.Forward(ctx, pixels, grid)
-	return []input.Multimodal{{Tensor: visionOutputs}}, nil
+	return &chunks{Model: m, Tensor: visionOutputs}, nil
+}
+
+type chunks struct {
+	*Model
+	ml.Tensor
+
+	dataOnce sync.Once
+	data     []float32
+}
+
+type chunk struct {
+	*chunks
+	s, n int
+}
+
+func (r *chunk) floats() []float32 {
+	r.dataOnce.Do(func() {
+		temp := r.Backend().NewContext()
+		defer temp.Close()
+		temp.Forward(r.Tensor).Compute(r.Tensor)
+		r.data = r.Floats()
+	})
+
+	return r.data[r.s*r.Dim(0) : (r.s+r.n)*r.Dim(0)]
 }

 // PostTokenize arranges Qwen-2.5-VL's inputs for the forward pass
@@ -115,15 +142,18 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 				result = append(result, input.Input{Token: pre[i]})
 			}

-			patchesPerChunk := inp.Multimodal[0].Tensor.Dim(1)
+			// This is an image token with multimodal data
+			chunksData := inp.Multimodal.(*chunks)
+			patchesPerChunk := chunksData.Dim(1)

 			// First add the vision start token
-			result = append(result, input.Input{Token: visionStartToken})
+			result = append(result, input.Input{Token: visionStartToken, SameBatch: patchesPerChunk + 2})

 			// Add the image token with the multimodal tensor data at the first position
+			// Create a chunk with proper s and n values
 			result = append(result, input.Input{
 				Token:          imageToken,
-				Multimodal:     inp.Multimodal,
+				Multimodal:     &chunk{chunks: chunksData, s: 0, n: patchesPerChunk},
 				MultimodalHash: inp.MultimodalHash,
 				SameBatch:      patchesPerChunk,
 			})
@@ -139,8 +169,15 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 }

 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	if err != nil {
+		return nil, err
+	}
+
+	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+	if err != nil {
+		return nil, err
+	}

 	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache)
 }
--- a/model/models/qwen25vl/model_text.go
+++ b/model/models/qwen25vl/model_text.go
@@ -7,15 +7,13 @@ import (
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
-	"github.com/ollama/ollama/ml/nn/fast"
-	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model/input"
 )

 type TextOptions struct {
-	hiddenSize, numHeads, numKVHeads int
-	ropeDim, originalContextLength   int
-	eps, ropeBase, ropeScale         float32
+	ctxLen, hiddenSize, numHeads, numKVHeads int
+	eps, ropeBase, ropeScale                 float32
+	ropeDim, defaultContextLen               uint32
 }

 type TextModel struct {
@@ -31,14 +29,15 @@ func NewTextModel(c fs.Config) *TextModel {
 	m := TextModel{
 		Layers: make([]Layer, c.Uint("block_count")),
 		TextOptions: &TextOptions{
-			hiddenSize:            int(c.Uint("embedding_length")),
-			numHeads:              int(c.Uint("attention.head_count")),
-			numKVHeads:            int(c.Uint("attention.head_count_kv")),
-			ropeDim:               int(c.Uint("rope.dimension_count", 128)),
-			originalContextLength: int(c.Uint("context_length", 128000)),
-			eps:                   c.Float("attention.layer_norm_rms_epsilon"),
-			ropeBase:              c.Float("rope.freq_base"),
-			ropeScale:             c.Float("rope.freq_scale", 1),
+			ctxLen:            int(c.Uint("context_length")),
+			hiddenSize:        int(c.Uint("embedding_length")),
+			numHeads:          int(c.Uint("attention.head_count")),
+			numKVHeads:        int(c.Uint("attention.head_count_kv")),
+			eps:               c.Float("attention.layer_norm_rms_epsilon"),
+			ropeBase:          c.Float("rope.freq_base"),
+			ropeScale:         c.Float("rope.freq_scale", 1),
+			ropeDim:           c.Uint("rope.dimension_count", 128),
+			defaultContextLen: c.Uint("context_length", 128000),
 		},
 	}

@@ -60,11 +59,11 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten

 	q := sa.Query.Forward(ctx, hiddenState)
 	q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
-	q = fast.RoPE(ctx, q, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithOriginalContextLength(opts.originalContextLength), rope.WithTypeNeoX())
+	q = q.RoPE(ctx, positionIDs, nil, opts.ropeDim, 2, opts.ropeBase, opts.ropeScale, ml.WithContextLen(opts.defaultContextLen))

 	k := sa.Key.Forward(ctx, hiddenState)
 	k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-	k = fast.RoPE(ctx, k, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithOriginalContextLength(opts.originalContextLength), rope.WithTypeNeoX())
+	k = k.RoPE(ctx, positionIDs, nil, opts.ropeDim, 2, opts.ropeBase, opts.ropeScale, ml.WithContextLen(opts.defaultContextLen))

 	v := sa.Value.Forward(ctx, hiddenState)
 	v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
@@ -78,7 +77,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten

 // Shift applies rotary position embeddings to the key tensor for causal attention caching
 func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale, rope.WithOriginalContextLength(m.originalContextLength), rope.WithTypeNeoX()), nil
+	return key.RoPE(ctx, shift, nil, m.ropeDim, 2, m.ropeBase, m.ropeScale, ml.WithContextLen(m.defaultContextLen)), nil
 }

 // MLP implements the feed-forward network component with SwiGLU activation
@@ -130,7 +129,12 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 	hiddenStates := m.TokenEmbedding.Forward(ctx, inputs).Duplicate(ctx)

 	for _, mi := range batch.Multimodal {
-		img := mi.Multimodal[0].Tensor
+		f32s := mi.Multimodal.(*chunk).floats()
+		img, err := ctx.Input().FromFloatSlice(f32s, len(f32s)/m.hiddenSize, m.hiddenSize)
+		if err != nil {
+			panic(err)
+		}
+
 		ctx.Forward(img.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), img.Dim(0)*img.Dim(1))))
 	}

--- a/model/models/qwen25vl/model_vision.go
+++ b/model/models/qwen25vl/model_vision.go
@@ -1,6 +1,7 @@
 package qwen25vl

 import (
+	"fmt"
 	"math"
 	"slices"

@@ -43,8 +44,10 @@ func blockDiagonalMask(ctx ml.Context, seqLength int, bounds []int, numHeads int
 		}
 	}

-	mask := ctx.Input().FromFloatSlice(flat, seqLength, seqLength)
-
+	mask, err := ctx.Input().FromFloatSlice(flat, seqLength, seqLength)
+	if err != nil {
+		panic(err)
+	}
 	// Reshape to match [seqLength, seqLength, 1] for broadcasting
 	mask = mask.Reshape(ctx, seqLength, seqLength, 1)

@@ -300,7 +303,10 @@ func (m *VisionModel) WindowIndex(ctx ml.Context, grid *Grid) (ml.Tensor, []int)
 		}
 	}

-	t := ctx.Input().FromIntSlice(index, len(index))
+	t, err := ctx.Input().FromIntSlice(index, len(index))
+	if err != nil {
+		panic(err)
+	}

 	return t, bounds
 }
@@ -320,7 +326,10 @@ func (m *VisionModel) PositionalEmbedding(ctx ml.Context, grid *Grid) ml.Tensor
 			freqVals[i*freq+j] = float32(i) / float32(math.Pow(theta, float64(j*2)/float64(dim)))
 		}
 	}
-	freqs := ctx.Input().FromFloatSlice(freqVals, freq, maxGridSize)
+	freqs, err := ctx.Input().FromFloatSlice(freqVals, freq, maxGridSize)
+	if err != nil {
+		panic(fmt.Errorf("failed to create tensor from frequencies: %w", err))
+	}

 	// Create position coordinates (y,x pairs) for the grid
 	// In PyTorch: Equivalent to generating position ids with torch.arange()
@@ -330,7 +339,10 @@ func (m *VisionModel) PositionalEmbedding(ctx ml.Context, grid *Grid) ml.Tensor
 			coords = append(coords, int32(y), int32(x))
 		}
 	}
-	pos := ctx.Input().FromIntSlice(coords, 2, grid.Width, grid.Height)
+	pos, err := ctx.Input().FromIntSlice(coords, 2, grid.Width, grid.Height)
+	if err != nil {
+		panic(fmt.Errorf("failed to create tensor from positions: %w", err))
+	}

 	// Reshape and permute positions to match spatial merging pattern
 	pos = pos.Reshape(ctx, 2, grid.Width, merge, grid.Height/merge)
--- a/model/models/qwen3/model.go
+++ b/model/models/qwen3/model.go
@@ -1,233 +0,0 @@
-package qwen3
-
-import (
-	"cmp"
-	"math"
-
-	"github.com/ollama/ollama/fs"
-	"github.com/ollama/ollama/kvcache"
-	"github.com/ollama/ollama/ml"
-	"github.com/ollama/ollama/ml/nn"
-	"github.com/ollama/ollama/ml/nn/fast"
-	"github.com/ollama/ollama/ml/nn/rope"
-	"github.com/ollama/ollama/model"
-	"github.com/ollama/ollama/model/input"
-)
-
-type Options struct {
-	hiddenSize, numHeads, numKVHeads int
-	eps                              float32
-	ropeBase, ropeScale              float32
-
-	keyLength, valueLength int
-
-	numExperts, numExpertsUsed int
-	normTopKProb               bool
-}
-
-func (o Options) headDim() int {
-	return cmp.Or(o.keyLength, o.valueLength, o.hiddenSize/o.numHeads)
-}
-
-type Attention struct {
-	QueryNorm *nn.RMSNorm `gguf:"attn_q_norm"`
-	Query     *nn.Linear  `gguf:"attn_q"`
-	KeyNorm   *nn.RMSNorm `gguf:"attn_k_norm"`
-	Key       *nn.Linear  `gguf:"attn_k"`
-	Value     *nn.Linear  `gguf:"attn_v"`
-	Output    *nn.Linear  `gguf:"attn_output"`
-}
-
-func (sa *Attention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
-	batchSize := hiddenStates.Dim(1)
-
-	query := sa.Query.Forward(ctx, hiddenStates)
-	key := sa.Key.Forward(ctx, hiddenStates)
-	value := sa.Value.Forward(ctx, hiddenStates)
-
-	query = query.Reshape(ctx, opts.headDim(), opts.numHeads, batchSize)
-	key = key.Reshape(ctx, opts.headDim(), opts.numKVHeads, batchSize)
-	value = value.Reshape(ctx, opts.headDim(), opts.numKVHeads, batchSize)
-
-	query = sa.QueryNorm.Forward(ctx, query, opts.eps)
-	key = sa.KeyNorm.Forward(ctx, key, opts.eps)
-
-	query = fast.RoPE(ctx, query, positions, opts.headDim(), opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
-	key = fast.RoPE(ctx, key, positions, opts.headDim(), opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
-
-	attention := nn.Attention(ctx, query, key, value, 1./math.Sqrt(float64(opts.headDim())), cache)
-	attention = attention.Reshape(ctx, attention.Dim(0)*attention.Dim(1), batchSize)
-	return sa.Output.Forward(ctx, attention)
-}
-
-type MLP interface {
-	Forward(ml.Context, ml.Tensor, *Options) ml.Tensor
-}
-
-type sparse struct {
-	Router *nn.Linear `gguf:"ffn_gate_inp"`
-	Gate   ml.Tensor  `gguf:"ffn_gate_exps.weight"`
-	Up     ml.Tensor  `gguf:"ffn_up_exps.weight"`
-	Down   ml.Tensor  `gguf:"ffn_down_exps.weight"`
-}
-
-func (mlp *sparse) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *Options) ml.Tensor {
-	hiddenDim, sequenceLength, batchSize := hiddenStates.Dim(0), hiddenStates.Dim(1), hiddenStates.Dim(2)
-	hiddenStates = hiddenStates.Reshape(ctx, hiddenDim, sequenceLength*batchSize)
-	routerLogits := mlp.Router.Forward(ctx, hiddenStates)
-
-	routingWeights := routerLogits.Softmax(ctx)
-	selectedExperts := routingWeights.TopK(ctx, opts.numExpertsUsed)
-	routingWeights = routingWeights.Reshape(ctx, 1, opts.numExperts, hiddenStates.Dim(1)).Rows(ctx, selectedExperts)
-	if opts.normTopKProb {
-		routingWeights = routingWeights.Reshape(ctx, opts.numExpertsUsed, hiddenStates.Dim(1))
-		routingWeights = routingWeights.Div(ctx, routingWeights.SumRows(ctx))
-		routingWeights = routingWeights.Reshape(ctx, 1, opts.numExpertsUsed, hiddenStates.Dim(1))
-	}
-
-	hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0), 1, hiddenStates.Dim(1))
-
-	upStates := mlp.Up.MulmatID(ctx, hiddenStates, selectedExperts)
-
-	hiddenStates = mlp.Gate.MulmatID(ctx, hiddenStates, selectedExperts)
-	hiddenStates = hiddenStates.SILU(ctx)
-	hiddenStates = hiddenStates.Mul(ctx, upStates)
-
-	experts := mlp.Down.MulmatID(ctx, hiddenStates, selectedExperts)
-	experts = experts.Mul(ctx, routingWeights)
-
-	nextStates := experts.View(ctx, 0, experts.Dim(0), experts.Stride(2), experts.Dim(2))
-	for i := 1; i < opts.numExpertsUsed; i++ {
-		nextStates = nextStates.Add(ctx, experts.View(ctx, i*experts.Stride(1), experts.Dim(0), experts.Stride(2), experts.Dim(2)))
-	}
-
-	return nextStates
-}
-
-type dense struct {
-	Gate *nn.Linear `gguf:"ffn_gate"`
-	Up   *nn.Linear `gguf:"ffn_up"`
-	Down *nn.Linear `gguf:"ffn_down"`
-}
-
-func (mlp *dense) Forward(ctx ml.Context, hiddenStates ml.Tensor, _ *Options) ml.Tensor {
-	hiddenStates = mlp.Gate.Forward(ctx, hiddenStates).SILU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenStates))
-	return mlp.Down.Forward(ctx, hiddenStates)
-}
-
-type Layer struct {
-	AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
-	*Attention
-
-	MLPNorm *nn.RMSNorm `gguf:"ffn_norm"`
-	MLP
-}
-
-func (d *Layer) Forward(ctx ml.Context, hiddenStates, positions, outputs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
-	residual := hiddenStates
-	hiddenStates = d.AttentionNorm.Forward(ctx, hiddenStates, opts.eps)
-	hiddenStates = d.Attention.Forward(ctx, hiddenStates, positions, cache, opts)
-
-	if outputs != nil {
-		hiddenStates = hiddenStates.Rows(ctx, outputs)
-		residual = residual.Rows(ctx, outputs)
-	}
-
-	hiddenStates = hiddenStates.Add(ctx, residual)
-
-	residual = hiddenStates
-	hiddenStates = d.MLPNorm.Forward(ctx, hiddenStates, opts.eps)
-	hiddenStates = d.MLP.Forward(ctx, hiddenStates, opts)
-	return hiddenStates.Add(ctx, residual)
-}
-
-type Model struct {
-	model.Base
-	model.BytePairEncoding
-
-	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
-	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
-	Output         *nn.Linear    `gguf:"output,alt:token_embd"`
-
-	Layers []Layer `gguf:"blk"`
-
-	*Options
-}
-
-// Forward implements model.Model.
-func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-
-	hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs)
-
-	for i, layer := range m.Layers {
-		m.Cache.SetLayer(i)
-
-		var outputs ml.Tensor
-		if i == len(m.Layers)-1 {
-			outputs = ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
-		}
-
-		hiddenStates = layer.Forward(ctx, hiddenStates, positions, outputs, m.Cache, m.Options)
-	}
-
-	hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.eps)
-	return m.Output.Forward(ctx, hiddenStates), nil
-}
-
-func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return fast.RoPE(ctx, key, shift, m.headDim(), m.ropeBase, m.ropeScale, rope.WithTypeNeoX()), nil
-}
-
-var _ model.Model = (*Model)(nil)
-
-func New(c fs.Config) (model.Model, error) {
-	layers := make([]Layer, c.Uint("block_count"))
-	for i := range layers {
-		if c.String("general.architecture") == "qwen3moe" {
-			layers[i].MLP = &sparse{}
-		} else {
-			layers[i].MLP = &dense{}
-		}
-	}
-
-	m := Model{
-		BytePairEncoding: model.NewBytePairEncoding(
-			`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
-			&model.Vocabulary{
-				Values: c.Strings("tokenizer.ggml.tokens"),
-				Types:  c.Ints("tokenizer.ggml.token_type"),
-				Merges: c.Strings("tokenizer.ggml.merges"),
-				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
-				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				EOS: append(
-					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
-					c.Ints("tokenizer.ggml.eos_token_ids")...,
-				),
-			},
-		),
-		Layers: layers,
-		Options: &Options{
-			hiddenSize:     int(c.Uint("embedding_length")),
-			numHeads:       int(c.Uint("attention.head_count")),
-			numKVHeads:     int(c.Uint("attention.head_count_kv")),
-			keyLength:      int(c.Uint("attention.key_length")),
-			valueLength:    int(c.Uint("attention.value_length")),
-			eps:            c.Float("attention.layer_norm_rms_epsilon"),
-			ropeBase:       c.Float("rope.freq_base"),
-			ropeScale:      c.Float("rope.freq_scale", 1),
-			numExperts:     int(c.Uint("expert_count")),
-			numExpertsUsed: int(c.Uint("expert_used_count")),
-			normTopKProb:   c.Bool("norm_top_k_prob", true),
-		},
-	}
-
-	m.Cache = kvcache.NewCausalCache(m.Shift)
-	return &m, nil
-}
-
-func init() {
-	model.Register("qwen3", New)
-	model.Register("qwen3moe", New)
-}
--- a/model/bytepairencoding.go
+++ b/model/bytepairencoding.go
@@ -3,16 +3,118 @@ package model
 import (
 	"cmp"
 	"context"
-	"fmt"
 	"iter"
 	"log/slog"
+	"slices"
 	"strings"
+	"sync"

 	"github.com/dlclark/regexp2"
 	heap "github.com/emirpasic/gods/v2/trees/binaryheap"
 	"github.com/ollama/ollama/logutil"
 )

+type Special int32
+
+const (
+	SpecialBOS Special = iota
+	SpecialEOS
+)
+
+const (
+	TOKEN_TYPE_NORMAL = iota + 1
+	TOKEN_TYPE_UNKNOWN
+	TOKEN_TYPE_CONTROL
+	TOKEN_TYPE_USER_DEFINED
+	TOKEN_TYPE_UNUSED
+	TOKEN_TYPE_BYTE
+)
+
+type TextProcessor interface {
+	Encode(s string, addSpecial bool) ([]int32, error)
+	Decode([]int32) (string, error)
+	Is(int32, Special) bool
+	Vocabulary() *Vocabulary
+}
+
+type Vocabulary struct {
+	Values []string
+	Types  []int32
+	Scores []float32
+	Merges []string
+
+	BOS, EOS, EOT          int32
+	AddBOS, AddEOS, AddEOT bool
+
+	specialOnce sync.Once
+	special     []string
+
+	valuesOnce sync.Once
+	values     map[string]int32
+
+	mergeOnce sync.Once
+	merge     map[string]int32
+}
+
+func (v *Vocabulary) Is(id int32, special Special) bool {
+	switch special {
+	case SpecialBOS:
+		return id == v.BOS
+	case SpecialEOS:
+		return id == v.EOS || id == v.EOT
+	default:
+		return false
+	}
+}
+
+func (v *Vocabulary) Encode(s string) int32 {
+	v.valuesOnce.Do(func() {
+		v.values = make(map[string]int32, len(v.Values))
+		for i, value := range v.Values {
+			v.values[value] = int32(i)
+		}
+	})
+
+	if id, ok := v.values[s]; ok {
+		return id
+	}
+
+	return -1
+}
+
+func (v *Vocabulary) Decode(id int32) string {
+	return v.Values[id]
+}
+
+func (v *Vocabulary) SpecialVocabulary() []string {
+	v.specialOnce.Do(func() {
+		for i := range v.Values {
+			if slices.Contains([]int{105, 106}, i) {
+				v.special = append(v.special, v.Values[i])
+			} else if v.Types[i] == TOKEN_TYPE_CONTROL {
+				v.special = append(v.special, v.Values[i])
+			}
+		}
+	})
+
+	return v.special
+}
+
+func (v *Vocabulary) Merge(left, right string) int {
+	v.mergeOnce.Do(func() {
+		v.merge = make(map[string]int32, len(v.Merges))
+		for i, merge := range v.Merges {
+			v.merge[merge] = int32(i)
+		}
+	})
+
+	if id, ok := v.merge[left+" "+right]; ok {
+		return int(id)
+	}
+
+	return -1
+}
+
 type BytePairEncoding struct {
 	pre   *regexp2.Regexp
 	vocab *Vocabulary
@@ -202,23 +304,30 @@ func (bpe BytePairEncoding) Encode(s string, addSpecial bool) ([]int32, error) {
 		}
 	}

-	slog.Log(context.TODO(), logutil.LevelTrace, "encoded", "string", s, "ids", ids)
-
 	if addSpecial && len(ids) > 0 {
-		ids = bpe.vocab.addSpecials(ids)
+		if bpe.vocab.AddBOS {
+			if ids[0] == bpe.vocab.BOS {
+				slog.Warn("adding bos token to prompt which already has it", "id", bpe.vocab.BOS)
+			}
+
+			slog.Debug("adding bos token to prompt", "id", bpe.vocab.BOS)
+			ids = append([]int32{bpe.vocab.BOS}, ids...)
+		}
+
+		if bpe.vocab.AddEOS {
+			if ids[len(ids)-1] == bpe.vocab.EOS {
+				slog.Warn("adding eos token to prompt which already has it", "id", bpe.vocab.EOS)
+			}
+
+			slog.Debug("adding eos token to prompt", "id", bpe.vocab.EOS)
+			ids = append(ids, bpe.vocab.EOS)
+		}
 	}

+	slog.Log(context.TODO(), logutil.LevelTrace, "encoded", "ids", ids)
 	return ids, nil
 }

-type lazyIdsString struct {
-	ids []int32
-}
-
-func (l lazyIdsString) LogValue() slog.Value {
-	return slog.AnyValue(fmt.Sprint(l.ids))
-}
-
 func (bpe BytePairEncoding) Decode(ids []int32) (string, error) {
 	var sb strings.Builder
 	for _, id := range ids {
@@ -243,6 +352,6 @@ func (bpe BytePairEncoding) Decode(ids []int32) (string, error) {
 		}
 	}

-	slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "string", sb.String(), "from", lazyIdsString{ids: ids})
+	slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "string", sb.String())
 	return sb.String(), nil
 }
--- a/model/process_text_spm.go
+++ b/model/process_text_spm.go
@@ -182,12 +182,27 @@ func (spm SentencePieceModel) Encode(s string, addSpecial bool) ([]int32, error)
 		}
 	}

-	slog.Log(context.TODO(), logutil.LevelTrace, "encoded", "string", s, "ids", ids)
-
 	if addSpecial && len(ids) > 0 {
-		ids = spm.vocab.addSpecials(ids)
+		if spm.vocab.AddBOS {
+			if ids[0] == spm.vocab.BOS {
+				slog.Warn("adding bos token to prompt which already has it", "id", spm.vocab.BOS)
+			}
+
+			slog.Debug("adding bos token to prompt", "id", spm.vocab.BOS)
+			ids = append([]int32{spm.vocab.BOS}, ids...)
+		}
+
+		if spm.vocab.AddEOS {
+			if ids[len(ids)-1] == spm.vocab.EOS {
+				slog.Warn("adding eos token to prompt which already has it", "id", spm.vocab.EOS)
+			}
+
+			slog.Debug("adding eos token to prompt", "id", spm.vocab.EOS)
+			ids = append(ids, spm.vocab.EOS)
+		}
 	}

+	slog.Log(context.TODO(), logutil.LevelTrace, "encoded", "ids", ids)
 	return ids, nil
 }

@@ -246,6 +261,6 @@ func (spm SentencePieceModel) Decode(ids []int32) (string, error) {
 		}
 	}

-	slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "ids", ids, "string", sb.String())
+	slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "string", sb.String())
 	return sb.String(), nil
 }
--- a/model/process_text_spm_test.go
+++ b/model/process_text_spm_test.go
--- a/model/bytepairencoding_test.go
+++ b/model/bytepairencoding_test.go
--- a/model/textprocessor.go
+++ b/model/textprocessor.go
@@ -1,17 +0,0 @@
-package model
-
-const (
-	TOKEN_TYPE_NORMAL = iota + 1
-	TOKEN_TYPE_UNKNOWN
-	TOKEN_TYPE_CONTROL
-	TOKEN_TYPE_USER_DEFINED
-	TOKEN_TYPE_UNUSED
-	TOKEN_TYPE_BYTE
-)
-
-type TextProcessor interface {
-	Encode(s string, addSpecial bool) ([]int32, error)
-	Decode([]int32) (string, error)
-	Is(int32, Special) bool
-	Vocabulary() *Vocabulary
-}
--- a/model/vocabulary.go
+++ b/model/vocabulary.go
@@ -1,112 +0,0 @@
-package model
-
-import (
-	"log/slog"
-	"slices"
-	"sync"
-)
-
-type Special int32
-
-const (
-	SpecialBOS Special = iota
-	SpecialEOS
-)
-
-type Vocabulary struct {
-	Values []string
-	Types  []int32
-	Scores []float32
-	Merges []string
-
-	BOS, EOS       []int32
-	AddBOS, AddEOS bool
-
-	specialOnce sync.Once
-	special     []string
-
-	valuesOnce sync.Once
-	values     map[string]int32
-
-	mergeOnce sync.Once
-	merge     map[string]int32
-}
-
-func (v *Vocabulary) Is(id int32, special Special) bool {
-	switch special {
-	case SpecialBOS:
-		return slices.Contains(v.BOS, id)
-	case SpecialEOS:
-		return slices.Contains(v.EOS, id)
-	default:
-		return false
-	}
-}
-
-func (v *Vocabulary) addSpecials(ids []int32) []int32 {
-	if v.AddBOS && len(v.BOS) > 0 {
-		if slices.Contains(v.BOS, ids[0]) {
-			slog.Warn("adding bos token to prompt which already has it", "id", v.BOS)
-		}
-
-		slog.Debug("adding bos token to prompt", "id", v.BOS)
-		ids = append([]int32{v.BOS[0]}, ids...)
-	}
-
-	if v.AddEOS && len(v.EOS) > 0 {
-		if slices.Contains(v.BOS, ids[len(ids)-1]) {
-			slog.Warn("adding eos token to prompt which already has it", "id", v.EOS)
-		}
-
-		slog.Debug("adding eos token to prompt", "id", v.EOS)
-		ids = append(ids, v.EOS[0])
-	}
-
-	return ids
-}
-
-func (v *Vocabulary) Encode(s string) int32 {
-	v.valuesOnce.Do(func() {
-		v.values = make(map[string]int32, len(v.Values))
-		for i, value := range v.Values {
-			v.values[value] = int32(i)
-		}
-	})
-
-	if id, ok := v.values[s]; ok {
-		return id
-	}
-
-	return -1
-}
-
-func (v *Vocabulary) Decode(id int32) string {
-	return v.Values[id]
-}
-
-func (v *Vocabulary) SpecialVocabulary() []string {
-	v.specialOnce.Do(func() {
-		for i := range v.Values {
-			if v.Types[i] == TOKEN_TYPE_CONTROL {
-				v.special = append(v.special, v.Values[i])
-			}
-		}
-	})
-
-	return v.special
-}
-
-func (v *Vocabulary) Merge(left, right string) int {
-	v.mergeOnce.Do(func() {
-		v.merge = make(map[string]int32, len(v.Merges))
-		for i, merge := range v.Merges {
-			v.merge[merge] = int32(i)
-		}
-	})
-
-	if id, ok := v.merge[left+" "+right]; ok {
-		return int(id)
-	}
-
-	return -1
-}
--- a/readline/types.go
+++ b/readline/types.go
@@ -61,8 +61,6 @@ const (
 	ColorGrey    = Esc + "[38;5;245m"
 	ColorDefault = Esc + "[0m"

-	ColorBold = Esc + "[1m"
-
 	StartBracketedPaste = Esc + "[?2004h"
 	EndBracketedPaste   = Esc + "[?2004l"
 )
--- a/runner/llamarunner/cache.go
+++ b/runner/llamarunner/cache.go
@@ -104,8 +104,8 @@ func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCach
 	slog.Debug("loading cache slot", "id", slot.Id, "cache", len(slot.Inputs), "prompt", len(prompt),
 		"used", numPast, "remaining", len(prompt)-numPast)

-	slot.Inputs = prompt[:numPast]
 	prompt = prompt[numPast:]
+	slot.Inputs = slot.Inputs[:numPast]

 	return slot, prompt, nil
 }
--- a/runner/ollamarunner/cache.go
+++ b/runner/ollamarunner/cache.go
@@ -136,8 +136,8 @@ func (c *InputCache) LoadCacheSlot(prompt []input.Input) (*InputCacheSlot, []inp
 	slog.Debug("loading cache slot", "id", slot.Id, "cache", len(slot.Inputs), "prompt", len(prompt),
 		"used", numPast, "remaining", int32(len(prompt))-numPast)

-	slot.Inputs = prompt[:numPast]
 	prompt = prompt[numPast:]
+	slot.Inputs = slot.Inputs[:numPast]

 	return slot, prompt, nil
 }
--- a/runner/ollamarunner/cache_test.go
+++ b/runner/ollamarunner/cache_test.go
@@ -3,6 +3,7 @@ package ollamarunner
 import (
 	"errors"
 	"fmt"
+	"image"
 	"testing"
 	"time"

@@ -11,6 +12,10 @@ import (
 )

 func TestCountCommon(t *testing.T) {
+	imgA := image.NewRGBA(image.Rect(0, 0, 100, 100))
+	imgB := image.NewRGBA(image.Rect(0, 0, 50, 50))
+	imgC := image.NewRGBA(image.Rect(50, 50, 100, 100))
+
 	tests := []struct {
 		name     string
 		t1       []input.Input
@@ -31,20 +36,20 @@ func TestCountCommon(t *testing.T) {
 		},
 		{
 			name:     "Image Prefix",
-			t1:       []input.Input{{MultimodalHash: 1}},
-			t2:       []input.Input{{MultimodalHash: 1}, {MultimodalHash: 2}, {MultimodalHash: 3}},
+			t1:       []input.Input{{Multimodal: imgA, MultimodalHash: 1}},
+			t2:       []input.Input{{Multimodal: imgA, MultimodalHash: 1}, {Multimodal: imgB, MultimodalHash: 2}, {Multimodal: imgC, MultimodalHash: 3}},
 			expected: 1,
 		},
 		{
 			name:     "Mixed",
-			t1:       []input.Input{{Token: 1}, {MultimodalHash: 1}},
-			t2:       []input.Input{{Token: 1}, {MultimodalHash: 1}, {Token: 5}},
+			t1:       []input.Input{{Token: 1}, {Multimodal: imgA, MultimodalHash: 1}},
+			t2:       []input.Input{{Token: 1}, {Multimodal: imgA, MultimodalHash: 1}, {Token: 5}},
 			expected: 2,
 		},
 		{
 			name:     "Mixed, Same Length",
-			t1:       []input.Input{{Token: 1}, {MultimodalHash: 1}},
-			t2:       []input.Input{{Token: 1}, {MultimodalHash: 2}},
+			t1:       []input.Input{{Token: 1}, {Multimodal: imgA, MultimodalHash: 1}},
+			t2:       []input.Input{{Token: 1}, {Multimodal: imgB, MultimodalHash: 2}},
 			expected: 1,
 		},
 		{
--- a/runner/ollamarunner/multimodal.go
+++ b/runner/ollamarunner/multimodal.go
@@ -1,113 +0,0 @@
-package ollamarunner
-
-import (
-	"errors"
-
-	"github.com/ollama/ollama/ml"
-	"github.com/ollama/ollama/model/input"
-)
-
-// Tensors can't be used across multiple compute graphs. This is a problem
-// if a single embedding is split across batches using views since all of
-// the views will have the same source tensor. We also don't want to
-// recompute the entire embedding for each batch.
-//
-// To avoid this, we compute all of the tensors for the embedding on the
-// first use and then store the result in system memory. When we need
-// additional tensors, we recreate them from the stored data.
-
-// multimodalEntry represents the embeddings of a single object (such
-// as an image).
-type multimodalEntry struct {
-	// mm is the original set of tensors created by EncodeMultimodal
-	mm []input.Multimodal
-
-	// data is the computed result of mm. Nil if not yet computed
-	data [][]float32
-}
-
-// multimodalStore maps from an individual tensor (of which there
-// may be many in a single multimodal object) to its parent embedding
-type multimodalStore map[ml.Tensor]*multimodalEntry
-
-func newMultimodalStore() multimodalStore {
-	return make(multimodalStore)
-}
-
-// addMultimodal stores an embedding for later use in a compute graph
-func (m multimodalStore) addMultimodal(embedding []input.Multimodal) {
-	entry := &multimodalEntry{mm: embedding}
-
-	for _, e := range embedding {
-		if e.Tensor != nil {
-			m[e.Tensor] = entry
-		}
-	}
-}
-
-// getMultimodal takes a source set of tensors (which may contain a whole or
-// parts of one or more images) and returns the equivalent that can be used in
-// the current context
-func (m multimodalStore) getMultimodal(backend ml.Backend, ctx ml.Context, in []input.Multimodal, reserve bool) ([]input.Multimodal, error) {
-	out := make([]input.Multimodal, len(in))
-	for i := range out {
-		if in[i].Tensor != nil {
-			var err error
-			out[i].Tensor, err = m.getTensor(backend, ctx, in[i].Tensor, reserve)
-			if err != nil {
-				return nil, err
-			}
-		}
-
-		out[i].Data = in[i].Data
-	}
-
-	return out, nil
-}
-
-func (m multimodalStore) getTensor(backend ml.Backend, ctx ml.Context, in ml.Tensor, reserve bool) (ml.Tensor, error) {
-	entry := m[in]
-
-	if entry.data == nil {
-		computeCtx := backend.NewContext()
-		defer computeCtx.Close()
-
-		var tensors []ml.Tensor
-		for _, t := range entry.mm {
-			if t.Tensor != nil {
-				tensors = append(tensors, t.Tensor)
-			}
-		}
-
-		if len(tensors) == 0 {
-			return nil, nil
-		}
-
-		computeCtx.Forward(tensors...)
-		entry.data = make([][]float32, len(entry.mm))
-
-		if !reserve {
-			computeCtx.Compute(tensors...)
-
-			for i, t := range entry.mm {
-				if t.Tensor != nil {
-					entry.data[i] = t.Tensor.Floats()
-				}
-			}
-		} else {
-			computeCtx.Reserve()
-		}
-	}
-
-	for i, t := range entry.mm {
-		if in == t.Tensor {
-			if !reserve {
-				return ctx.Input().FromFloatSlice(entry.data[i], t.Tensor.Shape()...), nil
-			} else {
-				return ctx.Input().Empty(t.Tensor.DType(), t.Tensor.Shape()...), nil
-			}
-		}
-	}
-
-	return nil, errors.New("multimodal tensor not found")
-}
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -1,14 +1,12 @@
 package ollamarunner

 import (
-	"bytes"
 	"context"
 	"encoding/json"
 	"errors"
 	"flag"
 	"fmt"
 	"hash/maphash"
-	"image"
 	"log"
 	"log/slog"
 	"net"
@@ -22,7 +20,6 @@ import (
 	"time"
 	"unicode/utf8"

-	"golang.org/x/image/bmp"
 	"golang.org/x/sync/semaphore"

 	"github.com/ollama/ollama/api"
@@ -43,9 +40,6 @@ type Sequence struct {
 	// multimodal embeddings
 	ctxs []ml.Context

-	// mmStore holds multimodal embeddings to mange memory and enable splitting across batches
-	mmStore multimodalStore
-
 	// batch index
 	iBatch int

@@ -107,7 +101,7 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe

 	startTime := time.Now()

-	inputs, ctxs, mmStore, err := s.inputs(prompt, images)
+	inputs, ctxs, err := s.inputs(prompt, images)
 	if err != nil {
 		return nil, fmt.Errorf("failed to process inputs: %w", err)
 	} else if len(inputs) == 0 {
@@ -162,7 +156,6 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe

 	return &Sequence{
 		ctxs:                ctxs,
-		mmStore:             mmStore,
 		inputs:              inputs,
 		numPromptInputs:     len(inputs),
 		startProcessingTime: startTime,
@@ -181,10 +174,9 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe
 // inputs processes the prompt and images into a list of inputs
 // by splitting the prompt on [img-<n>] tags, tokenizing text and
 // decoding images
-func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, []ml.Context, multimodalStore, error) {
+func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, []ml.Context, error) {
 	var inputs []input.Input
 	var ctxs []ml.Context
-	var mmStore multimodalStore

 	var parts []string
 	var matches [][]string
@@ -195,7 +187,6 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, [
 		re := regexp.MustCompile(`\[img-(\d+)\]`)
 		parts = re.Split(prompt, -1)
 		matches = re.FindAllStringSubmatch(prompt, -1)
-		mmStore = newMultimodalStore()
 	} else {
 		parts = []string{prompt}
 	}
@@ -205,7 +196,7 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, [
 		// text - tokenize
 		tokens, err := s.model.(model.TextProcessor).Encode(part, i == 0)
 		if err != nil {
-			return nil, nil, nil, err
+			return nil, nil, err
 		}

 		for _, t := range tokens {
@@ -225,7 +216,7 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, [
 			}

 			if imageIndex < 0 {
-				return nil, nil, nil, fmt.Errorf("invalid image index: %d", n)
+				return nil, nil, fmt.Errorf("invalid image index: %d", n)
 			}

 			ctx := s.model.Backend().NewContext()
@@ -233,15 +224,13 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, [
 			ctxs = append(ctxs, ctx)
 			imageEmbeddings, err := multimodalProcessor.EncodeMultimodal(ctx, images[imageIndex].Data)
 			if err != nil {
-				return nil, nil, nil, err
+				return nil, nil, err
 			}

 			s.multimodalHash.Reset()
 			_, _ = s.multimodalHash.Write(images[imageIndex].Data)
 			imageHash := s.multimodalHash.Sum64()

-			mmStore.addMultimodal(imageEmbeddings)
-
 			inputs = append(inputs, input.Input{Multimodal: imageEmbeddings, MultimodalHash: imageHash})
 			postTokenize = true
 		}
@@ -251,11 +240,11 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, [
 		var err error
 		inputs, err = multimodalProcessor.PostTokenize(inputs)
 		if err != nil {
-			return nil, nil, nil, err
+			return nil, nil, err
 		}
 	}

-	return inputs, ctxs, mmStore, nil
+	return inputs, ctxs, nil
 }

 type Server struct {
@@ -374,9 +363,6 @@ func (s *Server) processBatch() error {
 	}
 	defer s.mu.Unlock()

-	ctx := s.model.Backend().NewContext()
-	defer ctx.Close()
-
 	var batchInputs []int32
 	var batch input.Batch

@@ -447,11 +433,7 @@ func (s *Server) processBatch() error {

 			batchInputs = append(batchInputs, inp.Token)
 			if inp.Multimodal != nil {
-				mm, err := seq.mmStore.getMultimodal(s.model.Backend(), ctx, inp.Multimodal, false)
-				if err != nil {
-					return err
-				}
-				batch.Multimodal = append(batch.Multimodal, input.MultimodalIndex{Index: len(batchInputs) - 1, Multimodal: mm})
+				batch.Multimodal = append(batch.Multimodal, input.MultimodalIndex{Index: len(batchInputs) - 1, Multimodal: inp.Multimodal})
 			}

 			batch.Positions = append(batch.Positions, int32(len(seq.cache.Inputs)+len(seq.pendingInputs)))
@@ -477,6 +459,9 @@ func (s *Server) processBatch() error {
 		return nil
 	}

+	ctx := s.model.Backend().NewContext()
+	defer ctx.Close()
+
 	modelOutput, err := model.Forward(ctx, s.model, batchInputs, batch)
 	if err != nil {
 		return fmt.Errorf("failed to decode batch: %w", err)
@@ -735,71 +720,12 @@ func (s *Server) reserveWorstCaseGraph() error {
 	ctx := s.model.Backend().NewContext()
 	defer ctx.Close()

-	var err error
-	inputs := make([]input.Input, s.batchSize)
-	mmStore := newMultimodalStore()
-
-	// Multimodal strategy:
-	// - Encode a 2048x2048 image. This assumes that a single image of this
-	//   size is sufficient to trigger the worst case. This is currently true
-	//   because for existing models, only a single image fits in a batch.
-	// - Add the embedding to a full batch of tokens - this is necessary because
-	//   the model may be looking for non-image data, such as <image> tags.
-	// - Run PostTokenize to execute any transformations between generated
-	//   embeddings and what the forward pass expects.
-	// - The result may now be larger than a batch (images may not fit in a
-	//   single batch), so trim based on what will fit and must be grouped together.
-	// - Fill out the rest of the space with text tokens.
-	if multimodalProcessor, ok := s.model.(model.MultimodalProcessor); ok {
-		mmCtx := s.model.Backend().NewContext()
-		defer mmCtx.Close()
-
-		img := image.NewGray(image.Rect(0, 0, 2048, 2048))
-		var buf bytes.Buffer
-		bmp.Encode(&buf, img)
-
-		if inputs[0].Multimodal, err = multimodalProcessor.EncodeMultimodal(mmCtx, buf.Bytes()); err == nil {
-			mmStore.addMultimodal(inputs[0].Multimodal)
-
-			inputs, err = multimodalProcessor.PostTokenize(inputs)
-			if err != nil {
-				return err
-			}
-
-			for i, inp := range inputs {
-				minBatch := 1 + inp.SameBatch
-				if minBatch > s.batchSize {
-					inputs = inputs[i:min(i+minBatch, len(inputs))]
-					break
-				} else if i+minBatch > s.batchSize {
-					inputs = inputs[:i]
-					break
-				}
-			}
-
-			if len(inputs) < s.batchSize {
-				newInputs := make([]input.Input, s.batchSize)
-				copy(newInputs, inputs)
-				inputs = newInputs
-			}
-		}
-	}
-
 	var batch input.Batch

-	batchInputs := make([]int32, len(inputs))
+	inputs := make([]int32, s.batchSize)
 	batch.Positions = make([]int32, len(inputs))
 	batch.Sequences = make([]int, len(inputs))
-	for i, inp := range inputs {
-		batchInputs[i] = inp.Token
-		if inp.Multimodal != nil {
-			mm, err := mmStore.getMultimodal(s.model.Backend(), ctx, inp.Multimodal, true)
-			if err != nil {
-				return err
-			}
-			batch.Multimodal = append(batch.Multimodal, input.MultimodalIndex{Index: i, Multimodal: mm})
-		}
-
+	for i := range inputs {
 		batch.Positions[i] = int32(i)
 	}

@@ -808,7 +734,11 @@ func (s *Server) reserveWorstCaseGraph() error {
 		batch.Outputs[i] = int32(i)
 	}

-	batch.Inputs = ctx.Input().FromIntSlice(batchInputs, len(batchInputs))
+	var err error
+	batch.Inputs, err = ctx.Input().FromIntSlice(inputs, len(inputs))
+	if err != nil {
+		return err
+	}

 	cache := s.model.Config().Cache
 	if cache != nil {
@@ -823,12 +753,16 @@ func (s *Server) reserveWorstCaseGraph() error {
 		return err
 	}

-	ctx.Forward(t).Reserve()
+	err = ctx.Forward(t).Reserve()
+	if err != nil {
+		return err
+	}

 	return nil
 }

-func (s *Server) initModel(
+func (s *Server) loadModel(
+	ctx context.Context,
 	mpath string,
 	params ml.BackendParams,
 	lpath multiLPath,
@@ -836,21 +770,21 @@ func (s *Server) initModel(
 	kvCacheType string,
 	kvSize int,
 	multiUserCache bool,
-) error {
+) {
 	var err error
-	s.model, err = model.New(mpath, params)
+	s.model, err = model.New(ctx, mpath, params)
 	if err != nil {
-		return err
+		panic(err)
 	}

 	// TODO(jessegross): LoRA loading
 	if lpath.String() != "" {
-		return errors.New("loras are not yet implemented")
+		panic("loras are not yet implemented")
 	}

 	s.cache, err = NewInputCache(s.model, kvCacheType, int32(kvSize), parallel, s.batchSize, multiUserCache)
 	if err != nil {
-		return err
+		panic(err)
 	}

 	if !s.cache.enabled && parallel > 1 {
@@ -862,30 +796,7 @@ func (s *Server) initModel(
 	s.seqs = make([]*Sequence, s.parallel)
 	s.seqsSem = semaphore.NewWeighted(int64(s.parallel))

-	return s.reserveWorstCaseGraph()
-}
-
-func (s *Server) load(
-	ctx context.Context,
-	mpath string,
-	params ml.BackendParams,
-	lpath multiLPath,
-	parallel int,
-	kvCacheType string,
-	kvSize int,
-	multiUserCache bool,
-) {
-	err := s.initModel(mpath, params, lpath, parallel, kvCacheType, kvSize, multiUserCache)
-	if err != nil {
-		panic(err)
-	}
-
-	slog.Debug("memory", "allocated", s.model.Backend().BackendMemory())
-
-	err = s.model.Backend().Load(ctx,
-		func(progress float32) {
-			s.progress = progress
-		})
+	err = s.reserveWorstCaseGraph()
 	if err != nil {
 		panic(err)
 	}
@@ -929,14 +840,9 @@ func Execute(args []string) error {
 		status:    llm.ServerStatusLoadingModel,
 	}

-	server.cond = sync.NewCond(&server.mu)
-	server.ready.Add(1)
-
-	ctx, cancel := context.WithCancel(context.Background())
-	defer cancel()
-
 	// TODO(jessegross): Parameters that need to be implemented:
 	//	no-mmap
+	//	mlock

 	var tensorSplitFloats []float32
 	if *tensorSplit != "" {
@@ -949,6 +855,9 @@ func Execute(args []string) error {
 	}

 	params := ml.BackendParams{
+		Progress: func(progress float32) {
+			server.progress = progress
+		},
 		NumThreads:     *threads,
 		NumGPULayers:   *numGPULayers,
 		MainGPU:        *mainGPU,
@@ -956,7 +865,14 @@ func Execute(args []string) error {
 		FlashAttention: *flashAttention,
 	}

-	go server.load(ctx, *mpath, params, lpaths, *parallel, *kvCacheType, *kvSize, *multiUserCache)
+	server.ready.Add(1)
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
+	go server.loadModel(ctx, *mpath, params, lpaths, *parallel, *kvCacheType, *kvSize, *multiUserCache)
+
+	server.cond = sync.NewCond(&server.mu)
+
 	go server.run(ctx)

 	addr := "127.0.0.1:" + strconv.Itoa(*port)
--- a/runner/ollamarunner/runner_test.go
+++ b/runner/ollamarunner/runner_test.go
@@ -0,0 +1,218 @@
+package ollamarunner
+
+import (
+	"context"
+	"sync"
+	"testing"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/model/input"
+	"github.com/ollama/ollama/sample"
+	"golang.org/x/sync/semaphore"
+)
+
+// testBackend implements ml.Backend with minimal functionality required for tests.
+type testBackend struct{}
+
+func (b *testBackend) Config() fs.Config             { return testConfig{} }
+func (b *testBackend) Get(string) ml.Tensor          { return nil }
+func (b *testBackend) NewContext() ml.Context        { return &testContext{} }
+func (b *testBackend) NewContextSize(int) ml.Context { return &testContext{} }
+
+// testConfig is a stub implementation of fs.Config used by testBackend.
+type testConfig struct{}
+
+func (testConfig) Architecture() string                  { return "" }
+func (testConfig) String(string, ...string) string       { return "" }
+func (testConfig) Uint(string, ...uint32) uint32         { return 0 }
+func (testConfig) Float(string, ...float32) float32      { return 0 }
+func (testConfig) Bool(string, ...bool) bool             { return false }
+func (testConfig) Strings(string, ...[]string) []string  { return nil }
+func (testConfig) Ints(string, ...[]int32) []int32       { return nil }
+func (testConfig) Floats(string, ...[]float32) []float32 { return nil }
+
+type testContext struct{}
+
+func (c *testContext) Empty(dtype ml.DType, shape ...int) ml.Tensor {
+	sz := 1
+	for _, s := range shape {
+		sz *= s
+	}
+	return &testTensor{dtype: dtype, data: make([]float32, sz), shape: shape}
+}
+func (c *testContext) Zeros(dtype ml.DType, shape ...int) ml.Tensor { return c.Empty(dtype, shape...) }
+func (c *testContext) FromFloatSlice(s []float32, shape ...int) (ml.Tensor, error) {
+	t := c.Empty(ml.DTypeF32, shape...).(*testTensor)
+	copy(t.data, s)
+	return t, nil
+}
+func (c *testContext) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
+	f := make([]float32, len(s))
+	for i, v := range s {
+		f[i] = float32(v)
+	}
+	out, _ := c.FromFloatSlice(f, shape...)
+	out.(*testTensor).dtype = ml.DTypeI32
+	return out, nil
+}
+func (c *testContext) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
+	return c.Empty(dtype, int((stop-start)/step))
+}
+func (c *testContext) Forward(...ml.Tensor) ml.Context { return c }
+func (c *testContext) Compute(...ml.Tensor)            {}
+func (c *testContext) Reserve() error                  { return nil }
+func (c *testContext) MaxGraphNodes() int              { return 0 }
+func (c *testContext) Close()                          {}
+func (c *testContext) Input() ml.Context               { return c }
+func (c *testContext) Layer(int) ml.Context            { return c }
+
+type testTensor struct {
+	ml.Tensor
+	dtype ml.DType
+	data  []float32
+	shape []int
+}
+
+func (t *testTensor) Dim(n int) int    { return t.shape[n] }
+func (t *testTensor) Stride(n int) int { return 0 }
+func (t *testTensor) Shape() []int     { return t.shape }
+func (t *testTensor) DType() ml.DType  { return t.dtype }
+func (t *testTensor) Bytes() []byte    { return nil }
+func (t *testTensor) Floats() []float32 {
+	out := make([]float32, len(t.data))
+	copy(out, t.data)
+	return out
+}
+func (t *testTensor) Neg(ctx ml.Context) ml.Tensor { return nil }
+func (t *testTensor) Add(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
+	out, _ := ctx.(*testContext).FromFloatSlice(nil, len(t.data))
+	return out
+}
+func (t *testTensor) Mul(ctx ml.Context, t2 ml.Tensor) ml.Tensor            { return nil }
+func (t *testTensor) Mulmat(ctx ml.Context, t2 ml.Tensor) ml.Tensor         { return nil }
+func (t *testTensor) MulmatFullPrec(ctx ml.Context, t2 ml.Tensor) ml.Tensor { return nil }
+func (t *testTensor) MulmatID(ctx ml.Context, t2, ids ml.Tensor) ml.Tensor  { return nil }
+func (t *testTensor) Softmax(ctx ml.Context) ml.Tensor                      { return nil }
+func (t *testTensor) LayerNorm(ctx ml.Context, w, b ml.Tensor, e float32) ml.Tensor {
+	return nil
+}
+func (t *testTensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
+	return ctx.(*testContext).Empty(t.dtype, shape...)
+}
+func (t *testTensor) Copy(ctx ml.Context, dest ml.Tensor) ml.Tensor {
+	copy(dest.(*testTensor).data, t.data)
+	return nil
+}
+
+// fakeModel implements model.Model and model.TextProcessor.
+type fakeModel struct {
+	model.Base
+	decode  map[int32]string
+	logits  [][]float32
+	call    int
+	backend ml.Backend
+}
+
+func (f *fakeModel) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
+	idx := f.call
+	if idx >= len(f.logits) {
+		idx = len(f.logits) - 1
+	}
+	f.call++
+	return ctx.FromFloatSlice(f.logits[idx], len(f.logits[idx]))
+}
+
+func (f *fakeModel) Backend() ml.Backend {
+	if f.backend == nil {
+		f.backend = &testBackend{}
+	}
+	return f.backend
+}
+
+func (f *fakeModel) Encode(string, bool) ([]int32, error) { return nil, nil }
+func (f *fakeModel) Decode(ids []int32) (string, error) {
+	var s string
+	for _, id := range ids {
+		s += f.decode[id]
+	}
+	return s, nil
+}
+func (f *fakeModel) Is(id int32, sp model.Special) bool { return false }
+func (f *fakeModel) Vocabulary() *model.Vocabulary      { return &model.Vocabulary{} }
+
+var _ model.Model = (*fakeModel)(nil)
+var _ model.TextProcessor = (*fakeModel)(nil)
+
+func TestProcessBatchUnicode(t *testing.T) {
+	tests := []struct {
+		name   string
+		decode map[int32]string
+		logits [][]float32
+		want   string
+	}{
+		{
+			name:   "emoji",
+			decode: map[int32]string{0: "A", 1: "😀", 2: "👍", 3: "!"},
+			logits: [][]float32{{10, 0, 0, 0}, {0, 10, 0, 0}, {0, 0, 10, 0}, {0, 0, 0, 10}},
+			want:   "A😀👍!",
+		},
+		{
+			name:   "ascii",
+			decode: map[int32]string{0: "H", 1: "e", 2: "y"},
+			logits: [][]float32{{10, 0, 0}, {0, 10, 0}, {0, 0, 10}},
+			want:   "Hey",
+		},
+		{
+			name:   "multibyte",
+			decode: map[int32]string{0: "世", 1: "界", 2: "😊"},
+			logits: [][]float32{{10, 0, 0}, {0, 10, 0}, {0, 0, 10}},
+			want:   "世界😊",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			m := &fakeModel{decode: tt.decode, logits: tt.logits}
+
+			s := &Server{model: m, batchSize: 1, parallel: 1}
+			s.cache = &InputCache{enabled: true, slots: []InputCacheSlot{{Id: 0}}, numCtx: 10}
+			s.seqs = make([]*Sequence, 1)
+			s.seqsSem = semaphore.NewWeighted(1)
+			if err := s.seqsSem.Acquire(context.Background(), 1); err != nil {
+				t.Fatal(err)
+			}
+			s.cond = sync.NewCond(&s.mu)
+
+			seq := &Sequence{
+				inputs:     []input.Input{{Token: 0}},
+				cache:      &s.cache.slots[0],
+				responses:  make(chan string, 10),
+				quit:       make(chan bool, 1),
+				numPredict: len(tt.logits),
+				sampler:    sample.NewSampler(0, 0, 0, 0, 0, nil),
+				embedding:  make(chan []float32, 1),
+			}
+			s.seqs[0] = seq
+
+			for {
+				if err := s.processBatch(); err != nil {
+					t.Fatal(err)
+				}
+				if s.seqs[0] == nil {
+					break
+				}
+			}
+
+			var result string
+			for r := range seq.responses {
+				result += r
+			}
+
+			if result != tt.want {
+				t.Fatalf("got %q want %q", result, tt.want)
+			}
+		})
+	}
+}
--- a/sample/samplers.go
+++ b/sample/samplers.go
@@ -176,7 +176,7 @@ func NewGrammarSampler(model model.TextProcessor, grammarStr string) (*GrammarSa
 		vocabIds[i] = uint32(i)
 	}

-	grammar := llama.NewGrammar(grammarStr, vocabIds, pieces, model.Vocabulary().EOS)
+	grammar := llama.NewGrammar(grammarStr, vocabIds, pieces, []uint32{uint32(model.Vocabulary().EOS), uint32(model.Vocabulary().EOT)})
 	if grammar == nil {
 		return nil, errors.New("sample: failed to initialize grammar")
 	}
--- a/server/create.go
+++ b/server/create.go
@@ -295,7 +295,7 @@ func convertFromSafetensors(files map[string]string, baseLayers []*layerGGML, is
 	}
 	defer bin.Close()

-	f, err := ggml.Decode(bin, -1)
+	f, _, err := ggml.Decode(bin, -1)
 	if err != nil {
 		return nil, err
 	}
@@ -430,7 +430,7 @@ func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.Progr
 	fnWrap := func(n uint64) {
 		done := doneBytes.Add(n)
 		progress := float32(done) / float32(totalBytes)
-		fn(api.ProgressResponse{Status: fmt.Sprintf("quantizing %s model to %s", ft, quantizeType), Digest: "0000000000000000000", Total: layer.Size, Completed: int64(progress * float32(layer.Size))})
+		fn(api.ProgressResponse{Status: fmt.Sprintf("quantizing %s model to %s", ft, quantizeType), Digest: "0", Total: layer.Size, Completed: int64(progress * float32(layer.Size))})
 	}
 	ftype, err := ggml.ParseFileType(quantizeType)
 	if err != nil {
@@ -467,7 +467,7 @@ func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.Progr
 		return nil, err
 	}

-	f, err := ggml.Decode(temp, 1024)
+	f, _, err := ggml.Decode(temp, 1024)
 	if err != nil {
 		slog.Error(fmt.Sprintf("error decoding ggml: %s\n", err))
 		return nil, err
@@ -501,26 +501,47 @@ func ggufLayers(digest string, fn func(resp api.ProgressResponse)) ([]*layerGGML
 		return nil, errOnlyGGUFSupported
 	}

-	f, err := ggml.Decode(blob, -1)
+	stat, err := blob.Stat()
 	if err != nil {
 		return nil, err
 	}

-	mediatype := "application/vnd.ollama.image.model"
-	if f.KV().Kind() == "adapter" {
-		mediatype = "application/vnd.ollama.image.adapter"
-	} else if (f.KV().Uint("block_count") == 0 && f.KV().Uint("vision.block_count") > 0) || f.KV().Kind() == "projector" {
-		// if a model has vision.block_count but not block_count, it is a standalone vision model
-		mediatype = "application/vnd.ollama.image.projector"
-	}
+	var offset int64
+	for offset < stat.Size() {
+		f, n, err := ggml.Decode(blob, 1024)
+		if errors.Is(err, io.EOF) {
+			break
+		} else if err != nil {
+			return nil, err
+		}

-	layer, err := NewLayerFromLayer(digest, mediatype, blob.Name())
-	if err != nil {
-		slog.Debug("could not create new layer from layer", "error", err)
-		return nil, err
-	}
+		mediatype := "application/vnd.ollama.image.model"
+		if f.KV().Kind() == "adapter" {
+			mediatype = "application/vnd.ollama.image.adapter"
+		} else if _, ok := f.KV()[fmt.Sprintf("%s.vision.block_count", f.KV().Architecture())]; ok || f.KV().Kind() == "projector" {
+			mediatype = "application/vnd.ollama.image.projector"
+		}

-	layers = append(layers, &layerGGML{layer, f})
+		var layer Layer
+		if digest != "" && n == stat.Size() && offset == 0 {
+			layer, err = NewLayerFromLayer(digest, mediatype, blob.Name())
+			if err != nil {
+				slog.Debug("could not create new layer from layer", "error", err)
+				return nil, err
+			}
+		}
+
+		// Fallback to creating layer from file copy (either NewLayerFromLayer failed, or digest empty/n != stat.Size())
+		if layer.Digest == "" {
+			layer, err = NewLayer(io.NewSectionReader(blob, offset, n), mediatype)
+			if err != nil {
+				return nil, err
+			}
+		}
+
+		layers = append(layers, &layerGGML{layer, f})
+		offset = n
+	}

 	return detectChatTemplate(layers)
 }
--- a/server/download.go
+++ b/server/download.go
@@ -464,10 +464,6 @@ type downloadOpts struct {

 // downloadBlob downloads a blob from the registry and stores it in the blobs directory
 func downloadBlob(ctx context.Context, opts downloadOpts) (cacheHit bool, _ error) {
-	if opts.digest == "" {
-		return false, fmt.Errorf(("%s: %s"), opts.mp.GetNamespaceRepository(), "digest is is empty")
-	}
-
 	fp, err := GetBlobsPath(opts.digest)
 	if err != nil {
 		return false, err
--- a/server/images.go
+++ b/server/images.go
@@ -23,10 +23,9 @@ import (

 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/envconfig"
-	"github.com/ollama/ollama/fs/gguf"
+	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/template"
-	"github.com/ollama/ollama/thinking"
 	"github.com/ollama/ollama/types/model"
 	"github.com/ollama/ollama/version"
 )
@@ -38,7 +37,6 @@ var (
 	errCapabilityInsert     = errors.New("insert")
 	errCapabilityVision     = errors.New("vision")
 	errCapabilityEmbedding  = errors.New("embedding")
-	errCapabilityThinking   = errors.New("thinking")
 	errInsecureProtocol     = errors.New("insecure protocol http")
 )

@@ -73,20 +71,22 @@ func (m *Model) Capabilities() []model.Capability {
 	capabilities := []model.Capability{}

 	// Check for completion capability
-	f, err := gguf.Open(m.ModelPath)
+	r, err := os.Open(m.ModelPath)
 	if err == nil {
-		defer f.Close()
+		defer r.Close()

-		embedding := f.KeyValue("pooling_type")
-		if !embedding.Value.IsNil() {
-			capabilities = append(capabilities, model.CapabilityEmbedding)
+		f, _, err := ggml.Decode(r, 1024)
+		if err == nil {
+			if _, ok := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]; ok {
+				capabilities = append(capabilities, model.CapabilityEmbedding)
+			} else {
+				capabilities = append(capabilities, model.CapabilityCompletion)
+			}
+			if _, ok := f.KV()[fmt.Sprintf("%s.vision.block_count", f.KV().Architecture())]; ok {
+				capabilities = append(capabilities, model.CapabilityVision)
+			}
 		} else {
-			// If no embedding is specified, we assume the model supports completion
-			capabilities = append(capabilities, model.CapabilityCompletion)
-		}
-		vision := f.KeyValue("vision.block_count")
-		if !vision.Value.IsNil() {
-			capabilities = append(capabilities, model.CapabilityVision)
+			slog.Error("couldn't decode ggml", "error", err)
 		}
 	} else {
 		slog.Error("couldn't open model file", "error", err)
@@ -111,12 +111,6 @@ func (m *Model) Capabilities() []model.Capability {
 		capabilities = append(capabilities, model.CapabilityVision)
 	}

-	// Check for thinking capability
-	openingTag, closingTag := thinking.InferTags(m.Template.Template)
-	if openingTag != "" && closingTag != "" {
-		capabilities = append(capabilities, model.CapabilityThinking)
-	}
-
 	return capabilities
 }

@@ -133,7 +127,6 @@ func (m *Model) CheckCapabilities(want ...model.Capability) error {
 		model.CapabilityInsert:     errCapabilityInsert,
 		model.CapabilityVision:     errCapabilityVision,
 		model.CapabilityEmbedding:  errCapabilityEmbedding,
-		model.CapabilityThinking:   errCapabilityThinking,
 	}

 	for _, cap := range want {
@@ -148,19 +141,11 @@ func (m *Model) CheckCapabilities(want ...model.Capability) error {
 		}
 	}

-	var err error
 	if len(errs) > 0 {
-		err = fmt.Errorf("%w %w", errCapabilities, errors.Join(errs...))
+		return fmt.Errorf("%w %w", errCapabilities, errors.Join(errs...))
 	}

-	if slices.Contains(errs, errCapabilityThinking) {
-		if m.Config.ModelFamily == "qwen3" || model.ParseName(m.Name).Model == "deepseek-r1" {
-			// append a message to the existing error
-			return fmt.Errorf("%w. Pull the model again to get the latest version with full thinking support", err)
-		}
-	}
-
-	return err
+	return nil
 }

 func (m *Model) String() string {
--- a/server/images_test.go
+++ b/server/images_test.go
@@ -1,8 +1,9 @@
 package server

 import (
+	"bytes"
 	"encoding/binary"
-	"fmt"
+	"errors"
 	"os"
 	"path/filepath"
 	"strings"
@@ -12,200 +13,81 @@ import (
 	"github.com/ollama/ollama/types/model"
 )

-// GGUF type constants (matching gguf package)
-const (
-	typeUint8   = uint32(0)
-	typeInt8    = uint32(1)
-	typeUint16  = uint32(2)
-	typeInt16   = uint32(3)
-	typeUint32  = uint32(4)
-	typeInt32   = uint32(5)
-	typeFloat32 = uint32(6)
-	typeBool    = uint32(7)
-	typeString  = uint32(8)
-	typeArray   = uint32(9)
-	typeUint64  = uint32(10)
-	typeInt64   = uint32(11)
-	typeFloat64 = uint32(12)
+// Constants for GGUF magic bytes and version
+var (
+	ggufMagic = []byte{0x47, 0x47, 0x55, 0x46} // "GGUF"
+	ggufVer   = uint32(3)                      // Version 3
 )

-type testTensorInfo struct {
-	Name  string
-	Shape []uint64
-	Type  uint32
-}
+// Helper function to create mock GGUF data
+func createMockGGUFData(architecture string, vision bool) []byte {
+	var buf bytes.Buffer

-// Helper function to create test GGUF files (matching gguf package approach)
-func createTestGGUFFile(path string, keyValues map[string]any, tensors []testTensorInfo) error {
-	file, err := os.Create(path)
-	if err != nil {
-		return err
+	// Write GGUF header
+	buf.Write(ggufMagic)
+	binary.Write(&buf, binary.LittleEndian, ggufVer)
+
+	// Write tensor count (0 for our test)
+	var numTensors uint64 = 0
+	binary.Write(&buf, binary.LittleEndian, numTensors)
+
+	// Calculate number of metadata entries
+	numMetaEntries := uint64(1) // architecture entry
+	if vision {
+		numMetaEntries++
 	}
-	defer file.Close()
+	// Add embedding entry if architecture is "bert"
+	if architecture == "bert" {
+		numMetaEntries++
+	}
+	binary.Write(&buf, binary.LittleEndian, numMetaEntries)

-	// Write GGUF magic
-	if _, err := file.Write([]byte("GGUF")); err != nil {
-		return err
+	// Write architecture metadata
+	archKey := "general.architecture"
+	keyLen := uint64(len(archKey))
+	binary.Write(&buf, binary.LittleEndian, keyLen)
+	buf.WriteString(archKey)
+
+	// String type (8)
+	var strType uint32 = 8
+	binary.Write(&buf, binary.LittleEndian, strType)
+
+	// String length
+	strLen := uint64(len(architecture))
+	binary.Write(&buf, binary.LittleEndian, strLen)
+	buf.WriteString(architecture)
+
+	if vision {
+		visionKey := architecture + ".vision.block_count"
+		keyLen = uint64(len(visionKey))
+		binary.Write(&buf, binary.LittleEndian, keyLen)
+		buf.WriteString(visionKey)
+
+		// uint32 type (4)
+		var uint32Type uint32 = 4
+		binary.Write(&buf, binary.LittleEndian, uint32Type)
+
+		// uint32 value (1)
+		var countVal uint32 = 1
+		binary.Write(&buf, binary.LittleEndian, countVal)
+	}
+	// Write embedding metadata if architecture is "bert"
+	if architecture == "bert" {
+		poolKey := architecture + ".pooling_type"
+		keyLen = uint64(len(poolKey))
+		binary.Write(&buf, binary.LittleEndian, keyLen)
+		buf.WriteString(poolKey)
+
+		// uint32 type (4)
+		var uint32Type uint32 = 4
+		binary.Write(&buf, binary.LittleEndian, uint32Type)
+
+		// uint32 value (1)
+		var poolingVal uint32 = 1
+		binary.Write(&buf, binary.LittleEndian, poolingVal)
 	}

-	// Write version
-	if err := binary.Write(file, binary.LittleEndian, uint32(3)); err != nil {
-		return err
-	}
-
-	// Write tensor count
-	if err := binary.Write(file, binary.LittleEndian, uint64(len(tensors))); err != nil {
-		return err
-	}
-
-	// Write metadata count
-	if err := binary.Write(file, binary.LittleEndian, uint64(len(keyValues))); err != nil {
-		return err
-	}
-
-	// Write metadata
-	for key, value := range keyValues {
-		if err := writeKeyValue(file, key, value); err != nil {
-			return err
-		}
-	}
-
-	// Write tensor info
-	for _, tensor := range tensors {
-		if err := writeTensorInfo(file, tensor); err != nil {
-			return err
-		}
-	}
-
-	// Write some dummy tensor data
-	dummyData := make([]byte, 1024)
-	file.Write(dummyData)
-
-	return nil
-}
-
-func writeKeyValue(file *os.File, key string, value any) error {
-	// Write key length and key
-	if err := binary.Write(file, binary.LittleEndian, uint64(len(key))); err != nil {
-		return err
-	}
-	if _, err := file.Write([]byte(key)); err != nil {
-		return err
-	}
-
-	// Write value based on type
-	switch v := value.(type) {
-	case string:
-		if err := binary.Write(file, binary.LittleEndian, uint32(typeString)); err != nil {
-			return err
-		}
-		if err := binary.Write(file, binary.LittleEndian, uint64(len(v))); err != nil {
-			return err
-		}
-		_, err := file.Write([]byte(v))
-		return err
-	case int64:
-		if err := binary.Write(file, binary.LittleEndian, typeInt64); err != nil {
-			return err
-		}
-		return binary.Write(file, binary.LittleEndian, v)
-	case uint32:
-		if err := binary.Write(file, binary.LittleEndian, typeUint32); err != nil {
-			return err
-		}
-		return binary.Write(file, binary.LittleEndian, v)
-	case bool:
-		if err := binary.Write(file, binary.LittleEndian, typeBool); err != nil {
-			return err
-		}
-		return binary.Write(file, binary.LittleEndian, v)
-	case float64:
-		if err := binary.Write(file, binary.LittleEndian, uint32(typeFloat64)); err != nil {
-			return err
-		}
-		return binary.Write(file, binary.LittleEndian, v)
-	case []string:
-		if err := binary.Write(file, binary.LittleEndian, uint32(typeArray)); err != nil {
-			return err
-		}
-		if err := binary.Write(file, binary.LittleEndian, typeString); err != nil {
-			return err
-		}
-		if err := binary.Write(file, binary.LittleEndian, uint64(len(v))); err != nil {
-			return err
-		}
-		for _, s := range v {
-			if err := binary.Write(file, binary.LittleEndian, uint64(len(s))); err != nil {
-				return err
-			}
-			if _, err := file.Write([]byte(s)); err != nil {
-				return err
-			}
-		}
-		return nil
-	case []int64:
-		if err := binary.Write(file, binary.LittleEndian, uint32(typeArray)); err != nil {
-			return err
-		}
-		if err := binary.Write(file, binary.LittleEndian, typeInt64); err != nil {
-			return err
-		}
-		if err := binary.Write(file, binary.LittleEndian, uint64(len(v))); err != nil {
-			return err
-		}
-		for _, i := range v {
-			if err := binary.Write(file, binary.LittleEndian, i); err != nil {
-				return err
-			}
-		}
-		return nil
-	case []float64:
-		if err := binary.Write(file, binary.LittleEndian, typeArray); err != nil {
-			return err
-		}
-		if err := binary.Write(file, binary.LittleEndian, typeFloat64); err != nil {
-			return err
-		}
-		if err := binary.Write(file, binary.LittleEndian, uint64(len(v))); err != nil {
-			return err
-		}
-		for _, f := range v {
-			if err := binary.Write(file, binary.LittleEndian, f); err != nil {
-				return err
-			}
-		}
-		return nil
-	default:
-		return fmt.Errorf("unsupported value type: %T", value)
-	}
-}
-
-func writeTensorInfo(file *os.File, tensor testTensorInfo) error {
-	// Write tensor name
-	if err := binary.Write(file, binary.LittleEndian, uint64(len(tensor.Name))); err != nil {
-		return err
-	}
-	if _, err := file.Write([]byte(tensor.Name)); err != nil {
-		return err
-	}
-
-	// Write dimensions
-	if err := binary.Write(file, binary.LittleEndian, uint32(len(tensor.Shape))); err != nil {
-		return err
-	}
-	for _, dim := range tensor.Shape {
-		if err := binary.Write(file, binary.LittleEndian, dim); err != nil {
-			return err
-		}
-	}
-
-	// Write type
-	if err := binary.Write(file, binary.LittleEndian, tensor.Type); err != nil {
-		return err
-	}
-
-	// Write offset (dummy value)
-	return binary.Write(file, binary.LittleEndian, uint64(0))
+	return buf.Bytes()
 }

 func TestModelCapabilities(t *testing.T) {
@@ -219,38 +101,13 @@ func TestModelCapabilities(t *testing.T) {
 	// Create a simple model file for tests that don't depend on GGUF content
 	simpleModelPath := filepath.Join(tempDir, "simple_model.bin")

-	// Create completion model (llama architecture without vision)
-	if err := createTestGGUFFile(completionModelPath, map[string]any{
-		"general.architecture": "llama",
-	}, []testTensorInfo{
-		{Name: "token_embd.weight", Shape: []uint64{1000, 512}, Type: 1}, // F16
-	}); err != nil {
-		t.Fatalf("Failed to create completion model file: %v", err)
-	}
-
-	// Create vision model (llama architecture with vision block count)
-	if err := createTestGGUFFile(visionModelPath, map[string]any{
-		"general.architecture":     "llama",
-		"llama.vision.block_count": uint32(1),
-	}, []testTensorInfo{
-		{Name: "token_embd.weight", Shape: []uint64{1000, 512}, Type: 1}, // F16
-	}); err != nil {
-		t.Fatalf("Failed to create vision model file: %v", err)
-	}
-
-	// Create embedding model (bert architecture with pooling type)
-	if err := createTestGGUFFile(embeddingModelPath, map[string]any{
-		"general.architecture": "bert",
-		"bert.pooling_type":    uint32(1),
-	}, []testTensorInfo{
-		{Name: "token_embd.weight", Shape: []uint64{1000, 512}, Type: 1}, // F16
-	}); err != nil {
-		t.Fatalf("Failed to create embedding model file: %v", err)
-	}
-
-	// Create simple model file for tests that don't depend on GGUF content
-	if err := os.WriteFile(simpleModelPath, []byte("dummy model data"), 0o644); err != nil {
-		t.Fatalf("Failed to create simple model file: %v", err)
+	if err := errors.Join(
+		os.WriteFile(completionModelPath, createMockGGUFData("llama", false), 0o644),
+		os.WriteFile(visionModelPath, createMockGGUFData("llama", true), 0o644),
+		os.WriteFile(embeddingModelPath, createMockGGUFData("bert", false), 0o644),
+		os.WriteFile(simpleModelPath, []byte("dummy model data"), 0o644),
+	); err != nil {
+		t.Fatalf("Failed to create model files: %v", err)
 	}

 	toolsInsertTemplate, err := template.Parse("{{ .prompt }}{{ if .tools }}{{ .tools }}{{ end }}{{ if .suffix }}{{ .suffix }}{{ end }}")
@@ -374,29 +231,12 @@ func TestModelCheckCapabilities(t *testing.T) {
 	simpleModelPath := filepath.Join(tempDir, "model.bin")
 	embeddingModelPath := filepath.Join(tempDir, "embedding_model.bin")

-	// Create vision model (llama architecture with vision block count)
-	if err := createTestGGUFFile(visionModelPath, map[string]any{
-		"general.architecture":     "llama",
-		"llama.vision.block_count": uint32(1),
-	}, []testTensorInfo{
-		{Name: "token_embd.weight", Shape: []uint64{1000, 512}, Type: 1}, // F16
-	}); err != nil {
-		t.Fatalf("Failed to create vision model file: %v", err)
-	}
-
-	// Create embedding model (bert architecture with pooling type)
-	if err := createTestGGUFFile(embeddingModelPath, map[string]any{
-		"general.architecture": "bert",
-		"bert.pooling_type":    uint32(1),
-	}, []testTensorInfo{
-		{Name: "token_embd.weight", Shape: []uint64{1000, 512}, Type: 1}, // F16
-	}); err != nil {
-		t.Fatalf("Failed to create embedding model file: %v", err)
-	}
-
-	// Create simple model file for tests that don't depend on GGUF content
-	if err := os.WriteFile(simpleModelPath, []byte("dummy model data"), 0o644); err != nil {
-		t.Fatalf("Failed to create simple model file: %v", err)
+	if err := errors.Join(
+		os.WriteFile(simpleModelPath, []byte("dummy model data"), 0o644),
+		os.WriteFile(visionModelPath, createMockGGUFData("llama", true), 0o644),
+		os.WriteFile(embeddingModelPath, createMockGGUFData("bert", false), 0o644),
+	); err != nil {
+		t.Fatalf("Failed to create model files: %v", err)
 	}

 	toolsInsertTemplate, err := template.Parse("{{ .prompt }}{{ if .tools }}{{ .tools }}{{ end }}{{ if .suffix }}{{ .suffix }}{{ end }}")
--- a/server/model.go
+++ b/server/model.go
@@ -10,6 +10,9 @@ import (
 	"log/slog"
 	"net/http"
 	"os"
+	"slices"
+	"strings"
+	"text/template/parse"

 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/fs/ggml"
@@ -61,7 +64,7 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe
 			}
 			defer blob.Close()

-			f, err := ggml.Decode(blob, -1)
+			f, _, err := ggml.Decode(blob, -1)
 			if err != nil {
 				return nil, err
 			}
@@ -125,3 +128,124 @@ func detectContentType(r io.Reader) (string, error) {

 	return "unknown", nil
 }
+
+func parseObjects(s string) []map[string]any {
+	var objs []map[string]any
+	for offset := 0; offset < len(s); {
+		var obj map[string]any
+		decoder := json.NewDecoder(strings.NewReader(s[offset:]))
+		if err := decoder.Decode(&obj); errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) {
+			break
+		} else if syntax := &(json.SyntaxError{}); errors.As(err, &syntax) {
+			// skip over any syntax errors
+			offset += int(syntax.Offset)
+		} else if unmarshalType := &(json.UnmarshalTypeError{}); errors.As(err, &unmarshalType) {
+			// skip over any unmarshalable types
+			offset += int(unmarshalType.Offset)
+		} else if err != nil {
+			return nil
+		} else {
+			offset += int(decoder.InputOffset())
+			objs = append(objs, obj)
+		}
+	}
+
+	return objs
+}
+
+// parseToolCalls attempts to parse a JSON string into a slice of ToolCalls.
+// mxyng: this only really works if the input contains tool calls in some JSON format
+func (m *Model) parseToolCalls(s string) ([]api.ToolCall, bool) {
+	// create a subtree from the node that ranges over .ToolCalls
+	tmpl := m.Template.Subtree(func(n parse.Node) bool {
+		if t, ok := n.(*parse.RangeNode); ok {
+			return slices.Contains(template.Identifiers(t.Pipe), "ToolCalls")
+		}
+
+		return false
+	})
+
+	if tmpl == nil {
+		return nil, false
+	}
+
+	var b bytes.Buffer
+	if err := tmpl.Execute(&b, map[string][]api.ToolCall{
+		"ToolCalls": {
+			{
+				Function: api.ToolCallFunction{
+					Name: "@@name@@",
+					Arguments: api.ToolCallFunctionArguments{
+						"@@argument@@": 1,
+					},
+				},
+			},
+		},
+	}); err != nil {
+		return nil, false
+	}
+
+	templateObjects := parseObjects(b.String())
+	if len(templateObjects) == 0 {
+		return nil, false
+	}
+
+	// find the keys that correspond to the name and arguments fields
+	var name, arguments string
+	for k, v := range templateObjects[0] {
+		switch v.(type) {
+		case string:
+			name = k
+		case map[string]any:
+			arguments = k
+		}
+	}
+
+	if name == "" || arguments == "" {
+		return nil, false
+	}
+
+	responseObjects := parseObjects(s)
+	if len(responseObjects) == 0 {
+		return nil, false
+	}
+
+	// collect all nested objects
+	var collect func(any) []map[string]any
+	collect = func(obj any) (all []map[string]any) {
+		switch o := obj.(type) {
+		case map[string]any:
+			all = append(all, o)
+			for _, v := range o {
+				all = append(all, collect(v)...)
+			}
+		case []any:
+			for _, v := range o {
+				all = append(all, collect(v)...)
+			}
+		}
+
+		return all
+	}
+
+	var objs []map[string]any
+	for _, p := range responseObjects {
+		objs = append(objs, collect(p)...)
+	}
+
+	var toolCalls []api.ToolCall
+	for _, kv := range objs {
+		n, nok := kv[name].(string)
+		a, aok := kv[arguments].(map[string]any)
+		if nok && aok {
+			toolCalls = append(toolCalls, api.ToolCall{
+				Function: api.ToolCallFunction{
+					Name:      n,
+					Arguments: a,
+				},
+			})
+		}
+	}
+
+	return toolCalls, len(toolCalls) > 0
+}
--- a/server/model_test.go
+++ b/server/model_test.go
@@ -0,0 +1,179 @@
+package server
+
+import (
+	"bytes"
+	"encoding/json"
+	"fmt"
+	"os"
+	"path/filepath"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+
+	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/template"
+)
+
+func readFile(t *testing.T, base, name string) *bytes.Buffer {
+	t.Helper()
+
+	bts, err := os.ReadFile(filepath.Join(base, name))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	return bytes.NewBuffer(bts)
+}
+
+func TestExecuteWithTools(t *testing.T) {
+	p := filepath.Join("testdata", "tools")
+	cases := []struct {
+		model  string
+		output string
+		ok     bool
+	}{
+		{"mistral", `[TOOL_CALLS]  [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`, true},
+		{"mistral", `[TOOL_CALLS]  [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]
+
+The temperature in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.`, true},
+		{"mistral", `[TOOL_CALLS]  [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"To }]`, false},
+		{"mistral", `I'm not aware of that information. However, I can suggest searching for the weather using the "get_current_weather" function:
+
+		[{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`, true},
+		{"mistral", " The weather in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.", false},
+		{"command-r-plus", "Action: ```json" + `
+[
+    {
+        "tool_name": "get_current_weather",
+        "parameters": {
+            "format": "fahrenheit",
+            "location": "San Francisco, CA"
+        }
+    },
+    {
+        "tool_name": "get_current_weather",
+        "parameters": {
+            "format": "celsius",
+            "location": "Toronto, Canada"
+        }
+    }
+]
+` + "```", true},
+		{"command-r-plus", " The weather in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.", false},
+		{"firefunction", ` functools[{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`, true},
+		{"firefunction", " The weather in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.", false},
+		{"llama3-groq-tool-use", `<tool_call>
+{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}}
+{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}
+</tool_call>`, true},
+		{"xlam", `{"tool_calls": [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]}`, true},
+		{"nemotron", `<toolcall>{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]} </toolcall>`, true},
+	}
+
+	var tools []api.Tool
+	if err := json.Unmarshal(readFile(t, p, "tools.json").Bytes(), &tools); err != nil {
+		t.Fatal(err)
+	}
+
+	var messages []api.Message
+	if err := json.Unmarshal(readFile(t, p, "messages.json").Bytes(), &messages); err != nil {
+		t.Fatal(err)
+	}
+
+	calls := []api.ToolCall{
+		{
+			Function: api.ToolCallFunction{
+				Name: "get_current_weather",
+				Arguments: api.ToolCallFunctionArguments{
+					"format":   "fahrenheit",
+					"location": "San Francisco, CA",
+				},
+			},
+		},
+		{
+			Function: api.ToolCallFunction{
+				Name: "get_current_weather",
+				Arguments: api.ToolCallFunctionArguments{
+					"format":   "celsius",
+					"location": "Toronto, Canada",
+				},
+			},
+		},
+	}
+
+	for _, tt := range cases {
+		t.Run(tt.model, func(t *testing.T) {
+			tmpl, err := template.Parse(readFile(t, p, fmt.Sprintf("%s.gotmpl", tt.model)).String())
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			t.Run("template", func(t *testing.T) {
+				var actual bytes.Buffer
+				if err := tmpl.Execute(&actual, template.Values{Tools: tools, Messages: messages}); err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(actual.String(), readFile(t, p, fmt.Sprintf("%s.out", tt.model)).String()); diff != "" {
+					t.Errorf("mismatch (-got +want):\n%s", diff)
+				}
+			})
+
+			t.Run("parse", func(t *testing.T) {
+				m := &Model{Template: tmpl}
+				actual, ok := m.parseToolCalls(tt.output)
+				if ok != tt.ok {
+					t.Fatalf("expected %t, got %t", tt.ok, ok)
+				}
+
+				if tt.ok {
+					if diff := cmp.Diff(actual, calls); diff != "" {
+						t.Errorf("mismatch (-got +want):\n%s", diff)
+					}
+				}
+			})
+		})
+	}
+}
+
+func TestParseObjects(t *testing.T) {
+	tests := []struct {
+		input string
+		want  []map[string]any
+	}{
+		{
+			input: `[{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`,
+			want: []map[string]any{
+				{"name": "get_current_weather", "arguments": map[string]any{"format": "fahrenheit", "location": "San Francisco, CA"}},
+				{"name": "get_current_weather", "arguments": map[string]any{"format": "celsius", "location": "Toronto, Canada"}},
+			},
+		},
+		{
+			input: `<toolcall>{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} </toolcall>`,
+			want: []map[string]any{
+				{"name": "get_current_weather", "arguments": map[string]any{"format": "fahrenheit", "location": "San Francisco, CA"}},
+			},
+		},
+		{
+			input: `<toolcall>{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} </toolcall> <toolcall>{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, ON"}} </toolcall>`,
+			want: []map[string]any{
+				{"name": "get_current_weather", "arguments": map[string]any{"format": "fahrenheit", "location": "San Francisco, CA"}},
+				{"name": "get_current_weather", "arguments": map[string]any{"format": "celsius", "location": "Toronto, ON"}},
+			},
+		},
+		{
+			input: `{"name": "get_current_weather", "arguments": `,
+			want:  nil,
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.input, func(t *testing.T) {
+			got := parseObjects(tc.input)
+
+			if diff := cmp.Diff(got, tc.want); diff != "" {
+				t.Errorf("mismatch (-got +want):\n%s", diff)
+			}
+		})
+	}
+}
--- a/server/modelpath.go
+++ b/server/modelpath.go
@@ -116,7 +116,7 @@ func (mp ModelPath) BaseURL() *url.URL {
 func GetManifestPath() (string, error) {
 	path := filepath.Join(envconfig.Models(), "manifests")
 	if err := os.MkdirAll(path, 0o755); err != nil {
-		return "", fmt.Errorf("%w: ensure path elements are traversable", err)
+		return "", err
 	}

 	return path, nil
@@ -139,7 +139,7 @@ func GetBlobsPath(digest string) (string, error) {
 	}

 	if err := os.MkdirAll(dirPath, 0o755); err != nil {
-		return "", fmt.Errorf("%w: ensure path elements are traversable", err)
+		return "", err
 	}

 	return path, nil
--- a/server/prompt.go
+++ b/server/prompt.go
@@ -19,7 +19,7 @@ type tokenizeFunc func(context.Context, string) ([]int, error)
 // chatPrompt accepts a list of messages and returns the prompt and images that should be used for the next chat turn.
 // chatPrompt truncates any messages that exceed the context window of the model, making sure to always include 1) the
 // latest message and 2) system messages
-func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.Options, msgs []api.Message, tools []api.Tool, think *bool) (prompt string, images []llm.ImageData, _ error) {
+func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.Options, msgs []api.Message, tools []api.Tool) (prompt string, images []llm.ImageData, _ error) {
 	var system []api.Message

 	// TODO: Ideally we would compute this from the projector metadata but some pieces are implementation dependent
@@ -41,12 +41,8 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
 			}
 		}

-		thinkVal := false
-		if think != nil {
-			thinkVal = *think
-		}
 		var b bytes.Buffer
-		if err := m.Template.Execute(&b, template.Values{Messages: append(system, msgs[i:]...), Tools: tools, Think: thinkVal, IsThinkSet: think != nil}); err != nil {
+		if err := m.Template.Execute(&b, template.Values{Messages: append(system, msgs[i:]...), Tools: tools}); err != nil {
 			return "", nil, err
 		}

@@ -100,11 +96,7 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.

 	// truncate any messages that do not fit into the context window
 	var b bytes.Buffer
-	thinkVal := false
-	if think != nil {
-		thinkVal = *think
-	}
-	if err := m.Template.Execute(&b, template.Values{Messages: append(system, msgs[currMsgIdx:]...), Tools: tools, Think: thinkVal, IsThinkSet: think != nil}); err != nil {
+	if err := m.Template.Execute(&b, template.Values{Messages: append(system, msgs[currMsgIdx:]...), Tools: tools}); err != nil {
 		return "", nil, err
 	}

--- a/server/prompt_test.go
+++ b/server/prompt_test.go
@@ -208,8 +208,7 @@ func TestChatPrompt(t *testing.T) {
 		t.Run(tt.name, func(t *testing.T) {
 			model := tt.model
 			opts := api.Options{Runner: api.Runner{NumCtx: tt.limit}}
-			think := false
-			prompt, images, err := chatPrompt(t.Context(), &model, mockRunner{}.Tokenize, &opts, tt.msgs, nil, &think)
+			prompt, images, err := chatPrompt(t.Context(), &model, mockRunner{}.Tokenize, &opts, tt.msgs, nil)
 			if tt.error == nil && err != nil {
 				t.Fatal(err)
 			} else if tt.error != nil && err != tt.error {
--- a/server/quantization.go
+++ b/server/quantization.go
@@ -120,30 +120,14 @@ func getTensorNewType(kv fsggml.KV, qs *quantizeState, newType fsggml.TensorType

 	if newType.IsQuantized() {
 		nx := shape[0]
+		ny := uint64(1)
+		if len(shape) > 1 {
+			ny = shape[1]
+		}
 		qk_k := newType.BlockSize()
-
-		// Check if first dimension is divisible by block size
 		if nx%qk_k != 0 {
-			// Store the original type for logging
-			originalType := newType
-
-			// Select appropriate fallback based on original type
-			switch newType {
-			case fsggml.TensorTypeQ4_K:
-				newType = fsggml.TensorTypeQ5_0
-			case fsggml.TensorTypeQ5_K:
-				newType = fsggml.TensorTypeQ5_1
-			case fsggml.TensorTypeQ6_K:
-				newType = fsggml.TensorTypeQ8_0
-			}
-
-			// Final check - if still incompatible, fall back to F16
-			if nx%newType.BlockSize() != 0 {
-				newType = fsggml.TensorTypeF16
-			}
-
-			slog.Warn(fmt.Sprintf("tensor cols %d are not divisible by %d, required for %s - using fallback quantization %s",
-				nx, qk_k, originalType.String(), newType.String()))
+			slog.Warn(fmt.Sprintf("tensor cols %d x %d are not divisible by %d, required for %s.  Falling back to quantization %s", nx, ny, qk_k, newType.String(), fsggml.TensorTypeF16.String()))
+			newType = fsggml.TensorTypeF16
 		}
 	}
 	return newType
--- a/server/quantization_test.go
+++ b/server/quantization_test.go
@@ -271,7 +271,7 @@ func TestQuantizeModel(t *testing.T) {
 				t.Fatal(err.Error())
 			}
 			defer fp.Close()
-			meta, err := fsggml.Decode(fp, -1)
+			meta, _, err := fsggml.Decode(fp, -1)
 			if err != nil {
 				t.Fatal(err.Error())
 			}
@@ -303,7 +303,7 @@ func TestQuantizeModel(t *testing.T) {
 				t.Fatalf("failed to load the quantized model %s: %s", tmp.Name(), err)
 			}
 			defer fpNew.Close()
-			newMeta, err := fsggml.Decode(fpNew, -1)
+			newMeta, _, err := fsggml.Decode(fpNew, -1)
 			if err != nil {
 				t.Fatalf("failed to load the quantized model %s: %s", tmp.Name(), err)
 			}
--- a/server/routes.go
+++ b/server/routes.go
@@ -17,6 +17,7 @@ import (
 	"net/netip"
 	"os"
 	"os/signal"
+	"regexp"
 	"slices"
 	"strings"
 	"syscall"
@@ -37,8 +38,6 @@ import (
 	"github.com/ollama/ollama/server/internal/client/ollama"
 	"github.com/ollama/ollama/server/internal/registry"
 	"github.com/ollama/ollama/template"
-	"github.com/ollama/ollama/thinking"
-	"github.com/ollama/ollama/tools"
 	"github.com/ollama/ollama/types/errtypes"
 	"github.com/ollama/ollama/types/model"
 	"github.com/ollama/ollama/version"
@@ -186,13 +185,6 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 	if req.Suffix != "" {
 		caps = append(caps, model.CapabilityInsert)
 	}
-	if req.Think != nil && *req.Think {
-		caps = append(caps, model.CapabilityThinking)
-		// TODO(drifkin): consider adding a warning if it's false and the model
-		// doesn't support thinking. It's not strictly required, but it can be a
-		// hint that the user is on an older qwen3/r1 model that doesn't have an
-		// updated template supporting thinking
-	}

 	r, m, opts, err := s.scheduleRunner(c.Request.Context(), name.String(), caps, req.Options, req.KeepAlive)
 	if errors.Is(err, errCapabilityCompletion) {
@@ -261,9 +253,6 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 			values.Messages = append(msgs, api.Message{Role: "user", Content: req.Prompt})
 		}

-		values.Think = req.Think != nil && *req.Think
-		values.IsThinkSet = req.Think != nil
-
 		var b bytes.Buffer
 		if req.Context != nil {
 			slog.Warn("the context field is deprecated and will be removed in a future version of Ollama")
@@ -283,15 +272,6 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 		prompt = b.String()
 	}

-	var thinkingState *thinking.Parser
-	openingTag, closingTag := thinking.InferTags(m.Template.Template)
-	if req.Think != nil && *req.Think && openingTag != "" && closingTag != "" {
-		thinkingState = &thinking.Parser{
-			OpeningTag: openingTag,
-			ClosingTag: closingTag,
-		}
-	}
-
 	ch := make(chan any)
 	go func() {
 		// TODO (jmorganca): avoid building the response twice both here and below
@@ -316,12 +296,6 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 				},
 			}

-			if thinkingState != nil {
-				thinking, content := thinkingState.AddContent(cr.Content)
-				res.Thinking = thinking
-				res.Response = content
-			}
-
 			if _, err := sb.WriteString(cr.Content); err != nil {
 				ch <- gin.H{"error": err.Error()}
 			}
@@ -349,13 +323,11 @@ func (s *Server) GenerateHandler(c *gin.Context) {

 	if req.Stream != nil && !*req.Stream {
 		var r api.GenerateResponse
-		var sbThinking strings.Builder
-		var sbContent strings.Builder
+		var sb strings.Builder
 		for rr := range ch {
 			switch t := rr.(type) {
 			case api.GenerateResponse:
-				sbThinking.WriteString(t.Thinking)
-				sbContent.WriteString(t.Response)
+				sb.WriteString(t.Response)
 				r = t
 			case gin.H:
 				msg, ok := t["error"].(string)
@@ -371,9 +343,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 			}
 		}

-		r.Thinking = sbThinking.String()
-		r.Response = sbContent.String()
-
+		r.Response = sb.String()
 		c.JSON(http.StatusOK, r)
 		return
 	}
@@ -929,7 +899,8 @@ func (s *Server) ListHandler(c *gin.Context) {
 			}
 		}

-		r := api.ListModelResponse{
+		// tag should never be masked
+		models = append(models, api.ListModelResponse{
 			Model:      n.DisplayShortest(),
 			Name:       n.DisplayShortest(),
 			Size:       m.Size(),
@@ -942,16 +913,7 @@ func (s *Server) ListHandler(c *gin.Context) {
 				ParameterSize:     cf.ModelType,
 				QuantizationLevel: cf.FileType,
 			},
-		}
-
-		model, err := GetModel(n.String())
-		if err != nil {
-			slog.Warn("bad model details", "name", n, "error", err)
-		} else {
-			r.Capabilities = model.Capabilities()
-		}
-
-		models = append(models, r)
+		})
 	}

 	slices.SortStableFunc(models, func(i, j api.ListModelResponse) int {
@@ -1473,9 +1435,6 @@ func (s *Server) ChatHandler(c *gin.Context) {
 	if len(req.Tools) > 0 {
 		caps = append(caps, model.CapabilityTools)
 	}
-	if req.Think != nil && *req.Think {
-		caps = append(caps, model.CapabilityThinking)
-	}

 	name := model.ParseName(req.Model)
 	if !name.IsValid() {
@@ -1516,36 +1475,18 @@ func (s *Server) ChatHandler(c *gin.Context) {
 	}
 	msgs = filterThinkTags(msgs, m)

-	prompt, images, err := chatPrompt(c.Request.Context(), m, r.Tokenize, opts, msgs, req.Tools, req.Think)
+	prompt, images, err := chatPrompt(c.Request.Context(), m, r.Tokenize, opts, msgs, req.Tools)
 	if err != nil {
 		slog.Error("chat prompt error", "error", err)
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
 	}

-	var thinkingState *thinking.Parser
-	openingTag, closingTag := thinking.InferTags(m.Template.Template)
-	if req.Think != nil && *req.Think && openingTag != "" && closingTag != "" {
-		thinkingState = &thinking.Parser{
-			OpeningTag: openingTag,
-			ClosingTag: closingTag,
-		}
-	}
-
-	var toolParser *tools.Parser
-	if len(req.Tools) > 0 {
-		toolParser, err = tools.NewParser(m.Template.Template)
-		if err != nil {
-			slog.Error("failed to create tool parser", "error", err)
-			c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
-			return
-		}
-	}
-
 	ch := make(chan any)
 	go func() {
 		defer close(ch)
-
+		var sb strings.Builder
+		var toolCallIndex int = 0
 		if err := r.Completion(c.Request.Context(), llm.CompletionRequest{
 			Prompt:  prompt,
 			Images:  images,
@@ -1565,40 +1506,43 @@ func (s *Server) ChatHandler(c *gin.Context) {
 				},
 			}

-			if thinkingState != nil {
-				thinkingContent, remainingContent := thinkingState.AddContent(res.Message.Content)
-				if thinkingContent == "" && remainingContent == "" && !r.Done {
-					// need to accumulate more to decide what to send
-					return
-				}
-				res.Message.Content = remainingContent
-				res.Message.Thinking = thinkingContent
-			}
-
 			if r.Done {
 				res.DoneReason = r.DoneReason.String()
 				res.TotalDuration = time.Since(checkpointStart)
 				res.LoadDuration = checkpointLoaded.Sub(checkpointStart)
 			}

-			if len(req.Tools) > 0 {
-				toolCalls, content := toolParser.Add(res.Message.Content)
-				if len(content) > 0 {
-					res.Message.Content = content
-				} else if len(toolCalls) > 0 {
-					res.Message.ToolCalls = toolCalls
-					res.Message.Content = ""
-				} else if res.Message.Thinking != "" {
-					// don't return
-				} else {
-					if r.Done {
-						ch <- res
-					}
-					return
-				}
+			// TODO: tool call checking and filtering should be moved outside of this callback once streaming
+			// however this was a simple change for now without reworking streaming logic of this (and other)
+			// handlers
+			if req.Stream != nil && !*req.Stream || len(req.Tools) == 0 {
+				ch <- res
+				return
 			}

-			ch <- res
+			// Streaming tool calls:
+			// If tools are recognized, use a flag to track the sending of a tool downstream
+			// This ensures that content is cleared from the message on the last chunk sent
+			sb.WriteString(r.Content)
+			if toolCalls, ok := m.parseToolCalls(sb.String()); ok {
+				res.Message.ToolCalls = toolCalls
+				for i := range toolCalls {
+					toolCalls[i].Function.Index = toolCallIndex
+					toolCallIndex++
+				}
+				res.Message.Content = ""
+				sb.Reset()
+				ch <- res
+				return
+			}
+
+			if r.Done {
+				// Send any remaining content if no tool calls were detected
+				if toolCallIndex == 0 {
+					res.Message.Content = sb.String()
+				}
+				ch <- res
+			}
 		}); err != nil {
 			ch <- gin.H{"error": err.Error()}
 		}
@@ -1606,18 +1550,12 @@ func (s *Server) ChatHandler(c *gin.Context) {

 	if req.Stream != nil && !*req.Stream {
 		var resp api.ChatResponse
-		var toolCalls []api.ToolCall
-		var sbThinking strings.Builder
-		var sbContent strings.Builder
+		var sb strings.Builder
 		for rr := range ch {
 			switch t := rr.(type) {
 			case api.ChatResponse:
-				sbThinking.WriteString(t.Message.Thinking)
-				sbContent.WriteString(t.Message.Content)
+				sb.WriteString(t.Message.Content)
 				resp = t
-				if len(req.Tools) > 0 {
-					toolCalls = append(toolCalls, t.Message.ToolCalls...)
-				}
 			case gin.H:
 				msg, ok := t["error"].(string)
 				if !ok {
@@ -1632,11 +1570,13 @@ func (s *Server) ChatHandler(c *gin.Context) {
 			}
 		}

-		resp.Message.Content = sbContent.String()
-		resp.Message.Thinking = sbThinking.String()
+		resp.Message.Content = sb.String()

-		if len(toolCalls) > 0 {
-			resp.Message.ToolCalls = toolCalls
+		if len(req.Tools) > 0 {
+			if toolCalls, ok := m.parseToolCalls(sb.String()); ok {
+				resp.Message.ToolCalls = toolCalls
+				resp.Message.Content = ""
+			}
 		}

 		c.JSON(http.StatusOK, resp)
@@ -1661,6 +1601,8 @@ func handleScheduleError(c *gin.Context, name string, err error) {
 	}
 }

+var thinkTagRegexp = regexp.MustCompile(`<think>(?s).*?</think>(\n)*`)
+
 func filterThinkTags(msgs []api.Message, m *Model) []api.Message {
 	if m.Config.ModelFamily == "qwen3" || model.ParseName(m.Name).Model == "deepseek-r1" {
 		finalUserIndex := -1
@@ -1672,17 +1614,7 @@ func filterThinkTags(msgs []api.Message, m *Model) []api.Message {

 		for i, msg := range msgs {
 			if msg.Role == "assistant" && i < finalUserIndex {
-				// TODO(drifkin): this is from before we added proper thinking support.
-				// However, even if thinking is not enabled (and therefore we shouldn't
-				// change the user output), we should probably perform this filtering
-				// for all thinking models (not just qwen3 & deepseek-r1) since it tends
-				// to save tokens and improve quality.
-				thinkingState := &thinking.Parser{
-					OpeningTag: "<think>",
-					ClosingTag: "</think>",
-				}
-				_, content := thinkingState.AddContent(msg.Content)
-				msgs[i].Content = content
+				msgs[i].Content = thinkTagRegexp.ReplaceAllString(msg.Content, "")
 			}
 		}
 	}
--- a/server/routes_generate_test.go
+++ b/server/routes_generate_test.go
@@ -143,25 +143,6 @@ func TestGenerateChat(t *testing.T) {
 		}
 	})

-	t.Run("missing thinking capability", func(t *testing.T) {
-		think := true
-		w := createRequest(t, s.ChatHandler, api.ChatRequest{
-			Model: "test",
-			Messages: []api.Message{
-				{Role: "user", Content: "Hello!"},
-			},
-			Think: &think,
-		})
-
-		if w.Code != http.StatusBadRequest {
-			t.Errorf("expected status 400, got %d", w.Code)
-		}
-
-		if diff := cmp.Diff(w.Body.String(), `{"error":"registry.ollama.ai/library/test:latest does not support thinking"}`); diff != "" {
-			t.Errorf("mismatch (-got +want):\n%s", diff)
-		}
-	})
-
 	t.Run("missing model", func(t *testing.T) {
 		w := createRequest(t, s.ChatHandler, api.ChatRequest{})
 		if w.Code != http.StatusBadRequest {
--- a/server/sched.go
+++ b/server/sched.go
@@ -387,17 +387,6 @@ func (s *Scheduler) processCompleted(ctx context.Context) {
 				s.loadedMu.Unlock()
 				runner.refMu.Unlock()
 				slog.Debug("duplicate expired event, ignoring", "runner", runner)
-			} else if runner.pid != runnerToUnload.pid {
-				// If the pids do not match, we likely had multiple load
-				// failures for the same model in quick succession due to
-				// request context canceled and are draining the queue of
-				// events. Ensure the orphaned runner is properly shut down, but
-				// do not delete the mismatched loaded runner, or wait for VRAM
-				// convergence.
-				slog.Debug("orphaned runner shutting down", "orphan", runner, "loaded", runnerToUnload)
-				runner.unload()
-				s.loadedMu.Unlock()
-				runner.refMu.Unlock()
 			} else {
 				slog.Debug("starting background wait for VRAM recovery", "runner", runner)
 				finished := runner.waitForVRAMRecovery()
--- a/Show More
+++ b/Show More