runner: enable returning more info from runner processing

Currently we return only the text predicted from the LLM. This was nice in that it was simple, but there may be other info we want to know from the processing. This change adds the ability to return more information from the runner than just the text predicted.
tools: loosen tool parsing to allow for more formats (#11030 )
2026-01-08 07:30:53 -05:00 · 2025-06-13 16:26:57 -07:00 · 2025-06-12 14:18:54 -07:00 · 2025-06-12 11:04:11 -07:00 · 2025-06-11 12:10:54 -07:00 · 2025-06-11 12:10:35 -07:00
148 changed files with 8390 additions and 3435 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -51,6 +51,8 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/include
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu/amx)

+add_compile_definitions(NDEBUG)
+
 set(GGML_CPU ON)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src)
 set_property(TARGET ggml PROPERTY EXCLUDE_FROM_ALL TRUE)
--- a/README.md
+++ b/README.md
@@ -40,10 +40,10 @@ The official [Ollama Docker image](https://hub.docker.com/r/ollama/ollama) `olla

 ## Quickstart

-To run and chat with [Llama 3.2](https://ollama.com/library/llama3.2):
+To run and chat with [Gemma 3](https://ollama.com/library/gemma3):

 ```shell
-ollama run llama3.2
+ollama run gemma3
 ```

 ## Model library
@@ -405,6 +405,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Writeopia](https://github.com/Writeopia/Writeopia) (Text editor with integration with Ollama)
 - [AppFlowy](https://github.com/AppFlowy-IO/AppFlowy) (AI collaborative workspace with Ollama, cross-platform and self-hostable)
 - [Lumina](https://github.com/cushydigit/lumina.git) (A lightweight, minimal React.js frontend for interacting with Ollama servers)
+- [Tiny Notepad](https://pypi.org/project/tiny-notepad) (A lightweight, notepad-like interface to chat with ollama available on PyPI)
+- [macLlama (macOS native)](https://github.com/hellotunamayo/macLlama) (A native macOS GUI application for interacting with Ollama models, featuring a chat interface.) 

 ### Cloud

@@ -448,6 +450,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [orbiton](https://github.com/xyproto/orbiton) Configuration-free text editor and IDE with support for tab completion with Ollama.
 - [orca-cli](https://github.com/molbal/orca-cli) Ollama Registry CLI Application - Browse, pull, and download models from Ollama Registry in your terminal.
 - [GGUF-to-Ollama](https://github.com/jonathanhecl/gguf-to-ollama) - Importing GGUF to Ollama made easy (multiplatform)
+- [AWS-Strands-With-Ollama](https://github.com/rapidarchitect/ollama_strands) - AWS Strands Agents with Ollama Examples
+- [ollama-multirun](https://github.com/attogram/ollama-multirun) - A bash shell script to run a single prompt against any or all of your locally installed ollama models, saving the output and performance statistics as easily navigable web pages. ([Demo](https://attogram.github.io/ai_test_zone/))

 ### Apple Vision Pro

@@ -584,6 +588,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Simple-Discord-AI](https://github.com/zyphixor/simple-discord-ai)
 - [LLM Telegram Bot](https://github.com/innightwolfsleep/llm_telegram_bot) (telegram bot, primary for RP. Oobabooga-like buttons, [A1111](https://github.com/AUTOMATIC1111/stable-diffusion-webui) API integration e.t.c)
 - [mcp-llm](https://github.com/sammcj/mcp-llm) (MCP Server to allow LLMs to call other LLMs)
+- [SimpleOllamaUnity](https://github.com/HardCodeDev777/SimpleOllamaUnity) (Unity Engine extension for communicating with Ollama in a few lines of code. Also works at runtime)
 - [UnityCodeLama](https://github.com/HardCodeDev777/UnityCodeLama) (Unity Edtior tool to analyze scripts via Ollama)

 ### Supported backends
--- a/api/client.go
+++ b/api/client.go
@@ -24,7 +24,10 @@ import (
 	"net/http"
 	"net/url"
 	"runtime"
+	"strconv"
+	"time"

+	"github.com/ollama/ollama/auth"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/version"
@@ -76,6 +79,14 @@ func NewClient(base *url.URL, http *http.Client) *Client {
 	}
 }

+func getAuthorizationToken(ctx context.Context, challenge string) (string, error) {
+	token, err := auth.Sign(ctx, []byte(challenge))
+	if err != nil {
+		return "", err
+	}
+	return token, nil
+}
+
 func (c *Client) do(ctx context.Context, method, path string, reqData, respData any) error {
 	var reqBody io.Reader
 	var data []byte
@@ -97,6 +108,21 @@ func (c *Client) do(ctx context.Context, method, path string, reqData, respData
 	}

 	requestURL := c.base.JoinPath(path)
+
+	var token string
+	if envconfig.UseAuth() || c.base.Hostname() == "ollama.com" {
+		now := strconv.FormatInt(time.Now().Unix(), 10)
+		chal := fmt.Sprintf("%s,%s?ts=%s", method, path, now)
+		token, err = getAuthorizationToken(ctx, chal)
+		if err != nil {
+			return err
+		}
+
+		q := requestURL.Query()
+		q.Set("ts", now)
+		requestURL.RawQuery = q.Encode()
+	}
+
 	request, err := http.NewRequestWithContext(ctx, method, requestURL.String(), reqBody)
 	if err != nil {
 		return err
@@ -106,6 +132,10 @@ func (c *Client) do(ctx context.Context, method, path string, reqData, respData
 	request.Header.Set("Accept", "application/json")
 	request.Header.Set("User-Agent", fmt.Sprintf("ollama/%s (%s %s) Go/%s", version.Version, runtime.GOARCH, runtime.GOOS, runtime.Version()))

+	if token != "" {
+		request.Header.Set("Authorization", token)
+	}
+
 	respObj, err := c.http.Do(request)
 	if err != nil {
 		return err
@@ -143,6 +173,22 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f
 	}

 	requestURL := c.base.JoinPath(path)
+
+	var token string
+	if envconfig.UseAuth() || c.base.Hostname() == "ollama.com" {
+		var err error
+		now := strconv.FormatInt(time.Now().Unix(), 10)
+		chal := fmt.Sprintf("%s,%s?ts=%s", method, path, now)
+		token, err = getAuthorizationToken(ctx, chal)
+		if err != nil {
+			return err
+		}
+
+		q := requestURL.Query()
+		q.Set("ts", now)
+		requestURL.RawQuery = q.Encode()
+	}
+
 	request, err := http.NewRequestWithContext(ctx, method, requestURL.String(), buf)
 	if err != nil {
 		return err
@@ -152,6 +198,10 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f
 	request.Header.Set("Accept", "application/x-ndjson")
 	request.Header.Set("User-Agent", fmt.Sprintf("ollama/%s (%s %s) Go/%s", version.Version, runtime.GOARCH, runtime.GOOS, runtime.Version()))

+	if token != "" {
+		request.Header.Set("Authorization", token)
+	}
+
 	response, err := c.http.Do(request)
 	if err != nil {
 		return err
--- a/api/types.go
+++ b/api/types.go
@@ -83,6 +83,12 @@ type GenerateRequest struct {
 	// Options lists model-specific options. For example, temperature can be
 	// set through this field, if the model supports it.
 	Options map[string]any `json:"options"`
+
+	// Think controls whether thinking/reasoning models will think before
+	// responding. Needs to be a pointer so we can distinguish between false
+	// (request that thinking _not_ be used) and unset (use the old behavior
+	// before this option was introduced)
+	Think *bool `json:"think,omitempty"`
 }

 // ChatRequest describes a request sent by [Client.Chat].
@@ -108,6 +114,10 @@ type ChatRequest struct {

 	// Options lists model-specific options.
 	Options map[string]any `json:"options"`
+
+	// Think controls whether thinking/reasoning models will think before
+	// responding
+	Think *bool `json:"think,omitempty"`
 }

 type Tools []Tool
@@ -126,8 +136,11 @@ func (t Tool) String() string {
 // role ("system", "user", or "assistant"), the content and an optional list
 // of images.
 type Message struct {
-	Role      string      `json:"role"`
-	Content   string      `json:"content"`
+	Role    string `json:"role"`
+	Content string `json:"content"`
+	// Thinking contains the text that was inside thinking tags in the
+	// original model output when ChatRequest.Think is enabled.
+	Thinking  string      `json:"thinking,omitempty"`
 	Images    []ImageData `json:"images,omitempty"`
 	ToolCalls []ToolCall  `json:"tool_calls,omitempty"`
 }
@@ -478,6 +491,10 @@ type GenerateResponse struct {
 	// Response is the textual response itself.
 	Response string `json:"response"`

+	// Thinking contains the text that was inside thinking tags in the
+	// original model output when ChatRequest.Think is enabled.
+	Thinking string `json:"thinking,omitempty"`
+
 	// Done specifies if the response is complete.
 	Done bool `json:"done"`

--- a/api/types_test.go
+++ b/api/types_test.go
@@ -372,3 +372,50 @@ func TestPropertyType_MarshalJSON(t *testing.T) {
 		})
 	}
 }
+
+func TestThinking_UnmarshalJSON(t *testing.T) {
+	trueVal := true
+	falseVal := false
+
+	tests := []struct {
+		name             string
+		input            string
+		expectedThinking *bool
+		expectedError    bool
+	}{
+		{
+			name:             "true",
+			input:            `{ "think": true }`,
+			expectedThinking: &trueVal,
+		},
+		{
+			name:             "false",
+			input:            `{ "think": false }`,
+			expectedThinking: &falseVal,
+		},
+		{
+			name:             "unset",
+			input:            `{ }`,
+			expectedThinking: nil,
+		},
+		{
+			name:             "invalid",
+			input:            `{ "think": "true" }`,
+			expectedThinking: nil,
+			expectedError:    true,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			var req GenerateRequest
+			err := json.Unmarshal([]byte(test.input), &req)
+			if test.expectedError {
+				require.Error(t, err)
+			} else {
+				require.NoError(t, err)
+				assert.Equal(t, test.expectedThinking, req.Think)
+			}
+		})
+	}
+}
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -39,6 +39,7 @@ import (
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/progress"
+	"github.com/ollama/ollama/readline"
 	"github.com/ollama/ollama/runner"
 	"github.com/ollama/ollama/server"
 	"github.com/ollama/ollama/types/model"
@@ -46,6 +47,23 @@ import (
 	"github.com/ollama/ollama/version"
 )

+// ensureThinkingSupport emits a warning if the model does not advertise thinking support
+func ensureThinkingSupport(ctx context.Context, client *api.Client, name string) {
+	if name == "" {
+		return
+	}
+	resp, err := client.Show(ctx, &api.ShowRequest{Model: name})
+	if err != nil {
+		return
+	}
+	for _, cap := range resp.Capabilities {
+		if cap == model.CapabilityThinking {
+			return
+		}
+	}
+	fmt.Fprintf(os.Stderr, "warning: model %q does not support thinking output\n", name)
+}
+
 var errModelfileNotFound = errors.New("specified Modelfile wasn't found")

 func getModelfileName(cmd *cobra.Command) (string, error) {
@@ -265,6 +283,9 @@ func loadOrUnloadModel(cmd *cobra.Command, opts *runOptions) error {
 	req := &api.GenerateRequest{
 		Model:     opts.Model,
 		KeepAlive: opts.KeepAlive,
+
+		// pass Think here so we fail before getting to the chat prompt if the model doesn't support it
+		Think: opts.Think,
 	}

 	return client.Generate(cmd.Context(), req, func(api.GenerateResponse) error { return nil })
@@ -299,6 +320,22 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 	}
 	opts.Format = format

+	thinkFlag := cmd.Flags().Lookup("think")
+	if thinkFlag.Changed {
+		think, err := cmd.Flags().GetBool("think")
+		if err != nil {
+			return err
+		}
+		opts.Think = &think
+	} else {
+		opts.Think = nil
+	}
+	hidethinking, err := cmd.Flags().GetBool("hidethinking")
+	if err != nil {
+		return err
+	}
+	opts.HideThinking = hidethinking
+
 	keepAlive, err := cmd.Flags().GetString("keepalive")
 	if err != nil {
 		return err
@@ -362,6 +399,11 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 		return err
 	}

+	opts.Think, err = inferThinkingOption(&info.Capabilities, &opts, thinkFlag.Changed)
+	if err != nil {
+		return err
+	}
+
 	opts.MultiModal = slices.Contains(info.Capabilities, model.CapabilityVision)

 	// TODO: remove the projector info and vision info checks below,
@@ -747,11 +789,38 @@ func showInfo(resp *api.ShowResponse, verbose bool, w io.Writer) error {
 				case float64:
 					v = fmt.Sprintf("%g", vData)
 				case []any:
-					n := 3
-					if len(vData) < n {
-						n = len(vData)
+					targetWidth := 10 // Small width where we are displaying the data in a column
+
+					var itemsToShow int
+					totalWidth := 1 // Start with 1 for opening bracket
+
+					// Find how many we can fit
+					for i := range vData {
+						itemStr := fmt.Sprintf("%v", vData[i])
+						width := runewidth.StringWidth(itemStr)
+
+						// Add separator width (", ") for all items except the first
+						if i > 0 {
+							width += 2
+						}
+
+						// Check if adding this item would exceed our width limit
+						if totalWidth+width > targetWidth && i > 0 {
+							break
+						}
+
+						totalWidth += width
+						itemsToShow++
+					}
+
+					// Format the output
+					if itemsToShow < len(vData) {
+						v = fmt.Sprintf("%v", vData[:itemsToShow])
+						v = strings.TrimSuffix(v, "]")
+						v += fmt.Sprintf(" ...+%d more]", len(vData)-itemsToShow)
+					} else {
+						v = fmt.Sprintf("%v", vData)
 					}
-					v = fmt.Sprintf("%v", vData[:n])
 				default:
 					v = fmt.Sprintf("%T", vData)
 				}
@@ -772,10 +841,19 @@ func showInfo(resp *api.ShowResponse, verbose bool, w io.Writer) error {

 	head := func(s string, n int) (rows [][]string) {
 		scanner := bufio.NewScanner(strings.NewReader(s))
-		for scanner.Scan() && (len(rows) < n || n < 0) {
-			if text := scanner.Text(); text != "" {
-				rows = append(rows, []string{"", strings.TrimSpace(text)})
+		count := 0
+		for scanner.Scan() {
+			text := strings.TrimSpace(scanner.Text())
+			if text == "" {
+				continue
 			}
+			count++
+			if n < 0 || count <= n {
+				rows = append(rows, []string{"", text})
+			}
+		}
+		if n >= 0 && count > n {
+			rows = append(rows, []string{"", "..."})
 		}
 		return
 	}
@@ -887,17 +965,19 @@ func PullHandler(cmd *cobra.Command, args []string) error {
 type generateContextKey string

 type runOptions struct {
-	Model       string
-	ParentModel string
-	Prompt      string
-	Messages    []api.Message
-	WordWrap    bool
-	Format      string
-	System      string
-	Images      []api.ImageData
-	Options     map[string]any
-	MultiModal  bool
-	KeepAlive   *api.Duration
+	Model        string
+	ParentModel  string
+	Prompt       string
+	Messages     []api.Message
+	WordWrap     bool
+	Format       string
+	System       string
+	Images       []api.ImageData
+	Options      map[string]any
+	MultiModal   bool
+	KeepAlive    *api.Duration
+	Think        *bool
+	HideThinking bool
 }

 type displayResponseState struct {
@@ -953,6 +1033,26 @@ func displayResponse(content string, wordWrap bool, state *displayResponseState)
 	}
 }

+func thinkingOutputOpeningText(plainText bool) string {
+	text := "Thinking...\n"
+
+	if plainText {
+		return text
+	}
+
+	return readline.ColorGrey + readline.ColorBold + text + readline.ColorDefault + readline.ColorGrey
+}
+
+func thinkingOutputClosingText(plainText bool) string {
+	text := "...done thinking.\n\n"
+
+	if plainText {
+		return text
+	}
+
+	return readline.ColorGrey + readline.ColorBold + text + readline.ColorDefault
+}
+
 func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
@@ -980,14 +1080,34 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 	var latest api.ChatResponse
 	var fullResponse strings.Builder
 	var role string
+	var thinkTagOpened bool = false
+	var thinkTagClosed bool = false

 	fn := func(response api.ChatResponse) error {
-		p.StopAndClear()
+		if response.Message.Content != "" || !opts.HideThinking {
+			p.StopAndClear()
+		}

 		latest = response

 		role = response.Message.Role
+		if response.Message.Thinking != "" && !opts.HideThinking {
+			if !thinkTagOpened {
+				fmt.Print(thinkingOutputOpeningText(false))
+				thinkTagOpened = true
+			}
+			displayResponse(response.Message.Thinking, opts.WordWrap, state)
+		}
+
 		content := response.Message.Content
+		if thinkTagOpened && !thinkTagClosed && content != "" {
+			fmt.Print(thinkingOutputClosingText(false))
+			thinkTagClosed = true
+		}
+		// purposefully not putting thinking blocks in the response, which would
+		// only be needed if we later added tool calling to the cli (they get
+		// filtered out anyway since current models don't expect them unless you're
+		// about to finish some tool calls)
 		fullResponse.WriteString(content)

 		displayResponse(content, opts.WordWrap, state)
@@ -1004,6 +1124,7 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 		Messages: opts.Messages,
 		Format:   json.RawMessage(opts.Format),
 		Options:  opts.Options,
+		Think:    opts.Think,
 	}

 	if opts.KeepAlive != nil {
@@ -1065,13 +1186,32 @@ func generate(cmd *cobra.Command, opts runOptions) error {
 	}()

 	var state *displayResponseState = &displayResponseState{}
+	var thinkTagOpened bool = false
+	var thinkTagClosed bool = false
+
+	plainText := !term.IsTerminal(int(os.Stdout.Fd()))

 	fn := func(response api.GenerateResponse) error {
-		p.StopAndClear()
-
 		latest = response
 		content := response.Response

+		if response.Response != "" || !opts.HideThinking {
+			p.StopAndClear()
+		}
+
+		if response.Thinking != "" && !opts.HideThinking {
+			if !thinkTagOpened {
+				fmt.Print(thinkingOutputOpeningText(plainText))
+				thinkTagOpened = true
+			}
+			displayResponse(response.Thinking, opts.WordWrap, state)
+		}
+
+		if thinkTagOpened && !thinkTagClosed && content != "" {
+			fmt.Print(thinkingOutputClosingText(plainText))
+			thinkTagClosed = true
+		}
+
 		displayResponse(content, opts.WordWrap, state)

 		return nil
@@ -1097,6 +1237,7 @@ func generate(cmd *cobra.Command, opts runOptions) error {
 		System:    opts.System,
 		Options:   opts.Options,
 		KeepAlive: opts.KeepAlive,
+		Think:     opts.Think,
 	}

 	if err := client.Generate(ctx, &request, fn); err != nil {
@@ -1200,11 +1341,11 @@ func checkServerHeartbeat(cmd *cobra.Command, _ []string) error {
 		return err
 	}
 	if err := client.Heartbeat(cmd.Context()); err != nil {
-		if !strings.Contains(err.Error(), " refused") {
+		if !(strings.Contains(err.Error(), " refused") || strings.Contains(err.Error(), "could not connect")) {
 			return err
 		}
 		if err := startApp(cmd.Context(), client); err != nil {
-			return errors.New("could not connect to ollama app, is it running?")
+			return fmt.Errorf("ollama server not responding - %w", err)
 		}
 	}
 	return nil
@@ -1282,7 +1423,7 @@ func NewCLI() *cobra.Command {
 	}

 	createCmd.Flags().StringP("file", "f", "", "Name of the Modelfile (default \"Modelfile\"")
-	createCmd.Flags().StringP("quantize", "q", "", "Quantize model to this level (e.g. q4_0)")
+	createCmd.Flags().StringP("quantize", "q", "", "Quantize model to this level (e.g. q4_K_M)")

 	showCmd := &cobra.Command{
 		Use:     "show MODEL",
@@ -1312,6 +1453,8 @@ func NewCLI() *cobra.Command {
 	runCmd.Flags().Bool("insecure", false, "Use an insecure registry")
 	runCmd.Flags().Bool("nowordwrap", false, "Don't wrap words to the next line automatically")
 	runCmd.Flags().String("format", "", "Response format (e.g. json)")
+	runCmd.Flags().Bool("think", false, "Whether to use thinking mode for supported models")
+	runCmd.Flags().Bool("hidethinking", false, "Hide thinking output (if provided)")

 	stopCmd := &cobra.Command{
 		Use:     "stop MODEL",
@@ -1363,7 +1506,6 @@ func NewCLI() *cobra.Command {
 		PreRunE: checkServerHeartbeat,
 		RunE:    ListRunningHandler,
 	}
-
 	copyCmd := &cobra.Command{
 		Use:     "cp SOURCE DESTINATION",
 		Short:   "Copy a model",
@@ -1452,3 +1594,45 @@ func NewCLI() *cobra.Command {

 	return rootCmd
 }
+
+// If the user has explicitly set thinking options, either through the CLI or
+// through the `/set think` or `set nothink` interactive options, then we
+// respect them. Otherwise, we check model capabilities to see if the model
+// supports thinking. If the model does support thinking, we enable it.
+// Otherwise, we unset the thinking option (which is different than setting it
+// to false).
+//
+// If capabilities are not provided, we fetch them from the server.
+func inferThinkingOption(caps *[]model.Capability, runOpts *runOptions, explicitlySetByUser bool) (*bool, error) {
+	if explicitlySetByUser {
+		return runOpts.Think, nil
+	}
+
+	if caps == nil {
+		client, err := api.ClientFromEnvironment()
+		if err != nil {
+			return nil, err
+		}
+		ret, err := client.Show(context.Background(), &api.ShowRequest{
+			Model: runOpts.Model,
+		})
+		if err != nil {
+			return nil, err
+		}
+		caps = &ret.Capabilities
+	}
+
+	thinkingSupported := false
+	for _, cap := range *caps {
+		if cap == model.CapabilityThinking {
+			thinkingSupported = true
+		}
+	}
+
+	if thinkingSupported {
+		thinking := true
+		return &thinking, nil
+	}
+
+	return nil, nil
+}
--- a/cmd/cmd_test.go
+++ b/cmd/cmd_test.go
@@ -225,6 +225,7 @@ Weigh anchor!
  System
    You are a pirate!    
    Ahoy, matey!         
+    ...                  

 `
 		if diff := cmp.Diff(expect, b.String()); diff != "" {
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -62,6 +62,8 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 		fmt.Fprintln(os.Stderr, "  /set noformat          Disable formatting")
 		fmt.Fprintln(os.Stderr, "  /set verbose           Show LLM stats")
 		fmt.Fprintln(os.Stderr, "  /set quiet             Disable LLM stats")
+		fmt.Fprintln(os.Stderr, "  /set think             Enable thinking")
+		fmt.Fprintln(os.Stderr, "  /set nothink           Disable thinking")
 		fmt.Fprintln(os.Stderr, "")
 	}

@@ -128,6 +130,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {

 	var sb strings.Builder
 	var multiline MultilineState
+	var thinkExplicitlySet bool = opts.Think != nil

 	for {
 		line, err := scanner.Readline()
@@ -195,11 +198,19 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 			opts.Model = args[1]
 			opts.Messages = []api.Message{}
 			fmt.Printf("Loading model '%s'\n", opts.Model)
+			opts.Think, err = inferThinkingOption(nil, &opts, thinkExplicitlySet)
+			if err != nil {
+				return err
+			}
 			if err := loadOrUnloadModel(cmd, &opts); err != nil {
 				if strings.Contains(err.Error(), "not found") {
 					fmt.Printf("error: %v\n", err)
 					continue
 				}
+				if strings.Contains(err.Error(), "does not support thinking") {
+					fmt.Printf("error: %v\n", err)
+					continue
+				}
 				return err
 			}
 			continue
@@ -260,6 +271,22 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 						return err
 					}
 					fmt.Println("Set 'quiet' mode.")
+				case "think":
+					think := true
+					opts.Think = &think
+					thinkExplicitlySet = true
+					if client, err := api.ClientFromEnvironment(); err == nil {
+						ensureThinkingSupport(cmd.Context(), client, opts.Model)
+					}
+					fmt.Println("Set 'think' mode.")
+				case "nothink":
+					think := false
+					opts.Think = &think
+					thinkExplicitlySet = true
+					if client, err := api.ClientFromEnvironment(); err == nil {
+						ensureThinkingSupport(cmd.Context(), client, opts.Model)
+					}
+					fmt.Println("Set 'nothink' mode.")
 				case "format":
 					if len(args) < 3 || args[2] != "json" {
 						fmt.Println("Invalid or missing format. For 'json' mode use '/set format json'")
@@ -448,6 +475,11 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {

 			assistant, err := chat(cmd, opts)
 			if err != nil {
+				if strings.Contains(err.Error(), "does not support thinking") {
+					fmt.Printf("error: %v\n", err)
+					sb.Reset()
+					continue
+				}
 				return err
 			}
 			if assistant != nil {
--- a/cmd/start_darwin.go
+++ b/cmd/start_darwin.go
@@ -5,7 +5,7 @@ import (
 	"errors"
 	"os"
 	"os/exec"
-	"strings"
+	"regexp"

 	"github.com/ollama/ollama/api"
 )
@@ -19,11 +19,12 @@ func startApp(ctx context.Context, client *api.Client) error {
 	if err != nil {
 		return err
 	}
-	if !strings.Contains(link, "Ollama.app") {
+	r := regexp.MustCompile(`^.*/Ollama\s?\d*.app`)
+	m := r.FindStringSubmatch(link)
+	if len(m) != 1 {
 		return errors.New("could not find ollama app")
 	}
-	path := strings.Split(link, "Ollama.app")
-	if err := exec.Command("/usr/bin/open", "-a", path[0]+"Ollama.app").Run(); err != nil {
+	if err := exec.Command("/usr/bin/open", "-j", "-a", m[0], "--args", "--fast-startup").Run(); err != nil {
 		return err
 	}
 	return waitForServer(ctx, client)
--- a/cmd/start_windows.go
+++ b/cmd/start_windows.go
@@ -4,17 +4,27 @@ import (
 	"context"
 	"errors"
 	"fmt"
+	"log/slog"
 	"os"
 	"os/exec"
+	"path"
 	"path/filepath"
 	"strings"
 	"syscall"
+	"unsafe"

 	"github.com/ollama/ollama/api"
+	"golang.org/x/sys/windows"
+)
+
+const (
+	Installer = "OllamaSetup.exe"
 )

 func startApp(ctx context.Context, client *api.Client) error {
-	// log.Printf("XXX Attempting to find and start ollama app")
+	if len(isProcRunning(Installer)) > 0 {
+		return fmt.Errorf("upgrade in progress...")
+	}
 	AppName := "ollama app.exe"
 	exe, err := os.Executable()
 	if err != nil {
@@ -35,14 +45,11 @@ func startApp(ctx context.Context, client *api.Client) error {
 			}
 		}
 	}
-	// log.Printf("XXX attempting to start app %s", appExe)

 	cmd_path := "c:\\Windows\\system32\\cmd.exe"
-	cmd := exec.Command(cmd_path, "/c", appExe)
-	// TODO - these hide flags aren't working - still pops up a command window for some reason
+	cmd := exec.Command(cmd_path, "/c", appExe, "--hide", "--fast-startup")
 	cmd.SysProcAttr = &syscall.SysProcAttr{CreationFlags: 0x08000000, HideWindow: true}

-	// TODO this didn't help either...
 	cmd.Stdin = strings.NewReader("")
 	cmd.Stdout = os.Stdout
 	cmd.Stderr = os.Stderr
@@ -56,3 +63,50 @@ func startApp(ctx context.Context, client *api.Client) error {
 	}
 	return waitForServer(ctx, client)
 }
+
+func isProcRunning(procName string) []uint32 {
+	pids := make([]uint32, 2048)
+	var ret uint32
+	if err := windows.EnumProcesses(pids, &ret); err != nil || ret == 0 {
+		slog.Debug("failed to check for running installers", "error", err)
+		return nil
+	}
+	if ret > uint32(len(pids)) {
+		pids = make([]uint32, ret+10)
+		if err := windows.EnumProcesses(pids, &ret); err != nil || ret == 0 {
+			slog.Debug("failed to check for running installers", "error", err)
+			return nil
+		}
+	}
+	if ret < uint32(len(pids)) {
+		pids = pids[:ret]
+	}
+	var matches []uint32
+	for _, pid := range pids {
+		if pid == 0 {
+			continue
+		}
+		hProcess, err := windows.OpenProcess(windows.PROCESS_QUERY_INFORMATION|windows.PROCESS_VM_READ, false, pid)
+		if err != nil {
+			continue
+		}
+		defer windows.CloseHandle(hProcess)
+		var module windows.Handle
+		var cbNeeded uint32
+		cb := (uint32)(unsafe.Sizeof(module))
+		if err := windows.EnumProcessModules(hProcess, &module, cb, &cbNeeded); err != nil {
+			continue
+		}
+		var sz uint32 = 1024 * 8
+		moduleName := make([]uint16, sz)
+		cb = uint32(len(moduleName)) * (uint32)(unsafe.Sizeof(uint16(0)))
+		if err := windows.GetModuleBaseName(hProcess, module, &moduleName[0], cb); err != nil && err != syscall.ERROR_INSUFFICIENT_BUFFER {
+			continue
+		}
+		exeFile := path.Base(strings.ToLower(syscall.UTF16ToString(moduleName)))
+		if strings.EqualFold(exeFile, procName) {
+			matches = append(matches, pid)
+		}
+	}
+	return matches
+}
--- a/cmd/warn_thinking_test.go
+++ b/cmd/warn_thinking_test.go
@@ -0,0 +1,63 @@
+package cmd
+
+import (
+	"encoding/json"
+	"io"
+	"net/http"
+	"net/http/httptest"
+	"os"
+	"strings"
+	"testing"
+
+	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/types/model"
+)
+
+// Test that a warning is printed when thinking is requested but not supported.
+func TestWarnMissingThinking(t *testing.T) {
+	cases := []struct {
+		capabilities []model.Capability
+		expectWarn   bool
+	}{
+		{capabilities: []model.Capability{model.CapabilityThinking}, expectWarn: false},
+		{capabilities: []model.Capability{}, expectWarn: true},
+	}
+
+	for _, tc := range cases {
+		srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			if r.URL.Path != "/api/show" || r.Method != http.MethodPost {
+				t.Fatalf("unexpected request to %s %s", r.URL.Path, r.Method)
+			}
+			var req api.ShowRequest
+			if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
+				t.Fatalf("decode request: %v", err)
+			}
+			resp := api.ShowResponse{Capabilities: tc.capabilities}
+			if err := json.NewEncoder(w).Encode(resp); err != nil {
+				t.Fatalf("encode response: %v", err)
+			}
+		}))
+		defer srv.Close()
+
+		t.Setenv("OLLAMA_HOST", srv.URL)
+		client, err := api.ClientFromEnvironment()
+		if err != nil {
+			t.Fatal(err)
+		}
+		oldStderr := os.Stderr
+		r, w, _ := os.Pipe()
+		os.Stderr = w
+		ensureThinkingSupport(t.Context(), client, "m")
+		w.Close()
+		os.Stderr = oldStderr
+		out, _ := io.ReadAll(r)
+
+		warned := strings.Contains(string(out), "warning:")
+		if tc.expectWarn && !warned {
+			t.Errorf("expected warning, got none")
+		}
+		if !tc.expectWarn && warned {
+			t.Errorf("did not expect warning, got: %s", string(out))
+		}
+	}
+}
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -53,8 +53,11 @@ func (ModelParameters) KV(t *Tokenizer) ggml.KV {
 	}

 	for _, sv := range t.SpecialVocabulary {
-		kv[fmt.Sprintf("tokenizer.ggml.%s_token_id", sv.Key())] = uint32(sv.ID)
 		kv[fmt.Sprintf("tokenizer.ggml.add_%s_token", sv.Key())] = sv.AddToken
+		kv[fmt.Sprintf("tokenizer.ggml.%s_token_id", sv.Key())] = uint32(sv.ID)
+		if len(sv.IDs) > 0 {
+			kv[fmt.Sprintf("tokenizer.ggml.%s_token_ids", sv.Key())] = sv.IDs
+		}
 	}

 	return kv
@@ -191,6 +194,8 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
 		conv = &phi3Model{}
 	case "Qwen2ForCausalLM":
 		conv = &qwen2Model{}
+	case "Qwen2_5_VLForConditionalGeneration":
+		conv = &qwen25VLModel{}
 	case "BertModel":
 		conv = &bertModel{}
 	case "CohereForCausalLM":
--- a/convert/convert_llama.go
+++ b/convert/convert_llama.go
@@ -139,7 +139,8 @@ func (p *llamaModel) Tensors(ts []Tensor) []*ggml.Tensor {
 	}

 	for _, t := range ts {
-		if strings.HasSuffix(t.Name(), "attn_q.weight") || strings.HasSuffix(t.Name(), "attn_k.weight") {
+		if strings.HasSuffix(t.Name(), "attn_q.weight") || strings.HasSuffix(t.Name(), "attn_k.weight") ||
+			strings.HasSuffix(t.Name(), "attn_q_proj.weight") || strings.HasSuffix(t.Name(), "attn_k_proj.weight") {
 			if !p.skipRepack {
 				t.SetRepacker(p.repack)
 			}
@@ -181,9 +182,9 @@ func (p *llamaModel) repack(name string, data []float32, shape []uint64) ([]floa
 	}

 	var heads uint32
-	if strings.HasSuffix(name, "attn_q.weight") {
+	if strings.HasSuffix(name, "attn_q.weight") || strings.HasSuffix(name, "attn_q_proj.weight") {
 		heads = p.NumAttentionHeads
-	} else if strings.HasSuffix(name, "attn_k.weight") {
+	} else if strings.HasSuffix(name, "attn_k.weight") || strings.HasSuffix(name, "attn_k_proj.weight") {
 		heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
 	} else {
 		return nil, fmt.Errorf("unknown tensor for repack: %s", name)
--- a/convert/convert_mllama.go
+++ b/convert/convert_mllama.go
@@ -94,7 +94,9 @@ func (m *mllamaModel) Tensors(ts []Tensor) []*ggml.Tensor {
 	var out []*ggml.Tensor
 	var text []Tensor
 	for _, t := range ts {
-		if t.Name() == "v.position_embd.gate" {
+		if !strings.HasPrefix(t.Name(), "v.") && !strings.HasPrefix(t.Name(), "mm.") {
+			text = append(text, t)
+		} else if t.Name() == "v.position_embd.gate" {
 			for _, name := range []string{"v.position_embd.gate", "v.tile_position_embd.gate"} {
 				tt := t.Clone()
 				tt.SetRepacker(m.repack(name))
@@ -105,23 +107,21 @@ func (m *mllamaModel) Tensors(ts []Tensor) []*ggml.Tensor {
 					WriterTo: tt,
 				})
 			}
-		} else if t.Name() == "v.pre_tile_position_embd.gate" || t.Name() == "v.post_tile_position_embd.gate" {
-			t.SetRepacker(m.repack(t.Name()))
-			out = append(out, &ggml.Tensor{
-				Name:     t.Name(),
-				Kind:     t.Kind(),
-				Shape:    t.Shape(),
-				WriterTo: t,
-			})
-		} else if strings.HasPrefix(t.Name(), "v.") || strings.HasPrefix(t.Name(), "mm.") {
-			out = append(out, &ggml.Tensor{
-				Name:     t.Name(),
-				Kind:     t.Kind(),
-				Shape:    t.Shape(),
-				WriterTo: t,
-			})
 		} else {
-			text = append(text, t)
+			if t.Name() == "v.pre_tile_position_embd.gate" || t.Name() == "v.post_tile_position_embd.gate" {
+				t.SetRepacker(m.repack(t.Name()))
+			} else if strings.HasSuffix(t.Name(), "attn_q.weight") || strings.HasSuffix(t.Name(), "attn_k.weight") {
+				t.SetRepacker(m.repack(t.Name()))
+			} else if strings.HasSuffix(t.Name(), "attn_gate") || strings.HasSuffix(t.Name(), "ffn_gate") {
+				t.SetRepacker(m.repack(t.Name()))
+			}
+
+			out = append(out, &ggml.Tensor{
+				Name:     t.Name(),
+				Kind:     t.Kind(),
+				Shape:    t.Shape(),
+				WriterTo: t,
+			})
 		}
 	}

@@ -137,16 +137,35 @@ func (m *mllamaModel) repack(name string) Repacker {

 		var t tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))

-		t, err = tensor.Tanh(t)
-		if err != nil {
-			return nil, err
-		}
+		if strings.HasSuffix(name, "attn_q.weight") || strings.HasSuffix(name, "attn_k.weight") {
+			heads := m.VisionModel.AttentionHeads
+			if err := t.Reshape(append([]int{int(heads), 2, dims[0] / int(heads) / 2}, dims[1:]...)...); err != nil {
+				return nil, err
+			}

-		if name == "v.position_embd.gate" {
-			t, err = tensor.Sub(float32(1), t)
+			if err := t.T(0, 2, 1, 3); err != nil {
+				return nil, err
+			}
+
+			if err := t.Reshape(dims...); err != nil {
+				return nil, err
+			}
+
+			if err := t.Transpose(); err != nil {
+				return nil, err
+			}
+		} else {
+			t, err = tensor.Tanh(t)
 			if err != nil {
 				return nil, err
 			}
+
+			if name == "v.position_embd.gate" {
+				t, err = tensor.Sub(float32(1), t)
+				if err != nil {
+					return nil, err
+				}
+			}
 		}

 		t = tensor.Materialize(t)
--- a/convert/convert_qwen2.go
+++ b/convert/convert_qwen2.go
@@ -15,6 +15,7 @@ type qwen2Model struct {
 		Type                          string     `json:"type"`
 		Factor                        ropeFactor `json:"factor"`
 		OriginalMaxPositionEmbeddings uint32     `json:"original_max_position_embeddings"`
+		MropeSection                  []int32    `json:"mrope_section"`
 	} `json:"rope_scaling"`
 	RMSNormEPS float32 `json:"rms_norm_eps"`
 }
@@ -39,6 +40,8 @@ func (q *qwen2Model) KV(t *Tokenizer) ggml.KV {
 	case "yarn":
 		kv["qwen2.rope.scaling.type"] = q.RopeScaling.Type
 		kv["qwen2.rope.scaling.factor"] = q.RopeScaling.Factor
+	case "mrope", "default":
+		kv["qwen2.rope.mrope_section"] = q.RopeScaling.MropeSection
 	default:
 		panic("unknown rope scaling type")
 	}
--- a/convert/convert_qwen25vl.go
+++ b/convert/convert_qwen25vl.go
@@ -0,0 +1,102 @@
+package convert
+
+import (
+	"cmp"
+	"slices"
+	"strings"
+
+	"github.com/ollama/ollama/fs/ggml"
+)
+
+type qwen25VLModel struct {
+	qwen2Model
+
+	VisionModel struct {
+		Depth               uint32  `json:"depth"`
+		HiddenSize          uint32  `json:"hidden_size"`
+		NumHeads            uint32  `json:"num_heads"`
+		InChannels          uint32  `json:"in_chans"`
+		PatchSize           uint32  `json:"patch_size"`
+		SpatialMergeSize    uint32  `json:"spatial_merge_size"`
+		SpatialPatchSize    uint32  `json:"spatial_patch_size"`
+		WindowSize          uint32  `json:"window_size"`
+		RMSNormEps          float32 `json:"layer_norm_epsilon"`
+		RopeTheta           float32 `json:"rope_theta"`
+		FullAttentionBlocks []int32 `json:"fullatt_block_indexes"`
+		TemporalPatchSize   uint32  `json:"temporal_patch_size"`
+	} `json:"vision_config"`
+}
+
+var _ ModelConverter = (*qwen25VLModel)(nil)
+
+func (q *qwen25VLModel) KV(t *Tokenizer) ggml.KV {
+	kv := q.ModelParameters.KV(t)
+	kv["general.architecture"] = "qwen25vl"
+
+	for k, v := range q.qwen2Model.KV(t) {
+		if strings.HasPrefix(k, "qwen2.") {
+			kv[strings.Replace(k, "qwen2.", "qwen25vl.", 1)] = v
+		}
+	}
+
+	if q.VisionModel.FullAttentionBlocks == nil {
+		kv["qwen25vl.vision.fullatt_block_indexes"] = []int32{7, 15, 23, 31}
+	}
+
+	kv["qwen25vl.vision.block_count"] = cmp.Or(q.VisionModel.Depth, 32)
+	kv["qwen25vl.vision.embedding_length"] = q.VisionModel.HiddenSize
+	kv["qwen25vl.vision.attention.head_count"] = cmp.Or(q.VisionModel.NumHeads, 16)
+	kv["qwen25vl.vision.num_channels"] = q.VisionModel.InChannels
+	kv["qwen25vl.vision.patch_size"] = cmp.Or(q.VisionModel.PatchSize, 14)
+	kv["qwen25vl.vision.spatial_merge_size"] = cmp.Or(q.VisionModel.SpatialMergeSize, 2)
+	kv["qwen25vl.vision.spatial_patch_size"] = q.VisionModel.SpatialPatchSize
+	kv["qwen25vl.vision.window_size"] = cmp.Or(q.VisionModel.WindowSize, 112)
+	kv["qwen25vl.vision.attention.layer_norm_epsilon"] = cmp.Or(q.VisionModel.RMSNormEps, 1e-6)
+	kv["qwen25vl.vision.rope.freq_base"] = cmp.Or(q.VisionModel.RopeTheta, 1e4)
+	kv["qwen25vl.vision.fullatt_block_indexes"] = q.VisionModel.FullAttentionBlocks
+	kv["qwen25vl.vision.temporal_patch_size"] = cmp.Or(q.VisionModel.TemporalPatchSize, 2)
+
+	return kv
+}
+
+func (q *qwen25VLModel) Tensors(ts []Tensor) []*ggml.Tensor {
+	var out []*ggml.Tensor
+
+	for _, t := range ts {
+		if strings.Contains(t.Name(), "patch_embed.proj") {
+			for t := range splitDim(t, 2,
+				split{Replacer: strings.NewReplacer("patch_embed.proj", "patch_embd_0")},
+				split{Replacer: strings.NewReplacer("patch_embed.proj", "patch_embd_1")},
+			) {
+				t.Shape = slices.DeleteFunc(t.Shape, func(i uint64) bool { return i == 1 })
+				out = append(out, t)
+			}
+		} else if strings.Contains(t.Name(), "attn.qkv") {
+			out = append(out, slices.Collect(splitDim(t, 0,
+				split{Replacer: strings.NewReplacer("attn.qkv", "attn_q")},
+				split{Replacer: strings.NewReplacer("attn.qkv", "attn_k")},
+				split{Replacer: strings.NewReplacer("attn.qkv", "attn_v")},
+			))...)
+		} else {
+			out = append(out, &ggml.Tensor{
+				Name:     t.Name(),
+				Kind:     t.Kind(),
+				Shape:    t.Shape(),
+				WriterTo: t,
+			})
+		}
+	}
+
+	return out
+}
+
+func (p *qwen25VLModel) Replacements() []string {
+	return append(
+		p.qwen2Model.Replacements(),
+		"visual", "v",
+		"blocks", "blk",
+		"attn.proj", "attn_out",
+		"norm1", "ln1",
+		"norm2", "ln2",
+	)
+}
--- a/convert/convert_test.go
+++ b/convert/convert_test.go
@@ -47,7 +47,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, ggml.KV, ggml.Tensors) {
 	}
 	t.Cleanup(func() { r.Close() })

-	m, _, err := ggml.Decode(r, -1)
+	m, err := ggml.Decode(r, -1)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -332,7 +332,7 @@ func TestConvertAdapter(t *testing.T) {
 			}
 			defer r.Close()

-			m, _, err := ggml.Decode(r, -1)
+			m, err := ggml.Decode(r, -1)
 			if err != nil {
 				t.Fatal(err)
 			}
--- a/convert/tensor.go
+++ b/convert/tensor.go
@@ -0,0 +1,76 @@
+package convert
+
+import (
+	"cmp"
+	"iter"
+	"slices"
+	"strings"
+
+	"github.com/pdevine/tensor"
+	"github.com/pdevine/tensor/native"
+
+	"github.com/ollama/ollama/fs/ggml"
+)
+
+type split struct {
+	*strings.Replacer
+	dim int
+
+	// fn is an optional function to apply to the tensor after slicing
+	fn func(tensor.Tensor) (tensor.Tensor, error)
+}
+
+// splitDim splits a tensor along a specified dimension into multiple tensors. The dimension
+// is split evenly based on the number of replacers provided unless a specific count is given.
+func splitDim(t Tensor, dim int, splits ...split) iter.Seq[*ggml.Tensor] {
+	return func(yield func(*ggml.Tensor) bool) {
+		var offset int
+		for _, split := range splits {
+			t := t.Clone()
+			shape := slices.Clone(t.Shape())
+			shape[dim] = cmp.Or(uint64(split.dim), shape[dim]/uint64(len(splits)))
+
+			slice := slices.Repeat([]tensor.Slice{nil}, len(shape))
+			slice[dim] = tensor.S(offset, offset+int(shape[dim]))
+			offset += int(shape[dim])
+
+			t.SetRepacker(func(_ string, data []float32, shape []uint64) ([]float32, error) {
+				dims := make([]int, len(shape))
+				for i := range shape {
+					dims[i] = int(shape[i])
+				}
+
+				var tt tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
+				tt, err := tt.Slice(slice...)
+				if err != nil {
+					return nil, err
+				}
+
+				tt = tensor.Materialize(tt)
+
+				if split.fn != nil {
+					tt, err = split.fn(tt)
+					if err != nil {
+						return nil, err
+					}
+				}
+
+				// flatten tensor so it can be written as a vector
+				if err := tt.Reshape(tt.Shape().TotalSize()); err != nil {
+					return nil, err
+				}
+
+				return native.VectorF32(tt.(*tensor.Dense))
+			})
+
+			if !yield(&ggml.Tensor{
+				Name:     split.Replace(t.Name()),
+				Kind:     t.Kind(),
+				Shape:    shape,
+				WriterTo: t,
+			}) {
+				break
+			}
+		}
+	}
+}
--- a/convert/tensor_test.go
+++ b/convert/tensor_test.go
@@ -0,0 +1,304 @@
+package convert
+
+import (
+	"bytes"
+	"encoding/binary"
+	"io"
+	"iter"
+	"slices"
+	"strings"
+	"testing"
+
+	"github.com/pdevine/tensor"
+)
+
+type fakeTensor struct {
+	name  string
+	shape []uint64
+	data  []float32
+
+	repacker Repacker
+}
+
+func (f fakeTensor) Name() string {
+	return f.name
+}
+
+func (f fakeTensor) Shape() []uint64 {
+	return f.shape
+}
+
+func (f fakeTensor) Kind() uint32 {
+	return 0
+}
+
+func (f *fakeTensor) SetRepacker(fn Repacker) {
+	f.repacker = fn
+}
+
+func (f fakeTensor) Clone() Tensor {
+	return &fakeTensor{
+		name:     f.name,
+		shape:    slices.Clone(f.shape),
+		data:     slices.Clone(f.data),
+		repacker: f.repacker,
+	}
+}
+
+func (f fakeTensor) WriteTo(w io.Writer) (n int64, err error) {
+	data := f.data
+	if f.repacker != nil {
+		data, err = f.repacker(f.name, data, f.shape)
+		if err != nil {
+			return 0, err
+		}
+	}
+
+	if err := binary.Write(w, binary.LittleEndian, data); err != nil {
+		return 0, err
+	}
+
+	return int64(len(data) * 4), nil
+}
+
+func mul(shape []uint64) int {
+	n := 1
+	for _, dim := range shape {
+		n *= int(dim)
+	}
+	return n
+}
+
+func TestSplitDim(t *testing.T) {
+	r := fakeTensor{
+		name:  "a.b",
+		shape: []uint64{3, 4},
+		data:  []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
+	}
+
+	t.Run("no split", func(t *testing.T) {
+		for tt := range splitDim(&r, 0, split{Replacer: strings.NewReplacer("a", "x")}) {
+			if tt.Name != "x.b" {
+				t.Fatalf("expected name 'x', got '%s'", tt.Name)
+			}
+
+			if !slices.Equal(tt.Shape, []uint64{3, 4}) {
+				t.Fatalf("expected shape [3, 4], got %v", tt.Shape)
+			}
+
+			var b bytes.Buffer
+			if _, err := tt.WriteTo(&b); err != nil {
+				t.Fatal(err)
+			}
+
+			f32s := make([]float32, mul(tt.Shape))
+			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+				t.Fatal(err)
+			}
+
+			if !slices.Equal(f32s, []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}) {
+				t.Fatalf("expected data [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], got %v", f32s)
+			}
+		}
+	})
+
+	t.Run("even split", func(t *testing.T) {
+		next, stop := iter.Pull(splitDim(&r, 1,
+			split{Replacer: strings.NewReplacer("a", "x")},
+			split{Replacer: strings.NewReplacer("b", "y")},
+		))
+		defer stop()
+
+		{
+			tt, ok := next()
+			if !ok {
+				t.Fatal("expected at least one split")
+			}
+
+			if tt.Name != "x.b" {
+				t.Fatal("expected name 'x.b', got", tt.Name)
+			}
+
+			if !slices.Equal(tt.Shape, []uint64{3, 2}) {
+				t.Fatal("expected shape [3, 2], got", tt.Shape)
+			}
+
+			var b bytes.Buffer
+			if _, err := tt.WriteTo(&b); err != nil {
+				t.Fatal(err)
+			}
+
+			f32s := make([]float32, mul(tt.Shape))
+			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+				t.Fatal(err)
+			}
+
+			if !slices.Equal(f32s, []float32{0, 1, 4, 5, 8, 9}) {
+				t.Fatal("expected data [0, 1, 4, 5, 8, 9], got", f32s)
+			}
+		}
+
+		{
+			tt, ok := next()
+			if !ok {
+				t.Fatal("expected at least one split")
+			}
+
+			if tt.Name != "a.y" {
+				t.Fatal("expected name 'a.y', got", tt.Name)
+			}
+
+			if !slices.Equal(tt.Shape, []uint64{3, 2}) {
+				t.Fatal("expected shape [3, 2], got", tt.Shape)
+			}
+
+			var b bytes.Buffer
+			if _, err := tt.WriteTo(&b); err != nil {
+				t.Fatal(err)
+			}
+
+			f32s := make([]float32, mul(tt.Shape))
+			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+				t.Fatal(err)
+			}
+
+			if !slices.Equal(f32s, []float32{2, 3, 6, 7, 10, 11}) {
+				t.Fatal("expected data [2, 3, 6, 7, 10, 11], got", f32s)
+			}
+		}
+	})
+
+	t.Run("uneven split", func(t *testing.T) {
+		next, stop := iter.Pull(splitDim(&r, 0,
+			split{Replacer: strings.NewReplacer("a", "x"), dim: 2},
+			split{Replacer: strings.NewReplacer("b", "y"), dim: 1},
+		))
+		defer stop()
+
+		{
+			tt, ok := next()
+			if !ok {
+				t.Fatal("expected at least one split")
+			}
+
+			if tt.Name != "x.b" {
+				t.Fatal("expected name 'x.b', got", tt.Name)
+			}
+
+			if !slices.Equal(tt.Shape, []uint64{2, 4}) {
+				t.Fatal("expected shape [2, 4], got", tt.Shape)
+			}
+
+			var b bytes.Buffer
+			if _, err := tt.WriteTo(&b); err != nil {
+				t.Fatal(err)
+			}
+
+			f32s := make([]float32, mul(tt.Shape))
+			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+				t.Fatal(err)
+			}
+
+			if !slices.Equal(f32s, []float32{0, 1, 2, 3, 4, 5, 6, 7}) {
+				t.Fatal("expected data [0, 1, 2, 3, 4, 5, 6, 7], got", f32s)
+			}
+		}
+
+		{
+			tt, ok := next()
+			if !ok {
+				t.Fatal("expected at least one split")
+			}
+
+			if tt.Name != "a.y" {
+				t.Fatal("expected name 'a.y', got", tt.Name)
+			}
+
+			if !slices.Equal(tt.Shape, []uint64{1, 4}) {
+				t.Fatal("expected shape [1, 4], got", tt.Shape)
+			}
+
+			var b bytes.Buffer
+			if _, err := tt.WriteTo(&b); err != nil {
+				t.Fatal(err)
+			}
+
+			f32s := make([]float32, mul(tt.Shape))
+			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+				t.Fatal(err)
+			}
+
+			if !slices.Equal(f32s, []float32{8, 9, 10, 11}) {
+				t.Fatal("expected data [8, 9, 10, 11], got", f32s)
+			}
+		}
+	})
+
+	t.Run("split with transpose", func(t *testing.T) {
+		next, stop := iter.Pull(splitDim(&r, 1,
+			split{Replacer: strings.NewReplacer("a", "x")},
+			split{Replacer: strings.NewReplacer("b", "y"), fn: func(tt tensor.Tensor) (tensor.Tensor, error) {
+				return tensor.Transpose(tt, 1, 0)
+			}},
+		))
+		defer stop()
+
+		{
+			tt, ok := next()
+			if !ok {
+				t.Fatal("expected at least one split")
+			}
+
+			if tt.Name != "x.b" {
+				t.Fatal("expected name 'x.b', got", tt.Name)
+			}
+
+			if !slices.Equal(tt.Shape, []uint64{3, 2}) {
+				t.Fatal("expected shape [3, 2], got", tt.Shape)
+			}
+
+			var b bytes.Buffer
+			if _, err := tt.WriteTo(&b); err != nil {
+				t.Fatal(err)
+			}
+
+			f32s := make([]float32, mul(tt.Shape))
+			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+				t.Fatal(err)
+			}
+
+			if !slices.Equal(f32s, []float32{0, 1, 4, 5, 8, 9}) {
+				t.Fatal("expected data [0, 1, 4, 5, 8, 9], got", f32s)
+			}
+		}
+
+		{
+			tt, ok := next()
+			if !ok {
+				t.Fatal("expected at least one split")
+			}
+
+			if tt.Name != "a.y" {
+				t.Fatal("expected name 'a.y', got", tt.Name)
+			}
+
+			if !slices.Equal(tt.Shape, []uint64{3, 2}) {
+				t.Fatal("expected shape [3, 2], got", tt.Shape)
+			}
+
+			var b bytes.Buffer
+			if _, err := tt.WriteTo(&b); err != nil {
+				t.Fatal(err)
+			}
+
+			f32s := make([]float32, mul(tt.Shape))
+			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
+				t.Fatal(err)
+			}
+
+			if !slices.Equal(f32s, []float32{2, 6, 10, 3, 7, 11}) {
+				t.Fatal("expected data [2, 6, 10, 3, 7, 11], got", f32s)
+			}
+		}
+	})
+}
--- a/convert/tokenizer.go
+++ b/convert/tokenizer.go
@@ -110,6 +110,7 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
 	}

 	if f, err := fsys.Open("tokenizer_config.json"); errors.Is(err, os.ErrNotExist) {
+		// noop
 	} else if err != nil {
 		return nil, err
 	} else {
@@ -171,6 +172,34 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
 		}
 	}

+	if f, err := fsys.Open("generation_config.json"); errors.Is(err, os.ErrNotExist) {
+	} else if err != nil {
+		return nil, err
+	} else {
+		defer f.Close()
+
+		var p map[string]json.RawMessage
+		if err := json.NewDecoder(f).Decode(&p); err != nil {
+			return nil, err
+		}
+
+		for _, st := range specialTokenTypes {
+			if bts, ok := p[fmt.Sprintf("%s_token_id", st)]; ok {
+				var ids []int32
+				if err := json.Unmarshal(bts, &ids); err != nil {
+					// value is not a list so the existing ID is used
+					continue
+				}
+
+				if i := slices.IndexFunc(t.SpecialVocabulary, func(sv *SpecialVocabulary) bool {
+					return sv.Type == st
+				}); i >= 0 {
+					t.SpecialVocabulary[i].IDs = ids
+				}
+			}
+		}
+	}
+
 	return t, nil
 }

@@ -280,6 +309,9 @@ type SpecialVocabulary struct {
 	ID       int
 	Content  string
 	AddToken bool
+
+	// IDs is populated by generation_config.json
+	IDs []int32
 }

 func (sv SpecialVocabulary) Key() string {
--- a/convert/tokenizer_test.go
+++ b/convert/tokenizer_test.go
@@ -247,6 +247,67 @@ func TestParseTokenizer(t *testing.T) {
 				Pre: "default",
 			},
 		},
+		{
+			name: "generation config eos token ids",
+			fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
+				"tokenizer.json": strings.NewReader(`{
+					"added_tokens": [
+						{
+							"id": 0,
+							"content": "<bos>",
+							"special": true
+						},
+						{
+							"id": 1,
+							"content": "<eos>",
+							"special": true
+						},
+						{
+							"id": 2,
+							"content": "<eot>",
+							"special": true
+						},
+						{
+							"id": 3,
+							"content": "<eom>",
+							"special": true
+						}
+					],
+					"model": {
+						"vocab": {
+							"<bos>": 0,
+							"<eos>": 1,
+							"<eot>": 2,
+							"<eom>": 3
+						}
+					}
+				}`),
+				"tokenizer_config.json": strings.NewReader(`{
+					"add_bos_token": true,
+					"add_eos_token": false,
+					"bos_token": "<bos>",
+					"eos_token": "<eos>"
+				}`),
+				"generation_config.json": strings.NewReader(`{
+					"bos_token_id": 0,
+					"eos_token_id": [1, 2, 3]
+				}`),
+			}),
+			specialTokenTypes: []string{"pad", "eos", "bos", "unk"},
+			want: &Tokenizer{
+				Vocabulary: &Vocabulary{
+					Model:  "gpt2",
+					Tokens: []string{"<bos>", "<eos>", "<eot>", "<eom>"},
+					Scores: []float32{0, 1, 2, 3},
+					Types:  []int32{3, 3, 3, 3},
+				},
+				SpecialVocabulary: []*SpecialVocabulary{
+					{Type: "eos", Content: "<eos>", ID: 1, IDs: []int32{1, 2, 3}, AddToken: false},
+					{Type: "bos", Content: "<bos>", ID: 0, AddToken: true},
+				},
+				Pre: "default",
+			},
+		},
 	}

 	for _, tt := range cases {
--- a/docs/api.md
+++ b/docs/api.md
@@ -43,6 +43,7 @@ Generate a response for a given prompt with a provided model. This is a streamin
 - `prompt`: the prompt to generate a response for
 - `suffix`: the text after the model response
 - `images`: (optional) a list of base64-encoded images (for multimodal models such as `llava`)
+- `think`: (for thinking models) should the model think before responding?

 Advanced parameters (optional):

@@ -490,11 +491,13 @@ Generate the next message in a chat with a provided model. This is a streaming e
 - `model`: (required) the [model name](#model-names)
 - `messages`: the messages of the chat, this can be used to keep a chat memory
 - `tools`: list of tools in JSON for the model to use if supported
+- `think`: (for thinking models) should the model think before responding?

 The `message` object has the following fields:

 - `role`: the role of the message, either `system`, `user`, `assistant`, or `tool`
 - `content`: the content of the message
+- `thinking`: (for thinking models) the model's thinking process
 - `images` (optional): a list of images to include in the message (for multimodal models such as `llava`)
 - `tool_calls` (optional): a list of tools in JSON that the model wants to use

--- a/docs/development.md
+++ b/docs/development.md
@@ -118,7 +118,7 @@ To run tests, use `go test`:
 go test ./...
 ```

-> NOTE: In rare cirumstances, you may nedd to change a package using the new
+> NOTE: In rare cirumstances, you may need to change a package using the new
 > "synctest" package in go1.24.
 >
 > If you do not have the "synctest" package enabled, you will not see build or
--- a/docs/import.md
+++ b/docs/import.md
@@ -132,22 +132,12 @@ success

 ### Supported Quantizations

- `q4_0`
- `q4_1`
- `q5_0`
- `q5_1`
 - `q8_0`

 #### K-means Quantizations

- `q3_K_S`
- `q3_K_M`
- `q3_K_L`
 - `q4_K_S`
 - `q4_K_M`
- `q5_K_S`
- `q5_K_M`
- `q6_K`


 ## Sharing your model on ollama.com
--- a/docs/linux.md
+++ b/docs/linux.md
@@ -112,8 +112,8 @@ sudo systemctl status ollama
 > While AMD has contributed the `amdgpu` driver upstream to the official linux
 > kernel source, the version is older and may not support all ROCm features. We
 > recommend you install the latest driver from
-> https://www.amd.com/en/support/linux-drivers for best support of your Radeon
-> GPU.
+> [AMD](https://www.amd.com/en/support/download/linux-drivers.html) for best support
+> of your Radeon GPU.

 ## Customizing

--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -183,6 +183,8 @@ var (
 	NewEngine = Bool("OLLAMA_NEW_ENGINE")
 	// ContextLength sets the default context length
 	ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 4096)
+	// Auth enables authentication between the Ollama client and server
+	UseAuth = Bool("OLLAMA_AUTH")
 )

 func String(s string) func() string {
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -15,6 +15,7 @@ import (
 type GGML struct {
 	container
 	model
+	Length int64
 }

 type model interface {
@@ -126,6 +127,7 @@ func (kv KV) OllamaEngineRequired() bool {
 		"mistral3",
 		"llama4",
 		"mllama",
+		"qwen25vl",
 	}, kv.Architecture())
 }

@@ -385,12 +387,12 @@ func DetectContentType(b []byte) string {
 //
 // It collects array values for arrays with a size less than or equal to
 // maxArraySize. If the maxArraySize is negative, all arrays are collected.
-func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
+func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) {
 	rs = bufioutil.NewBufferedSeeker(rs, 32<<10)

 	var magic uint32
 	if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
-		return nil, 0, err
+		return nil, err
 	}

 	var c container
@@ -400,24 +402,25 @@ func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
 	case FILE_MAGIC_GGUF_BE:
 		c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
 	default:
-		return nil, 0, errors.New("invalid file magic")
+		return nil, errors.New("invalid file magic")
 	}

 	model, err := c.Decode(rs)
 	if err != nil {
-		return nil, 0, err
+		return nil, err
 	}

 	offset, err := rs.Seek(0, io.SeekCurrent)
 	if err != nil {
-		return nil, 0, err
+		return nil, err
 	}

 	// final model type
 	return &GGML{
 		container: c,
 		model:     model,
-	}, offset, nil
+		Length:    offset,
+	}, nil
 }

 func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string) (kv []uint64, partialOffload, fullOffload uint64) {
@@ -649,6 +652,20 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
 		graphSize = 4 * (imageSize*imageSize*numChannels +
 			embeddingLength*patchSize +
 			numPatches*numPatches*headCount)
+	case "qwen25vl":
+		maxPixels := uint64(llm.KV().Uint("vision.max_pixels", 28*28*1280))
+
+		numPatches := maxPixels / (patchSize * patchSize)
+
+		graphSize = 4 * (maxPixels*numChannels + // Original image storage
+			// Normalized pixels
+			maxPixels*numChannels +
+			// Patches storage (numPatches * channels * patchSize^2)
+			numPatches*numChannels*patchSize*patchSize +
+			// Self-attention calculations
+			numPatches*numPatches*headCount +
+			// Additional buffer for processing
+			embeddingLength*numPatches)
 	case "llama4":
 		// vision graph is computed independently in the same schedule
 		// and is negligible compared to the worst case text graph
--- a/fs/ggml/gguf_test.go
+++ b/fs/ggml/gguf_test.go
@@ -35,7 +35,7 @@ func TestWriteGGUF(t *testing.T) {
 	}
 	defer r.Close()

-	ff, _, err := Decode(r, 0)
+	ff, err := Decode(r, 0)
 	if err != nil {
 		t.Fatal(err)
 	}
--- a/fs/gguf/gguf.go
+++ b/fs/gguf/gguf.go
@@ -0,0 +1,347 @@
+package gguf
+
+import (
+	"bytes"
+	"cmp"
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"io"
+	"iter"
+	"os"
+	"slices"
+	"strings"
+)
+
+const (
+	typeUint8 uint32 = iota
+	typeInt8
+	typeUint16
+	typeInt16
+	typeUint32
+	typeInt32
+	typeFloat32
+	typeBool
+	typeString
+	typeArray
+	typeUint64
+	typeInt64
+	typeFloat64
+)
+
+var ErrUnsupported = errors.New("unsupported")
+
+type File struct {
+	Magic   [4]byte
+	Version uint32
+
+	keyValues *lazy[KeyValue]
+	tensors   *lazy[TensorInfo]
+	offset    int64
+
+	file   *os.File
+	reader *bufferedReader
+	bts    []byte
+}
+
+func Open(path string) (f *File, err error) {
+	f = &File{bts: make([]byte, 4096)}
+	f.file, err = os.Open(path)
+	if err != nil {
+		return nil, err
+	}
+
+	f.reader = newBufferedReader(f.file, 32<<10)
+
+	if err := binary.Read(f.reader, binary.LittleEndian, &f.Magic); err != nil {
+		return nil, err
+	}
+
+	if bytes.Equal(f.Magic[:], []byte("gguf")) {
+		return nil, fmt.Errorf("%w file type %v", ErrUnsupported, f.Magic)
+	}
+
+	if err := binary.Read(f.reader, binary.LittleEndian, &f.Version); err != nil {
+		return nil, err
+	}
+
+	if f.Version != 3 {
+		return nil, fmt.Errorf("%w version %v", ErrUnsupported, f.Version)
+	}
+
+	f.tensors, err = newLazy(f, f.readTensor)
+	if err != nil {
+		return nil, err
+	}
+
+	f.tensors.successFunc = func() error {
+		offset := f.reader.offset
+
+		alignment := cmp.Or(f.KeyValue("general.alignment").Int(), 32)
+		f.offset = offset + (alignment-offset%alignment)%alignment
+		return nil
+	}
+
+	f.keyValues, err = newLazy(f, f.readKeyValue)
+	if err != nil {
+		return nil, err
+	}
+
+	return f, nil
+}
+
+func (f *File) readTensor() (TensorInfo, error) {
+	name, err := readString(f)
+	if err != nil {
+		return TensorInfo{}, err
+	}
+
+	dims, err := read[uint32](f)
+	if err != nil {
+		return TensorInfo{}, err
+	}
+
+	shape := make([]uint64, dims)
+	for i := range dims {
+		shape[i], err = read[uint64](f)
+		if err != nil {
+			return TensorInfo{}, err
+		}
+	}
+
+	type_, err := read[uint32](f)
+	if err != nil {
+		return TensorInfo{}, err
+	}
+
+	offset, err := read[uint64](f)
+	if err != nil {
+		return TensorInfo{}, err
+	}
+
+	return TensorInfo{
+		Name:   name,
+		Offset: offset,
+		Shape:  shape,
+		Type:   TensorType(type_),
+	}, nil
+}
+
+func (f *File) readKeyValue() (KeyValue, error) {
+	key, err := readString(f)
+	if err != nil {
+		return KeyValue{}, err
+	}
+
+	t, err := read[uint32](f)
+	if err != nil {
+		return KeyValue{}, err
+	}
+
+	value, err := func() (any, error) {
+		switch t {
+		case typeUint8:
+			return read[uint8](f)
+		case typeInt8:
+			return read[int8](f)
+		case typeUint16:
+			return read[uint16](f)
+		case typeInt16:
+			return read[int16](f)
+		case typeUint32:
+			return read[uint32](f)
+		case typeInt32:
+			return read[int32](f)
+		case typeUint64:
+			return read[uint64](f)
+		case typeInt64:
+			return read[int64](f)
+		case typeFloat32:
+			return read[float32](f)
+		case typeFloat64:
+			return read[float64](f)
+		case typeBool:
+			return read[bool](f)
+		case typeString:
+			return readString(f)
+		case typeArray:
+			return readArray(f)
+		default:
+			return nil, fmt.Errorf("%w type %d", ErrUnsupported, t)
+		}
+	}()
+	if err != nil {
+		return KeyValue{}, err
+	}
+
+	return KeyValue{
+		Key:   key,
+		Value: Value{value},
+	}, nil
+}
+
+func read[T any](f *File) (t T, err error) {
+	err = binary.Read(f.reader, binary.LittleEndian, &t)
+	return t, err
+}
+
+func readString(f *File) (string, error) {
+	n, err := read[uint64](f)
+	if err != nil {
+		return "", err
+	}
+
+	if int(n) > len(f.bts) {
+		f.bts = make([]byte, n)
+	}
+
+	bts := f.bts[:n]
+	if _, err := io.ReadFull(f.reader, bts); err != nil {
+		return "", err
+	}
+	defer clear(bts)
+
+	return string(bts), nil
+}
+
+func readArray(f *File) (any, error) {
+	t, err := read[uint32](f)
+	if err != nil {
+		return nil, err
+	}
+
+	n, err := read[uint64](f)
+	if err != nil {
+		return nil, err
+	}
+
+	switch t {
+	case typeUint8:
+		return readArrayData[uint8](f, n)
+	case typeInt8:
+		return readArrayData[int8](f, n)
+	case typeUint16:
+		return readArrayData[uint16](f, n)
+	case typeInt16:
+		return readArrayData[int16](f, n)
+	case typeUint32:
+		return readArrayData[uint32](f, n)
+	case typeInt32:
+		return readArrayData[int32](f, n)
+	case typeUint64:
+		return readArrayData[uint64](f, n)
+	case typeInt64:
+		return readArrayData[int64](f, n)
+	case typeFloat32:
+		return readArrayData[float32](f, n)
+	case typeFloat64:
+		return readArrayData[float64](f, n)
+	case typeBool:
+		return readArrayData[bool](f, n)
+	case typeString:
+		return readArrayString(f, n)
+	default:
+		return nil, fmt.Errorf("%w type %d", ErrUnsupported, t)
+	}
+}
+
+func readArrayData[T any](f *File, n uint64) (s []T, err error) {
+	s = make([]T, n)
+	for i := range n {
+		e, err := read[T](f)
+		if err != nil {
+			return nil, err
+		}
+
+		s[i] = e
+	}
+
+	return s, nil
+}
+
+func readArrayString(f *File, n uint64) (s []string, err error) {
+	s = make([]string, n)
+	for i := range n {
+		e, err := readString(f)
+		if err != nil {
+			return nil, err
+		}
+
+		s[i] = e
+	}
+
+	return s, nil
+}
+
+func (f *File) Close() error {
+	f.keyValues.stop()
+	f.tensors.stop()
+	return f.file.Close()
+}
+
+func (f *File) KeyValue(key string) KeyValue {
+	if !strings.HasPrefix(key, "general.") && !strings.HasPrefix(key, "tokenizer.") {
+		key = f.KeyValue("general.architecture").String() + "." + key
+	}
+
+	if index := slices.IndexFunc(f.keyValues.values, func(kv KeyValue) bool {
+		return kv.Key == key
+	}); index >= 0 {
+		return f.keyValues.values[index]
+	}
+
+	for keyValue, ok := f.keyValues.next(); ok; keyValue, ok = f.keyValues.next() {
+		if keyValue.Key == key {
+			return keyValue
+		}
+	}
+
+	return KeyValue{}
+}
+
+func (f *File) NumKeyValues() int {
+	return int(f.keyValues.count)
+}
+
+func (f *File) KeyValues() iter.Seq2[int, KeyValue] {
+	return f.keyValues.All()
+}
+
+func (f *File) TensorInfo(name string) TensorInfo {
+	if index := slices.IndexFunc(f.tensors.values, func(t TensorInfo) bool {
+		return t.Name == name
+	}); index >= 0 {
+		return f.tensors.values[index]
+	}
+
+	// fast-forward through key values if we haven't already
+	_ = f.keyValues.rest()
+	for tensor, ok := f.tensors.next(); ok; tensor, ok = f.tensors.next() {
+		if tensor.Name == name {
+			return tensor
+		}
+	}
+
+	return TensorInfo{}
+}
+
+func (f *File) NumTensors() int {
+	return int(f.tensors.count)
+}
+
+func (f *File) TensorInfos() iter.Seq2[int, TensorInfo] {
+	// fast forward through key values if we haven't already
+	f.keyValues.rest()
+	return f.tensors.All()
+}
+
+func (f *File) TensorReader(name string) (TensorInfo, io.Reader, error) {
+	t := f.TensorInfo(name)
+	if t.NumBytes() == 0 {
+		return TensorInfo{}, nil, fmt.Errorf("tensor %s not found", name)
+	}
+
+	// fast forward through tensor info if we haven't already
+	_ = f.tensors.rest()
+	return t, io.NewSectionReader(f.file, f.offset+int64(t.Offset), t.NumBytes()), nil
+}
--- a/fs/gguf/gguf_test.go
+++ b/fs/gguf/gguf_test.go
@@ -0,0 +1,249 @@
+package gguf_test
+
+import (
+	"bytes"
+	"os"
+	"strconv"
+	"strings"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+	"github.com/google/go-cmp/cmp/cmpopts"
+	"github.com/ollama/ollama/fs/ggml"
+	"github.com/ollama/ollama/fs/gguf"
+)
+
+func createBinFile(tb testing.TB) string {
+	tb.Helper()
+	f, err := os.CreateTemp(tb.TempDir(), "")
+	if err != nil {
+		tb.Fatal(err)
+	}
+	defer f.Close()
+
+	kv := ggml.KV{
+		"general.architecture":                   "llama",
+		"llama.block_count":                      uint32(8),
+		"llama.embedding_length":                 uint32(3),
+		"llama.attention.head_count":             uint32(2),
+		"llama.attention.head_count_kv":          uint32(2),
+		"llama.attention.key_length":             uint32(3),
+		"llama.rope.dimension_count":             uint32(4),
+		"llama.rope.freq_base":                   float32(10000.0),
+		"llama.rope.freq_scale":                  float32(1.0),
+		"llama.attention.layer_norm_rms_epsilon": float32(1e-6),
+		"tokenizer.ggml.eos_token_id":            uint32(0),
+		"tokenizer.ggml.eos_token_ids":           []int32{1, 2, 3},
+		"tokenizer.ggml.tokens":                  []string{"hello", "world"},
+		"tokenizer.ggml.scores":                  []float32{0, 1},
+	}
+
+	tensors := []*ggml.Tensor{
+		{
+			Name:     "token_embd.weight",
+			Kind:     0,
+			Shape:    []uint64{2, 3},
+			WriterTo: bytes.NewBuffer(make([]byte, 4*2*3)),
+		},
+		{
+			Name:     "output.weight",
+			Kind:     0,
+			Shape:    []uint64{3, 2},
+			WriterTo: bytes.NewBuffer(make([]byte, 4*3*2)),
+		},
+	}
+
+	for i := range 8 {
+		tensors = append(tensors, &ggml.Tensor{
+			Name:     "blk." + strconv.Itoa(i) + ".attn_q.weight",
+			Kind:     0,
+			Shape:    []uint64{3, 3},
+			WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
+		}, &ggml.Tensor{
+			Name:     "blk." + strconv.Itoa(i) + ".attn_k.weight",
+			Kind:     0,
+			Shape:    []uint64{3, 3},
+			WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
+		}, &ggml.Tensor{
+			Name:     "blk." + strconv.Itoa(i) + ".attn_v.weight",
+			Kind:     0,
+			Shape:    []uint64{3, 3},
+			WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
+		}, &ggml.Tensor{
+			Name:     "blk." + strconv.Itoa(i) + ".attn_output.weight",
+			Kind:     0,
+			Shape:    []uint64{3, 3},
+			WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
+		})
+	}
+
+	if err := ggml.WriteGGUF(f, kv, tensors); err != nil {
+		tb.Fatal(err)
+	}
+
+	return f.Name()
+}
+
+func TestRead(t *testing.T) {
+	f, err := gguf.Open(createBinFile(t))
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer f.Close()
+
+	if got := f.KeyValue("does.not.exist").Valid(); got {
+		t.Errorf(`KeyValue("does.not.exist").Exists() = %v, want false`, got)
+	}
+
+	if got := f.KeyValue("general.architecture").String(); got != "llama" {
+		t.Errorf(`KeyValue("general.architecture").String() = %q, want %q`, got, "llama")
+	}
+
+	if got := f.TensorInfo("token_embd.weight"); got.Name != "token_embd.weight" {
+		t.Errorf(`TensorInfo("token_embd.weight").Name = %q, want %q`, got.Name, "token_embd.weight")
+	} else if diff := cmp.Diff(got.Shape, []uint64{2, 3}); diff != "" {
+		t.Errorf(`TensorInfo("token_embd.weight").Shape mismatch (-got +want):\n%s`, diff)
+	} else if got.Type != gguf.TensorTypeF32 {
+		t.Errorf(`TensorInfo("token_embd.weight").Type = %d, want %d`, got.Type, gguf.TensorTypeF32)
+	}
+
+	if got := f.KeyValue("block_count").Uint(); got != 8 {
+		t.Errorf(`KeyValue("block_count").Uint() = %d, want %d`, got, 8)
+	}
+
+	if diff := cmp.Diff(f.KeyValue("tokenizer.ggml.tokens").Strings(), []string{"hello", "world"}); diff != "" {
+		t.Errorf("KeyValue(\"tokenizer.ggml.tokens\").Strings() mismatch (-got +want):\n%s", diff)
+	}
+
+	if diff := cmp.Diff(f.KeyValue("tokenizer.ggml.scores").Floats(), []float64{0, 1}); diff != "" {
+		t.Errorf("KeyValue(\"tokenizer.ggml.scores\").Ints() mismatch (-got +want):\n%s", diff)
+	}
+
+	var kvs []string
+	for _, kv := range f.KeyValues() {
+		if !kv.Valid() {
+			t.Error("found invalid key-value pair:", kv)
+		}
+
+		kvs = append(kvs, kv.Key)
+	}
+
+	if len(kvs) != f.NumKeyValues() {
+		t.Errorf("iterated key count = %d, want %d", len(kvs), f.NumKeyValues())
+	}
+
+	if diff := cmp.Diff(kvs, []string{
+		"general.architecture",
+		"llama.block_count",
+		"llama.embedding_length",
+		"llama.attention.head_count",
+		"llama.attention.head_count_kv",
+		"llama.attention.key_length",
+		"llama.rope.dimension_count",
+		"llama.rope.freq_base",
+		"llama.rope.freq_scale",
+		"llama.attention.layer_norm_rms_epsilon",
+		"tokenizer.ggml.eos_token_id",
+		"tokenizer.ggml.eos_token_ids",
+		"tokenizer.ggml.tokens",
+		"tokenizer.ggml.scores",
+	}, cmpopts.SortSlices(strings.Compare)); diff != "" {
+		t.Errorf("KeyValues() mismatch (-got +want):\n%s", diff)
+	}
+
+	var tis []string
+	for _, ti := range f.TensorInfos() {
+		if !ti.Valid() {
+			t.Error("found invalid tensor info:", ti)
+		}
+
+		tis = append(tis, ti.Name)
+	}
+
+	if len(tis) != f.NumTensors() {
+		t.Errorf("iterated tensor count = %d, want %d", len(tis), f.NumTensors())
+	}
+
+	if diff := cmp.Diff(tis, []string{
+		"token_embd.weight",
+		"output.weight",
+		"blk.0.attn_q.weight",
+		"blk.0.attn_k.weight",
+		"blk.0.attn_v.weight",
+		"blk.0.attn_output.weight",
+		"blk.1.attn_q.weight",
+		"blk.1.attn_k.weight",
+		"blk.1.attn_v.weight",
+		"blk.1.attn_output.weight",
+		"blk.2.attn_q.weight",
+		"blk.2.attn_k.weight",
+		"blk.2.attn_v.weight",
+		"blk.2.attn_output.weight",
+		"blk.3.attn_q.weight",
+		"blk.3.attn_k.weight",
+		"blk.3.attn_v.weight",
+		"blk.3.attn_output.weight",
+		"blk.4.attn_q.weight",
+		"blk.4.attn_k.weight",
+		"blk.4.attn_v.weight",
+		"blk.4.attn_output.weight",
+		"blk.5.attn_q.weight",
+		"blk.5.attn_k.weight",
+		"blk.5.attn_v.weight",
+		"blk.5.attn_output.weight",
+		"blk.6.attn_q.weight",
+		"blk.6.attn_k.weight",
+		"blk.6.attn_v.weight",
+		"blk.6.attn_output.weight",
+		"blk.7.attn_q.weight",
+		"blk.7.attn_k.weight",
+		"blk.7.attn_v.weight",
+		"blk.7.attn_output.weight",
+	}, cmpopts.SortSlices(strings.Compare)); diff != "" {
+		t.Errorf("TensorInfos() mismatch (-got +want):\n%s", diff)
+	}
+
+	ti, r, err := f.TensorReader("output.weight")
+	if err != nil {
+		t.Fatalf(`TensorReader("output.weight") error: %v`, err)
+	}
+
+	if ti.Name != "output.weight" {
+		t.Errorf(`TensorReader("output.weight").Name = %q, want %q`, ti.Name, "output.weight")
+	} else if diff := cmp.Diff(ti.Shape, []uint64{3, 2}); diff != "" {
+		t.Errorf(`TensorReader("output.weight").Shape mismatch (-got +want):\n%s`, diff)
+	} else if ti.Type != gguf.TensorTypeF32 {
+		t.Errorf(`TensorReader("output.weight").Type = %d, want %d`, ti.Type, gguf.TensorTypeF32)
+	}
+
+	var b bytes.Buffer
+	if _, err := b.ReadFrom(r); err != nil {
+		t.Fatalf(`ReadFrom TensorReader("output.weight") error: %v`, err)
+	}
+
+	if b.Len() != int(ti.NumBytes()) {
+		t.Errorf(`ReadFrom TensorReader("output.weight") length = %d, want %d`, b.Len(), ti.NumBytes())
+	}
+}
+
+func BenchmarkRead(b *testing.B) {
+	b.ReportAllocs()
+
+	p := createBinFile(b)
+	for b.Loop() {
+		f, err := gguf.Open(p)
+		if err != nil {
+			b.Fatal(err)
+		}
+
+		if got := f.KeyValue("general.architecture").String(); got != "llama" {
+			b.Errorf("got = %q, want %q", got, "llama")
+		}
+
+		// Iterate through some tensors
+		for range f.TensorInfos() {
+		}
+
+		f.Close()
+	}
+}
--- a/fs/gguf/keyvalue.go
+++ b/fs/gguf/keyvalue.go
@@ -0,0 +1,90 @@
+package gguf
+
+import (
+	"reflect"
+	"slices"
+)
+
+type KeyValue struct {
+	Key string
+	Value
+}
+
+func (kv KeyValue) Valid() bool {
+	return kv.Key != "" && kv.Value.value != nil
+}
+
+type Value struct {
+	value any
+}
+
+func value[T any](v Value, kinds ...reflect.Kind) (t T) {
+	vv := reflect.ValueOf(v.value)
+	if slices.Contains(kinds, vv.Kind()) {
+		t = vv.Convert(reflect.TypeOf(t)).Interface().(T)
+	}
+	return
+}
+
+func values[T any](v Value, kinds ...reflect.Kind) (ts []T) {
+	switch vv := reflect.ValueOf(v.value); vv.Kind() {
+	case reflect.Slice:
+		if slices.Contains(kinds, vv.Type().Elem().Kind()) {
+			ts = make([]T, vv.Len())
+			for i := range vv.Len() {
+				ts[i] = vv.Index(i).Convert(reflect.TypeOf(ts[i])).Interface().(T)
+			}
+		}
+	}
+	return
+}
+
+// Int returns Value as a signed integer. If it is not a signed integer, it returns 0.
+func (v Value) Int() int64 {
+	return value[int64](v, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64)
+}
+
+// Ints returns Value as a signed integer slice. If it is not a signed integer slice, it returns nil.
+func (v Value) Ints() (i64s []int64) {
+	return values[int64](v, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64)
+}
+
+// Uint converts an unsigned integer value to uint64. If the value is not a unsigned integer, it returns 0.
+func (v Value) Uint() uint64 {
+	return value[uint64](v, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64)
+}
+
+// Uints returns Value as a unsigned integer slice. If it is not a unsigned integer slice, it returns nil.
+func (v Value) Uints() (u64s []uint64) {
+	return values[uint64](v, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64)
+}
+
+// Float returns Value as a float. If it is not a float, it returns 0.
+func (v Value) Float() float64 {
+	return value[float64](v, reflect.Float32, reflect.Float64)
+}
+
+// Floats returns Value as a float slice. If it is not a float slice, it returns nil.
+func (v Value) Floats() (f64s []float64) {
+	return values[float64](v, reflect.Float32, reflect.Float64)
+}
+
+// Bool returns Value as a boolean. If it is not a boolean, it returns false.
+func (v Value) Bool() bool {
+	return value[bool](v, reflect.Bool)
+}
+
+// Bools returns Value as a boolean slice. If it is not a boolean slice, it returns nil.
+func (v Value) Bools() (bools []bool) {
+	return values[bool](v, reflect.Bool)
+}
+
+// String returns Value as a string. If it is not a string, it returns an empty string.
+func (v Value) String() string {
+	return value[string](v, reflect.String)
+}
+
+// Strings returns Value as a string slice. If it is not a string slice, it returns nil.
+func (v Value) Strings() (strings []string) {
+	return values[string](v, reflect.String)
+}
--- a/fs/gguf/keyvalue_test.go
+++ b/fs/gguf/keyvalue_test.go
@@ -0,0 +1,208 @@
+package gguf
+
+import (
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+)
+
+func split(name string, values map[string][]any) (matched []any, unmatched []any) {
+	for key, value := range values {
+		if key == name {
+			matched = value
+		} else {
+			unmatched = append(unmatched, value...)
+		}
+	}
+	return
+}
+
+func TestValue(t *testing.T) {
+	values := map[string][]any{
+		"int64":   {int(42), int8(42), int16(42), int32(42), int64(42)},
+		"uint64":  {uint(42), uint8(42), uint16(42), uint32(42), uint64(42)},
+		"float64": {float32(42), float64(42)},
+		"string":  {"42", "hello"},
+		"bool":    {true, false},
+	}
+
+	t.Run("int64", func(t *testing.T) {
+		matched, unmatched := split("int64", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if i64 := kv.Int(); i64 != 42 {
+				t.Errorf("expected 42, got %d", i64)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if i64 := kv.Int(); i64 != 0 {
+				t.Errorf("expected 42, got %d", i64)
+			}
+		}
+	})
+
+	t.Run("uint64", func(t *testing.T) {
+		matched, unmatched := split("uint64", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if u64 := kv.Uint(); u64 != 42 {
+				t.Errorf("expected 42, got %d", u64)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if u64 := kv.Uint(); u64 != 0 {
+				t.Errorf("expected 42, got %d", u64)
+			}
+		}
+	})
+
+	t.Run("float64", func(t *testing.T) {
+		matched, unmatched := split("float64", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if f64 := kv.Float(); f64 != 42 {
+				t.Errorf("expected 42, got %f", f64)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if f64 := kv.Float(); f64 != 0 {
+				t.Errorf("expected 42, got %f", f64)
+			}
+		}
+	})
+
+	t.Run("string", func(t *testing.T) {
+		matched, unmatched := split("string", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if s := kv.String(); s != v {
+				t.Errorf("expected 42, got %s", s)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if s := kv.String(); s != "" {
+				t.Errorf("expected 42, got %s", s)
+			}
+		}
+	})
+
+	t.Run("bool", func(t *testing.T) {
+		matched, unmatched := split("bool", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if b := kv.Bool(); b != v {
+				t.Errorf("expected true, got %v", b)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if b := kv.Bool(); b != false {
+				t.Errorf("expected false, got %v", b)
+			}
+		}
+	})
+}
+
+func TestValues(t *testing.T) {
+	values := map[string][]any{
+		"int64s":   {[]int{42}, []int8{42}, []int16{42}, []int32{42}, []int64{42}},
+		"uint64s":  {[]uint{42}, []uint8{42}, []uint16{42}, []uint32{42}, []uint64{42}},
+		"float64s": {[]float32{42}, []float64{42}},
+		"strings":  {[]string{"42"}, []string{"hello"}},
+		"bools":    {[]bool{true}, []bool{false}},
+	}
+
+	t.Run("int64s", func(t *testing.T) {
+		matched, unmatched := split("int64s", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if diff := cmp.Diff(kv.Ints(), []int64{42}); diff != "" {
+				t.Errorf("diff: %s", diff)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if i64s := kv.Ints(); i64s != nil {
+				t.Errorf("expected nil, got %v", i64s)
+			}
+		}
+	})
+
+	t.Run("uint64s", func(t *testing.T) {
+		matched, unmatched := split("uint64s", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if diff := cmp.Diff(kv.Uints(), []uint64{42}); diff != "" {
+				t.Errorf("diff: %s", diff)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if u64s := kv.Uints(); u64s != nil {
+				t.Errorf("expected nil, got %v", u64s)
+			}
+		}
+	})
+
+	t.Run("float64s", func(t *testing.T) {
+		matched, unmatched := split("float64s", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if diff := cmp.Diff(kv.Floats(), []float64{42}); diff != "" {
+				t.Errorf("diff: %s", diff)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if f64s := kv.Floats(); f64s != nil {
+				t.Errorf("expected nil, got %v", f64s)
+			}
+		}
+	})
+
+	t.Run("strings", func(t *testing.T) {
+		matched, unmatched := split("strings", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if diff := cmp.Diff(kv.Strings(), v); diff != "" {
+				t.Errorf("diff: %s", diff)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if s := kv.Strings(); s != nil {
+				t.Errorf("expected nil, got %v", s)
+			}
+		}
+	})
+
+	t.Run("bools", func(t *testing.T) {
+		matched, unmatched := split("bools", values)
+		for _, v := range matched {
+			kv := KeyValue{"key", Value{v}}
+			if diff := cmp.Diff(kv.Bools(), v); diff != "" {
+				t.Errorf("diff: %s", diff)
+			}
+		}
+
+		for _, v := range unmatched {
+			kv := KeyValue{"key", Value{v}}
+			if b := kv.Bools(); b != nil {
+				t.Errorf("expected nil, got %v", b)
+			}
+		}
+	})
+}
--- a/fs/gguf/lazy.go
+++ b/fs/gguf/lazy.go
@@ -0,0 +1,89 @@
+package gguf
+
+import (
+	"encoding/binary"
+	"iter"
+	"log/slog"
+)
+
+type lazy[T any] struct {
+	count  uint64
+	next   func() (T, bool)
+	stop   func()
+	values []T
+
+	// successFunc is called when all values have been successfully read.
+	successFunc func() error
+}
+
+func newLazy[T any](f *File, fn func() (T, error)) (*lazy[T], error) {
+	it := lazy[T]{}
+	if err := binary.Read(f.reader, binary.LittleEndian, &it.count); err != nil {
+		return nil, err
+	}
+
+	it.values = make([]T, 0)
+	it.next, it.stop = iter.Pull(func(yield func(T) bool) {
+		for i := range it.count {
+			t, err := fn()
+			if err != nil {
+				slog.Error("error reading tensor", "index", i, "error", err)
+				return
+			}
+
+			it.values = append(it.values, t)
+			if !yield(t) {
+				break
+			}
+		}
+
+		if it.successFunc != nil {
+			it.successFunc()
+		}
+	})
+
+	return &it, nil
+}
+
+func (g *lazy[T]) Values() iter.Seq[T] {
+	return func(yield func(T) bool) {
+		for _, v := range g.All() {
+			if !yield(v) {
+				break
+			}
+		}
+	}
+}
+
+func (g *lazy[T]) All() iter.Seq2[int, T] {
+	return func(yield func(int, T) bool) {
+		for i := range int(g.count) {
+			if i < len(g.values) {
+				if !yield(i, g.values[i]) {
+					break
+				}
+			} else {
+				t, ok := g.next()
+				if !ok {
+					break
+				}
+
+				if !yield(i, t) {
+					break
+				}
+			}
+		}
+	}
+}
+
+func (g *lazy[T]) rest() (collected bool) {
+	for {
+		_, ok := g.next()
+		collected = collected || ok
+		if !ok {
+			break
+		}
+	}
+
+	return collected
+}
--- a/fs/gguf/reader.go
+++ b/fs/gguf/reader.go
@@ -0,0 +1,23 @@
+package gguf
+
+import (
+	"bufio"
+	"io"
+)
+
+type bufferedReader struct {
+	offset int64
+	*bufio.Reader
+}
+
+func newBufferedReader(rs io.ReadSeeker, size int) *bufferedReader {
+	return &bufferedReader{
+		Reader: bufio.NewReaderSize(rs, size),
+	}
+}
+
+func (rs *bufferedReader) Read(p []byte) (n int, err error) {
+	n, err = rs.Reader.Read(p)
+	rs.offset += int64(n)
+	return n, err
+}
--- a/fs/gguf/tensor.go
+++ b/fs/gguf/tensor.go
@@ -0,0 +1,288 @@
+package gguf
+
+import (
+	"log/slog"
+	"strings"
+)
+
+type TensorInfo struct {
+	Name   string
+	Offset uint64
+	Shape  []uint64
+	Type   TensorType
+}
+
+func (ti TensorInfo) Valid() bool {
+	return ti.Name != "" && ti.NumBytes() > 0
+}
+
+func (ti TensorInfo) NumValues() int64 {
+	var numItems int64 = 1
+	for _, dim := range ti.Shape {
+		numItems *= int64(dim)
+	}
+	return numItems
+}
+
+// NumBytes returns the number of bytes in the tensor.
+func (ti TensorInfo) NumBytes() int64 {
+	return int64(float64(ti.NumValues()) * ti.Type.NumBytes())
+}
+
+func (ti TensorInfo) LogValue() slog.Value {
+	return slog.GroupValue(
+		slog.String("name", ti.Name),
+		slog.Int64("offset", int64(ti.Offset)),
+		slog.Any("shape", ti.Shape),
+		slog.Int64("num_values", ti.NumValues()),
+		slog.Int64("num_bytes", ti.NumBytes()),
+		slog.Any("type", ti.Type),
+	)
+}
+
+type TensorType uint32
+
+const (
+	TensorTypeF32 TensorType = iota
+	TensorTypeF16
+	TensorTypeQ4_0
+	TensorTypeQ4_1
+
+	// unexported // unused in gguf
+	tensorTypeQ4_2
+	tensorTypeQ4_3
+
+	TensorTypeQ5_0
+	TensorTypeQ5_1
+	TensorTypeQ8_0
+	TensorTypeQ8_1
+	TensorTypeQ2_K
+	TensorTypeQ3_K
+	TensorTypeQ4_K
+	TensorTypeQ5_K
+	TensorTypeQ6_K
+	TensorTypeQ8_K
+
+	// unexported // unquantizable by ollama
+	tensorTypeIQ2_XXS
+	tensorTypeIQ2_XS
+	tensorTypeIQ3_XXS
+	tensorTypeIQ1_S
+	tensorTypeIQ4_NL
+	tensorTypeIQ3_S
+	tensorTypeIQ2_S
+	tensorTypeIQ4_XS
+
+	TensorTypeI8
+	TensorTypeI16
+	TensorTypeI32
+	TensorTypeI64
+	TensorTypeF64
+
+	// unexported // unquantizable by ollama
+	tensorTypeIQ1_M
+
+	TensorTypeBF16
+
+	// unexported // unused in gguf
+	tensorTypeQ4_0_4_4
+	tensorTypeQ4_0_4_8
+	tensorTypeQ4_0_8_8
+
+	// unexported // unquantizable by ollama
+	tensorTypeTQ1_0
+	tensorTypeTQ2_0
+
+	// unexported // unused in gguf
+	tensorTypeIQ4_NL_4_4
+	tensorTypeIQ4_NL_4_8
+	tensorTypeIQ4_NL_8_8
+)
+
+func (tt TensorType) NumBytes() float64 {
+	return float64(tt.typeSize()) / float64(tt.blockSize())
+}
+
+func (tt TensorType) typeSize() int64 {
+	switch tt {
+	case TensorTypeF32:
+		return 4
+	case TensorTypeF16:
+		return 2
+	case TensorTypeQ4_0:
+		return 2 + tt.blockSize()/2
+	case TensorTypeQ4_1:
+		return 2 + 2 + tt.blockSize()/2
+	case TensorTypeQ5_0:
+		return 2 + 4 + tt.blockSize()/2
+	case TensorTypeQ5_1:
+		return 2 + 2 + 4 + tt.blockSize()/2
+	case TensorTypeQ8_0:
+		return 2 + tt.blockSize()
+	case TensorTypeQ8_1:
+		return 2 + 2 + tt.blockSize()
+	case TensorTypeQ2_K:
+		return tt.blockSize()/16 + tt.blockSize()/4 + 2 + 2
+	case TensorTypeQ3_K:
+		return tt.blockSize()/8 + tt.blockSize()/4 + 12 + 2
+	case TensorTypeQ4_K:
+		return 2 + 2 + 12 + tt.blockSize()/2
+	case TensorTypeQ5_K:
+		return 2 + 2 + 12 + tt.blockSize()/8 + tt.blockSize()/2
+	case TensorTypeQ6_K:
+		return tt.blockSize()/2 + tt.blockSize()/4 + tt.blockSize()/16 + 2
+	case TensorTypeQ8_K:
+		return 4 + tt.blockSize() + 2*tt.blockSize()/16
+	case tensorTypeIQ2_XXS:
+		return 2 + 2*tt.blockSize()/8
+	case tensorTypeIQ2_XS:
+		return 2 + 2*tt.blockSize()/8 + tt.blockSize()/32
+	case tensorTypeIQ3_XXS:
+		return 2 + tt.blockSize()/4 + tt.blockSize()/8
+	case tensorTypeIQ1_S:
+		return 2 + tt.blockSize()/8 + tt.blockSize()/16
+	case tensorTypeIQ4_NL:
+		return 2 + tt.blockSize()/2
+	case tensorTypeIQ3_S:
+		return 2 + tt.blockSize()/4 + tt.blockSize()/8 + tt.blockSize()/32 + 4
+	case tensorTypeIQ2_S:
+		return 2 + tt.blockSize()/4 + tt.blockSize()/16
+	case tensorTypeIQ4_XS:
+		return 2 + 2 + tt.blockSize()/2 + tt.blockSize()/64
+	case TensorTypeI8:
+		return 1
+	case TensorTypeI16:
+		return 2
+	case TensorTypeI32:
+		return 4
+	case TensorTypeI64:
+		return 8
+	case TensorTypeF64:
+		return 8
+	case tensorTypeIQ1_M:
+		return tt.blockSize()/8 + tt.blockSize()/16 + tt.blockSize()/32
+	case TensorTypeBF16:
+		return 2
+	default:
+		return 0
+	}
+}
+
+func (tt TensorType) blockSize() int64 {
+	switch tt {
+	case TensorTypeF32,
+		TensorTypeF16,
+		TensorTypeI8,
+		TensorTypeI16,
+		TensorTypeI32,
+		TensorTypeI64,
+		TensorTypeF64,
+		TensorTypeBF16:
+		return 1
+	case TensorTypeQ4_0,
+		TensorTypeQ4_1,
+		TensorTypeQ5_0,
+		TensorTypeQ5_1,
+		TensorTypeQ8_0,
+		TensorTypeQ8_1,
+		tensorTypeIQ4_NL:
+		return 32
+	default:
+		return 256
+	}
+}
+
+func (tt TensorType) String() string {
+	switch tt {
+	case TensorTypeF32:
+		return "f32"
+	case TensorTypeF16:
+		return "f16"
+	case TensorTypeQ4_0:
+		return "q4_0"
+	case TensorTypeQ4_1:
+		return "q4_1"
+	case tensorTypeQ4_2:
+		return "q4_2"
+	case tensorTypeQ4_3:
+		return "q4_3"
+	case TensorTypeQ5_0:
+		return "q5_0"
+	case TensorTypeQ5_1:
+		return "q5_1"
+	case TensorTypeQ8_0:
+		return "q8_0"
+	case TensorTypeQ8_1:
+		return "q8_1"
+	case TensorTypeQ2_K:
+		return "q2_k"
+	case TensorTypeQ3_K:
+		return "q3_k"
+	case TensorTypeQ4_K:
+		return "q4_k"
+	case TensorTypeQ5_K:
+		return "q5_k"
+	case TensorTypeQ6_K:
+		return "q6_k"
+	case TensorTypeQ8_K:
+		return "q8_k"
+	case tensorTypeIQ2_XXS:
+		return "iq2_xxs"
+	case tensorTypeIQ2_XS:
+		return "iq2_xs"
+	case tensorTypeIQ3_XXS:
+		return "iq3_xxs"
+	case tensorTypeIQ1_S:
+		return "iq1_s"
+	case tensorTypeIQ4_NL:
+		return "iq4_nl"
+	case tensorTypeIQ3_S:
+		return "iq3_s"
+	case tensorTypeIQ2_S:
+		return "iq2_s"
+	case tensorTypeIQ4_XS:
+		return "iq4_xs"
+	case TensorTypeI8:
+		return "i8"
+	case TensorTypeI16:
+		return "i16"
+	case TensorTypeI32:
+		return "i32"
+	case TensorTypeI64:
+		return "i64"
+	case TensorTypeF64:
+		return "f64"
+	case tensorTypeIQ1_M:
+		return "iq1_m"
+	case TensorTypeBF16:
+		return "bf16"
+	case tensorTypeQ4_0_4_4:
+		return "q4_0_4_4"
+	case tensorTypeQ4_0_4_8:
+		return "q4_0_4_8"
+	case tensorTypeQ4_0_8_8:
+		return "q4_0_8_8"
+	case tensorTypeTQ1_0:
+		return "tq1_0"
+	case tensorTypeTQ2_0:
+		return "tq2_0"
+	case tensorTypeIQ4_NL_4_4:
+		return "iq4_nl_4_4"
+	case tensorTypeIQ4_NL_4_8:
+		return "iq4_nl_4_8"
+	case tensorTypeIQ4_NL_8_8:
+		return "iq4_nl_8_8"
+	default:
+		return "unknown"
+	}
+}
+
+func (tt TensorType) LogValue() slog.Value {
+	return slog.GroupValue(
+		slog.Uint64("value", uint64(tt)),
+		slog.String("name", strings.ToUpper(tt.String())),
+		slog.Int64("size", tt.typeSize()),
+		slog.Int64("block_size", tt.blockSize()),
+		slog.Float64("num_bytes", tt.NumBytes()),
+	)
+}
--- a/go.mod
+++ b/go.mod
@@ -19,8 +19,7 @@ require (
 	github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1
 	github.com/dlclark/regexp2 v1.11.4
 	github.com/emirpasic/gods/v2 v2.0.0-alpha
-	github.com/go-json-experiment/json v0.0.0-20250417205406-170dfdcf87d1
-	github.com/google/go-cmp v0.6.0
+	github.com/google/go-cmp v0.7.0
 	github.com/mattn/go-runewidth v0.0.14
 	github.com/nlpodyssey/gopickle v0.3.0
 	github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c
--- a/go.sum
+++ b/go.sum
@@ -69,8 +69,6 @@ github.com/go-fonts/latin-modern v0.2.0/go.mod h1:rQVLdDMK+mK1xscDwsqM5J8U2jrRa3
 github.com/go-fonts/liberation v0.1.1/go.mod h1:K6qoJYypsmfVjWg8KOVDQhLc8UDgIK2HYqyqAO9z7GY=
 github.com/go-fonts/stix v0.1.0/go.mod h1:w/c1f0ldAUlJmLBvlbkvVXLAD+tAMqobIIQpmnUIzUY=
 github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU=
-github.com/go-json-experiment/json v0.0.0-20250417205406-170dfdcf87d1 h1:+VexzzkMLb1tnvpuQdGT/DicIRW7MN8ozsXqBMgp0Hk=
-github.com/go-json-experiment/json v0.0.0-20250417205406-170dfdcf87d1/go.mod h1:TiCD2a1pcmjd7YnhGH0f/zKNcCD06B029pHhzV23c2M=
 github.com/go-latex/latex v0.0.0-20210118124228-b3d85cf34e07/go.mod h1:CO1AlKB2CSIqUrmQPqA0gdRIlnLEY0gK5JGjh37zN5U=
 github.com/go-playground/assert/v2 v2.2.0 h1:JvknZsQTYeFEAhQwI4qEt9cyV5ONwRHC+lYKSsYSR8s=
 github.com/go-playground/assert/v2 v2.2.0/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4=
@@ -114,8 +112,8 @@ github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/
 github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
-github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
-github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
+github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
+github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
 github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
 github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
 github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
--- a/integration/llm_image_test.go
+++ b/integration/llm_image_test.go
@@ -19,7 +19,7 @@ func TestVisionModels(t *testing.T) {
 	}
 	testCases := []testCase{
 		{
-			model: "llava:7b",
+			model: "qwen2.5vl",
 		},
 		{
 			model: "llama3.2-vision",
@@ -60,6 +60,7 @@ func TestVisionModels(t *testing.T) {
 }

 func TestIntegrationSplitBatch(t *testing.T) {
+	skipUnderMinVRAM(t, 6)
 	image, err := base64.StdEncoding.DecodeString(imageEncoding)
 	require.NoError(t, err)
 	req := api.GenerateRequest{
--- a/integration/testdata/embed.json
+++ b/integration/testdata/embed.json
--- a/kvcache/causal.go
+++ b/kvcache/causal.go
@@ -30,6 +30,11 @@ type Causal struct {

 	// ** current forward pass **

+	// curReserve indicates that this forward pass is only for
+	// memory reservation and we should not update our metadata
+	// based on it.
+	curReserve bool
+
 	// the active layer for Get and Put
 	curLayer int

@@ -159,12 +164,13 @@ func (c *Causal) Close() {
 }

 func (c *Causal) StartForward(ctx ml.Context, batch input.Batch, reserve bool) error {
+	c.curReserve = reserve
 	c.curBatchSize = len(batch.Positions)
 	c.curSequences = batch.Sequences
 	c.curPositions = batch.Positions
 	c.opts.Except = nil

-	if !reserve {
+	if !c.curReserve {
 		c.updateSlidingWindow()

 		var err error
@@ -211,10 +217,9 @@ func (c *Causal) StartForward(ctx ml.Context, batch input.Batch, reserve bool) e
 		c.curCellRange.max = len(c.cells) - 1
 	}

-	var err error
-	c.curMask, err = c.buildMask(ctx)
+	c.curMask = c.buildMask(ctx)

-	return err
+	return nil
 }

 func newRange() cellRange {
@@ -297,7 +302,7 @@ func roundUp(length, pad int) int {
 // Builds a mask of history x batch indicating whether for each token in the batch the
 // token in the history should apply. This is based on both the sequence and causality (the
 // position of the history is not ahead of the token in the batch).
-func (c *Causal) buildMask(ctx ml.Context) (ml.Tensor, error) {
+func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
 	// Align and pad the two dimensions as required by the backend
 	batchSize := roundUp(c.curBatchSize, c.config.MaskBatchPadding)

@@ -305,6 +310,11 @@ func (c *Causal) buildMask(ctx ml.Context) (ml.Tensor, error) {
 	c.curCellRange.max = roundUp(c.curCellRange.max+1, c.config.CachePadding) - 1

 	length := c.curCellRange.max - c.curCellRange.min + 1
+
+	if c.curReserve {
+		return ctx.Input().Empty(c.config.MaskDType, length, batchSize)
+	}
+
 	mask := make([]float32, batchSize*length)

 	for i := range c.curBatchSize {
@@ -325,10 +335,7 @@ func (c *Causal) buildMask(ctx ml.Context) (ml.Tensor, error) {
 		mask[i] = float32(math.Inf(-1))
 	}

-	maskTensor, err := ctx.Input().FromFloatSlice(mask, length, batchSize)
-	if err != nil {
-		return nil, err
-	}
+	maskTensor := ctx.Input().FromFloatSlice(mask, length, batchSize)

 	if c.config.MaskDType != ml.DTypeF32 {
 		out := ctx.Input().Empty(c.config.MaskDType, maskTensor.Shape()...)
@@ -336,7 +343,7 @@ func (c *Causal) buildMask(ctx ml.Context) (ml.Tensor, error) {
 		maskTensor = out
 	}

-	return maskTensor, nil
+	return maskTensor
 }

 func (c *Causal) moveCells(ctx ml.Context, src, dst, length int) {
@@ -491,12 +498,7 @@ func (c *Causal) SetCausal(ctx ml.Context, opts CausalOptions) {
 	if !slices.Equal(c.opts.Except, opts.Except) {
 		c.opts = opts
 		if ctx != nil {
-			var err error
-			c.curMask, err = c.buildMask(ctx)
-			if err != nil {
-				// This error should never occur because we have previously built a mask with the same shape
-				panic(fmt.Errorf("SetCausal: %w", err))
-			}
+			c.curMask = c.buildMask(ctx)
 		}
 	}
 }
@@ -652,10 +654,7 @@ func (c *Causal) shift(seq int, beginIndex, offset int32) error {
 		}
 	}

-	kShift, err := ctx.Input().FromIntSlice(offsets, len(offsets))
-	if err != nil {
-		return err
-	}
+	kShift := ctx.Input().FromIntSlice(offsets, len(offsets))

 	for i, key := range c.keys {
 		if key == nil {
--- a/kvcache/causal_test.go
+++ b/kvcache/causal_test.go
@@ -344,7 +344,7 @@ func testCache(t *testing.T, backend ml.Backend, cache Cache, tests []testCase)
 			}

 			cache.SetLayer(0)
-			tensor, _ := context.FromFloatSlice(test.in, test.inShape...)
+			tensor := context.FromFloatSlice(test.in, test.inShape...)
 			cache.Put(context, tensor, tensor)

 			out, _, mask := cache.Get(context)
@@ -386,7 +386,7 @@ func TestCanResume(t *testing.T) {
 	}

 	cache.SetLayer(0)
-	tensor, _ := context.FromFloatSlice([]float32{1, 2, 3, 4}, 1, 1, 4)
+	tensor := context.FromFloatSlice([]float32{1, 2, 3, 4}, 1, 1, 4)
 	cache.Put(context, tensor, tensor)

 	// with window size 4, nothing has slid out of the window yet
@@ -413,7 +413,7 @@ func TestCanResume(t *testing.T) {
 	}

 	cache.SetLayer(0)
-	tensor, _ = context.FromFloatSlice([]float32{5, 6}, 1, 1, 2)
+	tensor = context.FromFloatSlice([]float32{5, 6}, 1, 1, 2)
 	cache.Put(context, tensor, tensor)

 	// only the latest position has overlapping windows
@@ -470,24 +470,24 @@ func (c *testContext) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
 	return c.Empty(dtype, shape...)
 }

-func (c *testContext) FromFloatSlice(s []float32, shape ...int) (ml.Tensor, error) {
+func (c *testContext) FromFloatSlice(s []float32, shape ...int) ml.Tensor {
 	t := c.Empty(ml.DTypeF32, shape...).(*testTensor)

 	copy(t.data, s)

-	return t, nil
+	return t
 }

-func (c *testContext) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
+func (c *testContext) FromIntSlice(s []int32, shape ...int) ml.Tensor {
 	f := make([]float32, len(s))
 	for i := range f {
 		f[i] = float32(s[i])
 	}

-	out, _ := c.FromFloatSlice(f, shape...)
+	out := c.FromFloatSlice(f, shape...)
 	out.(*testTensor).dtype = ml.DTypeI32

-	return out, nil
+	return out
 }

 func (c *testContext) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
@@ -496,7 +496,7 @@ func (c *testContext) Arange(start, stop, step float32, dtype ml.DType) ml.Tenso
 		s = append(s, i)
 	}

-	out, _ := c.FromFloatSlice(s, len(s))
+	out := c.FromFloatSlice(s, len(s))
 	out.(*testTensor).dtype = dtype
 	return out
 }
@@ -508,7 +508,7 @@ func (c *testContext) Forward(...ml.Tensor) ml.Context { return c }

 func (c *testContext) Compute(...ml.Tensor) {}

-func (c *testContext) Reserve() error { return nil }
+func (c *testContext) Reserve() {}

 func (c *testContext) MaxGraphNodes() int {
 	return 10
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -544,7 +544,7 @@ func NewSamplingContext(model *Model, params SamplingParams) (*SamplingContext,
 	cparams.penalty_last_n = C.int32_t(params.RepeatLastN)
 	cparams.penalty_repeat = C.float(params.PenaltyRepeat)
 	cparams.penalty_freq = C.float(params.PenaltyFreq)
-	cparams.penalty_present = C.float(params.PenaltyFreq)
+	cparams.penalty_present = C.float(params.PenaltyPresent)
 	cparams.seed = C.uint32_t(params.Seed)

 	grammar := C.CString(params.Grammar)
@@ -580,7 +580,7 @@ func SchemaToGrammar(schema []byte) []byte {
 	defer C.free(unsafe.Pointer(cStr))

 	// Allocate buffer for grammar based on schema length but with upper bound
-	maxLen := min(1024*1024, len(schema)*4)
+	maxLen := max(32768, min(1024*1024, len(schema)*4))
 	buf := make([]byte, maxLen)

 	// Call C function to convert schema to grammar
@@ -602,7 +602,7 @@ type Grammar struct {
 	mu sync.Mutex
 }

-func NewGrammar(grammar string, vocabIds []uint32, vocabValues []string, eogTokens []uint32) *Grammar {
+func NewGrammar(grammar string, vocabIds []uint32, vocabValues []string, eogTokens []int32) *Grammar {
 	cGrammar := C.CString(grammar)
 	defer C.free(unsafe.Pointer(cGrammar))

@@ -622,7 +622,7 @@ func NewGrammar(grammar string, vocabIds []uint32, vocabValues []string, eogToke
 		cEogTokens[i] = C.uint32_t(token)
 	}

-	g := C.grammar_init(cGrammar, (*C.uint32_t)(unsafe.Pointer(&cTokens[0])), C.size_t(len(cTokens)), (**C.char)(unsafe.Pointer(&cPieces[0])), (*C.uint32_t)(unsafe.Pointer(&cEogTokens[0])), C.size_t(len(cEogTokens)))
+	g := C.grammar_init(cGrammar, unsafe.SliceData(cTokens), C.size_t(len(cTokens)), unsafe.SliceData(cPieces), unsafe.SliceData(cEogTokens), C.size_t(len(cEogTokens)))
 	if g == nil {
 		return nil
 	}
--- a/llama/patches/0015-add-argsort-and-cuda-copy-for-i32.patch
+++ b/llama/patches/0015-add-argsort-and-cuda-copy-for-i32.patch
@@ -0,0 +1,277 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Michael Yang <git@mxy.ng>
+Date: Thu, 1 May 2025 13:45:12 -0700
+Subject: [PATCH] add argsort and cuda copy for i32
+
+---
+ ggml/src/ggml-cpu/ops.cpp     |  43 ++++++++++++++
+ ggml/src/ggml-cuda/argsort.cu | 102 +++++++++++++++++++++++++++++++++-
+ ggml/src/ggml-cuda/cpy.cu     |  49 ++++++++++++++++
+ 3 files changed, 192 insertions(+), 2 deletions(-)
+
+diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
+index becdae07..7a44b6cf 100644
+--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
+@@ -6890,6 +6890,45 @@ static void ggml_compute_forward_argsort_f32(
+     }
+ }
+ 
+static void ggml_compute_forward_argsort_i32(
+    const ggml_compute_params * params,
+    ggml_tensor * dst) {
+
+    const ggml_tensor * src0 = dst->src[0];
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    GGML_ASSERT(nb0 == sizeof(int32_t));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int64_t nr = ggml_nrows(src0);
+
+    ggml_sort_order order = (ggml_sort_order) ggml_get_op_params_i32(dst, 0);
+
+    for (int64_t i = ith; i < nr; i += nth) {
+        int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);
+        const int32_t * src_data = (int32_t *)((char *) src0->data + i*nb01);
+
+        for (int64_t j = 0; j < ne0; j++) {
+            dst_data[j] = j;
+        }
+
+        // C doesn't have a functional sort, so we do a bubble sort instead
+        for (int64_t j = 0; j < ne0; j++) {
+            for (int64_t k = j + 1; k < ne0; k++) {
+                if ((order == GGML_SORT_ORDER_ASC  && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
+                    (order == GGML_SORT_ORDER_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
+                    int32_t tmp = dst_data[j];
+                    dst_data[j] = dst_data[k];
+                    dst_data[k] = tmp;
+                }
+            }
+        }
+    }
+}
+
+ void ggml_compute_forward_argsort(
+     const ggml_compute_params * params,
+     ggml_tensor * dst) {
+@@ -6901,6 +6940,10 @@ void ggml_compute_forward_argsort(
+             {
+                 ggml_compute_forward_argsort_f32(params, dst);
+             } break;
+        case GGML_TYPE_I32:
+            {
+                ggml_compute_forward_argsort_i32(params, dst);
+            } break;
+         default:
+             {
+                 GGML_ABORT("fatal error");
+diff --git a/ggml/src/ggml-cuda/argsort.cu b/ggml/src/ggml-cuda/argsort.cu
+index 607ded85..53b02634 100644
+--- a/ggml/src/ggml-cuda/argsort.cu
+++ b/ggml/src/ggml-cuda/argsort.cu
+@@ -85,13 +85,107 @@ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, co
+     }
+ }
+ 
+
+template<ggml_sort_order order>
+static __global__ void k_argsort_i32_i32(const int32_t * x, int * dst, const int ncols, const int ncols_pad) {
+    extern __shared__ int shared_mem[];
+    int * indices = shared_mem;
+
+    const int tid = threadIdx.x;
+    const int row = blockIdx.y;
+
+    // Initialize all indices, handling the case where threads < ncols_pad
+    for (int i = tid; i < ncols_pad; i += blockDim.x) {
+        indices[i] = i < ncols ? i : 0; // Use 0 for padding indices
+    }
+    __syncthreads();
+
+    // Bitonic sort
+    for (int k = 2; k <= ncols_pad; k *= 2) {
+        for (int j = k/2; j > 0; j /= 2) {
+            for (int i = tid; i < ncols_pad; i += blockDim.x) {
+                const int ij = i ^ j;
+                if (ij > i) {
+                    // Only compare values within the actual data range
+                    if (i < ncols && ij < ncols) {
+                        if ((i & k) == 0) {
+                            if (order == GGML_SORT_ORDER_ASC) {
+                                if (x[row * ncols + indices[i]] > x[row * ncols + indices[ij]]) {
+                                    int tmp = indices[i];
+                                    indices[i] = indices[ij];
+                                    indices[ij] = tmp;
+                                }
+                            } else {
+                                if (x[row * ncols + indices[i]] < x[row * ncols + indices[ij]]) {
+                                    int tmp = indices[i];
+                                    indices[i] = indices[ij];
+                                    indices[ij] = tmp;
+                                }
+                            }
+                        } else {
+                            if (order == GGML_SORT_ORDER_ASC) {
+                                if (x[row * ncols + indices[i]] < x[row * ncols + indices[ij]]) {
+                                    int tmp = indices[i];
+                                    indices[i] = indices[ij];
+                                    indices[ij] = tmp;
+                                }
+                            } else {
+                                if (x[row * ncols + indices[i]] > x[row * ncols + indices[ij]]) {
+                                    int tmp = indices[i];
+                                    indices[i] = indices[ij];
+                                    indices[ij] = tmp;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            __syncthreads();
+        }
+    }
+
+    // Write sorted indices to output, only threads handling valid data
+    for (int i = tid; i < ncols; i += blockDim.x) {
+        dst[row * ncols + i] = indices[i];
+    }
+}
+
+static void argsort_i32_i32_cuda(const int32_t * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
+    // Bitonic sort requires ncols to be power of 2
+    const int ncols_pad = next_power_of_2(ncols);
+
+    // Ensure thread count doesn't exceed maximum (typically 1024)
+    const int max_threads = 1024;  // This is the typical max for most GPUs
+    const int threads_per_block = ncols_pad > max_threads ? max_threads : ncols_pad;
+
+    const dim3 block_dims(threads_per_block, 1, 1);
+    const dim3 block_nums(1, nrows, 1);
+    const size_t shared_mem = ncols_pad * sizeof(int);
+
+    // Check if shared memory size is within limits
+    const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb;
+
+    // Instead of logging an error, use GGML_ASSERT with a descriptive message
+    GGML_ASSERT(shared_mem <= max_shared_mem && "argsort: required shared memory exceeds device limit");
+
+    // Launch kernels with the updated thread configuration
+    if (order == GGML_SORT_ORDER_ASC) {
+        k_argsort_i32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
+    } else if (order == GGML_SORT_ORDER_DESC) {
+        k_argsort_i32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
+    } else {
+        GGML_ABORT("fatal error");
+    }
+}
+
+
+ void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+     const ggml_tensor * src0 = dst->src[0];
+     const float * src0_d = (const float *)src0->data;
+     float * dst_d = (float *)dst->data;
+     cudaStream_t stream = ctx.stream();
+ 
+-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_I32);
+     GGML_ASSERT( dst->type == GGML_TYPE_I32);
+     GGML_ASSERT(ggml_is_contiguous(src0));
+ 
+@@ -100,5 +194,9 @@ void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+ 
+     enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
+ 
+-    argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);
+    if (src0->type == GGML_TYPE_I32) {
+        argsort_i32_i32_cuda((const int32_t *)src0_d, (int *)dst_d, ncols, nrows, order, stream);
+    } else {
+        argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);
+    }
+ }
+diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
+index 2d46176e..47383486 100644
+--- a/ggml/src/ggml-cuda/cpy.cu
+++ b/ggml/src/ggml-cuda/cpy.cu
+@@ -38,6 +38,13 @@ static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
+     *dsti = *xi;
+ }
+ 
+static __device__ void cpy_1_i32_i32(const char * cxi, char * cdsti) {
+    const int32_t * xi = (const int32_t *) cxi;
+    int32_t * dsti = (int32_t *) cdsti;
+
+    *dsti = *xi;
+}
+
+ template <cpy_kernel_t cpy_1>
+ static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const int ne,
+                                    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+@@ -68,6 +75,44 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const in
+     cpy_1(cx + x_offset, cdst + dst_offset);
+ }
+ 
+// First, add this template function after the other template functions
+template <cpy_kernel_t cpy_1>
+static __global__ void cpy_i32_i32(const char * cx, char * cdst, const int ne,
+                                 const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+                                 const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
+                                 const int nb12, const int nb13) {
+    const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= ne) {
+        return;
+    }
+
+    const int64_t i03 = i/(ne00 * ne01 * ne02);
+    const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
+    const int64_t i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
+    const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
+    const int64_t x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
+
+    const int64_t i13 = i/(ne10 * ne11 * ne12);
+    const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
+    const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
+    const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
+    const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
+
+    cpy_1(cx + x_offset, cdst + dst_offset);
+}
+
+// Then modify the ggml_cpy_i32_i32_cuda function to use the new template
+static void ggml_cpy_i32_i32_cuda(
+    const char * cx, char * cdst, const int ne,
+    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int graph_cpynode_index) {
+
+    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+    cpy_i32_i32<cpy_1_i32_i32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+}
+
+ static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
+     const float * xi = (const float *) cxi;
+     block_q8_0 * dsti = (block_q8_0 *) cdsti;
+@@ -631,6 +676,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
+         ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+     } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
+         ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+    } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
+        ggml_cpy_i32_i32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+     } else {
+         GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
+                 ggml_type_name(src0->type), ggml_type_name(src1->type));
+@@ -686,6 +733,8 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
+         return (void*) cpy_f32_f16<cpy_1_f32_f16>;
+     } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
+         return (void*) cpy_f32_f16<cpy_1_f16_f32>;
+    } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
+        return (void*) cpy_i32_i32<cpy_1_i32_i32>;
+     } else {
+         GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
+                 ggml_type_name(src0->type), ggml_type_name(src1->type));
--- a/llama/patches/0016-graph-memory-reporting-on-failure.patch
+++ b/llama/patches/0016-graph-memory-reporting-on-failure.patch
@@ -0,0 +1,156 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Jesse Gross <jesse@ollama.com>
+Date: Fri, 18 Apr 2025 15:58:19 -0700
+Subject: [PATCH] graph memory reporting on failure
+
+---
+ ggml/include/ggml-alloc.h   |  6 ++++++
+ ggml/include/ggml-backend.h |  6 ++++++
+ ggml/src/ggml-alloc.c       | 38 +++++++++++++++++++++++++++++++++----
+ ggml/src/ggml-backend.cpp   | 10 ++++++++++
+ 4 files changed, 56 insertions(+), 4 deletions(-)
+
+diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h
+index 2cb150fd..781b1e10 100644
+--- a/ggml/include/ggml-alloc.h
+++ b/ggml/include/ggml-alloc.h
+@@ -66,6 +66,12 @@ GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph
+ 
+ GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
+ 
+struct ggml_allocr_buffer_status {
+    size_t size;
+    bool allocated;
+};
+GGML_API struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id);
+
+ // Utils
+ // Create a buffer and allocate all the tensors in a ggml_context
+ GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
+diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
+index 778927f6..74e46716 100644
+--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
+@@ -304,6 +304,12 @@ extern "C" {
+ 
+     GGML_API size_t               ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
+ 
+    struct ggml_backend_buffer_status {
+        size_t size;
+        bool allocated;
+    };
+    GGML_API struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
+
+     GGML_API void                 ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
+     GGML_API ggml_backend_t       ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
+ 
+diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
+index 5fd379f6..04812990 100644
+--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
+@@ -364,6 +364,7 @@ struct node_alloc {
+ struct ggml_gallocr {
+     ggml_backend_buffer_type_t * bufts; // [n_buffers]
+     ggml_backend_buffer_t * buffers; // [n_buffers]
+    size_t *buffer_sizes; // [n_buffers]
+     struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
+     int n_buffers;
+ 
+@@ -387,6 +388,9 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
+     galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
+     GGML_ASSERT(galloc->buffers != NULL);
+ 
+    galloc->buffer_sizes = calloc(n_bufs, sizeof(size_t));
+    GGML_ASSERT(galloc->buffer_sizes != NULL);
+
+     galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
+     GGML_ASSERT(galloc->buf_tallocs != NULL);
+ 
+@@ -453,6 +457,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
+     ggml_hash_set_free(&galloc->hash_set);
+     free(galloc->hash_values);
+     free(galloc->bufts);
+    free(galloc->buffer_sizes);
+     free(galloc->buffers);
+     free(galloc->buf_tallocs);
+     free(galloc->node_allocs);
+@@ -748,6 +753,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
+         }
+     }
+ 
+    bool success = true;
+
+     // reallocate buffers if needed
+     for (int i = 0; i < galloc->n_buffers; i++) {
+         // if the buffer type is used multiple times, we reuse the same buffer
+@@ -769,15 +776,20 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
+ 
+             ggml_backend_buffer_free(galloc->buffers[i]);
+             galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
+-            if (galloc->buffers[i] == NULL) {
+            if (galloc->buffers[i]) {
+                galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
+                ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
+            } else {
+                 GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
+-                return false;
+                galloc->buffer_sizes[i] = new_size;
+                success = false;
+             }
+-            ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
+        } else {
+            galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
+         }
+     }
+ 
+-    return true;
+    return success;
+ }
+ 
+ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
+@@ -934,6 +946,24 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
+     return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
+ }
+ 
+struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
+    GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
+
+    for (int i = 0; i < buffer_id; i++) {
+        if (galloc->buf_tallocs[i] == galloc->buf_tallocs[buffer_id]) {
+            // This buffer is the same as a previous one due to the same buffer type being used multiple times
+            // (See above.) However, we need a different check because multiple buffers might be NULL in our
+            // case and we still want to know the attempted size.
+
+            struct ggml_allocr_buffer_status status = {0, true};
+            return status;
+        }
+    }
+
+    struct ggml_allocr_buffer_status status = {galloc->buffer_sizes[buffer_id], galloc->buffers[buffer_id] != NULL};
+    return status;
+}
+
+ // utils
+ 
+ static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
+diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
+index 0ce73a99..be335e8c 100644
+--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
+@@ -1629,6 +1629,16 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
+     return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
+ }
+ 
+struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
+    int backend_index = ggml_backend_sched_backend_id(sched, backend);
+    GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
+
+    struct ggml_allocr_buffer_status allocr_status = ggml_gallocr_get_attempted_buffer_size(sched->galloc, backend_index);
+    struct ggml_backend_buffer_status status = {allocr_status.size, allocr_status.allocated};
+
+    return status;
+}
+
+ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
+     int backend_index = ggml_backend_sched_backend_id(sched, backend);
+     GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
--- a/llama/patches/0017-ggml-Export-GPU-UUIDs.patch
+++ b/llama/patches/0017-ggml-Export-GPU-UUIDs.patch
@@ -0,0 +1,102 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Jesse Gross <jesse@ollama.com>
+Date: Thu, 24 Apr 2025 14:48:51 -0700
+Subject: [PATCH] ggml: Export GPU UUIDs
+
+This enables matching up devices and information reported by the backend
+with tools (e.g. nvidia-smi) and system management libraries (e.g. nvml).
+---
+ ggml/include/ggml-backend.h      |  1 +
+ ggml/src/ggml-cuda/ggml-cuda.cu  | 33 ++++++++++++++++++++++++++++++++
+ ggml/src/ggml-metal/ggml-metal.m |  1 +
+ 3 files changed, 35 insertions(+)
+
+diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
+index 74e46716..a880df33 100644
+--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
+@@ -152,6 +152,7 @@ extern "C" {
+     struct ggml_backend_dev_props {
+         const char * name;
+         const char * description;
+        const char * uuid;
+         size_t memory_free;
+         size_t memory_total;
+         enum ggml_backend_dev_type type;
+diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
+index cb0d8528..4c829153 100644
+--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
+@@ -2884,6 +2884,7 @@ struct ggml_backend_cuda_device_context {
+     int device;
+     std::string name;
+     std::string description;
+    std::string uuid;
+ };
+ 
+ static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
+@@ -2896,6 +2897,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
+     return ctx->description.c_str();
+ }
+ 
+static const char * ggml_backend_cuda_device_get_uuid(ggml_backend_dev_t dev) {
+    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
+    return ctx->uuid.c_str();
+}
+
+ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
+     ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
+     ggml_cuda_set_device(ctx->device);
+@@ -2910,6 +2916,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
+ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
+     props->name        = ggml_backend_cuda_device_get_name(dev);
+     props->description = ggml_backend_cuda_device_get_description(dev);
+    props->uuid        = ggml_backend_cuda_device_get_uuid(dev);
+     props->type        = ggml_backend_cuda_device_get_type(dev);
+     ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
+ 
+@@ -3458,6 +3465,32 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
+                 CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
+                 dev_ctx->description = prop.name;
+ 
+                #if !defined(GGML_USE_HIP)
+                char uuid[64];
+                snprintf(uuid, sizeof(uuid),
+                    "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
+                    (unsigned char)prop.uuid.bytes[0],
+                    (unsigned char)prop.uuid.bytes[1],
+                    (unsigned char)prop.uuid.bytes[2],
+                    (unsigned char)prop.uuid.bytes[3],
+                    (unsigned char)prop.uuid.bytes[4],
+                    (unsigned char)prop.uuid.bytes[5],
+                    (unsigned char)prop.uuid.bytes[6],
+                    (unsigned char)prop.uuid.bytes[7],
+                    (unsigned char)prop.uuid.bytes[8],
+                    (unsigned char)prop.uuid.bytes[9],
+                    (unsigned char)prop.uuid.bytes[10],
+                    (unsigned char)prop.uuid.bytes[11],
+                    (unsigned char)prop.uuid.bytes[12],
+                    (unsigned char)prop.uuid.bytes[13],
+                    (unsigned char)prop.uuid.bytes[14],
+                    (unsigned char)prop.uuid.bytes[15]
+                  );
+                dev_ctx->uuid = uuid;
+                #else
+                dev_ctx->uuid = "GPU-" + std::string(prop.uuid.bytes, 16);
+                #endif
+
+                 ggml_backend_dev_t dev = new ggml_backend_device {
+                     /* .iface   = */ ggml_backend_cuda_device_interface,
+                     /* .reg     = */ &reg,
+diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
+index 1b56f858..ee4f2dcb 100644
+--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
+@@ -5703,6 +5703,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
+ static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
+     props->name        = ggml_backend_metal_device_get_name(dev);
+     props->description = ggml_backend_metal_device_get_description(dev);
+    props->uuid        = "0";
+     props->type        = ggml_backend_metal_device_get_type(dev);
+     ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
+     props->caps = (struct ggml_backend_dev_caps) {
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -1,12 +1,9 @@
 package llm

 import (
-	"cmp"
 	"fmt"
 	"log/slog"
-	"maps"
 	"os"
-	"slices"
 	"strconv"
 	"strings"

@@ -85,8 +82,11 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	var graphOffload uint64

 	// Projectors loaded into GPU0 only
-	var projectorWeights uint64
-	var projectorGraph uint64
+	var llamaEngineProjectorWeights uint64
+
+	// Projectors loaded with output layer
+	var ollamaEngineProjectorWeights uint64
+	var ollamaEngineProjectorGraph uint64

 	// Conditional output size on GPU 0
 	var memoryLayerOutput uint64
@@ -111,21 +111,23 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList)

 	for _, projector := range projectors {
-		weight := projectorMemoryRequirements(projector)
-		projectorWeights += weight
+		llamaEngineProjectorWeights += projectorMemoryRequirements(projector)

 		// multimodal models require at least 2048 context
 		opts.NumCtx = max(opts.NumCtx, 2048)
 	}
-	if projectorWeights == 0 && projectorGraph == 0 {
-		projectorWeights, projectorGraph = f.VisionGraphSize()
+	if llamaEngineProjectorWeights == 0 {
+		ollamaEngineProjectorWeights, ollamaEngineProjectorGraph = f.VisionGraphSize()
+		opts.NumCtx = max(opts.NumCtx, 2048)
 	}

 	layers := f.Tensors().GroupLayers()
-	// add one layer (chosing the max layer) worth of memory as a buffer
-	layerSize = slices.MaxFunc(slices.Collect(maps.Values(layers)), func(a, b ggml.Layer) int {
-		return cmp.Compare(a.Size(), b.Size())
-	}).Size()
+	// add one layer worth of memory as a buffer
+	if blk0, ok := layers["blk.0"]; ok {
+		layerSize = blk0.Size()
+	} else {
+		slog.Warn("model missing blk.0 layer size")
+	}

 	var kvct string
 	if envconfig.FlashAttention() &&
@@ -163,6 +165,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		graphFullOffload = graphPartialOffload
 	}

+	// Output layer handled at the end if we have space
 	if layer, ok := layers["output_norm"]; ok {
 		memoryLayerOutput += layer.Size()
 	}
@@ -172,8 +175,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		memoryLayerOutput += layer.Size()
 	}

-	// Output layer handled at the end if we have space
-	gpuZeroOverhead := projectorWeights + projectorGraph
+	gpuZeroOverhead := llamaEngineProjectorWeights

 	// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
 	var layerCount int
@@ -216,6 +218,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	if len(gpusWithSpace) > 0 {
 		gpuZeroID = gpusWithSpace[0].i
 		gpuAllocations[gpuZeroID] += gpuZeroOverhead
+	} else {
+		overflow += gpuZeroOverhead
 	}

 	// For all the layers, find where they can fit on the GPU(s)
@@ -256,21 +260,24 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	}

 	// Determine if we need to consider output then find where it fits
-	if memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) {
-		for j := len(gpusWithSpace); j > 0; j-- {
-			g := gpusWithSpace[layerCount%j]
-			used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
-			if g.g.FreeMemory > overhead+used+memoryLayerOutput {
-				gpuAllocations[g.i] += memoryLayerOutput
-				layerCounts[g.i]++
-				layerCount++
-				break
+	memoryLastLayer := memoryLayerOutput + ollamaEngineProjectorWeights + ollamaEngineProjectorGraph
+	if memoryLastLayer > 0 {
+		if opts.NumGPU < 0 || layerCount < opts.NumGPU {
+			for j := len(gpusWithSpace); j > 0; j-- {
+				g := gpusWithSpace[layerCount%j]
+				used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
+				if g.g.FreeMemory > overhead+used+memoryLastLayer {
+					gpuAllocations[g.i] += memoryLastLayer
+					layerCounts[g.i]++
+					layerCount++
+					break
+				}
 			}
 		}

 		if layerCount < int(f.KV().BlockCount())+1 {
 			fullyLoaded = false
-			overflow += memoryLayerOutput
+			overflow += memoryLastLayer
 		}
 	}

@@ -328,8 +335,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		memoryLayerOutput:   memoryLayerOutput,
 		graphFullOffload:    graphFullOffload,
 		graphPartialOffload: graphPartialOffload,
-		projectorWeights:    projectorWeights,
-		projectorGraph:      projectorGraph,
+		projectorWeights:    llamaEngineProjectorWeights + ollamaEngineProjectorWeights,
+		projectorGraph:      ollamaEngineProjectorGraph,
 	}

 	if gpus[0].Library == "cpu" {
@@ -415,7 +422,7 @@ func projectorMemoryRequirements(filename string) (weights uint64) {
 	}
 	defer file.Close()

-	ggml, _, err := ggml.Decode(file, 1024)
+	ggml, err := ggml.Decode(file, 1024)
 	if err != nil {
 		return 0
 	}
--- a/llm/server.go
+++ b/llm/server.go
@@ -22,6 +22,7 @@ import (
 	"strings"
 	"sync"
 	"time"
+	"unicode/utf8"

 	"golang.org/x/sync/semaphore"

@@ -121,7 +122,7 @@ func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) {
 	}
 	defer f.Close()

-	ggml, _, err := ggml.Decode(f, maxArraySize)
+	ggml, err := ggml.Decode(f, maxArraySize)
 	return ggml, err
 }

@@ -725,10 +726,68 @@ type CompletionResponse struct {
 	EvalDuration       time.Duration `json:"eval_duration"`
 }

+// unicodeBufferHandler wraps a completion response callback to handle partial UTF-8 sequences.
+// This function creates a stateful closure that is NOT safe for concurrent use.
+// Each completion request should create its own handler instance.
+func unicodeBufferHandler(fn func(CompletionResponse)) func(CompletionResponse) {
+	var pendingUTF8 string
+
+	return func(resp CompletionResponse) {
+		if resp.Content == "" && !resp.Done {
+			// No content to process, just pass through
+			fn(resp)
+			return
+		}
+
+		// Combine any pending UTF-8 with current content
+		combinedContent := pendingUTF8 + resp.Content
+		pendingUTF8 = ""
+
+		// Check if combined content is valid UTF-8
+		if utf8.ValidString(combinedContent) {
+			// Valid UTF-8, send it
+			resp.Content = combinedContent
+			fn(resp)
+		} else {
+			// Invalid UTF-8
+			if resp.Done {
+				// This is the final response, trim incomplete UTF-8
+				trimmedContent := combinedContent
+				for !utf8.ValidString(trimmedContent) && len(trimmedContent) > 0 {
+					trimmedContent = trimmedContent[:len(trimmedContent)-1]
+				}
+				resp.Content = trimmedContent
+				fn(resp)
+			} else {
+				// Not final response, split valid and invalid parts
+				validPrefix := combinedContent
+				for !utf8.ValidString(validPrefix) && len(validPrefix) > 0 {
+					validPrefix = validPrefix[:len(validPrefix)-1]
+				}
+
+				if len(validPrefix) > 0 {
+					// Send valid prefix
+					resp.Content = validPrefix
+					fn(resp)
+					// Buffer the remainder
+					pendingUTF8 = combinedContent[len(validPrefix):]
+				} else {
+					// No valid prefix, buffer everything
+					pendingUTF8 = combinedContent
+					// Don't send this response
+				}
+			}
+		}
+	}
+}
+
 func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
 	slog.Debug("completion request", "images", len(req.Images), "prompt", len(req.Prompt), "format", string(req.Format))
 	slog.Log(ctx, logutil.LevelTrace, "completion request", "prompt", req.Prompt)

+	// Wrap the callback with unicode buffer handling
+	unicodeFn := unicodeBufferHandler(fn)
+
 	if len(req.Format) > 0 {
 		switch string(req.Format) {
 		case `null`, `""`:
@@ -797,7 +856,8 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu

 	res, err := http.DefaultClient.Do(serverReq)
 	if err != nil {
-		return fmt.Errorf("POST predict: %v", err)
+		slog.Error("post predict", "error", err)
+		return errors.New("model runner has unexpectedly stopped, this may be due to resource limitations or an internal error, check ollama server logs for details")
 	}
 	defer res.Body.Close()

@@ -853,13 +913,13 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 			}

 			if c.Content != "" {
-				fn(CompletionResponse{
+				unicodeFn(CompletionResponse{
 					Content: c.Content,
 				})
 			}

 			if c.Done {
-				fn(c)
+				unicodeFn(c)
 				return nil
 			}
 		}
--- a/llm/server_test.go
+++ b/llm/server_test.go
@@ -70,3 +70,152 @@ func TestLLMServerCompletionFormat(t *testing.T) {
 	}, nil)
 	checkValid(err)
 }
+
+func TestUnicodeBufferHandler(t *testing.T) {
+	tests := []struct {
+		name              string
+		inputResponses    []CompletionResponse
+		expectedResponses []CompletionResponse
+		description       string
+	}{
+		{
+			name: "complete_unicode",
+			inputResponses: []CompletionResponse{
+				{Content: "Hello", Done: false},
+				{Content: " world", Done: false},
+				{Content: "!", Done: true},
+			},
+			expectedResponses: []CompletionResponse{
+				{Content: "Hello", Done: false},
+				{Content: " world", Done: false},
+				{Content: "!", Done: true},
+			},
+			description: "All responses with valid unicode should pass through unchanged",
+		},
+		{
+			name: "incomplete_unicode_at_end_with_done",
+			inputResponses: []CompletionResponse{
+				{Content: "Hello", Done: false},
+				{Content: string([]byte{0xF0, 0x9F}), Done: true}, // Incomplete emoji with Done=true
+			},
+			expectedResponses: []CompletionResponse{
+				{Content: "Hello", Done: false},
+				{Content: "", Done: true}, // Content is trimmed but response is still sent with Done=true
+			},
+			description: "When Done=true, incomplete Unicode at the end should be trimmed",
+		},
+		{
+			name: "split_unicode_across_responses",
+			inputResponses: []CompletionResponse{
+				{Content: "Hello " + string([]byte{0xF0, 0x9F}), Done: false}, // First part of 😀
+				{Content: string([]byte{0x98, 0x80}) + " world!", Done: true}, // Second part of 😀 and more text
+			},
+			expectedResponses: []CompletionResponse{
+				{Content: "Hello ", Done: false},  // Incomplete Unicode trimmed
+				{Content: "😀 world!", Done: true}, // Complete emoji in second response
+			},
+			description: "Unicode split across responses should be handled correctly",
+		},
+		{
+			name: "incomplete_unicode_buffered",
+			inputResponses: []CompletionResponse{
+				{Content: "Test " + string([]byte{0xF0, 0x9F}), Done: false}, // Incomplete emoji
+				{Content: string([]byte{0x98, 0x80}), Done: false},           // Complete the emoji
+				{Content: " done", Done: true},
+			},
+			expectedResponses: []CompletionResponse{
+				{Content: "Test ", Done: false}, // First part without incomplete unicode
+				{Content: "😀", Done: false},     // Complete emoji
+				{Content: " done", Done: true},
+			},
+			description: "Incomplete unicode should be buffered and combined with next response",
+		},
+		{
+			name: "empty_response_with_done",
+			inputResponses: []CompletionResponse{
+				{Content: "Complete response", Done: false},
+				{Content: "", Done: true}, // Empty response with Done=true
+			},
+			expectedResponses: []CompletionResponse{
+				{Content: "Complete response", Done: false},
+				{Content: "", Done: true}, // Should still be sent because Done=true
+			},
+			description: "Empty final response with Done=true should still be sent",
+		},
+		{
+			name: "done_reason_preserved",
+			inputResponses: []CompletionResponse{
+				{Content: "Response", Done: false},
+				{Content: " complete", Done: true, DoneReason: DoneReasonStop},
+			},
+			expectedResponses: []CompletionResponse{
+				{Content: "Response", Done: false},
+				{Content: " complete", Done: true, DoneReason: DoneReasonStop},
+			},
+			description: "DoneReason should be preserved in the final response",
+		},
+		{
+			name: "only_incomplete_unicode_not_done",
+			inputResponses: []CompletionResponse{
+				{Content: string([]byte{0xF0, 0x9F}), Done: false}, // Only incomplete unicode
+			},
+			expectedResponses: []CompletionResponse{
+				// No response expected - should be buffered
+			},
+			description: "Response with only incomplete unicode should be buffered if not done",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			var actualResponses []CompletionResponse
+
+			// Create a callback that collects responses
+			callback := func(resp CompletionResponse) {
+				actualResponses = append(actualResponses, resp)
+			}
+
+			// Create the unicode buffer handler
+			handler := unicodeBufferHandler(callback)
+
+			// Send all input responses through the handler
+			for _, resp := range tt.inputResponses {
+				handler(resp)
+			}
+
+			// Verify the number of responses
+			if len(actualResponses) != len(tt.expectedResponses) {
+				t.Fatalf("%s: got %d responses, want %d responses",
+					tt.description, len(actualResponses), len(tt.expectedResponses))
+			}
+
+			// Verify each response matches the expected one
+			for i, expected := range tt.expectedResponses {
+				if i >= len(actualResponses) {
+					t.Fatalf("%s: missing response at index %d", tt.description, i)
+					continue
+				}
+
+				actual := actualResponses[i]
+
+				// Verify content
+				if actual.Content != expected.Content {
+					t.Errorf("%s: response[%d].Content = %q, want %q",
+						tt.description, i, actual.Content, expected.Content)
+				}
+
+				// Verify Done flag
+				if actual.Done != expected.Done {
+					t.Errorf("%s: response[%d].Done = %v, want %v",
+						tt.description, i, actual.Done, expected.Done)
+				}
+
+				// Verify DoneReason if specified
+				if actual.DoneReason != expected.DoneReason {
+					t.Errorf("%s: response[%d].DoneReason = %v, want %v",
+						tt.description, i, actual.DoneReason, expected.DoneReason)
+				}
+			}
+		})
+	}
+}
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -5,8 +5,8 @@ import (
 	"context"
 	"encoding/binary"
 	"fmt"
+	"log/slog"
 	"math"
-	"os"
 	"slices"
 	"strconv"
 	"strings"
@@ -15,6 +15,11 @@ import (
 )

 type Backend interface {
+	Load(ctx context.Context, progress func(float32)) error
+
+	// BackendMemory returns the memory allocations that were made for this model
+	BackendMemory() BackendMemory
+
 	Config() fs.Config
 	Get(name string) Tensor
 	NewContext() Context
@@ -52,10 +57,6 @@ type CacheConfig struct {

 // BackendParams controls how the backend loads and executes models
 type BackendParams struct {
-	// Progress is a callback function that allows reporting percentage completion
-	// of model loading
-	Progress func(float32)
-
 	// NumThreads sets the number of threads to use if running on the CPU
 	NumThreads int

@@ -72,9 +73,130 @@ type BackendParams struct {
 	FlashAttention bool
 }

-var backends = make(map[string]func(context.Context, *os.File, BackendParams) (Backend, error))
+// ErrNoMem is returned when panicing due to insufficient memory. It includes
+// the attempted memory allocation.
+type ErrNoMem struct {
+	BackendMemory
+}

-func RegisterBackend(name string, f func(context.Context, *os.File, BackendParams) (Backend, error)) {
+func (e ErrNoMem) Error() string {
+	return fmt.Sprintf("insufficient memory - required allocations: %+v", e.BackendMemory)
+}
+
+type AllocationStatus int
+
+const (
+	// Unallocated memory - have not yet attempted to allocate
+	Unallocated AllocationStatus = iota
+
+	// Failed memory - tried to allocate the memory and did not succeed
+	Failed
+
+	// Allocated memory = tried and succeeded to allocate memory
+	Allocated
+)
+
+// Memory is the size of an allocation and whether it was successful.
+type Memory struct {
+	Size   uint64
+	Status AllocationStatus
+}
+
+func (m Memory) String() string {
+	s := fmt.Sprint(m.Size)
+
+	switch m.Status {
+	case Unallocated:
+		s += "U"
+	case Failed:
+		s += "F"
+	case Allocated:
+		s += "A"
+	}
+
+	return s
+}
+
+// DeviceMemory provides a breakdown of the memory needed
+// per device, such as a CPU or GPU.
+type DeviceMemory struct {
+	// Name is the name of the device as labeled by the backend. It
+	// may not be persistent across instances of the runner.
+	Name string
+
+	// UUID is a unique persistent identifier for the device for matching
+	// with system management libraries
+	UUID string
+
+	// Weights is the per-layer memory needed for the model weights.
+	Weights []Memory
+
+	// Cache is the per-layer memory needed for the KV cache.
+	Cache []Memory
+
+	// Graph is the size of the compute graph. It is not per-layer.
+	Graph Memory
+}
+
+func memoryPresent(mem []Memory) bool {
+	return slices.ContainsFunc(mem, func(m Memory) bool { return m.Size != 0 })
+}
+
+func (m DeviceMemory) LogValue() slog.Value {
+	var attrs []slog.Attr
+	if memoryPresent(m.Weights) {
+		attrs = append(attrs, slog.Any("Weights", m.Weights))
+	}
+
+	if memoryPresent(m.Cache) {
+		attrs = append(attrs, slog.Any("Cache", m.Cache))
+	}
+
+	if m.Graph.Size != 0 {
+		attrs = append(attrs, slog.Any("Graph", m.Graph))
+	}
+
+	if len(attrs) > 0 && m.UUID != "" {
+		attrs = append([]slog.Attr{slog.String("UUID", m.UUID)}, attrs...)
+	}
+
+	return slog.GroupValue(attrs...)
+}
+
+// BackendMemory provides the amount of memory required to load the model
+// per device based on the BackendParams. In some cases, not all required
+// allocations will be known at this point. However, the size of the most recent
+// allocation is guaranteed to be provided so that if it failed, the caller can
+// accommodate that to make forward progress.
+type BackendMemory struct {
+	// InputsWeights are always located on the CPU and cannot be moved
+	InputWeights Memory
+
+	// CPU model components are located in system memory. This does not
+	// include unified memory allocated through the GPU.
+	CPU DeviceMemory
+
+	// GPU model components are located on one or more GPUs.
+	GPUs []DeviceMemory
+}
+
+func (m BackendMemory) LogValue() slog.Value {
+	var attrs []slog.Attr
+	if m.InputWeights.Size != 0 {
+		attrs = append(attrs, slog.Any("InputWeights", m.InputWeights))
+	}
+
+	attrs = append(attrs, slog.Any(m.CPU.Name, m.CPU))
+	for _, g := range m.GPUs {
+		attrs = append(attrs, slog.Any(g.Name, g))
+	}
+
+	return slog.GroupValue(attrs...)
+}
+
+var backends = make(map[string]func(string, BackendParams) (Backend, error))
+
+func RegisterBackend(name string, f func(string, BackendParams) (Backend, error)) {
 	if _, ok := backends[name]; ok {
 		panic("backend: backend already registered")
 	}
@@ -82,9 +204,9 @@ func RegisterBackend(name string, f func(context.Context, *os.File, BackendParam
 	backends[name] = f
 }

-func NewBackend(ctx context.Context, f *os.File, params BackendParams) (Backend, error) {
+func NewBackend(modelPath string, params BackendParams) (Backend, error) {
 	if backend, ok := backends["ggml"]; ok {
-		return backend(ctx, f, params)
+		return backend(modelPath, params)
 	}

 	return nil, fmt.Errorf("unsupported backend")
@@ -93,8 +215,8 @@ func NewBackend(ctx context.Context, f *os.File, params BackendParams) (Backend,
 type Context interface {
 	Empty(dtype DType, shape ...int) Tensor
 	Zeros(dtype DType, shape ...int) Tensor
-	FromFloatSlice(s []float32, shape ...int) (Tensor, error)
-	FromIntSlice(s []int32, shape ...int) (Tensor, error)
+	FromFloatSlice(s []float32, shape ...int) Tensor
+	FromIntSlice(s []int32, shape ...int) Tensor

 	// Arange creates a 1D tensor with values within an interval (start, stop] increased by step.
 	Arange(start, stop, step float32, dtype DType) Tensor
@@ -106,7 +228,7 @@ type Context interface {
 	// graph, simply preallocates memory. Typically called with a
 	// worst case graph to ensure all resources are available for
 	// for future inference.
-	Reserve() error
+	Reserve()

 	MaxGraphNodes() int
 	Close()
@@ -132,6 +254,8 @@ type Tensor interface {
 	Neg(ctx Context) Tensor
 	Add(ctx Context, t2 Tensor) Tensor
 	Mul(ctx Context, t2 Tensor) Tensor
+	Div(ctx Context, t2 Tensor) Tensor
+
 	Mulmat(ctx Context, t2 Tensor) Tensor
 	MulmatFullPrec(ctx Context, t2 Tensor) Tensor
 	MulmatID(ctx Context, t2, ids Tensor) Tensor
@@ -140,11 +264,11 @@ type Tensor interface {
 	LayerNorm(ctx Context, weight, bias Tensor, eps float32) Tensor
 	RMSNorm(ctx Context, weight Tensor, eps float32) Tensor
 	Scale(ctx Context, s float64) Tensor
+	SumRows(ctx Context) Tensor

 	AvgPool2D(ctx Context, k, s int, p float32) Tensor
 	Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor

-	RoPE(ctx Context, positionIDs, ropeFactors Tensor, dim, ropeType uint32, base, scale float32) Tensor
 	IM2Col(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor

 	Sin(ctx Context) Tensor
@@ -172,6 +296,7 @@ type Tensor interface {
 	Duplicate(ctx Context) Tensor

 	TopK(ctx Context, k int) Tensor
+	Argsort(ctx Context) Tensor
 }

 // ScaledDotProductAttention implements a fused attention
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -10,7 +10,6 @@ import "C"

 import (
 	"context"
-	"errors"
 	"fmt"
 	"io"
 	"log/slog"
@@ -30,6 +29,7 @@ import (
 	"github.com/ollama/ollama/logutil"
 	"github.com/ollama/ollama/ml"
 	ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
+	"github.com/ollama/ollama/ml/nn/rope"
 	"golang.org/x/sync/errgroup"
 )

@@ -44,8 +44,15 @@ func devices() []*C.struct_ggml_backend_device {
 }

 type Backend struct {
+	// modelPath is the location of the model data
+	modelPath string
+
 	meta *fsggml.GGML

+	// tensorLoadTargets maps from the name of the tensor in the file
+	// to the name that is used by the model definition
+	tensorLoadTargets map[string][]string
+
 	sched         *C.struct_ggml_backend_sched
 	schedBackends []*C.struct_ggml_backend
 	schedBufts    []*C.struct_ggml_backend_buffer_type
@@ -58,14 +65,26 @@ type Backend struct {
 	// layers is the backend used for repeating layers
 	layers map[int]*C.struct_ggml_backend_buffer_type

+	// requiredMemory is the cumulative memory allocations needed by the backend
+	requiredMemory *ml.BackendMemory
+
+	// btDeviceMemory maps from a buffer type to the memory allocations associated with that device
+	btDeviceMemory map[*C.struct_ggml_backend_buffer_type]*ml.DeviceMemory
+
 	flashAttention bool

 	// maxGraphNodes is the maximum allowed number of graph nodes in this scheduler
 	maxGraphNodes int
 }

-func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend, error) {
-	meta, n, err := fsggml.Decode(r, -1)
+func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
+	r, err := os.Open(modelPath)
+	if err != nil {
+		return nil, err
+	}
+	defer r.Close()
+
+	meta, err := fsggml.Decode(r, -1)
 	if err != nil {
 		return nil, err
 	}
@@ -80,6 +99,9 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 		"num_key_values", len(meta.KV()),
 	)

+	var requiredMemory ml.BackendMemory
+	btDeviceMemory := make(map[*C.struct_ggml_backend_buffer_type]*ml.DeviceMemory)
+
 	type deviceBufferType struct {
 		d   *C.struct_ggml_backend_device
 		bts []*C.struct_ggml_backend_buffer_type
@@ -100,6 +122,8 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 		}
 	}

+	blocks := int(meta.KV().BlockCount())
+
 	// create list of buffer types for the cpu
 	cpuDeviceBufferType := deviceBufferType{d: C.ggml_backend_dev_by_type(C.GGML_BACKEND_DEVICE_TYPE_CPU)}
 	for _, d := range append(accels, append(gpus, cpus...)...) {
@@ -107,17 +131,33 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 		case C.GGML_BACKEND_DEVICE_TYPE_CPU,
 			C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
 			cpuDeviceBufferType.bts = append(cpuDeviceBufferType.bts, C.ggml_backend_dev_buffer_type(d))
+			btDeviceMemory[C.ggml_backend_dev_buffer_type(d)] = &requiredMemory.CPU
 		}
 	}

+	requiredMemory.CPU.Name = C.GoString(C.ggml_backend_dev_name(cpuDeviceBufferType.d))
+	var props C.struct_ggml_backend_dev_props
+	C.ggml_backend_dev_get_props(cpuDeviceBufferType.d, &props)
+	requiredMemory.CPU.UUID = C.GoString(props.uuid)
+	requiredMemory.CPU.Weights = make([]ml.Memory, blocks+1)
+	requiredMemory.CPU.Cache = make([]ml.Memory, blocks+1)
+
 	// create list of buffer types for each gpu
 	var gpuDeviceBufferTypes []deviceBufferType
-	for _, d := range gpus {
+	requiredMemory.GPUs = make([]ml.DeviceMemory, len(gpus))
+	for i, d := range gpus {
 		bt := C.ggml_backend_dev_buffer_type(d)
 		gpuDeviceBufferTypes = append(gpuDeviceBufferTypes, deviceBufferType{
 			d:   d,
 			bts: append([]*C.struct_ggml_backend_buffer_type{bt}, cpuDeviceBufferType.bts...),
 		})
+		btDeviceMemory[bt] = &requiredMemory.GPUs[i]
+		requiredMemory.GPUs[i].Name = C.GoString(C.ggml_backend_dev_name(d))
+		var props C.struct_ggml_backend_dev_props
+		C.ggml_backend_dev_get_props(d, &props)
+		requiredMemory.GPUs[i].UUID = C.GoString(props.uuid)
+		requiredMemory.GPUs[i].Weights = make([]ml.Memory, blocks+1)
+		requiredMemory.GPUs[i].Cache = make([]ml.Memory, blocks+1)
 	}

 	useDefaultSplit := true
@@ -156,8 +196,6 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 	// inputs always use cpu
 	input := cpuDeviceBufferType

-	blocks := int(meta.KV().BlockCount())
-
 	// define a range of gpu layers. anything outside of this range is assigned to the cpu
 	gpuRangeStart := max(0, blocks-params.NumGPULayers)
 	gpuRangeStop := min(gpuRangeStart+params.NumGPULayers, blocks+1)
@@ -198,7 +236,7 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,

 	// contexts are shared by tensors of the same buffer type
 	ctxs := make(map[*C.struct_ggml_backend_buffer_type]*C.struct_ggml_context)
-	createTensor := func(t tensor, bts []*C.struct_ggml_backend_buffer_type) *C.struct_ggml_tensor {
+	createTensor := func(t tensor, bts []*C.struct_ggml_backend_buffer_type, layer int) *C.struct_ggml_tensor {
 		for _, bt := range bts {
 			if _, ok := ctxs[bt]; !ok {
 				ctxs[bt] = C.ggml_init(C.struct_ggml_init_params{
@@ -224,6 +262,16 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 			C.ggml_set_name(tt, cname)

 			slog.Log(context.TODO(), logutil.LevelTrace, "created tensor", "name", name, "shape", t.source.Shape, "dtype", t.source.Kind, "buffer_type", C.GoString(C.ggml_backend_buft_name(bt)))
+
+			size := pad(C.ggml_backend_buft_get_alloc_size(bt, tt), C.ggml_backend_buft_get_alignment(bt))
+			if layer == -1 {
+				// Assume that InputWeights can be allocated - they're always in system memory and can't be moved in any case
+				requiredMemory.InputWeights.Status = ml.Allocated
+				requiredMemory.InputWeights.Size += uint64(size)
+			} else {
+				btDeviceMemory[bt].Weights[layer].Size += uint64(size)
+			}
+
 			//nolint:staticcheck // TODO: check if buffer type supports this tensor
 			return tt
 		}
@@ -245,22 +293,22 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 	for _, t := range meta.Tensors().Items() {
 		switch {
 		case contains(t.Name, "position_embd", "token_embd", "token_norm_embd", "token_types"):
-			createTensor(tensor{source: t}, input.bts)
+			createTensor(tensor{source: t}, input.bts, -1)
 			if _, ok := meta.Tensors().GroupLayers()["output"]; !ok && t.Name == "token_embd.weight" {
-				createTensor(tensor{source: t, target: "output.weight"}, output.bts)
+				createTensor(tensor{source: t, target: "output.weight"}, output.bts, blocks)
 			}
 		case contains(t.Name, "cls", "output", "output_norm"):
-			createTensor(tensor{source: t}, output.bts)
+			createTensor(tensor{source: t}, output.bts, blocks)
 		case strings.HasPrefix(t.Name, "v.") || strings.HasPrefix(t.Name, "mm."):
 			// TODO: assign vision tensors to the gpu if possible
-			createTensor(tensor{source: t}, output.bts)
+			createTensor(tensor{source: t}, output.bts, blocks)
 		case contains(t.Name, "rope_freqs", "rope_factors_long", "rope_factors_short"):
 			// these tensors should be repeated per layer
 			for i, layer := range layers {
 				createTensor(tensor{
 					source: t,
 					target: "blk." + strconv.Itoa(i) + "." + t.Name,
-				}, layer.bts)
+				}, layer.bts, i)
 			}
 		default:
 			layerIndex := -1
@@ -271,10 +319,10 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 			}

 			if layerIndex >= 0 {
-				createTensor(tensor{source: t}, layers[layerIndex].bts)
+				createTensor(tensor{source: t}, layers[layerIndex].bts, layerIndex)
 			} else {
 				// load all other tensors on the cpu
-				createTensor(tensor{source: t}, input.bts)
+				createTensor(tensor{source: t}, input.bts, -1)
 			}
 		}
 	}
@@ -287,8 +335,18 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 		}

 		b := C.ggml_backend_alloc_ctx_tensors_from_buft(c, bt)
+		for i := range btDeviceMemory[bt].Weights {
+			if btDeviceMemory[bt].Weights[i].Size != 0 {
+				if b != nil {
+					btDeviceMemory[bt].Weights[i].Status = ml.Allocated
+				} else {
+					btDeviceMemory[bt].Weights[i].Status = ml.Failed
+				}
+			}
+		}
+
 		if b == nil {
-			return nil, fmt.Errorf("unable to allocate memory from device %v for model weights", C.GoString(C.ggml_backend_buft_name(bt)))
+			panic(ml.ErrNoMem{BackendMemory: requiredMemory})
 		}

 		C.ggml_backend_buffer_set_usage(b, C.GGML_BACKEND_BUFFER_USAGE_WEIGHTS)
@@ -307,73 +365,6 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 		}
 	}

-	var doneBytes atomic.Uint64
-	totalBytes := uint64(n) - meta.Tensors().Offset
-
-	g, ctx := errgroup.WithContext(ctx)
-	g.SetLimit(runtime.GOMAXPROCS(0))
-	for _, t := range meta.Tensors().Items() {
-		t := t
-		g.Go(func() error {
-			tts := make([]*C.struct_ggml_tensor, max(1, len(targets[t.Name])))
-			for i := range tts {
-				target := targets[t.Name][i]
-				if target == "" {
-					target = t.Name
-				}
-
-				tt, ok := tensors[target]
-				if !ok {
-					return fmt.Errorf("unassigned tensor: %s", t.Name)
-				}
-
-				tts[i] = tt
-			}
-
-			// Create a new FD for each goroutine so that each FD is read sequentially, rather than
-			// seeking around within an FD shared between all goroutines.
-			file, err := os.Open(r.Name())
-			if err != nil {
-				slog.Warn("file open error", "file", r.Name(), "error", err)
-				return err
-			}
-			defer file.Close()
-			sr := io.NewSectionReader(file, int64(meta.Tensors().Offset+t.Offset), int64(t.Size()))
-			bts := make([]byte, 128*format.KibiByte)
-
-			var s uint64
-			for s < t.Size() {
-				// Stop if either the parent context has been canceled or if any of the other tensors returned an error
-				if err := ctx.Err(); err != nil {
-					return err
-				}
-
-				n, err := io.ReadFull(sr, bts[:min(len(bts), int(t.Size()-s))])
-				if err != nil {
-					slog.Warn("file read error", "file", r.Name(), "error", err)
-					return err
-				}
-
-				for _, tt := range tts {
-					C.ggml_backend_tensor_set(tt, unsafe.Pointer(&bts[0]), C.size_t(s), C.size_t(n))
-				}
-
-				s += uint64(n)
-
-				if params.Progress != nil {
-					done := doneBytes.Add(uint64(n))
-					params.Progress(float32(done) / float32(totalBytes))
-				}
-			}
-
-			return nil
-		})
-	}
-
-	if err := g.Wait(); err != nil {
-		return nil, err
-	}
-
 	// map devices to backend buffer types so new tensors can be assigned to the correct device
 	deviceBufferTypes := make(map[*C.struct_ggml_backend_device]*C.struct_ggml_backend_buffer_type)

@@ -397,9 +388,11 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,

 	maxGraphNodes := max(8192, len(meta.Tensors().Items())*5)
 	return &Backend{
-		flashAttention: params.FlashAttention,
-		meta:           meta,
-		tensors:        tensors,
+		modelPath:         modelPath,
+		flashAttention:    params.FlashAttention,
+		meta:              meta,
+		tensorLoadTargets: targets,
+		tensors:           tensors,
 		sched: C.ggml_backend_sched_new(
 			(*C.ggml_backend_t)(unsafe.Pointer(&schedBackends[0])),
 			(*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&schedBufts[0])),
@@ -418,7 +411,9 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 			}
 			return m
 		}(),
-		maxGraphNodes: maxGraphNodes,
+		requiredMemory: &requiredMemory,
+		btDeviceMemory: btDeviceMemory,
+		maxGraphNodes:  maxGraphNodes,
 	}, nil
 }

@@ -426,6 +421,81 @@ func init() {
 	ml.RegisterBackend("ggml", New)
 }

+func (b *Backend) Load(ctx context.Context, progress func(float32)) error {
+	var doneBytes atomic.Uint64
+	totalBytes := uint64(b.meta.Length) - b.meta.Tensors().Offset
+
+	g, ctx := errgroup.WithContext(ctx)
+	g.SetLimit(runtime.GOMAXPROCS(0))
+	for _, t := range b.meta.Tensors().Items() {
+		t := t
+		g.Go(func() error {
+			tts := make([]*C.struct_ggml_tensor, max(1, len(b.tensorLoadTargets[t.Name])))
+			for i := range tts {
+				target := b.tensorLoadTargets[t.Name][i]
+				if target == "" {
+					target = t.Name
+				}
+
+				tt, ok := b.tensors[target]
+				if !ok {
+					return fmt.Errorf("unassigned tensor: %s", t.Name)
+				}
+
+				tts[i] = tt
+			}
+
+			// Create a new FD for each goroutine so that each FD is read sequentially, rather than
+			// seeking around within an FD shared between all goroutines.
+			file, err := os.Open(b.modelPath)
+			if err != nil {
+				slog.Warn("file open error", "file", b.modelPath, "error", err)
+				return err
+			}
+			defer file.Close()
+			sr := io.NewSectionReader(file, int64(b.meta.Tensors().Offset+t.Offset), int64(t.Size()))
+			bts := make([]byte, 128*format.KibiByte)
+
+			var s uint64
+			for s < t.Size() {
+				// Stop if either the parent context has been canceled or if any of the other tensors returned an error
+				if err := ctx.Err(); err != nil {
+					return err
+				}
+
+				n, err := io.ReadFull(sr, bts[:min(len(bts), int(t.Size()-s))])
+				if err != nil {
+					slog.Warn("file read error", "file", b.modelPath, "error", err)
+					return err
+				}
+
+				for _, tt := range tts {
+					C.ggml_backend_tensor_set(tt, unsafe.Pointer(&bts[0]), C.size_t(s), C.size_t(n))
+				}
+
+				s += uint64(n)
+
+				if progress != nil {
+					done := doneBytes.Add(uint64(n))
+					progress(float32(done) / float32(totalBytes))
+				}
+			}
+
+			return nil
+		})
+	}
+
+	if err := g.Wait(); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func (b *Backend) BackendMemory() ml.BackendMemory {
+	return *b.requiredMemory
+}
+
 func (b *Backend) Config() fs.Config {
 	return b.meta.KV()
 }
@@ -457,6 +527,7 @@ func (b *Backend) NewContextSize(n int) ml.Context {
 			no_alloc: true,
 		}),
 		allocatedBuffers: &allocatedBuffers,
+		layer:            -1,
 	}
 }

@@ -483,6 +554,9 @@ type Context struct {

 	// maxGraphNodes is the maximum allowed number of graph nodes in this context
 	maxGraphNodes int
+
+	// layer is the graph layer that this context is allocating for - assumed to be cache
+	layer int
 }

 func (c *Context) Input() ml.Context {
@@ -493,6 +567,7 @@ func (c *Context) Input() ml.Context {
 			buft:             c.b.input,
 			allocatedBuffers: c.allocatedBuffers,
 			maxGraphNodes:    c.maxGraphNodes,
+			layer:            -1,
 		}
 	}

@@ -507,6 +582,7 @@ func (c *Context) Layer(i int) ml.Context {
 			buft:             buft,
 			allocatedBuffers: c.allocatedBuffers,
 			maxGraphNodes:    c.maxGraphNodes,
+			layer:            i,
 		}
 	}

@@ -544,22 +620,34 @@ func (c *Context) Compute(tensors ...ml.Tensor) {
 	}
 }

-func (c *Context) Reserve() error {
-	if !C.ggml_backend_sched_reserve(c.b.sched, c.graph) {
-		C.ggml_backend_sched_reset(c.b.sched)
-		return errors.New("failed to reserve graph")
-	}
+func (c *Context) Reserve() {
+	reserved := C.ggml_backend_sched_reserve(c.b.sched, c.graph)

 	slog.Debug("compute graph", "nodes", C.ggml_graph_n_nodes(c.graph), "splits", C.ggml_backend_sched_get_n_splits(c.b.sched))
-	for i := range c.b.schedBackends {
-		size := C.ggml_backend_sched_get_buffer_size(c.b.sched, c.b.schedBackends[i])
-		slog.Info("compute graph", "backend", C.GoString(C.ggml_backend_name(c.b.schedBackends[i])), "buffer_type", C.GoString(C.ggml_backend_buft_name(c.b.schedBufts[i])),
-			"size", format.HumanBytes2(uint64(size)))
+
+	// Reserve may get called multiple times for different graphs - we just want the last run, which will contain the max allocations
+	for _, bt := range c.b.schedBufts {
+		c.b.btDeviceMemory[bt].Graph = ml.Memory{}
 	}

-	C.ggml_backend_sched_reset(c.b.sched)
+	for i := range c.b.schedBackends {
+		bufferStatus := C.ggml_backend_sched_get_attempted_buffer_size(c.b.sched, c.b.schedBackends[i])

-	return nil
+		graph := &c.b.btDeviceMemory[c.b.schedBufts[i]].Graph
+		graph.Size += uint64(bufferStatus.size)
+		if bufferStatus.allocated && graph.Status != ml.Failed {
+			graph.Status = ml.Allocated
+		} else {
+			graph.Status = ml.Failed
+		}
+
+		slog.Info("compute graph", "backend", C.GoString(C.ggml_backend_name(c.b.schedBackends[i])), "buffer_type", C.GoString(C.ggml_backend_buft_name(c.b.schedBufts[i])),
+			"size", format.HumanBytes2(uint64(bufferStatus.size)))
+	}
+
+	if !reserved {
+		panic(ml.ErrNoMem{BackendMemory: *c.b.requiredMemory})
+	}
 }

 func (c *Context) MaxGraphNodes() int {
@@ -579,7 +667,7 @@ func pad(length, pad C.size_t) C.size_t {
 	return ((length + pad - 1) / pad) * pad
 }

-func (c *Context) newTensor(dtype ml.DType, shape []int) (ml.Tensor, error) {
+func (c *Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
 	if c.buft == nil {
 		panic("set Input or Layer before creating tensors")
 	}
@@ -602,7 +690,7 @@ func (c *Context) newTensor(dtype ml.DType, shape []int) (ml.Tensor, error) {

 	if len(shape) < 1 || shape[0] == 0 {
 		var shape C.int64_t = 0
-		return &Tensor{b: c.b, t: C.ggml_new_tensor(c.ctx, cdtype, 1, &shape)}, nil
+		return &Tensor{b: c.b, t: C.ggml_new_tensor(c.ctx, cdtype, 1, &shape)}
 	} else if len(shape) > 4 {
 		panic("unsupported number of dimensions")
 	}
@@ -615,40 +703,43 @@ func (c *Context) newTensor(dtype ml.DType, shape []int) (ml.Tensor, error) {

 	t := C.ggml_new_tensor(c.ctx, cdtype, C.int(len(shape)), shapeToGGML(shape))
 	size := pad(C.ggml_backend_buft_get_alloc_size(c.buft, t), C.ggml_backend_buft_get_alignment(c.buft))
-	b := C.ggml_backend_buft_alloc_buffer(c.buft, size)
-	if b == nil {
-		return nil, fmt.Errorf("unable to allocate %v from device %v for new tensor", format.HumanBytes2(uint64(size)), C.GoString(C.ggml_backend_buft_name(c.buft)))
-	}
-	*c.allocatedBuffers = append(*c.allocatedBuffers, b)

+	b := C.ggml_backend_buft_alloc_buffer(c.buft, size)
+	if c.layer >= 0 {
+		cache := &c.b.btDeviceMemory[c.buft].Cache[c.layer]
+
+		cache.Size += uint64(size)
+		if b != nil {
+			cache.Status = ml.Allocated
+		} else {
+			cache.Status = ml.Failed
+		}
+	}
+
+	if b == nil {
+		panic(ml.ErrNoMem{BackendMemory: *c.b.requiredMemory})
+	}
+
+	*c.allocatedBuffers = append(*c.allocatedBuffers, b)
 	C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
-	return &Tensor{b: c.b, t: t}, nil
+	return &Tensor{b: c.b, t: t}
 }

 func (c *Context) Empty(dtype ml.DType, shape ...int) ml.Tensor {
-	t, err := c.newTensor(dtype, shape)
-	if err != nil {
-		panic(err)
-	}
-
-	return t
+	return c.newTensor(dtype, shape)
 }

 func (c *Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
-	t, err := c.newTensor(dtype, shape)
-	if err != nil {
-		panic(err)
-	}
-
+	t := c.newTensor(dtype, shape)
 	C.ggml_set_zero(t.(*Tensor).t)
 	return t
 }

-func checkShape[S ~[]E, E any](s S, shape ...int) error {
+func checkShape[S ~[]E, E any](s S, shape ...int) {
 	n := len(s)

 	if n == 0 {
-		return nil
+		return
 	}

 	for _, v := range shape {
@@ -656,44 +747,32 @@ func checkShape[S ~[]E, E any](s S, shape ...int) error {
 	}

 	if n != 1 {
-		return fmt.Errorf("invalid shape: %v", shape)
+		panic(fmt.Errorf("invalid shape: %v", shape))
 	}
-
-	return nil
 }

-func (c *Context) FromFloatSlice(s []float32, shape ...int) (ml.Tensor, error) {
-	if err := checkShape(s, shape...); err != nil {
-		return nil, err
-	}
+func (c *Context) FromFloatSlice(s []float32, shape ...int) ml.Tensor {
+	checkShape(s, shape...)

-	t, err := c.newTensor(ml.DTypeF32, shape)
-	if err != nil {
-		return nil, err
-	}
+	t := c.newTensor(ml.DTypeF32, shape)

 	if len(s) > 0 {
 		C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
 	}

-	return t, nil
+	return t
 }

-func (c *Context) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
-	if err := checkShape(s, shape...); err != nil {
-		return nil, err
-	}
+func (c *Context) FromIntSlice(s []int32, shape ...int) ml.Tensor {
+	checkShape(s, shape...)

-	t, err := c.newTensor(ml.DTypeI32, shape)
-	if err != nil {
-		return nil, err
-	}
+	t := c.newTensor(ml.DTypeI32, shape)

 	if len(s) > 0 {
 		C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
 	}

-	return t, nil
+	return t
 }

 func (c Context) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
@@ -711,12 +790,7 @@ func (c Context) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
 			arange = append(arange, int32(i))
 		}

-		t, err := c.Input().FromIntSlice(arange, len(arange))
-		if err != nil {
-			panic(err)
-		}
-
-		return t
+		return c.Input().FromIntSlice(arange, len(arange))
 	default:
 		panic("unsupported dtype for arange")
 	}
@@ -867,6 +941,13 @@ func (t *Tensor) Mul(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
 	}
 }

+func (t *Tensor) Div(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
+	return &Tensor{
+		b: t.b,
+		t: C.ggml_div(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
+	}
+}
+
 func (t *Tensor) Mulmat(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
 	return &Tensor{
 		b: t.b,
@@ -915,6 +996,8 @@ func (t *Tensor) RMSNorm(ctx ml.Context, w ml.Tensor, eps float32) ml.Tensor {
 func (t *Tensor) Pad(ctx ml.Context, shape ...int) ml.Tensor {
 	if len(shape) != 4 {
 		panic("expected 4 dimensions")
+	} else if shape[3] != 0 {
+		panic("cuda does not support 4d tensors")
 	}

 	return &Tensor{
@@ -982,6 +1065,13 @@ func (t *Tensor) Scale(ctx ml.Context, s float64) ml.Tensor {
 	}
 }

+func (t *Tensor) SumRows(ctx ml.Context) ml.Tensor {
+	return &Tensor{
+		b: t.b,
+		t: C.ggml_sum_rows(ctx.(*Context).ctx, t.t),
+	}
+}
+
 func (t *Tensor) Softmax(ctx ml.Context) ml.Tensor {
 	return &Tensor{
 		b: t.b,
@@ -1053,16 +1143,13 @@ func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
 	}
 }

-const (
-	ropeTypeNorm   C.int = 0
-	ropeTypeNeox   C.int = 2
-	ropeTypeMrope  C.int = 8
-	ropeTypeVision C.int = 24
-)
+func (t *Tensor) RoPE(ctx ml.Context, positions ml.Tensor, ropeDim int, ropeBase, ropeScale float32, options ...func(*rope.Options)) ml.Tensor {
+	// Default options
+	opts := &rope.Options{OriginalContextLength: 131072, Factors: &Tensor{}}

-func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim, ropeType uint32, ropeBase, ropeScale float32) ml.Tensor {
-	if ropeFactors == nil {
-		ropeFactors = &Tensor{b: t.b}
+	// Apply any provided options
+	for _, option := range options {
+		option(opts)
 	}

 	dequant := t.t
@@ -1073,16 +1160,19 @@ func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDi
 	return &Tensor{
 		b: t.b,
 		t: C.ggml_rope_ext(
-			ctx.(*Context).ctx, dequant, positionIDs.(*Tensor).t, ropeFactors.(*Tensor).t,
+			ctx.(*Context).ctx,
+			dequant,
+			positions.(*Tensor).t,
+			opts.Factors.(*Tensor).t,
 			C.int(ropeDim),
-			C.int(ropeType),
-			131072, // YaRN n_ctx_train
+			C.int(opts.Type),
+			C.int(opts.OriginalContextLength),
 			C.float(ropeBase),
 			C.float(ropeScale),
-			0.,  // YaRN ext_factor
-			1.,  // YaRN attn_factor
-			32., // YaRN beta_fast
-			1.,  // YaRN beta_slow
+			C.float(0.0),
+			C.float(1.0),
+			C.float(32.0),
+			C.float(1.0),
 		),
 	}
 }
@@ -1176,3 +1266,10 @@ func (t *Tensor) TopK(ctx ml.Context, k int) ml.Tensor {
 		t: C.ggml_top_k(ctx.(*Context).ctx, t.t, C.int(k)),
 	}
 }
+
+func (t *Tensor) Argsort(ctx ml.Context) ml.Tensor {
+	return &Tensor{
+		b: t.b,
+		t: C.ggml_argsort(ctx.(*Context).ctx, t.t, C.GGML_SORT_ORDER_ASC),
+	}
+}
--- a/ml/backend/ggml/ggml/include/ggml-alloc.h
+++ b/ml/backend/ggml/ggml/include/ggml-alloc.h
@@ -66,6 +66,12 @@ GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph

 GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);

+struct ggml_allocr_buffer_status {
+    size_t size;
+    bool allocated;
+};
+GGML_API struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id);
+
 // Utils
 // Create a buffer and allocate all the tensors in a ggml_context
 GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
--- a/ml/backend/ggml/ggml/include/ggml-backend.h
+++ b/ml/backend/ggml/ggml/include/ggml-backend.h
@@ -152,6 +152,7 @@ extern "C" {
    struct ggml_backend_dev_props {
        const char * name;
        const char * description;
+        const char * uuid;
        size_t memory_free;
        size_t memory_total;
        enum ggml_backend_dev_type type;
@@ -304,6 +305,12 @@ extern "C" {

    GGML_API size_t               ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);

+    struct ggml_backend_buffer_status {
+        size_t size;
+        bool allocated;
+    };
+    GGML_API struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
+
    GGML_API void                 ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
    GGML_API ggml_backend_t       ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);

--- a/ml/backend/ggml/ggml/src/ggml-alloc.c
+++ b/ml/backend/ggml/ggml/src/ggml-alloc.c
@@ -364,6 +364,7 @@ struct node_alloc {
 struct ggml_gallocr {
    ggml_backend_buffer_type_t * bufts; // [n_buffers]
    ggml_backend_buffer_t * buffers; // [n_buffers]
+    size_t *buffer_sizes; // [n_buffers]
    struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
    int n_buffers;

@@ -387,6 +388,9 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
    galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
    GGML_ASSERT(galloc->buffers != NULL);

+    galloc->buffer_sizes = calloc(n_bufs, sizeof(size_t));
+    GGML_ASSERT(galloc->buffer_sizes != NULL);
+
    galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
    GGML_ASSERT(galloc->buf_tallocs != NULL);

@@ -453,6 +457,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
    ggml_hash_set_free(&galloc->hash_set);
    free(galloc->hash_values);
    free(galloc->bufts);
+    free(galloc->buffer_sizes);
    free(galloc->buffers);
    free(galloc->buf_tallocs);
    free(galloc->node_allocs);
@@ -748,6 +753,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
        }
    }

+    bool success = true;
+
    // reallocate buffers if needed
    for (int i = 0; i < galloc->n_buffers; i++) {
        // if the buffer type is used multiple times, we reuse the same buffer
@@ -769,15 +776,20 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c

            ggml_backend_buffer_free(galloc->buffers[i]);
            galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
-            if (galloc->buffers[i] == NULL) {
+            if (galloc->buffers[i]) {
+                galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
+                ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
+            } else {
                GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
-                return false;
+                galloc->buffer_sizes[i] = new_size;
+                success = false;
            }
-            ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
+        } else {
+            galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
        }
    }

-    return true;
+    return success;
 }

 bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
@@ -934,6 +946,24 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
    return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
 }

+struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
+    GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
+
+    for (int i = 0; i < buffer_id; i++) {
+        if (galloc->buf_tallocs[i] == galloc->buf_tallocs[buffer_id]) {
+            // This buffer is the same as a previous one due to the same buffer type being used multiple times
+            // (See above.) However, we need a different check because multiple buffers might be NULL in our
+            // case and we still want to know the attempted size.
+
+            struct ggml_allocr_buffer_status status = {0, true};
+            return status;
+        }
+    }
+
+    struct ggml_allocr_buffer_status status = {galloc->buffer_sizes[buffer_id], galloc->buffers[buffer_id] != NULL};
+    return status;
+}
+
 // utils

 static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
--- a/ml/backend/ggml/ggml/src/ggml-backend.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-backend.cpp
@@ -1629,6 +1629,16 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
    return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
 }

+struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
+    int backend_index = ggml_backend_sched_backend_id(sched, backend);
+    GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
+
+    struct ggml_allocr_buffer_status allocr_status = ggml_gallocr_get_attempted_buffer_size(sched->galloc, backend_index);
+    struct ggml_backend_buffer_status status = {allocr_status.size, allocr_status.allocated};
+
+    return status;
+}
+
 void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
    int backend_index = ggml_backend_sched_backend_id(sched, backend);
    GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
--- a/ml/backend/ggml/ggml/src/ggml-cpu/cpu.go
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/cpu.go
@@ -3,7 +3,7 @@ package cpu
 // #cgo CFLAGS: -O3 -Wno-implicit-function-declaration
 // #cgo CXXFLAGS: -std=c++17
 // #cgo CPPFLAGS: -I${SRCDIR}/amx -I${SRCDIR}/llamafile -I${SRCDIR}/.. -I${SRCDIR}/../../include
-// #cgo CPPFLAGS: -DGGML_USE_LLAMAFILE
+// #cgo CPPFLAGS: -DNDEBUG -DGGML_USE_LLAMAFILE
 // #cgo linux CPPFLAGS: -D_GNU_SOURCE
 // #cgo darwin,arm64 CPPFLAGS: -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
 // #cgo darwin,arm64 LDFLAGS: -framework Accelerate
--- a/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
@@ -6822,6 +6822,45 @@ static void ggml_compute_forward_argsort_f32(
    }
 }

+static void ggml_compute_forward_argsort_i32(
+    const ggml_compute_params * params,
+    ggml_tensor * dst) {
+
+    const ggml_tensor * src0 = dst->src[0];
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    GGML_ASSERT(nb0 == sizeof(int32_t));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int64_t nr = ggml_nrows(src0);
+
+    ggml_sort_order order = (ggml_sort_order) ggml_get_op_params_i32(dst, 0);
+
+    for (int64_t i = ith; i < nr; i += nth) {
+        int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);
+        const int32_t * src_data = (int32_t *)((char *) src0->data + i*nb01);
+
+        for (int64_t j = 0; j < ne0; j++) {
+            dst_data[j] = j;
+        }
+
+        // C doesn't have a functional sort, so we do a bubble sort instead
+        for (int64_t j = 0; j < ne0; j++) {
+            for (int64_t k = j + 1; k < ne0; k++) {
+                if ((order == GGML_SORT_ORDER_ASC  && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
+                    (order == GGML_SORT_ORDER_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
+                    int32_t tmp = dst_data[j];
+                    dst_data[j] = dst_data[k];
+                    dst_data[k] = tmp;
+                }
+            }
+        }
+    }
+}
+
 void ggml_compute_forward_argsort(
    const ggml_compute_params * params,
    ggml_tensor * dst) {
@@ -6833,6 +6872,10 @@ void ggml_compute_forward_argsort(
            {
                ggml_compute_forward_argsort_f32(params, dst);
            } break;
+        case GGML_TYPE_I32:
+            {
+                ggml_compute_forward_argsort_i32(params, dst);
+            } break;
        default:
            {
                GGML_ABORT("fatal error");
--- a/ml/backend/ggml/ggml/src/ggml-cuda/argsort.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/argsort.cu
@@ -85,13 +85,107 @@ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, co
    }
 }

+
+template<ggml_sort_order order>
+static __global__ void k_argsort_i32_i32(const int32_t * x, int * dst, const int ncols, const int ncols_pad) {
+    extern __shared__ int shared_mem[];
+    int * indices = shared_mem;
+
+    const int tid = threadIdx.x;
+    const int row = blockIdx.y;
+
+    // Initialize all indices, handling the case where threads < ncols_pad
+    for (int i = tid; i < ncols_pad; i += blockDim.x) {
+        indices[i] = i < ncols ? i : 0; // Use 0 for padding indices
+    }
+    __syncthreads();
+
+    // Bitonic sort
+    for (int k = 2; k <= ncols_pad; k *= 2) {
+        for (int j = k/2; j > 0; j /= 2) {
+            for (int i = tid; i < ncols_pad; i += blockDim.x) {
+                const int ij = i ^ j;
+                if (ij > i) {
+                    // Only compare values within the actual data range
+                    if (i < ncols && ij < ncols) {
+                        if ((i & k) == 0) {
+                            if (order == GGML_SORT_ORDER_ASC) {
+                                if (x[row * ncols + indices[i]] > x[row * ncols + indices[ij]]) {
+                                    int tmp = indices[i];
+                                    indices[i] = indices[ij];
+                                    indices[ij] = tmp;
+                                }
+                            } else {
+                                if (x[row * ncols + indices[i]] < x[row * ncols + indices[ij]]) {
+                                    int tmp = indices[i];
+                                    indices[i] = indices[ij];
+                                    indices[ij] = tmp;
+                                }
+                            }
+                        } else {
+                            if (order == GGML_SORT_ORDER_ASC) {
+                                if (x[row * ncols + indices[i]] < x[row * ncols + indices[ij]]) {
+                                    int tmp = indices[i];
+                                    indices[i] = indices[ij];
+                                    indices[ij] = tmp;
+                                }
+                            } else {
+                                if (x[row * ncols + indices[i]] > x[row * ncols + indices[ij]]) {
+                                    int tmp = indices[i];
+                                    indices[i] = indices[ij];
+                                    indices[ij] = tmp;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            __syncthreads();
+        }
+    }
+
+    // Write sorted indices to output, only threads handling valid data
+    for (int i = tid; i < ncols; i += blockDim.x) {
+        dst[row * ncols + i] = indices[i];
+    }
+}
+
+static void argsort_i32_i32_cuda(const int32_t * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
+    // Bitonic sort requires ncols to be power of 2
+    const int ncols_pad = next_power_of_2(ncols);
+
+    // Ensure thread count doesn't exceed maximum (typically 1024)
+    const int max_threads = 1024;  // This is the typical max for most GPUs
+    const int threads_per_block = ncols_pad > max_threads ? max_threads : ncols_pad;
+
+    const dim3 block_dims(threads_per_block, 1, 1);
+    const dim3 block_nums(1, nrows, 1);
+    const size_t shared_mem = ncols_pad * sizeof(int);
+
+    // Check if shared memory size is within limits
+    const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb;
+
+    // Instead of logging an error, use GGML_ASSERT with a descriptive message
+    GGML_ASSERT(shared_mem <= max_shared_mem && "argsort: required shared memory exceeds device limit");
+
+    // Launch kernels with the updated thread configuration
+    if (order == GGML_SORT_ORDER_ASC) {
+        k_argsort_i32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
+    } else if (order == GGML_SORT_ORDER_DESC) {
+        k_argsort_i32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
+    } else {
+        GGML_ABORT("fatal error");
+    }
+}
+
+
 void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * src0 = dst->src[0];
    const float * src0_d = (const float *)src0->data;
    float * dst_d = (float *)dst->data;
    cudaStream_t stream = ctx.stream();

-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_I32);
    GGML_ASSERT( dst->type == GGML_TYPE_I32);
    GGML_ASSERT(ggml_is_contiguous(src0));

@@ -100,5 +194,9 @@ void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

    enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];

-    argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);
+    if (src0->type == GGML_TYPE_I32) {
+        argsort_i32_i32_cuda((const int32_t *)src0_d, (int *)dst_d, ncols, nrows, order, stream);
+    } else {
+        argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);
+    }
 }
--- a/ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu
@@ -38,6 +38,13 @@ static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
    *dsti = *xi;
 }

+static __device__ void cpy_1_i32_i32(const char * cxi, char * cdsti) {
+    const int32_t * xi = (const int32_t *) cxi;
+    int32_t * dsti = (int32_t *) cdsti;
+
+    *dsti = *xi;
+}
+
 template <cpy_kernel_t cpy_1>
 static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const int ne,
                                   const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
@@ -68,6 +75,44 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const in
    cpy_1(cx + x_offset, cdst + dst_offset);
 }

+// First, add this template function after the other template functions
+template <cpy_kernel_t cpy_1>
+static __global__ void cpy_i32_i32(const char * cx, char * cdst, const int ne,
+                                 const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+                                 const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
+                                 const int nb12, const int nb13) {
+    const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= ne) {
+        return;
+    }
+
+    const int64_t i03 = i/(ne00 * ne01 * ne02);
+    const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
+    const int64_t i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
+    const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
+    const int64_t x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
+
+    const int64_t i13 = i/(ne10 * ne11 * ne12);
+    const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
+    const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
+    const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
+    const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
+
+    cpy_1(cx + x_offset, cdst + dst_offset);
+}
+
+// Then modify the ggml_cpy_i32_i32_cuda function to use the new template
+static void ggml_cpy_i32_i32_cuda(
+    const char * cx, char * cdst, const int ne,
+    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int graph_cpynode_index) {
+
+    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+    cpy_i32_i32<cpy_1_i32_i32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+}
+
 static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
    const float * xi = (const float *) cxi;
    block_q8_0 * dsti = (block_q8_0 *) cdsti;
@@ -633,6 +678,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
        ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
        ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
+    } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
+        ggml_cpy_i32_i32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
    } else {
        GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
                ggml_type_name(src0->type), ggml_type_name(src1->type));
@@ -688,6 +735,8 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
        return (void*) cpy_f32_f16<cpy_1_f32_f16>;
    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
        return (void*) cpy_f32_f16<cpy_1_f16_f32>;
+    } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
+        return (void*) cpy_i32_i32<cpy_1_i32_i32>;
    } else {
        GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
                ggml_type_name(src0->type), ggml_type_name(src1->type));
--- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2884,6 +2884,7 @@ struct ggml_backend_cuda_device_context {
    int device;
    std::string name;
    std::string description;
+    std::string uuid;
 };

 static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@@ -2896,6 +2897,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
    return ctx->description.c_str();
 }

+static const char * ggml_backend_cuda_device_get_uuid(ggml_backend_dev_t dev) {
+    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
+    return ctx->uuid.c_str();
+}
+
 static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
    ggml_cuda_set_device(ctx->device);
@@ -2910,6 +2916,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
 static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
    props->name        = ggml_backend_cuda_device_get_name(dev);
    props->description = ggml_backend_cuda_device_get_description(dev);
+    props->uuid        = ggml_backend_cuda_device_get_uuid(dev);
    props->type        = ggml_backend_cuda_device_get_type(dev);
    ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);

@@ -3458,6 +3465,32 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
                CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
                dev_ctx->description = prop.name;

+                #if !defined(GGML_USE_HIP)
+                char uuid[64];
+                snprintf(uuid, sizeof(uuid),
+                    "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
+                    (unsigned char)prop.uuid.bytes[0],
+                    (unsigned char)prop.uuid.bytes[1],
+                    (unsigned char)prop.uuid.bytes[2],
+                    (unsigned char)prop.uuid.bytes[3],
+                    (unsigned char)prop.uuid.bytes[4],
+                    (unsigned char)prop.uuid.bytes[5],
+                    (unsigned char)prop.uuid.bytes[6],
+                    (unsigned char)prop.uuid.bytes[7],
+                    (unsigned char)prop.uuid.bytes[8],
+                    (unsigned char)prop.uuid.bytes[9],
+                    (unsigned char)prop.uuid.bytes[10],
+                    (unsigned char)prop.uuid.bytes[11],
+                    (unsigned char)prop.uuid.bytes[12],
+                    (unsigned char)prop.uuid.bytes[13],
+                    (unsigned char)prop.uuid.bytes[14],
+                    (unsigned char)prop.uuid.bytes[15]
+                  );
+                dev_ctx->uuid = uuid;
+                #else
+                dev_ctx->uuid = "GPU-" + std::string(prop.uuid.bytes, 16);
+                #endif
+
                ggml_backend_dev_t dev = new ggml_backend_device {
                    /* .iface   = */ ggml_backend_cuda_device_interface,
                    /* .reg     = */ &reg,
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
@@ -5703,6 +5703,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
 static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
    props->name        = ggml_backend_metal_device_get_name(dev);
    props->description = ggml_backend_metal_device_get_description(dev);
+    props->uuid        = "0";
    props->type        = ggml_backend_metal_device_get_type(dev);
    ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
    props->caps = (struct ggml_backend_dev_caps) {
--- a/ml/backend/ggml/ggml/src/ggml-metal/metal.go
+++ b/ml/backend/ggml/ggml/src/ggml-metal/metal.go
@@ -4,6 +4,6 @@ package metal

 //go:generate sh -c "{ echo // Code generated by 'go generate'. DO NOT EDIT.; sed -e '/__embed_ggml-common.h__/r ../ggml-common.h' -e '/__embed_ggml-common.h__/d' -e '/#include \"ggml-metal-impl.h\"/r ggml-metal-impl.h' -e '/#include \"ggml-metal-impl.h\"/d' ggml-metal.metal; } >ggml-metal-embed.metal"

-// #cgo CPPFLAGS: -DGGML_METAL_EMBED_LIBRARY -I.. -I../../include
+// #cgo CPPFLAGS: -DGGML_METAL_NDEBUG -DGGML_METAL_EMBED_LIBRARY -I.. -I../../include
 // #cgo LDFLAGS: -framework Metal -framework MetalKit
 import "C"
--- a/ml/nn/fast/rope.go
+++ b/ml/nn/fast/rope.go
@@ -0,0 +1,21 @@
+// fast provides implementations of fast (fused) operations for increased performance.
+package fast
+
+import (
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn/rope"
+)
+
+// fastRoPE is an interface for tensors that support fast rotary positional embedding.
+type fastRoPE interface {
+	RoPE(ctx ml.Context, positionIDs ml.Tensor, dim int, base, scale float32, options ...func(*rope.Options)) ml.Tensor
+}
+
+// RoPE applies rotary positional embedding to tensor `t`.
+func RoPE(ctx ml.Context, t, positions ml.Tensor, dim int, base, scale float32, options ...func(*rope.Options)) ml.Tensor {
+	if t, ok := t.(fastRoPE); ok {
+		return t.RoPE(ctx, positions, dim, base, scale, options...)
+	}
+
+	panic("RoPE not implemented for this tensor type")
+}
--- a/ml/nn/rope/rope.go
+++ b/ml/nn/rope/rope.go
@@ -0,0 +1,33 @@
+package rope
+
+import "github.com/ollama/ollama/ml"
+
+// Options contains optional parameters for RoPE function
+type Options struct {
+	OriginalContextLength int
+	Type                  int
+	Factors               ml.Tensor
+}
+
+// WithOriginalContextLength sets a custom context length
+func WithOriginalContextLength(n int) func(*Options) {
+	return func(opts *Options) {
+		opts.OriginalContextLength = n
+	}
+}
+
+// WithType sets RoPE type to NeoX
+func WithTypeNeoX() func(*Options) {
+	return func(opts *Options) {
+		opts.Type = 2
+	}
+}
+
+// WithFactors sets custom rope factors
+func WithFactors(factors ml.Tensor) func(*Options) {
+	return func(opts *Options) {
+		if factors != nil {
+			opts.Factors = factors
+		}
+	}
+}
--- a/model/bytepairencoding.go
+++ b/model/bytepairencoding.go
@@ -3,118 +3,16 @@ package model
 import (
 	"cmp"
 	"context"
+	"fmt"
 	"iter"
 	"log/slog"
-	"slices"
 	"strings"
-	"sync"

 	"github.com/dlclark/regexp2"
 	heap "github.com/emirpasic/gods/v2/trees/binaryheap"
 	"github.com/ollama/ollama/logutil"
 )

-type Special int32
-
-const (
-	SpecialBOS Special = iota
-	SpecialEOS
-)
-
-const (
-	TOKEN_TYPE_NORMAL = iota + 1
-	TOKEN_TYPE_UNKNOWN
-	TOKEN_TYPE_CONTROL
-	TOKEN_TYPE_USER_DEFINED
-	TOKEN_TYPE_UNUSED
-	TOKEN_TYPE_BYTE
-)
-
-type TextProcessor interface {
-	Encode(s string, addSpecial bool) ([]int32, error)
-	Decode([]int32) (string, error)
-	Is(int32, Special) bool
-	Vocabulary() *Vocabulary
-}
-
-type Vocabulary struct {
-	Values []string
-	Types  []int32
-	Scores []float32
-	Merges []string
-
-	BOS, EOS, EOT          int32
-	AddBOS, AddEOS, AddEOT bool
-
-	specialOnce sync.Once
-	special     []string
-
-	valuesOnce sync.Once
-	values     map[string]int32
-
-	mergeOnce sync.Once
-	merge     map[string]int32
-}
-
-func (v *Vocabulary) Is(id int32, special Special) bool {
-	switch special {
-	case SpecialBOS:
-		return id == v.BOS
-	case SpecialEOS:
-		return id == v.EOS || id == v.EOT
-	default:
-		return false
-	}
-}
-
-func (v *Vocabulary) Encode(s string) int32 {
-	v.valuesOnce.Do(func() {
-		v.values = make(map[string]int32, len(v.Values))
-		for i, value := range v.Values {
-			v.values[value] = int32(i)
-		}
-	})
-
-	if id, ok := v.values[s]; ok {
-		return id
-	}
-
-	return -1
-}
-
-func (v *Vocabulary) Decode(id int32) string {
-	return v.Values[id]
-}
-
-func (v *Vocabulary) SpecialVocabulary() []string {
-	v.specialOnce.Do(func() {
-		for i := range v.Values {
-			if slices.Contains([]int{105, 106}, i) {
-				v.special = append(v.special, v.Values[i])
-			} else if v.Types[i] == TOKEN_TYPE_CONTROL {
-				v.special = append(v.special, v.Values[i])
-			}
-		}
-	})
-
-	return v.special
-}
-
-func (v *Vocabulary) Merge(left, right string) int {
-	v.mergeOnce.Do(func() {
-		v.merge = make(map[string]int32, len(v.Merges))
-		for i, merge := range v.Merges {
-			v.merge[merge] = int32(i)
-		}
-	})
-
-	if id, ok := v.merge[left+" "+right]; ok {
-		return int(id)
-	}
-
-	return -1
-}
-
 type BytePairEncoding struct {
 	pre   *regexp2.Regexp
 	vocab *Vocabulary
@@ -304,30 +202,23 @@ func (bpe BytePairEncoding) Encode(s string, addSpecial bool) ([]int32, error) {
 		}
 	}

+	slog.Log(context.TODO(), logutil.LevelTrace, "encoded", "string", s, "ids", ids)
+
 	if addSpecial && len(ids) > 0 {
-		if bpe.vocab.AddBOS {
-			if ids[0] == bpe.vocab.BOS {
-				slog.Warn("adding bos token to prompt which already has it", "id", bpe.vocab.BOS)
-			}
-
-			slog.Debug("adding bos token to prompt", "id", bpe.vocab.BOS)
-			ids = append([]int32{bpe.vocab.BOS}, ids...)
-		}
-
-		if bpe.vocab.AddEOS {
-			if ids[len(ids)-1] == bpe.vocab.EOS {
-				slog.Warn("adding eos token to prompt which already has it", "id", bpe.vocab.EOS)
-			}
-
-			slog.Debug("adding eos token to prompt", "id", bpe.vocab.EOS)
-			ids = append(ids, bpe.vocab.EOS)
-		}
+		ids = bpe.vocab.addSpecials(ids)
 	}

-	slog.Log(context.TODO(), logutil.LevelTrace, "encoded", "ids", ids)
 	return ids, nil
 }

+type lazyIdsString struct {
+	ids []int32
+}
+
+func (l lazyIdsString) LogValue() slog.Value {
+	return slog.AnyValue(fmt.Sprint(l.ids))
+}
+
 func (bpe BytePairEncoding) Decode(ids []int32) (string, error) {
 	var sb strings.Builder
 	for _, id := range ids {
@@ -352,6 +243,6 @@ func (bpe BytePairEncoding) Decode(ids []int32) (string, error) {
 		}
 	}

-	slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "string", sb.String())
+	slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "string", sb.String(), "from", lazyIdsString{ids: ids})
 	return sb.String(), nil
 }
--- a/model/bytepairencoding_test.go
+++ b/model/bytepairencoding_test.go
--- a/model/input/input.go
+++ b/model/input/input.go
@@ -2,16 +2,30 @@ package input

 import "github.com/ollama/ollama/ml"

+// Multimodal is a multimodal embedding or a component of one.
+// For example, it could be a row of an image that can be processed
+// independently.
+type Multimodal struct {
+	// Tensor is the embedding data. Implementations may chose what to
+	// store here or it may be nil if not needed. However, any ml.Tensor
+	// objects must be stored here and not in Data.
+	Tensor ml.Tensor
+
+	// Data is implementation-specific opaque data, such as metadata on how
+	// to layout Tensor. It may be nil if not needed. It may also store larger
+	// objects such as complete images if they are to be processed later.
+	Data any
+}
+
 // Input represents one token in the input stream
 type Input struct {
 	// Token is a single element of text.
 	Token int32

-	// Multimodal is opaque data representing a non-text
-	// element such as an image (or part of one if the image
-	// can be processed in pieces). It may be either together
-	// with Token or on its own.
-	Multimodal any
+	// Multimodal is represents a non-text element such as an
+	// image (or part of one if the image can be processed in pieces).
+	// It may be used either together with Token or on its own.
+	Multimodal []Multimodal

 	// MultimodalHash is a unique representation of the data
 	// stored in Multimodal, used for caching and comparing
@@ -32,7 +46,7 @@ type Input struct {
 // Positions slice.
 type MultimodalIndex struct {
 	Index      int
-	Multimodal any
+	Multimodal []Multimodal
 }

 // Batch contains the inputs for a model forward pass
--- a/model/model.go
+++ b/model/model.go
@@ -40,12 +40,13 @@ type MultimodalProcessor interface {
 	// EncodeMultimodal processes a single input (such as an image) and
 	// generates an output (typically an embedding) that can be used by the model.
 	//
-	// The return value is most typically an ml.Tensor, however, different
-	// type are possible, such as an object containing a tensor plus
-	// additional metadata, a slice of tensors or even just the original input.
+	// The return value is one or more tensors, each with optional model-specific
+	// opaque metadata. Typically, the tensors might be views into an embedding
+	// with each view representing a chunk of data that can be processed independently
+	// in different batches.
 	//
 	// The result may be cached by the runner.
-	EncodeMultimodal(ml.Context, []byte) (any, error)
+	EncodeMultimodal(ml.Context, []byte) ([]input.Multimodal, error)

 	// PostTokenize is called after tokenization to allow the model to edit the
 	// input stream to correctly arrange multimodal elements.
@@ -97,14 +98,8 @@ func Register(name string, f func(fs.Config) (Model, error)) {
 }

 // New initializes a new model instance with the provided configuration based on the metadata in the model file
-func New(ctx context.Context, modelPath string, params ml.BackendParams) (Model, error) {
-	r, err := os.Open(modelPath)
-	if err != nil {
-		return nil, err
-	}
-	defer r.Close()
-
-	b, err := ml.NewBackend(ctx, r, params)
+func New(modelPath string, params ml.BackendParams) (Model, error) {
+	b, err := ml.NewBackend(modelPath, params)
 	if err != nil {
 		return nil, err
 	}
@@ -133,7 +128,7 @@ func NewTextProcessor(s string) (TextProcessor, error) {
 		return nil, err
 	}
 	defer r.Close()
-	meta, _, err := fsggml.Decode(r, -1)
+	meta, err := fsggml.Decode(r, -1)
 	if err != nil {
 		return nil, err
 	}
@@ -292,11 +287,7 @@ func Forward(ctx ml.Context, m Model, inputs []int32, batch input.Batch) (ml.Ten
 		return nil, errors.New("batch size cannot be less than 1")
 	}

-	var err error
-	batch.Inputs, err = ctx.Input().FromIntSlice(inputs, len(inputs))
-	if err != nil {
-		return nil, err
-	}
+	batch.Inputs = ctx.Input().FromIntSlice(inputs, len(inputs))

 	cache := m.Config().Cache
 	if cache != nil {
--- a/model/models/gemma2/model.go
+++ b/model/models/gemma2/model.go
@@ -7,6 +7,8 @@ import (
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/ml/nn/fast"
+	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
 )
@@ -43,10 +45,13 @@ func New(c fs.Config) (model.Model, error) {
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Scores: c.Floats("tokenizer.ggml.scores"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
-				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
-				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
-				// TODO: set EOT to EOS otherwise 0 will stop generation
-				EOT: int32(c.Uint("tokenizer.ggml.eos_token_id")),
+				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
+				EOS: append(
+					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				),
 			},
 		),
 		Layers: make([]Layer, c.Uint("block_count")),
@@ -80,11 +85,10 @@ type SelfAttention struct {

 func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
-	ropeType := uint32(2)

 	q := sa.Query.Forward(ctx, hiddenState)
 	q = q.Reshape(ctx, opts.attnKeyLen, opts.numHeads, batchSize)
-	q = q.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, opts.ropeBase, opts.ropeScale)
+	q = fast.RoPE(ctx, q, positionIDs, opts.attnKeyLen, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())

 	if opts.largeModelScaling {
 		q = q.Scale(ctx, 1.0/math.Sqrt(float64(opts.hiddenSize/opts.numHeads)))
@@ -94,7 +98,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten

 	k := sa.Key.Forward(ctx, hiddenState)
 	k = k.Reshape(ctx, opts.attnKeyLen, opts.numKVHeads, batchSize)
-	k = k.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, opts.ropeBase, opts.ropeScale)
+	k = fast.RoPE(ctx, k, positionIDs, opts.attnKeyLen, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())

 	v := sa.Value.Forward(ctx, hiddenState)
 	v = v.Reshape(ctx, opts.attnValLen, opts.numKVHeads, batchSize)
@@ -124,7 +128,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
 }

 func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return key.RoPE(ctx, shift, nil, uint32(m.Options.attnKeyLen), uint32(2), m.Options.ropeBase, m.Options.ropeScale), nil
+	return fast.RoPE(ctx, key, shift, m.Options.attnKeyLen, m.Options.ropeBase, m.Options.ropeScale, rope.WithTypeNeoX()), nil
 }

 type MLP struct {
@@ -171,15 +175,8 @@ func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Ten
 }

 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	if err != nil {
-		return nil, err
-	}
-
-	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
-	if err != nil {
-		return nil, err
-	}
+	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))

 	hiddenState := m.TokenEmbedding.Forward(ctx, batch.Inputs)
 	hiddenState = hiddenState.Scale(ctx, math.Sqrt(float64(m.Options.hiddenSize)))
--- a/model/models/gemma3/model.go
+++ b/model/models/gemma3/model.go
@@ -60,12 +60,16 @@ func New(c fs.Config) (model.Model, error) {
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Scores: c.Floats("tokenizer.ggml.scores"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
-				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				EOS:    int32(1),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				EOT:    int32(106),
-				AddEOT: c.Bool("tokenizer.ggml.add_eot_token", false),
+				EOS: append(
+					[]int32{
+						int32(c.Uint("tokenizer.ggml.eos_token_id")),
+						int32(c.Uint("tokenizer.ggml.eot_token_id", 106)),
+					},
+					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				),
 			},
 		),
 		ImageProcessor: newImageProcessor(c),
@@ -82,7 +86,7 @@ func New(c fs.Config) (model.Model, error) {
 	return &m, nil
 }

-func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
 	if len(m.VisionModel.Layers) == 0 {
 		return nil, model.ErrNoVisionModel
 	}
@@ -97,33 +101,30 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
 		return nil, err
 	}

-	pixelValues, err := ctx.Input().FromFloatSlice(f32s,
+	pixelValues := ctx.Input().FromFloatSlice(f32s,
 		m.ImageProcessor.imageSize,
 		m.ImageProcessor.imageSize,
 		m.ImageProcessor.numChannels,
 	)
-	if err != nil {
-		return nil, err
-	}

 	visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
 	visionOutputs = m.MultiModalProjector.Forward(ctx, visionOutputs, m.imageSize, m.patchSize, m.VisionModel.eps)
-	return visionOutputs, nil
+	return []input.Multimodal{{Tensor: visionOutputs}}, nil
 }

 func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 	var result []input.Input

 	for _, inp := range inputs {
-		if inp.Multimodal == nil {
+		if len(inp.Multimodal) == 0 {
 			result = append(result, inp)
 		} else {
-			inputMultimodal := inp.Multimodal.(ml.Tensor)
+			inputMultimodal := inp.Multimodal[0].Tensor

 			result = append(result,
-				input.Input{Token: 108, SameBatch: inputMultimodal.Dim(1) + 3},               // "\n\n"
-				input.Input{Token: 255999},                                                   // "<start_of_image>""
-				input.Input{Multimodal: inputMultimodal, MultimodalHash: inp.MultimodalHash}, // image data is on the first placeholder
+				input.Input{Token: 108, SameBatch: inputMultimodal.Dim(1) + 3}, // "\n\n"
+				input.Input{Token: 255999},                                     // "<start_of_image>""
+				input.Input{Multimodal: []input.Multimodal{{Tensor: inputMultimodal}}, MultimodalHash: inp.MultimodalHash}, // image data is on the first placeholder
 			)

 			// add image token placeholders
@@ -140,15 +141,8 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 }

 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	if err != nil {
-		return nil, err
-	}
-
-	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
-	if err != nil {
-		return nil, err
-	}
+	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))

 	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache), nil
 }
--- a/model/models/gemma3/model_text.go
+++ b/model/models/gemma3/model_text.go
@@ -7,6 +7,8 @@ import (
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/ml/nn/fast"
+	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model/input"
 )

@@ -73,7 +75,6 @@ type TextSelfAttention struct {

 func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *TextConfig) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
-	ropeType := uint32(2)

 	ropeBase := opts.ropeLocalBase
 	if (layer+1)%gemmaGlobalCacheCount == 0 {
@@ -83,7 +84,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, pos
 	q := sa.Query.Forward(ctx, hiddenState)
 	q = q.Reshape(ctx, opts.attnKeyLen, opts.numHeads, batchSize)
 	q = sa.QueryNorm.Forward(ctx, q, opts.eps)
-	q = q.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, ropeBase, opts.ropeScale)
+	q = fast.RoPE(ctx, q, positionIDs, opts.attnKeyLen, ropeBase, opts.ropeScale, rope.WithTypeNeoX())

 	if opts.largeModelScaling {
 		q = q.Scale(ctx, 1.0/math.Sqrt(float64(opts.hiddenSize/opts.numHeads)))
@@ -94,7 +95,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, pos
 	k := sa.Key.Forward(ctx, hiddenState)
 	k = k.Reshape(ctx, opts.attnKeyLen, opts.numKVHeads, batchSize)
 	k = sa.KeyNorm.Forward(ctx, k, opts.eps)
-	k = k.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, ropeBase, opts.ropeScale)
+	k = fast.RoPE(ctx, k, positionIDs, opts.attnKeyLen, ropeBase, opts.ropeScale, rope.WithTypeNeoX())

 	v := sa.Value.Forward(ctx, hiddenState)
 	v = v.Reshape(ctx, opts.attnValLen, opts.numKVHeads, batchSize)
@@ -112,7 +113,7 @@ func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.T
 		ropeBase = m.TextConfig.ropeGlobalBase
 	}

-	return key.RoPE(ctx, shift, nil, uint32(m.TextConfig.attnKeyLen), uint32(2), ropeBase, m.TextConfig.ropeScale), nil
+	return fast.RoPE(ctx, key, shift, m.TextConfig.attnKeyLen, ropeBase, m.TextConfig.ropeScale, rope.WithTypeNeoX()), nil
 }

 type TextMLP struct {
@@ -165,7 +166,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 	// set image embeddings
 	var except []int
 	for _, image := range batch.Multimodal {
-		visionOutputs := image.Multimodal.(ml.Tensor)
+		visionOutputs := image.Multimodal[0].Tensor
 		ctx.Forward(visionOutputs.Copy(ctx, hiddenState.View(ctx, image.Index*hiddenState.Stride(1), visionOutputs.Dim(0)*visionOutputs.Dim(1))))

 		for i := range visionOutputs.Dim(1) {
--- a/model/models/llama/model.go
+++ b/model/models/llama/model.go
@@ -1,22 +1,23 @@
 package llama

 import (
-	"fmt"
+	"cmp"
 	"math"
-	"strings"

 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/ml/nn/fast"
+	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
 )

 type Options struct {
 	hiddenSize, numHeads, numKVHeads int
+	headDim, ropeDim                 int
 	eps, ropeBase, ropeScale         float32
-	ropeDim                          uint32
 }

 type Model struct {
@@ -32,10 +33,6 @@ type Model struct {
 }

 func New(c fs.Config) (model.Model, error) {
-	if !strings.EqualFold(c.String("tokenizer.ggml.model"), "gpt2") {
-		return nil, fmt.Errorf("tokenizer %s not yet supported", c.String("tokenizer.ggml.model"))
-	}
-
 	m := Model{
 		BytePairEncoding: model.NewBytePairEncoding(
 			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
@@ -43,13 +40,13 @@ func New(c fs.Config) (model.Model, error) {
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
-				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				// TODO: set EOT to EOS otherwise 0 will stop generation
-				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
-				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
+				EOS: append(
+					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				),
 			},
 		),
 		Layers: make([]Layer, c.Uint("block_count")),
@@ -57,10 +54,11 @@ func New(c fs.Config) (model.Model, error) {
 			hiddenSize: int(c.Uint("embedding_length")),
 			numHeads:   int(c.Uint("attention.head_count")),
 			numKVHeads: int(c.Uint("attention.head_count_kv")),
+			headDim:    int(c.Uint("attention.key_length")),
+			ropeDim:    int(c.Uint("rope.dimension_count")),
 			eps:        c.Float("attention.layer_norm_rms_epsilon"),
 			ropeBase:   c.Float("rope.freq_base"),
 			ropeScale:  c.Float("rope.freq_scale", 1),
-			ropeDim:    c.Uint("rope.dimension_count"),
 		},
 	}

@@ -77,31 +75,31 @@ type SelfAttention struct {
 	RopeFactors ml.Tensor  `gguf:"rope_freqs.weight"`
 }

-func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
+func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
-	headDim := opts.hiddenSize / opts.numHeads
-	ropeType := uint32(0)
+	headDim := cmp.Or(opts.headDim, opts.hiddenSize/opts.numHeads)
+	ropeDim := cmp.Or(opts.ropeDim, headDim)

-	q := sa.Query.Forward(ctx, hiddenState)
-	q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
-	q = q.RoPE(ctx, positionIDs, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
+	query := sa.Query.Forward(ctx, hiddenState)
+	query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)

-	k := sa.Key.Forward(ctx, hiddenState)
-	k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-	k = k.RoPE(ctx, positionIDs, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
+	key := sa.Key.Forward(ctx, hiddenState)
+	key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize)

-	v := sa.Value.Forward(ctx, hiddenState)
-	v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
+	value := sa.Value.Forward(ctx, hiddenState)
+	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)

-	scaleFactor := 1.0 / math.Sqrt(float64(headDim))
-	kqv := nn.Attention(ctx, q, k, v, scaleFactor, cache)
-	kqv = kqv.Reshape(ctx, opts.hiddenSize, batchSize)
+	query = fast.RoPE(ctx, query, positions, ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
+	key = fast.RoPE(ctx, key, positions, ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))

-	return sa.Output.Forward(ctx, kqv)
+	attention := nn.Attention(ctx, query, key, value, 1.0/math.Sqrt(float64(headDim)), cache)
+	attention = attention.Reshape(ctx, headDim*opts.numHeads, batchSize)
+	return sa.Output.Forward(ctx, attention)
 }

 func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return key.RoPE(ctx, shift, m.Layers[layer].SelfAttention.RopeFactors, uint32(0), m.ropeDim, m.ropeBase, m.ropeScale), nil
+	ropeDim := cmp.Or(m.ropeDim, m.hiddenSize/m.numHeads)
+	return fast.RoPE(ctx, key, shift, ropeDim, m.ropeBase, m.ropeScale, rope.WithFactors(m.Layers[layer].SelfAttention.RopeFactors)), nil
 }

 type MLP struct {
@@ -122,11 +120,11 @@ type Layer struct {
 	MLP           *MLP
 }

-func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
+func (l *Layer) Forward(ctx ml.Context, hiddenState, positions, outputs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
 	residual := hiddenState

 	hiddenState = l.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
-	hiddenState = l.SelfAttention.Forward(ctx, hiddenState, positionIDs, cache, opts)
+	hiddenState = l.SelfAttention.Forward(ctx, hiddenState, positions, cache, opts)

 	// In the final layer (outputs != nil), optimize by pruning to just the token positions
 	// we need logits for.
@@ -144,27 +142,19 @@ func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Ten
 }

 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	if err != nil {
-		return nil, err
-	}
-
-	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
-	if err != nil {
-		return nil, err
-	}
+	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))

 	hiddenState := m.TokenEmbedding.Forward(ctx, batch.Inputs)

 	for i, layer := range m.Layers {
 		m.Cache.SetLayer(i)

-		var lastLayerOutputs ml.Tensor
+		var outputs ml.Tensor
 		if i == len(m.Layers)-1 {
-			lastLayerOutputs = outputs
+			outputs = ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
 		}

-		hiddenState = layer.Forward(ctx, hiddenState, positions, lastLayerOutputs, m.Cache, m.Options)
+		hiddenState = layer.Forward(ctx, hiddenState, positions, outputs, m.Cache, m.Options)
 	}

 	hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
--- a/model/models/llama4/model.go
+++ b/model/models/llama4/model.go
@@ -4,7 +4,6 @@ import (
 	"bytes"
 	"image"
 	"slices"
-	"sync"

 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
@@ -41,13 +40,13 @@ func New(c fs.Config) (model.Model, error) {
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
-				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				// TODO: set EOT to EOS otherwise 0 will stop generation
-				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
-				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
+				EOS: append(
+					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				),
 			},
 		),
 		ImageProcessor: newImageProcessor(c),
@@ -63,7 +62,7 @@ func New(c fs.Config) (model.Model, error) {
 	return &m, nil
 }

-func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
 	if len(m.VisionModel.Layers) < 1 {
 		return nil, model.ErrNoVisionModel
 	}
@@ -78,10 +77,7 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
 		return nil, err
 	}

-	tilesLocal, err := ctx.Input().FromFloatSlice(pixelsLocal, size.X, size.Y, m.numChannels)
-	if err != nil {
-		return nil, err
-	}
+	tilesLocal := ctx.Input().FromFloatSlice(pixelsLocal, size.X, size.Y, m.numChannels)

 	ratioW, ratioH := size.X/m.imageSize, size.Y/m.imageSize

@@ -92,81 +88,86 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
 	pixelValues := tilesLocal

 	if len(pixelsGlobal) > 0 {
-		tilesGlobal, err := ctx.Input().FromFloatSlice(pixelsGlobal, m.imageSize, m.imageSize, m.numChannels)
-		if err != nil {
-			return nil, err
-		}
-
+		tilesGlobal := ctx.Input().FromFloatSlice(pixelsGlobal, m.imageSize, m.imageSize, m.numChannels)
 		pixelValues = pixelValues.Concat(ctx, tilesGlobal, 3)
 	}

 	visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
 	visionOutputs = visionOutputs.Reshape(ctx, visionOutputs.Dim(0), visionOutputs.Dim(1)*visionOutputs.Dim(2)*visionOutputs.Dim(3))
 	projectedOutputs := m.Projector.Forward(ctx, visionOutputs)
-	return &chunks{Model: m, Tensor: projectedOutputs, aspectRatio: image.Point{ratioW, ratioH}}, nil
+
+	var multimodal []input.Multimodal
+	aspectRatio := image.Point{ratioW, ratioH}
+
+	var offset int
+	patchesPerChunk := projectedOutputs.Dim(1)
+	if aspectRatio.Y*aspectRatio.X > 1 {
+		patchesPerChunk = projectedOutputs.Dim(1) / (aspectRatio.X*aspectRatio.Y + 1)
+
+		for range aspectRatio.Y {
+			for x := range aspectRatio.X {
+				view := projectedOutputs.View(ctx, projectedOutputs.Stride(1)*offset,
+					projectedOutputs.Dim(0), projectedOutputs.Stride(1),
+					patchesPerChunk)
+				var separator separator
+				if x < aspectRatio.X-1 {
+					separator.x = true // <|tile_x_separator|>
+				} else {
+					separator.y = true // <|tile_y_separator|>
+				}
+				multimodal = append(multimodal, input.Multimodal{Tensor: view, Data: &separator})
+				offset += patchesPerChunk
+			}
+		}
+	}
+
+	view := projectedOutputs.View(ctx, projectedOutputs.Stride(1)*offset,
+		projectedOutputs.Dim(0), projectedOutputs.Stride(1),
+		patchesPerChunk)
+	multimodal = append(multimodal, input.Multimodal{Tensor: view, Data: &separator{}})
+
+	return multimodal, nil
 }

-type chunks struct {
-	*Model
-	ml.Tensor
-	aspectRatio image.Point
-
-	dataOnce sync.Once
-	data     []float32
-}
-
-type chunk struct {
-	*chunks
-	s, n int
-}
-
-func (r *chunk) floats() []float32 {
-	r.dataOnce.Do(func() {
-		temp := r.Backend().NewContext()
-		defer temp.Close()
-		temp.Forward(r.Tensor).Compute(r.Tensor)
-		r.data = r.Floats()
-	})
-
-	return r.data[r.s*r.Dim(0) : (r.s+r.n)*r.Dim(0)]
+type separator struct {
+	x bool
+	y bool
 }

 func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 	var result []input.Input
 	for _, inp := range inputs {
-		if inp.Multimodal == nil {
+		if len(inp.Multimodal) == 0 {
 			result = append(result, inp)
 			continue
 		}

-		t := inp.Multimodal.(*chunks)
 		var imageInputs []input.Input
 		imageInputs = append(imageInputs, input.Input{Token: 200080}) // <|image_start|>

-		var offset int
-		patchesPerChunk := t.Dim(1)
-		if t.aspectRatio.Y*t.aspectRatio.X > 1 {
-			patchesPerChunk = t.Dim(1) / (t.aspectRatio.X*t.aspectRatio.Y + 1)
+		for i, mm := range inp.Multimodal {
+			patchesPerChunk := mm.Tensor.Dim(1)

-			for range t.aspectRatio.Y {
-				for x := range t.aspectRatio.X {
-					imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: &chunk{t, offset, patchesPerChunk}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
-					imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
-					if x < t.aspectRatio.X-1 {
-						imageInputs = append(imageInputs, input.Input{Token: 200084}) // <|tile_x_separator|>
-					}
-					offset += patchesPerChunk
+			if i < len(inp.Multimodal)-1 {
+				separator := mm.Data.(*separator)
+
+				imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: []input.Multimodal{{Tensor: mm.Tensor}}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
+				imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
+
+				if separator.x {
+					imageInputs = append(imageInputs, input.Input{Token: 200084}) // <|tile_x_separator|>
 				}
-
-				imageInputs = append(imageInputs, input.Input{Token: 200085}) // <|tile_y_separator|>
+				if separator.y {
+					imageInputs = append(imageInputs, input.Input{Token: 200085}) // <|tile_y_separator|>
+				}
+			} else {
+				imageInputs = append(imageInputs, input.Input{Token: 200090})                                                                                                                      // <|image|>
+				imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: []input.Multimodal{{Tensor: mm.Tensor}}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
+				imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
+				imageInputs = append(imageInputs, input.Input{Token: 200080}) // <|image_end|>
 			}
 		}

-		imageInputs = append(imageInputs, input.Input{Token: 200090})                                                                                                                 // <|image|>
-		imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: &chunk{t, offset, patchesPerChunk}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
-		imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
-		imageInputs = append(imageInputs, input.Input{Token: 200080}) // <|image_end|>
-
 		result = append(result, imageInputs...)
 	}

@@ -174,15 +175,8 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 }

 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	if err != nil {
-		return nil, err
-	}
-
-	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
-	if err != nil {
-		return nil, err
-	}
+	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))

 	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache), nil
 }
--- a/model/models/llama4/model_text.go
+++ b/model/models/llama4/model_text.go
@@ -8,6 +8,8 @@ import (
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/ml/nn/fast"
+	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model/input"
 )

@@ -31,8 +33,8 @@ func (sa *TextAttention) Forward(ctx ml.Context, hiddenStates, positions, attent
 	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)

 	if useRope {
-		query = query.RoPE(ctx, positions, sa.RopeFactors, uint32(opts.ropeDim), uint32(0), opts.ropeBase, opts.ropeScale)
-		key = key.RoPE(ctx, positions, sa.RopeFactors, uint32(opts.ropeDim), uint32(0), opts.ropeBase, opts.ropeScale)
+		query = fast.RoPE(ctx, query, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
+		key = fast.RoPE(ctx, key, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
 	}

 	if opts.useQKNorm {
@@ -61,9 +63,9 @@ func (mlp *TextMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *TextOp
 }

 type TextExperts struct {
-	Gate ml.Tensor `gguf:"ffn_gate_exps.weight"`
-	Up   ml.Tensor `gguf:"ffn_up_exps.weight"`
-	Down ml.Tensor `gguf:"ffn_down_exps.weight"`
+	Gate *nn.Linear `gguf:"ffn_gate_exps"`
+	Up   *nn.Linear `gguf:"ffn_up_exps"`
+	Down *nn.Linear `gguf:"ffn_down_exps"`
 }

 func (e *TextExperts) Forward(ctx ml.Context, hiddenStates, routerLogits ml.Tensor, opts *TextOptions) ml.Tensor {
@@ -74,13 +76,13 @@ func (e *TextExperts) Forward(ctx ml.Context, hiddenStates, routerLogits ml.Tens
 	hiddenStates = hiddenStates.Repeat(ctx, 1, opts.numExpertsUsed)
 	hiddenStates = hiddenStates.Mul(ctx, scores)

-	upStates := e.Up.MulmatID(ctx, hiddenStates, experts)
-	gateStates := e.Gate.MulmatID(ctx, hiddenStates, experts)
-	downStates := e.Down.MulmatID(ctx, upStates.Mul(ctx, gateStates.SILU(ctx)), experts)
+	upStates := e.Up.Weight.MulmatID(ctx, hiddenStates, experts)
+	gateStates := e.Gate.Weight.MulmatID(ctx, hiddenStates, experts)
+	downStates := e.Down.Weight.MulmatID(ctx, upStates.Mul(ctx, gateStates.SILU(ctx)), experts)

 	nextStates := downStates.View(ctx, 0, hiddenStates.Dim(0), downStates.Stride(2), hiddenStates.Dim(2))
 	for i := 1; i < opts.numExpertsUsed; i++ {
-		nextStates.Add(ctx, downStates.View(ctx, i*downStates.Stride(1), hiddenStates.Dim(0), downStates.Stride(2), hiddenStates.Dim(2)))
+		nextStates = nextStates.Add(ctx, downStates.View(ctx, i*downStates.Stride(1), hiddenStates.Dim(0), downStates.Stride(2), hiddenStates.Dim(2)))
 	}

 	return nextStates
@@ -210,12 +212,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 	hiddenStates := m.TokenEmbedding.Forward(ctx, inputs).Duplicate(ctx)

 	for _, mi := range batch.Multimodal {
-		f32s := mi.Multimodal.(*chunk).floats()
-		img, err := ctx.Input().FromFloatSlice(f32s, len(f32s)/m.hiddenSize, m.hiddenSize)
-		if err != nil {
-			panic(err)
-		}
-
+		img := mi.Multimodal[0].Tensor
 		ctx.Forward(img.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), img.Dim(0)*img.Dim(1))))
 	}

@@ -226,11 +223,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 			scales[i] = float32(math.Log(math.Floor(((float64(p)+1.0)/float64(m.attentionFloorScale))+1.0))*m.attentionScale + 1.0)
 		}

-		var err error
-		attentionScales, err = ctx.Input().FromFloatSlice(scales, 1, 1, len(scales))
-		if err != nil {
-			panic(err)
-		}
+		attentionScales = ctx.Input().FromFloatSlice(scales, 1, 1, len(scales))
 	}

 	for i, layer := range m.Layers {
@@ -255,5 +248,5 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 }

 func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return key.RoPE(ctx, shift, m.Layers[layer].Attention.RopeFactors, uint32(0), uint32(m.ropeDim), m.ropeBase, m.ropeScale), nil
+	return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale, rope.WithFactors(m.Layers[layer].Attention.RopeFactors)), nil
 }
--- a/model/models/llama4/model_vision.go
+++ b/model/models/llama4/model_vision.go
@@ -245,10 +245,7 @@ func (m *VisionModel) rotaryEmbedding(ctx ml.Context) (ml.Tensor, ml.Tensor) {
 		}
 	}

-	ropeFreqs, err := ctx.Input().FromFloatSlice(freqs, freqDim/2, numPatches, 2)
-	if err != nil {
-		panic(err)
-	}
+	ropeFreqs := ctx.Input().FromFloatSlice(freqs, freqDim/2, numPatches, 2)

 	ropeFreqs = ropeFreqs.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
 	ropeFreqs = ropeFreqs.Reshape(ctx, freqDim, 1, numPatches)
--- a/model/models/mistral3/model.go
+++ b/model/models/mistral3/model.go
@@ -4,7 +4,6 @@ import (
 	"bytes"
 	"image"
 	"slices"
-	"sync"

 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
@@ -32,31 +31,26 @@ var _ model.MultimodalProcessor = (*Model)(nil)
 var _ model.TextProcessor = (*Model)(nil)

 func New(c fs.Config) (model.Model, error) {
-	textModel, err := NewTextModel(c)
-	if err != nil {
-		return nil, err
-	}
-
 	m := &Model{
-		TextModel:           textModel,
-		VisionModel:         newVisionModel(c),
-		ImageProcessor:      newImageProcessor(c),
-		MultiModalProjector: newMultiModalProjector(c),
 		BytePairEncoding: model.NewBytePairEncoding(
 			c.String("tokenizer.ggml.pretokenizer", `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
 			&model.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
-				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id", 1)),
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id", 2)),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				// TODO: set EOT to EOS otherwise 0 will stop generation
-				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
-				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
+				EOS: append(
+					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				),
 			},
 		),
+		TextModel:           newTextModel(c),
+		VisionModel:         newVisionModel(c),
+		ImageProcessor:      newImageProcessor(c),
+		MultiModalProjector: newMultiModalProjector(c),
 	}

 	m.Cache = kvcache.NewCausalCache(m.TextModel.Shift)
@@ -105,7 +99,7 @@ func newMultiModalProjector(c fs.Config) *MultiModalProjector {
 	}
 }

-func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
 	if len(m.VisionModel.Layers) == 0 {
 		return nil, model.ErrNoVisionModel
 	}
@@ -120,46 +114,20 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
 		return nil, err
 	}

-	pixelValues, err := ctx.Input().FromFloatSlice(f32s, size.X, size.Y, m.ImageProcessor.numChannels)
-	if err != nil {
-		return nil, err
-	}
+	pixelValues := ctx.Input().FromFloatSlice(f32s, size.X, size.Y, m.ImageProcessor.numChannels)

 	visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
 	features, size := m.MultiModalProjector.Forward(ctx, visionOutputs, size)

 	// split into patches to be sent to the text transformer
-	parent := imageFeatures{tensor: features}
-	rows := make([]*imageRow, size.Y)
+	rows := make([]input.Multimodal, size.Y)
 	for i := range rows {
-		rows[i] = &imageRow{parent: &parent, s: i, shape: []int{features.Dim(0), size.X}}
+		rows[i].Tensor = features.View(ctx, features.Stride(1)*size.X*i, features.Dim(0), features.Stride(1), size.X)
 	}

 	return rows, nil
 }

-type imageFeatures struct {
-	tensor ml.Tensor
-
-	dataOnce sync.Once
-	data     []float32
-}
-
-type imageRow struct {
-	parent *imageFeatures
-	s      int
-	shape  []int
-}
-
-func (r *imageRow) data() []float32 {
-	n := 1
-	for _, s := range r.shape {
-		n *= s
-	}
-
-	return r.parent.data[r.s*n : (r.s+1)*n]
-}
-
 // PostTokenize arranges Mistral 3's inputs for the forward pass
 // In Mistral 3 and Pixtral, the input patches are arranged as follows:
 // [IMG]...[IMG][IMG_BREAK][IMG]...[IMG][IMG_BREAK][IMG]...[IMG][IMG_END]
@@ -168,15 +136,14 @@ func (r *imageRow) data() []float32 {
 func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 	var result []input.Input
 	for _, inp := range inputs {
-		if inp.Multimodal == nil {
+		if len(inp.Multimodal) == 0 {
 			result = append(result, inp)
 		} else {
-			inputMultimodal := inp.Multimodal.([]*imageRow)
-			for i, row := range inputMultimodal {
+			for i, row := range inp.Multimodal {
 				// [IMG]
-				result = append(result, input.Input{Token: 10, Multimodal: row, MultimodalHash: inp.MultimodalHash, SameBatch: row.shape[1]})
-				result = append(result, slices.Repeat([]input.Input{{Token: 10}}, row.shape[1]-1)...)
-				if i == len(inputMultimodal)-1 {
+				result = append(result, input.Input{Token: 10, Multimodal: []input.Multimodal{{Tensor: row.Tensor}}, MultimodalHash: inp.MultimodalHash, SameBatch: row.Tensor.Dim(1)})
+				result = append(result, slices.Repeat([]input.Input{{Token: 10}}, row.Tensor.Dim(1)-1)...)
+				if i == len(inp.Multimodal)-1 {
 					// [IMG_END]
 					result = append(result, input.Input{Token: 13})
 				} else {
@@ -191,15 +158,8 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 }

 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	if err != nil {
-		return nil, err
-	}
-
-	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
-	if err != nil {
-		return nil, err
-	}
+	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))

 	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache), nil
 }
--- a/model/models/mistral3/model_text.go
+++ b/model/models/mistral3/model_text.go
@@ -1,27 +1,24 @@
 package mistral3

 import (
-	"fmt"
+	"cmp"
 	"math"
-	"strings"

 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
-	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/ml/nn/fast"
 	"github.com/ollama/ollama/model/input"
 )

 type TextOptions struct {
-	hiddenSize, numHeads, numKVHeads, headDim int
-	eps, ropeBase, ropeScale                  float32
-	ropeDim                                   uint32
+	hiddenSize, numHeads, numKVHeads int
+	headDim, ropeDim                 int
+	eps, ropeBase, ropeScale         float32
 }

 type TextModel struct {
-	model.Base
-
 	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
 	Layers         []Layer       `gguf:"blk"`
 	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
@@ -39,19 +36,15 @@ type SelfAttention struct {

 func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
-	ropeType := uint32(0)
-	headDim := opts.headDim
-	if headDim == 0 {
-		headDim = opts.hiddenSize / opts.numHeads
-	}
+	headDim := cmp.Or(opts.headDim, opts.hiddenSize/opts.numHeads)

 	q := sa.Query.Forward(ctx, hiddenState)
 	q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
-	q = q.RoPE(ctx, positionIDs, nil, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
+	q = fast.RoPE(ctx, q, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale)

 	k := sa.Key.Forward(ctx, hiddenState)
 	k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-	k = k.RoPE(ctx, positionIDs, nil, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
+	k = fast.RoPE(ctx, k, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale)

 	v := sa.Value.Forward(ctx, hiddenState)
 	v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
@@ -62,7 +55,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
 }

 func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return key.RoPE(ctx, shift, nil, uint32(0), m.ropeDim, m.ropeBase, m.ropeScale), nil
+	return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale), nil
 }

 type MLP struct {
@@ -109,20 +102,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor

 	// image embeddings
 	for _, image := range batch.Multimodal {
-		row := image.Multimodal.(*imageRow)
-		row.parent.dataOnce.Do(func() {
-			// use a new, throwaway context so the image tensor is not added to the graph
-			temp := m.Backend().NewContext()
-			temp.Forward(row.parent.tensor).Compute(row.parent.tensor)
-			row.parent.data = row.parent.tensor.Floats()
-			temp.Close()
-		})
-
-		imageFeature, err := ctx.Input().FromFloatSlice(row.data(), row.shape...)
-		if err != nil {
-			panic(err)
-		}
-
+		imageFeature := image.Multimodal[0].Tensor
 		ctx.Forward(imageFeature.Copy(ctx, hiddenState.View(ctx, image.Index*hiddenState.Stride(1), imageFeature.Dim(0)*imageFeature.Dim(1))))
 	}

@@ -141,24 +121,18 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 	return m.Output.Forward(ctx, hiddenState)
 }

-func NewTextModel(c fs.Config) (*TextModel, error) {
-	if !strings.EqualFold(c.String("tokenizer.ggml.model"), "gpt2") {
-		return nil, fmt.Errorf("tokenizer %s not yet supported", c.String("tokenizer.ggml.model"))
-	}
-
-	textModel := &TextModel{
+func newTextModel(c fs.Config) *TextModel {
+	return &TextModel{
 		Layers: make([]Layer, c.Uint("block_count")),
 		TextOptions: &TextOptions{
 			hiddenSize: int(c.Uint("embedding_length")),
 			numHeads:   int(c.Uint("attention.head_count")),
 			numKVHeads: int(c.Uint("attention.head_count_kv")),
 			headDim:    int(c.Uint("attention.key_length")),
+			ropeDim:    int(c.Uint("rope.dimension_count")),
 			eps:        c.Float("attention.layer_norm_rms_epsilon"),
 			ropeBase:   c.Float("rope.freq_base"),
 			ropeScale:  c.Float("rope.freq_scale", 1),
-			ropeDim:    c.Uint("rope.dimension_count"),
 		},
 	}
-
-	return textModel, nil
 }
--- a/model/models/mistral3/model_vision.go
+++ b/model/models/mistral3/model_vision.go
@@ -110,15 +110,8 @@ func (m *VisionModel) positionalEmbedding(ctx ml.Context, positionIDs ml.Tensor)
 		}
 	}

-	h, err := ctx.Input().FromFloatSlice(frequenciesHeight, maxPatchesPerSide, frequencies/2)
-	if err != nil {
-		panic(err)
-	}
-
-	w, err := ctx.Input().FromFloatSlice(frequenciesWidth, maxPatchesPerSide, frequencies/2)
-	if err != nil {
-		panic(err)
-	}
+	h := ctx.Input().FromFloatSlice(frequenciesHeight, maxPatchesPerSide, frequencies/2)
+	w := ctx.Input().FromFloatSlice(frequenciesWidth, maxPatchesPerSide, frequencies/2)

 	h = h.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
 	w = w.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
@@ -151,10 +144,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
 		}
 	}

-	positionIDs, err := ctx.Input().FromIntSlice(positions, len(positions))
-	if err != nil {
-		panic(err)
-	}
+	positionIDs := ctx.Input().FromIntSlice(positions, len(positions))

 	positionEmbedding := m.positionalEmbedding(ctx, positionIDs)
 	cos, sin := positionEmbedding.Cos(ctx), positionEmbedding.Sin(ctx)
@@ -170,7 +160,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {

 func newVisionModel(c fs.Config) *VisionModel {
 	return &VisionModel{
-		Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count", 24)),
+		Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count")),
 		VisionModelOptions: &VisionModelOptions{
 			hiddenSize:       int(c.Uint("vision.embedding_length", 1024)),
 			numHeads:         int(c.Uint("vision.attention.head_count", 16)),
--- a/model/models/mllama/model.go
+++ b/model/models/mllama/model.go
@@ -3,6 +3,7 @@ package mllama
 import (
 	"bytes"
 	"image"
+	"slices"

 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
@@ -37,13 +38,13 @@ func New(c fs.Config) (model.Model, error) {
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
-				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				// TODO: set EOT to EOS otherwise 0 will stop generation
-				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
-				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
+				EOS: append(
+					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				),
 			},
 		),
 		ImageProcessor: newImageProcessor(c),
@@ -58,7 +59,7 @@ func New(c fs.Config) (model.Model, error) {
 	return &m, nil
 }

-func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
 	if len(m.VisionModel.Transformer.Layers) == 0 || len(m.GlobalTransformer.Layers) == 0 {
 		return nil, model.ErrNoVisionModel
 	}
@@ -73,21 +74,20 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
 		return nil, err
 	}

-	pixelValues, err := ctx.Input().FromFloatSlice(f32s, m.imageSize, m.imageSize, m.numChannels, ratio.numTiles())
-	if err != nil {
-		return nil, err
+	if ratio.numTiles() < m.maxNumTiles {
+		// Pad tiles to maxNumTiles
+		f32s = slices.Grow(f32s, m.imageSize*m.imageSize*m.numChannels*m.maxNumTiles)
+		f32s = f32s[:m.imageSize*m.imageSize*m.numChannels*m.maxNumTiles]
 	}

-	pixelValues = pixelValues.Pad(ctx, 0, 0, 0, m.ImageProcessor.maxNumTiles-ratio.numTiles())
-
-	aspectRatio, err := ctx.Input().FromIntSlice([]int32{int32(ratio.rank)}, 1)
-	if err != nil {
-		return nil, err
-	}
+	pixelValues := ctx.Input().FromFloatSlice(f32s, m.imageSize, m.imageSize, m.numChannels, m.maxNumTiles)
+	aspectRatio := ctx.Input().FromIntSlice([]int32{int32(ratio.rank)}, 1)

 	positionIDs := ctx.Arange(0, 1601, 1, ml.DTypeI32)
 	crossAttentionStates := m.VisionModel.Forward(ctx, pixelValues, positionIDs, aspectRatio)
-	return m.Projector.Forward(ctx, crossAttentionStates), nil
+	projectedOutputs := m.Projector.Forward(ctx, crossAttentionStates)
+
+	return []input.Multimodal{{Tensor: projectedOutputs}}, nil
 }

 func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
@@ -103,18 +103,11 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 	var crossAttentionStates ml.Tensor
 	if len(batch.Multimodal) > 0 {
-		crossAttentionStates = batch.Multimodal[len(batch.Multimodal)-1].Multimodal.(ml.Tensor)
+		crossAttentionStates = batch.Multimodal[len(batch.Multimodal)-1].Multimodal[0].Tensor
 	}

-	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	if err != nil {
-		return nil, err
-	}
-
-	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
-	if err != nil {
-		return nil, err
-	}
+	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))

 	// TODO: attention mask, cross attention mask
 	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, crossAttentionStates, nil, m.Cache.(*kvcache.WrapperCache)), nil
--- a/model/models/mllama/model_text.go
+++ b/model/models/mllama/model_text.go
@@ -8,6 +8,8 @@ import (
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/ml/nn/fast"
+	"github.com/ollama/ollama/ml/nn/rope"
 )

 type TextSelfAttention struct {
@@ -21,15 +23,14 @@ type TextSelfAttention struct {
 func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
 	headDim := opts.hiddenSize / opts.numHeads
-	ropeType := uint32(0)

 	query := sa.Query.Forward(ctx, hiddenState)
 	query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
-	query = query.RoPE(ctx, positions, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
+	query = fast.RoPE(ctx, query, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))

 	key := sa.Key.Forward(ctx, hiddenState)
 	key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-	key = key.RoPE(ctx, positions, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
+	key = fast.RoPE(ctx, key, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))

 	value := sa.Value.Forward(ctx, hiddenState)
 	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
@@ -44,7 +45,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.T
 func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
 	// This will only get called for layers in the cache, which are just the self attention layers
 	if sa, ok := m.Transformer.Layers[layer].(*TextSelfAttentionDecoderLayer); ok {
-		return key.RoPE(ctx, shift, sa.SelfAttention.RopeFactors, m.ropeDim, uint32(0), m.ropeBase, m.ropeScale), nil
+		return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale, rope.WithFactors(sa.SelfAttention.RopeFactors)), nil
 	}

 	return key, nil
@@ -199,8 +200,8 @@ func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs,

 type TextModelOptions struct {
 	hiddenSize, numHeads, numKVHeads int
+	ropeDim                          int
 	eps, ropeBase, ropeScale         float32
-	ropeDim                          uint32

 	crossAttentionLayers []int32
 }
@@ -240,10 +241,10 @@ func newTextModel(c fs.Config) *TextModel {
 			hiddenSize:           int(c.Uint("embedding_length")),
 			numHeads:             int(c.Uint("attention.head_count")),
 			numKVHeads:           int(c.Uint("attention.head_count_kv")),
+			ropeDim:              int(c.Uint("rope.dimension_count")),
 			eps:                  c.Float("attention.layer_norm_rms_epsilon"),
 			ropeBase:             c.Float("rope.freq_base"),
 			ropeScale:            c.Float("rope.freq_scale", 1),
-			ropeDim:              c.Uint("rope.dimension_count"),
 			crossAttentionLayers: c.Ints("attention.cross_attention_layers"),
 		},
 	}
--- a/model/models/mllama/model_vision.go
+++ b/model/models/mllama/model_vision.go
@@ -16,8 +16,6 @@ type VisionSelfAttention struct {
 	Key    *nn.Linear `gguf:"attn_k"`
 	Value  *nn.Linear `gguf:"attn_v"`
 	Output *nn.Linear `gguf:"attn_output"`
-
-	Gate ml.Tensor `gguf:"attn_gate"`
 }

 func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
@@ -25,27 +23,16 @@ func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, op

 	query := sa.Query.Forward(ctx, hiddenState)
 	query = query.Reshape(ctx, headDim, opts.numHeads, query.Dim(1), batchSize)
-	query = query.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)

 	key := sa.Key.Forward(ctx, hiddenState)
 	key = key.Reshape(ctx, headDim, opts.numHeads, key.Dim(1), batchSize)
-	key = key.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)

 	value := sa.Value.Forward(ctx, hiddenState)
 	value = value.Reshape(ctx, headDim, opts.numHeads, value.Dim(1), batchSize)
-	value = value.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)

-	scores := key.Mulmat(ctx, query)
-	scores = scores.Scale(ctx, 1.0/math.Sqrt(float64(headDim)))
-	scores = scores.Softmax(ctx)
-
-	attention := value.Mulmat(ctx, scores)
-	attention = attention.Reshape(ctx, headDim, attention.Dim(1), opts.numHeads, batchSize)
-	attention = attention.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
+	attention := nn.Attention(ctx, query, key, value, 1./math.Sqrt(float64(headDim)), nil)
 	attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize)
-
-	hiddenState = sa.Output.Forward(ctx, attention)
-	return hiddenState
+	return sa.Output.Forward(ctx, attention)
 }

 type VisionMLP struct {
@@ -76,21 +63,18 @@ func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts
 	// self attention
 	hiddenState = e.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
 	hiddenState = e.SelfAttention.Forward(ctx, hiddenState, opts)
-
 	if e.AttentionGate != nil {
 		hiddenState = hiddenState.Mul(ctx, e.AttentionGate)
 	}
 	hiddenState = hiddenState.Add(ctx, residual)
 	residual = hiddenState

-	// feed forward
 	hiddenState = e.MLPNorm.Forward(ctx, hiddenState, opts.eps)
 	hiddenState = e.MLP.Forward(ctx, hiddenState, opts)
-	hiddenState = hiddenState.Add(ctx, residual)
 	if e.MLPGate != nil {
 		hiddenState = hiddenState.Mul(ctx, e.MLPGate)
 	}
-
+	hiddenState = hiddenState.Add(ctx, residual)
 	return hiddenState
 }

--- a/model/models/models.go
+++ b/model/models/models.go
@@ -7,4 +7,7 @@ import (
 	_ "github.com/ollama/ollama/model/models/llama4"
 	_ "github.com/ollama/ollama/model/models/mistral3"
 	_ "github.com/ollama/ollama/model/models/mllama"
+	_ "github.com/ollama/ollama/model/models/qwen2"
+	_ "github.com/ollama/ollama/model/models/qwen25vl"
+	_ "github.com/ollama/ollama/model/models/qwen3"
 )
--- a/model/models/qwen2/model.go
+++ b/model/models/qwen2/model.go
@@ -0,0 +1,164 @@
+package qwen2
+
+import (
+	"cmp"
+	"math"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/ml/nn/fast"
+	"github.com/ollama/ollama/ml/nn/rope"
+	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/model/input"
+)
+
+type Options struct {
+	hiddenSize, numHeads, numKVHeads int
+	headDim, ropeDim                 int
+	eps, ropeBase, ropeScale         float32
+}
+
+type Attention struct {
+	Query  *nn.Linear `gguf:"attn_q"`
+	Key    *nn.Linear `gguf:"attn_k"`
+	Value  *nn.Linear `gguf:"attn_v"`
+	Output *nn.Linear `gguf:"attn_output"`
+}
+
+func (attn Attention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
+	batchSize := hiddenStates.Dim(1)
+	headDim := cmp.Or(opts.headDim, opts.hiddenSize/opts.numHeads)
+	ropeDim := cmp.Or(opts.ropeDim, headDim)
+
+	query := attn.Query.Forward(ctx, hiddenStates)
+	query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
+
+	key := attn.Key.Forward(ctx, hiddenStates)
+	key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
+
+	value := attn.Value.Forward(ctx, hiddenStates)
+	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
+
+	query = fast.RoPE(ctx, query, positions, ropeDim, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
+	key = fast.RoPE(ctx, key, positions, ropeDim, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
+
+	attention := nn.Attention(ctx, query, key, value, 1.0/math.Sqrt(float64(headDim)), cache)
+	attention = attention.Reshape(ctx, headDim*opts.numHeads, batchSize)
+
+	return attn.Output.Forward(ctx, attention)
+}
+
+type MLP struct {
+	Gate *nn.Linear `gguf:"ffn_gate"`
+	Up   *nn.Linear `gguf:"ffn_up"`
+	Down *nn.Linear `gguf:"ffn_down"`
+}
+
+func (mlp MLP) Forward(ctx ml.Context, hiddenStates ml.Tensor) ml.Tensor {
+	hiddenStates = mlp.Gate.Forward(ctx, hiddenStates).SILU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenStates))
+	return mlp.Down.Forward(ctx, hiddenStates)
+}
+
+type DecoderLayer struct {
+	AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
+	Attention     *Attention
+	MLPNorm       *nn.RMSNorm `gguf:"ffn_norm"`
+	MLP           *MLP
+}
+
+func (d DecoderLayer) Forward(ctx ml.Context, hiddenStates, positions, outputs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
+	residual := hiddenStates
+
+	hiddenStates = d.AttentionNorm.Forward(ctx, hiddenStates, opts.eps)
+	hiddenStates = d.Attention.Forward(ctx, hiddenStates, positions, cache, opts)
+	if outputs != nil {
+		hiddenStates = hiddenStates.Rows(ctx, outputs)
+		residual = residual.Rows(ctx, outputs)
+	}
+
+	hiddenStates = hiddenStates.Add(ctx, residual)
+	residual = hiddenStates
+
+	hiddenStates = d.MLPNorm.Forward(ctx, hiddenStates, opts.eps)
+	hiddenStates = d.MLP.Forward(ctx, hiddenStates)
+	return hiddenStates.Add(ctx, residual)
+}
+
+type Model struct {
+	model.Base
+	model.BytePairEncoding
+
+	TokenEmbedding *nn.Embedding  `gguf:"token_embd"`
+	Layers         []DecoderLayer `gguf:"blk"`
+	OutputNorm     *nn.RMSNorm    `gguf:"output_norm"`
+	Output         *nn.Linear     `gguf:"output,alt:token_embd"`
+
+	Options
+}
+
+// Forward implements model.Model.
+func (m Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
+	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+
+	hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs)
+
+	for i, layer := range m.Layers {
+		m.Cache.SetLayer(i)
+
+		var outputs ml.Tensor
+		if i == len(m.Layers)-1 {
+			outputs = ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+		}
+
+		hiddenStates = layer.Forward(ctx, hiddenStates, positions, outputs, m.Cache, &m.Options)
+	}
+
+	hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.eps)
+	hiddenStates = m.Output.Forward(ctx, hiddenStates)
+	return hiddenStates, nil
+}
+
+func (m Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
+	ropeDim := cmp.Or(m.ropeDim, m.hiddenSize/m.numHeads)
+	return fast.RoPE(ctx, key, shift, ropeDim, m.ropeBase, m.ropeScale, rope.WithTypeNeoX()), nil
+}
+
+func New(c fs.Config) (model.Model, error) {
+	m := Model{
+		Layers: make([]DecoderLayer, c.Uint("block_count")),
+		BytePairEncoding: model.NewBytePairEncoding(
+			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
+			&model.Vocabulary{
+				Values: c.Strings("tokenizer.ggml.tokens"),
+				Types:  c.Ints("tokenizer.ggml.token_type"),
+				Merges: c.Strings("tokenizer.ggml.merges"),
+				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
+				EOS: append(
+					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				),
+			},
+		),
+		Options: Options{
+			hiddenSize: int(c.Uint("embedding_length")),
+			numHeads:   int(c.Uint("attention.head_count")),
+			numKVHeads: int(c.Uint("attention.head_count_kv")),
+			headDim:    int(c.Uint("attention.key_length")),
+			ropeDim:    int(c.Uint("rope.dimension_count")),
+			ropeBase:   c.Float("rope.freq_base"),
+			ropeScale:  c.Float("rope.freq_scale", 1),
+			eps:        c.Float("attention.layer_norm_rms_epsilon"),
+		},
+	}
+
+	m.Cache = kvcache.NewCausalCache(m.Shift)
+	return &m, nil
+}
+
+func init() {
+	model.Register("qwen2", New)
+}
--- a/model/models/qwen25vl/model.go
+++ b/model/models/qwen25vl/model.go
@@ -0,0 +1,150 @@
+package qwen25vl
+
+import (
+	"bytes"
+	"fmt"
+	"image"
+	"slices"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/model/input"
+)
+
+type Model struct {
+	model.Base
+	model.BytePairEncoding
+
+	*TextModel
+	*VisionModel `gguf:"v,vision"`
+
+	ImageProcessor
+}
+
+// Implement MultimodalProcessor interface
+var _ model.MultimodalProcessor = (*Model)(nil)
+
+func New(c fs.Config) (model.Model, error) {
+	m := &Model{
+		BytePairEncoding: model.NewBytePairEncoding(
+			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
+			&model.Vocabulary{
+				Values: c.Strings("tokenizer.ggml.tokens"),
+				Types:  c.Ints("tokenizer.ggml.token_type"),
+				Merges: c.Strings("tokenizer.ggml.merges"),
+				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
+				EOS: append(
+					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				),
+			},
+		),
+		TextModel:      NewTextModel(c),
+		VisionModel:    newVisionModel(c),
+		ImageProcessor: newImageProcessor(c),
+	}
+
+	m.Cache = kvcache.NewCausalCache(m.TextModel.Shift)
+
+	return m, nil
+}
+
+func (m *Model) PixelValues(ctx ml.Context, multimodalData []byte) (ml.Tensor, *Grid, error) {
+	image, _, err := image.Decode(bytes.NewReader(multimodalData))
+	if err != nil {
+		return nil, nil, err
+	}
+
+	f32s, grid, err := m.ImageProcessor.ProcessImage(image)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	// Calculate tensor dimensions
+	patchDim := m.ImageProcessor.numChannels * m.ImageProcessor.temporalPatchSize *
+		m.ImageProcessor.patchSize * m.ImageProcessor.patchSize
+	numPatches := grid.Temporal * grid.Height * grid.Width
+
+	pixelValues := ctx.Input().FromFloatSlice(f32s, patchDim, numPatches)
+
+	return pixelValues, grid, nil
+}
+
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
+	if len(m.VisionModel.Layers) == 0 {
+		return nil, model.ErrNoVisionModel
+	}
+
+	pixels, grid, err := m.PixelValues(ctx, multimodalData)
+	if err != nil {
+		return nil, err
+	}
+
+	visionOutputs := m.VisionModel.Forward(ctx, pixels, grid)
+	return []input.Multimodal{{Tensor: visionOutputs}}, nil
+}
+
+// PostTokenize arranges Qwen-2.5-VL's inputs for the forward pass
+func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
+	var result []input.Input
+
+	var (
+		imageToken       int32 = 151655
+		visionStartToken int32 = 151652
+		visionEndToken   int32 = 151653
+	)
+
+	nImg := 0
+	for _, inp := range inputs {
+		if inp.Multimodal == nil {
+			// If not a multimodal input, add it to the result unchanged
+			result = append(result, inp)
+		} else {
+			// Adding the 'Picture' prefix is a hack, at the time of writing there is no way to prefix
+			// the image tokens with a prompt, so we add a prefix here
+			nImg++
+			pre, err := m.Encode(fmt.Sprintf(" Picture %d: ", nImg), true)
+			if err != nil {
+				return nil, fmt.Errorf("failed to encode image prompt: %w", err)
+			}
+			for i := range pre {
+				result = append(result, input.Input{Token: pre[i]})
+			}
+
+			patchesPerChunk := inp.Multimodal[0].Tensor.Dim(1)
+
+			// First add the vision start token
+			result = append(result, input.Input{Token: visionStartToken})
+
+			// Add the image token with the multimodal tensor data at the first position
+			result = append(result, input.Input{
+				Token:          imageToken,
+				Multimodal:     inp.Multimodal,
+				MultimodalHash: inp.MultimodalHash,
+				SameBatch:      patchesPerChunk,
+			})
+
+			// Add the placeholder tokens for the remaining positions (tokensPerGrid-1)
+			result = append(result, slices.Repeat([]input.Input{{Token: imageToken}}, patchesPerChunk-1)...)
+
+			result = append(result, input.Input{Token: visionEndToken})
+		}
+	}
+
+	return result, nil
+}
+
+func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
+	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+
+	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache)
+}
+
+func init() {
+	model.Register("qwen25vl", New)
+}
--- a/model/models/qwen25vl/model_text.go
+++ b/model/models/qwen25vl/model_text.go
@@ -0,0 +1,151 @@
+package qwen25vl
+
+import (
+	"math"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/ml/nn/fast"
+	"github.com/ollama/ollama/ml/nn/rope"
+	"github.com/ollama/ollama/model/input"
+)
+
+type TextOptions struct {
+	hiddenSize, numHeads, numKVHeads int
+	ropeDim, originalContextLength   int
+	eps, ropeBase, ropeScale         float32
+}
+
+type TextModel struct {
+	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
+	Layers         []Layer       `gguf:"blk"`
+	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
+	Output         *nn.Linear    `gguf:"output,alt:token_embd"`
+
+	*TextOptions
+}
+
+func NewTextModel(c fs.Config) *TextModel {
+	m := TextModel{
+		Layers: make([]Layer, c.Uint("block_count")),
+		TextOptions: &TextOptions{
+			hiddenSize:            int(c.Uint("embedding_length")),
+			numHeads:              int(c.Uint("attention.head_count")),
+			numKVHeads:            int(c.Uint("attention.head_count_kv")),
+			ropeDim:               int(c.Uint("rope.dimension_count", 128)),
+			originalContextLength: int(c.Uint("context_length", 128000)),
+			eps:                   c.Float("attention.layer_norm_rms_epsilon"),
+			ropeBase:              c.Float("rope.freq_base"),
+			ropeScale:             c.Float("rope.freq_scale", 1),
+		},
+	}
+
+	return &m
+}
+
+// SelfAttention implements the multi-head self-attention mechanism
+// with separate projections for query, key, value and output transformations
+type SelfAttention struct {
+	Query  *nn.Linear `gguf:"attn_q"`
+	Key    *nn.Linear `gguf:"attn_k"`
+	Value  *nn.Linear `gguf:"attn_v"`
+	Output *nn.Linear `gguf:"attn_output"`
+}
+
+func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
+	batchSize := hiddenState.Dim(1)
+	headDim := opts.hiddenSize / opts.numHeads
+
+	q := sa.Query.Forward(ctx, hiddenState)
+	q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
+	q = fast.RoPE(ctx, q, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithOriginalContextLength(opts.originalContextLength), rope.WithTypeNeoX())
+
+	k := sa.Key.Forward(ctx, hiddenState)
+	k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
+	k = fast.RoPE(ctx, k, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithOriginalContextLength(opts.originalContextLength), rope.WithTypeNeoX())
+
+	v := sa.Value.Forward(ctx, hiddenState)
+	v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
+
+	scaleFactor := 1.0 / math.Sqrt(float64(headDim))
+	kqv := nn.Attention(ctx, q, k, v, scaleFactor, cache)
+	kqv = kqv.Reshape(ctx, opts.hiddenSize, batchSize)
+
+	return sa.Output.Forward(ctx, kqv)
+}
+
+// Shift applies rotary position embeddings to the key tensor for causal attention caching
+func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
+	return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale, rope.WithOriginalContextLength(m.originalContextLength), rope.WithTypeNeoX()), nil
+}
+
+// MLP implements the feed-forward network component with SwiGLU activation
+type MLP struct {
+	Up   *nn.Linear `gguf:"ffn_up"`
+	Down *nn.Linear `gguf:"ffn_down"`
+	Gate *nn.Linear `gguf:"ffn_gate"`
+}
+
+func (mlp *MLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *TextOptions) ml.Tensor {
+	// Apply SwiGLU activation gating
+	hiddenState = mlp.Gate.Forward(ctx, hiddenState).SILU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenState))
+	// Project back to hidden dimension
+	return mlp.Down.Forward(ctx, hiddenState)
+}
+
+// Layer represents a single transformer layer combining self-attention and feed-forward components
+type Layer struct {
+	AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
+	SelfAttention *SelfAttention
+	MLPNorm       *nn.RMSNorm `gguf:"ffn_norm"`
+	MLP           *MLP
+}
+
+func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
+	// Self-attention branch with residual connection
+	residual := hiddenState
+
+	hiddenState = l.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
+	hiddenState = l.SelfAttention.Forward(ctx, hiddenState, positionIDs, cache, opts)
+
+	// In the final layer (outputs != nil), optimize by pruning to just the token positions
+	// we need logits for.
+	if outputs != nil {
+		hiddenState = hiddenState.Rows(ctx, outputs)
+		residual = residual.Rows(ctx, outputs)
+	}
+
+	hiddenState = hiddenState.Add(ctx, residual)
+	// Feed-forward branch with residual connection
+	residual = hiddenState
+	hiddenState = l.MLPNorm.Forward(ctx, hiddenState, opts.eps)
+	hiddenState = l.MLP.Forward(ctx, hiddenState, opts)
+	return hiddenState.Add(ctx, residual)
+}
+
+func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor, batch input.Batch, cache kvcache.Cache) (ml.Tensor, error) {
+	// Initial token embedding
+	hiddenStates := m.TokenEmbedding.Forward(ctx, inputs).Duplicate(ctx)
+
+	for _, mi := range batch.Multimodal {
+		img := mi.Multimodal[0].Tensor
+		ctx.Forward(img.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), img.Dim(0)*img.Dim(1))))
+	}
+
+	// Process through transformer layers
+	for i, layer := range m.Layers {
+		cache.SetLayer(i)
+
+		var lastLayerOutputs ml.Tensor
+		if i == len(m.Layers)-1 {
+			lastLayerOutputs = outputs
+		}
+
+		hiddenStates = layer.Forward(ctx, hiddenStates, positions, lastLayerOutputs, cache, m.TextOptions)
+	}
+
+	hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.eps)
+	return m.Output.Forward(ctx, hiddenStates), nil
+}
--- a/model/models/qwen25vl/model_vision.go
+++ b/model/models/qwen25vl/model_vision.go
@@ -0,0 +1,379 @@
+package qwen25vl
+
+import (
+	"math"
+	"slices"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+)
+
+// We only support batch size of 1
+var batchSize int = 1
+
+func rotateHalf(ctx ml.Context, t ml.Tensor) ml.Tensor {
+	x1 := t.View(ctx, 0, t.Dim(0)/2, t.Stride(1), t.Dim(1), t.Stride(2), t.Dim(2), t.Stride(3), t.Dim(3))
+	x2 := t.View(ctx, t.Stride(0)*t.Dim(0)/2, t.Dim(0)/2, t.Stride(1), t.Dim(1), t.Stride(2), t.Dim(2), t.Stride(3), t.Dim(3)).Contiguous(ctx)
+	return x2.Neg(ctx).Concat(ctx, x1, 0)
+}
+
+func applyRotaryPositionalEmbedding(ctx ml.Context, t, cos, sin ml.Tensor) ml.Tensor {
+	return t.Mul(ctx, cos).Add(ctx, rotateHalf(ctx, t).Mul(ctx, sin))
+}
+
+func blockDiagonalMask(ctx ml.Context, seqLength int, bounds []int, numHeads int) ml.Tensor {
+	// Create a flat slice for the mask (all -inf initially to block all attention)
+	flat := make([]float32, seqLength*seqLength)
+	for i := range flat {
+		flat[i] = float32(math.Inf(-1)) // Negative infinity to block attention
+	}
+
+	// Fill in the mask with zeros for tokens that CAN attend to each other
+	for i := 1; i < len(bounds); i++ {
+		start := bounds[i-1]
+		end := bounds[i]
+
+		// Enable attention within this sequence block by setting values to 0
+		for row := start; row < end; row++ {
+			for col := start; col < end; col++ {
+				idx := row*seqLength + col
+				flat[idx] = 0.0 // 0 allows attention, -inf blocks it
+			}
+		}
+	}
+
+	mask := ctx.Input().FromFloatSlice(flat, seqLength, seqLength)
+
+	// Reshape to match [seqLength, seqLength, 1] for broadcasting
+	mask = mask.Reshape(ctx, seqLength, seqLength, 1)
+
+	return mask
+}
+
+type VisionSelfAttention struct {
+	Query  *nn.Linear `gguf:"attn_q"`
+	Key    *nn.Linear `gguf:"attn_k"`
+	Value  *nn.Linear `gguf:"attn_v"`
+	Output *nn.Linear `gguf:"attn_out"`
+}
+
+func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenStates, cos, sin, mask ml.Tensor, opts *VisionModelOptions) ml.Tensor {
+	query := sa.Query.Forward(ctx, hiddenStates)
+	key := sa.Key.Forward(ctx, hiddenStates)
+	value := sa.Value.Forward(ctx, hiddenStates)
+
+	query = query.Reshape(ctx, opts.headDim, opts.numHeads, query.Dim(1), batchSize)
+	key = key.Reshape(ctx, opts.headDim, opts.numHeads, key.Dim(1), batchSize)
+	value = value.Reshape(ctx, opts.headDim, opts.numHeads, value.Dim(1), batchSize)
+
+	query = applyRotaryPositionalEmbedding(ctx, query, cos, sin)
+	key = applyRotaryPositionalEmbedding(ctx, key, cos, sin)
+
+	// Scale factor for scaled dot-product attention
+	scale := 1.0 / math.Sqrt(float64(opts.headDim))
+
+	// Scaled dot-product attention
+	query = query.Permute(ctx, 0, 2, 1, 3)
+	key = key.Permute(ctx, 0, 2, 1, 3)
+	value = value.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
+	kq := key.MulmatFullPrec(ctx, query)
+	kq = kq.Scale(ctx, scale)
+	if mask != nil {
+		kq = kq.Add(ctx, mask)
+	}
+	kq = kq.Softmax(ctx)
+	kqv := value.Mulmat(ctx, kq)
+	attention := kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
+	attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize)
+
+	return sa.Output.Forward(ctx, attention)
+}
+
+// VisionMLP implements the multi-layer perceptron
+type VisionMLP struct {
+	Gate *nn.Linear `gguf:"ffn_gate"`
+	Up   *nn.Linear `gguf:"ffn_up"`
+	Down *nn.Linear `gguf:"ffn_down"`
+}
+
+func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *VisionModelOptions) ml.Tensor {
+	// Using activation as specified in config (likely GELU or SiLU/Swish)
+	gateOutput := mlp.Gate.Forward(ctx, hiddenStates)
+	upOutput := mlp.Up.Forward(ctx, hiddenStates)
+	hiddenStates = gateOutput.SILU(ctx).Mul(ctx, upOutput)
+
+	return mlp.Down.Forward(ctx, hiddenStates)
+}
+
+type VisionEncoderLayer struct {
+	Norm1         *nn.RMSNorm `gguf:"ln1"`
+	SelfAttention *VisionSelfAttention
+	Norm2         *nn.RMSNorm `gguf:"ln2"`
+	MLP           *VisionMLP
+}
+
+func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenStates, cos, sin, mask ml.Tensor, opts *VisionModelOptions) ml.Tensor {
+	residual := hiddenStates
+	hiddenStates = e.Norm1.Forward(ctx, hiddenStates, opts.eps)
+	hiddenStates = e.SelfAttention.Forward(ctx, hiddenStates, cos, sin, mask, opts)
+	hiddenStates = hiddenStates.Add(ctx, residual)
+
+	residual = hiddenStates
+	hiddenStates = e.Norm2.Forward(ctx, hiddenStates, opts.eps)
+	hiddenStates = e.MLP.Forward(ctx, hiddenStates, opts)
+	return hiddenStates.Add(ctx, residual)
+}
+
+// VisionModelOptions contains configuration options
+type VisionModelOptions struct {
+	hiddenSize        int
+	numHeads          int
+	headDim           int
+	patchSize         int
+	numChannels       int
+	eps               float32
+	ropeTheta         float32
+	spatialMergeSize  int
+	windowSize        int
+	fullAttnBlocks    []int32
+	temporalPatchSize int
+}
+
+type PatchEmbedding struct {
+	PatchConv0 *nn.Conv2D `gguf:"patch_embd_0"`
+	PatchConv1 *nn.Conv2D `gguf:"patch_embd_1"`
+}
+
+func (pe *PatchEmbedding) Forward(ctx ml.Context, pixelValues ml.Tensor, opts *VisionModelOptions) ml.Tensor {
+	numPatches := pixelValues.Shape()[1]
+
+	// Reshape the input tensor to match the expected dimensions
+	pixelValues = pixelValues.Reshape(ctx, opts.patchSize*opts.patchSize, opts.temporalPatchSize, opts.numChannels, numPatches)
+
+	// Permute the tensor to bring the temporal dimension to the front
+	pixelValues = pixelValues.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
+
+	// Split the tensor into parts for the temporal convolutions
+	in0 := pixelValues.View(ctx, 0, 1, pixelValues.Stride(1), pixelValues.Dim(1), pixelValues.Stride(2), pixelValues.Dim(2), pixelValues.Stride(3), pixelValues.Dim(3)).Contiguous(ctx)
+	in0 = in0.Reshape(ctx, opts.patchSize, opts.patchSize, opts.numChannels, numPatches)
+	in1 := pixelValues.View(ctx, pixelValues.Stride(0), 1, pixelValues.Stride(1), pixelValues.Dim(1), pixelValues.Stride(2), pixelValues.Dim(2), pixelValues.Stride(3), pixelValues.Dim(3)).Contiguous(ctx)
+	in1 = in1.Reshape(ctx, opts.patchSize, opts.patchSize, opts.numChannels, numPatches)
+
+	s0, s1 := opts.patchSize, opts.patchSize // Use full stride
+	p0, p1 := 0, 0                           // padding
+	d0, d1 := 1, 1                           // dilation
+	out0 := pe.PatchConv0.Forward(ctx, in0, s0, s1, p0, p1, d0, d1)
+	out1 := pe.PatchConv1.Forward(ctx, in1, s0, s1, p0, p1, d0, d1)
+
+	// Add the outputs from the two temporal convolutions
+	out := out0.Add(ctx, out1)
+
+	// Reshape the output tensor to match the expected dimensions
+	return out.Reshape(ctx, opts.hiddenSize, numPatches)
+}
+
+// VisionPatchMerger implements patch merging for the Qwen vision model
+type VisionPatchMerger struct {
+	LNQ  *nn.RMSNorm `gguf:"ln_q"`
+	MLP0 *nn.Linear  `gguf:"mlp.0"`
+	MLP2 *nn.Linear  `gguf:"mlp.2"`
+}
+
+// Forward computes patch merging for the vision model
+func (pm *VisionPatchMerger) Forward(ctx ml.Context, visionOutputs ml.Tensor, opts *VisionModelOptions) ml.Tensor {
+	normalized := pm.LNQ.Forward(ctx, visionOutputs, opts.eps)
+
+	hiddenSize := visionOutputs.Dim(0) * (opts.spatialMergeSize * opts.spatialMergeSize)
+
+	// Reshape the normalized output to view the hidden size dimension
+	reshaped := normalized.Reshape(ctx, hiddenSize, normalized.Dim(1)/(opts.spatialMergeSize*opts.spatialMergeSize), batchSize)
+	hidden := pm.MLP0.Forward(ctx, reshaped)
+	activated := hidden.GELU(ctx)
+
+	output := pm.MLP2.Forward(ctx, activated)
+
+	return output
+}
+
+// VisionModel implements the Qwen vision model
+type VisionModel struct {
+	PatchEmbedding *PatchEmbedding
+	Layers         []VisionEncoderLayer `gguf:"blk"`
+	PatchMerger    *VisionPatchMerger   `gguf:"merger"`
+
+	*VisionModelOptions
+}
+
+// Forward computes the vision model for an input tensor
+func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor, grid *Grid) ml.Tensor {
+	// Extract patch embeddings
+	hiddenStates := m.PatchEmbedding.Forward(ctx, pixelValues, m.VisionModelOptions)
+
+	positionEmbedding := m.PositionalEmbedding(ctx, grid)
+
+	windowIndex, bounds := m.WindowIndex(ctx, grid)
+
+	spatialMergeUnit := m.spatialMergeSize * m.spatialMergeSize
+
+	hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0)*spatialMergeUnit, hiddenStates.Dim(1)/spatialMergeUnit)
+	hiddenStates = hiddenStates.Rows(ctx, windowIndex)
+	hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0)/spatialMergeUnit, hiddenStates.Dim(1)*spatialMergeUnit)
+
+	positionEmbedding = positionEmbedding.Reshape(ctx, positionEmbedding.Dim(0)*spatialMergeUnit, positionEmbedding.Dim(1)/spatialMergeUnit)
+	positionEmbedding = positionEmbedding.Rows(ctx, windowIndex)
+	positionEmbedding = positionEmbedding.Reshape(ctx, positionEmbedding.Dim(0)/spatialMergeUnit, positionEmbedding.Dim(1)*spatialMergeUnit)
+	positionEmbedding = positionEmbedding.Concat(ctx, positionEmbedding, 0)
+
+	cos, sin := positionEmbedding.Cos(ctx), positionEmbedding.Sin(ctx)
+	cos = cos.Reshape(ctx, cos.Dim(0), 1, cos.Dim(1))
+	sin = sin.Reshape(ctx, sin.Dim(0), 1, sin.Dim(1))
+
+	mask := blockDiagonalMask(ctx, hiddenStates.Dim(1), bounds, m.VisionModelOptions.numHeads)
+	// Apply encoder layers
+	for i, layer := range m.Layers {
+		if slices.Contains(m.fullAttnBlocks, int32(i)) {
+			hiddenStates = layer.Forward(ctx, hiddenStates, cos, sin, nil, m.VisionModelOptions)
+		} else {
+			hiddenStates = layer.Forward(
+				ctx,
+				hiddenStates,
+				cos,
+				sin,
+				mask,
+				m.VisionModelOptions,
+			)
+		}
+	}
+
+	hiddenStates = m.PatchMerger.Forward(ctx, hiddenStates, m.VisionModelOptions)
+	reverseWindowIndex := windowIndex.Argsort(ctx)
+	return hiddenStates.Rows(ctx, reverseWindowIndex)
+}
+
+// WindowIndex divides the grid into windows and returns:
+//  1. A tensor containing flattened indices of all grid points organized by windows
+//  2. A slice of boundaries that mark where each window's data begins and ends
+//     in the flattened representation, scaled by spatialMergeSize squared
+//
+// The boundaries slice always starts with 0 and contains cumulative ending
+// positions for each window, allowing downstream processing to identify
+// window boundaries in the tensor data.
+func (m *VisionModel) WindowIndex(ctx ml.Context, grid *Grid) (ml.Tensor, []int) {
+	vitMergerWindowSize := m.windowSize / m.spatialMergeSize / m.patchSize
+
+	llmGridH := grid.Height / m.spatialMergeSize
+	llmGridW := grid.Width / m.spatialMergeSize
+
+	// Calculate window parameters
+	numWindowsH := int(math.Ceil(float64(llmGridH) / float64(vitMergerWindowSize)))
+	numWindowsW := int(math.Ceil(float64(llmGridW) / float64(vitMergerWindowSize)))
+
+	// Initialize index_new slice
+	var index []int32
+
+	// Initialize bounds with the first element as 0
+	bounds := []int{0}
+	totalSeqLen := 0
+
+	// Process each window without padding
+	for wh := range numWindowsH {
+		for ww := range numWindowsW {
+			// Calculate window boundaries
+			hStart := wh * vitMergerWindowSize
+			wStart := ww * vitMergerWindowSize
+			hEnd := min(hStart+vitMergerWindowSize, llmGridH)
+			wEnd := min(wStart+vitMergerWindowSize, llmGridW)
+
+			// Calculate sequence length for this window
+			seqLen := (hEnd - hStart) * (wEnd - wStart)
+
+			// Collect indices for this window
+			for h := hStart; h < hEnd; h++ {
+				for w := wStart; w < wEnd; w++ {
+					index = append(index, int32(h*llmGridW+w))
+				}
+			}
+
+			totalSeqLen += seqLen
+			bounds = append(bounds, totalSeqLen*(m.spatialMergeSize*m.spatialMergeSize)+bounds[0])
+		}
+	}
+
+	t := ctx.Input().FromIntSlice(index, len(index))
+
+	return t, bounds
+}
+
+// PositionalEmbedding generates rotary position embeddings for attention mechanisms
+func (m *VisionModel) PositionalEmbedding(ctx ml.Context, grid *Grid) ml.Tensor {
+	dim := m.headDim / 2
+	freq := dim / 2
+	theta := float64(m.ropeTheta)
+	merge := m.spatialMergeSize
+
+	// Create frequency patterns for position encoding
+	maxGridSize := max(grid.Height, grid.Width)
+	freqVals := make([]float32, freq*maxGridSize)
+	for i := range maxGridSize {
+		for j := range freq {
+			freqVals[i*freq+j] = float32(i) / float32(math.Pow(theta, float64(j*2)/float64(dim)))
+		}
+	}
+	freqs := ctx.Input().FromFloatSlice(freqVals, freq, maxGridSize)
+
+	// Create position coordinates (y,x pairs) for the grid
+	// In PyTorch: Equivalent to generating position ids with torch.arange()
+	coords := make([]int32, 0, grid.Height*grid.Width*2)
+	for y := range grid.Height {
+		for x := range grid.Width {
+			coords = append(coords, int32(y), int32(x))
+		}
+	}
+	pos := ctx.Input().FromIntSlice(coords, 2, grid.Width, grid.Height)
+
+	// Reshape and permute positions to match spatial merging pattern
+	pos = pos.Reshape(ctx, 2, grid.Width, merge, grid.Height/merge)
+	pos = pos.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
+	pos = pos.Reshape(ctx, 2, merge, merge, grid.Width/merge*grid.Height/merge)
+	pos = pos.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
+	pos = pos.Reshape(ctx, 2*merge*merge*grid.Width/merge*grid.Height/merge)
+
+	// Use position indices to look up corresponding frequency values
+	positionalEmbedding := freqs.Rows(ctx, pos)
+	positionalEmbedding = positionalEmbedding.Reshape(ctx, positionalEmbedding.Dim(0)*2, positionalEmbedding.Dim(1)/2)
+	return positionalEmbedding
+}
+
+// newVisionModel creates a new instance of the Qwen vision model
+func newVisionModel(c fs.Config) *VisionModel {
+	patchSize := int(c.Uint("vision.patch_size", 14))
+	hiddenSize := int(c.Uint("vision.embedding_length", 1280))
+	numHeads := int(c.Uint("vision.attention.head_count", 16))
+	numChannels := int(c.Uint("vision.num_channels", 3))
+	eps := c.Float("vision.attention.layer_norm_epsilon", 1e-6)
+	ropeTheta := c.Float("vision.rope.freq_base", 10000.0)
+	spatialMergeSize := int(c.Uint("vision.spatial_merge_size", 2))
+	windowSize := int(c.Uint("vision.window_size", 112))
+	fullAttnBlocks := c.Ints("qwen25vl.vision.fullatt_block_indexes", []int32{7, 15, 23, 31})
+	temporalPatchSize := int(c.Uint("vision.temporal_patch_size", 2))
+
+	model := &VisionModel{
+		Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count", 32)),
+		VisionModelOptions: &VisionModelOptions{
+			hiddenSize:        hiddenSize,
+			numHeads:          numHeads,
+			headDim:           hiddenSize / numHeads,
+			patchSize:         patchSize,
+			numChannels:       numChannels,
+			eps:               eps,
+			ropeTheta:         ropeTheta,
+			spatialMergeSize:  spatialMergeSize,
+			windowSize:        windowSize,
+			temporalPatchSize: temporalPatchSize,
+			fullAttnBlocks:    fullAttnBlocks,
+		},
+	}
+
+	return model
+}
--- a/model/models/qwen25vl/process_image.go
+++ b/model/models/qwen25vl/process_image.go
@@ -0,0 +1,184 @@
+package qwen25vl
+
+import (
+	"fmt"
+	"image"
+	"math"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/model/imageproc"
+)
+
+// ImageProcessor contains configuration for the Qwen 2.5 VL image processing
+type ImageProcessor struct {
+	numChannels       int
+	patchSize         int
+	temporalPatchSize int
+	mergeSize         int
+	minPixels         int
+	maxPixels         int
+	factor            int
+	rescaleFactor     float32
+	imageMean         []float32
+	imageStd          []float32
+}
+
+// newImageProcessor creates a new image processor with default values
+func newImageProcessor(c fs.Config) ImageProcessor {
+	patchSize := int(c.Uint("vision.patch_size", 14))
+	mergeSize := int(c.Uint("vision.spatial_merge_size", 2))
+
+	return ImageProcessor{
+		numChannels:       int(c.Uint("vision.num_channels", 3)), // not set
+		patchSize:         patchSize,
+		temporalPatchSize: 2,
+		mergeSize:         mergeSize,
+		minPixels:         56 * 56,
+		maxPixels:         int(c.Uint("vision.max_pixels", 28*28*1280)), // 1MP limit
+		factor:            patchSize * mergeSize,
+		rescaleFactor:     1.0 / 255.0,
+		imageMean:         imageproc.ClipDefaultMean[:],
+		imageStd:          imageproc.ClipDefaultSTD[:],
+	}
+}
+
+// SmartResize implements the smart resize algorithm
+func (p *ImageProcessor) SmartResize(height, width int) (int, int) {
+	factor := p.factor
+
+	if height < factor || width < factor {
+		panic(fmt.Sprintf("height:%d or width:%d must be larger than factor:%d", height, width, factor))
+	} else if aspectRatio := max(height, width) / min(height, width); aspectRatio > 200 {
+		panic(fmt.Sprintf("absolute aspect ratio must be smaller than 200, got %v", aspectRatio))
+	}
+
+	round := func(x float64) int { return int(math.RoundToEven(x)) }
+
+	hBar := round(float64(height)/float64(factor)) * factor
+	wBar := round(float64(width)/float64(factor)) * factor
+
+	if hBar*wBar > p.maxPixels {
+		beta := math.Sqrt(float64(height*width) / float64(p.maxPixels))
+
+		hBar = int(math.Floor(float64(height)/beta/float64(factor))) * factor
+		wBar = int(math.Floor(float64(width)/beta/float64(factor))) * factor
+	} else if hBar*wBar < p.minPixels {
+		beta := math.Sqrt(float64(p.minPixels) / float64(height*width))
+
+		hBar = int(math.Ceil(float64(height)*beta/float64(factor))) * factor
+		wBar = int(math.Ceil(float64(width)*beta/float64(factor))) * factor
+	}
+
+	return hBar, wBar
+}
+
+type Grid struct {
+	Height   int
+	Width    int
+	Temporal int
+}
+
+func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, *Grid, error) {
+	origWidth := img.Bounds().Dx()
+	origHeight := img.Bounds().Dy()
+
+	// Calculate smart resize dimensions
+	resizedHeight, resizedWidth := p.SmartResize(origHeight, origWidth)
+
+	// Resize image using existing functions
+	resizedImg := imageproc.Resize(img, image.Point{X: resizedWidth, Y: resizedHeight}, imageproc.ResizeBilinear)
+
+	normalizedPixels := imageproc.Normalize(
+		resizedImg,
+		[3]float32{p.imageMean[0], p.imageMean[1], p.imageMean[2]},
+		[3]float32{p.imageStd[0], p.imageStd[1], p.imageStd[2]},
+		true, // rescale
+		true, // channelFirst
+	)
+
+	// Calculate grid dimensions
+	grid := &Grid{
+		Height:   resizedHeight / p.patchSize,
+		Width:    resizedWidth / p.patchSize,
+		Temporal: 1, // For single images, temporal dimension is 1
+	}
+
+	patches, err := p.createPatches(normalizedPixels, resizedHeight, resizedWidth, grid)
+	if err != nil {
+		return nil, nil, fmt.Errorf("failed to create patches: %v", err)
+	}
+
+	// Return patches and grid dimensions
+	return patches, grid, nil
+}
+
+func (p *ImageProcessor) createPatches(pixels []float32, height, width int, grid *Grid) ([]float32, error) {
+	channels := p.numChannels
+	patchSize := p.patchSize
+	mergeSize := p.mergeSize
+	temporalPatchSize := p.temporalPatchSize
+
+	// Calculate output dimensions
+	numPatches := grid.Temporal * grid.Height * grid.Width
+	patchDim := channels * temporalPatchSize * patchSize * patchSize
+
+	result := make([]float32, numPatches*patchDim)
+	patchIndex := 0
+
+	// Single temporal frame handling (copies to all frames)
+	for range grid.Temporal {
+		for h := 0; h < grid.Height; h += mergeSize {
+			for w := 0; w < grid.Width; w += mergeSize {
+				// Handle the 2x2 merged patches
+				for mh := range mergeSize {
+					for mw := range mergeSize {
+						baseOffset := patchIndex * patchDim
+
+						// Extract patch data for first temporal frame
+						for c := range channels {
+							channelOffset := baseOffset + (c * temporalPatchSize * patchSize * patchSize)
+
+							for py := range patchSize {
+								for px := range patchSize {
+									// Calculate source pixel coordinates
+									y := (h+mh)*patchSize + py
+									x := (w+mw)*patchSize + px
+
+									// Source index in input tensor (CHW format)
+									srcIdx := c*height*width + y*width + x
+
+									// Destination index in first temporal frame
+									dstIdx := channelOffset + (py * patchSize) + px
+
+									if srcIdx < len(pixels) && dstIdx < len(result) {
+										result[dstIdx] = pixels[srcIdx]
+									}
+								}
+							}
+						}
+
+						// Copy first temporal frame to all other frames
+						if temporalPatchSize > 1 {
+							for c := range channels {
+								channelOffset := baseOffset + (c * temporalPatchSize * patchSize * patchSize)
+								firstFrameOffset := channelOffset
+								frameSize := patchSize * patchSize
+
+								// Copy first frame to all other frames
+								for tp := 1; tp < temporalPatchSize; tp++ {
+									currentFrameOffset := channelOffset + (tp * frameSize)
+									copy(result[currentFrameOffset:currentFrameOffset+frameSize],
+										result[firstFrameOffset:firstFrameOffset+frameSize])
+								}
+							}
+						}
+
+						patchIndex++
+					}
+				}
+			}
+		}
+	}
+
+	return result, nil
+}
--- a/model/models/qwen3/model.go
+++ b/model/models/qwen3/model.go
@@ -0,0 +1,233 @@
+package qwen3
+
+import (
+	"cmp"
+	"math"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/ml/nn/fast"
+	"github.com/ollama/ollama/ml/nn/rope"
+	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/model/input"
+)
+
+type Options struct {
+	hiddenSize, numHeads, numKVHeads int
+	eps                              float32
+	ropeBase, ropeScale              float32
+
+	keyLength, valueLength int
+
+	numExperts, numExpertsUsed int
+	normTopKProb               bool
+}
+
+func (o Options) headDim() int {
+	return cmp.Or(o.keyLength, o.valueLength, o.hiddenSize/o.numHeads)
+}
+
+type Attention struct {
+	QueryNorm *nn.RMSNorm `gguf:"attn_q_norm"`
+	Query     *nn.Linear  `gguf:"attn_q"`
+	KeyNorm   *nn.RMSNorm `gguf:"attn_k_norm"`
+	Key       *nn.Linear  `gguf:"attn_k"`
+	Value     *nn.Linear  `gguf:"attn_v"`
+	Output    *nn.Linear  `gguf:"attn_output"`
+}
+
+func (sa *Attention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
+	batchSize := hiddenStates.Dim(1)
+
+	query := sa.Query.Forward(ctx, hiddenStates)
+	key := sa.Key.Forward(ctx, hiddenStates)
+	value := sa.Value.Forward(ctx, hiddenStates)
+
+	query = query.Reshape(ctx, opts.headDim(), opts.numHeads, batchSize)
+	key = key.Reshape(ctx, opts.headDim(), opts.numKVHeads, batchSize)
+	value = value.Reshape(ctx, opts.headDim(), opts.numKVHeads, batchSize)
+
+	query = sa.QueryNorm.Forward(ctx, query, opts.eps)
+	key = sa.KeyNorm.Forward(ctx, key, opts.eps)
+
+	query = fast.RoPE(ctx, query, positions, opts.headDim(), opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
+	key = fast.RoPE(ctx, key, positions, opts.headDim(), opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
+
+	attention := nn.Attention(ctx, query, key, value, 1./math.Sqrt(float64(opts.headDim())), cache)
+	attention = attention.Reshape(ctx, attention.Dim(0)*attention.Dim(1), batchSize)
+	return sa.Output.Forward(ctx, attention)
+}
+
+type MLP interface {
+	Forward(ml.Context, ml.Tensor, *Options) ml.Tensor
+}
+
+type sparse struct {
+	Router *nn.Linear `gguf:"ffn_gate_inp"`
+	Gate   *nn.Linear `gguf:"ffn_gate_exps"`
+	Up     *nn.Linear `gguf:"ffn_up_exps"`
+	Down   *nn.Linear `gguf:"ffn_down_exps"`
+}
+
+func (mlp *sparse) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *Options) ml.Tensor {
+	hiddenDim, sequenceLength, batchSize := hiddenStates.Dim(0), hiddenStates.Dim(1), hiddenStates.Dim(2)
+	hiddenStates = hiddenStates.Reshape(ctx, hiddenDim, sequenceLength*batchSize)
+	routerLogits := mlp.Router.Forward(ctx, hiddenStates)
+
+	routingWeights := routerLogits.Softmax(ctx)
+	selectedExperts := routingWeights.TopK(ctx, opts.numExpertsUsed)
+	routingWeights = routingWeights.Reshape(ctx, 1, opts.numExperts, hiddenStates.Dim(1)).Rows(ctx, selectedExperts)
+	if opts.normTopKProb {
+		routingWeights = routingWeights.Reshape(ctx, opts.numExpertsUsed, hiddenStates.Dim(1))
+		routingWeights = routingWeights.Div(ctx, routingWeights.SumRows(ctx))
+		routingWeights = routingWeights.Reshape(ctx, 1, opts.numExpertsUsed, hiddenStates.Dim(1))
+	}
+
+	hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0), 1, hiddenStates.Dim(1))
+
+	upStates := mlp.Up.Weight.MulmatID(ctx, hiddenStates, selectedExperts)
+
+	hiddenStates = mlp.Gate.Weight.MulmatID(ctx, hiddenStates, selectedExperts)
+	hiddenStates = hiddenStates.SILU(ctx)
+	hiddenStates = hiddenStates.Mul(ctx, upStates)
+
+	experts := mlp.Down.Weight.MulmatID(ctx, hiddenStates, selectedExperts)
+	experts = experts.Mul(ctx, routingWeights)
+
+	nextStates := experts.View(ctx, 0, experts.Dim(0), experts.Stride(2), experts.Dim(2))
+	for i := 1; i < opts.numExpertsUsed; i++ {
+		nextStates = nextStates.Add(ctx, experts.View(ctx, i*experts.Stride(1), experts.Dim(0), experts.Stride(2), experts.Dim(2)))
+	}
+
+	return nextStates
+}
+
+type dense struct {
+	Gate *nn.Linear `gguf:"ffn_gate"`
+	Up   *nn.Linear `gguf:"ffn_up"`
+	Down *nn.Linear `gguf:"ffn_down"`
+}
+
+func (mlp *dense) Forward(ctx ml.Context, hiddenStates ml.Tensor, _ *Options) ml.Tensor {
+	hiddenStates = mlp.Gate.Forward(ctx, hiddenStates).SILU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenStates))
+	return mlp.Down.Forward(ctx, hiddenStates)
+}
+
+type Layer struct {
+	AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
+	*Attention
+
+	MLPNorm *nn.RMSNorm `gguf:"ffn_norm"`
+	MLP
+}
+
+func (d *Layer) Forward(ctx ml.Context, hiddenStates, positions, outputs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
+	residual := hiddenStates
+	hiddenStates = d.AttentionNorm.Forward(ctx, hiddenStates, opts.eps)
+	hiddenStates = d.Attention.Forward(ctx, hiddenStates, positions, cache, opts)
+
+	if outputs != nil {
+		hiddenStates = hiddenStates.Rows(ctx, outputs)
+		residual = residual.Rows(ctx, outputs)
+	}
+
+	hiddenStates = hiddenStates.Add(ctx, residual)
+
+	residual = hiddenStates
+	hiddenStates = d.MLPNorm.Forward(ctx, hiddenStates, opts.eps)
+	hiddenStates = d.MLP.Forward(ctx, hiddenStates, opts)
+	return hiddenStates.Add(ctx, residual)
+}
+
+type Model struct {
+	model.Base
+	model.BytePairEncoding
+
+	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
+	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
+	Output         *nn.Linear    `gguf:"output,alt:token_embd"`
+
+	Layers []Layer `gguf:"blk"`
+
+	*Options
+}
+
+// Forward implements model.Model.
+func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
+	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+
+	hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs)
+
+	for i, layer := range m.Layers {
+		m.Cache.SetLayer(i)
+
+		var outputs ml.Tensor
+		if i == len(m.Layers)-1 {
+			outputs = ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+		}
+
+		hiddenStates = layer.Forward(ctx, hiddenStates, positions, outputs, m.Cache, m.Options)
+	}
+
+	hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.eps)
+	return m.Output.Forward(ctx, hiddenStates), nil
+}
+
+func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
+	return fast.RoPE(ctx, key, shift, m.headDim(), m.ropeBase, m.ropeScale, rope.WithTypeNeoX()), nil
+}
+
+var _ model.Model = (*Model)(nil)
+
+func New(c fs.Config) (model.Model, error) {
+	layers := make([]Layer, c.Uint("block_count"))
+	for i := range layers {
+		if c.String("general.architecture") == "qwen3moe" {
+			layers[i].MLP = &sparse{}
+		} else {
+			layers[i].MLP = &dense{}
+		}
+	}
+
+	m := Model{
+		BytePairEncoding: model.NewBytePairEncoding(
+			`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
+			&model.Vocabulary{
+				Values: c.Strings("tokenizer.ggml.tokens"),
+				Types:  c.Ints("tokenizer.ggml.token_type"),
+				Merges: c.Strings("tokenizer.ggml.merges"),
+				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
+				EOS: append(
+					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				),
+			},
+		),
+		Layers: layers,
+		Options: &Options{
+			hiddenSize:     int(c.Uint("embedding_length")),
+			numHeads:       int(c.Uint("attention.head_count")),
+			numKVHeads:     int(c.Uint("attention.head_count_kv")),
+			keyLength:      int(c.Uint("attention.key_length")),
+			valueLength:    int(c.Uint("attention.value_length")),
+			eps:            c.Float("attention.layer_norm_rms_epsilon"),
+			ropeBase:       c.Float("rope.freq_base"),
+			ropeScale:      c.Float("rope.freq_scale", 1),
+			numExperts:     int(c.Uint("expert_count")),
+			numExpertsUsed: int(c.Uint("expert_used_count")),
+			normTopKProb:   c.Bool("norm_top_k_prob", true),
+		},
+	}
+
+	m.Cache = kvcache.NewCausalCache(m.Shift)
+	return &m, nil
+}
+
+func init() {
+	model.Register("qwen3", New)
+	model.Register("qwen3moe", New)
+}
--- a/model/process_text_spm.go
+++ b/model/process_text_spm.go
@@ -182,27 +182,12 @@ func (spm SentencePieceModel) Encode(s string, addSpecial bool) ([]int32, error)
 		}
 	}

+	slog.Log(context.TODO(), logutil.LevelTrace, "encoded", "string", s, "ids", ids)
+
 	if addSpecial && len(ids) > 0 {
-		if spm.vocab.AddBOS {
-			if ids[0] == spm.vocab.BOS {
-				slog.Warn("adding bos token to prompt which already has it", "id", spm.vocab.BOS)
-			}
-
-			slog.Debug("adding bos token to prompt", "id", spm.vocab.BOS)
-			ids = append([]int32{spm.vocab.BOS}, ids...)
-		}
-
-		if spm.vocab.AddEOS {
-			if ids[len(ids)-1] == spm.vocab.EOS {
-				slog.Warn("adding eos token to prompt which already has it", "id", spm.vocab.EOS)
-			}
-
-			slog.Debug("adding eos token to prompt", "id", spm.vocab.EOS)
-			ids = append(ids, spm.vocab.EOS)
-		}
+		ids = spm.vocab.addSpecials(ids)
 	}

-	slog.Log(context.TODO(), logutil.LevelTrace, "encoded", "ids", ids)
 	return ids, nil
 }

@@ -261,6 +246,6 @@ func (spm SentencePieceModel) Decode(ids []int32) (string, error) {
 		}
 	}

-	slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "string", sb.String())
+	slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "ids", ids, "string", sb.String())
 	return sb.String(), nil
 }
--- a/model/process_text_spm_test.go
+++ b/model/process_text_spm_test.go
--- a/model/textprocessor.go
+++ b/model/textprocessor.go
@@ -0,0 +1,17 @@
+package model
+
+const (
+	TOKEN_TYPE_NORMAL = iota + 1
+	TOKEN_TYPE_UNKNOWN
+	TOKEN_TYPE_CONTROL
+	TOKEN_TYPE_USER_DEFINED
+	TOKEN_TYPE_UNUSED
+	TOKEN_TYPE_BYTE
+)
+
+type TextProcessor interface {
+	Encode(s string, addSpecial bool) ([]int32, error)
+	Decode([]int32) (string, error)
+	Is(int32, Special) bool
+	Vocabulary() *Vocabulary
+}
--- a/model/vocabulary.go
+++ b/model/vocabulary.go
@@ -0,0 +1,112 @@
+package model
+
+import (
+	"log/slog"
+	"slices"
+	"sync"
+)
+
+type Special int32
+
+const (
+	SpecialBOS Special = iota
+	SpecialEOS
+)
+
+type Vocabulary struct {
+	Values []string
+	Types  []int32
+	Scores []float32
+	Merges []string
+
+	BOS, EOS       []int32
+	AddBOS, AddEOS bool
+
+	specialOnce sync.Once
+	special     []string
+
+	valuesOnce sync.Once
+	values     map[string]int32
+
+	mergeOnce sync.Once
+	merge     map[string]int32
+}
+
+func (v *Vocabulary) Is(id int32, special Special) bool {
+	switch special {
+	case SpecialBOS:
+		return slices.Contains(v.BOS, id)
+	case SpecialEOS:
+		return slices.Contains(v.EOS, id)
+	default:
+		return false
+	}
+}
+
+func (v *Vocabulary) addSpecials(ids []int32) []int32 {
+	if v.AddBOS && len(v.BOS) > 0 {
+		if slices.Contains(v.BOS, ids[0]) {
+			slog.Warn("adding bos token to prompt which already has it", "id", v.BOS)
+		}
+
+		slog.Debug("adding bos token to prompt", "id", v.BOS)
+		ids = append([]int32{v.BOS[0]}, ids...)
+	}
+
+	if v.AddEOS && len(v.EOS) > 0 {
+		if slices.Contains(v.BOS, ids[len(ids)-1]) {
+			slog.Warn("adding eos token to prompt which already has it", "id", v.EOS)
+		}
+
+		slog.Debug("adding eos token to prompt", "id", v.EOS)
+		ids = append(ids, v.EOS[0])
+	}
+
+	return ids
+}
+
+func (v *Vocabulary) Encode(s string) int32 {
+	v.valuesOnce.Do(func() {
+		v.values = make(map[string]int32, len(v.Values))
+		for i, value := range v.Values {
+			v.values[value] = int32(i)
+		}
+	})
+
+	if id, ok := v.values[s]; ok {
+		return id
+	}
+
+	return -1
+}
+
+func (v *Vocabulary) Decode(id int32) string {
+	return v.Values[id]
+}
+
+func (v *Vocabulary) SpecialVocabulary() []string {
+	v.specialOnce.Do(func() {
+		for i := range v.Values {
+			if v.Types[i] == TOKEN_TYPE_CONTROL {
+				v.special = append(v.special, v.Values[i])
+			}
+		}
+	})
+
+	return v.special
+}
+
+func (v *Vocabulary) Merge(left, right string) int {
+	v.mergeOnce.Do(func() {
+		v.merge = make(map[string]int32, len(v.Merges))
+		for i, merge := range v.Merges {
+			v.merge[merge] = int32(i)
+		}
+	})
+
+	if id, ok := v.merge[left+" "+right]; ok {
+		return int(id)
+	}
+
+	return -1
+}
--- a/parser/parser.go
+++ b/parser/parser.go
@@ -292,13 +292,18 @@ func filesForModel(path string) ([]string, error) {
 	}
 	files = append(files, js...)

-	if tks, _ := glob(filepath.Join(path, "tokenizer.model"), "application/octet-stream"); len(tks) > 0 {
-		// add tokenizer.model if it exists, tokenizer.json is automatically picked up by the previous glob
-		// tokenizer.model might be a unresolved git lfs reference; error if it is
-		files = append(files, tks...)
-	} else if tks, _ := glob(filepath.Join(path, "**/tokenizer.model"), "text/plain"); len(tks) > 0 {
-		// some times tokenizer.model is in a subdirectory (e.g. meta-llama/Meta-Llama-3-8B)
-		files = append(files, tks...)
+	// only include tokenizer.model is tokenizer.json is not present
+	if !slices.ContainsFunc(files, func(s string) bool {
+		return slices.Contains(strings.Split(s, string(os.PathSeparator)), "tokenizer.json")
+	}) {
+		if tks, _ := glob(filepath.Join(path, "tokenizer.model"), "application/octet-stream"); len(tks) > 0 {
+			// add tokenizer.model if it exists, tokenizer.json is automatically picked up by the previous glob
+			// tokenizer.model might be a unresolved git lfs reference; error if it is
+			files = append(files, tks...)
+		} else if tks, _ := glob(filepath.Join(path, "**/tokenizer.model"), "text/plain"); len(tks) > 0 {
+			// some times tokenizer.model is in a subdirectory (e.g. meta-llama/Meta-Llama-3-8B)
+			files = append(files, tks...)
+		}
 	}

 	return files, nil
--- a/readline/types.go
+++ b/readline/types.go
@@ -61,6 +61,8 @@ const (
 	ColorGrey    = Esc + "[38;5;245m"
 	ColorDefault = Esc + "[0m"

+	ColorBold = Esc + "[1m"
+
 	StartBracketedPaste = Esc + "[?2004h"
 	EndBracketedPaste   = Esc + "[?2004l"
 )
--- a/runner/common/stop.go
+++ b/runner/common/stop.go
@@ -2,6 +2,8 @@ package common

 import (
 	"strings"
+
+	"github.com/ollama/ollama/llm"
 )

 func FindStop(sequence string, stops []string) (bool, string) {
@@ -29,68 +31,41 @@ func ContainsStopSuffix(sequence string, stops []string) bool {
 // truncateStop removes the provided stop string from pieces,
 // returning the partial pieces with stop removed, including truncating
 // the last piece if required (and signalling if this was the case)
-func TruncateStop(pieces []string, stop string) ([]string, bool) {
-	joined := strings.Join(pieces, "")
-
-	index := strings.Index(joined, stop)
-	if index == -1 {
-		return pieces, false
+func TruncateStop(resps []llm.CompletionResponse, stop string) ([]llm.CompletionResponse, bool) {
+	var sequence string
+	for _, resp := range resps {
+		sequence += resp.Content
 	}

-	joined = joined[:index]
-
-	// Split truncated string back into pieces of original lengths
-	lengths := make([]int, len(pieces))
-	for i, piece := range pieces {
-		lengths[i] = len(piece)
+	idx := strings.Index(sequence, stop)
+	if idx < 0 {
+		return resps, false
 	}

-	var result []string
-	tokenTruncated := false
-	start := 0
-	for _, length := range lengths {
-		if start >= len(joined) {
+	truncated := sequence[:idx]
+	if len(truncated) == 0 {
+		return nil, true
+	}
+
+	result := make([]llm.CompletionResponse, 0, len(resps))
+
+	// Track position in truncated sequence
+	pos := 0
+	truncationHappened := false
+	for _, resp := range resps {
+		if pos >= len(truncated) {
 			break
 		}

-		end := start + length
-		if end > len(joined) {
-			end = len(joined)
-			tokenTruncated = true
+		chunk := truncated[pos:min(pos+len(resp.Content), len(truncated))]
+		if len(chunk) < len(resp.Content) {
+			truncationHappened = true
 		}
-		result = append(result, joined[start:end])
-		start = end
+		if len(chunk) > 0 {
+			result = append(result, llm.CompletionResponse{Content: chunk})
+		}
+		pos += len(resp.Content)
 	}

-	return result, tokenTruncated
-}
-
-func IncompleteUnicode(token string) bool {
-	incomplete := false
-
-	// check if there is incomplete UTF-8 character at the end
-	for i := 1; i < 5 && i <= len(token); i++ {
-		c := token[len(token)-i]
-
-		if (c & 0xc0) == 0x80 {
-			// continuation byte: 10xxxxxx
-			continue
-		}
-
-		if (c & 0xe0) == 0xc0 {
-			// 2-byte character: 110xxxxx ...
-			incomplete = i < 2
-		} else if (c & 0xf0) == 0xe0 {
-			// 3-byte character: 1110xxxx ...
-			incomplete = i < 3
-		} else if (c & 0xf8) == 0xf0 {
-			// 4-byte character: 11110xxx ...
-			incomplete = i < 4
-		}
-
-		// else 1-byte character or invalid byte
-		break
-	}
-
-	return incomplete
+	return result, truncationHappened
 }
--- a/runner/common/stop_test.go
+++ b/runner/common/stop_test.go
@@ -1,51 +1,84 @@
 package common

 import (
+	"fmt"
 	"reflect"
 	"testing"
+
+	"github.com/ollama/ollama/llm"
 )

 func TestTruncateStop(t *testing.T) {
 	tests := []struct {
 		name          string
-		pieces        []string
+		pieces        []llm.CompletionResponse
 		stop          string
-		expected      []string
+		expected      []llm.CompletionResponse
 		expectedTrunc bool
 	}{
 		{
-			name:          "Single word",
-			pieces:        []string{"hello", "world"},
-			stop:          "world",
-			expected:      []string{"hello"},
+			name: "Single word",
+			pieces: []llm.CompletionResponse{
+				{Content: "Hello"},
+				{Content: "world"},
+			},
+			stop: "world",
+			expected: []llm.CompletionResponse{
+				{Content: "Hello"},
+			},
 			expectedTrunc: false,
 		},
 		{
-			name:          "Partial",
-			pieces:        []string{"hello", "wor"},
-			stop:          "or",
-			expected:      []string{"hello", "w"},
+			name: "Partial",
+			pieces: []llm.CompletionResponse{
+				{Content: "Hello"},
+				{Content: " wor"},
+			},
+			stop: "or",
+			expected: []llm.CompletionResponse{
+				{Content: "Hello"},
+				{Content: " w"},
+			},
 			expectedTrunc: true,
 		},
 		{
-			name:          "Suffix",
-			pieces:        []string{"Hello", " there", "!"},
-			stop:          "!",
-			expected:      []string{"Hello", " there"},
+			name: "Suffix",
+			pieces: []llm.CompletionResponse{
+				{Content: "Hello"},
+				{Content: " there"},
+				{Content: "!"},
+			},
+			stop: "!",
+			expected: []llm.CompletionResponse{
+				{Content: "Hello"},
+				{Content: " there"},
+			},
 			expectedTrunc: false,
 		},
 		{
-			name:          "Suffix partial",
-			pieces:        []string{"Hello", " the", "re!"},
-			stop:          "there!",
-			expected:      []string{"Hello", " "},
+			name: "Suffix partial",
+			pieces: []llm.CompletionResponse{
+				{Content: "Hello"},
+				{Content: " the"},
+				{Content: "re!"},
+			},
+			stop: "there!",
+			expected: []llm.CompletionResponse{
+				{Content: "Hello"},
+				{Content: " "},
+			},
 			expectedTrunc: true,
 		},
 		{
-			name:          "Middle",
-			pieces:        []string{"hello", " wor"},
-			stop:          "llo w",
-			expected:      []string{"he"},
+			name: "Middle",
+			pieces: []llm.CompletionResponse{
+				{Content: "Hello"},
+				{Content: " wo"},
+			},
+			stop: "llo w",
+			expected: []llm.CompletionResponse{
+				{Content: "He"},
+			},
 			expectedTrunc: true,
 		},
 	}
@@ -54,76 +87,23 @@ func TestTruncateStop(t *testing.T) {
 		t.Run(tt.name, func(t *testing.T) {
 			result, resultTrunc := TruncateStop(tt.pieces, tt.stop)
 			if !reflect.DeepEqual(result, tt.expected) || resultTrunc != tt.expectedTrunc {
-				t.Errorf("truncateStop(%v, %s): have %v (%v); want %v (%v)", tt.pieces, tt.stop, result, resultTrunc, tt.expected, tt.expectedTrunc)
+				t.Errorf("truncateStop(%v, %v):\n%shave truncated %v\nwant truncated %v",
+					tt.pieces, tt.stop, formatContentDiff(result, tt.expected), resultTrunc, tt.expectedTrunc)
 			}
 		})
 	}
 }

-func TestIncompleteUnicode(t *testing.T) {
-	tests := []struct {
-		name     string
-		input    string
-		expected bool
-	}{
-		{
-			name:     "Basic",
-			input:    "hi",
-			expected: false,
-		},
-		{
-			name:     "Two byte",
-			input:    "hi" + string([]byte{0xc2, 0xa3}),
-			expected: false,
-		},
-		{
-			name:     "Two byte - missing last",
-			input:    "hi" + string([]byte{0xc2}),
-			expected: true,
-		},
-		{
-			name:     "Three byte",
-			input:    "hi" + string([]byte{0xe0, 0xA0, 0x80}),
-			expected: false,
-		},
-		{
-			name:     "Three byte - missing last",
-			input:    "hi" + string([]byte{0xe0, 0xA0}),
-			expected: true,
-		},
-		{
-			name:     "Three byte - missing last 2",
-			input:    "hi" + string([]byte{0xe0}),
-			expected: true,
-		},
-		{
-			name:     "Four byte",
-			input:    "hi" + string([]byte{0xf0, 0x92, 0x8a, 0xb7}),
-			expected: false,
-		},
-		{
-			name:     "Four byte - missing last",
-			input:    "hi" + string([]byte{0xf0, 0x92, 0x8a}),
-			expected: true,
-		},
-		{
-			name:     "Four byte - missing last 2",
-			input:    "hi" + string([]byte{0xf0, 0x92}),
-			expected: true,
-		},
-		{
-			name:     "Four byte - missing last 3",
-			input:    "hi" + string([]byte{0xf0}),
-			expected: true,
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			result := IncompleteUnicode(tt.input)
-			if result != tt.expected {
-				t.Errorf("incompleteUnicode(%s): have %v; want %v", tt.input, result, tt.expected)
-			}
-		})
+func formatContentDiff(result, expected []llm.CompletionResponse) string {
+	var s string
+	for i := 0; i < len(result) || i < len(expected); i++ {
+		if i < len(result) && i < len(expected) && result[i].Content != expected[i].Content {
+			s += fmt.Sprintf("[%d] %q vs %q\n", i, result[i].Content, expected[i].Content)
+		} else if i < len(result) && i >= len(expected) {
+			s += fmt.Sprintf("[%d] extra %q\n", i, result[i].Content)
+		} else if i >= len(result) && i < len(expected) {
+			s += fmt.Sprintf("[%d] missing %q\n", i, expected[i].Content)
+		}
 	}
+	return s
 }
--- a/runner/llamarunner/cache.go
+++ b/runner/llamarunner/cache.go
@@ -104,8 +104,8 @@ func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCach
 	slog.Debug("loading cache slot", "id", slot.Id, "cache", len(slot.Inputs), "prompt", len(prompt),
 		"used", numPast, "remaining", len(prompt)-numPast)

+	slot.Inputs = prompt[:numPast]
 	prompt = prompt[numPast:]
-	slot.Inputs = slot.Inputs[:numPast]

 	return slot, prompt, nil
 }
--- a/runner/llamarunner/runner.go
+++ b/runner/llamarunner/runner.go
@@ -17,7 +17,6 @@ import (
 	"strings"
 	"sync"
 	"time"
-	"unicode/utf8"

 	"golang.org/x/sync/semaphore"

@@ -52,13 +51,13 @@ type Sequence struct {
 	pendingInputs []input

 	// tokens that have been generated but not returned yet (e.g. for stop sequences)
-	pendingResponses []string
+	pendingResponses []llm.CompletionResponse

 	// input cache being used by this sequence
 	cache *InputCacheSlot

 	// channel to send responses over
-	responses chan string
+	responses chan llm.CompletionResponse

 	// channel to stop decoding (such as if the remote connection is closed)
 	quit chan bool
@@ -89,6 +88,19 @@ type Sequence struct {
 	numPromptInputs     int
 }

+func (seq *Sequence) send(resp llm.CompletionResponse) bool {
+	if len(resp.Content) > 0 || resp.Done {
+		select {
+		case seq.responses <- resp:
+			// Successfully sent
+			return true
+		case <-seq.quit:
+			return false
+		}
+	}
+	return true
+}
+
 type NewSequenceParams struct {
 	numPredict     int
 	stop           []string
@@ -147,8 +159,8 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe
 		numPromptInputs:     len(inputs),
 		startProcessingTime: startTime,
 		numPredict:          params.numPredict,
-		pendingResponses:    make([]string, 0),
-		responses:           make(chan string, 100),
+		pendingResponses:    make([]llm.CompletionResponse, 0),
+		responses:           make(chan llm.CompletionResponse, 100),
 		quit:                make(chan bool, 1),
 		embedding:           make(chan []float32, 1),
 		samplingCtx:         sc,
@@ -272,36 +284,15 @@ func (s *Server) allNil() bool {
 	return true
 }

-func flushPending(seq *Sequence) bool {
-	joined := strings.Join(seq.pendingResponses, "")
-	seq.pendingResponses = []string{}
-
-	// Check if there are any partial UTF-8 characters remaining.
-	// We already check and queue as we are generating but some may
-	// still make it here:
-	// - Sequence is ending, e.g. generation limit has been hit
-	// - Invalid characters in the middle of a string
-	// This is a stricter check to ensure we never output invalid Unicode.
-	for !utf8.ValidString(joined) {
-		joined = joined[:len(joined)-1]
-	}
-
-	if len(joined) == 0 {
-		return true
-	}
-
-	select {
-	case seq.responses <- joined:
-		return true
-	case <-seq.quit:
-		return false
-	}
-}
-
 func (s *Server) removeSequence(seqIndex int, reason llm.DoneReason) {
 	seq := s.seqs[seqIndex]

-	flushPending(seq)
+	// Send any remaining pending responses
+	for _, resp := range seq.pendingResponses {
+		seq.send(resp)
+	}
+	seq.pendingResponses = []llm.CompletionResponse{}
+
 	seq.doneReason = reason
 	close(seq.responses)
 	close(seq.embedding)
@@ -490,8 +481,11 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)

 		seq.inputs = []input{{token: token}}

-		seq.pendingResponses = append(seq.pendingResponses, piece)
-		sequence := strings.Join(seq.pendingResponses, "")
+		seq.pendingResponses = append(seq.pendingResponses, llm.CompletionResponse{Content: piece})
+		sequence := ""
+		for _, r := range seq.pendingResponses {
+			sequence += r.Content
+		}

 		if ok, stop := common.FindStop(sequence, seq.stop); ok {
 			slog.Debug("hit stop token", "pending", seq.pendingResponses, "stop", stop)
@@ -523,13 +517,13 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 			continue
 		}

-		if common.IncompleteUnicode(sequence) {
-			continue
-		}
-
-		if !flushPending(seq) {
-			s.removeSequence(i, llm.DoneReasonConnectionClosed)
+		for _, resp := range seq.pendingResponses {
+			if !seq.send(resp) {
+				s.removeSequence(i, llm.DoneReasonConnectionClosed)
+				break
+			}
 		}
+		seq.pendingResponses = []llm.CompletionResponse{}
 	}

 	return nil
@@ -627,9 +621,7 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 			return
 		case content, ok := <-seq.responses:
 			if ok {
-				if err := json.NewEncoder(w).Encode(&llm.CompletionResponse{
-					Content: content,
-				}); err != nil {
+				if err := json.NewEncoder(w).Encode(&content); err != nil {
 					http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
 					close(seq.quit)
 					return
--- a/runner/ollamarunner/cache.go
+++ b/runner/ollamarunner/cache.go
@@ -136,8 +136,8 @@ func (c *InputCache) LoadCacheSlot(prompt []input.Input) (*InputCacheSlot, []inp
 	slog.Debug("loading cache slot", "id", slot.Id, "cache", len(slot.Inputs), "prompt", len(prompt),
 		"used", numPast, "remaining", int32(len(prompt))-numPast)

+	slot.Inputs = prompt[:numPast]
 	prompt = prompt[numPast:]
-	slot.Inputs = slot.Inputs[:numPast]

 	return slot, prompt, nil
 }
--- a/runner/ollamarunner/cache_test.go
+++ b/runner/ollamarunner/cache_test.go
@@ -3,7 +3,6 @@ package ollamarunner
 import (
 	"errors"
 	"fmt"
-	"image"
 	"testing"
 	"time"

@@ -12,10 +11,6 @@ import (
 )

 func TestCountCommon(t *testing.T) {
-	imgA := image.NewRGBA(image.Rect(0, 0, 100, 100))
-	imgB := image.NewRGBA(image.Rect(0, 0, 50, 50))
-	imgC := image.NewRGBA(image.Rect(50, 50, 100, 100))
-
 	tests := []struct {
 		name     string
 		t1       []input.Input
@@ -36,20 +31,20 @@ func TestCountCommon(t *testing.T) {
 		},
 		{
 			name:     "Image Prefix",
-			t1:       []input.Input{{Multimodal: imgA, MultimodalHash: 1}},
-			t2:       []input.Input{{Multimodal: imgA, MultimodalHash: 1}, {Multimodal: imgB, MultimodalHash: 2}, {Multimodal: imgC, MultimodalHash: 3}},
+			t1:       []input.Input{{MultimodalHash: 1}},
+			t2:       []input.Input{{MultimodalHash: 1}, {MultimodalHash: 2}, {MultimodalHash: 3}},
 			expected: 1,
 		},
 		{
 			name:     "Mixed",
-			t1:       []input.Input{{Token: 1}, {Multimodal: imgA, MultimodalHash: 1}},
-			t2:       []input.Input{{Token: 1}, {Multimodal: imgA, MultimodalHash: 1}, {Token: 5}},
+			t1:       []input.Input{{Token: 1}, {MultimodalHash: 1}},
+			t2:       []input.Input{{Token: 1}, {MultimodalHash: 1}, {Token: 5}},
 			expected: 2,
 		},
 		{
 			name:     "Mixed, Same Length",
-			t1:       []input.Input{{Token: 1}, {Multimodal: imgA, MultimodalHash: 1}},
-			t2:       []input.Input{{Token: 1}, {Multimodal: imgB, MultimodalHash: 2}},
+			t1:       []input.Input{{Token: 1}, {MultimodalHash: 1}},
+			t2:       []input.Input{{Token: 1}, {MultimodalHash: 2}},
 			expected: 1,
 		},
 		{
--- a/runner/ollamarunner/multimodal.go
+++ b/runner/ollamarunner/multimodal.go
@@ -0,0 +1,113 @@
+package ollamarunner
+
+import (
+	"errors"
+
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/model/input"
+)
+
+// Tensors can't be used across multiple compute graphs. This is a problem
+// if a single embedding is split across batches using views since all of
+// the views will have the same source tensor. We also don't want to
+// recompute the entire embedding for each batch.
+//
+// To avoid this, we compute all of the tensors for the embedding on the
+// first use and then store the result in system memory. When we need
+// additional tensors, we recreate them from the stored data.
+
+// multimodalEntry represents the embeddings of a single object (such
+// as an image).
+type multimodalEntry struct {
+	// mm is the original set of tensors created by EncodeMultimodal
+	mm []input.Multimodal
+
+	// data is the computed result of mm. Nil if not yet computed
+	data [][]float32
+}
+
+// multimodalStore maps from an individual tensor (of which there
+// may be many in a single multimodal object) to its parent embedding
+type multimodalStore map[ml.Tensor]*multimodalEntry
+
+func newMultimodalStore() multimodalStore {
+	return make(multimodalStore)
+}
+
+// addMultimodal stores an embedding for later use in a compute graph
+func (m multimodalStore) addMultimodal(embedding []input.Multimodal) {
+	entry := &multimodalEntry{mm: embedding}
+
+	for _, e := range embedding {
+		if e.Tensor != nil {
+			m[e.Tensor] = entry
+		}
+	}
+}
+
+// getMultimodal takes a source set of tensors (which may contain a whole or
+// parts of one or more images) and returns the equivalent that can be used in
+// the current context
+func (m multimodalStore) getMultimodal(backend ml.Backend, ctx ml.Context, in []input.Multimodal, reserve bool) ([]input.Multimodal, error) {
+	out := make([]input.Multimodal, len(in))
+	for i := range out {
+		if in[i].Tensor != nil {
+			var err error
+			out[i].Tensor, err = m.getTensor(backend, ctx, in[i].Tensor, reserve)
+			if err != nil {
+				return nil, err
+			}
+		}
+
+		out[i].Data = in[i].Data
+	}
+
+	return out, nil
+}
+
+func (m multimodalStore) getTensor(backend ml.Backend, ctx ml.Context, in ml.Tensor, reserve bool) (ml.Tensor, error) {
+	entry := m[in]
+
+	if entry.data == nil {
+		computeCtx := backend.NewContext()
+		defer computeCtx.Close()
+
+		var tensors []ml.Tensor
+		for _, t := range entry.mm {
+			if t.Tensor != nil {
+				tensors = append(tensors, t.Tensor)
+			}
+		}
+
+		if len(tensors) == 0 {
+			return nil, nil
+		}
+
+		computeCtx.Forward(tensors...)
+		entry.data = make([][]float32, len(entry.mm))
+
+		if !reserve {
+			computeCtx.Compute(tensors...)
+
+			for i, t := range entry.mm {
+				if t.Tensor != nil {
+					entry.data[i] = t.Tensor.Floats()
+				}
+			}
+		} else {
+			computeCtx.Reserve()
+		}
+	}
+
+	for i, t := range entry.mm {
+		if in == t.Tensor {
+			if !reserve {
+				return ctx.Input().FromFloatSlice(entry.data[i], t.Tensor.Shape()...), nil
+			} else {
+				return ctx.Input().Empty(t.Tensor.DType(), t.Tensor.Shape()...), nil
+			}
+		}
+	}
+
+	return nil, errors.New("multimodal tensor not found")
+}
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Bruce MacDonald	d5eae8248d	runner: enable returning more info from runner processing Currently we return only the text predicted from the LLM. This was nice in that it was simple, but there may be other info we want to know from the processing. This change adds the ability to return more information from the runner than just the text predicted.	2025-06-13 16:26:57 -07:00
Jeffrey Morgan	9f8a18ec05	tools: loosen tool parsing to allow for more formats (#11030 )	2025-06-12 14:18:54 -07:00
Michael Yang	6b04cad7e8	feat: incremental gguf parser (#10822 ) * incremental gguf parser * gguf: update test to not rely on gguf on disc * re-use existing create gguf * read capabilities from gguf kv * kv exists * update tests * s/doneFunc/successFunc/g * new buffered reader --------- Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>	2025-06-12 11:04:11 -07:00
Michael Yang	45f56355d5	feat: uneven splits (#11048 ) The current splitDim function only operates on tensors that are split evenly which isn't always the case, e.g. a QKV tensor. This change allows the function to be used for arbitrary splits	2025-06-11 12:10:54 -07:00
Michael Yang	0dabb4ef6a	skip tokenizer.model if possible (#11050 ) if tokenizer.json is already copied, skip tokenizer.model	2025-06-11 12:10:35 -07:00
Michael Yang	2e77aa1ae7	use nn.Linear in place of ml.Tensor (#11049 ) while nn.Linear.Forward isn't applicable for sparse MLP, it's still a nice container for the tensors	2025-06-11 12:10:15 -07:00
Attogram Project	deaabe292d	readme: add ollama-multirun to community integrations (#11038 )	2025-06-10 14:14:51 -07:00
Jeffrey Morgan	af21a5ac39	readme: update quickstart link text to Gemma 3	2025-06-10 09:34:23 -07:00
Jeffrey Morgan	f63d7f68eb	readme: update quickstart example to Gemma 3	2025-06-10 09:33:54 -07:00
Daniel Hiltgen	82ad1dbc07	mac: handle "keep" named apps (#11031 ) When a user elects to keep the existing app, the new Ollama is named `Ollama 2.app` This fixes the app startup flow to handle this naming pattern.	2025-06-09 16:29:57 -07:00
Daniel Hiltgen	feeabdadd2	spawn desktop quickly (#11011 ) Give the desktop app a hint to start fast.	2025-06-08 09:34:52 -07:00
Krzysztof Jeziorny	fc0309615e	docs: update link to AMD drivers in linux.md (#10973 )	2025-06-06 23:30:04 -04:00
Jeffrey Morgan	09d308d6b6	Revert "server: add model capabilities to the list endpoint (#10174 )" (#11004 ) This reverts commit `0943001193`.	2025-06-06 23:29:14 -04:00
Daniel Hiltgen	a8ed68bd93	launch app hidden (#10962 ) When starting the app in the background, start it hidden.	2025-06-06 14:06:29 -07:00
Daniel Hiltgen	2ae65ae471	win: handle more than 2048 processes (#10997 ) Fix an array out of bounds crash	2025-06-06 14:06:09 -07:00
Devon Rifkin	a3b6886b7d	move thinking logic into its own package (#10990 ) move thinking logic into its own package	2025-06-06 12:02:20 -07:00
Hunter Wittenborn	c6a6d7294d	docs: fix typo in development.md (#10998 )	2025-06-06 12:07:29 -04:00
Devon Rifkin	2cf007c9d1	Merge pull request #10987 from ollama/drifkin/export-thinking-parser export ThinkingParser	2025-06-05 12:19:14 -07:00
Devon Rifkin	0683efa637	export ThinkingParser	2025-06-05 10:22:32 -07:00
JasonHonKL	0943001193	server: add model capabilities to the list endpoint (#10174 )	2025-06-04 11:39:48 -07:00
HardCodeDev	5c42800fca	readme: add SimpleOllamaUnity to community integrations (#10817 )	2025-05-30 19:50:16 -07:00
Parth Sareen	65f10c2823	tools: resiliency upgrade to name and arg extraction from template (#10917 )	2025-05-30 15:18:09 -07:00
Jesse Gross	aaa7818000	ggml: Export GPU UUIDs This enables matching up devices and information reported by the backend with system management libraries such as nvml to get accurate free memory reporting.	2025-05-29 14:01:26 -07:00
Jesse Gross	f15ffc4320	llm: Make "POST predict" error message more informative "POST predict" basically means that the runner has crashed, which can have many reasons. However, many people think this is a specific error and either report only this message or group together unrelated bugs. This replaces it with a more friendly and helpful message.	2025-05-29 09:41:19 -07:00
Devon Rifkin	5f57b0ef42	add thinking support to the api and cli (#10584 ) - Both `/api/generate` and `/api/chat` now accept a `"think"` option that allows specifying whether thinking mode should be on or not - Templates get passed this new option so, e.g., qwen3's template can put `/think` or `/no_think` in the system prompt depending on the value of the setting - Models' thinking support is inferred by inspecting model templates. The prefix and suffix the parser uses to identify thinking support is also automatically inferred from templates - Thinking control & parsing is opt-in via the API to prevent breaking existing API consumers. If the `"think"` option is not specified, the behavior is unchanged from previous versions of ollama - Add parsing for thinking blocks in both streaming/non-streaming mode in both `/generate` and `/chat` - Update the CLI to make use of these changes. Users can pass `--think` or `--think=false` to control thinking, or during an interactive session they can use the commands `/set think` or `/set nothink` - A `--hidethinking` option has also been added to the CLI. This makes it easy to use thinking in scripting scenarios like `ollama run qwen3 --think --hidethinking "my question here"` where you just want to see the answer but still want the benefits of thinking models	2025-05-28 19:38:52 -07:00
Patrick Devine	aa25aff10d	client: add request signing to the client (#10881 ) If OLLAMA_AUTH is set, sign each request w/ a timestamp and pass the signature in the token header	2025-05-27 16:50:57 -07:00
Jesse Gross	ea79003180	kvcache: Skip computing causal mask for worst case graph reservation Computing an attention mask for a large context and max batch is expensive - over 100ms. Models like Gemma3 that have multiple types of caches and custom attention masks need to do this 4 times, so this adds approximately 500ms to startup time when using 128k context When we are reserving the worst case graph, we don't need the mask, only its shape, so we can skip this.	2025-05-27 14:25:15 -07:00
Kyle Steere	9239a254e0	server: abort download on empty digest Signed-off-by: Kyle Steere <kyle.steere@chainguard.dev>	2025-05-27 11:28:48 -07:00
Parth Sareen	066d0f4746	tools: relax JSON parse constraints for tool calling (#10872 )	2025-05-26 18:59:06 -07:00
Parth Sareen	aea6fb9b58	tools: remove newline stripping (#10869 )	2025-05-26 17:16:00 -07:00
RAPID ARCHITECT	012cf65340	readme: add AWS Strands Agents SDK example to community integrations (#10865 )	2025-05-26 12:05:03 -07:00
Min Yoo	a45231af47	readme: Add macLlama to community integrations (#10790 ) This commit updates the README to include macLlama within the community integrations section. macLlama is a native macOS application built for lightweight and efficient LLM interaction. Key features include: * Lightweight & Native: Designed to be resource-friendly and perform optimally on macOS. * Chat-like Interface: Provides a user-friendly, conversational interface. * Multiple Window Support: Allows users to manage multiple conversations simultaneously. The primary goal of macLlama is to offer a simple and easy-to-run LLM experience on macOS.	2025-05-24 13:18:32 -07:00
Daniel Hiltgen	2307fc2bcd	tests: drop llama3.2-vision embedding tests (#10837 )	2025-05-24 13:17:53 -07:00
frob	6623898198	docs: remove unsupported quantizations (#10842 )	2025-05-24 13:17:26 -07:00
frob	eda472df1b	server: add hint to the error message when model path access fails (#10843 )	2025-05-24 13:17:04 -07:00
Jesse Gross	f18e0cb550	ml: Improve slog formatting for BackendMemory	2025-05-23 20:08:23 -07:00
Parth Sareen	e8b981fa5d	tools: refactor tool call parsing and enable streaming (#10415 )	2025-05-23 14:19:31 -07:00
Parth Sareen	884d26093c	llama: add minimum memory for grammar (#10820 )	2025-05-22 18:53:31 -07:00
Jesse Gross	1f371ea92f	ml: Panic rather than return error on tensor allocation failure FromFloatSlice and FromIntSlice return an error if the shape doesn't match the passed data or if memory can't be allocated. Since these are inputs, the memory being allocated is system memory rather than VRAM. In many cases, the caller can't really handle the error and panics. Empty and Zeros directly panic if they can't allocate memory. This makes things consistent by panicing for the first two cases, removing a fair amount of error handling code. This is also consistent with how Go typically handles these situations.	2025-05-22 14:38:09 -07:00
Jesse Gross	73d6a82cce	ollamarunner: Memory usage reporting This provides granular information about the backend memory allocations required by the runner: - Per backend - Per layer - Weights, cache and graph - Allocation status This can be used for debugging and validating memory estimates.	2025-05-22 14:38:09 -07:00
Jesse Gross	6db8a3771c	ggml: Report graph memory for failed allocations GGML has a function to report the allocated size of a backend buffer. However, this returns 0 if we tried to allocate a buffer and it failed. For memory management purposes, it's important to know how much we were trying to allocate. This extends the API to report attempted sizes for all buffers and whether it succeeeded.	2025-05-22 14:38:09 -07:00
Daniel Hiltgen	d950ff12c0	sched: fix runner leak during reloading unload (#10819 ) When the same model is being reloaded rapidly with client connections being canceled before the model finishes loading, the queued unload event could cause a leak of runners by deleting a different runner from the loaded list.	2025-05-22 14:31:36 -07:00
Michael Yang	adff143bcd	fix: mllama quality (#10807 ) * fix mllama convert - transform attn_gate and ffn_gate - swap attention heads for vision models * fix mllama the mlp gate which was applied in the wrong place	2025-05-22 11:30:49 -07:00
Bruce MacDonald	fbe6ae285a	server: improve tensor quantization fallback logic (#10806 ) Fall back to alternative quantization types when a tensor's dimensions aren't divisible by the block size required for the original desired quantization type. If retried quantization types fail, the system ultimately falls back to F16 (half-precision floating point) which has a block size of 1 and can handle any tensor dimension.	2025-05-22 10:48:08 -07:00
Daniel Hiltgen	fdd4d479a3	integration: add qwen2.5-vl (#10815 ) Replace the older llava model with qwen2.5 for vision tests Skip split-batch test on small VRAM systems to avoid excessive test time	2025-05-22 09:12:32 -07:00
Michael Yang	61aeaf7e81	remove support for multiple ggufs in a single file (#10722 ) * remove support for multiple ggufs in a single file this was an attempt to make it easier to import multimodal models into ollama. this was rarely used and error prone so remove it * fix: create fused model from blob	2025-05-21 13:55:31 -07:00
Daniel Hiltgen	7359b02707	win: detect background upgrade in progress (#10785 ) Give the user a helpful error instead of showing connection refused errors.	2025-05-21 10:46:56 -07:00
Michael Yang	c890011322	feat: port qwen2 model (#10782 )	2025-05-21 10:21:24 -07:00
Michael Yang	e0ed984cde	feat: qwen3 dense and sparse models (#10708 ) * feat: qwen3 dense * feat: qwen3moe * fix llama4 moe	2025-05-21 10:21:07 -07:00
Michael Yang	139f84cf21	fix cmakelists (#10804 ) this fixes an issue introduced in #10788	2025-05-21 09:52:52 -07:00
Michael Yang	375839ea2d	chore: disable debug in binary libraries (#10788 )	2025-05-21 09:39:38 -07:00
Michael Yang	69b2fe9282	fix: qwen25vl assign samebatch in multimodal input (#10789 ) setting samebatch on the vision start token is problematic because it will be shared with other inputs that also use images. this will cause the input to be cached and the runner will not see SameBatch. SameBatch will also be incorrect since it may be for a different image. assigning samebatch to the input tokens resolves this by ensure it's assigned correctly to inputs corresponding to the image. not setting same batch correctly may cause panics during inference since images are no longer guaranteed to be in the same batch.	2025-05-21 09:39:20 -07:00
Michael Yang	9ed8bf14cb	ml: add more rope options (#10775 )	2025-05-20 15:51:08 -07:00
DarkCaster	e6a800ca11	llama: fix incorrect initialization of C.struct_common_sampler_cparams.penalty_present (#10779 )	2025-05-20 10:41:15 -07:00
Michael Yang	ff180c3466	fix llama and mistral3 models (#10774 ) * fix llama model * fix mistral3.1 model do not set default vision layers	2025-05-19 15:06:35 -07:00
Jesse Gross	3fe74fba42	llm: Use first layer as memory buffer in estimation This is a partial revert of `0478d44` "Fixed over vram allcation dure to small initial layer sizes." Previously we used the size of the first layer as an extra reserved amount of space to buffer our memory estimates. The above commit changed this to use the largest layer. However, this had performance impacts on more models than the original commit was trying to fix. There is just a heuristic without an ideal solution so this goes back to the historic behavior. Fixes: #10765, #10756, #10752, #10726	2025-05-19 14:03:34 -07:00
Daniel Hiltgen	1a0cfd080a	avoid kv truncation during create (#10761 )	2025-05-19 13:54:54 -07:00
Jesse Gross	94ab428e3f	ggml: Seperate tensor load from backend creation Currently, when the backend is created, the tensors are loaded at the same time, which is a slow operation. This separates them to be two steps: - Create backend, including enumerating tensors and memory allocation - Loading tensor data This allows more flexibility in managing model loading.	2025-05-19 09:54:22 -07:00
Jesse Gross	d755577473	llm: Estimate projector memory correctly for Ollama engine The Llama engine always places vision projectors on the first GPU if one exists. However, the Ollama engine groups it with the output layer, which means the projector is only offloaded if all other layers are offloaded. The memory estimation code always assumes the former layout - this changes it to use the correct layout based on the engine. This addresses two impacts of the current behavior: - In multi-GPU setups, we can crash with OOM errors when we try to allocate memory on a full GPU while another still has space. - If the vision projector is large, it may prevent us from offloading anything when we could have fit some of the text layers.	2025-05-19 09:52:48 -07:00
Jesse Gross	a2cc8571c5	llm: Consistently track unassigned model data In some cases, if we fail to assign a piece of the model to a GPU then we lose track of this data. Although it doesn't change the memory allocation, it does affect the total size of the model reported by tools such as ollama ps (and also the percent offloaded). This makes it look like setting num_gpu isn't reflected in ollama ps, which isn't true but the offloading percent may appear to not change. Spreading the model across more GPUs will continue to impact the reported total size of the model.	2025-05-19 09:52:48 -07:00
Ronald Wilson	7edfdd2f5f	readme: add TinyNotepad to community integrations (#10763 ) This PR adds Tiny Notepad, a lightweight, notepad-like interface to chat with local LLMs via Ollama. - It’s designed as a simple, distraction-free alternative. - The app supports basic note-taking, timestamped logs, and model parameter controls. - Built with Tkinter, it runs entirely offline and available via PyPI. Aims to provide a lightweight easy to run and install interface for ollama.	2025-05-18 12:43:22 -07:00
Michael Yang	333e360422	model: handle multiple eos tokens (#10577 ) * get eos_token_id from generation_config.json * refactor * include both ids and strings in trace * comments * remove special case for gemma3 special vocab (#10743)	2025-05-16 13:40:23 -07:00
Daniel Hiltgen	27da2cddc5	Fix lingering Q4_0 help reference (#10720 )	2025-05-15 16:33:23 -07:00
Bruce MacDonald	feb8923ada	cmd: add ellipses to truncated show metadata (#10717 ) When a piece of information has been truncated in the show output an ellipses to indicate that more data has not been displayed	2025-05-15 15:45:52 -07:00
Jesse Gross	fe623c2cf4	ollamarunner: Multi-modal worst case graph We currently preallocate compute graph memory for the worst case batch of text tokens. This adds support for doing the same for images. Note that image models are more complicated than text models in how they process their inputs so there may be cases where this approach isn't completely generic for all models. It covers all currently supported models though.	2025-05-15 13:46:20 -07:00
Jesse Gross	3c14461d5d	ollamarunner: Separate text and multimodal graphs For some multimodal models (such as gemma3), we create a single graph that generates the image embedding and then use this in the text model. The embedding tensor is completely opaque to the runner. However, this doesn't work if we need to use the embedding in multiple batches. This can arise if the embedding is larger than the batch size. In these cases (as with llama4), we would like to create views that are more appropriately sized. However, if we do this then the original source tensor is used in multiple graphs, which isn't allowed. To avoid that problem, models with this pattern compute the embedding tensor on first use and recreate the individual views. There is no longer a single vision and text graph. This codifies the pattern of separating vision and text graphs. The logic of computing tensors on demand is moved to the runner, so models no longer have to worry about this. It also gives the runner visibility into the multimodal tensors, which is important for memory management.	2025-05-15 13:46:20 -07:00
Jesse Gross	499ae7311f	ollamarunner: Base cached tokens on current prompt When we restore a sequence from the cache, we split the prompt into the already used tokens (stored in the cache) and new tokens that need to be processed. Currently, the references to the used tokens are coming from the stored previous sequence. However, even though we know that the used tokens are semantically equivalent to the prefix of the prompt, tokens can contain pointers which are no longer valid. As a result, it is better to get the used tokens from the prompt, which has currently valid pointers. This doesn't currently have any impact because it isn't possible to reuse the pointers (which are tensors) anyways. However, it becomes an issue once we can.	2025-05-15 13:46:20 -07:00
Michael Yang	ef202789fa	fix pixel values padding (#10718 ) * panic if trying to pad 4d * fix pixel values padding	2025-05-15 13:44:44 -07:00
Michael Yang	55760195e6	fix mllama conversion (#10716 ) cross attention Q and K projections needs to have their heads swapped, similar to non-cross attention Q and K tensors	2025-05-15 12:15:01 -07:00
Bruce MacDonald	bd68d3ae50	ggml: update qwen25vl vision size estimate (#10711 )	2025-05-14 16:42:30 -07:00
Daniel Hiltgen	ff80718e9c	fix crash in old clients with quantization progress (#10710 ) Older clients assumed the digest was at least 19 characters long so increase the size of the dummy digest to avoid array out of bounds crashes.	2025-05-14 14:54:18 -07:00
Bruce MacDonald	0aa8b371dd	model: add Qwen2.5-VL support (#10385 )	2025-05-13 20:58:02 -07:00