logging: add a new customer logger and trace method

This change addresses over logging with debug in the SPM tokenizer by adding a trace level to slog.
Merge pull request #9741 from ollama/mxyng/visionless
2026-02-08 14:43:05 -05:00 · 2025-03-13 16:10:59 -07:00 · 2025-03-13 15:03:25 -07:00 · 2025-03-13 14:24:27 -07:00 · 2025-03-13 13:59:19 -07:00 · 2025-03-13 13:12:33 -07:00
47 changed files with 2162 additions and 339 deletions
--- a/README.md
+++ b/README.md
@@ -54,6 +54,10 @@ Here are some example models that can be downloaded:

 | Model              | Parameters | Size  | Download                         |
 | ------------------ | ---------- | ----- | -------------------------------- |
+| Gemma 3            | 1B         | 815MB | `ollama run gemma3:1b`           |
+| Gemma 3            | 4B         | 3.3GB | `ollama run gemma3`              |
+| Gemma 3            | 12B        | 8.1GB | `ollama run gemma3:12b`          |
+| Gemma 3            | 27B        | 17GB  | `ollama run gemma3:27b`          |
 | QwQ                | 32B        | 20GB  | `ollama run qwq`                 |
 | DeepSeek-R1        | 7B         | 4.7GB | `ollama run deepseek-r1`         |
 | DeepSeek-R1        | 671B       | 404GB | `ollama run deepseek-r1:671b`    |
@@ -66,9 +70,6 @@ Here are some example models that can be downloaded:
 | Llama 3.1          | 405B       | 231GB | `ollama run llama3.1:405b`       |
 | Phi 4              | 14B        | 9.1GB | `ollama run phi4`                |
 | Phi 4 Mini         | 3.8B       | 2.5GB | `ollama run phi4-mini`           |
-| Gemma 2            | 2B         | 1.6GB | `ollama run gemma2:2b`           |
-| Gemma 2            | 9B         | 5.5GB | `ollama run gemma2`              |
-| Gemma 2            | 27B        | 16GB  | `ollama run gemma2:27b`          |
 | Mistral            | 7B         | 4.1GB | `ollama run mistral`             |
 | Moondream 2        | 1.4B       | 829MB | `ollama run moondream`           |
 | Neural Chat        | 7B         | 4.1GB | `ollama run neural-chat`         |
@@ -571,6 +572,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [llama.cpp](https://github.com/ggerganov/llama.cpp) project founded by Georgi Gerganov.

 ### Observability
+- [Opik](https://www.comet.com/docs/opik/cookbook/ollama) is an open-source platform to debug, evaluate, and monitor your LLM applications, RAG systems, and agentic workflows with comprehensive tracing, automated evaluations, and production-ready dashboards. Opik supports native intergration to Ollama.
 - [Lunary](https://lunary.ai/docs/integrations/ollama) is the leading open-source LLM observability platform. It provides a variety of enterprise-grade features such as real-time analytics, prompt templates management, PII masking, and comprehensive agent tracing.
 - [OpenLIT](https://github.com/openlit/openlit) is an OpenTelemetry-native tool for monitoring Ollama Applications & GPUs using traces and metrics.
 - [HoneyHive](https://docs.honeyhive.ai/integrations/ollama) is an AI observability and evaluation platform for AI agents. Use HoneyHive to evaluate agent performance, interrogate failures, and monitor quality in production.
--- a/api/types.go
+++ b/api/types.go
@@ -349,6 +349,7 @@ type ShowResponse struct {
 	Messages      []Message      `json:"messages,omitempty"`
 	ModelInfo     map[string]any `json:"model_info,omitempty"`
 	ProjectorInfo map[string]any `json:"projector_info,omitempty"`
+	Tensors       []Tensor       `json:"tensors,omitempty"`
 	ModifiedAt    time.Time      `json:"modified_at,omitempty"`
 }

@@ -467,6 +468,13 @@ type ModelDetails struct {
 	QuantizationLevel string   `json:"quantization_level"`
 }

+// Tensor describes the metadata for a given tensor.
+type Tensor struct {
+	Name  string   `json:"name"`
+	Type  string   `json:"type"`
+	Shape []uint64 `json:"shape"`
+}
+
 func (m *Metrics) Summary() {
 	if m.TotalDuration > 0 {
 		fmt.Fprintf(os.Stderr, "total duration:       %v\n", m.TotalDuration)
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -18,6 +18,7 @@ import (
 	"os/signal"
 	"path/filepath"
 	"runtime"
+	"sort"
 	"strconv"
 	"strings"
 	"sync/atomic"
@@ -568,8 +569,9 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
 	parameters, errParams := cmd.Flags().GetBool("parameters")
 	system, errSystem := cmd.Flags().GetBool("system")
 	template, errTemplate := cmd.Flags().GetBool("template")
+	verbose, errVerbose := cmd.Flags().GetBool("verbose")

-	for _, boolErr := range []error{errLicense, errModelfile, errParams, errSystem, errTemplate} {
+	for _, boolErr := range []error{errLicense, errModelfile, errParams, errSystem, errTemplate, errVerbose} {
 		if boolErr != nil {
 			return errors.New("error retrieving flags")
 		}
@@ -607,7 +609,7 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
 		return errors.New("only one of '--license', '--modelfile', '--parameters', '--system', or '--template' can be specified")
 	}

-	req := api.ShowRequest{Name: args[0]}
+	req := api.ShowRequest{Name: args[0], Verbose: verbose}
 	resp, err := client.Show(cmd.Context(), &req)
 	if err != nil {
 		return err
@@ -630,10 +632,10 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
 		return nil
 	}

-	return showInfo(resp, os.Stdout)
+	return showInfo(resp, verbose, os.Stdout)
 }

-func showInfo(resp *api.ShowResponse, w io.Writer) error {
+func showInfo(resp *api.ShowResponse, verbose bool, w io.Writer) error {
 	tableRender := func(header string, rows func() [][]string) {
 		fmt.Fprintln(w, " ", header)
 		table := tablewriter.NewWriter(w)
@@ -690,6 +692,45 @@ func showInfo(resp *api.ShowResponse, w io.Writer) error {
 		})
 	}

+	if resp.ModelInfo != nil && verbose {
+		tableRender("Metadata", func() (rows [][]string) {
+			keys := make([]string, 0, len(resp.ModelInfo))
+			for k := range resp.ModelInfo {
+				keys = append(keys, k)
+			}
+			sort.Strings(keys)
+
+			for _, k := range keys {
+				var v string
+				switch vData := resp.ModelInfo[k].(type) {
+				case string:
+					v = vData
+				case float64:
+					v = fmt.Sprintf("%g", vData)
+				case []any:
+					n := 3
+					if len(vData) < n {
+						n = len(vData)
+					}
+					v = fmt.Sprintf("%v", vData[:n])
+				default:
+					v = fmt.Sprintf("%T", vData)
+				}
+				rows = append(rows, []string{"", k, v})
+			}
+			return
+		})
+	}
+
+	if len(resp.Tensors) > 0 && verbose {
+		tableRender("Tensors", func() (rows [][]string) {
+			for _, t := range resp.Tensors {
+				rows = append(rows, []string{"", t.Name, t.Type, fmt.Sprint(t.Shape)})
+			}
+			return
+		})
+	}
+
 	head := func(s string, n int) (rows [][]string) {
 		scanner := bufio.NewScanner(strings.NewReader(s))
 		for scanner.Scan() && (len(rows) < n || n < 0) {
@@ -1196,6 +1237,7 @@ func NewCLI() *cobra.Command {
 	showCmd.Flags().Bool("parameters", false, "Show parameters of a model")
 	showCmd.Flags().Bool("template", false, "Show template of a model")
 	showCmd.Flags().Bool("system", false, "Show system message of a model")
+	showCmd.Flags().BoolP("verbose", "v", false, "Show detailed model information")

 	runCmd := &cobra.Command{
 		Use:     "run MODEL [PROMPT]",
--- a/cmd/cmd_test.go
+++ b/cmd/cmd_test.go
@@ -27,7 +27,7 @@ func TestShowInfo(t *testing.T) {
 				ParameterSize:     "7B",
 				QuantizationLevel: "FP16",
 			},
-		}, &b); err != nil {
+		}, false, &b); err != nil {
 			t.Fatal(err)
 		}

@@ -57,7 +57,7 @@ func TestShowInfo(t *testing.T) {
 				ParameterSize:     "7B",
 				QuantizationLevel: "FP16",
 			},
-		}, &b); err != nil {
+		}, false, &b); err != nil {
 			t.Fatal(err)
 		}

@@ -68,6 +68,56 @@ func TestShowInfo(t *testing.T) {
    embedding length    0       
    quantization        FP16    

+`
+		if diff := cmp.Diff(expect, b.String()); diff != "" {
+			t.Errorf("unexpected output (-want +got):\n%s", diff)
+		}
+	})
+
+	t.Run("verbose model", func(t *testing.T) {
+		var b bytes.Buffer
+		if err := showInfo(&api.ShowResponse{
+			Details: api.ModelDetails{
+				Family:            "test",
+				ParameterSize:     "8B",
+				QuantizationLevel: "FP16",
+			},
+			Parameters: `
+			stop up`,
+			ModelInfo: map[string]any{
+				"general.architecture":    "test",
+				"general.parameter_count": float64(8_000_000_000),
+				"test.context_length":     float64(1000),
+				"test.embedding_length":   float64(11434),
+			},
+			Tensors: []api.Tensor{
+				{Name: "blk.0.attn_k.weight", Type: "BF16", Shape: []uint64{42, 3117}},
+				{Name: "blk.0.attn_q.weight", Type: "FP16", Shape: []uint64{3117, 42}},
+			},
+		}, true, &b); err != nil {
+			t.Fatal(err)
+		}
+
+		expect := `  Model
+    architecture        test     
+    parameters          8B       
+    context length      1000     
+    embedding length    11434    
+    quantization        FP16     
+
+  Parameters
+    stop    up    
+
+  Metadata
+    general.architecture       test     
+    general.parameter_count    8e+09    
+    test.context_length        1000     
+    test.embedding_length      11434    
+
+  Tensors
+    blk.0.attn_k.weight    BF16    [42 3117]    
+    blk.0.attn_q.weight    FP16    [3117 42]    
+
 `
 		if diff := cmp.Diff(expect, b.String()); diff != "" {
 			t.Errorf("unexpected output (-want +got):\n%s", diff)
@@ -89,7 +139,7 @@ func TestShowInfo(t *testing.T) {
 			stop you
 			stop up
 			temperature 99`,
-		}, &b); err != nil {
+		}, false, &b); err != nil {
 			t.Fatal(err)
 		}

@@ -126,7 +176,7 @@ func TestShowInfo(t *testing.T) {
 				"clip.vision.embedding_length": float64(0),
 				"clip.vision.projection_dim":   float64(0),
 			},
-		}, &b); err != nil {
+		}, false, &b); err != nil {
 			t.Fatal(err)
 		}

@@ -159,7 +209,7 @@ func TestShowInfo(t *testing.T) {
 Ahoy, matey!
 Weigh anchor!
 			`,
-		}, &b); err != nil {
+		}, false, &b); err != nil {
 			t.Fatal(err)
 		}

@@ -188,7 +238,7 @@ Weigh anchor!
 				QuantizationLevel: "FP16",
 			},
 			License: license,
-		}, &b); err != nil {
+		}, false, &b); err != nil {
 			t.Fatal(err)
 		}

--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -195,6 +195,10 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 			opts.Messages = []api.Message{}
 			fmt.Printf("Loading model '%s'\n", opts.Model)
 			if err := loadOrUnloadModel(cmd, &opts); err != nil {
+				if strings.Contains(err.Error(), "not found") {
+					fmt.Printf("error: %v\n", err)
+					continue
+				}
 				return err
 			}
 			continue
@@ -343,7 +347,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {

 				switch args[1] {
 				case "info":
-					_ = showInfo(resp, os.Stderr)
+					_ = showInfo(resp, false, os.Stderr)
 				case "license":
 					if resp.License == "" {
 						fmt.Println("No license was specified for this model.")
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -13,8 +13,13 @@ import (
 )

 type ModelParameters struct {
-	Architectures []string `json:"architectures"`
-	VocabSize     uint32   `json:"vocab_size"`
+	Architectures []string       `json:"architectures"`
+	VocabSize     uint32         `json:"vocab_size"`
+	TextModel     TextParameters `json:"text_config"`
+}
+
+type TextParameters struct {
+	VocabSize uint32 `json:"vocab_size"`
 }

 type AdapterParameters struct {
@@ -185,6 +190,8 @@ func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
 		conv = &gemmaModel{}
 	case "Gemma2ForCausalLM":
 		conv = &gemma2Model{}
+	case "Gemma3ForCausalLM", "Gemma3ForConditionalGeneration":
+		conv = &gemma3Model{Architecture: p.Architectures[0]}
 	case "Phi3ForCausalLM":
 		conv = &phi3Model{}
 	case "Qwen2ForCausalLM":
@@ -213,7 +220,14 @@ func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
 	}

 	vocabSize := int(p.VocabSize)
+	if vocabSize == 0 {
+		tVocabSize := int(p.TextModel.VocabSize)
+		vocabSize = tVocabSize
+	}
+
 	switch {
+	case vocabSize == 0:
+		slog.Warn("vocabulary size was not explicitly set by the model", "default size", len(t.Vocabulary.Tokens))
 	case vocabSize > len(t.Vocabulary.Tokens):
 		slog.Warn("vocabulary is smaller than expected, padding with dummy tokens", "expect", vocabSize, "actual", len(t.Vocabulary.Tokens))
 		for i := range vocabSize - len(t.Vocabulary.Tokens) {
--- a/convert/convert_gemma.go
+++ b/convert/convert_gemma.go
@@ -45,7 +45,7 @@ func (p *gemmaModel) KV(t *Tokenizer) ggml.KV {
 func (p *gemmaModel) Tensors(ts []Tensor) []ggml.Tensor {
 	var out []ggml.Tensor
 	for _, t := range ts {
-		if strings.HasSuffix(t.Name(), "_norm.weight") {
+		if !strings.HasPrefix(t.Name(), "v.") && strings.HasSuffix(t.Name(), "_norm.weight") {
 			t.SetRepacker(p.addOne)
 		}

--- a/convert/convert_gemma3.go
+++ b/convert/convert_gemma3.go
@@ -0,0 +1,142 @@
+package convert
+
+import (
+	"cmp"
+
+	"github.com/ollama/ollama/fs/ggml"
+)
+
+type gemma3Model struct {
+	gemmaModel
+	Architecture string
+	TextModel    struct {
+		HeadDim          uint32 `json:"head_dim"`
+		HiddenSize       uint32 `json:"hidden_size"`
+		HiddenLayers     uint32 `json:"num_hidden_layers"`
+		IntermediateSize uint32 `json:"intermediate_size"`
+		SlidingWindow    uint32 `json:"sliding_window"`
+	} `json:"text_config"`
+	VisionModel struct {
+		NumAttentionHeads uint32  `json:"num_attention_heads"` // attention.head_count 16
+		LayerNormEpsilon  float32 `json:"layer_norm_eps"`      // attention.layer_norm_epsilon 1e-05
+		NumHiddenLayers   uint32  `json:"num_hidden_layers"`   // block_count 32
+		HiddenSize        uint32  `json:"hidden_size"`         // embedding_length 1280
+		IntermediateSize  uint32  `json:"intermediate_size"`   // feed_forward_length 5120
+		ImageSize         uint32  `json:"image_size"`          // image_size 560
+		NumChannels       uint32  `json:"num_channels"`        // num_channels 3
+		PatchSize         uint32  `json:"patch_size"`          // patch_size 14
+	} `json:"vision_config"`
+	MaxPositionEmbeddings    uint32  `json:"max_position_embeddings"`
+	NumAttentionHeads        uint32  `json:"num_attention_heads"`
+	NumKeyValueHeads         uint32  `json:"num_key_value_heads"`
+	RMSNormEPS               float32 `json:"rms_norm_eps"`
+	HeadDim                  uint32  `json:"head_dim"`
+	FinalLogitSoftcap        float32 `json:"final_logit_softcapping"`
+	RopeLocalTheta           float32 `json:"rope_local_base_freq"`
+	RopeGlobalTheta          float32 `json:"rope_global_base_freq"`
+	SlidingWindow            uint32  `json:"sliding_window"`
+	MultiModalTokensPerImage uint32  `json:"mm_tokens_per_image"`
+}
+
+const (
+	gemma4BLayerCount  = 34
+	gemma12BLayerCount = 48
+	gemma27BLayerCount = 62
+)
+
+func (p *gemma3Model) KV(t *Tokenizer) ggml.KV {
+	kv := p.ModelParameters.KV(t)
+	kv["general.architecture"] = "gemma3"
+
+	numBlocks := cmp.Or(p.HiddenLayers, p.TextModel.HiddenLayers)
+	kv["gemma3.block_count"] = numBlocks
+
+	var (
+		numHeads   uint32
+		numKVHeads uint32
+	)
+
+	switch numBlocks {
+	case gemma4BLayerCount:
+		numHeads = 8
+		numKVHeads = 4
+	case gemma12BLayerCount:
+		numHeads = 16
+		numKVHeads = 8
+	case gemma27BLayerCount:
+		numHeads = 32
+		numKVHeads = 16
+	default:
+		numHeads = p.NumAttentionHeads
+		numKVHeads = p.NumKeyValueHeads
+	}
+
+	kv["gemma3.attention.head_count"] = numHeads
+	kv["gemma3.attention.head_count_kv"] = numKVHeads
+
+	switch p.Architecture {
+	case "Gemma3ForCausalLM":
+		kv["gemma3.context_length"] = p.MaxPositionEmbeddings
+		kv["gemma3.attention.layer_norm_rms_epsilon"] = p.RMSNormEPS
+		kv["gemma3.attention.key_length"] = p.HeadDim
+		kv["gemma3.attention.value_length"] = p.HeadDim
+		kv["gemma3.attention.sliding_window"] = p.SlidingWindow
+		kv["gemma3.final_logit_softcapping"] = cmp.Or(p.FinalLogitSoftcap, 30)
+		kv["gemma3.rope.local.freq_base"] = cmp.Or(p.RopeLocalTheta, 10000.0)
+		kv["gemma3.rope.global.freq_base"] = cmp.Or(p.RopeGlobalTheta, 1000000.0)
+		kv["gemma3.embedding_length"] = p.HiddenSize
+		kv["gemma3.feed_forward_length"] = p.IntermediateSize
+	default:
+		kv["gemma3.context_length"] = cmp.Or(p.MaxPositionEmbeddings, 131072)
+		kv["gemma3.embedding_length"] = p.TextModel.HiddenSize
+		kv["gemma3.feed_forward_length"] = p.TextModel.IntermediateSize
+		kv["gemma3.attention.sliding_window"] = p.TextModel.SlidingWindow
+		kv["gemma3.vision.block_count"] = p.VisionModel.NumHiddenLayers
+		kv["gemma3.vision.embedding_length"] = p.VisionModel.HiddenSize
+		kv["gemma3.vision.feed_forward_length"] = p.VisionModel.IntermediateSize
+		kv["gemma3.vision.image_size"] = p.VisionModel.ImageSize
+		kv["gemma3.vision.patch_size"] = p.VisionModel.PatchSize
+		kv["gemma3.vision.num_channels"] = cmp.Or(p.VisionModel.NumChannels, 3)
+		kv["gemma3.vision.attention.head_count"] = p.VisionModel.NumAttentionHeads
+		kv["gemma3.vision.attention.layer_norm_epsilon"] = cmp.Or(p.VisionModel.LayerNormEpsilon, 1e-6)
+		kv["gemma3.attention.key_length"] = cmp.Or(p.TextModel.HeadDim, 256)
+		kv["gemma3.attention.value_length"] = cmp.Or(p.TextModel.HeadDim, 256)
+	}
+
+	if p.MultiModalTokensPerImage > 0 {
+		kv["gemma3.mm.tokens_per_image"] = p.MultiModalTokensPerImage
+	}
+
+	return kv
+}
+
+func (p *gemma3Model) Replacements() []string {
+	return []string{
+		"lm_head", "output",
+		"model.embed_tokens", "token_embd",
+		"model.norm", "output_norm",
+		"vision_tower.vision_model.embeddings", "v",
+		"vision_tower.vision_model", "v",
+		"vision_model.vision_model.embeddings", "v",
+		"vision_model.vision_model", "v",
+		"language_model.", "",
+		"model.layers", "blk",
+		"encoder.layers", "blk",
+		"input_layernorm", "attn_norm",
+		"self_attn.q_proj", "attn_q",
+		"self_attn.q_norm", "attn_q_norm",
+		"self_attn.k_proj", "attn_k",
+		"self_attn.k_norm", "attn_k_norm",
+		"self_attn.v_proj", "attn_v",
+		"self_attn.o_proj", "attn_output",
+		"self_attn.out_proj", "attn_output",
+		"mlp.gate_proj", "ffn_gate",
+		"mlp.down_proj", "ffn_down",
+		"mlp.up_proj", "ffn_up",
+		"post_attention_layernorm", "post_attention_norm",
+		"pre_feedforward_layernorm", "ffn_norm",
+		"post_feedforward_layernorm", "post_ffw_norm",
+		"input_projection_weight", "input_projection.weight",
+		"multi_modal_projector", "mm",
+	}
+}
--- a/convert/tokenizer_spm.go
+++ b/convert/tokenizer_spm.go
@@ -6,7 +6,9 @@ import (
 	"errors"
 	"fmt"
 	"io/fs"
+	"log/slog"
 	"os"
+	"reflect"
 	"slices"

 	"google.golang.org/protobuf/proto"
@@ -15,6 +17,8 @@ import (
 )

 func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) {
+	slog.Debug("using spm vocabulary")
+
 	ast, err := parseAdditionalSpecialTokens(fsys)
 	if err != nil {
 		return nil, err
@@ -43,10 +47,19 @@ func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) {
 			v.Types = append(v.Types, int32(t))
 		default:
 			tt := int32(sentencepiece.ModelProto_SentencePiece_NORMAL)
-			if slices.Contains(ast, piece.GetPiece()) {
+
+			// temporary fix to handle gemma3 broken configs
+			if slices.Contains([]string{"<end_of_turn>", "<start_of_turn>"}, piece.GetPiece()) {
 				tt = int32(sentencepiece.ModelProto_SentencePiece_CONTROL)
 			}

+			for _, t := range ast {
+				if t.Content == piece.GetPiece() {
+					tt = int32(sentencepiece.ModelProto_SentencePiece_CONTROL)
+					break
+				}
+			}
+
 			v.Types = append(v.Types, tt)
 		}
 	}
@@ -78,10 +91,16 @@ func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) {
 		return cmp.Compare(i.id, j.id)
 	})

-	n := len(v.Tokens)
-	for i, t := range ts {
-		if t.id != i+n {
-			return nil, fmt.Errorf("invalid token id: %d", t.id)
+	for _, t := range ts {
+		if t.id < len(v.Tokens) {
+			if v.Tokens[t.id] == t.content {
+				slog.Warn("tokenizer", "duplicate token", t.content, "id", t.id)
+				continue
+			}
+			return nil, fmt.Errorf("token mismatch: %s != %s at pos [%d]", t.content, v.Tokens[t.id], t.id)
+		}
+		if t.id != len(v.Tokens) {
+			return nil, fmt.Errorf("invalid token id: [%d] as pos [%d]", t.id, len(v.Tokens))
 		}

 		v.Tokens = append(v.Tokens, t.content)
@@ -92,7 +111,15 @@ func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) {
 	return &v, nil
 }

-func parseAdditionalSpecialTokens(fsys fs.FS) ([]string, error) {
+type specialToken struct {
+	Content    string `json:"content"`
+	Lstrip     bool   `json:"lstrip"`
+	Normalized bool   `json:"normalized"`
+	Rstrip     bool   `json:"rstrip"`
+	SingleWord bool   `json:"single_word"`
+}
+
+func parseAdditionalSpecialTokens(fsys fs.FS) ([]specialToken, error) {
 	f, err := fsys.Open("special_tokens_map.json")
 	if errors.Is(err, os.ErrNotExist) {
 		return nil, nil
@@ -102,12 +129,43 @@ func parseAdditionalSpecialTokens(fsys fs.FS) ([]string, error) {
 	defer f.Close()

 	var m struct {
-		AdditionalSpecialTokens []string `json:"additional_special_tokens"`
+		AdditionalSpecialTokens any `json:"additional_special_tokens"`
 	}

 	if err := json.NewDecoder(f).Decode(&m); err != nil {
 		return nil, err
 	}

-	return m.AdditionalSpecialTokens, nil
+	var ast []specialToken
+
+	switch st := m.AdditionalSpecialTokens.(type) {
+	case []string:
+		for _, s := range st {
+			ast = append(ast, specialToken{Content: s})
+		}
+	case []any:
+		for _, s := range st {
+			// marshal and unmarshal the object to get the special token
+			tMap := s.(map[string]any)
+			data, err := json.Marshal(tMap)
+			if err != nil {
+				return nil, err
+			}
+
+			var token specialToken
+			err = json.Unmarshal(data, &token)
+			if err != nil {
+				return nil, err
+			}
+
+			ast = append(ast, token)
+		}
+
+	default:
+		slog.Warn("special token", "unknown token", reflect.TypeOf(st))
+	}
+
+	slog.Debug("spm tokenizer", "additional tokens", ast)
+
+	return ast, nil
 }
--- a/docs/linux.md
+++ b/docs/linux.md
@@ -75,7 +75,7 @@ RestartSec=3
 Environment="PATH=$PATH"

 [Install]
-WantedBy=default.target
+WantedBy=multi-user.target
 ```

 Then start the service:
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -124,6 +124,19 @@ func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
 	return s
 }

+func (kv KV) Floats(key string, defaultValue ...[]float32) []float32 {
+	r := keyValue(kv, key, &array{})
+	s := make([]float32, r.size)
+	for i := range r.size {
+		s[i] = float32(r.values[i].(float32))
+	}
+	return s
+}
+
+func (kv KV) OllamaEngineRequired() bool {
+	return kv.Architecture() == "gemma3"
+}
+
 func keyValue[T string | uint32 | uint64 | float32 | *array | bool](kv KV, key string, defaultValue ...T) T {
 	if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
 		key = kv.Architecture() + "." + key
@@ -314,6 +327,10 @@ func (t Tensor) Size() uint64 {
 	return t.parameters() * t.typeSize() / t.blockSize()
 }

+func (t Tensor) Type() string {
+	return fileType(t.Kind).String()
+}
+
 type container interface {
 	Name() string
 	Decode(io.ReadSeeker) (model, error)
@@ -476,7 +493,7 @@ func (f GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partialO
 			// vocab graph
 			4*batch*(embedding+vocab)+embedding*vocab*105/128,
 		)
-	case "gemma", "gemma2":
+	case "gemma", "gemma2", "gemma3":
 		fullOffload = max(
 			4*batch*(embedding+vocab),
 			4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads),
--- a/kvcache/causal.go
+++ b/kvcache/causal.go
@@ -21,9 +21,10 @@ type shiftFn func(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, e
 type Causal struct {
 	DType      ml.DType
 	Capacity   int32
-	causal     bool
 	windowSize int32

+	opts CausalOptions
+
 	// config controls mostly backend-specific optimizations
 	config *ml.CacheConfig

@@ -79,7 +80,6 @@ type cellRange struct {

 func NewCausalCache(shift shiftFn) *Causal {
 	return &Causal{
-		causal:     true,
 		windowSize: math.MaxInt32,
 		shiftFn:    shift,
 		ctxs:       make(map[int]ml.Context),
@@ -90,7 +90,6 @@ func NewCausalCache(shift shiftFn) *Causal {

 func NewSWACache(windowSize int32, shift shiftFn) *Causal {
 	return &Causal{
-		causal:     true,
 		windowSize: windowSize,
 		shiftFn:    shift,
 		ctxs:       make(map[int]ml.Context),
@@ -145,6 +144,7 @@ func (c *Causal) StartForward(ctx ml.Context, opts input.Options) error {
 	c.curBatchSize = len(opts.Positions)
 	c.curSequences = opts.Sequences
 	c.curPositions = opts.Positions
+	c.opts.Except = nil

 	var err error
 	c.curLoc, err = c.findStartLoc()
@@ -235,9 +235,10 @@ func (c *Causal) buildMask(ctx ml.Context) (ml.Tensor, error) {
 	mask := make([]float32, batchSize*length)

 	for i := range c.curBatchSize {
+		enabled := !slices.Contains(c.opts.Except, i)
 		for j := c.curCellRange.min; j <= c.curCellRange.max; j++ {
 			if !slices.Contains(c.cells[j].sequences, c.curSequences[i]) ||
-				(c.causal && c.cells[j].pos > c.curPositions[i]) ||
+				(enabled && c.cells[j].pos > c.curPositions[i]) ||
 				c.cells[j].pos < c.curPositions[i]-c.windowSize {
 				mask[i*length+(j-c.curCellRange.min)] = float32(math.Inf(-1))
 			}
@@ -404,15 +405,16 @@ func (c *Causal) SetLayer(layer int) {
 	c.curLayer = layer
 }

-// SetCausal enables or disables causal mask generation for subsequent calls to Get.
-// This state carries over to future forward passes. The default value is true.
-//
-// ctx may be set to nil if this is called from outside of a forward pass, for
-// example, when initializing the cache.
-func (c *Causal) SetCausal(ctx ml.Context, causal bool) {
-	if c.causal != causal {
-		c.causal = causal
+type CausalOptions struct {
+	// Enabled controls whether the causal mask is generated for a particular index in a batch
+	Except []int
+}

+// SetCausal disables causal mask generation for a particular range of indicies in
+// the current batch for subsequent calls to Get. The state resets for the next forward pass.
+func (c *Causal) SetCausal(ctx ml.Context, opts CausalOptions) {
+	if !slices.Equal(c.opts.Except, opts.Except) {
+		c.opts = opts
 		if ctx != nil {
 			var err error
 			c.curMask, err = c.buildMask(ctx)
--- a/kvcache/causal_test.go
+++ b/kvcache/causal_test.go
@@ -441,11 +441,19 @@ func (t *testTensor) Scale(ctx ml.Context, s float64) ml.Tensor {
 	panic("not implemented")
 }

+func (t *testTensor) AvgPool1D(ctx ml.Context, k, s, p int) ml.Tensor {
+	panic("not implemented")
+}
+
+func (t *testTensor) AvgPool2D(ctx ml.Context, k, s int, p float32) ml.Tensor {
+	panic("not implemented")
+}
+
 func (t *testTensor) Conv2D(ctx ml.Context, weight ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
 	panic("not implemented")
 }

-func (t *testTensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, dim uint32, base, scale float32) ml.Tensor {
+func (t *testTensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, dim, ropeType uint32, base, scale float32) ml.Tensor {
 	panic("not implemented")
 }

@@ -495,6 +503,10 @@ func (t *testTensor) Contiguous(ctx ml.Context) ml.Tensor {
 	panic("not implemented")
 }

+func (t *testTensor) Set(ctx ml.Context, t2 ml.Tensor, offset int, strides ...int) ml.Tensor {
+	panic("not implemented")
+}
+
 func (t *testTensor) Pad(ctx ml.Context, shape ...int) ml.Tensor {
 	panic("not implemented")
 }
--- a/llama/patches/0020-ollama-debug-tensor.patch
+++ b/llama/patches/0020-ollama-debug-tensor.patch
@@ -0,0 +1,33 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Michael Yang <mxyng@pm.me>
+Date: Sun, 9 Mar 2025 14:44:16 -0700
+Subject: [PATCH] ollama debug tensor
+
+---
+ ggml/src/ggml-cpu/ggml-cpu.c | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
+index 2f606d82..ec60e8fc 100644
+--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
+@@ -11,6 +11,8 @@
+ #include "ggml-threading.h"
+ #include "ggml.h"
+ 
+#include "ollama-debug.h"
+
+ #if defined(_MSC_VER) || defined(__MINGW32__)
+ #include <malloc.h> // using malloc.h with MSC/MINGW
+ #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
+@@ -14103,6 +14105,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
+ 
+         ggml_compute_forward(&params, node);
+ 
+#ifdef OLLAMA_DEBUG
+        ollama_debug(node, true);
+#endif
+
+         if (state->ith == 0 && cplan->abort_callback &&
+                 cplan->abort_callback(cplan->abort_callback_data)) {
+             atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed);
--- a/llm/server.go
+++ b/llm/server.go
@@ -271,7 +271,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a

 	var llamaModel *llama.Model
 	var textProcessor model.TextProcessor
-	if envconfig.NewEngine() {
+	if envconfig.NewEngine() || f.KV().OllamaEngineRequired() {
 		textProcessor, err = model.NewTextProcessor(modelPath)
 		if err != nil {
 			// To prepare for opt-out mode, instead of treating this as an error, we fallback to the old runner
--- a/logging/log.go
+++ b/logging/log.go
@@ -0,0 +1,40 @@
+package logging
+
+import (
+	"context"
+	"log/slog"
+	"os"
+)
+
+const LevelTrace slog.Level = slog.LevelDebug - 4
+
+type Logger struct {
+	logger *slog.Logger
+}
+
+func NewLogger() *Logger {
+	handler := slog.NewTextHandler(os.Stdout, nil)
+	return &Logger{
+		logger: slog.New(handler),
+	}
+}
+
+func (l *Logger) Trace(msg string, args ...any) {
+	l.logger.Log(context.Background(), LevelTrace, msg, args...)
+}
+
+func (l *Logger) Debug(msg string, args ...any) {
+	l.logger.Debug(msg, args...)
+}
+
+func (l *Logger) Info(msg string, args ...any) {
+	l.logger.Info(msg, args...)
+}
+
+func (l *Logger) Warn(msg string, args ...any) {
+	l.logger.Warn(msg, args...)
+}
+
+func (l *Logger) Error(msg string, args ...any) {
+	l.logger.Error(msg, args...)
+}
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -5,6 +5,7 @@ import (
 	"encoding/binary"
 	"fmt"
 	"os"
+	"slices"
 	"strconv"
 	"strings"
 )
@@ -18,6 +19,7 @@ type Config interface {

 	Strings(string, ...[]string) []string
 	Uints(string, ...[]uint32) []uint32
+	Floats(string, ...[]float32) []float32
 }

 type Backend interface {
@@ -133,8 +135,10 @@ type Tensor interface {
 	RMSNorm(ctx Context, weight Tensor, eps float32) Tensor
 	Scale(ctx Context, s float64) Tensor

+	AvgPool2D(ctx Context, k, s int, p float32) Tensor
 	Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
-	RoPE(ctx Context, positionIDs, ropeFactors Tensor, dim uint32, base, scale float32) Tensor
+
+	RoPE(ctx Context, positionIDs, ropeFactors Tensor, dim, ropeType uint32, base, scale float32) Tensor

 	Tanh(ctx Context) Tensor
 	GELU(ctx Context) Tensor
@@ -144,6 +148,7 @@ type Tensor interface {
 	View(ctx Context, offset int, shape ...int) Tensor
 	Permute(ctx Context, shape ...int) Tensor
 	Contiguous(ctx Context) Tensor
+	Set(ctx Context, t2 Tensor, offset int, strides ...int) Tensor

 	Pad(ctx Context, shape ...int) Tensor
 	Unpad(ctx Context, shape ...int) Tensor
@@ -241,16 +246,17 @@ func dump[S ~[]E, E number](ctx Context, t Tensor, items int, fn func(E) string)
 	}

 	shape := t.Shape()
+	slices.Reverse(shape)

 	var sb strings.Builder
 	var f func([]int, int)
 	f = func(dims []int, stride int) {
 		prefix := strings.Repeat(" ", len(shape)-len(dims)+1)
-		fmt.Fprint(&sb, "[")
-		defer func() { fmt.Fprint(&sb, "]") }()
+		sb.WriteString("[")
+		defer func() { sb.WriteString("]") }()
 		for i := 0; i < dims[0]; i++ {
 			if i >= items && i < dims[0]-items {
-				fmt.Fprint(&sb, "..., ")
+				sb.WriteString("..., ")
 				// skip to next printable element
 				skip := dims[0] - 2*items
 				if len(dims) > 1 {
@@ -265,9 +271,14 @@ func dump[S ~[]E, E number](ctx Context, t Tensor, items int, fn func(E) string)
 					fmt.Fprint(&sb, ",", strings.Repeat("\n", len(dims)-1), prefix)
 				}
 			} else {
-				fmt.Fprint(&sb, fn(s[stride+i]))
+				text := fn(s[stride+i])
+				if len(text) > 0 && text[0] != '-' {
+					sb.WriteString(" ")
+				}
+
+				sb.WriteString(text)
 				if i < dims[0]-1 {
-					fmt.Fprint(&sb, ", ")
+					sb.WriteString(", ")
 				}
 			}
 		}
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -240,11 +240,22 @@ func New(r *os.File, params ml.BackendParams) (ml.Backend, error) {
 		switch {
 		case contains(t.Name, "position_embd", "token_embd", "token_norm_embd", "token_types"):
 			createTensor(tensor{source: t}, input.bts)
+			if _, ok := meta.Tensors().GroupLayers()["output"]; !ok && t.Name == "token_embd.weight" {
+				createTensor(tensor{source: t, target: "output.weight"}, output.bts)
+			}
 		case contains(t.Name, "cls", "output", "output_norm"):
 			createTensor(tensor{source: t}, output.bts)
 		case strings.HasPrefix(t.Name, "v.") || strings.HasPrefix(t.Name, "mm."):
 			// TODO: assign vision tensors to the gpu if possible
-			createTensor(tensor{source: t}, input.bts)
+			createTensor(tensor{source: t}, output.bts)
+		case contains(t.Name, "rope_freqs", "rope_factors_long", "rope_factors_short"):
+			// these tensors should be repeated per layer
+			for i, layer := range layers {
+				createTensor(tensor{
+					source: t,
+					target: "blk." + strconv.Itoa(i) + "." + t.Name,
+				}, layer.bts)
+			}
 		default:
 			layerIndex := -1
 			if fields := strings.FieldsFunc(t.Name, func(r rune) bool { return !unicode.IsNumber(r) }); len(fields) > 0 {
@@ -256,14 +267,8 @@ func New(r *os.File, params ml.BackendParams) (ml.Backend, error) {
 			if layerIndex >= 0 {
 				createTensor(tensor{source: t}, layers[layerIndex].bts)
 			} else {
-				// this is a repeating tensor that doesn't explicitly associated with a layer so
-				// duplicate it for each layer
-				for i, layer := range layers {
-					createTensor(tensor{
-						source: t,
-						target: "blk." + strconv.Itoa(i) + "." + t.Name,
-					}, layer.bts)
-				}
+				// load all other tensors on the cpu
+				createTensor(tensor{source: t}, input.bts)
 			}
 		}
 	}
@@ -352,7 +357,7 @@ func New(r *os.File, params ml.BackendParams) (ml.Backend, error) {

 		if C.ggml_backend_is_cpu(b) {
 			// set number of threads for cpu backend
-			C.ggml_backend_cpu_set_n_threads(b, C.int(params.NumThreads))
+			C.ggml_backend_cpu_set_n_threads(b, C.int(Threads(params.NumThreads)))
 		}
 	}

@@ -893,10 +898,13 @@ func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
 }

 const (
-	ropeTypeNorm C.int = iota
+	ropeTypeNorm   C.int = 0
+	ropeTypeNeox   C.int = 2
+	ropeTypeMrope  C.int = 8
+	ropeTypeVision C.int = 24
 )

-func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim uint32, ropeBase, ropeScale float32) ml.Tensor {
+func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim, ropeType uint32, ropeBase, ropeScale float32) ml.Tensor {
 	if ropeFactors == nil {
 		ropeFactors = &Tensor{b: t.b}
 	}
@@ -911,8 +919,8 @@ func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDi
 		t: C.ggml_rope_ext(
 			ctx.(*Context).ctx, dequant, positionIDs.(*Tensor).t, ropeFactors.(*Tensor).t,
 			C.int(ropeDim),
-			131072,       // YaRN n_ctx_train
-			ropeTypeNorm, // ROPE_TYPE_NORM
+			C.int(ropeType),
+			131072, // YaRN n_ctx_train
 			C.float(ropeBase),
 			C.float(ropeScale),
 			0.,  // YaRN ext_factor
@@ -944,6 +952,27 @@ func (t *Tensor) Conv2D(ctx ml.Context, t2 ml.Tensor, s0, s1, p0, p1, d0, d1 int
 	}
 }

+func (t *Tensor) AvgPool2D(ctx ml.Context, k, s int, p float32) ml.Tensor {
+	return &Tensor{
+		b: t.b,
+		t: C.ggml_pool_2d(ctx.(*Context).ctx, t.t, C.GGML_OP_POOL_AVG, C.int(k), C.int(k), C.int(s), C.int(s), C.float(p), C.float(p)),
+	}
+}
+
+func (t *Tensor) Set(ctx ml.Context, t2 ml.Tensor, offset int, strides ...int) ml.Tensor {
+	var tt *C.struct_ggml_tensor
+	switch len(strides) {
+	case 0:
+		tt = C.ggml_set_1d(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.size_t(offset))
+	case 1:
+		tt = C.ggml_set_2d(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.size_t(offset), C.size_t(strides[0]))
+	default:
+		panic("unsupported number of dimensions")
+	}
+
+	return &Tensor{b: t.b, t: tt}
+}
+
 func (t *Tensor) ScaledDotProductAttention(ctx ml.Context, key, value, mask ml.Tensor, scale float64) ml.Tensor {
 	var kqMask *C.struct_ggml_tensor
 	if mask != nil {
--- a/ml/backend/ggml/ggml/include/ollama-debug.h
+++ b/ml/backend/ggml/ggml/include/ollama-debug.h
@@ -0,0 +1,11 @@
+#include "ggml.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void ollama_debug(const struct ggml_tensor *tensor, bool verbose);
+
+#ifdef __cplusplus
+}
+#endif
--- a/ml/backend/ggml/ggml/src/ggml-cpu/cpu_debug.go
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/cpu_debug.go
@@ -0,0 +1,6 @@
+//go:build debug
+
+package cpu
+
+// #cgo CPPFLAGS: -DOLLAMA_DEBUG
+import "C"
--- a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
@@ -11,6 +11,8 @@
 #include "ggml-threading.h"
 #include "ggml.h"

+#include "ollama-debug.h"
+
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <malloc.h> // using malloc.h with MSC/MINGW
 #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
@@ -14103,6 +14105,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {

        ggml_compute_forward(&params, node);

+#ifdef OLLAMA_DEBUG
+        ollama_debug(node, true);
+#endif
+
        if (state->ith == 0 && cplan->abort_callback &&
                cplan->abort_callback(cplan->abort_callback_data)) {
            atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed);
--- a/ml/backend/ggml/ggml/src/ollama-debug.c
+++ b/ml/backend/ggml/ggml/src/ollama-debug.c
@@ -0,0 +1,116 @@
+#include <string.h>
+#include <inttypes.h>
+
+#include "ollama-debug.h"
+
+static int mul(int64_t *dims, int ndims) {
+    int result = 1;
+    for (int i = 0; i < ndims; i++) {
+        result *= dims[i];
+    }
+
+    return result;
+}
+
+static void repeat(char c, int n) {
+    for (int i = 0; i < n; i++) {
+        fprintf(stderr, "%c", c);
+    }
+}
+
+static void print_tensor(const void *tensor, void (*cb)(const void *, int),
+                         int shape,
+                         int64_t *dims, int ndims, int stride,
+                         int nitems, int pad) {
+    fprintf(stderr, "[");
+    for (int i = 0; i < dims[0]; i++) {
+        if (i >= nitems && i < dims[0] - nitems) {
+            fprintf(stderr, "... (%" PRIi64 " more), ", dims[0] - 2 * nitems);
+            int skip = dims[0] - 2 * nitems;
+            if (ndims > 1) {
+                stride += mul(dims + 1, ndims - 1) * skip;
+                repeat('\n', ndims - 1);
+                repeat(' ', shape - ndims + 1 + pad);
+            }
+            i += skip - 1;
+        } else if (ndims > 1) {
+            print_tensor(tensor, cb, shape, dims + 1, ndims - 1, stride,
+                         nitems, pad);
+            stride += mul(dims + 1, ndims - 1);
+            if (i < dims[0] - 1) {
+                fprintf(stderr, ", ");
+                repeat('\n', ndims - 1);
+                repeat(' ', shape - ndims + 1 + pad);
+            }
+        } else {
+            cb(tensor, stride + i);
+            if (i < dims[0] - 1) {
+                fprintf(stderr, ", ");
+            }
+        }
+    }
+    fprintf(stderr, "]");
+}
+
+static void print_tensor_f16(const void *tensor, int i) {
+    float value = ggml_fp16_to_fp32(((const ggml_fp16_t *)tensor)[i]);
+    fprintf(stderr, "%s%f", value < 0 ? "" : " ", value);
+}
+
+static void print_tensor_f32(const void *tensor, int i) {
+    float value = ((const float *)tensor)[i];
+    fprintf(stderr, "%s%f", value < 0 ? "" : " ", value);
+}
+
+static void print_tensor_i32(const void *tensor, int i) {
+    int32_t value = ((const int32_t *)tensor)[i];
+    fprintf(stderr, "%s%d", value < 0 ? "" : " ", value);
+}
+
+static void ollama_debug_tensor(const struct ggml_tensor *tensor, bool verbose, const char *prefix, int indent) {
+    fprintf(stderr, "%s%s %s (%s): [%" PRIi64 " %" PRIi64 " %" PRIi64 " %" PRIi64 "]\n", prefix, tensor->name,
+            ggml_op_name(tensor->op), ggml_type_name(tensor->type), tensor->ne[0],
+            tensor->ne[1], tensor->ne[2], tensor->ne[3]);
+
+    if (!verbose) {
+        return;
+    }
+
+    for (int i = 0; i < indent; i++) {
+        fprintf(stderr, " ");
+    }
+
+    switch (tensor->type) {
+    case GGML_TYPE_F16:
+        print_tensor(ggml_get_data(tensor), print_tensor_f16, ggml_n_dims(tensor),
+                     (int64_t *)tensor->ne, ggml_n_dims(tensor), 0, 3, indent);
+        break;
+    case GGML_TYPE_F32:
+        print_tensor(ggml_get_data(tensor), print_tensor_f32, ggml_n_dims(tensor),
+                     (int64_t *)tensor->ne, ggml_n_dims(tensor), 0, 3, indent);
+        break;
+    case GGML_TYPE_I32:
+        print_tensor(ggml_get_data(tensor), print_tensor_i32, ggml_n_dims(tensor),
+                     (int64_t *)tensor->ne, ggml_n_dims(tensor), 0, 3, indent);
+        break;
+    default:
+        fprintf(stderr, "<unsupported type>\n");
+        return;
+    }
+
+    fprintf(stderr, "\n");
+}
+
+void ollama_debug(const struct ggml_tensor *tensor, bool verbose) {
+    ollama_debug_tensor(tensor, verbose, ">>> ", 4);
+
+    for (int i = 0; i < GGML_MAX_SRC && tensor->src[i] != NULL; ++i) {
+        char src[8];
+        const int n = snprintf(src, sizeof(src), " src%d ", i);
+        if (n >= sizeof(src)) {
+            src[sizeof(src) - 1] = '\0';
+        }
+
+        ollama_debug_tensor(tensor->src[i], verbose, src, 4);
+    }
+}
--- a/ml/backend/ggml/threads.go
+++ b/ml/backend/ggml/threads.go
@@ -0,0 +1,7 @@
+//go:build !debug
+
+package ggml
+
+func Threads(n int) int {
+	return n
+}
--- a/ml/backend/ggml/threads_debug.go
+++ b/ml/backend/ggml/threads_debug.go
@@ -0,0 +1,7 @@
+//go:build debug
+
+package ggml
+
+func Threads(_ int) int {
+	return 1
+}
--- a/model/model.go
+++ b/model/model.go
@@ -22,6 +22,8 @@ import (
 	"github.com/ollama/ollama/model/input"
 )

+var ErrNoVisionModel = errors.New("this model is missing data required for image input")
+
 // Model implements a specific model architecture, defining the forward pass and any model-specific configuration
 type Model interface {
 	Forward(ml.Context, input.Options) (ml.Tensor, error)
--- a/model/models/gemma2/model.go
+++ b/model/models/gemma2/model.go
@@ -0,0 +1,220 @@
+package gemma2
+
+import (
+	"math"
+
+	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/model/input"
+)
+
+type Options struct {
+	hiddenSize, numHeads, numKVHeads int
+	attnKeyLen, attnValLen           int
+	eps, ropeBase, ropeScale         float32
+	attnLogitSoftcap                 float32
+	finalLogitSoftcap                float32
+	largeModelScaling                bool
+}
+
+type Model struct {
+	model.Base
+	model.SentencePieceModel
+
+	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
+	Layers         []Layer       `gguf:"blk"`
+	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
+	Output         *nn.Linear    `gguf:"output,alt:token_embd"` // just set to token_embd?
+
+	*Options
+}
+
+const (
+	gemma27BLayerCount = 46
+)
+
+func New(c ml.Config) (model.Model, error) {
+	m := Model{
+		SentencePieceModel: model.NewSentencePieceModel(
+			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
+			&model.Vocabulary{
+				Values: c.Strings("tokenizer.ggml.tokens"),
+				Scores: c.Floats("tokenizer.ggml.scores"),
+				Types:  c.Uints("tokenizer.ggml.token_type"),
+				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
+				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+			},
+		),
+		Layers: make([]Layer, c.Uint("block_count")),
+		Options: &Options{
+			hiddenSize:        int(c.Uint("embedding_length")),
+			numHeads:          int(c.Uint("attention.head_count")),
+			numKVHeads:        int(c.Uint("attention.head_count_kv")),
+			attnKeyLen:        int(c.Uint("attention.key_length")),
+			attnValLen:        int(c.Uint("attention.value_length")),
+			eps:               c.Float("attention.layer_norm_rms_epsilon"),
+			ropeBase:          c.Float("rope.freq_base", 10000.0),
+			ropeScale:         c.Float("rope.freq_scale", 1.0),
+			attnLogitSoftcap:  c.Float("attn_logit_softcapping"),
+			finalLogitSoftcap: c.Float("final_logit_softcapping"),
+		},
+	}
+
+	slidingWindowLen := int32(c.Uint("attention.sliding_window"))
+	m.Cache = kvcache.NewWrapperCache(kvcache.NewSWACache(slidingWindowLen, m.Shift), kvcache.NewCausalCache(m.Shift))
+	m.Cache.SetConfig(ml.CacheConfig{})
+
+	return &m, nil
+}
+
+type SelfAttention struct {
+	Query  *nn.Linear `gguf:"attn_q"`
+	Key    *nn.Linear `gguf:"attn_k"`
+	Value  *nn.Linear `gguf:"attn_v"`
+	Output *nn.Linear `gguf:"attn_output"`
+}
+
+func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
+	batchSize := hiddenState.Dim(1)
+	ropeType := uint32(2)
+
+	q := sa.Query.Forward(ctx, hiddenState)
+	q = q.Reshape(ctx, opts.attnKeyLen, opts.numHeads, batchSize)
+	q = q.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, opts.ropeBase, opts.ropeScale)
+
+	if opts.largeModelScaling {
+		q = q.Scale(ctx, 1.0/math.Sqrt(float64(opts.hiddenSize/opts.numHeads)))
+	} else {
+		q = q.Scale(ctx, 1.0/math.Sqrt(float64(opts.attnKeyLen)))
+	}
+
+	k := sa.Key.Forward(ctx, hiddenState)
+	k = k.Reshape(ctx, opts.attnKeyLen, opts.numKVHeads, batchSize)
+	k = k.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, opts.ropeBase, opts.ropeScale)
+
+	v := sa.Value.Forward(ctx, hiddenState)
+	v = v.Reshape(ctx, opts.attnValLen, opts.numKVHeads, batchSize)
+
+	cache.Put(ctx, k, v)
+	k, v, mask := cache.Get(ctx)
+
+	q = q.Permute(ctx, 0, 2, 1, 3)
+	k = k.Permute(ctx, 0, 2, 1, 3)
+	v = v.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
+
+	kq := k.Mulmat(ctx, q)
+
+	// logit softcap
+	kq = kq.Scale(ctx, 1.0/float64(opts.attnLogitSoftcap))
+	kq = kq.Tanh(ctx)
+	kq = kq.Scale(ctx, float64(opts.attnLogitSoftcap))
+
+	kq = kq.Add(ctx, mask)
+	kq = kq.Softmax(ctx)
+
+	kqv := v.Mulmat(ctx, kq)
+	kqv = kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
+	kqv = kqv.Reshape(ctx, opts.attnValLen*opts.numHeads, batchSize)
+
+	return sa.Output.Forward(ctx, kqv)
+}
+
+func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
+	return key.RoPE(ctx, shift, nil, uint32(m.Options.attnKeyLen), uint32(2), m.Options.ropeBase, m.Options.ropeScale), nil
+}
+
+type MLP struct {
+	Up   *nn.Linear `gguf:"ffn_up"`
+	Down *nn.Linear `gguf:"ffn_down"`
+	Gate *nn.Linear `gguf:"ffn_gate"`
+}
+
+func (mlp *MLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *Options) ml.Tensor {
+	hiddenState = mlp.Gate.Forward(ctx, hiddenState).GELU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenState))
+	return mlp.Down.Forward(ctx, hiddenState)
+}
+
+type Layer struct {
+	AttentionNorm     *nn.RMSNorm `gguf:"attn_norm"`
+	SelfAttention     *SelfAttention
+	PostAttentionNorm *nn.RMSNorm `gguf:"post_attention_norm"`
+	MLPNorm           *nn.RMSNorm `gguf:"ffn_norm"`
+	MLP               *MLP
+	PostMLPNorm       *nn.RMSNorm `gguf:"post_ffw_norm"`
+}
+
+func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
+	residual := hiddenState
+
+	hiddenState = l.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
+	hiddenState = l.SelfAttention.Forward(ctx, hiddenState, positionIDs, cache, opts)
+	hiddenState = l.PostAttentionNorm.Forward(ctx, hiddenState, opts.eps)
+
+	// In the final layer (outputs != nil), optimize by pruning to just the token positions
+	// we need logits for.
+	if outputs != nil {
+		hiddenState = hiddenState.Rows(ctx, outputs)
+		residual = residual.Rows(ctx, outputs)
+	}
+
+	hiddenState = hiddenState.Add(ctx, residual)
+	residual = hiddenState
+
+	hiddenState = l.MLPNorm.Forward(ctx, hiddenState, opts.eps)
+	hiddenState = l.MLP.Forward(ctx, hiddenState, opts)
+	hiddenState = l.PostMLPNorm.Forward(ctx, hiddenState, opts.eps)
+	return hiddenState.Add(ctx, residual)
+}
+
+func (m *Model) Forward(ctx ml.Context, opts input.Options) (ml.Tensor, error) {
+	inputs, err := ctx.Input().FromIntSlice(opts.Inputs, len(opts.Inputs))
+	if err != nil {
+		return nil, err
+	}
+
+	positions, err := ctx.Input().FromIntSlice(opts.Positions, len(opts.Positions))
+	if err != nil {
+		return nil, err
+	}
+
+	outputs, err := ctx.Output().FromIntSlice(opts.Outputs, len(opts.Outputs))
+	if err != nil {
+		return nil, err
+	}
+
+	hiddenState := m.TokenEmbedding.Forward(ctx, inputs)
+	hiddenState = hiddenState.Scale(ctx, math.Sqrt(float64(m.Options.hiddenSize)))
+
+	if len(m.Layers) == gemma27BLayerCount {
+		m.Options.largeModelScaling = true
+	}
+
+	for i, layer := range m.Layers {
+		cacheType := i % 2
+		m.Cache.SetLayer(i)
+		wc := m.Cache.(*kvcache.WrapperCache)
+		wc.SetLayerType(cacheType)
+
+		var lastLayerOutputs ml.Tensor
+		if i == len(m.Layers)-1 {
+			lastLayerOutputs = outputs
+		}
+
+		hiddenState = layer.Forward(ctx, hiddenState, positions, lastLayerOutputs, m.Cache, m.Options)
+	}
+
+	hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
+	hiddenState = m.Output.Forward(ctx, hiddenState)
+
+	// final logit softcap
+	hiddenState = hiddenState.Scale(ctx, 1.0/float64(m.Options.finalLogitSoftcap))
+	hiddenState = hiddenState.Tanh(ctx)
+	hiddenState = hiddenState.Scale(ctx, float64(m.Options.finalLogitSoftcap))
+	return hiddenState.Rows(ctx, outputs), nil
+}
+
+func init() {
+	model.Register("gemma2", New)
+}
--- a/model/models/gemma3/model.go
+++ b/model/models/gemma3/model.go
@@ -0,0 +1,177 @@
+package gemma3
+
+import (
+	"bytes"
+	"encoding/binary"
+	"hash/fnv"
+	"image"
+	"math"
+
+	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/model/input"
+)
+
+type Model struct {
+	model.Base
+	model.SentencePieceModel
+
+	*VisionModel `gguf:"v,vision"`
+	*TextModel
+
+	*MultiModalProjector `gguf:"mm"`
+
+	ImageProcessor
+}
+
+var _ model.MultimodalProcessor = (*Model)(nil)
+
+type MultiModalProjector struct {
+	SoftEmbNorm     *nn.RMSNorm `gguf:"mm_soft_emb_norm"`
+	InputProjection *nn.Linear  `gguf:"mm_input_projection"`
+
+	tokensPerImage int
+}
+
+func (p *MultiModalProjector) Forward(ctx ml.Context, visionOutputs ml.Tensor, imageSize, patchSize int, eps float32) ml.Tensor {
+	l := visionOutputs.Dim(0)
+
+	visionOutputs = visionOutputs.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
+	patchesPerImage := imageSize / patchSize
+	visionOutputs = visionOutputs.Reshape(ctx, patchesPerImage, patchesPerImage, l)
+
+	kernelSize := patchesPerImage / int(math.Sqrt(float64(p.tokensPerImage)))
+	visionOutputs = visionOutputs.AvgPool2D(ctx, kernelSize, kernelSize, 0)
+	visionOutputs = visionOutputs.Reshape(ctx, visionOutputs.Dim(0)*visionOutputs.Dim(1), l)
+	visionOutputs = visionOutputs.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
+	visionOutputs = p.SoftEmbNorm.Forward(ctx, visionOutputs, eps)
+
+	// TODO: inputProjection must be transposed since they're incompatible with visionOutputs
+	visionOutputs = p.InputProjection.Weight.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx).Mulmat(ctx, visionOutputs)
+	return visionOutputs
+}
+
+func New(c ml.Config) (model.Model, error) {
+	m := Model{
+		SentencePieceModel: model.NewSentencePieceModel(
+			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
+			&model.Vocabulary{
+				Values: c.Strings("tokenizer.ggml.tokens"),
+				Scores: c.Floats("tokenizer.ggml.scores"),
+				Types:  c.Uints("tokenizer.ggml.token_type"),
+				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
+				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
+				EOS:    int32(1),
+				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
+				EOT:    int32(106),
+				AddEOT: c.Bool("tokenizer.ggml.add_eot_token", false),
+			},
+		),
+		ImageProcessor: newImageProcessor(c),
+		VisionModel:    newVisionModel(c),
+		TextModel:      newTextModel(c),
+		MultiModalProjector: &MultiModalProjector{
+			tokensPerImage: int(c.Uint("mm_tokens_per_image", 256)),
+		},
+	}
+
+	slidingWindowLen := int32(c.Uint("attention.sliding_window"))
+	m.Cache = kvcache.NewWrapperCache(kvcache.NewSWACache(slidingWindowLen, m.Shift), kvcache.NewCausalCache(m.Shift))
+
+	return &m, nil
+}
+
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
+	if len(m.VisionModel.Layers) == 0 {
+		return nil, model.ErrNoVisionModel
+	}
+
+	image, _, err := image.Decode(bytes.NewReader(multimodalData))
+	if err != nil {
+		return nil, err
+	}
+
+	f32s, err := m.ImageProcessor.ProcessImage(image)
+	if err != nil {
+		return nil, err
+	}
+
+	pixelValues, err := ctx.Input().FromFloatSlice(f32s,
+		m.ImageProcessor.imageSize,
+		m.ImageProcessor.imageSize,
+		m.ImageProcessor.numChannels,
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
+	visionOutputs = m.MultiModalProjector.Forward(ctx, visionOutputs, m.imageSize, m.patchSize, m.VisionModel.eps)
+	return visionOutputs, nil
+}
+
+type imageToken struct {
+	embedding ml.Tensor
+	index     int
+}
+
+func (m *Model) PostTokenize(ctx ml.Context, inputs []input.Input) ([]input.Input, error) {
+	var result []input.Input
+	fnvHash := fnv.New64a()
+
+	for _, inp := range inputs {
+		if inp.Multimodal == nil {
+			result = append(result, inp)
+		} else {
+			imageInputs := []input.Input{
+				{Token: 108},    // "\n\n"
+				{Token: 255999}, // "<start_of_image>""
+			}
+			result = append(result, imageInputs...)
+
+			// add image embeddings
+			inputMultimodal := inp.Multimodal.(ml.Tensor)
+
+			for i := range inputMultimodal.Dim(1) {
+				fnvHash.Reset()
+				binary.Write(fnvHash, binary.NativeEndian, inp.MultimodalHash)
+				fnvHash.Write([]byte{byte(i)})
+
+				imageToken := imageToken{embedding: inputMultimodal, index: i}
+				result = append(result, input.Input{Multimodal: imageToken, MultimodalHash: fnvHash.Sum64()})
+			}
+
+			result = append(result,
+				input.Input{Token: 256000}, // <end_of_image>
+				input.Input{Token: 108},    // "\n\n"
+			)
+		}
+	}
+
+	return result, nil
+}
+
+func (m *Model) Forward(ctx ml.Context, opts input.Options) (ml.Tensor, error) {
+	inputs, err := ctx.Input().FromIntSlice(opts.Inputs, len(opts.Inputs))
+	if err != nil {
+		return nil, err
+	}
+
+	positions, err := ctx.Input().FromIntSlice(opts.Positions, len(opts.Positions))
+	if err != nil {
+		return nil, err
+	}
+
+	outputs, err := ctx.Output().FromIntSlice(opts.Outputs, len(opts.Outputs))
+	if err != nil {
+		return nil, err
+	}
+
+	return m.TextModel.Forward(ctx, inputs, positions, outputs, opts, m.Cache), nil
+}
+
+func init() {
+	model.Register("gemma3", New)
+}
--- a/model/models/gemma3/model_text.go
+++ b/model/models/gemma3/model_text.go
@@ -0,0 +1,247 @@
+package gemma3
+
+import (
+	"math"
+
+	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/model/input"
+)
+
+type TextOptions struct {
+	hiddenSize, numHeads, numKVHeads int
+	attnKeyLen, attnValLen           int
+	eps, ropeScale                   float32
+	ropeLocalBase, ropeGlobalBase    float32
+	largeModelScaling                bool
+}
+
+type TextModel struct {
+	model.Base
+	model.SentencePieceModel
+
+	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
+	Layers         []TextLayer   `gguf:"blk"`
+	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
+	Output         *nn.Linear    `gguf:"output,alt:token_embd"`
+
+	*TextOptions
+}
+
+const (
+	gemmaGlobalCacheCount = 6
+	gemma27BLayerCount    = 62
+)
+
+const (
+	cacheTypeSWA = iota
+	cacheTypeCausal
+)
+
+func newTextModel(c ml.Config) *TextModel {
+	numBlocks := int(c.Uint("block_count"))
+
+	m := TextModel{
+		SentencePieceModel: model.NewSentencePieceModel(
+			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
+			&model.Vocabulary{
+				Values: c.Strings("tokenizer.ggml.tokens"),
+				Scores: c.Floats("tokenizer.ggml.scores"),
+				Types:  c.Uints("tokenizer.ggml.token_type"),
+				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
+				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+			},
+		),
+		Layers: make([]TextLayer, numBlocks),
+		TextOptions: &TextOptions{
+			hiddenSize:     int(c.Uint("embedding_length")),
+			numHeads:       int(c.Uint("attention.head_count")),
+			numKVHeads:     int(c.Uint("attention.head_count_kv")),
+			attnKeyLen:     int(c.Uint("attention.key_length", 256)),
+			attnValLen:     int(c.Uint("attention.value_length", 256)),
+			eps:            c.Float("attention.layer_norm_rms_epsilon", 1e-06),
+			ropeLocalBase:  c.Float("rope.local.freq_base", 10000.0),
+			ropeGlobalBase: c.Float("rope.global.freq_base", 1000000.0),
+			ropeScale:      c.Float("rope.freq_scale", 1.0),
+		},
+	}
+
+	if numBlocks == gemma27BLayerCount {
+		m.largeModelScaling = true
+	}
+
+	return &m
+}
+
+type TextSelfAttention struct {
+	Query     *nn.Linear  `gguf:"attn_q"`
+	QueryNorm *nn.RMSNorm `gguf:"attn_q_norm"`
+	Key       *nn.Linear  `gguf:"attn_k"`
+	KeyNorm   *nn.RMSNorm `gguf:"attn_k_norm"`
+	Value     *nn.Linear  `gguf:"attn_v"`
+	Output    *nn.Linear  `gguf:"attn_output"`
+}
+
+func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
+	batchSize := hiddenState.Dim(1)
+	ropeType := uint32(2)
+
+	ropeBase := opts.ropeLocalBase
+	if (layer+1)%gemmaGlobalCacheCount == 0 {
+		ropeBase = opts.ropeGlobalBase
+	}
+
+	q := sa.Query.Forward(ctx, hiddenState)
+	q = q.Reshape(ctx, opts.attnKeyLen, opts.numHeads, batchSize)
+	q = sa.QueryNorm.Forward(ctx, q, opts.eps)
+	q = q.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, ropeBase, opts.ropeScale)
+
+	if opts.largeModelScaling {
+		q = q.Scale(ctx, 1.0/math.Sqrt(float64(opts.hiddenSize/opts.numHeads)))
+	} else {
+		q = q.Scale(ctx, 1.0/math.Sqrt(float64(opts.attnKeyLen)))
+	}
+
+	k := sa.Key.Forward(ctx, hiddenState)
+	k = k.Reshape(ctx, opts.attnKeyLen, opts.numKVHeads, batchSize)
+	k = sa.KeyNorm.Forward(ctx, k, opts.eps)
+	k = k.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, ropeBase, opts.ropeScale)
+
+	v := sa.Value.Forward(ctx, hiddenState)
+	v = v.Reshape(ctx, opts.attnValLen, opts.numKVHeads, batchSize)
+
+	scaleFactor := 1.0
+	kqv := nn.Attention(ctx, q, k, v, scaleFactor, cache)
+	kqv = kqv.Reshape(ctx, opts.attnValLen*opts.numHeads, batchSize)
+
+	return sa.Output.Forward(ctx, kqv)
+}
+
+func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
+	ropeBase := m.TextOptions.ropeLocalBase
+	if (layer+1)%gemmaGlobalCacheCount == 0 {
+		ropeBase = m.TextOptions.ropeGlobalBase
+	}
+
+	return key.RoPE(ctx, shift, nil, uint32(m.TextOptions.attnKeyLen), uint32(2), ropeBase, m.TextOptions.ropeScale), nil
+}
+
+type TextMLP struct {
+	Up   *nn.Linear `gguf:"ffn_up"`
+	Down *nn.Linear `gguf:"ffn_down"`
+	Gate *nn.Linear `gguf:"ffn_gate"`
+}
+
+func (mlp *TextMLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *TextOptions) ml.Tensor {
+	hiddenState = mlp.Gate.Forward(ctx, hiddenState).GELU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenState))
+	return mlp.Down.Forward(ctx, hiddenState)
+}
+
+type TextLayer struct {
+	AttentionNorm     *nn.RMSNorm `gguf:"attn_norm"`
+	SelfAttention     *TextSelfAttention
+	PostAttentionNorm *nn.RMSNorm `gguf:"post_attention_norm"`
+	MLPNorm           *nn.RMSNorm `gguf:"ffn_norm"`
+	MLP               *TextMLP
+	PostMLPNorm       *nn.RMSNorm `gguf:"post_ffw_norm"`
+}
+
+func (l *TextLayer) Forward(ctx ml.Context, layer int, hiddenState, positionIDs, outputs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
+	residual := hiddenState
+
+	hiddenState = l.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
+	hiddenState = l.SelfAttention.Forward(ctx, layer, hiddenState, positionIDs, cache, opts)
+	hiddenState = l.PostAttentionNorm.Forward(ctx, hiddenState, opts.eps)
+
+	// In the final layer (outputs != nil), optimize by pruning to just the token positions
+	// we need logits for.
+	if outputs != nil {
+		hiddenState = hiddenState.Rows(ctx, outputs)
+		residual = residual.Rows(ctx, outputs)
+	}
+
+	hiddenState = hiddenState.Add(ctx, residual)
+	residual = hiddenState
+
+	hiddenState = l.MLPNorm.Forward(ctx, hiddenState, opts.eps)
+	hiddenState = l.MLP.Forward(ctx, hiddenState, opts)
+	hiddenState = l.PostMLPNorm.Forward(ctx, hiddenState, opts.eps)
+	return hiddenState.Add(ctx, residual)
+}
+
+func setImageEmbeddings(ctx ml.Context, hiddenState ml.Tensor, multimodal []input.MultimodalIndex) []int {
+	var embedding ml.Tensor
+	var src, dst, length int
+	var except []int
+
+	for _, image := range multimodal {
+		imageToken := image.Multimodal.(imageToken)
+		imageSrc := imageToken.index
+		imageDst := image.Index
+
+		if embedding == nil {
+			embedding = imageToken.embedding
+			src = imageSrc
+			dst = imageDst
+			length = 1
+		} else if embedding == imageToken.embedding && imageSrc+1 == src && imageDst+1 == dst {
+			src = imageSrc
+			dst = imageDst
+			length++
+		} else if embedding == imageToken.embedding && src+length == imageSrc && dst+length == imageDst {
+			length++
+		} else {
+			visionOutputs := embedding.View(ctx, src*embedding.Stride(1), length*embedding.Dim(0))
+			ctx.Forward(visionOutputs.Copy(ctx, hiddenState.View(ctx, dst*hiddenState.Stride(1), length*hiddenState.Dim(0))))
+
+			embedding = imageToken.embedding
+			src = imageSrc
+			dst = imageDst
+			length = 1
+		}
+
+		except = append(except, imageDst)
+	}
+
+	if embedding != nil {
+		visionOutputs := embedding.View(ctx, src*embedding.Stride(1), length*embedding.Dim(0))
+		ctx.Forward(visionOutputs.Copy(ctx, hiddenState.View(ctx, dst*hiddenState.Stride(1), length*hiddenState.Dim(0))))
+	}
+
+	return except
+}
+
+func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor, opts input.Options, cache kvcache.Cache) ml.Tensor {
+	hiddenState := m.TokenEmbedding.Forward(ctx, inputs)
+	hiddenState = hiddenState.Scale(ctx, math.Sqrt(float64(m.TextOptions.hiddenSize)))
+
+	except := setImageEmbeddings(ctx, hiddenState, opts.Multimodal)
+
+	for i, layer := range m.Layers {
+		// gemma alternates between the sliding window (local) and causal (global)
+		// kv cache every 6 layers
+		cacheType := cacheTypeSWA
+		if (i+1)%gemmaGlobalCacheCount == 0 {
+			cacheType = cacheTypeCausal
+		}
+		cache.SetLayer(i)
+		wc := cache.(*kvcache.WrapperCache)
+		wc.SetLayerType(cacheType)
+
+		if causal, ok := wc.UnderlyingCache().(*kvcache.Causal); ok {
+			causal.SetCausal(ctx, kvcache.CausalOptions{Except: except})
+		}
+
+		var lastLayerOutputs ml.Tensor
+		if i == len(m.Layers)-1 {
+			lastLayerOutputs = outputs
+		}
+
+		hiddenState = layer.Forward(ctx, i, hiddenState, positions, lastLayerOutputs, cache, m.TextOptions)
+	}
+
+	hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
+	return m.Output.Forward(ctx, hiddenState)
+}
--- a/model/models/gemma3/model_vision.go
+++ b/model/models/gemma3/model_vision.go
@@ -0,0 +1,127 @@
+package gemma3
+
+import (
+	"math"
+
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+)
+
+var batchSize int = 1
+
+type VisionSelfAttention struct {
+	Query  *nn.Linear `gguf:"attn_q"`
+	Key    *nn.Linear `gguf:"attn_k"`
+	Value  *nn.Linear `gguf:"attn_v"`
+	Output *nn.Linear `gguf:"attn_output"`
+}
+
+func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
+	headDim := opts.hiddenSize / opts.numHeads
+
+	query := sa.Query.Forward(ctx, hiddenState)
+	key := sa.Key.Forward(ctx, hiddenState)
+	value := sa.Value.Forward(ctx, hiddenState)
+
+	query = query.Reshape(ctx, headDim, opts.numHeads, query.Dim(1), batchSize)
+	key = key.Reshape(ctx, headDim, opts.numHeads, key.Dim(1), batchSize)
+	value = value.Reshape(ctx, headDim, opts.numHeads, value.Dim(1), batchSize)
+
+	attention := nn.Attention(ctx, query, key, value, 1.0/math.Sqrt(float64(headDim)), nil)
+	attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize)
+
+	hiddenState = sa.Output.Forward(ctx, attention)
+	return hiddenState
+}
+
+type VisionMLP struct {
+	FC1 *nn.Linear `gguf:"fc1"`
+	FC2 *nn.Linear `gguf:"fc2"`
+}
+
+func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
+	hiddenState = mlp.FC1.Forward(ctx, hiddenState).GELU(ctx)
+	hiddenState = mlp.FC2.Forward(ctx, hiddenState)
+	return hiddenState
+}
+
+type VisionEncoderLayer struct {
+	LayerNorm1    *nn.LayerNorm `gguf:"layer_norm1"`
+	SelfAttention *VisionSelfAttention
+
+	LayerNorm2 *nn.LayerNorm `gguf:"layer_norm2"`
+	MLP        *VisionMLP    `gguf:"mlp"`
+}
+
+func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
+	residual := hiddenState
+
+	// self attention
+	hiddenState = e.LayerNorm1.Forward(ctx, hiddenState, opts.eps)
+	hiddenState = e.SelfAttention.Forward(ctx, hiddenState, opts)
+	hiddenState = hiddenState.Add(ctx, residual)
+	residual = hiddenState
+
+	// feed forward
+	hiddenState = e.LayerNorm2.Forward(ctx, hiddenState, opts.eps)
+	hiddenState = e.MLP.Forward(ctx, hiddenState, opts)
+	return hiddenState.Add(ctx, residual)
+}
+
+type VisionModelOptions struct {
+	hiddenSize, numHeads int
+	imageSize, patchSize int
+	eps                  float32
+}
+
+type VisionModel struct {
+	PatchEmbedding    *nn.Conv2D    `gguf:"patch_embedding"`
+	PositionEmbedding *nn.Embedding `gguf:"position_embedding"`
+	PostLayerNorm     *nn.LayerNorm `gguf:"post_layernorm"`
+
+	Layers []VisionEncoderLayer `gguf:"blk"`
+
+	*VisionModelOptions
+}
+
+func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
+	numPatches := (m.imageSize / m.patchSize) * (m.imageSize / m.patchSize)
+
+	hiddenState := m.PatchEmbedding.Forward(ctx, pixelValues, m.patchSize, m.patchSize, 0, 0, 1, 1)
+	hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize)
+	hiddenState = hiddenState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
+
+	positions := make([]int32, numPatches)
+	for i := range positions {
+		positions[i] = int32(i)
+	}
+
+	positionIDs, err := ctx.Input().FromIntSlice(positions, len(positions))
+	if err != nil {
+		panic(err)
+	}
+
+	hiddenState = hiddenState.Add(ctx, m.PositionEmbedding.Forward(ctx, positionIDs))
+
+	for _, layer := range m.Layers {
+		hiddenState = layer.Forward(ctx, hiddenState, m.VisionModelOptions)
+	}
+
+	hiddenState = m.PostLayerNorm.Forward(ctx, hiddenState, m.eps)
+	return hiddenState
+}
+
+func newVisionModel(c ml.Config) *VisionModel {
+	return &VisionModel{
+		Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count")),
+		VisionModelOptions: &VisionModelOptions{
+			hiddenSize: int(c.Uint("vision.embedding_length")),
+			numHeads:   int(c.Uint("vision.attention.head_count")),
+
+			imageSize: int(c.Uint("vision.image_size")),
+			patchSize: int(c.Uint("vision.patch_size")),
+
+			eps: c.Float("vision.attention.layer_norm_epsilon"),
+		},
+	}
+}
--- a/model/models/gemma3/process_image.go
+++ b/model/models/gemma3/process_image.go
@@ -0,0 +1,58 @@
+package gemma3
+
+import (
+	"image"
+
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/model/imageproc"
+)
+
+type ImageProcessor struct {
+	imageSize, patchSize, numChannels int
+}
+
+func newImageProcessor(c ml.Config) ImageProcessor {
+	return ImageProcessor{
+		imageSize:   int(c.Uint("vision.image_size")),
+		patchSize:   int(c.Uint("vision.patch_size")),
+		numChannels: int(c.Uint("vision.num_channels")),
+	}
+}
+
+func (p *ImageProcessor) pack(img image.Image, mean, std [3]float32) []float32 {
+	var pixelVals, rVals, gVals, bVals []float32
+
+	bounds := img.Bounds()
+	for y := bounds.Min.Y; y < bounds.Max.Y; y++ {
+		for x := bounds.Min.X; x < bounds.Max.X; x++ {
+			c := img.At(x, y)
+			r, g, b, _ := c.RGBA()
+			rVal := float32(r>>8) / 255.0
+			gVal := float32(g>>8) / 255.0
+			bVal := float32(b>>8) / 255.0
+
+			rVal = (rVal - mean[0]) / std[0]
+			gVal = (gVal - mean[1]) / std[1]
+			bVal = (bVal - mean[2]) / std[2]
+
+			rVals = append(rVals, rVal)
+			gVals = append(gVals, gVal)
+			bVals = append(bVals, bVal)
+		}
+	}
+
+	pixelVals = append(pixelVals, rVals...)
+	pixelVals = append(pixelVals, gVals...)
+	pixelVals = append(pixelVals, bVals...)
+
+	return pixelVals
+}
+
+func (p ImageProcessor) ProcessImage(img image.Image) ([]float32, error) {
+	outputSize := image.Point{p.imageSize, p.imageSize}
+	newImage := imageproc.Composite(img)
+	newImage = imageproc.Resize(newImage, outputSize, imageproc.ResizeBilinear)
+
+	data := p.pack(newImage, imageproc.ImageNetStandardMean, imageproc.ImageNetStandardSTD)
+	return data, nil
+}
--- a/model/models/llama/model.go
+++ b/model/models/llama/model.go
@@ -76,14 +76,15 @@ type SelfAttention struct {
 func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
 	headDim := opts.hiddenSize / opts.numHeads
+	ropeType := uint32(0)

 	q := sa.Query.Forward(ctx, hiddenState)
 	q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
-	q = q.RoPE(ctx, positionIDs, sa.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)
+	q = q.RoPE(ctx, positionIDs, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)

 	k := sa.Key.Forward(ctx, hiddenState)
 	k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-	k = k.RoPE(ctx, positionIDs, sa.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)
+	k = k.RoPE(ctx, positionIDs, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)

 	v := sa.Value.Forward(ctx, hiddenState)
 	v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
@@ -96,7 +97,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
 }

 func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return key.RoPE(ctx, shift, m.Layers[layer].SelfAttention.RopeFactors, m.ropeDim, m.ropeBase, m.ropeScale), nil
+	return key.RoPE(ctx, shift, m.Layers[layer].SelfAttention.RopeFactors, uint32(0), m.ropeDim, m.ropeBase, m.ropeScale), nil
 }

 type MLP struct {
--- a/model/models/mllama/model.go
+++ b/model/models/mllama/model.go
@@ -63,6 +63,10 @@ func New(c ml.Config) (model.Model, error) {
 }

 func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
+	if len(m.VisionModel.Transformer.Layers) == 0 || len(m.GlobalTransformer.Layers) == 0 {
+		return nil, model.ErrNoVisionModel
+	}
+
 	image, _, err := image.Decode(bytes.NewReader(multimodalData))
 	if err != nil {
 		return nil, err
--- a/model/models/mllama/model_text.go
+++ b/model/models/mllama/model_text.go
@@ -20,14 +20,15 @@ type TextSelfAttention struct {
 func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, _ ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
 	headDim := opts.hiddenSize / opts.numHeads
+	ropeType := uint32(0)

 	query := sa.Query.Forward(ctx, hiddenState)
 	query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
-	query = query.RoPE(ctx, positions, sa.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)
+	query = query.RoPE(ctx, positions, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)

 	key := sa.Key.Forward(ctx, hiddenState)
 	key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-	key = key.RoPE(ctx, positions, sa.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)
+	key = key.RoPE(ctx, positions, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)

 	value := sa.Value.Forward(ctx, hiddenState)
 	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
@@ -40,8 +41,9 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, _ m
 }

 func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
+	// This will only get called for layers in the cache, which are just the self attention layers
 	if sa, ok := m.Transformer.Layers[layer].(*TextSelfAttentionDecoderLayer); ok {
-		return key.RoPE(ctx, shift, sa.SelfAttention.RopeFactors, m.ropeDim, m.ropeBase, m.ropeScale), nil
+		return key.RoPE(ctx, shift, sa.SelfAttention.RopeFactors, m.ropeDim, uint32(0), m.ropeBase, m.ropeScale), nil
 	}

 	return key, nil
--- a/model/models/mllama/process_image.go
+++ b/model/models/mllama/process_image.go
@@ -144,8 +144,6 @@ func (p *ImageProcessor) splitToTiles(img image.Image, numTilesSize image.Point)
 	return images
 }

-// remove the "alpha" channel by drawing over a prefilled image
-//
 // remove the "alpha" channel by drawing over a prefilled image
 //
 //nolint:unused
--- a/model/models/models.go
+++ b/model/models/models.go
@@ -1,6 +1,8 @@
 package models

 import (
+	_ "github.com/ollama/ollama/model/models/gemma2"
+	_ "github.com/ollama/ollama/model/models/gemma3"
 	_ "github.com/ollama/ollama/model/models/llama"
 	_ "github.com/ollama/ollama/model/models/mllama"
 )
--- a/model/process_text.go
+++ b/model/process_text.go
@@ -4,6 +4,7 @@ import (
 	"cmp"
 	"iter"
 	"log/slog"
+	"slices"
 	"strings"
 	"sync"

@@ -18,6 +19,15 @@ const (
 	SpecialEOS
 )

+const (
+	TOKEN_TYPE_NORMAL = iota + 1
+	TOKEN_TYPE_UNKNOWN
+	TOKEN_TYPE_CONTROL
+	TOKEN_TYPE_USER_DEFINED
+	TOKEN_TYPE_UNUSED
+	TOKEN_TYPE_BYTE
+)
+
 type TextProcessor interface {
 	Encode(s string, addSpecial bool) ([]int32, error)
 	Decode([]int32) (string, error)
@@ -27,11 +37,11 @@ type TextProcessor interface {
 type Vocabulary struct {
 	Values []string
 	Types  []uint32
-	Scores []uint32
+	Scores []float32
 	Merges []string

-	BOS, EOS       int32
-	AddBOS, AddEOS bool
+	BOS, EOS, EOT          int32
+	AddBOS, AddEOS, AddEOT bool

 	specialOnce sync.Once
 	special     []string
@@ -48,7 +58,7 @@ func (v *Vocabulary) Is(id int32, special Special) bool {
 	case SpecialBOS:
 		return id == v.BOS
 	case SpecialEOS:
-		return id == v.EOS
+		return id == v.EOS || id == v.EOT
 	default:
 		return false
 	}
@@ -76,7 +86,9 @@ func (v *Vocabulary) Decode(id int32) string {
 func (v *Vocabulary) SpecialVocabulary() []string {
 	v.specialOnce.Do(func() {
 		for i := range v.Values {
-			if v.Types[i] == 3 {
+			if slices.Contains([]int{105, 106}, i) {
+				v.special = append(v.special, v.Values[i])
+			} else if v.Types[i] == TOKEN_TYPE_CONTROL {
 				v.special = append(v.special, v.Values[i])
 			}
 		}
--- a/model/process_text_spm.go
+++ b/model/process_text_spm.go
@@ -0,0 +1,249 @@
+package model
+
+import (
+	"iter"
+	"strings"
+
+	"github.com/dlclark/regexp2"
+	queue "github.com/emirpasic/gods/v2/queues/priorityqueue"
+
+	"github.com/ollama/ollama/logging"
+)
+
+const spmWhitespaceSep = "▁"
+
+var log = logging.NewLogger()
+
+func replaceWhitespaceBySeperator(s string) string {
+	return strings.ReplaceAll(s, " ", spmWhitespaceSep)
+}
+
+type SentencePieceModel struct {
+	maxTokenLen int
+	pre         *regexp2.Regexp
+	vocab       *Vocabulary
+}
+
+var _ TextProcessor = (*SentencePieceModel)(nil)
+
+func NewSentencePieceModel(pre string, vocab *Vocabulary) SentencePieceModel {
+	log.Debug("Tokens", "num tokens", len(vocab.Values), "vals", vocab.Values[:5], "scores", vocab.Scores[:5], "types", vocab.Types[:5])
+
+	counter := map[int]int{}
+	var maxTokenLen int
+	for cnt := range vocab.Types {
+		switch vocab.Types[cnt] {
+		case TOKEN_TYPE_NORMAL, TOKEN_TYPE_USER_DEFINED, TOKEN_TYPE_UNUSED:
+			maxTokenLen = max(maxTokenLen, len(vocab.Values[cnt]))
+			fallthrough
+		default:
+			counter[int(vocab.Types[cnt])] += 1
+		}
+	}
+
+	log.Debug("Token counts", "normal", counter[TOKEN_TYPE_NORMAL], "unknown", counter[TOKEN_TYPE_UNKNOWN], "control", counter[TOKEN_TYPE_CONTROL],
+		"user defined", counter[TOKEN_TYPE_USER_DEFINED], "unused", counter[TOKEN_TYPE_UNUSED], "byte", counter[TOKEN_TYPE_BYTE],
+		"max token len", maxTokenLen)
+
+	return SentencePieceModel{
+		maxTokenLen: maxTokenLen,
+		pre:         regexp2.MustCompile(pre, regexp2.Unicode|regexp2.RE2),
+		vocab:       vocab,
+	}
+}
+
+func (spm SentencePieceModel) Is(id int32, special Special) bool {
+	return spm.vocab.Is(id, special)
+}
+
+func (spm *SentencePieceModel) split(s string) iter.Seq[string] {
+	return func(yield func(string) bool) {
+		for m, _ := spm.pre.FindStringMatch(s); m != nil; m, _ = spm.pre.FindNextMatch(m) {
+			if !yield(m.String()) {
+				break
+			}
+		}
+	}
+}
+
+func (spm SentencePieceModel) Encode(s string, addSpecial bool) ([]int32, error) {
+	fragments := []fragment{{value: s}}
+	for _, special := range spm.vocab.SpecialVocabulary() {
+		// TODO: process special tokens concurrently
+		id := spm.vocab.Encode(special)
+		for i := 0; i < len(fragments); i++ {
+			frag := fragments[i]
+			if len(frag.ids) > 0 {
+				continue
+			}
+
+			var middle []fragment
+			switch i := strings.Index(frag.value, special); {
+			case i < 0:
+				middle = append(middle, frag)
+			case i > 0:
+				middle = append(middle, fragment{value: frag.value[:i]})
+				fallthrough
+			default:
+				middle = append(middle, fragment{value: special, ids: []int32{id}})
+				if rest := frag.value[i+len(special):]; rest != "" {
+					middle = append(middle, fragment{value: rest})
+				}
+			}
+
+			fragments = append(fragments[:i], append(middle, fragments[i+1:]...)...)
+		}
+	}
+	log.Trace("fragments", "frags", fragments)
+
+	var ids []int32
+	for _, frag := range fragments {
+		if len(frag.ids) > 0 {
+			ids = append(ids, frag.ids...)
+			continue
+		}
+
+		for split := range spm.split(frag.value) {
+			split = replaceWhitespaceBySeperator(split)
+
+			var sb strings.Builder
+			sb.Write([]byte(split))
+			if id := spm.vocab.Encode(sb.String()); id >= 0 {
+				ids = append(ids, id)
+				continue
+			}
+
+			runes := []rune(sb.String())
+			pq := queue.NewWith(func(a, b any) int {
+				priA := a.(*candidate)
+				priB := b.(*candidate)
+				if priA.score > priB.score || (priA.score == priB.score && priA.a < priB.a) {
+					return -1
+				}
+				return 1
+			})
+
+			merges := make([]merge, len(runes))
+			for r := range runes {
+				merges[r] = merge{
+					p:     r - 1,
+					n:     r + 1,
+					runes: []rune{runes[r]},
+				}
+			}
+
+			log.Trace("tokenizer", "merges", merges)
+
+			pairwise := func(a, b int) *candidate {
+				if a < 0 || b >= len(runes) {
+					return nil
+				}
+
+				left, right := string(merges[a].runes), string(merges[b].runes)
+				if id := spm.vocab.Encode(left + right); id >= 0 {
+					return &candidate{
+						a:     a,
+						b:     b,
+						score: spm.vocab.Scores[id],
+					}
+				}
+				return nil
+			}
+
+			for i := range len(runes) - 1 {
+				if pair := pairwise(i, i+1); pair != nil {
+					pq.Enqueue(pair)
+				}
+			}
+
+			pqv := pq.Values()
+			for _, v := range pqv {
+				e := v.(*candidate)
+				log.Trace("candidate", "candidate", e)
+			}
+
+			for !pq.Empty() {
+				v, _ := pq.Dequeue()
+				pair := v.(*candidate)
+				left, right := merges[pair.a], merges[pair.b]
+
+				log.Trace("pair", "left", left, "right", right)
+				if len(left.runes) == 0 || len(right.runes) == 0 {
+					continue
+				}
+
+				if id := spm.vocab.Encode(string(left.runes) + string(right.runes)); id < 0 {
+					continue
+				}
+
+				merges[pair.a].runes = append(left.runes, right.runes...)
+				merges[pair.b].runes = nil
+				merges[pair.a].n = right.n
+				if right.n < len(merges) {
+					merges[right.n].p = pair.a
+				}
+
+				if pair := pairwise(merges[pair.a].p, pair.a); pair != nil {
+					pq.Enqueue(pair)
+				}
+
+				if pair := pairwise(pair.a, merges[pair.a].n); pair != nil {
+					pq.Enqueue(pair)
+				}
+			}
+
+			log.Trace("merges", "merges", merges)
+
+			for _, merge := range merges {
+				if len(merge.runes) > 0 {
+					if id := spm.vocab.Encode(string(merge.runes)); id >= 0 {
+						ids = append(ids, id)
+					} else {
+						log.Error("missing token", "token", string(merge.runes))
+					}
+				}
+			}
+		}
+	}
+
+	if addSpecial && len(ids) > 0 {
+		if spm.vocab.AddBOS {
+			if ids[0] == spm.vocab.BOS {
+				log.Warn("adding bos token to prompt which already has it", "id", spm.vocab.BOS)
+			}
+
+			log.Debug("adding bos token to prompt", "id", spm.vocab.BOS)
+			ids = append([]int32{spm.vocab.BOS}, ids...)
+		}
+
+		if spm.vocab.AddEOS {
+			if ids[len(ids)-1] == spm.vocab.EOS {
+				log.Warn("adding eos token to prompt which already has it", "id", spm.vocab.EOS)
+			}
+
+			log.Debug("adding eos token to prompt", "id", spm.vocab.EOS)
+			ids = append(ids, spm.vocab.EOS)
+		}
+	}
+
+	return ids, nil
+}
+
+type candidate struct {
+	a, b  int
+	score float32
+}
+
+func (spm SentencePieceModel) Decode(ids []int32) (string, error) {
+	var sb strings.Builder
+	for _, id := range ids {
+		data := spm.vocab.Decode(id)
+		data = strings.ReplaceAll(data, spmWhitespaceSep, " ")
+		if _, err := sb.WriteString(data); err != nil {
+			return "", err
+		}
+	}
+
+	log.Debug("decoded", "ids", ids, "text", sb.String())
+	return sb.String(), nil
+}
--- a/model/process_text_spm_test.go
+++ b/model/process_text_spm_test.go
@@ -0,0 +1,118 @@
+package model
+
+import (
+	"log/slog"
+	"os"
+	"path/filepath"
+	"slices"
+	"testing"
+
+	"google.golang.org/protobuf/proto"
+
+	"github.com/ollama/ollama/convert/sentencepiece"
+)
+
+func loadSentencePieceVocab(t *testing.T) SentencePieceModel {
+	t.Helper()
+
+	bts, err := os.ReadFile(filepath.Join("testdata", "gemma2", "tokenizer.model"))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	var spm sentencepiece.ModelProto
+	if err := proto.Unmarshal(bts, &spm); err != nil {
+		t.Fatal(err)
+	}
+
+	preTokenizer := `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`
+
+	var v Vocabulary
+
+	for _, piece := range spm.GetPieces() {
+		v.Values = append(v.Values, piece.GetPiece())
+		v.Scores = append(v.Scores, piece.GetScore())
+		switch t := piece.GetType(); t {
+		case sentencepiece.ModelProto_SentencePiece_UNKNOWN,
+			sentencepiece.ModelProto_SentencePiece_CONTROL,
+			sentencepiece.ModelProto_SentencePiece_UNUSED,
+			sentencepiece.ModelProto_SentencePiece_BYTE:
+			v.Types = append(v.Types, uint32(t))
+		default:
+			tt := uint32(sentencepiece.ModelProto_SentencePiece_NORMAL)
+			// todo parse the special tokens file
+			//   - this will roundtrip correctly but the <start_of_turn> and
+			//     <end_of_turn> tokens aren't processed
+			v.Types = append(v.Types, tt)
+		}
+	}
+
+	return NewSentencePieceModel(preTokenizer, &v)
+}
+
+func TestSentencePieceEncode(t *testing.T) {
+	logger := slog.New(slog.NewTextHandler(os.Stdout, &slog.HandlerOptions{Level: slog.LevelDebug}))
+	slog.SetDefault(logger)
+
+	tokenizer := loadSentencePieceVocab(t)
+
+	t.Run("basic roundtrip", func(t *testing.T) {
+		t.Parallel()
+
+		cases := []string{
+			"hello",
+			"hello ",
+			"hello  ",
+			" hello",
+			" hello ",
+			" hello  ",
+			"hello world",
+			"请考试我的软件！12345",
+			"你好",
+			"Hello 你好 world!",
+			"Special characters: !@#$%^&*()_+-=[]{}|;':\",./<>?",
+			"Multilingual: 你好 こんにちは Привет Hola مرحبا",
+			"Numbers and symbols: 123456789 +- */",
+			"Special tokens: <bos> text <eos>",
+			"Code snippets: func main() { fmt.Println(\"Hello World\") }",
+			"Long text: " + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. " +
+				"Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. " +
+				"Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris.",
+		}
+
+		for _, want := range cases {
+			ids, err := tokenizer.Encode(want, true)
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			if got, err := tokenizer.Decode(ids); err != nil {
+				t.Fatal(err)
+			} else if got != want {
+				t.Errorf("got %q, want %q [%#v]", got, want, ids)
+			}
+		}
+	})
+
+	t.Run("special tokens", func(t *testing.T) {
+		type candidate struct {
+			token string
+			ids   []int32
+		}
+
+		cases := []candidate{
+			{"<bos>", []int32{2}},
+			{"<eos>", []int32{1}},
+		}
+
+		for _, want := range cases {
+			ids, err := tokenizer.Encode(want.token, true)
+			if err != nil {
+				t.Fatal(err)
+			}
+			if !slices.Equal(ids, want.ids) {
+				t.Errorf("got %#v, want %#v", ids, want.ids)
+			}
+		}
+	})
+}
--- a/model/testdata/gemma2/tokenizer.model
+++ b/model/testdata/gemma2/tokenizer.model
--- a/readline/readline.go
+++ b/readline/readline.go
@@ -116,19 +116,9 @@ func (i *Instance) Readline() (string, error) {

 			switch r {
 			case KeyUp:
-				if i.History.Pos > 0 {
-					if i.History.Pos == i.History.Size() {
-						currentLineBuf = []rune(buf.String())
-					}
-					buf.Replace([]rune(i.History.Prev()))
-				}
+				i.historyPrev(buf, &currentLineBuf)
 			case KeyDown:
-				if i.History.Pos < i.History.Size() {
-					buf.Replace([]rune(i.History.Next()))
-					if i.History.Pos == i.History.Size() {
-						buf.Replace(currentLineBuf)
-					}
-				}
+				i.historyNext(buf, &currentLineBuf)
 			case KeyLeft:
 				buf.MoveLeft()
 			case KeyRight:
@@ -185,6 +175,10 @@ func (i *Instance) Readline() (string, error) {
 			esc = true
 		case CharInterrupt:
 			return "", ErrInterrupt
+		case CharPrev:
+			i.historyPrev(buf, &currentLineBuf)
+		case CharNext:
+			i.historyNext(buf, &currentLineBuf)
 		case CharLineStart:
 			buf.MoveToStart()
 		case CharLineEnd:
@@ -246,6 +240,24 @@ func (i *Instance) HistoryDisable() {
 	i.History.Enabled = false
 }

+func (i *Instance) historyPrev(buf *Buffer, currentLineBuf *[]rune) {
+	if i.History.Pos > 0 {
+		if i.History.Pos == i.History.Size() {
+			*currentLineBuf = []rune(buf.String())
+		}
+		buf.Replace([]rune(i.History.Prev()))
+	}
+}
+
+func (i *Instance) historyNext(buf *Buffer, currentLineBuf *[]rune) {
+	if i.History.Pos < i.History.Size() {
+		buf.Replace([]rune(i.History.Next()))
+		if i.History.Pos == i.History.Size() {
+			buf.Replace(*currentLineBuf)
+		}
+	}
+}
+
 func NewTerminal() (*Terminal, error) {
 	fd := os.Stdin.Fd()
 	termios, err := SetRawMode(fd)
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -691,65 +691,6 @@ type EmbeddingResponse struct {
 	Embedding []float32 `json:"embedding"`
 }

-func (s *Server) embeddings(w http.ResponseWriter, r *http.Request) {
-	var req EmbeddingRequest
-	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
-		http.Error(w, fmt.Sprintf("bad request: %s", err), http.StatusBadRequest)
-		return
-	}
-
-	w.Header().Set("Content-Type", "application/json")
-
-	slog.Debug("embedding request", "content", req.Content)
-
-	seq, err := s.NewSequence(req.Content, nil, NewSequenceParams{embedding: true})
-	if err != nil {
-		http.Error(w, fmt.Sprintf("Failed to create new sequence: %v", err), http.StatusInternalServerError)
-		return
-	}
-
-	// Ensure there is a place to put the sequence, released when removed from s.seqs
-	if err := s.seqsSem.Acquire(r.Context(), 1); err != nil {
-		if errors.Is(err, context.Canceled) {
-			slog.Info("aborting embeddings request due to client closing the connection")
-		} else {
-			slog.Error("Failed to acquire semaphore", "error", err)
-		}
-		return
-	}
-
-	s.mu.Lock()
-	found := false
-	for i, sq := range s.seqs {
-		if sq == nil {
-			seq.cache, seq.inputs, err = s.cache.LoadCacheSlot(seq.inputs, req.CachePrompt)
-			if err != nil {
-				s.mu.Unlock()
-				http.Error(w, fmt.Sprintf("Failed to load cache: %v", err), http.StatusInternalServerError)
-				return
-			}
-			s.seqs[i] = seq
-			s.cond.Signal()
-			found = true
-			break
-		}
-	}
-	s.mu.Unlock()
-
-	if !found {
-		http.Error(w, "could not find an available sequence", http.StatusInternalServerError)
-		return
-	}
-
-	embedding := <-seq.embedding
-
-	if err := json.NewEncoder(w).Encode(&EmbeddingResponse{
-		Embedding: embedding,
-	}); err != nil {
-		http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
-	}
-}
-
 type HealthResponse struct {
 	Status   string  `json:"status"`
 	Progress float32 `json:"progress"`
@@ -927,9 +868,13 @@ func Execute(args []string) error {
 	defer listener.Close()

 	mux := http.NewServeMux()
-	mux.HandleFunc("/embedding", server.embeddings)
-	mux.HandleFunc("/completion", server.completion)
-	mux.HandleFunc("/health", server.health)
+	// TODO: support embeddings
+	mux.HandleFunc("POST /embedding", func(w http.ResponseWriter, r *http.Request) {
+		http.Error(w, "this model does not support embeddings", http.StatusNotImplemented)
+	})
+
+	mux.HandleFunc("POST /completion", server.completion)
+	mux.HandleFunc("GET /health", server.health)

 	httpServer := http.Server{
 		Handler: mux,
--- a/sample/samplers.go
+++ b/sample/samplers.go
@@ -84,14 +84,12 @@ func (s *Sampler) sample(tokens []token) (token, error) {
 		return greedy(tokens), nil
 	}

-	if s.topK > 0 {
-		tokens = topK(tokens, s.topK)
-	} else {
-		sortLogits(tokens)
-	}
+	// topK also sorts the tokens in descending order of logits
+	tokens = topK(tokens, s.topK)

 	tokens = temperature(tokens, s.temperature)
 	tokens = softmax(tokens)
+
 	tokens = topP(tokens, s.topP)
 	tokens = minP(tokens, s.minP)

--- a/sample/transforms.go
+++ b/sample/transforms.go
@@ -1,17 +1,58 @@
 package sample

 import (
+	"container/heap"
 	"math"
 	"slices"
 )

+// tokenHeap implements heap.Interface and holds tokens as a min-heap to track k largest elements
+type tokenHeap []token
+
+func (h tokenHeap) Len() int           { return len(h) }
+func (h tokenHeap) Less(i, j int) bool { return h[i].value < h[j].value }
+func (h tokenHeap) Swap(i, j int)      { h[i], h[j] = h[j], h[i] }
+
+func (h *tokenHeap) Push(x any) {
+	*h = append(*h, x.(token))
+}
+
+func (h *tokenHeap) Pop() any {
+	old := *h
+	n := len(old)
+	x := old[n-1]
+	*h = old[0 : n-1]
+	return x
+}
+
+// temperature applies scaling to the logits
+func temperature(ts []token, temp float32) []token {
+	// Ensure temperature clipping near 0 to avoid numerical instability
+	temp = max(temp, 1e-7)
+	for i := range ts {
+		ts[i].value = ts[i].value / temp
+	}
+	return ts
+}
+
+// softmax applies normalization to the logits
 func softmax(ts []token) []token {
+	// Find max logit for numerical stability
+	maxLogit := float32(math.Inf(-1))
+	for _, t := range ts {
+		if t.value > maxLogit {
+			maxLogit = t.value
+		}
+	}
+
+	// Compute exp(x - max)
 	var sum float32
 	for i, v := range ts {
-		ts[i].value = float32(math.Exp(float64(v.value)))
+		ts[i].value = float32(math.Exp(float64(v.value - maxLogit)))
 		sum += ts[i].value
 	}

+	// exp(x - max) / sum(exp(x - max))
 	for i := range ts {
 		ts[i].value /= sum
 	}
@@ -19,83 +60,42 @@ func softmax(ts []token) []token {
 	return ts
 }

-func temperature(ti []token, t float32) []token {
-	if t == 1 {
-		return ti
-	}
-
-	temp := max(t, 1e-7)
-	maxLogit := float32(math.Inf(-1))
-	for _, token := range ti {
-		if token.value > maxLogit {
-			maxLogit = token.value
-		}
-	}
-
-	// subtracting max logit to avoid under/overflow
-	for i := range ti {
-		ti[i].value = (ti[i].value - maxLogit) / temp
-	}
-
-	return ti
-}
-
-// siftDown maintains a min-heap property by recursively moving larger elements down the heap.
-//
-// The heap is represented as an array where for any node at index i:
-// - Left child is at index 2i + 1
-// - Right child is at index 2i + 2
-// - Parent is at index (i-1)/2
-//
-// The function compares a node with its children and:
-// 1. Finds the smallest value between the node and its children
-// 2. If the node is not the smallest, swaps it with its smallest child
-// 3. Continues this process down the affected path until the min-heap property is restored
-func siftDown(data []token, start, end int) {
-	root := start
-	for {
-		child := 2*root + 1
-		if child >= end {
-			break
-		}
-		// Find smaller child (we want min heap)
-		if child+1 < end && data[child+1].value < data[child].value {
-			child++
-		}
-		// Exit if root is already smaller than children
-		if data[root].value <= data[child].value {
-			break
-		}
-		// Swap with smaller child and continue
-		data[root], data[child] = data[child], data[root]
-		root = child
-	}
-}
-
 // topK limits the number of tokens considered to the k highest logits
 func topK(ts []token, k int) []token {
-	if k >= len(ts) {
+	if k >= len(ts) || k <= 0 {
+		slices.SortFunc(ts, func(a, b token) int {
+			switch {
+			case a.value < b.value:
+				return 1
+			case a.value > b.value:
+				return -1
+			default:
+				return 0
+			}
+		})
 		return ts
 	}
-	// Heapify + siftDown - O(nlog(k))
-	// Build min-heap of first k elements
-	heap := ts[:k]
-	for i := k/2 - 1; i >= 0; i-- {
-		siftDown(heap, i, k)
-	}

-	// Process remaining elements - if larger than heap root, replace root
+	// Initialize min-heap with first k elements
+	h := make(tokenHeap, k)
+	copy(h, ts[:k])
+	heap.Init(&h)
+
+	// Process remaining elements
 	for i := k; i < len(ts); i++ {
-		if ts[i].value > heap[0].value {
-			heap[0] = ts[i]
-			siftDown(heap, 0, k)
+		if ts[i].value > h[0].value {
+			heap.Pop(&h)
+			heap.Push(&h, ts[i])
 		}
 	}

-	slices.Reverse(heap)
+	// Convert heap to sorted slice in descending order
+	result := make([]token, len(h))
+	for i := k - 1; i >= 0; i-- {
+		result[i] = heap.Pop(&h).(token)
+	}

-	ts = heap
-	return ts
+	return result
 }

 // topP limits tokens to those with cumulative probability p
@@ -143,61 +143,3 @@ func minP(ts []token, p float32) []token {
 	ts = validTokens
 	return ts
 }
-
-// TODO(parthsareen): possibly replace with simpler implementation https://github.com/ollama/ollama/issues/9584
-// Conting sort implementation to sort tokens by logits
-func sortLogits(tokens []token) {
-	if len(tokens) <= 1 {
-		return
-	}
-
-	// Find max/min in a single pass
-	minLogit, maxLogit := tokens[0].value, tokens[0].value
-	for _, t := range tokens[1:] {
-		if t.value < minLogit {
-			minLogit = t.value
-		} else if t.value > maxLogit {
-			maxLogit = t.value
-		}
-	}
-
-	// Calculate scaling to map to uint32 range
-	logitRange := maxLogit - minLogit
-	if logitRange < 1e-6 {
-		return // All values effectively equal
-	}
-
-	// Count frequencies directly from tokens
-	const maxInt = (1 << 24) - 1 // Use 24 bits for good granularity
-	var counts [256]int          // For first byte
-
-	// First pass: count frequencies
-	for _, t := range tokens {
-		// Map to [0, maxInt] range
-		score := min(uint32((t.value-minLogit)*float32(maxInt)/logitRange), maxInt)
-		counts[score>>16]++
-	}
-
-	// Calculate offsets
-	var offset int
-	for i := range counts {
-		count := counts[i]
-		counts[i] = offset
-		offset += count
-	}
-
-	// Second pass: place elements in correct position
-	output := make([]token, len(tokens))
-	// Track current positions
-	countsCopy := counts
-
-	for i, t := range tokens {
-		score := min(uint32((t.value-minLogit)*float32(maxInt)/logitRange), maxInt)
-
-		pos := countsCopy[score>>16]
-		countsCopy[score>>16]++
-		output[len(tokens)-1-pos] = tokens[i]
-	}
-
-	copy(tokens, output)
-}
--- a/sample/transforms_test.go
+++ b/sample/transforms_test.go
@@ -6,86 +6,155 @@ import (
 	"testing"
 )

-// Helper to convert float64 slice to logit slice
-func toTokens(values []float64) []token {
+// Helper to convert float32 slice to logit slice
+func toTokens(values []float32) []token {
 	tokens := make([]token, len(values))
 	for i, v := range values {
 		tokens[i] = token{
 			id:    int32(i),
-			value: float32(v),
+			value: v,
 		}
 	}
 	return tokens
 }

 // Helper to compare logit slices
-func compareLogits(t *testing.T, name string, want []float64, got []token) {
+func compareLogits(t *testing.T, name string, want []float32, got []token) {
 	t.Helper()
 	if len(want) != len(got) {
 		t.Errorf("%s: length mismatch: want %d, got %d", name, len(want), len(got))
 		return
 	}
 	for i := range want {
-		if math.Abs(float64(got[i].value)-want[i]) > 1e-6 {
+		if math.Abs(float64(got[i].value-want[i])) > 1e-6 {
 			t.Errorf("%s: index %d: want %f, got %f", name, i, want[i], got[i].value)
 		}
 	}
 }

 func TestTemperature(t *testing.T) {
-	input := []float64{2, -1, 4, -3, 1, -2, 0}
-	want := []float64{-4, -10, 0, -14, -6, -12, -8} // (logit - max logit) / temp
-
+	input := []float32{1.0, 4.0, -2.0, 0.0}
 	got := temperature(toTokens(input), 0.5)
-	compareLogits(t, "Temperature", want, got)
+	want := []float32{2.0, 8.0, -4.0, 0.0}
+	compareLogits(t, "temperature(0.5)", want, got)
+
+	got = temperature(toTokens(input), 1.0)
+	want = []float32{1.0, 4.0, -2.0, 0.0}
+	compareLogits(t, "temperature(1)", want, got)
+
+	got = temperature(toTokens(input), 0.0)
+	want = []float32{1e7, 4e7, -2e7, 0.0}
+	compareLogits(t, "temperature(0)", want, got)
 }

 func TestSoftmax(t *testing.T) {
-	input := []float64{-3, -2, -1, 0, 1, 2, 4}
-	got := softmax(toTokens(input))
-
-	// Check probabilities sum to 1
-	var sum float32
-	for _, token := range got {
-		sum += token.value
-	}
-	if math.Abs(float64(sum)-1.0) > 1e-6 {
-		t.Errorf("probabilities don't sum to 1: got %f", sum)
+	tests := []struct {
+		name     string
+		input    []float32
+		expected []float32
+	}{
+		{
+			name:     "correctness softmax",
+			input:    []float32{1, -2, 3, 0},
+			expected: []float32{0.113550, 0.005653, 0.839024, 0.041773},
+		},
+		{
+			name:  "normal distribution",
+			input: []float32{0.026986899, 0.043722924, 0.036774673, 0.27755088, 0.0046718004, 0.08582123, 0.20409796, 0.00412893, 0.15720603, 0.045046154, 0.0030491839, 0.01681367},
+		},
+		{
+			name:  "single value",
+			input: []float32{1.0},
+		},
+		{
+			name:  "identical values",
+			input: []float32{0.9, 0.9, 0.9},
+		},
+		{
+			name:  "large values",
+			input: []float32{1000.0, 2000.0, 3000.0},
+		},
+		{
+			name:  "small values",
+			input: []float32{1e-6, 2e-6, 3e-6},
+		},
+		{
+			name:  "negative values",
+			input: []float32{-1.0, -2.0, -3.0},
+		},
+		{
+			name:  "mixed values",
+			input: []float32{-100.0, 0.0, 100.0},
+		},
 	}

-	// Check relative ordering is preserved
-	for i := 1; i < len(got); i++ {
-		if got[i].value < got[i-1].value {
-			t.Errorf("probability ordering not preserved at index %d", i)
-		}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := softmax(toTokens(tt.input))
+
+			if tt.expected != nil {
+				compareLogits(t, tt.name, tt.expected, got)
+				return
+			}
+
+			// Check probabilities sum to 1
+			var sum float32
+			for _, token := range got {
+				sum += token.value
+				if token.value < 0 || token.value > 1 {
+					t.Errorf("probability out of range [0,1]: got %f", token.value)
+				}
+			}
+			if math.Abs(float64(sum-1.0)) > 1e-6 {
+				t.Errorf("probabilities don't sum to 1: got %f", sum)
+			}
+		})
 	}
 }

 func TestTopK(t *testing.T) {
-	input := []float64{-3, -2, -1, 0, 1, 2, 4}
+	input := []float32{0.026986899, 0.043722924, 0.036774673, 0.27755088, 0.0046718004, 0.08582123, 0.20409796, 0.00412893, 0.15720603, 0.045046154, 0.0030491839, 0.01681367}

-	// Test k=3
-	got := topK(toTokens(input), 3)
-	if len(got) != 3 {
-		t.Errorf("topK(3): wrong length: want 3, got %d", len(got))
+	// Test k=5
+	got := topK(toTokens(input), 5)
+	if len(got) != 5 {
+		t.Errorf("topK(5): wrong length: want 5, got %d", len(got))
 	}
-	// Should keep highest 3 values: 4, 2, 1
-	want := []float64{4, 2, 1}
+	// Should keep highest 3 values in descending order
+	want := []float32{0.27755088, 0.20409796, 0.15720603, 0.08582123, 0.045046154}
 	compareLogits(t, "topK(3)", want, got)

-	// Test k > len
-	got = topK(toTokens(input), 10)
-	compareLogits(t, "topK(10)", input, got)
+	got = topK(toTokens(input), 20)
+	if len(got) != len(input) {
+		t.Errorf("topK(20): wrong length: want %d, got %d", len(input), len(got))
+	}
+
+	// Test k=-1
+	input = []float32{0.026986899, 0.043722924, 0.036774673, 0.27755088, 0.0046718004, 0.08582123, 0.20409796, 0.00412893, 0.15720603, 0.045046154, 0.0030491839, 0.01681367}
+	want = []float32{0.27755088, 0.20409796, 0.15720603, 0.08582123, 0.045046154, 0.043722924, 0.036774673, 0.026986899, 0.01681367, 0.0046718004, 0.00412893, 0.0030491839}
+	got = topK(toTokens(input), -1)
+	if len(got) != len(input) {
+		t.Errorf("topK(-1): wrong length: want %d, got %d", len(input), len(got))
+	}
+	compareLogits(t, "topK(-1)", want, got)
+
+	// Test k=0
+	input = []float32{0.026986899, 0.043722924, 0.036774673, 0.27755088, 0.0046718004, 0.08582123, 0.20409796, 0.00412893, 0.15720603, 0.045046154, 0.0030491839, 0.01681367}
+	want = []float32{0.27755088, 0.20409796, 0.15720603, 0.08582123, 0.045046154, 0.043722924, 0.036774673, 0.026986899, 0.01681367, 0.0046718004, 0.00412893, 0.0030491839}
+	got = topK(toTokens(input), 0)
+	if len(got) != len(input) {
+		t.Errorf("topK(-1): wrong length: want %d, got %d", len(input), len(got))
+	}
+	compareLogits(t, "topK(-1)", want, got)
 }

 func TestTopP(t *testing.T) {
-	input := []float64{-3, -2, -1, 0, 1, 2, 4}
+	input := []float32{-3, -2, -1, 0, 1, 2, 4}
 	tokens := toTokens(input)

 	// First apply temperature and softmax to get probabilities
-	tokens = temperature(tokens, 1)
 	tokens = softmax(tokens)
-	sortLogits(tokens)
+	tokens = topK(tokens, 20)

 	// Then apply topP
 	got := topP(tokens, 0.95)
@@ -98,11 +167,10 @@ func TestTopP(t *testing.T) {
 }

 func TestMinP(t *testing.T) {
-	input := []float64{-3, -2, -1, 0, 1, 2, 4, 3}
+	input := []float32{-3, -2, -1, 0, 1, 2, 4, 3}
 	tokens := toTokens(input)

 	// First apply temperature and softmax
-	tokens = temperature(tokens, 1)
 	tokens = softmax(tokens)

 	// Then apply minP
@@ -115,10 +183,10 @@ func TestMinP(t *testing.T) {
 }

 func TestSortLogits(t *testing.T) {
-	input := []float64{3, 1, 4, 2, -1, 0, -2}
+	input := []float32{0.026986899, 0.043722924, 0.036774673, 0.27755088, 0.0046718004, 0.08582123, 0.20409796, 0.00412893, 0.15720603, 0.045046154, 0.0030491839, 0.01681367}
 	tokens := toTokens(input)

-	sortLogits(tokens)
+	tokens = topK(tokens, 20)

 	for i := 1; i < len(tokens); i++ {
 		if tokens[i].value > tokens[i-1].value {
@@ -127,7 +195,7 @@ func TestSortLogits(t *testing.T) {
 		}
 	}

-	want := []float64{4, 3, 2, 1, 0, -1, -2}
+	want := []float32{0.27755088, 0.20409796, 0.15720603, 0.08582123, 0.045046154, 0.043722924, 0.036774673, 0.026986899, 0.01681367, 0.0046718004, 0.00412893, 0.0030491839}
 	compareLogits(t, "sortLogits", want, tokens)
 }

@@ -151,6 +219,14 @@ func BenchmarkTransforms(b *testing.B) {
 		}
 	})

+	b.Run("Softmax", func(b *testing.B) {
+		b.ResetTimer()
+		for b.Loop() {
+			copy(tokensCopy, tokens)
+			softmax(tokensCopy)
+		}
+	})
+
 	b.Run("TopK", func(b *testing.B) {
 		b.ResetTimer()
 		for b.Loop() {
@@ -179,7 +255,7 @@ func BenchmarkTransforms(b *testing.B) {
 		b.ResetTimer()
 		for b.Loop() {
 			copy(tokensCopy, tokens)
-			sortLogits(tokensCopy)
+			topK(tokensCopy, 200000)
 		}
 	})
 }
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@@ -80,13 +80,14 @@ function checkEnv() {


 function buildOllama() {
+    mkdir -Force -path "${script:DIST_DIR}\"
    if ($script:ARCH -ne "arm64") {
        Remove-Item -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}"
        New-Item "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ItemType Directory -ea 0

        & cmake --fresh --preset CPU --install-prefix $script:DIST_DIR
        if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-        & cmake --build --preset CPU --parallel $script:JOBS
+        & cmake --build --preset CPU  --config Release --parallel $script:JOBS
        if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
        & cmake --install build --component CPU --strip
        if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
@@ -101,7 +102,7 @@ function buildOllama() {
            # to avoid 2022 (or newer) from being used as the default
            & cmake --fresh --preset "CUDA 11" -G "Visual Studio 16 2019" --install-prefix $script:DIST_DIR
            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-            & cmake --build --preset "CUDA 11" --parallel $script:JOBS
+            & cmake --build --preset "CUDA 11"  --config Release --parallel $script:JOBS
            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
            & cmake --install build --component "CUDA" --strip
            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
@@ -112,7 +113,7 @@ function buildOllama() {
            write-host "Building CUDA v12 backend libraries"
            & cmake --fresh --preset "CUDA 12" --install-prefix $script:DIST_DIR
            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-            & cmake --build --preset "CUDA 12" --parallel $script:JOBS
+            & cmake --build --preset "CUDA 12"  --config Release --parallel $script:JOBS
            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
            & cmake --install build --component "CUDA" --strip
            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
@@ -131,7 +132,7 @@ function buildOllama() {
            $env:HIPCXX=""
            $env:HIP_PLATFORM=""
            $env:CMAKE_PREFIX_PATH=""
-            & cmake --build --preset "ROCm" --parallel $script:JOBS
+            & cmake --build --preset "ROCm"  --config Release --parallel $script:JOBS
            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
            & cmake --install build --component "HIP" --strip
            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
--- a/server/prompt.go
+++ b/server/prompt.go
@@ -26,6 +26,7 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
 	var system []api.Message

 	isMllama := checkMllamaModelFamily(m)
+	isGemma3 := checkGemma3ModelFamily(m)

 	var imageNumTokens int
 	// TODO: Ideally we would compute this from the projector metadata but some pieces are implementation dependent
@@ -40,7 +41,7 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
 	n := len(msgs) - 1
 	// in reverse, find all messages that fit into context window
 	for i := n; i >= 0; i-- {
-		if isMllama && len(msgs[i].Images) > 1 {
+		if (isMllama || isGemma3) && len(msgs[i].Images) > 1 {
 			return "", nil, errTooManyImages
 		}

@@ -157,3 +158,12 @@ func checkMllamaModelFamily(m *Model) bool {
 	}
 	return false
 }
+
+func checkGemma3ModelFamily(m *Model) bool {
+	for _, arch := range m.Config.ModelFamilies {
+		if arch == "gemma3" {
+			return true
+		}
+	}
+	return false
+}
--- a/server/routes.go
+++ b/server/routes.go
@@ -435,7 +435,7 @@ func (s *Server) EmbedHandler(c *gin.Context) {
 		return
 	}

-	kvData, err := getKVData(m.ModelPath, false)
+	kvData, _, err := getModelData(m.ModelPath, false)
 	if err != nil {
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
@@ -483,8 +483,7 @@ func (s *Server) EmbedHandler(c *gin.Context) {
 	}

 	if err := g.Wait(); err != nil {
-		slog.Error("embedding generation failed", "error", err)
-		c.JSON(http.StatusInternalServerError, gin.H{"error": fmt.Errorf("failed to generate embeddings: %v", err)})
+		c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": strings.TrimSpace(err.Error())})
 		return
 	}

@@ -545,8 +544,7 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) {

 	embedding, err := r.Embedding(c.Request.Context(), req.Prompt)
 	if err != nil {
-		slog.Info(fmt.Sprintf("embedding generation failed: %v", err))
-		c.JSON(http.StatusInternalServerError, gin.H{"error": fmt.Errorf("failed to generate embedding: %v", err)})
+		c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": strings.TrimSpace(err.Error())})
 		return
 	}

@@ -850,16 +848,23 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
 	fmt.Fprint(&sb, m.String())
 	resp.Modelfile = sb.String()

-	kvData, err := getKVData(m.ModelPath, req.Verbose)
+	kvData, tensors, err := getModelData(m.ModelPath, req.Verbose)
 	if err != nil {
 		return nil, err
 	}
+
 	delete(kvData, "general.name")
 	delete(kvData, "tokenizer.chat_template")
 	resp.ModelInfo = kvData

+	tensorData := make([]api.Tensor, len(tensors.Items()))
+	for cnt, t := range tensors.Items() {
+		tensorData[cnt] = api.Tensor{Name: t.Name, Type: t.Type(), Shape: t.Shape}
+	}
+	resp.Tensors = tensorData
+
 	if len(m.ProjectorPaths) > 0 {
-		projectorData, err := getKVData(m.ProjectorPaths[0], req.Verbose)
+		projectorData, _, err := getModelData(m.ProjectorPaths[0], req.Verbose)
 		if err != nil {
 			return nil, err
 		}
@@ -869,17 +874,17 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
 	return resp, nil
 }

-func getKVData(digest string, verbose bool) (ggml.KV, error) {
+func getModelData(digest string, verbose bool) (ggml.KV, ggml.Tensors, error) {
 	maxArraySize := 0
 	if verbose {
 		maxArraySize = -1
 	}
-	kvData, err := llm.LoadModel(digest, maxArraySize)
+	data, err := llm.LoadModel(digest, maxArraySize)
 	if err != nil {
-		return nil, err
+		return nil, ggml.Tensors{}, err
 	}

-	kv := kvData.KV()
+	kv := data.KV()

 	if !verbose {
 		for k := range kv {
@@ -889,7 +894,7 @@ func getKVData(digest string, verbose bool) (ggml.KV, error) {
 		}
 	}

-	return kv, nil
+	return kv, data.Tensors(), nil
 }

 func (s *Server) ListHandler(c *gin.Context) {
Author	SHA1	Message	Date
Patrick Devine	73a1e99f8a	logging: add a new customer logger and trace method This change addresses over logging with debug in the SPM tokenizer by adding a trace level to slog.	2025-03-13 16:10:59 -07:00
Michael Yang	543240fb5f	Merge pull request #9741 from ollama/mxyng/visionless fix: error if image requested without vision model	2025-03-13 15:03:25 -07:00
Patrick Devine	4bed739259	add verbose mode to the show command (#9640 ) Add metadata and tensor information to the show command to be able to see more information about a model. This outputs the same data as shown on the model details page on ollama.com	2025-03-13 14:24:27 -07:00
Patrick Devine	80c7ce381b	fix: change default context size for gemma3 (#9744 )	2025-03-13 13:59:19 -07:00
Michael Yang	ccfd41c4f0	Merge pull request #9742 from ollama/mxyng/engine-error-embeddings fix: error on models that don't support embeddings	2025-03-13 13:12:33 -07:00
Michael Yang	3e102b7dad	Update model/model.go Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>	2025-03-13 13:11:52 -07:00
Michael Yang	ec46f3286c	engine: error on embeddings; not currently implemented	2025-03-13 11:40:55 -07:00
Michael Yang	5e2e0b46b1	fix: error if image requested without vision model	2025-03-13 10:52:09 -07:00
Michael Yang	45a13b1dec	Merge pull request #9688 from Shane-XB-Qian/debug_mistype_lld ollama-debug.c: correct mistype	2025-03-13 10:12:44 -07:00
Parth Sareen	5c0b663969	sample: separate softmax and temperature transforms (#9732 )	2025-03-13 09:53:27 -07:00
shane.xb.qian	30d7a59ba8	ollama-debug.c: change 'ld' to 'PRIi64' * macOS has different definition per info from @mxyng	2025-03-13 17:10:37 +08:00
ParthSareen	4aeb67ef4c	sample: do all sorting in topK	2025-03-12 11:59:17 -07:00
ParthSareen	3ba91634c1	sample: simplify top_k=0 sorting	2025-03-12 11:59:17 -07:00
ParthSareen	1b7433b71e	sample: use container/heap for top_k	2025-03-12 11:59:17 -07:00
Bruce MacDonald	a70820daa0	models/gemma3: remove final logit softcap (#9692 ) Softcap isn't in the whitepaper/implementation for the language model so we should remove it. There is no discernible difference in output with it removed.	2025-03-12 10:17:57 -07:00
Shane-XB-Qian	6b45b1d6b4	cli: adding support ctrl-n/p like general cli (#9136 ) Signed-off-by: shane.xb.qian <shane.qian@foxmail.com>	2025-03-12 08:51:56 -07:00
shane.xb.qian	85ab552028	ollama-debug.c: correct mistype Signed-off-by: shane.xb.qian <shane.qian@foxmail.com>	2025-03-12 22:32:30 +08:00
frob	b3af953a55	cli: don't exit for invalid model during /load. (#9576 ) Co-authored-by: Richard Lyons <frob@cloudstaff.com>	2025-03-11 23:42:53 -07:00
Michael	ad4e0bf3be	Adding Gemma 3 to readme (#9671 )	2025-03-12 07:39:25 +01:00
Michael Yang	aee28501b5	Merge pull request #9661 from ollama/gemma engine: add gemma support	2025-03-11 15:07:50 -07:00
jmorganca	83f0ec8269	all: address linter errors	2025-03-11 14:49:20 -07:00
jmorganca	c6b6938b3a	kvcache: fix tests by adding AvgPool2D stub	2025-03-11 14:49:20 -07:00
jmorganca	fb4664fcec	model: add more spm tokenizer tests	2025-03-11 14:49:20 -07:00
jmorganca	20e3593863	model: validate left and right pairs before merging them	2025-03-11 14:49:20 -07:00
Michael Yang	63a394068c	use 2d pooling	2025-03-11 14:49:20 -07:00
Daniel Hiltgen	ab39e08eb9	llm: auto detect models that require Ollama Engine (#1 )	2025-03-11 14:49:20 -07:00
jmorganca	11bfa62796	add trailing \n\n after <end_of_image> to match reference implementation	2025-03-11 14:49:20 -07:00
jmorganca	f63e62e546	reduce kernel size, add TODO for loading from config	2025-03-11 14:49:20 -07:00
jmorganca	65b0f329d1	Revert "Allow models to force a new batch" This reverts commit c7eae586b899083acebcd9b3847b89ea78c2850c.	2025-03-11 14:49:20 -07:00
Jesse Gross	06007c0a18	Allow models to force a new batch This is useful for a few things: - Work around bugs, such as having 2 images in one batch - Keep the image in a single batch for fully connected attention - Improve performance by not evaluating embeddings multiple times	2025-03-11 14:49:20 -07:00
Jesse Gross	a8e83a7654	Disable causal attention based on batch index Currently we are using positions, which are relative to a sequence and may not be unique.	2025-03-11 14:49:20 -07:00
Jesse Gross	475005504e	Restrict Gemma to a single image per request	2025-03-11 14:49:20 -07:00
Jesse Gross	2c40c4d35e	Fix follow up images and images split across batches	2025-03-11 14:49:19 -07:00
Michael Yang	e95278932b	use non-causal mask only for image positions	2025-03-11 14:49:19 -07:00
Michael Yang	9d2a20a763	use non-causal mask for inputs with images	2025-03-11 14:49:19 -07:00
Patrick Devine	2e54d72fc3	fix gemma3 1b conversion	2025-03-11 14:49:19 -07:00
Michael Yang	6b32a2d549	compat with upstream gguf	2025-03-11 14:49:19 -07:00
Michael Yang	c5cbe4fc2a	fallback to cpu	2025-03-11 14:49:19 -07:00
Michael Yang	f888912870	fix vision encoder	2025-03-11 14:49:19 -07:00
Michael Yang	9e4642e9b3	ollama debug tensor	2025-03-11 14:49:19 -07:00
Michael Yang	6b0486c216	duplicate token_embd to output	2025-03-11 14:49:19 -07:00
Michael Yang	d368c039f0	skip repacking vision tensors	2025-03-11 14:49:19 -07:00
Patrick Devine	9b54267e69	fix configs	2025-03-11 14:49:19 -07:00
Michael Yang	46bb0169c4	update model	2025-03-11 14:49:19 -07:00
Michael Yang	8934324b72	use fast attention	2025-03-11 14:49:18 -07:00
Jesse Gross	0e886595bf	Fix tests and drift from main	2025-03-11 14:49:18 -07:00
Patrick Devine	c62861f4fa	fix conversion	2025-03-11 14:49:18 -07:00
Michael Yang	0df1800436	set non-causal attention	2025-03-11 14:49:18 -07:00
Patrick Devine	631fecc6d9	temporary work around for converting spm	2025-03-11 14:49:18 -07:00
Jesse Gross	4346c2409d	fix drift from main	2025-03-11 14:49:18 -07:00
Michael Yang	4b037a97dc	add gemma vision encoder	2025-03-11 14:49:17 -07:00
Patrick Devine	5f74d1fd47	gemma2 impl	2025-03-11 14:35:08 -07:00
Daniel Hiltgen	4dcf80167a	Build release for windows with local script (#9636 )	2025-03-11 08:34:20 -07:00
Michael Yang	26a26998fb	Merge pull request #9590 from ollama/mxyng/dump-pad fix: pad tensor item if ge zero	2025-03-10 16:34:55 -07:00
Michael Yang	9926eae015	fix: pad tensor item if ge zero this produces a nicer output since both positive and negative values produces the same width	2025-03-10 16:18:12 -07:00
Vincent Koc	8585b7b151	docs: add opik to observability integrations (#9626 )	2025-03-10 16:15:10 -07:00
Parth Sareen	7e34f4fbfa	sample: add numerical stability to temperature/softmax transform (#9631 )	2025-03-10 14:43:53 -07:00
Michael Yang	fe776293f7	Merge pull request #9569 from dwt/patch-1 Better WantedBy declaration	2025-03-10 14:09:37 -07:00
‮rekcäH nitraM‮	25248f4bd5	Better WantedBy declaration The problem with default.target is that it always points to the target that is currently started. So if you boot into single user mode or the rescue mode still Ollama tries to start. I noticed this because either tried (and failed) to start all the time during a system update, where Ollama definitely is not wanted.	2025-03-07 10:26:31 +01:00