tmp

next bert
2026-01-22 22:40:07 -05:00 · 2025-02-11 22:35:00 -08:00 · 2025-02-11 22:34:09 -08:00 · 2025-02-11 16:06:55 -08:00 · 2025-02-11 15:46:25 -08:00 · 2025-02-11 15:46:25 -08:00
68 changed files with 857 additions and 3879 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -329,9 +329,7 @@ jobs:
          done
        working-directory: dist/${{ matrix.os }}-${{ matrix.arch }}
      - run: |
-          for ARCHIVE in dist/${{ matrix.os }}-${{ matrix.arch }}/*.tar.in; do
-            tar c -C dist/${{ matrix.os }}-${{ matrix.arch }} -T $ARCHIVE --owner 0 --group 0 | pigz -9vc >$(basename ${ARCHIVE//.*/}.tgz);
-          done
+          for ARCHIVE in dist/${{ matrix.os }}-${{ matrix.arch }}/*.tar.in; do tar c -C dist/${{ matrix.os }}-${{ matrix.arch }} -T $ARCHIVE | pigz -9vc >$(basename ${ARCHIVE//.*/}.tgz); done
      - uses: actions/upload-artifact@v4
        with:
          name: dist-${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.target }}
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -104,10 +104,6 @@ if(CMAKE_HIP_COMPILER)
    if(AMDGPU_TARGETS)
        add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-hip)

-        if (WIN32)
-            target_compile_definitions(ggml-hip PRIVATE GGML_CUDA_NO_PEER_COPY=1)
-        endif()
-
        set(OLLAMA_HIP_INSTALL_DIR ${OLLAMA_INSTALL_DIR}/rocm)
        install(TARGETS ggml-hip
            RUNTIME_DEPENDENCIES
--- a/README.md
+++ b/README.md
@@ -380,8 +380,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Chipper](https://github.com/TilmanGriesel/chipper) AI interface for tinkerers (Ollama, Haystack RAG, Python)
 - [ChibiChat](https://github.com/CosmicEventHorizon/ChibiChat) (Kotlin-based Android app to chat with Ollama and Koboldcpp API endpoints)
 - [LocalLLM](https://github.com/qusaismael/localllm) (Minimal Web-App to run ollama models on it with a GUI)
- [Ollamazing](https://github.com/buiducnhat/ollamazing) (Web extension to run Ollama models)
- [OpenDeepResearcher-via-searxng](https://github.com/benhaotang/OpenDeepResearcher-via-searxng) (A Deep Research equivent endpoint with Ollama support for running locally)

 ### Cloud

@@ -439,10 +437,9 @@ See the [API documentation](./docs/api.md) for all endpoints.

 - [Pacman](https://archlinux.org/packages/extra/x86_64/ollama/)
 - [Gentoo](https://github.com/gentoo/guru/tree/master/app-misc/ollama)
- [Homebrew](https://formulae.brew.sh/formula/ollama)
 - [Helm Chart](https://artifacthub.io/packages/helm/ollama-helm/ollama)
 - [Guix channel](https://codeberg.org/tusharhero/ollama-guix)
- [Nix package](https://search.nixos.org/packages?show=ollama&from=0&size=50&sort=relevance&type=packages&query=ollama)
+- [Nix package](https://search.nixos.org/packages?channel=24.05&show=ollama&from=0&size=50&sort=relevance&type=packages&query=ollama)
 - [Flox](https://flox.dev/blog/ollama-part-one)

 ### Libraries
@@ -497,7 +494,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [multi-llm-ts](https://github.com/nbonamy/multi-llm-ts) (A Typescript/JavaScript library allowing access to different LLM in unified API)
 - [LlmTornado](https://github.com/lofcz/llmtornado) (C# library providing a unified interface for major FOSS & Commercial inference APIs)
 - [Ollama for Zig](https://github.com/dravenk/ollama-zig)
- [Abso](https://github.com/lunary-ai/abso) (OpenAI-compatible TypeScript SDK for any LLM provider)
+- [Abso](https://github.com/lunary-ai/abso/blob/main/README.md#ollama) (OpenAI-compatible TypeScript SDK for any LLM provider)

 ### Mobile

@@ -549,7 +546,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Alfred Ollama](https://github.com/zeitlings/alfred-ollama) (Alfred Workflow)
 - [TextLLaMA](https://github.com/adarshM84/TextLLaMA) A Chrome Extension that helps you write emails, correct grammar, and translate into any language
 - [Simple-Discord-AI](https://github.com/zyphixor/simple-discord-ai)
- [LLM Telegram Bot](https://github.com/innightwolfsleep/llm_telegram_bot) (telegram bot, primary for RP. Oobabooga-like buttons, [A1111](https://github.com/AUTOMATIC1111/stable-diffusion-webui) API integration e.t.c)

 ### Supported backends

--- a/cache/cache.go
+++ b/cache/cache.go
@@ -0,0 +1,63 @@
+package cache
+
+import (
+	"github.com/ollama/ollama/ml"
+)
+
+type Options struct {
+	Position int
+}
+
+type Cache interface {
+	Sub(i int) Cache
+	Put(ctx ml.Context, key, value ml.Tensor, opts Options) (ml.Tensor, ml.Tensor)
+}
+
+type Simple struct {
+	DType    ml.DType
+	Capacity int
+
+	keys, values []ml.Tensor
+}
+
+func (c *Simple) Sub(i int) Cache {
+	if i >= len(c.keys) {
+		c.keys = append(c.keys, make([]ml.Tensor, i-len(c.keys)+1)...)
+		c.values = append(c.values, make([]ml.Tensor, i-len(c.values)+1)...)
+	}
+
+	return &Simple{
+		keys:     c.keys[i : i+1],
+		values:   c.values[i : i+1],
+		Capacity: c.Capacity,
+		DType:    c.DType,
+	}
+}
+
+func (c *Simple) Put(ctx ml.Context, key, value ml.Tensor, opts Options) (ml.Tensor, ml.Tensor) {
+	if c.keys[0] == nil || c.values[0] == nil {
+		c.keys[0] = ctx.Zeros(c.DType, int(key.Dim(0)*key.Dim(1))*c.Capacity)
+		c.values[0] = ctx.Zeros(c.DType, int(value.Dim(0)*value.Dim(1))*c.Capacity)
+	}
+
+	ctx.Forward(key.Copy(ctx, c.keys[0].View(ctx, int(key.Stride(2))*opts.Position, int(key.Dim(0)*key.Dim(1)*key.Dim(2)))))
+	ctx.Forward(value.Copy(ctx, c.values[0].View(ctx, int(value.Stride(2))*opts.Position, int(value.Dim(0)*value.Dim(1)*value.Dim(2)))))
+
+	n := min(c.Capacity, int(key.Dim(2))+opts.Position)
+
+	key = c.keys[0].View(ctx, 0,
+		int(key.Dim(0)), int(key.Stride(1)),
+		int(key.Dim(1)), int(key.Stride(2)),
+		n,
+	)
+
+	value = c.values[0].View(ctx, 0,
+		int(value.Dim(0)), int(value.Stride(1)),
+		int(value.Dim(1)), int(value.Stride(2)),
+		n,
+	)
+
+	// TODO shift context if necessary
+
+	return key, value
+}
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -35,9 +35,9 @@ import (
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/llama"
+	"github.com/ollama/ollama/llama/runner"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/progress"
-	"github.com/ollama/ollama/runner"
 	"github.com/ollama/ollama/server"
 	"github.com/ollama/ollama/types/model"
 	"github.com/ollama/ollama/version"
@@ -338,10 +338,7 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 		return err
 	}

-	// TODO(jessegross): We should either find another way to know if this is
-	// a vision model or remove the logic. Also consider that other modalities will
-	// need different behavior anyways.
-	opts.MultiModal = len(info.ProjectorInfo) != 0 || envconfig.NewEngine()
+	opts.MultiModal = len(info.ProjectorInfo) != 0
 	opts.ParentModel = info.Details.ParentModel

 	if interactive {
--- a/cmd/runner/main.go
+++ b/cmd/runner/main.go
@@ -4,7 +4,7 @@ import (
 	"fmt"
 	"os"

-	"github.com/ollama/ollama/runner"
+	"github.com/ollama/ollama/llama/runner"
 )

 func main() {
--- a/discover/path.go
+++ b/discover/path.go
@@ -19,8 +19,9 @@ var LibOllamaPath string = func() string {
 		return ""
 	}

-	if eval, err := filepath.EvalSymlinks(exe); err == nil {
-		exe = eval
+	exe, err = filepath.EvalSymlinks(exe)
+	if err != nil {
+		return ""
 	}

 	var libPath string
--- a/docs/gpu.md
+++ b/docs/gpu.md
@@ -7,7 +7,7 @@ Check your compute compatibility to see if your card is supported:

 | Compute Capability | Family              | Cards                                                                                                       |
 | ------------------ | ------------------- | ----------------------------------------------------------------------------------------------------------- |
-| 9.0                | NVIDIA              | `H200` `H100`                                                                                               |
+| 9.0                | NVIDIA              | `H100`                                                                                                      |
 | 8.9                | GeForce RTX 40xx    | `RTX 4090` `RTX 4080 SUPER` `RTX 4080` `RTX 4070 Ti SUPER` `RTX 4070 Ti` `RTX 4070 SUPER` `RTX 4070` `RTX 4060 Ti` `RTX 4060`  |
 |                    | NVIDIA Professional | `L4` `L40` `RTX 6000`                                                                                       |
 | 8.6                | GeForce RTX 30xx    | `RTX 3090 Ti` `RTX 3090` `RTX 3080 Ti` `RTX 3080` `RTX 3070 Ti` `RTX 3070` `RTX 3060 Ti` `RTX 3060` `RTX 3050 Ti` `RTX 3050`   |
--- a/docs/windows.md
+++ b/docs/windows.md
@@ -55,7 +55,7 @@ Here's a quick example showing API access from `powershell`
 ## Troubleshooting

 Ollama on Windows stores files in a few different locations.  You can view them in
-the explorer window by hitting `<Ctrl>+R` and type in:
+the explorer window by hitting `<cmd>+R` and type in:
 - `explorer %LOCALAPPDATA%\Ollama` contains logs, and downloaded updates
    - *app.log* contains most resent logs from the GUI application
    - *server.log* contains the most recent server logs
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -165,8 +165,6 @@ var (
 	IntelGPU = Bool("OLLAMA_INTEL_GPU")
 	// MultiUserCache optimizes prompt caching for multi-user scenarios
 	MultiUserCache = Bool("OLLAMA_MULTIUSER_CACHE")
-	// Enable the new Ollama engine
-	NewEngine = Bool("OLLAMA_NEW_ENGINE")
 )

 func String(s string) func() string {
@@ -252,7 +250,6 @@ func AsMap() map[string]EnvVar {
 		"OLLAMA_ORIGINS":           {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"},
 		"OLLAMA_SCHED_SPREAD":      {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
 		"OLLAMA_MULTIUSER_CACHE":   {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},
-		"OLLAMA_NEW_ENGINE":        {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"},

 		// Informational
 		"HTTP_PROXY":  {"HTTP_PROXY", String("HTTP_PROXY")(), "HTTP proxy"},
--- a/format/format_test.go
+++ b/format/format_test.go
@@ -12,9 +12,6 @@ func TestHumanNumber(t *testing.T) {

 	testCases := []testCase{
 		{0, "0"},
-		{999, "999"},
-		{1000, "1K"},
-		{1001, "1K"},
 		{1000000, "1M"},
 		{125000000, "125M"},
 		{500500000, "500.50M"},
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -120,15 +120,6 @@ func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
 	return s
 }

-func (kv KV) Floats(key string, defaultValue ...[]float32) []float32 {
-	r := keyValue(kv, key, &array{})
-	s := make([]float32, r.size)
-	for i := range r.size {
-		s[i] = float32(r.values[i].(float32))
-	}
-	return s
-}
-
 func keyValue[T string | uint32 | uint64 | float32 | *array](kv KV, key string, defaultValue ...T) T {
 	if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
 		key = kv.Architecture() + "." + key
@@ -162,17 +153,19 @@ func (s Tensors) Items(prefix ...string) []*Tensor {
 	return items
 }

-func (ts Tensors) GroupLayers() map[string]Layer {
+func (ts Tensors) Layers() map[string]Layer {
 	layers := make(map[string]Layer)
 	for _, t := range ts.items {
 		parts := strings.Split(t.Name, ".")
-		if index := slices.IndexFunc(parts, func(s string) bool { return s == "blk" || s == "mm" }); index != -1 {
-			if len(parts) > index+2 {
-				// blk and mm should have a number after them, join it
-				parts = append(
-					[]string{strings.Join(parts[:index+2], ".")},
-					parts[index+2:]...)
-			}
+		if i := slices.Index(parts, "blk"); i > 0 {
+			parts = append([]string{
+				strings.Join(parts[:i], "."),
+				strings.Join(parts[i:i+2], "."),
+			}, parts[i+2:]...)
+		} else if i == 0 {
+			parts = append([]string{
+				strings.Join(parts[i:i+2], "."),
+			}, parts[i+2:]...)
 		}

 		if _, ok := layers[parts[0]]; !ok {
@@ -384,22 +377,22 @@ func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
 	}, offset, nil
 }

-func (f GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partialOffload, fullOffload uint64) {
-	embedding := f.KV().EmbeddingLength()
-	heads := f.KV().HeadCount()
-	headsKV := f.KV().HeadCountKV()
-	vocab := uint64(f.KV()["tokenizer.ggml.tokens"].(*array).size)
+func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partialOffload, fullOffload uint64) {
+	embedding := llm.KV().EmbeddingLength()
+	heads := llm.KV().HeadCount()
+	headsKV := llm.KV().HeadCountKV()
+	vocab := uint64(llm.KV()["tokenizer.ggml.tokens"].(*array).size)

-	embeddingHeads := f.KV().EmbeddingHeadCount()
-	embeddingHeadsK := f.KV().EmbeddingHeadCountK()
-	embeddingHeadsV := f.KV().EmbeddingHeadCountV()
+	embeddingHeads := llm.KV().EmbeddingHeadCount()
+	embeddingHeadsK := llm.KV().EmbeddingHeadCountK()
+	embeddingHeadsV := llm.KV().EmbeddingHeadCountV()

-	layers := f.Tensors().GroupLayers()
+	layers := llm.Tensors().Layers()

 	bytesPerElement := kvCacheBytesPerElement(kvCacheType)
-	kv = uint64(float64(context*f.KV().BlockCount()*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
+	kv = uint64(float64(context*llm.KV().BlockCount()*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)

-	switch f.KV().Architecture() {
+	switch llm.KV().Architecture() {
 	case "llama":
 		fullOffload = max(
 			4*batch*(1+4*embedding+context*(1+heads)),
@@ -414,7 +407,7 @@ func (f GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partialO

 		if ffnGateExpsWeight, ok := layers["blk.0"]["ffn_gate_exps.weight"]; ok {
 			// mixtral 8x22b
-			ff := uint64(f.KV()["llama.feed_forward_length"].(uint32))
+			ff := uint64(llm.KV()["llama.feed_forward_length"].(uint32))
 			partialOffload = max(
 				3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
 				4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
@@ -431,11 +424,11 @@ func (f GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partialO
 	case "mllama":
 		var visionTokens, tiles uint64 = 1601, 4

-		if crossAttentionLayers, ok := f.KV()["mllama.attention.cross_attention_layers"].(*array); ok {
+		if crossAttentionLayers, ok := llm.KV()["mllama.attention.cross_attention_layers"].(*array); ok {
 			kv = headsKV *
 				(embeddingHeadsK + embeddingHeadsV) * // one for K, one for V
 				(2* // sizeof(float16)
-					(f.KV().BlockCount()-uint64(crossAttentionLayers.size))* // num non-cross attention layers
+					(llm.KV().BlockCount()-uint64(crossAttentionLayers.size))* // num non-cross attention layers
 					context +
 					4* // sizeof(float32)
 						uint64(crossAttentionLayers.size)* // num cross attention layers
@@ -450,7 +443,7 @@ func (f GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partialO
 		)

 		var ropeFreqsCount uint64
-		if ropeFreqs, ok := f.Tensors().GroupLayers()["rope_freqs"]; ok {
+		if ropeFreqs, ok := llm.Tensors().Layers()["rope_freqs"]; ok {
 			if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok {
 				ropeFreqsCount = ropeFreqsWeights.parameters()
 			}
@@ -554,20 +547,20 @@ func (f GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partialO
 }

 // SupportsKVCacheType checks if the requested cache type is supported
-func (f GGML) SupportsKVCacheType(cacheType string) bool {
+func (llm GGML) SupportsKVCacheType(cacheType string) bool {
 	return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType)
 }

 // SupportsFlashAttention checks if the model supports flash attention
-func (f GGML) SupportsFlashAttention() bool {
-	_, isEmbedding := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]
+func (llm GGML) SupportsFlashAttention() bool {
+	_, isEmbedding := llm.KV()[fmt.Sprintf("%s.pooling_type", llm.KV().Architecture())]
 	if isEmbedding {
 		return false
 	}

 	// Check head counts match and are non-zero
-	headCountK := f.KV().EmbeddingHeadCountK()
-	headCountV := f.KV().EmbeddingHeadCountV()
+	headCountK := llm.KV().EmbeddingHeadCountK()
+	headCountV := llm.KV().EmbeddingHeadCountV()
 	return headCountK != 0 && headCountV != 0 && headCountK == headCountV
 }

--- a/fs/ggml/ggml_test.go
+++ b/fs/ggml/ggml_test.go
@@ -1,159 +0,0 @@
-package ggml
-
-import (
-	"maps"
-	"slices"
-	"strings"
-	"testing"
-
-	"github.com/google/go-cmp/cmp"
-)
-
-func TestTensorLayers(t *testing.T) {
-	tensors := make(map[string]*Tensor)
-	for _, name := range []string{
-		"token_embd.weight",
-		"blk.0.attn_k.weight",
-		"blk.0.attn_output.weight",
-		"blk.0.attn_q.weight",
-		"blk.0.attn_v.weight",
-		"blk.0.attn_norm.weight",
-		"blk.0.ffn_down.weight",
-		"blk.0.ffn_gate.weight",
-		"blk.0.ffn_up.weight",
-		"blk.0.ffn_norm.weight",
-		"output_norm.weight",
-		"mm.0.bias",
-		"mm.0.weight",
-		"v.blk.0.attn_k.weight",
-		"v.blk.0.attn_output.weight",
-		"v.blk.0.attn_q.weight",
-		"v.blk.0.attn_v.weight",
-		"v.blk.0.attn_norm.weight",
-		"v.blk.0.ffn_down.weight",
-		"v.blk.0.ffn_gate.weight",
-		"v.blk.0.ffn_up.weight",
-		"v.blk.0.ffn_norm.weight",
-		"v.patch_embd.weight",
-		"v.position_embd.gate",
-		"v.position_embd.weight",
-	} {
-		tensors[name] = &Tensor{Name: name}
-	}
-
-	cases := []struct {
-		name  string
-		items []*Tensor
-		want  map[string]Layer
-	}{
-		{
-			name: "text",
-			items: slices.Collect(func(yield func(*Tensor) bool) {
-				for k, v := range tensors {
-					if !strings.HasPrefix(k, "mm.") && !strings.HasPrefix(k, "v.") {
-						if !yield(v) {
-							return
-						}
-					}
-				}
-			}),
-			want: map[string]Layer{
-				"blk.0": {
-					"attn_k.weight":      tensors["blk.0.attn_k.weight"],
-					"attn_q.weight":      tensors["blk.0.attn_q.weight"],
-					"attn_v.weight":      tensors["blk.0.attn_v.weight"],
-					"attn_output.weight": tensors["blk.0.attn_output.weight"],
-					"attn_norm.weight":   tensors["blk.0.attn_norm.weight"],
-					"ffn_down.weight":    tensors["blk.0.ffn_down.weight"],
-					"ffn_gate.weight":    tensors["blk.0.ffn_gate.weight"],
-					"ffn_up.weight":      tensors["blk.0.ffn_up.weight"],
-					"ffn_norm.weight":    tensors["blk.0.ffn_norm.weight"],
-				},
-				"token_embd":  {"weight": tensors["token_embd.weight"]},
-				"output_norm": {"weight": tensors["output_norm.weight"]},
-			},
-		},
-		{
-			name: "vision",
-			items: slices.Collect(func(yield func(*Tensor) bool) {
-				for k, v := range tensors {
-					if strings.HasPrefix(k, "mm.") || strings.HasPrefix(k, "v.") {
-						if !yield(v) {
-							return
-						}
-					}
-				}
-			}),
-			want: map[string]Layer{
-				"mm.0": {
-					"bias":   tensors["mm.0.bias"],
-					"weight": tensors["mm.0.weight"],
-				},
-				"v.blk.0": {
-					"attn_k.weight":      tensors["v.blk.0.attn_k.weight"],
-					"attn_q.weight":      tensors["v.blk.0.attn_q.weight"],
-					"attn_v.weight":      tensors["v.blk.0.attn_v.weight"],
-					"attn_output.weight": tensors["v.blk.0.attn_output.weight"],
-					"attn_norm.weight":   tensors["v.blk.0.attn_norm.weight"],
-					"ffn_down.weight":    tensors["v.blk.0.ffn_down.weight"],
-					"ffn_gate.weight":    tensors["v.blk.0.ffn_gate.weight"],
-					"ffn_up.weight":      tensors["v.blk.0.ffn_up.weight"],
-					"ffn_norm.weight":    tensors["v.blk.0.ffn_norm.weight"],
-				},
-				"v": {
-					"patch_embd.weight":    tensors["v.patch_embd.weight"],
-					"position_embd.gate":   tensors["v.position_embd.gate"],
-					"position_embd.weight": tensors["v.position_embd.weight"],
-				},
-			},
-		},
-		{
-			name:  "vision and text",
-			items: slices.Collect(maps.Values(tensors)),
-			want: map[string]Layer{
-				"blk.0": {
-					"attn_k.weight":      tensors["blk.0.attn_k.weight"],
-					"attn_q.weight":      tensors["blk.0.attn_q.weight"],
-					"attn_v.weight":      tensors["blk.0.attn_v.weight"],
-					"attn_output.weight": tensors["blk.0.attn_output.weight"],
-					"attn_norm.weight":   tensors["blk.0.attn_norm.weight"],
-					"ffn_down.weight":    tensors["blk.0.ffn_down.weight"],
-					"ffn_gate.weight":    tensors["blk.0.ffn_gate.weight"],
-					"ffn_up.weight":      tensors["blk.0.ffn_up.weight"],
-					"ffn_norm.weight":    tensors["blk.0.ffn_norm.weight"],
-				},
-				"token_embd":  {"weight": tensors["token_embd.weight"]},
-				"output_norm": {"weight": tensors["output_norm.weight"]},
-				"mm.0": {
-					"bias":   tensors["mm.0.bias"],
-					"weight": tensors["mm.0.weight"],
-				},
-				"v.blk.0": {
-					"attn_k.weight":      tensors["v.blk.0.attn_k.weight"],
-					"attn_q.weight":      tensors["v.blk.0.attn_q.weight"],
-					"attn_v.weight":      tensors["v.blk.0.attn_v.weight"],
-					"attn_output.weight": tensors["v.blk.0.attn_output.weight"],
-					"attn_norm.weight":   tensors["v.blk.0.attn_norm.weight"],
-					"ffn_down.weight":    tensors["v.blk.0.ffn_down.weight"],
-					"ffn_gate.weight":    tensors["v.blk.0.ffn_gate.weight"],
-					"ffn_up.weight":      tensors["v.blk.0.ffn_up.weight"],
-					"ffn_norm.weight":    tensors["v.blk.0.ffn_norm.weight"],
-				},
-				"v": {
-					"patch_embd.weight":    tensors["v.patch_embd.weight"],
-					"position_embd.gate":   tensors["v.position_embd.gate"],
-					"position_embd.weight": tensors["v.position_embd.weight"],
-				},
-			},
-		},
-	}
-
-	for _, tt := range cases {
-		t.Run(tt.name, func(t *testing.T) {
-			got := Tensors{items: tt.items}.GroupLayers()
-			if diff := cmp.Diff(got, tt.want); diff != "" {
-				t.Errorf("unexpected layers (-got +want):\n%s", diff)
-			}
-		})
-	}
-}
--- a/fs/ggml/type.go
+++ b/fs/ggml/type.go
@@ -32,10 +32,9 @@ const (
 	fileTypeIQ1_S
 	fileTypeIQ4_NL
 	fileTypeIQ3_S
-	fileTypeIQ3_M
 	fileTypeIQ2_S
-	fileTypeIQ2_M
 	fileTypeIQ4_XS
+	fileTypeIQ2_M
 	fileTypeIQ1_M
 	fileTypeBF16

@@ -94,14 +93,12 @@ func ParseFileType(s string) (fileType, error) {
 		return fileTypeIQ4_NL, nil
 	case "IQ3_S":
 		return fileTypeIQ3_S, nil
-	case "IQ3_M":
-		return fileTypeIQ3_M, nil
 	case "IQ2_S":
 		return fileTypeIQ2_S, nil
-	case "IQ2_M":
-		return fileTypeIQ2_M, nil
 	case "IQ4_XS":
 		return fileTypeIQ4_XS, nil
+	case "IQ2_M":
+		return fileTypeIQ2_M, nil
 	case "IQ1_M":
 		return fileTypeIQ1_M, nil
 	case "BF16":
@@ -163,8 +160,6 @@ func (t fileType) String() string {
 		return "IQ4_NL"
 	case fileTypeIQ3_S:
 		return "IQ3_S"
-	case fileTypeIQ3_M:
-		return "IQ3_M"
 	case fileTypeIQ2_S:
 		return "IQ2_S"
 	case fileTypeIQ4_XS:
--- a/go.mod
+++ b/go.mod
@@ -18,7 +18,6 @@ require (
 	github.com/agnivade/levenshtein v1.1.1
 	github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1
 	github.com/dlclark/regexp2 v1.11.4
-	github.com/emirpasic/gods v1.18.1
 	github.com/emirpasic/gods/v2 v2.0.0-alpha
 	github.com/google/go-cmp v0.6.0
 	github.com/mattn/go-runewidth v0.0.14
--- a/go.sum
+++ b/go.sum
@@ -44,8 +44,6 @@ github.com/dgryski/trifles v0.0.0-20200323201526-dd97f9abfb48 h1:fRzb/w+pyskVMQ+
 github.com/dgryski/trifles v0.0.0-20200323201526-dd97f9abfb48/go.mod h1:if7Fbed8SFyPtHLHbg49SI7NAdJiC5WIA09pe59rfAA=
 github.com/dlclark/regexp2 v1.11.4 h1:rPYF9/LECdNymJufQKmri9gV604RvvABwgOA8un7yAo=
 github.com/dlclark/regexp2 v1.11.4/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8=
-github.com/emirpasic/gods v1.18.1 h1:FXtiHYKDGKCW2KzwZKx0iC0PQmdlorYgdFG9jPXJ1Bc=
-github.com/emirpasic/gods v1.18.1/go.mod h1:8tpGGwCnJ5H4r6BWwaV6OrWmMoPhUl5jm/FMNAnJvWQ=
 github.com/emirpasic/gods/v2 v2.0.0-alpha h1:dwFlh8pBg1VMOXWGipNMRt8v96dKAIvBehtCt6OtunU=
 github.com/emirpasic/gods/v2 v2.0.0-alpha/go.mod h1:W0y4M2dtBB9U5z3YlghmpuUhiaZT2h6yoeE+C1sCp6A=
 github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
--- a/kvcache/cache.go
+++ b/kvcache/cache.go
@@ -1,54 +0,0 @@
-package kvcache
-
-import (
-	"errors"
-
-	"github.com/ollama/ollama/ml"
-)
-
-var (
-	ErrKvCacheFull  = errors.New("could not find a kv cache slot")
-	ErrNotSupported = errors.New("model does not support operation")
-)
-
-type Cache interface {
-	// ** used by model implementations **
-
-	// SetLayer sets the active layer of the cache
-	SetLayer(layer int)
-
-	// Get returns the history of key and value tensors plus a mask
-	//
-	// The shape of the tensors is documented in the specific
-	// cache implementation used.
-	Get(ctx ml.Context) (ml.Tensor, ml.Tensor, ml.Tensor)
-
-	// Put stores a batch of key and value in the cache
-	//
-	// The shape of the tensors is documented in the specific
-	// cache implementation used.
-	Put(ctx ml.Context, key, value ml.Tensor)
-
-	// ** cache management **
-
-	// Init sets up runtime parameters
-	Init(backend ml.Backend, dtype ml.DType, capacity int32)
-
-	// Close closes the cache and frees resources associated with it
-	Close()
-
-	// StartForward is called before the start of the model's forward pass.
-	// For each token in the coming batch, there must be a corresponding
-	// entry in positions and seqs.
-	StartForward(ctx ml.Context, positions []int32, seqs []int) error
-
-	// CopyPrefix copies tokens in the range [0, len) from srcSeq to dstSeq
-	CopyPrefix(srcSeq, dstSeq int, len int32)
-
-	// Remove deletes tokens in the range [beginIndex, endIndex) from seq. Set
-	// endIndex to math.MaxInt32 to remove everything starting at beginIndex.
-	//
-	// If an error occurs, the entire context for the sequence should be
-	// removed by calling Remove(seq, 0, math.MaxInt32)
-	Remove(seq int, beginIndex, endIndex int32) error
-}
--- a/kvcache/causal.go
+++ b/kvcache/causal.go
@@ -1,455 +0,0 @@
-package kvcache
-
-import (
-	"errors"
-	"fmt"
-	"log/slog"
-	"math"
-	"slices"
-
-	"github.com/ollama/ollama/ml"
-)
-
-type shiftFn func(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error)
-
-// Causal cache stores K and V tensors according to their position in the
-// sequence. Returns the history and a mask for attending to past tokens
-//
-// The tensors are of shape embed dim, kv heads, batch size
-// The mask is of shape history size, batch size
-type Causal struct {
-	DType      ml.DType
-	Capacity   int32
-	windowSize int32
-
-	// ** current forward pass **
-
-	// the active layer for Get and Put
-	curLayer int
-
-	// starting location for data storage for this batch
-	curLoc int
-
-	// size of the current batch
-	curBatchSize int
-
-	// mask of the cache as used by this batch
-	curMask ml.Tensor
-
-	// locations in the cache that are needed for this batch
-	curCellRange cellRange
-
-	// ** cache metadata **
-
-	// for each possible location in the cache, stores the position and set of sequences
-	// that reference the data there
-	cells []cacheCell
-
-	// maps from sequence to the range of locations where it is stored in the cache
-	cellRanges map[int]cellRange
-
-	// ** cache data storage **
-
-	shiftFn      shiftFn
-	backend      ml.Backend
-	cacheCtx     ml.Context
-	keys, values []ml.Tensor
-}
-
-type cacheCell struct {
-	pos       int32
-	sequences []int
-}
-
-type cellRange struct {
-	min int
-	max int
-}
-
-func NewCausalCache(shift shiftFn) *Causal {
-	return &Causal{windowSize: math.MaxInt32, shiftFn: shift}
-}
-
-func NewSWACache(windowSize int32, shift shiftFn) *Causal {
-	return &Causal{windowSize: windowSize, shiftFn: shift}
-}
-
-func (c *Causal) Init(backend ml.Backend, dtype ml.DType, capacity int32) {
-	c.DType = dtype
-	c.Capacity = capacity
-	c.cells = make([]cacheCell, capacity)
-	c.cellRanges = make(map[int]cellRange)
-	c.backend = backend
-	c.cacheCtx = backend.NewContext()
-}
-
-func (c *Causal) Close() {
-	c.cacheCtx.Close()
-}
-
-func (c *Causal) StartForward(ctx ml.Context, positions []int32, seqs []int) error {
-	c.curBatchSize = len(positions)
-
-	var err error
-	c.curLoc, err = c.findStartLoc()
-	if errors.Is(err, ErrKvCacheFull) {
-		c.defrag()
-		c.curLoc, err = c.findStartLoc()
-	}
-	if err != nil {
-		return err
-	}
-
-	c.curCellRange = newRange()
-	for i, pos := range positions {
-		seq := seqs[i]
-
-		c.cells[c.curLoc+i] = cacheCell{pos: pos, sequences: []int{seq}}
-
-		seqRange, ok := c.cellRanges[seq]
-		if !ok {
-			seqRange = newRange()
-		}
-
-		if c.curLoc+i > seqRange.max {
-			seqRange.max = c.curLoc + i
-		}
-		if seqRange.max > c.curCellRange.max {
-			c.curCellRange.max = seqRange.max
-		}
-
-		if c.curLoc+i < seqRange.min {
-			seqRange.min = c.curLoc + i
-		}
-		if seqRange.min < c.curCellRange.min {
-			c.curCellRange.min = seqRange.min
-		}
-		c.cellRanges[seq] = seqRange
-	}
-
-	c.curMask, err = c.buildMask(ctx, positions, seqs)
-
-	return err
-}
-
-func newRange() cellRange {
-	return cellRange{
-		min: math.MaxInt,
-		max: 0,
-	}
-}
-
-// Find the first contiguous block of at least curBatchSize
-func (c *Causal) findStartLoc() (int, error) {
-	var start, count int
-	for i := range c.cells {
-		if len(c.cells[i].sequences) == 0 {
-			count++
-			if count >= c.curBatchSize {
-				return start, nil
-			}
-		} else {
-			start = i + 1
-			count = 0
-		}
-	}
-
-	return 0, fmt.Errorf("%w (length: %v)", ErrKvCacheFull, c.Capacity)
-}
-
-// Builds a mask of history x batch indicating whether for each token in the batch the
-// token in the history should apply. This is based on both the sequence and causality (the
-// position of the history is not ahead of the token in the batch).
-func (c *Causal) buildMask(ctx ml.Context, positions []int32, seqs []int) (ml.Tensor, error) {
-	// TODO(jessegross): This does not do padding, which is required for flash attention
-	len := c.curCellRange.max - c.curCellRange.min + 1
-	mask := make([]float32, c.curBatchSize*len)
-
-	for i := range c.curBatchSize {
-		for j := c.curCellRange.min; j <= c.curCellRange.max; j++ {
-			if !slices.Contains(c.cells[j].sequences, seqs[i]) || c.cells[j].pos > positions[i] ||
-				c.cells[j].pos < positions[i]-c.windowSize {
-				mask[i*len+(j-c.curCellRange.min)] = float32(math.Inf(-1))
-			}
-		}
-	}
-
-	return ctx.FromFloatSlice(mask, len, c.curBatchSize)
-}
-
-func moveCell(ctx ml.Context, objs []ml.Tensor, src, dst, len int) {
-	for _, obj := range objs {
-		if obj == nil {
-			continue
-		}
-
-		srcView := obj.View(ctx, obj.Stride(2)*src, obj.Dim(0)*obj.Dim(1)*len)
-		dstView := obj.View(ctx, obj.Stride(2)*dst, obj.Dim(0)*obj.Dim(1)*len)
-
-		ctx.Forward(srcView.Copy(ctx, dstView))
-	}
-}
-
-func (c *Causal) defrag() {
-	slog.Debug("defragmenting kv cache")
-
-	// Defrag strategy:
-	// - Search for empty holes at the beginning of the cache,
-	//   filling them with active data starting at the end
-	// - If there are contiguous elements that need to be moved,
-	//   combine them into a single operation by holding new moves
-	//   until we see that the next one is non-contiguous
-	// - Fill up the context with the maximum number of operations it
-	//   can hold then compute that and continue with a new context
-	//
-	// We could try to optimize placement by grouping blocks from
-	// the same sequences together but most likely the next forward
-	// pass will disrupt this anyways, so the real world benefit
-	// seems limited as this time.
-
-	ctx := c.backend.NewContext()
-
-	// For every move, 6 tensors are required per layer (2 views and a
-	// copy for each of k and v).
-	layers := 0
-	for _, key := range c.keys {
-		if key == nil {
-			continue
-		}
-		layers++
-	}
-
-	maxMoves := ctx.MaxTensors() / (6 * layers)
-	moves := 0
-
-	var pendingSrc, pendingDst, pendingLen int
-	src := len(c.cells) - 1
-
-	for dst := 0; dst < src; dst++ {
-		if len(c.cells[dst].sequences) == 0 {
-			for ; src > dst; src-- {
-				if len(c.cells[src].sequences) != 0 {
-					c.cells[dst] = c.cells[src]
-					c.cells[src] = cacheCell{}
-
-					if pendingLen > 0 {
-						if src == pendingSrc-pendingLen && dst == pendingDst+pendingLen {
-							pendingSrc = src
-							pendingLen++
-							break
-						} else {
-							moveCell(ctx, c.keys, pendingSrc, pendingDst, pendingLen)
-							moveCell(ctx, c.values, pendingSrc, pendingDst, pendingLen)
-							moves++
-						}
-					}
-
-					pendingSrc = src
-					pendingDst = dst
-					pendingLen = 1
-
-					break
-				}
-			}
-		}
-
-		if moves >= maxMoves {
-			ctx.Compute()
-			ctx.Close()
-			ctx = c.backend.NewContext()
-
-			moves = 0
-		}
-	}
-
-	if pendingLen > 0 {
-		moveCell(ctx, c.keys, pendingSrc, pendingDst, pendingLen)
-		moveCell(ctx, c.values, pendingSrc, pendingDst, pendingLen)
-		moves++
-	}
-
-	if moves > 0 {
-		ctx.Compute()
-	}
-	ctx.Close()
-
-	// Reset range metadata
-	for seq := range c.cellRanges {
-		seqRange := newRange()
-
-		for i, cell := range c.cells {
-			if slices.Contains(cell.sequences, seq) {
-				if i < seqRange.min {
-					seqRange.min = i
-				}
-				if i > seqRange.max {
-					seqRange.max = i
-				}
-			}
-		}
-
-		c.cellRanges[seq] = seqRange
-	}
-}
-
-func (c *Causal) SetLayer(layer int) {
-	if layer >= len(c.keys) {
-		c.keys = append(c.keys, make([]ml.Tensor, layer-len(c.keys)+1)...)
-		c.values = append(c.values, make([]ml.Tensor, layer-len(c.values)+1)...)
-	}
-
-	c.curLayer = layer
-}
-
-func (c *Causal) Get(ctx ml.Context) (ml.Tensor, ml.Tensor, ml.Tensor) {
-	key := c.keys[c.curLayer]
-	value := c.values[c.curLayer]
-
-	key = key.View(ctx, key.Stride(2)*c.curCellRange.min,
-		key.Dim(0), key.Stride(1),
-		key.Dim(1), key.Stride(2),
-		c.curMask.Dim(0),
-	)
-
-	value = value.View(ctx, key.Stride(2)*c.curCellRange.min,
-		value.Dim(0), value.Stride(1),
-		value.Dim(1), value.Stride(2),
-		c.curMask.Dim(0),
-	)
-
-	return key, value, c.curMask
-}
-
-func (c *Causal) Put(ctx ml.Context, key, value ml.Tensor) {
-	if c.curBatchSize != key.Dim(2) {
-		panic(fmt.Errorf("inconsistent batch sizes (layer: %v, batch size: %v layer batch size: %v)", c.curLayer, c.curBatchSize, key.Dim(2)))
-	}
-
-	if c.keys[c.curLayer] == nil || c.values[c.curLayer] == nil {
-		c.keys[c.curLayer] = c.cacheCtx.Zeros(c.DType, key.Dim(0), key.Dim(1), int(c.Capacity))
-		c.values[c.curLayer] = c.cacheCtx.Zeros(c.DType, value.Dim(0), value.Dim(1), int(c.Capacity))
-	}
-
-	ctx.Forward(key.Copy(ctx, c.keys[c.curLayer].View(ctx, c.keys[c.curLayer].Stride(2)*c.curLoc, key.Dim(0)*key.Dim(1)*key.Dim(2))))
-	ctx.Forward(value.Copy(ctx, c.values[c.curLayer].View(ctx, c.values[c.curLayer].Stride(2)*c.curLoc, value.Dim(0)*value.Dim(1)*value.Dim(2))))
-}
-
-func (c *Causal) CopyPrefix(srcSeq, dstSeq int, len int32) {
-	seqRange := newRange()
-
-	for i := range c.cells {
-		// Remove the contents of dstSeq so that we only have the copied prefix, metadata will be reset at the end
-		if slices.Contains(c.cells[i].sequences, dstSeq) {
-			c.cells[i].sequences = slices.DeleteFunc(c.cells[i].sequences, func(s int) bool { return s == dstSeq })
-		}
-
-		if slices.Contains(c.cells[i].sequences, srcSeq) && c.cells[i].pos < len {
-			c.cells[i].sequences = append(c.cells[i].sequences, dstSeq)
-			if i < seqRange.min {
-				seqRange.min = i
-			}
-			if i > seqRange.max {
-				seqRange.max = i
-			}
-		}
-	}
-
-	c.cellRanges[dstSeq] = seqRange
-}
-
-func (c *Causal) shift(seq int, beginIndex, offset int32) error {
-	if c.shiftFn == nil {
-		return ErrNotSupported
-	}
-
-	ctx := c.backend.NewContext()
-	defer ctx.Close()
-
-	seqRange := c.cellRanges[seq]
-	size := seqRange.max - seqRange.min + 1
-
-	offsets := make([]int32, size)
-	for i := range offsets {
-		cell := c.cells[seqRange.min+i]
-
-		if slices.Contains(cell.sequences, seq) && cell.pos >= beginIndex {
-			offsets[i] = offset
-		}
-	}
-
-	kShift, err := ctx.FromIntSlice(offsets, len(offsets))
-	if err != nil {
-		return err
-	}
-
-	for i, key := range c.keys {
-		if key == nil {
-			continue
-		}
-
-		key = key.View(ctx, key.Stride(2)*seqRange.min,
-			key.Dim(0), key.Stride(1),
-			key.Dim(1), key.Stride(2),
-			size,
-		)
-
-		roped, err := c.shiftFn(ctx, i, key, kShift)
-		if err != nil {
-			return err
-		}
-
-		ctx.Forward(roped.Copy(ctx, key))
-	}
-
-	ctx.Compute()
-
-	return nil
-}
-
-func (c *Causal) Remove(seq int, beginIndex, endIndex int32) error {
-	var offset int32
-	if endIndex != math.MaxInt32 {
-		offset = beginIndex - endIndex
-	}
-
-	seqRange := newRange()
-
-	for i := range c.cells {
-		if slices.Contains(c.cells[i].sequences, seq) {
-			if c.cells[i].pos >= beginIndex && c.cells[i].pos < endIndex {
-				c.cells[i].sequences = slices.DeleteFunc(c.cells[i].sequences, func(s int) bool { return s == seq })
-			} else {
-				if c.cells[i].pos >= endIndex {
-					if slices.ContainsFunc(c.cells[i].sequences, func(s int) bool { return s != seq }) {
-						// TODO(jessegross): Need to be careful about data shared between sequences
-						return errors.New("shifting on cells shared by multiple sequences not yet implemented")
-					}
-
-					c.cells[i].pos += offset
-				}
-				if i < seqRange.min {
-					seqRange.min = i
-				}
-				if i > seqRange.max {
-					seqRange.max = i
-				}
-			}
-		}
-	}
-
-	if seqRange == newRange() {
-		delete(c.cellRanges, seq)
-		return nil
-	}
-
-	c.cellRanges[seq] = seqRange
-
-	if endIndex != math.MaxInt32 {
-		err := c.shift(seq, endIndex+offset, offset)
-		if err != nil {
-			return err
-		}
-	}
-
-	return nil
-}
--- a/kvcache/causal_test.go
+++ b/kvcache/causal_test.go
@@ -1,510 +0,0 @@
-package kvcache
-
-import (
-	"math"
-	"slices"
-	"testing"
-
-	"github.com/ollama/ollama/ml"
-)
-
-type testCase struct {
-	name          string
-	in            []float32
-	inShape       []int
-	seqs          []int
-	pos           []int32
-	expected      []float32
-	expectedShape []int
-	expectedMask  []float32
-}
-
-func TestStore(t *testing.T) {
-	backend := &testBackend{}
-	cache := NewCausalCache(nil)
-	defer cache.Close()
-
-	cache.Init(backend, ml.DTypeF16, 16)
-
-	tests := []testCase{
-		{
-			name:          "FirstBatch",
-			in:            []float32{111, 211, 121, 221, 131, 231, 112, 212, 122, 222, 132, 232, 113, 213, 123, 223, 133, 233, 114, 214, 124, 224, 134, 234},
-			inShape:       []int{2, 3, 4},
-			seqs:          []int{0, 0, 0, 0},
-			pos:           []int32{0, 1, 2, 3},
-			expected:      []float32{111, 211, 121, 221, 131, 231, 112, 212, 122, 222, 132, 232, 113, 213, 123, 223, 133, 233, 114, 214, 124, 224, 134, 234},
-			expectedShape: []int{2, 3, 4},
-			expectedMask:  []float32{0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, float32(math.Inf(-1)), 0, 0, 0, 0},
-		},
-		{
-			name:          "SecondBatch",
-			in:            []float32{115, 215, 125, 225, 135, 235},
-			inShape:       []int{2, 3, 1},
-			seqs:          []int{0},
-			pos:           []int32{4},
-			expected:      []float32{111, 211, 121, 221, 131, 231, 112, 212, 122, 222, 132, 232, 113, 213, 123, 223, 133, 233, 114, 214, 124, 224, 134, 234, 115, 215, 125, 225, 135, 235},
-			expectedShape: []int{2, 3, 5},
-			expectedMask:  []float32{0, 0, 0, 0, 0},
-		},
-	}
-
-	testCache(t, backend, cache, tests)
-}
-
-func TestSWA(t *testing.T) {
-	backend := &testBackend{}
-	cache := NewSWACache(1, nil)
-	defer cache.Close()
-
-	cache.Init(backend, ml.DTypeF32, 16)
-
-	tests := []testCase{
-		{
-			name:          "SlidingWindow",
-			in:            []float32{1, 2, 3, 4},
-			inShape:       []int{1, 1, 4},
-			seqs:          []int{0, 0, 0, 0},
-			pos:           []int32{0, 1, 2, 3},
-			expected:      []float32{1, 2, 3, 4},
-			expectedShape: []int{1, 1, 4},
-			expectedMask:  []float32{0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0},
-		},
-	}
-
-	testCache(t, backend, cache, tests)
-}
-
-func TestSequences(t *testing.T) {
-	backend := &testBackend{}
-	cache := NewCausalCache(nil)
-	defer cache.Close()
-
-	cache.Init(backend, ml.DTypeF16, 16)
-
-	tests := []testCase{
-		{
-			name:          "FirstBatch",
-			in:            []float32{1, 2, 3, 4},
-			inShape:       []int{1, 1, 4},
-			seqs:          []int{0, 0, 1, 1},
-			pos:           []int32{0, 1, 0, 1},
-			expected:      []float32{1, 2, 3, 4},
-			expectedShape: []int{1, 1, 4},
-			expectedMask:  []float32{0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0},
-		},
-		{
-			name:          "SecondBatch",
-			in:            []float32{5, 6},
-			inShape:       []int{1, 1, 2},
-			seqs:          []int{0, 1},
-			pos:           []int32{2, 2},
-			expected:      []float32{1, 2, 3, 4, 5, 6},
-			expectedShape: []int{1, 1, 6},
-			expectedMask:  []float32{0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, float32(math.Inf(-1)), 0},
-		},
-	}
-
-	testCache(t, backend, cache, tests)
-}
-
-func TestRemove(t *testing.T) {
-	backend := &testBackend{}
-	cache := NewCausalCache(func(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-		return key.Add(ctx, shift), nil
-	})
-	defer cache.Close()
-
-	cache.Init(backend, ml.DTypeF16, 16)
-
-	tests := []testCase{
-		{
-			name:          "FirstBatch",
-			in:            []float32{1, 2, 3, 4},
-			inShape:       []int{1, 1, 4},
-			seqs:          []int{0, 0, 1, 1},
-			pos:           []int32{0, 1, 0, 1},
-			expected:      []float32{1, 2, 3, 4},
-			expectedShape: []int{1, 1, 4},
-			expectedMask:  []float32{0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0},
-		},
-	}
-
-	testCache(t, backend, cache, tests)
-
-	err := cache.Remove(0, 1, math.MaxInt32)
-	if err != nil {
-		panic(err)
-	}
-
-	tests = []testCase{
-		{
-			name:          "RemoveEnd",
-			in:            []float32{5, 6},
-			inShape:       []int{1, 1, 2},
-			seqs:          []int{0, 1},
-			pos:           []int32{1, 2},
-			expected:      []float32{1, 2, 3, 4, 5, 6},
-			expectedShape: []int{1, 1, 6},
-			expectedMask:  []float32{0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, float32(math.Inf(-1)), 0},
-		},
-	}
-
-	testCache(t, backend, cache, tests)
-
-	err = cache.Remove(0, 0, 1)
-	if err != nil {
-		panic(err)
-	}
-
-	tests = []testCase{
-		{
-			name:          "RemoveMiddle",
-			in:            []float32{7, 8},
-			inShape:       []int{1, 1, 2},
-			seqs:          []int{0, 0},
-			pos:           []int32{1, 2},
-			expected:      []float32{7, 8, 3, 4, 4},
-			expectedShape: []int{1, 1, 5},
-			expectedMask:  []float32{0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), 0},
-		},
-	}
-
-	testCache(t, backend, cache, tests)
-}
-
-func TestDefrag(t *testing.T) {
-	backend := &testBackend{}
-	cache := NewCausalCache(func(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-		return key.Add(ctx, shift), nil
-	})
-	defer cache.Close()
-
-	cache.Init(backend, ml.DTypeF16, 16)
-
-	tests := []testCase{
-		{
-			name:          "FirstBatch",
-			in:            []float32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
-			inShape:       []int{1, 1, 16},
-			seqs:          []int{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-			pos:           []int32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
-			expected:      []float32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
-			expectedShape: []int{1, 1, 16},
-			expectedMask:  []float32{0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, 0, 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, 0, 0, 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, 0, 0, 0, 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, float32(math.Inf(-1)), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-		},
-	}
-
-	testCache(t, backend, cache, tests)
-
-	err := cache.Remove(0, 2, 4)
-	if err != nil {
-		panic(err)
-	}
-
-	err = cache.Remove(0, 13, math.MaxInt32)
-	if err != nil {
-		panic(err)
-	}
-
-	tests = []testCase{
-		{
-			name:          "Defrag",
-			in:            []float32{17, 18, 19},
-			inShape:       []int{1, 1, 3},
-			seqs:          []int{0, 0, 0},
-			pos:           []int32{16, 17, 18},
-			expected:      []float32{1, 2, 12, 13, 3, 4, 5, 6, 7, 8, 9, 10, 11, 17, 18, 19},
-			expectedShape: []int{1, 1, 16},
-			expectedMask:  []float32{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, float32(math.Inf(-1)), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-		},
-	}
-
-	testCache(t, backend, cache, tests)
-}
-
-func TestCopy(t *testing.T) {
-	backend := &testBackend{}
-	cache := NewCausalCache(func(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) { return key, nil })
-	defer cache.Close()
-
-	cache.Init(backend, ml.DTypeF16, 16)
-
-	tests := []testCase{
-		{
-			name:          "FirstBatch",
-			in:            []float32{1, 2, 3, 4},
-			inShape:       []int{1, 1, 4},
-			seqs:          []int{0, 0, 0, 0},
-			pos:           []int32{0, 1, 2, 3},
-			expected:      []float32{1, 2, 3, 4},
-			expectedShape: []int{1, 1, 4},
-			expectedMask:  []float32{0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, float32(math.Inf(-1)), 0, 0, 0, 0},
-		},
-	}
-
-	testCache(t, backend, cache, tests)
-
-	cache.CopyPrefix(0, 1, 2)
-
-	tests = []testCase{
-		{
-			name:          "Copy",
-			in:            []float32{5, 6},
-			inShape:       []int{1, 1, 2},
-			seqs:          []int{1, 1},
-			pos:           []int32{3, 4},
-			expected:      []float32{1, 2, 3, 4, 5, 6},
-			expectedShape: []int{1, 1, 6},
-			expectedMask:  []float32{0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), 0, float32(math.Inf(-1)), 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0},
-		},
-	}
-
-	testCache(t, backend, cache, tests)
-}
-
-func testCache(t *testing.T, backend ml.Backend, cache Cache, tests []testCase) {
-	for _, test := range tests {
-		t.Run(test.name, func(t *testing.T) {
-			context := backend.NewContext()
-			defer context.Close()
-
-			err := cache.StartForward(context, test.pos, test.seqs)
-			if err != nil {
-				panic(err)
-			}
-
-			cache.SetLayer(0)
-			tensor, _ := context.FromFloatSlice(test.in, test.inShape...)
-			cache.Put(context, tensor, tensor)
-
-			out, _, mask := cache.Get(context)
-
-			context.Forward(out)
-			context.Forward(mask)
-			context.Compute(out, mask)
-
-			if !slices.Equal(out.Floats(), test.expected) || !slices.Equal(out.Shape(), test.expectedShape) || !slices.Equal(mask.Floats(), test.expectedMask) {
-				t.Errorf("TestCache: have %v (shape %v); want %v (shape %v); mask: have %v (shape %v) want %v", out.Floats(), out.Shape(), test.expected, test.expectedShape, mask.Floats(), mask.Shape(), test.expectedMask)
-			}
-		})
-	}
-}
-
-type testBackend struct{}
-
-func (b *testBackend) Config() ml.Config {
-	panic("not implemented")
-}
-
-func (b *testBackend) Get(name string) ml.Tensor {
-	panic("not implemented")
-}
-
-func (b *testBackend) NewContext() ml.Context {
-	return &testContext{}
-}
-
-func (b *testBackend) SystemInfo() string {
-	return "not implemented"
-}
-
-type testContext struct{}
-
-func (c *testContext) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
-	total := 0
-
-	if len(shape) > 0 {
-		total = 1
-		for _, s := range shape {
-			total *= s
-		}
-	}
-
-	return &testTensor{dtype: dtype, elementSize: 4, data: make([]float32, total), shape: shape}
-}
-
-func (c *testContext) FromFloatSlice(s []float32, shape ...int) (ml.Tensor, error) {
-	t := c.Zeros(ml.DTypeF32, shape...).(*testTensor)
-
-	copy(t.data, s)
-
-	return t, nil
-}
-
-func (c *testContext) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
-	f := make([]float32, len(s))
-	for i := range f {
-		f[i] = float32(s[i])
-	}
-
-	out, _ := c.FromFloatSlice(f, shape...)
-	out.(*testTensor).dtype = ml.DTypeI32
-
-	return out, nil
-}
-
-func (c *testContext) Forward(ml.Tensor) {}
-
-func (c *testContext) Compute(...ml.Tensor) {}
-
-func (c *testContext) MaxTensors() int {
-	return 10
-}
-
-func (c *testContext) Close() {}
-
-type testTensor struct {
-	dtype       ml.DType
-	elementSize int
-	data        []float32
-	shape       []int
-}
-
-func (t *testTensor) Dim(n int) int {
-	return t.shape[n]
-}
-
-func (t *testTensor) Stride(n int) int {
-	stride := t.elementSize
-	for i := range n {
-		stride *= t.shape[i]
-	}
-
-	return stride
-}
-
-func (t *testTensor) Shape() []int {
-	return t.shape
-}
-
-func (t *testTensor) DType() ml.DType {
-	return t.dtype
-}
-
-func (t *testTensor) Bytes() []byte {
-	panic("not implemented")
-}
-
-func (t *testTensor) Floats() []float32 {
-	out := make([]float32, len(t.data))
-	copy(out, t.data)
-	return out
-}
-
-func (t *testTensor) Add(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
-	out := ctx.Zeros(t.DType(), t.Shape()...).(*testTensor)
-
-	for i := range out.data {
-		out.data[i] = t.data[i] + t2.(*testTensor).data[i]
-	}
-
-	return out
-}
-
-func (t *testTensor) Mul(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
-	panic("not implemented")
-}
-
-func (t *testTensor) Mulmat(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
-	panic("not implemented")
-}
-
-func (t *testTensor) MulmatFullPrec(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
-	panic("not implemented")
-}
-
-func (t *testTensor) Softmax(ctx ml.Context) ml.Tensor {
-	panic("not implemented")
-}
-
-func (t *testTensor) LayerNorm(ctx ml.Context, weight, bias ml.Tensor, eps float32) ml.Tensor {
-	panic("not implemented")
-}
-
-func (t *testTensor) RMSNorm(ctx ml.Context, weight ml.Tensor, eps float32) ml.Tensor {
-	panic("not implemented")
-}
-
-func (t *testTensor) Scale(ctx ml.Context, s float64) ml.Tensor {
-	panic("not implemented")
-}
-
-func (t *testTensor) Conv2D(ctx ml.Context, weight ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
-	panic("not implemented")
-}
-
-func (t *testTensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, dim, ropeType uint32, base, scale float32) ml.Tensor {
-	panic("not implemented")
-}
-
-func (t *testTensor) Tanh(ctx ml.Context) ml.Tensor {
-	panic("not implemented")
-}
-
-func (t *testTensor) GELU(ctx ml.Context) ml.Tensor {
-	panic("not implemented")
-}
-
-func (t *testTensor) SILU(ctx ml.Context) ml.Tensor {
-	panic("not implemented")
-}
-
-func (t *testTensor) Reshape(ctx ml.Context, shape ...int) ml.Tensor {
-	panic("not implemented")
-}
-
-func (t *testTensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
-	offset /= t.elementSize
-
-	var s []int
-
-	switch len(shape) {
-	case 1:
-		s = []int{shape[0]}
-	case 5:
-		s = []int{shape[0], shape[2], shape[4]}
-	default:
-		panic("unsupported number of dimensions")
-	}
-
-	context := &testContext{}
-
-	view := context.Zeros(t.dtype, s...).(*testTensor)
-	view.data = t.data[offset : offset+len(view.data)]
-
-	return view
-}
-
-func (t *testTensor) Permute(ctx ml.Context, shape ...int) ml.Tensor {
-	panic("not implemented")
-}
-
-func (t *testTensor) Contiguous(ctx ml.Context) ml.Tensor {
-	panic("not implemented")
-}
-
-func (t *testTensor) Pad(ctx ml.Context, shape ...int) ml.Tensor {
-	panic("not implemented")
-}
-
-func (t *testTensor) Unpad(ctx ml.Context, shape ...int) ml.Tensor {
-	panic("not implemented")
-}
-
-func (t *testTensor) Stack(ctx ml.Context, dim int, s ...ml.Tensor) ml.Tensor {
-	panic("not implemented")
-}
-
-func (t *testTensor) Concat(ctx ml.Context, t2 ml.Tensor, dim int) ml.Tensor {
-	panic("not implemented")
-}
-
-func (t *testTensor) Rows(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
-	panic("not implemented")
-}
-
-func (t *testTensor) Copy(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
-	copy(t2.(*testTensor).data, t.data)
-	return nil
-}
--- a/kvcache/encoder.go
+++ b/kvcache/encoder.go
@@ -1,97 +0,0 @@
-package kvcache
-
-import (
-	"github.com/ollama/ollama/ml"
-)
-
-// Encoder cache stores K and V tensors that are position independent
-//
-// The tensors can be of any shape and will be returned as they were stored
-// The mask is currently always nil
-//
-// Not currently safe for multiple sequences
-type EncoderCache struct {
-	// ** current forward pass **
-
-	// the active layer for Get and Put
-	curLayer int
-
-	// if something is stored during this pass, this
-	// will be the position (but there is no guarantee
-	// anything will be stored)
-	curPos int32
-
-	// ** cache metadata **
-
-	// was something stored in the cache?
-	encoderCached bool
-
-	// position of the cached data
-	encoderPos int32
-
-	// ** cache data storage **
-
-	cacheCtx     ml.Context
-	keys, values []ml.Tensor
-}
-
-func NewEncoderCache() *EncoderCache {
-	return &EncoderCache{}
-}
-
-func (c *EncoderCache) Init(backend ml.Backend, dtype ml.DType, capacity int32) {
-	c.cacheCtx = backend.NewContext()
-}
-
-func (c *EncoderCache) Close() {
-	c.cacheCtx.Close()
-}
-
-func (c *EncoderCache) StartForward(ctx ml.Context, positions []int32, seqs []int) error {
-	// The image is always in the first position
-	c.curPos = positions[0]
-
-	return nil
-}
-
-func (c *EncoderCache) SetLayer(layer int) {
-	if layer >= len(c.keys) {
-		c.keys = append(c.keys, make([]ml.Tensor, layer-len(c.keys)+1)...)
-		c.values = append(c.values, make([]ml.Tensor, layer-len(c.values)+1)...)
-	}
-
-	c.curLayer = layer
-}
-
-func (c *EncoderCache) EncoderCached() bool {
-	return c.encoderCached
-}
-
-func (c *EncoderCache) Get(ctx ml.Context) (ml.Tensor, ml.Tensor, ml.Tensor) {
-	return c.keys[c.curLayer], c.values[c.curLayer], nil
-}
-
-func (c *EncoderCache) Put(ctx ml.Context, key, value ml.Tensor) {
-	c.encoderPos = c.curPos
-	c.encoderCached = true
-
-	if c.keys[c.curLayer] == nil || c.values[c.curLayer] == nil {
-		c.keys[c.curLayer] = c.cacheCtx.Zeros(key.DType(), key.Shape()...)
-		c.values[c.curLayer] = c.cacheCtx.Zeros(value.DType(), value.Shape()...)
-	}
-
-	ctx.Forward(key.Copy(ctx, c.keys[c.curLayer]))
-	ctx.Forward(value.Copy(ctx, c.values[c.curLayer]))
-}
-
-func (c *EncoderCache) CopyPrefix(srcSeq, dstSeq int, len int32) {
-	panic("encoder cache does not support multiple sequences")
-}
-
-func (c *EncoderCache) Remove(seq int, beginIndex, endIndex int32) error {
-	if c.encoderPos >= beginIndex && c.encoderPos < endIndex {
-		c.encoderCached = false
-	}
-
-	return nil
-}
--- a/kvcache/wrapper.go
+++ b/kvcache/wrapper.go
@@ -1,93 +0,0 @@
-package kvcache
-
-import (
-	"math"
-
-	"github.com/ollama/ollama/ml"
-)
-
-// Wrapper cache is a container for multiple types of caches,
-// such as for the encoding and decoding portions of a model.
-type WrapperCache struct {
-	// caches we are wrapping
-	caches []Cache
-
-	// cache to be used for this layer
-	curType int
-}
-
-func NewWrapperCache(caches ...Cache) *WrapperCache {
-	return &WrapperCache{
-		caches: caches,
-	}
-}
-
-func (c *WrapperCache) Init(backend ml.Backend, dtype ml.DType, capacity int32) {
-	for _, cache := range c.caches {
-		cache.Init(backend, dtype, capacity)
-	}
-}
-
-func (c *WrapperCache) Close() {
-	for _, cache := range c.caches {
-		cache.Close()
-	}
-}
-
-func (c *WrapperCache) StartForward(ctx ml.Context, positions []int32, seqs []int) error {
-	for i, cache := range c.caches {
-		err := cache.StartForward(ctx, positions, seqs)
-		if err != nil {
-			// unwind on error - Remove with endIndex set to math.MaxInt32 does not fail
-			for j := i - 1; j >= 0; j-- {
-				for k := range positions {
-					_ = c.caches[j].Remove(seqs[k], positions[k], math.MaxInt32)
-				}
-			}
-			return err
-		}
-	}
-
-	c.curType = 0
-	return nil
-}
-
-func (c *WrapperCache) SetLayer(layer int) {
-	for _, cache := range c.caches {
-		cache.SetLayer(layer)
-	}
-}
-
-func (c *WrapperCache) SetLayerType(layerType int) {
-	c.curType = layerType
-}
-
-func (c *WrapperCache) UnderlyingCache() Cache {
-	return c.caches[c.curType]
-}
-
-func (c *WrapperCache) Get(ctx ml.Context) (ml.Tensor, ml.Tensor, ml.Tensor) {
-	return c.caches[c.curType].Get(ctx)
-}
-
-func (c *WrapperCache) Put(ctx ml.Context, key, value ml.Tensor) {
-	c.caches[c.curType].Put(ctx, key, value)
-}
-
-func (c *WrapperCache) CopyPrefix(srcSeq, dstSeq int, len int32) {
-	for _, cache := range c.caches {
-		cache.CopyPrefix(srcSeq, dstSeq, len)
-	}
-}
-
-func (c *WrapperCache) Remove(seq int, beginIndex, endIndex int32) error {
-	// If the one of these fails, the caller is supposed to retry with endIndex set to math.MaxInt32, which should not fail
-	for _, cache := range c.caches {
-		err := cache.Remove(seq, beginIndex, endIndex)
-		if err != nil {
-			return err
-		}
-	}
-
-	return nil
-}
--- a/llama/patches/0014-sort-devices-by-score.patch
+++ b/llama/patches/0014-sort-devices-by-score.patch
@@ -8,7 +8,7 @@ Subject: [PATCH] sort devices by score
 1 file changed, 13 insertions(+), 8 deletions(-)

 diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
-index 899d16f2..135f7df0 100644
+index 899d16f2..ac5cda07 100644
 --- a/ggml/src/ggml-backend-reg.cpp
 +++ b/ggml/src/ggml-backend-reg.cpp
@@ -150,7 +150,7 @@ struct ggml_backend_reg_entry {
@@ -29,7 +29,7 @@ index 899d16f2..135f7df0 100644
         if (!reg) {
             return;
         }
-@@ -206,15 +206,20 @@ struct ggml_backend_registry {
+@@ -206,15 +206,15 @@ struct ggml_backend_registry {
 #endif
         backends.push_back({ reg, std::move(handle) });
         for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
@@ -45,15 +45,10 @@ index 899d16f2..135f7df0 100644
 #endif
 -        devices.push_back(device);
 +        devices.push_back({device, score});
-+        std::stable_sort(devices.begin(), devices.end(),
-+            [](const auto & a, const auto & b) {
-+                return a.second > b.second;
-+            }
-+        );
     }
 
     ggml_backend_reg_t load_backend(const std::wstring & path, bool silent) {
-@@ -257,7 +262,7 @@ struct ggml_backend_registry {
+@@ -257,7 +257,7 @@ struct ggml_backend_registry {
 
         GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), utf16_to_utf8(path).c_str());
 
@@ -62,7 +57,7 @@ index 899d16f2..135f7df0 100644
 
         return reg;
     }
-@@ -280,7 +285,7 @@ struct ggml_backend_registry {
+@@ -280,7 +280,7 @@ struct ggml_backend_registry {
         // remove devices
         devices.erase(
             std::remove_if(devices.begin(), devices.end(),
@@ -71,12 +66,17 @@ index 899d16f2..135f7df0 100644
             devices.end());
 
         // remove backend
-@@ -338,7 +343,7 @@ size_t ggml_backend_dev_count() {
+@@ -338,7 +338,12 @@ size_t ggml_backend_dev_count() {
 
 ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
     GGML_ASSERT(index < ggml_backend_dev_count());
 -    return get_reg().devices[index];
-+    return get_reg().devices[index].first;
+    auto devices = get_reg().devices;
+    if (!std::is_heap(devices.begin(), devices.end())) {
+        std::make_heap(devices.begin(), devices.end(), [](const auto & a, const auto & b) { return a.second < b.second; });
+    }
+
+    return devices[index].first;
 }
 
 ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) {
--- a/llama/patches/0016-add-ollama-debug.patch
+++ b/llama/patches/0016-add-ollama-debug.patch
@@ -0,0 +1,33 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Michael Yang <mxyng@pm.me>
+Date: Fri, 7 Feb 2025 16:51:55 -0800
+Subject: [PATCH] add ollama debug
+
+---
+ ggml/src/ggml-cpu/ggml-cpu.c | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
+index b307d554..a23bbe98 100644
+--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
+@@ -13,6 +13,8 @@
+ #include "amx/amx.h"
+ #include "ggml.h"
+ 
+#include "ollama-debug.h"
+
+ #if defined(_MSC_VER) || defined(__MINGW32__)
+ #include <malloc.h> // using malloc.h with MSC/MINGW
+ #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
+@@ -13645,6 +13647,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
+ 
+         ggml_compute_forward(&params, node);
+ 
+#ifdef OLLAMA_DEBUG
+        ollama_debug(node, false);
+#endif
+
+         if (state->ith == 0 && cplan->abort_callback &&
+                 cplan->abort_callback(cplan->abort_callback_data)) {
+             tp->abort = true;
--- a/llama/patches/0017-try-catch-backend-load.patch
+++ b/llama/patches/0017-try-catch-backend-load.patch
@@ -8,7 +8,7 @@ Subject: [PATCH] try/catch backend load
 1 file changed, 23 insertions(+), 22 deletions(-)

 diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
-index 135f7df0..84b21dd8 100644
+index ac5cda07..374c3b21 100644
 --- a/ggml/src/ggml-backend-reg.cpp
 +++ b/ggml/src/ggml-backend-reg.cpp
@@ -512,32 +512,33 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
--- a/llama/runner/README.md
+++ b/llama/runner/README.md
--- a/runner/llamarunner/cache.go
+++ b/runner/llamarunner/cache.go
@@ -1,4 +1,4 @@
-package llamarunner
+package runner

 import (
 	"errors"
--- a/runner/llamarunner/cache_test.go
+++ b/runner/llamarunner/cache_test.go
@@ -1,4 +1,4 @@
-package llamarunner
+package runner

 import (
 	"testing"
--- a/runner/llamarunner/image.go
+++ b/runner/llamarunner/image.go
@@ -1,4 +1,4 @@
-package llamarunner
+package runner

 import (
 	"errors"
--- a/runner/llamarunner/image_test.go
+++ b/runner/llamarunner/image_test.go
@@ -1,4 +1,4 @@
-package llamarunner
+package runner

 import (
 	"reflect"
--- a/runner/llamarunner/runner.go
+++ b/runner/llamarunner/runner.go
@@ -1,4 +1,4 @@
-package llamarunner
+package runner

 import (
 	"context"
@@ -24,7 +24,6 @@ import (

 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/llama"
-	"github.com/ollama/ollama/runner/common"
 )

 // input is an element of the prompt to process, either
@@ -499,12 +498,12 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 		seq.pendingResponses = append(seq.pendingResponses, piece)
 		sequence := strings.Join(seq.pendingResponses, "")

-		if ok, stop := common.FindStop(sequence, seq.stop); ok {
+		if ok, stop := findStop(sequence, seq.stop); ok {
 			slog.Debug("hit stop token", "pending", seq.pendingResponses, "stop", stop)

 			var tokenTruncated bool
 			origLen := len(seq.pendingResponses)
-			seq.pendingResponses, tokenTruncated = common.TruncateStop(seq.pendingResponses, stop)
+			seq.pendingResponses, tokenTruncated = truncateStop(seq.pendingResponses, stop)
 			newLen := len(seq.pendingResponses)

 			// Update the cache based on the tokens that will be returned:
@@ -525,11 +524,11 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 			continue
 		}

-		if common.ContainsStopSuffix(sequence, seq.stop) {
+		if containsStopSuffix(sequence, seq.stop) {
 			continue
 		}

-		if common.IncompleteUnicode(sequence) {
+		if incompleteUnicode(sequence) {
 			continue
 		}

@@ -845,6 +844,8 @@ func (s *Server) loadModel(
 	threads int,
 	multiUserCache bool,
 ) {
+	llama.BackendInit()
+
 	var err error
 	s.model, err = llama.LoadModelFromFile(mpath, params)
 	if err != nil {
@@ -884,6 +885,9 @@ func (s *Server) loadModel(
 }

 func Execute(args []string) error {
+	if args[0] == "runner" {
+		args = args[1:]
+	}
 	fs := flag.NewFlagSet("runner", flag.ExitOnError)
 	mpath := fs.String("model", "", "Path to model binary file")
 	ppath := fs.String("mmproj", "", "Path to projector binary file")
@@ -930,8 +934,6 @@ func Execute(args []string) error {
 	})
 	slog.SetDefault(slog.New(handler))
 	slog.Info("starting go runner")
-
-	llama.BackendInit()
 	slog.Info("system", "info", llama.PrintSystemInfo(), "threads", *threads)

 	server := &Server{
--- a/runner/common/stop.go
+++ b/runner/common/stop.go
@@ -1,10 +1,10 @@
-package common
+package runner

 import (
 	"strings"
 )

-func FindStop(sequence string, stops []string) (bool, string) {
+func findStop(sequence string, stops []string) (bool, string) {
 	for _, stop := range stops {
 		if strings.Contains(sequence, stop) {
 			return true, stop
@@ -14,7 +14,7 @@ func FindStop(sequence string, stops []string) (bool, string) {
 	return false, ""
 }

-func ContainsStopSuffix(sequence string, stops []string) bool {
+func containsStopSuffix(sequence string, stops []string) bool {
 	for _, stop := range stops {
 		for i := 1; i <= len(stop); i++ {
 			if strings.HasSuffix(sequence, stop[:i]) {
@@ -29,7 +29,7 @@ func ContainsStopSuffix(sequence string, stops []string) bool {
 // truncateStop removes the provided stop string from pieces,
 // returning the partial pieces with stop removed, including truncating
 // the last piece if required (and signalling if this was the case)
-func TruncateStop(pieces []string, stop string) ([]string, bool) {
+func truncateStop(pieces []string, stop string) ([]string, bool) {
 	joined := strings.Join(pieces, "")

 	index := strings.Index(joined, stop)
@@ -65,7 +65,7 @@ func TruncateStop(pieces []string, stop string) ([]string, bool) {
 	return result, tokenTruncated
 }

-func IncompleteUnicode(token string) bool {
+func incompleteUnicode(token string) bool {
 	incomplete := false

 	// check if there is incomplete UTF-8 character at the end
--- a/runner/common/stop_test.go
+++ b/runner/common/stop_test.go
@@ -1,4 +1,4 @@
-package common
+package runner

 import (
 	"reflect"
@@ -52,7 +52,7 @@ func TestTruncateStop(t *testing.T) {

 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
-			result, resultTrunc := TruncateStop(tt.pieces, tt.stop)
+			result, resultTrunc := truncateStop(tt.pieces, tt.stop)
 			if !reflect.DeepEqual(result, tt.expected) || resultTrunc != tt.expectedTrunc {
 				t.Errorf("truncateStop(%v, %s): have %v (%v); want %v (%v)", tt.pieces, tt.stop, result, resultTrunc, tt.expected, tt.expectedTrunc)
 			}
@@ -120,7 +120,7 @@ func TestIncompleteUnicode(t *testing.T) {

 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
-			result := IncompleteUnicode(tt.input)
+			result := incompleteUnicode(tt.input)
 			if result != tt.expected {
 				t.Errorf("incompleteUnicode(%s): have %v; want %v", tt.input, result, tt.expected)
 			}
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -116,7 +116,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		opts.NumCtx = max(opts.NumCtx, 2048)
 	}

-	layers := f.Tensors().GroupLayers()
+	layers := f.Tensors().Layers()
 	// add one layer worth of memory as a buffer
 	if blk0, ok := layers["blk.0"]; ok {
 		layerSize = blk0.Size()
@@ -410,7 +410,7 @@ func projectorMemoryRequirements(filename string) (weights, graphSize uint64) {
 		return 0, 0
 	}

-	for _, layer := range ggml.Tensors().GroupLayers() {
+	for _, layer := range ggml.Tensors().Layers() {
 		weights += layer.Size()
 	}

@@ -431,7 +431,7 @@ func projectorMemoryRequirements(filename string) (weights, graphSize uint64) {
 		headCount := kv("attention.head_count")

 		numPatches := (imageSize / kv("patch_size")) * (imageSize / kv("patch_size"))
-		if _, ok := ggml.Tensors().GroupLayers()["v"]["class_embd"]; ok {
+		if _, ok := ggml.Tensors().Layers()["v"]["class_embd"]; ok {
 			numPatches++
 		}

--- a/llm/server.go
+++ b/llm/server.go
@@ -275,9 +275,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt
 			port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
 		}
 		finalParams := []string{"runner"}
-		if envconfig.NewEngine() {
-			finalParams = append(finalParams, "--ollama-engine")
-		}
 		finalParams = append(finalParams, params...)
 		finalParams = append(finalParams, "--port", strconv.Itoa(port))

@@ -320,8 +317,9 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt
 			return nil, fmt.Errorf("unable to lookup executable path: %w", err)
 		}

-		if eval, err := filepath.EvalSymlinks(exe); err == nil {
-			exe = eval
+		exe, err = filepath.EvalSymlinks(exe)
+		if err != nil {
+			return nil, fmt.Errorf("unable to evaluate symlinks for executable path: %w", err)
 		}

 		// TODO - once fully switched to the Go runner, load the model here for tokenize/detokenize cgo access
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -17,14 +17,12 @@ type Config interface {

 	Strings(string, ...[]string) []string
 	Uints(string, ...[]uint32) []uint32
-	Floats(string, ...[]float32) []float32
 }

 type Backend interface {
 	Config() Config
 	Get(name string) Tensor
 	NewContext() Context
-	SystemInfo() string
 }

 var backends = make(map[string]func(*os.File) (Backend, error))
@@ -51,16 +49,15 @@ type Context interface {
 	FromIntSlice(s []int32, shape ...int) (Tensor, error)

 	Forward(Tensor)
-	Compute(...Tensor)
-	MaxTensors() int
-	Close()
+	Compute(Tensor) Tensor
+	Close() error
 }

 type Tensor interface {
-	Dim(n int) int
-	Stride(n int) int
+	Dim(n int) int64
+	Stride(n int) int64

-	Shape() []int
+	Shape() []int64
 	DType() DType

 	Bytes() []byte
@@ -69,7 +66,6 @@ type Tensor interface {
 	Add(ctx Context, t2 Tensor) Tensor
 	Mul(ctx Context, t2 Tensor) Tensor
 	Mulmat(ctx Context, t2 Tensor) Tensor
-	MulmatFullPrec(ctx Context, t2 Tensor) Tensor

 	Softmax(ctx Context) Tensor
 	LayerNorm(ctx Context, weight, bias Tensor, eps float32) Tensor
@@ -77,19 +73,19 @@ type Tensor interface {
 	Scale(ctx Context, s float64) Tensor

 	Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
-	RoPE(ctx Context, positionIDs, ropeFactors Tensor, dim, ropeType uint32, base, scale float32) Tensor
+	RoPE(ctx Context, positionIDs, ropeFactors Tensor, dim uint32, base, scale float32) Tensor

 	Tanh(ctx Context) Tensor
 	GELU(ctx Context) Tensor
 	SILU(ctx Context) Tensor

-	Reshape(ctx Context, shape ...int) Tensor
+	Reshape(ctx Context, shape ...int64) Tensor
 	View(ctx Context, offset int, shape ...int) Tensor
 	Permute(ctx Context, shape ...int) Tensor
 	Contiguous(ctx Context) Tensor

-	Pad(ctx Context, shape ...int) Tensor
-	Unpad(ctx Context, shape ...int) Tensor
+	Pad(ctx Context, shape ...int64) Tensor
+	Unpad(ctx Context, shape ...int64) Tensor

 	Stack(ctx Context, dim int, s ...Tensor) Tensor
 	Concat(ctx Context, t2 Tensor, dim int) Tensor
@@ -115,13 +111,13 @@ func mul[T number](s ...T) T {

 type DumpOptions struct {
 	// Items is the number of elements to print at the beginning and end of each dimension.
-	Items int
+	Items int64

 	// Precision is the number of decimal places to print. Applies to float32 and float64.
 	Precision int
 }

-func Dump(ctx Context, t Tensor, opts ...DumpOptions) string {
+func Dump(t Tensor, opts ...DumpOptions) string {
 	if len(opts) < 1 {
 		opts = append(opts, DumpOptions{
 			Items:     3,
@@ -131,17 +127,11 @@ func Dump(ctx Context, t Tensor, opts ...DumpOptions) string {

 	switch t.DType() {
 	case DTypeF32:
-		return dump[[]float32](ctx, t, opts[0].Items, func(f float32) string {
-			return strconv.FormatFloat(float64(f), 'f', opts[0].Precision, 32)
-		})
-	case DTypeF16:
-		f32 := ctx.Zeros(DTypeF32, t.Shape()...)
-		f32 = t.Copy(ctx, f32)
-		return dump[[]float32](ctx, f32, opts[0].Items, func(f float32) string {
+		return dump[[]float32](t, opts[0].Items, func(f float32) string {
 			return strconv.FormatFloat(float64(f), 'f', opts[0].Precision, 32)
 		})
 	case DTypeI32:
-		return dump[[]int32](ctx, t, opts[0].Items, func(i int32) string {
+		return dump[[]int32](t, opts[0].Items, func(i int32) string {
 			return strconv.FormatInt(int64(i), 10)
 		})
 	default:
@@ -149,10 +139,10 @@ func Dump(ctx Context, t Tensor, opts ...DumpOptions) string {
 	}
 }

-func dump[S ~[]E, E number](ctx Context, t Tensor, items int, fn func(E) string) string {
-	if t.Bytes() == nil {
-		ctx.Forward(t)
-		ctx.Compute(t)
+func dump[S ~[]E, E number](t Tensor, items int64, fn func(E) string) string {
+	bts := t.Bytes()
+	if bts == nil {
+		return "<nil>"
 	}

 	s := make(S, mul(t.Shape()...))
@@ -163,12 +153,12 @@ func dump[S ~[]E, E number](ctx Context, t Tensor, items int, fn func(E) string)
 	shape := t.Shape()

 	var sb strings.Builder
-	var f func([]int, int)
-	f = func(dims []int, stride int) {
+	var f func([]int64, int64)
+	f = func(dims []int64, stride int64) {
 		prefix := strings.Repeat(" ", len(shape)-len(dims)+1)
 		fmt.Fprint(&sb, "[")
 		defer func() { fmt.Fprint(&sb, "]") }()
-		for i := 0; i < dims[0]; i++ {
+		for i := int64(0); i < dims[0]; i++ {
 			if i >= items && i < dims[0]-items {
 				fmt.Fprint(&sb, "..., ")
 				// skip to next printable element
@@ -200,8 +190,7 @@ func dump[S ~[]E, E number](ctx Context, t Tensor, items int, fn func(E) string)
 type DType int

 const (
-	DTypeOther DType = iota
-	DTypeF32
-	DTypeF16
+	DTypeF32 DType = iota
 	DTypeI32
+	DTypeOther
 )
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -1,30 +1,16 @@
 package ggml

-/*
-#cgo CPPFLAGS: -I${SRCDIR}/ggml/include
-#include <stdlib.h>
-#include <stdint.h>
-#include "ggml.h"
-#include "ggml-cpu.h"
-#include "ggml-backend.h"
-static struct ggml_backend_feature * getBackendFeatures(void *fp, ggml_backend_reg_t reg) {return ((ggml_backend_get_features_t)(fp))(reg);}
-static struct ggml_backend_feature * getNextBackendFeatures(struct ggml_backend_feature * feature) { return &feature[1];}
-
-typedef enum {COMP_UNKNOWN,COMP_GCC,COMP_CLANG} COMPILER;
-COMPILER inline get_compiler() {
-#if defined(__clang__)
-	return COMP_CLANG;
-#elif defined(__GNUC__)
-	return COMP_GCC;
-#else
-	return UNKNOWN_COMPILER;
-#endif
-}
-
-*/
+// #cgo CPPFLAGS: -I${SRCDIR}/ggml/include
+// #include <stdlib.h>
+// #include <stdint.h>
+// #include "ggml.h"
+// #include "ggml-cpu.h"
+// #include "ggml-backend.h"
 import "C"

 import (
+	"bytes"
+	"encoding/binary"
 	"fmt"
 	"io"
 	"log/slog"
@@ -113,15 +99,17 @@ func New(r *os.File) (ml.Backend, error) {
 				}),
 				backend: C.ggml_backend_dev_init(d.d, nil),
 			})
-		case C.GGML_BACKEND_DEVICE_TYPE_GPU:
-			slog.Info("gpu", "device", d)
-			gpus = append(gpus, Context{
-				ctx: C.ggml_init(C.struct_ggml_init_params{
-					mem_size: C.size_t(int(C.ggml_tensor_overhead()) * (len(meta.Tensors().Items()) + 1 + int(meta.KV().BlockCount())*2)),
-					no_alloc: true,
-				}),
-				backend: C.ggml_backend_dev_init(d.d, nil),
-			})
+
+			C.ggml_backend_cpu_set_n_threads(cpus[len(cpus)-1].backend, C.int(1))
+		// case C.GGML_BACKEND_DEVICE_TYPE_GPU:
+		// 	slog.Info("gpu", "device", d)
+		// 	gpus = append(gpus, Context{
+		// 		ctx: C.ggml_init(C.struct_ggml_init_params{
+		// 			mem_size: C.size_t(int(C.ggml_tensor_overhead()) * (len(meta.Tensors().Items()) + 1 + int(meta.KV().BlockCount())*2)),
+		// 			no_alloc: true,
+		// 		}),
+		// 		backend: C.ggml_backend_dev_init(d.d, nil),
+		// 	})
 		}
 	}

@@ -212,9 +200,10 @@ func (b *Backend) Get(name string) ml.Tensor {

 func (b *Backend) NewContext() ml.Context {
 	nodes := max(8192, len(b.meta.Tensors().Items())*5)
+	bts := make([]byte, C.size_t(nodes)*C.ggml_tensor_overhead()+C.ggml_graph_overhead_custom(C.size_t(nodes), false))
 	c := C.ggml_init(C.struct_ggml_init_params{
-		mem_buffer: nil,
-		mem_size:   C.size_t(nodes)*C.ggml_tensor_overhead() + C.ggml_graph_overhead_custom(C.size_t(nodes), false),
+		mem_buffer: unsafe.Pointer(&bts[0]),
+		mem_size:   C.size_t(len(bts)),
 		no_alloc:   true,
 	})

@@ -256,35 +245,15 @@ func (c *Context) Forward(t ml.Tensor) {
 	C.ggml_build_forward_expand(c.graph, t.(*Tensor).t)
 }

-func (c *Context) Compute(tensors ...ml.Tensor) {
+func (c *Context) Compute(t ml.Tensor) ml.Tensor {
+	c.Forward(t)
 	C.ggml_backend_sched_graph_compute_async(c.sched, c.graph)

-	needSync := true
-	sync := func() {
-		if needSync {
-			C.ggml_backend_sched_synchronize(c.sched)
-			needSync = false
-		}
-	}
+	backend := C.ggml_backend_sched_get_tensor_backend(c.sched, t.(*Tensor).t)

-	for _, t := range tensors {
-		if C.ggml_nbytes(t.(*Tensor).t) > 0 {
-			t.(*Tensor).sync = sync
-		}
-	}
-}
-
-func (c *Context) MaxTensors() int {
-	return c.nodes
-}
-
-func shapeToGGML(shape []int) *C.int64_t {
-	sh := make([]C.int64_t, len(shape))
-	for i, s := range shape {
-		sh[i] = (C.int64_t)(s)
-	}
-
-	return &sh[0]
+	t.(*Tensor).bytes = make([]byte, C.ggml_nbytes(t.(*Tensor).t))
+	C.ggml_backend_tensor_get_async(backend, t.(*Tensor).t, unsafe.Pointer(&t.(*Tensor).bytes[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
+	return t
 }

 func (c Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
@@ -301,11 +270,9 @@ func (c Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
 	var t *C.struct_ggml_tensor
 	switch dtype {
 	case ml.DTypeF32:
-		t = C.ggml_new_tensor(c.ctx, C.GGML_TYPE_F32, C.int(len(shape)), shapeToGGML(shape))
-	case ml.DTypeF16:
-		t = C.ggml_new_tensor(c.ctx, C.GGML_TYPE_F16, C.int(len(shape)), shapeToGGML(shape))
+		t = C.ggml_new_tensor(c.ctx, C.GGML_TYPE_F32, C.int(len(shape)), (*C.int64_t)(unsafe.Pointer(&shape[0])))
 	case ml.DTypeI32:
-		t = C.ggml_new_tensor(c.ctx, C.GGML_TYPE_I32, C.int(len(shape)), shapeToGGML(shape))
+		t = C.ggml_new_tensor(c.ctx, C.GGML_TYPE_I32, C.int(len(shape)), (*C.int64_t)(unsafe.Pointer(&shape[0])))
 	default:
 		panic("unsupported dtype")
 	}
@@ -318,13 +285,6 @@ func (c Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor {

 func fromSlice[S ~[]E, E float32 | int32](ctx Context, s S, shape []int, dtype uint32) (ml.Tensor, error) {
 	n := len(s)
-
-	if n == 0 {
-		var shape C.int64_t = 0
-		t := C.ggml_new_tensor(ctx.ctx, dtype, 1, &shape)
-		return &Tensor{t: t}, nil
-	}
-
 	for _, v := range shape {
 		n /= v
 	}
@@ -333,7 +293,7 @@ func fromSlice[S ~[]E, E float32 | int32](ctx Context, s S, shape []int, dtype u
 		return nil, fmt.Errorf("invalid shape %v for %d elements", shape, len(s))
 	}

-	t := C.ggml_new_tensor(ctx.ctx, dtype, C.int(len(shape)), shapeToGGML(shape))
+	t := C.ggml_new_tensor(ctx.ctx, dtype, C.int(len(shape)), (*C.int64_t)(unsafe.Pointer(&shape[0])))
 	b := C.ggml_backend_alloc_buffer(ctx.backend, C.ggml_nbytes(t))
 	C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
 	C.ggml_backend_tensor_set(t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t))
@@ -348,16 +308,15 @@ func (c Context) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
 	return fromSlice(c, s, shape, C.GGML_TYPE_I32)
 }

-func (c *Context) Close() {
-	if c != nil {
-		C.ggml_backend_sched_free(c.sched)
-		C.ggml_free(c.ctx)
-	}
+func (c *Context) Close() error {
+	C.ggml_backend_sched_free(c.sched)
+	C.ggml_free(c.ctx)
+	return nil
 }

 type Tensor struct {
-	t    *C.struct_ggml_tensor
-	sync func()
+	t     *C.struct_ggml_tensor
+	bytes []byte
 }

 func (t *Tensor) LogValue() slog.Value {
@@ -368,16 +327,16 @@ func (t *Tensor) LogValue() slog.Value {
 	)
 }

-func (t *Tensor) Dim(n int) int {
-	return int(t.t.ne[n])
+func (t *Tensor) Dim(n int) int64 {
+	return int64(t.t.ne[n])
 }

-func (t *Tensor) Stride(n int) int {
-	return int(t.t.nb[n])
+func (t *Tensor) Stride(n int) int64 {
+	return int64(t.t.nb[n])
 }

-func (t *Tensor) Shape() []int {
-	shape := make([]int, C.ggml_n_dims(t.t))
+func (t *Tensor) Shape() []int64 {
+	shape := make([]int64, C.ggml_n_dims(t.t))
 	for i := range shape {
 		shape[i] = t.Dim(i)
 	}
@@ -385,23 +344,19 @@ func (t *Tensor) Shape() []int {
 	return shape
 }

-func (t *Tensor) Bytes() (data []byte) {
-	if t.sync != nil {
-		data = make([]byte, C.ggml_nbytes(t.t))
-
-		t.sync()
-		C.ggml_backend_tensor_get(t.t, unsafe.Pointer(&data[0]), 0, C.ggml_nbytes(t.t))
+func (t *Tensor) Bytes() []byte {
+	if t.bytes == nil {
+		cbytes := C.ggml_get_data(t.t)
+		t.bytes = C.GoBytes(unsafe.Pointer(cbytes), C.int(C.ggml_nbytes(t.t)))
 	}

-	return
+	return t.bytes
 }

-func (t *Tensor) Floats() (data []float32) {
-	if t.sync != nil {
-		data = make([]float32, C.ggml_nelements(t.t))
-
-		t.sync()
-		C.ggml_backend_tensor_get(t.t, unsafe.Pointer(&data[0]), 0, C.ggml_nbytes(t.t))
+func (t *Tensor) Floats() (f32s []float32) {
+	if t.bytes != nil {
+		f32s = make([]float32, C.ggml_nelements(t.t))
+		_ = binary.Read(bytes.NewReader(t.bytes), binary.LittleEndian, f32s)
 	}

 	return
@@ -411,8 +366,6 @@ func (t *Tensor) DType() ml.DType {
 	switch t.t._type {
 	case C.GGML_TYPE_F32:
 		return ml.DTypeF32
-	case C.GGML_TYPE_F16:
-		return ml.DTypeF16
 	case C.GGML_TYPE_I32:
 		return ml.DTypeI32
 	default:
@@ -458,15 +411,6 @@ func (t *Tensor) Mulmat(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
 	}
 }

-func (t *Tensor) MulmatFullPrec(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
-	mul := C.ggml_mul_mat(ctx.(*Context).ctx, t.t, t2.(*Tensor).t)
-	C.ggml_mul_mat_set_prec(mul, C.GGML_PREC_F32)
-
-	return &Tensor{
-		t: mul,
-	}
-}
-
 func (t *Tensor) LayerNorm(ctx ml.Context, w, b ml.Tensor, eps float32) ml.Tensor {
 	tt := (&Tensor{t: C.ggml_norm(ctx.(*Context).ctx, t.t, C.float(eps))}).Mul(ctx, w)
 	if b != nil {
@@ -480,7 +424,7 @@ func (t *Tensor) RMSNorm(ctx ml.Context, w ml.Tensor, eps float32) ml.Tensor {
 	return (&Tensor{t: C.ggml_norm(ctx.(*Context).ctx, t.t, C.float(eps))}).Mul(ctx, w)
 }

-func (t *Tensor) Pad(ctx ml.Context, shape ...int) ml.Tensor {
+func (t *Tensor) Pad(ctx ml.Context, shape ...int64) ml.Tensor {
 	if len(shape) != 4 {
 		panic("expected 4 dimensions")
 	}
@@ -512,7 +456,7 @@ func (t *Tensor) Copy(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
 	}
 }

-func (t *Tensor) Reshape(ctx ml.Context, shape ...int) ml.Tensor {
+func (t *Tensor) Reshape(ctx ml.Context, shape ...int64) ml.Tensor {
 	switch len(shape) {
 	case 1:
 		return &Tensor{
@@ -553,7 +497,7 @@ func (t *Tensor) Tanh(ctx ml.Context) ml.Tensor {
 	}
 }

-func (t *Tensor) Unpad(ctx ml.Context, shape ...int) ml.Tensor {
+func (t *Tensor) Unpad(ctx ml.Context, shape ...int64) ml.Tensor {
 	if len(shape) != 4 {
 		panic("expected 4 dimensions")
 	}
@@ -596,28 +540,20 @@ func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
 }

 const (
-	ropeTypeNorm   C.int = 0
-	ropeTypeNeox   C.int = 2
-	ropeTypeMrope  C.int = 8
-	ropeTypeVision C.int = 24
+	ropeTypeNorm C.int = iota
 )

-func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim, ropeType uint32, ropeBase, ropeScale float32) ml.Tensor {
+func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim uint32, ropeBase, ropeScale float32) ml.Tensor {
 	if ropeFactors == nil {
 		ropeFactors = &Tensor{}
 	}

-	dequant := t.t
-	if C.ggml_is_quantized(t.t._type) {
-		dequant = C.ggml_cast(ctx.(*Context).ctx, t.t, C.GGML_TYPE_F32)
-	}
-
 	return &Tensor{
 		t: C.ggml_rope_ext(
-			ctx.(*Context).ctx, dequant, positionIDs.(*Tensor).t, ropeFactors.(*Tensor).t,
+			ctx.(*Context).ctx, t.t, positionIDs.(*Tensor).t, ropeFactors.(*Tensor).t,
 			C.int(ropeDim),
-			C.int(ropeType),
-			131072, // YaRN n_ctx_train
+			131072,       // YaRN n_ctx_train
+			ropeTypeNorm, // ROPE_TYPE_NORM
 			C.float(ropeBase),
 			C.float(ropeScale),
 			0.,  // YaRN ext_factor
@@ -645,34 +581,3 @@ func (t *Tensor) Conv2D(ctx ml.Context, t2 ml.Tensor, s0, s1, p0, p1, d0, d1 int
 		t: C.ggml_conv_2d(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int(s0), C.int(s1), C.int(p0), C.int(p1), C.int(d0), C.int(d1)),
 	}
 }
-
-func (b *Backend) SystemInfo() string {
-	var compiler string
-	switch C.get_compiler() {
-	case C.COMP_UNKNOWN:
-		compiler = "cgo(unknown_compiler)"
-	case C.COMP_GCC:
-		compiler = "cgo(gcc)"
-	case C.COMP_CLANG:
-		compiler = "cgo(clang)"
-	}
-
-	var s string
-	for i := range C.ggml_backend_reg_count() {
-		reg := C.ggml_backend_reg_get(i)
-		fName := C.CString("ggml_backend_get_features")
-		defer C.free(unsafe.Pointer(fName))
-		get_features_fn := C.ggml_backend_reg_get_proc_address(reg, fName)
-		if get_features_fn != nil {
-			s += C.GoString(C.ggml_backend_reg_name(reg))
-			s += " : "
-			for features := C.getBackendFeatures(get_features_fn, reg); features.name != nil; features = C.getNextBackendFeatures(features) {
-				s += C.GoString(features.name)
-				s += " = "
-				s += C.GoString(features.value)
-				s += " | "
-			}
-		}
-	}
-	return s + compiler
-}
--- a/ml/backend/ggml/ggml/.rsync-filter
+++ b/ml/backend/ggml/ggml/.rsync-filter
@@ -1,5 +1,6 @@
 protect *.go
 protect *-embed.*
+protect ollama-debug.*
 include include/
 include src/
 include src/CMakeLists.txt
--- a/ml/backend/ggml/ggml/include/ollama-debug.h
+++ b/ml/backend/ggml/ggml/include/ollama-debug.h
@@ -0,0 +1,11 @@
+#include "ggml.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void ollama_debug(const struct ggml_tensor *tensor, bool verbose);
+
+#ifdef __cplusplus
+}
+#endif
--- a/ml/backend/ggml/ggml/src/debug.go
+++ b/ml/backend/ggml/ggml/src/debug.go
@@ -0,0 +1,6 @@
+//go:build debug
+
+package ggml
+
+// #cgo CPPFLAGS: -DOLLAMA_DEBUG
+import "C"
--- a/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
@@ -215,11 +215,6 @@ struct ggml_backend_registry {
        GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
 #endif
        devices.push_back({device, score});
-        std::stable_sort(devices.begin(), devices.end(),
-            [](const auto & a, const auto & b) {
-                return a.second > b.second;
-            }
-        );
    }

    ggml_backend_reg_t load_backend(const std::wstring & path, bool silent) {
@@ -343,7 +338,12 @@ size_t ggml_backend_dev_count() {

 ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
    GGML_ASSERT(index < ggml_backend_dev_count());
-    return get_reg().devices[index].first;
+    auto devices = get_reg().devices;
+    if (!std::is_heap(devices.begin(), devices.end())) {
+        std::make_heap(devices.begin(), devices.end(), [](const auto & a, const auto & b) { return a.second < b.second; });
+    }
+
+    return devices[index].first;
 }

 ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) {
--- a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
@@ -13,6 +13,8 @@
 #include "amx/amx.h"
 #include "ggml.h"

+#include "ollama-debug.h"
+
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <malloc.h> // using malloc.h with MSC/MINGW
 #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
@@ -13645,6 +13647,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {

        ggml_compute_forward(&params, node);

+        ollama_debug(node, true);
+
        if (state->ith == 0 && cplan->abort_callback &&
                cplan->abort_callback(cplan->abort_callback_data)) {
            tp->abort = true;
--- a/ml/backend/ggml/ggml/src/ollama-debug.c
+++ b/ml/backend/ggml/ggml/src/ollama-debug.c
@@ -0,0 +1,110 @@
+#include <string.h>
+
+#include "ollama-debug.h"
+
+static int mul(int64_t *dims, int ndims) {
+    int result = 1;
+    for (int i = 0; i < ndims; i++) {
+        result *= dims[i];
+    }
+
+    return result;
+}
+
+static void repeat(char c, int n) {
+    for (int i = 0; i < n; i++) {
+        fprintf(stderr, "%c", c);
+    }
+}
+
+static void print_tensor(const void *tensor, void (*cb)(const void *, int),
+                         int shape,
+                         int64_t *dims, int ndims, int stride,
+                         int nitems, int pad) {
+    fprintf(stderr, "[");
+    for (int i = 0; i < dims[0]; i++) {
+        if (i >= nitems && i < dims[0] - nitems) {
+            fprintf(stderr, "... (%lld more), ", dims[0] - 2 * nitems);
+            int skip = dims[0] - 2 * nitems;
+            if (ndims > 1) {
+                stride += mul(dims + 1, ndims - 1) * skip;
+                repeat('\n', ndims - 1);
+                repeat(' ', shape - ndims + 1 + pad);
+            }
+            i += skip - 1;
+        } else if (ndims > 1) {
+            print_tensor(tensor, cb, shape, dims + 1, ndims - 1, stride,
+                         nitems, pad);
+            stride += mul(dims + 1, ndims - 1);
+            if (i < dims[0] - 1) {
+                fprintf(stderr, ", ");
+                repeat('\n', ndims - 1);
+                repeat(' ', shape - ndims + 1 + pad);
+            }
+        } else {
+            cb(tensor, stride + i);
+            if (i < dims[0] - 1) {
+                fprintf(stderr, ", ");
+            }
+        }
+    }
+    fprintf(stderr, "]");
+}
+
+static void print_tensor_f16(const void *tensor, int i) {
+    fprintf(stderr, "%f", ggml_fp16_to_fp32(((const ggml_fp16_t *)tensor)[i]));
+}
+
+static void print_tensor_f32(const void *tensor, int i) {
+    fprintf(stderr, "%f", ((const float *)tensor)[i]);
+}
+
+static void print_tensor_i32(const void *tensor, int i) {
+    fprintf(stderr, "%d", ((const int32_t *)tensor)[i]);
+}
+
+static void ollama_debug_tensor(const struct ggml_tensor *tensor, bool verbose, const char *prefix, int indent) {
+    fprintf(stderr, "%s%s %s (%s): [%lld %lld %lld %lld]\n", prefix, tensor->name,
+            ggml_op_name(tensor->op), ggml_type_name(tensor->type), tensor->ne[0],
+            tensor->ne[1], tensor->ne[2], tensor->ne[3]);
+
+    if (!verbose) {
+        return;
+    }
+
+    for (int i = 0; i < indent; i++) {
+        fprintf(stderr, " ");
+    }
+
+    switch (tensor->type) {
+    case GGML_TYPE_F16:
+        print_tensor(ggml_get_data(tensor), print_tensor_f16, ggml_n_dims(tensor),
+                     (int64_t *)tensor->ne, ggml_n_dims(tensor), 0, 3, indent);
+        break;
+    case GGML_TYPE_F32:
+        print_tensor(ggml_get_data(tensor), print_tensor_f32, ggml_n_dims(tensor),
+                     (int64_t *)tensor->ne, ggml_n_dims(tensor), 0, 3, indent);
+        break;
+    case GGML_TYPE_I32:
+        print_tensor(ggml_get_data(tensor), print_tensor_i32, ggml_n_dims(tensor),
+                     (int64_t *)tensor->ne, ggml_n_dims(tensor), 0, 3, indent);
+        break;
+    default:
+        fprintf(stderr, "<unsupported type>\n");
+        return;
+    }
+
+    fprintf(stderr, "\n");
+}
+
+void ollama_debug(const struct ggml_tensor *tensor, bool verbose) {
+    ollama_debug_tensor(tensor, verbose, ">>> ", 4);
+
+    if (tensor->src[0] != NULL) {
+        ollama_debug_tensor(tensor->src[0], verbose, " ?? ", 4);
+    }
+
+    if (tensor->src[1] != NULL) {
+        ollama_debug_tensor(tensor->src[1], verbose, " ?? ", 4);
+    }
+}
--- a/model/bert/model.go
+++ b/model/bert/model.go
@@ -0,0 +1,185 @@
+package bert
+
+import (
+	"math"
+
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/model"
+)
+
+func init() {
+	model.Register("bert", New)
+}
+
+type PoolingType int
+
+const (
+	PoolingTypeNone PoolingType = iota
+	PoolingTypeMean
+	PoolingTypeCLS
+	PoolingTypeLast
+	PoolingTypeRank
+)
+
+type Options struct {
+	hiddenSize, numHeads int64
+	eps                  float32
+	poolingType          PoolingType
+}
+
+type Model struct {
+	model.Base
+	model.BytePairEncoding
+
+	TokenEmbedding     *nn.Embedding `gguf:"token_embd"`
+	TypeEmbedding      *nn.Embedding `gguf:"type_embd,alt:token_types"`
+	PositionEmbedding  *nn.Embedding `gguf:"position_embd"`
+	TokenEmbeddingNorm *nn.LayerNorm `gguf:"token_embd_norm"`
+
+	Layers []EncoderLayer `gguf:"blk"`
+
+	*Options
+}
+
+// Forward implements model.Model.
+func (m *Model) Forward(ctx ml.Context, opts model.Options) (ml.Tensor, error) {
+	inputs, err := ctx.FromIntSlice(opts.Inputs(), len(opts.Inputs()))
+	if err != nil {
+		return nil, err
+	}
+
+	types, err := ctx.FromIntSlice([]int32{0}, 1)
+	if err != nil {
+		return nil, err
+	}
+
+	positions, err := ctx.FromIntSlice(opts.Positions(), len(opts.Positions()))
+	if err != nil {
+		return nil, err
+	}
+
+	hiddenState := m.TokenEmbedding.Forward(ctx, inputs)
+	hiddenState = hiddenState.Add(ctx, m.TypeEmbedding.Forward(ctx, types))
+	hiddenState = hiddenState.Add(ctx, m.PositionEmbedding.Forward(ctx, positions))
+	hiddenState = m.TokenEmbeddingNorm.Forward(ctx, hiddenState, m.eps)
+
+	for i, layer := range m.Layers {
+		hiddenState = layer.Forward(ctx, hiddenState, positions, opts.Cache.Sub(i), m.Options)
+	}
+
+	switch m.poolingType {
+	case PoolingTypeMean:
+		sum := func(s []int32) (sum int32) {
+			for _, v := range s {
+				sum += v
+			}
+
+			return
+		}
+
+		// TODO: handle batch
+		f32s := make([]float32, len(opts.Positions())*len(opts.Positions()))
+		for i := range opts.Positions() {
+			f32s[i] = 1 / float32(sum(opts.Positions()))
+		}
+
+		means, err := ctx.FromFloatSlice(f32s, len(opts.Positions()), len(opts.Positions()))
+		if err != nil {
+			return nil, err
+		}
+
+		hiddenState = hiddenState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
+		hiddenState = hiddenState.Mulmat(ctx, means)
+	}
+
+	return hiddenState, nil
+}
+
+type EncoderLayer struct {
+	*SelfAttention
+	MLPNorm *nn.LayerNorm `gguf:"attn_output_norm"`
+	*MLP
+	LayerOutputNorm *nn.LayerNorm `gguf:"layer_output_norm"`
+}
+
+func (e *EncoderLayer) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache model.Cache, opts *Options) ml.Tensor {
+	residual := hiddenState
+
+	hiddenState = e.SelfAttention.Forward(ctx, hiddenState, positionIDs, cache, opts)
+	hiddenState = hiddenState.Add(ctx, residual)
+	hiddenState = e.MLPNorm.Forward(ctx, hiddenState, opts.eps)
+	residual = hiddenState
+
+	hiddenState = e.MLP.Forward(ctx, hiddenState, opts)
+	hiddenState = hiddenState.Add(ctx, residual)
+	return e.LayerOutputNorm.Forward(ctx, hiddenState, opts.eps)
+}
+
+type SelfAttention struct {
+	Query  *nn.Linear `gguf:"attn_q"`
+	Key    *nn.Linear `gguf:"attn_k"`
+	Value  *nn.Linear `gguf:"attn_v"`
+	Output *nn.Linear `gguf:"attn_output"`
+}
+
+func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache model.Cache, opts *Options) ml.Tensor {
+	batchSize := hiddenState.Dim(1)
+	headDim := opts.hiddenSize / opts.numHeads
+
+	query := sa.Query.Forward(ctx, hiddenState)
+	query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
+
+	key := sa.Key.Forward(ctx, hiddenState)
+	key = key.Reshape(ctx, headDim, opts.numHeads, batchSize)
+
+	value := sa.Value.Forward(ctx, hiddenState)
+	value = value.Reshape(ctx, headDim, opts.numHeads, batchSize)
+
+	key, value = cache.Put(ctx, key, value, cache.Options)
+
+	query = query.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
+	key = key.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
+	value = value.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
+
+	scores := key.Mulmat(ctx, query)
+	scores = scores.Scale(ctx, 1.0/math.Sqrt(float64(headDim)))
+	scores = scores.Softmax(ctx)
+
+	attention := value.Mulmat(ctx, scores)
+	attention = attention.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
+	attention = attention.Reshape(ctx, opts.hiddenSize, batchSize)
+
+	return sa.Output.Forward(ctx, attention)
+}
+
+type MLP struct {
+	Up   *nn.Linear `gguf:"ffn_up"`
+	Down *nn.Linear `gguf:"ffn_down"`
+}
+
+func (mlp *MLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *Options) ml.Tensor {
+	return mlp.Down.Forward(ctx, mlp.Up.Forward(ctx, hiddenState).GELU(ctx))
+}
+
+func New(c ml.Config) (model.Model, error) {
+	return &Model{
+		Layers: make([]EncoderLayer, c.Uint("block_count")),
+		BytePairEncoding: model.NewBytePairEncoding(
+			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
+			&model.Vocabulary{
+				Values: c.Strings("tokenizer.ggml.tokens"),
+				Types:  c.Uints("tokenizer.ggml.token_type"),
+				Merges: c.Strings("tokenizer.ggml.merges"),
+				BOS:    c.Uint("tokenizer.ggml.bos_token_id"),
+				EOS:    c.Uint("tokenizer.ggml.eos_token_id"),
+			},
+		),
+		Options: &Options{
+			hiddenSize:  int64(c.Uint("embedding_length")),
+			numHeads:    int64(c.Uint("attention.head_count")),
+			eps:         c.Float("attention.layer_norm_epsilon"),
+			poolingType: PoolingType(c.Uint("pooling_type")),
+		},
+	}, nil
+}
--- a/model/bert/model_test.go
+++ b/model/bert/model_test.go
@@ -0,0 +1,75 @@
+package bert_test
+
+import (
+	"encoding/json"
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/model"
+)
+
+func blob(t *testing.T, tag string) string {
+	t.Helper()
+	home, err := os.UserHomeDir()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	p := filepath.Join(home, ".ollama", "models")
+	manifestBytes, err := os.ReadFile(filepath.Join(p, "manifests", "registry.ollama.ai", "library", "all-minilm", tag))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	var manifest struct {
+		Layers []struct {
+			MediaType string `json:"mediaType"`
+			Digest    string `json:"digest"`
+		}
+	}
+
+	if err := json.Unmarshal(manifestBytes, &manifest); err != nil {
+		t.Fatal(err)
+	}
+
+	var digest string
+	for _, layer := range manifest.Layers {
+		if layer.MediaType == "application/vnd.ollama.image.model" {
+			digest = layer.Digest
+			break
+		}
+	}
+
+	if digest == "" {
+		t.Fatal("no model layer found")
+	}
+
+	return filepath.Join(p, "blobs", strings.ReplaceAll(digest, ":", "-"))
+}
+
+func TestEmbedding(t *testing.T) {
+	m, err := model.New(blob(t, "latest"))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	text, err := os.ReadFile(filepath.Join("..", "testdata", "war-and-peace.txt"))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	inputIDs, err := m.(model.TextProcessor).Encode(string(text))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	logit, err := model.Forward(m, model.WithInputIDs(inputIDs))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	t.Log(ml.Dump(logit))
+}
--- a/model/models/llama/model.go
+++ b/model/models/llama/model.go
@@ -3,7 +3,6 @@ package llama
 import (
 	"math"

-	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
 	"github.com/ollama/ollama/model"
@@ -11,7 +10,7 @@ import (

 type Options struct {
 	RopeFactors                      ml.Tensor `gguf:"rope_freqs.weight"`
-	hiddenSize, numHeads, numKVHeads int
+	hiddenSize, numHeads, numKVHeads int64
 	eps, ropeBase, ropeScale         float32
 	ropeDim                          uint32
 }
@@ -29,32 +28,28 @@ type Model struct {
 }

 func New(c ml.Config) (model.Model, error) {
-	m := Model{
+	return &Model{
 		BytePairEncoding: model.NewBytePairEncoding(
 			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
 			&model.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Uints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
-				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
-				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+				BOS:    c.Uint("tokenizer.ggml.bos_token_id"),
+				EOS:    c.Uint("tokenizer.ggml.eos_token_id"),
 			},
 		),
 		Layers: make([]Layer, c.Uint("block_count")),
 		Options: &Options{
-			hiddenSize: int(c.Uint("embedding_length")),
-			numHeads:   int(c.Uint("attention.head_count")),
-			numKVHeads: int(c.Uint("attention.head_count_kv")),
+			hiddenSize: int64(c.Uint("embedding_length")),
+			numHeads:   int64(c.Uint("attention.head_count")),
+			numKVHeads: int64(c.Uint("attention.head_count_kv")),
 			eps:        c.Float("attention.layer_norm_rms_epsilon"),
 			ropeBase:   c.Float("rope.freq_base"),
 			ropeScale:  c.Float("rope.freq_scale", 1),
 			ropeDim:    c.Uint("rope.dimension_count"),
 		},
-	}
-
-	m.Cache = kvcache.NewCausalCache(m.Shift)
-
-	return &m, nil
+	}, nil
 }

 type SelfAttention struct {
@@ -64,32 +59,29 @@ type SelfAttention struct {
 	Output *nn.Linear `gguf:"attn_output"`
 }

-func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
+func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache model.Cache, opts *Options) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
 	headDim := opts.hiddenSize / opts.numHeads
-	ropeType := uint32(0)

 	q := sa.Query.Forward(ctx, hiddenState)
 	q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
-	q = q.RoPE(ctx, positionIDs, opts.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
+	q = q.RoPE(ctx, positionIDs, opts.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)

 	k := sa.Key.Forward(ctx, hiddenState)
 	k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-	k = k.RoPE(ctx, positionIDs, opts.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
+	k = k.RoPE(ctx, positionIDs, opts.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)

 	v := sa.Value.Forward(ctx, hiddenState)
 	v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)

-	cache.Put(ctx, k, v)
-	k, v, mask := cache.Get(ctx)
+	k, v = cache.Put(ctx, k, v, cache.Options)

 	q = q.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
 	k = k.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
 	v = v.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)

-	kq := k.MulmatFullPrec(ctx, q)
+	kq := k.Mulmat(ctx, q)
 	kq = kq.Scale(ctx, 1.0/math.Sqrt(float64(headDim)))
-	kq = kq.Add(ctx, mask)
 	kq = kq.Softmax(ctx)

 	kqv := v.Mulmat(ctx, kq)
@@ -99,10 +91,6 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
 	return sa.Output.Forward(ctx, kqv)
 }

-func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return key.RoPE(ctx, shift, m.Options.RopeFactors, m.Options.ropeDim, uint32(0), m.Options.ropeBase, m.Options.ropeScale), nil
-}
-
 type MLP struct {
 	Up   *nn.Linear `gguf:"ffn_up"`
 	Down *nn.Linear `gguf:"ffn_down"`
@@ -121,7 +109,7 @@ type Layer struct {
 	MLP           *MLP
 }

-func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
+func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache model.Cache, opts *Options) ml.Tensor {
 	residual := hiddenState

 	hiddenState = l.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
@@ -135,12 +123,12 @@ func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cach
 }

 func (m *Model) Forward(ctx ml.Context, opts model.Options) (ml.Tensor, error) {
-	inputs, err := ctx.FromIntSlice(opts.Inputs, len(opts.Inputs))
+	inputs, err := ctx.FromIntSlice(opts.Inputs(), len(opts.Inputs()))
 	if err != nil {
 		return nil, err
 	}

-	positions, err := ctx.FromIntSlice(opts.Positions, len(opts.Positions))
+	positions, err := ctx.FromIntSlice(opts.Positions(), len(opts.Positions()))
 	if err != nil {
 		return nil, err
 	}
@@ -148,14 +136,13 @@ func (m *Model) Forward(ctx ml.Context, opts model.Options) (ml.Tensor, error) {
 	hiddenState := m.TokenEmbedding.Forward(ctx, inputs)

 	for i, layer := range m.Layers {
-		m.Cache.SetLayer(i)
-		hiddenState = layer.Forward(ctx, hiddenState, positions, m.Cache, m.Options)
+		hiddenState = layer.Forward(ctx, hiddenState, positions, opts.Cache.Sub(i), m.Options)
 	}

 	hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
 	hiddenState = m.Output.Forward(ctx, hiddenState)

-	outputs, err := ctx.FromIntSlice(opts.Outputs, len(opts.Outputs))
+	outputs, err := ctx.FromIntSlice([]int32{int32(len(opts.Positions())) - 1}, 1)
 	if err != nil {
 		return nil, err
 	}
--- a/model/models/mllama/imageproc.go
+++ b/model/models/mllama/imageproc.go
--- a/model/models/mllama/imageproc_test.go
+++ b/model/models/mllama/imageproc_test.go
--- a/model/models/mllama/model.go
+++ b/model/models/mllama/model.go
@@ -1,7 +1,6 @@
 package mllama

 import (
-	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
 	"github.com/ollama/ollama/model"
@@ -19,31 +18,22 @@ type Model struct {
 	ImageProcessor
 }

-const (
-	crossAttentionLayer = iota
-	selfAttentionLayer
-)
-
 func New(c ml.Config) (model.Model, error) {
-	m := Model{
+	return &Model{
 		BytePairEncoding: model.NewBytePairEncoding(
 			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
 			&model.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Uints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
-				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
-				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+				BOS:    c.Uint("tokenizer.ggml.bos_token_id"),
+				EOS:    c.Uint("tokenizer.ggml.eos_token_id"),
 			},
 		),
 		ImageProcessor: newImageProcessor(c),
 		VisionModel:    newVisionModel(c),
 		TextModel:      newTextModel(c),
-	}
-
-	m.Cache = kvcache.NewWrapperCache(kvcache.NewEncoderCache(), kvcache.NewCausalCache(m.TextModel.Shift))
-
-	return &m, nil
+	}, nil
 }

 func (m *Model) Forward(ctx ml.Context, opts model.Options) (ml.Tensor, error) {
@@ -83,20 +73,20 @@ func (m *Model) Forward(ctx ml.Context, opts model.Options) (ml.Tensor, error) {
 		crossAttentionStates = m.Projector.Forward(ctx, crossAttentionStates)
 	}

-	inputs, err := ctx.FromIntSlice(opts.Inputs, len(opts.Inputs))
+	inputs, err := ctx.FromIntSlice(opts.Inputs(), len(opts.Inputs()))
 	if err != nil {
 		return nil, err
 	}

-	positions, err := ctx.FromIntSlice(opts.Positions, len(opts.Positions))
+	positions, err := ctx.FromIntSlice(opts.Positions(), len(opts.Positions()))
 	if err != nil {
 		return nil, err
 	}

 	// TODO: attention mask, cross attention mask
-	hiddenState := m.TextModel.Forward(ctx, inputs, positions, nil, crossAttentionStates, nil, m.Cache.(*kvcache.WrapperCache))
+	hiddenState := m.TextModel.Forward(ctx, inputs, positions, nil, crossAttentionStates, nil, opts.Cache)

-	outputs, err := ctx.FromIntSlice(opts.Outputs, len(opts.Outputs))
+	outputs, err := ctx.FromIntSlice([]int32{int32(len(opts.Positions())) - 1}, 1)
 	if err != nil {
 		return nil, err
 	}
--- a/model/models/mllama/model_text.go
+++ b/model/models/mllama/model_text.go
@@ -4,9 +4,9 @@ import (
 	"math"
 	"slices"

-	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/model"
 )

 type TextSelfAttention struct {
@@ -16,32 +16,34 @@ type TextSelfAttention struct {
 	Output *nn.Linear `gguf:"attn_output"`
 }

-func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, _ ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
+func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, mask ml.Tensor, cache model.Cache, opts *TextModelOptions) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
 	headDim := opts.hiddenSize / opts.numHeads
-	ropeType := uint32(0)

 	query := sa.Query.Forward(ctx, hiddenState)
 	query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
-	query = query.RoPE(ctx, positions, opts.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
+	query = query.RoPE(ctx, positions, opts.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)

 	key := sa.Key.Forward(ctx, hiddenState)
 	key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-	key = key.RoPE(ctx, positions, opts.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
+	key = key.RoPE(ctx, positions, opts.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)

 	value := sa.Value.Forward(ctx, hiddenState)
 	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)

-	cache.Put(ctx, key, value)
-	key, value, mask := cache.Get(ctx)
+	key, value = cache.Put(ctx, key, value, cache.Options)

 	query = query.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
 	key = key.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
 	value = value.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)

-	scores := key.MulmatFullPrec(ctx, query)
+	scores := key.Mulmat(ctx, query)
 	scores = scores.Scale(ctx, 1.0/math.Sqrt(float64(headDim)))
-	scores = scores.Add(ctx, mask)
+
+	if mask != nil {
+		scores = scores.Add(ctx, mask)
+	}
+
 	scores = scores.Softmax(ctx)

 	attention := value.Mulmat(ctx, scores)
@@ -51,11 +53,6 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, _ m
 	return sa.Output.Forward(ctx, attention)
 }

-func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	// This will only get called for layers in the cache, which are just the self attention layers
-	return key.RoPE(ctx, shift, m.RopeFactors, m.ropeDim, uint32(0), m.ropeBase, m.ropeScale), nil
-}
-
 type TextMLP struct {
 	Up   *nn.Linear `gguf:"ffn_up"`
 	Down *nn.Linear `gguf:"ffn_down"`
@@ -75,7 +72,7 @@ type TextSelfAttentionDecoderLayer struct {
 	MLP     *TextMLP
 }

-func (d *TextSelfAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, positions, mask, _, _ ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
+func (d *TextSelfAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, positions, mask, _, _ ml.Tensor, cache model.Cache, opts *TextModelOptions) ml.Tensor {
 	residual := hiddenState

 	hiddenState = d.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
@@ -97,29 +94,23 @@ type TextCrossAttention struct {
 	Output    *nn.Linear  `gguf:"cross_attn_o_proj"`
 }

-func (ca *TextCrossAttention) Forward(ctx ml.Context, hiddenState, crossAttentionStates ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
+func (ca *TextCrossAttention) Forward(ctx ml.Context, hiddenState, crossAttentionStates ml.Tensor, cache model.Cache, opts *TextModelOptions) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
 	headDim := opts.hiddenSize / opts.numHeads
+	numVisionTokens, numTiles := crossAttentionStates.Dim(1), crossAttentionStates.Dim(2)

 	query := ca.Query.Forward(ctx, hiddenState)
 	query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
 	query = ca.QueryNorm.Forward(ctx, query, opts.eps)

-	var key, value ml.Tensor
-	if crossAttentionStates != nil {
-		numVisionTokens, numTiles := crossAttentionStates.Dim(1), crossAttentionStates.Dim(2)
+	key := ca.Key.Forward(ctx, crossAttentionStates)
+	key = key.Reshape(ctx, headDim, opts.numKVHeads, numVisionTokens*numTiles)
+	key = ca.KeyNorm.Forward(ctx, key, opts.eps)

-		key = ca.Key.Forward(ctx, crossAttentionStates)
-		key = key.Reshape(ctx, headDim, opts.numKVHeads, numVisionTokens*numTiles)
-		key = ca.KeyNorm.Forward(ctx, key, opts.eps)
+	value := ca.Value.Forward(ctx, crossAttentionStates)
+	value = value.Reshape(ctx, headDim, opts.numKVHeads, numVisionTokens*numTiles)

-		value = ca.Value.Forward(ctx, crossAttentionStates)
-		value = value.Reshape(ctx, headDim, opts.numKVHeads, numVisionTokens*numTiles)
-
-		cache.Put(ctx, key, value)
-	} else {
-		key, value, _ = cache.Get(ctx)
-	}
+	// TODO cache key, value

 	query = query.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
 	key = key.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
@@ -146,7 +137,7 @@ type TextCrossAttentionDecoderLayer struct {
 	MLPGate ml.Tensor `gguf:"cross_attn_mlp_gate"`
 }

-func (d *TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _, _, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
+func (d TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _, _, crossAttentionStates, crossAttentionMask ml.Tensor, cache model.Cache, opts *TextModelOptions) ml.Tensor {
 	residual := hiddenState

 	hiddenState = d.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
@@ -162,25 +153,17 @@ func (d *TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _,
 }

 type TextDecoderLayer interface {
-	Forward(ctx ml.Context, hiddenState, positionIDs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor
+	Forward(ctx ml.Context, hiddenState, positionIDs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache model.Cache, opts *TextModelOptions) ml.Tensor
 }

 type TextDecoder struct {
 	Layers []TextDecoderLayer
 }

-func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
+func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache model.Cache, opts *TextModelOptions) ml.Tensor {
 	for i, layer := range d.Layers {
-		layerType := selfAttentionLayer
-		if slices.Contains(opts.crossAttentionLayers, uint32(i)) {
-			layerType = crossAttentionLayer
-		}
-
-		cache.SetLayer(i)
-		cache.SetLayerType(layerType)
-
-		if layerType == selfAttentionLayer || crossAttentionStates != nil || cache.UnderlyingCache().(*kvcache.EncoderCache).EncoderCached() {
-			hiddenState = layer.Forward(ctx, hiddenState, positionIDs, mask, crossAttentionStates, crossAttentionMask, cache, opts)
+		if !slices.Contains(opts.crossAttentionLayers, uint32(i)) || crossAttentionStates != nil {
+			hiddenState = layer.Forward(ctx, hiddenState, positionIDs, mask, crossAttentionStates, crossAttentionMask, cache.Sub(i), opts)
 		}
 	}

@@ -190,7 +173,7 @@ func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, mask, cr
 type TextModelOptions struct {
 	RopeFactors ml.Tensor `gguf:"rope_freqs.weight"`

-	hiddenSize, numHeads, numKVHeads int
+	hiddenSize, numHeads, numKVHeads int64
 	eps, ropeBase, ropeScale         float32
 	ropeDim                          uint32

@@ -206,7 +189,7 @@ type TextModel struct {
 	*TextModelOptions
 }

-func (m *TextModel) Forward(ctx ml.Context, inputIDs, positionIDs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache) ml.Tensor {
+func (m *TextModel) Forward(ctx ml.Context, inputIDs, positionIDs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache model.Cache) ml.Tensor {
 	hiddenState := m.TokenEmbedding.Forward(ctx, inputIDs)
 	hiddenState = m.Transformer.Forward(ctx, hiddenState, positionIDs, mask, crossAttentionStates, crossAttentionMask, cache, m.TextModelOptions)
 	hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
@@ -229,9 +212,9 @@ func newTextModel(c ml.Config) *TextModel {
 	return &TextModel{
 		Transformer: &TextDecoder{Layers: decoderLayers},
 		TextModelOptions: &TextModelOptions{
-			hiddenSize:           int(c.Uint("embedding_length")),
-			numHeads:             int(c.Uint("attention.head_count")),
-			numKVHeads:           int(c.Uint("attention.head_count_kv")),
+			hiddenSize:           int64(c.Uint("embedding_length")),
+			numHeads:             int64(c.Uint("attention.head_count")),
+			numKVHeads:           int64(c.Uint("attention.head_count_kv")),
 			eps:                  c.Float("attention.layer_norm_rms_epsilon"),
 			ropeBase:             c.Float("rope.freq_base"),
 			ropeScale:            c.Float("rope.freq_scale", 1),
--- a/model/models/mllama/model_vision.go
+++ b/model/models/mllama/model_vision.go
@@ -8,7 +8,7 @@ import (
 	"github.com/ollama/ollama/ml/nn"
 )

-var batchSize int = 1
+var batchSize int64 = 1

 type VisionSelfAttention struct {
 	Query  *nn.Linear `gguf:"attn_q"`
@@ -99,7 +99,7 @@ func (e *VisionEncoder) Forward(ctx ml.Context, hiddenState ml.Tensor, intermedi
 	var intermediateHiddenStates []ml.Tensor
 	for i, layer := range e.Layers {
 		if slices.Contains(intermediateLayersIndices, uint32(i)) {
-			intermediateHiddenStates = append(intermediateHiddenStates, hiddenState.Reshape(ctx, append([]int{1}, hiddenState.Shape()...)...))
+			intermediateHiddenStates = append(intermediateHiddenStates, hiddenState.Reshape(ctx, append([]int64{1}, hiddenState.Shape()...)...))
 		}

 		hiddenState = layer.Forward(ctx, hiddenState, opts)
@@ -131,7 +131,7 @@ type PrecomputedPositionEmbedding struct {
 	TilePositionEmbeddingGate ml.Tensor     `gguf:"tile_position_embd.gate"`
 }

-func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, positionIDs, aspectRatioIDs ml.Tensor, numPositions int, opts *VisionModelOptions) ml.Tensor {
+func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, positionIDs, aspectRatioIDs ml.Tensor, numPositions int64, opts *VisionModelOptions) ml.Tensor {
 	positionEmbedding := e.PositionEmbedding.Forward(ctx, positionIDs)
 	if e.PositionEmbeddingGate != nil {
 		positionEmbedding = positionEmbedding.Mul(ctx, e.PositionEmbeddingGate)
@@ -149,7 +149,7 @@ func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, posi
 }

 type VisionModelOptions struct {
-	hiddenSize, numHeads, numTiles int
+	hiddenSize, numHeads, numTiles int64
 	imageSize, patchSize           int
 	eps                            float32

@@ -174,7 +174,7 @@ type VisionModel struct {
 }

 func (m *VisionModel) Forward(ctx ml.Context, pixelValues, positionIDs, aspectRatioIDs ml.Tensor) ml.Tensor {
-	numPatches := (m.imageSize / m.patchSize) * (m.imageSize / m.patchSize)
+	numPatches := int64((m.imageSize / m.patchSize) * (m.imageSize / m.patchSize))
 	numPositions := numPatches
 	if m.ClassEmbedding != nil {
 		numPositions++
@@ -185,7 +185,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues, positionIDs, aspectRa
 	hiddenState = hiddenState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)

 	hiddenState = m.PreTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, m.VisionModelOptions)
-	hiddenState = m.ClassEmbedding.Stack(ctx, 2, slices.Repeat([]ml.Tensor{m.ClassEmbedding}, m.numTiles-1)...).Concat(ctx, hiddenState, 1)
+	hiddenState = m.ClassEmbedding.Stack(ctx, 2, slices.Repeat([]ml.Tensor{m.ClassEmbedding}, int(m.numTiles)-1)...).Concat(ctx, hiddenState, 1)

 	hiddenState = m.PositionEmbedding.Forward(ctx, hiddenState, positionIDs, aspectRatioIDs, numPositions, m.VisionModelOptions)
 	hiddenState = m.PreLayerNorm.Forward(ctx, hiddenState, m.eps)
@@ -205,7 +205,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues, positionIDs, aspectRa
 	hiddenState, _ = m.GlobalTransformer.Forward(ctx, hiddenState, nil, m.VisionModelOptions)

 	hiddenStates := intermediateHiddenStates[0].Stack(ctx, 0, intermediateHiddenStates[1:]...)
-	hiddenStates = hiddenStates.Reshape(ctx, len(intermediateHiddenStates)*m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize)
+	hiddenStates = hiddenStates.Reshape(ctx, int64(len(intermediateHiddenStates))*m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize)
 	hiddenStates = hiddenStates.Unpad(ctx, 0, numPaddingPatches, 0, 0)

 	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize)
@@ -219,9 +219,9 @@ func newVisionModel(c ml.Config) *VisionModel {
 		GlobalTransformer: &VisionEncoder{Layers: make([]VisionEncoderLayer, c.Uint("vision.global.block_count"))},

 		VisionModelOptions: &VisionModelOptions{
-			hiddenSize: int(c.Uint("vision.embedding_length")),
-			numHeads:   int(c.Uint("vision.attention.head_count")),
-			numTiles:   int(c.Uint("vision.max_num_tiles")),
+			hiddenSize: int64(c.Uint("vision.embedding_length")),
+			numHeads:   int64(c.Uint("vision.attention.head_count")),
+			numTiles:   int64(c.Uint("vision.max_num_tiles")),

 			imageSize: int(c.Uint("vision.image_size")),
 			patchSize: int(c.Uint("vision.patch_size")),
--- a/model/models/mllama/process_image.go
+++ b/model/models/mllama/process_image.go
--- a/model/model.go
+++ b/model/model.go
@@ -1,7 +1,6 @@
 package model

 import (
-	"errors"
 	"fmt"
 	"image"
 	_ "image/jpeg"
@@ -16,51 +15,106 @@ import (
 	_ "golang.org/x/image/tiff"
 	_ "golang.org/x/image/webp"

-	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/cache"
 	"github.com/ollama/ollama/ml"
 	_ "github.com/ollama/ollama/ml/backend"
 )

-// Options contains the inputs for a model forward pass
+type Cache struct {
+	cache.Cache
+	cache.Options
+}
+
+func (c Cache) Sub(i int) Cache {
+	if c.Cache != nil {
+		return Cache{
+			Cache:   c.Cache.Sub(i),
+			Options: c.Options,
+		}
+	}
+
+	return c
+}
+
+func (c Cache) Put(ctx ml.Context, key, value ml.Tensor, opts cache.Options) (ml.Tensor, ml.Tensor) {
+	if c.Cache != nil {
+		return c.Cache.Put(ctx, key, value, opts)
+	}
+
+	return key, value
+}
+
 type Options struct {
-	Inputs    []int32
-	Positions []int32
-	Sequences []int
-	Outputs   []int32
+	inputs []int32
+
+	Offset int

 	Images []image.Image
+
+	Cache
 }

-type config struct {
-	Cache kvcache.Cache
+func (opts Options) Inputs() []int32 {
+	return opts.inputs[opts.Offset:]
+}
+
+func (opts Options) Positions() []int32 {
+	positions := make([]int32, len(opts.inputs)-opts.Offset)
+	for i := range positions {
+		positions[i] = int32(opts.Offset + i)
+	}
+
+	return positions
+}
+
+type OptionsFunc func(Model, *Options)
+
+func WithInputIDs(ids []int32) OptionsFunc {
+	return func(m Model, opts *Options) {
+		opts.inputs = ids
+	}
+}
+
+func WithOffset(offset int) OptionsFunc {
+	return func(m Model, opts *Options) {
+		opts.Offset = offset
+		opts.Cache.Position = offset
+	}
+}
+
+func WithImage(img image.Image) OptionsFunc {
+	return func(m Model, opts *Options) {
+		opts.Images = append(opts.Images, img)
+	}
+}
+
+func WithCache(c cache.Cache) OptionsFunc {
+	return func(m Model, opts *Options) {
+		opts.Cache = Cache{
+			Cache: c,
+			Options: cache.Options{
+				Position: opts.Offset,
+			},
+		}
+	}
 }

-// Base implements the common fields and methods for all models
 type Base struct {
 	b ml.Backend
-	config
 }

-// Backend returns the underlying backend that will run the model
 func (m *Base) Backend() ml.Backend {
 	return m.b
 }

-func (m *Base) Config() config {
-	return m.config
-}
-
-// Model implements a specific model architecture, defining the forward pass and any model-specific configuration
 type Model interface {
 	Forward(ml.Context, Options) (ml.Tensor, error)

 	Backend() ml.Backend
-	Config() config
 }

 var models = make(map[string]func(ml.Config) (Model, error))

-// Register registers a model constructor for the given architecture
 func Register(name string, f func(ml.Config) (Model, error)) {
 	if _, ok := models[name]; ok {
 		panic("model: model already registered")
@@ -69,9 +123,8 @@ func Register(name string, f func(ml.Config) (Model, error)) {
 	models[name] = f
 }

-// New initializes a new model instance with the provided configuration based on the metadata in the model file
-func New(modelPath string) (Model, error) {
-	r, err := os.Open(modelPath)
+func New(s string) (Model, error) {
+	r, err := os.Open(s)
 	if err != nil {
 		return nil, err
 	}
@@ -93,15 +146,16 @@ func New(modelPath string) (Model, error) {
 		return nil, err
 	}

-	base := Base{b: b, config: m.Config()}
-
 	v := reflect.ValueOf(m)
-	v.Elem().Set(populateFields(base, v.Elem()))
+	v.Elem().Set(populateFields(b, v))
 	return m, nil
 }

-func populateFields(base Base, v reflect.Value, tags ...Tag) reflect.Value {
+func populateFields(b ml.Backend, v reflect.Value, tags ...Tag) reflect.Value {
 	t := v.Type()
+	if t.Kind() == reflect.Pointer {
+		t, v = t.Elem(), v.Elem()
+	}

 	if t.Kind() == reflect.Struct {
 		allNil := true
@@ -119,7 +173,7 @@ func populateFields(base Base, v reflect.Value, tags ...Tag) reflect.Value {
 			}

 			if tt == reflect.TypeOf((*Base)(nil)).Elem() {
-				vv.Set(reflect.ValueOf(base))
+				vv.Set(reflect.ValueOf(Base{b: b}))
 			} else if tt == reflect.TypeOf((*ml.Tensor)(nil)).Elem() {
 				var fn func([]Tag) [][]string
 				fn = func(tags []Tag) (values [][]string) {
@@ -145,22 +199,24 @@ func populateFields(base Base, v reflect.Value, tags ...Tag) reflect.Value {

 				names := fn(tagsCopy)
 				for _, name := range names {
-					if tensor := base.Backend().Get(strings.Join(name, ".")); tensor != nil {
+					if tensor := b.Get(strings.Join(name, ".")); tensor != nil {
 						slog.Debug("found tensor", "", tensor)
 						vv.Set(reflect.ValueOf(tensor))
 						break
 					}
 				}
-			} else if tt.Kind() == reflect.Pointer || tt.Kind() == reflect.Interface {
-				setPointer(base, vv, tagsCopy)
+			} else if tt.Kind() == reflect.Pointer {
+				vvv := vv.Elem()
+				if vv.IsNil() {
+					vvv = reflect.New(tt.Elem())
+				}
+
+				if f := populateFields(b, vvv, tagsCopy...); f.CanAddr() {
+					vv.Set(f.Addr())
+				}
 			} else if tt.Kind() == reflect.Slice || tt.Kind() == reflect.Array {
 				for i := range vv.Len() {
-					vvv := vv.Index(i)
-					if vvv.Kind() == reflect.Pointer || vvv.Kind() == reflect.Interface {
-						setPointer(base, vvv, append(tagsCopy, Tag{Name: strconv.Itoa(i)}))
-					} else {
-						vvv.Set(populateFields(base, vvv, append(tagsCopy, Tag{Name: strconv.Itoa(i)})...))
-					}
+					vv.Index(i).Set(populateFields(b, vv.Index(i), append(tagsCopy, Tag{Name: strconv.Itoa(i)})...))
 				}
 			}

@@ -177,26 +233,6 @@ func populateFields(base Base, v reflect.Value, tags ...Tag) reflect.Value {
 	return v
 }

-func setPointer(base Base, v reflect.Value, tags []Tag) {
-	vv := v
-	if v.Kind() == reflect.Interface {
-		if v.IsNil() {
-			return
-		}
-
-		vv = vv.Elem()
-	}
-
-	vv = vv.Elem()
-	if v.IsNil() {
-		vv = reflect.New(v.Type().Elem()).Elem()
-	}
-
-	if f := populateFields(base, vv, tags...); f.CanAddr() {
-		v.Set(f.Addr())
-	}
-}
-
 type Tag struct {
 	Name      string
 	Alternate []string
@@ -226,30 +262,18 @@ func canNil(t reflect.Type) bool {
 		t.Kind() == reflect.Slice
 }

-func Forward(ctx ml.Context, m Model, opts Options) (ml.Tensor, error) {
-	if len(opts.Positions) != len(opts.Sequences) {
-		return nil, fmt.Errorf("length of positions (%v) must match length of seqs (%v)", len(opts.Positions), len(opts.Sequences))
-	}
-
-	if len(opts.Positions) < 1 {
-		return nil, errors.New("batch size cannot be less than 1")
-	}
-
-	cache := m.Config().Cache
-	if cache != nil {
-		err := cache.StartForward(ctx, opts.Positions, opts.Sequences)
-		if err != nil {
-			return nil, err
-		}
+func Forward(m Model, optsFuncs ...OptionsFunc) (ml.Tensor, error) {
+	var opts Options
+	for _, optsFunc := range optsFuncs {
+		optsFunc(m, &opts)
 	}

+	ctx := m.Backend().NewContext()
 	t, err := m.Forward(ctx, opts)
 	if err != nil {
 		return nil, err
 	}
+	defer ctx.Close()

-	ctx.Forward(t)
-	ctx.Compute(t)
-
-	return t, nil
+	return ctx.Compute(t), nil
 }
--- a/model/model_test.go
+++ b/model/model_test.go
@@ -78,7 +78,7 @@ func TestPopulateFields(t *testing.T) {

 	var m fakeModel
 	v := reflect.ValueOf(&m)
-	v.Elem().Set(populateFields(Base{b: &fakeBackend{
+	v.Elem().Set(populateFields(&fakeBackend{
 		names: []string{
 			"input.weight",
 			"blk.0.attn_q.weight",
@@ -90,7 +90,7 @@ func TestPopulateFields(t *testing.T) {
 			"output_norm.weight",
 			"output.weight",
 		},
-	}}, v.Elem()))
+	}, v))

 	if diff := cmp.Diff(fakeModel{
 		Input:      &nn.Embedding{Weight: &fakeTensor{Name: "input.weight"}},
@@ -121,11 +121,11 @@ func TestPopulateFieldsAlternateName(t *testing.T) {

 	m := fakeModel{}
 	v := reflect.ValueOf(&m)
-	v.Elem().Set(populateFields(Base{b: &fakeBackend{
+	v.Elem().Set(populateFields(&fakeBackend{
 		names: []string{
 			"input.weight",
 		},
-	}}, v.Elem()))
+	}, v))

 	if diff := cmp.Diff(fakeModel{
 		Input:  &nn.Embedding{Weight: &fakeTensor{Name: "input.weight"}},
--- a/model/models/gemma2/model.go
+++ b/model/models/gemma2/model.go
@@ -1,193 +0,0 @@
-package gemma2
-
-import (
-	"math"
-
-	"github.com/ollama/ollama/kvcache"
-	"github.com/ollama/ollama/ml"
-	"github.com/ollama/ollama/ml/nn"
-	"github.com/ollama/ollama/model"
-)
-
-type Options struct {
-	hiddenSize, numHeads, numKVHeads int
-	attnKeyLen, attnValLen           int
-	eps, ropeBase, ropeScale         float32
-	attnLogitSoftcap                 float32
-	finalLogitSoftcap                float32
-}
-
-type Model struct {
-	model.Base
-	model.SentencePieceModel
-
-	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
-	Layers         []Layer       `gguf:"blk"`
-	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`           // is this supposed to be root means square?
-	Output         *nn.Linear    `gguf:"output,alt:token_embd"` // just set to token_embd?
-
-	*Options
-}
-
-func New(c ml.Config) (model.Model, error) {
-	m := Model{
-		SentencePieceModel: model.NewSentencePieceModel(
-			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
-			&model.Vocabulary{
-				Values: c.Strings("tokenizer.ggml.tokens"),
-				Scores: c.Floats("tokenizer.ggml.scores"),
-				Types:  c.Uints("tokenizer.ggml.token_type"),
-				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
-				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
-			},
-		),
-		Layers: make([]Layer, c.Uint("block_count")),
-		Options: &Options{
-			hiddenSize:        int(c.Uint("embedding_length")),
-			numHeads:          int(c.Uint("attention.head_count")),
-			numKVHeads:        int(c.Uint("attention.head_count_kv")),
-			attnKeyLen:        int(c.Uint("attention.key_length")),
-			attnValLen:        int(c.Uint("attention.value_length")),
-			eps:               c.Float("attention.layer_norm_rms_epsilon"),
-			ropeBase:          c.Float("rope.freq_base", 10000.0),
-			ropeScale:         c.Float("rope.freq_scale", 1.0),
-			attnLogitSoftcap:  c.Float("attn_logit_softcapping"),
-			finalLogitSoftcap: c.Float("final_logit_softcapping"),
-		},
-	}
-
-	slidingWindowLen := int32(c.Uint("attention.sliding_window"))
-	m.Cache = kvcache.NewWrapperCache(kvcache.NewSWACache(slidingWindowLen, m.Shift), kvcache.NewCausalCache(m.Shift))
-
-	return &m, nil
-}
-
-type SelfAttention struct {
-	Query  *nn.Linear `gguf:"attn_q"`
-	Key    *nn.Linear `gguf:"attn_k"`
-	Value  *nn.Linear `gguf:"attn_v"`
-	Output *nn.Linear `gguf:"attn_output"`
-}
-
-func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
-	batchSize := hiddenState.Dim(1)
-	ropeType := uint32(2)
-
-	q := sa.Query.Forward(ctx, hiddenState)
-	q = q.Reshape(ctx, opts.attnKeyLen, opts.numHeads, batchSize)
-	q = q.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, opts.ropeBase, opts.ropeScale)
-
-	// todo: this should be 1.0/math.Sqrt(float64(headDim)) for 27B models
-	q = q.Scale(ctx, 1.0/math.Sqrt(float64(opts.attnKeyLen)))
-
-	k := sa.Key.Forward(ctx, hiddenState)
-	k = k.Reshape(ctx, opts.attnKeyLen, opts.numKVHeads, batchSize)
-	k = k.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, opts.ropeBase, opts.ropeScale)
-
-	v := sa.Value.Forward(ctx, hiddenState)
-	v = v.Reshape(ctx, opts.attnValLen, opts.numKVHeads, batchSize)
-
-	cache.Put(ctx, k, v)
-	k, v, mask := cache.Get(ctx)
-
-	q = q.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
-	k = k.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
-	v = v.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
-
-	kq := k.Mulmat(ctx, q)
-
-	// logit softcap
-	kq = kq.Scale(ctx, 1.0/float64(opts.attnLogitSoftcap))
-	kq = kq.Tanh(ctx)
-	kq = kq.Scale(ctx, float64(opts.attnLogitSoftcap))
-
-	kq = kq.Add(ctx, mask)
-	kq = kq.Softmax(ctx)
-
-	kqv := v.Mulmat(ctx, kq)
-	kqv = kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
-	kqv = kqv.Reshape(ctx, opts.attnValLen*opts.numHeads, batchSize)
-
-	return sa.Output.Forward(ctx, kqv)
-}
-
-func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return key.RoPE(ctx, shift, nil, uint32(m.Options.attnKeyLen), uint32(2), m.Options.ropeBase, m.Options.ropeScale), nil
-}
-
-type MLP struct {
-	Up   *nn.Linear `gguf:"ffn_up"`
-	Down *nn.Linear `gguf:"ffn_down"`
-	Gate *nn.Linear `gguf:"ffn_gate"`
-}
-
-func (mlp *MLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *Options) ml.Tensor {
-	hiddenState = mlp.Gate.Forward(ctx, hiddenState).GELU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenState))
-	return mlp.Down.Forward(ctx, hiddenState)
-}
-
-type Layer struct {
-	AttentionNorm     *nn.RMSNorm `gguf:"attn_norm"`
-	SelfAttention     *SelfAttention
-	PostAttentionNorm *nn.RMSNorm `gguf:"post_attention_norm"`
-	MLPNorm           *nn.RMSNorm `gguf:"ffn_norm"`
-	MLP               *MLP
-	PostMLPNorm       *nn.RMSNorm `gguf:"post_ffw_norm"`
-}
-
-func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
-	residual := hiddenState
-
-	hiddenState = l.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
-	hiddenState = l.SelfAttention.Forward(ctx, hiddenState, positionIDs, cache, opts)
-	hiddenState = l.PostAttentionNorm.Forward(ctx, hiddenState, opts.eps)
-	hiddenState = hiddenState.Add(ctx, residual)
-	residual = hiddenState
-
-	hiddenState = l.MLPNorm.Forward(ctx, hiddenState, opts.eps)
-	hiddenState = l.MLP.Forward(ctx, hiddenState, opts)
-	hiddenState = l.PostMLPNorm.Forward(ctx, hiddenState, opts.eps)
-	return hiddenState.Add(ctx, residual)
-}
-
-func (m *Model) Forward(ctx ml.Context, opts model.Options) (ml.Tensor, error) {
-	inputs, err := ctx.FromIntSlice(opts.Inputs, len(opts.Inputs))
-	if err != nil {
-		return nil, err
-	}
-
-	positions, err := ctx.FromIntSlice(opts.Positions, len(opts.Positions))
-	if err != nil {
-		return nil, err
-	}
-
-	hiddenState := m.TokenEmbedding.Forward(ctx, inputs)
-	hiddenState = hiddenState.Scale(ctx, math.Sqrt(float64(m.Options.hiddenSize)))
-
-	for i, layer := range m.Layers {
-		cacheType := i % 2
-		m.Cache.SetLayer(i)
-		wc := m.Cache.(*kvcache.WrapperCache)
-		wc.SetLayerType(cacheType)
-		hiddenState = layer.Forward(ctx, hiddenState, positions, m.Cache, m.Options)
-	}
-
-	hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
-	hiddenState = m.Output.Forward(ctx, hiddenState)
-
-	// final logit softcap
-	hiddenState = hiddenState.Scale(ctx, 1.0/float64(m.Options.finalLogitSoftcap))
-	hiddenState = hiddenState.Tanh(ctx)
-	hiddenState = hiddenState.Scale(ctx, float64(m.Options.finalLogitSoftcap))
-
-	outputs, err := ctx.FromIntSlice(opts.Outputs, len(opts.Outputs))
-	if err != nil {
-		return nil, err
-	}
-
-	return hiddenState.Rows(ctx, outputs), nil
-}
-
-func init() {
-	model.Register("gemma2", New)
-}
--- a/model/models/models.go
+++ b/model/models/models.go
@@ -1,7 +0,0 @@
-package models
-
-import (
-	_ "github.com/ollama/ollama/model/models/gemma2"
-	_ "github.com/ollama/ollama/model/models/llama"
-	_ "github.com/ollama/ollama/model/models/mllama"
-)
--- a/model/models/pixtral/imageproc.go
+++ b/model/models/pixtral/imageproc.go
--- a/model/models/pixtral/imageproc_test.go
+++ b/model/models/pixtral/imageproc_test.go
--- a/model/process_text.go
+++ b/model/process_text.go
@@ -18,28 +18,19 @@ const (
 	SpecialEOS
 )

-const (
-	TOKEN_TYPE_NORMAL = iota + 1
-	TOKEN_TYPE_UNKNOWN
-	TOKEN_TYPE_CONTROL
-	TOKEN_TYPE_USER_DEFINED
-	TOKEN_TYPE_UNUSED
-	TOKEN_TYPE_BYTE
-)
-
 type TextProcessor interface {
 	Encode(string) ([]int32, error)
 	Decode([]int32) (string, error)
-	Is(int32, Special) bool
+	Is(uint32, Special) bool
 }

 type Vocabulary struct {
 	Values []string
 	Types  []uint32
-	Scores []float32
+	Scores []uint32
 	Merges []string

-	BOS, EOS int32
+	BOS, EOS uint32

 	specialOnce sync.Once
 	special     []string
@@ -51,7 +42,7 @@ type Vocabulary struct {
 	merge     map[string]int32
 }

-func (v *Vocabulary) Is(id int32, special Special) bool {
+func (v *Vocabulary) Is(id uint32, special Special) bool {
 	switch special {
 	case SpecialBOS:
 		return id == v.BOS
@@ -84,7 +75,7 @@ func (v *Vocabulary) Decode(id int32) string {
 func (v *Vocabulary) SpecialVocabulary() []string {
 	v.specialOnce.Do(func() {
 		for i := range v.Values {
-			if v.Types[i] == TOKEN_TYPE_CONTROL {
+			if v.Types[i] == 3 {
 				v.special = append(v.special, v.Values[i])
 			}
 		}
@@ -120,7 +111,7 @@ func NewBytePairEncoding(pre string, vocab *Vocabulary) BytePairEncoding {
 	}
 }

-func (bpe BytePairEncoding) Is(id int32, special Special) bool {
+func (bpe BytePairEncoding) Is(id uint32, special Special) bool {
 	return bpe.vocab.Is(id, special)
 }

--- a/model/process_text_spm.go
+++ b/model/process_text_spm.go
@@ -1,220 +0,0 @@
-package model
-
-import (
-	"iter"
-	"log/slog"
-	"strings"
-
-	"github.com/dlclark/regexp2"
-	queue "github.com/emirpasic/gods/queues/priorityqueue"
-)
-
-const spmWhitespaceSep = "▁"
-
-func replaceWhitespaceBySeperator(s string) string {
-	return strings.ReplaceAll(s, " ", spmWhitespaceSep)
-}
-
-type SentencePieceModel struct {
-	maxTokenLen int
-	pre         *regexp2.Regexp
-	vocab       *Vocabulary
-}
-
-func NewSentencePieceModel(pre string, vocab *Vocabulary) SentencePieceModel {
-	slog.Debug("Tokens", "num tokens", len(vocab.Values), "vals", vocab.Values[:3], "scores", vocab.Scores[:3], "types", vocab.Types[:3])
-
-	counter := map[int]int{}
-	var maxTokenLen int
-	for cnt := range vocab.Types {
-		switch vocab.Types[cnt] {
-		case TOKEN_TYPE_NORMAL, TOKEN_TYPE_USER_DEFINED, TOKEN_TYPE_UNUSED:
-			maxTokenLen = max(maxTokenLen, len(vocab.Values[cnt]))
-			fallthrough
-		default:
-			counter[int(vocab.Types[cnt])] += 1
-		}
-	}
-
-	slog.Debug("Token counts", "normal", counter[TOKEN_TYPE_NORMAL], "unknown", counter[TOKEN_TYPE_UNKNOWN], "control", counter[TOKEN_TYPE_CONTROL],
-		"user defined", counter[TOKEN_TYPE_USER_DEFINED], "unused", counter[TOKEN_TYPE_UNUSED], "byte", counter[TOKEN_TYPE_BYTE],
-		"max token len", maxTokenLen)
-
-	return SentencePieceModel{
-		maxTokenLen: maxTokenLen,
-		pre:         regexp2.MustCompile(pre, regexp2.Unicode|regexp2.RE2),
-		vocab:       vocab,
-	}
-}
-
-func (spm SentencePieceModel) Is(id int32, special Special) bool {
-	return spm.vocab.Is(id, special)
-}
-
-func (spm *SentencePieceModel) split(s string) iter.Seq[string] {
-	return func(yield func(string) bool) {
-		for m, _ := spm.pre.FindStringMatch(s); m != nil; m, _ = spm.pre.FindNextMatch(m) {
-			if !yield(m.String()) {
-				break
-			}
-		}
-	}
-}
-
-func (spm SentencePieceModel) Encode(s string) ([]int32, error) {
-	fragments := []fragment{{value: s}}
-	for _, special := range spm.vocab.SpecialVocabulary() {
-		// TODO: process special tokens concurrently
-		id := spm.vocab.Encode(special)
-		for i := 0; i < len(fragments); i++ {
-			frag := fragments[i]
-			if len(frag.ids) > 0 {
-				continue
-			}
-
-			var middle []fragment
-			switch i := strings.Index(frag.value, special); {
-			case i < 0:
-				middle = append(middle, frag)
-			case i > 0:
-				middle = append(middle, fragment{value: frag.value[:i]})
-				fallthrough
-			default:
-				middle = append(middle, fragment{value: special, ids: []int32{id}})
-				if rest := frag.value[i+len(special):]; rest != "" {
-					middle = append(middle, fragment{value: rest})
-				}
-			}
-
-			fragments = append(fragments[:i], append(middle, fragments[i+1:]...)...)
-		}
-	}
-	slog.Debug("fragments", "frags", fragments)
-
-	var ids []int32
-	for _, frag := range fragments {
-		if len(frag.ids) > 0 {
-			ids = append(ids, frag.ids...)
-			continue
-		}
-
-		for split := range spm.split(frag.value) {
-			split = replaceWhitespaceBySeperator(split)
-
-			var sb strings.Builder
-			sb.Write([]byte(split))
-			if id := spm.vocab.Encode(sb.String()); id >= 0 {
-				ids = append(ids, id)
-				continue
-			}
-
-			runes := []rune(sb.String())
-			pq := queue.NewWith(func(a, b any) int {
-				priA := a.(*candidate)
-				priB := b.(*candidate)
-				if priA.score > priB.score || (priA.score == priB.score && priA.a < priB.a) {
-					return 1
-				}
-				return -1
-			})
-
-			merges := make([]merge, len(runes))
-			for r := range runes {
-				merges[r] = merge{
-					p:     r - 1,
-					n:     r + 1,
-					runes: []rune{runes[r]},
-				}
-			}
-
-			pairwise := func(a, b int) *candidate {
-				if a < 0 || b >= len(runes) {
-					return nil
-				}
-
-				left, right := string(merges[a].runes), string(merges[b].runes)
-				if id := spm.vocab.Encode(left + right); id >= 0 {
-					return &candidate{
-						a:      a,
-						b:      b,
-						length: len(left + " " + right),
-						score:  spm.vocab.Scores[id],
-					}
-				}
-				return nil
-			}
-
-			for i := range len(runes) - 1 {
-				if pair := pairwise(i, i+1); pair != nil {
-					pq.Enqueue(pair)
-				}
-			}
-
-			pqv := pq.Values()
-			for _, v := range pqv {
-				e := v.(*candidate)
-				slog.Debug("candidate", "candidate", e)
-			}
-
-			for !pq.Empty() {
-				v, _ := pq.Dequeue()
-				pair := v.(*candidate)
-				left, right := merges[pair.a], merges[pair.b]
-
-				if len(left.runes) == 0 || len(right.runes) == 0 {
-					continue
-				}
-
-				merges[pair.a].runes = append(left.runes, right.runes...)
-				merges[pair.b].runes = nil
-				merges[pair.a].n = right.n
-				if right.n < len(merges) {
-					merges[right.n].p = pair.a
-				}
-
-				if pair := pairwise(merges[pair.a].p, pair.a); pair != nil {
-					pq.Enqueue(pair)
-				}
-
-				if pair := pairwise(pair.a, merges[pair.a].n); pair != nil {
-					pq.Enqueue(pair)
-				}
-			}
-
-			slog.Debug("merges", "merges", merges)
-
-			for _, merge := range merges {
-				if len(merge.runes) > 0 {
-					if id := spm.vocab.Encode(string(merge.runes)); id >= 0 {
-						ids = append(ids, id)
-					} else {
-						slog.Debug("missing token", "token", string(merge.runes))
-					}
-				}
-			}
-		}
-	}
-	slog.Debug("encoded", "ids", ids)
-
-	return ids, nil
-}
-
-type candidate struct {
-	a, b   int
-	score  float32
-	length int
-}
-
-func (spm SentencePieceModel) Decode(ids []int32) (string, error) {
-	var sb strings.Builder
-	for _, id := range ids {
-		data := spm.vocab.Decode(id)
-		data = strings.ReplaceAll(data, spmWhitespaceSep, " ")
-		if _, err := sb.WriteString(data); err != nil {
-			return "", err
-		}
-	}
-
-	slog.Debug("decoded", "ids", ids, "text", sb.String())
-	return sb.String(), nil
-}
--- a/model/models/qwen2vl/imageproc.go
+++ b/model/models/qwen2vl/imageproc.go
--- a/model/models/qwen2vl/imageproc_test.go
+++ b/model/models/qwen2vl/imageproc_test.go
--- a/openai/openai.go
+++ b/openai/openai.go
@@ -20,8 +20,6 @@ import (
 	"github.com/ollama/ollama/types/model"
 )

-var finishReasonToolCalls = "tool_calls"
-
 type Error struct {
 	Message string      `json:"message"`
 	Type    string      `json:"type"`
@@ -268,7 +266,7 @@ func toChatCompletion(id string, r api.ChatResponse) ChatCompletion {
 	}
 }

-func toChunk(id string, r api.ChatResponse, toolCallSent bool) ChatCompletionChunk {
+func toChunk(id string, r api.ChatResponse) ChatCompletionChunk {
 	toolCalls := toToolCalls(r.Message.ToolCalls)
 	return ChatCompletionChunk{
 		Id:                id,
@@ -281,9 +279,6 @@ func toChunk(id string, r api.ChatResponse, toolCallSent bool) ChatCompletionChu
 			Delta: Message{Role: "assistant", Content: r.Message.Content, ToolCalls: toolCalls},
 			FinishReason: func(reason string) *string {
 				if len(reason) > 0 {
-					if toolCallSent {
-						return &finishReasonToolCalls
-					}
 					return &reason
 				}
 				return nil
@@ -590,7 +585,6 @@ type ChatWriter struct {
 	stream        bool
 	streamOptions *StreamOptions
 	id            string
-	toolCallSent  bool
 	BaseWriter
 }

@@ -640,14 +634,11 @@ func (w *ChatWriter) writeResponse(data []byte) (int, error) {

 	// chat chunk
 	if w.stream {
-		c := toChunk(w.id, chatResponse, w.toolCallSent)
+		c := toChunk(w.id, chatResponse)
 		d, err := json.Marshal(c)
 		if err != nil {
 			return 0, err
 		}
-		if !w.toolCallSent && len(c.Choices) > 0 && len(c.Choices[0].Delta.ToolCalls) > 0 {
-			w.toolCallSent = true
-		}

 		w.ResponseWriter.Header().Set("Content-Type", "text/event-stream")
 		_, err = w.ResponseWriter.Write([]byte(fmt.Sprintf("data: %s\n\n", d)))
--- a/runner/ollamarunner/cache.go
+++ b/runner/ollamarunner/cache.go
@@ -1,280 +0,0 @@
-package ollamarunner
-
-import (
-	"errors"
-	"fmt"
-	"log/slog"
-	"math"
-	"reflect"
-	"time"
-
-	"github.com/ollama/ollama/kvcache"
-	"github.com/ollama/ollama/ml"
-	"github.com/ollama/ollama/model"
-)
-
-type InputCache struct {
-	// context window size (per slot)
-	numCtx int32
-
-	// does the cache store data or do we need to always send the full input?
-	// note that when enabled is false the underlying cache may either be nil
-	// or a non-nil dummy that doesn't actually store anything
-	enabled bool
-
-	// individual KV caches
-	slots []InputCacheSlot
-
-	// optimize cache eviction for multiple users
-	multiUserCache bool
-
-	cache kvcache.Cache
-}
-
-func NewInputCache(model model.Model, kvCacheType string, kvSize int32, numSlots int, multiUserCache bool) (*InputCache, error) {
-	if kvSize/int32(numSlots) < 1 {
-		return nil, fmt.Errorf("must have at least one kv cache entry per parallel sequence (kv: %v parallel: %v)", kvSize, numSlots)
-	}
-
-	slots := make([]InputCacheSlot, numSlots)
-
-	for i := range slots {
-		slots[i] = InputCacheSlot{
-			Id:     i,
-			Inputs: make([]input, 0),
-		}
-	}
-
-	cache := model.Config().Cache
-	if cache != nil {
-		cache.Init(model.Backend(), kvCacheTypeFromStr(kvCacheType), kvSize)
-	}
-
-	return &InputCache{
-		numCtx:         kvSize / int32(numSlots),
-		enabled:        cache != nil,
-		slots:          slots,
-		multiUserCache: multiUserCache,
-		cache:          cache,
-	}, nil
-}
-
-func kvCacheTypeFromStr(s string) ml.DType {
-	switch s {
-	case "q8_0":
-		panic("kv cache quantization not yet implemented")
-	case "q4_0":
-		panic("kv cache quantization not yet implemented")
-	default:
-		return ml.DTypeF16
-	}
-}
-
-func (c *InputCache) Close() {
-	c.cache.Close()
-}
-
-// Locking: Operations on InputCacheSlot (including finding one
-// through LoadCacheSlot) require a lock to be be held that serializes
-// these operations with each other and processBatch
-
-type InputCacheSlot struct {
-	// Index in the KV cache
-	Id int
-
-	// Inputs that are stored in the KV cache
-	Inputs []input
-
-	// is this cache actively being processed as part of a sequence?
-	InUse bool
-
-	// last time this cache was used (as of start of processing)
-	lastUsed time.Time
-}
-
-func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCacheSlot, []input, error) {
-	var slot *InputCacheSlot
-	var numPast int32
-	var err error
-
-	// In single-user scenarios, the longest cache slot works fine for getting good input
-	// cache hit rates and it keeps the footprint of the cache small, which improves throughput.
-	// For multiple users, the "best" cache slot produces better input cache hit rates
-	// at the cost of worse performance when we miss the input cache.
-	if !c.multiUserCache {
-		slot, numPast, err = c.findLongestCacheSlot(prompt)
-	} else {
-		slot, numPast, err = c.findBestCacheSlot(prompt)
-	}
-	if err != nil {
-		return nil, nil, err
-	}
-
-	if !cachePrompt {
-		numPast = 0
-	}
-
-	slot.InUse = true
-	slot.lastUsed = time.Now()
-
-	if numPast == int32(len(prompt)) {
-		// Leave one input to sample so we can get a response
-		numPast--
-	}
-
-	if c.cache != nil {
-		err = c.cache.Remove(slot.Id, numPast, math.MaxInt32)
-		if err != nil {
-			// Some models don't support partial erasure
-			err = c.cache.Remove(slot.Id, 0, math.MaxInt32)
-			if err != nil {
-				return nil, nil, err
-			}
-			numPast = 0
-		}
-	}
-
-	slog.Debug("loading cache slot", "id", slot.Id, "cache", len(slot.Inputs), "prompt", len(prompt),
-		"used", numPast, "remaining", int32(len(prompt))-numPast)
-
-	prompt = prompt[numPast:]
-	slot.Inputs = slot.Inputs[:numPast]
-
-	return slot, prompt, nil
-}
-
-func (c *InputCache) findLongestCacheSlot(prompt []input) (*InputCacheSlot, int32, error) {
-	longest := int32(-1)
-	var longestSlot *InputCacheSlot
-
-	for i, s := range c.slots {
-		if s.InUse {
-			continue
-		}
-
-		count := countCommonPrefix(s.Inputs, prompt)
-		if count > longest {
-			longest = count
-			longestSlot = &c.slots[i]
-		}
-	}
-
-	if longestSlot == nil {
-		return nil, 0, errors.New("no available cache slots")
-	}
-
-	return longestSlot, longest, nil
-}
-
-func (c *InputCache) findBestCacheSlot(prompt []input) (*InputCacheSlot, int32, error) {
-	oldest := time.Now()
-	var oldestSlot *InputCacheSlot
-
-	longest := int32(-1)
-	var longestSlot *InputCacheSlot
-
-	for i, s := range c.slots {
-		count := countCommonPrefix(s.Inputs, prompt)
-		if count > longest {
-			longest = count
-			longestSlot = &c.slots[i]
-		}
-
-		if s.lastUsed.Compare(oldest) < 0 && !s.InUse {
-			oldest = s.lastUsed
-			oldestSlot = &c.slots[i]
-		}
-	}
-
-	if longest == int32(len(longestSlot.Inputs)) && !longestSlot.InUse {
-		return longestSlot, longest, nil
-	}
-
-	if oldestSlot.InUse {
-		return nil, 0, errors.New("no available cache slots")
-	}
-
-	if len(oldestSlot.Inputs) != 0 {
-		slog.Debug("evicting cache slot", "id", oldestSlot.Id, "inputs", len(oldestSlot.Inputs),
-			"used", oldestSlot.lastUsed)
-	}
-
-	if longest > 0 && longestSlot != oldestSlot {
-		slog.Debug("forking cache slot", "src", longestSlot.Id, "dst", oldestSlot.Id, "inputs", longest, "total",
-			len(longestSlot.Inputs))
-		oldestSlot.Inputs = make([]input, longest)
-		copy(oldestSlot.Inputs, longestSlot.Inputs[:longest])
-		if c.cache != nil {
-			c.cache.CopyPrefix(longestSlot.Id, oldestSlot.Id, longest)
-		}
-	}
-
-	return oldestSlot, longest, nil
-}
-
-func countCommonPrefix(a []input, b []input) int32 {
-	var count int32
-
-	for i := range a {
-		if i >= len(b) {
-			break
-		}
-
-		if !reflect.DeepEqual(a[i], b[i]) {
-			break
-		}
-
-		count++
-	}
-
-	return count
-}
-
-func (c *InputCache) ShiftDiscard(inputLen int32, numKeep int32) int32 {
-	targetFree := (c.numCtx - numKeep) / 2
-	targetFree = max(targetFree, 1)
-
-	currentFree := c.numCtx - inputLen
-	discard := targetFree - currentFree
-
-	if discard < 0 {
-		discard = 0
-	}
-
-	return discard
-}
-
-// Frees up space in the KV cache by deleting the oldest half of history and shifting
-// the newest half into that space (saving numKeep inputs at the beginning).
-//
-// Assumes that at least 1 entry can be freed up by shifting (i.e. numKeep < numCtx)
-func (c *InputCache) ShiftCacheSlot(slot *InputCacheSlot, numKeep int32) error {
-	if numKeep >= c.numCtx {
-		return fmt.Errorf("unable to shift context - keep exceeds context (keep: %v context: %v)", numKeep, c.numCtx)
-	}
-
-	inputLen := int32(len(slot.Inputs))
-	discard := c.ShiftDiscard(inputLen, numKeep)
-
-	if discard <= 0 {
-		return nil
-	}
-
-	slog.Debug("context limit hit - shifting", "id", slot.Id, "limit", c.numCtx, "input", len(slot.Inputs),
-		"keep", numKeep, "discard", discard)
-
-	// TODO (jessegross): KV cache removal can fail for certain types of models
-	if c.cache != nil {
-		err := c.cache.Remove(slot.Id, numKeep, numKeep+discard)
-		if err != nil {
-			return fmt.Errorf("unable to remove old kv cache entries (id: %v, keep: %v discard: %v): %w", slot.Id, numKeep, discard, err)
-		}
-	}
-
-	for i := numKeep + discard; i < inputLen; i++ {
-		slot.Inputs[i-discard] = slot.Inputs[i]
-	}
-	slot.Inputs = slot.Inputs[:inputLen-discard]
-
-	return nil
-}
--- a/runner/ollamarunner/cache_test.go
+++ b/runner/ollamarunner/cache_test.go
@@ -1,291 +0,0 @@
-package ollamarunner
-
-import (
-	"image"
-	"testing"
-	"time"
-)
-
-func TestCountCommon(t *testing.T) {
-	imgA := image.NewRGBA(image.Rect(0, 0, 100, 100))
-	imgB := image.NewRGBA(image.Rect(0, 0, 50, 50))
-	imgC := image.NewRGBA(image.Rect(50, 50, 100, 100))
-
-	tests := []struct {
-		name     string
-		t1       []input
-		t2       []input
-		expected int32
-	}{
-		{
-			name:     "Equal",
-			t1:       []input{{token: 1}, {token: 2}, {token: 3}},
-			t2:       []input{{token: 1}, {token: 2}, {token: 3}},
-			expected: 3,
-		},
-		{
-			name:     "Prefix",
-			t1:       []input{{token: 1}},
-			t2:       []input{{token: 1}, {token: 2}, {token: 3}},
-			expected: 1,
-		},
-		{
-			name:     "Image Prefix",
-			t1:       []input{{image: imgA}},
-			t2:       []input{{image: imgA}, {image: imgB}, {image: imgC}},
-			expected: 1,
-		},
-		{
-			name:     "Mixed",
-			t1:       []input{{token: 1}, {image: imgA}},
-			t2:       []input{{token: 1}, {image: imgA}, {token: 5}},
-			expected: 2,
-		},
-		{
-			name:     "Empty",
-			t1:       []input{},
-			t2:       []input{{token: 1}, {token: 2}, {token: 3}},
-			expected: 0,
-		},
-		{
-			name:     "Both Empty",
-			t1:       []input{},
-			t2:       []input{},
-			expected: 0,
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			result := countCommonPrefix(tt.t1, tt.t2)
-			if result != tt.expected {
-				t.Errorf("countCommonPrefix(%v, %v): have %v; want %v", tt.t1, tt.t2, result, tt.expected)
-			}
-		})
-	}
-}
-
-func TestFindCacheSlot(t *testing.T) {
-	type expected struct {
-		result int
-		len    int32
-	}
-
-	tests := []struct {
-		name    string
-		cache   InputCache
-		prompt  []input
-		longest expected
-		best    expected
-	}{
-		{
-			name: "Empty",
-			cache: InputCache{slots: []InputCacheSlot{
-				{
-					Id:       0,
-					Inputs:   []input{},
-					InUse:    false,
-					lastUsed: time.Time{},
-				},
-				{
-					Id:       1,
-					Inputs:   []input{},
-					InUse:    false,
-					lastUsed: time.Time{},
-				},
-			}},
-			prompt:  []input{{token: 1}},
-			longest: expected{result: 0, len: 0},
-			best:    expected{result: 0, len: 0},
-		},
-		{
-			name: "Extend",
-			cache: InputCache{slots: []InputCacheSlot{
-				{
-					Id:       0,
-					Inputs:   []input{{token: 1}},
-					InUse:    false,
-					lastUsed: time.Now().Add(-time.Second),
-				},
-				{
-					Id:       1,
-					Inputs:   []input{{token: 1}, {token: 2}},
-					InUse:    false,
-					lastUsed: time.Now().Add(-2 * time.Second),
-				},
-			}},
-			prompt:  []input{{token: 1}, {token: 2}},
-			longest: expected{result: 1, len: 2},
-			best:    expected{result: 1, len: 2},
-		},
-		{
-			name: "New",
-			cache: InputCache{slots: []InputCacheSlot{
-				{
-					Id:       0,
-					Inputs:   []input{{token: 1}, {token: 2}},
-					InUse:    false,
-					lastUsed: time.Now().Add(-time.Second),
-				},
-				{
-					Id:       1,
-					Inputs:   []input{},
-					InUse:    false,
-					lastUsed: time.Time{},
-				},
-			}},
-			prompt:  []input{{token: 2}},
-			longest: expected{result: 0, len: 0},
-			best:    expected{result: 1, len: 0},
-		},
-		{
-			name: "Fork",
-			cache: InputCache{
-				slots: []InputCacheSlot{
-					{
-						Id:       0,
-						Inputs:   []input{{token: 1}, {token: 2}},
-						InUse:    false,
-						lastUsed: time.Now().Add(-time.Second),
-					},
-					{
-						Id:       1,
-						Inputs:   []input{},
-						InUse:    false,
-						lastUsed: time.Time{},
-					},
-				},
-			},
-			prompt:  []input{{token: 1}},
-			longest: expected{result: 0, len: 1},
-			best:    expected{result: 1, len: 1},
-		},
-		{
-			name: "Evict",
-			cache: InputCache{slots: []InputCacheSlot{
-				{
-					Id:       0,
-					Inputs:   []input{{token: 1}},
-					InUse:    false,
-					lastUsed: time.Now().Add(-time.Second),
-				},
-				{
-					Id:       1,
-					Inputs:   []input{{token: 1}, {token: 2}},
-					InUse:    false,
-					lastUsed: time.Now().Add(-2 * time.Second),
-				},
-			}},
-			prompt:  []input{{token: 2}, {token: 3}},
-			longest: expected{result: 0, len: 0},
-			best:    expected{result: 1, len: 0},
-		},
-		{
-			name: "In use",
-			cache: InputCache{slots: []InputCacheSlot{
-				{
-					Id:       0,
-					Inputs:   []input{{token: 1}, {token: 2}},
-					InUse:    true,
-					lastUsed: time.Now().Add(-time.Second),
-				},
-				{
-					Id:       1,
-					Inputs:   []input{{token: 1}},
-					InUse:    false,
-					lastUsed: time.Now().Add(-2 * time.Second),
-				},
-			}},
-			prompt:  []input{{token: 1}, {token: 2}},
-			longest: expected{result: 1, len: 1},
-			best:    expected{result: 1, len: 2},
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run("Longest-"+tt.name, func(t *testing.T) {
-			result, resultLen, err := tt.cache.findLongestCacheSlot(tt.prompt)
-			if err != nil {
-				t.Errorf("findLongestCacheSlot: err %v", err)
-			} else if result.Id != tt.longest.result || resultLen != tt.longest.len {
-				t.Errorf("findLongestCacheSlot: slot have %v, want %v len have %v, want %v",
-					result.Id, tt.longest.result, resultLen, tt.longest.len)
-			}
-		})
-	}
-
-	for _, tt := range tests {
-		t.Run("Best-"+tt.name, func(t *testing.T) {
-			result, resultLen, err := tt.cache.findBestCacheSlot(tt.prompt)
-			if err != nil {
-				t.Errorf("findBestCacheSlot: err %v", err)
-			} else if result.Id != tt.best.result || resultLen != tt.best.len {
-				t.Errorf("findBestCacheSlot: slot have %v, want %v len have %v, want %v",
-					result.Id, tt.best.result, resultLen, tt.best.len)
-			}
-		})
-	}
-}
-
-func TestShiftDiscard(t *testing.T) {
-	tests := []struct {
-		name     string
-		numCtx   int32
-		numKeep  int32
-		inputLen int32
-		expected int32
-	}{
-		{
-			name:     "Shift",
-			numCtx:   2048,
-			numKeep:  5,
-			inputLen: 2048,
-			expected: 1021,
-		},
-		{
-			name:     "Max Keep",
-			numCtx:   2048,
-			numKeep:  2047,
-			inputLen: 2048,
-			expected: 1,
-		},
-		{
-			name:     "No Keep",
-			numCtx:   2048,
-			numKeep:  0,
-			inputLen: 2048,
-			expected: 1024,
-		},
-		{
-			name:     "Truncate",
-			numCtx:   2048,
-			numKeep:  5,
-			inputLen: 5000,
-			expected: 3973,
-		},
-		{
-			name:     "Truncate Keep",
-			numCtx:   2048,
-			numKeep:  2047,
-			inputLen: 5000,
-			expected: 2953,
-		},
-		{
-			name:     "No Op",
-			numCtx:   2048,
-			numKeep:  5,
-			inputLen: 512,
-			expected: 0,
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			c := InputCache{numCtx: tt.numCtx}
-			result := c.ShiftDiscard(tt.inputLen, tt.numKeep)
-			if result != tt.expected {
-				t.Errorf("shiftDiscard(ctx: %v, keep: %v input: %v): have %v; want %v", tt.numCtx, tt.numKeep, tt.inputLen, result, tt.expected)
-			}
-		})
-	}
-}
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -1,946 +0,0 @@
-package ollamarunner
-
-import (
-	"bytes"
-	"context"
-	"encoding/json"
-	"errors"
-	"flag"
-	"fmt"
-	"image"
-	"log"
-	"log/slog"
-	"net"
-	"net/http"
-	"os"
-	"path/filepath"
-	"regexp"
-	"runtime"
-	"strconv"
-	"strings"
-	"sync"
-	"time"
-	"unicode/utf8"
-
-	"golang.org/x/sync/semaphore"
-
-	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/model"
-	"github.com/ollama/ollama/runner/common"
-	"github.com/ollama/ollama/sample"
-
-	_ "github.com/ollama/ollama/model/models"
-)
-
-// input is an element of the prompt to process, either a token or an image
-type input struct {
-	token int32
-
-	image image.Image
-}
-
-type Sequence struct {
-	// batch index
-	iBatch int
-
-	// prompt inputs left to evaluate
-	inputs []input
-
-	// inputs that have been added to a batch but not yet submitted to Forward
-	pendingInputs []input
-
-	// tokens that have been generated but not returned yet (e.g. for stop sequences)
-	pendingResponses []string
-
-	// input cache being used by this sequence
-	cache *InputCacheSlot
-
-	// channel to send responses over
-	responses chan string
-
-	// channel to stop decoding (such as if the remote connection is closed)
-	quit chan bool
-
-	// number of tokens to predict
-	numPredict int
-
-	// set of samplers to run on generated logits
-	samplers []sample.Sampler
-
-	// channel to send back the embedding if embedding only
-	embedding chan []float32
-
-	// stop sequences
-	stop []string
-
-	// number of inputs to keep at the beginning when shifting context window
-	numKeep int32
-
-	// true if an embedding are to be returned instead of text generation
-	embeddingOnly bool
-
-	doneReason string
-
-	// Metrics
-	startProcessingTime time.Time
-	startGenerationTime time.Time
-	numPredicted        int
-	numPromptInputs     int
-}
-
-type NewSequenceParams struct {
-	numPredict int
-	stop       []string
-	numKeep    int32
-	samplers   []sample.Sampler
-	embedding  bool
-}
-
-func (s *Server) NewSequence(prompt string, images []ImageData, params NewSequenceParams) (*Sequence, error) {
-	s.ready.Wait()
-
-	startTime := time.Now()
-
-	inputs, err := s.inputs(prompt, images)
-	if err != nil {
-		return nil, fmt.Errorf("failed to process inputs: %w", err)
-	} else if len(inputs) == 0 {
-		return nil, errors.New("no input provided")
-	}
-
-	if params.numKeep < 0 {
-		params.numKeep = int32(len(inputs))
-	}
-
-	// Ensure that at least 1 input can be discarded during shift
-	params.numKeep = min(params.numKeep, s.cache.numCtx-1)
-
-	if int32(len(inputs)) > s.cache.numCtx {
-		discard := int32(len(inputs)) - s.cache.numCtx
-		newInputs := inputs[:params.numKeep]
-		newInputs = append(newInputs, inputs[params.numKeep+discard:]...)
-
-		slog.Warn("truncating input prompt", "limit", s.cache.numCtx, "prompt", len(inputs), "keep", params.numKeep, "new", len(newInputs))
-		inputs = newInputs
-	}
-
-	// TODO(jessegross): Ingest cached history for grammar
-
-	return &Sequence{
-		inputs:              inputs,
-		numPromptInputs:     len(inputs),
-		startProcessingTime: startTime,
-		numPredict:          params.numPredict,
-		pendingResponses:    make([]string, 0),
-		responses:           make(chan string, 100),
-		quit:                make(chan bool, 1),
-		embedding:           make(chan []float32, 1),
-		samplers:            params.samplers,
-		embeddingOnly:       params.embedding,
-		stop:                params.stop,
-		numKeep:             params.numKeep,
-	}, nil
-}
-
-// inputs processes the prompt and images into a list of inputs
-// by splitting the prompt on [img-<n>] tags, tokenizing text and
-// decoding images
-func (s *Server) inputs(prompt string, images []ImageData) ([]input, error) {
-	var inputs []input
-	var parts []string
-	var matches [][]string
-
-	// TODO(jessegross): This can sometimes trigger for matching text in the
-	// user's prompt. We previously tried to avoid it by only looking for images
-	// on image models. We don't have a clear indication now but it would be better
-	// to properly escape it in any case.
-	re := regexp.MustCompile(`\[img-(\d+)\]`)
-	parts = re.Split(prompt, -1)
-	matches = re.FindAllStringSubmatch(prompt, -1)
-
-	for i, part := range parts {
-		// text - tokenize
-		tokens, err := s.model.(model.TextProcessor).Encode(part)
-		if err != nil {
-			return nil, err
-		}
-
-		for _, t := range tokens {
-			inputs = append(inputs, input{token: t})
-		}
-
-		// image - decode and store
-		if i < len(matches) {
-			n, _ := strconv.Atoi(matches[i][1])
-
-			imageIndex := -1
-			for j := range images {
-				if images[j].ID == n {
-					imageIndex = j
-					break
-				}
-			}
-
-			if imageIndex < 0 {
-				return nil, fmt.Errorf("invalid image index: %d", n)
-			}
-
-			image, _, err := image.Decode(bytes.NewReader(images[imageIndex].Data))
-			if err != nil {
-				return nil, err
-			}
-
-			inputs = append(inputs, input{image: image})
-		}
-	}
-
-	return inputs, nil
-}
-
-type Server struct {
-	// is the server ready to process requests?
-	// protects access to model and image
-	ready sync.WaitGroup
-
-	// loaded model
-	model model.Model
-
-	// status for external health reporting - loading, ready to serve, etc.
-	status ServerStatus
-
-	// current progress on loading the model
-	progress float32
-
-	// number of simultaneous requests to handle
-	parallel int
-
-	// maximum number of elements in a batch (per sequence)
-	// TODO (jmorganca): make this n_batch
-	batchSize int
-
-	// protects access to everything below this line
-	// this is context state needed for decoding
-	mu sync.Mutex
-
-	// indicates that data is ready for processing
-	cond *sync.Cond
-
-	// the list of simultaneous sequences being evaluated
-	seqs []*Sequence
-
-	// seqs can have a maximum of parallel entries, which
-	// is enfoced by seqSem
-	seqsSem *semaphore.Weighted
-
-	// KV cache
-	cache *InputCache
-
-	// next sequence for prompt processing to avoid starvation
-	nextSeq int
-}
-
-func (s *Server) allNil() bool {
-	for _, item := range s.seqs {
-		if item != nil {
-			return false
-		}
-	}
-	return true
-}
-
-func flushPending(seq *Sequence) bool {
-	joined := strings.Join(seq.pendingResponses, "")
-	seq.pendingResponses = []string{}
-
-	// Check if there are any partial UTF-8 characters remaining.
-	// We already check and queue as we are generating but some may
-	// still make it here:
-	// - Sequence is ending, e.g. generation limit has been hit
-	// - Invalid characters in the middle of a string
-	// This is a stricter check to ensure we never output invalid Unicode.
-	for !utf8.ValidString(joined) {
-		joined = joined[:len(joined)-1]
-	}
-
-	if len(joined) == 0 {
-		return true
-	}
-
-	select {
-	case seq.responses <- joined:
-		return true
-	case <-seq.quit:
-		return false
-	}
-}
-
-func (s *Server) removeSequence(seqIndex int, reason string) {
-	seq := s.seqs[seqIndex]
-
-	flushPending(seq)
-	seq.doneReason = reason
-	close(seq.responses)
-	close(seq.embedding)
-	seq.cache.InUse = false
-	s.seqs[seqIndex] = nil
-	s.seqsSem.Release(1)
-}
-
-func (s *Server) run(ctx context.Context) {
-	s.ready.Wait()
-
-	for {
-		select {
-		case <-ctx.Done():
-			return
-		default:
-			err := s.processBatch()
-			if err != nil {
-				panic(err)
-			}
-		}
-	}
-}
-
-func (s *Server) processBatch() error {
-	s.mu.Lock()
-	for s.allNil() {
-		s.cond.Wait() // Wait until an item is added
-	}
-	defer s.mu.Unlock()
-
-	var options model.Options
-	imgSeq := -1
-
-	seqIdx := s.nextSeq - 1
-	for range s.seqs {
-		seqIdx = (seqIdx + 1) % len(s.seqs)
-		seq := s.seqs[seqIdx]
-
-		if seq == nil {
-			continue
-		}
-
-		// if past the num predict limit
-		if seq.numPredict > 0 && seq.numPredicted >= seq.numPredict {
-			s.removeSequence(seqIdx, "limit")
-			continue
-		}
-
-		if !s.cache.enabled {
-			seq.inputs = append(seq.cache.Inputs, seq.inputs...)
-			seq.cache.Inputs = []input{}
-		}
-
-		for i, input := range seq.inputs {
-			if int32(len(seq.cache.Inputs)+len(seq.pendingInputs)+1) > s.cache.numCtx {
-				if len(seq.pendingInputs) == 0 {
-					err := s.cache.ShiftCacheSlot(seq.cache, seq.numKeep)
-					if err != nil {
-						return err
-					}
-				} else {
-					break
-				}
-			}
-
-			if i >= s.batchSize {
-				break
-			}
-
-			// TODO(jessegross): Image inputs need to be rethought - it's
-			// it doesn't work well for different types of models or multiple sequences
-			if input.image != nil {
-				if len(seq.pendingInputs) != len(options.Images) {
-					break
-				}
-
-				if imgSeq != seqIdx && imgSeq != -1 {
-					s.nextSeq = seqIdx
-					break
-				}
-
-				imgSeq = seqIdx
-				options.Images = append(options.Images, input.image)
-				seq.pendingInputs = append(seq.pendingInputs, input)
-				continue
-			}
-
-			options.Inputs = append(options.Inputs, input.token)
-			options.Positions = append(options.Positions, int32(len(seq.cache.Inputs)+len(seq.pendingInputs)))
-			options.Sequences = append(options.Sequences, seq.cache.Id)
-
-			seq.iBatch = len(options.Outputs)
-			if i+1 == len(seq.inputs) {
-				options.Outputs = append(options.Outputs, int32(len(options.Inputs)-1))
-			}
-			seq.pendingInputs = append(seq.pendingInputs, input)
-		}
-
-		seq.inputs = seq.inputs[len(seq.pendingInputs):]
-	}
-
-	if len(options.Inputs) == 0 {
-		return nil
-	}
-
-	ctx := s.model.Backend().NewContext()
-	defer ctx.Close()
-
-	modelOutput, err := model.Forward(ctx, s.model, options)
-	if err != nil {
-		return fmt.Errorf("failed to decode batch: %w", err)
-	}
-
-	f32s := modelOutput.Floats()
-
-	// TODO(jessegross): This will no longer be necessary once the sampling interface takes f32s
-	logits := make([]float64, len(f32s))
-	for i, f32 := range f32s {
-		logits[i] = float64(f32)
-	}
-
-	for i, seq := range s.seqs {
-		if seq == nil {
-			continue
-		}
-
-		// After calling Forward, pending inputs are now in the cache
-		if len(seq.pendingInputs) > 0 {
-			seq.cache.Inputs = append(seq.cache.Inputs, seq.pendingInputs...)
-			seq.pendingInputs = []input{}
-		}
-
-		// don't sample prompt processing
-		if len(seq.inputs) != 0 {
-			if !s.cache.enabled {
-				return errors.New("caching disabled but unable to fit entire input in a batch")
-			}
-			continue
-		}
-
-		seq.numPredicted++
-		if seq.numPredicted == 1 {
-			seq.startGenerationTime = time.Now()
-		}
-
-		// if done processing the prompt, generate an embedding and return
-		if seq.embeddingOnly {
-			// TODO(jessegross): Embedding support
-			s.removeSequence(i, "")
-			continue
-		}
-
-		// sample a token
-		vocabSize := len(f32s) / len(options.Outputs)
-		tokens, err := sample.Sample(logits[seq.iBatch*vocabSize:(seq.iBatch+1)*vocabSize], seq.samplers...)
-		if err != nil {
-			return err
-		}
-
-		// TODO(jessegross): Sampler will output a single int32 in the future
-		token := int32(tokens[0])
-
-		// if it's an end of sequence token, break
-		if s.model.(model.TextProcessor).Is(token, model.SpecialEOS) {
-			// TODO (jmorganca): we should send this back
-			// as it's important for the /api/generate context
-			// seq.responses <- piece
-
-			s.removeSequence(i, "stop")
-			continue
-		}
-
-		piece, err := s.model.(model.TextProcessor).Decode([]int32{token})
-		if err != nil {
-			return err
-		}
-
-		seq.inputs = []input{{token: token}}
-
-		seq.pendingResponses = append(seq.pendingResponses, piece)
-		sequence := strings.Join(seq.pendingResponses, "")
-
-		if ok, stop := common.FindStop(sequence, seq.stop); ok {
-			slog.Debug("hit stop token", "pending", seq.pendingResponses, "stop", stop)
-
-			var tokenTruncated bool
-			origLen := len(seq.pendingResponses)
-			seq.pendingResponses, tokenTruncated = common.TruncateStop(seq.pendingResponses, stop)
-			newLen := len(seq.pendingResponses)
-
-			// Update the cache based on the tokens that will be returned:
-			// - We have 1 token more than is currently in the cache because
-			// the last one generated wasn't submitted to Decode
-			// - Remove any stop sequences that we stripped out
-			// - If truncateStop removed a portion of a token, drop that
-			// - As defense-in-depth, if truncatedToken didn't find a stop token
-			// remove the extra one that we added to the cache len
-			tokenLen := len(seq.cache.Inputs) + 1
-			tokenLen -= origLen - newLen
-			if tokenTruncated || origLen == newLen {
-				tokenLen--
-			}
-			seq.cache.Inputs = seq.cache.Inputs[:tokenLen]
-
-			s.removeSequence(i, "stop")
-			continue
-		}
-
-		if common.ContainsStopSuffix(sequence, seq.stop) {
-			continue
-		}
-
-		if common.IncompleteUnicode(sequence) {
-			continue
-		}
-
-		if !flushPending(seq) {
-			s.removeSequence(i, "connection")
-		}
-	}
-
-	return nil
-}
-
-// TODO (jmorganca): use structs from the api package to avoid duplication
-// this way the api acts as a proxy instead of using a different api for the
-// runner
-type Options struct {
-	api.Runner
-
-	NumKeep          int      `json:"n_keep"`
-	Seed             int      `json:"seed"`
-	NumPredict       int      `json:"n_predict"`
-	TopK             int      `json:"top_k"`
-	TopP             float32  `json:"top_p"`
-	MinP             float32  `json:"min_p"`
-	TypicalP         float32  `json:"typical_p"`
-	RepeatLastN      int      `json:"repeat_last_n"`
-	Temperature      float32  `json:"temperature"`
-	RepeatPenalty    float32  `json:"repeat_penalty"`
-	PresencePenalty  float32  `json:"presence_penalty"`
-	FrequencyPenalty float32  `json:"frequency_penalty"`
-	Mirostat         int      `json:"mirostat"`
-	MirostatTau      float32  `json:"mirostat_tau"`
-	MirostatEta      float32  `json:"mirostat_eta"`
-	Stop             []string `json:"stop"`
-}
-
-type ImageData struct {
-	Data          []byte `json:"data"`
-	ID            int    `json:"id"`
-	AspectRatioID int    `json:"aspect_ratio_id"`
-}
-
-type CompletionRequest struct {
-	Prompt      string      `json:"prompt"`
-	Images      []ImageData `json:"image_data"`
-	Grammar     string      `json:"grammar"`
-	CachePrompt bool        `json:"cache_prompt"`
-
-	Options
-}
-
-type Timings struct {
-	PredictedN  int     `json:"predicted_n"`
-	PredictedMS float64 `json:"predicted_ms"`
-	PromptN     int     `json:"prompt_n"`
-	PromptMS    float64 `json:"prompt_ms"`
-}
-
-type CompletionResponse struct {
-	Content string `json:"content"`
-	Stop    bool   `json:"stop"`
-
-	Model        string  `json:"model,omitempty"`
-	Prompt       string  `json:"prompt,omitempty"`
-	StoppedLimit bool    `json:"stopped_limit,omitempty"`
-	PredictedN   int     `json:"predicted_n,omitempty"`
-	PredictedMS  float64 `json:"predicted_ms,omitempty"`
-	PromptN      int     `json:"prompt_n,omitempty"`
-	PromptMS     float64 `json:"prompt_ms,omitempty"`
-
-	Timings Timings `json:"timings"`
-}
-
-func getSamplers(_ CompletionRequest) []sample.Sampler {
-	// TODO(jessegross): Waiting for sampling code
-
-	/*samplingParams.TopK = req.TopK
-	samplingParams.TopP = req.TopP
-	samplingParams.MinP = req.MinP
-	samplingParams.TypicalP = req.TypicalP
-	samplingParams.Temp = req.Temperature
-	samplingParams.RepeatLastN = req.RepeatLastN
-	samplingParams.PenaltyRepeat = req.RepeatPenalty
-	samplingParams.PenaltyFreq = req.FrequencyPenalty
-	samplingParams.PenaltyPresent = req.PresencePenalty
-	samplingParams.Mirostat = req.Mirostat
-	samplingParams.MirostatTau = req.MirostatTau
-	samplingParams.MirostatEta = req.MirostatEta
-	samplingParams.Seed = uint32(req.Seed)
-	samplingParams.Grammar = req.Grammar*/
-
-	return []sample.Sampler{sample.Greedy()}
-}
-
-func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
-	var req CompletionRequest
-	req.Options = Options(api.DefaultOptions())
-	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
-		http.Error(w, "Bad request", http.StatusBadRequest)
-		return
-	}
-
-	// Set the headers to indicate streaming
-	w.Header().Set("Content-Type", "application/json")
-	w.Header().Set("Transfer-Encoding", "chunked")
-
-	flusher, ok := w.(http.Flusher)
-	if !ok {
-		http.Error(w, "Streaming not supported", http.StatusInternalServerError)
-		return
-	}
-
-	seq, err := s.NewSequence(req.Prompt, req.Images, NewSequenceParams{
-		numPredict: req.NumPredict,
-		stop:       req.Stop,
-		numKeep:    int32(req.NumKeep),
-		samplers:   getSamplers(req),
-		embedding:  false,
-	})
-	if err != nil {
-		http.Error(w, fmt.Sprintf("Failed to create new sequence: %v", err), http.StatusInternalServerError)
-		return
-	}
-
-	// Ensure there is a place to put the sequence, released when removed from s.seqs
-	if err := s.seqsSem.Acquire(r.Context(), 1); err != nil {
-		if errors.Is(err, context.Canceled) {
-			slog.Info("aborting completion request due to client closing the connection")
-		} else {
-			slog.Error("Failed to acquire semaphore", "error", err)
-		}
-		return
-	}
-
-	s.mu.Lock()
-	found := false
-	for i, sq := range s.seqs {
-		if sq == nil {
-			seq.cache, seq.inputs, err = s.cache.LoadCacheSlot(seq.inputs, req.CachePrompt)
-			if err != nil {
-				s.mu.Unlock()
-				http.Error(w, fmt.Sprintf("Failed to load cache: %v", err), http.StatusInternalServerError)
-				return
-			}
-
-			s.seqs[i] = seq
-			s.cond.Signal()
-			found = true
-			break
-		}
-	}
-	s.mu.Unlock()
-
-	if !found {
-		http.Error(w, "could not find an available sequence", http.StatusInternalServerError)
-		return
-	}
-
-	for {
-		select {
-		case <-r.Context().Done():
-			close(seq.quit)
-			return
-		case content, ok := <-seq.responses:
-			if ok {
-				if err := json.NewEncoder(w).Encode(&CompletionResponse{
-					Content: content,
-				}); err != nil {
-					http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
-					close(seq.quit)
-					return
-				}
-
-				flusher.Flush()
-			} else {
-				// Send the final response
-				if err := json.NewEncoder(w).Encode(&CompletionResponse{
-					Stop:         true,
-					StoppedLimit: seq.doneReason == "limit",
-					Timings: Timings{
-						PromptN:     seq.numPromptInputs,
-						PromptMS:    float64(seq.startGenerationTime.Sub(seq.startProcessingTime).Milliseconds()),
-						PredictedN:  seq.numPredicted,
-						PredictedMS: float64(time.Since(seq.startGenerationTime).Milliseconds()),
-					},
-				}); err != nil {
-					http.Error(w, fmt.Sprintf("failed to encode final response: %v", err), http.StatusInternalServerError)
-				}
-
-				return
-			}
-		}
-	}
-}
-
-type EmbeddingRequest struct {
-	Content     string `json:"content"`
-	CachePrompt bool   `json:"cache_prompt"`
-}
-
-type EmbeddingResponse struct {
-	Embedding []float32 `json:"embedding"`
-}
-
-func (s *Server) embeddings(w http.ResponseWriter, r *http.Request) {
-	var req EmbeddingRequest
-	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
-		http.Error(w, fmt.Sprintf("bad request: %s", err), http.StatusBadRequest)
-		return
-	}
-
-	w.Header().Set("Content-Type", "application/json")
-
-	slog.Debug("embedding request", "content", req.Content)
-
-	seq, err := s.NewSequence(req.Content, nil, NewSequenceParams{embedding: true})
-	if err != nil {
-		http.Error(w, fmt.Sprintf("Failed to create new sequence: %v", err), http.StatusInternalServerError)
-		return
-	}
-
-	// Ensure there is a place to put the sequence, released when removed from s.seqs
-	if err := s.seqsSem.Acquire(r.Context(), 1); err != nil {
-		if errors.Is(err, context.Canceled) {
-			slog.Info("aborting embeddings request due to client closing the connection")
-		} else {
-			slog.Error("Failed to acquire semaphore", "error", err)
-		}
-		return
-	}
-
-	s.mu.Lock()
-	found := false
-	for i, sq := range s.seqs {
-		if sq == nil {
-			seq.cache, seq.inputs, err = s.cache.LoadCacheSlot(seq.inputs, req.CachePrompt)
-			if err != nil {
-				s.mu.Unlock()
-				http.Error(w, fmt.Sprintf("Failed to load cache: %v", err), http.StatusInternalServerError)
-				return
-			}
-			s.seqs[i] = seq
-			s.cond.Signal()
-			found = true
-			break
-		}
-	}
-	s.mu.Unlock()
-
-	if !found {
-		http.Error(w, "could not find an available sequence", http.StatusInternalServerError)
-		return
-	}
-
-	embedding := <-seq.embedding
-
-	if err := json.NewEncoder(w).Encode(&EmbeddingResponse{
-		Embedding: embedding,
-	}); err != nil {
-		http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
-	}
-}
-
-type HealthResponse struct {
-	Status   string  `json:"status"`
-	Progress float32 `json:"progress"`
-}
-
-type ServerStatus int
-
-const (
-	ServerStatusReady ServerStatus = iota
-	ServerStatusLoadingModel
-	ServerStatusError
-)
-
-func (s ServerStatus) ToString() string {
-	switch s {
-	case ServerStatusReady:
-		return "ok"
-	case ServerStatusLoadingModel:
-		return "loading model"
-	default:
-		return "server error"
-	}
-}
-
-func (s *Server) health(w http.ResponseWriter, r *http.Request) {
-	w.Header().Set("Content-Type", "application/json")
-	if err := json.NewEncoder(w).Encode(&HealthResponse{
-		Status:   s.status.ToString(),
-		Progress: s.progress,
-	}); err != nil {
-		http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
-	}
-}
-
-type multiLPath []string
-
-func (m *multiLPath) Set(value string) error {
-	*m = append(*m, value)
-	return nil
-}
-
-func (m *multiLPath) String() string {
-	return strings.Join(*m, ", ")
-}
-
-func (s *Server) loadModel(
-	mpath string,
-	lpath multiLPath,
-	parallel int,
-	kvCacheType string,
-	kvSize int,
-	multiUserCache bool,
-) {
-	var err error
-	s.model, err = model.New(mpath)
-	if err != nil {
-		panic(err)
-	}
-
-	slog.Info("system", "info", s.model.Backend().SystemInfo() /* "threads", *threads */)
-
-	// TODO(jessegross): LoRA loading
-	if lpath.String() != "" {
-		panic("loras are not yet implemented")
-	}
-
-	s.cache, err = NewInputCache(s.model, kvCacheType, int32(kvSize), parallel, multiUserCache)
-	if err != nil {
-		panic(err)
-	}
-
-	if !s.cache.enabled && parallel > 1 {
-		parallel = 1
-		slog.Warn("model does not support caching, disabling parallel processing")
-	}
-
-	s.parallel = parallel
-	s.seqs = make([]*Sequence, s.parallel)
-	s.seqsSem = semaphore.NewWeighted(int64(s.parallel))
-
-	s.status = ServerStatusReady
-	s.ready.Done()
-}
-
-func Execute(args []string) error {
-	fs := flag.NewFlagSet("runner", flag.ExitOnError)
-	mpath := fs.String("model", "", "Path to model binary file")
-	parallel := fs.Int("parallel", 1, "Number of sequences to handle simultaneously")
-	batchSize := fs.Int("batch-size", 512, "Batch size")
-	_ = fs.Int("n-gpu-layers", 0, "Number of layers to offload to GPU")
-	_ = fs.Int("main-gpu", 0, "Main GPU")
-	_ = fs.Bool("flash-attn", false, "Enable flash attention")
-	kvSize := fs.Int("ctx-size", 2048, "Context (or KV cache) size")
-	kvCacheType := fs.String("kv-cache-type", "", "quantization type for KV cache (default: f16)")
-	port := fs.Int("port", 8080, "Port to expose the server on")
-	_ = fs.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
-	verbose := fs.Bool("verbose", false, "verbose output (default: disabled)")
-	_ = fs.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)")
-	_ = fs.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing")
-	_ = fs.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
-	multiUserCache := fs.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
-
-	var lpaths multiLPath
-	fs.Var(&lpaths, "lora", "Path to lora layer file (can be specified multiple times)")
-
-	fs.Usage = func() {
-		fmt.Fprintf(fs.Output(), "Runner usage\n")
-		fs.PrintDefaults()
-	}
-	if err := fs.Parse(args); err != nil {
-		return err
-	}
-	level := slog.LevelInfo
-	if *verbose {
-		level = slog.LevelDebug
-	}
-	handler := slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
-		Level:     level,
-		AddSource: true,
-		ReplaceAttr: func(_ []string, attr slog.Attr) slog.Attr {
-			if attr.Key == slog.SourceKey {
-				source := attr.Value.Any().(*slog.Source)
-				source.File = filepath.Base(source.File)
-			}
-			return attr
-		},
-	})
-	slog.SetDefault(slog.New(handler))
-	slog.Info("starting ollama engine")
-
-	server := &Server{
-		batchSize: *batchSize,
-		status:    ServerStatusLoadingModel,
-	}
-
-	// TODO(jessegross): Parameters that need to be implemented:
-	//	n-gpu-layers
-	//	main-gpu
-	//	flash-attn
-	//	threads
-	//	no-mmap
-	//	mlock
-	//	tensor-split
-
-	/*var tensorSplitFloats []float32
-	if *tensorSplit != "" {
-		stringFloats := regexp.MustCompile(",").Split(*tensorSplit, -1)
-
-		tensorSplitFloats = make([]float32, 0, len(stringFloats))
-		for _, s := range stringFloats {
-			f, _ := strconv.ParseFloat(s, 32)
-			tensorSplitFloats = append(tensorSplitFloats, float32(f))
-		}
-	}*/
-
-	server.ready.Add(1)
-	go server.loadModel(*mpath, lpaths, *parallel, *kvCacheType, *kvSize, *multiUserCache)
-
-	server.cond = sync.NewCond(&server.mu)
-
-	ctx, cancel := context.WithCancel(context.Background())
-	go server.run(ctx)
-
-	addr := "127.0.0.1:" + strconv.Itoa(*port)
-	listener, err := net.Listen("tcp", addr)
-	if err != nil {
-		fmt.Println("Listen error:", err)
-		cancel()
-		return err
-	}
-	defer listener.Close()
-
-	mux := http.NewServeMux()
-	mux.HandleFunc("/embedding", server.embeddings)
-	mux.HandleFunc("/completion", server.completion)
-	mux.HandleFunc("/health", server.health)
-
-	httpServer := http.Server{
-		Handler: mux,
-	}
-
-	log.Println("Server listening on", addr)
-	if err := httpServer.Serve(listener); err != nil {
-		log.Fatal("server error:", err)
-		return err
-	}
-
-	cancel()
-	return nil
-}
--- a/runner/runner.go
+++ b/runner/runner.go
@@ -1,24 +0,0 @@
-package runner
-
-import (
-	"github.com/ollama/ollama/runner/llamarunner"
-	"github.com/ollama/ollama/runner/ollamarunner"
-)
-
-func Execute(args []string) error {
-	if args[0] == "runner" {
-		args = args[1:]
-	}
-
-	var newRunner bool
-	if args[0] == "--ollama-engine" {
-		args = args[1:]
-		newRunner = true
-	}
-
-	if newRunner {
-		return ollamarunner.Execute(args)
-	} else {
-		return llamarunner.Execute(args)
-	}
-}
--- a/server/prompt.go
+++ b/server/prompt.go
@@ -10,9 +10,8 @@ import (
 	"strings"

 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/llm"
-	"github.com/ollama/ollama/model/models/mllama"
+	"github.com/ollama/ollama/model/mllama"
 	"github.com/ollama/ollama/template"
 )

@@ -93,33 +92,26 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
 			var imgData llm.ImageData

 			if isMllama {
-				if envconfig.NewEngine() {
-					imgData = llm.ImageData{
-						ID:   len(images),
-						Data: i,
-					}
-				} else {
-					data, opts, err := mllama.Preprocess(bytes.NewReader(i))
-					if err != nil {
-						return "", nil, err
-					}
+				data, opts, err := mllama.Preprocess(bytes.NewReader(i))
+				if err != nil {
+					return "", nil, err
+				}

-					buf := new(bytes.Buffer)
-					err = binary.Write(buf, binary.LittleEndian, data)
-					if err != nil {
-						return "", nil, err
-					}
+				buf := new(bytes.Buffer)
+				err = binary.Write(buf, binary.LittleEndian, data)
+				if err != nil {
+					return "", nil, err
+				}

-					ar, ok := opts["aspectRatioIndex"].(int)
-					if !ok {
-						return "", nil, fmt.Errorf("missing aspect ratio for image")
-					}
+				ar, ok := opts["aspectRatioIndex"].(int)
+				if !ok {
+					return "", nil, fmt.Errorf("missing aspect ratio for image")
+				}

-					imgData = llm.ImageData{
-						ID:            len(images),
-						Data:          buf.Bytes(),
-						AspectRatioID: ar,
-					}
+				imgData = llm.ImageData{
+					ID:            len(images),
+					Data:          buf.Bytes(),
+					AspectRatioID: ar,
 				}
 				imgPrompt = "<|image|>"
 			} else {
--- a/server/routes.go
+++ b/server/routes.go
@@ -32,7 +32,7 @@ import (
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/llm"
-	"github.com/ollama/ollama/model/models/mllama"
+	"github.com/ollama/ollama/model/mllama"
 	"github.com/ollama/ollama/openai"
 	"github.com/ollama/ollama/template"
 	"github.com/ollama/ollama/types/errtypes"
@@ -203,7 +203,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {

 	images := make([]llm.ImageData, len(req.Images))
 	for i := range req.Images {
-		if isMllama && !envconfig.NewEngine() {
+		if isMllama {
 			data, opts, err := mllama.Preprocess(bytes.NewReader(req.Images[i]))
 			if err != nil {
 				c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"})
Author	SHA1	Message	Date
Michael Yang	8b51db204f	tmp	2025-02-11 22:35:00 -08:00
Michael Yang	760e8fa656	tmp	2025-02-11 22:34:09 -08:00
Michael Yang	cf1dbcfc5a	next bert	2025-02-11 16:06:55 -08:00
Michael Yang	95eb87a052	ml: update Dump to handle precision	2025-02-11 15:46:25 -08:00
Michael Yang	c4f127ee6d	remove unused file	2025-02-11 15:46:25 -08:00
Michael Yang	f46a4b07a3	model: benchmark bpe split	2025-02-11 15:46:24 -08:00
Michael Yang	37498836bd	refactor prcess text tests	2025-02-11 15:46:24 -08:00
Michael Yang	58382892ad	fix linter	2025-02-11 15:46:24 -08:00
Michael Yang	44b39749d5	next	2025-02-11 15:46:24 -08:00