mlxrunner

chore: simplify runner.Execute
draft: model manifest file interface
2026-02-04 12:42:58 -05:00 · 2026-02-03 10:31:46 -08:00 · 2026-01-28 14:25:57 -08:00 · 2026-01-28 14:25:57 -08:00 · 2026-01-28 14:25:31 -08:00
142 changed files with 15818 additions and 10269 deletions
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -1419,10 +1419,10 @@ func thinkingOutputClosingText(plainText bool) string {
 	return readline.ColorGrey + readline.ColorBold + text + readline.ColorDefault
 }

-func chat(cmd *cobra.Command, opts runOptions) (*api.Message, *api.Metrics, error) {
+func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
-		return nil, nil, err
+		return nil, err
 	}

 	p := progress.NewProgress(os.Stderr)
@@ -1515,7 +1515,7 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, *api.Metrics, erro

 	if err := client.Chat(cancelCtx, req, fn); err != nil {
 		if errors.Is(err, context.Canceled) {
-			return nil, nil, nil
+			return nil, nil
 		}

 		// this error should ideally be wrapped properly by the client
@@ -1523,9 +1523,9 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, *api.Metrics, erro
 			p.StopAndClear()
 			fmt.Println("An error occurred while processing your message. Please try again.")
 			fmt.Println()
-			return nil, nil, nil
+			return nil, nil
 		}
-		return nil, nil, err
+		return nil, err
 	}

 	if len(opts.Messages) > 0 {
@@ -1535,14 +1535,14 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, *api.Metrics, erro

 	verbose, err := cmd.Flags().GetBool("verbose")
 	if err != nil {
-		return nil, nil, err
+		return nil, err
 	}

 	if verbose {
 		latest.Summary()
 	}

-	return &api.Message{Role: role, Thinking: thinkingContent.String(), Content: fullResponse.String()}, &latest.Metrics, nil
+	return &api.Message{Role: role, Thinking: thinkingContent.String(), Content: fullResponse.String()}, nil
 }

 func generate(cmd *cobra.Command, opts runOptions) error {
@@ -1963,7 +1963,7 @@ func NewCLI() *cobra.Command {
 		Use:    "runner",
 		Hidden: true,
 		RunE: func(cmd *cobra.Command, args []string) error {
-			return runner.Execute(os.Args[1:])
+			return runner.Execute(os.Args[2:])
 		},
 		FParseErrWhitelist: cobra.FParseErrWhitelist{UnknownFlags: true},
 	}
--- a/cmd/config/claude.go
+++ b/cmd/config/claude.go
@@ -6,8 +6,6 @@ import (
 	"os/exec"
 	"path/filepath"
 	"runtime"
-
-	"github.com/ollama/ollama/envconfig"
 )

 // Claude implements Runner for Claude Code integration
@@ -52,7 +50,7 @@ func (c *Claude) Run(model string) error {
 	cmd.Stdout = os.Stdout
 	cmd.Stderr = os.Stderr
 	cmd.Env = append(os.Environ(),
-		"ANTHROPIC_BASE_URL="+envconfig.Host().String(),
+		"ANTHROPIC_BASE_URL=http://localhost:11434",
 		"ANTHROPIC_API_KEY=",
 		"ANTHROPIC_AUTH_TOKEN=ollama",
 	)
--- a/cmd/config/clawdbot.go
+++ b/cmd/config/clawdbot.go
@@ -1,195 +0,0 @@
-package config
-
-import (
-	"bytes"
-	"encoding/json"
-	"fmt"
-	"io"
-	"os"
-	"os/exec"
-	"path/filepath"
-	"strings"
-
-	"github.com/ollama/ollama/envconfig"
-)
-
-type Clawdbot struct{}
-
-func (c *Clawdbot) String() string { return "Clawdbot" }
-
-const ansiGreen = "\033[32m"
-
-func (c *Clawdbot) Run(model string) error {
-	if _, err := exec.LookPath("clawdbot"); err != nil {
-		return fmt.Errorf("clawdbot is not installed, install from https://docs.clawd.bot")
-	}
-
-	models := []string{model}
-	if config, err := loadIntegration("clawdbot"); err == nil && len(config.Models) > 0 {
-		models = config.Models
-	}
-	if err := c.Edit(models); err != nil {
-		return fmt.Errorf("setup failed: %w", err)
-	}
-
-	cmd := exec.Command("clawdbot", "gateway")
-	cmd.Stdin = os.Stdin
-
-	// Capture output to detect "already running" message
-	var outputBuf bytes.Buffer
-	cmd.Stdout = io.MultiWriter(os.Stdout, &outputBuf)
-	cmd.Stderr = io.MultiWriter(os.Stderr, &outputBuf)
-
-	err := cmd.Run()
-	if err != nil && strings.Contains(outputBuf.String(), "Gateway already running") {
-		fmt.Fprintf(os.Stderr, "%sClawdbot has been configured with Ollama. Gateway is already running.%s\n", ansiGreen, ansiReset)
-		return nil
-	}
-	return err
-}
-
-func (c *Clawdbot) Paths() []string {
-	home, _ := os.UserHomeDir()
-	p := filepath.Join(home, ".clawdbot", "clawdbot.json")
-	if _, err := os.Stat(p); err == nil {
-		return []string{p}
-	}
-	return nil
-}
-
-func (c *Clawdbot) Edit(models []string) error {
-	if len(models) == 0 {
-		return nil
-	}
-
-	home, err := os.UserHomeDir()
-	if err != nil {
-		return err
-	}
-
-	configPath := filepath.Join(home, ".clawdbot", "clawdbot.json")
-	if err := os.MkdirAll(filepath.Dir(configPath), 0o755); err != nil {
-		return err
-	}
-
-	// Read into map[string]any to preserve unknown fields
-	config := make(map[string]any)
-	if data, err := os.ReadFile(configPath); err == nil {
-		_ = json.Unmarshal(data, &config)
-	}
-
-	// Navigate/create: models.providers.ollama (preserving other providers)
-	modelsSection, _ := config["models"].(map[string]any)
-	if modelsSection == nil {
-		modelsSection = make(map[string]any)
-	}
-	providers, _ := modelsSection["providers"].(map[string]any)
-	if providers == nil {
-		providers = make(map[string]any)
-	}
-	ollama, _ := providers["ollama"].(map[string]any)
-	if ollama == nil {
-		ollama = make(map[string]any)
-	}
-
-	ollama["baseUrl"] = envconfig.Host().String() + "/v1"
-	// needed to register provider
-	ollama["apiKey"] = "ollama-local"
-	// TODO(parthsareen): potentially move to responses
-	ollama["api"] = "openai-completions"
-
-	// Build map of existing models to preserve user customizations
-	existingModels, _ := ollama["models"].([]any)
-	existingByID := make(map[string]map[string]any)
-	for _, m := range existingModels {
-		if entry, ok := m.(map[string]any); ok {
-			if id, ok := entry["id"].(string); ok {
-				existingByID[id] = entry
-			}
-		}
-	}
-
-	var newModels []any
-	for _, model := range models {
-		entry := map[string]any{
-			"id":        model,
-			"name":      model,
-			"reasoning": false,
-			"input":     []any{"text"},
-			"cost": map[string]any{
-				"input":      0,
-				"output":     0,
-				"cacheRead":  0,
-				"cacheWrite": 0,
-			},
-			// TODO(parthsareen): get these values from API
-			"contextWindow": 131072,
-			"maxTokens":     16384,
-		}
-		// Merge existing fields (user customizations)
-		if existing, ok := existingByID[model]; ok {
-			for k, v := range existing {
-				if _, isNew := entry[k]; !isNew {
-					entry[k] = v
-				}
-			}
-		}
-		newModels = append(newModels, entry)
-	}
-	ollama["models"] = newModels
-
-	providers["ollama"] = ollama
-	modelsSection["providers"] = providers
-	config["models"] = modelsSection
-
-	// Update agents.defaults.model.primary (preserving other agent settings)
-	agents, _ := config["agents"].(map[string]any)
-	if agents == nil {
-		agents = make(map[string]any)
-	}
-	defaults, _ := agents["defaults"].(map[string]any)
-	if defaults == nil {
-		defaults = make(map[string]any)
-	}
-	modelConfig, _ := defaults["model"].(map[string]any)
-	if modelConfig == nil {
-		modelConfig = make(map[string]any)
-	}
-	modelConfig["primary"] = "ollama/" + models[0]
-	defaults["model"] = modelConfig
-	agents["defaults"] = defaults
-	config["agents"] = agents
-
-	data, err := json.MarshalIndent(config, "", "  ")
-	if err != nil {
-		return err
-	}
-	return writeWithBackup(configPath, data)
-}
-
-func (c *Clawdbot) Models() []string {
-	home, err := os.UserHomeDir()
-	if err != nil {
-		return nil
-	}
-
-	config, err := readJSONFile(filepath.Join(home, ".clawdbot", "clawdbot.json"))
-	if err != nil {
-		return nil
-	}
-
-	modelsSection, _ := config["models"].(map[string]any)
-	providers, _ := modelsSection["providers"].(map[string]any)
-	ollama, _ := providers["ollama"].(map[string]any)
-	modelList, _ := ollama["models"].([]any)
-
-	var result []string
-	for _, m := range modelList {
-		if entry, ok := m.(map[string]any); ok {
-			if id, ok := entry["id"].(string); ok {
-				result = append(result, id)
-			}
-		}
-	}
-	return result
-}
--- a/cmd/config/clawdbot_test.go
+++ b/cmd/config/clawdbot_test.go
@@ -1,625 +0,0 @@
-package config
-
-import (
-	"encoding/json"
-	"fmt"
-	"os"
-	"path/filepath"
-	"testing"
-)
-
-func TestClawdbotIntegration(t *testing.T) {
-	c := &Clawdbot{}
-
-	t.Run("String", func(t *testing.T) {
-		if got := c.String(); got != "Clawdbot" {
-			t.Errorf("String() = %q, want %q", got, "Clawdbot")
-		}
-	})
-
-	t.Run("implements Runner", func(t *testing.T) {
-		var _ Runner = c
-	})
-
-	t.Run("implements Editor", func(t *testing.T) {
-		var _ Editor = c
-	})
-}
-
-func TestClawdbotEdit(t *testing.T) {
-	c := &Clawdbot{}
-	tmpDir := t.TempDir()
-	setTestHome(t, tmpDir)
-
-	configDir := filepath.Join(tmpDir, ".clawdbot")
-	configPath := filepath.Join(configDir, "clawdbot.json")
-
-	cleanup := func() { os.RemoveAll(configDir) }
-
-	t.Run("fresh install", func(t *testing.T) {
-		cleanup()
-		if err := c.Edit([]string{"llama3.2"}); err != nil {
-			t.Fatal(err)
-		}
-		assertClawdbotModelExists(t, configPath, "llama3.2")
-		assertClawdbotPrimaryModel(t, configPath, "ollama/llama3.2")
-	})
-
-	t.Run("multiple models - first is primary", func(t *testing.T) {
-		cleanup()
-		if err := c.Edit([]string{"llama3.2", "mistral"}); err != nil {
-			t.Fatal(err)
-		}
-		assertClawdbotModelExists(t, configPath, "llama3.2")
-		assertClawdbotModelExists(t, configPath, "mistral")
-		assertClawdbotPrimaryModel(t, configPath, "ollama/llama3.2")
-	})
-
-	t.Run("preserve other providers", func(t *testing.T) {
-		cleanup()
-		os.MkdirAll(configDir, 0o755)
-		os.WriteFile(configPath, []byte(`{"models":{"providers":{"anthropic":{"apiKey":"xxx"}}}}`), 0o644)
-		if err := c.Edit([]string{"llama3.2"}); err != nil {
-			t.Fatal(err)
-		}
-		data, _ := os.ReadFile(configPath)
-		var cfg map[string]any
-		json.Unmarshal(data, &cfg)
-		models := cfg["models"].(map[string]any)
-		providers := models["providers"].(map[string]any)
-		if providers["anthropic"] == nil {
-			t.Error("anthropic provider was removed")
-		}
-	})
-
-	t.Run("preserve top-level keys", func(t *testing.T) {
-		cleanup()
-		os.MkdirAll(configDir, 0o755)
-		os.WriteFile(configPath, []byte(`{"theme":"dark","mcp":{"servers":{}}}`), 0o644)
-		if err := c.Edit([]string{"llama3.2"}); err != nil {
-			t.Fatal(err)
-		}
-		data, _ := os.ReadFile(configPath)
-		var cfg map[string]any
-		json.Unmarshal(data, &cfg)
-		if cfg["theme"] != "dark" {
-			t.Error("theme was removed")
-		}
-		if cfg["mcp"] == nil {
-			t.Error("mcp was removed")
-		}
-	})
-
-	t.Run("preserve user customizations on models", func(t *testing.T) {
-		cleanup()
-		c.Edit([]string{"llama3.2"})
-
-		// User adds custom field
-		data, _ := os.ReadFile(configPath)
-		var cfg map[string]any
-		json.Unmarshal(data, &cfg)
-		models := cfg["models"].(map[string]any)
-		providers := models["providers"].(map[string]any)
-		ollama := providers["ollama"].(map[string]any)
-		modelList := ollama["models"].([]any)
-		entry := modelList[0].(map[string]any)
-		entry["customField"] = "user-value"
-		configData, _ := json.MarshalIndent(cfg, "", "  ")
-		os.WriteFile(configPath, configData, 0o644)
-
-		// Re-run Edit
-		c.Edit([]string{"llama3.2"})
-
-		data, _ = os.ReadFile(configPath)
-		json.Unmarshal(data, &cfg)
-		models = cfg["models"].(map[string]any)
-		providers = models["providers"].(map[string]any)
-		ollama = providers["ollama"].(map[string]any)
-		modelList = ollama["models"].([]any)
-		entry = modelList[0].(map[string]any)
-		if entry["customField"] != "user-value" {
-			t.Error("custom field was lost")
-		}
-	})
-
-	t.Run("edit replaces models list", func(t *testing.T) {
-		cleanup()
-		c.Edit([]string{"llama3.2", "mistral"})
-		c.Edit([]string{"llama3.2"})
-
-		assertClawdbotModelExists(t, configPath, "llama3.2")
-		assertClawdbotModelNotExists(t, configPath, "mistral")
-	})
-
-	t.Run("empty models is no-op", func(t *testing.T) {
-		cleanup()
-		os.MkdirAll(configDir, 0o755)
-		original := `{"existing":"data"}`
-		os.WriteFile(configPath, []byte(original), 0o644)
-
-		c.Edit([]string{})
-
-		data, _ := os.ReadFile(configPath)
-		if string(data) != original {
-			t.Error("empty models should not modify file")
-		}
-	})
-
-	t.Run("corrupted JSON treated as empty", func(t *testing.T) {
-		cleanup()
-		os.MkdirAll(configDir, 0o755)
-		os.WriteFile(configPath, []byte(`{corrupted`), 0o644)
-
-		if err := c.Edit([]string{"llama3.2"}); err != nil {
-			t.Fatal(err)
-		}
-
-		data, _ := os.ReadFile(configPath)
-		var cfg map[string]any
-		if err := json.Unmarshal(data, &cfg); err != nil {
-			t.Error("result should be valid JSON")
-		}
-	})
-
-	t.Run("wrong type models section", func(t *testing.T) {
-		cleanup()
-		os.MkdirAll(configDir, 0o755)
-		os.WriteFile(configPath, []byte(`{"models":"not a map"}`), 0o644)
-
-		if err := c.Edit([]string{"llama3.2"}); err != nil {
-			t.Fatal(err)
-		}
-		assertClawdbotModelExists(t, configPath, "llama3.2")
-	})
-}
-
-func TestClawdbotModels(t *testing.T) {
-	c := &Clawdbot{}
-	tmpDir := t.TempDir()
-	setTestHome(t, tmpDir)
-
-	t.Run("no config returns nil", func(t *testing.T) {
-		if models := c.Models(); len(models) > 0 {
-			t.Errorf("expected nil/empty, got %v", models)
-		}
-	})
-
-	t.Run("returns all ollama models", func(t *testing.T) {
-		configDir := filepath.Join(tmpDir, ".clawdbot")
-		os.MkdirAll(configDir, 0o755)
-		os.WriteFile(filepath.Join(configDir, "clawdbot.json"), []byte(`{
-			"models":{"providers":{"ollama":{"models":[
-				{"id":"llama3.2"},
-				{"id":"mistral"}
-			]}}}
-		}`), 0o644)
-
-		models := c.Models()
-		if len(models) != 2 {
-			t.Errorf("expected 2 models, got %v", models)
-		}
-	})
-}
-
-// Helper functions
-func assertClawdbotModelExists(t *testing.T, path, model string) {
-	t.Helper()
-	data, _ := os.ReadFile(path)
-	var cfg map[string]any
-	json.Unmarshal(data, &cfg)
-	models := cfg["models"].(map[string]any)
-	providers := models["providers"].(map[string]any)
-	ollama := providers["ollama"].(map[string]any)
-	modelList := ollama["models"].([]any)
-	for _, m := range modelList {
-		if entry, ok := m.(map[string]any); ok {
-			if entry["id"] == model {
-				return
-			}
-		}
-	}
-	t.Errorf("model %s not found", model)
-}
-
-func assertClawdbotModelNotExists(t *testing.T, path, model string) {
-	t.Helper()
-	data, _ := os.ReadFile(path)
-	var cfg map[string]any
-	json.Unmarshal(data, &cfg)
-	models, _ := cfg["models"].(map[string]any)
-	providers, _ := models["providers"].(map[string]any)
-	ollama, _ := providers["ollama"].(map[string]any)
-	modelList, _ := ollama["models"].([]any)
-	for _, m := range modelList {
-		if entry, ok := m.(map[string]any); ok {
-			if entry["id"] == model {
-				t.Errorf("model %s should not exist", model)
-			}
-		}
-	}
-}
-
-func assertClawdbotPrimaryModel(t *testing.T, path, expected string) {
-	t.Helper()
-	data, _ := os.ReadFile(path)
-	var cfg map[string]any
-	json.Unmarshal(data, &cfg)
-	agents := cfg["agents"].(map[string]any)
-	defaults := agents["defaults"].(map[string]any)
-	model := defaults["model"].(map[string]any)
-	if model["primary"] != expected {
-		t.Errorf("primary model = %v, want %v", model["primary"], expected)
-	}
-}
-
-func TestClawdbotPaths(t *testing.T) {
-	c := &Clawdbot{}
-
-	t.Run("returns path when config exists", func(t *testing.T) {
-		tmpDir := t.TempDir()
-		setTestHome(t, tmpDir)
-		configDir := filepath.Join(tmpDir, ".clawdbot")
-		os.MkdirAll(configDir, 0o755)
-		os.WriteFile(filepath.Join(configDir, "clawdbot.json"), []byte(`{}`), 0o644)
-
-		paths := c.Paths()
-		if len(paths) != 1 {
-			t.Errorf("expected 1 path, got %d", len(paths))
-		}
-	})
-
-	t.Run("returns nil when config missing", func(t *testing.T) {
-		tmpDir := t.TempDir()
-		setTestHome(t, tmpDir)
-		if paths := c.Paths(); paths != nil {
-			t.Errorf("expected nil, got %v", paths)
-		}
-	})
-}
-
-func TestClawdbotModelsEdgeCases(t *testing.T) {
-	c := &Clawdbot{}
-	tmpDir := t.TempDir()
-	setTestHome(t, tmpDir)
-	configDir := filepath.Join(tmpDir, ".clawdbot")
-	configPath := filepath.Join(configDir, "clawdbot.json")
-	cleanup := func() { os.RemoveAll(configDir) }
-
-	t.Run("corrupted JSON returns nil", func(t *testing.T) {
-		cleanup()
-		os.MkdirAll(configDir, 0o755)
-		os.WriteFile(configPath, []byte(`{corrupted`), 0o644)
-		if models := c.Models(); models != nil {
-			t.Errorf("expected nil, got %v", models)
-		}
-	})
-
-	t.Run("wrong type at models level", func(t *testing.T) {
-		cleanup()
-		os.MkdirAll(configDir, 0o755)
-		os.WriteFile(configPath, []byte(`{"models":"string"}`), 0o644)
-		if models := c.Models(); models != nil {
-			t.Errorf("expected nil, got %v", models)
-		}
-	})
-
-	t.Run("wrong type at providers level", func(t *testing.T) {
-		cleanup()
-		os.MkdirAll(configDir, 0o755)
-		os.WriteFile(configPath, []byte(`{"models":{"providers":"string"}}`), 0o644)
-		if models := c.Models(); models != nil {
-			t.Errorf("expected nil, got %v", models)
-		}
-	})
-
-	t.Run("wrong type at ollama level", func(t *testing.T) {
-		cleanup()
-		os.MkdirAll(configDir, 0o755)
-		os.WriteFile(configPath, []byte(`{"models":{"providers":{"ollama":"string"}}}`), 0o644)
-		if models := c.Models(); models != nil {
-			t.Errorf("expected nil, got %v", models)
-		}
-	})
-
-	t.Run("model entry missing id", func(t *testing.T) {
-		cleanup()
-		os.MkdirAll(configDir, 0o755)
-		os.WriteFile(configPath, []byte(`{"models":{"providers":{"ollama":{"models":[{"name":"test"}]}}}}`), 0o644)
-		if len(c.Models()) != 0 {
-			t.Error("expected empty for missing id")
-		}
-	})
-
-	t.Run("model id is not string", func(t *testing.T) {
-		cleanup()
-		os.MkdirAll(configDir, 0o755)
-		os.WriteFile(configPath, []byte(`{"models":{"providers":{"ollama":{"models":[{"id":123}]}}}}`), 0o644)
-		if len(c.Models()) != 0 {
-			t.Error("expected empty for non-string id")
-		}
-	})
-}
-
-func TestClawdbotEditSchemaFields(t *testing.T) {
-	c := &Clawdbot{}
-	tmpDir := t.TempDir()
-	setTestHome(t, tmpDir)
-	configPath := filepath.Join(tmpDir, ".clawdbot", "clawdbot.json")
-
-	if err := c.Edit([]string{"llama3.2"}); err != nil {
-		t.Fatal(err)
-	}
-
-	data, _ := os.ReadFile(configPath)
-	var cfg map[string]any
-	json.Unmarshal(data, &cfg)
-	models := cfg["models"].(map[string]any)
-	providers := models["providers"].(map[string]any)
-	ollama := providers["ollama"].(map[string]any)
-	modelList := ollama["models"].([]any)
-	entry := modelList[0].(map[string]any)
-
-	// Verify required schema fields
-	if entry["reasoning"] != false {
-		t.Error("reasoning should be false")
-	}
-	if entry["input"] == nil {
-		t.Error("input should be set")
-	}
-	if entry["contextWindow"] == nil {
-		t.Error("contextWindow should be set")
-	}
-	if entry["maxTokens"] == nil {
-		t.Error("maxTokens should be set")
-	}
-	cost := entry["cost"].(map[string]any)
-	if cost["cacheRead"] == nil {
-		t.Error("cost.cacheRead should be set")
-	}
-	if cost["cacheWrite"] == nil {
-		t.Error("cost.cacheWrite should be set")
-	}
-}
-
-func TestClawdbotEditModelNames(t *testing.T) {
-	c := &Clawdbot{}
-	tmpDir := t.TempDir()
-	setTestHome(t, tmpDir)
-	configPath := filepath.Join(tmpDir, ".clawdbot", "clawdbot.json")
-	cleanup := func() { os.RemoveAll(filepath.Join(tmpDir, ".clawdbot")) }
-
-	t.Run("model with colon tag", func(t *testing.T) {
-		cleanup()
-		if err := c.Edit([]string{"llama3.2:70b"}); err != nil {
-			t.Fatal(err)
-		}
-		assertClawdbotModelExists(t, configPath, "llama3.2:70b")
-		assertClawdbotPrimaryModel(t, configPath, "ollama/llama3.2:70b")
-	})
-
-	t.Run("model with slash", func(t *testing.T) {
-		cleanup()
-		if err := c.Edit([]string{"library/model:tag"}); err != nil {
-			t.Fatal(err)
-		}
-		assertClawdbotModelExists(t, configPath, "library/model:tag")
-		assertClawdbotPrimaryModel(t, configPath, "ollama/library/model:tag")
-	})
-
-	t.Run("model with hyphen", func(t *testing.T) {
-		cleanup()
-		if err := c.Edit([]string{"test-model"}); err != nil {
-			t.Fatal(err)
-		}
-		assertClawdbotModelExists(t, configPath, "test-model")
-	})
-}
-
-func TestClawdbotEditAgentsPreservation(t *testing.T) {
-	c := &Clawdbot{}
-	tmpDir := t.TempDir()
-	setTestHome(t, tmpDir)
-	configDir := filepath.Join(tmpDir, ".clawdbot")
-	configPath := filepath.Join(configDir, "clawdbot.json")
-	cleanup := func() { os.RemoveAll(configDir) }
-
-	t.Run("preserve other agent defaults", func(t *testing.T) {
-		cleanup()
-		os.MkdirAll(configDir, 0o755)
-		os.WriteFile(configPath, []byte(`{"agents":{"defaults":{"model":{"primary":"old"},"temperature":0.7}}}`), 0o644)
-
-		c.Edit([]string{"llama3.2"})
-
-		data, _ := os.ReadFile(configPath)
-		var cfg map[string]any
-		json.Unmarshal(data, &cfg)
-		agents := cfg["agents"].(map[string]any)
-		defaults := agents["defaults"].(map[string]any)
-		if defaults["temperature"] != 0.7 {
-			t.Error("temperature setting was lost")
-		}
-	})
-
-	t.Run("preserve other agents besides defaults", func(t *testing.T) {
-		cleanup()
-		os.MkdirAll(configDir, 0o755)
-		os.WriteFile(configPath, []byte(`{"agents":{"defaults":{},"custom-agent":{"foo":"bar"}}}`), 0o644)
-
-		c.Edit([]string{"llama3.2"})
-
-		data, _ := os.ReadFile(configPath)
-		var cfg map[string]any
-		json.Unmarshal(data, &cfg)
-		agents := cfg["agents"].(map[string]any)
-		if agents["custom-agent"] == nil {
-			t.Error("custom-agent was lost")
-		}
-	})
-}
-
-const testClawdbotFixture = `{
-  "theme": "dark",
-  "mcp": {"servers": {"custom": {"enabled": true}}},
-  "models": {
-    "providers": {
-      "anthropic": {"apiKey": "xxx"},
-      "ollama": {
-        "baseUrl": "http://127.0.0.1:11434/v1",
-        "models": [{"id": "old-model", "customField": "preserved"}]
-      }
-    }
-  },
-  "agents": {
-    "defaults": {"model": {"primary": "old"}, "temperature": 0.7},
-    "custom-agent": {"foo": "bar"}
-  }
-}`
-
-func TestClawdbotEdit_RoundTrip(t *testing.T) {
-	c := &Clawdbot{}
-	tmpDir := t.TempDir()
-	setTestHome(t, tmpDir)
-	configDir := filepath.Join(tmpDir, ".clawdbot")
-	configPath := filepath.Join(configDir, "clawdbot.json")
-
-	os.MkdirAll(configDir, 0o755)
-	os.WriteFile(configPath, []byte(testClawdbotFixture), 0o644)
-
-	if err := c.Edit([]string{"llama3.2", "mistral"}); err != nil {
-		t.Fatal(err)
-	}
-
-	data, _ := os.ReadFile(configPath)
-	var cfg map[string]any
-	json.Unmarshal(data, &cfg)
-
-	// Verify top-level preserved
-	if cfg["theme"] != "dark" {
-		t.Error("theme not preserved")
-	}
-	mcp := cfg["mcp"].(map[string]any)
-	servers := mcp["servers"].(map[string]any)
-	if servers["custom"] == nil {
-		t.Error("mcp.servers.custom not preserved")
-	}
-
-	// Verify other providers preserved
-	models := cfg["models"].(map[string]any)
-	providers := models["providers"].(map[string]any)
-	if providers["anthropic"] == nil {
-		t.Error("anthropic provider not preserved")
-	}
-
-	// Verify agents preserved
-	agents := cfg["agents"].(map[string]any)
-	if agents["custom-agent"] == nil {
-		t.Error("custom-agent not preserved")
-	}
-	defaults := agents["defaults"].(map[string]any)
-	if defaults["temperature"] != 0.7 {
-		t.Error("temperature not preserved")
-	}
-}
-
-func TestClawdbotEdit_Idempotent(t *testing.T) {
-	c := &Clawdbot{}
-	tmpDir := t.TempDir()
-	setTestHome(t, tmpDir)
-	configDir := filepath.Join(tmpDir, ".clawdbot")
-	configPath := filepath.Join(configDir, "clawdbot.json")
-
-	os.MkdirAll(configDir, 0o755)
-	os.WriteFile(configPath, []byte(testClawdbotFixture), 0o644)
-
-	c.Edit([]string{"llama3.2", "mistral"})
-	firstData, _ := os.ReadFile(configPath)
-
-	c.Edit([]string{"llama3.2", "mistral"})
-	secondData, _ := os.ReadFile(configPath)
-
-	if string(firstData) != string(secondData) {
-		t.Error("repeated edits with same models produced different results")
-	}
-}
-
-func TestClawdbotEdit_MultipleConsecutiveEdits(t *testing.T) {
-	c := &Clawdbot{}
-	tmpDir := t.TempDir()
-	setTestHome(t, tmpDir)
-	configDir := filepath.Join(tmpDir, ".clawdbot")
-	configPath := filepath.Join(configDir, "clawdbot.json")
-
-	os.MkdirAll(configDir, 0o755)
-	os.WriteFile(configPath, []byte(testClawdbotFixture), 0o644)
-
-	for i := range 10 {
-		models := []string{"model-a", "model-b"}
-		if i%2 == 0 {
-			models = []string{"model-x", "model-y", "model-z"}
-		}
-		if err := c.Edit(models); err != nil {
-			t.Fatalf("edit %d failed: %v", i, err)
-		}
-	}
-
-	data, _ := os.ReadFile(configPath)
-	var cfg map[string]any
-	if err := json.Unmarshal(data, &cfg); err != nil {
-		t.Fatalf("file is not valid JSON after multiple edits: %v", err)
-	}
-
-	if cfg["theme"] != "dark" {
-		t.Error("theme lost after multiple edits")
-	}
-}
-
-func TestClawdbotEdit_BackupCreated(t *testing.T) {
-	c := &Clawdbot{}
-	tmpDir := t.TempDir()
-	setTestHome(t, tmpDir)
-	configDir := filepath.Join(tmpDir, ".clawdbot")
-	configPath := filepath.Join(configDir, "clawdbot.json")
-	backupDir := filepath.Join(os.TempDir(), "ollama-backups")
-
-	os.MkdirAll(configDir, 0o755)
-	uniqueMarker := fmt.Sprintf("test-marker-%d", os.Getpid())
-	original := fmt.Sprintf(`{"theme": "%s"}`, uniqueMarker)
-	os.WriteFile(configPath, []byte(original), 0o644)
-
-	if err := c.Edit([]string{"model-a"}); err != nil {
-		t.Fatal(err)
-	}
-
-	backups, _ := filepath.Glob(filepath.Join(backupDir, "clawdbot.json.*"))
-	foundBackup := false
-	for _, backup := range backups {
-		data, _ := os.ReadFile(backup)
-		if string(data) == original {
-			foundBackup = true
-			break
-		}
-	}
-
-	if !foundBackup {
-		t.Error("backup with original content not found")
-	}
-}
-
-func TestClawdbotEdit_CreatesDirectoryIfMissing(t *testing.T) {
-	c := &Clawdbot{}
-	tmpDir := t.TempDir()
-	setTestHome(t, tmpDir)
-	configDir := filepath.Join(tmpDir, ".clawdbot")
-
-	if _, err := os.Stat(configDir); !os.IsNotExist(err) {
-		t.Fatal("directory should not exist before test")
-	}
-
-	if err := c.Edit([]string{"model-a"}); err != nil {
-		t.Fatal(err)
-	}
-
-	if _, err := os.Stat(configDir); os.IsNotExist(err) {
-		t.Fatal("directory was not created")
-	}
-}
--- a/cmd/config/droid.go
+++ b/cmd/config/droid.go
@@ -7,8 +7,6 @@ import (
 	"os/exec"
 	"path/filepath"
 	"slices"
-
-	"github.com/ollama/ollama/envconfig"
 )

 // Droid implements Runner and Editor for Droid integration
@@ -119,7 +117,7 @@ func (d *Droid) Edit(models []string) error {
 		newModels = append(newModels, modelEntry{
 			Model:           model,
 			DisplayName:     model,
-			BaseURL:         envconfig.Host().String() + "/v1",
+			BaseURL:         "http://localhost:11434/v1",
 			APIKey:          "ollama",
 			Provider:        "generic-chat-completion-api",
 			MaxOutputTokens: 64000,
--- a/cmd/config/droid_test.go
+++ b/cmd/config/droid_test.go
@@ -218,7 +218,7 @@ func TestDroidEdit(t *testing.T) {
 			}
 		}

-		if model["baseUrl"] != "http://127.0.0.1:11434/v1" {
+		if model["baseUrl"] != "http://localhost:11434/v1" {
 			t.Errorf("unexpected baseUrl: %s", model["baseUrl"])
 		}
 		if model["apiKey"] != "ollama" {
@@ -447,7 +447,7 @@ const testDroidSettingsFixture = `{
    {
      "model": "existing-ollama-model",
      "displayName": "existing-ollama-model",
-      "baseUrl": "http://127.0.0.1:11434/v1",
+      "baseUrl": "http://localhost:11434/v1",
      "apiKey": "ollama",
      "provider": "generic-chat-completion-api",
      "maxOutputTokens": 64000,
--- a/cmd/config/integrations.go
+++ b/cmd/config/integrations.go
@@ -41,7 +41,6 @@ type Editor interface {
 // integrations is the registry of available integrations.
 var integrations = map[string]Runner{
 	"claude":   &Claude{},
-	"clawdbot": &Clawdbot{},
 	"codex":    &Codex{},
 	"droid":    &Droid{},
 	"opencode": &OpenCode{},
@@ -243,7 +242,6 @@ func LaunchCmd(checkServerHeartbeat func(cmd *cobra.Command, args []string) erro

 Supported integrations:
  claude    Claude Code
-  clawdbot  Clawdbot
  codex     Codex
  droid     Droid
  opencode  OpenCode
--- a/cmd/config/opencode.go
+++ b/cmd/config/opencode.go
@@ -9,8 +9,6 @@ import (
 	"path/filepath"
 	"slices"
 	"strings"
-
-	"github.com/ollama/ollama/envconfig"
 )

 // OpenCode implements Runner and Editor for OpenCode integration
@@ -90,7 +88,7 @@ func (o *OpenCode) Edit(modelList []string) error {
 			"npm":  "@ai-sdk/openai-compatible",
 			"name": "Ollama (local)",
 			"options": map[string]any{
-				"baseURL": envconfig.Host().String() + "/v1",
+				"baseURL": "http://localhost:11434/v1",
 			},
 		}
 	}
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -30,9 +30,6 @@ const (
 )

 func generateInteractive(cmd *cobra.Command, opts runOptions) error {
-	var sessionPromptTokens int64
-	var sessionCompletionTokens int64
-
 	usage := func() {
 		fmt.Fprintln(os.Stderr, "Available Commands:")
 		fmt.Fprintln(os.Stderr, "  /set            Set session variables")
@@ -40,7 +37,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 		fmt.Fprintln(os.Stderr, "  /load <model>   Load a session or model")
 		fmt.Fprintln(os.Stderr, "  /save <model>   Save your current session")
 		fmt.Fprintln(os.Stderr, "  /clear          Clear session context")
-		fmt.Fprintln(os.Stderr, "  /usage          Show session token usage")
 		fmt.Fprintln(os.Stderr, "  /bye            Exit")
 		fmt.Fprintln(os.Stderr, "  /?, /help       Help for a command")
 		fmt.Fprintln(os.Stderr, "  /? shortcuts    Help for keyboard shortcuts")
@@ -449,9 +445,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 			} else {
 				usageShow()
 			}
-		case strings.HasPrefix(line, "/usage"):
-			fmt.Printf("prompt tokens:     %d\n", sessionPromptTokens)
-			fmt.Printf("completion tokens: %d\n", sessionCompletionTokens)
 		case strings.HasPrefix(line, "/help"), strings.HasPrefix(line, "/?"):
 			args := strings.Fields(line)
 			if len(args) > 1 {
@@ -506,7 +499,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {

 			opts.Messages = append(opts.Messages, newMessage)

-			assistant, metrics, err := chat(cmd, opts)
+			assistant, err := chat(cmd, opts)
 			if err != nil {
 				if strings.Contains(err.Error(), "does not support thinking") ||
 					strings.Contains(err.Error(), "invalid think value") {
@@ -516,10 +509,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 				}
 				return err
 			}
-			if metrics != nil {
-				sessionPromptTokens += int64(metrics.PromptEvalCount)
-				sessionCompletionTokens += int64(metrics.EvalCount)
-			}
 			if assistant != nil {
 				opts.Messages = append(opts.Messages, *assistant)
 			}
--- a/cmd/runner/main.go
+++ b/cmd/runner/main.go
@@ -1,15 +0,0 @@
-package main
-
-import (
-	"fmt"
-	"os"
-
-	"github.com/ollama/ollama/runner"
-)
-
-func main() {
-	if err := runner.Execute(os.Args[1:]); err != nil {
-		fmt.Fprintf(os.Stderr, "error: %s\n", err)
-		os.Exit(1)
-	}
-}
--- a/docs/docs.json
+++ b/docs/docs.json
@@ -102,7 +102,6 @@
            "group": "Integrations",
            "pages": [
              "/integrations/claude-code",
-              "/integrations/clawdbot",
              "/integrations/cline",
              "/integrations/codex",
              "/integrations/droid",
--- a/docs/integrations/clawdbot.mdx
+++ b/docs/integrations/clawdbot.mdx
@@ -1,48 +0,0 @@
---
-title: Clawdbot
---
-
-Clawdbot is a personal AI assistant that runs on your own devices. It bridges messaging services (WhatsApp, Telegram, Slack, Discord, iMessage, and more) to AI coding agents through a centralized gateway.
-
-## Install
-
-Install [Clawdbot](https://clawd.bot/) 
-
-```bash
-npm install -g clawdbot@latest
-```
-
-Then run the onboarding wizard:
-
-```bash
-clawdbot onboard --install-daemon
-```
-
-<Note>Clawdbot requires a larger context window. It is recommended to use a context window of at least 64k tokens. See [Context length](/context-length) for more information.</Note>
-
-## Usage with Ollama
-
-### Quick setup
-
-```bash
-ollama launch clawdbot
-```
-
-This configures Clawdbot to use Ollama and starts the gateway.
-If the gateway is already running, no changes need to be made as the gateway will auto-reload the changes. 
-
-
-To configure without launching:
-
-```shell
-ollama launch clawdbot --config
-```
-
-## Recommended Models
-
- `qwen3-coder`
- `glm-4.7`
- `gpt-oss:20b`
- `gpt-oss:120b`
-
-Cloud models are also available at [ollama.com/search?c=cloud](https://ollama.com/search?c=cloud).
--- a/go.mod
+++ b/go.mod
@@ -13,7 +13,7 @@ require (
 	github.com/mattn/go-sqlite3 v1.14.24
 	github.com/olekukonko/tablewriter v0.0.5
 	github.com/spf13/cobra v1.7.0
-	github.com/stretchr/testify v1.9.0
+	github.com/stretchr/testify v1.10.0
 	github.com/x448/float16 v0.8.4
 	golang.org/x/sync v0.17.0
 	golang.org/x/sys v0.37.0
@@ -28,6 +28,8 @@ require (
 	github.com/nlpodyssey/gopickle v0.3.0
 	github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c
 	github.com/tkrajina/typescriptify-golang-structs v0.2.0
+	github.com/tree-sitter/go-tree-sitter v0.25.0
+	github.com/tree-sitter/tree-sitter-cpp v0.23.4
 	github.com/wk8/go-ordered-map/v2 v2.1.8
 	golang.org/x/image v0.22.0
 	golang.org/x/mod v0.30.0
@@ -49,6 +51,7 @@ require (
 	github.com/google/flatbuffers v24.3.25+incompatible // indirect
 	github.com/kr/text v0.2.0 // indirect
 	github.com/mailru/easyjson v0.7.7 // indirect
+	github.com/mattn/go-pointer v0.0.1 // indirect
 	github.com/pkg/errors v0.9.1 // indirect
 	github.com/pmezard/go-difflib v1.0.0 // indirect
 	github.com/rivo/uniseg v0.2.0 // indirect
--- a/go.sum
+++ b/go.sum
@@ -152,6 +152,8 @@ github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0
 github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
 github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
 github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
+github.com/mattn/go-pointer v0.0.1 h1:n+XhsuGeVO6MEAp7xyEukFINEa+Quek5psIR/ylA6o0=
+github.com/mattn/go-pointer v0.0.1/go.mod h1:2zXcozF6qYGgmsG+SeTZz3oAbFLdD3OWqnUbNvJZAlc=
 github.com/mattn/go-runewidth v0.0.9/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI=
 github.com/mattn/go-runewidth v0.0.14 h1:+xnbZSEeDbOIg5/mE6JF0w6n9duR1l3/WmbinWVwUuU=
 github.com/mattn/go-runewidth v0.0.14/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
@@ -204,12 +206,39 @@ github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/
 github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
 github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
 github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
-github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
 github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
+github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
+github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
 github.com/tkrajina/go-reflector v0.5.5 h1:gwoQFNye30Kk7NrExj8zm3zFtrGPqOkzFMLuQZg1DtQ=
 github.com/tkrajina/go-reflector v0.5.5/go.mod h1:ECbqLgccecY5kPmPmXg1MrHW585yMcDkVl6IvJe64T4=
 github.com/tkrajina/typescriptify-golang-structs v0.2.0 h1:ZedWk82egydDspGTryAatbX0/1NZDQbdiZLoCbOk4f8=
 github.com/tkrajina/typescriptify-golang-structs v0.2.0/go.mod h1:sjU00nti/PMEOZb07KljFlR+lJ+RotsC0GBQMv9EKls=
+github.com/tree-sitter/go-tree-sitter v0.25.0 h1:sx6kcg8raRFCvc9BnXglke6axya12krCJF5xJ2sftRU=
+github.com/tree-sitter/go-tree-sitter v0.25.0/go.mod h1:r77ig7BikoZhHrrsjAnv8RqGti5rtSyvDHPzgTPsUuU=
+github.com/tree-sitter/tree-sitter-c v0.23.4 h1:nBPH3FV07DzAD7p0GfNvXM+Y7pNIoPenQWBpvM++t4c=
+github.com/tree-sitter/tree-sitter-c v0.23.4/go.mod h1:MkI5dOiIpeN94LNjeCp8ljXN/953JCwAby4bClMr6bw=
+github.com/tree-sitter/tree-sitter-cpp v0.23.4 h1:LaWZsiqQKvR65yHgKmnaqA+uz6tlDJTJFCyFIeZU/8w=
+github.com/tree-sitter/tree-sitter-cpp v0.23.4/go.mod h1:doqNW64BriC7WBCQ1klf0KmJpdEvfxyXtoEybnBo6v8=
+github.com/tree-sitter/tree-sitter-embedded-template v0.23.2 h1:nFkkH6Sbe56EXLmZBqHHcamTpmz3TId97I16EnGy4rg=
+github.com/tree-sitter/tree-sitter-embedded-template v0.23.2/go.mod h1:HNPOhN0qF3hWluYLdxWs5WbzP/iE4aaRVPMsdxuzIaQ=
+github.com/tree-sitter/tree-sitter-go v0.23.4 h1:yt5KMGnTHS+86pJmLIAZMWxukr8W7Ae1STPvQUuNROA=
+github.com/tree-sitter/tree-sitter-go v0.23.4/go.mod h1:Jrx8QqYN0v7npv1fJRH1AznddllYiCMUChtVjxPK040=
+github.com/tree-sitter/tree-sitter-html v0.23.2 h1:1UYDV+Yd05GGRhVnTcbP58GkKLSHHZwVaN+lBZV11Lc=
+github.com/tree-sitter/tree-sitter-html v0.23.2/go.mod h1:gpUv/dG3Xl/eebqgeYeFMt+JLOY9cgFinb/Nw08a9og=
+github.com/tree-sitter/tree-sitter-java v0.23.5 h1:J9YeMGMwXYlKSP3K4Us8CitC6hjtMjqpeOf2GGo6tig=
+github.com/tree-sitter/tree-sitter-java v0.23.5/go.mod h1:NRKlI8+EznxA7t1Yt3xtraPk1Wzqh3GAIC46wxvc320=
+github.com/tree-sitter/tree-sitter-javascript v0.23.1 h1:1fWupaRC0ArlHJ/QJzsfQ3Ibyopw7ZfQK4xXc40Zveo=
+github.com/tree-sitter/tree-sitter-javascript v0.23.1/go.mod h1:lmGD1EJdCA+v0S1u2fFgepMg/opzSg/4pgFym2FPGAs=
+github.com/tree-sitter/tree-sitter-json v0.24.8 h1:tV5rMkihgtiOe14a9LHfDY5kzTl5GNUYe6carZBn0fQ=
+github.com/tree-sitter/tree-sitter-json v0.24.8/go.mod h1:F351KK0KGvCaYbZ5zxwx/gWWvZhIDl0eMtn+1r+gQbo=
+github.com/tree-sitter/tree-sitter-php v0.23.11 h1:iHewsLNDmznh8kgGyfWfujsZxIz1YGbSd2ZTEM0ZiP8=
+github.com/tree-sitter/tree-sitter-php v0.23.11/go.mod h1:T/kbfi+UcCywQfUNAJnGTN/fMSUjnwPXA8k4yoIks74=
+github.com/tree-sitter/tree-sitter-python v0.23.6 h1:qHnWFR5WhtMQpxBZRwiaU5Hk/29vGju6CVtmvu5Haas=
+github.com/tree-sitter/tree-sitter-python v0.23.6/go.mod h1:cpdthSy/Yoa28aJFBscFHlGiU+cnSiSh1kuDVtI8YeM=
+github.com/tree-sitter/tree-sitter-ruby v0.23.1 h1:T/NKHUA+iVbHM440hFx+lzVOzS4dV6z8Qw8ai+72bYo=
+github.com/tree-sitter/tree-sitter-ruby v0.23.1/go.mod h1:kUS4kCCQloFcdX6sdpr8p6r2rogbM6ZjTox5ZOQy8cA=
+github.com/tree-sitter/tree-sitter-rust v0.23.2 h1:6AtoooCW5GqNrRpfnvl0iUhxTAZEovEmLKDbyHlfw90=
+github.com/tree-sitter/tree-sitter-rust v0.23.2/go.mod h1:hfeGWic9BAfgTrc7Xf6FaOAguCFJRo3RBbs7QJ6D7MI=
 github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS4MhqMhdFk5YI=
 github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08=
 github.com/ugorji/go/codec v1.2.12 h1:9LC83zGrHhuUA9l16C9AHXAqEV/2wBQ4nkvumAE65EE=
--- a/llm/server.go
+++ b/llm/server.go
@@ -34,6 +34,7 @@ import (
 	"github.com/ollama/ollama/logutil"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/tokenizer"
 )

 type filteredEnv []string
@@ -115,7 +116,7 @@ type llamaServer struct {
 type ollamaServer struct {
 	llmServer

-	textProcessor model.TextProcessor // textProcessor handles text encoding/decoding
+	tokenizer tokenizer.Tokenizer // textProcessor handles text encoding/decoding
 }

 // LoadModel will load a model from disk. The model must be in the GGML format.
@@ -141,11 +142,11 @@ func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) {
 // NewLlamaServer will run a server for the given GPUs
 func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath string, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
 	var llamaModel *llama.Model
-	var textProcessor model.TextProcessor
+	var tokenizer tokenizer.Tokenizer
 	var err error
 	if envconfig.NewEngine() || f.KV().OllamaEngineRequired() {
 		if len(projectors) == 0 {
-			textProcessor, err = model.NewTextProcessor(modelPath)
+			tokenizer, err = model.NewTextProcessor(modelPath)
 		} else {
 			err = errors.New("split vision models aren't supported")
 		}
@@ -154,7 +155,7 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st
 			slog.Debug("model not yet supported by Ollama engine, switching to compatibility mode", "model", modelPath, "error", err)
 		}
 	}
-	if textProcessor == nil {
+	if tokenizer == nil {
 		llamaModel, err = llama.LoadModelFromFile(modelPath, llama.ModelParams{VocabOnly: true})
 		if err != nil {
 			return nil, err
@@ -210,7 +211,7 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st

 	kvct := strings.ToLower(envconfig.KvCacheType())

-	if textProcessor == nil {
+	if tokenizer == nil {
 		flashAttention := ml.FlashAttentionAuto
 		if faUserSet {
 			if fa {
@@ -260,7 +261,7 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st
 	gpuLibs := ml.LibraryPaths(gpus)
 	status := NewStatusWriter(os.Stderr)
 	cmd, port, err := StartRunner(
-		textProcessor != nil,
+		tokenizer != nil,
 		modelPath,
 		gpuLibs,
 		status,
@@ -309,8 +310,8 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st
 		}
 	}()

-	if textProcessor != nil {
-		return &ollamaServer{llmServer: s, textProcessor: textProcessor}, nil
+	if tokenizer != nil {
+		return &ollamaServer{llmServer: s, tokenizer: tokenizer}, nil
 	} else {
 		return &llamaServer{llmServer: s, ggml: f}, nil
 	}
@@ -1772,7 +1773,7 @@ func (s *llamaServer) Tokenize(ctx context.Context, content string) ([]int, erro
 }

 func (s *ollamaServer) Tokenize(ctx context.Context, content string) ([]int, error) {
-	tokens, err := s.textProcessor.Encode(content, false)
+	tokens, err := s.tokenizer.Encode(content, false)
 	if err != nil {
 		return nil, err
 	}
@@ -1807,7 +1808,7 @@ func (s *ollamaServer) Detokenize(ctx context.Context, tokens []int) (string, er
 		toks[i] = int32(t)
 	}

-	content, err := s.textProcessor.Decode(toks)
+	content, err := s.tokenizer.Decode(toks)
 	if err != nil {
 		return "", err
 	}
--- a/model/bytepairencoding.go
+++ b/model/bytepairencoding.go
@@ -1,272 +0,0 @@
-package model
-
-import (
-	"cmp"
-	"iter"
-	"slices"
-	"strings"
-
-	"github.com/dlclark/regexp2"
-	heap "github.com/emirpasic/gods/v2/trees/binaryheap"
-	"github.com/ollama/ollama/logutil"
-)
-
-type BytePairEncoding struct {
-	vocab   *Vocabulary
-	regexps []*regexp2.Regexp
-}
-
-var _ TextProcessor = (*BytePairEncoding)(nil)
-
-func NewBytePairEncoding(vocab *Vocabulary, pretokenizers ...string) BytePairEncoding {
-	if len(pretokenizers) == 0 {
-		// set default byte-level pretokenizer if none provided, e.g.
-		// https://github.com/huggingface/tokenizers/blob/main/tokenizers/src/pre_tokenizers/byte_level.rs#L44
-		pretokenizers = []string{`'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+`}
-	}
-
-	return BytePairEncoding{
-		vocab: vocab,
-		regexps: slices.Collect(func(yield func(*regexp2.Regexp) bool) {
-			for _, p := range pretokenizers {
-				if !yield(regexp2.MustCompile(p, regexp2.RE2)) {
-					return
-				}
-			}
-		}),
-	}
-}
-
-func (bpe BytePairEncoding) Vocabulary() *Vocabulary {
-	return bpe.vocab
-}
-
-func (bpe BytePairEncoding) Is(id int32, special Special) bool {
-	return bpe.vocab.Is(id, special)
-}
-
-func (bpe *BytePairEncoding) split(s string) iter.Seq[string] {
-	parts := []string{s}
-	for _, re := range bpe.regexps {
-		parts = slices.Collect(func(yield func(string) bool) {
-			for _, part := range parts {
-				r := []rune(part)
-				var offset int
-				for m, _ := re.FindRunesMatch(r); m != nil; m, _ = re.FindNextMatch(m) {
-					if offset-m.Index != 0 {
-						if !yield(string(r[:m.Index])) {
-							return
-						}
-					}
-
-					if !yield(m.String()) {
-						return
-					}
-
-					offset = m.Index + m.Length
-				}
-
-				if offset < len(r) {
-					if !yield(string(r[offset:])) {
-						return
-					}
-				}
-			}
-		})
-	}
-
-	return slices.Values(parts)
-}
-
-// fragment is a string fragment and their corresponding token IDs
-type fragment struct {
-	value string
-	ids   []int32
-}
-
-// pair is a pair of runes and its rank
-type pair struct {
-	a, b  int
-	rank  int
-	value string
-}
-
-type merge struct {
-	p, n  int
-	runes []rune
-}
-
-func (bpe BytePairEncoding) Encode(s string, addSpecial bool) ([]int32, error) {
-	fragments := []fragment{{value: s}}
-	for _, special := range bpe.vocab.SpecialVocabulary() {
-		// TODO: process special tokens concurrently
-		id := bpe.vocab.Encode(special)
-		for i := 0; i < len(fragments); i++ {
-			frag := fragments[i]
-			if len(frag.ids) > 0 {
-				continue
-			}
-
-			var middle []fragment
-			switch i := strings.Index(frag.value, special); {
-			case i < 0:
-				middle = append(middle, frag)
-			case i > 0:
-				middle = append(middle, fragment{value: frag.value[:i]})
-				fallthrough
-			default:
-				middle = append(middle, fragment{value: special, ids: []int32{id}})
-				if rest := frag.value[i+len(special):]; rest != "" {
-					middle = append(middle, fragment{value: rest})
-				}
-			}
-
-			fragments = append(fragments[:i], append(middle, fragments[i+1:]...)...)
-		}
-	}
-
-	var ids []int32
-	for _, frag := range fragments {
-		if len(frag.ids) > 0 {
-			ids = append(ids, frag.ids...)
-			continue
-		}
-
-		for split := range bpe.split(frag.value) {
-			// TODO: process splits concurrently
-			var sb strings.Builder
-			for _, b := range []byte(split) {
-				r := rune(b)
-				switch {
-				case r == 0x00ad:
-					r = 0x0143
-				case r <= 0x0020:
-					r = r + 0x0100
-				case r >= 0x007f && r <= 0x00a0:
-					r = r + 0x00a2
-				}
-
-				sb.WriteRune(r)
-			}
-
-			// short circuit if the fragment is in the vocabulary
-			if id := bpe.vocab.Encode(sb.String()); id >= 0 {
-				ids = append(ids, id)
-				continue
-			}
-
-			runes := []rune(sb.String())
-			merges := make([]merge, len(runes))
-			for r := range runes {
-				merges[r] = merge{
-					p:     r - 1,
-					n:     r + 1,
-					runes: []rune{runes[r]},
-				}
-			}
-
-			pairwise := func(a, b int) *pair {
-				if a < 0 || b >= len(runes) {
-					return nil
-				}
-
-				left, right := string(merges[a].runes), string(merges[b].runes)
-				rank := bpe.vocab.Merge(left, right)
-				if rank < 0 {
-					return nil
-				}
-
-				return &pair{
-					a:     a,
-					b:     b,
-					rank:  rank,
-					value: left + right,
-				}
-			}
-
-			pairs := heap.NewWith(func(i, j *pair) int {
-				return cmp.Compare(i.rank, j.rank)
-			})
-
-			for i := range len(runes) - 1 {
-				if pair := pairwise(i, i+1); pair != nil {
-					pairs.Push(pair)
-				}
-			}
-
-			for !pairs.Empty() {
-				pair, _ := pairs.Pop()
-
-				left, right := merges[pair.a], merges[pair.b]
-				if len(left.runes) == 0 || len(right.runes) == 0 ||
-					string(left.runes)+string(right.runes) != pair.value {
-					continue
-				}
-
-				if id := bpe.vocab.Encode(pair.value); id < 0 {
-					continue
-				}
-
-				merges[pair.a].runes = append(left.runes, right.runes...)
-				merges[pair.b].runes = nil
-
-				merges[pair.a].n = right.n
-				if right.n < len(merges) {
-					merges[right.n].p = pair.a
-				}
-
-				if pair := pairwise(merges[pair.a].p, pair.a); pair != nil {
-					pairs.Push(pair)
-				}
-
-				if pair := pairwise(pair.a, merges[pair.a].n); pair != nil {
-					pairs.Push(pair)
-				}
-			}
-
-			for _, merge := range merges {
-				if len(merge.runes) > 0 {
-					// TODO: handle the edge case where the rune isn't in the vocabulary
-					if id := bpe.vocab.Encode(string(merge.runes)); id >= 0 {
-						ids = append(ids, id)
-					}
-				}
-			}
-		}
-	}
-
-	if addSpecial {
-		ids = bpe.vocab.addSpecials(ids)
-	}
-
-	logutil.Trace("encoded", "string", s, "ids", ids)
-	return ids, nil
-}
-
-func (bpe BytePairEncoding) Decode(ids []int32) (string, error) {
-	var sb strings.Builder
-	for _, id := range ids {
-		for _, r := range bpe.vocab.Decode(id) {
-			switch {
-			case r == 0x0100:
-				// this produces 0x00 aka NULL
-				continue
-			case r == 0x0143:
-				r = 0x00ad
-			case r > 0x0100 && r <= 0x0120:
-				r = r - 0x0100
-			case r > 0x0120 && r <= 0x0142:
-				r = r - 0x00a2
-			}
-
-			// NOTE: not using WriteRune here because it writes the UTF-8
-			// encoding of the rune which is _not_ what we want
-			if err := sb.WriteByte(byte(r)); err != nil {
-				return "", err
-			}
-		}
-	}
-
-	logutil.Trace("decoded", "string", sb.String(), "from", ids)
-	return sb.String(), nil
-}
--- a/model/ignore_test.go
+++ b/model/ignore_test.go
--- a/model/model.go
+++ b/model/model.go
@@ -23,6 +23,7 @@ import (
 	_ "github.com/ollama/ollama/ml/backend"
 	"github.com/ollama/ollama/ml/nn/pooling"
 	"github.com/ollama/ollama/model/input"
+	"github.com/ollama/ollama/tokenizer"
 )

 var (
@@ -133,7 +134,7 @@ func New(modelPath string, params ml.BackendParams) (Model, error) {
 	return m, nil
 }

-func NewTextProcessor(s string) (TextProcessor, error) {
+func NewTextProcessor(s string) (tokenizer.Tokenizer, error) {
 	r, err := os.Open(s)
 	if err != nil {
 		return nil, err
@@ -150,7 +151,7 @@ func NewTextProcessor(s string) (TextProcessor, error) {
 		return nil, err
 	}

-	tp, ok := m.(TextProcessor)
+	tp, ok := m.(tokenizer.Tokenizer)
 	if !ok {
 		return nil, ErrUnsupportedTokenizer
 	}
--- a/model/models/bert/embed.go
+++ b/model/models/bert/embed.go
@@ -10,11 +10,12 @@ import (
 	"github.com/ollama/ollama/ml/nn/pooling"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
+	"github.com/ollama/ollama/tokenizer"
 )

 type Model struct {
 	model.Base
-	model.TextProcessor
+	tokenizer.Tokenizer

 	TokenEmbedding     *nn.Embedding `gguf:"token_embd"`
 	TypeEmbedding      *nn.Embedding `gguf:"token_types"`
@@ -129,7 +130,7 @@ func (o Options) headDim() int {
 }

 func New(c fs.Config) (model.Model, error) {
-	vocab := &model.Vocabulary{
+	vocab := &tokenizer.Vocabulary{
 		Values: c.Strings("tokenizer.ggml.tokens"),
 		Scores: c.Floats("tokenizer.ggml.scores"),
 		Types:  c.Ints("tokenizer.ggml.token_type"),
@@ -153,17 +154,17 @@ func New(c fs.Config) (model.Model, error) {
 		},
 	}

-	var processor model.TextProcessor
+	var t tokenizer.Tokenizer
 	switch c.String("tokenizer.ggml.model", "bert") {
 	case "bert":
-		processor = model.NewWordPiece(vocab, true)
+		t = tokenizer.NewWordPiece(vocab, true)
 	default:
 		return nil, model.ErrUnsupportedTokenizer
 	}

 	return &Model{
-		TextProcessor: processor,
-		Layers:        make([]EncoderLayer, c.Uint("block_count")),
+		Tokenizer: t,
+		Layers:    make([]EncoderLayer, c.Uint("block_count")),
 		Options: Options{
 			hiddenSize:  int(c.Uint("embedding_length")),
 			numHeads:    int(c.Uint("attention.head_count")),
--- a/model/models/deepseek2/model.go
+++ b/model/models/deepseek2/model.go
@@ -13,6 +13,7 @@ import (
 	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
+	"github.com/ollama/ollama/tokenizer"
 )

 type Options struct {
@@ -222,7 +223,7 @@ func (t *Layer) Forward(ctx ml.Context, hiddenStates, positions, outputs ml.Tens

 type Model struct {
 	model.Base
-	model.BytePairEncoding
+	tokenizer.Tokenizer

 	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
 	Layers         []Layer       `gguf:"blk"`
@@ -277,8 +278,8 @@ func New(c fs.Config) (model.Model, error) {
 	}

 	m := Model{
-		BytePairEncoding: model.NewBytePairEncoding(
-			&model.Vocabulary{
+		Tokenizer: tokenizer.NewBytePairEncoding(
+			&tokenizer.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
--- a/model/models/deepseekocr/model.go
+++ b/model/models/deepseekocr/model.go
@@ -10,11 +10,12 @@ import (
 	"github.com/ollama/ollama/ml/nn"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
+	"github.com/ollama/ollama/tokenizer"
 )

 type Model struct {
 	model.Base
-	model.TextProcessor
+	tokenizer.Tokenizer

 	Sam    *samModel    `gguf:"s"`
 	Vision *visionModel `gguf:"v"`
@@ -134,8 +135,8 @@ func init() {
 		}

 		m := Model{
-			TextProcessor: model.NewBytePairEncoding(
-				&model.Vocabulary{
+			Tokenizer: tokenizer.NewBytePairEncoding(
+				&tokenizer.Vocabulary{
 					Values: c.Strings("tokenizer.ggml.tokens"),
 					Types:  c.Ints("tokenizer.ggml.token_type"),
 					Merges: c.Strings("tokenizer.ggml.merges"),
--- a/model/models/gemma2/model.go
+++ b/model/models/gemma2/model.go
@@ -10,6 +10,7 @@ import (
 	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
+	"github.com/ollama/ollama/tokenizer"
 )

 type Options struct {
@@ -27,7 +28,7 @@ func (o Options) applyRotaryPositionEmbeddings(ctx ml.Context, states, positions

 type Model struct {
 	model.Base
-	model.SentencePiece
+	tokenizer.Tokenizer

 	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
 	Layers         []Layer       `gguf:"blk"`
@@ -43,8 +44,8 @@ const (

 func New(c fs.Config) (model.Model, error) {
 	m := Model{
-		SentencePiece: model.NewSentencePiece(
-			&model.Vocabulary{
+		Tokenizer: tokenizer.NewSentencePiece(
+			&tokenizer.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Scores: c.Floats("tokenizer.ggml.scores"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
--- a/model/models/gemma3/embed.go
+++ b/model/models/gemma3/embed.go
@@ -7,11 +7,12 @@ import (
 	"github.com/ollama/ollama/ml/nn/pooling"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
+	"github.com/ollama/ollama/tokenizer"
 )

 type embedModel struct {
 	model.Base
-	model.SentencePiece
+	tokenizer.Tokenizer

 	*TextModel
 	poolingType pooling.Type
@@ -31,8 +32,8 @@ func (m *embedModel) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, erro

 func newEmbedModel(c fs.Config) (model.Model, error) {
 	m := &embedModel{
-		SentencePiece: model.NewSentencePiece(
-			&model.Vocabulary{
+		Tokenizer: tokenizer.NewSentencePiece(
+			&tokenizer.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Scores: c.Floats("tokenizer.ggml.scores"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
--- a/model/models/gemma3/model.go
+++ b/model/models/gemma3/model.go
@@ -12,11 +12,12 @@ import (
 	"github.com/ollama/ollama/ml/nn"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
+	"github.com/ollama/ollama/tokenizer"
 )

 type Model struct {
 	model.Base
-	model.TextProcessor
+	tokenizer.Tokenizer

 	*VisionModel `gguf:"v"`
 	*TextModel
@@ -54,7 +55,7 @@ func (p *MultiModalProjector) Forward(ctx ml.Context, visionOutputs ml.Tensor, i
 }

 func New(c fs.Config) (model.Model, error) {
-	vocabulary := model.Vocabulary{
+	vocabulary := tokenizer.Vocabulary{
 		Values: c.Strings("tokenizer.ggml.tokens"),
 		Scores: c.Floats("tokenizer.ggml.scores"),
 		Types:  c.Ints("tokenizer.ggml.token_type"),
@@ -70,19 +71,19 @@ func New(c fs.Config) (model.Model, error) {
 		),
 	}

-	var processor model.TextProcessor
+	var t tokenizer.Tokenizer
 	switch c.String("tokenizer.ggml.model") {
 	case "gpt2":
-		processor = model.NewBytePairEncoding(&vocabulary)
+		t = tokenizer.NewBytePairEncoding(&vocabulary)
 	default:
 		// Previous uploads of Gemma 3 on Ollama did not have token 106
 		// (i.e. "<end_of_turn>") so we need to add in case it's not already present
 		vocabulary.EOS = append(vocabulary.EOS, int32(c.Uint("tokenizer.ggml.eot_token_id", 106)))
-		processor = model.NewSentencePiece(&vocabulary)
+		t = tokenizer.NewSentencePiece(&vocabulary)
 	}

 	m := Model{
-		TextProcessor:  processor,
+		Tokenizer:      t,
 		ImageProcessor: newImageProcessor(c),
 		VisionModel:    newVisionModel(c),
 		TextModel:      newTextModel(c),
--- a/model/models/gemma3n/model.go
+++ b/model/models/gemma3n/model.go
@@ -6,11 +6,12 @@ import (
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
+	"github.com/ollama/ollama/tokenizer"
 )

 type Model struct {
 	model.Base
-	model.SentencePiece
+	tokenizer.Tokenizer

 	*TextModel
 }
@@ -23,8 +24,8 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 func New(c fs.Config) (model.Model, error) {
 	m := Model{
 		TextModel: newTextModel(c),
-		SentencePiece: model.NewSentencePiece(
-			&model.Vocabulary{
+		Tokenizer: tokenizer.NewSentencePiece(
+			&tokenizer.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Scores: c.Floats("tokenizer.ggml.scores"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
--- a/model/models/glm4moelite/model.go
+++ b/model/models/glm4moelite/model.go
@@ -10,6 +10,7 @@ import (
 	"github.com/ollama/ollama/ml/nn"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
+	"github.com/ollama/ollama/tokenizer"
 )

 var ErrOldModelFormat = errors.New("this model uses a weight format that is no longer supported; please re-download it")
@@ -198,7 +199,7 @@ func (t *Layer) Forward(ctx ml.Context, hiddenStates, positions, outputs ml.Tens

 type Model struct {
 	model.Base
-	model.BytePairEncoding
+	tokenizer.Tokenizer

 	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
 	Layers         []Layer       `gguf:"blk"`
@@ -236,8 +237,8 @@ func New(c fs.Config) (model.Model, error) {
 	}

 	m := Model{
-		BytePairEncoding: model.NewBytePairEncoding(
-			&model.Vocabulary{
+		Tokenizer: tokenizer.NewBytePairEncoding(
+			&tokenizer.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
--- a/model/models/gptoss/model.go
+++ b/model/models/gptoss/model.go
@@ -12,11 +12,12 @@ import (
 	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
+	"github.com/ollama/ollama/tokenizer"
 )

 type Transformer struct {
 	model.Base
-	model.BytePairEncoding
+	tokenizer.Tokenizer

 	TokenEmbedding    *nn.Embedding      `gguf:"token_embd"`
 	TransformerBlocks []TransformerBlock `gguf:"blk"`
@@ -196,8 +197,8 @@ func (mlp *MLPBlock) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *Optio
 func New(c fs.Config) (model.Model, error) {
 	m := Transformer{
 		TransformerBlocks: make([]TransformerBlock, c.Uint("block_count")),
-		BytePairEncoding: model.NewBytePairEncoding(
-			&model.Vocabulary{
+		Tokenizer: tokenizer.NewBytePairEncoding(
+			&tokenizer.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
--- a/model/models/lfm2/model.go
+++ b/model/models/lfm2/model.go
@@ -10,6 +10,7 @@ import (
 	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
+	"github.com/ollama/ollama/tokenizer"
 )

 type Options struct {
@@ -59,7 +60,7 @@ func (o Options) applyRotaryPositionEmbeddings(ctx ml.Context, states, positions

 type Model struct {
 	model.Base
-	model.TextProcessor
+	tokenizer.Tokenizer

 	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
 	Layers         []Layer       `gguf:"blk"`
@@ -78,7 +79,7 @@ func New(c fs.Config) (model.Model, error) {
 		return nil, model.ErrUnsupportedTokenizer
 	}

-	vocabulary := model.Vocabulary{
+	vocabulary := tokenizer.Vocabulary{
 		Values: c.Strings("tokenizer.ggml.tokens"),
 		Scores: c.Floats("tokenizer.ggml.scores"),
 		Types:  c.Ints("tokenizer.ggml.token_type"),
@@ -104,8 +105,8 @@ func New(c fs.Config) (model.Model, error) {
 	}

 	m := Model{
-		TextProcessor: model.NewBytePairEncoding(&vocabulary, pretokenizers...),
-		Layers:        make([]Layer, c.Uint("block_count")),
+		Tokenizer: tokenizer.NewBytePairEncoding(&vocabulary, pretokenizers...),
+		Layers:    make([]Layer, c.Uint("block_count")),
 		Options: Options{
 			hiddenSize:            int(c.Uint("embedding_length")),
 			headDim:               int(c.Uint("attention.key_length")),
--- a/model/models/llama/model.go
+++ b/model/models/llama/model.go
@@ -11,6 +11,7 @@ import (
 	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
+	"github.com/ollama/ollama/tokenizer"
 )

 type Options struct {
@@ -25,7 +26,7 @@ func (o Options) applyRotaryPositionEmbeddings(ctx ml.Context, states, positions

 type Model struct {
 	model.Base
-	model.TextProcessor
+	tokenizer.Tokenizer

 	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
 	Layers         []Layer       `gguf:"blk"`
@@ -41,8 +42,8 @@ func New(c fs.Config) (model.Model, error) {
 		return nil, model.ErrUnsupportedModel
 	}

-	var processor model.TextProcessor
-	vocabulary := model.Vocabulary{
+	var processor tokenizer.Tokenizer
+	vocabulary := tokenizer.Vocabulary{
 		Values: c.Strings("tokenizer.ggml.tokens"),
 		Scores: c.Floats("tokenizer.ggml.scores"),
 		Types:  c.Ints("tokenizer.ggml.token_type"),
@@ -80,16 +81,16 @@ func New(c fs.Config) (model.Model, error) {
 				"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
 			}
 		}
-		processor = model.NewBytePairEncoding(&vocabulary, pretokenizers...)
+		processor = tokenizer.NewBytePairEncoding(&vocabulary, pretokenizers...)
 	case "llama":
-		processor = model.NewSentencePiece(&vocabulary)
+		processor = tokenizer.NewSentencePiece(&vocabulary)
 	default:
 		return nil, model.ErrUnsupportedTokenizer
 	}

 	m := Model{
-		TextProcessor: processor,
-		Layers:        make([]Layer, c.Uint("block_count")),
+		Tokenizer: processor,
+		Layers:    make([]Layer, c.Uint("block_count")),
 		Options: Options{
 			hiddenSize: int(c.Uint("embedding_length")),
 			numHeads:   int(c.Uint("attention.head_count")),
--- a/model/models/llama4/model.go
+++ b/model/models/llama4/model.go
@@ -11,11 +11,12 @@ import (
 	"github.com/ollama/ollama/ml/nn"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
+	"github.com/ollama/ollama/tokenizer"
 )

 type Model struct {
 	model.Base
-	model.BytePairEncoding
+	tokenizer.Tokenizer
 	ImageProcessor

 	*VisionModel `gguf:"v"`
@@ -33,8 +34,8 @@ func (p *Projector) Forward(ctx ml.Context, visionOutputs ml.Tensor) ml.Tensor {

 func New(c fs.Config) (model.Model, error) {
 	m := Model{
-		BytePairEncoding: model.NewBytePairEncoding(
-			&model.Vocabulary{
+		Tokenizer: tokenizer.NewBytePairEncoding(
+			&tokenizer.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
--- a/model/models/mistral3/model.go
+++ b/model/models/mistral3/model.go
@@ -11,11 +11,12 @@ import (
 	"github.com/ollama/ollama/ml/nn"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
+	"github.com/ollama/ollama/tokenizer"
 )

 type Model struct {
 	model.Base
-	model.BytePairEncoding
+	tokenizer.Tokenizer

 	*TextModel
 	*VisionModel         `gguf:"v"`
@@ -28,12 +29,12 @@ type Model struct {
 var _ model.MultimodalProcessor = (*Model)(nil)

 // Implement TextProcessor interface
-var _ model.TextProcessor = (*Model)(nil)
+var _ tokenizer.Tokenizer = (*Model)(nil)

 func New(c fs.Config) (model.Model, error) {
 	m := &Model{
-		BytePairEncoding: model.NewBytePairEncoding(
-			&model.Vocabulary{
+		Tokenizer: tokenizer.NewBytePairEncoding(
+			&tokenizer.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
--- a/model/models/mllama/model.go
+++ b/model/models/mllama/model.go
@@ -11,11 +11,12 @@ import (
 	"github.com/ollama/ollama/ml/nn"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
+	"github.com/ollama/ollama/tokenizer"
 )

 type Model struct {
 	model.Base
-	model.BytePairEncoding
+	tokenizer.Tokenizer

 	*VisionModel `gguf:"v"`
 	*TextModel
@@ -32,8 +33,8 @@ const (

 func New(c fs.Config) (model.Model, error) {
 	m := Model{
-		BytePairEncoding: model.NewBytePairEncoding(
-			&model.Vocabulary{
+		Tokenizer: tokenizer.NewBytePairEncoding(
+			&tokenizer.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
--- a/model/models/nomicbert/model.go
+++ b/model/models/nomicbert/model.go
@@ -11,11 +11,12 @@ import (
 	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
+	"github.com/ollama/ollama/tokenizer"
 )

 type Model struct {
 	model.Base
-	model.TextProcessor
+	tokenizer.Tokenizer

 	TokenEmbedding     *nn.Embedding `gguf:"token_embd"`
 	TypeEmbedding      *nn.Embedding `gguf:"token_types"`
@@ -178,8 +179,8 @@ func New(c fs.Config) (model.Model, error) {
 	numHeads := int(c.Uint("attention.head_count"))
 	headDim := hiddenSize / numHeads

-	processor := model.NewWordPiece(
-		&model.Vocabulary{
+	tokenizer := tokenizer.NewWordPiece(
+		&tokenizer.Vocabulary{
 			Values: c.Strings("tokenizer.ggml.tokens"),
 			Scores: c.Floats("tokenizer.ggml.scores"),
 			Types:  c.Ints("tokenizer.ggml.token_type"),
@@ -219,8 +220,8 @@ func New(c fs.Config) (model.Model, error) {
 	}

 	return &Model{
-		TextProcessor: processor,
-		Layers:        layers,
+		Tokenizer: tokenizer,
+		Layers:    layers,
 		Options: Options{
 			hiddenSize:      hiddenSize,
 			numHeads:        numHeads,
--- a/model/models/olmo3/model.go
+++ b/model/models/olmo3/model.go
@@ -11,6 +11,7 @@ import (
 	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
+	"github.com/ollama/ollama/tokenizer"
 )

 const (
@@ -33,7 +34,7 @@ type Options struct {

 type Model struct {
 	model.Base
-	model.TextProcessor
+	tokenizer.Tokenizer

 	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
 	Layers         []Layer       `gguf:"blk"`
@@ -44,7 +45,7 @@ type Model struct {
 }

 func New(c fs.Config) (model.Model, error) {
-	vocabulary := model.Vocabulary{
+	vocabulary := tokenizer.Vocabulary{
 		Values: c.Strings("tokenizer.ggml.tokens"),
 		Scores: c.Floats("tokenizer.ggml.scores"),
 		Types:  c.Ints("tokenizer.ggml.token_type"),
@@ -58,14 +59,14 @@ func New(c fs.Config) (model.Model, error) {
 		),
 	}

-	processor := model.NewBytePairEncoding(
+	tokenizer := tokenizer.NewBytePairEncoding(
 		&vocabulary,
 		"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
 	)

 	m := Model{
-		TextProcessor: processor,
-		Layers:        make([]Layer, c.Uint("block_count")),
+		Tokenizer: tokenizer,
+		Layers:    make([]Layer, c.Uint("block_count")),
 		Options: Options{
 			hiddenSize:            int(c.Uint("embedding_length")),
 			numHeads:              int(c.Uint("attention.head_count")),
--- a/model/models/qwen2/model.go
+++ b/model/models/qwen2/model.go
@@ -13,6 +13,7 @@ import (
 	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
+	"github.com/ollama/ollama/tokenizer"
 )

 type Options struct {
@@ -92,7 +93,7 @@ func (d DecoderLayer) Forward(ctx ml.Context, hiddenStates, positions, outputs m

 type Model struct {
 	model.Base
-	model.BytePairEncoding
+	tokenizer.Tokenizer

 	TokenEmbedding *nn.Embedding  `gguf:"token_embd"`
 	Layers         []DecoderLayer `gguf:"blk"`
@@ -139,8 +140,8 @@ func New(c fs.Config) (model.Model, error) {
 	}
 	m := Model{
 		Layers: make([]DecoderLayer, c.Uint("block_count")),
-		BytePairEncoding: model.NewBytePairEncoding(
-			&model.Vocabulary{
+		Tokenizer: tokenizer.NewBytePairEncoding(
+			&tokenizer.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
--- a/model/models/qwen25vl/model.go
+++ b/model/models/qwen25vl/model.go
@@ -10,11 +10,12 @@ import (
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
+	"github.com/ollama/ollama/tokenizer"
 )

 type Model struct {
 	model.Base
-	model.BytePairEncoding
+	tokenizer.Tokenizer

 	*TextModel
 	*VisionModel `gguf:"v"`
@@ -27,8 +28,8 @@ var _ model.MultimodalProcessor = (*Model)(nil)

 func New(c fs.Config) (model.Model, error) {
 	m := &Model{
-		BytePairEncoding: model.NewBytePairEncoding(
-			&model.Vocabulary{
+		Tokenizer: tokenizer.NewBytePairEncoding(
+			&tokenizer.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
--- a/model/models/qwen3/embed.go
+++ b/model/models/qwen3/embed.go
@@ -7,11 +7,12 @@ import (
 	"github.com/ollama/ollama/ml/nn/pooling"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
+	"github.com/ollama/ollama/tokenizer"
 )

 type embedModel struct {
 	model.Base
-	model.BytePairEncoding
+	tokenizer.Tokenizer

 	*Model
 	poolingType pooling.Type
@@ -34,8 +35,8 @@ func newEmbed(c fs.Config) (model.Model, error) {
 		layers[i].MLP = &dense{}
 	}
 	m := embedModel{
-		BytePairEncoding: model.NewBytePairEncoding(
-			&model.Vocabulary{
+		Tokenizer: tokenizer.NewBytePairEncoding(
+			&tokenizer.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
--- a/model/models/qwen3/model.go
+++ b/model/models/qwen3/model.go
@@ -12,6 +12,7 @@ import (
 	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
+	"github.com/ollama/ollama/tokenizer"
 )

 type Options struct {
@@ -159,7 +160,7 @@ func (d *Layer) Forward(ctx ml.Context, hiddenStates, positions, outputs ml.Tens

 type Model struct {
 	model.Base
-	model.BytePairEncoding
+	tokenizer.Tokenizer

 	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
 	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
@@ -218,8 +219,8 @@ func New(c fs.Config) (model.Model, error) {
 	}

 	m := Model{
-		BytePairEncoding: model.NewBytePairEncoding(
-			&model.Vocabulary{
+		Tokenizer: tokenizer.NewBytePairEncoding(
+			&tokenizer.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
--- a/model/models/qwen3vl/model.go
+++ b/model/models/qwen3vl/model.go
@@ -10,11 +10,12 @@ import (
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
+	"github.com/ollama/ollama/tokenizer"
 )

 type Model struct {
 	model.Base
-	model.TextProcessor
+	tokenizer.Tokenizer

 	*TextModel
 	*VisionModel `gguf:"v"`
@@ -172,8 +173,8 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {

 func New(c fs.Config) (model.Model, error) {
 	m := Model{
-		TextProcessor: model.NewBytePairEncoding(
-			&model.Vocabulary{
+		Tokenizer: tokenizer.NewBytePairEncoding(
+			&tokenizer.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
--- a/model/parsers/ministral.go
+++ b/model/parsers/ministral.go
@@ -4,7 +4,6 @@ import (
 	"encoding/json"
 	"fmt"
 	"strings"
-	"unicode"

 	"github.com/ollama/ollama/api"
 )
@@ -18,34 +17,12 @@ const (
 	ministralCollectingToolArgs
 )

-// ministralEvent represents an event emitted during parsing
-type ministralEvent interface {
-	isMinistralEvent()
-}
-
-type ministralEventContent struct {
-	content string
-}
-
-type ministralEventThinking struct {
-	thinking string
-}
-
-type ministralEventToolCall struct {
-	name string
-	args string // raw JSON string
-}
-
-func (ministralEventContent) isMinistralEvent()  {}
-func (ministralEventThinking) isMinistralEvent() {}
-func (ministralEventToolCall) isMinistralEvent() {}
-
 type MinistralParser struct {
 	state              ministralParserState
 	buffer             strings.Builder
 	tools              []api.Tool
 	hasThinkingSupport bool
-	pendingToolName    string // stores tool name while collecting args
+	currentTool        *api.Tool
 }

 func (p *MinistralParser) HasToolSupport() bool {
@@ -86,251 +63,74 @@ func toolByName(tools []api.Tool, n string) (*api.Tool, error) {
 	return nil, fmt.Errorf("tool '%s' not found", n)
 }

-const (
-	ministralToolCallsTag = "[TOOL_CALLS]"
-	ministralThinkTag     = "[THINK]"
-	ministralThinkEndTag  = "[/THINK]"
-	ministralArgsTag      = "[ARGS]"
-)
-
-// eat consumes the parser's buffer, and returns a list of any unambiguous
-// events from the current parser state. The second return value indicates
-// whether to keep looping (true when state transitions, false when waiting
-// for more data).
-func (p *MinistralParser) eat() ([]ministralEvent, bool) {
-	var events []ministralEvent
-
-	switch p.state {
-	case ministralCollectingContent:
-		bufStr := p.buffer.String()
-
-		// Check for [TOOL_CALLS] tag
-		if strings.Contains(bufStr, ministralToolCallsTag) {
-			split := strings.SplitN(bufStr, ministralToolCallsTag, 2)
-			before := strings.TrimRightFunc(split[0], unicode.IsSpace)
-			if len(before) > 0 {
-				events = append(events, ministralEventContent{content: before})
-			}
-			after := split[1]
-			p.buffer.Reset()
-			p.buffer.WriteString(after)
-			p.state = ministralCollectingToolName
-			return events, true
-		}
-
-		// Check for [THINK] tag
-		if strings.Contains(bufStr, ministralThinkTag) {
-			split := strings.SplitN(bufStr, ministralThinkTag, 2)
-			before := strings.TrimRightFunc(split[0], unicode.IsSpace)
-			if len(before) > 0 {
-				events = append(events, ministralEventContent{content: before})
-			}
-			after := split[1]
-			p.buffer.Reset()
-			p.buffer.WriteString(after)
-			p.state = ministralCollectingThinkingContent
-			return events, true
-		}
-
-		// Check for partial tag overlap with [TOOL_CALLS] or [THINK]
-		overlapToolCalls := overlap(bufStr, ministralToolCallsTag)
-		overlapThink := overlap(bufStr, ministralThinkTag)
-		maxOverlap := max(overlapToolCalls, overlapThink)
-
-		if maxOverlap > 0 {
-			// Withhold the potential partial tag
-			beforePartialTag := bufStr[:len(bufStr)-maxOverlap]
-			trailingWS := trailingWhitespaceLen(beforePartialTag)
-			ambiguousStart := len(beforePartialTag) - trailingWS
-			unambiguous := bufStr[:ambiguousStart]
-			ambiguous := bufStr[ambiguousStart:]
-			p.buffer.Reset()
-			p.buffer.WriteString(ambiguous)
-			if len(unambiguous) > 0 {
-				events = append(events, ministralEventContent{content: unambiguous})
-			}
-			return events, false
-		}
-
-		// No tag found: emit content but withhold trailing whitespace
-		whitespaceLen := trailingWhitespaceLen(bufStr)
-		ambiguousStart := len(bufStr) - whitespaceLen
-		unambiguous := bufStr[:ambiguousStart]
-		ambiguous := bufStr[ambiguousStart:]
-		p.buffer.Reset()
-		p.buffer.WriteString(ambiguous)
-		if len(unambiguous) > 0 {
-			events = append(events, ministralEventContent{content: unambiguous})
-		}
-		return events, false
-
-	case ministralCollectingThinkingContent:
-		bufStr := p.buffer.String()
-
-		if strings.Contains(bufStr, ministralThinkEndTag) {
-			split := strings.SplitN(bufStr, ministralThinkEndTag, 2)
-			thinkingContent := split[0]
-			after := strings.TrimLeftFunc(split[1], unicode.IsSpace)
-			p.buffer.Reset()
-			p.buffer.WriteString(after)
-			if len(thinkingContent) > 0 {
-				events = append(events, ministralEventThinking{thinking: thinkingContent})
-			}
-			p.state = ministralCollectingContent
-			return events, true
-		}
-
-		// Check for partial overlap with [/THINK]
-		if overlapLen := overlap(bufStr, ministralThinkEndTag); overlapLen > 0 {
-			unambiguous := bufStr[:len(bufStr)-overlapLen]
-			ambiguous := bufStr[len(bufStr)-overlapLen:]
-			p.buffer.Reset()
-			p.buffer.WriteString(ambiguous)
-			if len(unambiguous) > 0 {
-				events = append(events, ministralEventThinking{thinking: unambiguous})
-			}
-			return events, false
-		}
-
-		// No tag found: emit all thinking content
-		p.buffer.Reset()
-		if len(bufStr) > 0 {
-			events = append(events, ministralEventThinking{thinking: bufStr})
-		}
-		return events, false
-
-	case ministralCollectingToolName:
-		bufStr := p.buffer.String()
-
-		if strings.Contains(bufStr, ministralArgsTag) {
-			split := strings.SplitN(bufStr, ministralArgsTag, 2)
-			toolName := split[0]
-			after := split[1]
-			p.pendingToolName = toolName
-			p.buffer.Reset()
-			p.buffer.WriteString(after)
-			p.state = ministralCollectingToolArgs
-			return events, true
-		}
-		// Wait for more data
-		return events, false
-
-	case ministralCollectingToolArgs:
-		bufStr := p.buffer.String()
-		jsonEnd := findJSONEnd(bufStr)
-
-		if jsonEnd != -1 {
-			jsonStr := bufStr[:jsonEnd+1]
-			remaining := bufStr[jsonEnd+1:]
-
-			events = append(events, ministralEventToolCall{
-				name: p.pendingToolName,
-				args: jsonStr,
-			})
-
-			p.pendingToolName = ""
-			p.buffer.Reset()
-			p.buffer.WriteString(remaining)
-			p.state = ministralCollectingContent
-			return events, true
-		}
-		// Wait for more data
-		return events, false
-
-	default:
-		panic("unexpected ministral event")
-	}
-}
-
-// parseEvents loops calling eat() until it returns false
-func (p *MinistralParser) parseEvents() []ministralEvent {
-	var all []ministralEvent
-	keepLooping := true
-	for keepLooping {
-		var events []ministralEvent
-		events, keepLooping = p.eat()
-		all = append(all, events...)
-	}
-	return all
-}
-
 func (p *MinistralParser) Add(s string, done bool) (content string, thinking string, calls []api.ToolCall, err error) {
 	p.buffer.WriteString(s)

-	events := p.parseEvents()
-
-	var contentBuilder, thinkingBuilder strings.Builder
-	var toolCalls []api.ToolCall
-
-	for _, event := range events {
-		switch e := event.(type) {
-		case ministralEventContent:
-			contentBuilder.WriteString(e.content)
-		case ministralEventThinking:
-			thinkingBuilder.WriteString(e.thinking)
-		case ministralEventToolCall:
-			// Validate tool exists
-			tool, toolErr := toolByName(p.tools, e.name)
-			if toolErr != nil {
-				return contentBuilder.String(), thinkingBuilder.String(), toolCalls, toolErr
+	switch p.state {
+	case ministralCollectingContent:
+		if strings.Contains(p.buffer.String(), "[TOOL_CALLS]") {
+			before, _ := splitAtTag(&p.buffer, "[TOOL_CALLS]", false)
+			if before != "" {
+				return before, "", calls, nil
 			}
-			// Parse JSON arguments
+			p.state = ministralCollectingToolName
+		} else if strings.Contains(p.buffer.String(), "[THINK]") {
+			p.state = ministralCollectingThinkingContent
+			return "", "", calls, nil
+		} else {
+			p.buffer.Reset()
+			return s, "", calls, nil
+		}
+	case ministralCollectingThinkingContent:
+		if strings.Contains(p.buffer.String(), "[/THINK]") {
+			thinkingContent, after := splitAtTag(&p.buffer, "[/THINK]", true)
+			p.state = ministralCollectingContent
+			if after != "" {
+				p.buffer.Reset()
+				return after, thinkingContent, calls, nil
+			}
+			return "", thinkingContent, calls, nil
+		} else {
+			p.buffer.Reset()
+			return "", s, calls, nil
+		}
+	case ministralCollectingToolName:
+		if strings.Contains(p.buffer.String(), "[ARGS]") {
+			name, _ := splitAtTag(&p.buffer, "[ARGS]", false)
+
+			t, err := toolByName(p.tools, name)
+			if err != nil {
+				return "", "", calls, err
+			}
+			p.currentTool = t
+			p.state = ministralCollectingToolArgs
+			return "", "", calls, nil
+		}
+		return "", "", calls, nil
+	case ministralCollectingToolArgs:
+		if strings.Contains(p.buffer.String(), "}") {
+			before, _ := splitAtTag(&p.buffer, "}", false)
+			before += "}"
+
 			var args api.ToolCallFunctionArguments
-			if jsonErr := json.Unmarshal([]byte(e.args), &args); jsonErr != nil {
-				return contentBuilder.String(), thinkingBuilder.String(), toolCalls, jsonErr
+			if err := json.Unmarshal([]byte(before), &args); err != nil {
+				// todo - throw a better error
+				return "", "", calls, err
 			}
-			toolCalls = append(toolCalls, api.ToolCall{
+
+			p.state = ministralCollectingContent
+
+			call := api.ToolCall{
 				Function: api.ToolCallFunction{
-					Name:      tool.Function.Name,
+					Name:      p.currentTool.Function.Name,
 					Arguments: args,
 				},
-			})
+			}
+			calls = append(calls, call)
+			return "", "", calls, nil
 		}
+		return "", "", calls, nil
 	}

-	return contentBuilder.String(), thinkingBuilder.String(), toolCalls, nil
-}
-
-// findJSONEnd finds the index of the closing brace that completes a JSON object.
-// It properly handles nested objects, arrays, and strings (including escaped characters).
-// Returns -1 if the JSON is not yet complete.
-func findJSONEnd(s string) int {
-	depth := 0
-	inString := false
-	escaped := false
-
-	for i, r := range s {
-		if inString {
-			switch {
-			case escaped:
-				// If the previous character was a backslash, skip this character
-				escaped = false
-			case r == '\\':
-				// Mark the next character as escaped
-				escaped = true
-			case r == '"':
-				// End of string literal
-				inString = false
-			}
-			continue
-		}
-
-		switch r {
-		case '"':
-			// Start of string literal
-			inString = true
-		case '{', '[':
-			// Increase nesting level for objects and arrays
-			depth++
-		case '}', ']':
-			// Decrease nesting level
-			depth--
-			if depth == 0 {
-				// Reached the end of the root JSON structure
-				return i
-			}
-		}
-	}
-
-	return -1
+	return p.buffer.String(), thinking, calls, nil
 }
--- a/model/parsers/ministral_test.go
+++ b/model/parsers/ministral_test.go
@@ -1,545 +0,0 @@
-package parsers
-
-import (
-	"reflect"
-	"testing"
-
-	"github.com/ollama/ollama/api"
-)
-
-func TestMinistralParserStreaming(t *testing.T) {
-	type step struct {
-		input      string
-		wantEvents []ministralEvent
-	}
-
-	cases := []struct {
-		desc  string
-		tools []api.Tool
-		steps []step
-		think bool // whether to enable thinking support
-	}{
-		// Content streaming
-		{
-			desc: "simple content",
-			steps: []step{
-				{input: "Hello, how can I help you?", wantEvents: []ministralEvent{
-					ministralEventContent{content: "Hello, how can I help you?"},
-				}},
-			},
-		},
-		{
-			desc: "streaming content word by word",
-			steps: []step{
-				{input: "Hello,", wantEvents: []ministralEvent{ministralEventContent{content: "Hello,"}}},
-				{input: " how", wantEvents: []ministralEvent{ministralEventContent{content: " how"}}},
-				{input: " can I help?", wantEvents: []ministralEvent{ministralEventContent{content: " can I help?"}}},
-			},
-		},
-
-		// Simple tool calls
-		{
-			desc:  "simple tool call",
-			tools: []api.Tool{{Function: api.ToolFunction{Name: "get_weather"}}},
-			steps: []step{
-				{input: `[TOOL_CALLS]get_weather[ARGS]{"location": "San Francisco"}`, wantEvents: []ministralEvent{
-					ministralEventToolCall{name: "get_weather", args: `{"location": "San Francisco"}`},
-				}},
-			},
-		},
-		{
-			desc:  "tool call with nested object",
-			tools: []api.Tool{{Function: api.ToolFunction{Name: "create_entities"}}},
-			steps: []step{
-				{input: `[TOOL_CALLS]create_entities[ARGS]{"entities": [{"entityType": "Person", "name": "Jack", "observations": ["Works as a baker"]}]}`, wantEvents: []ministralEvent{
-					ministralEventToolCall{name: "create_entities", args: `{"entities": [{"entityType": "Person", "name": "Jack", "observations": ["Works as a baker"]}]}`},
-				}},
-			},
-		},
-		{
-			desc:  "tool call with deeply nested objects",
-			tools: []api.Tool{{Function: api.ToolFunction{Name: "update_config"}}},
-			steps: []step{
-				{input: `[TOOL_CALLS]update_config[ARGS]{"settings": {"user": {"profile": {"name": "John", "age": 30}}, "theme": "dark"}}`, wantEvents: []ministralEvent{
-					ministralEventToolCall{name: "update_config", args: `{"settings": {"user": {"profile": {"name": "John", "age": 30}}, "theme": "dark"}}`},
-				}},
-			},
-		},
-		{
-			desc:  "tool call with array of objects",
-			tools: []api.Tool{{Function: api.ToolFunction{Name: "process_items"}}},
-			steps: []step{
-				{input: `[TOOL_CALLS]process_items[ARGS]{"items": [{"id": 1}, {"id": 2}, {"id": 3}]}`, wantEvents: []ministralEvent{
-					ministralEventToolCall{name: "process_items", args: `{"items": [{"id": 1}, {"id": 2}, {"id": 3}]}`},
-				}},
-			},
-		},
-		{
-			desc:  "tool call with escaped quotes in string",
-			tools: []api.Tool{{Function: api.ToolFunction{Name: "search"}}},
-			steps: []step{
-				{input: `[TOOL_CALLS]search[ARGS]{"query": "say \"hello\""}`, wantEvents: []ministralEvent{
-					ministralEventToolCall{name: "search", args: `{"query": "say \"hello\""}`},
-				}},
-			},
-		},
-		{
-			desc:  "tool call with braces inside string",
-			tools: []api.Tool{{Function: api.ToolFunction{Name: "format"}}},
-			steps: []step{
-				{input: `[TOOL_CALLS]format[ARGS]{"template": "Hello {name}!"}`, wantEvents: []ministralEvent{
-					ministralEventToolCall{name: "format", args: `{"template": "Hello {name}!"}`},
-				}},
-			},
-		},
-		{
-			desc:  "empty JSON object",
-			tools: []api.Tool{{Function: api.ToolFunction{Name: "no_args"}}},
-			steps: []step{
-				{input: `[TOOL_CALLS]no_args[ARGS]{}`, wantEvents: []ministralEvent{
-					ministralEventToolCall{name: "no_args", args: `{}`},
-				}},
-			},
-		},
-		{
-			desc:  "JSON with newlines in string",
-			tools: []api.Tool{{Function: api.ToolFunction{Name: "write"}}},
-			steps: []step{
-				{input: `[TOOL_CALLS]write[ARGS]{"content": "line1\nline2\nline3"}`, wantEvents: []ministralEvent{
-					ministralEventToolCall{name: "write", args: `{"content": "line1\nline2\nline3"}`},
-				}},
-			},
-		},
-		{
-			desc:  "backslash in string value",
-			tools: []api.Tool{{Function: api.ToolFunction{Name: "path"}}},
-			steps: []step{
-				{input: `[TOOL_CALLS]path[ARGS]{"dir": "C:\\Users\\test"}`, wantEvents: []ministralEvent{
-					ministralEventToolCall{name: "path", args: `{"dir": "C:\\Users\\test"}`},
-				}},
-			},
-		},
-
-		// Content after tool call
-		{
-			desc:  "content after tool call",
-			tools: []api.Tool{{Function: api.ToolFunction{Name: "test"}}},
-			steps: []step{
-				// NOTE: It's unclear if this is valid Ministral output, but the parser
-				// currently treats text after a tool call as regular content. This test
-				// documents that behavior so we notice if it changes.
-				{input: `[TOOL_CALLS]test[ARGS]{"a": 1}some content after`, wantEvents: []ministralEvent{
-					ministralEventToolCall{name: "test", args: `{"a": 1}`},
-					ministralEventContent{content: "some content after"},
-				}},
-			},
-		},
-
-		// Multiple tool calls
-		{
-			desc: "multiple tool calls in sequence",
-			tools: []api.Tool{
-				{Function: api.ToolFunction{Name: "get_weather"}},
-				{Function: api.ToolFunction{Name: "get_time"}},
-			},
-			steps: []step{
-				{input: `[TOOL_CALLS]get_weather[ARGS]{"location": "NYC"}[TOOL_CALLS]get_time[ARGS]{"timezone": "EST"}`, wantEvents: []ministralEvent{
-					ministralEventToolCall{name: "get_weather", args: `{"location": "NYC"}`},
-					ministralEventToolCall{name: "get_time", args: `{"timezone": "EST"}`},
-				}},
-			},
-		},
-		{
-			desc: "multiple tool calls streamed separately",
-			tools: []api.Tool{
-				{Function: api.ToolFunction{Name: "tool_a"}},
-				{Function: api.ToolFunction{Name: "tool_b"}},
-			},
-			steps: []step{
-				{input: `[TOOL_CALLS]tool_a[ARGS]{"x": 1}`, wantEvents: []ministralEvent{
-					ministralEventToolCall{name: "tool_a", args: `{"x": 1}`},
-				}},
-				{input: `[TOOL_CALLS]tool_b[ARGS]{"y": 2}`, wantEvents: []ministralEvent{
-					ministralEventToolCall{name: "tool_b", args: `{"y": 2}`},
-				}},
-			},
-		},
-
-		// Streaming tool calls
-		{
-			desc:  "streaming tool call with nested objects",
-			tools: []api.Tool{{Function: api.ToolFunction{Name: "create_entities"}}},
-			steps: []step{
-				{input: "[TOOL_CALLS]create_entities[ARGS]", wantEvents: []ministralEvent{}},
-				{input: `{"entities": [{"entityType": "Person",`, wantEvents: []ministralEvent{}},
-				{input: ` "name": "Jack",`, wantEvents: []ministralEvent{}},
-				{input: ` "observations": ["Works`, wantEvents: []ministralEvent{}},
-				{input: ` as a baker"]}`, wantEvents: []ministralEvent{}},
-				{input: `]}`, wantEvents: []ministralEvent{
-					ministralEventToolCall{name: "create_entities", args: `{"entities": [{"entityType": "Person", "name": "Jack", "observations": ["Works as a baker"]}]}`},
-				}},
-			},
-		},
-		{
-			desc:  "streaming with incomplete JSON waits for completion",
-			tools: []api.Tool{{Function: api.ToolFunction{Name: "test"}}},
-			steps: []step{
-				{input: "[TOOL_CALLS]test[ARGS]{", wantEvents: []ministralEvent{}},
-				{input: `"a": {`, wantEvents: []ministralEvent{}},
-				{input: `"b": 1`, wantEvents: []ministralEvent{}},
-				{input: `}`, wantEvents: []ministralEvent{}},
-				{input: `}`, wantEvents: []ministralEvent{
-					ministralEventToolCall{name: "test", args: `{"a": {"b": 1}}`},
-				}},
-			},
-		},
-
-		// Partial tag handling
-		{
-			desc: "partial tool tag fakeout",
-			steps: []step{
-				{input: "abc[TOOL", wantEvents: []ministralEvent{ministralEventContent{content: "abc"}}},
-				{input: " not a tag", wantEvents: []ministralEvent{ministralEventContent{content: "[TOOL not a tag"}}},
-			},
-		},
-		{
-			desc:  "tool call tag split across chunks",
-			tools: []api.Tool{{Function: api.ToolFunction{Name: "test"}}},
-			steps: []step{
-				{input: "[TOOL_", wantEvents: []ministralEvent{}},
-				{input: "CALLS]test[ARGS]{}", wantEvents: []ministralEvent{
-					ministralEventToolCall{name: "test", args: `{}`},
-				}},
-			},
-		},
-		{
-			desc:  "content before tool call",
-			tools: []api.Tool{{Function: api.ToolFunction{Name: "get_weather"}}},
-			steps: []step{
-				{input: "hello [TOOL_CALLS]get_weather[ARGS]{}", wantEvents: []ministralEvent{
-					ministralEventContent{content: "hello"},
-					ministralEventToolCall{name: "get_weather", args: `{}`},
-				}},
-			},
-		},
-		{
-			desc:  "whitespace between content and tool call is trimmed",
-			tools: []api.Tool{{Function: api.ToolFunction{Name: "test"}}},
-			steps: []step{
-				{input: "content \n [TOOL_CALLS]test[ARGS]{}", wantEvents: []ministralEvent{
-					ministralEventContent{content: "content"},
-					ministralEventToolCall{name: "test", args: `{}`},
-				}},
-			},
-		},
-		{
-			desc:  "tabs and newlines before tool call are trimmed",
-			tools: []api.Tool{{Function: api.ToolFunction{Name: "test"}}},
-			steps: []step{
-				{input: "content\t\n\t[TOOL_CALLS]test[ARGS]{}", wantEvents: []ministralEvent{
-					ministralEventContent{content: "content"},
-					ministralEventToolCall{name: "test", args: `{}`},
-				}},
-			},
-		},
-		{
-			desc:  "non-breaking space before tool call is trimmed",
-			tools: []api.Tool{{Function: api.ToolFunction{Name: "test"}}},
-			steps: []step{
-				// \u00a0 is non-breaking space, which unicode.IsSpace considers whitespace
-				{input: "content\u00a0[TOOL_CALLS]test[ARGS]{}", wantEvents: []ministralEvent{
-					ministralEventContent{content: "content"},
-					ministralEventToolCall{name: "test", args: `{}`},
-				}},
-			},
-		},
-		{
-			desc: "whitespace before THINK tag is trimmed",
-			steps: []step{
-				{input: "content \n [THINK]thinking[/THINK]after", wantEvents: []ministralEvent{
-					ministralEventContent{content: "content"},
-					ministralEventThinking{thinking: "thinking"},
-					ministralEventContent{content: "after"},
-				}},
-			},
-		},
-		{
-			desc: "trailing whitespace withheld then emitted",
-			steps: []step{
-				{input: "Hello ", wantEvents: []ministralEvent{ministralEventContent{content: "Hello"}}},
-				{input: "world", wantEvents: []ministralEvent{ministralEventContent{content: " world"}}},
-			},
-		},
-		{
-			desc: "trailing newline withheld then emitted",
-			steps: []step{
-				{input: "Hello\n", wantEvents: []ministralEvent{ministralEventContent{content: "Hello"}}},
-				{input: "world", wantEvents: []ministralEvent{ministralEventContent{content: "\nworld"}}},
-			},
-		},
-
-		// Thinking support
-		{
-			desc:  "thinking content",
-			think: true,
-			steps: []step{
-				{input: "thinking here[/THINK]", wantEvents: []ministralEvent{
-					ministralEventThinking{thinking: "thinking here"},
-				}},
-				{input: "content after", wantEvents: []ministralEvent{
-					ministralEventContent{content: "content after"},
-				}},
-			},
-		},
-		{
-			desc:  "thinking with whitespace after end tag",
-			think: true,
-			steps: []step{
-				{input: "my thoughts[/THINK]  \n  response", wantEvents: []ministralEvent{
-					ministralEventThinking{thinking: "my thoughts"},
-					ministralEventContent{content: "response"},
-				}},
-			},
-		},
-		{
-			desc:  "non-breaking space after think end tag is trimmed",
-			think: true,
-			steps: []step{
-				// \u00a0 is non-breaking space
-				{input: "thinking[/THINK]\u00a0response", wantEvents: []ministralEvent{
-					ministralEventThinking{thinking: "thinking"},
-					ministralEventContent{content: "response"},
-				}},
-			},
-		},
-		{
-			desc:  "partial think end tag",
-			think: true,
-			steps: []step{
-				{input: "thinking[/THI", wantEvents: []ministralEvent{ministralEventThinking{thinking: "thinking"}}},
-				{input: "NK]after", wantEvents: []ministralEvent{ministralEventContent{content: "after"}}},
-			},
-		},
-		{
-			desc:  "think tag fakeout",
-			think: true,
-			steps: []step{
-				{input: "thinking[/THI", wantEvents: []ministralEvent{ministralEventThinking{thinking: "thinking"}}},
-				{input: "not end tag", wantEvents: []ministralEvent{ministralEventThinking{thinking: "[/THInot end tag"}}},
-			},
-		},
-		{
-			desc:  "thinking then tool call",
-			think: true,
-			tools: []api.Tool{{Function: api.ToolFunction{Name: "test"}}},
-			steps: []step{
-				{input: "let me think[/THINK][TOOL_CALLS]test[ARGS]{}", wantEvents: []ministralEvent{
-					ministralEventThinking{thinking: "let me think"},
-					ministralEventToolCall{name: "test", args: `{}`},
-				}},
-			},
-		},
-
-		// Content then THINK tag transition
-		{
-			desc: "content then think tag",
-			steps: []step{
-				{input: "content[THINK]thinking[/THINK]more", wantEvents: []ministralEvent{
-					ministralEventContent{content: "content"},
-					ministralEventThinking{thinking: "thinking"},
-					ministralEventContent{content: "more"},
-				}},
-			},
-		},
-
-		// Unicode handling
-		{
-			desc: "unicode content",
-			steps: []step{
-				{input: "你好 🌍 مرحبا", wantEvents: []ministralEvent{
-					ministralEventContent{content: "你好 🌍 مرحبا"},
-				}},
-			},
-		},
-		{
-			desc:  "unicode in tool args",
-			tools: []api.Tool{{Function: api.ToolFunction{Name: "greet"}}},
-			steps: []step{
-				{input: `[TOOL_CALLS]greet[ARGS]{"message": "你好 🌍"}`, wantEvents: []ministralEvent{
-					ministralEventToolCall{name: "greet", args: `{"message": "你好 🌍"}`},
-				}},
-			},
-		},
-	}
-
-	for _, tc := range cases {
-		t.Run(tc.desc, func(t *testing.T) {
-			parser := MinistralParser{}
-			parser.hasThinkingSupport = tc.think
-			parser.Init(tc.tools, nil, nil)
-
-			for i, step := range tc.steps {
-				parser.buffer.WriteString(step.input)
-				gotEvents := parser.parseEvents()
-
-				if len(gotEvents) == 0 && len(step.wantEvents) == 0 {
-					// avoid deep equal on empty vs. nil slices
-					continue
-				}
-
-				if !reflect.DeepEqual(gotEvents, step.wantEvents) {
-					t.Errorf("step %d: input %q: got events %#v, want %#v", i, step.input, gotEvents, step.wantEvents)
-				}
-			}
-		})
-	}
-}
-
-func TestMinistralParser_Errors(t *testing.T) {
-	t.Run("unknown tool returns error", func(t *testing.T) {
-		p := &MinistralParser{}
-		p.Init([]api.Tool{{Function: api.ToolFunction{Name: "known_tool"}}}, nil, nil)
-
-		_, _, _, err := p.Add(`[TOOL_CALLS]unknown_tool[ARGS]{"a": 1}`, true)
-		if err == nil {
-			t.Fatal("expected error for unknown tool")
-		}
-	})
-
-	t.Run("invalid JSON returns error", func(t *testing.T) {
-		p := &MinistralParser{}
-		p.Init([]api.Tool{{Function: api.ToolFunction{Name: "test"}}}, nil, nil)
-
-		_, _, _, err := p.Add(`[TOOL_CALLS]test[ARGS]{invalid json}`, true)
-		if err == nil {
-			t.Fatal("expected error for invalid JSON")
-		}
-	})
-}
-
-func TestFindJSONEnd(t *testing.T) {
-	tests := []struct {
-		name     string
-		input    string
-		expected int
-	}{
-		{
-			name:     "simple object",
-			input:    `{"a": 1}`,
-			expected: 7,
-		},
-		{
-			name:     "nested object",
-			input:    `{"a": {"b": 2}}`,
-			expected: 14,
-		},
-		{
-			name:     "array inside object",
-			input:    `{"items": [1, 2, 3]}`,
-			expected: 19,
-		},
-		{
-			name:     "braces in string",
-			input:    `{"template": "Hello {name}!"}`,
-			expected: 28,
-		},
-		{
-			name:     "escaped quotes",
-			input:    `{"msg": "say \"hi\""}`,
-			expected: 20,
-		},
-		{
-			name:     "incomplete object",
-			input:    `{"a": {"b": 1}`,
-			expected: -1,
-		},
-		{
-			name:     "deeply nested",
-			input:    `{"a": {"b": {"c": {"d": 1}}}}`,
-			expected: 28,
-		},
-		{
-			name:     "object with trailing content",
-			input:    `{"a": 1} extra`,
-			expected: 7,
-		},
-		{
-			name:     "array",
-			input:    `[{"a": 1}, {"b": 2}]`,
-			expected: 19,
-		},
-		{
-			name:     "escaped backslash before quote",
-			input:    `{"path": "C:\\"}`,
-			expected: 15,
-		},
-		{
-			name:     "empty string",
-			input:    "",
-			expected: -1,
-		},
-		{
-			name:     "no opening brace",
-			input:    "hello world",
-			expected: -1,
-		},
-		{
-			name:     "only opening brace",
-			input:    "{",
-			expected: -1,
-		},
-		{
-			name:     "unclosed string",
-			input:    `{"key": "unclosed`,
-			expected: -1,
-		},
-		{
-			name:     "double escaped backslash then quote",
-			input:    `{"path": "C:\\\\"}`,
-			expected: 17,
-		},
-		{
-			name:     "unicode in key and value",
-			input:    `{"키": "값"}`,
-			expected: 13,
-		},
-		{
-			name:     "nested arrays",
-			input:    `{"matrix": [[1, 2], [3, 4]]}`,
-			expected: 27,
-		},
-		{
-			name:     "mixed nesting",
-			input:    `{"a": [{"b": {"c": [1, 2, 3]}}]}`,
-			expected: 31,
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			result := findJSONEnd(tt.input)
-			if result != tt.expected {
-				t.Errorf("findJSONEnd(%q) = %d, want %d", tt.input, result, tt.expected)
-			}
-		})
-	}
-}
-
-func TestMinistralParser_HasToolSupport(t *testing.T) {
-	p := &MinistralParser{}
-	if !p.HasToolSupport() {
-		t.Error("expected HasToolSupport to return true")
-	}
-}
-
-func TestMinistralParser_HasThinkingSupport(t *testing.T) {
-	p := &MinistralParser{hasThinkingSupport: false}
-	if p.HasThinkingSupport() {
-		t.Error("expected HasThinkingSupport to return false")
-	}
-
-	p = &MinistralParser{hasThinkingSupport: true}
-	if !p.HasThinkingSupport() {
-		t.Error("expected HasThinkingSupport to return true")
-	}
-}
--- a/model/parsers/parsers.go
+++ b/model/parsers/parsers.go
@@ -3,7 +3,6 @@ package parsers
 import (
 	"strings"
 	"unicode"
-	"unicode/utf8"

 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/harmony"
@@ -115,33 +114,3 @@ func splitAtTag(sb *strings.Builder, tag string, trimAfter bool) (string, string
 	sb.WriteString(after)
 	return before, after // return events
 }
-
-// overlap returns the longest overlap between the suffix of s and the prefix of delim
-func overlap(s, delim string) int {
-	max := min(len(delim), len(s))
-	for i := max; i > 0; i-- {
-		if strings.HasSuffix(s, delim[:i]) {
-			return i
-		}
-	}
-	return 0
-}
-
-// trailingWhitespaceLen returns the length in bytes of trailing whitespace in s
-func trailingWhitespaceLen(s string) int {
-	remaining := s
-	total := 0
-	for len(remaining) > 0 {
-		r, size := utf8.DecodeLastRuneInString(remaining)
-		// if it's an invalid utf8 rune, assume it isn't whitespace
-		if r == utf8.RuneError && size == 1 {
-			break
-		}
-		if !unicode.IsSpace(r) {
-			break
-		}
-		total += size
-		remaining = remaining[:len(remaining)-size]
-	}
-	return total
-}
--- a/model/parsers/qwen3coder.go
+++ b/model/parsers/qwen3coder.go
@@ -11,6 +11,7 @@ import (
 	"strconv"
 	"strings"
 	"unicode"
+	"unicode/utf8"

 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/logutil"
@@ -193,6 +194,36 @@ func eat(p *Qwen3CoderParser) ([]qwenEvent, bool) {
 	}
 }

+// TODO(drifkin): move this to a shared location
+// longest overlap between suffix of s and prefix of delim
+func overlap(s, delim string) int {
+	max := min(len(delim), len(s))
+	for i := max; i > 0; i-- {
+		if strings.HasSuffix(s, delim[:i]) {
+			return i
+		}
+	}
+	return 0
+}
+
+func trailingWhitespaceLen(s string) int {
+	remaining := s
+	total := 0
+	for len(remaining) > 0 {
+		r, size := utf8.DecodeLastRuneInString(remaining)
+		// if it's an invalid utf8 rune, assume it isn't whitespace
+		if r == utf8.RuneError && size == 1 {
+			break
+		}
+		if !unicode.IsSpace(r) {
+			break
+		}
+		total += size
+		remaining = remaining[:len(remaining)-size]
+	}
+	return total
+}
+
 type XMLFunctionCall struct {
 	XMLName    xml.Name       `xml:"function"`
 	Name       string         `xml:"name,attr"`
--- a/model/sentencepiece.go
+++ b/model/sentencepiece.go
@@ -1,249 +0,0 @@
-package model
-
-import (
-	"container/heap"
-	"fmt"
-	"log/slog"
-	"strconv"
-	"strings"
-
-	"github.com/ollama/ollama/logutil"
-)
-
-const spmWhitespaceSep = "▁"
-
-type SentencePiece struct {
-	maxTokenLen int
-	vocab       *Vocabulary
-}
-
-var _ TextProcessor = (*SentencePiece)(nil)
-
-func (spm SentencePiece) Vocabulary() *Vocabulary {
-	return spm.vocab
-}
-
-func NewSentencePiece(vocab *Vocabulary) SentencePiece {
-	logutil.Trace("Tokens", "num tokens", len(vocab.Values), "vals", vocab.Values[:5], "scores", vocab.Scores[:5], "types", vocab.Types[:5])
-
-	counter := map[int]int{}
-	var maxTokenLen int
-	for cnt := range vocab.Types {
-		switch vocab.Types[cnt] {
-		case TOKEN_TYPE_NORMAL, TOKEN_TYPE_USER_DEFINED, TOKEN_TYPE_UNUSED:
-			maxTokenLen = max(maxTokenLen, len(vocab.Values[cnt]))
-			fallthrough
-		default:
-			counter[int(vocab.Types[cnt])] += 1
-		}
-	}
-
-	logutil.Trace("Token counts", "normal", counter[TOKEN_TYPE_NORMAL], "unknown", counter[TOKEN_TYPE_UNKNOWN], "control", counter[TOKEN_TYPE_CONTROL],
-		"user defined", counter[TOKEN_TYPE_USER_DEFINED], "unused", counter[TOKEN_TYPE_UNUSED], "byte", counter[TOKEN_TYPE_BYTE],
-		"max token len", maxTokenLen)
-
-	return SentencePiece{
-		maxTokenLen: maxTokenLen,
-		vocab:       vocab,
-	}
-}
-
-func (spm SentencePiece) Is(id int32, special Special) bool {
-	return spm.vocab.Is(id, special)
-}
-
-func (spm SentencePiece) Encode(s string, addSpecial bool) ([]int32, error) {
-	fragments := []fragment{{value: s}}
-	for _, special := range spm.vocab.SpecialVocabulary() {
-		id := spm.vocab.Encode(special)
-		for i := 0; i < len(fragments); i++ {
-			frag := fragments[i]
-			if len(frag.ids) > 0 {
-				continue
-			}
-
-			var middle []fragment
-			switch i := strings.Index(frag.value, special); {
-			case i < 0:
-				middle = append(middle, frag)
-			case i > 0:
-				middle = append(middle, fragment{value: frag.value[:i]})
-				fallthrough
-			default:
-				middle = append(middle, fragment{value: special, ids: []int32{id}})
-				if rest := frag.value[i+len(special):]; rest != "" {
-					middle = append(middle, fragment{value: rest})
-				}
-			}
-
-			fragments = append(fragments[:i], append(middle, fragments[i+1:]...)...)
-		}
-	}
-
-	var ids []int32
-	for _, frag := range fragments {
-		if len(frag.ids) > 0 {
-			ids = append(ids, frag.ids...)
-			continue
-		}
-
-		text := strings.ReplaceAll(frag.value, " ", spmWhitespaceSep)
-
-		if id := spm.vocab.Encode(text); id >= 0 {
-			ids = append(ids, id)
-			continue
-		}
-
-		q := &queue{}
-		heap.Init(q)
-
-		runes := []rune(text)
-		merges := make([]merge, len(runes))
-		for r := range runes {
-			merges[r] = merge{
-				p:     r - 1,
-				n:     r + 1,
-				runes: []rune{runes[r]},
-			}
-		}
-
-		pairwise := func(a, b int) *candidate {
-			if a < 0 || b >= len(runes) {
-				return nil
-			}
-
-			left, right := string(merges[a].runes), string(merges[b].runes)
-			if id := spm.vocab.Encode(left + right); id >= 0 {
-				return &candidate{
-					a:     a,
-					b:     b,
-					score: spm.vocab.Scores[id],
-					size:  len(left) + len(right),
-				}
-			}
-
-			return nil
-		}
-
-		for i := range len(runes) - 1 {
-			if pair := pairwise(i, i+1); pair != nil {
-				heap.Push(q, pair)
-			}
-		}
-
-		for q.Len() > 0 {
-			pair := heap.Pop(q).(*candidate)
-			left, right := merges[pair.a], merges[pair.b]
-
-			if string(left.runes) == "" || string(right.runes) == "" || len(string(left.runes))+len(string(right.runes)) != pair.size {
-				continue
-			}
-
-			merges[pair.a].runes = append(left.runes, right.runes...)
-			merges[pair.b].runes = nil
-			merges[pair.a].n = right.n
-			if right.n < len(merges) {
-				merges[right.n].p = pair.a
-			}
-
-			if pair := pairwise(merges[pair.a].p, pair.a); pair != nil {
-				heap.Push(q, pair)
-			}
-
-			if pair := pairwise(pair.a, merges[pair.a].n); pair != nil {
-				heap.Push(q, pair)
-			}
-		}
-
-		for _, merge := range merges {
-			if token := string(merge.runes); token != "" {
-				id := spm.vocab.Encode(token)
-
-				if id >= 0 {
-					ids = append(ids, id)
-					continue
-				}
-
-				// Fallback to byte tokenization
-				var result []int32
-				for _, b := range []byte(token) {
-					byteToken := fmt.Sprintf("<0x%02X>", b)
-					unknownID := spm.vocab.Encode(byteToken)
-					if unknownID >= 0 {
-						result = append(result, unknownID)
-					} else {
-						slog.Debug("unknown byte token", "byte", b, "token", byteToken)
-					}
-				}
-
-				ids = append(ids, result...)
-			}
-		}
-	}
-
-	if addSpecial {
-		ids = spm.vocab.addSpecials(ids)
-	}
-
-	logutil.Trace("encoded", "string", s, "ids", ids)
-	return ids, nil
-}
-
-type candidate struct {
-	a, b  int
-	score float32
-	size  int
-}
-
-type queue []*candidate
-
-func (q queue) Len() int { return len(q) }
-
-func (q queue) Less(i, j int) bool {
-	return (q[i].score > q[j].score) || (q[i].score == q[j].score && q[i].a < q[j].a)
-}
-
-func (q queue) Swap(i, j int) { q[i], q[j] = q[j], q[i] }
-
-func (q *queue) Push(x interface{}) {
-	item := x.(*candidate)
-	*q = append(*q, item)
-}
-
-func (q *queue) Pop() interface{} {
-	old := *q
-	n := len(old)
-	item := old[n-1]
-	*q = old[0 : n-1]
-	return item
-}
-
-func (spm SentencePiece) Decode(ids []int32) (string, error) {
-	var sb strings.Builder
-	for _, id := range ids {
-		data := spm.vocab.Decode(id)
-		data = strings.ReplaceAll(data, spmWhitespaceSep, " ")
-
-		// For tokenizers that use byte tokens like "<0xEA>"
-		// convert them to the partial unicode character
-		// so they are buffered correctly by the runner instead
-		// of being sent back to the api as "<0xEA>"
-		if len(data) == 6 && strings.HasPrefix(data, "<0x") && strings.HasSuffix(data, ">") {
-			byteVal, err := strconv.ParseUint(data[1:5], 0, 8)
-			if err != nil {
-				return "", fmt.Errorf("failed to parse hex byte: %v", err)
-			}
-
-			if err := sb.WriteByte(byte(byteVal)); err != nil {
-				return "", err
-			}
-		} else {
-			if _, err := sb.WriteString(data); err != nil {
-				return "", err
-			}
-		}
-	}
-
-	logutil.Trace("decoded", "ids", ids, "string", sb.String())
-	return sb.String(), nil
-}
--- a/model/textprocessor.go
+++ b/model/textprocessor.go
@@ -1,17 +0,0 @@
-package model
-
-const (
-	TOKEN_TYPE_NORMAL = iota + 1
-	TOKEN_TYPE_UNKNOWN
-	TOKEN_TYPE_CONTROL
-	TOKEN_TYPE_USER_DEFINED
-	TOKEN_TYPE_UNUSED
-	TOKEN_TYPE_BYTE
-)
-
-type TextProcessor interface {
-	Encode(s string, addSpecial bool) ([]int32, error)
-	Decode([]int32) (string, error)
-	Is(int32, Special) bool
-	Vocabulary() *Vocabulary
-}
--- a/model/wordpiece_test.go
+++ b/model/wordpiece_test.go
@@ -1,53 +0,0 @@
-package model
-
-import (
-	"slices"
-	"testing"
-
-	"github.com/google/go-cmp/cmp"
-)
-
-func TestWordPiece(t *testing.T) {
-	wpm := NewWordPiece(
-		&Vocabulary{
-			Values: []string{"[UNK]", "[CLS]", "[SEP]", "▁hello", "▁world", "s", "▁!", "▁@", "▁#"},
-			AddBOS: true,
-			AddEOS: true,
-			BOS:    []int32{1},
-			EOS:    []int32{2},
-		},
-		true, // lowercase
-	)
-
-	ids, err := wpm.Encode("Hello world!", true)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	if diff := cmp.Diff([]int32{1, 3, 4, 6, 2}, ids); diff != "" {
-		t.Errorf("unexpected ids (-want +got):\n%s", diff)
-	}
-
-	words, err := wpm.Decode(ids)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	if diff := cmp.Diff("[CLS] hello world! [SEP]", words); diff != "" {
-		t.Errorf("unexpected words (-want +got):\n%s", diff)
-	}
-}
-
-func TestWordPieceWords(t *testing.T) {
-	var wpm WordPiece
-
-	basic := slices.Collect(wpm.words("Hey friend!     How are you?!?"))
-	if diff := cmp.Diff([]string{"Hey", "friend", "!", "How", "are", "you", "?", "!", "?"}, basic); diff != "" {
-		t.Errorf("unexpected words (-want +got):\n%s", diff)
-	}
-
-	chinese := slices.Collect(wpm.words("野口里佳 Noguchi Rika"))
-	if diff := cmp.Diff([]string{"野", "口", "里", "佳", "Noguchi", "Rika"}, chinese); diff != "" {
-		t.Errorf("unexpected words (-want +got):\n%s", diff)
-	}
-}
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -37,6 +37,7 @@ import (
 	"github.com/ollama/ollama/model/input"
 	"github.com/ollama/ollama/runner/common"
 	"github.com/ollama/ollama/sample"
+	"github.com/ollama/ollama/tokenizer"

 	_ "github.com/ollama/ollama/model/models"
 )
@@ -210,9 +211,9 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe
 }

 // calculateLogprobs converts raw logits to log probabilities and finds top K tokens
-func calculateLogprobs(logits []float32, selectedToken int32, topK int, textProcessor model.TextProcessor) []llm.Logprob {
+func calculateLogprobs(logits []float32, selectedToken int32, topK int, tokenizer tokenizer.Tokenizer) []llm.Logprob {
 	decoder := func(tokenID int) string {
-		text, _ := textProcessor.Decode([]int32{int32(tokenID)})
+		text, _ := tokenizer.Decode([]int32{int32(tokenID)})
 		return text
 	}
 	return common.CalculateLogprobs(logits, int(selectedToken), topK, decoder)
@@ -242,7 +243,7 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]*input.Input,

 	for i, part := range parts {
 		// text - tokenize
-		tokens, err := s.model.(model.TextProcessor).Encode(part, i == 0)
+		tokens, err := s.model.(tokenizer.Tokenizer).Encode(part, i == 0)
 		if err != nil {
 			return nil, nil, nil, err
 		}
@@ -766,7 +767,7 @@ func (s *Server) computeBatch(activeBatch batchState) {
 		nextBatchTokens[i].Token = token

 		// if it's an end of sequence token, break
-		if s.model.(model.TextProcessor).Is(token, model.SpecialEOS) {
+		if s.model.(tokenizer.Tokenizer).Is(token, tokenizer.SpecialEOS) {
 			// TODO (jmorganca): we should send this back
 			// as it's important for the /api/generate context
 			// seq.responses <- piece
@@ -775,14 +776,14 @@ func (s *Server) computeBatch(activeBatch batchState) {
 			continue
 		}

-		piece, err := s.model.(model.TextProcessor).Decode([]int32{token})
+		piece, err := s.model.(tokenizer.Tokenizer).Decode([]int32{token})
 		if err != nil {
 			panic("failed to decode token")
 		}

 		// Calculate logprobs if requested (after EOS check to avoid logprobs for EOS tokens)
 		if seq.logprobs {
-			logprobs := calculateLogprobs(logits, token, seq.topLogprobs, s.model.(model.TextProcessor))
+			logprobs := calculateLogprobs(logits, token, seq.topLogprobs, s.model.(tokenizer.Tokenizer))
 			seq.pendingLogprobs = append(seq.pendingLogprobs, logprobs...)
 		}

@@ -873,7 +874,7 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 	var grammar *sample.GrammarSampler
 	var err error
 	if req.Grammar != "" {
-		grammar, err = sample.NewGrammarSampler(s.model.(model.TextProcessor), req.Grammar)
+		grammar, err = sample.NewGrammarSampler(s.model.(tokenizer.Tokenizer), req.Grammar)
 		if err != nil {
 			http.Error(w, "failed to load model vocabulary required for format", http.StatusInternalServerError)
 			return
--- a/runner/runner.go
+++ b/runner/runner.go
@@ -4,29 +4,19 @@ import (
 	"github.com/ollama/ollama/runner/llamarunner"
 	"github.com/ollama/ollama/runner/ollamarunner"
 	imagerunner "github.com/ollama/ollama/x/imagegen/runner"
+	"github.com/ollama/ollama/x/mlxrunner"
 )

 func Execute(args []string) error {
-	if args[0] == "runner" {
-		args = args[1:]
-	}
-
-	var newRunner bool
-	var imageRunner bool
-	if len(args) > 0 && args[0] == "--ollama-engine" {
-		args = args[1:]
-		newRunner = true
-	}
-	if len(args) > 0 && args[0] == "--image-engine" {
-		args = args[1:]
-		imageRunner = true
-	}
-
-	if imageRunner {
-		return imagerunner.Execute(args)
-	} else if newRunner {
-		return ollamarunner.Execute(args)
-	} else {
-		return llamarunner.Execute(args)
+	if len(args) > 0 {
+		switch args[0] {
+		case "--ollama-engine":
+			return ollamarunner.Execute(args[1:])
+		case "--image-engine":
+			return imagerunner.Execute(args[1:])
+		case "--mlx-engine":
+			return mlxrunner.Execute(args[1:])
+		}
 	}
+	return llamarunner.Execute(args)
 }
--- a/sample/samplers.go
+++ b/sample/samplers.go
@@ -7,7 +7,7 @@ import (
 	"slices"

 	"github.com/ollama/ollama/llama"
-	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/tokenizer"
 )

 // token represents information about a single token during sampling
@@ -168,15 +168,15 @@ type GrammarSampler struct {
 	grammar *llama.Grammar
 }

-func NewGrammarSampler(model model.TextProcessor, grammarStr string) (*GrammarSampler, error) {
-	vocabIds := make([]uint32, len(model.Vocabulary().Values))
-	pieces := make([]string, len(model.Vocabulary().Values))
-	for i := range model.Vocabulary().Values {
-		pieces[i], _ = model.Decode([]int32{int32(i)})
+func NewGrammarSampler(tokenizer tokenizer.Tokenizer, grammarStr string) (*GrammarSampler, error) {
+	vocabIds := make([]uint32, len(tokenizer.Vocabulary().Values))
+	pieces := make([]string, len(tokenizer.Vocabulary().Values))
+	for i := range tokenizer.Vocabulary().Values {
+		pieces[i], _ = tokenizer.Decode([]int32{int32(i)})
 		vocabIds[i] = uint32(i)
 	}

-	grammar := llama.NewGrammar(grammarStr, vocabIds, pieces, model.Vocabulary().EOS)
+	grammar := llama.NewGrammar(grammarStr, vocabIds, pieces, tokenizer.Vocabulary().EOS)
 	if grammar == nil {
 		return nil, errors.New("sample: failed to initialize grammar")
 	}
--- a/sample/samplers_test.go
+++ b/sample/samplers_test.go
@@ -8,7 +8,7 @@ import (
 	"path/filepath"
 	"testing"

-	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/tokenizer"
 )

 func TestWeighted(t *testing.T) {
@@ -60,10 +60,10 @@ func TestWeighted(t *testing.T) {
 	}
 }

-func modelHelper(t testing.TB) model.BytePairEncoding {
+func modelHelper(t testing.TB) tokenizer.Tokenizer {
 	t.Helper()

-	f, err := os.Open(filepath.Join("..", "model", "testdata", "llama3.2", "encoder.json"))
+	f, err := os.Open(filepath.Join("..", "testdata", "testdata", "llama3.2", "encoder.json"))
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -81,8 +81,8 @@ func modelHelper(t testing.TB) model.BytePairEncoding {

 	merges := make([]string, 0, 1)
 	// Only need vocab for Grammar Test
-	return model.NewBytePairEncoding(
-		&model.Vocabulary{
+	return tokenizer.NewBytePairEncoding(
+		&tokenizer.Vocabulary{
 			Values: tokens,
 			Types:  make([]int32, len(vocab)),
 			Merges: merges,
--- a/server/sched.go
+++ b/server/sched.go
@@ -5,9 +5,13 @@ import (
 	"errors"
 	"fmt"
 	"log/slog"
+	"math/rand"
+	"os"
+	"os/exec"
 	"reflect"
 	"slices"
 	"sort"
+	"strconv"
 	"strings"
 	"sync"
 	"time"
@@ -22,6 +26,7 @@ import (
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/types/model"
 	"github.com/ollama/ollama/x/imagegen"
+	"github.com/ollama/ollama/x/mlxrunner"
 )

 type LlmRequest struct {
@@ -195,9 +200,8 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						slog.Debug("updating default concurrency", "OLLAMA_MAX_LOADED_MODELS", maxRunners, "gpu_count", len(gpus))
 					}

-					// Check for image generation model before attempting GGML load
-					if slices.Contains(pending.model.Config.Capabilities, "image") {
-						if s.loadImageGen(pending) {
+					if pending.model.Config.ModelFormat == "safetensors" {
+						if s.loadSafetensors(pending) {
 							break
 						}
 						continue
@@ -552,6 +556,74 @@ iGPUScan:
 	return false
 }

+func subproc(args, environ []string) (*exec.Cmd, int, error) {
+	exe, err := os.Executable()
+	if err != nil {
+		return nil, 0, fmt.Errorf("unable to lookup executable path: %w", err)
+	}
+
+	for range 3 {
+		// get a random port in the ephemeral range
+		port := rand.Intn(65535-49152) + 49152
+		cmd := exec.Command(exe, slices.Concat([]string{"runner"}, args, []string{"--port", strconv.Itoa(port)})...)
+		cmd.Env = slices.Concat(os.Environ(), environ)
+		cmd.Stdout = os.Stderr
+		cmd.Stderr = os.Stderr
+		if err := cmd.Start(); err != nil {
+			continue
+		}
+
+		return cmd, port, nil
+	}
+
+	return nil, 0, fmt.Errorf("unable to start subprocess after multiple attempts")
+}
+
+func (s *Scheduler) loadSafetensors(req *LlmRequest) bool {
+	if slices.Contains(req.model.Config.Capabilities, "image") {
+		return s.loadImageGen(req)
+	}
+
+	args := []string{"--mlx-engine", "--model", req.model.ShortName}
+	environ := []string{}
+	cmd, port, err := subproc(args, environ)
+	if err != nil {
+		req.errCh <- fmt.Errorf("failed to start mlx subprocess: %w", err)
+		return true
+	}
+
+	sessionDuration := envconfig.KeepAlive()
+	if req.sessionDuration != nil {
+		sessionDuration = req.sessionDuration.Duration
+	}
+
+	runner := &runnerRef{
+		model:           req.model,
+		modelPath:       req.model.ModelPath,
+		Options:         &req.opts,
+		loading:         false,
+		sessionDuration: sessionDuration,
+		llama: &mlxrunner.Client{
+			Cmd:  cmd,
+			Port: port,
+		},
+	}
+
+	s.loadedMu.Lock()
+	s.loaded[req.model.ModelPath] = runner
+	s.loadedMu.Unlock()
+
+	runner.refMu.Lock()
+	if sessionDuration > 0 {
+		runner.expireTimer = time.AfterFunc(sessionDuration, func() {
+			s.expiredCh <- runner
+		})
+	}
+	runner.refMu.Unlock()
+	req.useLoadedRunner(runner, s.finishedReqCh)
+	return true
+}
+
 // loadImageGen loads an image generation model.
 func (s *Scheduler) loadImageGen(req *LlmRequest) bool {
 	// Use model name for imagegen (it resolves manifests by name, not file path)
--- a/tokenizer/bytepairencoding.go
+++ b/tokenizer/bytepairencoding.go
@@ -1,4 +1,4 @@
-package model
+package tokenizer

 import (
 	"cmp"
@@ -13,24 +13,24 @@ import (
 	"github.com/ollama/ollama/logutil"
 )

-type BytePairEncoding struct {
+type bytePairEncoding struct {
 	vocab   *Vocabulary
 	regexps []*regexp2.Regexp
 }

-var _ TextProcessor = (*BytePairEncoding)(nil)
+var _ Tokenizer = (*bytePairEncoding)(nil)

-func NewBytePairEncoding(vocab *Vocabulary, pretokenizers ...string) BytePairEncoding {
-	if len(pretokenizers) == 0 {
+func NewBytePairEncoding(vocab *Vocabulary, pretokenizer ...string) bytePairEncoding {
+	if len(pretokenizer) == 0 {
 		// set default byte-level pretokenizer if none provided, e.g.
-		// https://github.com/huggingface/tokenizers/blob/main/tokenizers/src/pre_tokenizers/byte_level.rs#L44
-		pretokenizers = []string{`'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+`}
+		// https://github.com/huggingface/tokenizer/blob/main/tokenizer/src/pre_tokenizer/byte_level.rs#L44
+		pretokenizer = []string{`'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+`}
 	}

-	return BytePairEncoding{
+	return bytePairEncoding{
 		vocab: vocab,
 		regexps: slices.Collect(func(yield func(*regexp2.Regexp) bool) {
-			for _, p := range pretokenizers {
+			for _, p := range pretokenizer {
 				if !yield(regexp2.MustCompile(p, regexp2.RE2)) {
 					return
 				}
@@ -39,15 +39,15 @@ func NewBytePairEncoding(vocab *Vocabulary, pretokenizers ...string) BytePairEnc
 	}
 }

-func (bpe BytePairEncoding) Vocabulary() *Vocabulary {
+func (bpe bytePairEncoding) Vocabulary() *Vocabulary {
 	return bpe.vocab
 }

-func (bpe BytePairEncoding) Is(id int32, special Special) bool {
+func (bpe bytePairEncoding) Is(id int32, special Special) bool {
 	return bpe.vocab.Is(id, special)
 }

-func (bpe *BytePairEncoding) split(s string) iter.Seq[string] {
+func (bpe *bytePairEncoding) split(s string) iter.Seq[string] {
 	parts := []string{s}
 	for _, re := range bpe.regexps {
 		parts = slices.Collect(func(yield func(string) bool) {
@@ -98,7 +98,7 @@ type merge struct {
 	runes []rune
 }

-func (bpe BytePairEncoding) Encode(s string, addSpecial bool) ([]int32, error) {
+func (bpe bytePairEncoding) Encode(s string, addSpecial bool) ([]int32, error) {
 	fragments := []fragment{{value: s}}
 	for _, special := range bpe.vocab.SpecialVocabulary() {
 		// TODO: process special tokens concurrently
@@ -253,7 +253,7 @@ func (l lazyIdsString) LogValue() slog.Value {
 	return slog.AnyValue(fmt.Sprint(l.ids))
 }

-func (bpe BytePairEncoding) Decode(ids []int32) (string, error) {
+func (bpe bytePairEncoding) Decode(ids []int32) (string, error) {
 	var sb strings.Builder
 	for _, id := range ids {
 		for _, r := range bpe.vocab.Decode(id) {
--- a/tokenizer/bytepairencoding_test.go
+++ b/tokenizer/bytepairencoding_test.go
@@ -1,4 +1,4 @@
-package model
+package tokenizer

 import (
 	"bufio"
@@ -14,10 +14,10 @@ import (
 	"github.com/google/go-cmp/cmp"
 )

-func llama(t testing.TB) BytePairEncoding {
+func llama(t testing.TB) bytePairEncoding {
 	t.Helper()

-	f, err := os.Open(filepath.Join("testdata", "llama3.2", "encoder.json"))
+	f, err := os.Open(filepath.FromSlash("testdata/llama3.2/encoder.json"))
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -43,7 +43,7 @@ func llama(t testing.TB) BytePairEncoding {
 		}
 	}

-	f, err = os.Open(filepath.Join("testdata", "llama3.2", "vocab.bpe"))
+	f, err = os.Open(filepath.FromSlash("testdata/llama3.2/vocab.bpe"))
 	if err != nil {
 		t.Fatal(err)
 	}
--- a/tokenizer/sentencepiece.go
+++ b/tokenizer/sentencepiece.go
@@ -1,4 +1,4 @@
-package model
+package tokenizer

 import (
 	"container/heap"
@@ -12,18 +12,18 @@ import (

 const spmWhitespaceSep = "▁"

-type SentencePiece struct {
+type sentencePiece struct {
 	maxTokenLen int
 	vocab       *Vocabulary
 }

-var _ TextProcessor = (*SentencePiece)(nil)
+var _ Tokenizer = (*sentencePiece)(nil)

-func (spm SentencePiece) Vocabulary() *Vocabulary {
+func (spm sentencePiece) Vocabulary() *Vocabulary {
 	return spm.vocab
 }

-func NewSentencePiece(vocab *Vocabulary) SentencePiece {
+func NewSentencePiece(vocab *Vocabulary) sentencePiece {
 	logutil.Trace("Tokens", "num tokens", len(vocab.Values), "vals", vocab.Values[:5], "scores", vocab.Scores[:5], "types", vocab.Types[:5])

 	counter := map[int]int{}
@@ -42,17 +42,17 @@ func NewSentencePiece(vocab *Vocabulary) SentencePiece {
 		"user defined", counter[TOKEN_TYPE_USER_DEFINED], "unused", counter[TOKEN_TYPE_UNUSED], "byte", counter[TOKEN_TYPE_BYTE],
 		"max token len", maxTokenLen)

-	return SentencePiece{
+	return sentencePiece{
 		maxTokenLen: maxTokenLen,
 		vocab:       vocab,
 	}
 }

-func (spm SentencePiece) Is(id int32, special Special) bool {
+func (spm sentencePiece) Is(id int32, special Special) bool {
 	return spm.vocab.Is(id, special)
 }

-func (spm SentencePiece) Encode(s string, addSpecial bool) ([]int32, error) {
+func (spm sentencePiece) Encode(s string, addSpecial bool) ([]int32, error) {
 	fragments := []fragment{{value: s}}
 	for _, special := range spm.vocab.SpecialVocabulary() {
 		id := spm.vocab.Encode(special)
@@ -218,13 +218,13 @@ func (q *queue) Pop() interface{} {
 	return item
 }

-func (spm SentencePiece) Decode(ids []int32) (string, error) {
+func (spm sentencePiece) Decode(ids []int32) (string, error) {
 	var sb strings.Builder
 	for _, id := range ids {
 		data := spm.vocab.Decode(id)
 		data = strings.ReplaceAll(data, spmWhitespaceSep, " ")

-		// For tokenizers that use byte tokens like "<0xEA>"
+		// For tokenizer that use byte tokens like "<0xEA>"
 		// convert them to the partial unicode character
 		// so they are buffered correctly by the runner instead
 		// of being sent back to the api as "<0xEA>"
--- a/tokenizer/sentencepiece_test.go
+++ b/tokenizer/sentencepiece_test.go
@@ -1,4 +1,4 @@
-package model
+package tokenizer

 import (
 	"log/slog"
@@ -12,10 +12,10 @@ import (
 	"github.com/ollama/ollama/convert/sentencepiece"
 )

-func loadSentencePieceVocab(t *testing.T) SentencePiece {
+func loadSentencePieceVocab(t *testing.T) sentencePiece {
 	t.Helper()

-	bts, err := os.ReadFile(filepath.Join("testdata", "gemma2", "tokenizer.model"))
+	bts, err := os.ReadFile(filepath.FromSlash("testdata/gemma2/tokenizer.model"))
 	if err != nil {
 		t.Fatal(err)
 	}
--- a/tokenizer/testdata/gemma2/tokenizer.model
+++ b/tokenizer/testdata/gemma2/tokenizer.model
--- a/tokenizer/testdata/llama3.2/encoder.json
+++ b/tokenizer/testdata/llama3.2/encoder.json
--- a/tokenizer/testdata/llama3.2/vocab.bpe
+++ b/tokenizer/testdata/llama3.2/vocab.bpe
--- a/tokenizer/testdata/war-and-peace.txt
+++ b/tokenizer/testdata/war-and-peace.txt
--- a/tokenizer/tokenizer.go
+++ b/tokenizer/tokenizer.go
@@ -0,0 +1,310 @@
+package tokenizer
+
+import (
+	"encoding/json"
+	"errors"
+	"io"
+	"os"
+
+	"github.com/ollama/ollama/types/model"
+)
+
+const (
+	TOKEN_TYPE_NORMAL = iota + 1
+	TOKEN_TYPE_UNKNOWN
+	TOKEN_TYPE_CONTROL
+	TOKEN_TYPE_USER_DEFINED
+	TOKEN_TYPE_UNUSED
+	TOKEN_TYPE_BYTE
+)
+
+type Tokenizer interface {
+	Encode(s string, addSpecial bool) ([]int32, error)
+	Decode([]int32) (string, error)
+	Is(int32, Special) bool
+	Vocabulary() *Vocabulary
+}
+
+func New(root *model.Root) (Tokenizer, error) {
+	f, err := root.Open("tokenizer.json")
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+
+	var tokenizer struct {
+		Model struct {
+			Type   string           `json:"type"`
+			Vocab  map[string]int32 `json:"vocab"`
+			Merges json.RawMessage  `json:"merges"`
+		} `json:"model"`
+
+		PreTokenizer json.RawMessage `json:"pre_tokenizer"`
+		Decoder      json.RawMessage `json:"decoder"`
+
+		AddedTokens []struct {
+			ID      int32  `json:"id"`
+			Content string `json:"content"`
+			Special bool   `json:"special"`
+		} `json:"added_tokens"`
+	}
+
+	if err := json.NewDecoder(f).Decode(&tokenizer); err != nil {
+		return nil, err
+	}
+
+	special := make(map[int32]struct{})
+	for _, token := range tokenizer.AddedTokens {
+		tokenizer.Model.Vocab[token.Content] = token.ID
+		special[token.ID] = struct{}{}
+	}
+
+	vocab, err := specialTokens(root, tokenizer.Model.Vocab)
+	if err != nil {
+		return nil, err
+	}
+
+	vocab.Values = make([]string, len(tokenizer.Model.Vocab))
+	vocab.Scores = make([]float32, len(tokenizer.Model.Vocab))
+	vocab.Types = make([]int32, len(tokenizer.Model.Vocab))
+	for content, id := range tokenizer.Model.Vocab {
+		vocab.Values[id] = content
+		vocab.Scores[id] = float32(id)
+		vocab.Types[id] = TOKEN_TYPE_NORMAL
+		if _, ok := special[id]; ok {
+			vocab.Types[id] = TOKEN_TYPE_USER_DEFINED
+		}
+	}
+
+	if tokenizer.Model.Merges != nil {
+		var pairs [][]string
+		if err := json.Unmarshal(tokenizer.Model.Merges, &pairs); err == nil {
+			vocab.Merges = make([]string, len(pairs))
+			for i, pair := range pairs {
+				vocab.Merges[i] = pair[0] + " " + pair[1]
+			}
+		} else if err := json.Unmarshal(tokenizer.Model.Merges, &vocab.Merges); err != nil {
+			return nil, err
+		}
+	}
+
+	vocab.valuesOnce.Do(func() {})
+	vocab.values = tokenizer.Model.Vocab
+
+	if tokenizer.Model.Type == "WordPiece" {
+		return NewWordPiece(vocab, true), nil
+	}
+
+	if tokenizer.Decoder != nil {
+		var decoder struct {
+			Type     string `json:"type"`
+			Decoders []struct {
+				Type    string `json:"type"`
+				Pattern struct {
+					String string `json:"string"`
+				} `json:"pattern"`
+			} `json:"decoders"`
+		}
+
+		if err := json.Unmarshal(tokenizer.Decoder, &decoder); err != nil {
+			return nil, err
+		}
+
+		if decoder.Type == "Sequence" {
+			for _, d := range decoder.Decoders {
+				if d.Type == "Replace" && d.Pattern.String == "▁" {
+					return NewSentencePiece(vocab), nil
+				}
+			}
+		}
+	}
+
+	var pretokenizers []string
+	if tokenizer.PreTokenizer != nil {
+		var pretokenizer struct {
+			Type          string `json:"type"`
+			Pretokenizers []struct {
+				Type    string `json:"type"`
+				Pattern struct {
+					Regex string
+				} `json:"pattern"`
+				IndividualDigits bool `json:"individual_digits"`
+			}
+		}
+
+		if err := json.Unmarshal(tokenizer.PreTokenizer, &pretokenizer); err != nil {
+			return nil, err
+		}
+
+		if pretokenizer.Type == "Sequence" {
+			for _, pretokenizer := range pretokenizer.Pretokenizers {
+				switch pretokenizer.Type {
+				case "Digits":
+					if pretokenizer.IndividualDigits {
+						pretokenizers = append(pretokenizers, `\d`)
+					} else {
+						pretokenizers = append(pretokenizers, `\d+`)
+					}
+				case "Punctuation":
+					pretokenizers = append(pretokenizers, `[^\p{L}\p{N}]+`)
+				case "Split":
+					pretokenizers = append(pretokenizers, pretokenizer.Pattern.Regex)
+				case "WhitespaceSplit":
+					pretokenizers = append(pretokenizers, `\s+(?!\S)|\s+`)
+				}
+			}
+		}
+	}
+
+	return NewBytePairEncoding(vocab, pretokenizers...), nil
+}
+
+// valueOrValues is a type that can unmarshal from either a single value or an array of values.
+type valueOrValues[E any] []E
+
+func (m *valueOrValues[E]) UnmarshalJSON(data []byte) error {
+	var s []E
+	if err := json.Unmarshal(data, &s); err != nil {
+		var e E
+		if err := json.Unmarshal(data, &e); err != nil {
+			return err
+		}
+		s = []E{e}
+	}
+	*m = valueOrValues[E](s)
+	return nil
+}
+
+type specialTokenIDs struct {
+	BOSTokenID valueOrValues[int32] `json:"bos_token_id"`
+	EOSTokenID valueOrValues[int32] `json:"eos_token_id"`
+}
+
+// stringOrContent is a type that can unmarshal from either a string or an object with a "content" field.
+type stringOrContent string
+
+func (t *stringOrContent) UnmarshalJSON(data []byte) error {
+	var s string
+	if err := json.Unmarshal(data, &s); err != nil {
+		var m map[string]any
+		if err := json.Unmarshal(data, &m); err != nil {
+			return err
+		}
+		if content, ok := m["content"].(string); ok {
+			s = content
+		}
+	}
+	*t = stringOrContent(s)
+	return nil
+}
+
+func specialTokens(root *model.Root, values map[string]int32) (*Vocabulary, error) {
+	var vocab Vocabulary
+	for _, c := range []struct {
+		name string
+		fn   func(io.Reader) error
+	}{
+		{
+			name: "generation_config.json",
+			fn: func(r io.Reader) error {
+				var c specialTokenIDs
+				if err := json.NewDecoder(r).Decode(&c); err != nil {
+					return err
+				}
+
+				vocab.BOS = c.BOSTokenID
+				vocab.EOS = c.EOSTokenID
+				return nil
+			},
+		},
+		{
+			name: "config.json",
+			fn: func(r io.Reader) error {
+				var c specialTokenIDs
+				if err := json.NewDecoder(r).Decode(&c); err != nil {
+					return err
+				}
+
+				if len(vocab.BOS) == 0 {
+					vocab.BOS = c.BOSTokenID
+				}
+
+				if len(vocab.EOS) == 0 {
+					vocab.EOS = c.EOSTokenID
+				}
+
+				return nil
+			},
+		},
+		{
+			name: "tokenizer_config.json",
+			fn: func(r io.Reader) error {
+				var c struct {
+					BOSToken    stringOrContent `json:"bos_token"`
+					EOSToken    stringOrContent `json:"eos_token"`
+					PADToken    stringOrContent `json:"pad_token"`
+					AddBOSToken bool            `json:"add_bos_token"`
+					AddEOSToken bool            `json:"add_eos_token"`
+				}
+				if err := json.NewDecoder(r).Decode(&c); err != nil {
+					return err
+				}
+
+				if len(vocab.BOS) == 0 && c.BOSToken != "" {
+					if id, ok := values[string(c.BOSToken)]; ok {
+						vocab.BOS = []int32{id}
+					}
+				}
+
+				if len(vocab.EOS) == 0 && c.EOSToken != "" {
+					if id, ok := values[string(c.EOSToken)]; ok {
+						vocab.EOS = []int32{id}
+					}
+				}
+
+				vocab.AddBOS = c.AddBOSToken
+				vocab.AddEOS = c.AddEOSToken
+				return nil
+			},
+		},
+		{
+			name: "special_tokens_map.json",
+			fn: func(r io.Reader) error {
+				var c map[string]stringOrContent
+				if err := json.NewDecoder(r).Decode(&c); err != nil {
+					return err
+				}
+
+				if bos, ok := c["bos_token"]; ok && len(vocab.BOS) == 0 {
+					if id, ok := values[string(bos)]; ok {
+						vocab.BOS = []int32{id}
+					}
+				}
+
+				if eos, ok := c["eos_token"]; ok && len(vocab.EOS) == 0 {
+					if id, ok := values[string(eos)]; ok {
+						vocab.EOS = []int32{id}
+					}
+				}
+
+				return nil
+			},
+		},
+	} {
+		if err := func() error {
+			f, err := root.Open(c.name)
+			if errors.Is(err, os.ErrNotExist) {
+				return nil
+			} else if err != nil {
+				return err
+			}
+			defer f.Close()
+
+			return c.fn(f)
+		}(); err != nil {
+			return nil, err
+		}
+	}
+
+	return &vocab, nil
+}
--- a/tokenizer/vocabulary.go
+++ b/tokenizer/vocabulary.go
@@ -1,4 +1,4 @@
-package model
+package tokenizer

 import (
 	"log/slog"
--- a/tokenizer/vocabulary_test.go
+++ b/tokenizer/vocabulary_test.go
@@ -1,4 +1,4 @@
-package model
+package tokenizer

 import (
 	"testing"
--- a/tokenizer/wordpiece.go
+++ b/tokenizer/wordpiece.go
@@ -1,4 +1,4 @@
-package model
+package tokenizer

 import (
 	"fmt"
@@ -9,7 +9,7 @@ import (
 	"github.com/ollama/ollama/logutil"
 )

-type WordPiece struct {
+type wordPiece struct {
 	vocab     *Vocabulary
 	lowercase bool
 }
@@ -32,8 +32,8 @@ var wordPieceReplacer = strings.NewReplacer(
 	" 're", "'re",
 )

-// Decode implements TextProcessor.
-func (wpm WordPiece) Decode(ids []int32) (string, error) {
+// Decode implements Tokenizer.
+func (wpm wordPiece) Decode(ids []int32) (string, error) {
 	var sb strings.Builder
 	for i, id := range ids {
 		if id < 0 || int(id) >= len(wpm.vocab.Values) {
@@ -56,7 +56,7 @@ func (wpm WordPiece) Decode(ids []int32) (string, error) {

 // words splits a string into words, treating CJK characters as separate words.
 // TODO: this is specifically for BERT and may need to be adjusted or refactored for other models.
-func (wpm WordPiece) words(s string) iter.Seq[string] {
+func (wpm wordPiece) words(s string) iter.Seq[string] {
 	return func(yield func(string) bool) {
 		runes := make([]rune, 0, len(s)*3)
 		for _, r := range s {
@@ -96,8 +96,8 @@ func (wpm WordPiece) words(s string) iter.Seq[string] {
 	}
 }

-// Encode implements TextProcessor.
-func (wpm WordPiece) Encode(s string, addSpecial bool) ([]int32, error) {
+// Encode implements Tokenizer.
+func (wpm wordPiece) Encode(s string, addSpecial bool) ([]int32, error) {
 	var ids []int32

 	// TODO: use [UNK] from config
@@ -151,20 +151,20 @@ func (wpm WordPiece) Encode(s string, addSpecial bool) ([]int32, error) {
 	return ids, nil
 }

-// Is implements TextProcessor.
-func (wpm WordPiece) Is(id int32, special Special) bool {
+// Is implements Tokenizer.
+func (wpm wordPiece) Is(id int32, special Special) bool {
 	return wpm.vocab.Is(id, special)
 }

-// Vocabulary implements TextProcessor.
-func (wpm WordPiece) Vocabulary() *Vocabulary {
+// Vocabulary implements Tokenizer.
+func (wpm wordPiece) Vocabulary() *Vocabulary {
 	return wpm.vocab
 }

-var _ TextProcessor = (*WordPiece)(nil)
+var _ Tokenizer = (*wordPiece)(nil)

-func NewWordPiece(vocab *Vocabulary, lowercase bool) WordPiece {
-	return WordPiece{
+func NewWordPiece(vocab *Vocabulary, lowercase bool) wordPiece {
+	return wordPiece{
 		vocab:     vocab,
 		lowercase: lowercase,
 	}
--- a/tokenizer/wordpiece_test.go
+++ b/tokenizer/wordpiece_test.go
@@ -1,4 +1,4 @@
-package model
+package tokenizer

 import (
 	"slices"
@@ -39,7 +39,7 @@ func TestWordPiece(t *testing.T) {
 }

 func TestWordPieceWords(t *testing.T) {
-	var wpm WordPiece
+	var wpm wordPiece

 	basic := slices.Collect(wpm.words("Hey friend!     How are you?!?"))
 	if diff := cmp.Diff([]string{"Hey", "friend", "!", "How", "are", "you", "?", "!", "?"}, basic); diff != "" {
--- a/types/model/file.go
+++ b/types/model/file.go
@@ -0,0 +1,309 @@
+package model
+
+import (
+	"crypto/sha256"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"hash"
+	"io"
+	"io/fs"
+	"iter"
+	"maps"
+	"mime"
+	"os"
+	"path/filepath"
+	"strings"
+
+	"github.com/ollama/ollama/envconfig"
+)
+
+func root() (*os.Root, error) {
+	root, err := os.OpenRoot(envconfig.Models())
+	if err != nil {
+		return nil, err
+	}
+
+	for _, sub := range []string{"manifests", "blobs"} {
+		if _, err := root.Stat(sub); errors.Is(err, fs.ErrNotExist) {
+			if err := root.MkdirAll(sub, 0o750); err != nil {
+				return nil, err
+			}
+		} else if err != nil {
+			return nil, err
+		}
+	}
+
+	return root, nil
+}
+
+// Open opens an existing file for reading. It will return [fs.ErrNotExist]
+// if the file does not exist. The returned [*Root] can only be used for reading.
+// It is the caller's responsibility to close the file when done.
+func Open(n Name) (*Root, error) {
+	r, err := root()
+	if err != nil {
+		return nil, err
+	}
+
+	f, err := r.Open(filepath.Join("manifests", n.Filepath()))
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+
+	var m manifest
+	if err := json.NewDecoder(f).Decode(&m); err != nil {
+		return nil, err
+	}
+
+	blobs := make(map[string]*blob, len(m.Layers)+1)
+	blobs[NamePrefix] = m.Config
+	for _, layer := range m.Layers {
+		if layer.Name == "" && layer.MediaType != "" {
+			mediatype, _, err := mime.ParseMediaType(layer.MediaType)
+			if err != nil {
+				return nil, err
+			}
+
+			if suffix, ok := strings.CutPrefix(mediatype, MediaTypePrefix); ok {
+				layer.Name = NamePrefix + suffix
+			}
+		}
+
+		blobs[layer.Name] = layer
+	}
+
+	return &Root{
+		root:  r,
+		name:  n,
+		blobs: blobs,
+		flags: os.O_RDONLY,
+	}, nil
+}
+
+// Create creates a new file. The returned [Root] can be used for both reading
+// and writing. It is the caller's responsibility to close the file when done
+// in order to finalize any new blobs and write the manifest.
+func Create(n Name) (*Root, error) {
+	r, err := root()
+	if err != nil {
+		return nil, err
+	}
+
+	return &Root{
+		root:  r,
+		name:  n,
+		blobs: make(map[string]*blob),
+		flags: os.O_RDWR,
+	}, nil
+}
+
+type blob struct {
+	Digest    string `json:"digest"`
+	MediaType string `json:"mediaType"`
+	Name      string `json:"name,omitempty"`
+	Size      int64  `json:"size"`
+
+	// tempfile is the temporary file where the blob data is written.
+	tempfile *os.File
+
+	// hash is the hash.Hash used to compute the blob digest.
+	hash hash.Hash
+}
+
+func (b *blob) Write(p []byte) (int, error) {
+	return io.MultiWriter(b.tempfile, b.hash).Write(p)
+}
+
+func (b *blob) Filepath() string {
+	return strings.ReplaceAll(b.Digest, ":", "-")
+}
+
+type manifest struct {
+	SchemaVersion int     `json:"schemaVersion"`
+	MediaType     string  `json:"mediaType"`
+	Config        *blob   `json:"config"`
+	Layers        []*blob `json:"layers"`
+}
+
+// Root represents a model file. It can be used to read and write blobs
+// associated with the model.
+//
+// Blobs are identified by name. Certain names are special and reserved;
+// see [NamePrefix] for details.
+type Root struct {
+	root  *os.Root
+	name  Name
+	blobs map[string]*blob
+	flags int
+}
+
+const MediaTypePrefix = "application/vnd.ollama"
+
+// NamePrefix is the prefix used for identifying special names. Names
+// with this prefix are idenfitied by their media types:
+//
+//   - name: NamePrefix + suffix
+//   - mediaType: [MediaTypePrefix] + suffix
+//
+// For example:
+//
+//   - name: "./..image.model"
+//   - mediaType: "application/vnd.ollama.image.model"
+//
+// NamePrefix by itself identifies the manifest config.
+const NamePrefix = "./."
+
+// Open opens the named blob for reading. It is the caller's responsibility
+// to close the returned [io.ReadCloser] when done. It will return
+// [fs.ErrNotExist] if the blob does not exist.
+func (r Root) Open(name string) (io.ReadCloser, error) {
+	if b, ok := r.blobs[name]; ok {
+		r, err := r.root.Open(filepath.Join("blobs", b.Filepath()))
+		if err != nil {
+			return nil, err
+		}
+		return r, nil
+	}
+
+	return nil, fs.ErrNotExist
+}
+
+func (r Root) ReadFile(name string) ([]byte, error) {
+	f, err := r.Open(name)
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+
+	return io.ReadAll(f)
+}
+
+// Create creates or replaces a named blob in the file. If the blob already
+// exists, it will be overwritten. It will return [fs.ErrInvalid] if the file
+// was opened in read-only mode. The returned [io.Writer] can be used to write
+// to the blob and does not need be closed, but the file must be closed to
+// finalize the blob.
+func (r *Root) Create(name string) (io.Writer, error) {
+	if r.flags&os.O_RDWR != 0 {
+		w, err := os.CreateTemp(r.root.Name(), "")
+		if err != nil {
+			return nil, err
+		}
+
+		r.blobs[name] = &blob{Name: name, tempfile: w, hash: sha256.New()}
+		return r.blobs[name], nil
+	}
+
+	return nil, fs.ErrInvalid
+}
+
+// Close closes the file. If the file was opened in read-write mode, it
+// will finalize any writeable blobs and write the manifest.
+func (r *Root) Close() error {
+	if r.flags&os.O_RDWR != 0 {
+		for _, b := range r.blobs {
+			if b.tempfile != nil {
+				fi, err := b.tempfile.Stat()
+				if err != nil {
+					return err
+				}
+
+				if err := b.tempfile.Close(); err != nil {
+					return err
+				}
+
+				b.Size = fi.Size()
+				b.Digest = fmt.Sprintf("sha256:%x", b.hash.Sum(nil))
+
+				if suffix, ok := strings.CutPrefix(b.Name, NamePrefix); ok {
+					if b.Name == NamePrefix {
+						b.MediaType = "application/vnd.docker.container.image.v1+json"
+					} else {
+						b.MediaType = MediaTypePrefix + suffix
+					}
+					b.Name = ""
+				}
+
+				rel, err := filepath.Rel(r.root.Name(), b.tempfile.Name())
+				if err != nil {
+					return err
+				}
+
+				if err := r.root.Rename(rel, filepath.Join("blobs", b.Filepath())); err != nil {
+					return err
+				}
+			}
+		}
+
+		p := filepath.Join("manifests", r.name.Filepath())
+		if _, err := r.root.Stat(filepath.Dir(p)); errors.Is(err, os.ErrNotExist) {
+			if err := r.root.MkdirAll(filepath.Dir(p), 0o750); err != nil {
+				return err
+			}
+		} else if err != nil {
+			return err
+		}
+
+		f, err := r.root.OpenFile(p, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0o640)
+		if err != nil {
+			return err
+		}
+		defer f.Close()
+
+		if err := json.NewEncoder(f).Encode(manifest{
+			SchemaVersion: 2,
+			MediaType:     "application/vnd.docker.distribution.manifest.v2+json",
+			Config:        r.blobs[NamePrefix],
+			Layers: func() []*blob {
+				blobs := make([]*blob, 0, len(r.blobs))
+				for name, b := range r.blobs {
+					if name != NamePrefix {
+						blobs = append(blobs, b)
+					}
+				}
+				return blobs
+			}(),
+		}); err != nil {
+			return err
+		}
+	}
+
+	return r.root.Close()
+}
+
+// Name returns the name of the file.
+func (r Root) Name() Name {
+	return r.name
+}
+
+// Names returns an iterator over the names in the file.
+func (r Root) Names() iter.Seq[string] {
+	return maps.Keys(r.blobs)
+}
+
+// Glob returns an iterator over the names in the file that match the given
+// pattern.
+//
+// The pattern syntax is the same as [filepath.Match]. As with filepath.Match,
+// the only possible returned error is ErrBadPattern, when pattern is malformed.
+func (r Root) Glob(pattern string) (iter.Seq[string], error) {
+	if _, err := filepath.Match(pattern, ""); err != nil {
+		return nil, err
+	}
+
+	return func(yield func(string) bool) {
+		for name, blob := range r.blobs {
+			if matched, _ := filepath.Match(pattern, name); matched {
+				if !yield(blob.Filepath()) {
+					return
+				}
+			}
+		}
+	}, nil
+}
+
+func (r Root) JoinPath(parts ...string) string {
+	return filepath.Join(append([]string{r.root.Name()}, parts...)...)
+}
--- a/types/model/file_test.go
+++ b/types/model/file_test.go
@@ -0,0 +1,90 @@
+package model
+
+import (
+	"io"
+	"strings"
+	"testing"
+)
+
+// setup is a helper function to set up the test environment.
+func setup(t *testing.T, models map[Name]map[string]io.Reader) {
+	t.Setenv("OLLAMA_MODELS", t.TempDir())
+
+	for m, s := range models {
+		f, err := Create(m)
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		for n, r := range s {
+			w, err := f.Create(n)
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			if _, err := io.Copy(w, r); err != nil {
+				t.Fatal(err)
+			}
+		}
+
+		if err := f.Close(); err != nil {
+			t.Fatal(err)
+		}
+	}
+}
+
+func TestOpen(t *testing.T) {
+	setup(t, map[Name]map[string]io.Reader{
+		ParseName("namespace/model"): {
+			"./.": strings.NewReader(`{"key":"value"}`),
+		},
+		ParseName("namespace/model:8b"): {
+			"./.": strings.NewReader(`{"foo":"bar"}`),
+		},
+		ParseName("another/model"): {
+			"./.": strings.NewReader(`{"another":"config"}`),
+		},
+	})
+
+	f, err := Open(ParseName("namespace/model"))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	for _, name := range []string{"./."} {
+		r, err := f.Open(name)
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		if _, err := io.ReadAll(r); err != nil {
+			t.Fatal(err)
+		}
+
+		if err := r.Close(); err != nil {
+			t.Fatal(err)
+		}
+	}
+
+	if err := f.Close(); err != nil {
+		t.Fatal(err)
+	}
+
+	t.Run("does not exist", func(t *testing.T) {
+		if _, err := Open(ParseName("namespace/unknown")); err == nil {
+			t.Error("expected error for unknown model")
+		}
+	})
+
+	t.Run("write", func(t *testing.T) {
+		f, err := Open(ParseName("namespace/model"))
+		if err != nil {
+			t.Fatal(err)
+		}
+		defer f.Close()
+
+		if _, err := f.Create("new-blob"); err == nil {
+			t.Error("expected error creating blob in read-only mode")
+		}
+	})
+}
--- a/types/model/files.go
+++ b/types/model/files.go
@@ -0,0 +1,33 @@
+package model
+
+import (
+	"io/fs"
+	"iter"
+	"path/filepath"
+)
+
+func All() (iter.Seq[Name], error) {
+	r, err := root()
+	if err != nil {
+		return nil, err
+	}
+
+	manifests, err := r.OpenRoot("manifests")
+	if err != nil {
+		return nil, err
+	}
+
+	matches, err := fs.Glob(manifests.FS(), "*/*/*/*")
+	if err != nil {
+		return nil, err
+	}
+
+	return func(yield func(Name) bool) {
+		for _, match := range matches {
+			name := ParseNameFromFilepath(filepath.ToSlash(match))
+			if !yield(name) {
+				return
+			}
+		}
+	}, nil
+}
--- a/types/model/name.go
+++ b/types/model/name.go
@@ -227,6 +227,17 @@ func (n Name) String() string {
 	return b.String()
 }

+// Set implements [flag.Value]. It parses the provided input as a name string
+// and sets the receiver to the parsed value. If the parsed name is not valid,
+// ErrUnqualifiedName is returned.
+func (n *Name) Set(s string) error {
+	*n = ParseName(s)
+	if !n.IsValid() {
+		return ErrUnqualifiedName
+	}
+	return nil
+}
+
 // DisplayShortest returns a short string version of the name.
 func (n Name) DisplayShortest() string {
 	var sb strings.Builder
--- a/x/imagegen/tokenizer/tokenizer.go
+++ b/x/imagegen/tokenizer/tokenizer.go
@@ -266,10 +266,10 @@ func LoadFromBytes(data []byte) (*Tokenizer, error) {

 // TokenizerConfig holds optional configuration data that can be passed to LoadFromBytesWithConfig.
 type TokenizerConfig struct {
-	TokenizerConfigJSON   []byte // tokenizer_config.json content
-	GenerationConfigJSON  []byte // generation_config.json content
-	SpecialTokensMapJSON  []byte // special_tokens_map.json content
-	ConfigJSON            []byte // config.json content
+	TokenizerConfigJSON  []byte // tokenizer_config.json content
+	GenerationConfigJSON []byte // generation_config.json content
+	SpecialTokensMapJSON []byte // special_tokens_map.json content
+	ConfigJSON           []byte // config.json content
 }

 // LoadFromBytesWithConfig loads a tokenizer from tokenizer.json bytes with additional config files.
@@ -445,7 +445,6 @@ func Load(path string) (*Tokenizer, error) {

 // loadFromTokenizerJSON parses a tokenizer.json file
 func loadFromTokenizerJSON(data []byte, dir string) (*Tokenizer, error) {
-
 	var raw struct {
 		Model struct {
 			Type   string           `json:"type"` // "BPE" or "WordPiece"
--- a/x/kvcache/cache.go
+++ b/x/kvcache/cache.go
@@ -1,77 +0,0 @@
-package kvcache
-
-import (
-	"errors"
-
-	"github.com/ollama/ollama/x/ml"
-	"github.com/ollama/ollama/x/model/input"
-)
-
-var (
-	ErrKvCacheFull  = errors.New("could not find a kv cache slot")
-	ErrNotSupported = errors.New("model does not support operation")
-)
-
-type Cache interface {
-	// ** used by model implementations **
-
-	// SetLayer sets the active layer of the cache
-	SetLayer(layer int)
-
-	// Get returns the history of key and value tensors plus a mask
-	//
-	// The shape of the tensors is documented in the specific
-	// cache implementation used.
-	Get(ctx ml.Context) (ml.Tensor, ml.Tensor, ml.Tensor)
-
-	// Put stores a batch of key and value in the cache
-	//
-	// The shape of the tensors is documented in the specific
-	// cache implementation used.
-	Put(ctx ml.Context, key, value ml.Tensor)
-
-	// SetConfig controls optimizations (mostly backend-specific) that may transform
-	// the output of the cache to work better with specific kernels. If not called,
-	// the backend settings will be used. This works well when calling Attention.
-	//
-	// The config can be overridden by models, especially if they require vanilla
-	// output when implementing their own version of attention. To do this, pass
-	// an empty ml.CacheConfig.
-	//
-	// Most models will not need to use this.
-	SetConfig(ml.CacheConfig)
-
-	// ** cache management **
-
-	// Init sets up runtime parameters.
-	// backend: Used to allocate cache data storage and execute management operations (such as defrag)
-	// dtype: The data type for storing cache entries
-	// maxSequences: The maximum number of sequences stored in the cache - across all batches
-	// capacity: The number of cache entries to store, per sequence
-	// maxBatch: The maximum number of tokens that can occur in a single batch
-	Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity, maxBatch int)
-
-	// Close closes the cache and frees resources associated with it
-	Close()
-
-	// StartForward is called before the start of the model's forward pass.
-	// For each token in the coming batch, there must be a corresponding
-	// entry in positions and seqs. reserve is to preallocate memory
-	// without actually storing data in the cache.
-	StartForward(ctx ml.Context, batch input.Batch, reserve bool) error
-
-	// CopyPrefix copies tokens in the range [0, len) from srcSeq to dstSeq
-	CopyPrefix(srcSeq, dstSeq int, len int32)
-
-	// CanResume returns true if the cache can continue with the next token at
-	// the given position and sequence. Assumes that the caller has already
-	// verified the contents of the cache.
-	CanResume(seq int, pos int32) bool
-
-	// Remove deletes tokens in the range [beginIndex, endIndex) from seq. Set
-	// endIndex to math.MaxInt32 to remove everything starting at beginIndex.
-	//
-	// If an error occurs, the entire context for the sequence should be
-	// removed by calling Remove(seq, 0, math.MaxInt32)
-	Remove(seq int, beginIndex, endIndex int32) error
-}
--- a/x/kvcache/causal.go
+++ b/x/kvcache/causal.go
@@ -1,797 +0,0 @@
-package kvcache
-
-// import (
-// 	"errors"
-// 	"fmt"
-// 	"log/slog"
-// 	"math"
-// 	"slices"
-
-// 	"github.com/ollama/ollama/ml"
-// 	"github.com/ollama/ollama/model/input"
-// )
-
-// type shiftFn func(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error)
-
-// // Causal cache stores K and V tensors according to their position in the
-// // sequence. Returns the history and a mask for attending to past tokens
-// //
-// // The tensors are of shape embed dim, kv heads, batch size
-// // The mask is of shape history size, batch size
-// type Causal struct {
-// 	DType ml.DType
-
-// 	// swaWindowSize is the number of tokens that will be included in the mask
-// 	// during attention operations. swaMemorySize is the number of tokens that
-// 	// will be retained in memory for partial prefix caching. Set to math.MaxInt32
-// 	// for unlimited or if sliding window attention is not being used.
-// 	swaWindowSize int32
-// 	swaMemorySize int32
-
-// 	chunkSize int32
-
-// 	opts CausalOptions
-
-// 	// maxBatch is the largest batch that we might receive
-// 	maxBatch int
-
-// 	// config controls mostly backend-specific optimizations
-// 	config *ml.CacheConfig
-
-// 	// ** current forward pass **
-
-// 	// size of the current batch
-// 	curBatchSize int
-
-// 	// locations for data storage for this batch
-// 	curLoc ml.Tensor
-
-// 	// mask of the cache as used by this batch
-// 	curMask ml.Tensor
-
-// 	// the active layer for Get and Put
-// 	curLayer int
-
-// 	// locations in the cache that are needed for this batch
-// 	curCellRange cellRange
-
-// 	// curSequences is the sequences corresponding to this pass's entries in the cache
-// 	curSequences []int
-
-// 	// curPositions is the positions corresponding to this pass's entries in the cache
-// 	curPositions []int32
-
-// 	// ** cache metadata **
-
-// 	// for each possible location in the cache, stores the position and set of sequences
-// 	// that reference the data there
-// 	cells []cacheCell
-
-// 	// maps from sequence to the range of locations where it is stored in the cache
-// 	cellRanges map[int]cellRange
-
-// 	// ** cache data storage **
-
-// 	shiftFn      shiftFn
-// 	backend      ml.Backend
-// 	ctxs         map[int]ml.Context
-// 	keys, values map[int]ml.Tensor
-
-// 	kHeadDims, vHeadDims, numKVHeads map[int]int
-// }
-
-// type cacheCell struct {
-// 	pos       int32
-// 	sequences []int
-// }
-
-// type cellRange struct {
-// 	min int
-// 	max int
-// }
-
-// func NewCausalCache(shift shiftFn) *Causal {
-// 	return &Causal{
-// 		shiftFn:    shift,
-// 		ctxs:       make(map[int]ml.Context),
-// 		keys:       make(map[int]ml.Tensor),
-// 		values:     make(map[int]ml.Tensor),
-// 		kHeadDims:  make(map[int]int),
-// 		vHeadDims:  make(map[int]int),
-// 		numKVHeads: make(map[int]int),
-// 	}
-// }
-
-// func NewSWACache(windowSize int32, shift shiftFn) *Causal {
-// 	return &Causal{
-// 		swaWindowSize: windowSize,
-// 		shiftFn:       shift,
-// 		ctxs:          make(map[int]ml.Context),
-// 		keys:          make(map[int]ml.Tensor),
-// 		values:        make(map[int]ml.Tensor),
-// 		kHeadDims:     make(map[int]int),
-// 		vHeadDims:     make(map[int]int),
-// 		numKVHeads:    make(map[int]int),
-// 	}
-// }
-
-// func NewSWAMemCache(windowSize int32, memorySize int32, shift shiftFn) *Causal {
-// 	return &Causal{
-// 		swaWindowSize: windowSize,
-// 		swaMemorySize: memorySize,
-// 		shiftFn:       shift,
-// 		ctxs:          make(map[int]ml.Context),
-// 		keys:          make(map[int]ml.Tensor),
-// 		values:        make(map[int]ml.Tensor),
-// 		kHeadDims:     make(map[int]int),
-// 		vHeadDims:     make(map[int]int),
-// 		numKVHeads:    make(map[int]int),
-// 	}
-// }
-
-// func NewChunkedAttentionCache(chunkSize int32, shift shiftFn) *Causal {
-// 	return &Causal{
-// 		chunkSize:  chunkSize,
-// 		shiftFn:    shift,
-// 		ctxs:       make(map[int]ml.Context),
-// 		keys:       make(map[int]ml.Tensor),
-// 		values:     make(map[int]ml.Tensor),
-// 		kHeadDims:  make(map[int]int),
-// 		vHeadDims:  make(map[int]int),
-// 		numKVHeads: make(map[int]int),
-// 	}
-// }
-
-// func (c *Causal) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity, maxBatch int) {
-// 	if c.config == nil {
-// 		var config ml.CacheConfig
-// 		if cc, ok := backend.(ml.BackendCacheConfig); ok {
-// 			config = cc.CacheConfig()
-// 		}
-// 		c.config = &config
-// 	}
-
-// 	if c.config.CachePadding == 0 {
-// 		c.config.CachePadding = 1
-// 	}
-
-// 	if c.config.MaskBatchPadding == 0 {
-// 		c.config.MaskBatchPadding = 1
-// 	}
-
-// 	// TODO what types do we handle here?
-// 	// if c.config.MaskDType == ml.DTypeOther {
-// 	// 	c.config.MaskDType = ml.DTypeFloat32
-// 	// }
-
-// 	if c.swaWindowSize == 0 {
-// 		c.swaWindowSize = math.MaxInt32
-// 	}
-// 	if c.swaMemorySize == 0 {
-// 		c.swaMemorySize = c.swaWindowSize
-// 	}
-// 	// We will allocate space in the cache for the stop token, which won't be part of a follow on
-// 	// sequence, so allocate an extra token of storage to ensure that we can jump back without
-// 	// causing a cache break. As an optimization, only do this when we have parallel sequences
-// 	// because the extra token will live in the batch buffer and won't get overwritten if we
-// 	// only have a single sequence.
-// 	if c.swaMemorySize != math.MaxInt32 && maxSequences > 1 {
-// 		c.swaMemorySize = max(c.swaMemorySize, c.swaWindowSize+1)
-// 	}
-// 	if int(c.swaMemorySize) >= capacity {
-// 		c.swaMemorySize = math.MaxInt32
-// 	}
-
-// 	if c.swaMemorySize < c.swaWindowSize {
-// 		panic(fmt.Errorf("sliding window memory (%v) must be at least as large as the window (%v)", c.swaMemorySize, c.swaWindowSize))
-// 	}
-
-// 	var cacheSize int
-// 	if c.swaMemorySize == math.MaxInt32 {
-// 		cacheSize = maxSequences * capacity
-// 	} else {
-// 		cacheSize = (maxSequences * int(c.swaMemorySize)) + maxBatch
-// 	}
-// 	cacheSize = roundUp(cacheSize, c.config.CachePadding)
-// 	c.cells = make([]cacheCell, cacheSize)
-
-// 	c.DType = dtype
-// 	c.cellRanges = make(map[int]cellRange)
-// 	c.backend = backend
-// 	c.maxBatch = maxBatch
-// }
-
-// func (c *Causal) SetConfig(config ml.CacheConfig) {
-// 	if c.config != nil {
-// 		panic("config cannot be changed after being previously set, either by the model or backend")
-// 	}
-
-// 	c.config = &config
-// }
-
-// func (c *Causal) Close() {
-// 	slog.Info("XXX Causal.Close called", "number of contexts", len(c.ctxs))
-// 	for _, ctx := range c.ctxs {
-// 		ctx.Close()
-// 	}
-// }
-
-// func (c *Causal) StartForward(ctx ml.Context, batch input.Batch, reserve bool) error {
-// 	slog.Info("XXX Causal.StartForward", "cell count", len(c.cells), "prior batch size", c.curBatchSize, "positions", len(batch.Positions), "reserve", reserve, "batch", batch)
-// 	// panic("XXX Causal.StartForward")
-// 	c.curBatchSize = len(batch.Positions)
-// 	c.curSequences = batch.Sequences
-// 	c.curPositions = batch.Positions
-// 	c.opts.Except = nil
-
-// 	var locs []int32
-// 	if !reserve {
-// 		c.updateSlidingWindow()
-
-// 		var err error
-// 		locs, err = c.findLocs()
-// 		if err != nil {
-// 			return err
-// 		}
-// 		slog.Info("XXX Causal.StartForward", "findLocs len", len(locs))
-
-// 		for i, pos := range batch.Positions {
-// 			seq := batch.Sequences[i]
-// 			loc := int(locs[i])
-
-// 			c.cells[loc] = cacheCell{pos: pos, sequences: []int{seq}}
-
-// 			seqRange, ok := c.cellRanges[seq]
-// 			if !ok {
-// 				seqRange = newRange()
-// 			}
-
-// 			seqRange.min = min(seqRange.min, loc)
-// 			c.curCellRange.min = min(c.curCellRange.min, loc)
-
-// 			seqRange.max = max(seqRange.max, loc)
-// 			c.curCellRange.max = max(c.curCellRange.max, loc)
-
-// 			c.cellRanges[seq] = seqRange
-// 		}
-// 	} else {
-// 		// If we are reserving memory, don't update any of the cache metadata but set the size
-// 		// to the worst case.
-// 		locs = make([]int32, c.curBatchSize)
-// 		for i := range locs {
-// 			locs[i] = int32(i)
-// 		}
-// 		c.curCellRange.min = 0
-// 		c.curCellRange.max = len(c.cells) - 1
-// 	}
-
-// 	// XXX Building up the locs for what's already processed (if any)
-// 	dummyLocs := []int{}
-// 	c.curCellRange.min = roundDown(c.curCellRange.min, c.config.CachePadding)
-// 	c.curCellRange.max = roundUp(c.curCellRange.max+1, c.config.CachePadding) - 1
-
-// 	for i := range c.curBatchSize {
-// 		enabled := !slices.Contains(c.opts.Except, i)
-// 		for j := c.curCellRange.min; j <= c.curCellRange.max; j++ {
-// 			if !slices.Contains(c.cells[j].sequences, c.curSequences[i]) ||
-// 				(enabled && c.cells[j].pos > c.curPositions[i]) ||
-// 				c.chunkSize > 0 && c.cells[j].pos < c.curPositions[i]-c.curPositions[i]%c.chunkSize ||
-// 				c.cells[j].pos < c.curPositions[i]-c.swaWindowSize {
-// 				// mask[i*length+(j-c.curCellRange.min)] = float32(math.Inf(-1))
-// 			} else {
-// 				if len(dummyLocs) == 0 || dummyLocs[len(dummyLocs)-1] != i {
-// 					dummyLocs = append(dummyLocs, i)
-// 				}
-// 			}
-// 		}
-// 	}
-// 	slog.Info("XXX Causa.StartForward calculated locations", "locs", dummyLocs)
-
-// 	slog.Info("XXX Causal.StartForward", "locs", locs)
-// 	c.curLoc = ctx.Input().FromInts(locs, len(locs))
-// 	c.curMask = c.buildMask(ctx)
-
-// 	return nil
-// }
-
-// func newRange() cellRange {
-// 	return cellRange{
-// 		min: math.MaxInt,
-// 		max: 0,
-// 	}
-// }
-
-// // Returns a slice of locations where each token in the batch should be stored
-// func (c *Causal) findLocs() ([]int32, error) {
-// 	loc := make([]int32, 0, c.curBatchSize)
-
-// 	for i := range c.cells {
-// 		if len(c.cells[i].sequences) == 0 {
-// 			loc = append(loc, int32(i))
-// 			if len(loc) >= c.curBatchSize {
-// 				return loc, nil
-// 			}
-// 		}
-// 	}
-
-// 	return nil, fmt.Errorf("%w (cache: %v batch: %v)", ErrKvCacheFull, len(c.cells), c.curBatchSize)
-// }
-
-// func (c *Causal) updateSlidingWindow() {
-// 	c.curCellRange = newRange()
-
-// 	if c.swaMemorySize == math.MaxInt32 {
-// 		for _, seq := range c.curSequences {
-// 			if seqRange, ok := c.cellRanges[seq]; ok {
-// 				c.curCellRange.min = min(c.curCellRange.min, seqRange.min)
-// 				c.curCellRange.max = max(c.curCellRange.max, seqRange.max)
-// 			}
-// 		}
-
-// 		return
-// 	}
-
-// 	type lowestPosition struct {
-// 		pos      int32
-// 		curBatch bool
-// 	}
-
-// 	// create a map of unique sequences to the lowest position in that sequence
-// 	lowestPos := make(map[int]lowestPosition)
-// 	for i := range c.curPositions {
-// 		seq := c.curSequences[i]
-
-// 		lowest, ok := lowestPos[seq]
-// 		if !ok {
-// 			lowest = lowestPosition{pos: c.curPositions[i], curBatch: true}
-// 		} else if c.curPositions[i] < lowest.pos {
-// 			lowest.pos = c.curPositions[i]
-// 		}
-
-// 		lowestPos[seq] = lowest
-// 	}
-
-// 	// for any sequences are not part of this batch, clean up any tokens
-// 	// that are no longer needed after the processing of the previous
-// 	// batch
-// 	for seq, seqRange := range c.cellRanges {
-// 		if _, ok := lowestPos[seq]; !ok {
-// 			var last int32
-// 			for i := seqRange.min; i <= seqRange.max; i++ {
-// 				if slices.Contains(c.cells[i].sequences, seq) {
-// 					last = max(last, c.cells[i].pos)
-// 				}
-// 			}
-
-// 			lowestPos[seq] = lowestPosition{pos: last + 1, curBatch: false}
-// 		}
-// 	}
-
-// 	// delete any entries that are beyond the window of the oldest position in the sequence
-// 	for seq, lowest := range lowestPos {
-// 		oldRange, ok := c.cellRanges[seq]
-// 		if !ok {
-// 			continue
-// 		}
-
-// 		newRange := newRange()
-
-// 		for i := oldRange.min; i <= oldRange.max; i++ {
-// 			if slices.Contains(c.cells[i].sequences, seq) {
-// 				if c.cells[i].pos < lowest.pos-c.swaMemorySize {
-// 					c.cells[i].sequences = slices.DeleteFunc(c.cells[i].sequences, func(s int) bool { return s == seq })
-// 				} else {
-// 					newRange.min = min(newRange.min, i)
-// 					newRange.max = max(newRange.max, i)
-// 				}
-// 				if lowest.curBatch && c.cells[i].pos >= lowest.pos-c.swaWindowSize {
-// 					c.curCellRange.min = min(c.curCellRange.min, i)
-// 					c.curCellRange.max = max(c.curCellRange.max, i)
-// 				}
-// 			}
-// 		}
-
-// 		c.cellRanges[seq] = newRange
-// 	}
-// }
-
-// func roundDown(length, pad int) int {
-// 	return (length / pad) * pad
-// }
-
-// func roundUp(length, pad int) int {
-// 	return ((length + pad - 1) / pad) * pad
-// }
-
-// // Builds a mask of history x batch indicating whether for each token in the batch the
-// // token in the history should apply. This is based on both the sequence and causality (the
-// // position of the history is not ahead of the token in the batch).
-// func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
-// 	// Align and pad the two dimensions as required by the backend
-// 	batchSize := roundUp(c.curBatchSize, c.config.MaskBatchPadding)
-
-// 	c.curCellRange.min = roundDown(c.curCellRange.min, c.config.CachePadding)
-// 	c.curCellRange.max = roundUp(c.curCellRange.max+1, c.config.CachePadding) - 1
-
-// 	length := c.curCellRange.max - c.curCellRange.min + 1
-
-// 	mask := make([]float32, batchSize*length)
-
-// 	for i := range c.curBatchSize {
-// 		enabled := !slices.Contains(c.opts.Except, i)
-// 		for j := c.curCellRange.min; j <= c.curCellRange.max; j++ {
-// 			if !slices.Contains(c.cells[j].sequences, c.curSequences[i]) ||
-// 				(enabled && c.cells[j].pos > c.curPositions[i]) ||
-// 				c.chunkSize > 0 && c.cells[j].pos < c.curPositions[i]-c.curPositions[i]%c.chunkSize ||
-// 				c.cells[j].pos < c.curPositions[i]-c.swaWindowSize {
-// 				mask[i*length+(j-c.curCellRange.min)] = float32(math.Inf(-1))
-// 			}
-// 		}
-// 	}
-
-// 	// Mask out any padding tokens we added. For padding that we added to the cache history, this
-// 	// has already been masked out because the sequence doesn't match.
-// 	for i := c.curBatchSize * length; i < len(mask); i++ {
-// 		mask[i] = float32(math.Inf(-1))
-// 	}
-
-// 	maskTensor := ctx.Input().FromFloats(mask, batchSize, length)
-
-// 	// if c.config.MaskDType != ml.DTypeFloat32 {
-// 	// 	maskTensor = maskTensor.Cast(ctx, c.config.MaskDType)
-// 	// }
-
-// 	slog.Info("XXX Causal.buildMask", "c.curBatchSize", c.curBatchSize, "c.config.MaskBatchPadding", c.config.MaskBatchPadding, "c.curCellRange.min", c.curCellRange.min, "c.curCellRange.max", c.curCellRange.max, "size", len(mask), "shape", []int{1, batchSize, length})
-
-// 	return maskTensor
-// }
-
-// func (c *Causal) SetLayer(layer int) {
-// 	c.curLayer = layer
-// }
-
-// type CausalOptions struct {
-// 	// Enabled controls whether the causal mask is generated for a particular index in a batch
-// 	Except []int
-// }
-
-// // SetCausal disables causal mask generation for a particular range of indicies in
-// // the current batch for subsequent calls to Get. The state resets for the next forward pass.
-// func (c *Causal) SetCausal(ctx ml.Context, opts CausalOptions) {
-// 	if !slices.Equal(c.opts.Except, opts.Except) {
-// 		c.opts = opts
-// 		if ctx != nil {
-// 			c.curMask = c.buildMask(ctx)
-// 		}
-// 	}
-// }
-
-// func (c *Causal) Get(ctx ml.Context) (ml.Tensor, ml.Tensor, ml.Tensor) {
-// 	key := c.keys[c.curLayer]
-// 	value := c.values[c.curLayer]
-
-// 	kHeadDim := c.kHeadDims[c.curLayer]
-// 	vHeadDim := c.vHeadDims[c.curLayer]
-// 	numKVHeads := c.numKVHeads[c.curLayer]
-// 	// rowSize := numKVHeads * c.curBatchSize
-// 	// cachedSize := c.curMask.Dim(1)
-// 	cachedSize := c.curLoc.Dim(0)
-// 	// kCellSize := kHeadDim * numKVHeads
-// 	// vCellSize := vHeadDim * numKVHeads
-
-// 	slog.Info("XXX Causal.Get full cache", "key", key)
-// 	slog.Info("XXX Causal.Get full cache", "value", value)
-// 	slog.Info("XXX Causal.Get full cache", "curloc", c.curLoc)
-// 	slog.Info("XXX Causal.Get", "curMask", c.curMask)
-// 	slog.Info("XXX Causal.Get", "kHeadDim", kHeadDim, "numKVHeads", numKVHeads, "cachedSize", cachedSize, "kHeadDim", kHeadDim)
-// 	// panic("XXX")
-
-// 	// fmt.Fprintln(os.Stderr, key.ToString())
-// 	// panic("full cache value")
-
-// 	// TODO we should use TakeAxes to gather the cells from curLoc, but for now to be consistent with GGML, just grab a larger chunk and mask
-// 	key = key.TakeAxes(ctx, c.curLoc, 0).Reshape(ctx, 1, numKVHeads, cachedSize, kHeadDim)
-// 	// key = key.AsStrided(ctx, []int{1, numKVHeads, cachedSize, kHeadDim}, []int{}, rowSize*c.curCellRange.min)
-
-// 	// slog.Info("XXX Causal.Get after AsStrided", "key", key)
-// 	// panic("XXX")
-
-// 	// if c.config.PermutedV {
-// 	// 	panic("permuted")
-// 	// 	// TODO not converted
-// 	// 	vHeadDim := value.Dim(1)
-// 	// 	elemSize := value.Stride(2)
-
-// 	// 	value = value.AsStrided(ctx,
-// 	// 		[]int{numKVHeads, vHeadDim, cachedSize},
-// 	// 		[]int{value.Stride(0), value.Stride(1)},
-// 	// 		elemSize*c.curCellRange.min,
-// 	// 	)
-// 	// } else {
-// 	// vHeadDim := c.vHeadDims[c.curLayer]
-// 	// rowSize := value.Stride(2)
-// 	// slog.Info("XXX Causal.Get before AsStrided", "vHeadDim", vHeadDim, "rowSize", rowSize)
-// 	// panic("XXX")
-
-// 	// TODO we should use TakeAxes to gather the cells from curLoc, but for now to be consistent with GGML, just grab a larger chunk and mask
-// 	value = value.TakeAxes(ctx, c.curLoc, 0).Reshape(ctx, 1, numKVHeads, cachedSize, vHeadDim)
-// 	// value = value.AsStrided(ctx, []int{1, numKVHeads, cachedSize, vHeadDim}, []int{}, rowSize*c.curCellRange.min)
-
-// 	// slog.Info("XXX Causal.Get after AsStrided", "value", value)
-// 	// panic("XXX")
-
-// 	// }
-
-// 	// // TODO The mask changes from X,X to 1,X, and with the Row-order change
-// 	// // the 1 becomes trailing and messes up later operations
-// 	// // This isn't the right solution, but works around it...
-// 	// if c.curMask.Dim(1) == 1 {
-// 	// 	return key, value, c.curMask.Transpose(ctx, 1, 0, 2, 3)
-// 	// }
-// 	// fmt.Fprintln(os.Stderr, key.ToString())
-// 	// fmt.Fprintln(os.Stderr, value.ToString())
-// 	// panic("XXX")
-// 	slog.Info("XXX Mask", "curLayer", c.curLayer, "shape", c.curMask.Shape())
-
-// 	return key, value, c.curMask
-// }
-
-// func (c *Causal) Put(ctx ml.Context, key, value ml.Tensor) {
-// 	kHeadDim := key.Dim(3)
-// 	vHeadDim := value.Dim(3)
-// 	numKVHeads := key.Dim(1)
-// 	batchSize := key.Dim(2)
-// 	kCellSize := kHeadDim * numKVHeads
-// 	vCellSize := vHeadDim * numKVHeads
-
-// 	// slog.Info("XXX Causal.Put", "key", key, "value", value)
-// 	slog.Info("XXX Causal.Put", "kHeadDim", kHeadDim, "vHeadDim", vHeadDim, "numKVHeads", numKVHeads, "batchSize", batchSize)
-// 	// panic("XXX")
-
-// 	if c.curBatchSize != batchSize {
-// 		panic(fmt.Errorf("inconsistent batch sizes (layer: %v, batch size: %v layer batch size: %v)", c.curLayer, c.curBatchSize, batchSize))
-// 	}
-
-// 	// slog.Info("XXX", "c.ctxs", c.ctxs, "c.curLayer", c.curLayer, "backend", c.backend)
-// 	if _, ok := c.ctxs[c.curLayer]; !ok {
-// 		slog.Info("XXX Causal.Put creating new context", "c.curLayer", c.curLayer)
-// 		c.ctxs[c.curLayer] = c.backend.NewContext().Layer(c.curLayer)
-// 	}
-
-// 	if _, ok := c.keys[c.curLayer]; !ok {
-// 		slog.Info("XXX Causal.Put allocating keys", "c.curLayer", c.curLayer, "shape", []int{len(c.cells), kCellSize})
-
-// 		c.keys[c.curLayer] = c.ctxs[c.curLayer].Zeros(c.DType, len(c.cells), kCellSize)
-// 		c.kHeadDims[c.curLayer] = kHeadDim
-// 		c.vHeadDims[c.curLayer] = vHeadDim
-// 		c.numKVHeads[c.curLayer] = numKVHeads
-// 	}
-
-// 	if _, ok := c.values[c.curLayer]; !ok {
-// 		// if c.config.PermutedV {
-// 		// 	c.values[c.curLayer] = c.ctxs[c.curLayer].Zeros(c.DType, numKVHeads, vHeadDim, len(c.cells))
-// 		// } else {
-// 		c.values[c.curLayer] = c.ctxs[c.curLayer].Zeros(c.DType, len(c.cells), vCellSize)
-// 		// }
-// 	}
-
-// 	key = key.Reshape(ctx, batchSize, 1, kCellSize) //.Contiguous(ctx, false) // TODO contiguous may not be needed
-
-// 	// slog.Info("XXX Causal.Put after reshape", "keyCache", keyCache)
-// 	// panic("XXX")
-// 	// curLoc := 0 // TODO c.curLoc is now a tensor
-// 	// kSize := numKVHeads * kHeadDim
-// 	// vSize := numKVHeads * vHeadDim
-// 	// start := []int{int(curLoc), 0}
-// 	// kStop := []int{int(curLoc + batchSize), int(kSize)}
-// 	// vStop := []int{int(curLoc + batchSize), int(vSize)}
-// 	// strides := []int{1, 1}
-
-// 	// slog.Info("XXX Causal.Put Key SliceUpdate", "keyCache", keyCache)
-// 	// slog.Info("XXX Causal.Put Key SliceUpdate", "key", key)
-
-// 	// slog.Info("XXX Causal.Put Key SliceUpdate", "start", start, "kStop", kStop, "strides", strides)
-
-// 	// ctx.Forward(c.keys[c.curLayer].SliceUpdate(ctx, key, start, kStop, strides))
-// 	ctx.Forward(c.keys[c.curLayer].Scatter(ctx, []ml.Tensor{c.curLoc}, key, []int{0}))
-// 	// fmt.Fprintln(os.Stderr, keyCache.ToString())
-// 	// panic("input value")
-
-// 	// fmt.Fprintln(os.Stderr, t.ToString())
-// 	// panic("XXX")
-
-// 	// if c.config.PermutedV {
-// 	// 	panic("permuted")
-// 	// 	// TODO not adjusted
-// 	// 	value = value.Reshape(ctx, vHeadDim*numKVHeads, 1, batchSize)
-// 	// 	value = value.Transpose(ctx, 2, 0, 1, 3)
-
-// 	// 	valueCache := c.values[c.curLayer]
-// 	// 	valueCache = valueCache.Reshape(ctx, 1, len(c.cells), vHeadDim*numKVHeads)
-
-// 	// 	ctx.Forward(valueCache.SliceUpdate(ctx, value, start, vStop, strides))
-// 	// } else {
-// 	value = value.Reshape(ctx, batchSize, 1, vCellSize) //.Contiguous(ctx, false) // TODO contiguous may not be needed
-// 	// slog.Info("XXX Causal.Put Value SliceUpdate", "valueCache", valueCache)
-// 	// slog.Info("XXX Causal.Put Value SliceUpdate", "value", value)
-// 	// slog.Info("XXX Causal.Put Value SliceUpdate", "start", start, "vStop", vStop, "strides", strides)
-
-// 	ctx.Forward(c.values[c.curLayer].Scatter(ctx, []ml.Tensor{c.curLoc}, value, []int{0}))
-// 	// }
-// 	// fmt.Fprintln(os.Stderr, c.keys[c.curLayer].ToString())
-// 	// fmt.Fprintln(os.Stderr, c.values[c.curLayer].ToString())
-// 	// panic("XXX")
-
-// }
-
-// func (c *Causal) CopyPrefix(srcSeq, dstSeq int, len int32) {
-// 	seqRange := newRange()
-
-// 	for i := range c.cells {
-// 		// Remove the contents of dstSeq so that we only have the copied prefix, metadata will be reset at the end
-// 		if slices.Contains(c.cells[i].sequences, dstSeq) {
-// 			c.cells[i].sequences = slices.DeleteFunc(c.cells[i].sequences, func(s int) bool { return s == dstSeq })
-// 		}
-
-// 		if slices.Contains(c.cells[i].sequences, srcSeq) && c.cells[i].pos < len {
-// 			c.cells[i].sequences = append(c.cells[i].sequences, dstSeq)
-// 			if i < seqRange.min {
-// 				seqRange.min = i
-// 			}
-// 			if i > seqRange.max {
-// 				seqRange.max = i
-// 			}
-// 		}
-// 	}
-
-// 	c.cellRanges[dstSeq] = seqRange
-// }
-
-// func (c *Causal) CanResume(seq int, pos int32) bool {
-// 	if c.swaMemorySize == math.MaxInt32 {
-// 		return true
-// 	}
-
-// 	seqRange, ok := c.cellRanges[seq]
-// 	if !ok {
-// 		return false
-// 	}
-
-// 	// for sliding window, check that the window of the new sequence is contained in
-// 	// the window of what we are storing
-// 	var first int32 = math.MaxInt32
-// 	var last int32 = -1
-// 	for i := seqRange.min; i <= seqRange.max; i++ {
-// 		if slices.Contains(c.cells[i].sequences, seq) {
-// 			first = min(first, c.cells[i].pos)
-// 			last = max(last, c.cells[i].pos)
-// 		}
-// 	}
-
-// 	if last == -1 {
-// 		return false
-// 	}
-
-// 	posWindowStart := max(0, pos-c.swaWindowSize)
-// 	return posWindowStart >= first && pos <= last+1
-// }
-
-// func (c *Causal) shift(seq int, beginIndex, offset int32) error {
-// 	if c.shiftFn == nil {
-// 		return ErrNotSupported
-// 	}
-
-// 	seqRange := c.cellRanges[seq]
-
-// 	for start := seqRange.min; start <= seqRange.max; start += c.maxBatch {
-// 		size := min(seqRange.max-start+1, c.maxBatch)
-// 		offsets := make([]int32, size)
-
-// 		var batchFirst, batchLast int
-
-// 		batchFirst = -1
-// 		for i := range offsets {
-// 			cell := c.cells[start+i]
-
-// 			if slices.Contains(cell.sequences, seq) && cell.pos >= beginIndex {
-// 				offsets[i] = offset
-// 				if batchFirst < 0 {
-// 					batchFirst = i
-// 				}
-// 				batchLast = i
-// 			}
-// 		}
-
-// 		if batchFirst < 0 {
-// 			continue
-// 		}
-
-// 		offsets = offsets[batchFirst : batchLast+1]
-
-// 		slog.Info("XXX Causal.shift creating new temporary context")
-// 		ctx := c.backend.NewContext()
-// 		kShift := ctx.Input().FromInts(offsets, len(offsets))
-
-// 		for i, key := range c.keys {
-// 			if key == nil {
-// 				continue
-// 			}
-
-// 			kHeadDim := key.Dim(2)
-// 			numKVHeads := key.Dim(1)
-// 			rowSize := key.Stride(0)
-
-// 			key = key.AsStrided(ctx,
-// 				[]int{len(offsets), numKVHeads, kHeadDim},
-// 				[]int{key.Stride(0), key.Stride(1)},
-// 				rowSize*(start+batchFirst),
-// 			)
-
-// 			roped, err := c.shiftFn(ctx, i, key, kShift)
-// 			if err != nil {
-// 				ctx.Close()
-// 				return err
-// 			}
-
-// 			ctx.Forward(roped.Copy(ctx, key))
-// 		}
-
-// 		ctx.Compute()
-// 		ctx.Close()
-// 	}
-
-// 	return nil
-// }
-
-// func (c *Causal) Remove(seq int, beginIndex, endIndex int32) error {
-// 	// TODO(jessegross): We should check to see if removing the middle of the sequence will
-// 	// cause the sliding window to encompass tokens that we no longer have. If so, then we
-// 	// should return an error, which will trigger the runner to evaluate the full history and
-// 	// rebuild the window. However, if we have multimodal inputs in our history, this reuse
-// 	// results in use after free, so we don't do it for now.
-
-// 	var offset int32
-// 	if endIndex != math.MaxInt32 {
-// 		offset = beginIndex - endIndex
-// 	}
-
-// 	seqRange := newRange()
-
-// 	for i := range c.cells {
-// 		if slices.Contains(c.cells[i].sequences, seq) {
-// 			if c.cells[i].pos >= beginIndex && c.cells[i].pos < endIndex {
-// 				c.cells[i].sequences = slices.DeleteFunc(c.cells[i].sequences, func(s int) bool { return s == seq })
-// 			} else {
-// 				if c.cells[i].pos >= endIndex {
-// 					if slices.ContainsFunc(c.cells[i].sequences, func(s int) bool { return s != seq }) {
-// 						return errors.New("shifting cells shared by multiple sequences not supported")
-// 					}
-
-// 					c.cells[i].pos += offset
-// 				}
-// 				if i < seqRange.min {
-// 					seqRange.min = i
-// 				}
-// 				if i > seqRange.max {
-// 					seqRange.max = i
-// 				}
-// 			}
-// 		}
-// 	}
-
-// 	if seqRange == newRange() {
-// 		delete(c.cellRanges, seq)
-// 		return nil
-// 	}
-
-// 	c.cellRanges[seq] = seqRange
-
-// 	if endIndex != math.MaxInt32 {
-// 		err := c.shift(seq, endIndex+offset, offset)
-// 		if err != nil {
-// 			return err
-// 		}
-// 	}
-
-// 	return nil
-// }
--- a/x/kvcache/causal_test.go
+++ b/x/kvcache/causal_test.go
@@ -1,973 +0,0 @@
-package kvcache
-
-// import (
-// 	"fmt"
-// 	"math"
-// 	"slices"
-// 	"testing"
-
-// 	"github.com/ollama/ollama/ml"
-// 	"github.com/ollama/ollama/model/input"
-// )
-
-// type testCase struct {
-// 	name          string
-// 	in            []float32
-// 	inShape       []int
-// 	seqs          []int
-// 	pos           []int32
-// 	expected      []float32
-// 	expectedShape []int
-// 	expectedMask  []float32
-// }
-
-// func runPermutedVariants(t *testing.T, fn func(t *testing.T, backend *testBackend)) {
-// 	t.Helper()
-// 	for _, permuted := range []bool{false, true} {
-// 		t.Run(fmt.Sprintf("PermutedV=%t", permuted), func(t *testing.T) {
-// 			fn(t, &testBackend{permutedV: permuted})
-// 		})
-// 	}
-// }
-
-// func TestStore(t *testing.T) {
-// 	runPermutedVariants(t, func(t *testing.T, backend *testBackend) {
-// 		cache := NewCausalCache(nil)
-// 		defer cache.Close()
-
-// 		cache.Init(backend, ml.DTypeF16, 1, 16, 16)
-
-// 		tests := []testCase{
-// 			{
-// 				name:          "FirstBatch",
-// 				in:            []float32{111, 211, 121, 221, 131, 231, 112, 212, 122, 222, 132, 232, 113, 213, 123, 223, 133, 233, 114, 214, 124, 224, 134, 234},
-// 				inShape:       []int{2, 3, 4},
-// 				seqs:          []int{0, 0, 0, 0},
-// 				pos:           []int32{0, 1, 2, 3},
-// 				expected:      []float32{111, 211, 121, 221, 131, 231, 112, 212, 122, 222, 132, 232, 113, 213, 123, 223, 133, 233, 114, 214, 124, 224, 134, 234},
-// 				expectedShape: []int{2, 3, 4},
-// 				expectedMask:  []float32{0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, float32(math.Inf(-1)), 0, 0, 0, 0},
-// 			},
-// 			{
-// 				name:          "SecondBatch",
-// 				in:            []float32{115, 215, 125, 225, 135, 235},
-// 				inShape:       []int{2, 3, 1},
-// 				seqs:          []int{0},
-// 				pos:           []int32{4},
-// 				expected:      []float32{111, 211, 121, 221, 131, 231, 112, 212, 122, 222, 132, 232, 113, 213, 123, 223, 133, 233, 114, 214, 124, 224, 134, 234, 115, 215, 125, 225, 135, 235},
-// 				expectedShape: []int{2, 3, 5},
-// 				expectedMask:  []float32{0, 0, 0, 0, 0},
-// 			},
-// 		}
-
-// 		testCache(t, backend, cache, tests)
-// 	})
-// }
-
-// func TestSWA(t *testing.T) {
-// 	runPermutedVariants(t, func(t *testing.T, backend *testBackend) {
-// 		cache := NewSWACache(1, nil)
-// 		defer cache.Close()
-
-// 		cache.Init(backend, ml.DTypeF16, 1, 16, 16)
-
-// 		x := float32(math.Inf(-1))
-
-// 		tests := []testCase{
-// 			{
-// 				name:          "FirstBatch",
-// 				in:            []float32{1, 2, 3, 4},
-// 				inShape:       []int{1, 1, 4},
-// 				seqs:          []int{0, 0, 0, 0},
-// 				pos:           []int32{0, 1, 2, 3},
-// 				expected:      []float32{1, 2, 3, 4},
-// 				expectedShape: []int{1, 1, 4},
-// 				expectedMask: []float32{
-// 					0, x, x, x,
-// 					0, 0, x, x,
-// 					x, 0, 0, x,
-// 					x, x, 0, 0,
-// 				},
-// 			},
-// 			{
-// 				name:          "SecondBatch",
-// 				in:            []float32{5, 6},
-// 				inShape:       []int{1, 1, 2},
-// 				seqs:          []int{0, 0},
-// 				pos:           []int32{4, 5},
-// 				expected:      []float32{5, 6, 3, 4},
-// 				expectedShape: []int{1, 1, 4},
-// 				expectedMask: []float32{
-// 					0, x, x, 0,
-// 					0, 0, x, x,
-// 				},
-// 			},
-// 		}
-
-// 		testCache(t, backend, cache, tests)
-// 	})
-// }
-
-// func TestSWASeparateBatches(t *testing.T) {
-// 	runPermutedVariants(t, func(t *testing.T, backend *testBackend) {
-// 		cache := NewSWACache(1, nil)
-// 		defer cache.Close()
-
-// 		cache.Init(backend, ml.DTypeF16, 2, 16, 2)
-
-// 		x := float32(math.Inf(-1))
-
-// 		tests := []testCase{
-// 			{
-// 				name:          "First seq 0",
-// 				in:            []float32{1, 2},
-// 				inShape:       []int{1, 1, 2},
-// 				seqs:          []int{0, 0},
-// 				pos:           []int32{0, 1},
-// 				expected:      []float32{1, 2},
-// 				expectedShape: []int{1, 1, 2},
-// 				expectedMask: []float32{
-// 					0, x,
-// 					0, 0,
-// 				},
-// 			},
-// 			{
-// 				name:          "Second seq 0",
-// 				in:            []float32{3, 4},
-// 				inShape:       []int{1, 1, 2},
-// 				seqs:          []int{0, 0},
-// 				pos:           []int32{2, 3},
-// 				expected:      []float32{2, 3, 4},
-// 				expectedShape: []int{1, 1, 3},
-// 				expectedMask: []float32{
-// 					0, 0, x,
-// 					x, 0, 0,
-// 				},
-// 			},
-// 			{
-// 				name:          "First seq 1",
-// 				in:            []float32{5, 6},
-// 				inShape:       []int{1, 1, 2},
-// 				seqs:          []int{1, 1},
-// 				pos:           []int32{0, 1},
-// 				expected:      []float32{5, 6},
-// 				expectedShape: []int{1, 1, 2},
-// 				expectedMask: []float32{
-// 					0, x,
-// 					0, 0,
-// 				},
-// 			},
-// 			{
-// 				name:          "Second seq 1",
-// 				in:            []float32{7, 8},
-// 				inShape:       []int{1, 1, 2},
-// 				seqs:          []int{1, 1},
-// 				pos:           []int32{2, 3},
-// 				expected:      []float32{6, 3, 4, 7, 8},
-// 				expectedShape: []int{1, 1, 5},
-// 				expectedMask: []float32{
-// 					0, x, x, 0, x,
-// 					x, x, x, 0, 0,
-// 				},
-// 			},
-// 			{
-// 				name:          "Third seq 0",
-// 				in:            []float32{9, 10},
-// 				inShape:       []int{1, 1, 2},
-// 				seqs:          []int{0, 0},
-// 				pos:           []int32{4, 5},
-// 				expected:      []float32{9, 10, 3, 4},
-// 				expectedShape: []int{1, 1, 4},
-// 				expectedMask: []float32{
-// 					0, x, x, 0,
-// 					0, 0, x, x,
-// 				},
-// 			},
-// 		}
-
-// 		testCache(t, backend, cache, tests)
-// 	})
-// }
-
-// func TestSWAMem(t *testing.T) {
-// 	runPermutedVariants(t, func(t *testing.T, backend *testBackend) {
-// 		cache := NewSWAMemCache(1, 3, nil)
-// 		defer cache.Close()
-
-// 		cache.Init(backend, ml.DTypeF16, 1, 16, 16)
-
-// 		x := float32(math.Inf(-1))
-
-// 		tests := []testCase{
-// 			{
-// 				name:          "FirstBatch",
-// 				in:            []float32{1, 2, 3, 4},
-// 				inShape:       []int{1, 1, 4},
-// 				seqs:          []int{0, 0, 0, 0},
-// 				pos:           []int32{0, 1, 2, 3},
-// 				expected:      []float32{1, 2, 3, 4},
-// 				expectedShape: []int{1, 1, 4},
-// 				expectedMask: []float32{
-// 					0, x, x, x,
-// 					0, 0, x, x,
-// 					x, 0, 0, x,
-// 					x, x, 0, 0,
-// 				},
-// 			},
-// 			{
-// 				name:          "SecondBatch",
-// 				in:            []float32{5, 6},
-// 				inShape:       []int{1, 1, 2},
-// 				seqs:          []int{0, 0},
-// 				pos:           []int32{4, 5},
-// 				expected:      []float32{5, 2, 3, 4, 6},
-// 				expectedShape: []int{1, 1, 5},
-// 				expectedMask: []float32{
-// 					0, x, x, 0, x,
-// 					0, x, x, x, 0,
-// 				},
-// 			},
-// 		}
-
-// 		testCache(t, backend, cache, tests)
-// 	})
-// }
-
-// func TestChunkedAttention(t *testing.T) {
-// 	runPermutedVariants(t, func(t *testing.T, backend *testBackend) {
-// 		cache := NewChunkedAttentionCache(2, nil)
-// 		defer cache.Close()
-
-// 		cache.Init(backend, ml.DTypeF16, 1, 16, 16)
-
-// 		x := float32(math.Inf(-1))
-
-// 		testCache(
-// 			t, backend, cache,
-// 			[]testCase{
-// 				{
-// 					name:          "FirstBatch",
-// 					in:            []float32{1, 2, 3, 4},
-// 					inShape:       []int{1, 1, 4},
-// 					seqs:          []int{0, 0, 0, 0},
-// 					pos:           []int32{0, 1, 2, 3},
-// 					expected:      []float32{1, 2, 3, 4},
-// 					expectedShape: []int{1, 1, 4},
-// 					expectedMask: []float32{
-// 						0, x, x, x,
-// 						0, 0, x, x,
-// 						x, x, 0, x,
-// 						x, x, 0, 0,
-// 					},
-// 				},
-// 				{
-// 					name:          "SecondBatch",
-// 					in:            []float32{5, 6, 7},
-// 					inShape:       []int{1, 1, 3},
-// 					seqs:          []int{0, 0, 0},
-// 					pos:           []int32{4, 5, 6},
-// 					expected:      []float32{1, 2, 3, 4, 5, 6, 7},
-// 					expectedShape: []int{1, 1, 7},
-// 					expectedMask: []float32{
-// 						x, x, x, x, 0, x, x,
-// 						x, x, x, x, 0, 0, x,
-// 						x, x, x, x, x, x, 0,
-// 					},
-// 				},
-// 				{
-// 					name:          "ThirdBatch",
-// 					in:            []float32{8, 9},
-// 					inShape:       []int{1, 1, 2},
-// 					seqs:          []int{0, 0},
-// 					pos:           []int32{7, 8},
-// 					expected:      []float32{1, 2, 3, 4, 5, 6, 7, 8, 9},
-// 					expectedShape: []int{1, 1, 9},
-// 					expectedMask: []float32{
-// 						x, x, x, x, x, x, 0, 0, x,
-// 						x, x, x, x, x, x, x, x, 0,
-// 					},
-// 				},
-// 			},
-// 		)
-// 	})
-// }
-
-// func TestSequences(t *testing.T) {
-// 	runPermutedVariants(t, func(t *testing.T, backend *testBackend) {
-// 		cache := NewCausalCache(nil)
-// 		defer cache.Close()
-
-// 		cache.Init(backend, ml.DTypeF16, 1, 16, 16)
-
-// 		tests := []testCase{
-// 			{
-// 				name:          "FirstBatch",
-// 				in:            []float32{1, 2, 3, 4},
-// 				inShape:       []int{1, 1, 4},
-// 				seqs:          []int{0, 0, 1, 1},
-// 				pos:           []int32{0, 1, 0, 1},
-// 				expected:      []float32{1, 2, 3, 4},
-// 				expectedShape: []int{1, 1, 4},
-// 				expectedMask:  []float32{0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0},
-// 			},
-// 			{
-// 				name:          "SecondBatch",
-// 				in:            []float32{5, 6},
-// 				inShape:       []int{1, 1, 2},
-// 				seqs:          []int{0, 1},
-// 				pos:           []int32{2, 2},
-// 				expected:      []float32{1, 2, 3, 4, 5, 6},
-// 				expectedShape: []int{1, 1, 6},
-// 				expectedMask:  []float32{0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, float32(math.Inf(-1)), 0},
-// 			},
-// 		}
-
-// 		testCache(t, backend, cache, tests)
-// 	})
-// }
-
-// func TestRemove(t *testing.T) {
-// 	runPermutedVariants(t, func(t *testing.T, backend *testBackend) {
-// 		cache := NewCausalCache(func(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-// 			return key.Add(ctx, shift), nil
-// 		})
-// 		defer cache.Close()
-
-// 		cache.Init(backend, ml.DTypeF16, 1, 16, 16)
-
-// 		x := float32(math.Inf(-1))
-
-// 		tests := []testCase{
-// 			{
-// 				name:          "FirstBatch",
-// 				in:            []float32{1, 2, 3, 4},
-// 				inShape:       []int{1, 1, 4},
-// 				seqs:          []int{0, 0, 1, 1},
-// 				pos:           []int32{0, 1, 0, 1},
-// 				expected:      []float32{1, 2, 3, 4},
-// 				expectedShape: []int{1, 1, 4},
-// 				expectedMask: []float32{
-// 					0, x, x, x,
-// 					0, 0, x, x,
-// 					x, x, 0, x,
-// 					x, x, 0, 0,
-// 				},
-// 			},
-// 		}
-
-// 		testCache(t, backend, cache, tests)
-
-// 		err := cache.Remove(0, 1, math.MaxInt32)
-// 		if err != nil {
-// 			panic(err)
-// 		}
-
-// 		tests = []testCase{
-// 			{
-// 				name:          "RemoveEnd",
-// 				in:            []float32{5, 6},
-// 				inShape:       []int{1, 1, 2},
-// 				seqs:          []int{0, 1},
-// 				pos:           []int32{1, 2},
-// 				expected:      []float32{1, 5, 3, 4, 6},
-// 				expectedShape: []int{1, 1, 5},
-// 				expectedMask: []float32{
-// 					0, 0, x, x, x,
-// 					x, x, 0, 0, 0,
-// 				},
-// 			},
-// 		}
-
-// 		testCache(t, backend, cache, tests)
-
-// 		err = cache.Remove(0, 0, 1)
-// 		if err != nil {
-// 			panic(err)
-// 		}
-
-// 		tests = []testCase{
-// 			{
-// 				name:          "RemoveMiddle",
-// 				in:            []float32{7, 8},
-// 				inShape:       []int{1, 1, 2},
-// 				seqs:          []int{0, 0},
-// 				pos:           []int32{1, 2},
-// 				expected:      []float32{7, 4, 3, 4, 6, 8},
-// 				expectedShape: []int{1, 1, 6},
-// 				expectedMask: []float32{
-// 					0, 0, x, x, x, x,
-// 					0, 0, x, x, x, 0,
-// 				},
-// 			},
-// 		}
-
-// 		testCache(t, backend, cache, tests)
-// 	})
-// }
-
-// func TestCopy(t *testing.T) {
-// 	runPermutedVariants(t, func(t *testing.T, backend *testBackend) {
-// 		cache := NewCausalCache(func(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) { return key, nil })
-// 		defer cache.Close()
-
-// 		cache.Init(backend, ml.DTypeF16, 1, 16, 16)
-
-// 		tests := []testCase{
-// 			{
-// 				name:          "FirstBatch",
-// 				in:            []float32{1, 2, 3, 4},
-// 				inShape:       []int{1, 1, 4},
-// 				seqs:          []int{0, 0, 0, 0},
-// 				pos:           []int32{0, 1, 2, 3},
-// 				expected:      []float32{1, 2, 3, 4},
-// 				expectedShape: []int{1, 1, 4},
-// 				expectedMask:  []float32{0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, float32(math.Inf(-1)), 0, 0, 0, 0},
-// 			},
-// 		}
-
-// 		testCache(t, backend, cache, tests)
-
-// 		cache.CopyPrefix(0, 1, 2)
-
-// 		tests = []testCase{
-// 			{
-// 				name:          "Copy",
-// 				in:            []float32{5, 6},
-// 				inShape:       []int{1, 1, 2},
-// 				seqs:          []int{1, 1},
-// 				pos:           []int32{3, 4},
-// 				expected:      []float32{1, 2, 3, 4, 5, 6},
-// 				expectedShape: []int{1, 1, 6},
-// 				expectedMask:  []float32{0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), 0, float32(math.Inf(-1)), 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0},
-// 			},
-// 		}
-
-// 		testCache(t, backend, cache, tests)
-// 	})
-// }
-
-// func testCache(t *testing.T, backend ml.Backend, cache Cache, tests []testCase) {
-// 	for _, test := range tests {
-// 		t.Run(test.name, func(t *testing.T) {
-// 			context := backend.NewContext()
-// 			defer context.Close()
-
-// 			err := cache.StartForward(context, input.Batch{Positions: test.pos, Sequences: test.seqs}, false)
-// 			if err != nil {
-// 				panic(err)
-// 			}
-
-// 			cache.SetLayer(0)
-// 			tensor := context.FromFloats(test.in, test.inShape...)
-// 			cache.Put(context, tensor, tensor)
-
-// 			out, _, mask := cache.Get(context)
-
-// 			context.Forward(out, mask).Compute(out, mask)
-
-// 			if !slices.Equal(out.Floats(), test.expected) {
-// 				t.Errorf("TestCache: have %v; want %v", out.Floats(), test.expected)
-// 			}
-
-// 			if !slices.Equal(out.Shape(), test.expectedShape) {
-// 				t.Errorf("TestCache: has shape %v; want %v", out.Shape(), test.expectedShape)
-// 			}
-
-// 			if !slices.Equal(mask.Floats(), test.expectedMask) {
-// 				t.Errorf("TestCache: have mask: have %v want %v", mask.Floats(), test.expectedMask)
-// 			}
-// 		})
-// 	}
-// }
-
-// func TestCanResume(t *testing.T) {
-// 	runPermutedVariants(t, func(t *testing.T, backend *testBackend) {
-// 		windowSize := int32(4)
-// 		cache := NewSWACache(windowSize, nil)
-// 		defer cache.Close()
-
-// 		cache.Init(backend, ml.DTypeF16, 1, 16, 16)
-
-// 		context := backend.NewContext()
-// 		defer context.Close()
-
-// 		err := cache.StartForward(context, input.Batch{
-// 			Positions: []int32{0, 1, 2, 3, 4},
-// 			Sequences: []int{0, 0, 0, 0, 0},
-// 		}, false)
-// 		if err != nil {
-// 			t.Fatalf("StartForward failed: %v", err)
-// 		}
-
-// 		cache.SetLayer(0)
-// 		tensor := context.FromFloats([]float32{1, 2, 3, 4, 5}, 1, 1, 5)
-// 		cache.Put(context, tensor, tensor)
-
-// 		// with window size 4, nothing has slid out of the window yet
-// 		if !cache.CanResume(0, 0) {
-// 			t.Errorf("CanResume(0, 0) = false, want true (within window)")
-// 		}
-// 		if !cache.CanResume(0, 1) {
-// 			t.Errorf("CanResume(0, 1) = false, want true (within window)")
-// 		}
-// 		if !cache.CanResume(0, 2) {
-// 			t.Errorf("CanResume(0, 2) = false, want true (within window)")
-// 		}
-// 		if !cache.CanResume(0, 3) {
-// 			t.Errorf("CanResume(0, 3) = false, want true (latest position)")
-// 		}
-// 		if !cache.CanResume(0, 4) {
-// 			t.Errorf("CanResume(0, 4) = false, want true (latest position)")
-// 		}
-
-// 		// shift window by adding position 5
-// 		err = cache.StartForward(context, input.Batch{
-// 			Positions: []int32{5},
-// 			Sequences: []int{0},
-// 		}, false)
-// 		if err != nil {
-// 			t.Fatalf("StartForward failed: %v", err)
-// 		}
-
-// 		cache.SetLayer(0)
-// 		tensor = context.FromFloats([]float32{6}, 1, 1, 1)
-// 		cache.Put(context, tensor, tensor)
-
-// 		// only the latest position has overlapping windows
-// 		if cache.CanResume(0, 0) {
-// 			t.Errorf("after shift: CanResume(0, 0) = true, want false (outside window)")
-// 		}
-// 		if cache.CanResume(0, 1) {
-// 			t.Errorf("after shift: CanResume(0, 1) = true, want false (outside window)")
-// 		}
-// 		if cache.CanResume(0, 2) {
-// 			t.Errorf("after shift: CanResume(0, 2) = true, want false (outside window)")
-// 		}
-// 		if cache.CanResume(0, 3) {
-// 			t.Errorf("after shift: CanResume(0, 3) = true, want false (outside window)")
-// 		}
-// 		if cache.CanResume(0, 4) {
-// 			t.Errorf("after shift: CanResume(0, 4) = true, want false (outside window)")
-// 		}
-// 		if !cache.CanResume(0, 5) {
-// 			t.Errorf("after shift: CanResume(0, 5) = false, want true (latest position)")
-// 		}
-// 	})
-// }
-
-// func TestCanResumeSWAMem(t *testing.T) {
-// 	runPermutedVariants(t, func(t *testing.T, backend *testBackend) {
-// 		windowSize := int32(4)
-// 		memSize := int32(5)
-// 		cache := NewSWAMemCache(windowSize, memSize, nil)
-// 		defer cache.Close()
-
-// 		cache.Init(backend, ml.DTypeF16, 1, 16, 16)
-
-// 		context := backend.NewContext()
-// 		defer context.Close()
-
-// 		err := cache.StartForward(context, input.Batch{
-// 			Positions: []int32{0, 1, 2, 3, 4, 5, 6},
-// 			Sequences: []int{0, 0, 0, 0, 0, 0, 0},
-// 		}, false)
-// 		if err != nil {
-// 			t.Fatalf("StartForward failed: %v", err)
-// 		}
-
-// 		cache.SetLayer(0)
-// 		tensor := context.FromFloats([]float32{1, 2, 3, 4, 5, 6, 7}, 1, 1, 7)
-// 		cache.Put(context, tensor, tensor)
-
-// 		// shift window by adding position 7
-// 		err = cache.StartForward(context, input.Batch{
-// 			Positions: []int32{7},
-// 			Sequences: []int{0},
-// 		}, false)
-// 		if err != nil {
-// 			t.Fatalf("StartForward failed: %v", err)
-// 		}
-
-// 		cache.SetLayer(0)
-// 		tensor = context.FromFloats([]float32{8}, 1, 1, 1)
-// 		cache.Put(context, tensor, tensor)
-
-// 		// only the latest position has overlapping windows
-// 		if cache.CanResume(0, 0) {
-// 			t.Errorf("after shift: CanResume(0, 0) = true, want false (outside window)")
-// 		}
-// 		if cache.CanResume(0, 1) {
-// 			t.Errorf("after shift: CanResume(0, 1) = true, want false (outside window)")
-// 		}
-// 		if cache.CanResume(0, 2) {
-// 			t.Errorf("after shift: CanResume(0, 2) = true, want false (outside window)")
-// 		}
-// 		if cache.CanResume(0, 3) {
-// 			t.Errorf("after shift: CanResume(0, 3) = true, want false (outside window)")
-// 		}
-// 		if cache.CanResume(0, 4) {
-// 			t.Errorf("after shift: CanResume(0, 4) = true, want false (outside window)")
-// 		}
-// 		if cache.CanResume(0, 5) {
-// 			t.Errorf("after shift: CanResume(0, 5) = true, want false (outside window)")
-// 		}
-// 		if !cache.CanResume(0, 6) {
-// 			t.Errorf("after shift: CanResume(0, 6) = false, want true (inside window)")
-// 		}
-// 		if !cache.CanResume(0, 7) {
-// 			t.Errorf("after shift: CanResume(0, 7) = false, want true (latest position)")
-// 		}
-// 	})
-// }
-
-// type testBackend struct {
-// 	ml.Backend
-// 	permutedV bool
-// }
-
-// func (b *testBackend) NewContext() ml.Context {
-// 	return &testContext{}
-// }
-
-// func (b *testBackend) NewContextSize(int) ml.Context {
-// 	return &testContext{}
-// }
-
-// func (b *testBackend) CacheConfig() ml.CacheConfig {
-// 	return ml.CacheConfig{PermutedV: b.permutedV}
-// }
-
-// type testContext struct {
-// 	ml.Context
-// }
-
-// func (c *testContext) Empty(dtype ml.DType, shape ...int) ml.Tensor {
-// 	total := 0
-
-// 	if len(shape) > 0 {
-// 		total = 1
-// 		for _, s := range shape {
-// 			total *= s
-// 		}
-// 	}
-
-// 	return &testTensor{dtype: dtype, elementSize: 4, data: make([]float32, total), shape: shape}
-// }
-
-// func (c *testContext) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
-// 	return c.Empty(dtype, shape...)
-// }
-
-// func (c *testContext) FromFloats(s []float32, shape ...int) ml.Tensor {
-// 	t := c.Empty(ml.DTypeF32, shape...).(*testTensor)
-
-// 	copy(t.data, s)
-
-// 	return t
-// }
-
-// func (c *testContext) FromInts(s []int32, shape ...int) ml.Tensor {
-// 	f := make([]float32, len(s))
-// 	for i := range f {
-// 		f[i] = float32(s[i])
-// 	}
-
-// 	out := c.FromFloats(f, shape...)
-// 	out.(*testTensor).dtype = ml.DTypeI32
-
-// 	return out
-// }
-
-// func (c *testContext) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
-// 	s := make([]float32, 0, int((stop-start)/step))
-// 	for i := start; i < stop; i += step {
-// 		s = append(s, i)
-// 	}
-
-// 	out := c.FromFloats(s, len(s))
-// 	out.(*testTensor).dtype = dtype
-// 	return out
-// }
-
-// func (c *testContext) Input() ml.Context    { return c }
-// func (c *testContext) Layer(int) ml.Context { return c }
-
-// func (c *testContext) Forward(...ml.Tensor) ml.Context { return c }
-
-// func (c *testContext) Compute(...ml.Tensor) {}
-
-// func (c *testContext) Reserve() {}
-
-// func (c *testContext) MaxGraphNodes() int {
-// 	return 10
-// }
-
-// func (c *testContext) Close() {}
-
-// type testTensor struct {
-// 	ml.Tensor
-
-// 	dtype       ml.DType
-// 	elementSize int
-// 	data        []float32
-// 	shape       []int
-// }
-
-// func (t *testTensor) Dim(n int) int {
-// 	return t.shape[n]
-// }
-
-// func (t *testTensor) Stride(n int) int {
-// 	stride := t.elementSize
-// 	for i := range n {
-// 		stride *= t.shape[i]
-// 	}
-
-// 	return stride
-// }
-
-// func (t *testTensor) Shape() []int {
-// 	return t.shape
-// }
-
-// func (t *testTensor) DType() ml.DType {
-// 	return t.dtype
-// }
-
-// func (t *testTensor) Floats() []float32 {
-// 	out := make([]float32, len(t.data))
-// 	copy(out, t.data)
-// 	return out
-// }
-
-// func (t *testTensor) Neg(ctx ml.Context) ml.Tensor {
-// 	out := ctx.Empty(t.DType(), t.Shape()...).(*testTensor)
-// 	for i := range out.data {
-// 		out.data[i] = -t.data[i]
-// 	}
-// 	return out
-// }
-
-// func (t *testTensor) Add(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
-// 	out := ctx.Empty(t.DType(), t.Shape()...).(*testTensor)
-
-// 	for i := range out.data {
-// 		out.data[i] = t.data[i] + t2.(*testTensor).data[i]
-// 	}
-
-// 	return out
-// }
-
-// func (t *testTensor) Reshape(ctx ml.Context, shape ...int) ml.Tensor {
-// 	return &testTensor{
-// 		dtype:       t.dtype,
-// 		elementSize: t.elementSize,
-// 		data:        t.data,
-// 		shape:       shape,
-// 	}
-// }
-
-// func (t *testTensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
-// 	offset /= t.elementSize
-
-// 	var s []int
-
-// 	switch len(shape) {
-// 	case 1:
-// 		s = []int{shape[0]}
-// 	case 3:
-// 		s = []int{shape[0], shape[2]}
-// 	case 5:
-// 		s = []int{shape[0], shape[2], shape[4]}
-// 	default:
-// 		panic("unsupported number of dimensions")
-// 	}
-
-// 	context := &testContext{}
-
-// 	view := context.Empty(t.dtype, s...).(*testTensor)
-// 	view.data = t.data[offset : offset+len(view.data)]
-
-// 	return view
-// }
-
-// func (t *testTensor) Permute(ctx ml.Context, order ...int) ml.Tensor {
-// 	if len(t.shape) > 4 || len(order) > 4 {
-// 		panic("permute only supports up to 4 dimensions")
-// 	}
-
-// 	if len(order) != len(t.shape) && len(order) != 4 {
-// 		panic("invalid number of dimensions for permute")
-// 	}
-
-// 	// ggml_permute expects 4 axes, so fill in any missing dimensions.
-// 	orderFull := append(make([]int, 0, 4), order...)
-// 	for len(orderFull) < 4 {
-// 		orderFull = append(orderFull, len(orderFull))
-// 	}
-
-// 	seen := [4]bool{}
-
-// 	shape4 := [4]int{1, 1, 1, 1}
-// 	for i := 0; i < len(t.shape) && i < 4; i++ {
-// 		shape4[i] = t.shape[i]
-// 	}
-
-// 	newShape4 := [4]int{1, 1, 1, 1}
-// 	for axis := range 4 {
-// 		dst := orderFull[axis]
-// 		if dst < 0 || dst >= 4 {
-// 			panic("invalid axis for permute")
-// 		}
-// 		if seen[dst] {
-// 			panic("duplicate axis for permute")
-// 		}
-// 		seen[dst] = true
-// 		newShape4[dst] = shape4[axis]
-// 	}
-
-// 	total := len(t.data)
-// 	newData := make([]float32, total)
-
-// 	if total > 0 {
-// 		oldDims := shape4
-// 		newDims := newShape4
-
-// 		oldStride := [4]int{1, 1, 1, 1}
-// 		newStride := [4]int{1, 1, 1, 1}
-// 		for i := 1; i < 4; i++ {
-// 			oldStride[i] = oldStride[i-1] * oldDims[i-1]
-// 			newStride[i] = newStride[i-1] * newDims[i-1]
-// 		}
-
-// 		var coords [4]int
-// 		var newCoords [4]int
-
-// 		for idx := range total {
-// 			remainder := idx
-// 			for axis := range 4 {
-// 				dim := oldDims[axis]
-// 				if dim == 0 {
-// 					coords[axis] = 0
-// 					continue
-// 				}
-// 				coords[axis] = remainder % dim
-// 				remainder /= dim
-// 			}
-
-// 			for axis := range 4 {
-// 				newCoords[orderFull[axis]] = coords[axis]
-// 			}
-
-// 			newIndex := 0
-// 			for axis := range 4 {
-// 				if newDims[axis] == 0 {
-// 					continue
-// 				}
-// 				newIndex += newCoords[axis] * newStride[axis]
-// 			}
-
-// 			newData[newIndex] = t.data[idx]
-// 		}
-// 	}
-
-// 	numDims := 4
-// 	for numDims > 1 && newShape4[numDims-1] <= 1 {
-// 		numDims--
-// 	}
-
-// 	newShape := make([]int, numDims)
-// 	copy(newShape, newShape4[:numDims])
-
-// 	return &testTensor{
-// 		dtype:       t.dtype,
-// 		elementSize: t.elementSize,
-// 		data:        newData,
-// 		shape:       newShape,
-// 	}
-// }
-
-// func (t *testTensor) SetRows(ctx ml.Context, src ml.Tensor, idxs ml.Tensor) ml.Tensor {
-// 	dst := t
-// 	srcTensor := src.(*testTensor)
-// 	idxTensor := idxs.(*testTensor)
-
-// 	shapeTo4D := func(shape []int) [4]int {
-// 		out := [4]int{1, 1, 1, 1}
-// 		for i := 0; i < len(shape) && i < 4; i++ {
-// 			out[i] = shape[i]
-// 		}
-// 		return out
-// 	}
-
-// 	computeStrides := func(shape [4]int) [4]int {
-// 		out := [4]int{1, 1, 1, 1}
-// 		for i := 1; i < 4; i++ {
-// 			out[i] = out[i-1] * shape[i-1]
-// 		}
-// 		return out
-// 	}
-
-// 	dstShape4D := shapeTo4D(dst.shape)
-// 	srcShape4D := shapeTo4D(srcTensor.shape)
-// 	idxShape4D := shapeTo4D(idxTensor.shape)
-
-// 	if dstShape4D[0] != srcShape4D[0] || dstShape4D[2] != srcShape4D[2] || dstShape4D[3] != srcShape4D[3] {
-// 		panic("SetRows requires matching tensor shapes")
-// 	}
-
-// 	if srcShape4D[1] != idxShape4D[0] {
-// 		panic("SetRows rows/index mismatch")
-// 	}
-
-// 	if srcShape4D[2]%idxShape4D[1] != 0 || srcShape4D[3]%idxShape4D[2] != 0 {
-// 		panic("SetRows cannot broadcast indices")
-// 	}
-
-// 	if idxShape4D[3] != 1 {
-// 		panic("SetRows expects 1D or 2D index tensors")
-// 	}
-
-// 	dstStride := computeStrides(dstShape4D)
-// 	srcStride := computeStrides(srcShape4D)
-// 	idxStride := computeStrides(idxShape4D)
-
-// 	numColumns := srcShape4D[0]
-// 	numRows := srcShape4D[1]
-
-// 	for dim3Index := range dstShape4D[3] {
-// 		for dim2Index := range dstShape4D[2] {
-// 			idxDim2 := 0
-// 			idxDim3 := 0
-// 			if idxShape4D[1] > 0 {
-// 				idxDim2 = dim2Index % idxShape4D[1]
-// 			}
-// 			if idxShape4D[2] > 0 {
-// 				idxDim3 = dim3Index % idxShape4D[2]
-// 			}
-
-// 			idxBase := idxDim3*idxStride[2] + idxDim2*idxStride[1]
-// 			srcBase := dim3Index*srcStride[3] + dim2Index*srcStride[2]
-// 			dstBase := dim3Index*dstStride[3] + dim2Index*dstStride[2]
-
-// 			for row := range numRows {
-// 				idx := int(idxTensor.data[idxBase+row*idxStride[0]])
-// 				if idx < 0 || idx >= dstShape4D[1] {
-// 					panic("SetRows index out of range")
-// 				}
-
-// 				srcOffset := srcBase + row*srcStride[1]
-// 				dstOffset := dstBase + idx*dstStride[1]
-
-// 				copy(dst.data[dstOffset:dstOffset+numColumns], srcTensor.data[srcOffset:srcOffset+numColumns])
-// 			}
-// 		}
-// 	}
-
-// 	return dst
-// }
-
-// func (t *testTensor) Copy(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
-// 	copy(t2.(*testTensor).data, t.data)
-// 	return nil
-// }
--- a/x/kvcache/encoder.go
+++ b/x/kvcache/encoder.go
@@ -1,156 +0,0 @@
-package kvcache
-
-// import (
-// 	"fmt"
-
-// 	"github.com/ollama/ollama/ml"
-// 	"github.com/ollama/ollama/model/input"
-// )
-
-// // Encoder cache stores K and V tensors that are position independent
-// //
-// // The tensors can be of any shape and will be returned as they were stored
-// // The mask is currently always nil
-// //
-// // Not currently safe for multiple sequences
-// type EncoderCache struct {
-// 	// config controls mostly backend-specific optimizations
-// 	config *ml.CacheConfig
-
-// 	// ** current forward pass **
-
-// 	// the active layer for Get and Put
-// 	curLayer int
-
-// 	// if something is stored during this pass, this
-// 	// will be the position (but there is no guarantee
-// 	// anything will be stored)
-// 	curPos int32
-
-// 	// curReserve indicates that this forward pass is only for
-// 	// memory reservation and we should not update our metadata
-// 	// based on it.
-// 	curReserve bool
-
-// 	// ** cache metadata **
-
-// 	// was something stored in the cache?
-// 	encoderCached bool
-
-// 	// position of the cached data
-// 	encoderPos int32
-
-// 	// ** cache data storage **
-// 	backend      ml.Backend
-// 	ctxs         map[int]ml.Context
-// 	keys, values map[int]ml.Tensor
-// }
-
-// func NewEncoderCache() *EncoderCache {
-// 	return &EncoderCache{
-// 		ctxs:   make(map[int]ml.Context),
-// 		keys:   make(map[int]ml.Tensor),
-// 		values: make(map[int]ml.Tensor),
-// 	}
-// }
-
-// func (c *EncoderCache) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity, maxBatch int) {
-// 	if c.config == nil {
-// 		var config ml.CacheConfig
-// 		if cc, ok := backend.(ml.BackendCacheConfig); ok {
-// 			config = cc.CacheConfig()
-// 		}
-// 		c.config = &config
-// 	}
-
-// 	if maxSequences > 1 {
-// 		panic(fmt.Errorf("encoder cache does not support multiple sequences; requested: %v", maxSequences))
-// 	}
-
-// 	if c.config.CachePadding != 0 && c.config.CachePadding != 1 {
-// 		panic(fmt.Errorf("encoder cache is unable to enforce requested CachePadding (%v)", c.config.CachePadding))
-// 	}
-
-// 	c.backend = backend
-// }
-
-// func (c *EncoderCache) SetConfig(config ml.CacheConfig) {
-// 	if c.config != nil {
-// 		panic("config cannot be changed after being previously set, either by the model or backend")
-// 	}
-
-// 	c.config = &config
-// }
-
-// func (c *EncoderCache) Close() {
-// 	for _, ctx := range c.ctxs {
-// 		ctx.Close()
-// 	}
-// }
-
-// func (c *EncoderCache) StartForward(ctx ml.Context, batch input.Batch, reserve bool) error {
-// 	// We work with the most recent image
-// 	if len(batch.Multimodal) > 0 {
-// 		c.curPos = batch.Positions[batch.Multimodal[len(batch.Multimodal)-1].Index]
-// 	}
-
-// 	c.curReserve = reserve
-
-// 	return nil
-// }
-
-// func (c *EncoderCache) SetLayer(layer int) {
-// 	c.curLayer = layer
-// }
-
-// func (c *EncoderCache) EncoderCached() bool {
-// 	return c.encoderCached
-// }
-
-// func (c *EncoderCache) Get(ctx ml.Context) (ml.Tensor, ml.Tensor, ml.Tensor) {
-// 	return c.keys[c.curLayer], c.values[c.curLayer], nil
-// }
-
-// func (c *EncoderCache) Put(ctx ml.Context, key, value ml.Tensor) {
-// 	if !c.curReserve {
-// 		c.encoderPos = c.curPos
-// 		c.encoderCached = true
-// 	}
-
-// 	if c.config.PermutedV {
-// 		value = value.Transpose(ctx, 1, 2, 0, 3)
-// 	}
-
-// 	if _, ok := c.ctxs[c.curLayer]; !ok {
-// 		c.ctxs[c.curLayer] = c.backend.NewContext().Layer(c.curLayer)
-// 	}
-
-// 	if _, ok := c.keys[c.curLayer]; !ok {
-// 		c.keys[c.curLayer] = c.ctxs[c.curLayer].Empty(key.DType(), key.Shape()...)
-// 	}
-
-// 	if _, ok := c.values[c.curLayer]; !ok {
-// 		c.values[c.curLayer] = c.ctxs[c.curLayer].Empty(value.DType(), value.Shape()...)
-// 	}
-
-// 	ctx.Forward(
-// 		key.Copy(ctx, c.keys[c.curLayer]),
-// 		value.Copy(ctx, c.values[c.curLayer]),
-// 	)
-// }
-
-// func (c *EncoderCache) CopyPrefix(srcSeq, dstSeq int, len int32) {
-// 	panic("encoder cache does not support multiple sequences")
-// }
-
-// func (c *EncoderCache) CanResume(seq int, pos int32) bool {
-// 	return true
-// }
-
-// func (c *EncoderCache) Remove(seq int, beginIndex, endIndex int32) error {
-// 	if c.encoderPos >= beginIndex && c.encoderPos < endIndex {
-// 		c.encoderCached = false
-// 	}
-
-// 	return nil
-// }
--- a/x/kvcache/mlx.go
+++ b/x/kvcache/mlx.go
@@ -1,144 +0,0 @@
-//go:build mlx
-
-package kvcache
-
-import (
-	"github.com/ollama/ollama/x/ml"
-	"github.com/ollama/ollama/x/model/input"
-)
-
-// Causal cache stores K and V tensors according to their position in the
-// sequence. Returns the history and a mask for attending to past tokens
-type MLXCausal struct {
-	DType ml.DType
-
-	// locations for data storage for this batch
-	curLocPut ml.Tensor
-
-	// locations for data storage for this batch
-	curLocGet ml.Tensor
-
-	// the active layer for Get and Put
-	curLayer int
-
-	capacity int
-
-	offset int
-
-	backend      ml.Backend
-	ctxs         map[int]ml.Context
-	keys, values map[int]ml.Tensor
-
-	// TODO is this needed per layer, or will it always be consistent?
-	kHeadDims, vHeadDims, numKVHeads map[int]int
-}
-
-func NewMLXCausalCache() *MLXCausal {
-	return &MLXCausal{
-		ctxs:       make(map[int]ml.Context),
-		keys:       make(map[int]ml.Tensor),
-		values:     make(map[int]ml.Tensor),
-		kHeadDims:  make(map[int]int),
-		vHeadDims:  make(map[int]int),
-		numKVHeads: make(map[int]int),
-	}
-}
-
-func (c *MLXCausal) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity, maxBatch int) {
-	c.DType = dtype
-	c.capacity = capacity
-	c.backend = backend
-}
-
-func (c *MLXCausal) SetConfig(config ml.CacheConfig) {}
-
-func (c *MLXCausal) SetLayer(layer int) {
-	c.curLayer = layer
-}
-
-func (c *MLXCausal) Close() {
-	// slog.Info("XXX MLXCausal.Close called", "number of contexts", len(c.ctxs))
-	for _, ctx := range c.ctxs {
-		ctx.Close()
-	}
-}
-
-func (c *MLXCausal) StartForward(ctx ml.Context, batch input.Batch, reserve bool) error {
-	locsPut := make([]int32, len(batch.Positions))
-	for i := c.offset; i < len(batch.Positions); i++ {
-		locsPut[i-c.offset] = int32(i)
-	}
-	c.offset += len(batch.Positions)
-	locsGet := make([]int32, c.offset)
-	for i := range c.offset {
-		locsGet[i] = int32(i)
-	}
-	c.curLocGet = ctx.Input().FromInts(locsGet, len(locsGet))
-	c.curLocPut = ctx.Input().FromInts(locsPut, len(locsPut))
-	// slog.Info("XXX MLXCausal.StartForward", "offset", c.offset, "put", locsPut, "get", locsGet)
-
-	return nil
-}
-func (c *MLXCausal) Put(ctx ml.Context, key, value ml.Tensor) {
-	kHeadDim := key.Dim(3)
-	vHeadDim := value.Dim(3)
-	numKVHeads := key.Dim(1)
-	batchSize := key.Dim(2)
-	kCellSize := kHeadDim * numKVHeads
-	vCellSize := vHeadDim * numKVHeads
-	// slog.Info("XXX Causal.Put", "kHeadDim", kHeadDim, "vHeadDim", vHeadDim, "numKVHeads", numKVHeads, "batchSize", batchSize, "kCellSize", kCellSize, "vCellSize", vCellSize)
-
-	if _, ok := c.ctxs[c.curLayer]; !ok {
-		// slog.Info("XXX Causal.Put creating new context", "c.curLayer", c.curLayer)
-		c.ctxs[c.curLayer] = c.backend.NewContext().Layer(c.curLayer)
-	}
-
-	if _, ok := c.keys[c.curLayer]; !ok {
-		// slog.Info("XXX MLXCausal.Put allocating keys and values", "c.curLayer", c.curLayer, "shape", []int{c.capacity, kCellSize})
-		c.keys[c.curLayer] = c.ctxs[c.curLayer].Zeros(c.DType, c.capacity, kCellSize)
-		c.values[c.curLayer] = c.ctxs[c.curLayer].Zeros(c.DType, c.capacity, vCellSize)
-		c.kHeadDims[c.curLayer] = kHeadDim
-		c.vHeadDims[c.curLayer] = vHeadDim
-		c.numKVHeads[c.curLayer] = numKVHeads
-	}
-	key = key.Reshape(ctx, batchSize, 1, kCellSize)
-
-	// slog.Info("XXX MLXCausal.Put ", "c.keys[c.curLayer]", c.keys[c.curLayer])
-	// slog.Info("XXX MLXCausal.Put ", "c.curLocPut", c.curLocPut)
-	// slog.Info("XXX MLXCausal.Put ", "key", key)
-	ctx.Forward(c.keys[c.curLayer].Scatter(ctx, []ml.Tensor{c.curLocPut}, key, []int{0}))
-	value = value.Reshape(ctx, batchSize, 1, vCellSize)
-	ctx.Forward(c.values[c.curLayer].Scatter(ctx, []ml.Tensor{c.curLocPut}, value, []int{0}))
-
-}
-
-func (c *MLXCausal) Get(ctx ml.Context) (ml.Tensor, ml.Tensor, ml.Tensor) {
-	key := c.keys[c.curLayer]
-	value := c.values[c.curLayer]
-
-	kHeadDim := c.kHeadDims[c.curLayer]
-	vHeadDim := c.vHeadDims[c.curLayer]
-	numKVHeads := c.numKVHeads[c.curLayer]
-	// rowSize := numKVHeads * c.curBatchSize
-	// cachedSize := c.curMask.Dim(1)
-	cachedSize := c.curLocGet.Dim(0)
-	// kCellSize := kHeadDim * numKVHeads
-	// vCellSize := vHeadDim * numKVHeads
-	// slog.Info("XXX MLXCausal.Get", "shape", []int{1, numKVHeads, cachedSize, kHeadDim})
-
-	key = key.TakeAxes(ctx, c.curLocGet, 0).Reshape(ctx, 1, numKVHeads, cachedSize, kHeadDim)
-	value = value.TakeAxes(ctx, c.curLocGet, 0).Reshape(ctx, 1, numKVHeads, cachedSize, vHeadDim)
-	return key, value, nil
-}
-
-func (c *MLXCausal) CopyPrefix(srcSeq, dstSeq int, len int32) {
-	panic("not implemented")
-}
-
-func (c *MLXCausal) CanResume(seq int, pos int32) bool {
-	panic("not implemented")
-}
-
-func (c *MLXCausal) Remove(seq int, beginIndex, endIndex int32) error {
-	panic("not implemented")
-}
--- a/x/kvcache/wrapper.go
+++ b/x/kvcache/wrapper.go
@@ -1,110 +0,0 @@
-package kvcache
-
-// import (
-// 	"math"
-
-// 	"github.com/ollama/ollama/ml"
-// 	"github.com/ollama/ollama/model/input"
-// )
-
-// // Wrapper cache is a container for multiple types of caches,
-// // such as for the encoding and decoding portions of a model.
-// type WrapperCache struct {
-// 	// caches we are wrapping
-// 	caches []Cache
-
-// 	// cache to be used for this layer
-// 	curType int
-// }
-
-// func NewWrapperCache(caches ...Cache) *WrapperCache {
-// 	return &WrapperCache{
-// 		caches: caches,
-// 	}
-// }
-
-// func (c *WrapperCache) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity, maxBatch int) {
-// 	for _, cache := range c.caches {
-// 		cache.Init(backend, dtype, maxSequences, capacity, maxBatch)
-// 	}
-// }
-
-// func (c *WrapperCache) SetConfig(config ml.CacheConfig) {
-// 	for _, cache := range c.caches {
-// 		cache.SetConfig(config)
-// 	}
-// }
-
-// func (c *WrapperCache) Close() {
-// 	for _, cache := range c.caches {
-// 		cache.Close()
-// 	}
-// }
-
-// func (c *WrapperCache) StartForward(ctx ml.Context, batch input.Batch, reserve bool) error {
-// 	for i, cache := range c.caches {
-// 		err := cache.StartForward(ctx, batch, reserve)
-// 		if err != nil {
-// 			// unwind on error - Remove with endIndex set to math.MaxInt32 does not fail
-// 			for j := i - 1; j >= 0; j-- {
-// 				for k := range batch.Positions {
-// 					_ = c.caches[j].Remove(batch.Sequences[k], batch.Positions[k], math.MaxInt32)
-// 				}
-// 			}
-// 			return err
-// 		}
-// 	}
-
-// 	c.curType = 0
-// 	return nil
-// }
-
-// func (c *WrapperCache) SetLayer(layer int) {
-// 	for _, cache := range c.caches {
-// 		cache.SetLayer(layer)
-// 	}
-// }
-
-// func (c *WrapperCache) SetLayerType(layerType int) {
-// 	c.curType = layerType
-// }
-
-// func (c *WrapperCache) UnderlyingCache() Cache {
-// 	return c.caches[c.curType]
-// }
-
-// func (c *WrapperCache) Get(ctx ml.Context) (ml.Tensor, ml.Tensor, ml.Tensor) {
-// 	return c.caches[c.curType].Get(ctx)
-// }
-
-// func (c *WrapperCache) Put(ctx ml.Context, key, value ml.Tensor) {
-// 	c.caches[c.curType].Put(ctx, key, value)
-// }
-
-// func (c *WrapperCache) CopyPrefix(srcSeq, dstSeq int, len int32) {
-// 	for _, cache := range c.caches {
-// 		cache.CopyPrefix(srcSeq, dstSeq, len)
-// 	}
-// }
-
-// func (c *WrapperCache) CanResume(seq int, pos int32) bool {
-// 	for _, cache := range c.caches {
-// 		if !cache.CanResume(seq, pos) {
-// 			return false
-// 		}
-// 	}
-
-// 	return true
-// }
-
-// func (c *WrapperCache) Remove(seq int, beginIndex, endIndex int32) error {
-// 	// If the one of these fails, the caller is supposed to retry with endIndex set to math.MaxInt32, which should not fail
-// 	for _, cache := range c.caches {
-// 		err := cache.Remove(seq, beginIndex, endIndex)
-// 		if err != nil {
-// 			return err
-// 		}
-// 	}
-
-// 	return nil
-// }
--- a/x/ml/backend.go
+++ b/x/ml/backend.go
@@ -1,433 +0,0 @@
-package ml
-
-import (
-	"fmt"
-	"log/slog"
-	"os"
-
-	"github.com/ollama/ollama/fs"
-)
-
-type Backend interface {
-	// Close frees all memory associated with this backend
-	// Close()
-
-	// Load(ctx context.Context, progress func(float32)) error
-
-	// BackendMemory returns the memory allocations that were made for this model
-	// BackendMemory() BackendMemory
-
-	Config() fs.Config
-	Get(name string) Tensor
-	NewContext() Context
-	// NewContextSize(size int) Context
-
-	// Enumerate the devices available for inference via this backend
-	// BackendDevices() []DeviceInfo
-}
-
-// BackendCacheConfig should be implemented by backends that need special output
-// from the cache to meet specific requirements. It is frequently implemented in
-// conjunction with ScaledDotProductAttention.
-type BackendCacheConfig interface {
-	CacheConfig() CacheConfig
-}
-
-// CacheConfig controls optimizations (mostly backend-specific) that may transform
-// the output the cache to work better with specific kernels.
-type CacheConfig struct {
-	// CachePadding specifies the multiple for the number of tokens of cache history
-	// that will be returned from cache Get for k, v and mask. The capacity of the
-	// cache itself will also be increased to a multiple of this size if needed.
-	CachePadding int
-
-	// PermutedV performs Permute(ctx, 1, 2, 0, 3) on v tensors stored via Put
-	// and return the permuted version via Get. This uses the cache copy operation
-	// to avoid a Contiguous call on the permuted tensor.
-	PermutedV bool
-
-	// MaskDType specifies the data type for generating the mask. If unset it will
-	// default to DTypeF32.
-	MaskDType DType
-
-	// MaskBatchPadding specifies the multiple for the batch size dimension in the mask.
-	// Any position that does not correspond to an actual token will be filled with -Inf.
-	MaskBatchPadding int
-}
-
-// BackendParams controls how the backend loads and executes models
-type BackendParams struct {
-	// AllocMemory causes the backend to allocate memory for the model. If
-	// false, this is only being used for discovering the required amount of
-	// memory and cannot load the model for running.
-	AllocMemory bool
-
-	// NumThreads sets the number of threads to use if running on the CPU
-	NumThreads int
-
-	// GPULayers is the set of layers to offload to GPUs
-	GPULayers GPULayersList
-
-	// FlashAttention indicates that we should use a fused flash attention kernel
-	FlashAttention bool
-}
-
-var backends = make(map[string]func(string, BackendParams) (Backend, error))
-
-func RegisterBackend(name string, f func(string, BackendParams) (Backend, error)) {
-	if _, ok := backends[name]; ok {
-		panic("backend: backend already registered")
-	}
-
-	backends[name] = f
-}
-
-func NewBackend(modelPath string, params BackendParams) (Backend, error) {
-	be := os.Getenv("OLLAMA_BACKEND")
-	if be == "" {
-		be = "mlx"
-		slog.Info("Defaulting to " + be + ". Set OLLAMA_BACKEND to override")
-	}
-	slog.Info("Loading new engine", "backend", be)
-	if backend, ok := backends[be]; ok {
-		return backend(modelPath, params)
-	}
-
-	return nil, fmt.Errorf("unsupported backend")
-}
-
-type Context interface {
-	Empty(dtype DType, shape ...int) Tensor
-	Zeros(dtype DType, shape ...int) Tensor
-	// FromBytes(dtype DType, s []byte, shape ...int) Tensor
-	FromFloats(s []float32, shape ...int) Tensor
-	FromInts(s []int32, shape ...int) Tensor
-	RandomNormal(shape []int, dtype DType, loc, scale float32, key Tensor) Tensor
-
-	// Arange creates a 1D tensor with values within an interval (start, stop] increased by step.
-	Arange(start, stop, step float32, dtype DType) Tensor
-
-	Forward(...Tensor) Context
-
-	// SetBatchSize provides a hint on the batch size to optimize processing
-	// Uses heuristics if not set
-	// SetBatchSize(int)
-
-	Compute(...Tensor)
-	// ComputeWithNotify(func(), ...Tensor) // notify callback once compute has begun
-
-	// Reserve is analogous to Compute but rather than executing a
-	// graph, simply preallocates memory. Typically called with a
-	// worst case graph to ensure all resources are available for
-	// for future inference.
-	// Reserve()
-
-	// MaxGraphNodes() int
-	Close()
-
-	// Input returns a context appropriate for creating tensors that are
-	// inputs to the model (which includes things like output locations)
-	Input() Context
-
-	// Layer returns a context appropriate for creating intermediate tensors
-	Layer(int) Context
-
-	// Load a tensor from "filename" safetensors file, and compare with the input tensor
-	// Returns error if the shape is inconsistent, or similarity measures are below 99%
-	CompareWith(filename string, tensors map[string]Tensor, abortOnError bool) error
-}
-
-type RoPEOptions struct {
-	Base  *float32
-	Freqs Tensor
-}
-
-func WithRoPEBase(base float32) func(*RoPEOptions) {
-	return func(opts *RoPEOptions) {
-		opts.Base = &base
-	}
-}
-
-func WithRoPEFreqs(freqs Tensor) func(*RoPEOptions) {
-	return func(opts *RoPEOptions) {
-		opts.Freqs = freqs
-	}
-}
-
-type Tensor interface {
-	ToString() string
-	RoPE(ctx Context, dims int, traditional bool, scale float32, offset int, options ...func(*RoPEOptions)) Tensor
-	ScaledDotProductAttention(ctx Context, keys, values Tensor, scale float64, maskMode string, mask Tensor, sinks Tensor) Tensor
-	TakeAxes(ctx Context, indicies Tensor, axes int) Tensor
-	// TakeAxes(ctx Context, axes int, indicies ...int) Tensor
-
-	Dim(n int) int
-	Stride(n int) int
-
-	Shape() []int
-	DType() DType
-	// Cast(ctx Context, dtype DType) Tensor
-
-	// Bytes() []byte
-	Floats() []float32
-	Ints() []int32
-
-	// FromBytes([]byte)
-	// FromFloats([]float32)
-	// FromInts([]int32)
-
-	Add(ctx Context, t2 Tensor) Tensor
-	Sub(ctx Context, t2 Tensor) Tensor
-	// Mul(ctx Context, t2 Tensor) Tensor
-	// Div(ctx Context, t2 Tensor) Tensor
-
-	Max(ctx Context, axes []int, keepDims bool) Tensor
-	Min(ctx Context, axes []int, keepDims bool) Tensor
-
-	Matmul(ctx Context, a2 Tensor) Tensor
-	// Mulmat(ctx Context, t2 Tensor) Tensor
-	// MulmatFullPrec(ctx Context, t2 Tensor) Tensor
-	// MulmatID(ctx Context, t2, ids Tensor) Tensor
-	// AddID(ctx Context, t2, ids Tensor) Tensor
-
-	Softmax(ctx Context) Tensor
-	L2Norm(ctx Context, eps float32) Tensor
-	LayerNorm(ctx Context, weight, bias Tensor, eps float32) Tensor
-	RMSNorm(ctx Context, weight Tensor, eps float32) Tensor
-	Scale(ctx Context, s float64) Tensor
-	// SumRows(ctx Context) Tensor
-
-	AvgPool2D(ctx Context, k, s int, p float32) Tensor
-	Conv2D(ctx Context, weight Tensor, stride0, stride1, padding0, padding1, dilation0, dilation1, groups int) Tensor
-	Conv3D(ctx Context, weight Tensor, stride0, stride1, stride2, padding0, padding1, padding2, dilation0, dilation1, dilation2, groups int) Tensor
-
-	// IM2Col(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
-
-	// Sin(ctx Context) Tensor
-	// Cos(ctx Context) Tensor
-	// Tanh(ctx Context) Tensor
-	GELU(ctx Context, up ...Tensor) Tensor
-	// QuickGELU(ctx Context, up ...Tensor) Tensor
-	// SILU(ctx Context, up ...Tensor) Tensor
-	// RELU(ctx Context, up ...Tensor) Tensor
-	// Sigmoid(ctx Context) Tensor
-
-	// AlphaLimitSILU is a variant of SILU that clamps the input to the range [-limit, limit]
-	// SILUAlphaLimit(ctx Context, up Tensor, alpha, limit float32) Tensor
-
-	Reshape(ctx Context, shape ...int) Tensor
-	AsStrided(ctx Context, shape, strides []int, offset int) Tensor
-	Transpose(ctx Context, shape ...int) Tensor
-	Contiguous(ctx Context, allowColMajor bool) Tensor
-
-	// Pad(ctx Context, shape ...int) Tensor
-
-	// Stack(ctx Context, dim int, s ...Tensor) Tensor
-
-	// Repeat repeats the tensor n times along dimension dim
-	// Repeat(ctx Context, dim, n int) Tensor
-	// Concat(ctx Context, t2 Tensor, dim int) Tensor
-	// Rows(ctx Context, t2 Tensor) Tensor
-
-	// TODO these probably aren't actually needed - false starts on trying to wire up cache
-	// SliceUpdate(ctx Context, update Tensor, start, stop, strides []int) Tensor
-	// SliceUpdateDynamic(ctx Context, update, start Tensor, axes []int) Tensor
-	// PutAlongAxis(ctx Context, indicies, values Tensor, axis int) Tensor
-
-	Scatter(ctx Context, indicies []Tensor, updates Tensor, axes []int) Tensor
-
-	Copy(ctx Context, t2 Tensor) Tensor
-	// Duplicate(ctx Context) Tensor
-
-	// Slice(ctx Context, dim, low, high, step int) Tensor
-	// Chunk(ctx Context, dim int, size int) []Tensor
-	// ChunkSections(ctx Context, dim int, sections ...int) []Tensor
-
-	// TopK(ctx Context, k int) Tensor
-	// Argsort(ctx Context) Tensor
-	// Mean(ctx Context) Tensor
-	// Variance(ctx Context) Tensor
-	// Stddev(ctx Context) Tensor
-	// Sqr(ctx Context) Tensor
-	// Sqrt(ctx Context) Tensor
-
-	// Interpolate(ctx Context, dims [4]int, samplingMode SamplingMode) Tensor
-}
-
-// ScaledDotProductAttention implements a fused attention
-// operation equivalent to following code on a tensor named
-// query:
-//
-// query = query.Permute(ctx, 0, 2, 1, 3)
-// key = key.Permute(ctx, 0, 2, 1, 3)
-// value = value.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
-//
-// kq := key.MulmatFullPrec(ctx, query)
-//
-// kq = kq.Scale(ctx, scale)
-//
-//	if mask != nil {
-//		kq = kq.Add(ctx, mask)
-//	}
-//
-// kq = kq.Softmax(ctx)
-//
-// kqv := value.Mulmat(ctx, kq)
-// return kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
-// type ScaledDotProductAttention interface {
-// 	ScaledDotProductAttention(ctx Context, key, value, mask, sinks Tensor, vmla Tensor, scale float64) Tensor
-// }
-
-// type number interface {
-// 	~int | ~int8 | ~int16 | ~int32 | ~int64 |
-// 		~uint | ~uint8 | ~uint16 | ~uint32 | ~uint64 |
-// 		~float32 | ~float64 |
-// 		~complex64 | ~complex128
-// }
-
-// func mul[T number](s ...T) T {
-// 	p := T(1)
-// 	for _, v := range s {
-// 		p *= v
-// 	}
-
-// 	return p
-// }
-
-// type DumpOptions func(*dumpOptions)
-
-// // DumpWithPrecision sets the number of decimal places to print. Applies to float32 and float64.
-// func DumpWithPrecision(n int) DumpOptions {
-// 	return func(opts *dumpOptions) {
-// 		opts.Precision = n
-// 	}
-// }
-
-// // DumpWithThreshold sets the threshold for printing the entire tensor. If the number of elements
-// // is less than or equal to this value, the entire tensor will be printed. Otherwise, only the
-// // beginning and end of each dimension will be printed.
-// func DumpWithThreshold(n int) DumpOptions {
-// 	return func(opts *dumpOptions) {
-// 		opts.Threshold = n
-// 	}
-// }
-
-// // DumpWithEdgeItems sets the number of elements to print at the beginning and end of each dimension.
-// func DumpWithEdgeItems(n int) DumpOptions {
-// 	return func(opts *dumpOptions) {
-// 		opts.EdgeItems = n
-// 	}
-// }
-
-// type dumpOptions struct {
-// 	Precision, Threshold, EdgeItems int
-// }
-
-// func Dump(ctx Context, t Tensor, optsFuncs ...DumpOptions) string {
-// 	opts := dumpOptions{Precision: 4, Threshold: 1000, EdgeItems: 3}
-// 	for _, optsFunc := range optsFuncs {
-// 		optsFunc(&opts)
-// 	}
-
-// 	if mul(t.Shape()...) <= opts.Threshold {
-// 		opts.EdgeItems = math.MaxInt
-// 	}
-
-// 	switch t.DType() {
-// 	case DTypeFloat32:
-// 		return dump[[]float32](ctx, t, opts.EdgeItems, func(f float32) string {
-// 			return strconv.FormatFloat(float64(f), 'f', opts.Precision, 32)
-// 		})
-// 	case DTypeFloat16: // TODO other types...
-// 		f32 := ctx.Input().Empty(DTypeFloat32, t.Shape()...)
-// 		f32 = t.Copy(ctx, f32)
-// 		return dump[[]float32](ctx, f32, opts.EdgeItems, func(f float32) string {
-// 			return strconv.FormatFloat(float64(f), 'f', opts.Precision, 32)
-// 		})
-// 	case DTypeInt32:
-// 		return dump[[]int32](ctx, t, opts.EdgeItems, func(i int32) string {
-// 			return strconv.FormatInt(int64(i), 10)
-// 		})
-// 	default:
-// 		return "<unsupported>"
-// 	}
-// }
-
-// func dump[S ~[]E, E number](ctx Context, t Tensor, items int, fn func(E) string) string {
-// 	if t.Bytes() == nil {
-// 		ctx.Compute(t)
-// 	}
-
-// 	s := make(S, mul(t.Shape()...))
-// 	if err := binary.Read(bytes.NewBuffer(t.Bytes()), binary.LittleEndian, &s); err != nil {
-// 		panic(err)
-// 	}
-
-// 	shape := t.Shape()
-// 	slices.Reverse(shape)
-
-// 	var sb strings.Builder
-// 	var f func([]int, int)
-// 	f = func(dims []int, stride int) {
-// 		prefix := strings.Repeat(" ", len(shape)-len(dims)+1)
-// 		sb.WriteString("[")
-// 		defer func() { sb.WriteString("]") }()
-// 		for i := 0; i < dims[0]; i++ {
-// 			if i >= items && i < dims[0]-items {
-// 				sb.WriteString("..., ")
-// 				// skip to next printable element
-// 				skip := dims[0] - 2*items
-// 				if len(dims) > 1 {
-// 					stride += mul(append(dims[1:], skip)...)
-// 					fmt.Fprint(&sb, strings.Repeat("\n", len(dims)-1), prefix)
-// 				}
-// 				i += skip - 1
-// 			} else if len(dims) > 1 {
-// 				f(dims[1:], stride)
-// 				stride += mul(dims[1:]...)
-// 				if i < dims[0]-1 {
-// 					fmt.Fprint(&sb, ",", strings.Repeat("\n", len(dims)-1), prefix)
-// 				}
-// 			} else {
-// 				text := fn(s[stride+i])
-// 				if len(text) > 0 && text[0] != '-' {
-// 					sb.WriteString(" ")
-// 				}
-
-// 				sb.WriteString(text)
-// 				if i < dims[0]-1 {
-// 					sb.WriteString(", ")
-// 				}
-// 			}
-// 		}
-// 	}
-// 	f(shape, 0)
-
-// 	return sb.String()
-// }
-
-type DType int
-
-const (
-	DTypeBool DType = iota
-	DTypeUint8
-	DTypeUint16
-	DTypeUint32
-	DTypeUint64
-	DTypeInt8
-	DTypeInt16
-	DTypeInt32
-	DTypeInt64
-	DTypeFloat16
-	DTypeFloat32
-	DTypeFloat64
-	DTypeBfloat16
-	DTypeComplex64
-)
-
-type SamplingMode int
-
-const (
-	SamplingModeNearest SamplingMode = iota
-	SamplingModeBilinear
-)
--- a/x/ml/backend/backend.go
+++ b/x/ml/backend/backend.go
@@ -1,3 +0,0 @@
-package backend
-
-// _ "github.com/ollama/ollama/x/ml/backend/mlx"
--- a/x/ml/backend/mlx/CMakeLists.txt
+++ b/x/ml/backend/mlx/CMakeLists.txt
@@ -1,61 +0,0 @@
-include(FetchContent)
-
-# Read MLX version from top-level file (shared with Dockerfile)
-file(READ "${CMAKE_SOURCE_DIR}/MLX_VERSION" MLX_C_GIT_TAG)
-string(STRIP "${MLX_C_GIT_TAG}" MLX_C_GIT_TAG)
-
-set(MLX_C_BUILD_EXAMPLES OFF)
-
-set(MLX_BUILD_GGUF OFF)
-set(MLX_BUILD_SAFETENSORS ON)
-
-function(set_target_output_directory _target)
-    if(TARGET ${_target})
-        set_target_properties(${_target} PROPERTIES
-            RUNTIME_OUTPUT_DIRECTORY ${OLLAMA_BUILD_DIR}
-            LIBRARY_OUTPUT_DIRECTORY ${OLLAMA_BUILD_DIR}
-            ARCHIVE_OUTPUT_DIRECTORY ${OLLAMA_BUILD_DIR}
-        )
-    endif()
-endfunction()
-
-# Check for Metal support (macOS only)
-if(CMAKE_SYSTEM_NAME MATCHES "Darwin")
-    execute_process(
-      COMMAND
-        zsh "-c"
-        "echo \"__METAL_VERSION__\" | xcrun -sdk macosx metal ${XCRUN_FLAGS} -E -x metal -P - | tail -1 | tr -d '\n'"
-      OUTPUT_VARIABLE MLX_METAL_VERSION COMMAND_ERROR_IS_FATAL ANY)
-
-    if(NOT MLX_METAL_VERSION)
-        message(STATUS "`xcrun metal` error. Setting MLX_BUILD_METAL=OFF")
-        set(MLX_BUILD_METAL OFF)
-    endif()
-else()
-    # On Linux, disable Metal backend
-    message(STATUS "Non-macOS platform detected. Setting MLX_BUILD_METAL=OFF")
-    set(MLX_BUILD_METAL OFF)
-endif()
-
-# Map CMAKE_CUDA_ARCHITECTURES to MLX_CUDA_ARCHITECTURES if not explicitly set
-if(NOT MLX_CUDA_ARCHITECTURES AND CMAKE_CUDA_ARCHITECTURES)
-    set(MLX_CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES})
-    message(STATUS "Using CMAKE_CUDA_ARCHITECTURES for MLX: ${MLX_CUDA_ARCHITECTURES}")
-endif()
-
-# Enable CUDA backend if CUDA architectures are specified and CUDA compiler is available
-if(MLX_CUDA_ARCHITECTURES AND CMAKE_CUDA_COMPILER)
-    set(MLX_BUILD_CUDA ON CACHE BOOL "Build CUDA backend for MLX" FORCE)
-    message(STATUS "Enabling MLX CUDA backend with architectures: ${MLX_CUDA_ARCHITECTURES}")
-elseif(MLX_CUDA_ARCHITECTURES)
-    message(WARNING "MLX_CUDA_ARCHITECTURES specified but CUDA compiler not found, CUDA backend will be disabled")
-endif()
-
-FetchContent_Declare(
-  mlx-c
-  GIT_REPOSITORY "https://github.com/ml-explore/mlx-c.git"
-  GIT_TAG ${MLX_C_GIT_TAG})
-FetchContent_MakeAvailable(mlx-c)
-
-set_target_output_directory(mlx)
-set_target_output_directory(mlxc)
--- a/x/ml/backend/mlx/mlx.go
+++ b/x/ml/backend/mlx/mlx.go
--- a/x/ml/backend/mlx/mlx_dynamic.c
+++ b/x/ml/backend/mlx/mlx_dynamic.c
@@ -1,92 +0,0 @@
-// mlx_dynamic.c - Dynamic loading wrapper for MLX-C library
-// This file provides runtime dynamic loading of libmlxc instead of link-time binding
-
-#include "mlx_dynamic.h"
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#ifdef _WIN32
-#include <windows.h>
-typedef HMODULE lib_handle_t;
-#define LOAD_LIB(path) LoadLibraryA(path)
-#define GET_SYMBOL(handle, name) GetProcAddress(handle, name)
-#define CLOSE_LIB(handle) FreeLibrary(handle)
-#define LIB_ERROR() "LoadLibrary failed"
-static const char* LIB_NAMES[] = {"libmlxc.dll", NULL};
-#else
-#include <dlfcn.h>
-typedef void* lib_handle_t;
-#define LOAD_LIB(path) dlopen(path, RTLD_LAZY | RTLD_GLOBAL)
-#define GET_SYMBOL(handle, name) dlsym(handle, name)
-#define CLOSE_LIB(handle) dlclose(handle)
-#define LIB_ERROR() dlerror()
-#ifdef __APPLE__
-static const char* LIB_NAMES[] = {
-    "libmlxc.dylib",
-    "@loader_path/../build/lib/ollama/libmlxc.dylib",
-    "@executable_path/../build/lib/ollama/libmlxc.dylib",
-    "build/lib/ollama/libmlxc.dylib",
-    "../build/lib/ollama/libmlxc.dylib",
-    NULL
-};
-#else
-static const char* LIB_NAMES[] = {
-    "libmlxc.so",
-    "$ORIGIN/../build/lib/ollama/libmlxc.so",
-    "build/lib/ollama/libmlxc.so",
-    "../build/lib/ollama/libmlxc.so",
-    NULL
-};
-#endif
-#endif
-
-static lib_handle_t mlx_handle = NULL;
-static int mlx_initialized = 0;
-static char mlx_error_buffer[512] = {0};
-
-// Initialize MLX dynamic library
-// Returns 0 on success, -1 on failure
-// On failure, call mlx_dynamic_error() to get error message
-int mlx_dynamic_init(void) {
-    if (mlx_initialized) {
-        return 0;  // Already initialized
-    }
-
-    // Try each possible library path
-    for (int i = 0; LIB_NAMES[i] != NULL; i++) {
-        mlx_handle = LOAD_LIB(LIB_NAMES[i]);
-        if (mlx_handle != NULL) {
-            mlx_initialized = 1;
-            snprintf(mlx_error_buffer, sizeof(mlx_error_buffer),
-                     "MLX: Successfully loaded %s", LIB_NAMES[i]);
-            return 0;
-        }
-    }
-
-    // Failed to load library
-    const char* err = LIB_ERROR();
-    snprintf(mlx_error_buffer, sizeof(mlx_error_buffer),
-             "MLX: Failed to load libmlxc library. %s",
-             err ? err : "Unknown error");
-    return -1;
-}
-
-// Get the last error message
-const char* mlx_dynamic_error(void) {
-    return mlx_error_buffer;
-}
-
-// Check if MLX is initialized
-int mlx_dynamic_is_initialized(void) {
-    return mlx_initialized;
-}
-
-// Cleanup (optional, called at program exit)
-void mlx_dynamic_cleanup(void) {
-    if (mlx_handle != NULL) {
-        CLOSE_LIB(mlx_handle);
-        mlx_handle = NULL;
-        mlx_initialized = 0;
-    }
-}
--- a/x/ml/backend/mlx/mlx_dynamic.h
+++ b/x/ml/backend/mlx/mlx_dynamic.h
@@ -1,26 +0,0 @@
-// mlx_dynamic.h - Dynamic loading interface for MLX-C library
-#ifndef MLX_DYNAMIC_H
-#define MLX_DYNAMIC_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Initialize the MLX dynamic library
-// Returns 0 on success, -1 on failure
-int mlx_dynamic_init(void);
-
-// Get the last error message from dynamic loading
-const char* mlx_dynamic_error(void);
-
-// Check if MLX is initialized
-int mlx_dynamic_is_initialized(void);
-
-// Cleanup resources (optional, for clean shutdown)
-void mlx_dynamic_cleanup(void);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif // MLX_DYNAMIC_H
--- a/x/ml/backend/mlx/mlx_test.go
+++ b/x/ml/backend/mlx/mlx_test.go
@@ -1,314 +0,0 @@
-//go:build mlx
-
-package mlx
-
-import (
-	"log/slog"
-	"os"
-	"reflect"
-	"strings"
-	"testing"
-
-	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/runner/common"
-	"github.com/ollama/ollama/sample"
-	"github.com/ollama/ollama/x/ml"
-	"github.com/ollama/ollama/x/model"
-	"github.com/ollama/ollama/x/model/input"
-	_ "github.com/ollama/ollama/x/model/models/gemma3"
-)
-
-func init() {
-	logger := slog.New(slog.NewTextHandler(os.Stdout, &slog.HandlerOptions{Level: slog.LevelDebug}))
-	slog.SetDefault(logger)
-}
-
-func TestLoadModel(t *testing.T) {
-	dir := "/Users/daniel/Models/gemma-3-4b-it/"
-	b := &Backend{}
-	err := b.LoadSafeTensors(dir)
-	if err != nil {
-		t.Fatalf("load failed: %s", err)
-	}
-}
-
-func TestFromInts(t *testing.T) {
-	b := &Backend{}
-	c := b.NewContext()
-	defer c.Close()
-	data := []int32{1, 2, 3, 4, 5, 6}
-	a := c.FromInts(data, 2, 3)
-	slog.Info("", "array", a)
-	t.Log(a.ToString())
-	if !reflect.DeepEqual(a.Shape(), []int{2, 3}) {
-		t.Fatalf("incorrect shape: %v", a.Shape())
-	}
-}
-
-func TestFromFloats(t *testing.T) {
-	b := &Backend{}
-	c := b.NewContext()
-	defer c.Close()
-	data := []float32{1, 2, 3, 4, 5, 6}
-	a := c.FromFloats(data, 2, 3)
-	slog.Info("", "array", a)
-	t.Log(a.ToString())
-	if !reflect.DeepEqual(a.Shape(), []int{2, 3}) {
-		t.Fatalf("incorrect shape: %v", a.Shape())
-	}
-	res := a.Floats()
-	if !reflect.DeepEqual(res, data) {
-		t.Fatalf("incorrect results: %v", res)
-	}
-}
-
-func TestAdd(t *testing.T) {
-	b := &Backend{}
-	c := b.NewContext()
-	defer c.Close()
-	t1 := c.Arange(0, 24, 1, ml.DTypeFloat16)
-	t2 := c.Arange(0, 24, 1, ml.DTypeFloat16)
-	exp := c.Arange(0, 48, 2, ml.DTypeFloat16)
-	t3 := t1.Add(c, t2)
-	c.Compute(t3, exp)
-	t3f := t3.Floats()
-	if !reflect.DeepEqual(t3f, exp.Floats()) {
-		t.Fatalf("incorrect result: %v", t3f)
-	}
-}
-
-func TestReshapeTranspose(t *testing.T) {
-	b := &Backend{}
-	c := b.NewContext()
-	defer c.Close()
-	t1 := c.Arange(0, 24, 1, ml.DTypeFloat16).Reshape(c, 2, 3, 4).Transpose(c, 0, 2, 1).Contiguous(c, false)
-	c.Compute(t1)
-	t1f := t1.Floats()
-	exp := []float32{
-		0, 4, 8,
-		1, 5, 9,
-		2, 6, 10,
-		3, 7, 11,
-		12, 16, 20,
-		13, 17, 21,
-		14, 18, 22,
-		15, 19, 23,
-	}
-	if !reflect.DeepEqual(t1f, exp) {
-		t.Fatalf("incorrect results: %v", t1f)
-	}
-}
-
-func prod(vals ...int) int {
-	r := 1
-	for _, v := range vals {
-		r *= v
-	}
-	return r
-}
-func TestMatmul(t *testing.T) {
-	// TODO create scenarios...
-	b := &Backend{}
-	c := b.NewContext()
-	defer c.Close()
-	s1 := []int{1, 3, 2, 4}
-	t1 := c.Arange(0, float32(prod(s1...)), 1, ml.DTypeFloat16).Reshape(c, s1...)
-	s2 := []int{4, 2}
-	t2 := c.Arange(0, float32(prod(s2...)), 1, ml.DTypeFloat16).Reshape(c, s2...)
-	t3 := t1.Matmul(c, t2)
-	exp := []float32{
-		28, 34,
-		76, 98,
-
-		124, 162,
-		172, 226,
-
-		220, 290,
-		268, 354,
-	}
-	c.Compute(t3)
-	t3f := t3.Floats()
-	if !reflect.DeepEqual(t3f, exp) {
-		t.Fatalf("incorrect result: %v", t3f)
-	}
-}
-
-func TestRows(t *testing.T) {
-	b := &Backend{}
-	c := b.NewContext()
-	defer c.Close()
-	t1 := c.Arange(0, 12, 1, ml.DTypeFloat32).Reshape(c, 1, 4, 3)
-	outputs := c.Zeros(ml.DTypeInt32, 1)
-	t2 := t1.TakeAxes(c, outputs, 1)
-	c.Forward(t1, t2).Compute(t1, t2)
-	t.Log(t1.ToString())
-	t.Log(t2.ToString())
-	f := t2.Floats()
-	t.Logf("Result: %v", f)
-}
-
-func TestCaching(t *testing.T) {
-	// Validate the caching algorithm
-	b := &Backend{}
-	c := b.NewContext()
-	defer c.Close()
-	batchSize := 3
-	headDim := 4
-	numKVHeads := 2
-	// Make cache twice the size of one test batch
-	cells := batchSize * 2
-	cellSize := numKVHeads * headDim
-	shape := []int{1, numKVHeads, batchSize, headDim}
-	stop := float32(1)
-	for _, x := range shape {
-		stop *= float32(x)
-	}
-	// Create the cache
-	cache := c.Zeros(ml.DTypeFloat16, cells, cellSize)
-	t.Logf("Empty Cache shape%v\n"+cache.ToString(), []int{cells, cellSize})
-
-	// Input tensor
-	t1 := c.Arange(0, stop, 1, ml.DTypeFloat16).Reshape(c, shape...)
-	t.Logf("Initial Data shape%v\n"+t1.ToString(), shape)
-
-	// Reshape to copy into the cache
-	/*
-		From MLX python/src/indexing.cpp mlx_scatter_args_array
-		// The update shape must broadcast with indices.shape + [1] + src.shape[1:]
-		auto up_shape = indices.shape();
-		up_shape.insert(up_shape.end(), src.shape().begin() + 1, src.shape().end());
-		up = broadcast_to(up, up_shape);
-		up_shape.insert(up_shape.begin() + indices.ndim(), 1);
-		up = reshape(up, up_shape);
-	*/
-	numRows := 3
-	up := t1.Reshape(c, numRows, 1, cellSize) // The shape has to look like this for scatter to work properly
-	t.Logf("Data reshaped for cache input shape%v\n"+up.ToString(), []int{batchSize, numKVHeads * headDim})
-
-	// Simulate cells 1,3,5 are available
-	indicies := []ml.Tensor{c.FromInts([]int32{1, 3, 5}, numRows)}
-	t.Logf("Indicies shape%v\n"+indicies[0].ToString(), []int{numRows})
-	axis := []int{0} // The 1,3,5 of the indicies are in reference to axis 0 in the cache shape
-	cache.Scatter(c, indicies, up, axis)
-
-	c.Forward(cache)
-	// Cache should contain the data now
-	t.Log("Cache after put\n" + cache.ToString())
-
-	// Retrieve cache content and verify it matches
-	out := cache.TakeAxes(c, indicies[0], 0).Reshape(c, shape...)
-	t.Logf("Output shape%v\n"+out.ToString(), out.Shape())
-
-	t1f := t1.Floats()
-	outf := out.Floats()
-	if !reflect.DeepEqual(t1f, outf) {
-		t.Fatalf("mismatched in->out\n%v\n ->\n%v", t1f, outf)
-	}
-}
-
-func TestGemma3(t *testing.T) {
-	// Why is the sky blue
-	inputs := []int32{2, 105, 2364, 107, 36425, 563, 506, 7217, 3730, 106, 107, 105, 4368}
-	limit := 50
-
-	// TODO generalize this
-	dir := "/Users/daniel/Models/gemma-3-4b-it/"
-
-	m, err := model.New(dir, ml.BackendParams{})
-	if err != nil {
-		t.Fatalf("unable to load model: %s", err)
-	}
-	b := m.Backend()
-	ctx := b.NewContext()
-	defer ctx.Close()
-
-	batch := input.Batch{
-		Inputs:    ctx.FromInts(inputs[:], 1, len(inputs)),
-		Positions: make([]int32, len(inputs)),
-		Sequences: make([]int, len(inputs)),
-		Outputs:   ctx.FromInts([]int32{int32(len(inputs) - 1)}, 1),
-		Offset:    0,
-	}
-	for i := range len(inputs) {
-		batch.Positions[i] = int32(i)
-	}
-	offset := len(inputs)
-
-	cache := m.Config().Cache
-	if cache != nil {
-		numSlots := 1
-		batchSize := 512
-		numCtx := 4096
-
-		// Note: this is inconsistent with mlx-py, but trying to be consistent with the GGML cache impl to get things working
-		// cache.SetConfig(ml.CacheConfig{CachePadding: 256, MaskDType: ml.DTypeBfloat16, MaskBatchPadding: 64})
-		cache.SetConfig(ml.CacheConfig{CachePadding: 0, MaskDType: ml.DTypeBfloat16, MaskBatchPadding: 0})
-
-		cache.Init(b, ml.DTypeBfloat16, numSlots, int(numCtx), batchSize)
-		err := cache.StartForward(ctx, batch, false)
-		if err != nil {
-			t.Fatalf("failed cache.StartForward: %s", err)
-		}
-	}
-	opts := api.DefaultOptions()
-	var grammar *sample.GrammarSampler
-	sampler := sample.NewSampler(
-		opts.Temperature,
-		opts.TopK,
-		opts.TopP,
-		opts.MinP,
-		opts.Seed,
-		grammar,
-	)
-
-	t.Log("Starting Forward pass loop")
-	pendingResponses := []string{}
-	for {
-		out, err := m.Forward(ctx, batch)
-		if err != nil {
-			t.Fatalf("failed forward pass: %s", err)
-		}
-		ctx.Forward(out)
-		outputs := out.Floats()
-		t.Logf("finished forward pass!  length:%d", len(outputs))
-		// sample a token
-		logits := outputs
-		token, err := sampler.Sample(logits)
-		if err != nil {
-			t.Fatalf("unable to sample token: %s", err)
-		}
-		t.Logf("Sampled token: %v", token)
-		if m.(model.TextProcessor).Is(token, model.SpecialEOS) {
-			t.Log("hit EOS")
-			break
-		}
-		piece, err := m.(model.TextProcessor).Decode([]int32{token})
-		if err != nil {
-			t.Fatalf("unable to decode token: %s", err)
-		}
-
-		pendingResponses = append(pendingResponses, piece)
-		sequence := strings.Join(pendingResponses, "")
-		if ok, stop := common.FindStop(sequence, opts.Stop); ok {
-			t.Logf("hit stop token: %v", stop)
-			break
-		}
-		t.Logf("RESULTS: %s", sequence)
-		batch = input.Batch{
-			Inputs:    ctx.FromInts([]int32{token}, 1, 1),
-			Positions: make([]int32, 1),
-			Sequences: make([]int, 1),
-			Outputs:   ctx.FromInts([]int32{0}, 1),
-			Offset:    offset,
-		}
-		offset++
-		batch.Positions[0] = 0
-		err = cache.StartForward(ctx, batch, false)
-		if err != nil {
-			t.Fatalf("failed cache.StartForward: %s", err)
-		}
-		if offset > limit {
-			break
-		}
-	}
-}
--- a/x/ml/backend/mlx/quant.go
+++ b/x/ml/backend/mlx/quant.go
@@ -1,335 +0,0 @@
-//go:build mlx
-
-package mlx
-
-/*
-#include <stdio.h>
-#include <string.h>
-
-#include "mlx/c/array.h"
-#include "mlx/c/ops.h"
-
-// Derived from https://github.com/ml-explore/mlx/blob/main/mlx/io/gguf_quants.cpp
-
-void unpack_32_4(uint8_t* data, int8_t* dst) {
-	memset(dst, 0, 16);
-	for (int j = 0; j < 16; ++j) {
-		uint8_t x = (data[j + 2] & 0x0F); // j+2 to skip scale bytes.
-		if (j % 2 != 0) {
-		x <<= 4;
-		}
-		dst[j / 2] += x;
-	}
-	// Last 16 weights are in the higher bits
-	for (int j = 0; j < 16; ++j) {
-		uint8_t x = (data[j + 2] >> 4);
-		if (j % 2 != 0) {
-		x <<= 4;
-		}
-		dst[8 + j / 2] += x;
-	}
-}
-
-// Extracts (weight, scales, biases) from Q4_0 tensors.
-// Data layout is: |16 bit scale|32 x 4bit weights|.
-void extract_q4_0_data(
-		uint8_t* data,
-		mlx_array* weights_arr,
-		mlx_array* scales_arr,
-		mlx_array* biases_arr) {
-	const uint64_t bytes_per_block = 18; // 2 bytes scale, 32x0.5 byte weights
-	uint8_t* weights = mlx_array_data_uint8(*weights_arr);
-	float16_t* scales = mlx_array_data_float16(*scales_arr);
-	float16_t* biases = mlx_array_data_float16(*biases_arr);
-	for (int64_t i = 0; i < mlx_array_size(*scales_arr); i++) {
-		scales[i] = *((float16_t*)data);
-		biases[i] = -8 * scales[i];
-		unpack_32_4(data, weights);
-		weights += 16;
-		data += bytes_per_block;
-	}
-}
-
-// Extracts (weight, scales, biases) from Q4_1 tensors.
-// Data layout is: |16 bit scale|16 bit bias|32 x 4bit weights|.
-void extract_q4_1_data(
-		uint8_t* data,
-		mlx_array* weights_arr,
-		mlx_array* scales_arr,
-		mlx_array* biases_arr) {
-	const uint64_t bytes_per_block = 20; // 2 bytes scale, 2 bytes bias, 32x0.5 byte weights
-	uint8_t* weights = mlx_array_data_uint8(*weights_arr);
-	float16_t* scales = mlx_array_data_float16(*scales_arr);
-	float16_t* biases = mlx_array_data_float16(*biases_arr);
-	for (int64_t i = 0; i < mlx_array_size(*scales_arr); i++) {
-		scales[i] = *((float16_t*)data);
-		biases[i] = *((float16_t*)(data) + 1);
-		unpack_32_4(data, weights);
-		weights += 16;
-		data += bytes_per_block;
-	}
-}
-
-// Extracts (weight, scales, biases) from Q8_0 tensors.
-// Data layout is: |16 bit scale|32 x 8bit weights|.
-void extract_q8_0_data(
-		uint8_t* data,
-		mlx_array* weights_arr,
-		mlx_array* scales_arr,
-		mlx_array* biases_arr) {
-	const uint64_t weights_per_block = 32;
-	const uint64_t bytes_per_block = 34; // 2 bytes scale, 32x1 byte weights
-	uint8_t* weights = mlx_array_data_uint8(*weights_arr);
-	float16_t* scales = mlx_array_data_float16(*scales_arr);
-	float16_t* biases = mlx_array_data_float16(*biases_arr);
-	for (int64_t i = 0; i < mlx_array_size(*scales_arr); i++) {
-		uint8_t* block_data = data + i * bytes_per_block;
-		scales[i] = *((float16_t*)block_data);
-		biases[i] = -128 * scales[i];
-		for (int64_t j = 0; j < weights_per_block; ++j) {
-			uint8_t x = block_data[j + 2]; // j+2 to skip the scale bytes.
-			// Original data is in int8_t, so we add a bias of -128 and invert the
-			// first bit.
-			x ^= 1 << 7;
-			weights[i * weights_per_block + j] = x;
-		}
-	}
-}
-
-// Drived from ggml-quants.c
-
-#define QK_K 256
-
-// 6-bit quantization
-// weight is represented as x = a * q
-// 16 blocks of 16 elements each
-// Effectively 6.5625 bits per weight
-typedef struct {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    uint16_t d;             // super-block scale
-} block_q6_K;
-
-void dequant_row_q6_K(const void * restrict vx, void * restrict vy, int k) {
-    const int64_t nb = k / QK_K;
-	block_q6_K *x = (block_q6_K *)vx;
-	float16_t* y = (float16_t *)vy;
-
-    for (int i = 0; i < nb; i++) {
-		float16_t d = 0.0;
-		memcpy(&d, &x[i].d, sizeof(d));
-
-        const uint8_t * restrict ql = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict sc = x[i].scales;
-
-        for (int n = 0; n < QK_K; n += 128) {
-            for (int l = 0; l < 32; ++l) {
-                int is = l/16;
-                const int8_t q1 = (int8_t)((ql[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
-                const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
-                const int8_t q3 = (int8_t)((ql[l +  0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
-                const int8_t q4 = (int8_t)((ql[l + 32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
-                y[l +  0] = d * sc[is + 0] * q1;
-                y[l + 32] = d * sc[is + 2] * q2;
-                y[l + 64] = d * sc[is + 4] * q3;
-                y[l + 96] = d * sc[is + 6] * q4;
-            }
-            y  += 128;
-            ql += 64;
-            qh += 32;
-            sc += 8;
-        }
-    }
-}
-
-#define K_SCALE_SIZE 12
-#define GGML_COMMON_AGGR_U
-#define GGML_COMMON_AGGR_S
-
-// 4-bit quantization
-// 8 blocks of 32 elements each
-// weight is represented as x = a * q + b
-// Effectively 4.5 bits per weight
-typedef struct {
-    union {
-        struct {
-            uint16_t d;    // super-block scale for quantized scales
-            uint16_t dmin; // super-block scale for quantized mins
-        } GGML_COMMON_AGGR_S;
-        uint16_t dm;
-    } GGML_COMMON_AGGR_U;
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];           // 4--bit quants
-} block_q4_K;
-
-static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) {
-    if (j < 4) {
-        *d = q[j] & 63; *m = q[j + 4] & 63;
-    } else {
-        *d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
-        *m = (q[j+4] >>  4) | ((q[j-0] >> 6) << 4);
-    }
-}
-
-void dequant_row_q4_K(const void * restrict vx, void * restrict vy, int k) {
-	block_q4_K *x = (block_q4_K *)vx;
-	float16_t* y = (float16_t *)vy;
-    const int nb = k / QK_K;
-
-    for (int i = 0; i < nb; i++) {
-        const uint8_t * q = x[i].qs;
-		float16_t d = 0.0;
-		memcpy(&d, &x[i].d, sizeof(d));
-		float16_t min = 0.0;
-		memcpy(&min, &x[i].dmin, sizeof(d));
-
-        int is = 0;
-        uint8_t sc, m;
-        for (int j = 0; j < QK_K; j += 64) {
-            get_scale_min_k4(is + 0, x[i].scales, &sc, &m);
-            const float16_t d1 = d * sc; const float16_t m1 = min * m;
-            get_scale_min_k4(is + 1, x[i].scales, &sc, &m);
-            const float16_t d2 = d * sc; const float16_t m2 = min * m;
-            for (int l = 0; l < 32; ++l) *y++ = d1 * (q[l] & 0xF) - m1;
-            for (int l = 0; l < 32; ++l) *y++ = d2 * (q[l]  >> 4) - m2;
-            q += 32; is += 2;
-        }
-    }
-}
-
-
-
-*/
-import "C"
-
-import (
-	"fmt"
-	"unsafe"
-
-	"github.com/x448/float16"
-)
-
-func gguf_load_quantized(data unsafe.Pointer, name string, final_shape []C.int, dtype uint32, stream C.mlx_stream) (r C.mlx_array, err error) {
-	shape := append([]C.int{}, final_shape...)
-	var weights_per_byte C.int
-	if dtype == 2 || dtype == 3 {
-		weights_per_byte = 2
-	} else if dtype == 8 {
-		weights_per_byte = 1
-	} else {
-		return r, fmt.Errorf("unsupported tensor type %d", dtype)
-	}
-
-	weights_per_block := C.int(32)
-	if shape[len(shape)-1]%weights_per_block != 0 {
-		return r, fmt.Errorf("[load_gguf] tensor has incompatible last dim shape: %d", shape[len(shape)-1])
-	}
-
-	weights_shape := append([]C.int{}, shape...)
-	weights_shape[len(weights_shape)-1] /= (weights_per_byte * 4)
-	w_nbytes := C.int(unsafe.Sizeof(uint32(0)))
-	for i := range weights_shape {
-		w_nbytes *= weights_shape[i]
-	}
-	w_data := make([]byte, w_nbytes)
-	cbytes := C.CBytes(w_data)
-	defer C.free(cbytes)
-	weights := C.mlx_array_new_data(
-		cbytes,
-		&weights_shape[0],
-		C.int(len(weights_shape)),
-		C.MLX_UINT32,
-	)
-
-	// For scales and bias
-	shape[len(shape)-1] = shape[len(shape)-1] / weights_per_block
-	sb_nbytes := C.int(unsafe.Sizeof(float16.Float16(0)))
-	for i := range shape {
-		sb_nbytes *= shape[i]
-	}
-
-	s_data := make([]byte, sb_nbytes)
-	cbytes = C.CBytes(s_data)
-	defer C.free(cbytes)
-	scales := C.mlx_array_new_data(
-		cbytes,
-		&shape[0],
-		C.int(len(shape)),
-		C.MLX_FLOAT16,
-	)
-	b_data := make([]byte, sb_nbytes)
-	cbytes = C.CBytes(b_data)
-	defer C.free(cbytes)
-	biases := C.mlx_array_new_data(
-		cbytes,
-		&shape[0],
-		C.int(len(shape)),
-		C.MLX_FLOAT16,
-	)
-	var bits C.int
-	switch dtype {
-	case 2:
-		C.extract_q4_0_data((*C.uint8_t)(data), &weights, &scales, &biases)
-		bits = 4
-	case 3:
-		C.extract_q4_1_data((*C.uint8_t)(data), &weights, &scales, &biases)
-		bits = 4
-	case 8:
-		C.extract_q8_0_data((*C.uint8_t)(data), &weights, &scales, &biases)
-		bits = 8
-	}
-	groupSize := C.mlx_optional_int{value: 32, has_value: true}
-	bitsOpt := C.mlx_optional_int{value: bits, has_value: true}
-	var dtypeOpt C.mlx_optional_dtype // has_value defaults to false
-	C.mlx_dequantize(
-		&r,
-		weights,
-		scales,
-		biases,
-		groupSize,
-		bitsOpt,
-		nil, // TODO mode
-		dtypeOpt,
-		stream,
-	)
-	C.mlx_array_free(weights)
-	C.mlx_array_free(scales)
-	C.mlx_array_free(biases)
-
-	return r, nil
-}
-
-func load_k_quantized(data unsafe.Pointer, name string, shape []C.int, dtype uint32, stream C.mlx_stream) (r C.mlx_array, err error) {
-	size := 1
-	for _, d := range shape {
-		size *= int(d)
-	}
-	fdata := make([]float16.Float16, size)
-	switch dtype {
-	case 14:
-		C.dequant_row_q6_K(
-			data,
-			unsafe.Pointer(&fdata[0]),
-			C.int(size),
-		)
-
-	case 12:
-		C.dequant_row_q4_K(
-			data,
-			unsafe.Pointer(&fdata[0]),
-			C.int(size),
-		)
-	default:
-		return r, fmt.Errorf("unsupported K quant")
-	}
-
-	r = C.mlx_array_new_data(
-		unsafe.Pointer(&fdata[0]),
-		&shape[0],
-		C.int(len(shape)),
-		C.MLX_FLOAT16,
-	)
-	return r, nil
-}
--- a/x/ml/device.go
+++ b/x/ml/device.go
@@ -1,643 +0,0 @@
-package ml
-
-import (
-	"context"
-	"encoding/binary"
-	"encoding/json"
-	"fmt"
-	"hash/maphash"
-	"io"
-	"log/slog"
-	"math"
-	"net/http"
-	"runtime"
-	"slices"
-	"sort"
-	"strconv"
-	"strings"
-	"time"
-
-	"github.com/ollama/ollama/format"
-	"github.com/ollama/ollama/logutil"
-)
-
-// GPULayers is a set of layers to be allocated on a single GPU
-type GPULayers struct {
-	DeviceID
-
-	// Layers is a set of layer indicies to load
-	Layers []int
-}
-
-// FirstLayer returns the smallest layer index scheduled on this GPU, or MaxInt when empty.
-func (g GPULayers) FirstLayer() int {
-	if len(g.Layers) == 0 {
-		return math.MaxInt
-	}
-
-	first := g.Layers[0]
-	for i := 1; i < len(g.Layers); i++ {
-		if g.Layers[i] < first {
-			first = g.Layers[i]
-		}
-	}
-
-	return first
-}
-
-func (g GPULayers) String() string {
-	if len(g.Layers) == 0 {
-		return ""
-	}
-
-	slices.Sort(g.Layers)
-
-	contiguous := true
-	base := g.Layers[0]
-	for i := range g.Layers {
-		if g.Layers[i] != base+i {
-			contiguous = false
-			break
-		}
-	}
-
-	if contiguous {
-		return fmt.Sprintf("ID:%v Layers:%v(%v..%v)", g.ID, len(g.Layers), g.Layers[0], g.Layers[len(g.Layers)-1])
-	} else {
-		return fmt.Sprintf("ID:%v Layers:%v%v", g.ID, len(g.Layers), g.Layers)
-	}
-}
-
-// GPULayersList is a set of layer allocations across multiple GPUs
-type GPULayersList []GPULayers
-
-func (l GPULayersList) Len() int      { return len(l) }
-func (l GPULayersList) Swap(i, j int) { l[i], l[j] = l[j], l[i] }
-
-// Sort by the ordering of the layers offloaded
-func (l GPULayersList) Less(i, j int) bool {
-	li := l[i].FirstLayer()
-	lj := l[j].FirstLayer()
-
-	return li < lj
-}
-
-func (l GPULayersList) String() string {
-	if l.Sum() > 0 {
-		return fmt.Sprintf("%v%v", l.Sum(), []GPULayers(l))
-	} else {
-		return fmt.Sprintf("%v", []GPULayers(l))
-	}
-}
-
-// Sum is the total number of layers assigned across all GPUs
-func (l GPULayersList) Sum() int {
-	var sum int
-
-	for _, g := range l {
-		sum += len(g.Layers)
-	}
-
-	return sum
-}
-
-var h maphash.Hash
-
-// Hash is an identifier of this layer assignment
-func (l GPULayersList) Hash() uint64 {
-	h.Reset()
-	for _, g := range l {
-		if len(g.Layers) > 0 {
-			h.WriteString(g.ID + g.Library)
-			for _, l := range g.Layers {
-				binary.Write(&h, binary.NativeEndian, int64(l))
-			}
-		}
-	}
-
-	return h.Sum64()
-}
-
-// ErrNoMem is returned when panicing due to insufficient memory. It includes
-// the attempted memory allocation.
-type ErrNoMem struct {
-	BackendMemory
-}
-
-func (e ErrNoMem) Error() string {
-	return fmt.Sprintf("insufficient memory - required allocations: %+v", e.BackendMemory)
-}
-
-// Minimal unique device identification
-type DeviceID struct {
-	// ID is an identifier for the device for matching with system
-	// management libraries.  The ID is only unique for other devices
-	// using the same Library.
-	// This ID represents a "post filtered" view of the enumerated devices
-	// if the ID is numeric
-	ID string `json:"id"`
-
-	// Library identifies which library is used for the device (e.g. CUDA, ROCm, etc.)
-	Library string `json:"backend,omitempty"`
-}
-
-// DeviceMemory provides a breakdown of the memory needed
-// per device, such as a CPU or GPU.
-type DeviceMemory struct {
-	DeviceID
-
-	// Name is the name of the device as labeled by the backend. It
-	// may not be persistent across instances of the runner.
-	Name string
-
-	// Weights is the per-layer memory needed for the model weights.
-	Weights []uint64
-
-	// Cache is the per-layer memory needed for the KV cache.
-	Cache []uint64
-
-	// Graph is the size of the compute graph. It is not per-layer.
-	Graph uint64
-}
-
-func sumMemory(mem []uint64) uint64 {
-	var sum uint64
-
-	for _, m := range mem {
-		sum += m
-	}
-
-	return sum
-}
-
-// Size returns the total size of the memory required by this device
-func (m DeviceMemory) Size() uint64 {
-	return sumMemory(m.Weights) + sumMemory(m.Cache) + m.Graph
-}
-
-func memoryPresent(mem []uint64) bool {
-	return slices.ContainsFunc(mem, func(m uint64) bool { return m != 0 })
-}
-
-func (m DeviceMemory) LogValue() slog.Value {
-	var attrs []slog.Attr
-	if memoryPresent(m.Weights) {
-		attrs = append(attrs, slog.Any("Weights", m.Weights))
-	}
-
-	if memoryPresent(m.Cache) {
-		attrs = append(attrs, slog.Any("Cache", m.Cache))
-	}
-
-	if m.Graph != 0 {
-		attrs = append(attrs, slog.Any("Graph", m.Graph))
-	}
-
-	if len(attrs) > 0 && m.ID != "" {
-		attrs = append([]slog.Attr{slog.String("ID", m.ID)}, attrs...)
-	}
-
-	return slog.GroupValue(attrs...)
-}
-
-// BackendMemory provides the amount of memory required to load the model
-// per device based on the BackendParams. In some cases, not all required
-// allocations will be known at this point. However, the size of the most recent
-// allocation is guaranteed to be provided so that if it failed, the caller can
-// accommodate that to make forward progress.
-type BackendMemory struct {
-	// InputWeights are always located on the CPU and cannot be moved
-	InputWeights uint64
-
-	// CPU model components are located in system memory. This does not
-	// include unified memory allocated through the GPU.
-	CPU DeviceMemory
-
-	// GPU model components are located on one or more GPUs.
-	GPUs []DeviceMemory
-}
-
-func (m BackendMemory) LogValue() slog.Value {
-	var attrs []slog.Attr
-	if m.InputWeights != 0 {
-		attrs = append(attrs, slog.Any("InputWeights", m.InputWeights))
-	}
-
-	attrs = append(attrs, slog.Any(m.CPU.Name, m.CPU))
-	for _, g := range m.GPUs {
-		attrs = append(attrs, slog.Any(g.Name, g))
-	}
-
-	return slog.GroupValue(attrs...)
-}
-
-// Log prints a high level summary of the memory
-func (m BackendMemory) Log(level slog.Level) {
-	var total uint64
-
-	for _, gpu := range m.GPUs {
-		if sum := sumMemory(gpu.Weights); sum > 0 {
-			slog.Log(context.TODO(), level, "model weights", "device", gpu.Name, "size", format.HumanBytes2(sum))
-			total += sum
-		}
-	}
-	if sum := m.InputWeights + sumMemory(m.CPU.Weights); sum > 0 {
-		slog.Log(context.TODO(), level, "model weights", "device", m.CPU.Name, "size", format.HumanBytes2(sum))
-		total += sum
-	}
-
-	for _, gpu := range m.GPUs {
-		if sum := sumMemory(gpu.Cache); sum > 0 {
-			slog.Log(context.TODO(), level, "kv cache", "device", gpu.Name, "size", format.HumanBytes2(sum))
-			total += sum
-		}
-	}
-	if sum := sumMemory(m.CPU.Cache); sum > 0 {
-		slog.Log(context.TODO(), level, "kv cache", "device", m.CPU.Name, "size", format.HumanBytes2(sum))
-		total += sum
-	}
-
-	for _, gpu := range m.GPUs {
-		if sum := gpu.Graph; sum > 0 {
-			slog.Log(context.TODO(), level, "compute graph", "device", gpu.Name, "size", format.HumanBytes2(sum))
-			total += sum
-		}
-	}
-	if sum := m.CPU.Graph; sum > 0 {
-		slog.Log(context.TODO(), level, "compute graph", "device", m.CPU.Name, "size", format.HumanBytes2(sum))
-		total += sum
-	}
-
-	if total > 0 {
-		slog.Log(context.TODO(), level, "total memory", "size", format.HumanBytes2(total))
-	}
-}
-
-type DeviceInfo struct {
-	DeviceID
-
-	// Name is the name of the device as labeled by the backend. It
-	// may not be persistent across instances of the runner.
-	Name string `json:"name"`
-
-	// Description is the longer user-friendly identification of the device
-	Description string `json:"description"`
-
-	// FilterID is populated with the unfiltered device ID if a numeric ID is used
-	// so the device can be included.
-	FilterID string `json:"filter_id,omitempty"`
-
-	// Integrated is set true for integrated GPUs, false for Discrete GPUs
-	Integrated bool `json:"integration,omitempty"`
-
-	// PCIID is the bus, device and domain ID of the device for deduplication
-	// when discovered by multiple backends
-	PCIID string `json:"pci_id,omitempty"`
-
-	// TotalMemory is the total amount of memory the device can use for loading models
-	TotalMemory uint64 `json:"total_memory"`
-
-	// FreeMemory is the amount of memory currently available on the device for loading models
-	FreeMemory uint64 `json:"free_memory,omitempty"`
-
-	// ComputeMajor is the major version of capabilities of the device
-	// if unsupported by the backend, -1 will be returned
-	ComputeMajor int
-
-	// ComputeMinor is the minor version of capabilities of the device
-	// if unsupported by the backend, -1 will be returned
-	ComputeMinor int
-
-	// Driver Information
-	DriverMajor int `json:"driver_major,omitempty"`
-	DriverMinor int `json:"driver_minor,omitempty"`
-
-	// Where backends were loaded from
-	LibraryPath []string
-}
-
-type SystemInfo struct {
-	// ThreadCount is the optimal number of threads to use for inference
-	ThreadCount int `json:"threads,omitempty"`
-
-	// TotalMemory is the total amount of system memory
-	TotalMemory uint64 `json:"total_memory,omitempty"`
-
-	// FreeMemory is the amount of memory currently available on the system for loading models
-	FreeMemory uint64 `json:"free_memory,omitempty"`
-
-	// FreeSwap is the amount of system swap space reported as available
-	FreeSwap uint64 `json:"free_swap,omitempty"`
-}
-
-func (d DeviceInfo) Compute() string {
-	// AMD gfx is encoded into the major minor in hex form
-	if strings.EqualFold(d.Library, "ROCm") {
-		return fmt.Sprintf("gfx%x%02x", d.ComputeMajor, d.ComputeMinor)
-	}
-	return strconv.Itoa(d.ComputeMajor) + "." + strconv.Itoa(d.ComputeMinor)
-}
-
-func (d DeviceInfo) Driver() string {
-	return strconv.Itoa(d.DriverMajor) + "." + strconv.Itoa(d.DriverMinor)
-}
-
-// MinimumMemory reports the amount of memory that should be set aside
-// on the device for overhead (e.g. VRAM consumed by context structures independent
-// of model allocations)
-func (d DeviceInfo) MinimumMemory() uint64 {
-	if d.Library == "Metal" {
-		return 512 * format.MebiByte
-	}
-	return 457 * format.MebiByte
-}
-
-// Sort by Free Space.
-// iGPUs are reported first, thus Reverse() yields the largest discrete GPU first
-type ByFreeMemory []DeviceInfo
-
-func (a ByFreeMemory) Len() int      { return len(a) }
-func (a ByFreeMemory) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
-func (a ByFreeMemory) Less(i, j int) bool {
-	if a[i].Integrated && !a[j].Integrated {
-		return true
-	} else if !a[i].Integrated && a[j].Integrated {
-		return false
-	}
-	return a[i].FreeMemory < a[j].FreeMemory
-}
-
-// ByPerformance groups devices by similar speed
-func ByPerformance(l []DeviceInfo) [][]DeviceInfo {
-	resp := [][]DeviceInfo{}
-	scores := []bool{}
-	for _, info := range l {
-		found := false
-		requested := info.Integrated
-		for i, score := range scores {
-			if score == requested {
-				resp[i] = append(resp[i], info)
-				found = true
-				break
-			}
-		}
-		if !found {
-			scores = append(scores, requested)
-			resp = append(resp, []DeviceInfo{info})
-		}
-	}
-	return resp
-}
-
-func ByLibrary(l []DeviceInfo) [][]DeviceInfo {
-	resp := [][]DeviceInfo{}
-	libs := []string{}
-	for _, info := range l {
-		found := false
-		requested := info.Library
-		for i, lib := range libs {
-			if lib == requested {
-				resp[i] = append(resp[i], info)
-				found = true
-				break
-			}
-		}
-		if !found {
-			libs = append(libs, requested)
-			resp = append(resp, []DeviceInfo{info})
-		}
-	}
-	return resp
-}
-
-func LibraryPaths(l []DeviceInfo) []string {
-	gpuLibs := []string{LibOllamaPath}
-	for _, gpu := range l {
-		for _, dir := range gpu.LibraryPath {
-			needed := true
-			for _, existing := range gpuLibs {
-				if dir == existing {
-					needed = false
-					break
-				}
-			}
-			if needed {
-				gpuLibs = append(gpuLibs, dir)
-			}
-		}
-	}
-	return gpuLibs
-}
-
-type DeviceComparison int
-
-const (
-	UniqueDevice      DeviceComparison = iota
-	SameBackendDevice                  // The device is the same, and the library/backend is the same
-	DuplicateDevice                    // The same physical device but different library/backend (overlapping device)
-)
-
-func (a DeviceInfo) Compare(b DeviceInfo) DeviceComparison {
-	if a.PCIID != b.PCIID {
-		return UniqueDevice
-	}
-	// If PCIID is empty, we have to use ID + library for uniqueness
-	if a.PCIID == "" && a.DeviceID != b.DeviceID {
-		return UniqueDevice
-	}
-	if a.Library == b.Library {
-		return SameBackendDevice
-	}
-	return DuplicateDevice
-}
-
-// For a SameBackendDevice, return true if b is better than a
-// e.g. newer GPU library version
-func (a DeviceInfo) IsBetter(b DeviceInfo) bool {
-	aLib := a.LibraryPath[len(a.LibraryPath)-1]
-	bLib := b.LibraryPath[len(b.LibraryPath)-1]
-	if aLib == bLib {
-		return false
-	}
-	aLibSplit := strings.SplitN(aLib, "_", 2)
-	bLibSplit := strings.SplitN(bLib, "_", 2)
-	if len(aLibSplit) < 2 || len(bLibSplit) < 2 {
-		return false
-	}
-	if aLibSplit[0] != bLibSplit[0] {
-		slog.Debug("unexpected libraries", "a", aLib, "b", bLib)
-		return false
-	}
-	if aLibSplit[1] == bLibSplit[1] {
-		return false
-	}
-	cmp := []string{aLibSplit[1], bLibSplit[1]}
-	sort.Sort(sort.Reverse(sort.StringSlice(cmp)))
-	return cmp[0] == bLibSplit[1]
-}
-
-// For each GPU, check if it does NOT support flash attention
-func FlashAttentionSupported(l []DeviceInfo) bool {
-	for _, gpu := range l {
-		supportsFA := gpu.Library == "cpu" ||
-			gpu.Name == "Metal" || gpu.Library == "Metal" ||
-			(gpu.Library == "CUDA" && gpu.DriverMajor >= 7 && !(gpu.ComputeMajor == 7 && gpu.ComputeMinor == 2)) ||
-			gpu.Library == "ROCm" ||
-			gpu.Library == "Vulkan"
-
-		if !supportsFA {
-			return false
-		}
-	}
-	return true
-}
-
-// Given the list of GPUs this instantiation is targeted for,
-// figure out the visible devices environment variables
-// Set mustFilter true to enable filtering of CUDA devices
-func GetVisibleDevicesEnv(l []DeviceInfo, mustFilter bool) map[string]string {
-	if len(l) == 0 {
-		return nil
-	}
-	env := map[string]string{}
-	for _, d := range l {
-		d.updateVisibleDevicesEnv(env, mustFilter)
-	}
-	return env
-}
-
-// NeedsInitValidation returns true if the device in question has the potential
-// to crash at inference time and requires deeper validation before we include
-// it in the supported devices list.
-func (d DeviceInfo) NeedsInitValidation() bool {
-	// ROCm: rocblas will crash on unsupported devices.
-	// CUDA: verify CC is supported by the version of the library
-	return d.Library == "ROCm" || d.Library == "CUDA"
-}
-
-// Set the init validation environment variable
-func (d DeviceInfo) AddInitValidation(env map[string]string) {
-	env["GGML_CUDA_INIT"] = "1" // force deep initialization to trigger crash on unsupported GPUs
-}
-
-// PreferredLibrary returns true if this library is preferred over the other input
-// library
-// Used to filter out Vulkan in favor of CUDA or ROCm
-func (d DeviceInfo) PreferredLibrary(other DeviceInfo) bool {
-	// TODO in the future if we find Vulkan is better than ROCm on some devices
-	// that implementation can live here.
-
-	if d.Library == "CUDA" || d.Library == "ROCm" {
-		return true
-	}
-	return false
-}
-
-func (d DeviceInfo) updateVisibleDevicesEnv(env map[string]string, mustFilter bool) {
-	var envVar string
-	switch d.Library {
-	case "ROCm":
-		// ROCm must be filtered as it can crash the runner on unsupported devices
-		envVar = "ROCR_VISIBLE_DEVICES"
-		if runtime.GOOS != "linux" {
-			envVar = "HIP_VISIBLE_DEVICES"
-		}
-	case "CUDA":
-		if !mustFilter {
-			// By default we try to avoid filtering CUDA devices because ROCm also
-			// looks at the CUDA env var, and gets confused in mixed vendor environments.
-			return
-		}
-		envVar = "CUDA_VISIBLE_DEVICES"
-	default:
-		// Vulkan is not filtered via env var, but via scheduling decisions
-		return
-	}
-	v, existing := env[envVar]
-	if existing {
-		v = v + ","
-	}
-	if d.FilterID != "" {
-		v = v + d.FilterID
-	} else {
-		v = v + d.ID
-	}
-	env[envVar] = v
-}
-
-type BaseRunner interface {
-	// GetPort returns the localhost port number the runner is running on
-	GetPort() int
-
-	// HasExited indicates if the runner is no longer running.  This can be used during
-	// bootstrap to detect if a given filtered device is incompatible and triggered an assert
-	HasExited() bool
-}
-
-type RunnerDiscovery interface {
-	BaseRunner
-
-	// GetDeviceInfos will perform a query of the underlying device libraries
-	// for device identification and free VRAM information
-	// During bootstrap scenarios, this routine may take seconds to complete
-	GetDeviceInfos(ctx context.Context) []DeviceInfo
-}
-
-type FilteredRunnerDiscovery interface {
-	RunnerDiscovery
-
-	// GetActiveDeviceIDs returns the filtered set of devices actively in
-	// use by this runner for running models.  If the runner is a bootstrap runner, no devices
-	// will be active yet so no device IDs are returned.
-	// This routine will not query the underlying device and will return immediately
-	GetActiveDeviceIDs() []DeviceID
-}
-
-func GetDevicesFromRunner(ctx context.Context, runner BaseRunner) ([]DeviceInfo, error) {
-	var moreDevices []DeviceInfo
-	port := runner.GetPort()
-	tick := time.Tick(10 * time.Millisecond)
-	for {
-		select {
-		case <-ctx.Done():
-			return nil, fmt.Errorf("failed to finish discovery before timeout")
-		case <-tick:
-			r, err := http.NewRequestWithContext(ctx, http.MethodGet, fmt.Sprintf("http://127.0.0.1:%d/info", port), nil)
-			if err != nil {
-				return nil, fmt.Errorf("failed to create request: %w", err)
-			}
-			r.Header.Set("Content-Type", "application/json")
-
-			resp, err := http.DefaultClient.Do(r)
-			if err != nil {
-				// slog.Warn("failed to send request", "error", err)
-				if runner.HasExited() {
-					return nil, fmt.Errorf("runner crashed")
-				}
-				continue
-			}
-			defer resp.Body.Close()
-
-			if resp.StatusCode == http.StatusNotFound {
-				// old runner, fall back to bootstrapping model
-				return nil, fmt.Errorf("llamarunner free vram reporting not supported")
-			}
-
-			body, err := io.ReadAll(resp.Body)
-			if err != nil {
-				slog.Warn("failed to read response", "error", err)
-				continue
-			}
-			if resp.StatusCode != 200 {
-				logutil.Trace("runner failed to discover free VRAM", "status", resp.StatusCode, "response", body)
-				return nil, fmt.Errorf("runner error: %s", string(body))
-			}
-
-			if err := json.Unmarshal(body, &moreDevices); err != nil {
-				slog.Warn("unmarshal encode response", "error", err)
-				continue
-			}
-			return moreDevices, nil
-		}
-	}
-}
--- a/x/ml/nn/attention.go
+++ b/x/ml/nn/attention.go
@@ -1,103 +0,0 @@
-package nn
-
-import (
-	"fmt"
-
-	"github.com/ollama/ollama/x/kvcache"
-	"github.com/ollama/ollama/x/ml"
-)
-
-// Attention implements scaled dot-product attention for transformer models:
-// Attention(Q, K, V) = softmax(QK^T/√d_k)V
-//
-// Parameters:
-//   - ctx: Context for tensor operations
-//   - query: Query tensor (Q) with shape [d_k, heads, seq_len_q]
-//   - key: Key tensor (K) with shape [d_k, kv_heads, seq_len_k], can be nil to read from cache only
-//   - value: Value tensor (V) with shape [d_v, kv_heads, seq_len_k], can be nil to read from cache only
-//   - scale: Scaling factor, typically 1/√d_k where d_k is the key dimension
-//   - cache: KV cache to store key/value and get past history, can be nil to only use provided key/value
-//
-// Returns:
-//
-//	Attention output with shape [d_v, heads, seq_len_q]
-func Attention(ctx ml.Context, query, key, value ml.Tensor, scale float64, cache kvcache.Cache) ml.Tensor {
-	return AttentionWithVMLA(ctx, query, key, value, nil, nil, scale, cache)
-}
-
-func AttentionWithSinks(ctx ml.Context, query, key, value, sinks ml.Tensor, scale float64, cache kvcache.Cache) ml.Tensor {
-	return AttentionWithVMLA(ctx, query, key, value, sinks, nil, scale, cache)
-}
-
-func AttentionWithVMLA(ctx ml.Context, query, key, value, sinks ml.Tensor, vmla ml.Tensor, scale float64, cache kvcache.Cache) ml.Tensor {
-	ctx.Forward(query)
-
-	if key != nil && value != nil {
-		if query.Dim(0) != key.Dim(0) {
-			panic(fmt.Errorf("d_k in attention operation does not match between query(%v) and key(%v)", query.Dim(0), key.Dim(0)))
-		}
-
-		if key.Dim(1) != value.Dim(1) {
-			panic(fmt.Errorf("kv_heads in attention operation does not match between key(%v) and value(%v)", key.Dim(1), value.Dim(1)))
-		}
-
-		if key.Dim(2) != value.Dim(2) {
-			panic(fmt.Errorf("seq_len_k in attention operation does not match between key(%v) and value(%v)", key.Dim(2), value.Dim(2)))
-		}
-
-		ctx.Forward(key, value)
-		if cache != nil {
-			cache.Put(ctx, key, value)
-		}
-	} else if cache == nil {
-		panic("key & value tensors must be provided if cache is nil")
-	}
-
-	// ctx.CompareWith("/tmp/test", map[string]ml.Tensor{"q": query, "k": key, "v": value}, true)
-	// panic("after cache get") //
-	// 2025/12/10 16:02:33 INFO XXX tensors are similar q=0.9999869465827942 shape="[1 8 13 256]" min_difference=[-0.07926178] max_difference=[0.07012844]
-	// 2025/12/10 16:02:33 INFO XXX tensors are similar k=0.9999891519546509 shape="[1 4 13 256]" min_difference=[-0.21365738] max_difference=[0.19916534]
-	// 2025/12/10 16:02:33 INFO XXX tensors are similar v=0.9999960660934448 shape="[1 4 13 256]" min_difference=[-0.32923126] max_difference=[0.32646942]
-
-	// var mask ml.Tensor
-	if cache != nil {
-		key, value, _ = cache.Get(ctx)
-	}
-	// ctx.CompareWith("/tmp/test", map[string]ml.Tensor{"q": query.Contiguous(ctx, false), "k": key.Contiguous(ctx, false), "v": value.Contiguous(ctx, false)}, true)
-	// panic("after cache get") //
-	// 2025/12/10 15:34:03 INFO XXX tensors are similar q=0.9999869465827942 shape="[1 8 13 256]" min_difference=[-0.07926178] max_difference=[0.07012844]
-	// 2025/12/10 15:34:03 INFO XXX tensors are similar k=0.9999881982803345 shape="[1 4 13 256]" min_difference=[-0.25] max_difference=[0.25]
-	// 2025/12/10 15:34:03 INFO XXX tensors are similar v=0.9999913573265076 shape="[1 4 13 256]" min_difference=[-0.5] max_difference=[0.5]
-
-	// Only use the fast SDPA implementation if we have a cache, since that's what
-	// will do any expected backend-specific transformations for us
-
-	if cache != nil {
-		// TODO what to do with vmla?
-		// return query.Transpose(ctx, 0, 2, 1, 3).ScaledDotProductAttention(ctx, key.Transpose(ctx, 0, 2, 1, 3), value.Transpose(ctx, 0, 2, 1, 3), scale, "array", mask, sinks)
-		return query.ScaledDotProductAttention(ctx, key, value, scale, "causal", nil, sinks)
-
-		// TODO these two produce identical output, but not similar enough - 92.9% - should be 99.999%
-	} else {
-		panic("else case not supported")
-		// TODO transpose shapes are wrong
-		// key = key.Transpose(ctx, 0, 2, 1, 3)
-		// value = value.Transpose(ctx, 1, 2, 0, 3).Contiguous(ctx, false)
-
-		// kq := query.Matmul(ctx, key)
-
-		// kq = kq.Scale(ctx, scale)
-		// if mask != nil {
-		// 	kq = kq.Add(ctx, mask)
-		// }
-		// kq = kq.Softmax(ctx)
-
-		// kqv := kq.Matmul(ctx, value)
-
-		// if vmla != nil {
-		// 	kqv = kqv.Matmul(ctx, vmla)
-		// }
-
-		// return kqv.Transpose(ctx, 0, 2, 1, 3).Contiguous(ctx, false)
-	}
-}
--- a/x/ml/nn/convolution.go
+++ b/x/ml/nn/convolution.go
@@ -1,30 +0,0 @@
-package nn
-
-import "github.com/ollama/ollama/x/ml"
-
-type Conv2D struct {
-	Weight ml.Tensor `gguf:"weight"`
-	Bias   ml.Tensor `gguf:"bias"`
-}
-
-func (m *Conv2D) Forward(ctx ml.Context, t ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
-	t = m.Weight.Conv2D(ctx, t, s0, s1, p0, p1, d0, d1, 1)
-	if m.Bias != nil {
-		// Bias shape is (out_channels,) while t shape is (width, height, out_channels, batch)
-		t = t.Add(ctx, m.Bias.Reshape(ctx, 1, 1, -1))
-	}
-	return t
-}
-
-type Conv3D struct {
-	Weight ml.Tensor `gguf:"weight"`
-	Bias   ml.Tensor `gguf:"bias"`
-}
-
-func (m *Conv3D) Forward(ctx ml.Context, t ml.Tensor, s0, s1, s2, p0, p1, p2, d0, d1, d2, g int) ml.Tensor {
-	t = m.Weight.Conv3D(ctx, t, s0, s1, s2, p0, p1, p2, d0, d1, d2, g)
-	if m.Bias != nil {
-		t = t.Add(ctx, m.Bias)
-	}
-	return t
-}
--- a/x/ml/nn/embedding.go
+++ b/x/ml/nn/embedding.go
@@ -1,11 +0,0 @@
-package nn
-
-import "github.com/ollama/ollama/x/ml"
-
-type Embedding struct {
-	Weight ml.Tensor `gguf:"weight"`
-}
-
-func (m *Embedding) Forward(ctx ml.Context, hiddenState ml.Tensor) ml.Tensor {
-	return m.Weight.TakeAxes(ctx, hiddenState, 0)
-}
--- a/x/ml/nn/linear.go
+++ b/x/ml/nn/linear.go
@@ -1,32 +0,0 @@
-package nn
-
-import "github.com/ollama/ollama/x/ml"
-
-type Linear struct {
-	Weight ml.Tensor `gguf:"weight"`
-	Bias   ml.Tensor `gguf:"bias"`
-}
-
-func (m *Linear) Forward(ctx ml.Context, t ml.Tensor) ml.Tensor {
-	t = t.Matmul(ctx, m.Weight.Transpose(ctx))
-	if m.Bias != nil {
-		t = t.Add(ctx, m.Bias)
-	}
-
-	return t
-}
-
-type LinearBatch struct {
-	Weight ml.Tensor `gguf:"weight"`
-	Bias   ml.Tensor `gguf:"bias"`
-}
-
-func (m *LinearBatch) Forward(ctx ml.Context, t, indices ml.Tensor) ml.Tensor {
-	panic("not yet ported")
-	// t = m.Weight.MulmatID(ctx, t, indices)
-	// if m.Bias != nil {
-	// 	t = t.AddID(ctx, m.Bias, indices)
-	// }
-
-	// return t
-}
--- a/x/ml/nn/normalization.go
+++ b/x/ml/nn/normalization.go
@@ -1,29 +0,0 @@
-package nn
-
-import (
-	"github.com/ollama/ollama/x/ml"
-)
-
-type LayerNorm struct {
-	Weight ml.Tensor `gguf:"weight"`
-	Bias   ml.Tensor `gguf:"bias"`
-}
-
-func (m *LayerNorm) Forward(ctx ml.Context, t ml.Tensor, eps float32) ml.Tensor {
-	return t.LayerNorm(ctx, m.Weight, m.Bias, eps)
-}
-
-type RMSNorm struct {
-	Weight ml.Tensor `gguf:"weight"`
-}
-
-func (m *RMSNorm) Forward(ctx ml.Context, t ml.Tensor, eps float32) ml.Tensor {
-	// slog.Info("RMSNorm", "eps", eps)
-	// fmt.Fprintln(os.Stderr, t.ToString())
-	// fmt.Fprintln(os.Stderr, m.Weight.ToString())
-
-	// TODO this is probably model specific, not generalized...
-	w := m.Weight.Add(ctx, ctx.FromFloats([]float32{1.0}, 1))
-
-	return t.RMSNorm(ctx, w, eps)
-}
--- a/x/ml/nn/pooling/pooling.go
+++ b/x/ml/nn/pooling/pooling.go
@@ -1,41 +0,0 @@
-package pooling
-
-import (
-	"github.com/ollama/ollama/x/ml"
-)
-
-type Type uint32
-
-const (
-	TypeNone Type = iota
-	TypeMean
-	TypeCLS
-	TypeLast
-)
-
-func (t Type) String() string {
-	switch t {
-	case TypeMean:
-		return "Mean"
-	case TypeCLS:
-		return "CLS"
-	case TypeLast:
-		return "Last"
-	default:
-		return "Unknown"
-	}
-}
-
-func (t Type) Forward(ctx ml.Context, hiddenStates ml.Tensor) ml.Tensor {
-	switch t {
-	// case TypeMean:
-	// 	hiddenStates = hiddenStates.Transpose(ctx, 1, 0, 2, 3).Contiguous(ctx, false).Mean(ctx)
-	// 	return hiddenStates.Transpose(ctx, 1, 0, 2, 3).Contiguous(ctx, false)
-	// case TypeCLS:
-	// 	return hiddenStates.Slice(ctx, 1, 0, 1, 1)
-	// case TypeLast:
-	// 	return hiddenStates.Slice(ctx, 1, hiddenStates.Dim(1)-1, hiddenStates.Dim(1), 1)
-	default:
-		panic("unknown pooling type")
-	}
-}
--- a/x/ml/nn/rope/rope.go
+++ b/x/ml/nn/rope/rope.go
@@ -1,72 +0,0 @@
-package rope
-
-import "github.com/ollama/ollama/x/ml"
-
-// Options contains optional parameters for RoPE function
-type Options struct {
-	Type    int
-	Factors ml.Tensor
-
-	// YaRN options
-	YaRN struct {
-		OriginalContextLength int
-		ExtrapolationFactor,
-		AttentionFactor,
-		BetaFast,
-		BetaSlow float32
-	}
-
-	// MRoPE options
-	MRoPE struct {
-		Sections []int
-	}
-}
-
-// WithTypeNeoX sets RoPE type to NeoX
-func WithTypeNeoX() func(*Options) {
-	return func(opts *Options) {
-		opts.Type = 2
-	}
-}
-
-// WithFactors sets custom rope factors
-func WithFactors(factors ml.Tensor) func(*Options) {
-	return func(opts *Options) {
-		if factors != nil {
-			opts.Factors = factors
-		}
-	}
-}
-
-// WithOriginalContextLength sets a custom context length
-func WithOriginalContextLength(n int) func(*Options) {
-	return func(opts *Options) {
-		opts.YaRN.OriginalContextLength = n
-	}
-}
-
-func WithExtrapolationFactor(extrapolationFactor float32) func(*Options) {
-	return func(opts *Options) {
-		opts.YaRN.ExtrapolationFactor = extrapolationFactor
-	}
-}
-
-func WithAttentionFactor(attentionFactor float32) func(*Options) {
-	return func(opts *Options) {
-		opts.YaRN.AttentionFactor = attentionFactor
-	}
-}
-
-func WithMRoPE(sections []int) func(*Options) {
-	return func(opts *Options) {
-		opts.Type |= 1 << 3
-		opts.MRoPE.Sections = sections
-	}
-}
-
-func WithInterleaveMRoPE(sections []int) func(*Options) {
-	return func(opts *Options) {
-		opts.Type |= 1<<3 | 1<<5
-		opts.MRoPE.Sections = sections
-	}
-}
--- a/x/ml/path.go
+++ b/x/ml/path.go
@@ -1,56 +0,0 @@
-package ml
-
-import (
-	"os"
-	"path/filepath"
-	"runtime"
-)
-
-// LibPath is a path to lookup dynamic libraries
-// in development it's usually 'build/lib/ollama'
-// in distribution builds it's 'lib/ollama' on Windows
-// '../lib/ollama' on Linux and the executable's directory on macOS
-// note: distribution builds, additional GPU-specific libraries are
-// found in subdirectories of the returned path, such as
-// 'cuda_v12', 'rocm', etc.
-var LibOllamaPath string = func() string {
-	exe, err := os.Executable()
-	if err != nil {
-		return ""
-	}
-
-	if eval, err := filepath.EvalSymlinks(exe); err == nil {
-		exe = eval
-	}
-
-	var libPath string
-	switch runtime.GOOS {
-	case "windows":
-		libPath = filepath.Join(filepath.Dir(exe), "lib", "ollama")
-	case "linux":
-		libPath = filepath.Join(filepath.Dir(exe), "..", "lib", "ollama")
-	case "darwin":
-		libPath = filepath.Dir(exe)
-	}
-
-	cwd, err := os.Getwd()
-	if err != nil {
-		return ""
-	}
-
-	paths := []string{
-		libPath,
-
-		// build paths for development
-		filepath.Join(filepath.Dir(exe), "build", "lib", "ollama"),
-		filepath.Join(cwd, "build", "lib", "ollama"),
-	}
-
-	for _, p := range paths {
-		if _, err := os.Stat(p); err == nil {
-			return p
-		}
-	}
-
-	return filepath.Dir(exe)
-}()
--- a/x/mlxrunner/cache.go
+++ b/x/mlxrunner/cache.go
@@ -0,0 +1,94 @@
+package mlxrunner
+
+import (
+	"log/slog"
+
+	"github.com/ollama/ollama/x/mlxrunner/cache"
+)
+
+type CacheEntry struct {
+	Caches  []cache.Cache
+	Count   int
+	Entries map[int32]*CacheEntry
+}
+
+func (s Runner) FindNearestCache(tokens []int32) ([]cache.Cache, []int32) {
+	current := &CacheEntry{Entries: s.CacheEntries}
+	index, cacheIndex := 0, -1
+	for _, token := range tokens {
+		if _, ok := current.Entries[token]; !ok {
+			break
+		}
+
+		current = current.Entries[token]
+		if len(current.Caches) > 0 {
+			cacheIndex = index
+		}
+
+		index += 1
+	}
+
+	if cacheIndex == len(tokens)-1 {
+		slog.Info("Cache hit", "type", "exact", "total", len(tokens), "cached", len(tokens), "left", len(tokens))
+		return current.Caches, []int32{}
+	} else if cacheIndex > 1 {
+		slog.Info("Cache hit", "type", "partial", "total", len(tokens), "cached", cacheIndex+1, "left", len(tokens[cacheIndex+1:]))
+		return current.Caches, tokens[cacheIndex+1:]
+	} else if index > 0 && cacheIndex < 0 {
+		type stackItem struct {
+			entry  *CacheEntry
+			tokens []int32
+		}
+
+		var best, item stackItem
+		stack := []stackItem{{entry: current, tokens: []int32{}}}
+		for len(stack) > 0 {
+			item, stack = stack[len(stack)-1], stack[:len(stack)-1]
+			if len(item.entry.Caches) > 0 {
+				if len(best.tokens) == 0 || len(item.tokens) < len(best.tokens) {
+					best = item
+				}
+			} else {
+				for token, entry := range item.entry.Entries {
+					stack = append(stack, stackItem{
+						entry:  entry,
+						tokens: append(item.tokens, token),
+					})
+				}
+			}
+		}
+
+		prefix := min(len(tokens)-1, index)
+		caches := make([]cache.Cache, len(best.entry.Caches))
+		trim := len(best.tokens)+1
+		for i := range caches {
+			caches[i] = best.entry.Caches[i].Clone()
+			caches[i].Trim(trim)
+		}
+
+		slog.Info("Cache hit", "type", "prefix", "total", len(tokens), "cached", prefix, "left", len(tokens[prefix:]), "trimmed", trim)
+		return caches, tokens[prefix:]
+	}
+
+	slog.Info("Cache miss", "left", len(tokens))
+	return nil, tokens
+}
+
+func (s *Runner) InsertCache(tokens []int32, caches []cache.Cache) {
+	current := &CacheEntry{Entries: s.CacheEntries}
+	for _, token := range tokens {
+		if _, ok := current.Entries[token]; !ok {
+			current.Entries[token] = &CacheEntry{
+				Entries: make(map[int32]*CacheEntry),
+			}
+		}
+
+		current = current.Entries[token]
+	}
+
+	if len(current.Caches) > 0 {
+		current.Count += 1
+	} else {
+		current.Caches = caches
+	}
+}
--- a/x/mlxrunner/cache/cache.go
+++ b/x/mlxrunner/cache/cache.go
@@ -0,0 +1,196 @@
+package cache
+
+import (
+	"log/slog"
+
+	"github.com/ollama/ollama/x/mlxrunner/mlx"
+)
+
+type Cache interface {
+	Update(keys, values *mlx.Tensor) (newKeys, newValues *mlx.Tensor)
+	State() (keys, values *mlx.Tensor)
+	Trim(int) int
+	Clone() Cache
+	Offset() int
+	Len() int
+}
+
+type KVCache struct {
+	keys, values *mlx.Tensor
+	offset       int
+	step         int
+}
+
+func NewKVCache() *KVCache {
+	return &KVCache{step: 256, keys: &mlx.Tensor{}, values: &mlx.Tensor{}}
+}
+
+func (c *KVCache) Update(keys, values *mlx.Tensor) (*mlx.Tensor, *mlx.Tensor) {
+	B, H, L, Dk, Dv := keys.Dim(0), keys.Dim(1), keys.Dim(2), keys.Dim(3), values.Dim(3)
+
+	prev := c.offset
+
+	// Grow buffer if needed
+	if !c.keys.Valid() || (prev+L) > c.keys.Dim(2) {
+		steps := (c.step + L - 1) / c.step
+		newKeys := mlx.Zeros(keys.DType(), B, H, steps*c.step, Dk)
+		newValues := mlx.Zeros(values.DType(), B, H, steps*c.step, Dv)
+
+		if c.keys.Valid() {
+			if prev%c.step != 0 {
+				c.keys.Set(c.keys.Slice(mlx.Slice(), mlx.Slice(), mlx.Slice(0, prev), mlx.Slice()))
+				c.values.Set(c.values.Slice(mlx.Slice(), mlx.Slice(), mlx.Slice(0, prev), mlx.Slice()))
+			}
+			c.keys.Set(c.keys.Concatenate(2, newKeys))
+			c.values.Set(c.values.Concatenate(2, newValues))
+		} else {
+			c.keys, c.values = newKeys, newValues
+		}
+	}
+
+	c.offset += L
+	c.keys.Set(c.keys.SliceUpdate(keys, mlx.Slice(), mlx.Slice(), mlx.Slice(prev, c.offset), mlx.Slice()))
+	c.values.Set(c.values.SliceUpdate(values, mlx.Slice(), mlx.Slice(), mlx.Slice(prev, c.offset), mlx.Slice()))
+
+	return c.keys.Slice(mlx.Slice(), mlx.Slice(), mlx.Slice(0, c.offset), mlx.Slice()),
+		c.values.Slice(mlx.Slice(), mlx.Slice(), mlx.Slice(0, c.offset), mlx.Slice())
+}
+
+func (c *KVCache) State() (*mlx.Tensor, *mlx.Tensor) {
+	if c.offset == c.keys.Dim(2) {
+		return c.keys, c.values
+	}
+	return c.keys.Slice(mlx.Slice(), mlx.Slice(), mlx.Slice(0, c.offset), mlx.Slice()),
+		c.values.Slice(mlx.Slice(), mlx.Slice(), mlx.Slice(0, c.offset), mlx.Slice())
+}
+
+func (c *KVCache) Trim(n int) int {
+	n = min(c.offset, n)
+	c.offset -= n
+	return n
+}
+
+func (c *KVCache) Clone() Cache {
+	return &KVCache{
+		keys:   c.keys.Clone(),
+		values: c.values.Clone(),
+		offset: c.offset,
+		step:   c.step,
+	}
+}
+
+func (c *KVCache) Offset() int { return c.offset }
+func (c *KVCache) Len() int    { return c.offset }
+
+// RotatingKVCache implements sliding window attention with bounded memory
+type RotatingKVCache struct {
+	maxSize int
+	idx     int
+
+	*KVCache
+}
+
+func NewRotatingKVCache(maxSize int) *RotatingKVCache {
+	return &RotatingKVCache{maxSize: maxSize, KVCache: NewKVCache()}
+}
+
+func (c *RotatingKVCache) Update(keys, values *mlx.Tensor) (*mlx.Tensor, *mlx.Tensor) {
+	if keys.Dim(2) > 1 {
+		return c.concat(keys, values)
+	}
+	return c.update(keys, values)
+}
+
+func (c *RotatingKVCache) concat(keys, values *mlx.Tensor) (newK *mlx.Tensor, newV *mlx.Tensor) {
+	slog.Debug("(*RotatingKVCache).concat", "keys_dim", keys.Dims(), "values_dim", values.Dims(), "offset", c.offset, "idx", c.idx, "max_size", c.maxSize)
+	if !c.keys.Valid() {
+		c.keys, c.values = keys, values
+	} else {
+		if c.idx < c.keys.Dim(2) {
+			c.keys.Set(c.keys.Slice(mlx.Slice(), mlx.Slice(), mlx.Slice(0, c.idx), mlx.Slice()))
+			c.values.Set(c.values.Slice(mlx.Slice(), mlx.Slice(), mlx.Slice(0, c.idx), mlx.Slice()))
+		}
+
+		// Trim to max_size to maintain sliding window
+		if trim := c.idx - c.maxSize + 1; trim > 0 {
+			c.keys.Set(c.keys.Slice(mlx.Slice(), mlx.Slice(), mlx.Slice(trim, c.keys.Dim(2)), mlx.Slice()))
+			c.values.Set(c.values.Slice(mlx.Slice(), mlx.Slice(), mlx.Slice(trim, c.values.Dim(2)), mlx.Slice()))
+		}
+
+		c.keys.Set(c.keys.Concatenate(2, keys))
+		c.values.Set(c.values.Concatenate(2, values))
+		c.idx = c.keys.Dim(2)
+	}
+
+	c.offset += keys.Dim(2)
+	c.idx = c.keys.Dim(2)
+	return c.keys, c.values
+}
+
+func (c *RotatingKVCache) update(keys, values *mlx.Tensor) (*mlx.Tensor, *mlx.Tensor) {
+	slog.Debug("(*RotatingKVCache).update", "keys_dim", keys.Dims(), "values_dim", values.Dims(), "offset", c.offset, "idx", c.idx, "max_size", c.maxSize)
+	B, H, L, Dk, Dv := keys.Dim(0), keys.Dim(1), keys.Dim(2), keys.Dim(3), values.Dim(3)
+
+	prev := c.offset
+
+	// Grow buffer if not yet at max
+	if !c.keys.Valid() || (prev >= c.keys.Dim(2) && c.keys.Dim(2) < c.maxSize) {
+		newSize := min(c.step, c.maxSize-prev)
+		newKeys := mlx.Zeros(keys.DType(), B, H, newSize, Dk)
+		newValues := mlx.Zeros(values.DType(), B, H, newSize, Dv)
+		if c.keys.Valid() {
+			c.keys.Set(c.keys.Concatenate(2, newKeys))
+			c.values.Set(c.values.Concatenate(2, newValues))
+		} else {
+			c.keys, c.values = newKeys, newValues
+		}
+		c.idx = prev
+	}
+
+	// Trim to max_size to maintain sliding window
+	if trim := c.keys.Dim(2) - c.maxSize; trim > 0 {
+		c.keys.Set(c.keys.Slice(mlx.Slice(), mlx.Slice(), mlx.Slice(trim, c.keys.Dim(2)), mlx.Slice()))
+		c.values.Set(c.values.Slice(mlx.Slice(), mlx.Slice(), mlx.Slice(trim, c.values.Dim(2)), mlx.Slice()))
+		c.idx = c.maxSize
+	}
+
+	// Rotate when hitting max
+	if c.idx >= c.maxSize {
+		c.idx = 0
+	}
+
+	c.keys.Set(c.keys.SliceUpdate(keys, mlx.Slice(), mlx.Slice(), mlx.Slice(c.idx, c.idx+L), mlx.Slice()))
+	c.values.Set(c.values.SliceUpdate(values, mlx.Slice(), mlx.Slice(), mlx.Slice(c.idx, c.idx+L), mlx.Slice()))
+
+	c.offset += L
+	c.idx += L
+
+	validLen := min(c.offset, c.maxSize)
+	return c.keys.Slice(mlx.Slice(), mlx.Slice(), mlx.Slice(0, validLen), mlx.Slice()),
+		c.values.Slice(mlx.Slice(), mlx.Slice(), mlx.Slice(0, validLen), mlx.Slice())
+}
+
+func (c *RotatingKVCache) State() (*mlx.Tensor, *mlx.Tensor) {
+	if c.offset < c.keys.Dim(2) {
+		return c.keys.Slice(mlx.Slice(), mlx.Slice(), mlx.Slice(0, c.offset), mlx.Slice()),
+			c.values.Slice(mlx.Slice(), mlx.Slice(), mlx.Slice(0, c.offset), mlx.Slice())
+	}
+	return c.keys, c.values
+}
+
+func (c *RotatingKVCache) Trim(n int) int {
+	n = min(c.offset, n)
+	c.offset -= n
+	c.idx -= n
+	return n
+}
+
+func (c *RotatingKVCache) Clone() Cache {
+	return &RotatingKVCache{
+		maxSize: c.maxSize,
+		idx:     c.idx,
+		KVCache: c.KVCache.Clone().(*KVCache),
+	}
+}
+
+func (c *RotatingKVCache) Len() int { return min(c.offset, c.maxSize) }
--- a/x/mlxrunner/client.go
+++ b/x/mlxrunner/client.go
@@ -0,0 +1,169 @@
+package mlxrunner
+
+import (
+	"bufio"
+	"bytes"
+	"context"
+	"encoding/json"
+	"errors"
+	"net"
+	"net/http"
+	"net/url"
+	"os/exec"
+	"strconv"
+	"strings"
+
+	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/ml"
+)
+
+type Client struct {
+	Port int
+	*exec.Cmd
+}
+
+func (c *Client) JoinPath(path string) string {
+	return (&url.URL{
+		Scheme: "http",
+		Host:   net.JoinHostPort("127.0.0.1", strconv.Itoa(c.Port)),
+	}).JoinPath(path).String()
+}
+
+func (c *Client) CheckError(w *http.Response) error {
+	if w.StatusCode >= 400 {
+		return errors.New(w.Status)
+	}
+	return nil
+}
+
+// Close implements llm.LlamaServer.
+func (c *Client) Close() error {
+	return c.Cmd.Process.Kill()
+}
+
+// Completion implements llm.LlamaServer.
+func (c *Client) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
+	var b bytes.Buffer
+	if err := json.NewEncoder(&b).Encode(req); err != nil {
+		return err
+	}
+
+	w, err := http.Post(c.JoinPath("/v1/completions"), "application/json", &b)
+	if err != nil {
+		return err
+	}
+	defer w.Body.Close()
+
+	if err := c.CheckError(w); err != nil {
+		return err
+	}
+
+	scanner := bufio.NewScanner(w.Body)
+	for scanner.Scan() {
+		bts := scanner.Bytes()
+
+		var resp llm.CompletionResponse
+		if err := json.Unmarshal(bts, &resp); err != nil {
+			return err
+		}
+
+		fn(resp)
+	}
+
+	return nil
+}
+
+// Detokenize implements llm.LlamaServer.
+func (c *Client) Detokenize(ctx context.Context, tokens []int) (string, error) {
+	panic("unimplemented")
+}
+
+// Embedding implements llm.LlamaServer.
+func (c *Client) Embedding(ctx context.Context, input string) ([]float32, int, error) {
+	panic("unimplemented")
+}
+
+// GetDeviceInfos implements llm.LlamaServer.
+func (c *Client) GetDeviceInfos(ctx context.Context) []ml.DeviceInfo {
+	panic("unimplemented")
+}
+
+// GetPort implements llm.LlamaServer.
+func (c *Client) GetPort() int {
+	return c.Port
+}
+
+// HasExited implements llm.LlamaServer.
+func (c *Client) HasExited() bool {
+	panic("unimplemented")
+}
+
+// Load implements llm.LlamaServer.
+func (c *Client) Load(ctx context.Context, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) ([]ml.DeviceID, error) {
+	w, err := http.Post(c.JoinPath("/v1/models"), "application/json", nil)
+	if err != nil {
+		return nil, err
+	}
+	defer w.Body.Close()
+
+	return []ml.DeviceID{}, nil
+}
+
+// ModelPath implements llm.LlamaServer.
+func (c *Client) ModelPath() string {
+	panic("unimplemented")
+}
+
+// Pid implements llm.LlamaServer.
+func (c *Client) Pid() int {
+	panic("unimplemented")
+}
+
+// Ping implements llm.LlamaServer.
+func (c *Client) Ping(ctx context.Context) error {
+	w, err := http.Get(c.JoinPath("/v1/status"))
+	if err != nil {
+		return err
+	}
+	defer w.Body.Close()
+
+	return nil
+}
+
+// Tokenize implements llm.LlamaServer.
+func (c *Client) Tokenize(ctx context.Context, content string) ([]int, error) {
+	w, err := http.Post(c.JoinPath("/v1/tokenize"), "text/plain", strings.NewReader(content))
+	if err != nil {
+		return nil, err
+	}
+	defer w.Body.Close()
+
+	var tokens []int
+	if err := json.NewDecoder(w.Body).Decode(&tokens); err != nil {
+		return nil, err
+	}
+
+	return tokens, nil
+}
+
+// TotalSize implements llm.LlamaServer.
+func (c *Client) TotalSize() uint64 {
+	panic("unimplemented")
+}
+
+// VRAMByGPU implements llm.LlamaServer.
+func (c *Client) VRAMByGPU(id ml.DeviceID) uint64 {
+	panic("unimplemented")
+}
+
+// VRAMSize implements llm.LlamaServer.
+func (c *Client) VRAMSize() uint64 {
+	panic("unimplemented")
+}
+
+// WaitUntilRunning implements llm.LlamaServer.
+func (c *Client) WaitUntilRunning(ctx context.Context) error {
+	panic("unimplemented")
+}
+
+var _ llm.LlamaServer = (*Client)(nil)
--- a/x/mlxrunner/mlx/.gitignore
+++ b/x/mlxrunner/mlx/.gitignore
@@ -0,0 +1,3 @@
+_deps
+build
+dist
--- a/x/mlxrunner/mlx/CMakeLists.txt
+++ b/x/mlxrunner/mlx/CMakeLists.txt
@@ -0,0 +1,26 @@
+cmake_minimum_required(VERSION 3.5)
+
+project(mlx)
+
+if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
+  set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_SOURCE_DIR}/dist" CACHE PATH "" FORCE)
+endif()
+
+set(MLX_BUILD_GGUF OFF CACHE BOOL "" FORCE)
+set(MLX_BUILD_SAFETENSORS ON CACHE BOOL "" FORCE)
+set(MLX_C_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE)
+set(BUILD_SHARED_LIBS ON CACHE BOOL "" FORCE)
+
+set(CMAKE_INSTALL_RPATH "@loader_path")
+
+include(FetchContent)
+
+set(MLX_C_GIT_TAG "v0.4.0" CACHE STRING "")
+
+FetchContent_Declare(
+  mlx-c
+  GIT_REPOSITORY "https://github.com/ml-explore/mlx-c.git"
+  GIT_TAG ${MLX_C_GIT_TAG}
+)
+
+FetchContent_MakeAvailable(mlx-c)
--- a/x/mlxrunner/mlx/act.go
+++ b/x/mlxrunner/mlx/act.go
@@ -0,0 +1,21 @@
+package mlx
+
+// #include "generated.h"
+import "C"
+import "math"
+
+func GELUApprox(t *Tensor) *Tensor {
+	return t.Multiply(
+		FromValue[float32](0.5),
+	).Multiply(
+		t.Add(
+			t.Power(FromValue[float32](3.0)).Multiply(FromValue[float32](0.044715)),
+		).Multiply(
+			FromValue(float32(math.Sqrt(2 / math.Pi))),
+		).Tanh().Add(FromValue[float32](1.0)),
+	).AsType(t.DType())
+}
+
+func SILU(t *Tensor) *Tensor {
+	return t.Multiply(t.Sigmoid()).AsType(t.DType())
+}
--- a/x/mlxrunner/mlx/dtype.go
+++ b/x/mlxrunner/mlx/dtype.go
@@ -0,0 +1,94 @@
+package mlx
+
+// #include "generated.h"
+import "C"
+
+type DType int
+
+func (t DType) String() string {
+	switch t {
+	case DTypeBool:
+		return "BOOL"
+	case DTypeUint8:
+		return "U8"
+	case DTypeUint16:
+		return "U16"
+	case DTypeUint32:
+		return "U32"
+	case DTypeUint64:
+		return "U64"
+	case DTypeInt8:
+		return "I8"
+	case DTypeInt16:
+		return "I16"
+	case DTypeInt32:
+		return "I32"
+	case DTypeInt64:
+		return "I64"
+	case DTypeFloat16:
+		return "F16"
+	case DTypeFloat32:
+		return "F32"
+	case DTypeFloat64:
+		return "F64"
+	case DTypeBFloat16:
+		return "BF16"
+	case DTypeComplex64:
+		return "C64"
+	default:
+		return "Unknown"
+	}
+}
+
+func (t *DType) UnmarshalJSON(b []byte) error {
+	switch string(b) {
+	case `"BOOL"`:
+		*t = DTypeBool
+	case `"U8"`:
+		*t = DTypeUint8
+	case `"U16"`:
+		*t = DTypeUint16
+	case `"U32"`:
+		*t = DTypeUint32
+	case `"U64"`:
+		*t = DTypeUint64
+	case `"I8"`:
+		*t = DTypeInt8
+	case `"I16"`:
+		*t = DTypeInt16
+	case `"I32"`:
+		*t = DTypeInt32
+	case `"I64"`:
+		*t = DTypeInt64
+	case `"F16"`:
+		*t = DTypeFloat16
+	case `"F64"`:
+		*t = DTypeFloat64
+	case `"F32"`:
+		*t = DTypeFloat32
+	case `"BF16"`:
+		*t = DTypeBFloat16
+	case `"C64"`:
+		*t = DTypeComplex64
+	default:
+		return nil
+	}
+	return nil
+}
+
+const (
+	DTypeBool      DType = C.MLX_BOOL
+	DTypeUint8     DType = C.MLX_UINT8
+	DTypeUint16    DType = C.MLX_UINT16
+	DTypeUint32    DType = C.MLX_UINT32
+	DTypeUint64    DType = C.MLX_UINT64
+	DTypeInt8      DType = C.MLX_INT8
+	DTypeInt16     DType = C.MLX_INT16
+	DTypeInt32     DType = C.MLX_INT32
+	DTypeInt64     DType = C.MLX_INT64
+	DTypeFloat16   DType = C.MLX_FLOAT16
+	DTypeFloat32   DType = C.MLX_FLOAT32
+	DTypeFloat64   DType = C.MLX_FLOAT64
+	DTypeBFloat16  DType = C.MLX_BFLOAT16
+	DTypeComplex64 DType = C.MLX_COMPLEX64
+)
--- a/x/mlxrunner/mlx/dynamic.c
+++ b/x/mlxrunner/mlx/dynamic.c
@@ -0,0 +1,34 @@
+#include "dynamic.h"
+
+#include <stdio.h>
+
+#ifdef _WIN32
+#include <windows.h>
+#define DLOPEN(path) LoadLibraryA(path)
+#define DLCLOSE(handle) FreeLibrary((HMODULE)(handle))
+#else
+#ifdef __APPLE__
+#include <mach-o/dyld.h>
+#include <libgen.h>
+#endif
+#include <dlfcn.h>
+#define DLOPEN(path) dlopen(path, RTLD_LAZY | RTLD_GLOBAL)
+#define DLCLOSE(handle) dlclose(handle)
+#endif
+
+static int mlx_dynamic_open(mlx_dynamic_handle* handle, const char* path) {
+    handle->ctx = (void*) DLOPEN(path);
+    CHECK(handle->ctx != NULL);
+    return 0;
+}
+
+int mlx_dynamic_load(mlx_dynamic_handle* handle, const char *path) {
+    return mlx_dynamic_open(handle, path);
+}
+
+void mlx_dynamic_unload(mlx_dynamic_handle* handle) {
+    if (handle->ctx) {
+        DLCLOSE(handle->ctx);
+        handle->ctx = NULL;
+    }
+}
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Michael Yang	ba1a983c79	mlxrunner	2026-02-03 10:31:46 -08:00
Michael Yang	5f82c5ff0f	chore: simplify runner.Execute	2026-01-28 14:25:57 -08:00
Michael Yang	77cb929a02	draft: model manifest file interface this change makes it easier to address blobs by their original names without rebuilding the filesystem structure	2026-01-28 14:25:57 -08:00
Michael Yang	07d944fdfc	move tokenizer to separate package	2026-01-28 14:25:31 -08:00