docs: update instructions for ollama config command

These tools can be automatically configured using the new ollama config command
2026-02-02 03:33:38 -05:00 · 2026-01-21 17:03:41 -08:00
88 changed files with 346 additions and 8847 deletions
--- a/4
+++ b/4
@@ -169,10 +169,8 @@ COPY . .
 RUN git clone --depth 1 --branch "$(cat MLX_VERSION)" https://github.com/ml-explore/mlx-c.git build/_deps/mlx-c-src
 ARG GOFLAGS="'-ldflags=-w -s'"
 ENV CGO_ENABLED=1
-ARG CGO_CFLAGS
+ENV CGO_CFLAGS="-I/go/src/github.com/ollama/ollama/build/_deps/mlx-c-src"
 ARG CGO_CXXFLAGS
-ENV CGO_CFLAGS="${CGO_CFLAGS} -I/go/src/github.com/ollama/ollama/build/_deps/mlx-c-src"
-ENV CGO_CXXFLAGS="${CGO_CXXFLAGS}"
 RUN --mount=type=cache,target=/root/.cache/go-build \
    go build -tags mlx -trimpath -buildmode=pie -o /bin/ollama .

--- a/README.md
+++ b/README.md
@@ -558,7 +558,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [LiteLLM](https://github.com/BerriAI/litellm)
 - [OllamaFarm for Go](https://github.com/presbrey/ollamafarm)
 - [OllamaSharp for .NET](https://github.com/awaescher/OllamaSharp)
- [Ollama for Ruby](https://github.com/crmne/ruby_llm)
+- [Ollama for Ruby](https://github.com/gbaptista/ollama-ai)
 - [Ollama-rs for Rust](https://github.com/pepperoni21/ollama-rs)
 - [Ollama-hpp for C++](https://github.com/jmont-dev/ollama-hpp)
 - [Ollama4j for Java](https://github.com/ollama4j/ollama4j)
--- a/app/README.md
+++ b/app/README.md
@@ -75,9 +75,9 @@ The `-dev` flag enables:
 CI builds with Xcode 14.1 for OS compatibility prior to v13.  If you want to manually build v11+ support, you can download the older Xcode [here](https://developer.apple.com/services-account/download?path=/Developer_Tools/Xcode_14.1/Xcode_14.1.xip), extract, then `mv ./Xcode.app /Applications/Xcode_14.1.0.app` then activate with:

 ```
-export CGO_CFLAGS="-O3 -mmacosx-version-min=12.0"
-export CGO_CXXFLAGS="-O3 -mmacosx-version-min=12.0"
-export CGO_LDFLAGS="-mmacosx-version-min=12.0"
+export CGO_CFLAGS=-mmacosx-version-min=12.0
+export CGO_CXXFLAGS=-mmacosx-version-min=12.0
+export CGO_LDFLAGS=-mmacosx-version-min=12.0
 export SDKROOT=/Applications/Xcode_14.1.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
 export DEVELOPER_DIR=/Applications/Xcode_14.1.0.app/Contents/Developer
 ```
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -35,7 +35,6 @@ import (
 	"golang.org/x/term"

 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/cmd/config"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/parser"
@@ -1019,10 +1018,8 @@ func showInfo(resp *api.ShowResponse, verbose bool, w io.Writer) error {
 		}

 		if resp.ModelInfo != nil {
-			arch, _ := resp.ModelInfo["general.architecture"].(string)
-			if arch != "" {
-				rows = append(rows, []string{"", "architecture", arch})
-			}
+			arch := resp.ModelInfo["general.architecture"].(string)
+			rows = append(rows, []string{"", "architecture", arch})

 			var paramStr string
 			if resp.Details.ParameterSize != "" {
@@ -1032,9 +1029,7 @@ func showInfo(resp *api.ShowResponse, verbose bool, w io.Writer) error {
 					paramStr = format.HumanNumber(uint64(f))
 				}
 			}
-			if paramStr != "" {
-				rows = append(rows, []string{"", "parameters", paramStr})
-			}
+			rows = append(rows, []string{"", "parameters", paramStr})

 			if v, ok := resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)]; ok {
 				if f, ok := v.(float64); ok {
@@ -2031,7 +2026,6 @@ func NewCLI() *cobra.Command {
 		copyCmd,
 		deleteCmd,
 		runnerCmd,
-		config.LaunchCmd(checkServerHeartbeat),
 	)

 	return rootCmd
--- a/cmd/config/claude.go
+++ b/cmd/config/claude.go
@@ -1,58 +0,0 @@
-package config
-
-import (
-	"fmt"
-	"os"
-	"os/exec"
-	"path/filepath"
-	"runtime"
-)
-
-// Claude implements Runner for Claude Code integration
-type Claude struct{}
-
-func (c *Claude) String() string { return "Claude Code" }
-
-func (c *Claude) args(model string) []string {
-	if model != "" {
-		return []string{"--model", model}
-	}
-	return nil
-}
-
-func (c *Claude) findPath() (string, error) {
-	if p, err := exec.LookPath("claude"); err == nil {
-		return p, nil
-	}
-	home, err := os.UserHomeDir()
-	if err != nil {
-		return "", err
-	}
-	name := "claude"
-	if runtime.GOOS == "windows" {
-		name = "claude.exe"
-	}
-	fallback := filepath.Join(home, ".claude", "local", name)
-	if _, err := os.Stat(fallback); err != nil {
-		return "", err
-	}
-	return fallback, nil
-}
-
-func (c *Claude) Run(model string) error {
-	claudePath, err := c.findPath()
-	if err != nil {
-		return fmt.Errorf("claude is not installed, install from https://code.claude.com/docs/en/quickstart")
-	}
-
-	cmd := exec.Command(claudePath, c.args(model)...)
-	cmd.Stdin = os.Stdin
-	cmd.Stdout = os.Stdout
-	cmd.Stderr = os.Stderr
-	cmd.Env = append(os.Environ(),
-		"ANTHROPIC_BASE_URL=http://localhost:11434",
-		"ANTHROPIC_API_KEY=",
-		"ANTHROPIC_AUTH_TOKEN=ollama",
-	)
-	return cmd.Run()
-}
--- a/cmd/config/claude_test.go
+++ b/cmd/config/claude_test.go
@@ -1,101 +0,0 @@
-package config
-
-import (
-	"os"
-	"path/filepath"
-	"runtime"
-	"slices"
-	"testing"
-)
-
-func TestClaudeIntegration(t *testing.T) {
-	c := &Claude{}
-
-	t.Run("String", func(t *testing.T) {
-		if got := c.String(); got != "Claude Code" {
-			t.Errorf("String() = %q, want %q", got, "Claude Code")
-		}
-	})
-
-	t.Run("implements Runner", func(t *testing.T) {
-		var _ Runner = c
-	})
-}
-
-func TestClaudeFindPath(t *testing.T) {
-	c := &Claude{}
-
-	t.Run("finds claude in PATH", func(t *testing.T) {
-		tmpDir := t.TempDir()
-		name := "claude"
-		if runtime.GOOS == "windows" {
-			name = "claude.exe"
-		}
-		fakeBin := filepath.Join(tmpDir, name)
-		os.WriteFile(fakeBin, []byte("#!/bin/sh\n"), 0o755)
-		t.Setenv("PATH", tmpDir)
-
-		got, err := c.findPath()
-		if err != nil {
-			t.Fatalf("unexpected error: %v", err)
-		}
-		if got != fakeBin {
-			t.Errorf("findPath() = %q, want %q", got, fakeBin)
-		}
-	})
-
-	t.Run("falls back to ~/.claude/local/claude", func(t *testing.T) {
-		tmpDir := t.TempDir()
-		setTestHome(t, tmpDir)
-		t.Setenv("PATH", t.TempDir()) // empty dir, no claude binary
-
-		name := "claude"
-		if runtime.GOOS == "windows" {
-			name = "claude.exe"
-		}
-		fallback := filepath.Join(tmpDir, ".claude", "local", name)
-		os.MkdirAll(filepath.Dir(fallback), 0o755)
-		os.WriteFile(fallback, []byte("#!/bin/sh\n"), 0o755)
-
-		got, err := c.findPath()
-		if err != nil {
-			t.Fatalf("unexpected error: %v", err)
-		}
-		if got != fallback {
-			t.Errorf("findPath() = %q, want %q", got, fallback)
-		}
-	})
-
-	t.Run("returns error when neither PATH nor fallback exists", func(t *testing.T) {
-		tmpDir := t.TempDir()
-		setTestHome(t, tmpDir)
-		t.Setenv("PATH", t.TempDir()) // empty dir, no claude binary
-
-		_, err := c.findPath()
-		if err == nil {
-			t.Fatal("expected error, got nil")
-		}
-	})
-}
-
-func TestClaudeArgs(t *testing.T) {
-	c := &Claude{}
-
-	tests := []struct {
-		name  string
-		model string
-		want  []string
-	}{
-		{"with model", "llama3.2", []string{"--model", "llama3.2"}},
-		{"empty model", "", nil},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			got := c.args(tt.model)
-			if !slices.Equal(got, tt.want) {
-				t.Errorf("args(%q) = %v, want %v", tt.model, got, tt.want)
-			}
-		})
-	}
-}
--- a/cmd/config/codex.go
+++ b/cmd/config/codex.go
@@ -1,61 +0,0 @@
-package config
-
-import (
-	"fmt"
-	"os"
-	"os/exec"
-	"strings"
-
-	"golang.org/x/mod/semver"
-)
-
-// Codex implements Runner for Codex integration
-type Codex struct{}
-
-func (c *Codex) String() string { return "Codex" }
-
-func (c *Codex) args(model string) []string {
-	args := []string{"--oss"}
-	if model != "" {
-		args = append(args, "-m", model)
-	}
-	return args
-}
-
-func (c *Codex) Run(model string) error {
-	if err := checkCodexVersion(); err != nil {
-		return err
-	}
-
-	cmd := exec.Command("codex", c.args(model)...)
-	cmd.Stdin = os.Stdin
-	cmd.Stdout = os.Stdout
-	cmd.Stderr = os.Stderr
-	return cmd.Run()
-}
-
-func checkCodexVersion() error {
-	if _, err := exec.LookPath("codex"); err != nil {
-		return fmt.Errorf("codex is not installed, install with: npm install -g @openai/codex")
-	}
-
-	out, err := exec.Command("codex", "--version").Output()
-	if err != nil {
-		return fmt.Errorf("failed to get codex version: %w", err)
-	}
-
-	// Parse output like "codex-cli 0.87.0"
-	fields := strings.Fields(strings.TrimSpace(string(out)))
-	if len(fields) < 2 {
-		return fmt.Errorf("unexpected codex version output: %s", string(out))
-	}
-
-	version := "v" + fields[len(fields)-1]
-	minVersion := "v0.81.0"
-
-	if semver.Compare(version, minVersion) < 0 {
-		return fmt.Errorf("codex version %s is too old, minimum required is %s, update with: npm update -g @openai/codex", fields[len(fields)-1], "0.81.0")
-	}
-
-	return nil
-}
--- a/cmd/config/codex_test.go
+++ b/cmd/config/codex_test.go
@@ -1,28 +0,0 @@
-package config
-
-import (
-	"slices"
-	"testing"
-)
-
-func TestCodexArgs(t *testing.T) {
-	c := &Codex{}
-
-	tests := []struct {
-		name  string
-		model string
-		want  []string
-	}{
-		{"with model", "llama3.2", []string{"--oss", "-m", "llama3.2"}},
-		{"empty model", "", []string{"--oss"}},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			got := c.args(tt.model)
-			if !slices.Equal(got, tt.want) {
-				t.Errorf("args(%q) = %v, want %v", tt.model, got, tt.want)
-			}
-		})
-	}
-}
--- a/cmd/config/config.go
+++ b/cmd/config/config.go
@@ -1,115 +0,0 @@
-// Package config provides integration configuration for external coding tools
-// (Claude Code, Codex, Droid, OpenCode) to use Ollama models.
-package config
-
-import (
-	"encoding/json"
-	"errors"
-	"fmt"
-	"os"
-	"path/filepath"
-	"strings"
-)
-
-type integration struct {
-	Models []string `json:"models"`
-}
-
-type config struct {
-	Integrations map[string]*integration `json:"integrations"`
-}
-
-func configPath() (string, error) {
-	home, err := os.UserHomeDir()
-	if err != nil {
-		return "", err
-	}
-	return filepath.Join(home, ".ollama", "config", "config.json"), nil
-}
-
-func load() (*config, error) {
-	path, err := configPath()
-	if err != nil {
-		return nil, err
-	}
-
-	data, err := os.ReadFile(path)
-	if err != nil {
-		if os.IsNotExist(err) {
-			return &config{Integrations: make(map[string]*integration)}, nil
-		}
-		return nil, err
-	}
-
-	var cfg config
-	if err := json.Unmarshal(data, &cfg); err != nil {
-		return nil, fmt.Errorf("failed to parse config: %w, at: %s", err, path)
-	}
-	if cfg.Integrations == nil {
-		cfg.Integrations = make(map[string]*integration)
-	}
-	return &cfg, nil
-}
-
-func save(cfg *config) error {
-	path, err := configPath()
-	if err != nil {
-		return err
-	}
-
-	if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil {
-		return err
-	}
-
-	data, err := json.MarshalIndent(cfg, "", "  ")
-	if err != nil {
-		return err
-	}
-
-	return writeWithBackup(path, data)
-}
-
-func saveIntegration(appName string, models []string) error {
-	if appName == "" {
-		return errors.New("app name cannot be empty")
-	}
-
-	cfg, err := load()
-	if err != nil {
-		return err
-	}
-
-	cfg.Integrations[strings.ToLower(appName)] = &integration{
-		Models: models,
-	}
-
-	return save(cfg)
-}
-
-func loadIntegration(appName string) (*integration, error) {
-	cfg, err := load()
-	if err != nil {
-		return nil, err
-	}
-
-	ic, ok := cfg.Integrations[strings.ToLower(appName)]
-	if !ok {
-		return nil, os.ErrNotExist
-	}
-
-	return ic, nil
-}
-
-func listIntegrations() ([]integration, error) {
-	cfg, err := load()
-	if err != nil {
-		return nil, err
-	}
-
-	result := make([]integration, 0, len(cfg.Integrations))
-	for _, ic := range cfg.Integrations {
-		result = append(result, *ic)
-	}
-
-	return result, nil
-}
--- a/cmd/config/config_test.go
+++ b/cmd/config/config_test.go
@@ -1,373 +0,0 @@
-package config
-
-import (
-	"os"
-	"path/filepath"
-	"strings"
-	"testing"
-)
-
-// setTestHome sets both HOME (Unix) and USERPROFILE (Windows) for cross-platform tests
-func setTestHome(t *testing.T, dir string) {
-	t.Setenv("HOME", dir)
-	t.Setenv("USERPROFILE", dir)
-}
-
-// editorPaths is a test helper that safely calls Paths if the runner implements Editor
-func editorPaths(r Runner) []string {
-	if editor, ok := r.(Editor); ok {
-		return editor.Paths()
-	}
-	return nil
-}
-
-func TestIntegrationConfig(t *testing.T) {
-	tmpDir := t.TempDir()
-	setTestHome(t, tmpDir)
-
-	t.Run("save and load round-trip", func(t *testing.T) {
-		models := []string{"llama3.2", "mistral", "qwen2.5"}
-		if err := saveIntegration("claude", models); err != nil {
-			t.Fatal(err)
-		}
-
-		config, err := loadIntegration("claude")
-		if err != nil {
-			t.Fatal(err)
-		}
-
-		if len(config.Models) != len(models) {
-			t.Errorf("expected %d models, got %d", len(models), len(config.Models))
-		}
-		for i, m := range models {
-			if config.Models[i] != m {
-				t.Errorf("model %d: expected %s, got %s", i, m, config.Models[i])
-			}
-		}
-	})
-
-	t.Run("defaultModel returns first model", func(t *testing.T) {
-		saveIntegration("codex", []string{"model-a", "model-b"})
-
-		config, _ := loadIntegration("codex")
-		defaultModel := ""
-		if len(config.Models) > 0 {
-			defaultModel = config.Models[0]
-		}
-		if defaultModel != "model-a" {
-			t.Errorf("expected model-a, got %s", defaultModel)
-		}
-	})
-
-	t.Run("defaultModel returns empty for no models", func(t *testing.T) {
-		config := &integration{Models: []string{}}
-		defaultModel := ""
-		if len(config.Models) > 0 {
-			defaultModel = config.Models[0]
-		}
-		if defaultModel != "" {
-			t.Errorf("expected empty string, got %s", defaultModel)
-		}
-	})
-
-	t.Run("app name is case-insensitive", func(t *testing.T) {
-		saveIntegration("Claude", []string{"model-x"})
-
-		config, err := loadIntegration("claude")
-		if err != nil {
-			t.Fatal(err)
-		}
-		defaultModel := ""
-		if len(config.Models) > 0 {
-			defaultModel = config.Models[0]
-		}
-		if defaultModel != "model-x" {
-			t.Errorf("expected model-x, got %s", defaultModel)
-		}
-	})
-
-	t.Run("multiple integrations in single file", func(t *testing.T) {
-		saveIntegration("app1", []string{"model-1"})
-		saveIntegration("app2", []string{"model-2"})
-
-		config1, _ := loadIntegration("app1")
-		config2, _ := loadIntegration("app2")
-
-		defaultModel1 := ""
-		if len(config1.Models) > 0 {
-			defaultModel1 = config1.Models[0]
-		}
-		defaultModel2 := ""
-		if len(config2.Models) > 0 {
-			defaultModel2 = config2.Models[0]
-		}
-		if defaultModel1 != "model-1" {
-			t.Errorf("expected model-1, got %s", defaultModel1)
-		}
-		if defaultModel2 != "model-2" {
-			t.Errorf("expected model-2, got %s", defaultModel2)
-		}
-	})
-}
-
-func TestListIntegrations(t *testing.T) {
-	tmpDir := t.TempDir()
-	setTestHome(t, tmpDir)
-
-	t.Run("returns empty when no integrations", func(t *testing.T) {
-		configs, err := listIntegrations()
-		if err != nil {
-			t.Fatal(err)
-		}
-		if len(configs) != 0 {
-			t.Errorf("expected 0 integrations, got %d", len(configs))
-		}
-	})
-
-	t.Run("returns all saved integrations", func(t *testing.T) {
-		saveIntegration("claude", []string{"model-1"})
-		saveIntegration("droid", []string{"model-2"})
-
-		configs, err := listIntegrations()
-		if err != nil {
-			t.Fatal(err)
-		}
-		if len(configs) != 2 {
-			t.Errorf("expected 2 integrations, got %d", len(configs))
-		}
-	})
-}
-
-func TestEditorPaths(t *testing.T) {
-	tmpDir := t.TempDir()
-	setTestHome(t, tmpDir)
-
-	t.Run("returns empty for claude (no Editor)", func(t *testing.T) {
-		r := integrations["claude"]
-		paths := editorPaths(r)
-		if len(paths) != 0 {
-			t.Errorf("expected no paths for claude, got %v", paths)
-		}
-	})
-
-	t.Run("returns empty for codex (no Editor)", func(t *testing.T) {
-		r := integrations["codex"]
-		paths := editorPaths(r)
-		if len(paths) != 0 {
-			t.Errorf("expected no paths for codex, got %v", paths)
-		}
-	})
-
-	t.Run("returns empty for droid when no config exists", func(t *testing.T) {
-		r := integrations["droid"]
-		paths := editorPaths(r)
-		if len(paths) != 0 {
-			t.Errorf("expected no paths, got %v", paths)
-		}
-	})
-
-	t.Run("returns path for droid when config exists", func(t *testing.T) {
-		settingsDir, _ := os.UserHomeDir()
-		settingsDir = filepath.Join(settingsDir, ".factory")
-		os.MkdirAll(settingsDir, 0o755)
-		os.WriteFile(filepath.Join(settingsDir, "settings.json"), []byte(`{}`), 0o644)
-
-		r := integrations["droid"]
-		paths := editorPaths(r)
-		if len(paths) != 1 {
-			t.Errorf("expected 1 path, got %d", len(paths))
-		}
-	})
-
-	t.Run("returns paths for opencode when configs exist", func(t *testing.T) {
-		home, _ := os.UserHomeDir()
-		configDir := filepath.Join(home, ".config", "opencode")
-		stateDir := filepath.Join(home, ".local", "state", "opencode")
-		os.MkdirAll(configDir, 0o755)
-		os.MkdirAll(stateDir, 0o755)
-		os.WriteFile(filepath.Join(configDir, "opencode.json"), []byte(`{}`), 0o644)
-		os.WriteFile(filepath.Join(stateDir, "model.json"), []byte(`{}`), 0o644)
-
-		r := integrations["opencode"]
-		paths := editorPaths(r)
-		if len(paths) != 2 {
-			t.Errorf("expected 2 paths, got %d: %v", len(paths), paths)
-		}
-	})
-}
-
-func TestLoadIntegration_CorruptedJSON(t *testing.T) {
-	tmpDir := t.TempDir()
-	setTestHome(t, tmpDir)
-
-	// Create corrupted config.json file
-	dir := filepath.Join(tmpDir, ".ollama", "config")
-	os.MkdirAll(dir, 0o755)
-	os.WriteFile(filepath.Join(dir, "config.json"), []byte(`{corrupted json`), 0o644)
-
-	// Corrupted file is treated as empty, so loadIntegration returns not found
-	_, err := loadIntegration("test")
-	if err == nil {
-		t.Error("expected error for nonexistent integration in corrupted file")
-	}
-}
-
-func TestSaveIntegration_NilModels(t *testing.T) {
-	tmpDir := t.TempDir()
-	setTestHome(t, tmpDir)
-
-	if err := saveIntegration("test", nil); err != nil {
-		t.Fatalf("saveIntegration with nil models failed: %v", err)
-	}
-
-	config, err := loadIntegration("test")
-	if err != nil {
-		t.Fatalf("loadIntegration failed: %v", err)
-	}
-
-	if config.Models == nil {
-		// nil is acceptable
-	} else if len(config.Models) != 0 {
-		t.Errorf("expected empty or nil models, got %v", config.Models)
-	}
-}
-
-func TestSaveIntegration_EmptyAppName(t *testing.T) {
-	tmpDir := t.TempDir()
-	setTestHome(t, tmpDir)
-
-	err := saveIntegration("", []string{"model"})
-	if err == nil {
-		t.Error("expected error for empty app name, got nil")
-	}
-	if err != nil && !strings.Contains(err.Error(), "app name cannot be empty") {
-		t.Errorf("expected 'app name cannot be empty' error, got: %v", err)
-	}
-}
-
-func TestLoadIntegration_NonexistentIntegration(t *testing.T) {
-	tmpDir := t.TempDir()
-	setTestHome(t, tmpDir)
-
-	_, err := loadIntegration("nonexistent")
-	if err == nil {
-		t.Error("expected error for nonexistent integration, got nil")
-	}
-	if !os.IsNotExist(err) {
-		t.Logf("error type is os.ErrNotExist as expected: %v", err)
-	}
-}
-
-func TestConfigPath(t *testing.T) {
-	tmpDir := t.TempDir()
-	setTestHome(t, tmpDir)
-
-	path, err := configPath()
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	expected := filepath.Join(tmpDir, ".ollama", "config", "config.json")
-	if path != expected {
-		t.Errorf("expected %s, got %s", expected, path)
-	}
-}
-
-func TestLoad(t *testing.T) {
-	tmpDir := t.TempDir()
-	setTestHome(t, tmpDir)
-
-	t.Run("returns empty config when file does not exist", func(t *testing.T) {
-		cfg, err := load()
-		if err != nil {
-			t.Fatal(err)
-		}
-		if cfg == nil {
-			t.Fatal("expected non-nil config")
-		}
-		if cfg.Integrations == nil {
-			t.Error("expected non-nil Integrations map")
-		}
-		if len(cfg.Integrations) != 0 {
-			t.Errorf("expected empty Integrations, got %d", len(cfg.Integrations))
-		}
-	})
-
-	t.Run("loads existing config", func(t *testing.T) {
-		path, _ := configPath()
-		os.MkdirAll(filepath.Dir(path), 0o755)
-		os.WriteFile(path, []byte(`{"integrations":{"test":{"models":["model-a"]}}}`), 0o644)
-
-		cfg, err := load()
-		if err != nil {
-			t.Fatal(err)
-		}
-		if cfg.Integrations["test"] == nil {
-			t.Fatal("expected test integration")
-		}
-		if len(cfg.Integrations["test"].Models) != 1 {
-			t.Errorf("expected 1 model, got %d", len(cfg.Integrations["test"].Models))
-		}
-	})
-
-	t.Run("returns error for corrupted JSON", func(t *testing.T) {
-		path, _ := configPath()
-		os.MkdirAll(filepath.Dir(path), 0o755)
-		os.WriteFile(path, []byte(`{corrupted`), 0o644)
-
-		_, err := load()
-		if err == nil {
-			t.Error("expected error for corrupted JSON")
-		}
-	})
-}
-
-func TestSave(t *testing.T) {
-	tmpDir := t.TempDir()
-	setTestHome(t, tmpDir)
-
-	t.Run("creates config file", func(t *testing.T) {
-		cfg := &config{
-			Integrations: map[string]*integration{
-				"test": {Models: []string{"model-a", "model-b"}},
-			},
-		}
-
-		if err := save(cfg); err != nil {
-			t.Fatal(err)
-		}
-
-		path, _ := configPath()
-		if _, err := os.Stat(path); os.IsNotExist(err) {
-			t.Error("config file was not created")
-		}
-	})
-
-	t.Run("round-trip preserves data", func(t *testing.T) {
-		cfg := &config{
-			Integrations: map[string]*integration{
-				"claude": {Models: []string{"llama3.2", "mistral"}},
-				"codex":  {Models: []string{"qwen2.5"}},
-			},
-		}
-
-		if err := save(cfg); err != nil {
-			t.Fatal(err)
-		}
-
-		loaded, err := load()
-		if err != nil {
-			t.Fatal(err)
-		}
-
-		if len(loaded.Integrations) != 2 {
-			t.Errorf("expected 2 integrations, got %d", len(loaded.Integrations))
-		}
-		if loaded.Integrations["claude"] == nil {
-			t.Error("missing claude integration")
-		}
-		if len(loaded.Integrations["claude"].Models) != 2 {
-			t.Errorf("expected 2 models for claude, got %d", len(loaded.Integrations["claude"].Models))
-		}
-	})
-}
--- a/cmd/config/droid.go
+++ b/cmd/config/droid.go
@@ -1,184 +0,0 @@
-package config
-
-import (
-	"encoding/json"
-	"fmt"
-	"os"
-	"os/exec"
-	"path/filepath"
-	"slices"
-)
-
-// Droid implements Runner and Editor for Droid integration
-type Droid struct{}
-
-// droidSettings represents the Droid settings.json file (only fields we use)
-type droidSettings struct {
-	CustomModels           []modelEntry    `json:"customModels"`
-	SessionDefaultSettings sessionSettings `json:"sessionDefaultSettings"`
-}
-
-type sessionSettings struct {
-	Model           string `json:"model"`
-	ReasoningEffort string `json:"reasoningEffort"`
-}
-
-type modelEntry struct {
-	Model           string `json:"model"`
-	DisplayName     string `json:"displayName"`
-	BaseURL         string `json:"baseUrl"`
-	APIKey          string `json:"apiKey"`
-	Provider        string `json:"provider"`
-	MaxOutputTokens int    `json:"maxOutputTokens"`
-	SupportsImages  bool   `json:"supportsImages"`
-	ID              string `json:"id"`
-	Index           int    `json:"index"`
-}
-
-func (d *Droid) String() string { return "Droid" }
-
-func (d *Droid) Run(model string) error {
-	if _, err := exec.LookPath("droid"); err != nil {
-		return fmt.Errorf("droid is not installed, install from https://docs.factory.ai/cli/getting-started/quickstart")
-	}
-
-	// Call Edit() to ensure config is up-to-date before launch
-	models := []string{model}
-	if config, err := loadIntegration("droid"); err == nil && len(config.Models) > 0 {
-		models = config.Models
-	}
-	if err := d.Edit(models); err != nil {
-		return fmt.Errorf("setup failed: %w", err)
-	}
-
-	cmd := exec.Command("droid")
-	cmd.Stdin = os.Stdin
-	cmd.Stdout = os.Stdout
-	cmd.Stderr = os.Stderr
-	return cmd.Run()
-}
-
-func (d *Droid) Paths() []string {
-	home, err := os.UserHomeDir()
-	if err != nil {
-		return nil
-	}
-	p := filepath.Join(home, ".factory", "settings.json")
-	if _, err := os.Stat(p); err == nil {
-		return []string{p}
-	}
-	return nil
-}
-
-func (d *Droid) Edit(models []string) error {
-	if len(models) == 0 {
-		return nil
-	}
-
-	home, err := os.UserHomeDir()
-	if err != nil {
-		return err
-	}
-
-	settingsPath := filepath.Join(home, ".factory", "settings.json")
-	if err := os.MkdirAll(filepath.Dir(settingsPath), 0o755); err != nil {
-		return err
-	}
-
-	// Read file once, unmarshal twice:
-	// map preserves unknown fields for writing back (including extra fields in model entries)
-	settingsMap := make(map[string]any)
-	var settings droidSettings
-	if data, err := os.ReadFile(settingsPath); err == nil {
-		if err := json.Unmarshal(data, &settingsMap); err != nil {
-			return fmt.Errorf("failed to parse settings file: %w, at: %s", err, settingsPath)
-		}
-		json.Unmarshal(data, &settings) // ignore error, zero values are fine
-	}
-
-	// Keep only non-Ollama models from the raw map (preserves extra fields)
-	// Rebuild Ollama models
-	var nonOllamaModels []any
-	if rawModels, ok := settingsMap["customModels"].([]any); ok {
-		for _, raw := range rawModels {
-			if m, ok := raw.(map[string]any); ok {
-				if m["apiKey"] != "ollama" {
-					nonOllamaModels = append(nonOllamaModels, raw)
-				}
-			}
-		}
-	}
-
-	// Build new Ollama model entries with sequential indices (0, 1, 2, ...)
-	var newModels []any
-	var defaultModelID string
-	for i, model := range models {
-		modelID := fmt.Sprintf("custom:%s-%d", model, i)
-		newModels = append(newModels, modelEntry{
-			Model:           model,
-			DisplayName:     model,
-			BaseURL:         "http://localhost:11434/v1",
-			APIKey:          "ollama",
-			Provider:        "generic-chat-completion-api",
-			MaxOutputTokens: 64000,
-			SupportsImages:  false,
-			ID:              modelID,
-			Index:           i,
-		})
-		if i == 0 {
-			defaultModelID = modelID
-		}
-	}
-
-	settingsMap["customModels"] = append(newModels, nonOllamaModels...)
-
-	// Update session default settings (preserve unknown fields in the nested object)
-	sessionSettings, ok := settingsMap["sessionDefaultSettings"].(map[string]any)
-	if !ok {
-		sessionSettings = make(map[string]any)
-	}
-	sessionSettings["model"] = defaultModelID
-
-	if !isValidReasoningEffort(settings.SessionDefaultSettings.ReasoningEffort) {
-		sessionSettings["reasoningEffort"] = "none"
-	}
-
-	settingsMap["sessionDefaultSettings"] = sessionSettings
-
-	data, err := json.MarshalIndent(settingsMap, "", "  ")
-	if err != nil {
-		return err
-	}
-	return writeWithBackup(settingsPath, data)
-}
-
-func (d *Droid) Models() []string {
-	home, err := os.UserHomeDir()
-	if err != nil {
-		return nil
-	}
-
-	data, err := os.ReadFile(filepath.Join(home, ".factory", "settings.json"))
-	if err != nil {
-		return nil
-	}
-
-	var settings droidSettings
-	if err := json.Unmarshal(data, &settings); err != nil {
-		return nil
-	}
-
-	var result []string
-	for _, m := range settings.CustomModels {
-		if m.APIKey == "ollama" {
-			result = append(result, m.Model)
-		}
-	}
-	return result
-}
-
-var validReasoningEfforts = []string{"high", "medium", "low", "none"}
-
-func isValidReasoningEffort(effort string) bool {
-	return slices.Contains(validReasoningEfforts, effort)
-}
--- a/cmd/config/droid_test.go
+++ b/cmd/config/droid_test.go
--- a/cmd/config/files.go
+++ b/cmd/config/files.go
@@ -1,99 +0,0 @@
-package config
-
-import (
-	"bytes"
-	"encoding/json"
-	"fmt"
-	"os"
-	"path/filepath"
-	"time"
-)
-
-func readJSONFile(path string) (map[string]any, error) {
-	data, err := os.ReadFile(path)
-	if err != nil {
-		return nil, err
-	}
-	var result map[string]any
-	if err := json.Unmarshal(data, &result); err != nil {
-		return nil, err
-	}
-	return result, nil
-}
-
-func copyFile(src, dst string) error {
-	info, err := os.Stat(src)
-	if err != nil {
-		return err
-	}
-	data, err := os.ReadFile(src)
-	if err != nil {
-		return err
-	}
-	return os.WriteFile(dst, data, info.Mode().Perm())
-}
-
-func backupDir() string {
-	return filepath.Join(os.TempDir(), "ollama-backups")
-}
-
-func backupToTmp(srcPath string) (string, error) {
-	dir := backupDir()
-	if err := os.MkdirAll(dir, 0o755); err != nil {
-		return "", err
-	}
-
-	backupPath := filepath.Join(dir, fmt.Sprintf("%s.%d", filepath.Base(srcPath), time.Now().Unix()))
-	if err := copyFile(srcPath, backupPath); err != nil {
-		return "", err
-	}
-	return backupPath, nil
-}
-
-// writeWithBackup writes data to path via temp file + rename, backing up any existing file first
-func writeWithBackup(path string, data []byte) error {
-	var backupPath string
-	// backup must be created before any writes to the target file
-	if existingContent, err := os.ReadFile(path); err == nil {
-		if !bytes.Equal(existingContent, data) {
-			backupPath, err = backupToTmp(path)
-			if err != nil {
-				return fmt.Errorf("backup failed: %w", err)
-			}
-		}
-	} else if !os.IsNotExist(err) {
-		return fmt.Errorf("read existing file: %w", err)
-	}
-
-	dir := filepath.Dir(path)
-	tmp, err := os.CreateTemp(dir, ".tmp-*")
-	if err != nil {
-		return fmt.Errorf("create temp failed: %w", err)
-	}
-	tmpPath := tmp.Name()
-
-	if _, err := tmp.Write(data); err != nil {
-		_ = tmp.Close()
-		_ = os.Remove(tmpPath)
-		return fmt.Errorf("write failed: %w", err)
-	}
-	if err := tmp.Sync(); err != nil {
-		_ = tmp.Close()
-		_ = os.Remove(tmpPath)
-		return fmt.Errorf("sync failed: %w", err)
-	}
-	if err := tmp.Close(); err != nil {
-		_ = os.Remove(tmpPath)
-		return fmt.Errorf("close failed: %w", err)
-	}
-
-	if err := os.Rename(tmpPath, path); err != nil {
-		_ = os.Remove(tmpPath)
-		if backupPath != "" {
-			_ = copyFile(backupPath, path)
-		}
-		return fmt.Errorf("rename failed: %w", err)
-	}
-
-	return nil
-}
--- a/cmd/config/files_test.go
+++ b/cmd/config/files_test.go
@@ -1,502 +0,0 @@
-package config
-
-import (
-	"encoding/json"
-	"fmt"
-	"os"
-	"path/filepath"
-	"runtime"
-	"testing"
-)
-
-func mustMarshal(t *testing.T, v any) []byte {
-	t.Helper()
-	data, err := json.MarshalIndent(v, "", "  ")
-	if err != nil {
-		t.Fatal(err)
-	}
-	return data
-}
-
-func TestWriteWithBackup(t *testing.T) {
-	tmpDir := t.TempDir()
-
-	t.Run("creates file", func(t *testing.T) {
-		path := filepath.Join(tmpDir, "new.json")
-		data := mustMarshal(t, map[string]string{"key": "value"})
-
-		if err := writeWithBackup(path, data); err != nil {
-			t.Fatal(err)
-		}
-
-		content, err := os.ReadFile(path)
-		if err != nil {
-			t.Fatal(err)
-		}
-
-		var result map[string]string
-		if err := json.Unmarshal(content, &result); err != nil {
-			t.Fatal(err)
-		}
-		if result["key"] != "value" {
-			t.Errorf("expected value, got %s", result["key"])
-		}
-	})
-
-	t.Run("creates backup in /tmp/ollama-backups", func(t *testing.T) {
-		path := filepath.Join(tmpDir, "backup.json")
-
-		os.WriteFile(path, []byte(`{"original": true}`), 0o644)
-
-		data := mustMarshal(t, map[string]bool{"updated": true})
-		if err := writeWithBackup(path, data); err != nil {
-			t.Fatal(err)
-		}
-
-		entries, err := os.ReadDir(backupDir())
-		if err != nil {
-			t.Fatal("backup directory not created")
-		}
-
-		var foundBackup bool
-		for _, entry := range entries {
-			if filepath.Ext(entry.Name()) != ".json" {
-				name := entry.Name()
-				if len(name) > len("backup.json.") && name[:len("backup.json.")] == "backup.json." {
-					backupPath := filepath.Join(backupDir(), name)
-					backup, err := os.ReadFile(backupPath)
-					if err == nil {
-						var backupData map[string]bool
-						json.Unmarshal(backup, &backupData)
-						if backupData["original"] {
-							foundBackup = true
-							os.Remove(backupPath)
-							break
-						}
-					}
-				}
-			}
-		}
-
-		if !foundBackup {
-			t.Error("backup file not created in /tmp/ollama-backups")
-		}
-
-		current, _ := os.ReadFile(path)
-		var currentData map[string]bool
-		json.Unmarshal(current, &currentData)
-		if !currentData["updated"] {
-			t.Error("file doesn't contain updated data")
-		}
-	})
-
-	t.Run("no backup for new file", func(t *testing.T) {
-		path := filepath.Join(tmpDir, "nobak.json")
-
-		data := mustMarshal(t, map[string]string{"new": "file"})
-		if err := writeWithBackup(path, data); err != nil {
-			t.Fatal(err)
-		}
-
-		entries, _ := os.ReadDir(backupDir())
-		for _, entry := range entries {
-			if len(entry.Name()) > len("nobak.json.") && entry.Name()[:len("nobak.json.")] == "nobak.json." {
-				t.Error("backup should not exist for new file")
-			}
-		}
-	})
-
-	t.Run("no backup when content unchanged", func(t *testing.T) {
-		path := filepath.Join(tmpDir, "unchanged.json")
-
-		data := mustMarshal(t, map[string]string{"key": "value"})
-
-		if err := writeWithBackup(path, data); err != nil {
-			t.Fatal(err)
-		}
-
-		entries1, _ := os.ReadDir(backupDir())
-		countBefore := 0
-		for _, e := range entries1 {
-			if len(e.Name()) > len("unchanged.json.") && e.Name()[:len("unchanged.json.")] == "unchanged.json." {
-				countBefore++
-			}
-		}
-
-		if err := writeWithBackup(path, data); err != nil {
-			t.Fatal(err)
-		}
-
-		entries2, _ := os.ReadDir(backupDir())
-		countAfter := 0
-		for _, e := range entries2 {
-			if len(e.Name()) > len("unchanged.json.") && e.Name()[:len("unchanged.json.")] == "unchanged.json." {
-				countAfter++
-			}
-		}
-
-		if countAfter != countBefore {
-			t.Errorf("backup was created when content unchanged (before=%d, after=%d)", countBefore, countAfter)
-		}
-	})
-
-	t.Run("backup filename contains unix timestamp", func(t *testing.T) {
-		path := filepath.Join(tmpDir, "timestamped.json")
-
-		os.WriteFile(path, []byte(`{"v": 1}`), 0o644)
-		data := mustMarshal(t, map[string]int{"v": 2})
-		if err := writeWithBackup(path, data); err != nil {
-			t.Fatal(err)
-		}
-
-		entries, _ := os.ReadDir(backupDir())
-		var found bool
-		for _, entry := range entries {
-			name := entry.Name()
-			if len(name) > len("timestamped.json.") && name[:len("timestamped.json.")] == "timestamped.json." {
-				timestamp := name[len("timestamped.json."):]
-				for _, c := range timestamp {
-					if c < '0' || c > '9' {
-						t.Errorf("backup filename timestamp contains non-numeric character: %s", name)
-					}
-				}
-				found = true
-				os.Remove(filepath.Join(backupDir(), name))
-				break
-			}
-		}
-		if !found {
-			t.Error("backup file with timestamp not found")
-		}
-	})
-}
-
-// Edge case tests for files.go
-
-// TestWriteWithBackup_FailsIfBackupFails documents critical behavior: if backup fails, we must not proceed.
-// User could lose their config with no way to recover.
-func TestWriteWithBackup_FailsIfBackupFails(t *testing.T) {
-	if runtime.GOOS == "windows" {
-		t.Skip("permission tests unreliable on Windows")
-	}
-
-	tmpDir := t.TempDir()
-	path := filepath.Join(tmpDir, "config.json")
-
-	// Create original file
-	originalContent := []byte(`{"original": true}`)
-	os.WriteFile(path, originalContent, 0o644)
-
-	// Make backup directory read-only to force backup failure
-	backupDir := backupDir()
-	os.MkdirAll(backupDir, 0o755)
-	os.Chmod(backupDir, 0o444) // Read-only
-	defer os.Chmod(backupDir, 0o755)
-
-	newContent := []byte(`{"updated": true}`)
-	err := writeWithBackup(path, newContent)
-
-	// Should fail because backup couldn't be created
-	if err == nil {
-		t.Error("expected error when backup fails, got nil")
-	}
-
-	// Original file should be preserved
-	current, _ := os.ReadFile(path)
-	if string(current) != string(originalContent) {
-		t.Errorf("original file was modified despite backup failure: got %s", string(current))
-	}
-}
-
-// TestWriteWithBackup_PermissionDenied verifies clear error when target file has wrong permissions.
-// Common issue when config owned by root or wrong perms.
-func TestWriteWithBackup_PermissionDenied(t *testing.T) {
-	if runtime.GOOS == "windows" {
-		t.Skip("permission tests unreliable on Windows")
-	}
-
-	tmpDir := t.TempDir()
-
-	// Create a read-only directory
-	readOnlyDir := filepath.Join(tmpDir, "readonly")
-	os.MkdirAll(readOnlyDir, 0o755)
-	os.Chmod(readOnlyDir, 0o444)
-	defer os.Chmod(readOnlyDir, 0o755)
-
-	path := filepath.Join(readOnlyDir, "config.json")
-	err := writeWithBackup(path, []byte(`{"test": true}`))
-
-	if err == nil {
-		t.Error("expected permission error, got nil")
-	}
-}
-
-// TestWriteWithBackup_DirectoryDoesNotExist verifies behavior when target directory doesn't exist.
-// writeWithBackup doesn't create directories - caller is responsible.
-func TestWriteWithBackup_DirectoryDoesNotExist(t *testing.T) {
-	tmpDir := t.TempDir()
-	path := filepath.Join(tmpDir, "nonexistent", "subdir", "config.json")
-
-	err := writeWithBackup(path, []byte(`{"test": true}`))
-
-	// Should fail because directory doesn't exist
-	if err == nil {
-		t.Error("expected error for nonexistent directory, got nil")
-	}
-}
-
-// TestWriteWithBackup_SymlinkTarget documents behavior when target is a symlink.
-// Documents what happens if user symlinks their config file.
-func TestWriteWithBackup_SymlinkTarget(t *testing.T) {
-	if runtime.GOOS == "windows" {
-		t.Skip("symlink tests may require admin on Windows")
-	}
-
-	tmpDir := t.TempDir()
-	realFile := filepath.Join(tmpDir, "real.json")
-	symlink := filepath.Join(tmpDir, "link.json")
-
-	// Create real file and symlink
-	os.WriteFile(realFile, []byte(`{"v": 1}`), 0o644)
-	os.Symlink(realFile, symlink)
-
-	// Write through symlink
-	err := writeWithBackup(symlink, []byte(`{"v": 2}`))
-	if err != nil {
-		t.Fatalf("writeWithBackup through symlink failed: %v", err)
-	}
-
-	// The real file should be updated (symlink followed for temp file creation)
-	content, _ := os.ReadFile(symlink)
-	if string(content) != `{"v": 2}` {
-		t.Errorf("symlink target not updated correctly: got %s", string(content))
-	}
-}
-
-// TestBackupToTmp_SpecialCharsInFilename verifies backup works with special characters.
-// User may have config files with unusual names.
-func TestBackupToTmp_SpecialCharsInFilename(t *testing.T) {
-	tmpDir := t.TempDir()
-
-	// File with spaces and special chars
-	path := filepath.Join(tmpDir, "my config (backup).json")
-	os.WriteFile(path, []byte(`{"test": true}`), 0o644)
-
-	backupPath, err := backupToTmp(path)
-	if err != nil {
-		t.Fatalf("backupToTmp with special chars failed: %v", err)
-	}
-
-	// Verify backup exists and has correct content
-	content, err := os.ReadFile(backupPath)
-	if err != nil {
-		t.Fatalf("could not read backup: %v", err)
-	}
-	if string(content) != `{"test": true}` {
-		t.Errorf("backup content mismatch: got %s", string(content))
-	}
-
-	os.Remove(backupPath)
-}
-
-// TestCopyFile_PreservesPermissions verifies that copyFile preserves file permissions.
-func TestCopyFile_PreservesPermissions(t *testing.T) {
-	if runtime.GOOS == "windows" {
-		t.Skip("permission preservation tests unreliable on Windows")
-	}
-
-	tmpDir := t.TempDir()
-	src := filepath.Join(tmpDir, "src.json")
-	dst := filepath.Join(tmpDir, "dst.json")
-
-	// Create source with specific permissions
-	os.WriteFile(src, []byte(`{"test": true}`), 0o600)
-
-	err := copyFile(src, dst)
-	if err != nil {
-		t.Fatalf("copyFile failed: %v", err)
-	}
-
-	srcInfo, _ := os.Stat(src)
-	dstInfo, _ := os.Stat(dst)
-
-	if srcInfo.Mode().Perm() != dstInfo.Mode().Perm() {
-		t.Errorf("permissions not preserved: src=%v, dst=%v", srcInfo.Mode().Perm(), dstInfo.Mode().Perm())
-	}
-}
-
-// TestCopyFile_SourceNotFound verifies clear error when source doesn't exist.
-func TestCopyFile_SourceNotFound(t *testing.T) {
-	tmpDir := t.TempDir()
-	src := filepath.Join(tmpDir, "nonexistent.json")
-	dst := filepath.Join(tmpDir, "dst.json")
-
-	err := copyFile(src, dst)
-	if err == nil {
-		t.Error("expected error for nonexistent source, got nil")
-	}
-}
-
-// TestWriteWithBackup_TargetIsDirectory verifies error when path points to a directory.
-func TestWriteWithBackup_TargetIsDirectory(t *testing.T) {
-	tmpDir := t.TempDir()
-	dirPath := filepath.Join(tmpDir, "actualdir")
-	os.MkdirAll(dirPath, 0o755)
-
-	err := writeWithBackup(dirPath, []byte(`{"test": true}`))
-	if err == nil {
-		t.Error("expected error when target is a directory, got nil")
-	}
-}
-
-// TestWriteWithBackup_EmptyData verifies writing zero bytes works correctly.
-func TestWriteWithBackup_EmptyData(t *testing.T) {
-	tmpDir := t.TempDir()
-	path := filepath.Join(tmpDir, "empty.json")
-
-	err := writeWithBackup(path, []byte{})
-	if err != nil {
-		t.Fatalf("writeWithBackup with empty data failed: %v", err)
-	}
-
-	content, err := os.ReadFile(path)
-	if err != nil {
-		t.Fatalf("could not read file: %v", err)
-	}
-	if len(content) != 0 {
-		t.Errorf("expected empty file, got %d bytes", len(content))
-	}
-}
-
-// TestWriteWithBackup_FileUnreadableButDirWritable verifies behavior when existing file
-// cannot be read (for backup comparison) but directory is writable.
-func TestWriteWithBackup_FileUnreadableButDirWritable(t *testing.T) {
-	if runtime.GOOS == "windows" {
-		t.Skip("permission tests unreliable on Windows")
-	}
-
-	tmpDir := t.TempDir()
-	path := filepath.Join(tmpDir, "unreadable.json")
-
-	// Create file and make it unreadable
-	os.WriteFile(path, []byte(`{"original": true}`), 0o644)
-	os.Chmod(path, 0o000)
-	defer os.Chmod(path, 0o644)
-
-	// Should fail because we can't read the file to compare/backup
-	err := writeWithBackup(path, []byte(`{"updated": true}`))
-	if err == nil {
-		t.Error("expected error when file is unreadable, got nil")
-	}
-}
-
-// TestWriteWithBackup_RapidSuccessiveWrites verifies backup works with multiple writes
-// within the same second (timestamp collision scenario).
-func TestWriteWithBackup_RapidSuccessiveWrites(t *testing.T) {
-	tmpDir := t.TempDir()
-	path := filepath.Join(tmpDir, "rapid.json")
-
-	// Create initial file
-	os.WriteFile(path, []byte(`{"v": 0}`), 0o644)
-
-	// Rapid successive writes
-	for i := 1; i <= 3; i++ {
-		data := []byte(fmt.Sprintf(`{"v": %d}`, i))
-		if err := writeWithBackup(path, data); err != nil {
-			t.Fatalf("write %d failed: %v", i, err)
-		}
-	}
-
-	// Verify final content
-	content, _ := os.ReadFile(path)
-	if string(content) != `{"v": 3}` {
-		t.Errorf("expected final content {\"v\": 3}, got %s", string(content))
-	}
-
-	// Verify at least one backup exists
-	entries, _ := os.ReadDir(backupDir())
-	var backupCount int
-	for _, e := range entries {
-		if len(e.Name()) > len("rapid.json.") && e.Name()[:len("rapid.json.")] == "rapid.json." {
-			backupCount++
-		}
-	}
-	if backupCount == 0 {
-		t.Error("expected at least one backup file from rapid writes")
-	}
-}
-
-// TestWriteWithBackup_BackupDirIsFile verifies error when backup directory path is a file.
-func TestWriteWithBackup_BackupDirIsFile(t *testing.T) {
-	if runtime.GOOS == "windows" {
-		t.Skip("test modifies system temp directory")
-	}
-
-	// Create a file at the backup directory path
-	backupPath := backupDir()
-	// Clean up any existing directory first
-	os.RemoveAll(backupPath)
-	// Create a file instead of directory
-	os.WriteFile(backupPath, []byte("not a directory"), 0o644)
-	defer func() {
-		os.Remove(backupPath)
-		os.MkdirAll(backupPath, 0o755)
-	}()
-
-	tmpDir := t.TempDir()
-	path := filepath.Join(tmpDir, "test.json")
-	os.WriteFile(path, []byte(`{"original": true}`), 0o644)
-
-	err := writeWithBackup(path, []byte(`{"updated": true}`))
-	if err == nil {
-		t.Error("expected error when backup dir is a file, got nil")
-	}
-}
-
-// TestWriteWithBackup_NoOrphanTempFiles verifies temp files are cleaned up on failure.
-func TestWriteWithBackup_NoOrphanTempFiles(t *testing.T) {
-	if runtime.GOOS == "windows" {
-		t.Skip("permission tests unreliable on Windows")
-	}
-
-	tmpDir := t.TempDir()
-
-	// Count existing temp files
-	countTempFiles := func() int {
-		entries, _ := os.ReadDir(tmpDir)
-		count := 0
-		for _, e := range entries {
-			if len(e.Name()) > 4 && e.Name()[:4] == ".tmp" {
-				count++
-			}
-		}
-		return count
-	}
-
-	before := countTempFiles()
-
-	// Create a file, then make directory read-only to cause rename failure
-	path := filepath.Join(tmpDir, "orphan.json")
-	os.WriteFile(path, []byte(`{"v": 1}`), 0o644)
-
-	// Make a subdirectory and try to write there after making parent read-only
-	subDir := filepath.Join(tmpDir, "subdir")
-	os.MkdirAll(subDir, 0o755)
-	subPath := filepath.Join(subDir, "config.json")
-	os.WriteFile(subPath, []byte(`{"v": 1}`), 0o644)
-
-	// Make subdir read-only after creating temp file would succeed but rename would fail
-	// This is tricky to test - the temp file is created in the same dir, so if we can't
-	// rename, we also couldn't create. Let's just verify normal failure cleanup works.
-
-	// Force a failure by making the target a directory
-	badPath := filepath.Join(tmpDir, "isdir")
-	os.MkdirAll(badPath, 0o755)
-
-	_ = writeWithBackup(badPath, []byte(`{"test": true}`))
-
-	after := countTempFiles()
-	if after > before {
-		t.Errorf("orphan temp files left behind: before=%d, after=%d", before, after)
-	}
-}
--- a/cmd/config/integrations.go
+++ b/cmd/config/integrations.go
@@ -1,353 +0,0 @@
-package config
-
-import (
-	"context"
-	"errors"
-	"fmt"
-	"maps"
-	"os"
-	"os/exec"
-	"runtime"
-	"slices"
-	"strings"
-	"time"
-
-	"github.com/ollama/ollama/api"
-	"github.com/spf13/cobra"
-)
-
-// Runners execute the launching of a model with the integration - claude, codex
-// Editors can edit config files (supports multi-model selection) - opencode, droid
-// They are composable interfaces where in some cases an editor is also a runner - opencode, droid
-// Runner can run an integration with a model.
-
-type Runner interface {
-	Run(model string) error
-	// String returns the human-readable name of the integration
-	String() string
-}
-
-// Editor can edit config files (supports multi-model selection)
-type Editor interface {
-	// Paths returns the paths to the config files for the integration
-	Paths() []string
-	// Edit updates the config files for the integration with the given models
-	Edit(models []string) error
-	// Models returns the models currently configured for the integration
-	// TODO(parthsareen): add error return to Models()
-	Models() []string
-}
-
-// integrations is the registry of available integrations.
-var integrations = map[string]Runner{
-	"claude":   &Claude{},
-	"codex":    &Codex{},
-	"droid":    &Droid{},
-	"opencode": &OpenCode{},
-}
-
-func selectIntegration() (string, error) {
-	if len(integrations) == 0 {
-		return "", fmt.Errorf("no integrations available")
-	}
-
-	names := slices.Sorted(maps.Keys(integrations))
-	var items []selectItem
-	for _, name := range names {
-		r := integrations[name]
-		description := r.String()
-		if conn, err := loadIntegration(name); err == nil && len(conn.Models) > 0 {
-			description = fmt.Sprintf("%s (%s)", r.String(), conn.Models[0])
-		}
-		items = append(items, selectItem{Name: name, Description: description})
-	}
-
-	return selectPrompt("Select integration:", items)
-}
-
-// selectModels lets the user select models for an integration
-func selectModels(ctx context.Context, name, current string) ([]string, error) {
-	r, ok := integrations[name]
-	if !ok {
-		return nil, fmt.Errorf("unknown integration: %s", name)
-	}
-
-	client, err := api.ClientFromEnvironment()
-	if err != nil {
-		return nil, err
-	}
-
-	models, err := client.List(ctx)
-	if err != nil {
-		return nil, err
-	}
-
-	if len(models.Models) == 0 {
-		return nil, fmt.Errorf("no models available, run 'ollama pull <model>' first")
-	}
-
-	var items []selectItem
-	cloudModels := make(map[string]bool)
-	for _, m := range models.Models {
-		if m.RemoteModel != "" {
-			cloudModels[m.Name] = true
-		}
-		items = append(items, selectItem{Name: m.Name})
-	}
-
-	if len(items) == 0 {
-		return nil, fmt.Errorf("no local models available, run 'ollama pull <model>' first")
-	}
-
-	// Get previously configured models (saved config takes precedence)
-	var preChecked []string
-	if saved, err := loadIntegration(name); err == nil {
-		preChecked = saved.Models
-	} else if editor, ok := r.(Editor); ok {
-		preChecked = editor.Models()
-	}
-	checked := make(map[string]bool, len(preChecked))
-	for _, n := range preChecked {
-		checked[n] = true
-	}
-
-	// Resolve current to full name (e.g., "llama3.2" -> "llama3.2:latest")
-	for _, item := range items {
-		if item.Name == current || strings.HasPrefix(item.Name, current+":") {
-			current = item.Name
-			break
-		}
-	}
-
-	// If current model is configured, move to front of preChecked
-	if checked[current] {
-		preChecked = append([]string{current}, slices.DeleteFunc(preChecked, func(m string) bool { return m == current })...)
-	}
-
-	// Sort: checked first, then alphabetical
-	slices.SortFunc(items, func(a, b selectItem) int {
-		ac, bc := checked[a.Name], checked[b.Name]
-		if ac != bc {
-			if ac {
-				return -1
-			}
-			return 1
-		}
-		return strings.Compare(strings.ToLower(a.Name), strings.ToLower(b.Name))
-	})
-
-	var selected []string
-	// only editors support multi-model selection
-	if _, ok := r.(Editor); ok {
-		selected, err = multiSelectPrompt(fmt.Sprintf("Select models for %s:", r), items, preChecked)
-		if err != nil {
-			return nil, err
-		}
-	} else {
-		model, err := selectPrompt(fmt.Sprintf("Select model for %s:", r), items)
-		if err != nil {
-			return nil, err
-		}
-		selected = []string{model}
-	}
-
-	// if any model in selected is a cloud model, ensure signed in
-	var selectedCloudModels []string
-	for _, m := range selected {
-		if cloudModels[m] {
-			selectedCloudModels = append(selectedCloudModels, m)
-		}
-	}
-	if len(selectedCloudModels) > 0 {
-		// ensure user is signed in
-		user, err := client.Whoami(ctx)
-		if err == nil && user != nil && user.Name != "" {
-			return selected, nil
-		}
-
-		var aErr api.AuthorizationError
-		if !errors.As(err, &aErr) || aErr.SigninURL == "" {
-			return nil, err
-		}
-
-		modelList := strings.Join(selectedCloudModels, ", ")
-		yes, err := confirmPrompt(fmt.Sprintf("sign in to use %s?", modelList))
-		if err != nil || !yes {
-			return nil, fmt.Errorf("%s requires sign in", modelList)
-		}
-
-		fmt.Fprintf(os.Stderr, "\nTo sign in, navigate to:\n    %s\n\n", aErr.SigninURL)
-
-		// TODO(parthsareen): extract into auth package for cmd
-		// Auto-open browser (best effort, fail silently)
-		switch runtime.GOOS {
-		case "darwin":
-			_ = exec.Command("open", aErr.SigninURL).Start()
-		case "linux":
-			_ = exec.Command("xdg-open", aErr.SigninURL).Start()
-		case "windows":
-			_ = exec.Command("rundll32", "url.dll,FileProtocolHandler", aErr.SigninURL).Start()
-		}
-
-		spinnerFrames := []string{"|", "/", "-", "\\"}
-		frame := 0
-
-		fmt.Fprintf(os.Stderr, "\033[90mwaiting for sign in to complete... %s\033[0m", spinnerFrames[0])
-
-		ticker := time.NewTicker(200 * time.Millisecond)
-		defer ticker.Stop()
-
-		for {
-			select {
-			case <-ctx.Done():
-				fmt.Fprintf(os.Stderr, "\r\033[K")
-				return nil, ctx.Err()
-			case <-ticker.C:
-				frame++
-				fmt.Fprintf(os.Stderr, "\r\033[90mwaiting for sign in to complete... %s\033[0m", spinnerFrames[frame%len(spinnerFrames)])
-
-				// poll every 10th frame (~2 seconds)
-				if frame%10 == 0 {
-					u, err := client.Whoami(ctx)
-					if err == nil && u != nil && u.Name != "" {
-						fmt.Fprintf(os.Stderr, "\r\033[K\033[A\r\033[K\033[1msigned in:\033[0m %s\n", u.Name)
-						return selected, nil
-					}
-				}
-			}
-		}
-	}
-
-	return selected, nil
-}
-
-func runIntegration(name, modelName string) error {
-	r, ok := integrations[name]
-	if !ok {
-		return fmt.Errorf("unknown integration: %s", name)
-	}
-	fmt.Fprintf(os.Stderr, "\nLaunching %s with %s...\n", r, modelName)
-	return r.Run(modelName)
-}
-
-// LaunchCmd returns the cobra command for launching integrations.
-func LaunchCmd(checkServerHeartbeat func(cmd *cobra.Command, args []string) error) *cobra.Command {
-	var modelFlag string
-	var configFlag bool
-
-	cmd := &cobra.Command{
-		Use:   "launch [INTEGRATION]",
-		Short: "Launch an integration with Ollama",
-		Long: `Launch an integration configured with Ollama models.
-
-Supported integrations:
-  claude    Claude Code
-  codex     Codex
-  droid     Droid
-  opencode  OpenCode
-
-Examples:
-  ollama launch
-  ollama launch claude
-  ollama launch claude --model <model>
-  ollama launch droid --config (does not auto-launch)`,
-		Args:    cobra.MaximumNArgs(1),
-		PreRunE: checkServerHeartbeat,
-		RunE: func(cmd *cobra.Command, args []string) error {
-			var name string
-			if len(args) > 0 {
-				name = args[0]
-			} else {
-				var err error
-				name, err = selectIntegration()
-				if errors.Is(err, errCancelled) {
-					return nil
-				}
-				if err != nil {
-					return err
-				}
-			}
-
-			r, ok := integrations[strings.ToLower(name)]
-			if !ok {
-				return fmt.Errorf("unknown integration: %s", name)
-			}
-
-			// If launching without --model, use saved config if available
-			if !configFlag && modelFlag == "" {
-				if config, err := loadIntegration(name); err == nil && len(config.Models) > 0 {
-					return runIntegration(name, config.Models[0])
-				}
-			}
-
-			var models []string
-			if modelFlag != "" {
-				// When --model is specified, merge with existing models (new model becomes default)
-				models = []string{modelFlag}
-				if existing, err := loadIntegration(name); err == nil && len(existing.Models) > 0 {
-					for _, m := range existing.Models {
-						if m != modelFlag {
-							models = append(models, m)
-						}
-					}
-				}
-			} else {
-				var err error
-				models, err = selectModels(cmd.Context(), name, "")
-				if errors.Is(err, errCancelled) {
-					return nil
-				}
-				if err != nil {
-					return err
-				}
-			}
-
-			if editor, isEditor := r.(Editor); isEditor {
-				paths := editor.Paths()
-				if len(paths) > 0 {
-					fmt.Fprintf(os.Stderr, "This will modify your %s configuration:\n", r)
-					for _, p := range paths {
-						fmt.Fprintf(os.Stderr, "  %s\n", p)
-					}
-					fmt.Fprintf(os.Stderr, "Backups will be saved to %s/\n\n", backupDir())
-
-					if ok, _ := confirmPrompt("Proceed?"); !ok {
-						return nil
-					}
-				}
-			}
-
-			if err := saveIntegration(name, models); err != nil {
-				return fmt.Errorf("failed to save: %w", err)
-			}
-
-			if editor, isEditor := r.(Editor); isEditor {
-				if err := editor.Edit(models); err != nil {
-					return fmt.Errorf("setup failed: %w", err)
-				}
-			}
-
-			if _, isEditor := r.(Editor); isEditor {
-				if len(models) == 1 {
-					fmt.Fprintf(os.Stderr, "Added %s to %s\n", models[0], r)
-				} else {
-					fmt.Fprintf(os.Stderr, "Added %d models to %s (default: %s)\n", len(models), r, models[0])
-				}
-			}
-
-			if configFlag {
-				if launch, _ := confirmPrompt(fmt.Sprintf("\nLaunch %s now?", r)); launch {
-					return runIntegration(name, models[0])
-				}
-				fmt.Fprintf(os.Stderr, "Run 'ollama launch %s' to start with %s\n", strings.ToLower(name), models[0])
-				return nil
-			}
-
-			return runIntegration(name, models[0])
-		},
-	}
-
-	cmd.Flags().StringVar(&modelFlag, "model", "", "Model to use")
-	cmd.Flags().BoolVar(&configFlag, "config", false, "Configure without launching")
-	return cmd
-}
--- a/cmd/config/integrations_test.go
+++ b/cmd/config/integrations_test.go
@@ -1,188 +0,0 @@
-package config
-
-import (
-	"slices"
-	"strings"
-	"testing"
-
-	"github.com/spf13/cobra"
-)
-
-func TestIntegrationLookup(t *testing.T) {
-	tests := []struct {
-		name      string
-		input     string
-		wantFound bool
-		wantName  string
-	}{
-		{"claude lowercase", "claude", true, "Claude Code"},
-		{"claude uppercase", "CLAUDE", true, "Claude Code"},
-		{"claude mixed case", "Claude", true, "Claude Code"},
-		{"codex", "codex", true, "Codex"},
-		{"droid", "droid", true, "Droid"},
-		{"opencode", "opencode", true, "OpenCode"},
-		{"unknown integration", "unknown", false, ""},
-		{"empty string", "", false, ""},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			r, found := integrations[strings.ToLower(tt.input)]
-			if found != tt.wantFound {
-				t.Errorf("integrations[%q] found = %v, want %v", tt.input, found, tt.wantFound)
-			}
-			if found && r.String() != tt.wantName {
-				t.Errorf("integrations[%q].String() = %q, want %q", tt.input, r.String(), tt.wantName)
-			}
-		})
-	}
-}
-
-func TestIntegrationRegistry(t *testing.T) {
-	expectedIntegrations := []string{"claude", "codex", "droid", "opencode"}
-
-	for _, name := range expectedIntegrations {
-		t.Run(name, func(t *testing.T) {
-			r, ok := integrations[name]
-			if !ok {
-				t.Fatalf("integration %q not found in registry", name)
-			}
-			if r.String() == "" {
-				t.Error("integration.String() should not be empty")
-			}
-		})
-	}
-}
-
-func TestHasLocalModel(t *testing.T) {
-	tests := []struct {
-		name   string
-		models []string
-		want   bool
-	}{
-		{"empty list", []string{}, false},
-		{"single local model", []string{"llama3.2"}, true},
-		{"single cloud model", []string{"cloud-model"}, false},
-		{"mixed models", []string{"cloud-model", "llama3.2"}, true},
-		{"multiple local models", []string{"llama3.2", "qwen2.5"}, true},
-		{"multiple cloud models", []string{"cloud-a", "cloud-b"}, false},
-		{"local model first", []string{"llama3.2", "cloud-model"}, true},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			got := slices.ContainsFunc(tt.models, func(m string) bool {
-				return !strings.Contains(m, "cloud")
-			})
-			if got != tt.want {
-				t.Errorf("hasLocalModel(%v) = %v, want %v", tt.models, got, tt.want)
-			}
-		})
-	}
-}
-
-func TestLaunchCmd(t *testing.T) {
-	// Mock checkServerHeartbeat that always succeeds
-	mockCheck := func(cmd *cobra.Command, args []string) error {
-		return nil
-	}
-
-	cmd := LaunchCmd(mockCheck)
-
-	t.Run("command structure", func(t *testing.T) {
-		if cmd.Use != "launch [INTEGRATION]" {
-			t.Errorf("Use = %q, want %q", cmd.Use, "launch [INTEGRATION]")
-		}
-		if cmd.Short == "" {
-			t.Error("Short description should not be empty")
-		}
-		if cmd.Long == "" {
-			t.Error("Long description should not be empty")
-		}
-	})
-
-	t.Run("flags exist", func(t *testing.T) {
-		modelFlag := cmd.Flags().Lookup("model")
-		if modelFlag == nil {
-			t.Error("--model flag should exist")
-		}
-
-		configFlag := cmd.Flags().Lookup("config")
-		if configFlag == nil {
-			t.Error("--config flag should exist")
-		}
-	})
-
-	t.Run("PreRunE is set", func(t *testing.T) {
-		if cmd.PreRunE == nil {
-			t.Error("PreRunE should be set to checkServerHeartbeat")
-		}
-	})
-}
-
-func TestRunIntegration_UnknownIntegration(t *testing.T) {
-	err := runIntegration("unknown-integration", "model")
-	if err == nil {
-		t.Error("expected error for unknown integration, got nil")
-	}
-	if !strings.Contains(err.Error(), "unknown integration") {
-		t.Errorf("error should mention 'unknown integration', got: %v", err)
-	}
-}
-
-func TestHasLocalModel_DocumentsHeuristic(t *testing.T) {
-	tests := []struct {
-		name   string
-		models []string
-		want   bool
-		reason string
-	}{
-		{"empty list", []string{}, false, "empty list has no local models"},
-		{"contains-cloud-substring", []string{"deepseek-r1:cloud"}, false, "model with 'cloud' substring is considered cloud"},
-		{"cloud-in-name", []string{"my-cloud-model"}, false, "'cloud' anywhere in name = cloud model"},
-		{"cloudless", []string{"cloudless-model"}, false, "'cloudless' still contains 'cloud'"},
-		{"local-model", []string{"llama3.2"}, true, "no 'cloud' = local"},
-		{"mixed", []string{"cloud-model", "llama3.2"}, true, "one local model = hasLocalModel true"},
-		{"all-cloud", []string{"cloud-a", "cloud-b"}, false, "all contain 'cloud'"},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			got := slices.ContainsFunc(tt.models, func(m string) bool {
-				return !strings.Contains(m, "cloud")
-			})
-			if got != tt.want {
-				t.Errorf("hasLocalModel(%v) = %v, want %v (%s)", tt.models, got, tt.want, tt.reason)
-			}
-		})
-	}
-}
-
-func TestLaunchCmd_NilHeartbeat(t *testing.T) {
-	// This should not panic - cmd creation should work even with nil
-	cmd := LaunchCmd(nil)
-	if cmd == nil {
-		t.Fatal("LaunchCmd returned nil")
-	}
-
-	// PreRunE should be nil when passed nil
-	if cmd.PreRunE != nil {
-		t.Log("Note: PreRunE is set even when nil is passed (acceptable)")
-	}
-}
-
-func TestAllIntegrations_HaveRequiredMethods(t *testing.T) {
-	for name, r := range integrations {
-		t.Run(name, func(t *testing.T) {
-			// Test String() doesn't panic and returns non-empty
-			displayName := r.String()
-			if displayName == "" {
-				t.Error("String() should not return empty")
-			}
-
-			// Test Run() exists (we can't call it without actually running the command)
-			// Just verify the method is available
-			var _ func(string) error = r.Run
-		})
-	}
-}
--- a/cmd/config/opencode.go
+++ b/cmd/config/opencode.go
@@ -1,224 +0,0 @@
-package config
-
-import (
-	"encoding/json"
-	"fmt"
-	"maps"
-	"os"
-	"os/exec"
-	"path/filepath"
-	"slices"
-	"strings"
-)
-
-// OpenCode implements Runner and Editor for OpenCode integration
-type OpenCode struct{}
-
-func (o *OpenCode) String() string { return "OpenCode" }
-
-func (o *OpenCode) Run(model string) error {
-	if _, err := exec.LookPath("opencode"); err != nil {
-		return fmt.Errorf("opencode is not installed, install from https://opencode.ai")
-	}
-
-	// Call Edit() to ensure config is up-to-date before launch
-	models := []string{model}
-	if config, err := loadIntegration("opencode"); err == nil && len(config.Models) > 0 {
-		models = config.Models
-	}
-	if err := o.Edit(models); err != nil {
-		return fmt.Errorf("setup failed: %w", err)
-	}
-
-	cmd := exec.Command("opencode")
-	cmd.Stdin = os.Stdin
-	cmd.Stdout = os.Stdout
-	cmd.Stderr = os.Stderr
-	return cmd.Run()
-}
-
-func (o *OpenCode) Paths() []string {
-	home, err := os.UserHomeDir()
-	if err != nil {
-		return nil
-	}
-
-	var paths []string
-	p := filepath.Join(home, ".config", "opencode", "opencode.json")
-	if _, err := os.Stat(p); err == nil {
-		paths = append(paths, p)
-	}
-	sp := filepath.Join(home, ".local", "state", "opencode", "model.json")
-	if _, err := os.Stat(sp); err == nil {
-		paths = append(paths, sp)
-	}
-	return paths
-}
-
-func (o *OpenCode) Edit(modelList []string) error {
-	if len(modelList) == 0 {
-		return nil
-	}
-
-	home, err := os.UserHomeDir()
-	if err != nil {
-		return err
-	}
-
-	configPath := filepath.Join(home, ".config", "opencode", "opencode.json")
-	if err := os.MkdirAll(filepath.Dir(configPath), 0o755); err != nil {
-		return err
-	}
-
-	config := make(map[string]any)
-	if data, err := os.ReadFile(configPath); err == nil {
-		_ = json.Unmarshal(data, &config) // Ignore parse errors; treat missing/corrupt files as empty
-	}
-
-	config["$schema"] = "https://opencode.ai/config.json"
-
-	provider, ok := config["provider"].(map[string]any)
-	if !ok {
-		provider = make(map[string]any)
-	}
-
-	ollama, ok := provider["ollama"].(map[string]any)
-	if !ok {
-		ollama = map[string]any{
-			"npm":  "@ai-sdk/openai-compatible",
-			"name": "Ollama (local)",
-			"options": map[string]any{
-				"baseURL": "http://localhost:11434/v1",
-			},
-		}
-	}
-
-	models, ok := ollama["models"].(map[string]any)
-	if !ok {
-		models = make(map[string]any)
-	}
-
-	selectedSet := make(map[string]bool)
-	for _, m := range modelList {
-		selectedSet[m] = true
-	}
-
-	for name, cfg := range models {
-		if cfgMap, ok := cfg.(map[string]any); ok {
-			if isOllamaModel(cfgMap) && !selectedSet[name] {
-				delete(models, name)
-			}
-		}
-	}
-
-	for _, model := range modelList {
-		if existing, ok := models[model].(map[string]any); ok {
-			// migrate existing models without _launch marker
-			if isOllamaModel(existing) {
-				existing["_launch"] = true
-				if name, ok := existing["name"].(string); ok {
-					existing["name"] = strings.TrimSuffix(name, " [Ollama]")
-				}
-			}
-			continue
-		}
-		models[model] = map[string]any{
-			"name":    model,
-			"_launch": true,
-		}
-	}
-
-	ollama["models"] = models
-	provider["ollama"] = ollama
-	config["provider"] = provider
-
-	configData, err := json.MarshalIndent(config, "", "  ")
-	if err != nil {
-		return err
-	}
-	if err := writeWithBackup(configPath, configData); err != nil {
-		return err
-	}
-
-	statePath := filepath.Join(home, ".local", "state", "opencode", "model.json")
-	if err := os.MkdirAll(filepath.Dir(statePath), 0o755); err != nil {
-		return err
-	}
-
-	state := map[string]any{
-		"recent":   []any{},
-		"favorite": []any{},
-		"variant":  map[string]any{},
-	}
-	if data, err := os.ReadFile(statePath); err == nil {
-		_ = json.Unmarshal(data, &state) // Ignore parse errors; use defaults
-	}
-
-	recent, _ := state["recent"].([]any)
-
-	modelSet := make(map[string]bool)
-	for _, m := range modelList {
-		modelSet[m] = true
-	}
-
-	// Filter out existing Ollama models we're about to re-add
-	newRecent := slices.DeleteFunc(slices.Clone(recent), func(entry any) bool {
-		e, ok := entry.(map[string]any)
-		if !ok || e["providerID"] != "ollama" {
-			return false
-		}
-		modelID, _ := e["modelID"].(string)
-		return modelSet[modelID]
-	})
-
-	// Prepend models in reverse order so first model ends up first
-	for _, model := range slices.Backward(modelList) {
-		newRecent = slices.Insert(newRecent, 0, any(map[string]any{
-			"providerID": "ollama",
-			"modelID":    model,
-		}))
-	}
-
-	const maxRecentModels = 10
-	newRecent = newRecent[:min(len(newRecent), maxRecentModels)]
-
-	state["recent"] = newRecent
-
-	stateData, err := json.MarshalIndent(state, "", "  ")
-	if err != nil {
-		return err
-	}
-	return writeWithBackup(statePath, stateData)
-}
-
-func (o *OpenCode) Models() []string {
-	home, err := os.UserHomeDir()
-	if err != nil {
-		return nil
-	}
-	config, err := readJSONFile(filepath.Join(home, ".config", "opencode", "opencode.json"))
-	if err != nil {
-		return nil
-	}
-	provider, _ := config["provider"].(map[string]any)
-	ollama, _ := provider["ollama"].(map[string]any)
-	models, _ := ollama["models"].(map[string]any)
-	if len(models) == 0 {
-		return nil
-	}
-	keys := slices.Collect(maps.Keys(models))
-	slices.Sort(keys)
-	return keys
-}
-
-// isOllamaModel reports whether a model config entry is managed by us
-func isOllamaModel(cfg map[string]any) bool {
-	if v, ok := cfg["_launch"].(bool); ok && v {
-		return true
-	}
-	// previously used [Ollama] as a suffix for the model managed by ollama launch
-	if name, ok := cfg["name"].(string); ok {
-		return strings.HasSuffix(name, "[Ollama]")
-	}
-	return false
-}
--- a/cmd/config/opencode_test.go
+++ b/cmd/config/opencode_test.go
@@ -1,507 +0,0 @@
-package config
-
-import (
-	"encoding/json"
-	"os"
-	"path/filepath"
-	"testing"
-)
-
-func TestOpenCodeIntegration(t *testing.T) {
-	o := &OpenCode{}
-
-	t.Run("String", func(t *testing.T) {
-		if got := o.String(); got != "OpenCode" {
-			t.Errorf("String() = %q, want %q", got, "OpenCode")
-		}
-	})
-
-	t.Run("implements Runner", func(t *testing.T) {
-		var _ Runner = o
-	})
-
-	t.Run("implements Editor", func(t *testing.T) {
-		var _ Editor = o
-	})
-}
-
-func TestOpenCodeEdit(t *testing.T) {
-	o := &OpenCode{}
-	tmpDir := t.TempDir()
-	setTestHome(t, tmpDir)
-
-	configDir := filepath.Join(tmpDir, ".config", "opencode")
-	configPath := filepath.Join(configDir, "opencode.json")
-	stateDir := filepath.Join(tmpDir, ".local", "state", "opencode")
-	statePath := filepath.Join(stateDir, "model.json")
-
-	cleanup := func() {
-		os.RemoveAll(configDir)
-		os.RemoveAll(stateDir)
-	}
-
-	t.Run("fresh install", func(t *testing.T) {
-		cleanup()
-		if err := o.Edit([]string{"llama3.2"}); err != nil {
-			t.Fatal(err)
-		}
-		assertOpenCodeModelExists(t, configPath, "llama3.2")
-		assertOpenCodeRecentModel(t, statePath, 0, "ollama", "llama3.2")
-	})
-
-	t.Run("preserve other providers", func(t *testing.T) {
-		cleanup()
-		os.MkdirAll(configDir, 0o755)
-		os.WriteFile(configPath, []byte(`{"provider":{"anthropic":{"apiKey":"xxx"}}}`), 0o644)
-		if err := o.Edit([]string{"llama3.2"}); err != nil {
-			t.Fatal(err)
-		}
-		data, _ := os.ReadFile(configPath)
-		var cfg map[string]any
-		json.Unmarshal(data, &cfg)
-		provider := cfg["provider"].(map[string]any)
-		if provider["anthropic"] == nil {
-			t.Error("anthropic provider was removed")
-		}
-		assertOpenCodeModelExists(t, configPath, "llama3.2")
-	})
-
-	t.Run("preserve other models", func(t *testing.T) {
-		cleanup()
-		os.MkdirAll(configDir, 0o755)
-		os.WriteFile(configPath, []byte(`{"provider":{"ollama":{"models":{"mistral":{"name":"Mistral"}}}}}`), 0o644)
-		if err := o.Edit([]string{"llama3.2"}); err != nil {
-			t.Fatal(err)
-		}
-		assertOpenCodeModelExists(t, configPath, "mistral")
-		assertOpenCodeModelExists(t, configPath, "llama3.2")
-	})
-
-	t.Run("update existing model", func(t *testing.T) {
-		cleanup()
-		o.Edit([]string{"llama3.2"})
-		o.Edit([]string{"llama3.2"})
-		assertOpenCodeModelExists(t, configPath, "llama3.2")
-	})
-
-	t.Run("preserve top-level keys", func(t *testing.T) {
-		cleanup()
-		os.MkdirAll(configDir, 0o755)
-		os.WriteFile(configPath, []byte(`{"theme":"dark","keybindings":{}}`), 0o644)
-		if err := o.Edit([]string{"llama3.2"}); err != nil {
-			t.Fatal(err)
-		}
-		data, _ := os.ReadFile(configPath)
-		var cfg map[string]any
-		json.Unmarshal(data, &cfg)
-		if cfg["theme"] != "dark" {
-			t.Error("theme was removed")
-		}
-		if cfg["keybindings"] == nil {
-			t.Error("keybindings was removed")
-		}
-	})
-
-	t.Run("model state - insert at index 0", func(t *testing.T) {
-		cleanup()
-		os.MkdirAll(stateDir, 0o755)
-		os.WriteFile(statePath, []byte(`{"recent":[{"providerID":"anthropic","modelID":"claude"}],"favorite":[],"variant":{}}`), 0o644)
-		if err := o.Edit([]string{"llama3.2"}); err != nil {
-			t.Fatal(err)
-		}
-		assertOpenCodeRecentModel(t, statePath, 0, "ollama", "llama3.2")
-		assertOpenCodeRecentModel(t, statePath, 1, "anthropic", "claude")
-	})
-
-	t.Run("model state - preserve favorites and variants", func(t *testing.T) {
-		cleanup()
-		os.MkdirAll(stateDir, 0o755)
-		os.WriteFile(statePath, []byte(`{"recent":[],"favorite":[{"providerID":"x","modelID":"y"}],"variant":{"a":"b"}}`), 0o644)
-		if err := o.Edit([]string{"llama3.2"}); err != nil {
-			t.Fatal(err)
-		}
-		data, _ := os.ReadFile(statePath)
-		var state map[string]any
-		json.Unmarshal(data, &state)
-		if len(state["favorite"].([]any)) != 1 {
-			t.Error("favorite was modified")
-		}
-		if state["variant"].(map[string]any)["a"] != "b" {
-			t.Error("variant was modified")
-		}
-	})
-
-	t.Run("model state - deduplicate on re-add", func(t *testing.T) {
-		cleanup()
-		os.MkdirAll(stateDir, 0o755)
-		os.WriteFile(statePath, []byte(`{"recent":[{"providerID":"ollama","modelID":"llama3.2"},{"providerID":"anthropic","modelID":"claude"}],"favorite":[],"variant":{}}`), 0o644)
-		if err := o.Edit([]string{"llama3.2"}); err != nil {
-			t.Fatal(err)
-		}
-		data, _ := os.ReadFile(statePath)
-		var state map[string]any
-		json.Unmarshal(data, &state)
-		recent := state["recent"].([]any)
-		if len(recent) != 2 {
-			t.Errorf("expected 2 recent entries, got %d", len(recent))
-		}
-		assertOpenCodeRecentModel(t, statePath, 0, "ollama", "llama3.2")
-	})
-
-	t.Run("remove model", func(t *testing.T) {
-		cleanup()
-		// First add two models
-		o.Edit([]string{"llama3.2", "mistral"})
-		assertOpenCodeModelExists(t, configPath, "llama3.2")
-		assertOpenCodeModelExists(t, configPath, "mistral")
-
-		// Then remove one by only selecting the other
-		o.Edit([]string{"llama3.2"})
-		assertOpenCodeModelExists(t, configPath, "llama3.2")
-		assertOpenCodeModelNotExists(t, configPath, "mistral")
-	})
-
-	t.Run("preserve user customizations on managed models", func(t *testing.T) {
-		cleanup()
-		if err := o.Edit([]string{"llama3.2"}); err != nil {
-			t.Fatal(err)
-		}
-
-		// Add custom fields to the model entry (simulating user edits)
-		data, _ := os.ReadFile(configPath)
-		var cfg map[string]any
-		json.Unmarshal(data, &cfg)
-		provider := cfg["provider"].(map[string]any)
-		ollama := provider["ollama"].(map[string]any)
-		models := ollama["models"].(map[string]any)
-		entry := models["llama3.2"].(map[string]any)
-		entry["_myPref"] = "custom-value"
-		entry["_myNum"] = 42
-		configData, _ := json.MarshalIndent(cfg, "", "  ")
-		os.WriteFile(configPath, configData, 0o644)
-
-		// Re-run Edit — should preserve custom fields
-		if err := o.Edit([]string{"llama3.2"}); err != nil {
-			t.Fatal(err)
-		}
-
-		data, _ = os.ReadFile(configPath)
-		json.Unmarshal(data, &cfg)
-		provider = cfg["provider"].(map[string]any)
-		ollama = provider["ollama"].(map[string]any)
-		models = ollama["models"].(map[string]any)
-		entry = models["llama3.2"].(map[string]any)
-
-		if entry["_myPref"] != "custom-value" {
-			t.Errorf("_myPref was lost: got %v", entry["_myPref"])
-		}
-		if entry["_myNum"] != float64(42) {
-			t.Errorf("_myNum was lost: got %v", entry["_myNum"])
-		}
-		if v, ok := entry["_launch"].(bool); !ok || !v {
-			t.Errorf("_launch marker missing or false: got %v", entry["_launch"])
-		}
-	})
-
-	t.Run("migrate legacy [Ollama] suffix entries", func(t *testing.T) {
-		cleanup()
-		// Write a config with a legacy entry (has [Ollama] suffix but no _launch marker)
-		os.MkdirAll(configDir, 0o755)
-		os.WriteFile(configPath, []byte(`{"provider":{"ollama":{"models":{"llama3.2":{"name":"llama3.2 [Ollama]"}}}}}`), 0o644)
-
-		if err := o.Edit([]string{"llama3.2"}); err != nil {
-			t.Fatal(err)
-		}
-
-		data, _ := os.ReadFile(configPath)
-		var cfg map[string]any
-		json.Unmarshal(data, &cfg)
-		provider := cfg["provider"].(map[string]any)
-		ollama := provider["ollama"].(map[string]any)
-		models := ollama["models"].(map[string]any)
-		entry := models["llama3.2"].(map[string]any)
-
-		// _launch marker should be added
-		if v, ok := entry["_launch"].(bool); !ok || !v {
-			t.Errorf("_launch marker not added during migration: got %v", entry["_launch"])
-		}
-		// [Ollama] suffix should be stripped
-		if name, ok := entry["name"].(string); !ok || name != "llama3.2" {
-			t.Errorf("name suffix not stripped: got %q", entry["name"])
-		}
-	})
-
-	t.Run("remove model preserves non-ollama models", func(t *testing.T) {
-		cleanup()
-		os.MkdirAll(configDir, 0o755)
-		// Add a non-Ollama model manually
-		os.WriteFile(configPath, []byte(`{"provider":{"ollama":{"models":{"external":{"name":"External Model"}}}}}`), 0o644)
-
-		o.Edit([]string{"llama3.2"})
-		assertOpenCodeModelExists(t, configPath, "llama3.2")
-		assertOpenCodeModelExists(t, configPath, "external") // Should be preserved
-	})
-}
-
-func assertOpenCodeModelExists(t *testing.T, path, model string) {
-	t.Helper()
-	data, err := os.ReadFile(path)
-	if err != nil {
-		t.Fatal(err)
-	}
-	var cfg map[string]any
-	if err := json.Unmarshal(data, &cfg); err != nil {
-		t.Fatal(err)
-	}
-	provider, ok := cfg["provider"].(map[string]any)
-	if !ok {
-		t.Fatal("provider not found")
-	}
-	ollama, ok := provider["ollama"].(map[string]any)
-	if !ok {
-		t.Fatal("ollama provider not found")
-	}
-	models, ok := ollama["models"].(map[string]any)
-	if !ok {
-		t.Fatal("models not found")
-	}
-	if models[model] == nil {
-		t.Errorf("model %s not found", model)
-	}
-}
-
-func assertOpenCodeModelNotExists(t *testing.T, path, model string) {
-	t.Helper()
-	data, err := os.ReadFile(path)
-	if err != nil {
-		t.Fatal(err)
-	}
-	var cfg map[string]any
-	if err := json.Unmarshal(data, &cfg); err != nil {
-		t.Fatal(err)
-	}
-	provider, ok := cfg["provider"].(map[string]any)
-	if !ok {
-		return // No provider means no model
-	}
-	ollama, ok := provider["ollama"].(map[string]any)
-	if !ok {
-		return // No ollama means no model
-	}
-	models, ok := ollama["models"].(map[string]any)
-	if !ok {
-		return // No models means no model
-	}
-	if models[model] != nil {
-		t.Errorf("model %s should not exist but was found", model)
-	}
-}
-
-func assertOpenCodeRecentModel(t *testing.T, path string, index int, providerID, modelID string) {
-	t.Helper()
-	data, err := os.ReadFile(path)
-	if err != nil {
-		t.Fatal(err)
-	}
-	var state map[string]any
-	if err := json.Unmarshal(data, &state); err != nil {
-		t.Fatal(err)
-	}
-	recent, ok := state["recent"].([]any)
-	if !ok {
-		t.Fatal("recent not found")
-	}
-	if index >= len(recent) {
-		t.Fatalf("index %d out of range (len=%d)", index, len(recent))
-	}
-	entry, ok := recent[index].(map[string]any)
-	if !ok {
-		t.Fatal("entry is not a map")
-	}
-	if entry["providerID"] != providerID {
-		t.Errorf("expected providerID %s, got %s", providerID, entry["providerID"])
-	}
-	if entry["modelID"] != modelID {
-		t.Errorf("expected modelID %s, got %s", modelID, entry["modelID"])
-	}
-}
-
-// Edge case tests for opencode.go
-
-func TestOpenCodeEdit_CorruptedConfigJSON(t *testing.T) {
-	o := &OpenCode{}
-	tmpDir := t.TempDir()
-	setTestHome(t, tmpDir)
-
-	configDir := filepath.Join(tmpDir, ".config", "opencode")
-	configPath := filepath.Join(configDir, "opencode.json")
-
-	os.MkdirAll(configDir, 0o755)
-	os.WriteFile(configPath, []byte(`{corrupted json content`), 0o644)
-
-	// Should not panic - corrupted JSON should be treated as empty
-	err := o.Edit([]string{"llama3.2"})
-	if err != nil {
-		t.Fatalf("Edit failed with corrupted config: %v", err)
-	}
-
-	// Verify valid JSON was created
-	data, _ := os.ReadFile(configPath)
-	var cfg map[string]any
-	if err := json.Unmarshal(data, &cfg); err != nil {
-		t.Errorf("resulting config is not valid JSON: %v", err)
-	}
-}
-
-func TestOpenCodeEdit_CorruptedStateJSON(t *testing.T) {
-	o := &OpenCode{}
-	tmpDir := t.TempDir()
-	setTestHome(t, tmpDir)
-
-	stateDir := filepath.Join(tmpDir, ".local", "state", "opencode")
-	statePath := filepath.Join(stateDir, "model.json")
-
-	os.MkdirAll(stateDir, 0o755)
-	os.WriteFile(statePath, []byte(`{corrupted state`), 0o644)
-
-	err := o.Edit([]string{"llama3.2"})
-	if err != nil {
-		t.Fatalf("Edit failed with corrupted state: %v", err)
-	}
-
-	// Verify valid state was created
-	data, _ := os.ReadFile(statePath)
-	var state map[string]any
-	if err := json.Unmarshal(data, &state); err != nil {
-		t.Errorf("resulting state is not valid JSON: %v", err)
-	}
-}
-
-func TestOpenCodeEdit_WrongTypeProvider(t *testing.T) {
-	o := &OpenCode{}
-	tmpDir := t.TempDir()
-	setTestHome(t, tmpDir)
-
-	configDir := filepath.Join(tmpDir, ".config", "opencode")
-	configPath := filepath.Join(configDir, "opencode.json")
-
-	os.MkdirAll(configDir, 0o755)
-	os.WriteFile(configPath, []byte(`{"provider": "not a map"}`), 0o644)
-
-	err := o.Edit([]string{"llama3.2"})
-	if err != nil {
-		t.Fatalf("Edit with wrong type provider failed: %v", err)
-	}
-
-	// Verify provider is now correct type
-	data, _ := os.ReadFile(configPath)
-	var cfg map[string]any
-	json.Unmarshal(data, &cfg)
-
-	provider, ok := cfg["provider"].(map[string]any)
-	if !ok {
-		t.Fatalf("provider should be map after setup, got %T", cfg["provider"])
-	}
-	if provider["ollama"] == nil {
-		t.Error("ollama provider should be created")
-	}
-}
-
-func TestOpenCodeEdit_WrongTypeRecent(t *testing.T) {
-	o := &OpenCode{}
-	tmpDir := t.TempDir()
-	setTestHome(t, tmpDir)
-
-	stateDir := filepath.Join(tmpDir, ".local", "state", "opencode")
-	statePath := filepath.Join(stateDir, "model.json")
-
-	os.MkdirAll(stateDir, 0o755)
-	os.WriteFile(statePath, []byte(`{"recent": "not an array", "favorite": [], "variant": {}}`), 0o644)
-
-	err := o.Edit([]string{"llama3.2"})
-	if err != nil {
-		t.Fatalf("Edit with wrong type recent failed: %v", err)
-	}
-
-	// The function should handle this gracefully
-	data, _ := os.ReadFile(statePath)
-	var state map[string]any
-	json.Unmarshal(data, &state)
-
-	// recent should be properly set after setup
-	recent, ok := state["recent"].([]any)
-	if !ok {
-		t.Logf("Note: recent type after setup is %T (documenting behavior)", state["recent"])
-	} else if len(recent) == 0 {
-		t.Logf("Note: recent is empty (documenting behavior)")
-	}
-}
-
-func TestOpenCodeEdit_EmptyModels(t *testing.T) {
-	o := &OpenCode{}
-	tmpDir := t.TempDir()
-	setTestHome(t, tmpDir)
-
-	configDir := filepath.Join(tmpDir, ".config", "opencode")
-	configPath := filepath.Join(configDir, "opencode.json")
-
-	os.MkdirAll(configDir, 0o755)
-	originalContent := `{"provider":{"ollama":{"models":{"existing":{}}}}}`
-	os.WriteFile(configPath, []byte(originalContent), 0o644)
-
-	// Empty models should be no-op
-	err := o.Edit([]string{})
-	if err != nil {
-		t.Fatalf("Edit with empty models failed: %v", err)
-	}
-
-	// Original content should be preserved (file not modified)
-	data, _ := os.ReadFile(configPath)
-	if string(data) != originalContent {
-		t.Errorf("empty models should not modify file, but content changed")
-	}
-}
-
-func TestOpenCodeEdit_SpecialCharsInModelName(t *testing.T) {
-	o := &OpenCode{}
-	tmpDir := t.TempDir()
-	setTestHome(t, tmpDir)
-
-	// Model name with special characters (though unusual)
-	specialModel := `model-with-"quotes"`
-
-	err := o.Edit([]string{specialModel})
-	if err != nil {
-		t.Fatalf("Edit with special chars failed: %v", err)
-	}
-
-	// Verify it was stored correctly
-	configDir := filepath.Join(tmpDir, ".config", "opencode")
-	configPath := filepath.Join(configDir, "opencode.json")
-	data, _ := os.ReadFile(configPath)
-
-	var cfg map[string]any
-	if err := json.Unmarshal(data, &cfg); err != nil {
-		t.Fatalf("resulting config is invalid JSON: %v", err)
-	}
-
-	// Model should be accessible
-	provider, _ := cfg["provider"].(map[string]any)
-	ollama, _ := provider["ollama"].(map[string]any)
-	models, _ := ollama["models"].(map[string]any)
-
-	if models[specialModel] == nil {
-		t.Errorf("model with special chars not found in config")
-	}
-}
-
-func TestOpenCodeModels_NoConfig(t *testing.T) {
-	o := &OpenCode{}
-	tmpDir := t.TempDir()
-	setTestHome(t, tmpDir)
-
-	models := o.Models()
-	if len(models) > 0 {
-		t.Errorf("expected nil/empty for missing config, got %v", models)
-	}
-}
--- a/cmd/config/selector.go
+++ b/cmd/config/selector.go
@@ -1,499 +0,0 @@
-package config
-
-import (
-	"errors"
-	"fmt"
-	"io"
-	"os"
-	"strings"
-
-	"golang.org/x/term"
-)
-
-// ANSI escape sequences for terminal formatting.
-const (
-	ansiHideCursor = "\033[?25l"
-	ansiShowCursor = "\033[?25h"
-	ansiBold       = "\033[1m"
-	ansiReset      = "\033[0m"
-	ansiGray       = "\033[37m"
-	ansiClearDown  = "\033[J"
-)
-
-const maxDisplayedItems = 10
-
-var errCancelled = errors.New("cancelled")
-
-type selectItem struct {
-	Name        string
-	Description string
-}
-
-type inputEvent int
-
-const (
-	eventNone inputEvent = iota
-	eventEnter
-	eventEscape
-	eventUp
-	eventDown
-	eventTab
-	eventBackspace
-	eventChar
-)
-
-type selectState struct {
-	items        []selectItem
-	filter       string
-	selected     int
-	scrollOffset int
-}
-
-func newSelectState(items []selectItem) *selectState {
-	return &selectState{items: items}
-}
-
-func (s *selectState) filtered() []selectItem {
-	return filterItems(s.items, s.filter)
-}
-
-func (s *selectState) handleInput(event inputEvent, char byte) (done bool, result string, err error) {
-	filtered := s.filtered()
-
-	switch event {
-	case eventEnter:
-		if len(filtered) > 0 && s.selected < len(filtered) {
-			return true, filtered[s.selected].Name, nil
-		}
-	case eventEscape:
-		return true, "", errCancelled
-	case eventBackspace:
-		if len(s.filter) > 0 {
-			s.filter = s.filter[:len(s.filter)-1]
-			s.selected = 0
-			s.scrollOffset = 0
-		}
-	case eventUp:
-		if s.selected > 0 {
-			s.selected--
-			if s.selected < s.scrollOffset {
-				s.scrollOffset = s.selected
-			}
-		}
-	case eventDown:
-		if s.selected < len(filtered)-1 {
-			s.selected++
-			if s.selected >= s.scrollOffset+maxDisplayedItems {
-				s.scrollOffset = s.selected - maxDisplayedItems + 1
-			}
-		}
-	case eventChar:
-		s.filter += string(char)
-		s.selected = 0
-		s.scrollOffset = 0
-	}
-
-	return false, "", nil
-}
-
-type multiSelectState struct {
-	items         []selectItem
-	itemIndex     map[string]int
-	filter        string
-	highlighted   int
-	scrollOffset  int
-	checked       map[int]bool
-	checkOrder    []int
-	focusOnButton bool
-}
-
-func newMultiSelectState(items []selectItem, preChecked []string) *multiSelectState {
-	s := &multiSelectState{
-		items:     items,
-		itemIndex: make(map[string]int, len(items)),
-		checked:   make(map[int]bool),
-	}
-
-	for i, item := range items {
-		s.itemIndex[item.Name] = i
-	}
-
-	for _, name := range preChecked {
-		if idx, ok := s.itemIndex[name]; ok {
-			s.checked[idx] = true
-			s.checkOrder = append(s.checkOrder, idx)
-		}
-	}
-
-	return s
-}
-
-func (s *multiSelectState) filtered() []selectItem {
-	return filterItems(s.items, s.filter)
-}
-
-func (s *multiSelectState) toggleItem() {
-	filtered := s.filtered()
-	if len(filtered) == 0 || s.highlighted >= len(filtered) {
-		return
-	}
-
-	item := filtered[s.highlighted]
-	origIdx := s.itemIndex[item.Name]
-
-	if s.checked[origIdx] {
-		delete(s.checked, origIdx)
-		for i, idx := range s.checkOrder {
-			if idx == origIdx {
-				s.checkOrder = append(s.checkOrder[:i], s.checkOrder[i+1:]...)
-				break
-			}
-		}
-	} else {
-		s.checked[origIdx] = true
-		s.checkOrder = append(s.checkOrder, origIdx)
-	}
-}
-
-func (s *multiSelectState) handleInput(event inputEvent, char byte) (done bool, result []string, err error) {
-	filtered := s.filtered()
-
-	switch event {
-	case eventEnter:
-		if s.focusOnButton && len(s.checkOrder) > 0 {
-			var res []string
-			for _, idx := range s.checkOrder {
-				res = append(res, s.items[idx].Name)
-			}
-			return true, res, nil
-		} else if !s.focusOnButton {
-			s.toggleItem()
-		}
-	case eventTab:
-		if len(s.checkOrder) > 0 {
-			s.focusOnButton = !s.focusOnButton
-		}
-	case eventEscape:
-		return true, nil, errCancelled
-	case eventBackspace:
-		if len(s.filter) > 0 {
-			s.filter = s.filter[:len(s.filter)-1]
-			s.highlighted = 0
-			s.scrollOffset = 0
-			s.focusOnButton = false
-		}
-	case eventUp:
-		if s.focusOnButton {
-			s.focusOnButton = false
-		} else if s.highlighted > 0 {
-			s.highlighted--
-			if s.highlighted < s.scrollOffset {
-				s.scrollOffset = s.highlighted
-			}
-		}
-	case eventDown:
-		if s.focusOnButton {
-			s.focusOnButton = false
-		} else if s.highlighted < len(filtered)-1 {
-			s.highlighted++
-			if s.highlighted >= s.scrollOffset+maxDisplayedItems {
-				s.scrollOffset = s.highlighted - maxDisplayedItems + 1
-			}
-		}
-	case eventChar:
-		s.filter += string(char)
-		s.highlighted = 0
-		s.scrollOffset = 0
-		s.focusOnButton = false
-	}
-
-	return false, nil, nil
-}
-
-func (s *multiSelectState) selectedCount() int {
-	return len(s.checkOrder)
-}
-
-// Terminal I/O handling
-
-type terminalState struct {
-	fd       int
-	oldState *term.State
-}
-
-func enterRawMode() (*terminalState, error) {
-	fd := int(os.Stdin.Fd())
-	oldState, err := term.MakeRaw(fd)
-	if err != nil {
-		return nil, err
-	}
-	fmt.Fprint(os.Stderr, ansiHideCursor)
-	return &terminalState{fd: fd, oldState: oldState}, nil
-}
-
-func (t *terminalState) restore() {
-	fmt.Fprint(os.Stderr, ansiShowCursor)
-	term.Restore(t.fd, t.oldState)
-}
-
-func clearLines(n int) {
-	if n > 0 {
-		fmt.Fprintf(os.Stderr, "\033[%dA", n)
-		fmt.Fprint(os.Stderr, ansiClearDown)
-	}
-}
-
-func parseInput(r io.Reader) (inputEvent, byte, error) {
-	buf := make([]byte, 3)
-	n, err := r.Read(buf)
-	if err != nil {
-		return 0, 0, err
-	}
-
-	switch {
-	case n == 1 && buf[0] == 13:
-		return eventEnter, 0, nil
-	case n == 1 && (buf[0] == 3 || buf[0] == 27):
-		return eventEscape, 0, nil
-	case n == 1 && buf[0] == 9:
-		return eventTab, 0, nil
-	case n == 1 && buf[0] == 127:
-		return eventBackspace, 0, nil
-	case n == 3 && buf[0] == 27 && buf[1] == 91 && buf[2] == 65:
-		return eventUp, 0, nil
-	case n == 3 && buf[0] == 27 && buf[1] == 91 && buf[2] == 66:
-		return eventDown, 0, nil
-	case n == 1 && buf[0] >= 32 && buf[0] < 127:
-		return eventChar, buf[0], nil
-	}
-
-	return eventNone, 0, nil
-}
-
-// Rendering
-
-func renderSelect(w io.Writer, prompt string, s *selectState) int {
-	filtered := s.filtered()
-
-	fmt.Fprintf(w, "%s %s\r\n", prompt, s.filter)
-	lineCount := 1
-
-	if len(filtered) == 0 {
-		fmt.Fprintf(w, "  %s(no matches)%s\r\n", ansiGray, ansiReset)
-		lineCount++
-	} else {
-		displayCount := min(len(filtered), maxDisplayedItems)
-
-		for i := range displayCount {
-			idx := s.scrollOffset + i
-			if idx >= len(filtered) {
-				break
-			}
-			item := filtered[idx]
-			prefix := "    "
-			if idx == s.selected {
-				prefix = "  " + ansiBold + "> "
-			}
-			if item.Description != "" {
-				fmt.Fprintf(w, "%s%s%s %s- %s%s\r\n", prefix, item.Name, ansiReset, ansiGray, item.Description, ansiReset)
-			} else {
-				fmt.Fprintf(w, "%s%s%s\r\n", prefix, item.Name, ansiReset)
-			}
-			lineCount++
-		}
-
-		if remaining := len(filtered) - s.scrollOffset - displayCount; remaining > 0 {
-			fmt.Fprintf(w, "  %s... and %d more%s\r\n", ansiGray, remaining, ansiReset)
-			lineCount++
-		}
-	}
-
-	return lineCount
-}
-
-func renderMultiSelect(w io.Writer, prompt string, s *multiSelectState) int {
-	filtered := s.filtered()
-
-	fmt.Fprintf(w, "%s %s\r\n", prompt, s.filter)
-	lineCount := 1
-
-	if len(filtered) == 0 {
-		fmt.Fprintf(w, "  %s(no matches)%s\r\n", ansiGray, ansiReset)
-		lineCount++
-	} else {
-		displayCount := min(len(filtered), maxDisplayedItems)
-
-		for i := range displayCount {
-			idx := s.scrollOffset + i
-			if idx >= len(filtered) {
-				break
-			}
-			item := filtered[idx]
-			origIdx := s.itemIndex[item.Name]
-
-			checkbox := "[ ]"
-			if s.checked[origIdx] {
-				checkbox = "[x]"
-			}
-
-			prefix := "  "
-			suffix := ""
-			if idx == s.highlighted && !s.focusOnButton {
-				prefix = "> "
-			}
-			if len(s.checkOrder) > 0 && s.checkOrder[0] == origIdx {
-				suffix = " " + ansiGray + "(default)" + ansiReset
-			}
-
-			if idx == s.highlighted && !s.focusOnButton {
-				fmt.Fprintf(w, "  %s%s %s %s%s%s\r\n", ansiBold, prefix, checkbox, item.Name, ansiReset, suffix)
-			} else {
-				fmt.Fprintf(w, "  %s %s %s%s\r\n", prefix, checkbox, item.Name, suffix)
-			}
-			lineCount++
-		}
-
-		if remaining := len(filtered) - s.scrollOffset - displayCount; remaining > 0 {
-			fmt.Fprintf(w, "  %s... and %d more%s\r\n", ansiGray, remaining, ansiReset)
-			lineCount++
-		}
-	}
-
-	fmt.Fprintf(w, "\r\n")
-	lineCount++
-	count := s.selectedCount()
-	switch {
-	case count == 0:
-		fmt.Fprintf(w, "  %sSelect at least one model.%s\r\n", ansiGray, ansiReset)
-	case s.focusOnButton:
-		fmt.Fprintf(w, "  %s> [ Continue ]%s %s(%d selected)%s\r\n", ansiBold, ansiReset, ansiGray, count, ansiReset)
-	default:
-		fmt.Fprintf(w, "    %s[ Continue ] (%d selected) - press Tab%s\r\n", ansiGray, count, ansiReset)
-	}
-	lineCount++
-
-	return lineCount
-}
-
-// selectPrompt prompts the user to select a single item from a list.
-func selectPrompt(prompt string, items []selectItem) (string, error) {
-	if len(items) == 0 {
-		return "", fmt.Errorf("no items to select from")
-	}
-
-	ts, err := enterRawMode()
-	if err != nil {
-		return "", err
-	}
-	defer ts.restore()
-
-	state := newSelectState(items)
-	var lastLineCount int
-
-	render := func() {
-		clearLines(lastLineCount)
-		lastLineCount = renderSelect(os.Stderr, prompt, state)
-	}
-
-	render()
-
-	for {
-		event, char, err := parseInput(os.Stdin)
-		if err != nil {
-			return "", err
-		}
-
-		done, result, err := state.handleInput(event, char)
-		if done {
-			clearLines(lastLineCount)
-			if err != nil {
-				return "", err
-			}
-			return result, nil
-		}
-
-		render()
-	}
-}
-
-// multiSelectPrompt prompts the user to select multiple items from a list.
-func multiSelectPrompt(prompt string, items []selectItem, preChecked []string) ([]string, error) {
-	if len(items) == 0 {
-		return nil, fmt.Errorf("no items to select from")
-	}
-
-	ts, err := enterRawMode()
-	if err != nil {
-		return nil, err
-	}
-	defer ts.restore()
-
-	state := newMultiSelectState(items, preChecked)
-	var lastLineCount int
-
-	render := func() {
-		clearLines(lastLineCount)
-		lastLineCount = renderMultiSelect(os.Stderr, prompt, state)
-	}
-
-	render()
-
-	for {
-		event, char, err := parseInput(os.Stdin)
-		if err != nil {
-			return nil, err
-		}
-
-		done, result, err := state.handleInput(event, char)
-		if done {
-			clearLines(lastLineCount)
-			if err != nil {
-				return nil, err
-			}
-			return result, nil
-		}
-
-		render()
-	}
-}
-
-func confirmPrompt(prompt string) (bool, error) {
-	fd := int(os.Stdin.Fd())
-	oldState, err := term.MakeRaw(fd)
-	if err != nil {
-		return false, err
-	}
-	defer term.Restore(fd, oldState)
-
-	fmt.Fprintf(os.Stderr, "%s (\033[1my\033[0m/n) ", prompt)
-
-	buf := make([]byte, 1)
-	for {
-		if _, err := os.Stdin.Read(buf); err != nil {
-			return false, err
-		}
-
-		switch buf[0] {
-		case 'Y', 'y', 13:
-			fmt.Fprintf(os.Stderr, "yes\r\n")
-			return true, nil
-		case 'N', 'n', 27, 3:
-			fmt.Fprintf(os.Stderr, "no\r\n")
-			return false, nil
-		}
-	}
-}
-
-func filterItems(items []selectItem, filter string) []selectItem {
-	if filter == "" {
-		return items
-	}
-	var result []selectItem
-	filterLower := strings.ToLower(filter)
-	for _, item := range items {
-		if strings.Contains(strings.ToLower(item.Name), filterLower) {
-			result = append(result, item)
-		}
-	}
-	return result
-}
--- a/cmd/config/selector_test.go
+++ b/cmd/config/selector_test.go
@@ -1,913 +0,0 @@
-package config
-
-import (
-	"bytes"
-	"strings"
-	"testing"
-)
-
-func TestFilterItems(t *testing.T) {
-	items := []selectItem{
-		{Name: "llama3.2:latest"},
-		{Name: "qwen2.5:7b"},
-		{Name: "deepseek-v3:cloud"},
-		{Name: "GPT-OSS:20b"},
-	}
-
-	t.Run("EmptyFilter_ReturnsAllItems", func(t *testing.T) {
-		result := filterItems(items, "")
-		if len(result) != len(items) {
-			t.Errorf("expected %d items, got %d", len(items), len(result))
-		}
-	})
-
-	t.Run("CaseInsensitive_UppercaseFilterMatchesLowercase", func(t *testing.T) {
-		result := filterItems(items, "LLAMA")
-		if len(result) != 1 || result[0].Name != "llama3.2:latest" {
-			t.Errorf("expected llama3.2:latest, got %v", result)
-		}
-	})
-
-	t.Run("CaseInsensitive_LowercaseFilterMatchesUppercase", func(t *testing.T) {
-		result := filterItems(items, "gpt")
-		if len(result) != 1 || result[0].Name != "GPT-OSS:20b" {
-			t.Errorf("expected GPT-OSS:20b, got %v", result)
-		}
-	})
-
-	t.Run("PartialMatch", func(t *testing.T) {
-		result := filterItems(items, "deep")
-		if len(result) != 1 || result[0].Name != "deepseek-v3:cloud" {
-			t.Errorf("expected deepseek-v3:cloud, got %v", result)
-		}
-	})
-
-	t.Run("NoMatch_ReturnsEmpty", func(t *testing.T) {
-		result := filterItems(items, "nonexistent")
-		if len(result) != 0 {
-			t.Errorf("expected 0 items, got %d", len(result))
-		}
-	})
-}
-
-func TestSelectState(t *testing.T) {
-	items := []selectItem{
-		{Name: "item1"},
-		{Name: "item2"},
-		{Name: "item3"},
-	}
-
-	t.Run("InitialState", func(t *testing.T) {
-		s := newSelectState(items)
-		if s.selected != 0 {
-			t.Errorf("expected selected=0, got %d", s.selected)
-		}
-		if s.filter != "" {
-			t.Errorf("expected empty filter, got %q", s.filter)
-		}
-		if s.scrollOffset != 0 {
-			t.Errorf("expected scrollOffset=0, got %d", s.scrollOffset)
-		}
-	})
-
-	t.Run("Enter_SelectsCurrentItem", func(t *testing.T) {
-		s := newSelectState(items)
-		done, result, err := s.handleInput(eventEnter, 0)
-		if !done || result != "item1" || err != nil {
-			t.Errorf("expected (true, item1, nil), got (%v, %v, %v)", done, result, err)
-		}
-	})
-
-	t.Run("Enter_WithFilter_SelectsFilteredItem", func(t *testing.T) {
-		s := newSelectState(items)
-		s.filter = "item3"
-		done, result, err := s.handleInput(eventEnter, 0)
-		if !done || result != "item3" || err != nil {
-			t.Errorf("expected (true, item3, nil), got (%v, %v, %v)", done, result, err)
-		}
-	})
-
-	t.Run("Enter_EmptyFilteredList_DoesNothing", func(t *testing.T) {
-		s := newSelectState(items)
-		s.filter = "nonexistent"
-		done, result, err := s.handleInput(eventEnter, 0)
-		if done || result != "" || err != nil {
-			t.Errorf("expected (false, '', nil), got (%v, %v, %v)", done, result, err)
-		}
-	})
-
-	t.Run("Escape_ReturnsCancelledError", func(t *testing.T) {
-		s := newSelectState(items)
-		done, result, err := s.handleInput(eventEscape, 0)
-		if !done || result != "" || err != errCancelled {
-			t.Errorf("expected (true, '', errCancelled), got (%v, %v, %v)", done, result, err)
-		}
-	})
-
-	t.Run("Down_MovesSelection", func(t *testing.T) {
-		s := newSelectState(items)
-		s.handleInput(eventDown, 0)
-		if s.selected != 1 {
-			t.Errorf("expected selected=1, got %d", s.selected)
-		}
-	})
-
-	t.Run("Down_AtBottom_StaysAtBottom", func(t *testing.T) {
-		s := newSelectState(items)
-		s.selected = 2
-		s.handleInput(eventDown, 0)
-		if s.selected != 2 {
-			t.Errorf("expected selected=2 (stayed at bottom), got %d", s.selected)
-		}
-	})
-
-	t.Run("Up_MovesSelection", func(t *testing.T) {
-		s := newSelectState(items)
-		s.selected = 2
-		s.handleInput(eventUp, 0)
-		if s.selected != 1 {
-			t.Errorf("expected selected=1, got %d", s.selected)
-		}
-	})
-
-	t.Run("Up_AtTop_StaysAtTop", func(t *testing.T) {
-		s := newSelectState(items)
-		s.handleInput(eventUp, 0)
-		if s.selected != 0 {
-			t.Errorf("expected selected=0 (stayed at top), got %d", s.selected)
-		}
-	})
-
-	t.Run("Char_AppendsToFilter", func(t *testing.T) {
-		s := newSelectState(items)
-		s.handleInput(eventChar, 'i')
-		s.handleInput(eventChar, 't')
-		s.handleInput(eventChar, 'e')
-		s.handleInput(eventChar, 'm')
-		s.handleInput(eventChar, '2')
-		if s.filter != "item2" {
-			t.Errorf("expected filter='item2', got %q", s.filter)
-		}
-		filtered := s.filtered()
-		if len(filtered) != 1 || filtered[0].Name != "item2" {
-			t.Errorf("expected [item2], got %v", filtered)
-		}
-	})
-
-	t.Run("Char_ResetsSelectionToZero", func(t *testing.T) {
-		s := newSelectState(items)
-		s.selected = 2
-		s.handleInput(eventChar, 'x')
-		if s.selected != 0 {
-			t.Errorf("expected selected=0 after typing, got %d", s.selected)
-		}
-	})
-
-	t.Run("Backspace_RemovesLastFilterChar", func(t *testing.T) {
-		s := newSelectState(items)
-		s.filter = "test"
-		s.handleInput(eventBackspace, 0)
-		if s.filter != "tes" {
-			t.Errorf("expected filter='tes', got %q", s.filter)
-		}
-	})
-
-	t.Run("Backspace_EmptyFilter_DoesNothing", func(t *testing.T) {
-		s := newSelectState(items)
-		s.handleInput(eventBackspace, 0)
-		if s.filter != "" {
-			t.Errorf("expected filter='', got %q", s.filter)
-		}
-	})
-
-	t.Run("Backspace_ResetsSelectionToZero", func(t *testing.T) {
-		s := newSelectState(items)
-		s.filter = "test"
-		s.selected = 2
-		s.handleInput(eventBackspace, 0)
-		if s.selected != 0 {
-			t.Errorf("expected selected=0 after backspace, got %d", s.selected)
-		}
-	})
-
-	t.Run("Scroll_DownPastVisibleItems_ScrollsViewport", func(t *testing.T) {
-		// maxDisplayedItems is 10, so with 15 items we need to scroll
-		manyItems := make([]selectItem, 15)
-		for i := range manyItems {
-			manyItems[i] = selectItem{Name: string(rune('a' + i))}
-		}
-		s := newSelectState(manyItems)
-
-		// move down 12 times (past the 10-item viewport)
-		for range 12 {
-			s.handleInput(eventDown, 0)
-		}
-
-		if s.selected != 12 {
-			t.Errorf("expected selected=12, got %d", s.selected)
-		}
-		if s.scrollOffset != 3 {
-			t.Errorf("expected scrollOffset=3 (12-10+1), got %d", s.scrollOffset)
-		}
-	})
-
-	t.Run("Scroll_UpPastScrollOffset_ScrollsViewport", func(t *testing.T) {
-		manyItems := make([]selectItem, 15)
-		for i := range manyItems {
-			manyItems[i] = selectItem{Name: string(rune('a' + i))}
-		}
-		s := newSelectState(manyItems)
-		s.selected = 5
-		s.scrollOffset = 5
-
-		s.handleInput(eventUp, 0)
-
-		if s.selected != 4 {
-			t.Errorf("expected selected=4, got %d", s.selected)
-		}
-		if s.scrollOffset != 4 {
-			t.Errorf("expected scrollOffset=4, got %d", s.scrollOffset)
-		}
-	})
-}
-
-func TestMultiSelectState(t *testing.T) {
-	items := []selectItem{
-		{Name: "item1"},
-		{Name: "item2"},
-		{Name: "item3"},
-	}
-
-	t.Run("InitialState_NoPrechecked", func(t *testing.T) {
-		s := newMultiSelectState(items, nil)
-		if s.highlighted != 0 {
-			t.Errorf("expected highlighted=0, got %d", s.highlighted)
-		}
-		if s.selectedCount() != 0 {
-			t.Errorf("expected 0 selected, got %d", s.selectedCount())
-		}
-		if s.focusOnButton {
-			t.Error("expected focusOnButton=false initially")
-		}
-	})
-
-	t.Run("InitialState_WithPrechecked", func(t *testing.T) {
-		s := newMultiSelectState(items, []string{"item2", "item3"})
-		if s.selectedCount() != 2 {
-			t.Errorf("expected 2 selected, got %d", s.selectedCount())
-		}
-		if !s.checked[1] || !s.checked[2] {
-			t.Error("expected item2 and item3 to be checked")
-		}
-	})
-
-	t.Run("Prechecked_PreservesSelectionOrder", func(t *testing.T) {
-		// order matters: first checked = default model
-		s := newMultiSelectState(items, []string{"item3", "item1"})
-		if len(s.checkOrder) != 2 {
-			t.Fatalf("expected 2 in checkOrder, got %d", len(s.checkOrder))
-		}
-		if s.checkOrder[0] != 2 || s.checkOrder[1] != 0 {
-			t.Errorf("expected checkOrder=[2,0] (item3 first), got %v", s.checkOrder)
-		}
-	})
-
-	t.Run("Prechecked_IgnoresInvalidNames", func(t *testing.T) {
-		s := newMultiSelectState(items, []string{"item1", "nonexistent"})
-		if s.selectedCount() != 1 {
-			t.Errorf("expected 1 selected (nonexistent ignored), got %d", s.selectedCount())
-		}
-	})
-
-	t.Run("Toggle_ChecksUncheckedItem", func(t *testing.T) {
-		s := newMultiSelectState(items, nil)
-		s.toggleItem()
-		if !s.checked[0] {
-			t.Error("expected item1 to be checked after toggle")
-		}
-	})
-
-	t.Run("Toggle_UnchecksCheckedItem", func(t *testing.T) {
-		s := newMultiSelectState(items, []string{"item1"})
-		s.toggleItem()
-		if s.checked[0] {
-			t.Error("expected item1 to be unchecked after toggle")
-		}
-	})
-
-	t.Run("Toggle_RemovesFromCheckOrder", func(t *testing.T) {
-		s := newMultiSelectState(items, []string{"item1", "item2", "item3"})
-		s.highlighted = 1 // toggle item2
-		s.toggleItem()
-
-		if len(s.checkOrder) != 2 {
-			t.Fatalf("expected 2 in checkOrder, got %d", len(s.checkOrder))
-		}
-		// should be [0, 2] (item1, item3) with item2 removed
-		if s.checkOrder[0] != 0 || s.checkOrder[1] != 2 {
-			t.Errorf("expected checkOrder=[0,2], got %v", s.checkOrder)
-		}
-	})
-
-	t.Run("Enter_TogglesWhenNotOnButton", func(t *testing.T) {
-		s := newMultiSelectState(items, nil)
-		s.handleInput(eventEnter, 0)
-		if !s.checked[0] {
-			t.Error("expected item1 to be checked after enter")
-		}
-	})
-
-	t.Run("Enter_OnButton_ReturnsSelection", func(t *testing.T) {
-		s := newMultiSelectState(items, []string{"item2", "item1"})
-		s.focusOnButton = true
-
-		done, result, err := s.handleInput(eventEnter, 0)
-
-		if !done || err != nil {
-			t.Errorf("expected done=true, err=nil, got done=%v, err=%v", done, err)
-		}
-		// result should preserve selection order
-		if len(result) != 2 || result[0] != "item2" || result[1] != "item1" {
-			t.Errorf("expected [item2, item1], got %v", result)
-		}
-	})
-
-	t.Run("Enter_OnButton_EmptySelection_DoesNothing", func(t *testing.T) {
-		s := newMultiSelectState(items, nil)
-		s.focusOnButton = true
-		done, result, err := s.handleInput(eventEnter, 0)
-		if done || result != nil || err != nil {
-			t.Errorf("expected (false, nil, nil), got (%v, %v, %v)", done, result, err)
-		}
-	})
-
-	t.Run("Tab_SwitchesToButton_WhenHasSelection", func(t *testing.T) {
-		s := newMultiSelectState(items, []string{"item1"})
-		s.handleInput(eventTab, 0)
-		if !s.focusOnButton {
-			t.Error("expected focus on button after tab")
-		}
-	})
-
-	t.Run("Tab_DoesNothing_WhenNoSelection", func(t *testing.T) {
-		s := newMultiSelectState(items, nil)
-		s.handleInput(eventTab, 0)
-		if s.focusOnButton {
-			t.Error("tab should not focus button when nothing selected")
-		}
-	})
-
-	t.Run("Tab_TogglesButtonFocus", func(t *testing.T) {
-		s := newMultiSelectState(items, []string{"item1"})
-		s.handleInput(eventTab, 0)
-		if !s.focusOnButton {
-			t.Error("expected focus on button after first tab")
-		}
-		s.handleInput(eventTab, 0)
-		if s.focusOnButton {
-			t.Error("expected focus back on list after second tab")
-		}
-	})
-
-	t.Run("Escape_ReturnsCancelledError", func(t *testing.T) {
-		s := newMultiSelectState(items, []string{"item1"})
-		done, result, err := s.handleInput(eventEscape, 0)
-		if !done || result != nil || err != errCancelled {
-			t.Errorf("expected (true, nil, errCancelled), got (%v, %v, %v)", done, result, err)
-		}
-	})
-
-	t.Run("IsDefault_TrueForFirstChecked", func(t *testing.T) {
-		s := newMultiSelectState(items, []string{"item2", "item1"})
-		if !(len(s.checkOrder) > 0 && s.checkOrder[0] == 1) {
-			t.Error("expected item2 (idx 1) to be default (first checked)")
-		}
-		if len(s.checkOrder) > 0 && s.checkOrder[0] == 0 {
-			t.Error("expected item1 (idx 0) to NOT be default")
-		}
-	})
-
-	t.Run("IsDefault_FalseWhenNothingChecked", func(t *testing.T) {
-		s := newMultiSelectState(items, nil)
-		if len(s.checkOrder) > 0 && s.checkOrder[0] == 0 {
-			t.Error("expected isDefault=false when nothing checked")
-		}
-	})
-
-	t.Run("Down_MovesHighlight", func(t *testing.T) {
-		s := newMultiSelectState(items, nil)
-		s.handleInput(eventDown, 0)
-		if s.highlighted != 1 {
-			t.Errorf("expected highlighted=1, got %d", s.highlighted)
-		}
-	})
-
-	t.Run("Up_MovesHighlight", func(t *testing.T) {
-		s := newMultiSelectState(items, nil)
-		s.highlighted = 1
-		s.handleInput(eventUp, 0)
-		if s.highlighted != 0 {
-			t.Errorf("expected highlighted=0, got %d", s.highlighted)
-		}
-	})
-
-	t.Run("Arrow_ReturnsFocusFromButton", func(t *testing.T) {
-		s := newMultiSelectState(items, []string{"item1"})
-		s.focusOnButton = true
-		s.handleInput(eventDown, 0)
-		if s.focusOnButton {
-			t.Error("expected focus to return to list on arrow key")
-		}
-	})
-
-	t.Run("Char_AppendsToFilter", func(t *testing.T) {
-		s := newMultiSelectState(items, nil)
-		s.handleInput(eventChar, 'x')
-		if s.filter != "x" {
-			t.Errorf("expected filter='x', got %q", s.filter)
-		}
-	})
-
-	t.Run("Char_ResetsHighlightAndScroll", func(t *testing.T) {
-		manyItems := make([]selectItem, 15)
-		for i := range manyItems {
-			manyItems[i] = selectItem{Name: string(rune('a' + i))}
-		}
-		s := newMultiSelectState(manyItems, nil)
-		s.highlighted = 10
-		s.scrollOffset = 5
-
-		s.handleInput(eventChar, 'x')
-
-		if s.highlighted != 0 {
-			t.Errorf("expected highlighted=0, got %d", s.highlighted)
-		}
-		if s.scrollOffset != 0 {
-			t.Errorf("expected scrollOffset=0, got %d", s.scrollOffset)
-		}
-	})
-
-	t.Run("Backspace_RemovesLastFilterChar", func(t *testing.T) {
-		s := newMultiSelectState(items, nil)
-		s.filter = "test"
-		s.handleInput(eventBackspace, 0)
-		if s.filter != "tes" {
-			t.Errorf("expected filter='tes', got %q", s.filter)
-		}
-	})
-
-	t.Run("Backspace_RemovesFocusFromButton", func(t *testing.T) {
-		s := newMultiSelectState(items, []string{"item1"})
-		s.filter = "x"
-		s.focusOnButton = true
-		s.handleInput(eventBackspace, 0)
-		if s.focusOnButton {
-			t.Error("expected focusOnButton=false after backspace")
-		}
-	})
-}
-
-func TestParseInput(t *testing.T) {
-	t.Run("Enter", func(t *testing.T) {
-		event, char, err := parseInput(bytes.NewReader([]byte{13}))
-		if err != nil || event != eventEnter || char != 0 {
-			t.Errorf("expected (eventEnter, 0, nil), got (%v, %v, %v)", event, char, err)
-		}
-	})
-
-	t.Run("Escape", func(t *testing.T) {
-		event, _, err := parseInput(bytes.NewReader([]byte{27}))
-		if err != nil || event != eventEscape {
-			t.Errorf("expected eventEscape, got %v", event)
-		}
-	})
-
-	t.Run("CtrlC_TreatedAsEscape", func(t *testing.T) {
-		event, _, err := parseInput(bytes.NewReader([]byte{3}))
-		if err != nil || event != eventEscape {
-			t.Errorf("expected eventEscape for Ctrl+C, got %v", event)
-		}
-	})
-
-	t.Run("Tab", func(t *testing.T) {
-		event, _, err := parseInput(bytes.NewReader([]byte{9}))
-		if err != nil || event != eventTab {
-			t.Errorf("expected eventTab, got %v", event)
-		}
-	})
-
-	t.Run("Backspace", func(t *testing.T) {
-		event, _, err := parseInput(bytes.NewReader([]byte{127}))
-		if err != nil || event != eventBackspace {
-			t.Errorf("expected eventBackspace, got %v", event)
-		}
-	})
-
-	t.Run("UpArrow", func(t *testing.T) {
-		event, _, err := parseInput(bytes.NewReader([]byte{27, 91, 65}))
-		if err != nil || event != eventUp {
-			t.Errorf("expected eventUp, got %v", event)
-		}
-	})
-
-	t.Run("DownArrow", func(t *testing.T) {
-		event, _, err := parseInput(bytes.NewReader([]byte{27, 91, 66}))
-		if err != nil || event != eventDown {
-			t.Errorf("expected eventDown, got %v", event)
-		}
-	})
-
-	t.Run("PrintableChars", func(t *testing.T) {
-		tests := []struct {
-			name string
-			char byte
-		}{
-			{"lowercase", 'a'},
-			{"uppercase", 'Z'},
-			{"digit", '5'},
-			{"space", ' '},
-			{"tilde", '~'},
-		}
-		for _, tt := range tests {
-			t.Run(tt.name, func(t *testing.T) {
-				event, char, err := parseInput(bytes.NewReader([]byte{tt.char}))
-				if err != nil || event != eventChar || char != tt.char {
-					t.Errorf("expected (eventChar, %q), got (%v, %q)", tt.char, event, char)
-				}
-			})
-		}
-	})
-}
-
-func TestRenderSelect(t *testing.T) {
-	items := []selectItem{
-		{Name: "item1", Description: "first item"},
-		{Name: "item2"},
-	}
-
-	t.Run("ShowsPromptAndItems", func(t *testing.T) {
-		s := newSelectState(items)
-		var buf bytes.Buffer
-		lineCount := renderSelect(&buf, "Select:", s)
-
-		output := buf.String()
-		if !strings.Contains(output, "Select:") {
-			t.Error("expected prompt in output")
-		}
-		if !strings.Contains(output, "item1") {
-			t.Error("expected item1 in output")
-		}
-		if !strings.Contains(output, "first item") {
-			t.Error("expected description in output")
-		}
-		if !strings.Contains(output, "item2") {
-			t.Error("expected item2 in output")
-		}
-		if lineCount != 3 { // 1 prompt + 2 items
-			t.Errorf("expected 3 lines, got %d", lineCount)
-		}
-	})
-
-	t.Run("EmptyFilteredList_ShowsNoMatches", func(t *testing.T) {
-		s := newSelectState(items)
-		s.filter = "xyz"
-		var buf bytes.Buffer
-		renderSelect(&buf, "Select:", s)
-
-		if !strings.Contains(buf.String(), "no matches") {
-			t.Error("expected 'no matches' message")
-		}
-	})
-
-	t.Run("LongList_ShowsRemainingCount", func(t *testing.T) {
-		manyItems := make([]selectItem, 15)
-		for i := range manyItems {
-			manyItems[i] = selectItem{Name: string(rune('a' + i))}
-		}
-		s := newSelectState(manyItems)
-		var buf bytes.Buffer
-		renderSelect(&buf, "Select:", s)
-
-		// 15 items - 10 displayed = 5 more
-		if !strings.Contains(buf.String(), "5 more") {
-			t.Error("expected '5 more' indicator")
-		}
-	})
-}
-
-func TestRenderMultiSelect(t *testing.T) {
-	items := []selectItem{
-		{Name: "item1"},
-		{Name: "item2"},
-	}
-
-	t.Run("ShowsCheckboxes", func(t *testing.T) {
-		s := newMultiSelectState(items, []string{"item1"})
-		var buf bytes.Buffer
-		renderMultiSelect(&buf, "Select:", s)
-
-		output := buf.String()
-		if !strings.Contains(output, "[x]") {
-			t.Error("expected checked checkbox [x]")
-		}
-		if !strings.Contains(output, "[ ]") {
-			t.Error("expected unchecked checkbox [ ]")
-		}
-	})
-
-	t.Run("ShowsDefaultMarker", func(t *testing.T) {
-		s := newMultiSelectState(items, []string{"item1"})
-		var buf bytes.Buffer
-		renderMultiSelect(&buf, "Select:", s)
-
-		if !strings.Contains(buf.String(), "(default)") {
-			t.Error("expected (default) marker for first checked item")
-		}
-	})
-
-	t.Run("ShowsSelectedCount", func(t *testing.T) {
-		s := newMultiSelectState(items, []string{"item1", "item2"})
-		var buf bytes.Buffer
-		renderMultiSelect(&buf, "Select:", s)
-
-		if !strings.Contains(buf.String(), "2 selected") {
-			t.Error("expected '2 selected' in output")
-		}
-	})
-
-	t.Run("NoSelection_ShowsHelperText", func(t *testing.T) {
-		s := newMultiSelectState(items, nil)
-		var buf bytes.Buffer
-		renderMultiSelect(&buf, "Select:", s)
-
-		if !strings.Contains(buf.String(), "Select at least one") {
-			t.Error("expected 'Select at least one' helper text")
-		}
-	})
-}
-
-func TestErrCancelled(t *testing.T) {
-	t.Run("NotNil", func(t *testing.T) {
-		if errCancelled == nil {
-			t.Error("errCancelled should not be nil")
-		}
-	})
-
-	t.Run("Message", func(t *testing.T) {
-		if errCancelled.Error() != "cancelled" {
-			t.Errorf("expected 'cancelled', got %q", errCancelled.Error())
-		}
-	})
-}
-
-// Edge case tests for selector.go
-
-// TestSelectState_SingleItem verifies that single item list works without crash.
-// List with only one item should still work.
-func TestSelectState_SingleItem(t *testing.T) {
-	items := []selectItem{{Name: "only-one"}}
-
-	s := newSelectState(items)
-
-	// Down should do nothing (already at bottom)
-	s.handleInput(eventDown, 0)
-	if s.selected != 0 {
-		t.Errorf("down on single item: expected selected=0, got %d", s.selected)
-	}
-
-	// Up should do nothing (already at top)
-	s.handleInput(eventUp, 0)
-	if s.selected != 0 {
-		t.Errorf("up on single item: expected selected=0, got %d", s.selected)
-	}
-
-	// Enter should select the only item
-	done, result, err := s.handleInput(eventEnter, 0)
-	if !done || result != "only-one" || err != nil {
-		t.Errorf("enter on single item: expected (true, 'only-one', nil), got (%v, %q, %v)", done, result, err)
-	}
-}
-
-// TestSelectState_ExactlyMaxItems verifies boundary condition at maxDisplayedItems.
-// List with exactly maxDisplayedItems items should not scroll.
-func TestSelectState_ExactlyMaxItems(t *testing.T) {
-	items := make([]selectItem, maxDisplayedItems)
-	for i := range items {
-		items[i] = selectItem{Name: string(rune('a' + i))}
-	}
-
-	s := newSelectState(items)
-
-	// Move to last item
-	for range maxDisplayedItems - 1 {
-		s.handleInput(eventDown, 0)
-	}
-
-	if s.selected != maxDisplayedItems-1 {
-		t.Errorf("expected selected=%d, got %d", maxDisplayedItems-1, s.selected)
-	}
-
-	// Should not scroll when exactly at max
-	if s.scrollOffset != 0 {
-		t.Errorf("expected scrollOffset=0 for exactly maxDisplayedItems, got %d", s.scrollOffset)
-	}
-
-	// One more down should do nothing
-	s.handleInput(eventDown, 0)
-	if s.selected != maxDisplayedItems-1 {
-		t.Errorf("down at max: expected selected=%d, got %d", maxDisplayedItems-1, s.selected)
-	}
-}
-
-// TestFilterItems_RegexSpecialChars verifies that filter is literal, not regex.
-// User typing "model.v1" shouldn't match "modelsv1".
-func TestFilterItems_RegexSpecialChars(t *testing.T) {
-	items := []selectItem{
-		{Name: "model.v1"},
-		{Name: "modelsv1"},
-		{Name: "model-v1"},
-	}
-
-	// Filter with dot should only match literal dot
-	result := filterItems(items, "model.v1")
-	if len(result) != 1 {
-		t.Errorf("expected 1 exact match, got %d", len(result))
-	}
-	if len(result) > 0 && result[0].Name != "model.v1" {
-		t.Errorf("expected 'model.v1', got %s", result[0].Name)
-	}
-
-	// Other regex special chars should be literal too
-	items2 := []selectItem{
-		{Name: "test[0]"},
-		{Name: "test0"},
-		{Name: "test(1)"},
-	}
-
-	result2 := filterItems(items2, "test[0]")
-	if len(result2) != 1 || result2[0].Name != "test[0]" {
-		t.Errorf("expected only 'test[0]', got %v", result2)
-	}
-}
-
-// TestMultiSelectState_DuplicateNames documents handling of duplicate item names.
-// itemIndex uses name as key - duplicates cause collision. This documents
-// the current behavior: the last index for a duplicate name is stored
-func TestMultiSelectState_DuplicateNames(t *testing.T) {
-	// Duplicate names - this is an edge case that shouldn't happen in practice
-	items := []selectItem{
-		{Name: "duplicate"},
-		{Name: "duplicate"},
-		{Name: "unique"},
-	}
-
-	s := newMultiSelectState(items, nil)
-
-	// DOCUMENTED BEHAVIOR: itemIndex maps name to LAST index
-	// When there are duplicates, only the last occurrence's index is stored
-	if s.itemIndex["duplicate"] != 1 {
-		t.Errorf("itemIndex should map 'duplicate' to last index (1), got %d", s.itemIndex["duplicate"])
-	}
-
-	// Toggle item at highlighted=0 (first "duplicate")
-	// Due to name collision, toggleItem uses itemIndex["duplicate"] = 1
-	// So it actually toggles the SECOND duplicate item, not the first
-	s.toggleItem()
-
-	// This documents the potentially surprising behavior:
-	// We toggled at highlighted=0, but itemIndex lookup returned 1
-	if !s.checked[1] {
-		t.Error("toggle should check index 1 (due to name collision in itemIndex)")
-	}
-	if s.checked[0] {
-		t.Log("Note: index 0 is NOT checked, even though highlighted=0 (name collision behavior)")
-	}
-}
-
-// TestSelectState_FilterReducesBelowSelection verifies selection resets when filter reduces list.
-// Prevents index-out-of-bounds on next keystroke
-func TestSelectState_FilterReducesBelowSelection(t *testing.T) {
-	items := []selectItem{
-		{Name: "apple"},
-		{Name: "banana"},
-		{Name: "cherry"},
-	}
-
-	s := newSelectState(items)
-	s.selected = 2 // Select "cherry"
-
-	// Type a filter that removes cherry from results
-	s.handleInput(eventChar, 'a') // Filter to "a" - matches "apple" and "banana"
-
-	// Selection should reset to 0
-	if s.selected != 0 {
-		t.Errorf("expected selected=0 after filter, got %d", s.selected)
-	}
-
-	filtered := s.filtered()
-	if len(filtered) != 2 {
-		t.Errorf("expected 2 filtered items, got %d", len(filtered))
-	}
-}
-
-// TestFilterItems_UnicodeCharacters verifies filtering works with UTF-8.
-// Model names might contain unicode characters
-func TestFilterItems_UnicodeCharacters(t *testing.T) {
-	items := []selectItem{
-		{Name: "llama-日本語"},
-		{Name: "模型-chinese"},
-		{Name: "émoji-🦙"},
-		{Name: "regular-model"},
-	}
-
-	t.Run("filter japanese", func(t *testing.T) {
-		result := filterItems(items, "日本")
-		if len(result) != 1 || result[0].Name != "llama-日本語" {
-			t.Errorf("expected llama-日本語, got %v", result)
-		}
-	})
-
-	t.Run("filter chinese", func(t *testing.T) {
-		result := filterItems(items, "模型")
-		if len(result) != 1 || result[0].Name != "模型-chinese" {
-			t.Errorf("expected 模型-chinese, got %v", result)
-		}
-	})
-
-	t.Run("filter emoji", func(t *testing.T) {
-		result := filterItems(items, "🦙")
-		if len(result) != 1 || result[0].Name != "émoji-🦙" {
-			t.Errorf("expected émoji-🦙, got %v", result)
-		}
-	})
-
-	t.Run("filter accented char", func(t *testing.T) {
-		result := filterItems(items, "émoji")
-		if len(result) != 1 || result[0].Name != "émoji-🦙" {
-			t.Errorf("expected émoji-🦙, got %v", result)
-		}
-	})
-}
-
-// TestMultiSelectState_FilterReducesBelowHighlight verifies highlight resets when filter reduces list.
-func TestMultiSelectState_FilterReducesBelowHighlight(t *testing.T) {
-	items := []selectItem{
-		{Name: "apple"},
-		{Name: "banana"},
-		{Name: "cherry"},
-	}
-
-	s := newMultiSelectState(items, nil)
-	s.highlighted = 2 // Highlight "cherry"
-
-	// Type a filter that removes cherry
-	s.handleInput(eventChar, 'a')
-
-	if s.highlighted != 0 {
-		t.Errorf("expected highlighted=0 after filter, got %d", s.highlighted)
-	}
-}
-
-// TestMultiSelectState_EmptyItems verifies handling of empty item list.
-// Empty list should be handled gracefully.
-func TestMultiSelectState_EmptyItems(t *testing.T) {
-	s := newMultiSelectState([]selectItem{}, nil)
-
-	// Toggle should not panic on empty list
-	s.toggleItem()
-
-	if s.selectedCount() != 0 {
-		t.Errorf("expected 0 selected for empty list, got %d", s.selectedCount())
-	}
-
-	// Render should handle empty list
-	var buf bytes.Buffer
-	lineCount := renderMultiSelect(&buf, "Select:", s)
-	if lineCount == 0 {
-		t.Error("renderMultiSelect should produce output even for empty list")
-	}
-	if !strings.Contains(buf.String(), "no matches") {
-		t.Error("expected 'no matches' for empty list")
-	}
-}
-
-// TestSelectState_RenderWithDescriptions verifies rendering items with descriptions.
-func TestSelectState_RenderWithDescriptions(t *testing.T) {
-	items := []selectItem{
-		{Name: "item1", Description: "First item description"},
-		{Name: "item2", Description: ""},
-		{Name: "item3", Description: "Third item"},
-	}
-
-	s := newSelectState(items)
-	var buf bytes.Buffer
-	renderSelect(&buf, "Select:", s)
-
-	output := buf.String()
-	if !strings.Contains(output, "First item description") {
-		t.Error("expected description to be rendered")
-	}
-	if !strings.Contains(output, "item2") {
-		t.Error("expected item without description to be rendered")
-	}
-}
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -159,7 +159,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 			sb.WriteString(before)
 			if !ok {
 				fmt.Fprintln(&sb)
-				scanner.Prompt.UseAlt = true
 				continue
 			}

--- a/convert/convert.go
+++ b/convert/convert.go
@@ -313,8 +313,6 @@ func LoadModelMetadata(fsys fs.FS) (ModelKV, *Tokenizer, error) {
 		conv = &deepseek2Model{}
 	case "Glm4MoeLiteForCausalLM":
 		conv = &glm4MoeLiteModel{}
-	case "GlmOcrForConditionalGeneration":
-		conv = &glmOcrModel{}
 	case "Lfm2ForCausalLM":
 		conv = &lfm2Model{}
 	default:
--- a/convert/convert_glm4moelite.go
+++ b/convert/convert_glm4moelite.go
@@ -6,10 +6,6 @@ import (
 	"log/slog"
 	"regexp"
 	"strconv"
-	"strings"
-
-	"github.com/pdevine/tensor"
-	"github.com/pdevine/tensor/native"

 	"github.com/ollama/ollama/fs/ggml"
 )
@@ -73,9 +69,6 @@ func (p *glm4MoeLiteModel) KV(t *Tokenizer) KV {
 	kv["glm4moelite.rope.dimension_count"] = p.QKRopeHeadDim
 	kv["glm4moelite.rope.freq_base"] = cmp.Or(p.RopeTheta, float32(1000000.0))

-	kv["glm4moelite.attention.key_length_mla"] = p.KVLoraRank + p.QKRopeHeadDim
-	kv["glm4moelite.attention.value_length_mla"] = p.KVLoraRank
-
 	kv["tokenizer.ggml.pre"] = "glm4"

 	return kv
@@ -107,67 +100,6 @@ func (p *glm4MoeLiteModel) Replacements() []string {
 	}
 }

-// repackKVB extracts K or V from the combined KV_B tensor for MLA absorption.
-// K output row-major: [n_head, kv_lora_rank, qk_nope] -> GGML ne[]={qk_nope, kv_lora_rank, n_head}
-// V output row-major: [n_head, v_head, kv_lora_rank] -> GGML ne[]={kv_lora_rank, v_head, n_head}
-func (p *glm4MoeLiteModel) repackKVB(extractK bool, kvFirst bool, numHeads int) Repacker {
-	qkNope := int(p.QKNopeHeadDim)
-	vHeadDim := int(p.VHeadDim)
-	kvLoraRank := int(p.KVLoraRank)
-	kvPerHead := qkNope + vHeadDim
-
-	return func(_ string, data []float32, shape []uint64) ([]float32, error) {
-		dims := make([]int, len(shape))
-		for i := range shape {
-			dims[i] = int(shape[i])
-		}
-
-		var tt tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
-		var err error
-
-		// Normalize to [n_head * (qk_nope + v_head), kv_lora_rank] layout
-		if kvFirst {
-			tt, err = tensor.Transpose(tt, 1, 0)
-			if err != nil {
-				return nil, err
-			}
-			tt = tensor.Materialize(tt)
-		}
-
-		// Reshape to [n_head, qk_nope + v_head, kv_lora_rank]
-		if err := tt.Reshape(numHeads, kvPerHead, kvLoraRank); err != nil {
-			return nil, err
-		}
-
-		if extractK {
-			// Slice K: [n_head, qk_nope, kv_lora_rank]
-			tt, err = tt.Slice(nil, tensor.S(0, qkNope), nil)
-			if err != nil {
-				return nil, err
-			}
-			tt = tensor.Materialize(tt)
-			// Transpose to [n_head, kv_lora_rank, qk_nope]
-			tt, err = tensor.Transpose(tt, 0, 2, 1)
-			if err != nil {
-				return nil, err
-			}
-			tt = tensor.Materialize(tt)
-		} else {
-			// Slice V: [n_head, v_head, kv_lora_rank] - already correct layout
-			tt, err = tt.Slice(nil, tensor.S(qkNope, kvPerHead), nil)
-			if err != nil {
-				return nil, err
-			}
-			tt = tensor.Materialize(tt)
-		}
-
-		if err := tt.Reshape(tt.Shape().TotalSize()); err != nil {
-			return nil, err
-		}
-		return native.VectorF32(tt.(*tensor.Dense))
-	}
-}
-
 func (p *glm4MoeLiteModel) Tensors(s []Tensor) (out []*ggml.Tensor) {
 	merges := make([]merge, p.HiddenLayers*3)
 	for i := range p.HiddenLayers {
@@ -207,52 +139,6 @@ func (p *glm4MoeLiteModel) Tensors(s []Tensor) (out []*ggml.Tensor) {
 			slog.Debug("skipping layer", "name", t.Name())
 			continue
 		}
-
-		// Split attn_kv_b into separate attn_k_b and attn_v_b for MLA absorption
-		if strings.HasSuffix(t.Name(), ".attn_kv_b.weight") {
-			qkNope := int(p.QKNopeHeadDim)
-			vHeadDim := int(p.VHeadDim)
-			kvLoraRank := int(p.KVLoraRank)
-			kvPerHead := qkNope + vHeadDim
-			numHeads := int(p.NumAttentionHeads)
-			kvFirst := true
-			if len(t.Shape()) == 2 {
-				switch {
-				case int(t.Shape()[0]) == kvLoraRank:
-					if kvPerHead > 0 && int(t.Shape()[1])%kvPerHead == 0 {
-						numHeads = int(t.Shape()[1]) / kvPerHead
-					}
-					kvFirst = true
-				case int(t.Shape()[1]) == kvLoraRank:
-					if kvPerHead > 0 && int(t.Shape()[0])%kvPerHead == 0 {
-						numHeads = int(t.Shape()[0]) / kvPerHead
-					}
-					kvFirst = false
-				default:
-					slog.Warn("glm4moelite: unexpected attn_kv_b layout", "name", t.Name(), "shape", t.Shape())
-				}
-			}
-
-			kTensor := t.Clone()
-			kTensor.SetRepacker(p.repackKVB(true, kvFirst, numHeads))
-			out = append(out, &ggml.Tensor{
-				Name:     strings.Replace(t.Name(), "attn_kv_b", "attn_k_b", 1),
-				Kind:     t.Kind(),
-				Shape:    []uint64{uint64(numHeads), uint64(kvLoraRank), uint64(qkNope)},
-				WriterTo: kTensor,
-			})
-
-			vTensor := t.Clone()
-			vTensor.SetRepacker(p.repackKVB(false, kvFirst, numHeads))
-			out = append(out, &ggml.Tensor{
-				Name:     strings.Replace(t.Name(), "attn_kv_b", "attn_v_b", 1),
-				Kind:     t.Kind(),
-				Shape:    []uint64{uint64(numHeads), uint64(vHeadDim), uint64(kvLoraRank)},
-				WriterTo: vTensor,
-			})
-			continue
-		}
-
 		out = append(out, &ggml.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
--- a/convert/convert_glmocr.go
+++ b/convert/convert_glmocr.go
@@ -1,469 +0,0 @@
-package convert
-
-import (
-	"cmp"
-	"encoding/json"
-	"io/fs"
-	"log/slog"
-	"regexp"
-	"strconv"
-	"strings"
-
-	"github.com/ollama/ollama/fs/ggml"
-	"github.com/pdevine/tensor"
-	"github.com/pdevine/tensor/native"
-)
-
-// normalToNeoXRepacker creates a repacker that permutes Q/K weights from interleaved (LLaMA)
-// to NeoX ordering for compatibility with GGML's M-RoPE kernel.
-//
-// For weights: reshape [out, in] -> [n_heads, head_dim, in], permute rotary dims, reshape back
-// For biases: reshape [out] -> [n_heads, head_dim], permute rotary dims, reshape back
-func normalToNeoXRepacker(nHeads, headDim int, partialRotaryFactor float32) func(string, []float32, []uint64) ([]float32, error) {
-	return func(_ string, data []float32, shape []uint64) ([]float32, error) {
-		rotaryDim := int(float32(headDim) * partialRotaryFactor)
-		if rotaryDim%2 != 0 {
-			rotaryDim = (rotaryDim / 2) * 2 // Round down to even
-		}
-
-		// Handle 1D (bias) or 2D (weight) tensors
-		is1D := len(shape) == 1
-		var inFeatures int
-		if is1D {
-			inFeatures = 1
-		} else {
-			inFeatures = int(shape[1])
-		}
-		outFeatures := int(shape[0])
-		nEffectiveHeads := outFeatures / headDim
-
-		if nEffectiveHeads != nHeads {
-			slog.Warn("normalToNeoX: unexpected head count", "effective", nEffectiveHeads, "expected", nHeads)
-		}
-
-		// Reshape to [n_heads, head_dim, in_features]
-		reshaped := make([]float32, len(data))
-		copy(reshaped, data)
-
-		// Permute the rotary dimensions: even indices first, then odd
-		// For each head, reorder [0,1,2,3,4,5...] to [0,2,4...,1,3,5...]
-		result := make([]float32, len(data))
-		halfRotary := rotaryDim / 2
-
-		for h := range nEffectiveHeads {
-			for f := range inFeatures {
-				for i := range halfRotary {
-					// Even dim (0, 2, 4, ...) -> position i
-					srcIdx := h*headDim*inFeatures + (2*i)*inFeatures + f
-					dstIdx := h*headDim*inFeatures + i*inFeatures + f
-					result[dstIdx] = reshaped[srcIdx]
-
-					// Odd dim (1, 3, 5, ...) -> position halfRotary + i
-					srcIdx = h*headDim*inFeatures + (2*i+1)*inFeatures + f
-					dstIdx = h*headDim*inFeatures + (halfRotary+i)*inFeatures + f
-					result[dstIdx] = reshaped[srcIdx]
-				}
-
-				// Non-rotary part: copy as-is
-				for i := rotaryDim; i < headDim; i++ {
-					srcIdx := h*headDim*inFeatures + i*inFeatures + f
-					result[srcIdx] = reshaped[srcIdx]
-				}
-			}
-		}
-
-		return result, nil
-	}
-}
-
-type glmOcrModel struct {
-	ModelParameters
-
-	TextConfig struct {
-		HiddenSize          uint32  `json:"hidden_size"`
-		IntermediateSize    uint32  `json:"intermediate_size"`
-		NumHiddenLayers     uint32  `json:"num_hidden_layers"`
-		NumAttentionHeads   uint32  `json:"num_attention_heads"`
-		NumKeyValueHeads    uint32  `json:"num_key_value_heads"`
-		HeadDim             uint32  `json:"head_dim"`
-		MaxPositionEmbed    uint32  `json:"max_position_embeddings"`
-		RMSNormEps          float32 `json:"rms_norm_eps"`
-		PartialRotaryFactor float32 `json:"partial_rotary_factor"`
-		RopeParameters      struct {
-			RopeType            string  `json:"rope_type"`
-			MRopeSection        []int32 `json:"mrope_section"`
-			RopeTheta           float32 `json:"rope_theta"`
-			PartialRotaryFactor float32 `json:"partial_rotary_factor"`
-		} `json:"rope_parameters"`
-	} `json:"text_config"`
-
-	VisionConfig struct {
-		HiddenSize        uint32  `json:"hidden_size"`
-		IntermediateSize  uint32  `json:"intermediate_size"`
-		Depth             uint32  `json:"depth"`
-		NumHeads          uint32  `json:"num_heads"`
-		ImageSize         uint32  `json:"image_size"`
-		PatchSize         uint32  `json:"patch_size"`
-		OutHiddenSize     uint32  `json:"out_hidden_size"`
-		RMSNormEps        float32 `json:"rms_norm_eps"`
-		SpatialMergeSize  uint32  `json:"spatial_merge_size"`
-		TemporalPatchSize uint32  `json:"temporal_patch_size"`
-	} `json:"vision_config"`
-
-	ImageStartTokenID uint32 `json:"image_start_token_id"`
-	ImageEndTokenID   uint32 `json:"image_end_token_id"`
-	VideoStartTokenID uint32 `json:"video_start_token_id"`
-	VideoEndTokenID   uint32 `json:"video_end_token_id"`
-	ImageTokenID      uint32 `json:"image_token_id"`
-	VideoTokenID      uint32 `json:"video_token_id"`
-
-	// Preprocessor config (preprocessor_config.json)
-	Preprocessor struct {
-		Size struct {
-			ShortestEdge uint32 `json:"shortest_edge"`
-			LongestEdge  uint32 `json:"longest_edge"`
-		} `json:"size"`
-		PatchSize         uint32    `json:"patch_size"`
-		TemporalPatchSize uint32    `json:"temporal_patch_size"`
-		MergeSize         uint32    `json:"merge_size"`
-		ImageMean         []float32 `json:"image_mean"`
-		ImageStd          []float32 `json:"image_std"`
-	} `json:"-"`
-}
-
-var _ ModelConverter = (*glmOcrModel)(nil)
-
-func (m *glmOcrModel) parseMore(fsys fs.FS) error {
-	bts, err := fs.ReadFile(fsys, "preprocessor_config.json")
-	if err != nil {
-		return err
-	}
-
-	return json.Unmarshal(bts, &m.Preprocessor)
-}
-
-func (m *glmOcrModel) KV(t *Tokenizer) KV {
-	kv := m.ModelParameters.KV(t)
-	kv["general.architecture"] = "glmocr"
-
-	// Text model parameters
-	kv["glmocr.block_count"] = cmp.Or(m.TextConfig.NumHiddenLayers, 16)
-	kv["glmocr.embedding_length"] = cmp.Or(m.TextConfig.HiddenSize, 1536)
-	kv["glmocr.attention.head_count"] = cmp.Or(m.TextConfig.NumAttentionHeads, 16)
-	kv["glmocr.attention.head_count_kv"] = cmp.Or(m.TextConfig.NumKeyValueHeads, 8)
-	headDim := cmp.Or(m.TextConfig.HeadDim, m.TextConfig.HiddenSize/m.TextConfig.NumAttentionHeads)
-	kv["glmocr.attention.key_length"] = headDim
-	kv["glmocr.attention.value_length"] = headDim
-	kv["glmocr.feed_forward_length"] = cmp.Or(m.TextConfig.IntermediateSize, 4608)
-	kv["glmocr.attention.layer_norm_rms_epsilon"] = cmp.Or(m.TextConfig.RMSNormEps, 1e-5)
-	kv["glmocr.context_length"] = cmp.Or(m.TextConfig.MaxPositionEmbed, 131072)
-	kv["glmocr.rope.freq_base"] = cmp.Or(m.TextConfig.RopeParameters.RopeTheta, float32(10000))
-	kv["glmocr.rope.partial_rotary_factor"] = cmp.Or(m.TextConfig.RopeParameters.PartialRotaryFactor, m.TextConfig.PartialRotaryFactor, float32(1.0))
-	if len(m.TextConfig.RopeParameters.MRopeSection) > 0 {
-		kv["glmocr.rope.mrope_section"] = m.TextConfig.RopeParameters.MRopeSection
-	}
-
-	// Vision model parameters
-	kv["glmocr.vision.block_count"] = cmp.Or(m.VisionConfig.Depth, 24)
-	kv["glmocr.vision.embedding_length"] = cmp.Or(m.VisionConfig.HiddenSize, 1024)
-	kv["glmocr.vision.attention.head_count"] = cmp.Or(m.VisionConfig.NumHeads, 16)
-	kv["glmocr.vision.image_size"] = cmp.Or(m.VisionConfig.ImageSize, 336)
-	kv["glmocr.vision.patch_size"] = cmp.Or(m.VisionConfig.PatchSize, m.Preprocessor.PatchSize, 14)
-	kv["glmocr.vision.spatial_merge_size"] = cmp.Or(m.VisionConfig.SpatialMergeSize, m.Preprocessor.MergeSize, 2)
-	kv["glmocr.vision.temporal_patch_size"] = cmp.Or(m.VisionConfig.TemporalPatchSize, m.Preprocessor.TemporalPatchSize, 2)
-	kv["glmocr.vision.out_hidden_size"] = cmp.Or(m.VisionConfig.OutHiddenSize, 1536)
-	kv["glmocr.vision.intermediate_size"] = cmp.Or(m.VisionConfig.IntermediateSize, 4096)
-	kv["glmocr.vision.attention.layer_norm_rms_epsilon"] = cmp.Or(m.VisionConfig.RMSNormEps, 1e-5)
-
-	// Preprocessor-derived image settings (min/max pixels and normalization)
-	// Note: fs.Config.keyValue() auto-prepends architecture prefix, so use full key
-	if m.Preprocessor.Size.ShortestEdge > 0 {
-		kv["glmocr.vision.min_pixels"] = m.Preprocessor.Size.ShortestEdge
-	}
-	if m.Preprocessor.Size.LongestEdge > 0 {
-		kv["glmocr.vision.max_pixels"] = m.Preprocessor.Size.LongestEdge
-	}
-	if len(m.Preprocessor.ImageMean) == 3 {
-		kv["glmocr.vision.image_mean"] = m.Preprocessor.ImageMean
-	}
-	if len(m.Preprocessor.ImageStd) == 3 {
-		kv["glmocr.vision.image_std"] = m.Preprocessor.ImageStd
-	}
-
-	// Special tokens
-	kv["glmocr.image_token_id"] = m.ImageTokenID
-	kv["glmocr.image_start_token_id"] = m.ImageStartTokenID
-	kv["glmocr.image_end_token_id"] = m.ImageEndTokenID
-	kv["glmocr.video_token_id"] = m.VideoTokenID
-	kv["glmocr.video_start_token_id"] = m.VideoStartTokenID
-	kv["glmocr.video_end_token_id"] = m.VideoEndTokenID
-
-	return kv
-}
-
-func (m *glmOcrModel) Tensors(ts []Tensor) []*ggml.Tensor {
-	var out []*ggml.Tensor
-
-	// Skip layers >= num_hidden_layers (Multi-Token Prediction layers not needed for basic inference)
-	numLayers := int(cmp.Or(m.TextConfig.NumHiddenLayers, 16))
-	skipLayer := func(name string) bool {
-		// Tensor names are already replaced to "blk.N.xxx" format
-		re := regexp.MustCompile(`^blk\.(\d+)`)
-		matches := re.FindStringSubmatch(name)
-		if matches == nil {
-			return false
-		}
-		blkNum, err := strconv.Atoi(matches[1])
-		if err != nil {
-			return false
-		}
-		return blkNum >= numLayers
-	}
-
-	for _, t := range ts {
-		name := t.Name()
-
-		// Skip next-n prediction layers (layers >= num_hidden_layers)
-		if skipLayer(name) {
-			continue
-		}
-
-		// Split ffn_gate_up into separate gate and up projections
-		if strings.Contains(name, "ffn_gate_up") {
-			for t := range splitDim(t, 0,
-				split{Replacer: strings.NewReplacer("ffn_gate_up", "ffn_gate")},
-				split{Replacer: strings.NewReplacer("ffn_gate_up", "ffn_up")},
-			) {
-				out = append(out, t)
-			}
-			continue
-		}
-
-		// Split 5D Conv3D patch_embed weight into two Conv2D weights along temporal dimension
-		// Shape: [out_channels, in_channels, temporal=2, height, width] -> 2x [out_channels, in_channels, height, width]
-		// NOTE: Tensor names are already renamed via Replacements() before Tensors() is called,
-		// so we check for "patch_embd" (renamed) not "patch_embed" (original safetensors name)
-		// NOTE: Ollama Conv2D expects PyTorch format [OC, IC, KH, KW] - no transpose needed
-		if strings.HasSuffix(name, "patch_embd.weight") {
-			shape := t.Shape()
-			if len(shape) == 5 && shape[2] == 2 {
-				// Original shape: [OC, IC, 2, KH, KW] -> [OC, IC, KH, KW] (PyTorch format, no transpose)
-				newShape := []uint64{shape[0], shape[1], shape[3], shape[4]}
-
-				// Create repacker for first temporal slice (t=0)
-				t0 := t.Clone()
-				t0.SetRepacker(func(_ string, data []float32, shape []uint64) ([]float32, error) {
-					dims := make([]int, len(shape))
-					for i := range shape {
-						dims[i] = int(shape[i])
-					}
-					var tt tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
-					// Slice first temporal frame: [:, :, 0, :, :]
-					tt, err := tt.Slice(nil, nil, tensor.S(0, 1), nil, nil)
-					if err != nil {
-						return nil, err
-					}
-					tt = tensor.Materialize(tt)
-					// Reshape to 4D by squeezing temporal dim [OC, IC, 1, KH, KW] -> [OC, IC, KH, KW]
-					newDims := []int{int(shape[0]), int(shape[1]), int(shape[3]), int(shape[4])}
-					if err := tt.Reshape(newDims...); err != nil {
-						return nil, err
-					}
-					// No transpose - keep PyTorch format
-					if err := tt.Reshape(tt.Shape().TotalSize()); err != nil {
-						return nil, err
-					}
-					return native.VectorF32(tt.(*tensor.Dense))
-				})
-				out = append(out, &ggml.Tensor{
-					Name:     strings.Replace(name, "patch_embd.weight", "patch_embd_0.weight", 1),
-					Kind:     t.Kind(),
-					Shape:    newShape,
-					WriterTo: t0,
-				})
-
-				// Create repacker for second temporal slice (t=1)
-				t1 := t.Clone()
-				t1.SetRepacker(func(_ string, data []float32, shape []uint64) ([]float32, error) {
-					dims := make([]int, len(shape))
-					for i := range shape {
-						dims[i] = int(shape[i])
-					}
-					var tt tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
-					// Slice second temporal frame: [:, :, 1, :, :]
-					tt, err := tt.Slice(nil, nil, tensor.S(1, 2), nil, nil)
-					if err != nil {
-						return nil, err
-					}
-					tt = tensor.Materialize(tt)
-					// Reshape to 4D by squeezing temporal dim [OC, IC, 1, KH, KW] -> [OC, IC, KH, KW]
-					newDims := []int{int(shape[0]), int(shape[1]), int(shape[3]), int(shape[4])}
-					if err := tt.Reshape(newDims...); err != nil {
-						return nil, err
-					}
-					// No transpose - keep PyTorch format
-					if err := tt.Reshape(tt.Shape().TotalSize()); err != nil {
-						return nil, err
-					}
-					return native.VectorF32(tt.(*tensor.Dense))
-				})
-				out = append(out, &ggml.Tensor{
-					Name:     strings.Replace(name, "patch_embd.weight", "patch_embd_1.weight", 1),
-					Kind:     t.Kind(),
-					Shape:    newShape,
-					WriterTo: t1,
-				})
-
-				continue
-			}
-
-			if len(shape) == 4 {
-				out = append(out, &ggml.Tensor{
-					Name:     strings.Replace(name, "patch_embd.weight", "patch_embd_0.weight", 1),
-					Kind:     t.Kind(),
-					Shape:    t.Shape(),
-					WriterTo: t,
-				})
-				continue
-			}
-
-			slog.Warn("glmocr: patch_embed weight has unexpected shape - not splitting", "shape", shape)
-			// Fall through to default handling
-		}
-
-		// Handle pre-split patch embedding weights
-		// Pattern 1: v.patch_embd.0.weight, v.patch_embd.1.weight -> patch_embd_0.weight, patch_embd_1.weight
-		// Pattern 2: v.patch_embd.weight.0, v.patch_embd.weight.1 -> patch_embd_0.weight, patch_embd_1.weight
-		if strings.Contains(name, "patch_embd.0.") {
-			out = append(out, &ggml.Tensor{
-				Name:     strings.Replace(name, "patch_embd.0.", "patch_embd_0.", 1),
-				Kind:     t.Kind(),
-				Shape:    t.Shape(),
-				WriterTo: t,
-			})
-			continue
-		}
-		if strings.Contains(name, "patch_embd.1.") {
-			out = append(out, &ggml.Tensor{
-				Name:     strings.Replace(name, "patch_embd.1.", "patch_embd_1.", 1),
-				Kind:     t.Kind(),
-				Shape:    t.Shape(),
-				WriterTo: t,
-			})
-			continue
-		}
-		// Handle .weight.0 and .weight.1 suffix patterns
-		if strings.HasSuffix(name, "patch_embd.weight.0") {
-			out = append(out, &ggml.Tensor{
-				Name:     strings.Replace(name, "patch_embd.weight.0", "patch_embd_0.weight", 1),
-				Kind:     t.Kind(),
-				Shape:    t.Shape(),
-				WriterTo: t,
-			})
-			continue
-		}
-		if strings.HasSuffix(name, "patch_embd.weight.1") {
-			out = append(out, &ggml.Tensor{
-				Name:     strings.Replace(name, "patch_embd.weight.1", "patch_embd_1.weight", 1),
-				Kind:     t.Kind(),
-				Shape:    t.Shape(),
-				WriterTo: t,
-			})
-			continue
-		}
-
-		// Permute Q/K weights for M-RoPE compatibility (interleaved -> NeoX ordering)
-		// GGML's M-RoPE kernel uses NeoX-style rotation, but GLM-OCR uses interleaved (LLaMA-style)
-		// We permute at conversion time so the weights work correctly with GGML's kernel
-		// This aligns Q/K rotary dimensions with GGML's NeoX-style rotation
-		if len(m.TextConfig.RopeParameters.MRopeSection) > 0 &&
-			strings.Contains(name, "blk.") && (strings.Contains(name, "attn_q.") || strings.Contains(name, "attn_k.")) {
-			// Get config values for permutation
-			nHeads := int(cmp.Or(m.TextConfig.NumAttentionHeads, 16))
-			nKVHeads := int(cmp.Or(m.TextConfig.NumKeyValueHeads, 8))
-			hiddenSize := int(cmp.Or(m.TextConfig.HiddenSize, 1536))
-			headDim := int(cmp.Or(m.TextConfig.HeadDim, uint32(hiddenSize/nHeads)))
-			partialRotaryFactor := cmp.Or(m.TextConfig.PartialRotaryFactor, m.TextConfig.RopeParameters.PartialRotaryFactor, float32(1.0))
-
-			// Use appropriate head count: nHeads for Q, nKVHeads for K
-			effectiveHeads := nHeads
-			if strings.Contains(name, "attn_k.") {
-				effectiveHeads = nKVHeads
-			}
-
-			permutedT := t.Clone()
-			permutedT.SetRepacker(normalToNeoXRepacker(effectiveHeads, headDim, partialRotaryFactor))
-			out = append(out, &ggml.Tensor{
-				Name:     name,
-				Kind:     t.Kind(),
-				Shape:    t.Shape(),
-				WriterTo: permutedT,
-			})
-			continue
-		}
-
-		out = append(out, &ggml.Tensor{
-			Name:     name,
-			Kind:     t.Kind(),
-			Shape:    t.Shape(),
-			WriterTo: t,
-		})
-	}
-
-	return out
-}
-
-func (m *glmOcrModel) Replacements() []string {
-	return []string{
-		// Vision encoder
-		"model.visual.patch_embed.proj_1", "v.patch_embd_1", // Second temporal split
-		"model.visual.patch_embed.proj", "v.patch_embd",
-		"model.visual.blocks", "v.blk",
-		"model.visual.post_layernorm", "v.post_ln",
-		"model.visual.downsample", "mm.patch_merger",
-
-		// Vision attention
-		"attn.qkv", "attn_qkv",
-		"attn.proj", "attn_out",
-		"attn.q_norm", "attn_q_norm",
-		"attn.k_norm", "attn_k_norm",
-
-		// Vision norms
-		"norm1", "ln1",
-		"norm2", "ln2",
-
-		// Vision MLP
-		"mlp.gate_proj", "ffn_gate",
-		"mlp.up_proj", "ffn_up",
-		"mlp.down_proj", "ffn_down",
-
-		// Merger (multimodal projector)
-		"model.visual.merger.proj", "mm.model.fc",
-		"model.visual.merger.post_projection_norm", "mm.post_norm",
-		"model.visual.merger.gate_proj", "mm.gate",
-		"model.visual.merger.up_proj", "mm.up",
-		"model.visual.merger.down_proj", "mm.down",
-
-		// Language model
-		"model.language_model.embed_tokens", "token_embd",
-		"model.language_model.layers", "blk",
-		"model.language_model.norm", "output_norm",
-		"lm_head", "output",
-
-		// Language model attention
-		"self_attn.q_proj", "attn_q",
-		"self_attn.k_proj", "attn_k",
-		"self_attn.v_proj", "attn_v",
-		"self_attn.o_proj", "attn_out",
-
-		// Language model norms
-		"input_layernorm", "attn_norm",
-		"post_attention_layernorm", "ffn_norm",
-		"post_self_attn_layernorm", "post_attn_norm",
-		"post_mlp_layernorm", "post_ffn_norm",
-
-		// Language model MLP (remove mlp. prefix so ffn_* names work)
-		"mlp.gate_up_proj", "ffn_gate_up",
-		"mlp.down_proj", "ffn_down",
-	}
-}
--- a/convert/reader_safetensors.go
+++ b/convert/reader_safetensors.go
@@ -99,7 +99,6 @@ func (st safetensor) Kind() uint32 {
 	if st.dtype == "BF16" &&
 		!strings.HasPrefix(st.name, "v.") &&
 		!strings.HasPrefix(st.name, "s.") &&
-		!strings.HasPrefix(st.name, "mm.") &&
 		kind != tensorKindFP32 {
 		kind = tensorKindBF16
 	}
--- a/docs/api/anthropic-compatibility.mdx
+++ b/docs/api/anthropic-compatibility.mdx
@@ -4,6 +4,16 @@ title: Anthropic compatibility

 Ollama provides compatibility with the [Anthropic Messages API](https://docs.anthropic.com/en/api/messages) to help connect existing applications to Ollama, including tools like Claude Code.

+## Recommended models
+
+For coding use cases, models like `glm-4.7:cloud`, `minimax-m2.1:cloud`, and `qwen3-coder` are recommended.
+
+Pull a model before use:
+```shell
+ollama pull qwen3-coder
+ollama pull glm-4.7:cloud
+```
+
 ## Usage

 ### Environment variables
@@ -12,8 +22,8 @@ To use Ollama with tools that expect the Anthropic API (like Claude Code), set t

 ```shell
 export ANTHROPIC_AUTH_TOKEN=ollama  # required but ignored
-export ANTHROPIC_API_KEY="" # required but ignored
 export ANTHROPIC_BASE_URL=http://localhost:11434
+export ANTHROPIC_API_KEY=ollama  # required but ignored
 ```

 ### Simple `/v1/messages` example
@@ -235,41 +245,10 @@ curl -X POST http://localhost:11434/v1/messages \

 ## Using with Claude Code

-[Claude Code](https://code.claude.com/docs/en/overview) can be configured to use Ollama as its backend. 
-
-### Recommended models
-
-For coding use cases, models like `glm-4.7`, `minimax-m2.1`, and `qwen3-coder` are recommended.
-
-Download a model before use:
+[Claude Code](https://code.claude.com/docs/en/overview) can be configured to use Ollama as its backend:

 ```shell
-ollama pull qwen3-coder
-```
-> Note: Qwen 3 coder is a 30B parameter model requiring at least 24GB of VRAM to run smoothly. More is required for longer context lengths. 
-
-```shell
-ollama pull glm-4.7:cloud
-```
-
-### Quick setup
-
-```shell
-ollama launch claude
-```
-
-This will prompt you to select a model, configure Claude Code automatically, and launch it. To configure without launching:
-
-```shell
-ollama launch claude --config
-```
-
-### Manual setup
-
-Set the environment variables and run Claude Code:
-
-```shell
-ANTHROPIC_AUTH_TOKEN=ollama ANTHROPIC_BASE_URL=http://localhost:11434 ANTHROPIC_API_KEY="" claude --model qwen3-coder
+ANTHROPIC_AUTH_TOKEN=ollama ANTHROPIC_BASE_URL=http://localhost:11434 ANTHROPIC_API_KEY=ollama claude --model qwen3-coder
 ```

 Or set the environment variables in your shell profile:
@@ -277,13 +256,19 @@ Or set the environment variables in your shell profile:
 ```shell
 export ANTHROPIC_AUTH_TOKEN=ollama
 export ANTHROPIC_BASE_URL=http://localhost:11434
-export ANTHROPIC_API_KEY=""
+export ANTHROPIC_API_KEY=ollama
 ```

 Then run Claude Code with any Ollama model:

 ```shell
+# Local models
 claude --model qwen3-coder
+claude --model gpt-oss:20b
+
+# Cloud models
+claude --model glm-4.7:cloud
+claude --model minimax-m2.1:cloud
 ```

 ## Endpoints
--- a/docs/cli.mdx
+++ b/docs/cli.mdx
@@ -8,47 +8,6 @@ title: CLI Reference
 ollama run gemma3
 ```

-### Launch integrations
-
-```
-ollama launch
-```
-
-Configure and launch external applications to use Ollama models. This provides an interactive way to set up and start integrations with supported apps.
-
-#### Supported integrations
-
- **OpenCode** - Open-source coding assistant
- **Claude Code** - Anthropic's agentic coding tool
- **Codex** - OpenAI's coding assistant
- **Droid** - Factory's AI coding agent
-
-#### Examples
-
-Launch an integration interactively:
-
-```
-ollama launch
-```
-
-Launch a specific integration:
-
-```
-ollama launch claude
-```
-
-Launch with a specific model:
-
-```
-ollama launch claude --model qwen3-coder
-```
-
-Configure without launching:
-
-```
-ollama launch droid --config
-```
-
 #### Multiline input

 For multiline input, you can wrap text with `"""`:
--- a/docs/cloud.mdx
+++ b/docs/cloud.mdx
@@ -3,6 +3,8 @@ title: Cloud
 sidebarTitle: Cloud
 ---

+<Info>Ollama's cloud is currently in preview.</Info>
+
 ## Cloud Models

 Ollama's cloud models are a new kind of model in Ollama that can run without a powerful GPU. Instead, cloud models are automatically offloaded to Ollama's cloud service while offering the same capabilities as local models, making it possible to keep using your local tools while running larger models that wouldn't fit on a personal computer.
--- a/docs/context-length.mdx
+++ b/docs/context-length.mdx
@@ -8,7 +8,7 @@ Context length is the maximum number of tokens that the model has access to in m
  The default context length in Ollama is 4096 tokens.
 </Note>

-Tasks which require large context like web search, agents, and coding tools should be set to at least 64000 tokens.
+Tasks which require large context like web search, agents, and coding tools should be set to at least 32000 tokens.

 ## Setting context length

@@ -24,7 +24,7 @@ Change the slider in the Ollama app under settings to your desired context lengt
 ### CLI
 If editing the context length for Ollama is not possible, the context length can also be updated when serving Ollama.  
 ```
-OLLAMA_CONTEXT_LENGTH=64000 ollama serve
+OLLAMA_CONTEXT_LENGTH=32000 ollama serve
 ```

 ### Check allocated context length and model offloading
--- a/docs/docs.json
+++ b/docs/docs.json
@@ -102,19 +102,18 @@
            "group": "Integrations",
            "pages": [
              "/integrations/claude-code",
-              "/integrations/cline",
+              "/integrations/vscode",
+              "/integrations/jetbrains",
              "/integrations/codex",
+              "/integrations/cline",
              "/integrations/droid",
              "/integrations/goose",
-              "/integrations/jetbrains",
-              "/integrations/marimo",
-              "/integrations/n8n",
-              "/integrations/onyx",
-              "/integrations/opencode",
+              "/integrations/zed",
              "/integrations/roo-code",
-              "/integrations/vscode",
+              "/integrations/n8n",
              "/integrations/xcode",
-              "/integrations/zed"
+              "/integrations/onyx",
+              "/integrations/marimo"
            ]
          },
          {
--- a/docs/index.mdx
+++ b/docs/index.mdx
@@ -9,7 +9,7 @@ sidebarTitle: Welcome

 <CardGroup cols={2}>
  <Card title="Quickstart" icon="rocket" href="/quickstart">
-    Get up and running with your first model or integrate Ollama with your favorite tools
+    Get up and running with your first model
  </Card>
  <Card
    title="Download Ollama"
--- a/docs/integrations/claude-code.mdx
+++ b/docs/integrations/claude-code.mdx
@@ -2,9 +2,9 @@
 title: Claude Code
 ---

-Claude Code is Anthropic's agentic coding tool that can read, modify, and execute code in your working directory. 
+Claude Code is Anthropic's agentic coding tool that can read, modify, and execute code in your working directory.

-Open models can be used with Claude Code through Ollama's Anthropic-compatible API, enabling you to use models such as `glm-4.7`, `qwen3-coder`, `gpt-oss`. 
+Open models can be used with Claude Code through Ollama's Anthropic-compatible API, enabling you to use models such as `qwen3-coder`, `gpt-oss:20b`, or other models.

 ![Claude Code with Ollama](https://files.ollama.com/claude-code.png)

@@ -26,19 +26,15 @@ irm https://claude.ai/install.ps1 | iex

 ## Usage with Ollama

-### Quick setup
+Configure Claude Code to use Ollama:

 ```shell
-ollama launch claude
+ollama config claude
 ```

-To configure without launching:
+This will prompt you to select a model and automatically configure Claude Code to use Ollama.

-```shell
-ollama launch claude --config
-```
-
-### Manual setup
+<Accordion title="Manual Configuration">

 Claude Code connects to Ollama using the Anthropic-compatible API.

@@ -46,7 +42,6 @@ Claude Code connects to Ollama using the Anthropic-compatible API.

 ```shell
 export ANTHROPIC_AUTH_TOKEN=ollama
-export ANTHROPIC_API_KEY=""
 export ANTHROPIC_BASE_URL=http://localhost:11434
 ```

@@ -59,17 +54,37 @@ claude --model gpt-oss:20b
 Or run with environment variables inline:

 ```shell
-ANTHROPIC_AUTH_TOKEN=ollama ANTHROPIC_BASE_URL=http://localhost:11434 ANTHROPIC_API_KEY="" claude --model qwen3-coder 
+ANTHROPIC_AUTH_TOKEN=ollama ANTHROPIC_BASE_URL=http://localhost:11434 claude --model gpt-oss:20b
 ```

-**Note:** Claude Code requires a large context window. We recommend at least 64k tokens. See the [context length documentation](/context-length) for how to adjust context length in Ollama.
+</Accordion>
+
+<Note>Claude Code requires a large context window. We recommend at least 32K tokens. See the [context length documentation](/context-length) for how to adjust context length in Ollama.</Note>
+
+## Connecting to ollama.com
+
+1. Create an [API key](https://ollama.com/settings/keys) on ollama.com
+2. Set the environment variables:
+
+```shell
+export ANTHROPIC_BASE_URL=https://ollama.com
+export ANTHROPIC_API_KEY=<your-api-key>
+```
+
+3. Run Claude Code with a cloud model:
+
+```shell
+claude --model glm-4.7:cloud
+```

 ## Recommended Models

- `qwen3-coder` 
- `glm-4.7`
- `gpt-oss:20b`
- `gpt-oss:120b`
-
-Cloud models are also available at [ollama.com/search?c=cloud](https://ollama.com/search?c=cloud).
+### Cloud models
+- `glm-4.7:cloud` - High-performance cloud model
+- `minimax-m2.1:cloud` - Fast cloud model
+- `qwen3-coder:480b` - Large coding model

+### Local models
+- `qwen3-coder` - Excellent for coding tasks
+- `gpt-oss:20b` - Strong general-purpose model
+- `gpt-oss:120b` - Larger general-purpose model for more complex tasks
--- a/docs/integrations/codex.mdx
+++ b/docs/integrations/codex.mdx
@@ -2,36 +2,31 @@
 title: Codex
 ---

+Codex is OpenAI's agentic coding tool for the command line.

 ## Install

 Install the [Codex CLI](https://developers.openai.com/codex/cli/):

-```
+```shell
 npm install -g @openai/codex
 ```

 ## Usage with Ollama

-<Note>Codex requires a larger context window. It is recommended to use a context window of at least 64k tokens.</Note>
-
-### Quick setup
-
-```
-ollama launch codex
-```
-
-To configure without launching:
+Configure Codex to use Ollama:

 ```shell
-ollama launch codex --config
+ollama config codex
 ```

-### Manual setup
+This will prompt you to select a model and automatically configure Codex to use Ollama.
+
+<Accordion title="Manual Configuration">

 To use `codex` with Ollama, use the `--oss` flag:

-```
+```shell
 codex --oss
 ```

@@ -39,20 +34,22 @@ codex --oss

 By default, codex will use the local `gpt-oss:20b` model. However, you can specify a different model with the `-m` flag:

-```
+```shell
 codex --oss -m gpt-oss:120b
 ```

 ### Cloud Models

-```
+```shell
 codex --oss -m gpt-oss:120b-cloud
 ```

+</Accordion>
+
+<Note>Codex requires a larger context window. It is recommended to use a context window of at least 32K tokens.</Note>

 ## Connecting to ollama.com

-
 Create an [API key](https://ollama.com/settings/keys) from ollama.com and export it as `OLLAMA_API_KEY`.

 To use ollama.com directly, edit your `~/.codex/config.toml` file to point to ollama.com.
--- a/docs/integrations/droid.mdx
+++ b/docs/integrations/droid.mdx
@@ -2,6 +2,7 @@
 title: Droid
 ---

+Droid is Factory's agentic coding tool for the command line.

 ## Install

@@ -11,77 +12,77 @@ Install the [Droid CLI](https://factory.ai/):
 curl -fsSL https://app.factory.ai/cli | sh
 ```

-<Note>Droid requires a larger context window. It is recommended to use a context window of at least 64k tokens. See [Context length](/context-length) for more information.</Note>
-
 ## Usage with Ollama

-### Quick setup
-
-```bash
-ollama launch droid
-```
-
-To configure without launching:
+Configure Droid to use Ollama:

 ```shell
-ollama launch droid --config
+ollama config droid
 ```

-### Manual setup
+This will prompt you to select models and automatically configure Droid to use Ollama.

-Add a local configuration block to `~/.factory/config.json`:
+<Accordion title="Manual Configuration">
+
+Add a local configuration block to `~/.factory/settings.json`:

 ```json
 {
-  "custom_models": [
+  "customModels": [
    {
-      "model_display_name": "qwen3-coder [Ollama]",
      "model": "qwen3-coder",
-      "base_url": "http://localhost:11434/v1/",
-      "api_key": "not-needed",
+      "displayName": "qwen3-coder [Ollama]",
+      "baseUrl": "http://localhost:11434/v1",
+      "apiKey": "ollama",
      "provider": "generic-chat-completion-api",
-      "max_tokens": 32000 
+      "maxOutputTokens": 32000
    }
  ]
 }
 ```

+Adjust `maxOutputTokens` based on your model's context length (the automated setup detects this automatically).
+
+### Cloud Models

-## Cloud Models
 `qwen3-coder:480b-cloud` is the recommended model for use with Droid.

-Add the cloud configuration block to `~/.factory/config.json`:
+Add the cloud configuration block to `~/.factory/settings.json`:

 ```json
 {
-  "custom_models": [
+  "customModels": [
    {
-      "model_display_name": "qwen3-coder [Ollama Cloud]",
      "model": "qwen3-coder:480b-cloud",
-      "base_url": "http://localhost:11434/v1/",
-      "api_key": "not-needed",
+      "displayName": "qwen3-coder:480b-cloud [Ollama]",
+      "baseUrl": "http://localhost:11434/v1",
+      "apiKey": "ollama",
      "provider": "generic-chat-completion-api",
-      "max_tokens": 128000
+      "maxOutputTokens": 128000
    }
  ]
 }
 ```

+</Accordion>
+
+<Note>Droid requires a larger context window. It is recommended to use a context window of at least 32K tokens. See [Context length](/context-length) for more information.</Note>
+
 ## Connecting to ollama.com

 1. Create an [API key](https://ollama.com/settings/keys) from ollama.com and export it as `OLLAMA_API_KEY`.
-2. Add the cloud configuration block to `~/.factory/config.json`:
+2. Add the cloud configuration block to `~/.factory/settings.json`:

   ```json
   {
-     "custom_models": [
+     "customModels": [
       {
-         "model_display_name": "qwen3-coder [Ollama Cloud]",
         "model": "qwen3-coder:480b",
-         "base_url": "https://ollama.com/v1/",
-         "api_key": "OLLAMA_API_KEY",
+         "displayName": "qwen3-coder:480b [Ollama Cloud]",
+         "baseUrl": "https://ollama.com/v1",
+         "apiKey": "OLLAMA_API_KEY",
         "provider": "generic-chat-completion-api",
-         "max_tokens": 128000
+         "maxOutputTokens": 128000
       }
     ]
   }
--- a/docs/integrations/opencode.mdx
+++ b/docs/integrations/opencode.mdx
@@ -2,35 +2,29 @@
 title: OpenCode
 ---

-OpenCode is an open-source AI coding assistant that runs in your terminal.
+OpenCode is an agentic coding tool for the terminal.

 ## Install

-Install the [OpenCode CLI](https://opencode.ai):
+Install [OpenCode](https://opencode.ai):

-```bash
-curl -fsSL https://opencode.ai/install.sh | bash
+```shell
+curl -fsSL https://opencode.ai/install | bash
 ```

-<Note>OpenCode requires a larger context window. It is recommended to use a context window of at least 64k tokens. See [Context length](/context-length) for more information.</Note>
-
 ## Usage with Ollama

-### Quick setup
-
-```bash
-ollama launch opencode
-```
-
-To configure without launching:
+Configure OpenCode to use Ollama:

 ```shell
-ollama launch opencode --config
+ollama config opencode
 ```

-### Manual setup
+This will prompt you to select models and automatically configure OpenCode to use Ollama.

-Add a configuration block to `~/.config/opencode/opencode.json`:
+<Accordion title="Manual Configuration">
+
+Add the Ollama provider to `~/.config/opencode/opencode.json`:

 ```json
 {
@@ -38,13 +32,13 @@ Add a configuration block to `~/.config/opencode/opencode.json`:
  "provider": {
    "ollama": {
      "npm": "@ai-sdk/openai-compatible",
-      "name": "Ollama",
+      "name": "Ollama (local)",
      "options": {
        "baseURL": "http://localhost:11434/v1"
      },
      "models": {
        "qwen3-coder": {
-          "name": "qwen3-coder"
+          "name": "qwen3-coder [Ollama]"
        }
      }
    }
@@ -52,55 +46,18 @@ Add a configuration block to `~/.config/opencode/opencode.json`:
 }
 ```

-## Cloud Models
+</Accordion>

-`glm-4.7:cloud` is the recommended model for use with OpenCode.
+<Note>OpenCode requires a larger context window. It is recommended to use a context window of at least 32K tokens. See [Context length](/context-length) for more information.</Note>

-Add the cloud configuration to `~/.config/opencode/opencode.json`:
+## Recommended Models

-```json
-{
-  "$schema": "https://opencode.ai/config.json",
-  "provider": {
-    "ollama": {
-      "npm": "@ai-sdk/openai-compatible",
-      "name": "Ollama",
-      "options": {
-        "baseURL": "http://localhost:11434/v1"
-      },
-      "models": {
-        "glm-4.7:cloud": {
-          "name": "glm-4.7:cloud"
-        }
-      }
-    }
-  }
-}
-```
+### Cloud models
+- `qwen3-coder:480b` - Large coding model
+- `glm-4.7:cloud` - High-performance cloud model
+- `minimax-m2.1:cloud` - Fast cloud model

-## Connecting to ollama.com
-
-1. Create an [API key](https://ollama.com/settings/keys) from ollama.com and export it as `OLLAMA_API_KEY`.
-2. Update `~/.config/opencode/opencode.json` to point to ollama.com:
-
-```json
-{
-  "$schema": "https://opencode.ai/config.json",
-  "provider": {
-    "ollama": {
-      "npm": "@ai-sdk/openai-compatible",
-      "name": "Ollama Cloud",
-      "options": {
-        "baseURL": "https://ollama.com/v1"
-      },
-      "models": {
-        "glm-4.7:cloud": {
-          "name": "glm-4.7:cloud"
-        }
-      }
-    }
-  }
-}
-```
-
-Run `opencode` in a new terminal to load the new settings.
+### Local models
+- `qwen3-coder` - Excellent for coding tasks
+- `gpt-oss:20b` - Strong general-purpose model
+- `gpt-oss:120b` - Larger general-purpose model for more complex tasks
--- a/docs/quickstart.mdx
+++ b/docs/quickstart.mdx
@@ -18,13 +18,13 @@ This quickstart will walk your through running your first model with Ollama. To
  <Tab title="CLI">
    Open a terminal and run the command:

-    ```sh
+    ```
    ollama run gemma3
    ```

  </Tab>
  <Tab title="cURL">
-    ```sh
+    ```
    ollama pull gemma3
    ```

@@ -45,13 +45,13 @@ This quickstart will walk your through running your first model with Ollama. To
  <Tab title="Python">
    Start by downloading a model:

-    ```sh
+    ```
    ollama pull gemma3
    ```

    Then install Ollama's Python library:

-    ```sh
+    ```
    pip install ollama
    ```

@@ -101,42 +101,3 @@ This quickstart will walk your through running your first model with Ollama. To
 </Tabs>

 See a full list of available models [here](https://ollama.com/models).
-
-## Coding 
-
-For coding use cases, we recommend using the `glm-4.7-flash` model. 
-
-Note: this model requires 23 GB of VRAM with 64000 tokens context length.
-```sh
-ollama pull glm-4.7-flash 
-```
-
-Alternatively, you can use a more powerful cloud model (with full context length):
-```sh
-ollama pull glm-4.7:cloud
-```
-
-Use `ollama launch` to quickly set up a coding tool with Ollama models:
-
-```sh
-ollama launch
-```
-
-### Supported integrations
-
- [OpenCode](/integrations/opencode) - Open-source coding assistant
- [Claude Code](/integrations/claude-code) - Anthropic's agentic coding tool
- [Codex](/integrations/codex) - OpenAI's coding assistant
- [Droid](/integrations/droid) - Factory's AI coding agent
-
-### Launch with a specific model
-
-```sh
-ollama launch claude --model glm-4.7-flash
-```
-
-### Configure without launching
-
-```sh
-ollama launch claude --config
-```
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -270,7 +270,6 @@ func (kv KV) OllamaEngineRequired() bool {
 		"qwen3", "qwen3moe",
 		"qwen3vl", "qwen3vlmoe",
 		"glm4moelite",
-		"glmocr",
 		"lfm2",
 	}, kv.Architecture())
 }
@@ -860,7 +859,6 @@ func (f GGML) FlashAttention() bool {
 		"bert",
 		"gemma3",
 		"glm4moelite",
-		"glmocr",
 		"gptoss", "gpt-oss",
 		"lfm2",
 		"mistral3",
--- a/llama/patches/0032-ggml-enable-MLA-flash-attention-for-GLM-4.7-flash.patch
+++ b/llama/patches/0032-ggml-enable-MLA-flash-attention-for-GLM-4.7-flash.patch
@@ -1,309 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: nobody <>
-Date: Sat, 24 Jan 2026 02:31:01 +0000
-Subject: [PATCH] ggml: enable MLA flash attention for GLM-4.7-flash
-
-Add support for gqa_ratio 4 in MLA flash attention kernels. GLM-4.7-flash
-uses head size 576 with gqa_ratio 4, which was previously only supported
-for gqa_ratio 16 (DeepSeek).
-
-Metal changes:
- Enable head size 576 for flash attention
- Increase simdgroups to 8 for large heads (>=512)
- Add case 8 kernel dispatch for 8 simdgroups
-
-CUDA changes:
- Add gqa_ratio 4 support for head 576/512
- Add tile configs for (576, 512, 4) and (576, 512, 8)
- Add MMA config cases for ncols 4
- Add template instances for ncols2=4
- Fix nbatch_fa values in nvidia_fp32 config (32->64)
---
- ggml/src/ggml-cuda/fattn-mma-f16.cuh          | 40 +++++++++++++++----
- ggml/src/ggml-cuda/fattn-tile.cuh             | 16 ++++++++
- ggml/src/ggml-cuda/fattn.cu                   | 12 ++++--
- ...ttn-mma-f16-instance-ncols1_16-ncols2_4.cu |  1 +
- ...attn-mma-f16-instance-ncols1_2-ncols2_4.cu |  1 +
- ...attn-mma-f16-instance-ncols1_4-ncols2_4.cu |  1 +
- ...attn-mma-f16-instance-ncols1_8-ncols2_4.cu |  1 +
- ggml/src/ggml-metal/ggml-metal-device.m       |  8 +---
- ggml/src/ggml-metal/ggml-metal-ops.cpp        |  2 +-
- ggml/src/ggml-metal/ggml-metal.metal          |  1 +
- 10 files changed, 64 insertions(+), 19 deletions(-)
-
-diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
-index 7bd1044c1..3dea2205e 100644
--- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh
-+++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
-@@ -66,7 +66,8 @@ static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_co
-     GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 32, 128, 2,  32, 128, 128, 128, 2, true);
-     GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 64, 128, 2,  32, 128, 128, 128, 2, true);
- 
-    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512,  8,  64, 4,  32, 288, 256, 128, 1, false);
-+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512,  4,  64, 4,  32, 288, 256, 128, 1, false);
-+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512,  8,  64, 4,  32, 288, 256, 128, 1, true);
-     GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 16,  64, 4,  32, 288, 256, 128, 1, false);
-     GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 32, 128, 2,  32, 160, 128, 128, 1, false);
-     GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 64, 256, 1,  32, 160, 128, 128, 1, false);
-@@ -80,7 +81,8 @@ static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_co
-     GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 32, 128, 2,  64, 128, 128,  64, 2, true);
-     GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 64, 128, 2,  64, 128, 128,  64, 2, true);
- 
-    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512,  8,  64, 4,  32,  96,  64, 128, 1, false);
-+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512,  4,  64, 4,  32,  96,  64, 128, 1, false);
-+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512,  8,  64, 4,  32,  96,  64, 128, 1, true);
-     GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 16,  64, 4,  32,  96,  64, 128, 1, false);
-     GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 32, 128, 2,  32, 160, 128, 128, 1, false);
-     GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 64, 256, 1,  32, 160, 128, 128, 1, false);
-@@ -89,7 +91,8 @@ static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_co
- }
- 
- static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_config_volta(const int DKQ, const int DV, const int ncols) {
-    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512,  8,  64, 4,  32, 288, 256,  64, 1, false);
-+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512,  4,  64, 4,  32, 288, 256,  64, 1, false);
-+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512,  8,  64, 4,  32, 288, 256,  64, 1, true);
-     GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 16,  64, 4,  32, 288, 256,  64, 1, false);
-     GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 32, 128, 2,  32, 160, 128,  64, 1, false);
-     GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 64, 256, 1,  32, 160, 128,  64, 1, false);
-@@ -397,7 +400,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
-     constexpr int  ncols           = ncols1 * ncols2;
-     constexpr int  cols_per_warp   = T_B_KQ::I;
-     constexpr int  cols_per_thread = 2; // This is specifically KQ columns, Volta only has a single VKQ column.
-    constexpr int  np              = nwarps * (cols_per_warp/ncols2) / ncols1; // Number of parallel CUDA warps per Q column.
-+    constexpr int  np              = cols_per_warp > ncols ? nwarps : nwarps * cols_per_warp/ncols; // Number of parallel CUDA warps per Q column.
-     constexpr int  nbatch_fa       = ggml_cuda_fattn_mma_get_nbatch_fa(DKQ, DV, ncols);
-     constexpr int  nbatch_K2       = ggml_cuda_fattn_mma_get_nbatch_K2(DKQ, DV, ncols);
-     constexpr int  nbatch_V2       = ggml_cuda_fattn_mma_get_nbatch_V2(DKQ, DV, ncols);
-@@ -467,7 +470,6 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
-                 }
-             }
-         } else {
-            static_assert(cols_per_warp != 8, "cols_per_warp == 8 not implemented");
- #pragma unroll
-             for (int k_KQ_0 = k0_start; k_KQ_0 < k0_stop; k_KQ_0 += T_A_KQ::J) {
-                 load_ldmatrix(Q_B[0], tile_Q + (threadIdx.y / np)*(T_B_KQ::I*stride_tile_Q) + k_KQ_0, stride_tile_Q);
-@@ -479,8 +481,18 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
-                     T_A_KQ K_A;
-                     load_ldmatrix(K_A, tile_K + i_KQ_0*stride_tile_K + (k_KQ_0 - k0_start), stride_tile_K);
- 
-                    // Wide version of KQ_C is column-major => swap A and B.
-                    mma(KQ_C[i_KQ_00/(np*T_A_KQ::I)], Q_B[0], K_A);
-+                    if constexpr (cols_per_warp == 8) {
-+                        mma(KQ_C[i_KQ_00/(np*T_A_KQ::I)], K_A, Q_B[0]);
-+                    } else {
-+                        // Wide version of KQ_C is column-major
-+#if defined(AMD_WMMA_AVAILABLE)
-+                        // RDNA matrix C is column-major.
-+                        mma(KQ_C[i_KQ_00/(np*T_A_KQ::I)], K_A, Q_B[0]);
-+#else
-+                        // swap A and B for CUDA.
-+                        mma(KQ_C[i_KQ_00/(np*T_A_KQ::I)], Q_B[0], K_A);
-+#endif // defined(AMD_WMMA_AVAILABLE)
-+                    }
-                 }
-             }
-         }
-@@ -841,7 +853,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
- 
-     constexpr int  cols_per_warp   = T_B_KQ::I;
-     constexpr int  cols_per_thread = 2; // This is specifically KQ columns, Volta only has a single VKQ column.
-    constexpr int  np              = nwarps * (cols_per_warp/ncols2) / ncols1; // Number of parallel CUDA warps per Q column.
-+    constexpr int  np              = cols_per_warp > ncols ? nwarps : nwarps * cols_per_warp/ncols; // Number of parallel CUDA warps per Q column.
-     constexpr int  nbatch_fa       = ggml_cuda_fattn_mma_get_nbatch_fa     (DKQ, DV, ncols);
-     constexpr int  nbatch_K2       = ggml_cuda_fattn_mma_get_nbatch_K2     (DKQ, DV, ncols);
-     constexpr int  nbatch_V2       = ggml_cuda_fattn_mma_get_nbatch_V2     (DKQ, DV, ncols);
-@@ -1353,6 +1365,13 @@ static __global__ void flash_attn_ext_f16(
-         NO_DEVICE_CODE;
-         return;
-     }
-+#ifdef VOLTA_MMA_AVAILABLE
-+    if (ncols1*ncols2 < 32) {
-+        NO_DEVICE_CODE;
-+        return;
-+    }
-+#endif // VOLTA_MMA_AVAILABLE
-+
- #if __CUDA_ARCH__ == GGML_CUDA_CC_TURING
-     if (ncols1*ncols2 > 32) {
-         NO_DEVICE_CODE;
-@@ -1585,3 +1604,8 @@ DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 256,  64)
- extern DECL_FATTN_MMA_F16_CASE(576, 512, 1, 16);
- extern DECL_FATTN_MMA_F16_CASE(576, 512, 2, 16);
- extern DECL_FATTN_MMA_F16_CASE(576, 512, 4, 16);
-+
-+// For GLM 4.7 Flash
-+extern DECL_FATTN_MMA_F16_CASE(576, 512,  4,  4);
-+extern DECL_FATTN_MMA_F16_CASE(576, 512,  8,  4);
-+extern DECL_FATTN_MMA_F16_CASE(576, 512, 16,  4);
-diff --git a/ggml/src/ggml-cuda/fattn-tile.cuh b/ggml/src/ggml-cuda/fattn-tile.cuh
-index 7c4d6fe67..371be7442 100644
--- a/ggml/src/ggml-cuda/fattn-tile.cuh
-+++ b/ggml/src/ggml-cuda/fattn-tile.cuh
-@@ -68,6 +68,8 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
-     GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2,  64,  64)
-     GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2,  64,  64)
- 
-+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 128, 2,  64,  64)
-+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  8, 256, 2,  64,  64)
-     GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2,  64,  64)
- 
-     return 0;
-@@ -122,6 +124,8 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
-     GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2,  32, 128)
-     GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2,  32,  64)
- 
-+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 128, 2,  32,  64)
-+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  8, 256, 2,  32,  64)
-     GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2,  32,  64)
- 
-     return 0;
-@@ -183,6 +187,8 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
-     GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2,  32, 128)
-     GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2,  32, 128)
- 
-+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 128, 2,  64,  64)
-+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  8, 256, 2,  64,  64)
-     GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2,  64,  64)
-     GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 32, 512, 1, 128,  64)
- 
-@@ -245,6 +251,8 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
-     GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 5,  32, 256)
-     GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 3,  64, 128)
- 
-+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 128, 2,  64,  64)
-+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  8, 256, 2,  64,  64)
-     GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 4,  64,  64)
-     GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 32, 256, 2, 128,  64)
- 
-@@ -1187,6 +1195,14 @@ static void launch_fattn_tile_switch_ncols2(ggml_backend_cuda_context & ctx, ggm
-             launch_fattn_tile_switch_ncols1<DKQ, DV, 16, use_logit_softcap>(ctx, dst);
-             return;
-         }
-+        if (use_gqa_opt && gqa_ratio % 8 == 0) {
-+            launch_fattn_tile_switch_ncols1<DKQ, DV, 8, use_logit_softcap>(ctx, dst);
-+            return;
-+        }
-+        if (use_gqa_opt && gqa_ratio % 4 == 0) {
-+            launch_fattn_tile_switch_ncols1<DKQ, DV, 4, use_logit_softcap>(ctx, dst);
-+            return;
-+        }
-     }
- 
-     if constexpr (DV <= 256) {
-diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu
-index 015540666..1693479cb 100644
--- a/ggml/src/ggml-cuda/fattn.cu
-+++ b/ggml/src/ggml-cuda/fattn.cu
-@@ -111,7 +111,7 @@ static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, gg
-             ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2<256, 256>(ctx, dst);
-             break;
-         case 576: {
-            // For Deepseek, go straight to the ncols1 switch to avoid compiling unnecessary kernels.
-+            // For Deepseek/GLM4, go straight to the ncols1 switch to avoid compiling unnecessary kernels.
-             GGML_ASSERT(V->ne[0] == 512);
-             float max_bias = 0.0f;
-             memcpy(&max_bias, (const float *) KQV->op_params + 1, sizeof(float));
-@@ -121,8 +121,12 @@ static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, gg
- 
-             GGML_ASSERT(Q->ne[2] % K->ne[2] == 0);
-             const int gqa_ratio = Q->ne[2] / K->ne[2];
-            GGML_ASSERT(gqa_ratio % 16 == 0);
-            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 16>(ctx, dst);
-+            GGML_ASSERT(gqa_ratio % 4 == 0);
-+            if (gqa_ratio % 16 == 0) {
-+                ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 16>(ctx, dst);
-+            } else {
-+                ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512,  4>(ctx, dst);
-+            }
-         } break;
-         default:
-             GGML_ABORT("fatal error");
-@@ -251,7 +255,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
-             if (V->ne[0] != 512) {
-                 return BEST_FATTN_KERNEL_NONE;
-             }
-            if (!gqa_opt_applies || gqa_ratio % 16 != 0) {
-+            if (!gqa_opt_applies || gqa_ratio % 4 != 0) {
-                 return BEST_FATTN_KERNEL_NONE;
-             }
-             break;
-diff --git a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu
-index 2074e954a..517993cb0 100644
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu
-+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu
-@@ -8,3 +8,4 @@ DECL_FATTN_MMA_F16_CASE(96, 96, 16, 4);
- DECL_FATTN_MMA_F16_CASE(112, 112, 16, 4);
- DECL_FATTN_MMA_F16_CASE(128, 128, 16, 4);
- DECL_FATTN_MMA_F16_CASE(256, 256, 16, 4);
-+DECL_FATTN_MMA_F16_CASE(576, 512, 16, 4);
-diff --git a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu
-index 24c64cf00..97b19c67a 100644
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu
-+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu
-@@ -8,3 +8,4 @@ DECL_FATTN_MMA_F16_CASE(96, 96, 2, 4);
- DECL_FATTN_MMA_F16_CASE(112, 112, 2, 4);
- DECL_FATTN_MMA_F16_CASE(128, 128, 2, 4);
- DECL_FATTN_MMA_F16_CASE(256, 256, 2, 4);
-+DECL_FATTN_MMA_F16_CASE(576, 512, 2, 4);
-diff --git a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu
-index 1ada657f1..989626dfa 100644
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu
-+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu
-@@ -8,3 +8,4 @@ DECL_FATTN_MMA_F16_CASE(96, 96, 4, 4);
- DECL_FATTN_MMA_F16_CASE(112, 112, 4, 4);
- DECL_FATTN_MMA_F16_CASE(128, 128, 4, 4);
- DECL_FATTN_MMA_F16_CASE(256, 256, 4, 4);
-+DECL_FATTN_MMA_F16_CASE(576, 512, 4, 4);
-diff --git a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu
-index 86d4ffae2..173de7aac 100644
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu
-+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu
-@@ -8,3 +8,4 @@ DECL_FATTN_MMA_F16_CASE(96, 96, 8, 4);
- DECL_FATTN_MMA_F16_CASE(112, 112, 8, 4);
- DECL_FATTN_MMA_F16_CASE(128, 128, 8, 4);
- DECL_FATTN_MMA_F16_CASE(256, 256, 8, 4);
-+DECL_FATTN_MMA_F16_CASE(576, 512, 8, 4);
-diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m
-index f24270bb1..7b5ee968c 100644
--- a/ggml/src/ggml-metal/ggml-metal-device.m
-+++ b/ggml/src/ggml-metal/ggml-metal-device.m
-@@ -1071,12 +1071,8 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
-                 op->src[0]->ne[0] != 112 &&
-                 op->src[0]->ne[0] != 128 &&
-                 op->src[0]->ne[0] != 192 &&
-                op->src[0]->ne[0] != 256) {
-                return false;
-            }
-            if (op->src[0]->ne[0] == 576) {
-                // DeepSeek sizes
-                // TODO: disabled for now, until optmized
-+                op->src[0]->ne[0] != 256 &&
-+                op->src[0]->ne[0] != 576) {
-                 return false;
-             }
-             if (op->src[1]->type != op->src[2]->type) {
-diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp
-index e99c1763f..80864f303 100644
--- a/ggml/src/ggml-metal/ggml-metal-ops.cpp
-+++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp
-@@ -2456,7 +2456,7 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
- 
-         // simdgroups per threadgroup (a.k.a. warps)
-         //nsg = ne01 <= nqptg ? MAX(4, MIN(nsgmax, MIN(ne11/ncpsg, (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32))) : 4;
-        int32_t nsg = 4;
-+        int32_t nsg = ne00 >= 512 ? 8 : 4;
- 
-         const size_t smem = FATTN_SMEM(nsg);
- 
-diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
-index c98d269d1..d33c16079 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
-+++ b/ggml/src/ggml-metal/ggml-metal.metal
-@@ -6166,6 +6166,7 @@ kernel void kernel_flash_attn_ext(
-       //case 1: kernel_flash_attn_ext_impl<FWD_TMPL, 1>(FWD_ARGS); break;
-       //case 2: kernel_flash_attn_ext_impl<FWD_TMPL, 2>(FWD_ARGS); break;
-         case 4: kernel_flash_attn_ext_impl<FWD_TMPL, 4>(FWD_ARGS); break;
-+        case 8: kernel_flash_attn_ext_impl<FWD_TMPL, 8>(FWD_ARGS); break;
-     }
- #undef FWD_TMPL
- #undef FWD_ARGS
--- a/llm/server.go
+++ b/llm/server.go
@@ -242,6 +242,7 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st
 	} else {
 		// For Ollama engine, use our SupportsFlashAttention logic
 		if fa {
+			slog.Info("enabling flash attention")
 			loadRequest.FlashAttention = ml.FlashAttentionEnabled

 			// Flash Attention also supports kv cache quantization
--- a/middleware/openai.go
+++ b/middleware/openai.go
@@ -609,49 +609,3 @@ func ImageGenerationsMiddleware() gin.HandlerFunc {
 		c.Next()
 	}
 }
-
-func ImageEditsMiddleware() gin.HandlerFunc {
-	return func(c *gin.Context) {
-		var req openai.ImageEditRequest
-		if err := c.ShouldBindJSON(&req); err != nil {
-			c.AbortWithStatusJSON(http.StatusBadRequest, openai.NewError(http.StatusBadRequest, err.Error()))
-			return
-		}
-
-		if req.Prompt == "" {
-			c.AbortWithStatusJSON(http.StatusBadRequest, openai.NewError(http.StatusBadRequest, "prompt is required"))
-			return
-		}
-
-		if req.Model == "" {
-			c.AbortWithStatusJSON(http.StatusBadRequest, openai.NewError(http.StatusBadRequest, "model is required"))
-			return
-		}
-
-		if req.Image == "" {
-			c.AbortWithStatusJSON(http.StatusBadRequest, openai.NewError(http.StatusBadRequest, "image is required"))
-			return
-		}
-
-		genReq, err := openai.FromImageEditRequest(req)
-		if err != nil {
-			c.AbortWithStatusJSON(http.StatusBadRequest, openai.NewError(http.StatusBadRequest, err.Error()))
-			return
-		}
-
-		var b bytes.Buffer
-		if err := json.NewEncoder(&b).Encode(genReq); err != nil {
-			c.AbortWithStatusJSON(http.StatusInternalServerError, openai.NewError(http.StatusInternalServerError, err.Error()))
-			return
-		}
-
-		c.Request.Body = io.NopCloser(&b)
-
-		w := &ImageWriter{
-			BaseWriter: BaseWriter{ResponseWriter: c.Writer},
-		}
-
-		c.Writer = w
-		c.Next()
-	}
-}
--- a/middleware/openai_test.go
+++ b/middleware/openai_test.go
@@ -1112,129 +1112,3 @@ func TestImageWriterResponse(t *testing.T) {
 		t.Errorf("expected image data 'dGVzdC1pbWFnZS1kYXRh', got %s", imageResp.Data[0].B64JSON)
 	}
 }
-
-func TestImageEditsMiddleware(t *testing.T) {
-	type testCase struct {
-		name string
-		body string
-		req  api.GenerateRequest
-		err  openai.ErrorResponse
-	}
-
-	var capturedRequest *api.GenerateRequest
-
-	// Base64-encoded test image (1x1 pixel PNG)
-	testImage := "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNk+A8AAQUBAScY42YAAAAASUVORK5CYII="
-	decodedImage, _ := base64.StdEncoding.DecodeString("iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNk+A8AAQUBAScY42YAAAAASUVORK5CYII=")
-
-	testCases := []testCase{
-		{
-			name: "image edit basic",
-			body: `{
-				"model": "test-model",
-				"prompt": "make it blue",
-				"image": "` + testImage + `"
-			}`,
-			req: api.GenerateRequest{
-				Model:  "test-model",
-				Prompt: "make it blue",
-				Images: []api.ImageData{decodedImage},
-			},
-		},
-		{
-			name: "image edit with size",
-			body: `{
-				"model": "test-model",
-				"prompt": "make it blue",
-				"image": "` + testImage + `",
-				"size": "512x768"
-			}`,
-			req: api.GenerateRequest{
-				Model:  "test-model",
-				Prompt: "make it blue",
-				Images: []api.ImageData{decodedImage},
-				Width:  512,
-				Height: 768,
-			},
-		},
-		{
-			name: "image edit missing prompt",
-			body: `{
-				"model": "test-model",
-				"image": "` + testImage + `"
-			}`,
-			err: openai.ErrorResponse{
-				Error: openai.Error{
-					Message: "prompt is required",
-					Type:    "invalid_request_error",
-				},
-			},
-		},
-		{
-			name: "image edit missing model",
-			body: `{
-				"prompt": "make it blue",
-				"image": "` + testImage + `"
-			}`,
-			err: openai.ErrorResponse{
-				Error: openai.Error{
-					Message: "model is required",
-					Type:    "invalid_request_error",
-				},
-			},
-		},
-		{
-			name: "image edit missing image",
-			body: `{
-				"model": "test-model",
-				"prompt": "make it blue"
-			}`,
-			err: openai.ErrorResponse{
-				Error: openai.Error{
-					Message: "image is required",
-					Type:    "invalid_request_error",
-				},
-			},
-		},
-	}
-
-	endpoint := func(c *gin.Context) {
-		c.Status(http.StatusOK)
-	}
-
-	gin.SetMode(gin.TestMode)
-	router := gin.New()
-	router.Use(ImageEditsMiddleware(), captureRequestMiddleware(&capturedRequest))
-	router.Handle(http.MethodPost, "/api/generate", endpoint)
-
-	for _, tc := range testCases {
-		t.Run(tc.name, func(t *testing.T) {
-			req, _ := http.NewRequest(http.MethodPost, "/api/generate", strings.NewReader(tc.body))
-			req.Header.Set("Content-Type", "application/json")
-
-			defer func() { capturedRequest = nil }()
-
-			resp := httptest.NewRecorder()
-			router.ServeHTTP(resp, req)
-
-			if tc.err.Error.Message != "" {
-				var errResp openai.ErrorResponse
-				if err := json.Unmarshal(resp.Body.Bytes(), &errResp); err != nil {
-					t.Fatal(err)
-				}
-				if diff := cmp.Diff(tc.err, errResp); diff != "" {
-					t.Fatalf("errors did not match:\n%s", diff)
-				}
-				return
-			}
-
-			if resp.Code != http.StatusOK {
-				t.Fatalf("expected status 200, got %d: %s", resp.Code, resp.Body.String())
-			}
-
-			if diff := cmp.Diff(&tc.req, capturedRequest); diff != "" {
-				t.Fatalf("requests did not match:\n%s", diff)
-			}
-		})
-	}
-}
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -170,7 +170,6 @@ type Tensor interface {
 	Cos(ctx Context) Tensor
 	Tanh(ctx Context) Tensor
 	GELU(ctx Context, up ...Tensor) Tensor
-	GELU_ERF(ctx Context) Tensor
 	QuickGELU(ctx Context, up ...Tensor) Tensor
 	SILU(ctx Context, up ...Tensor) Tensor
 	RELU(ctx Context, up ...Tensor) Tensor
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -1581,13 +1581,6 @@ func (t *Tensor) GELU(ctx ml.Context, t2 ...ml.Tensor) ml.Tensor {
 	}
 }

-func (t *Tensor) GELU_ERF(ctx ml.Context) ml.Tensor {
-	return &Tensor{
-		b: t.b,
-		t: C.ggml_gelu_erf_inplace(ctx.(*Context).ctx, t.t),
-	}
-}
-
 func (t *Tensor) QuickGELU(ctx ml.Context, t2 ...ml.Tensor) ml.Tensor {
 	var tt *C.struct_ggml_tensor
 	if len(t2) > 0 {
--- a/ml/backend/ggml/ggml/src/ggml-cuda/fattn-mma-f16.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@@ -66,8 +66,7 @@ static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_co
    GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 32, 128, 2,  32, 128, 128, 128, 2, true);
    GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 64, 128, 2,  32, 128, 128, 128, 2, true);

-    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512,  4,  64, 4,  32, 288, 256, 128, 1, false);
-    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512,  8,  64, 4,  32, 288, 256, 128, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512,  8,  64, 4,  32, 288, 256, 128, 1, false);
    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 16,  64, 4,  32, 288, 256, 128, 1, false);
    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 32, 128, 2,  32, 160, 128, 128, 1, false);
    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 64, 256, 1,  32, 160, 128, 128, 1, false);
@@ -81,8 +80,7 @@ static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_co
    GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 32, 128, 2,  64, 128, 128,  64, 2, true);
    GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 64, 128, 2,  64, 128, 128,  64, 2, true);

-    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512,  4,  64, 4,  32,  96,  64, 128, 1, false);
-    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512,  8,  64, 4,  32,  96,  64, 128, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512,  8,  64, 4,  32,  96,  64, 128, 1, false);
    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 16,  64, 4,  32,  96,  64, 128, 1, false);
    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 32, 128, 2,  32, 160, 128, 128, 1, false);
    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 64, 256, 1,  32, 160, 128, 128, 1, false);
@@ -91,8 +89,7 @@ static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_co
 }

 static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_config_volta(const int DKQ, const int DV, const int ncols) {
-    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512,  4,  64, 4,  32, 288, 256,  64, 1, false);
-    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512,  8,  64, 4,  32, 288, 256,  64, 1, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512,  8,  64, 4,  32, 288, 256,  64, 1, false);
    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 16,  64, 4,  32, 288, 256,  64, 1, false);
    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 32, 128, 2,  32, 160, 128,  64, 1, false);
    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 64, 256, 1,  32, 160, 128,  64, 1, false);
@@ -400,7 +397,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
    constexpr int  ncols           = ncols1 * ncols2;
    constexpr int  cols_per_warp   = T_B_KQ::I;
    constexpr int  cols_per_thread = 2; // This is specifically KQ columns, Volta only has a single VKQ column.
-    constexpr int  np              = cols_per_warp > ncols ? nwarps : nwarps * cols_per_warp/ncols; // Number of parallel CUDA warps per Q column.
+    constexpr int  np              = nwarps * (cols_per_warp/ncols2) / ncols1; // Number of parallel CUDA warps per Q column.
    constexpr int  nbatch_fa       = ggml_cuda_fattn_mma_get_nbatch_fa(DKQ, DV, ncols);
    constexpr int  nbatch_K2       = ggml_cuda_fattn_mma_get_nbatch_K2(DKQ, DV, ncols);
    constexpr int  nbatch_V2       = ggml_cuda_fattn_mma_get_nbatch_V2(DKQ, DV, ncols);
@@ -470,6 +467,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
                }
            }
        } else {
+            static_assert(cols_per_warp != 8, "cols_per_warp == 8 not implemented");
 #pragma unroll
            for (int k_KQ_0 = k0_start; k_KQ_0 < k0_stop; k_KQ_0 += T_A_KQ::J) {
                load_ldmatrix(Q_B[0], tile_Q + (threadIdx.y / np)*(T_B_KQ::I*stride_tile_Q) + k_KQ_0, stride_tile_Q);
@@ -481,18 +479,8 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
                    T_A_KQ K_A;
                    load_ldmatrix(K_A, tile_K + i_KQ_0*stride_tile_K + (k_KQ_0 - k0_start), stride_tile_K);

-                    if constexpr (cols_per_warp == 8) {
-                        mma(KQ_C[i_KQ_00/(np*T_A_KQ::I)], K_A, Q_B[0]);
-                    } else {
-                        // Wide version of KQ_C is column-major
-#if defined(AMD_WMMA_AVAILABLE)
-                        // RDNA matrix C is column-major.
-                        mma(KQ_C[i_KQ_00/(np*T_A_KQ::I)], K_A, Q_B[0]);
-#else
-                        // swap A and B for CUDA.
-                        mma(KQ_C[i_KQ_00/(np*T_A_KQ::I)], Q_B[0], K_A);
-#endif // defined(AMD_WMMA_AVAILABLE)
-                    }
+                    // Wide version of KQ_C is column-major => swap A and B.
+                    mma(KQ_C[i_KQ_00/(np*T_A_KQ::I)], Q_B[0], K_A);
                }
            }
        }
@@ -853,7 +841,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(

    constexpr int  cols_per_warp   = T_B_KQ::I;
    constexpr int  cols_per_thread = 2; // This is specifically KQ columns, Volta only has a single VKQ column.
-    constexpr int  np              = cols_per_warp > ncols ? nwarps : nwarps * cols_per_warp/ncols; // Number of parallel CUDA warps per Q column.
+    constexpr int  np              = nwarps * (cols_per_warp/ncols2) / ncols1; // Number of parallel CUDA warps per Q column.
    constexpr int  nbatch_fa       = ggml_cuda_fattn_mma_get_nbatch_fa     (DKQ, DV, ncols);
    constexpr int  nbatch_K2       = ggml_cuda_fattn_mma_get_nbatch_K2     (DKQ, DV, ncols);
    constexpr int  nbatch_V2       = ggml_cuda_fattn_mma_get_nbatch_V2     (DKQ, DV, ncols);
@@ -1365,13 +1353,6 @@ static __global__ void flash_attn_ext_f16(
        NO_DEVICE_CODE;
        return;
    }
-#ifdef VOLTA_MMA_AVAILABLE
-    if (ncols1*ncols2 < 32) {
-        NO_DEVICE_CODE;
-        return;
-    }
-#endif // VOLTA_MMA_AVAILABLE
-
 #if __CUDA_ARCH__ == GGML_CUDA_CC_TURING
    if (ncols1*ncols2 > 32) {
        NO_DEVICE_CODE;
@@ -1604,8 +1585,3 @@ DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 256,  64)
 extern DECL_FATTN_MMA_F16_CASE(576, 512, 1, 16);
 extern DECL_FATTN_MMA_F16_CASE(576, 512, 2, 16);
 extern DECL_FATTN_MMA_F16_CASE(576, 512, 4, 16);
-
-// For GLM 4.7 Flash
-extern DECL_FATTN_MMA_F16_CASE(576, 512,  4,  4);
-extern DECL_FATTN_MMA_F16_CASE(576, 512,  8,  4);
-extern DECL_FATTN_MMA_F16_CASE(576, 512, 16,  4);
--- a/ml/backend/ggml/ggml/src/ggml-cuda/fattn-tile.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-tile.cuh
@@ -68,8 +68,6 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2,  64,  64)

-    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 128, 2,  64,  64)
-    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  8, 256, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2,  64,  64)

    return 0;
@@ -124,8 +122,6 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2,  32, 128)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2,  32,  64)

-    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 128, 2,  32,  64)
-    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  8, 256, 2,  32,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2,  32,  64)

    return 0;
@@ -187,8 +183,6 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2,  32, 128)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2,  32, 128)

-    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 128, 2,  64,  64)
-    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  8, 256, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 32, 512, 1, 128,  64)

@@ -251,8 +245,6 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 5,  32, 256)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 3,  64, 128)

-    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 128, 2,  64,  64)
-    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  8, 256, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 4,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 32, 256, 2, 128,  64)

@@ -1195,14 +1187,6 @@ static void launch_fattn_tile_switch_ncols2(ggml_backend_cuda_context & ctx, ggm
            launch_fattn_tile_switch_ncols1<DKQ, DV, 16, use_logit_softcap>(ctx, dst);
            return;
        }
-        if (use_gqa_opt && gqa_ratio % 8 == 0) {
-            launch_fattn_tile_switch_ncols1<DKQ, DV, 8, use_logit_softcap>(ctx, dst);
-            return;
-        }
-        if (use_gqa_opt && gqa_ratio % 4 == 0) {
-            launch_fattn_tile_switch_ncols1<DKQ, DV, 4, use_logit_softcap>(ctx, dst);
-            return;
-        }
    }

    if constexpr (DV <= 256) {
--- a/ml/backend/ggml/ggml/src/ggml-cuda/fattn.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/fattn.cu
@@ -111,7 +111,7 @@ static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, gg
            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2<256, 256>(ctx, dst);
            break;
        case 576: {
-            // For Deepseek/GLM4, go straight to the ncols1 switch to avoid compiling unnecessary kernels.
+            // For Deepseek, go straight to the ncols1 switch to avoid compiling unnecessary kernels.
            GGML_ASSERT(V->ne[0] == 512);
            float max_bias = 0.0f;
            memcpy(&max_bias, (const float *) KQV->op_params + 1, sizeof(float));
@@ -121,12 +121,8 @@ static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, gg

            GGML_ASSERT(Q->ne[2] % K->ne[2] == 0);
            const int gqa_ratio = Q->ne[2] / K->ne[2];
-            GGML_ASSERT(gqa_ratio % 4 == 0);
-            if (gqa_ratio % 16 == 0) {
-                ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 16>(ctx, dst);
-            } else {
-                ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512,  4>(ctx, dst);
-            }
+            GGML_ASSERT(gqa_ratio % 16 == 0);
+            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 16>(ctx, dst);
        } break;
        default:
            GGML_ABORT("fatal error");
@@ -255,7 +251,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
            if (V->ne[0] != 512) {
                return BEST_FATTN_KERNEL_NONE;
            }
-            if (!gqa_opt_applies || gqa_ratio % 4 != 0) {
+            if (!gqa_opt_applies || gqa_ratio % 16 != 0) {
                return BEST_FATTN_KERNEL_NONE;
            }
            break;
--- a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu
@@ -8,4 +8,3 @@ DECL_FATTN_MMA_F16_CASE(96, 96, 16, 4);
 DECL_FATTN_MMA_F16_CASE(112, 112, 16, 4);
 DECL_FATTN_MMA_F16_CASE(128, 128, 16, 4);
 DECL_FATTN_MMA_F16_CASE(256, 256, 16, 4);
-DECL_FATTN_MMA_F16_CASE(576, 512, 16, 4);
--- a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu
@@ -8,4 +8,3 @@ DECL_FATTN_MMA_F16_CASE(96, 96, 2, 4);
 DECL_FATTN_MMA_F16_CASE(112, 112, 2, 4);
 DECL_FATTN_MMA_F16_CASE(128, 128, 2, 4);
 DECL_FATTN_MMA_F16_CASE(256, 256, 2, 4);
-DECL_FATTN_MMA_F16_CASE(576, 512, 2, 4);
--- a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu
@@ -8,4 +8,3 @@ DECL_FATTN_MMA_F16_CASE(96, 96, 4, 4);
 DECL_FATTN_MMA_F16_CASE(112, 112, 4, 4);
 DECL_FATTN_MMA_F16_CASE(128, 128, 4, 4);
 DECL_FATTN_MMA_F16_CASE(256, 256, 4, 4);
-DECL_FATTN_MMA_F16_CASE(576, 512, 4, 4);
--- a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu
@@ -8,4 +8,3 @@ DECL_FATTN_MMA_F16_CASE(96, 96, 8, 4);
 DECL_FATTN_MMA_F16_CASE(112, 112, 8, 4);
 DECL_FATTN_MMA_F16_CASE(128, 128, 8, 4);
 DECL_FATTN_MMA_F16_CASE(256, 256, 8, 4);
-DECL_FATTN_MMA_F16_CASE(576, 512, 8, 4);
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-device.m
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-device.m
@@ -1071,8 +1071,12 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
                op->src[0]->ne[0] != 112 &&
                op->src[0]->ne[0] != 128 &&
                op->src[0]->ne[0] != 192 &&
-                op->src[0]->ne[0] != 256 &&
-                op->src[0]->ne[0] != 576) {
+                op->src[0]->ne[0] != 256) {
+                return false;
+            }
+            if (op->src[0]->ne[0] == 576) {
+                // DeepSeek sizes
+                // TODO: disabled for now, until optmized
                return false;
            }
            if (op->src[1]->type != op->src[2]->type) {
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
@@ -8967,7 +8967,6 @@ kernel void kernel_flash_attn_ext(
      //case 1: kernel_flash_attn_ext_impl<FWD_TMPL, 1>(FWD_ARGS); break;
      //case 2: kernel_flash_attn_ext_impl<FWD_TMPL, 2>(FWD_ARGS); break;
        case 4: kernel_flash_attn_ext_impl<FWD_TMPL, 4>(FWD_ARGS); break;
-        case 8: kernel_flash_attn_ext_impl<FWD_TMPL, 8>(FWD_ARGS); break;
    }
 #undef FWD_TMPL
 #undef FWD_ARGS
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-ops.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-ops.cpp
@@ -2456,7 +2456,7 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {

        // simdgroups per threadgroup (a.k.a. warps)
        //nsg = ne01 <= nqptg ? MAX(4, MIN(nsgmax, MIN(ne11/ncpsg, (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32))) : 4;
-        int32_t nsg = ne00 >= 512 ? 8 : 4;
+        int32_t nsg = 4;

        const size_t smem = FATTN_SMEM(nsg);

--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal
@@ -6166,7 +6166,6 @@ kernel void kernel_flash_attn_ext(
      //case 1: kernel_flash_attn_ext_impl<FWD_TMPL, 1>(FWD_ARGS); break;
      //case 2: kernel_flash_attn_ext_impl<FWD_TMPL, 2>(FWD_ARGS); break;
        case 4: kernel_flash_attn_ext_impl<FWD_TMPL, 4>(FWD_ARGS); break;
-        case 8: kernel_flash_attn_ext_impl<FWD_TMPL, 8>(FWD_ARGS); break;
    }
 #undef FWD_TMPL
 #undef FWD_ARGS
--- a/model/imageproc/images.go
+++ b/model/imageproc/images.go
@@ -20,7 +20,6 @@ const (
 	ResizeBilinear = iota
 	ResizeNearestNeighbor
 	ResizeApproxBilinear
-	ResizeBicubic
 	ResizeCatmullrom
 )

@@ -46,7 +45,6 @@ func Resize(img image.Image, newSize image.Point, method int) image.Image {
 		ResizeBilinear:        draw.BiLinear,
 		ResizeNearestNeighbor: draw.NearestNeighbor,
 		ResizeApproxBilinear:  draw.ApproxBiLinear,
-		ResizeBicubic:         draw.CatmullRom,
 		ResizeCatmullrom:      draw.CatmullRom,
 	}

--- a/model/model.go
+++ b/model/model.go
@@ -39,13 +39,6 @@ type Model interface {
 	Config() config
 }

-// Validator is an optional interface that models can implement to perform
-// validation after tensors have been loaded. If validation fails, model
-// loading will fail with the returned error.
-type Validator interface {
-	Validate() error
-}
-
 // MultimodalProcessor must be implemented by multimodal models.
 type MultimodalProcessor interface {
 	// EncodeMultimodal processes a single input (such as an image) and
@@ -123,13 +116,6 @@ func New(modelPath string, params ml.BackendParams) (Model, error) {
 	base := Base{b: b, config: m.Config()}
 	v := reflect.ValueOf(m)
 	v.Elem().Set(populateFields(base, v.Elem()))
-
-	if validator, ok := m.(Validator); ok {
-		if err := validator.Validate(); err != nil {
-			return nil, err
-		}
-	}
-
 	return m, nil
 }

--- a/model/models/glm4moelite/model.go
+++ b/model/models/glm4moelite/model.go
@@ -1,7 +1,6 @@
 package glm4moelite

 import (
-	"errors"
 	"math"

 	"github.com/ollama/ollama/fs"
@@ -12,8 +11,6 @@ import (
 	"github.com/ollama/ollama/model/input"
 )

-var ErrOldModelFormat = errors.New("this model uses a weight format that is no longer supported; please re-download it")
-
 type Options struct {
 	numExpertsUsed      int
 	numExperts          int
@@ -50,9 +47,7 @@ type Attention struct {

 	KVA     *nn.Linear  `gguf:"attn_kv_a_mqa"`
 	KVANorm *nn.RMSNorm `gguf:"attn_kv_a_norm"`
-
-	KB *nn.Linear `gguf:"attn_k_b"`
-	VB *nn.Linear `gguf:"attn_v_b"`
+	KVB     *nn.Linear  `gguf:"attn_kv_b"`

 	Output *nn.Linear `gguf:"attn_out,alt:attn_output"`
 }
@@ -83,16 +78,15 @@ func (attn *Attention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor
 	qRot := opts.applyRotaryPositionEmbeddings(ctx, queryChunks[1], positions)
 	kRot = opts.applyRotaryPositionEmbeddings(ctx, kRot, positions)
 	kPass = attn.KVANorm.Forward(ctx, kPass, opts.eps)
+	kPass = attn.KVB.Forward(ctx, kPass)

-	// MLA absorption: absorb K projection into query
-	qPass := queryChunks[0].Permute(ctx, 0, 2, 1, 3)
-	qPassAbsorb := attn.KB.Forward(ctx, qPass).Permute(ctx, 0, 2, 1, 3)
-	query = qRot.Concat(ctx, qPassAbsorb, 0)
+	kv := kPass.Reshape(ctx, kPass.Dim(0)/opts.numKVHeads, opts.numKVHeads, seqLength)
+	kvChunks := kv.ChunkSections(ctx, 0, opts.kqNopeHeadDim, opts.vHeadDim)

-	kPass = kPass.Reshape(ctx, opts.kvLoraRank, 1, seqLength)
-	key := kRot.Concat(ctx, kPass, 0)
-
-	attention := nn.AttentionWithVMLA(ctx, query, key, kPass, nil, attn.VB.Weight, opts.kqScale, cache)
+	kRot = kRot.Repeat(ctx, 1, queryChunks[0].Dim(1))
+	query = qRot.Concat(ctx, queryChunks[0], 0)
+	key := kRot.Concat(ctx, kvChunks[0], 0)
+	attention := nn.Attention(ctx, query, key, kvChunks[1], opts.kqScale, cache)

 	attention = attention.Reshape(ctx, attention.Dim(0)*attention.Dim(1), seqLength)
 	return attn.Output.Forward(ctx, attention)
@@ -223,6 +217,7 @@ func New(c fs.Config) (model.Model, error) {

 	keyLength := int(c.Uint("attention.key_length"))
 	valueLength := int(c.Uint("attention.value_length"))
+
 	kqScale := 1.0 / math.Sqrt(float64(keyLength))

 	var pre []string
@@ -241,7 +236,7 @@ func New(c fs.Config) (model.Model, error) {
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
-				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", false),
+				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
 				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
 				EOS: append(
@@ -284,15 +279,6 @@ func (m Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor
 	return m.applyRotaryPositionEmbeddings(ctx, key, shift), nil
 }

-func (m *Model) Validate() error {
-	for _, layer := range m.Layers {
-		if layer.Attention != nil && (layer.Attention.KB == nil || layer.Attention.VB == nil) {
-			return ErrOldModelFormat
-		}
-	}
-	return nil
-}
-
 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 	positions := ctx.Input().FromInts(batch.Positions, len(batch.Positions))

--- a/model/models/glm4moelite/model_test.go
+++ b/model/models/glm4moelite/model_test.go
@@ -1,73 +0,0 @@
-package glm4moelite
-
-import (
-	"testing"
-
-	"github.com/ollama/ollama/ml/nn"
-)
-
-func TestValidate(t *testing.T) {
-	tests := []struct {
-		name    string
-		model   *Model
-		wantErr bool
-	}{
-		{
-			name: "valid model with KB and VB",
-			model: &Model{
-				Layers: []Layer{
-					{Attention: &Attention{KB: &nn.Linear{}, VB: &nn.Linear{}}},
-				},
-			},
-			wantErr: false,
-		},
-		{
-			name: "missing KB",
-			model: &Model{
-				Layers: []Layer{
-					{Attention: &Attention{VB: &nn.Linear{}}},
-				},
-			},
-			wantErr: true,
-		},
-		{
-			name: "missing VB",
-			model: &Model{
-				Layers: []Layer{
-					{Attention: &Attention{KB: &nn.Linear{}}},
-				},
-			},
-			wantErr: true,
-		},
-		{
-			name: "missing both KB and VB",
-			model: &Model{
-				Layers: []Layer{
-					{Attention: &Attention{}},
-				},
-			},
-			wantErr: true,
-		},
-		{
-			name: "nil Attention is ok",
-			model: &Model{
-				Layers: []Layer{
-					{Attention: nil},
-				},
-			},
-			wantErr: false,
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			err := tt.model.Validate()
-			if (err != nil) != tt.wantErr {
-				t.Errorf("Validate() error = %v, wantErr %v", err, tt.wantErr)
-			}
-			if tt.wantErr && err != ErrOldModelFormat {
-				t.Errorf("Validate() error = %v, want %v", err, ErrOldModelFormat)
-			}
-		})
-	}
-}
--- a/model/models/glmocr/imageprocessor.go
+++ b/model/models/glmocr/imageprocessor.go
@@ -1,171 +0,0 @@
-package glmocr
-
-import (
-	"image"
-	"math"
-
-	"github.com/ollama/ollama/fs"
-	"github.com/ollama/ollama/model/imageproc"
-)
-
-type ImageProcessor struct {
-	imageSize         int
-	patchSize         int
-	temporalPatchSize int
-	spatialMergeSize  int
-	minPixels         int
-	maxPixels         int
-	factor            int
-	imageMean         [3]float32
-	imageStd          [3]float32
-}
-
-func newImageProcessor(c fs.Config) ImageProcessor {
-	patchSize := int(c.Uint("vision.patch_size", 14))
-	spatialMergeSize := int(c.Uint("vision.spatial_merge_size", 2))
-	temporalPatchSize := int(c.Uint("vision.temporal_patch_size", 2))
-
-	// Read normalization values from config if available, otherwise use CLIP defaults
-	imageMean := c.Floats("vision.image_mean", imageproc.ClipDefaultMean[:])
-	imageStd := c.Floats("vision.image_std", imageproc.ClipDefaultSTD[:])
-
-	// Default max_pixels: 2048 * patchSize² * mergeSize² * temporal = ~3.2M pixels
-	// This limits to ~16k patches (4k output tokens) to keep memory stable without flash attention
-	defaultMaxPixels := 2048 * patchSize * patchSize * spatialMergeSize * spatialMergeSize * temporalPatchSize
-
-	return ImageProcessor{
-		imageSize:         int(c.Uint("vision.image_size", 336)),
-		patchSize:         patchSize,
-		temporalPatchSize: temporalPatchSize,
-		spatialMergeSize:  spatialMergeSize,
-		minPixels:         int(c.Uint("vision.min_pixels", uint32(8*patchSize*patchSize*spatialMergeSize*spatialMergeSize*temporalPatchSize))),
-		maxPixels:         int(c.Uint("vision.max_pixels", uint32(defaultMaxPixels))),
-		factor:            patchSize * spatialMergeSize,
-		imageMean:         [3]float32{imageMean[0], imageMean[1], imageMean[2]},
-		imageStd:          [3]float32{imageStd[0], imageStd[1], imageStd[2]},
-	}
-}
-
-func (p *ImageProcessor) SmartResize(height, width int) (int, int) {
-	factor := p.factor
-	temporalFactor := p.temporalPatchSize
-	numFrames := temporalFactor // single image
-
-	if height < factor || width < factor {
-		// Scale up small images
-		scale := float64(factor) / float64(min(height, width))
-		height = int(math.Ceil(float64(height) * scale))
-		width = int(math.Ceil(float64(width) * scale))
-	}
-
-	if temporalFactor <= 0 {
-		panic("temporal_patch_size must be > 0")
-	}
-	if numFrames < temporalFactor {
-		panic("num_frames must be >= temporal_patch_size")
-	}
-	if aspectRatio := float64(max(height, width)) / float64(min(height, width)); aspectRatio > 200 {
-		panic("absolute aspect ratio must be smaller than 200")
-	}
-
-	round := func(x float64) int { return int(math.RoundToEven(x)) }
-
-	hBar := round(float64(height)/float64(factor)) * factor
-	wBar := round(float64(width)/float64(factor)) * factor
-	tBar := round(float64(numFrames)/float64(temporalFactor)) * temporalFactor
-
-	if tBar*hBar*wBar > p.maxPixels {
-		beta := math.Sqrt(float64(numFrames*height*width) / float64(p.maxPixels))
-		hBar = int(math.Floor(float64(height)/beta/float64(factor))) * factor
-		wBar = int(math.Floor(float64(width)/beta/float64(factor))) * factor
-	} else if tBar*hBar*wBar < p.minPixels {
-		beta := math.Sqrt(float64(p.minPixels) / float64(numFrames*height*width))
-		hBar = int(math.Ceil(float64(height)*beta/float64(factor))) * factor
-		wBar = int(math.Ceil(float64(width)*beta/float64(factor))) * factor
-	}
-
-	return hBar, wBar
-}
-
-func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, *Grid, error) {
-	img = imageproc.Composite(img)
-
-	origWidth := img.Bounds().Dx()
-	origHeight := img.Bounds().Dy()
-
-	// Calculate smart resize dimensions
-	resizedHeight, resizedWidth := p.SmartResize(origHeight, origWidth)
-
-	// Resize image
-	resizedImg := imageproc.Resize(img, image.Point{X: resizedWidth, Y: resizedHeight}, imageproc.ResizeBicubic)
-
-	// Normalize pixels - output format is [C, H, W] with rescale and channelFirst
-	// We keep [C, H, W] for patch extraction
-	normalizedPixels := imageproc.Normalize(resizedImg, p.imageMean, p.imageStd, true, true)
-
-	// Calculate grid dimensions (after Conv2D patching)
-	grid := &Grid{
-		Height:      resizedHeight / p.patchSize,
-		Width:       resizedWidth / p.patchSize,
-		Temporal:    1, // Single image
-		ImageHeight: resizedHeight,
-		ImageWidth:  resizedWidth,
-	}
-
-	patches, err := p.createPatches(normalizedPixels, resizedHeight, resizedWidth, grid)
-	if err != nil {
-		return nil, nil, err
-	}
-
-	return patches, grid, nil
-}
-
-func (p *ImageProcessor) createPatches(pixels []float32, height, width int, grid *Grid) ([]float32, error) {
-	channels := 3
-	patchSize := p.patchSize
-	mergeSize := p.spatialMergeSize
-	temporalPatchSize := p.temporalPatchSize
-
-	numPatches := grid.Temporal * grid.Height * grid.Width
-	patchDim := channels * temporalPatchSize * patchSize * patchSize
-	result := make([]float32, numPatches*patchDim)
-	patchIndex := 0
-
-	// Single temporal frame handling (copies to all frames)
-	for range grid.Temporal {
-		for h := 0; h < grid.Height; h += mergeSize {
-			for w := 0; w < grid.Width; w += mergeSize {
-				for mh := range mergeSize {
-					for mw := range mergeSize {
-						baseOffset := patchIndex * patchDim
-						for c := range channels {
-							channelOffset := baseOffset + (c * temporalPatchSize * patchSize * patchSize)
-							for py := range patchSize {
-								for px := range patchSize {
-									y := (h+mh)*patchSize + py
-									x := (w+mw)*patchSize + px
-									srcIdx := c*height*width + y*width + x
-									dstIdx := channelOffset + (py * patchSize) + px
-									result[dstIdx] = pixels[srcIdx]
-								}
-							}
-
-							if temporalPatchSize > 1 {
-								frameSize := patchSize * patchSize
-								for tp := 1; tp < temporalPatchSize; tp++ {
-									currentFrameOffset := channelOffset + (tp * frameSize)
-									copy(result[currentFrameOffset:currentFrameOffset+frameSize],
-										result[channelOffset:channelOffset+frameSize])
-								}
-							}
-						}
-
-						patchIndex++
-					}
-				}
-			}
-		}
-	}
-
-	return result, nil
-}
--- a/model/models/glmocr/model.go
+++ b/model/models/glmocr/model.go
@@ -1,235 +0,0 @@
-package glmocr
-
-import (
-	"bytes"
-	"errors"
-	"image"
-	"slices"
-
-	"github.com/ollama/ollama/fs"
-	"github.com/ollama/ollama/kvcache"
-	"github.com/ollama/ollama/ml"
-	"github.com/ollama/ollama/model"
-	"github.com/ollama/ollama/model/input"
-)
-
-type Model struct {
-	model.Base
-	model.BytePairEncoding
-
-	*TextModel
-	*VisionModel     `gguf:"v"`
-	VisionDownsample *VisionDownsample `gguf:"mm.patch_merger"`
-	PatchMerger      *PatchMerger      `gguf:"mm"`
-
-	ImageProcessor
-
-	imageTokenID      int32
-	imageStartTokenID int32
-	imageEndTokenID   int32
-}
-
-var _ model.MultimodalProcessor = (*Model)(nil)
-
-func New(c fs.Config) (model.Model, error) {
-	eosTokenID := int32(c.Uint("tokenizer.ggml.eos_token_id"))
-	eosTokenIDs := c.Ints("tokenizer.ggml.eos_token_ids")
-	allEOS := append([]int32{eosTokenID}, eosTokenIDs...)
-
-	m := &Model{
-		BytePairEncoding: model.NewBytePairEncoding(
-			&model.Vocabulary{
-				Values: c.Strings("tokenizer.ggml.tokens"),
-				Types:  c.Ints("tokenizer.ggml.token_type"),
-				Merges: c.Strings("tokenizer.ggml.merges"),
-				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", false),
-				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
-				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				EOS:    allEOS,
-			},
-			`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
-		),
-		TextModel:         newTextModel(c),
-		VisionModel:       newVisionModel(c),
-		ImageProcessor:    newImageProcessor(c),
-		imageTokenID:      int32(c.Uint("image_token_id", 59280)),
-		imageStartTokenID: int32(c.Uint("image_start_token_id", 59256)),
-		imageEndTokenID:   int32(c.Uint("image_end_token_id", 59257)),
-	}
-
-	m.Cache = kvcache.NewCausalCache(m.TextModel.Shift)
-
-	return m, nil
-}
-
-func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
-	if len(m.VisionModel.Blocks) == 0 {
-		return nil, model.ErrNoVisionModel
-	}
-
-	img, _, err := image.Decode(bytes.NewReader(multimodalData))
-	if err != nil {
-		return nil, err
-	}
-
-	f32s, grid, err := m.ImageProcessor.ProcessImage(img)
-	if err != nil {
-		return nil, err
-	}
-
-	// Create pixel values tensor from flattened patches
-	// Shape: [patchDim, numPatches]
-	patchDim := m.VisionModel.numChannels * m.temporalPatchSize * m.patchSize * m.patchSize
-	numPatches := grid.Temporal * grid.Height * grid.Width
-	pixelValues := ctx.Input().FromFloats(f32s, patchDim, numPatches)
-
-	// Forward through vision encoder
-	visionOutputs := m.VisionModel.Forward(ctx, pixelValues, grid)
-
-	// Forward through downsample (patch merger)
-	if m.VisionDownsample == nil || m.VisionDownsample.Weight == nil {
-		return nil, errors.New("glmocr: missing vision downsample weights")
-	}
-	visionOutputs = m.VisionDownsample.Forward(ctx, visionOutputs, grid, m.VisionModel.VisionModelOptions)
-
-	// Forward through patch merger (FC + LayerNorm + GELU + SwiGLU FFN)
-	if m.PatchMerger == nil {
-		return nil, errors.New("glmocr: missing patch merger weights")
-	}
-	visionOutputs = m.PatchMerger.Forward(ctx, visionOutputs, m.VisionModel.VisionModelOptions)
-
-	return []input.Multimodal{{Tensor: visionOutputs, Data: grid}}, nil
-}
-
-func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
-	var result []*input.Input
-
-	// Reset position cache
-	m.TextModel.positionCache = m.TextModel.positionCache[:0]
-	m.TextModel.ropeDelta = 0
-
-	pos := int32(0)
-	for _, inp := range inputs {
-		if inp.Multimodal == nil {
-			result = append(result, inp)
-			m.TextModel.positionCache = append(m.TextModel.positionCache, pos)
-			pos++
-			continue
-		}
-
-		// Get grid info for position calculation
-		grid := inp.Multimodal[0].Data.(*Grid)
-		mergedH := grid.Height / m.VisionModel.spatialMergeSize
-		mergedW := grid.Width / m.VisionModel.spatialMergeSize
-
-		// Add image start token
-		result = append(result, &input.Input{Token: m.imageStartTokenID})
-		m.TextModel.positionCache = append(m.TextModel.positionCache, pos)
-		pos++
-
-		// Add image tokens with multimodal data
-		// All image tokens share the same base position for temporal dimension
-		tokensPerGrid := inp.Multimodal[0].Tensor.Dim(1)
-		basePos := pos
-		sameBatch := tokensPerGrid - 1
-		if sameBatch < 0 {
-			sameBatch = 0
-		}
-		result = append(result, &input.Input{
-			Token:          m.imageTokenID,
-			Multimodal:     inp.Multimodal,
-			MultimodalHash: inp.MultimodalHash,
-			SameBatch:      sameBatch,
-		})
-		m.TextModel.positionCache = append(m.TextModel.positionCache, basePos)
-
-		// Add placeholder tokens for remaining positions
-		// All image tokens use the same base position (temporal stays constant)
-		for range tokensPerGrid - 1 {
-			result = append(result, &input.Input{Token: m.imageTokenID})
-			m.TextModel.positionCache = append(m.TextModel.positionCache, basePos)
-		}
-
-		// Advance position by max(mergedH, mergedW) after image tokens
-		pos = basePos + int32(max(mergedH, mergedW))
-
-		// Add image end token
-		result = append(result, &input.Input{Token: m.imageEndTokenID})
-		m.TextModel.positionCache = append(m.TextModel.positionCache, pos)
-		pos++
-	}
-
-	// Compute rope delta for continuation after the prefill segment:
-	// delta = (max_position_id + 1) - sequence_length
-	if len(m.TextModel.positionCache) > 0 {
-		last := m.TextModel.positionCache[len(m.TextModel.positionCache)-1]
-		m.TextModel.ropeDelta = last + 1 - int32(len(m.TextModel.positionCache))
-	}
-
-	return result, nil
-}
-
-func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	// Initial token embedding
-	hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs).Duplicate(ctx)
-	ctx.Forward(hiddenStates)
-
-	// Build position slices for M-RoPE
-	positionSlice := func() [][]int32 {
-		s := [][]int32{
-			make([]int32, len(batch.Positions)), // temporal
-			make([]int32, len(batch.Positions)), // height
-			make([]int32, len(batch.Positions)), // width
-			make([]int32, len(batch.Positions)), // unused (zeros)
-		}
-		for i, position := range batch.Positions {
-			// Translate through position cache or continue sequence
-			if position < int32(len(m.TextModel.positionCache)) {
-				position = m.TextModel.positionCache[position]
-			} else if len(m.TextModel.positionCache) > 0 {
-				// Continue sequence after cached positions using ropeDelta
-				position = position + m.TextModel.ropeDelta
-			}
-
-			s[0][i] = position
-			s[1][i] = position
-			s[2][i] = position
-		}
-		return s
-	}()
-
-	// Inject vision embeddings and adjust positions for image tokens
-	for _, mi := range batch.Multimodal {
-		img := mi.Multimodal[0].Tensor
-		ctx.Forward(img.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), img.Dim(0)*img.Dim(1))))
-
-		if grid, ok := mi.Multimodal[0].Data.(*Grid); ok {
-			w := grid.Width / m.VisionModel.spatialMergeSize
-			for i := range img.Dim(1) {
-				positionSlice[1][mi.Index+i] += int32(i / w)
-				positionSlice[2][mi.Index+i] += int32(i % w)
-			}
-		}
-	}
-
-	positions := ctx.Input().FromInts(slices.Concat(positionSlice...), len(positionSlice[0])*len(positionSlice))
-
-	// Process through transformer layers
-	for i, layer := range m.TextModel.Layers {
-		m.Cache.SetLayer(i)
-
-		var lastLayerOutputs ml.Tensor
-		if i == len(m.TextModel.Layers)-1 {
-			lastLayerOutputs = batch.Outputs
-		}
-
-		hiddenStates = layer.Forward(ctx, hiddenStates, positions, lastLayerOutputs, m.Cache, m.TextModel.TextModelOptions)
-	}
-
-	hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.TextModel.eps)
-	return m.Output.Forward(ctx, hiddenStates), nil
-}
-
-func init() {
-	model.Register("glmocr", New)
-}
--- a/model/models/glmocr/model_text.go
+++ b/model/models/glmocr/model_text.go
@@ -1,180 +0,0 @@
-package glmocr
-
-import (
-	"math"
-
-	"github.com/ollama/ollama/fs"
-	"github.com/ollama/ollama/kvcache"
-	"github.com/ollama/ollama/ml"
-	"github.com/ollama/ollama/ml/nn"
-	"github.com/ollama/ollama/ml/nn/rope"
-)
-
-type TextModelOptions struct {
-	hiddenSize       int
-	numHeads         int
-	numKVHeads       int
-	headDim          int
-	rotaryDim        int
-	intermediateSize int
-	eps              float32
-	ropeBase         float32
-	mropeSections    []int
-}
-
-func (o *TextModelOptions) applyMRoPE(ctx ml.Context, states, positions ml.Tensor) ml.Tensor {
-	// GLM4 uses standard M-RoPE (not interleaved like Qwen3VL)
-	// With 4 sections for [temporal, height, width, unused]
-	return nn.RoPE(ctx, states, positions, o.rotaryDim, o.ropeBase, 1.0, rope.WithMRoPE(o.mropeSections))
-}
-
-type TextSelfAttention struct {
-	Query  *nn.Linear `gguf:"attn_q"`
-	Key    *nn.Linear `gguf:"attn_k"`
-	Value  *nn.Linear `gguf:"attn_v"`
-	Output *nn.Linear `gguf:"attn_out"`
-}
-
-func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, cache kvcache.Cache, opts *TextModelOptions) ml.Tensor {
-	batchSize := hiddenStates.Dim(1)
-
-	// Separate Q, K, V projections
-	q := sa.Query.Forward(ctx, hiddenStates)
-	k := sa.Key.Forward(ctx, hiddenStates)
-	v := sa.Value.Forward(ctx, hiddenStates)
-
-	// Reshape for GQA
-	q = q.Reshape(ctx, opts.headDim, opts.numHeads, batchSize)
-	k = k.Reshape(ctx, opts.headDim, opts.numKVHeads, batchSize)
-	v = v.Reshape(ctx, opts.headDim, opts.numKVHeads, batchSize)
-
-	// Apply M-RoPE (multi-resolution rotary position embeddings)
-	q = opts.applyMRoPE(ctx, q, positions)
-	k = opts.applyMRoPE(ctx, k, positions)
-
-	// Scaled dot-product attention with KV cache
-	scaleFactor := 1.0 / math.Sqrt(float64(opts.headDim))
-	kqv := nn.Attention(ctx, q, k, v, scaleFactor, cache)
-	// Reshape attention output: [headDim, numHeads, batchSize] -> [numHeads*headDim, batchSize]
-	// Note: numHeads * headDim = 16 * 128 = 2048, which is the attention hidden size
-	kqv = kqv.Reshape(ctx, opts.numHeads*opts.headDim, batchSize)
-
-	return sa.Output.Forward(ctx, kqv)
-}
-
-type TextMLP struct {
-	Gate *nn.Linear `gguf:"ffn_gate"`
-	Up   *nn.Linear `gguf:"ffn_up"`
-	Down *nn.Linear `gguf:"ffn_down"`
-}
-
-func (mlp *TextMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *TextModelOptions) ml.Tensor {
-	// SwiGLU: down(silu(gate(x)) * up(x))
-	gate := mlp.Gate.Forward(ctx, hiddenStates).SILU(ctx, mlp.Up.Forward(ctx, hiddenStates))
-	return mlp.Down.Forward(ctx, gate)
-}
-
-type TextDecoderLayer struct {
-	// Input layernorm (before attention)
-	AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
-	SelfAttention *TextSelfAttention
-	// Post self-attention layernorm (after attention, before residual add)
-	PostAttnNorm *nn.RMSNorm `gguf:"post_attn_norm"`
-
-	// FFN input layernorm (after first residual, before MLP)
-	FFNNorm *nn.RMSNorm `gguf:"ffn_norm"`
-	MLP     *TextMLP
-	// Post MLP layernorm (after MLP, before residual add)
-	PostFFNNorm *nn.RMSNorm `gguf:"post_ffn_norm"`
-}
-
-func (l *TextDecoderLayer) Forward(ctx ml.Context, hiddenStates, positions, outputs ml.Tensor, cache kvcache.Cache, opts *TextModelOptions) ml.Tensor {
-	// Attention block
-	residual := hiddenStates
-	hiddenStates = l.AttentionNorm.Forward(ctx, hiddenStates, opts.eps)
-	hiddenStates = l.SelfAttention.Forward(ctx, hiddenStates, positions, cache, opts)
-	hiddenStates = l.PostAttnNorm.Forward(ctx, hiddenStates, opts.eps)
-
-	// Prune to output positions in final layer
-	if outputs != nil {
-		hiddenStates = hiddenStates.Rows(ctx, outputs)
-		residual = residual.Rows(ctx, outputs)
-	}
-
-	hiddenStates = hiddenStates.Add(ctx, residual)
-
-	// MLP block
-	residual = hiddenStates
-	hiddenStates = l.FFNNorm.Forward(ctx, hiddenStates, opts.eps)
-	hiddenStates = l.MLP.Forward(ctx, hiddenStates, opts)
-	hiddenStates = l.PostFFNNorm.Forward(ctx, hiddenStates, opts.eps)
-	hiddenStates = hiddenStates.Add(ctx, residual)
-
-	return hiddenStates
-}
-
-type TextModel struct {
-	TokenEmbedding *nn.Embedding      `gguf:"token_embd"`
-	Layers         []TextDecoderLayer `gguf:"blk"`
-	OutputNorm     *nn.RMSNorm        `gguf:"output_norm"`
-	Output         *nn.Linear         `gguf:"output,alt:token_embd"`
-
-	*TextModelOptions
-
-	// positionCache stores the M-RoPE position for each token in the sequence.
-	// This is needed because image tokens share the same base position but have
-	// different height/width offsets, and the end token position depends on the
-	// image grid dimensions.
-	positionCache []int32
-	ropeDelta     int32
-}
-
-func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	// Clear position cache when KV cache shifts
-	m.positionCache = nil
-	m.ropeDelta = 0
-	return m.applyMRoPE(ctx, key, shift), nil
-}
-
-func newTextModel(c fs.Config) *TextModel {
-	hiddenSize := int(c.Uint("embedding_length", 1536))
-	numHeads := int(c.Uint("attention.head_count", 16))
-	numKVHeads := int(c.Uint("attention.head_count_kv", 8))
-	intermediateSize := int(c.Uint("feed_forward_length", 4608))
-	eps := c.Float("attention.layer_norm_rms_epsilon", 1e-5)
-	ropeBase := c.Float("rope.freq_base", 10000)
-
-	headDim := int(c.Uint("attention.key_length", uint32(hiddenSize/numHeads)))
-
-	mropeSections := c.Ints("rope.mrope_section")
-	var sectionInts []int
-
-	if len(mropeSections) > 0 {
-		sectionInts = make([]int, len(mropeSections))
-		for i, section := range mropeSections {
-			sectionInts[i] = int(section)
-		}
-	} else {
-		// Default: 3 sections like GLM-OCR
-		sectionInts = []int{16, 24, 24}
-	}
-
-	// rotaryDim = headDim (128) to rotate all dimensions
-	// GGML rope_multi: sector = (dim_pair) % sum(sections), mapping each pair to its position dim
-	rotaryDim := headDim
-
-	return &TextModel{
-		Layers: make([]TextDecoderLayer, c.Uint("block_count", 16)),
-		TextModelOptions: &TextModelOptions{
-			hiddenSize:       hiddenSize,
-			numHeads:         numHeads,
-			numKVHeads:       numKVHeads,
-			headDim:          headDim,
-			rotaryDim:        rotaryDim,
-			intermediateSize: intermediateSize,
-			eps:              eps,
-			ropeBase:         ropeBase,
-			mropeSections:    sectionInts,
-		},
-	}
-}
--- a/model/models/glmocr/model_vision.go
+++ b/model/models/glmocr/model_vision.go
@@ -1,348 +0,0 @@
-package glmocr
-
-import (
-	"log/slog"
-	"math"
-	"slices"
-
-	"github.com/ollama/ollama/fs"
-	"github.com/ollama/ollama/ml"
-	"github.com/ollama/ollama/ml/nn"
-	"github.com/ollama/ollama/ml/nn/rope"
-)
-
-type Grid struct {
-	Height      int // Number of patches in height direction
-	Width       int // Number of patches in width direction
-	Temporal    int
-	ImageHeight int // Full image height in pixels
-	ImageWidth  int // Full image width in pixels
-}
-
-type VisionModelOptions struct {
-	hiddenSize        int
-	numHeads          int
-	headDim           int
-	numChannels       int
-	patchSize         int
-	temporalPatchSize int
-	imageSize         int
-	spatialMergeSize  int
-	outHiddenSize     int
-	intermediateSize  int
-	eps               float32
-}
-
-type VisionPatchEmbed struct {
-	Proj  *nn.Conv2D `gguf:"patch_embd_0"`
-	Proj1 *nn.Conv2D `gguf:"patch_embd_1"`
-	Bias  ml.Tensor  `gguf:"patch_embd.bias"`
-}
-
-func (pe *VisionPatchEmbed) Forward(ctx ml.Context, pixelValues ml.Tensor, grid *Grid, opts *VisionModelOptions) ml.Tensor {
-	_ = grid // patches are already in merge-block order
-
-	// pixelValues shape: [patchDim, numPatches]
-	numPatches := pixelValues.Shape()[1]
-
-	// Reshape to [patchSize*patchSize, temporalPatchSize, numChannels, numPatches]
-	pixelValues = pixelValues.Reshape(ctx, opts.patchSize*opts.patchSize, opts.temporalPatchSize, opts.numChannels, numPatches)
-	// Permute to [temporalPatchSize, patchSize*patchSize, numChannels, numPatches]
-	pixelValues = pixelValues.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
-
-	// Slice temporal frames for Conv2D (simulate Conv3D)
-	in0 := pixelValues.View(ctx, 0, 1, pixelValues.Stride(1), pixelValues.Dim(1), pixelValues.Stride(2), pixelValues.Dim(2), pixelValues.Stride(3), pixelValues.Dim(3)).Contiguous(ctx)
-	in0 = in0.Reshape(ctx, opts.patchSize, opts.patchSize, opts.numChannels, numPatches)
-
-	s0, s1 := opts.patchSize, opts.patchSize
-	p0, p1 := 0, 0
-	d0, d1 := 1, 1
-	hiddenStates := pe.Proj.Forward(ctx, in0, s0, s1, p0, p1, d0, d1)
-
-	if pe.Proj1 != nil && opts.temporalPatchSize > 1 {
-		in1 := pixelValues.View(ctx, pixelValues.Stride(0), 1, pixelValues.Stride(1), pixelValues.Dim(1), pixelValues.Stride(2), pixelValues.Dim(2), pixelValues.Stride(3), pixelValues.Dim(3)).Contiguous(ctx)
-		in1 = in1.Reshape(ctx, opts.patchSize, opts.patchSize, opts.numChannels, numPatches)
-		out1 := pe.Proj1.Forward(ctx, in1, s0, s1, p0, p1, d0, d1)
-		hiddenStates = hiddenStates.Add(ctx, out1)
-	}
-
-	// Flatten to [hidden_size, num_patches]
-	hiddenStates = hiddenStates.Reshape(ctx, opts.hiddenSize, numPatches)
-
-	// Add patch bias - reshape from [hidden_size] to [hidden_size, 1] for broadcasting
-	if pe.Bias != nil {
-		hiddenStates = hiddenStates.Add(ctx, pe.Bias.Reshape(ctx, opts.hiddenSize, 1))
-	}
-
-	return hiddenStates
-}
-
-type VisionSelfAttention struct {
-	QKV    *nn.Linear  `gguf:"attn_qkv"`
-	QNorm  *nn.RMSNorm `gguf:"attn_q_norm"`
-	KNorm  *nn.RMSNorm `gguf:"attn_k_norm"`
-	Output *nn.Linear  `gguf:"attn_out"`
-}
-
-func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, opts *VisionModelOptions) ml.Tensor {
-	batchSize := hiddenStates.Dim(1)
-
-	// Combined QKV projection: [3*hidden_size, batch_size]
-	qkv := sa.QKV.Forward(ctx, hiddenStates)
-
-	// Split using ChunkSections along dim 0 (handles byte offsets correctly)
-	// ChunkSections returns views - must make contiguous before further operations
-	chunks := qkv.ChunkSections(ctx, 0, opts.hiddenSize, opts.hiddenSize, opts.hiddenSize)
-	q := chunks[0].Contiguous(ctx)
-	k := chunks[1].Contiguous(ctx)
-	v := chunks[2].Contiguous(ctx)
-
-	// Reshape for multi-head attention: [hiddenSize, N] -> [headDim, numHeads, N]
-	q = q.Reshape(ctx, opts.headDim, opts.numHeads, batchSize)
-	k = k.Reshape(ctx, opts.headDim, opts.numHeads, batchSize)
-	v = v.Reshape(ctx, opts.headDim, opts.numHeads, batchSize)
-
-	// Apply Q-norm and K-norm after head reshape
-	// Weights are [headDim]=64, tensor is [headDim, numHeads, N]
-	q = sa.QNorm.Forward(ctx, q, opts.eps)
-	k = sa.KNorm.Forward(ctx, k, opts.eps)
-
-	// Apply rotary position embeddings with vision-style 2D positions
-	// Each section of headDim/4 pairs is assigned to one position dimension
-	// Positions are [height, width, height, width] repeated for rotation
-	ropeFreqBase := float32(10000.0)
-	sections := []int{opts.headDim / 4, opts.headDim / 4, opts.headDim / 4, opts.headDim / 4}
-	q = nn.RoPE(ctx, q, positions, opts.headDim/2, ropeFreqBase, 1.0, rope.WithVision(sections))
-	k = nn.RoPE(ctx, k, positions, opts.headDim/2, ropeFreqBase, 1.0, rope.WithVision(sections))
-
-	// Scale factor for scaled dot-product attention
-	scale := 1.0 / math.Sqrt(float64(opts.headDim))
-
-	// Try flash attention first (ScaledDotProductAttention), fall back to manual
-	if sdpa, ok := q.(ml.ScaledDotProductAttention); ok {
-		attention := sdpa.ScaledDotProductAttention(ctx, k, v, nil, nil, nil, scale, false)
-		attention = attention.Reshape(ctx, opts.hiddenSize, batchSize)
-		return sa.Output.Forward(ctx, attention)
-	}
-
-	slog.Warn("glmocr: vision attention falling back to manual attention",
-		"batchSize", batchSize, "numHeads", opts.numHeads,
-		"hint", "set OLLAMA_FLASH_ATTENTION=1 to enable flash attention")
-
-	// Manual attention fallback
-	// q, k, v are [headDim, numHeads, batchSize] - GGML treats as 4D with implicit dim 3 = 1
-	q = q.Permute(ctx, 0, 2, 1, 3)
-	k = k.Permute(ctx, 0, 2, 1, 3)
-	v = v.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
-
-	// Attention scores
-	kq := k.MulmatFullPrec(ctx, q)
-	kq = kq.Scale(ctx, scale)
-	kq = kq.Softmax(ctx)
-
-	// Attention output: v @ kq (note: v first)
-	kqv := v.Mulmat(ctx, kq)
-	attention := kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
-	attention = attention.Reshape(ctx, opts.hiddenSize, batchSize)
-
-	return sa.Output.Forward(ctx, attention)
-}
-
-type VisionMLP struct {
-	Gate *nn.Linear `gguf:"ffn_gate"`
-	Up   *nn.Linear `gguf:"ffn_up"`
-	Down *nn.Linear `gguf:"ffn_down"`
-}
-
-func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor) ml.Tensor {
-	// SwiGLU: down(silu(gate(x)) * up(x))
-	gate := mlp.Gate.Forward(ctx, hiddenStates).SILU(ctx, mlp.Up.Forward(ctx, hiddenStates))
-	return mlp.Down.Forward(ctx, gate)
-}
-
-type VisionBlock struct {
-	Norm1         *nn.RMSNorm `gguf:"ln1"`
-	SelfAttention *VisionSelfAttention
-	Norm2         *nn.RMSNorm `gguf:"ln2"`
-	MLP           *VisionMLP
-}
-
-func (b *VisionBlock) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, opts *VisionModelOptions) ml.Tensor {
-	// Pre-norm architecture
-	residual := hiddenStates
-	hiddenStates = b.Norm1.Forward(ctx, hiddenStates, opts.eps)
-	hiddenStates = b.SelfAttention.Forward(ctx, hiddenStates, positions, opts)
-	hiddenStates = hiddenStates.Add(ctx, residual)
-
-	residual = hiddenStates
-	hiddenStates = b.Norm2.Forward(ctx, hiddenStates, opts.eps)
-	hiddenStates = b.MLP.Forward(ctx, hiddenStates)
-	hiddenStates = hiddenStates.Add(ctx, residual)
-
-	return hiddenStates
-}
-
-type VisionDownsample struct {
-	*nn.Conv2D // Embedded to get mm.patch_merger.weight/bias directly
-}
-
-func (d *VisionDownsample) Forward(ctx ml.Context, hiddenStates ml.Tensor, grid *Grid, opts *VisionModelOptions) ml.Tensor {
-	// Apply spatial downsampling via Conv2D
-	// Input: [hidden_size, num_patches] where patches are in merge-block order
-
-	if d.Conv2D == nil || d.Weight == nil {
-		panic("VisionDownsample weights not loaded")
-	}
-
-	merge := opts.spatialMergeSize
-	numOutputTokens := (grid.Height / merge) * (grid.Width / merge)
-
-	// Step 1: Reshape to [hidden_size, merge, merge, num_output_tokens]
-	hiddenStates = hiddenStates.Reshape(ctx, opts.hiddenSize, merge, merge, numOutputTokens)
-
-	// Step 2: Permute to [merge, merge, hidden_size, num_output_tokens]
-	// ggml semantics: result.ne[perm[i]] = input.ne[i]
-	// So permute(2,0,1,3) on [1024,2,2,N] gives: ne[2]=1024, ne[0]=2, ne[1]=2, ne[3]=N -> [2,2,1024,N]
-	hiddenStates = hiddenStates.Permute(ctx, 2, 0, 1, 3).Contiguous(ctx)
-
-	// Step 3: Apply Conv2D without bias (bias added after reshape)
-	// Note: ggml_conv_2d takes (kernel, input) - kernel must be receiver in ollama
-	s0, s1 := merge, merge
-	p0, p1 := 0, 0
-	d0, d1 := 1, 1
-	hiddenStates = d.Weight.Conv2D(ctx, hiddenStates, s0, s1, p0, p1, d0, d1)
-
-	// Step 4: Reshape to [out_hidden_size, num_output_tokens]
-	hiddenStates = hiddenStates.Reshape(ctx, opts.outHiddenSize, numOutputTokens)
-
-	// Step 5: Add bias after reshape
-	// Reshape bias from [out_hidden_size] to [out_hidden_size, 1] for proper broadcasting
-	if d.Bias != nil {
-		hiddenStates = hiddenStates.Add(ctx, d.Bias.Reshape(ctx, opts.outHiddenSize, 1))
-	}
-
-	return hiddenStates
-}
-
-type PatchMerger struct {
-	// GGUF tags align with mm.* keys used by the model
-	Proj     *nn.Linear    `gguf:"model.fc"`  // mm.model.fc.weight
-	PostLN   *nn.LayerNorm `gguf:"post_norm"` // mm.post_norm.weight/bias
-	GateProj *nn.Linear    `gguf:"gate"`      // mm.gate.weight
-	UpProj   *nn.Linear    `gguf:"up"`        // mm.up.weight
-	DownProj *nn.Linear    `gguf:"down"`      // mm.down.weight
-}
-
-func (m *PatchMerger) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *VisionModelOptions) ml.Tensor {
-	// Linear projection
-	hiddenStates = m.Proj.Forward(ctx, hiddenStates)
-
-	// Post-projection layer norm + GELU ERF
-	hiddenStates = m.PostLN.Forward(ctx, hiddenStates, opts.eps)
-	hiddenStates = hiddenStates.GELU_ERF(ctx)
-	// Force a copy to avoid in-place mutation issues with GELU_ERF
-	hiddenStates = hiddenStates.Contiguous(ctx)
-
-	// SwiGLU MLP: down(silu(gate(x)) * up(x))
-	gateOut := m.GateProj.Forward(ctx, hiddenStates)
-	upOut := m.UpProj.Forward(ctx, hiddenStates)
-	gate := gateOut.SILU(ctx, upOut)
-	return m.DownProj.Forward(ctx, gate)
-}
-
-type VisionModel struct {
-	PatchEmbed *VisionPatchEmbed
-	Blocks     []VisionBlock `gguf:"blk"`
-	PostLN     *nn.RMSNorm   `gguf:"post_ln"`
-	// Note: Downsample is applied at the model level so mm.patch_merger stays separate
-
-	*VisionModelOptions
-}
-
-func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor, grid *Grid) ml.Tensor {
-	// Extract patch embeddings from flattened patches
-	hiddenStates := m.PatchEmbed.Forward(ctx, pixelValues, grid, m.VisionModelOptions)
-
-	// Create position IDs for RoPE (spatial grid)
-	// Patches are already in merge-block order from preprocessing
-	positions := m.createPositions(ctx, grid)
-
-	// Process through vision blocks
-	for _, block := range m.Blocks {
-		hiddenStates = block.Forward(ctx, hiddenStates, positions, m.VisionModelOptions)
-	}
-
-	// Post-layernorm
-	hiddenStates = m.PostLN.Forward(ctx, hiddenStates, m.eps)
-
-	// Note: Downsample is now applied separately in Model.EncodeMultimodal
-	// so mm.patch_merger remains a distinct module
-
-	return hiddenStates
-}
-
-func (m *VisionModel) createPositions(ctx ml.Context, grid *Grid) ml.Tensor {
-	// Create spatial position IDs for vision RoPE
-	// Position layout: [height, width, height, width] - 4 sections for mrope
-	// Patches are in MERGE-BLOCK order after VisionPatchEmbed interleaving
-	// This follows the GLM-OCR rot_pos_emb layout
-	numPatches := grid.Height * grid.Width
-	mergeRatio := m.spatialMergeSize
-
-	// Build position arrays in merge-block order
-	// Each merge_ratio x merge_ratio block of patches is grouped together
-	hpos := make([]int32, numPatches)
-	wpos := make([]int32, numPatches)
-	ptr := 0
-	for y := 0; y < grid.Height; y += mergeRatio {
-		for x := 0; x < grid.Width; x += mergeRatio {
-			for dy := range mergeRatio {
-				for dx := range mergeRatio {
-					hpos[ptr] = int32(y + dy)
-					wpos[ptr] = int32(x + dx)
-					ptr++
-				}
-			}
-		}
-	}
-
-	// Build position arrays for 4 sections (mrope)
-	s := [][]int32{
-		hpos,               // Section 0: height
-		wpos,               // Section 1: width
-		slices.Clone(hpos), // Section 2: height (repeated)
-		slices.Clone(wpos), // Section 3: width (repeated)
-	}
-
-	return ctx.Input().FromInts(slices.Concat(s...), numPatches*4)
-}
-
-func newVisionModel(c fs.Config) *VisionModel {
-	hiddenSize := int(c.Uint("vision.embedding_length", 1024))
-	numHeads := int(c.Uint("vision.attention.head_count", 16))
-	numChannels := int(c.Uint("vision.num_channels", 3))
-	patchSize := int(c.Uint("vision.patch_size", 14))
-	temporalPatchSize := int(c.Uint("vision.temporal_patch_size", 2))
-	imageSize := int(c.Uint("vision.image_size", 336))
-	spatialMergeSize := int(c.Uint("vision.spatial_merge_size", 2))
-	outHiddenSize := int(c.Uint("vision.out_hidden_size", 1536))
-	intermediateSize := int(c.Uint("vision.intermediate_size", 4096))
-	eps := c.Float("vision.attention.layer_norm_rms_epsilon", 1e-5)
-
-	return &VisionModel{
-		Blocks: make([]VisionBlock, c.Uint("vision.block_count", 24)),
-		VisionModelOptions: &VisionModelOptions{
-			hiddenSize:        hiddenSize,
-			numHeads:          numHeads,
-			headDim:           hiddenSize / numHeads,
-			numChannels:       numChannels,
-			patchSize:         patchSize,
-			temporalPatchSize: temporalPatchSize,
-			imageSize:         imageSize,
-			spatialMergeSize:  spatialMergeSize,
-			outHiddenSize:     outHiddenSize,
-			intermediateSize:  intermediateSize,
-			eps:               eps,
-		},
-	}
-}
--- a/model/models/models.go
+++ b/model/models/models.go
@@ -8,7 +8,6 @@ import (
 	_ "github.com/ollama/ollama/model/models/gemma3"
 	_ "github.com/ollama/ollama/model/models/gemma3n"
 	_ "github.com/ollama/ollama/model/models/glm4moelite"
-	_ "github.com/ollama/ollama/model/models/glmocr"
 	_ "github.com/ollama/ollama/model/models/gptoss"
 	_ "github.com/ollama/ollama/model/models/lfm2"
 	_ "github.com/ollama/ollama/model/models/llama"
--- a/model/parsers/glmocr.go
+++ b/model/parsers/glmocr.go
@@ -1,19 +0,0 @@
-package parsers
-
-import "github.com/ollama/ollama/api"
-
-type GlmOcrParser struct {
-	GLM47Parser
-}
-
-func (p *GlmOcrParser) HasThinkingSupport() bool {
-	return false
-}
-
-func (p *GlmOcrParser) Init(tools []api.Tool, lastMessage *api.Message, thinkValue *api.ThinkValue) []api.Tool {
-	p.tools = tools
-	if thinkValue != nil && thinkValue.Bool() {
-		p.state = glm46ParserState_CollectingThinking
-	}
-	return tools
-}
--- a/model/parsers/parsers.go
+++ b/model/parsers/parsers.go
@@ -70,8 +70,6 @@ func ParserForName(name string) Parser {
 		return &FunctionGemmaParser{}
 	case "glm-4.7":
 		return &GLM47Parser{}
-	case "glm-ocr":
-		return &GlmOcrParser{}
 	case "lfm2":
 		return &LFM2Parser{hasThinkingSupport: false}
 	case "lfm2-thinking":
--- a/model/renderers/glmocr.go
+++ b/model/renderers/glmocr.go
@@ -1,109 +0,0 @@
-package renderers
-
-import (
-	"encoding/json"
-	"fmt"
-	"strings"
-
-	"github.com/ollama/ollama/api"
-)
-
-type GlmOcrRenderer struct{}
-
-func (r *GlmOcrRenderer) Render(messages []api.Message, tools []api.Tool, thinkValue *api.ThinkValue) (string, error) {
-	var sb strings.Builder
-
-	sb.WriteString("[gMASK]<sop>")
-
-	if len(tools) > 0 {
-		sb.WriteString("<|system|>\n")
-		sb.WriteString("# Tools\n\n")
-		sb.WriteString("You may call one or more functions to assist with the user query.\n\n")
-		sb.WriteString("You are provided with function signatures within <tools></tools> XML tags:\n")
-		sb.WriteString("<tools>\n")
-		for _, tool := range tools {
-			d, _ := json.Marshal(tool)
-			sb.WriteString(formatGLM47ToolJSON(d))
-			sb.WriteString("\n")
-		}
-		sb.WriteString("</tools>\n\n")
-		sb.WriteString("For each function call, output the function name and arguments within the following XML format:\n")
-		sb.WriteString("<tool_call>{function-name}<arg_key>{arg-key-1}</arg_key><arg_value>{arg-value-1}</arg_value><arg_key>{arg-key-2}</arg_key><arg_value>{arg-value-2}</arg_value>...</tool_call>")
-	}
-
-	enableThinking := false
-	thinkingExplicitlySet := false
-	if thinkValue != nil {
-		enableThinking = thinkValue.Bool()
-		thinkingExplicitlySet = true
-	}
-
-	for i, message := range messages {
-		switch message.Role {
-		case "user":
-			sb.WriteString("<|user|>\n")
-			sb.WriteString(message.Content)
-			if thinkingExplicitlySet && !enableThinking && !strings.HasSuffix(message.Content, "/nothink") {
-				sb.WriteString("/nothink")
-			}
-		case "assistant":
-			sb.WriteString("<|assistant|>\n")
-			if message.Thinking != "" {
-				sb.WriteString("<think>" + strings.TrimSpace(message.Thinking) + "</think>")
-			} else {
-				sb.WriteString("<think></think>")
-			}
-			if message.Content != "" {
-				sb.WriteString("\n" + strings.TrimSpace(message.Content))
-			}
-			if len(message.ToolCalls) > 0 {
-				for _, toolCall := range message.ToolCalls {
-					sb.WriteString("\n<tool_call>" + toolCall.Function.Name)
-					sb.WriteString(renderGlmOcrToolArguments(toolCall.Function.Arguments))
-					sb.WriteString("</tool_call>")
-				}
-			}
-			sb.WriteString("\n")
-		case "tool":
-			if i == 0 || messages[i-1].Role != "tool" {
-				sb.WriteString("<|observation|>")
-			}
-			sb.WriteString("\n<tool_response>\n")
-			sb.WriteString(message.Content)
-			sb.WriteString("\n</tool_response>\n")
-		case "system":
-			sb.WriteString("<|system|>\n")
-			sb.WriteString(message.Content)
-			sb.WriteString("\n")
-		}
-	}
-
-	sb.WriteString("<|assistant|>\n")
-	if thinkingExplicitlySet && !enableThinking {
-		sb.WriteString("<think></think>\n")
-	}
-
-	return sb.String(), nil
-}
-
-func renderGlmOcrToolArguments(args api.ToolCallFunctionArguments) string {
-	var sb strings.Builder
-	for key, value := range args.All() {
-		sb.WriteString("<arg_key>" + key + "</arg_key>")
-		var valueStr string
-		if str, ok := value.(string); ok {
-			valueStr = str
-		} else {
-			jsonBytes, err := json.Marshal(value)
-			if err != nil {
-				valueStr = fmt.Sprintf("%v", value)
-			} else {
-				valueStr = string(jsonBytes)
-			}
-		}
-
-		sb.WriteString("<arg_value>" + valueStr + "</arg_value>")
-	}
-
-	return sb.String()
-}
--- a/model/renderers/renderer.go
+++ b/model/renderers/renderer.go
@@ -82,8 +82,6 @@ func rendererForName(name string) Renderer {
 		return &FunctionGemmaRenderer{}
 	case "glm-4.7":
 		return &GLM47Renderer{}
-	case "glm-ocr":
-		return &GlmOcrRenderer{}
 	case "lfm2":
 		return &LFM2Renderer{IsThinking: false}
 	case "lfm2-thinking":
--- a/openai/openai.go
+++ b/openai/openai.go
@@ -794,47 +794,3 @@ func ToImageGenerationResponse(resp api.GenerateResponse) ImageGenerationRespons
 		Data:    data,
 	}
 }
-
-// ImageEditRequest is an OpenAI-compatible image edit request.
-type ImageEditRequest struct {
-	Model  string `json:"model"`
-	Prompt string `json:"prompt"`
-	Image  string `json:"image"`          // Base64-encoded image data
-	Size   string `json:"size,omitempty"` // e.g., "1024x1024"
-	Seed   *int64 `json:"seed,omitempty"`
-}
-
-// FromImageEditRequest converts an OpenAI image edit request to an Ollama GenerateRequest.
-func FromImageEditRequest(r ImageEditRequest) (api.GenerateRequest, error) {
-	req := api.GenerateRequest{
-		Model:  r.Model,
-		Prompt: r.Prompt,
-	}
-
-	// Decode the input image
-	if r.Image != "" {
-		imgData, err := decodeImageURL(r.Image)
-		if err != nil {
-			return api.GenerateRequest{}, fmt.Errorf("invalid image: %w", err)
-		}
-		req.Images = append(req.Images, imgData)
-	}
-
-	// Parse size if provided (e.g., "1024x768")
-	if r.Size != "" {
-		var w, h int32
-		if _, err := fmt.Sscanf(r.Size, "%dx%d", &w, &h); err == nil {
-			req.Width = w
-			req.Height = h
-		}
-	}
-
-	if r.Seed != nil {
-		if req.Options == nil {
-			req.Options = map[string]any{}
-		}
-		req.Options["seed"] = *r.Seed
-	}
-
-	return req, nil
-}
--- a/openai/openai_test.go
+++ b/openai/openai_test.go
@@ -448,86 +448,3 @@ func TestFromChatRequest_TopLogprobsRange(t *testing.T) {
 		})
 	}
 }
-
-func TestFromImageEditRequest_Basic(t *testing.T) {
-	req := ImageEditRequest{
-		Model:  "test-model",
-		Prompt: "make it blue",
-		Image:  prefix + image,
-	}
-
-	result, err := FromImageEditRequest(req)
-	if err != nil {
-		t.Fatalf("unexpected error: %v", err)
-	}
-
-	if result.Model != "test-model" {
-		t.Errorf("expected model 'test-model', got %q", result.Model)
-	}
-
-	if result.Prompt != "make it blue" {
-		t.Errorf("expected prompt 'make it blue', got %q", result.Prompt)
-	}
-
-	if len(result.Images) != 1 {
-		t.Fatalf("expected 1 image, got %d", len(result.Images))
-	}
-}
-
-func TestFromImageEditRequest_WithSize(t *testing.T) {
-	req := ImageEditRequest{
-		Model:  "test-model",
-		Prompt: "make it blue",
-		Image:  prefix + image,
-		Size:   "512x768",
-	}
-
-	result, err := FromImageEditRequest(req)
-	if err != nil {
-		t.Fatalf("unexpected error: %v", err)
-	}
-
-	if result.Width != 512 {
-		t.Errorf("expected width 512, got %d", result.Width)
-	}
-
-	if result.Height != 768 {
-		t.Errorf("expected height 768, got %d", result.Height)
-	}
-}
-
-func TestFromImageEditRequest_WithSeed(t *testing.T) {
-	seed := int64(12345)
-	req := ImageEditRequest{
-		Model:  "test-model",
-		Prompt: "make it blue",
-		Image:  prefix + image,
-		Seed:   &seed,
-	}
-
-	result, err := FromImageEditRequest(req)
-	if err != nil {
-		t.Fatalf("unexpected error: %v", err)
-	}
-
-	if result.Options == nil {
-		t.Fatal("expected options to be set")
-	}
-
-	if result.Options["seed"] != seed {
-		t.Errorf("expected seed %d, got %v", seed, result.Options["seed"])
-	}
-}
-
-func TestFromImageEditRequest_InvalidImage(t *testing.T) {
-	req := ImageEditRequest{
-		Model:  "test-model",
-		Prompt: "make it blue",
-		Image:  "not-valid-base64",
-	}
-
-	_, err := FromImageEditRequest(req)
-	if err == nil {
-		t.Error("expected error for invalid image")
-	}
-}
--- a/readline/readline.go
+++ b/readline/readline.go
@@ -95,21 +95,7 @@ func (i *Instance) Readline() (string, error) {

 	var currentLineBuf []rune

-	// draining tracks if we're processing buffered input from cooked mode.
-	// In cooked mode Enter sends \n, but in raw mode Ctrl+J sends \n.
-	// We treat \n from cooked mode as submit, not multiline.
-	// We check Buffered() after the first read since the bufio buffer is
-	// empty until then. This is compatible with """ multiline mode in
-	// interactive.go since each Readline() call is independent.
-	var draining, stopDraining bool
-
 	for {
-		// Apply deferred state change from previous iteration
-		if stopDraining {
-			draining = false
-			stopDraining = false
-		}
-
 		// don't show placeholder when pasting unless we're in multiline mode
 		showPlaceholder := !i.Pasting || i.Prompt.UseAlt
 		if buf.IsEmpty() && showPlaceholder {
@@ -119,15 +105,6 @@ func (i *Instance) Readline() (string, error) {

 		r, err := i.Terminal.Read()

-		// After reading, check if there's more buffered data. If so, we're
-		// processing cooked-mode input. Once buffer empties, the current
-		// char is the last buffered one (still drain it), then stop next iteration.
-		if i.Terminal.reader.Buffered() > 0 {
-			draining = true
-		} else if draining {
-			stopDraining = true
-		}
-
 		if buf.IsEmpty() {
 			fmt.Print(ClearToEOL)
 		}
@@ -255,20 +232,15 @@ func (i *Instance) Readline() (string, error) {
 			fd := os.Stdin.Fd()
 			return handleCharCtrlZ(fd, i.Terminal.termios)
 		case CharCtrlJ:
-			// If not draining cooked-mode input, treat as multiline
-			if !draining {
-				i.pastedLines = append(i.pastedLines, buf.String())
-				buf.Buf.Clear()
-				buf.Pos = 0
-				buf.DisplayPos = 0
-				buf.LineHasSpace.Clear()
-				fmt.Println()
-				fmt.Print(i.Prompt.AltPrompt)
-				i.Prompt.UseAlt = true
-				continue
-			}
-			// Draining cooked-mode input: treat \n as submit
-			fallthrough
+			i.pastedLines = append(i.pastedLines, buf.String())
+			buf.Buf.Clear()
+			buf.Pos = 0
+			buf.DisplayPos = 0
+			buf.LineHasSpace.Clear()
+			fmt.Println()
+			fmt.Print(i.Prompt.AltPrompt)
+			i.Prompt.UseAlt = true
+			continue
 		case CharEnter:
 			output := buf.String()
 			if len(i.pastedLines) > 0 {
--- a/scripts/build_darwin.sh
+++ b/scripts/build_darwin.sh
@@ -14,8 +14,8 @@
 VOL_NAME=${VOL_NAME:-"Ollama"}
 export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")}
 export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${VERSION#v}\" \"-X=github.com/ollama/ollama/server.mode=release\"'"
-export CGO_CFLAGS="-O3 -mmacosx-version-min=14.0"
-export CGO_CXXFLAGS="-O3 -mmacosx-version-min=14.0"
+export CGO_CFLAGS="-mmacosx-version-min=14.0"
+export CGO_CXXFLAGS="-mmacosx-version-min=14.0"
 export CGO_LDFLAGS="-mmacosx-version-min=14.0"

 set -e
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@@ -56,12 +56,6 @@ function checkEnv {

    $script:DIST_DIR="${script:SRC_DIR}\dist\windows-${script:TARGET_ARCH}"
    $env:CGO_ENABLED="1"
-    if (-not $env:CGO_CFLAGS) {
-        $env:CGO_CFLAGS = "-O3"
-    }
-    if (-not $env:CGO_CXXFLAGS) {
-        $env:CGO_CXXFLAGS = "-O3"
-    }
    Write-Output "Checking version"
    if (!$env:VERSION) {
        $data=(git describe --tags --first-parent --abbrev=7 --long --dirty --always)
--- a/server/images.go
+++ b/server/images.go
@@ -75,6 +75,12 @@ type Model struct {
 func (m *Model) Capabilities() []model.Capability {
 	capabilities := []model.Capability{}

+	// Check for image generation model via config capabilities
+	if slices.Contains(m.Config.Capabilities, "image") {
+		return []model.Capability{model.CapabilityImage}
+	}
+
+	// Check for completion capability
 	if m.ModelPath != "" {
 		f, err := gguf.Open(m.ModelPath)
 		if err == nil {
--- a/server/images_test.go
+++ b/server/images_test.go
@@ -56,15 +56,6 @@ func TestModelCapabilities(t *testing.T) {
 			},
 			expectedCaps: []model.Capability{model.CapabilityImage},
 		},
-		{
-			name: "model with image and vision capability (image editing)",
-			model: Model{
-				Config: model.ConfigV2{
-					Capabilities: []string{"image", "vision"},
-				},
-			},
-			expectedCaps: []model.Capability{model.CapabilityImage, model.CapabilityVision},
-		},
 		{
 			name: "model with completion capability",
 			model: Model{
--- a/server/quantization.go
+++ b/server/quantization.go
@@ -95,13 +95,6 @@ func getTensorNewType(kv fsggml.KV, qs *quantizeState, newType fsggml.TensorType
 			// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
 			newType = fsggml.TensorTypeQ8_0
 		}
-	} else if strings.Contains(name, "attn_k_b.weight") ||
-		strings.Contains(name, "attn_v_b.weight") ||
-		strings.Contains(name, "attn_kv_a_mqa.weight") ||
-		strings.Contains(name, "attn_q_a.weight") ||
-		strings.Contains(name, "attn_q_b.weight") {
-		// MLA tensors need higher precision to avoid quality degradation
-		newType = fsggml.TensorTypeQ8_0
 	} else if strings.Contains(name, "ffn_down") {
 		iLayer := qs.iFfnDown
 		n_layer := qs.nFfnDown
--- a/server/routes.go
+++ b/server/routes.go
@@ -1604,9 +1604,8 @@ func (s *Server) GenerateRoutes(rc *ollama.Registry) (http.Handler, error) {
 	r.GET("/v1/models", middleware.ListMiddleware(), s.ListHandler)
 	r.GET("/v1/models/:model", middleware.RetrieveMiddleware(), s.ShowHandler)
 	r.POST("/v1/responses", middleware.ResponsesMiddleware(), s.ChatHandler)
-	// OpenAI-compatible image generation endpoints
+	// OpenAI-compatible image generation endpoint
 	r.POST("/v1/images/generations", middleware.ImageGenerationsMiddleware(), s.GenerateHandler)
-	r.POST("/v1/images/edits", middleware.ImageEditsMiddleware(), s.GenerateHandler)

 	// Inference (Anthropic compatibility)
 	r.POST("/v1/messages", middleware.AnthropicMessagesMiddleware(), s.ChatHandler)
@@ -2508,14 +2507,8 @@ func (s *Server) handleImageGenerate(c *gin.Context, req api.GenerateRequest, mo
 		return
 	}

-	// Check streaming preference
-	isStreaming := req.Stream == nil || *req.Stream
-
-	contentType := "application/x-ndjson"
-	if !isStreaming {
-		contentType = "application/json; charset=utf-8"
-	}
-	c.Header("Content-Type", contentType)
+	// Set headers for streaming response
+	c.Header("Content-Type", "application/x-ndjson")

 	// Get seed from options if provided
 	var seed int64
@@ -2530,21 +2523,13 @@ func (s *Server) handleImageGenerate(c *gin.Context, req api.GenerateRequest, mo
 		}
 	}

-	var images []llm.ImageData
-	for i, imgData := range req.Images {
-		images = append(images, llm.ImageData{ID: i, Data: imgData})
-	}
-
 	var streamStarted bool
-	var finalResponse api.GenerateResponse
-
 	if err := runner.Completion(c.Request.Context(), llm.CompletionRequest{
 		Prompt: req.Prompt,
 		Width:  req.Width,
 		Height: req.Height,
 		Steps:  req.Steps,
 		Seed:   seed,
-		Images: images,
 	}, func(cr llm.CompletionResponse) {
 		streamStarted = true
 		res := api.GenerateResponse{
@@ -2568,11 +2553,6 @@ func (s *Server) handleImageGenerate(c *gin.Context, req api.GenerateRequest, mo
 			res.Metrics.LoadDuration = checkpointLoaded.Sub(checkpointStart)
 		}

-		if !isStreaming {
-			finalResponse = res
-			return
-		}
-
 		data, _ := json.Marshal(res)
 		c.Writer.Write(append(data, '\n'))
 		c.Writer.Flush()
@@ -2582,10 +2562,5 @@ func (s *Server) handleImageGenerate(c *gin.Context, req api.GenerateRequest, mo
 		if !streamStarted {
 			c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		}
-		return
-	}
-
-	if !isStreaming {
-		c.JSON(http.StatusOK, finalResponse)
 	}
 }
--- a/server/routes_generate_test.go
+++ b/server/routes_generate_test.go
@@ -19,9 +19,7 @@ import (
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/llm"
-	"github.com/ollama/ollama/manifest"
 	"github.com/ollama/ollama/ml"
-	"github.com/ollama/ollama/types/model"
 )

 // testPropsMap creates a ToolPropertiesMap from a map (convenience function for tests)
@@ -73,8 +71,6 @@ func (mockRunner) Tokenize(_ context.Context, s string) (tokens []int, err error
 	return
 }

-func (mockRunner) Ping(_ context.Context) error { return nil }
-
 func newMockServer(mock *mockRunner) func(ml.SystemInfo, []ml.DeviceInfo, string, *ggml.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
 	return func(_ ml.SystemInfo, _ []ml.DeviceInfo, _ string, _ *ggml.GGML, _, _ []string, _ api.Options, _ int) (llm.LlamaServer, error) {
 		return mock, nil
@@ -2197,246 +2193,3 @@ func TestGenerateUnload(t *testing.T) {
 		}
 	})
 }
-
-func TestGenerateWithImages(t *testing.T) {
-	gin.SetMode(gin.TestMode)
-
-	mock := mockRunner{
-		CompletionResponse: llm.CompletionResponse{
-			Done:               true,
-			DoneReason:         llm.DoneReasonStop,
-			PromptEvalCount:    1,
-			PromptEvalDuration: 1,
-			EvalCount:          1,
-			EvalDuration:       1,
-		},
-	}
-
-	s := Server{
-		sched: &Scheduler{
-			pendingReqCh:    make(chan *LlmRequest, 1),
-			finishedReqCh:   make(chan *LlmRequest, 1),
-			expiredCh:       make(chan *runnerRef, 1),
-			unloadedCh:      make(chan any, 1),
-			loaded:          make(map[string]*runnerRef),
-			newServerFn:     newMockServer(&mock),
-			getGpuFn:        getGpuFn,
-			getSystemInfoFn: getSystemInfoFn,
-			waitForRecovery: 250 * time.Millisecond,
-			loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
-				time.Sleep(time.Millisecond)
-				req.successCh <- &runnerRef{
-					llama: &mock,
-				}
-				return false
-			},
-		},
-	}
-
-	go s.sched.Run(t.Context())
-
-	_, digest := createBinFile(t, ggml.KV{
-		"general.architecture":          "llama",
-		"llama.block_count":             uint32(1),
-		"llama.context_length":          uint32(8192),
-		"llama.embedding_length":        uint32(4096),
-		"llama.attention.head_count":    uint32(32),
-		"llama.attention.head_count_kv": uint32(8),
-		"tokenizer.ggml.tokens":         []string{""},
-		"tokenizer.ggml.scores":         []float32{0},
-		"tokenizer.ggml.token_type":     []int32{0},
-	}, []*ggml.Tensor{
-		{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.ffn_gate.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.ffn_up.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.ffn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.attn_k.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.attn_output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.attn_q.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.attn_v.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-	})
-
-	w := createRequest(t, s.CreateHandler, api.CreateRequest{
-		Model:  "test",
-		Files:  map[string]string{"file.gguf": digest},
-		Stream: &stream,
-	})
-
-	if w.Code != http.StatusOK {
-		t.Fatalf("expected status 200, got %d", w.Code)
-	}
-
-	t.Run("images passed to completion request", func(t *testing.T) {
-		testImage := []byte("test-image-data")
-
-		mock.CompletionResponse.Content = "Image processed"
-		w := createRequest(t, s.GenerateHandler, api.GenerateRequest{
-			Model:  "test",
-			Prompt: "Describe this image",
-			Images: []api.ImageData{testImage},
-			Stream: &stream,
-		})
-
-		if w.Code != http.StatusOK {
-			t.Fatalf("expected status 200, got %d: %s", w.Code, w.Body.String())
-		}
-
-		// Verify images were passed to the completion request
-		if len(mock.CompletionRequest.Images) != 1 {
-			t.Fatalf("expected 1 image in completion request, got %d", len(mock.CompletionRequest.Images))
-		}
-
-		if !bytes.Equal(mock.CompletionRequest.Images[0].Data, testImage) {
-			t.Errorf("image data mismatch in completion request")
-		}
-
-		if mock.CompletionRequest.Images[0].ID != 0 {
-			t.Errorf("expected image ID 0, got %d", mock.CompletionRequest.Images[0].ID)
-		}
-	})
-
-	t.Run("multiple images passed to completion request", func(t *testing.T) {
-		testImage1 := []byte("test-image-1")
-		testImage2 := []byte("test-image-2")
-
-		mock.CompletionResponse.Content = "Images processed"
-		w := createRequest(t, s.GenerateHandler, api.GenerateRequest{
-			Model:  "test",
-			Prompt: "Compare these images",
-			Images: []api.ImageData{testImage1, testImage2},
-			Stream: &stream,
-		})
-
-		if w.Code != http.StatusOK {
-			t.Fatalf("expected status 200, got %d: %s", w.Code, w.Body.String())
-		}
-
-		// Verify both images were passed
-		if len(mock.CompletionRequest.Images) != 2 {
-			t.Fatalf("expected 2 images in completion request, got %d", len(mock.CompletionRequest.Images))
-		}
-
-		if !bytes.Equal(mock.CompletionRequest.Images[0].Data, testImage1) {
-			t.Errorf("first image data mismatch")
-		}
-
-		if !bytes.Equal(mock.CompletionRequest.Images[1].Data, testImage2) {
-			t.Errorf("second image data mismatch")
-		}
-
-		if mock.CompletionRequest.Images[0].ID != 0 || mock.CompletionRequest.Images[1].ID != 1 {
-			t.Errorf("expected image IDs 0 and 1, got %d and %d",
-				mock.CompletionRequest.Images[0].ID, mock.CompletionRequest.Images[1].ID)
-		}
-	})
-
-	t.Run("no images when none provided", func(t *testing.T) {
-		mock.CompletionResponse.Content = "No images"
-		w := createRequest(t, s.GenerateHandler, api.GenerateRequest{
-			Model:  "test",
-			Prompt: "Hello",
-			Stream: &stream,
-		})
-
-		if w.Code != http.StatusOK {
-			t.Fatalf("expected status 200, got %d: %s", w.Code, w.Body.String())
-		}
-
-		// Verify no images in completion request
-		if len(mock.CompletionRequest.Images) != 0 {
-			t.Fatalf("expected 0 images in completion request, got %d", len(mock.CompletionRequest.Images))
-		}
-	})
-}
-
-// TestImageGenerateStreamFalse tests that image generation respects stream=false
-// and returns a single JSON response instead of streaming ndjson.
-func TestImageGenerateStreamFalse(t *testing.T) {
-	gin.SetMode(gin.TestMode)
-
-	p := t.TempDir()
-	t.Setenv("OLLAMA_MODELS", p)
-
-	mock := mockRunner{}
-	mock.CompletionFn = func(ctx context.Context, r llm.CompletionRequest, fn func(r llm.CompletionResponse)) error {
-		fn(llm.CompletionResponse{Step: 1, TotalSteps: 3, Done: false})
-		fn(llm.CompletionResponse{Step: 2, TotalSteps: 3, Done: false})
-		fn(llm.CompletionResponse{Step: 3, TotalSteps: 3, Done: true, DoneReason: llm.DoneReasonStop, Image: "base64image"})
-		return nil
-	}
-
-	opts := api.DefaultOptions()
-	s := Server{
-		sched: &Scheduler{
-			pendingReqCh:  make(chan *LlmRequest, 1),
-			finishedReqCh: make(chan *LlmRequest, 1),
-			expiredCh:     make(chan *runnerRef, 1),
-			unloadedCh:    make(chan any, 1),
-			loaded: map[string]*runnerRef{
-				"": {
-					llama:       &mock,
-					Options:     &opts,
-					model:       &Model{Config: model.ConfigV2{Capabilities: []string{"image"}}},
-					numParallel: 1,
-				},
-			},
-			newServerFn:     newMockServer(&mock),
-			getGpuFn:        getGpuFn,
-			getSystemInfoFn: getSystemInfoFn,
-		},
-	}
-
-	go s.sched.Run(t.Context())
-
-	// Create model manifest with image capability
-	n := model.ParseName("test-image")
-	cfg := model.ConfigV2{Capabilities: []string{"image"}}
-	var b bytes.Buffer
-	if err := json.NewEncoder(&b).Encode(&cfg); err != nil {
-		t.Fatal(err)
-	}
-	configLayer, err := manifest.NewLayer(&b, "application/vnd.docker.container.image.v1+json")
-	if err != nil {
-		t.Fatal(err)
-	}
-	if err := manifest.WriteManifest(n, configLayer, nil); err != nil {
-		t.Fatal(err)
-	}
-
-	streamFalse := false
-	w := createRequest(t, s.GenerateHandler, api.GenerateRequest{
-		Model:  "test-image",
-		Prompt: "test prompt",
-		Stream: &streamFalse,
-	})
-
-	if w.Code != http.StatusOK {
-		t.Fatalf("expected status 200, got %d: %s", w.Code, w.Body.String())
-	}
-
-	if ct := w.Header().Get("Content-Type"); ct != "application/json; charset=utf-8" {
-		t.Errorf("expected Content-Type 'application/json; charset=utf-8', got %q", ct)
-	}
-
-	body := w.Body.String()
-	lines := strings.Split(strings.TrimSpace(body), "\n")
-	if len(lines) != 1 {
-		t.Errorf("expected 1 response line, got %d:\n%s", len(lines), body)
-	}
-
-	var resp api.GenerateResponse
-	if err := json.Unmarshal([]byte(lines[0]), &resp); err != nil {
-		t.Fatalf("failed to parse response: %v", err)
-	}
-
-	if resp.Image != "base64image" {
-		t.Errorf("expected image 'base64image', got %q", resp.Image)
-	}
-
-	if !resp.Done {
-		t.Errorf("expected done=true")
-	}
-}
--- a/x/create/client/create.go
+++ b/x/create/client/create.go
@@ -11,8 +11,6 @@ import (
 	"encoding/json"
 	"fmt"
 	"io"
-	"os"
-	"path/filepath"

 	"github.com/ollama/ollama/manifest"
 	"github.com/ollama/ollama/progress"
@@ -211,23 +209,10 @@ func newManifestWriter(opts CreateOptions, capabilities []string) create.Manifes
 			return fmt.Errorf("invalid model name: %s", modelName)
 		}

-		// TODO: find a better way to detect image input support
-		// For now, hardcode Flux2KleinPipeline as supporting vision (image input)
-		caps := capabilities
-		modelIndex := filepath.Join(opts.ModelDir, "model_index.json")
-		if data, err := os.ReadFile(modelIndex); err == nil {
-			var cfg struct {
-				ClassName string `json:"_class_name"`
-			}
-			if json.Unmarshal(data, &cfg) == nil && cfg.ClassName == "Flux2KleinPipeline" {
-				caps = append(caps, "vision")
-			}
-		}
-
 		// Create config blob with version requirement
 		configData := model.ConfigV2{
 			ModelFormat:  "safetensors",
-			Capabilities: caps,
+			Capabilities: capabilities,
 			Requires:     MinOllamaVersion,
 		}
 		configJSON, err := json.Marshal(configData)
--- a/x/imagegen/cli.go
+++ b/x/imagegen/cli.go
@@ -10,10 +10,7 @@ import (
 	"errors"
 	"fmt"
 	"io"
-	"net/http"
 	"os"
-	"regexp"
-	"slices"
 	"strconv"
 	"strings"
 	"time"
@@ -78,7 +75,6 @@ Image Generation Flags (experimental):
 // RunCLI handles the CLI for image generation models.
 // Returns true if it handled the request, false if the caller should continue with normal flow.
 // Supports flags: --width, --height, --steps, --seed, --negative
-// Image paths can be included in the prompt and will be extracted automatically.
 func RunCLI(cmd *cobra.Command, name string, prompt string, interactive bool, keepAlive *api.Duration) error {
 	// Get options from flags (with env var defaults)
 	opts := DefaultOptions()
@@ -115,16 +111,9 @@ func generateImageWithOptions(cmd *cobra.Command, modelName, prompt string, keep
 		return err
 	}

-	// Extract any image paths from the prompt
-	prompt, images, err := extractFileData(prompt)
-	if err != nil {
-		return err
-	}
-
 	req := &api.GenerateRequest{
 		Model:  modelName,
 		Prompt: prompt,
-		Images: images,
 		Width:  int32(opts.Width),
 		Height: int32(opts.Height),
 		Steps:  int32(opts.Steps),
@@ -265,33 +254,14 @@ func runInteractive(cmd *cobra.Command, modelName string, keepAlive *api.Duratio
 			printCurrentSettings(opts)
 			continue
 		case strings.HasPrefix(line, "/"):
-			// Check if it's a file path, not a command
-			args := strings.Fields(line)
-			isFile := false
-			for _, f := range extractFileNames(line) {
-				if strings.HasPrefix(f, args[0]) {
-					isFile = true
-					break
-				}
-			}
-			if !isFile {
-				fmt.Fprintf(os.Stderr, "Unknown command: %s (try /help)\n", args[0])
-				continue
-			}
-		}
-
-		// Extract any image paths from the input
-		prompt, images, err := extractFileData(line)
-		if err != nil {
-			fmt.Fprintf(os.Stderr, "Error: %v\n", err)
+			fmt.Fprintf(os.Stderr, "Unknown command: %s (try /help)\n", line)
 			continue
 		}

 		// Generate image with current options
 		req := &api.GenerateRequest{
 			Model:  modelName,
-			Prompt: prompt,
-			Images: images,
+			Prompt: line,
 			Width:  int32(opts.Width),
 			Height: int32(opts.Height),
 			Steps:  int32(opts.Steps),
@@ -516,61 +486,3 @@ func displayImageInTerminal(imagePath string) bool {
 		return false
 	}
 }
-
-// extractFileNames finds image file paths in the input string.
-func extractFileNames(input string) []string {
-	// Regex to match file paths with image extensions
-	regexPattern := `(?:[a-zA-Z]:)?(?:\./|/|\\)[\S\\ ]+?\.(?i:jpg|jpeg|png|webp)\b`
-	re := regexp.MustCompile(regexPattern)
-	return re.FindAllString(input, -1)
-}
-
-// extractFileData extracts image data from file paths found in the input.
-// Returns the cleaned prompt (with file paths removed) and the image data.
-func extractFileData(input string) (string, []api.ImageData, error) {
-	filePaths := extractFileNames(input)
-	var imgs []api.ImageData
-
-	for _, fp := range filePaths {
-		// Normalize shell escapes
-		nfp := strings.ReplaceAll(fp, "\\ ", " ")
-		nfp = strings.ReplaceAll(nfp, "\\(", "(")
-		nfp = strings.ReplaceAll(nfp, "\\)", ")")
-		nfp = strings.ReplaceAll(nfp, "%20", " ")
-
-		data, err := getImageData(nfp)
-		if errors.Is(err, os.ErrNotExist) {
-			continue
-		} else if err != nil {
-			return "", nil, err
-		}
-		fmt.Fprintf(os.Stderr, "Added image '%s'\n", nfp)
-		input = strings.ReplaceAll(input, fp, "")
-		imgs = append(imgs, data)
-	}
-	return strings.TrimSpace(input), imgs, nil
-}
-
-// getImageData reads and validates image data from a file.
-func getImageData(filePath string) ([]byte, error) {
-	file, err := os.Open(filePath)
-	if err != nil {
-		return nil, err
-	}
-	defer file.Close()
-
-	buf := make([]byte, 512)
-	_, err = file.Read(buf)
-	if err != nil {
-		return nil, err
-	}
-
-	contentType := http.DetectContentType(buf)
-	allowedTypes := []string{"image/jpeg", "image/jpg", "image/png", "image/webp"}
-	if !slices.Contains(allowedTypes, contentType) {
-		return nil, fmt.Errorf("invalid image type: %s", contentType)
-	}
-
-	// Re-read the full file
-	return os.ReadFile(filePath)
-}
--- a/x/imagegen/image.go
+++ b/x/imagegen/image.go
@@ -7,8 +7,6 @@ import (
 	"encoding/base64"
 	"fmt"
 	"image"
-	"image/color"
-	"image/draw"
 	_ "image/jpeg"
 	"image/png"
 	"os"
@@ -113,7 +111,6 @@ func clampF(v, min, max float32) float32 {
 }

 // DecodeImage decodes image bytes with EXIF orientation applied.
-// Transparent images are composited onto a white background.
 func DecodeImage(data []byte) (image.Image, error) {
 	orientation := readJPEGOrientation(data)

@@ -122,33 +119,9 @@ func DecodeImage(data []byte) (image.Image, error) {
 		return nil, err
 	}

-	img = flattenAlpha(img)
 	return applyOrientation(img, orientation), nil
 }

-// flattenAlpha composites an image onto a white background,
-// removing any transparency. This is needed because image
-// generation models don't handle alpha channels well.
-func flattenAlpha(img image.Image) image.Image {
-	if _, ok := img.(*image.RGBA); !ok {
-		if _, ok := img.(*image.NRGBA); !ok {
-			// No alpha channel, return as-is
-			return img
-		}
-	}
-
-	bounds := img.Bounds()
-	dst := image.NewRGBA(bounds)
-
-	// Fill with white background
-	draw.Draw(dst, bounds, &image.Uniform{color.White}, image.Point{}, draw.Src)
-
-	// Composite the image on top
-	draw.Draw(dst, bounds, img, bounds.Min, draw.Over)
-
-	return dst
-}
-
 // readJPEGOrientation extracts EXIF orientation from JPEG bytes.
 // Returns 1 (normal) for non-JPEG or if orientation not found.
 func readJPEGOrientation(data []byte) int {
--- a/x/imagegen/manifest.go
+++ b/x/imagegen/manifest.go
@@ -161,17 +161,6 @@ func (m *ModelManifest) HasTensorLayers() bool {
 	return false
 }

-// TotalTensorSize returns the total size in bytes of all tensor layers.
-func (m *ModelManifest) TotalTensorSize() int64 {
-	var total int64
-	for _, layer := range m.Manifest.Layers {
-		if layer.MediaType == "application/vnd.ollama.image.tensor" {
-			total += layer.Size
-		}
-	}
-	return total
-}
-
 // ModelInfo contains metadata about an image generation model.
 type ModelInfo struct {
 	Architecture   string
--- a/x/imagegen/manifest_test.go
+++ b/x/imagegen/manifest_test.go
@@ -5,37 +5,6 @@ import (
 	"testing"
 )

-func TestTotalTensorSize(t *testing.T) {
-	m := &ModelManifest{
-		Manifest: &Manifest{
-			Layers: []ManifestLayer{
-				{MediaType: "application/vnd.ollama.image.tensor", Size: 1000},
-				{MediaType: "application/vnd.ollama.image.tensor", Size: 2000},
-				{MediaType: "application/vnd.ollama.image.json", Size: 500}, // not a tensor
-				{MediaType: "application/vnd.ollama.image.tensor", Size: 3000},
-			},
-		},
-	}
-
-	got := m.TotalTensorSize()
-	want := int64(6000)
-	if got != want {
-		t.Errorf("TotalTensorSize() = %d, want %d", got, want)
-	}
-}
-
-func TestTotalTensorSizeEmpty(t *testing.T) {
-	m := &ModelManifest{
-		Manifest: &Manifest{
-			Layers: []ManifestLayer{},
-		},
-	}
-
-	if got := m.TotalTensorSize(); got != 0 {
-		t.Errorf("TotalTensorSize() = %d, want 0", got)
-	}
-}
-
 func TestManifestAndBlobDirsRespectOLLAMAModels(t *testing.T) {
 	modelsDir := filepath.Join(t.TempDir(), "models")

--- a/x/imagegen/memory.go
+++ b/x/imagegen/memory.go
@@ -16,9 +16,18 @@ import (
 	"runtime"
 )

+// GB is a convenience constant for gigabytes.
+const GB = 1024 * 1024 * 1024
+
 // SupportedBackends lists the backends that support image generation.
 var SupportedBackends = []string{"metal", "cuda", "cpu"}

+// modelVRAMEstimates maps pipeline class names to their estimated VRAM requirements.
+var modelVRAMEstimates = map[string]uint64{
+	"ZImagePipeline": 21 * GB, // ~21GB for Z-Image (text encoder + transformer + VAE)
+	"FluxPipeline":   20 * GB, // ~20GB for Flux
+}
+
 // CheckPlatformSupport validates that image generation is supported on the current platform.
 // Returns nil if supported, or an error describing why it's not supported.
 func CheckPlatformSupport() error {
@@ -38,6 +47,17 @@ func CheckPlatformSupport() error {
 	}
 }

+// CheckMemoryRequirements validates that there's enough memory for image generation.
+// Returns nil if memory is sufficient, or an error if not.
+func CheckMemoryRequirements(modelName string, availableMemory uint64) error {
+	required := EstimateVRAM(modelName)
+	if availableMemory < required {
+		return fmt.Errorf("insufficient memory for image generation: need %d GB, have %d GB",
+			required/GB, availableMemory/GB)
+	}
+	return nil
+}
+
 // ResolveModelName checks if a model name is a known image generation model.
 // Returns the normalized model name if found, empty string otherwise.
 func ResolveModelName(modelName string) string {
@@ -48,6 +68,16 @@ func ResolveModelName(modelName string) string {
 	return ""
 }

+// EstimateVRAM returns the estimated VRAM needed for an image generation model.
+// Returns a conservative default of 21GB if the model type cannot be determined.
+func EstimateVRAM(modelName string) uint64 {
+	className := DetectModelType(modelName)
+	if estimate, ok := modelVRAMEstimates[className]; ok {
+		return estimate
+	}
+	return 21 * GB
+}
+
 // DetectModelType reads model_index.json and returns the model type.
 // Checks both "architecture" (Ollama format) and "_class_name" (diffusers format).
 // Returns empty string if detection fails.
--- a/x/imagegen/memory_test.go
+++ b/x/imagegen/memory_test.go
@@ -30,6 +30,69 @@ func TestCheckPlatformSupport(t *testing.T) {
 	}
 }

+func TestCheckMemoryRequirements(t *testing.T) {
+	tests := []struct {
+		name            string
+		availableMemory uint64
+		wantErr         bool
+	}{
+		{
+			name:            "sufficient memory",
+			availableMemory: 32 * GB,
+			wantErr:         false,
+		},
+		{
+			name:            "exactly enough memory",
+			availableMemory: 21 * GB,
+			wantErr:         false,
+		},
+		{
+			name:            "insufficient memory",
+			availableMemory: 16 * GB,
+			wantErr:         true,
+		},
+		{
+			name:            "zero memory",
+			availableMemory: 0,
+			wantErr:         true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			// Use a non-existent model name which will default to 21GB estimate
+			err := CheckMemoryRequirements("nonexistent-model", tt.availableMemory)
+			if (err != nil) != tt.wantErr {
+				t.Errorf("CheckMemoryRequirements() error = %v, wantErr %v", err, tt.wantErr)
+			}
+		})
+	}
+}
+
+func TestModelVRAMEstimates(t *testing.T) {
+	// Verify the VRAM estimates map has expected entries
+	expected := map[string]uint64{
+		"ZImagePipeline": 21 * GB,
+		"FluxPipeline":   20 * GB,
+	}
+
+	for name, expectedVRAM := range expected {
+		if actual, ok := modelVRAMEstimates[name]; !ok {
+			t.Errorf("Missing VRAM estimate for %s", name)
+		} else if actual != expectedVRAM {
+			t.Errorf("VRAM estimate for %s = %d GB, want %d GB", name, actual/GB, expectedVRAM/GB)
+		}
+	}
+}
+
+func TestEstimateVRAMDefault(t *testing.T) {
+	// Non-existent model should return default 21GB
+	vram := EstimateVRAM("nonexistent-model-that-does-not-exist")
+	if vram != 21*GB {
+		t.Errorf("EstimateVRAM() = %d GB, want 21 GB", vram/GB)
+	}
+}
+
 func TestResolveModelName(t *testing.T) {
 	// Non-existent model should return empty string
 	result := ResolveModelName("nonexistent-model")
--- a/x/imagegen/models/flux2/flux2.go
+++ b/x/imagegen/models/flux2/flux2.go
@@ -177,20 +177,6 @@ func (m *Model) GenerateImage(ctx context.Context, prompt string, width, height
 	})
 }

-// GenerateImageWithInputs implements runner.ImageEditModel interface.
-// It generates an image conditioned on the provided input images for image editing.
-func (m *Model) GenerateImageWithInputs(ctx context.Context, prompt string, width, height int32, steps int, seed int64, inputImages []image.Image, progress func(step, total int)) (*mlx.Array, error) {
-	return m.GenerateFromConfig(ctx, &GenerateConfig{
-		Prompt:      prompt,
-		Width:       width,
-		Height:      height,
-		Steps:       steps,
-		Seed:        seed,
-		InputImages: inputImages,
-		Progress:    progress,
-	})
-}
-
 // MaxOutputPixels is the maximum output resolution (4 megapixels, ~2048x2048)
 const MaxOutputPixels = 2048 * 2048

--- a/x/imagegen/runner/runner.go
+++ b/x/imagegen/runner/runner.go
@@ -9,7 +9,6 @@ import (
 	"encoding/json"
 	"flag"
 	"fmt"
-	"image"
 	"log/slog"
 	"net/http"
 	"os"
@@ -26,12 +25,11 @@ import (

 // Request is the image generation request format
 type Request struct {
-	Prompt string   `json:"prompt"`
-	Width  int32    `json:"width,omitempty"`
-	Height int32    `json:"height,omitempty"`
-	Steps  int      `json:"steps,omitempty"`
-	Seed   int64    `json:"seed,omitempty"`
-	Images [][]byte `json:"images,omitempty"` // Input images for image editing/conditioning
+	Prompt string `json:"prompt"`
+	Width  int32  `json:"width,omitempty"`
+	Height int32  `json:"height,omitempty"`
+	Steps  int    `json:"steps,omitempty"`
+	Seed   int64  `json:"seed,omitempty"`
 }

 // Response is streamed back for each progress update
@@ -48,13 +46,6 @@ type ImageModel interface {
 	GenerateImage(ctx context.Context, prompt string, width, height int32, steps int, seed int64, progress func(step, total int)) (*mlx.Array, error)
 }

-// ImageEditModel extends ImageModel with image editing/conditioning capability.
-// Models that support input images for editing should implement this interface.
-type ImageEditModel interface {
-	ImageModel
-	GenerateImageWithInputs(ctx context.Context, prompt string, width, height int32, steps int, seed int64, inputImages []image.Image, progress func(step, total int)) (*mlx.Array, error)
-}
-
 // Server holds the model and handles requests
 type Server struct {
 	mu        sync.Mutex
@@ -87,6 +78,14 @@ func Execute(args []string) error {
 	slog.Info("MLX library initialized")
 	slog.Info("starting image runner", "model", *modelName, "port", *port)

+	// Check memory requirements before loading
+	requiredMemory := imagegen.EstimateVRAM(*modelName)
+	availableMemory := mlx.GetMemoryLimit()
+	if availableMemory > 0 && availableMemory < requiredMemory {
+		return fmt.Errorf("insufficient memory for image generation: need %d GB, have %d GB",
+			requiredMemory/(1024*1024*1024), availableMemory/(1024*1024*1024))
+	}
+
 	// Detect model type and load appropriate model
 	modelType := imagegen.DetectModelType(*modelName)
 	slog.Info("detected model type", "type", modelType)
@@ -162,44 +161,6 @@ func (s *Server) completionHandler(w http.ResponseWriter, r *http.Request) {
 		return
 	}

-	// Validate and decode input images
-	const maxInputImages = 2
-	if len(req.Images) > maxInputImages {
-		http.Error(w, fmt.Sprintf("too many input images, maximum is %d", maxInputImages), http.StatusBadRequest)
-		return
-	}
-
-	var inputImages []image.Image
-	if len(req.Images) > 0 {
-		// TODO: add memory check for input images
-
-		inputImages = make([]image.Image, len(req.Images))
-		for i, imgBytes := range req.Images {
-			img, err := imagegen.DecodeImage(imgBytes)
-			if err != nil {
-				http.Error(w, fmt.Sprintf("invalid image %d: %v", i, err), http.StatusBadRequest)
-				return
-			}
-			inputImages[i] = img
-		}
-		slog.Info("decoded input images", "count", len(inputImages))
-
-		// Default width/height to first input image dimensions, scaled to max 1024
-		bounds := inputImages[0].Bounds()
-		w, h := bounds.Dx(), bounds.Dy()
-		if w > 1024 || h > 1024 {
-			if w > h {
-				h = h * 1024 / w
-				w = 1024
-			} else {
-				w = w * 1024 / h
-				h = 1024
-			}
-		}
-		req.Width = int32(w)
-		req.Height = int32(h)
-	}
-
 	// Serialize generation requests - MLX model may not handle concurrent generation
 	s.mu.Lock()
 	defer s.mu.Unlock()
@@ -231,19 +192,7 @@ func (s *Server) completionHandler(w http.ResponseWriter, r *http.Request) {
 		flusher.Flush()
 	}

-	// Use ImageEditModel if available and images provided, otherwise use basic ImageModel
-	var img *mlx.Array
-	var err error
-	if len(inputImages) > 0 {
-		editModel, ok := s.model.(ImageEditModel)
-		if !ok {
-			http.Error(w, "model does not support image editing", http.StatusBadRequest)
-			return
-		}
-		img, err = editModel.GenerateImageWithInputs(ctx, req.Prompt, req.Width, req.Height, req.Steps, req.Seed, inputImages, progress)
-	} else {
-		img, err = s.model.GenerateImage(ctx, req.Prompt, req.Width, req.Height, req.Steps, req.Seed, progress)
-	}
+	img, err := s.model.GenerateImage(ctx, req.Prompt, req.Width, req.Height, req.Steps, req.Seed, progress)

 	if err != nil {
 		// Don't send error for cancellation
--- a/x/imagegen/server.go
+++ b/x/imagegen/server.go
@@ -7,7 +7,6 @@ import (
 	"encoding/json"
 	"errors"
 	"fmt"
-	"io"
 	"log/slog"
 	"math/rand"
 	"net"
@@ -105,17 +104,11 @@ func NewServer(modelName string) (*Server, error) {
 		slog.Debug("mlx subprocess library path", "LD_LIBRARY_PATH", pathEnvVal)
 	}

-	// Get total weight size from manifest
-	var weightSize uint64
-	if manifest, err := LoadManifest(modelName); err == nil {
-		weightSize = uint64(manifest.TotalTensorSize())
-	}
-
 	s := &Server{
 		cmd:       cmd,
 		port:      port,
 		modelName: modelName,
-		vramSize:  weightSize,
+		vramSize:  EstimateVRAM(modelName),
 		done:      make(chan error, 1),
 		client:    &http.Client{Timeout: 10 * time.Minute},
 	}
@@ -233,27 +226,19 @@ func (s *Server) Completion(ctx context.Context, req llm.CompletionRequest, fn f
 		seed = time.Now().UnixNano()
 	}

-	// Extract raw image bytes from llm.ImageData slice
-	var images [][]byte
-	for _, img := range req.Images {
-		images = append(images, img.Data)
-	}
-
 	// Build request for subprocess
 	creq := struct {
-		Prompt string   `json:"prompt"`
-		Width  int32    `json:"width,omitempty"`
-		Height int32    `json:"height,omitempty"`
-		Steps  int32    `json:"steps,omitempty"`
-		Seed   int64    `json:"seed,omitempty"`
-		Images [][]byte `json:"images,omitempty"`
+		Prompt string `json:"prompt"`
+		Width  int32  `json:"width,omitempty"`
+		Height int32  `json:"height,omitempty"`
+		Steps  int32  `json:"steps,omitempty"`
+		Seed   int64  `json:"seed,omitempty"`
 	}{
 		Prompt: req.Prompt,
 		Width:  req.Width,
 		Height: req.Height,
 		Steps:  req.Steps,
 		Seed:   seed,
-		Images: images,
 	}

 	body, err := json.Marshal(creq)
@@ -275,8 +260,7 @@ func (s *Server) Completion(ctx context.Context, req llm.CompletionRequest, fn f
 	defer resp.Body.Close()

 	if resp.StatusCode != http.StatusOK {
-		body, _ := io.ReadAll(resp.Body)
-		return fmt.Errorf("%s", strings.TrimSpace(string(body)))
+		return fmt.Errorf("request failed: %d", resp.StatusCode)
 	}

 	scanner := bufio.NewScanner(resp.Body)
--- a/x/imagegen/server_test.go
+++ b/x/imagegen/server_test.go
@@ -38,6 +38,40 @@ func TestPlatformSupport(t *testing.T) {
 	}
 }

+// TestMemoryRequirementsError verifies memory check returns clear error.
+func TestMemoryRequirementsError(t *testing.T) {
+	// Test with insufficient memory
+	err := CheckMemoryRequirements("test-model", 8*GB)
+	if err == nil {
+		t.Error("Expected error for insufficient memory (8GB < 21GB default)")
+	}
+
+	// Test with sufficient memory
+	err = CheckMemoryRequirements("test-model", 32*GB)
+	if err != nil {
+		t.Errorf("Expected no error for sufficient memory (32GB), got: %v", err)
+	}
+}
+
+// TestEstimateVRAMReturnsReasonableDefaults verifies VRAM estimates are sensible.
+func TestEstimateVRAMReturnsReasonableDefaults(t *testing.T) {
+	// Unknown model should return default (21GB)
+	vram := EstimateVRAM("unknown-model")
+	if vram < 10*GB || vram > 100*GB {
+		t.Errorf("VRAM estimate %d GB is outside reasonable range (10-100 GB)", vram/GB)
+	}
+
+	// Verify known pipeline estimates exist and are reasonable
+	for name, estimate := range modelVRAMEstimates {
+		if estimate < 10*GB {
+			t.Errorf("VRAM estimate for %s (%d GB) is suspiciously low", name, estimate/GB)
+		}
+		if estimate > 200*GB {
+			t.Errorf("VRAM estimate for %s (%d GB) is suspiciously high", name, estimate/GB)
+		}
+	}
+}
+
 // TestServerInterfaceCompliance verifies Server implements llm.LlamaServer.
 // This is a compile-time check but we document it as a test.
 func TestServerInterfaceCompliance(t *testing.T) {