add min vram for glm-4.7-flash

revert doc changes for recommended models
launch: add recommended models with install flow and sorting
2026-02-02 19:54:33 -05:00 · 2026-02-02 13:53:27 -08:00 · 2026-02-02 13:47:07 -08:00 · 2026-02-02 13:45:46 -08:00 · 2026-02-02 13:29:45 -08:00 · 2026-02-02 11:13:21 -08:00
51 changed files with 3248 additions and 317 deletions
--- a/4
+++ b/4
@@ -169,8 +169,10 @@ COPY . .
 RUN git clone --depth 1 --branch "$(cat MLX_VERSION)" https://github.com/ml-explore/mlx-c.git build/_deps/mlx-c-src
 ARG GOFLAGS="'-ldflags=-w -s'"
 ENV CGO_ENABLED=1
-ENV CGO_CFLAGS="-I/go/src/github.com/ollama/ollama/build/_deps/mlx-c-src"
+ARG CGO_CFLAGS
 ARG CGO_CXXFLAGS
+ENV CGO_CFLAGS="${CGO_CFLAGS} -I/go/src/github.com/ollama/ollama/build/_deps/mlx-c-src"
+ENV CGO_CXXFLAGS="${CGO_CXXFLAGS}"
 RUN --mount=type=cache,target=/root/.cache/go-build \
    go build -tags mlx -trimpath -buildmode=pie -o /bin/ollama .

--- a/README.md
+++ b/README.md
@@ -358,6 +358,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Odin Runes](https://github.com/leonid20000/OdinRunes)
 - [LLM-X](https://github.com/mrdjohnson/llm-x) (Progressive Web App)
 - [AnythingLLM (Docker + MacOs/Windows/Linux native app)](https://github.com/Mintplex-Labs/anything-llm)
+- [Screenpipe](https://github.com/mediar-ai/screenpipe) (24/7 screen & mic recording with AI-powered search, uses Ollama for local LLM features)
 - [Ollama Basic Chat: Uses HyperDiv Reactive UI](https://github.com/rapidarchitect/ollama_basic_chat)
 - [Ollama-chats RPG](https://github.com/drazdra/ollama-chats)
 - [IntelliBar](https://intellibar.app/) (AI-powered assistant for macOS)
@@ -465,6 +466,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Clueless](https://github.com/KashyapTan/clueless) (Open Source & Local Cluely: A desktop application LLM assistant to help you talk to anything on your screen using locally served Ollama models. Also undetectable to screenshare)
 - [ollama-co2](https://github.com/carbonatedWaterOrg/ollama-co2) (FastAPI web interface for monitoring and managing local and remote Ollama servers with real-time model monitoring and concurrent downloads)
 - [Hillnote](https://hillnote.com) (A Markdown-first workspace designed to supercharge your AI workflow. Create documents ready to integrate with Claude, ChatGPT, Gemini, Cursor, and more - all while keeping your work on your device.)
+- [Stakpak](https://github.com/stakpak/agent) (An open source, vendor neutral DevOps agent that works with any model, and any stack, for teams who just want to ship)

 ### Cloud

@@ -558,7 +560,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [LiteLLM](https://github.com/BerriAI/litellm)
 - [OllamaFarm for Go](https://github.com/presbrey/ollamafarm)
 - [OllamaSharp for .NET](https://github.com/awaescher/OllamaSharp)
- [Ollama for Ruby](https://github.com/gbaptista/ollama-ai)
+- [Ollama for Ruby](https://github.com/crmne/ruby_llm)
 - [Ollama-rs for Rust](https://github.com/pepperoni21/ollama-rs)
 - [Ollama-hpp for C++](https://github.com/jmont-dev/ollama-hpp)
 - [Ollama4j for Java](https://github.com/ollama4j/ollama4j)
--- a/app/README.md
+++ b/app/README.md
@@ -75,9 +75,9 @@ The `-dev` flag enables:
 CI builds with Xcode 14.1 for OS compatibility prior to v13.  If you want to manually build v11+ support, you can download the older Xcode [here](https://developer.apple.com/services-account/download?path=/Developer_Tools/Xcode_14.1/Xcode_14.1.xip), extract, then `mv ./Xcode.app /Applications/Xcode_14.1.0.app` then activate with:

 ```
-export CGO_CFLAGS=-mmacosx-version-min=12.0
-export CGO_CXXFLAGS=-mmacosx-version-min=12.0
-export CGO_LDFLAGS=-mmacosx-version-min=12.0
+export CGO_CFLAGS="-O3 -mmacosx-version-min=12.0"
+export CGO_CXXFLAGS="-O3 -mmacosx-version-min=12.0"
+export CGO_LDFLAGS="-mmacosx-version-min=12.0"
 export SDKROOT=/Applications/Xcode_14.1.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
 export DEVELOPER_DIR=/Applications/Xcode_14.1.0.app/Contents/Developer
 ```
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -1888,7 +1888,7 @@ func NewCLI() *cobra.Command {
 	serveCmd := &cobra.Command{
 		Use:     "serve",
 		Aliases: []string{"start"},
-		Short:   "Start ollama",
+		Short:   "Start Ollama",
 		Args:    cobra.ExactArgs(0),
 		RunE:    RunServer,
 	}
--- a/cmd/config/claude.go
+++ b/cmd/config/claude.go
@@ -4,6 +4,10 @@ import (
 	"fmt"
 	"os"
 	"os/exec"
+	"path/filepath"
+	"runtime"
+
+	"github.com/ollama/ollama/envconfig"
 )

 // Claude implements Runner for Claude Code integration
@@ -18,17 +22,37 @@ func (c *Claude) args(model string) []string {
 	return nil
 }

+func (c *Claude) findPath() (string, error) {
+	if p, err := exec.LookPath("claude"); err == nil {
+		return p, nil
+	}
+	home, err := os.UserHomeDir()
+	if err != nil {
+		return "", err
+	}
+	name := "claude"
+	if runtime.GOOS == "windows" {
+		name = "claude.exe"
+	}
+	fallback := filepath.Join(home, ".claude", "local", name)
+	if _, err := os.Stat(fallback); err != nil {
+		return "", err
+	}
+	return fallback, nil
+}
+
 func (c *Claude) Run(model string) error {
-	if _, err := exec.LookPath("claude"); err != nil {
+	claudePath, err := c.findPath()
+	if err != nil {
 		return fmt.Errorf("claude is not installed, install from https://code.claude.com/docs/en/quickstart")
 	}

-	cmd := exec.Command("claude", c.args(model)...)
+	cmd := exec.Command(claudePath, c.args(model)...)
 	cmd.Stdin = os.Stdin
 	cmd.Stdout = os.Stdout
 	cmd.Stderr = os.Stderr
 	cmd.Env = append(os.Environ(),
-		"ANTHROPIC_BASE_URL=http://localhost:11434",
+		"ANTHROPIC_BASE_URL="+envconfig.Host().String(),
 		"ANTHROPIC_API_KEY=",
 		"ANTHROPIC_AUTH_TOKEN=ollama",
 	)
--- a/cmd/config/claude_test.go
+++ b/cmd/config/claude_test.go
@@ -1,6 +1,9 @@
 package config

 import (
+	"os"
+	"path/filepath"
+	"runtime"
 	"slices"
 	"testing"
 )
@@ -19,6 +22,62 @@ func TestClaudeIntegration(t *testing.T) {
 	})
 }

+func TestClaudeFindPath(t *testing.T) {
+	c := &Claude{}
+
+	t.Run("finds claude in PATH", func(t *testing.T) {
+		tmpDir := t.TempDir()
+		name := "claude"
+		if runtime.GOOS == "windows" {
+			name = "claude.exe"
+		}
+		fakeBin := filepath.Join(tmpDir, name)
+		os.WriteFile(fakeBin, []byte("#!/bin/sh\n"), 0o755)
+		t.Setenv("PATH", tmpDir)
+
+		got, err := c.findPath()
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if got != fakeBin {
+			t.Errorf("findPath() = %q, want %q", got, fakeBin)
+		}
+	})
+
+	t.Run("falls back to ~/.claude/local/claude", func(t *testing.T) {
+		tmpDir := t.TempDir()
+		setTestHome(t, tmpDir)
+		t.Setenv("PATH", t.TempDir()) // empty dir, no claude binary
+
+		name := "claude"
+		if runtime.GOOS == "windows" {
+			name = "claude.exe"
+		}
+		fallback := filepath.Join(tmpDir, ".claude", "local", name)
+		os.MkdirAll(filepath.Dir(fallback), 0o755)
+		os.WriteFile(fallback, []byte("#!/bin/sh\n"), 0o755)
+
+		got, err := c.findPath()
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if got != fallback {
+			t.Errorf("findPath() = %q, want %q", got, fallback)
+		}
+	})
+
+	t.Run("returns error when neither PATH nor fallback exists", func(t *testing.T) {
+		tmpDir := t.TempDir()
+		setTestHome(t, tmpDir)
+		t.Setenv("PATH", t.TempDir()) // empty dir, no claude binary
+
+		_, err := c.findPath()
+		if err == nil {
+			t.Fatal("expected error, got nil")
+		}
+	})
+}
+
 func TestClaudeArgs(t *testing.T) {
 	c := &Claude{}

--- a/cmd/config/droid.go
+++ b/cmd/config/droid.go
@@ -7,6 +7,8 @@ import (
 	"os/exec"
 	"path/filepath"
 	"slices"
+
+	"github.com/ollama/ollama/envconfig"
 )

 // Droid implements Runner and Editor for Droid integration
@@ -117,7 +119,7 @@ func (d *Droid) Edit(models []string) error {
 		newModels = append(newModels, modelEntry{
 			Model:           model,
 			DisplayName:     model,
-			BaseURL:         "http://localhost:11434/v1",
+			BaseURL:         envconfig.Host().String() + "/v1",
 			APIKey:          "ollama",
 			Provider:        "generic-chat-completion-api",
 			MaxOutputTokens: 64000,
--- a/cmd/config/droid_test.go
+++ b/cmd/config/droid_test.go
@@ -218,7 +218,7 @@ func TestDroidEdit(t *testing.T) {
 			}
 		}

-		if model["baseUrl"] != "http://localhost:11434/v1" {
+		if model["baseUrl"] != "http://127.0.0.1:11434/v1" {
 			t.Errorf("unexpected baseUrl: %s", model["baseUrl"])
 		}
 		if model["apiKey"] != "ollama" {
@@ -447,7 +447,7 @@ const testDroidSettingsFixture = `{
    {
      "model": "existing-ollama-model",
      "displayName": "existing-ollama-model",
-      "baseUrl": "http://localhost:11434/v1",
+      "baseUrl": "http://127.0.0.1:11434/v1",
      "apiKey": "ollama",
      "provider": "generic-chat-completion-api",
      "maxOutputTokens": 64000,
--- a/cmd/config/integrations.go
+++ b/cmd/config/integrations.go
@@ -13,6 +13,7 @@ import (
 	"time"

 	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/progress"
 	"github.com/spf13/cobra"
 )

@@ -41,9 +42,26 @@ type Editor interface {
 // integrations is the registry of available integrations.
 var integrations = map[string]Runner{
 	"claude":   &Claude{},
+	"clawdbot": &Openclaw{},
 	"codex":    &Codex{},
+	"moltbot":  &Openclaw{},
 	"droid":    &Droid{},
 	"opencode": &OpenCode{},
+	"openclaw": &Openclaw{},
+}
+
+// recommendedModels are shown when the user has no models or as suggestions.
+// Order matters: local models first, then cloud models.
+var recommendedModels = []selectItem{
+	{Name: "glm-4.7-flash", Description: "Recommended (requires ~25GB VRAM)"},
+	{Name: "glm-4.7:cloud", Description: "recommended"},
+	{Name: "kimi-k2.5:cloud", Description: "recommended"},
+}
+
+// integrationAliases are hidden from the interactive selector but work as CLI arguments.
+var integrationAliases = map[string]bool{
+	"clawdbot": true,
+	"moltbot":  true,
 }

 func selectIntegration() (string, error) {
@@ -54,6 +72,9 @@ func selectIntegration() (string, error) {
 	names := slices.Sorted(maps.Keys(integrations))
 	var items []selectItem
 	for _, name := range names {
+		if integrationAliases[name] {
+			continue
+		}
 		r := integrations[name]
 		description := r.String()
 		if conn, err := loadIntegration(name); err == nil && len(conn.Models) > 0 {
@@ -82,62 +103,25 @@ func selectModels(ctx context.Context, name, current string) ([]string, error) {
 		return nil, err
 	}

-	if len(models.Models) == 0 {
-		return nil, fmt.Errorf("no models available, run 'ollama pull <model>' first")
-	}
-
-	var items []selectItem
-	cloudModels := make(map[string]bool)
+	var existing []modelInfo
 	for _, m := range models.Models {
-		if m.RemoteModel != "" {
-			cloudModels[m.Name] = true
-		}
-		items = append(items, selectItem{Name: m.Name})
+		existing = append(existing, modelInfo{Name: m.Name, Remote: m.RemoteModel != ""})
 	}

-	if len(items) == 0 {
-		return nil, fmt.Errorf("no local models available, run 'ollama pull <model>' first")
-	}
-
-	// Get previously configured models (saved config takes precedence)
 	var preChecked []string
 	if saved, err := loadIntegration(name); err == nil {
 		preChecked = saved.Models
 	} else if editor, ok := r.(Editor); ok {
 		preChecked = editor.Models()
 	}
-	checked := make(map[string]bool, len(preChecked))
-	for _, n := range preChecked {
-		checked[n] = true
-	}

-	// Resolve current to full name (e.g., "llama3.2" -> "llama3.2:latest")
-	for _, item := range items {
-		if item.Name == current || strings.HasPrefix(item.Name, current+":") {
-			current = item.Name
-			break
-		}
-	}
+	items, preChecked, existingModels, cloudModels := buildModelList(existing, preChecked, current)

-	// If current model is configured, move to front of preChecked
-	if checked[current] {
-		preChecked = append([]string{current}, slices.DeleteFunc(preChecked, func(m string) bool { return m == current })...)
+	if len(items) == 0 {
+		return nil, fmt.Errorf("no models available")
 	}

-	// Sort: checked first, then alphabetical
-	slices.SortFunc(items, func(a, b selectItem) int {
-		ac, bc := checked[a.Name], checked[b.Name]
-		if ac != bc {
-			if ac {
-				return -1
-			}
-			return 1
-		}
-		return strings.Compare(strings.ToLower(a.Name), strings.ToLower(b.Name))
-	})
-
 	var selected []string
-	// only editors support multi-model selection
 	if _, ok := r.(Editor); ok {
 		selected, err = multiSelectPrompt(fmt.Sprintf("Select models for %s:", r), items, preChecked)
 		if err != nil {
@@ -151,7 +135,27 @@ func selectModels(ctx context.Context, name, current string) ([]string, error) {
 		selected = []string{model}
 	}

-	// if any model in selected is a cloud model, ensure signed in
+	var toPull []string
+	for _, m := range selected {
+		if !existingModels[m] {
+			toPull = append(toPull, m)
+		}
+	}
+	if len(toPull) > 0 {
+		msg := fmt.Sprintf("Download %s?", strings.Join(toPull, ", "))
+		if ok, err := confirmPrompt(msg); err != nil {
+			return nil, err
+		} else if !ok {
+			return nil, errCancelled
+		}
+		for _, m := range toPull {
+			fmt.Fprintf(os.Stderr, "\n")
+			if err := pullModel(ctx, client, m); err != nil {
+				return nil, fmt.Errorf("failed to pull %s: %w", m, err)
+			}
+		}
+	}
+
 	var selectedCloudModels []string
 	for _, m := range selected {
 		if cloudModels[m] {
@@ -245,6 +249,7 @@ Supported integrations:
  codex     Codex
  droid     Droid
  opencode  OpenCode
+  openclaw  OpenClaw (aliases: clawdbot, moltbot)

 Examples:
  ollama launch
@@ -273,7 +278,6 @@ Examples:
 				return fmt.Errorf("unknown integration: %s", name)
 			}

-			// If launching without --model, use saved config if available
 			if !configFlag && modelFlag == "" {
 				if config, err := loadIntegration(name); err == nil && len(config.Models) > 0 {
 					return runIntegration(name, config.Models[0])
@@ -282,7 +286,6 @@ Examples:

 			var models []string
 			if modelFlag != "" {
-				// When --model is specified, merge with existing models (new model becomes default)
 				models = []string{modelFlag}
 				if existing, err := loadIntegration(name); err == nil && len(existing.Models) > 0 {
 					for _, m := range existing.Models {
@@ -351,3 +354,150 @@ Examples:
 	cmd.Flags().BoolVar(&configFlag, "config", false, "Configure without launching")
 	return cmd
 }
+
+type modelInfo struct {
+	Name   string
+	Remote bool
+}
+
+// buildModelList merges existing models with recommendations, sorts them, and returns
+// the ordered items along with maps of existing and cloud model names.
+func buildModelList(existing []modelInfo, preChecked []string, current string) (items []selectItem, orderedChecked []string, existingModels, cloudModels map[string]bool) {
+	existingModels = make(map[string]bool)
+	cloudModels = make(map[string]bool)
+	recommended := make(map[string]bool)
+	var hasLocalModel, hasCloudModel bool
+
+	for _, rec := range recommendedModels {
+		recommended[rec.Name] = true
+	}
+
+	for _, m := range existing {
+		existingModels[m.Name] = true
+		if m.Remote {
+			cloudModels[m.Name] = true
+			hasCloudModel = true
+		} else {
+			hasLocalModel = true
+		}
+		displayName := strings.TrimSuffix(m.Name, ":latest")
+		existingModels[displayName] = true
+		item := selectItem{Name: displayName}
+		if recommended[displayName] {
+			item.Description = "recommended"
+		}
+		items = append(items, item)
+	}
+
+	for _, rec := range recommendedModels {
+		if existingModels[rec.Name] || existingModels[rec.Name+":latest"] {
+			continue
+		}
+		items = append(items, rec)
+		if isCloudModel(rec.Name) {
+			cloudModels[rec.Name] = true
+		}
+	}
+
+	checked := make(map[string]bool, len(preChecked))
+	for _, n := range preChecked {
+		checked[n] = true
+	}
+
+	// Resolve current to full name (e.g., "llama3.2" -> "llama3.2:latest")
+	for _, item := range items {
+		if item.Name == current || strings.HasPrefix(item.Name, current+":") {
+			current = item.Name
+			break
+		}
+	}
+
+	if checked[current] {
+		preChecked = append([]string{current}, slices.DeleteFunc(preChecked, func(m string) bool { return m == current })...)
+	}
+
+	// Non-existing models get "install?" suffix and are pushed to the bottom.
+	// When user has no models, preserve recommended order.
+	notInstalled := make(map[string]bool)
+	for i := range items {
+		if !existingModels[items[i].Name] {
+			notInstalled[items[i].Name] = true
+			items[i].Description = "recommended, install?"
+		}
+	}
+
+	if hasLocalModel || hasCloudModel {
+		slices.SortStableFunc(items, func(a, b selectItem) int {
+			ac, bc := checked[a.Name], checked[b.Name]
+			aNew, bNew := notInstalled[a.Name], notInstalled[b.Name]
+
+			if ac != bc {
+				if ac {
+					return -1
+				}
+				return 1
+			}
+			if !ac && !bc && aNew != bNew {
+				if aNew {
+					return 1
+				}
+				return -1
+			}
+			return strings.Compare(strings.ToLower(a.Name), strings.ToLower(b.Name))
+		})
+	}
+
+	return items, preChecked, existingModels, cloudModels
+}
+
+func isCloudModel(name string) bool {
+	return strings.HasSuffix(name, ":cloud")
+}
+
+func pullModel(ctx context.Context, client *api.Client, model string) error {
+	p := progress.NewProgress(os.Stderr)
+	defer p.Stop()
+
+	bars := make(map[string]*progress.Bar)
+	var status string
+	var spinner *progress.Spinner
+
+	fn := func(resp api.ProgressResponse) error {
+		if resp.Digest != "" {
+			if resp.Completed == 0 {
+				return nil
+			}
+
+			if spinner != nil {
+				spinner.Stop()
+			}
+
+			bar, ok := bars[resp.Digest]
+			if !ok {
+				name, isDigest := strings.CutPrefix(resp.Digest, "sha256:")
+				name = strings.TrimSpace(name)
+				if isDigest {
+					name = name[:min(12, len(name))]
+				}
+				bar = progress.NewBar(fmt.Sprintf("pulling %s:", name), resp.Total, resp.Completed)
+				bars[resp.Digest] = bar
+				p.Add(resp.Digest, bar)
+			}
+
+			bar.Set(resp.Completed)
+		} else if status != resp.Status {
+			if spinner != nil {
+				spinner.Stop()
+			}
+
+			status = resp.Status
+			spinner = progress.NewSpinner(status)
+			p.Add(status, spinner)
+		}
+
+		return nil
+	}
+
+	request := api.PullRequest{Name: model}
+	return client.Pull(ctx, &request, fn)
+}
--- a/cmd/config/integrations_test.go
+++ b/cmd/config/integrations_test.go
@@ -5,6 +5,7 @@ import (
 	"strings"
 	"testing"

+	"github.com/google/go-cmp/cmp"
 	"github.com/spf13/cobra"
 )

@@ -174,15 +175,226 @@ func TestLaunchCmd_NilHeartbeat(t *testing.T) {
 func TestAllIntegrations_HaveRequiredMethods(t *testing.T) {
 	for name, r := range integrations {
 		t.Run(name, func(t *testing.T) {
-			// Test String() doesn't panic and returns non-empty
 			displayName := r.String()
 			if displayName == "" {
 				t.Error("String() should not return empty")
 			}
-
-			// Test Run() exists (we can't call it without actually running the command)
-			// Just verify the method is available
 			var _ func(string) error = r.Run
 		})
 	}
 }
+
+func TestIsCloudModel(t *testing.T) {
+	tests := []struct {
+		name string
+		want bool
+	}{
+		{"glm-4.7:cloud", true},
+		{"kimi-k2.5:cloud", true},
+		{"glm-4.7-flash", false},
+		{"glm-4.7-flash:latest", false},
+		{"cloud-model", false},
+		{"model:cloudish", false},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := isCloudModel(tt.name); got != tt.want {
+				t.Errorf("isCloudModel(%q) = %v, want %v", tt.name, got, tt.want)
+			}
+		})
+	}
+}
+
+func names(items []selectItem) []string {
+	var out []string
+	for _, item := range items {
+		out = append(out, item.Name)
+	}
+	return out
+}
+
+func TestBuildModelList_NoExistingModels(t *testing.T) {
+	items, _, _, _ := buildModelList(nil, nil, "")
+
+	want := []string{"glm-4.7-flash", "glm-4.7:cloud", "kimi-k2.5:cloud"}
+	if diff := cmp.Diff(want, names(items)); diff != "" {
+		t.Errorf("with no existing models, items should be recommended in order (-want +got):\n%s", diff)
+	}
+
+	for _, item := range items {
+		if item.Description != "recommended, install?" {
+			t.Errorf("item %q should have description 'install?', got %q", item.Name, item.Description)
+		}
+	}
+}
+
+func TestBuildModelList_OnlyLocalModels_CloudRecsAtBottom(t *testing.T) {
+	existing := []modelInfo{
+		{Name: "llama3.2:latest", Remote: false},
+		{Name: "qwen2.5:latest", Remote: false},
+	}
+
+	items, _, _, _ := buildModelList(existing, nil, "")
+	got := names(items)
+
+	want := []string{"llama3.2", "qwen2.5", "glm-4.7-flash", "glm-4.7:cloud", "kimi-k2.5:cloud"}
+	if diff := cmp.Diff(want, got); diff != "" {
+		t.Errorf("cloud recs should be at bottom (-want +got):\n%s", diff)
+	}
+}
+
+func TestBuildModelList_BothCloudAndLocal_RegularSort(t *testing.T) {
+	existing := []modelInfo{
+		{Name: "llama3.2:latest", Remote: false},
+		{Name: "glm-4.7:cloud", Remote: true},
+	}
+
+	items, _, _, _ := buildModelList(existing, nil, "")
+	got := names(items)
+
+	want := []string{"glm-4.7:cloud", "llama3.2", "glm-4.7-flash", "kimi-k2.5:cloud"}
+	if diff := cmp.Diff(want, got); diff != "" {
+		t.Errorf("mixed models should be alphabetical (-want +got):\n%s", diff)
+	}
+}
+
+func TestBuildModelList_PreCheckedFirst(t *testing.T) {
+	existing := []modelInfo{
+		{Name: "llama3.2:latest", Remote: false},
+		{Name: "glm-4.7:cloud", Remote: true},
+	}
+
+	items, _, _, _ := buildModelList(existing, []string{"llama3.2"}, "")
+	got := names(items)
+
+	if got[0] != "llama3.2" {
+		t.Errorf("pre-checked model should be first, got %v", got)
+	}
+}
+
+func TestBuildModelList_ExistingRecommendedMarked(t *testing.T) {
+	existing := []modelInfo{
+		{Name: "glm-4.7-flash", Remote: false},
+		{Name: "glm-4.7:cloud", Remote: true},
+	}
+
+	items, _, _, _ := buildModelList(existing, nil, "")
+
+	for _, item := range items {
+		switch item.Name {
+		case "glm-4.7-flash", "glm-4.7:cloud":
+			if item.Description != "recommended" {
+				t.Errorf("installed recommended %q should have description 'recommended', got %q", item.Name, item.Description)
+			}
+		case "kimi-k2.5:cloud":
+			if item.Description != "recommended, install?" {
+				t.Errorf("non-installed recommended %q should have description 'install?', got %q", item.Name, item.Description)
+			}
+		}
+	}
+}
+
+func TestBuildModelList_ExistingCloudModelsNotPushedToBottom(t *testing.T) {
+	existing := []modelInfo{
+		{Name: "glm-4.7-flash", Remote: false},
+		{Name: "glm-4.7:cloud", Remote: true},
+	}
+
+	items, _, _, _ := buildModelList(existing, nil, "")
+	got := names(items)
+
+	// glm-4.7-flash and glm-4.7:cloud are installed so they sort normally;
+	// kimi-k2.5:cloud and qwen3:0.6b are not installed so they go to the bottom
+	want := []string{"glm-4.7-flash", "glm-4.7:cloud", "kimi-k2.5:cloud"}
+	if diff := cmp.Diff(want, got); diff != "" {
+		t.Errorf("existing cloud models should sort normally (-want +got):\n%s", diff)
+	}
+}
+
+func TestBuildModelList_HasRecommendedCloudModel_OnlyNonInstalledAtBottom(t *testing.T) {
+	existing := []modelInfo{
+		{Name: "llama3.2:latest", Remote: false},
+		{Name: "kimi-k2.5:cloud", Remote: true},
+	}
+
+	items, _, _, _ := buildModelList(existing, nil, "")
+	got := names(items)
+
+	// kimi-k2.5:cloud is installed so it sorts normally;
+	// the rest of the recommendations are not installed so they go to the bottom
+	want := []string{"kimi-k2.5:cloud", "llama3.2", "glm-4.7-flash", "glm-4.7:cloud"}
+	if diff := cmp.Diff(want, got); diff != "" {
+		t.Errorf("only non-installed models should be at bottom (-want +got):\n%s", diff)
+	}
+
+	// Non-installed models should have "recommended, install?" description
+	for _, item := range items {
+		if !slices.Contains([]string{"kimi-k2.5:cloud", "llama3.2"}, item.Name) {
+			if item.Description != "recommended, install?" {
+				t.Errorf("non-installed %q should have description 'install?', got %q", item.Name, item.Description)
+			}
+		}
+	}
+}
+
+func TestBuildModelList_LatestTagStripped(t *testing.T) {
+	existing := []modelInfo{
+		{Name: "glm-4.7-flash:latest", Remote: false},
+		{Name: "llama3.2:latest", Remote: false},
+	}
+
+	items, _, existingModels, _ := buildModelList(existing, nil, "")
+	got := names(items)
+
+	// :latest should be stripped from display names
+	for _, name := range got {
+		if strings.HasSuffix(name, ":latest") {
+			t.Errorf("name %q should not have :latest suffix", name)
+		}
+	}
+
+	// glm-4.7-flash should not be duplicated (existing :latest matches the recommendation)
+	count := 0
+	for _, name := range got {
+		if name == "glm-4.7-flash" {
+			count++
+		}
+	}
+	if count != 1 {
+		t.Errorf("glm-4.7-flash should appear exactly once, got %d in %v", count, got)
+	}
+
+	// Stripped name should be in existingModels so it won't be pulled
+	if !existingModels["glm-4.7-flash"] {
+		t.Error("glm-4.7-flash should be in existingModels")
+	}
+}
+
+func TestBuildModelList_ReturnsExistingAndCloudMaps(t *testing.T) {
+	existing := []modelInfo{
+		{Name: "llama3.2:latest", Remote: false},
+		{Name: "glm-4.7:cloud", Remote: true},
+	}
+
+	_, _, existingModels, cloudModels := buildModelList(existing, nil, "")
+
+	if !existingModels["llama3.2"] {
+		t.Error("llama3.2 should be in existingModels")
+	}
+	if !existingModels["glm-4.7:cloud"] {
+		t.Error("glm-4.7:cloud should be in existingModels")
+	}
+	if existingModels["glm-4.7-flash"] {
+		t.Error("glm-4.7-flash should not be in existingModels (it's a recommendation)")
+	}
+
+	if !cloudModels["glm-4.7:cloud"] {
+		t.Error("glm-4.7:cloud should be in cloudModels")
+	}
+	if !cloudModels["kimi-k2.5:cloud"] {
+		t.Error("kimi-k2.5:cloud should be in cloudModels (recommended cloud)")
+	}
+	if cloudModels["llama3.2"] {
+		t.Error("llama3.2 should not be in cloudModels")
+	}
+}
--- a/cmd/config/openclaw.go
+++ b/cmd/config/openclaw.go
@@ -0,0 +1,254 @@
+package config
+
+import (
+	"bytes"
+	"encoding/json"
+	"fmt"
+	"io"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strings"
+
+	"github.com/ollama/ollama/envconfig"
+)
+
+type Openclaw struct{}
+
+func (c *Openclaw) String() string { return "OpenClaw" }
+
+const ansiGreen = "\033[32m"
+
+func (c *Openclaw) Run(model string) error {
+	bin := "openclaw"
+	if _, err := exec.LookPath(bin); err != nil {
+		bin = "clawdbot"
+		if _, err := exec.LookPath(bin); err != nil {
+			return fmt.Errorf("openclaw is not installed, install from https://docs.openclaw.ai")
+		}
+	}
+
+	models := []string{model}
+	if config, err := loadIntegration("openclaw"); err == nil && len(config.Models) > 0 {
+		models = config.Models
+	} else if config, err := loadIntegration("clawdbot"); err == nil && len(config.Models) > 0 {
+		models = config.Models
+	}
+	if err := c.Edit(models); err != nil {
+		return fmt.Errorf("setup failed: %w", err)
+	}
+
+	if !c.onboarded() {
+		// Onboarding not completed: run it (model already set via Edit)
+		// Use "ollama" as gateway token for simple local access
+		cmd := exec.Command(bin, "onboard",
+			"--auth-choice", "skip",
+			"--gateway-token", "ollama",
+		)
+		cmd.Stdin = os.Stdin
+		cmd.Stdout = os.Stdout
+		cmd.Stderr = os.Stderr
+		return cmd.Run()
+	}
+
+	// Onboarding completed: run gateway
+	cmd := exec.Command(bin, "gateway")
+	cmd.Stdin = os.Stdin
+
+	// Capture output to detect "already running" message
+	var outputBuf bytes.Buffer
+	cmd.Stdout = io.MultiWriter(os.Stdout, &outputBuf)
+	cmd.Stderr = io.MultiWriter(os.Stderr, &outputBuf)
+
+	err := cmd.Run()
+	if err != nil && strings.Contains(outputBuf.String(), "Gateway already running") {
+		fmt.Fprintf(os.Stderr, "%sOpenClaw has been configured with Ollama. Gateway is already running.%s\n", ansiGreen, ansiReset)
+		return nil
+	}
+	return err
+}
+
+// onboarded checks if OpenClaw onboarding wizard was completed
+// by looking for the wizard.lastRunAt marker in the config
+func (c *Openclaw) onboarded() bool {
+	home, err := os.UserHomeDir()
+	if err != nil {
+		return false
+	}
+
+	configPath := filepath.Join(home, ".openclaw", "openclaw.json")
+	legacyPath := filepath.Join(home, ".clawdbot", "clawdbot.json")
+
+	config := make(map[string]any)
+	if data, err := os.ReadFile(configPath); err == nil {
+		_ = json.Unmarshal(data, &config)
+	} else if data, err := os.ReadFile(legacyPath); err == nil {
+		_ = json.Unmarshal(data, &config)
+	} else {
+		return false
+	}
+
+	// Check for wizard.lastRunAt marker (set when onboarding completes)
+	wizard, _ := config["wizard"].(map[string]any)
+	if wizard == nil {
+		return false
+	}
+	lastRunAt, _ := wizard["lastRunAt"].(string)
+	return lastRunAt != ""
+}
+
+func (c *Openclaw) Paths() []string {
+	home, _ := os.UserHomeDir()
+	p := filepath.Join(home, ".openclaw", "openclaw.json")
+	if _, err := os.Stat(p); err == nil {
+		return []string{p}
+	}
+	legacy := filepath.Join(home, ".clawdbot", "clawdbot.json")
+	if _, err := os.Stat(legacy); err == nil {
+		return []string{legacy}
+	}
+	return nil
+}
+
+func (c *Openclaw) Edit(models []string) error {
+	if len(models) == 0 {
+		return nil
+	}
+
+	home, err := os.UserHomeDir()
+	if err != nil {
+		return err
+	}
+
+	configPath := filepath.Join(home, ".openclaw", "openclaw.json")
+	legacyPath := filepath.Join(home, ".clawdbot", "clawdbot.json")
+	if err := os.MkdirAll(filepath.Dir(configPath), 0o755); err != nil {
+		return err
+	}
+
+	// Read into map[string]any to preserve unknown fields
+	config := make(map[string]any)
+	if data, err := os.ReadFile(configPath); err == nil {
+		_ = json.Unmarshal(data, &config)
+	} else if data, err := os.ReadFile(legacyPath); err == nil {
+		_ = json.Unmarshal(data, &config)
+	}
+
+	// Navigate/create: models.providers.ollama (preserving other providers)
+	modelsSection, _ := config["models"].(map[string]any)
+	if modelsSection == nil {
+		modelsSection = make(map[string]any)
+	}
+	providers, _ := modelsSection["providers"].(map[string]any)
+	if providers == nil {
+		providers = make(map[string]any)
+	}
+	ollama, _ := providers["ollama"].(map[string]any)
+	if ollama == nil {
+		ollama = make(map[string]any)
+	}
+
+	ollama["baseUrl"] = envconfig.Host().String() + "/v1"
+	// needed to register provider
+	ollama["apiKey"] = "ollama-local"
+	// TODO(parthsareen): potentially move to responses
+	ollama["api"] = "openai-completions"
+
+	// Build map of existing models to preserve user customizations
+	existingModels, _ := ollama["models"].([]any)
+	existingByID := make(map[string]map[string]any)
+	for _, m := range existingModels {
+		if entry, ok := m.(map[string]any); ok {
+			if id, ok := entry["id"].(string); ok {
+				existingByID[id] = entry
+			}
+		}
+	}
+
+	var newModels []any
+	for _, model := range models {
+		entry := map[string]any{
+			"id":        model,
+			"name":      model,
+			"reasoning": false,
+			"input":     []any{"text"},
+			"cost": map[string]any{
+				"input":      0,
+				"output":     0,
+				"cacheRead":  0,
+				"cacheWrite": 0,
+			},
+			// TODO(parthsareen): get these values from API
+			"contextWindow": 131072,
+			"maxTokens":     16384,
+		}
+		// Merge existing fields (user customizations)
+		if existing, ok := existingByID[model]; ok {
+			for k, v := range existing {
+				if _, isNew := entry[k]; !isNew {
+					entry[k] = v
+				}
+			}
+		}
+		newModels = append(newModels, entry)
+	}
+	ollama["models"] = newModels
+
+	providers["ollama"] = ollama
+	modelsSection["providers"] = providers
+	config["models"] = modelsSection
+
+	// Update agents.defaults.model.primary (preserving other agent settings)
+	agents, _ := config["agents"].(map[string]any)
+	if agents == nil {
+		agents = make(map[string]any)
+	}
+	defaults, _ := agents["defaults"].(map[string]any)
+	if defaults == nil {
+		defaults = make(map[string]any)
+	}
+	modelConfig, _ := defaults["model"].(map[string]any)
+	if modelConfig == nil {
+		modelConfig = make(map[string]any)
+	}
+	modelConfig["primary"] = "ollama/" + models[0]
+	defaults["model"] = modelConfig
+	agents["defaults"] = defaults
+	config["agents"] = agents
+
+	data, err := json.MarshalIndent(config, "", "  ")
+	if err != nil {
+		return err
+	}
+	return writeWithBackup(configPath, data)
+}
+
+func (c *Openclaw) Models() []string {
+	home, err := os.UserHomeDir()
+	if err != nil {
+		return nil
+	}
+
+	config, err := readJSONFile(filepath.Join(home, ".openclaw", "openclaw.json"))
+	if err != nil {
+		config, err = readJSONFile(filepath.Join(home, ".clawdbot", "clawdbot.json"))
+		if err != nil {
+			return nil
+		}
+	}
+
+	modelsSection, _ := config["models"].(map[string]any)
+	providers, _ := modelsSection["providers"].(map[string]any)
+	ollama, _ := providers["ollama"].(map[string]any)
+	modelList, _ := ollama["models"].([]any)
+
+	var result []string
+	for _, m := range modelList {
+		if entry, ok := m.(map[string]any); ok {
+			if id, ok := entry["id"].(string); ok {
+				result = append(result, id)
+			}
+		}
+	}
+	return result
+}
--- a/cmd/config/openclaw_test.go
+++ b/cmd/config/openclaw_test.go
@@ -0,0 +1,878 @@
+package config
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"path/filepath"
+	"testing"
+)
+
+func TestOpenclawIntegration(t *testing.T) {
+	c := &Openclaw{}
+
+	t.Run("String", func(t *testing.T) {
+		if got := c.String(); got != "OpenClaw" {
+			t.Errorf("String() = %q, want %q", got, "OpenClaw")
+		}
+	})
+
+	t.Run("implements Runner", func(t *testing.T) {
+		var _ Runner = c
+	})
+
+	t.Run("implements Editor", func(t *testing.T) {
+		var _ Editor = c
+	})
+}
+
+func TestOpenclawEdit(t *testing.T) {
+	c := &Openclaw{}
+	tmpDir := t.TempDir()
+	setTestHome(t, tmpDir)
+
+	configDir := filepath.Join(tmpDir, ".openclaw")
+	configPath := filepath.Join(configDir, "openclaw.json")
+
+	cleanup := func() { os.RemoveAll(configDir) }
+
+	t.Run("fresh install", func(t *testing.T) {
+		cleanup()
+		if err := c.Edit([]string{"llama3.2"}); err != nil {
+			t.Fatal(err)
+		}
+		assertOpenclawModelExists(t, configPath, "llama3.2")
+		assertOpenclawPrimaryModel(t, configPath, "ollama/llama3.2")
+	})
+
+	t.Run("multiple models - first is primary", func(t *testing.T) {
+		cleanup()
+		if err := c.Edit([]string{"llama3.2", "mistral"}); err != nil {
+			t.Fatal(err)
+		}
+		assertOpenclawModelExists(t, configPath, "llama3.2")
+		assertOpenclawModelExists(t, configPath, "mistral")
+		assertOpenclawPrimaryModel(t, configPath, "ollama/llama3.2")
+	})
+
+	t.Run("preserve other providers", func(t *testing.T) {
+		cleanup()
+		os.MkdirAll(configDir, 0o755)
+		os.WriteFile(configPath, []byte(`{"models":{"providers":{"anthropic":{"apiKey":"xxx"}}}}`), 0o644)
+		if err := c.Edit([]string{"llama3.2"}); err != nil {
+			t.Fatal(err)
+		}
+		data, _ := os.ReadFile(configPath)
+		var cfg map[string]any
+		json.Unmarshal(data, &cfg)
+		models := cfg["models"].(map[string]any)
+		providers := models["providers"].(map[string]any)
+		if providers["anthropic"] == nil {
+			t.Error("anthropic provider was removed")
+		}
+	})
+
+	t.Run("preserve top-level keys", func(t *testing.T) {
+		cleanup()
+		os.MkdirAll(configDir, 0o755)
+		os.WriteFile(configPath, []byte(`{"theme":"dark","mcp":{"servers":{}}}`), 0o644)
+		if err := c.Edit([]string{"llama3.2"}); err != nil {
+			t.Fatal(err)
+		}
+		data, _ := os.ReadFile(configPath)
+		var cfg map[string]any
+		json.Unmarshal(data, &cfg)
+		if cfg["theme"] != "dark" {
+			t.Error("theme was removed")
+		}
+		if cfg["mcp"] == nil {
+			t.Error("mcp was removed")
+		}
+	})
+
+	t.Run("preserve user customizations on models", func(t *testing.T) {
+		cleanup()
+		c.Edit([]string{"llama3.2"})
+
+		// User adds custom field
+		data, _ := os.ReadFile(configPath)
+		var cfg map[string]any
+		json.Unmarshal(data, &cfg)
+		models := cfg["models"].(map[string]any)
+		providers := models["providers"].(map[string]any)
+		ollama := providers["ollama"].(map[string]any)
+		modelList := ollama["models"].([]any)
+		entry := modelList[0].(map[string]any)
+		entry["customField"] = "user-value"
+		configData, _ := json.MarshalIndent(cfg, "", "  ")
+		os.WriteFile(configPath, configData, 0o644)
+
+		// Re-run Edit
+		c.Edit([]string{"llama3.2"})
+
+		data, _ = os.ReadFile(configPath)
+		json.Unmarshal(data, &cfg)
+		models = cfg["models"].(map[string]any)
+		providers = models["providers"].(map[string]any)
+		ollama = providers["ollama"].(map[string]any)
+		modelList = ollama["models"].([]any)
+		entry = modelList[0].(map[string]any)
+		if entry["customField"] != "user-value" {
+			t.Error("custom field was lost")
+		}
+	})
+
+	t.Run("edit replaces models list", func(t *testing.T) {
+		cleanup()
+		c.Edit([]string{"llama3.2", "mistral"})
+		c.Edit([]string{"llama3.2"})
+
+		assertOpenclawModelExists(t, configPath, "llama3.2")
+		assertOpenclawModelNotExists(t, configPath, "mistral")
+	})
+
+	t.Run("empty models is no-op", func(t *testing.T) {
+		cleanup()
+		os.MkdirAll(configDir, 0o755)
+		original := `{"existing":"data"}`
+		os.WriteFile(configPath, []byte(original), 0o644)
+
+		c.Edit([]string{})
+
+		data, _ := os.ReadFile(configPath)
+		if string(data) != original {
+			t.Error("empty models should not modify file")
+		}
+	})
+
+	t.Run("corrupted JSON treated as empty", func(t *testing.T) {
+		cleanup()
+		os.MkdirAll(configDir, 0o755)
+		os.WriteFile(configPath, []byte(`{corrupted`), 0o644)
+
+		if err := c.Edit([]string{"llama3.2"}); err != nil {
+			t.Fatal(err)
+		}
+
+		data, _ := os.ReadFile(configPath)
+		var cfg map[string]any
+		if err := json.Unmarshal(data, &cfg); err != nil {
+			t.Error("result should be valid JSON")
+		}
+	})
+
+	t.Run("wrong type models section", func(t *testing.T) {
+		cleanup()
+		os.MkdirAll(configDir, 0o755)
+		os.WriteFile(configPath, []byte(`{"models":"not a map"}`), 0o644)
+
+		if err := c.Edit([]string{"llama3.2"}); err != nil {
+			t.Fatal(err)
+		}
+		assertOpenclawModelExists(t, configPath, "llama3.2")
+	})
+}
+
+func TestOpenclawModels(t *testing.T) {
+	c := &Openclaw{}
+	tmpDir := t.TempDir()
+	setTestHome(t, tmpDir)
+
+	t.Run("no config returns nil", func(t *testing.T) {
+		if models := c.Models(); len(models) > 0 {
+			t.Errorf("expected nil/empty, got %v", models)
+		}
+	})
+
+	t.Run("returns all ollama models", func(t *testing.T) {
+		configDir := filepath.Join(tmpDir, ".openclaw")
+		os.MkdirAll(configDir, 0o755)
+		os.WriteFile(filepath.Join(configDir, "openclaw.json"), []byte(`{
+			"models":{"providers":{"ollama":{"models":[
+				{"id":"llama3.2"},
+				{"id":"mistral"}
+			]}}}
+		}`), 0o644)
+
+		models := c.Models()
+		if len(models) != 2 {
+			t.Errorf("expected 2 models, got %v", models)
+		}
+	})
+}
+
+// Helper functions
+func assertOpenclawModelExists(t *testing.T, path, model string) {
+	t.Helper()
+	data, _ := os.ReadFile(path)
+	var cfg map[string]any
+	json.Unmarshal(data, &cfg)
+	models := cfg["models"].(map[string]any)
+	providers := models["providers"].(map[string]any)
+	ollama := providers["ollama"].(map[string]any)
+	modelList := ollama["models"].([]any)
+	for _, m := range modelList {
+		if entry, ok := m.(map[string]any); ok {
+			if entry["id"] == model {
+				return
+			}
+		}
+	}
+	t.Errorf("model %s not found", model)
+}
+
+func assertOpenclawModelNotExists(t *testing.T, path, model string) {
+	t.Helper()
+	data, _ := os.ReadFile(path)
+	var cfg map[string]any
+	json.Unmarshal(data, &cfg)
+	models, _ := cfg["models"].(map[string]any)
+	providers, _ := models["providers"].(map[string]any)
+	ollama, _ := providers["ollama"].(map[string]any)
+	modelList, _ := ollama["models"].([]any)
+	for _, m := range modelList {
+		if entry, ok := m.(map[string]any); ok {
+			if entry["id"] == model {
+				t.Errorf("model %s should not exist", model)
+			}
+		}
+	}
+}
+
+func assertOpenclawPrimaryModel(t *testing.T, path, expected string) {
+	t.Helper()
+	data, _ := os.ReadFile(path)
+	var cfg map[string]any
+	json.Unmarshal(data, &cfg)
+	agents := cfg["agents"].(map[string]any)
+	defaults := agents["defaults"].(map[string]any)
+	model := defaults["model"].(map[string]any)
+	if model["primary"] != expected {
+		t.Errorf("primary model = %v, want %v", model["primary"], expected)
+	}
+}
+
+func TestOpenclawPaths(t *testing.T) {
+	c := &Openclaw{}
+
+	t.Run("returns path when config exists", func(t *testing.T) {
+		tmpDir := t.TempDir()
+		setTestHome(t, tmpDir)
+		configDir := filepath.Join(tmpDir, ".openclaw")
+		os.MkdirAll(configDir, 0o755)
+		os.WriteFile(filepath.Join(configDir, "openclaw.json"), []byte(`{}`), 0o644)
+
+		paths := c.Paths()
+		if len(paths) != 1 {
+			t.Errorf("expected 1 path, got %d", len(paths))
+		}
+	})
+
+	t.Run("returns nil when config missing", func(t *testing.T) {
+		tmpDir := t.TempDir()
+		setTestHome(t, tmpDir)
+		if paths := c.Paths(); paths != nil {
+			t.Errorf("expected nil, got %v", paths)
+		}
+	})
+}
+
+func TestOpenclawModelsEdgeCases(t *testing.T) {
+	c := &Openclaw{}
+	tmpDir := t.TempDir()
+	setTestHome(t, tmpDir)
+	configDir := filepath.Join(tmpDir, ".openclaw")
+	configPath := filepath.Join(configDir, "openclaw.json")
+	cleanup := func() { os.RemoveAll(configDir) }
+
+	t.Run("corrupted JSON returns nil", func(t *testing.T) {
+		cleanup()
+		os.MkdirAll(configDir, 0o755)
+		os.WriteFile(configPath, []byte(`{corrupted`), 0o644)
+		if models := c.Models(); models != nil {
+			t.Errorf("expected nil, got %v", models)
+		}
+	})
+
+	t.Run("wrong type at models level", func(t *testing.T) {
+		cleanup()
+		os.MkdirAll(configDir, 0o755)
+		os.WriteFile(configPath, []byte(`{"models":"string"}`), 0o644)
+		if models := c.Models(); models != nil {
+			t.Errorf("expected nil, got %v", models)
+		}
+	})
+
+	t.Run("wrong type at providers level", func(t *testing.T) {
+		cleanup()
+		os.MkdirAll(configDir, 0o755)
+		os.WriteFile(configPath, []byte(`{"models":{"providers":"string"}}`), 0o644)
+		if models := c.Models(); models != nil {
+			t.Errorf("expected nil, got %v", models)
+		}
+	})
+
+	t.Run("wrong type at ollama level", func(t *testing.T) {
+		cleanup()
+		os.MkdirAll(configDir, 0o755)
+		os.WriteFile(configPath, []byte(`{"models":{"providers":{"ollama":"string"}}}`), 0o644)
+		if models := c.Models(); models != nil {
+			t.Errorf("expected nil, got %v", models)
+		}
+	})
+
+	t.Run("model entry missing id", func(t *testing.T) {
+		cleanup()
+		os.MkdirAll(configDir, 0o755)
+		os.WriteFile(configPath, []byte(`{"models":{"providers":{"ollama":{"models":[{"name":"test"}]}}}}`), 0o644)
+		if len(c.Models()) != 0 {
+			t.Error("expected empty for missing id")
+		}
+	})
+
+	t.Run("model id is not string", func(t *testing.T) {
+		cleanup()
+		os.MkdirAll(configDir, 0o755)
+		os.WriteFile(configPath, []byte(`{"models":{"providers":{"ollama":{"models":[{"id":123}]}}}}`), 0o644)
+		if len(c.Models()) != 0 {
+			t.Error("expected empty for non-string id")
+		}
+	})
+}
+
+func TestOpenclawEditSchemaFields(t *testing.T) {
+	c := &Openclaw{}
+	tmpDir := t.TempDir()
+	setTestHome(t, tmpDir)
+	configPath := filepath.Join(tmpDir, ".openclaw", "openclaw.json")
+
+	if err := c.Edit([]string{"llama3.2"}); err != nil {
+		t.Fatal(err)
+	}
+
+	data, _ := os.ReadFile(configPath)
+	var cfg map[string]any
+	json.Unmarshal(data, &cfg)
+	models := cfg["models"].(map[string]any)
+	providers := models["providers"].(map[string]any)
+	ollama := providers["ollama"].(map[string]any)
+	modelList := ollama["models"].([]any)
+	entry := modelList[0].(map[string]any)
+
+	// Verify required schema fields
+	if entry["reasoning"] != false {
+		t.Error("reasoning should be false")
+	}
+	if entry["input"] == nil {
+		t.Error("input should be set")
+	}
+	if entry["contextWindow"] == nil {
+		t.Error("contextWindow should be set")
+	}
+	if entry["maxTokens"] == nil {
+		t.Error("maxTokens should be set")
+	}
+	cost := entry["cost"].(map[string]any)
+	if cost["cacheRead"] == nil {
+		t.Error("cost.cacheRead should be set")
+	}
+	if cost["cacheWrite"] == nil {
+		t.Error("cost.cacheWrite should be set")
+	}
+}
+
+func TestOpenclawEditModelNames(t *testing.T) {
+	c := &Openclaw{}
+	tmpDir := t.TempDir()
+	setTestHome(t, tmpDir)
+	configPath := filepath.Join(tmpDir, ".openclaw", "openclaw.json")
+	cleanup := func() { os.RemoveAll(filepath.Join(tmpDir, ".openclaw")) }
+
+	t.Run("model with colon tag", func(t *testing.T) {
+		cleanup()
+		if err := c.Edit([]string{"llama3.2:70b"}); err != nil {
+			t.Fatal(err)
+		}
+		assertOpenclawModelExists(t, configPath, "llama3.2:70b")
+		assertOpenclawPrimaryModel(t, configPath, "ollama/llama3.2:70b")
+	})
+
+	t.Run("model with slash", func(t *testing.T) {
+		cleanup()
+		if err := c.Edit([]string{"library/model:tag"}); err != nil {
+			t.Fatal(err)
+		}
+		assertOpenclawModelExists(t, configPath, "library/model:tag")
+		assertOpenclawPrimaryModel(t, configPath, "ollama/library/model:tag")
+	})
+
+	t.Run("model with hyphen", func(t *testing.T) {
+		cleanup()
+		if err := c.Edit([]string{"test-model"}); err != nil {
+			t.Fatal(err)
+		}
+		assertOpenclawModelExists(t, configPath, "test-model")
+	})
+}
+
+func TestOpenclawEditAgentsPreservation(t *testing.T) {
+	c := &Openclaw{}
+	tmpDir := t.TempDir()
+	setTestHome(t, tmpDir)
+	configDir := filepath.Join(tmpDir, ".openclaw")
+	configPath := filepath.Join(configDir, "openclaw.json")
+	cleanup := func() { os.RemoveAll(configDir) }
+
+	t.Run("preserve other agent defaults", func(t *testing.T) {
+		cleanup()
+		os.MkdirAll(configDir, 0o755)
+		os.WriteFile(configPath, []byte(`{"agents":{"defaults":{"model":{"primary":"old"},"temperature":0.7}}}`), 0o644)
+
+		c.Edit([]string{"llama3.2"})
+
+		data, _ := os.ReadFile(configPath)
+		var cfg map[string]any
+		json.Unmarshal(data, &cfg)
+		agents := cfg["agents"].(map[string]any)
+		defaults := agents["defaults"].(map[string]any)
+		if defaults["temperature"] != 0.7 {
+			t.Error("temperature setting was lost")
+		}
+	})
+
+	t.Run("preserve other agents besides defaults", func(t *testing.T) {
+		cleanup()
+		os.MkdirAll(configDir, 0o755)
+		os.WriteFile(configPath, []byte(`{"agents":{"defaults":{},"custom-agent":{"foo":"bar"}}}`), 0o644)
+
+		c.Edit([]string{"llama3.2"})
+
+		data, _ := os.ReadFile(configPath)
+		var cfg map[string]any
+		json.Unmarshal(data, &cfg)
+		agents := cfg["agents"].(map[string]any)
+		if agents["custom-agent"] == nil {
+			t.Error("custom-agent was lost")
+		}
+	})
+}
+
+const testOpenclawFixture = `{
+  "theme": "dark",
+  "mcp": {"servers": {"custom": {"enabled": true}}},
+  "models": {
+    "providers": {
+      "anthropic": {"apiKey": "xxx"},
+      "ollama": {
+        "baseUrl": "http://127.0.0.1:11434/v1",
+        "models": [{"id": "old-model", "customField": "preserved"}]
+      }
+    }
+  },
+  "agents": {
+    "defaults": {"model": {"primary": "old"}, "temperature": 0.7},
+    "custom-agent": {"foo": "bar"}
+  }
+}`
+
+func TestOpenclawEdit_RoundTrip(t *testing.T) {
+	c := &Openclaw{}
+	tmpDir := t.TempDir()
+	setTestHome(t, tmpDir)
+	configDir := filepath.Join(tmpDir, ".openclaw")
+	configPath := filepath.Join(configDir, "openclaw.json")
+
+	os.MkdirAll(configDir, 0o755)
+	os.WriteFile(configPath, []byte(testOpenclawFixture), 0o644)
+
+	if err := c.Edit([]string{"llama3.2", "mistral"}); err != nil {
+		t.Fatal(err)
+	}
+
+	data, _ := os.ReadFile(configPath)
+	var cfg map[string]any
+	json.Unmarshal(data, &cfg)
+
+	// Verify top-level preserved
+	if cfg["theme"] != "dark" {
+		t.Error("theme not preserved")
+	}
+	mcp := cfg["mcp"].(map[string]any)
+	servers := mcp["servers"].(map[string]any)
+	if servers["custom"] == nil {
+		t.Error("mcp.servers.custom not preserved")
+	}
+
+	// Verify other providers preserved
+	models := cfg["models"].(map[string]any)
+	providers := models["providers"].(map[string]any)
+	if providers["anthropic"] == nil {
+		t.Error("anthropic provider not preserved")
+	}
+
+	// Verify agents preserved
+	agents := cfg["agents"].(map[string]any)
+	if agents["custom-agent"] == nil {
+		t.Error("custom-agent not preserved")
+	}
+	defaults := agents["defaults"].(map[string]any)
+	if defaults["temperature"] != 0.7 {
+		t.Error("temperature not preserved")
+	}
+}
+
+func TestOpenclawEdit_Idempotent(t *testing.T) {
+	c := &Openclaw{}
+	tmpDir := t.TempDir()
+	setTestHome(t, tmpDir)
+	configDir := filepath.Join(tmpDir, ".openclaw")
+	configPath := filepath.Join(configDir, "openclaw.json")
+
+	os.MkdirAll(configDir, 0o755)
+	os.WriteFile(configPath, []byte(testOpenclawFixture), 0o644)
+
+	c.Edit([]string{"llama3.2", "mistral"})
+	firstData, _ := os.ReadFile(configPath)
+
+	c.Edit([]string{"llama3.2", "mistral"})
+	secondData, _ := os.ReadFile(configPath)
+
+	if string(firstData) != string(secondData) {
+		t.Error("repeated edits with same models produced different results")
+	}
+}
+
+func TestOpenclawEdit_MultipleConsecutiveEdits(t *testing.T) {
+	c := &Openclaw{}
+	tmpDir := t.TempDir()
+	setTestHome(t, tmpDir)
+	configDir := filepath.Join(tmpDir, ".openclaw")
+	configPath := filepath.Join(configDir, "openclaw.json")
+
+	os.MkdirAll(configDir, 0o755)
+	os.WriteFile(configPath, []byte(testOpenclawFixture), 0o644)
+
+	for i := range 10 {
+		models := []string{"model-a", "model-b"}
+		if i%2 == 0 {
+			models = []string{"model-x", "model-y", "model-z"}
+		}
+		if err := c.Edit(models); err != nil {
+			t.Fatalf("edit %d failed: %v", i, err)
+		}
+	}
+
+	data, _ := os.ReadFile(configPath)
+	var cfg map[string]any
+	if err := json.Unmarshal(data, &cfg); err != nil {
+		t.Fatalf("file is not valid JSON after multiple edits: %v", err)
+	}
+
+	if cfg["theme"] != "dark" {
+		t.Error("theme lost after multiple edits")
+	}
+}
+
+func TestOpenclawEdit_BackupCreated(t *testing.T) {
+	c := &Openclaw{}
+	tmpDir := t.TempDir()
+	setTestHome(t, tmpDir)
+	configDir := filepath.Join(tmpDir, ".openclaw")
+	configPath := filepath.Join(configDir, "openclaw.json")
+	backupDir := filepath.Join(os.TempDir(), "ollama-backups")
+
+	os.MkdirAll(configDir, 0o755)
+	uniqueMarker := fmt.Sprintf("test-marker-%d", os.Getpid())
+	original := fmt.Sprintf(`{"theme": "%s"}`, uniqueMarker)
+	os.WriteFile(configPath, []byte(original), 0o644)
+
+	if err := c.Edit([]string{"model-a"}); err != nil {
+		t.Fatal(err)
+	}
+
+	backups, _ := filepath.Glob(filepath.Join(backupDir, "openclaw.json.*"))
+	foundBackup := false
+	for _, backup := range backups {
+		data, _ := os.ReadFile(backup)
+		if string(data) == original {
+			foundBackup = true
+			break
+		}
+	}
+
+	if !foundBackup {
+		t.Error("backup with original content not found")
+	}
+}
+
+func TestOpenclawClawdbotAlias(t *testing.T) {
+	for _, alias := range []string{"clawdbot", "moltbot"} {
+		t.Run(alias+" alias resolves to Openclaw runner", func(t *testing.T) {
+			r, ok := integrations[alias]
+			if !ok {
+				t.Fatalf("%s not found in integrations", alias)
+			}
+			if _, ok := r.(*Openclaw); !ok {
+				t.Errorf("%s integration is %T, want *Openclaw", alias, r)
+			}
+		})
+
+		t.Run(alias+" is hidden from selector", func(t *testing.T) {
+			if !integrationAliases[alias] {
+				t.Errorf("%s should be in integrationAliases", alias)
+			}
+		})
+	}
+}
+
+func TestOpenclawLegacyPaths(t *testing.T) {
+	c := &Openclaw{}
+
+	t.Run("falls back to legacy clawdbot path", func(t *testing.T) {
+		tmpDir := t.TempDir()
+		setTestHome(t, tmpDir)
+		legacyDir := filepath.Join(tmpDir, ".clawdbot")
+		os.MkdirAll(legacyDir, 0o755)
+		os.WriteFile(filepath.Join(legacyDir, "clawdbot.json"), []byte(`{}`), 0o644)
+
+		paths := c.Paths()
+		if len(paths) != 1 {
+			t.Fatalf("expected 1 path, got %d", len(paths))
+		}
+		if paths[0] != filepath.Join(legacyDir, "clawdbot.json") {
+			t.Errorf("expected legacy path, got %s", paths[0])
+		}
+	})
+
+	t.Run("prefers new path over legacy", func(t *testing.T) {
+		tmpDir := t.TempDir()
+		setTestHome(t, tmpDir)
+		newDir := filepath.Join(tmpDir, ".openclaw")
+		legacyDir := filepath.Join(tmpDir, ".clawdbot")
+		os.MkdirAll(newDir, 0o755)
+		os.MkdirAll(legacyDir, 0o755)
+		os.WriteFile(filepath.Join(newDir, "openclaw.json"), []byte(`{}`), 0o644)
+		os.WriteFile(filepath.Join(legacyDir, "clawdbot.json"), []byte(`{}`), 0o644)
+
+		paths := c.Paths()
+		if len(paths) != 1 {
+			t.Fatalf("expected 1 path, got %d", len(paths))
+		}
+		if paths[0] != filepath.Join(newDir, "openclaw.json") {
+			t.Errorf("expected new path, got %s", paths[0])
+		}
+	})
+
+	t.Run("Models reads from legacy path", func(t *testing.T) {
+		tmpDir := t.TempDir()
+		setTestHome(t, tmpDir)
+		legacyDir := filepath.Join(tmpDir, ".clawdbot")
+		os.MkdirAll(legacyDir, 0o755)
+		os.WriteFile(filepath.Join(legacyDir, "clawdbot.json"), []byte(`{
+			"models":{"providers":{"ollama":{"models":[{"id":"llama3.2"}]}}}
+		}`), 0o644)
+
+		models := c.Models()
+		if len(models) != 1 || models[0] != "llama3.2" {
+			t.Errorf("expected [llama3.2], got %v", models)
+		}
+	})
+
+	t.Run("Models prefers new path over legacy", func(t *testing.T) {
+		tmpDir := t.TempDir()
+		setTestHome(t, tmpDir)
+		newDir := filepath.Join(tmpDir, ".openclaw")
+		legacyDir := filepath.Join(tmpDir, ".clawdbot")
+		os.MkdirAll(newDir, 0o755)
+		os.MkdirAll(legacyDir, 0o755)
+		os.WriteFile(filepath.Join(newDir, "openclaw.json"), []byte(`{
+			"models":{"providers":{"ollama":{"models":[{"id":"new-model"}]}}}
+		}`), 0o644)
+		os.WriteFile(filepath.Join(legacyDir, "clawdbot.json"), []byte(`{
+			"models":{"providers":{"ollama":{"models":[{"id":"legacy-model"}]}}}
+		}`), 0o644)
+
+		models := c.Models()
+		if len(models) != 1 || models[0] != "new-model" {
+			t.Errorf("expected [new-model], got %v", models)
+		}
+	})
+
+	t.Run("Edit reads new path over legacy when both exist", func(t *testing.T) {
+		tmpDir := t.TempDir()
+		setTestHome(t, tmpDir)
+		newDir := filepath.Join(tmpDir, ".openclaw")
+		legacyDir := filepath.Join(tmpDir, ".clawdbot")
+		os.MkdirAll(newDir, 0o755)
+		os.MkdirAll(legacyDir, 0o755)
+		os.WriteFile(filepath.Join(newDir, "openclaw.json"), []byte(`{"theme":"new"}`), 0o644)
+		os.WriteFile(filepath.Join(legacyDir, "clawdbot.json"), []byte(`{"theme":"legacy"}`), 0o644)
+
+		if err := c.Edit([]string{"llama3.2"}); err != nil {
+			t.Fatal(err)
+		}
+
+		data, _ := os.ReadFile(filepath.Join(newDir, "openclaw.json"))
+		var cfg map[string]any
+		json.Unmarshal(data, &cfg)
+		if cfg["theme"] != "new" {
+			t.Errorf("expected theme from new config, got %v", cfg["theme"])
+		}
+	})
+
+	t.Run("Edit migrates from legacy config", func(t *testing.T) {
+		tmpDir := t.TempDir()
+		setTestHome(t, tmpDir)
+		legacyDir := filepath.Join(tmpDir, ".clawdbot")
+		os.MkdirAll(legacyDir, 0o755)
+		os.WriteFile(filepath.Join(legacyDir, "clawdbot.json"), []byte(`{"theme":"dark"}`), 0o644)
+
+		if err := c.Edit([]string{"llama3.2"}); err != nil {
+			t.Fatal(err)
+		}
+
+		// Should write to new path
+		newPath := filepath.Join(tmpDir, ".openclaw", "openclaw.json")
+		data, err := os.ReadFile(newPath)
+		if err != nil {
+			t.Fatal("expected new config file to be created")
+		}
+		var cfg map[string]any
+		json.Unmarshal(data, &cfg)
+		if cfg["theme"] != "dark" {
+			t.Error("legacy theme setting was not migrated")
+		}
+	})
+}
+
+func TestOpenclawEdit_CreatesDirectoryIfMissing(t *testing.T) {
+	c := &Openclaw{}
+	tmpDir := t.TempDir()
+	setTestHome(t, tmpDir)
+	configDir := filepath.Join(tmpDir, ".openclaw")
+
+	if _, err := os.Stat(configDir); !os.IsNotExist(err) {
+		t.Fatal("directory should not exist before test")
+	}
+
+	if err := c.Edit([]string{"model-a"}); err != nil {
+		t.Fatal(err)
+	}
+
+	if _, err := os.Stat(configDir); os.IsNotExist(err) {
+		t.Fatal("directory was not created")
+	}
+}
+
+func TestOpenclawOnboarded(t *testing.T) {
+	c := &Openclaw{}
+
+	t.Run("returns false when no config exists", func(t *testing.T) {
+		tmpDir := t.TempDir()
+		setTestHome(t, tmpDir)
+		if c.onboarded() {
+			t.Error("expected false when no config exists")
+		}
+	})
+
+	t.Run("returns false when config exists but no wizard section", func(t *testing.T) {
+		tmpDir := t.TempDir()
+		setTestHome(t, tmpDir)
+		configDir := filepath.Join(tmpDir, ".openclaw")
+		os.MkdirAll(configDir, 0o755)
+		os.WriteFile(filepath.Join(configDir, "openclaw.json"), []byte(`{"theme":"dark"}`), 0o644)
+
+		if c.onboarded() {
+			t.Error("expected false when no wizard section")
+		}
+	})
+
+	t.Run("returns false when wizard section exists but no lastRunAt", func(t *testing.T) {
+		tmpDir := t.TempDir()
+		setTestHome(t, tmpDir)
+		configDir := filepath.Join(tmpDir, ".openclaw")
+		os.MkdirAll(configDir, 0o755)
+		os.WriteFile(filepath.Join(configDir, "openclaw.json"), []byte(`{"wizard":{}}`), 0o644)
+
+		if c.onboarded() {
+			t.Error("expected false when wizard.lastRunAt is missing")
+		}
+	})
+
+	t.Run("returns false when wizard.lastRunAt is empty string", func(t *testing.T) {
+		tmpDir := t.TempDir()
+		setTestHome(t, tmpDir)
+		configDir := filepath.Join(tmpDir, ".openclaw")
+		os.MkdirAll(configDir, 0o755)
+		os.WriteFile(filepath.Join(configDir, "openclaw.json"), []byte(`{"wizard":{"lastRunAt":""}}`), 0o644)
+
+		if c.onboarded() {
+			t.Error("expected false when wizard.lastRunAt is empty")
+		}
+	})
+
+	t.Run("returns true when wizard.lastRunAt is set", func(t *testing.T) {
+		tmpDir := t.TempDir()
+		setTestHome(t, tmpDir)
+		configDir := filepath.Join(tmpDir, ".openclaw")
+		os.MkdirAll(configDir, 0o755)
+		os.WriteFile(filepath.Join(configDir, "openclaw.json"), []byte(`{"wizard":{"lastRunAt":"2024-01-01T00:00:00Z"}}`), 0o644)
+
+		if !c.onboarded() {
+			t.Error("expected true when wizard.lastRunAt is set")
+		}
+	})
+
+	t.Run("checks legacy clawdbot path", func(t *testing.T) {
+		tmpDir := t.TempDir()
+		setTestHome(t, tmpDir)
+		legacyDir := filepath.Join(tmpDir, ".clawdbot")
+		os.MkdirAll(legacyDir, 0o755)
+		os.WriteFile(filepath.Join(legacyDir, "clawdbot.json"), []byte(`{"wizard":{"lastRunAt":"2024-01-01T00:00:00Z"}}`), 0o644)
+
+		if !c.onboarded() {
+			t.Error("expected true when legacy config has wizard.lastRunAt")
+		}
+	})
+
+	t.Run("prefers new path over legacy", func(t *testing.T) {
+		tmpDir := t.TempDir()
+		setTestHome(t, tmpDir)
+		newDir := filepath.Join(tmpDir, ".openclaw")
+		legacyDir := filepath.Join(tmpDir, ".clawdbot")
+		os.MkdirAll(newDir, 0o755)
+		os.MkdirAll(legacyDir, 0o755)
+		// New path has no wizard marker
+		os.WriteFile(filepath.Join(newDir, "openclaw.json"), []byte(`{}`), 0o644)
+		// Legacy has wizard marker
+		os.WriteFile(filepath.Join(legacyDir, "clawdbot.json"), []byte(`{"wizard":{"lastRunAt":"2024-01-01T00:00:00Z"}}`), 0o644)
+
+		if c.onboarded() {
+			t.Error("expected false - should prefer new path which has no wizard marker")
+		}
+	})
+
+	t.Run("handles corrupted JSON gracefully", func(t *testing.T) {
+		tmpDir := t.TempDir()
+		setTestHome(t, tmpDir)
+		configDir := filepath.Join(tmpDir, ".openclaw")
+		os.MkdirAll(configDir, 0o755)
+		os.WriteFile(filepath.Join(configDir, "openclaw.json"), []byte(`{corrupted`), 0o644)
+
+		if c.onboarded() {
+			t.Error("expected false for corrupted JSON")
+		}
+	})
+
+	t.Run("handles wrong type for wizard section", func(t *testing.T) {
+		tmpDir := t.TempDir()
+		setTestHome(t, tmpDir)
+		configDir := filepath.Join(tmpDir, ".openclaw")
+		os.MkdirAll(configDir, 0o755)
+		os.WriteFile(filepath.Join(configDir, "openclaw.json"), []byte(`{"wizard":"not a map"}`), 0o644)
+
+		if c.onboarded() {
+			t.Error("expected false when wizard is wrong type")
+		}
+	})
+}
--- a/cmd/config/opencode.go
+++ b/cmd/config/opencode.go
@@ -9,6 +9,8 @@ import (
 	"path/filepath"
 	"slices"
 	"strings"
+
+	"github.com/ollama/ollama/envconfig"
 )

 // OpenCode implements Runner and Editor for OpenCode integration
@@ -88,7 +90,7 @@ func (o *OpenCode) Edit(modelList []string) error {
 			"npm":  "@ai-sdk/openai-compatible",
 			"name": "Ollama (local)",
 			"options": map[string]any{
-				"baseURL": "http://localhost:11434/v1",
+				"baseURL": envconfig.Host().String() + "/v1",
 			},
 		}
 	}
@@ -105,17 +107,26 @@ func (o *OpenCode) Edit(modelList []string) error {

 	for name, cfg := range models {
 		if cfgMap, ok := cfg.(map[string]any); ok {
-			if displayName, ok := cfgMap["name"].(string); ok {
-				if strings.HasSuffix(displayName, "[Ollama]") && !selectedSet[name] {
-					delete(models, name)
-				}
+			if isOllamaModel(cfgMap) && !selectedSet[name] {
+				delete(models, name)
 			}
 		}
 	}

 	for _, model := range modelList {
+		if existing, ok := models[model].(map[string]any); ok {
+			// migrate existing models without _launch marker
+			if isOllamaModel(existing) {
+				existing["_launch"] = true
+				if name, ok := existing["name"].(string); ok {
+					existing["name"] = strings.TrimSuffix(name, " [Ollama]")
+				}
+			}
+			continue
+		}
 		models[model] = map[string]any{
-			"name": fmt.Sprintf("%s [Ollama]", model),
+			"name":    model,
+			"_launch": true,
 		}
 	}

@@ -201,3 +212,15 @@ func (o *OpenCode) Models() []string {
 	slices.Sort(keys)
 	return keys
 }
+
+// isOllamaModel reports whether a model config entry is managed by us
+func isOllamaModel(cfg map[string]any) bool {
+	if v, ok := cfg["_launch"].(bool); ok && v {
+		return true
+	}
+	// previously used [Ollama] as a suffix for the model managed by ollama launch
+	if name, ok := cfg["name"].(string); ok {
+		return strings.HasSuffix(name, "[Ollama]")
+	}
+	return false
+}
--- a/cmd/config/opencode_test.go
+++ b/cmd/config/opencode_test.go
@@ -161,6 +161,76 @@ func TestOpenCodeEdit(t *testing.T) {
 		assertOpenCodeModelNotExists(t, configPath, "mistral")
 	})

+	t.Run("preserve user customizations on managed models", func(t *testing.T) {
+		cleanup()
+		if err := o.Edit([]string{"llama3.2"}); err != nil {
+			t.Fatal(err)
+		}
+
+		// Add custom fields to the model entry (simulating user edits)
+		data, _ := os.ReadFile(configPath)
+		var cfg map[string]any
+		json.Unmarshal(data, &cfg)
+		provider := cfg["provider"].(map[string]any)
+		ollama := provider["ollama"].(map[string]any)
+		models := ollama["models"].(map[string]any)
+		entry := models["llama3.2"].(map[string]any)
+		entry["_myPref"] = "custom-value"
+		entry["_myNum"] = 42
+		configData, _ := json.MarshalIndent(cfg, "", "  ")
+		os.WriteFile(configPath, configData, 0o644)
+
+		// Re-run Edit — should preserve custom fields
+		if err := o.Edit([]string{"llama3.2"}); err != nil {
+			t.Fatal(err)
+		}
+
+		data, _ = os.ReadFile(configPath)
+		json.Unmarshal(data, &cfg)
+		provider = cfg["provider"].(map[string]any)
+		ollama = provider["ollama"].(map[string]any)
+		models = ollama["models"].(map[string]any)
+		entry = models["llama3.2"].(map[string]any)
+
+		if entry["_myPref"] != "custom-value" {
+			t.Errorf("_myPref was lost: got %v", entry["_myPref"])
+		}
+		if entry["_myNum"] != float64(42) {
+			t.Errorf("_myNum was lost: got %v", entry["_myNum"])
+		}
+		if v, ok := entry["_launch"].(bool); !ok || !v {
+			t.Errorf("_launch marker missing or false: got %v", entry["_launch"])
+		}
+	})
+
+	t.Run("migrate legacy [Ollama] suffix entries", func(t *testing.T) {
+		cleanup()
+		// Write a config with a legacy entry (has [Ollama] suffix but no _launch marker)
+		os.MkdirAll(configDir, 0o755)
+		os.WriteFile(configPath, []byte(`{"provider":{"ollama":{"models":{"llama3.2":{"name":"llama3.2 [Ollama]"}}}}}`), 0o644)
+
+		if err := o.Edit([]string{"llama3.2"}); err != nil {
+			t.Fatal(err)
+		}
+
+		data, _ := os.ReadFile(configPath)
+		var cfg map[string]any
+		json.Unmarshal(data, &cfg)
+		provider := cfg["provider"].(map[string]any)
+		ollama := provider["ollama"].(map[string]any)
+		models := ollama["models"].(map[string]any)
+		entry := models["llama3.2"].(map[string]any)
+
+		// _launch marker should be added
+		if v, ok := entry["_launch"].(bool); !ok || !v {
+			t.Errorf("_launch marker not added during migration: got %v", entry["_launch"])
+		}
+		// [Ollama] suffix should be stripped
+		if name, ok := entry["name"].(string); !ok || name != "llama3.2" {
+			t.Errorf("name suffix not stripped: got %q", entry["name"])
+		}
+	})
+
 	t.Run("remove model preserves non-ollama models", func(t *testing.T) {
 		cleanup()
 		os.MkdirAll(configDir, 0o755)
--- a/cmd/config/selector.go
+++ b/cmd/config/selector.go
@@ -275,7 +275,11 @@ func parseInput(r io.Reader) (inputEvent, byte, error) {
 func renderSelect(w io.Writer, prompt string, s *selectState) int {
 	filtered := s.filtered()

-	fmt.Fprintf(w, "%s %s\r\n", prompt, s.filter)
+	if s.filter == "" {
+		fmt.Fprintf(w, "%s %sType to filter...%s\r\n", prompt, ansiGray, ansiReset)
+	} else {
+		fmt.Fprintf(w, "%s %s\r\n", prompt, s.filter)
+	}
 	lineCount := 1

 	if len(filtered) == 0 {
@@ -314,7 +318,11 @@ func renderSelect(w io.Writer, prompt string, s *selectState) int {
 func renderMultiSelect(w io.Writer, prompt string, s *multiSelectState) int {
 	filtered := s.filtered()

-	fmt.Fprintf(w, "%s %s\r\n", prompt, s.filter)
+	if s.filter == "" {
+		fmt.Fprintf(w, "%s %sType to filter...%s\r\n", prompt, ansiGray, ansiReset)
+	} else {
+		fmt.Fprintf(w, "%s %s\r\n", prompt, s.filter)
+	}
 	lineCount := 1

 	if len(filtered) == 0 {
@@ -345,10 +353,15 @@ func renderMultiSelect(w io.Writer, prompt string, s *multiSelectState) int {
 				suffix = " " + ansiGray + "(default)" + ansiReset
 			}

+			desc := ""
+			if item.Description != "" {
+				desc = " " + ansiGray + "- " + item.Description + ansiReset
+			}
+
 			if idx == s.highlighted && !s.focusOnButton {
-				fmt.Fprintf(w, "  %s%s %s %s%s%s\r\n", ansiBold, prefix, checkbox, item.Name, ansiReset, suffix)
+				fmt.Fprintf(w, "  %s%s %s %s%s%s%s\r\n", ansiBold, prefix, checkbox, item.Name, ansiReset, desc, suffix)
 			} else {
-				fmt.Fprintf(w, "  %s %s %s%s\r\n", prefix, checkbox, item.Name, suffix)
+				fmt.Fprintf(w, "  %s %s %s%s%s\r\n", prefix, checkbox, item.Name, desc, suffix)
 			}
 			lineCount++
 		}
--- a/docs/api/anthropic-compatibility.mdx
+++ b/docs/api/anthropic-compatibility.mdx
@@ -4,16 +4,6 @@ title: Anthropic compatibility

 Ollama provides compatibility with the [Anthropic Messages API](https://docs.anthropic.com/en/api/messages) to help connect existing applications to Ollama, including tools like Claude Code.

-## Recommended models
-
-For coding use cases, models like `glm-4.7:cloud`, `minimax-m2.1:cloud`, and `qwen3-coder` are recommended.
-
-Pull a model before use:
-```shell
-ollama pull qwen3-coder
-ollama pull glm-4.7:cloud
-```
-
 ## Usage

 ### Environment variables
@@ -22,8 +12,8 @@ To use Ollama with tools that expect the Anthropic API (like Claude Code), set t

 ```shell
 export ANTHROPIC_AUTH_TOKEN=ollama  # required but ignored
+export ANTHROPIC_API_KEY="" # required but ignored
 export ANTHROPIC_BASE_URL=http://localhost:11434
-export ANTHROPIC_API_KEY=ollama  # required but ignored
 ```

 ### Simple `/v1/messages` example
@@ -245,10 +235,41 @@ curl -X POST http://localhost:11434/v1/messages \

 ## Using with Claude Code

-[Claude Code](https://code.claude.com/docs/en/overview) can be configured to use Ollama as its backend:
+[Claude Code](https://code.claude.com/docs/en/overview) can be configured to use Ollama as its backend. 
+
+### Recommended models
+
+For coding use cases, models like `glm-4.7`, `minimax-m2.1`, and `qwen3-coder` are recommended.
+
+Download a model before use:

 ```shell
-ANTHROPIC_AUTH_TOKEN=ollama ANTHROPIC_BASE_URL=http://localhost:11434 ANTHROPIC_API_KEY=ollama claude --model qwen3-coder
+ollama pull qwen3-coder
+```
+> Note: Qwen 3 coder is a 30B parameter model requiring at least 24GB of VRAM to run smoothly. More is required for longer context lengths. 
+
+```shell
+ollama pull glm-4.7:cloud
+```
+
+### Quick setup
+
+```shell
+ollama launch claude
+```
+
+This will prompt you to select a model, configure Claude Code automatically, and launch it. To configure without launching:
+
+```shell
+ollama launch claude --config
+```
+
+### Manual setup
+
+Set the environment variables and run Claude Code:
+
+```shell
+ANTHROPIC_AUTH_TOKEN=ollama ANTHROPIC_BASE_URL=http://localhost:11434 ANTHROPIC_API_KEY="" claude --model qwen3-coder
 ```

 Or set the environment variables in your shell profile:
@@ -256,19 +277,13 @@ Or set the environment variables in your shell profile:
 ```shell
 export ANTHROPIC_AUTH_TOKEN=ollama
 export ANTHROPIC_BASE_URL=http://localhost:11434
-export ANTHROPIC_API_KEY=ollama
+export ANTHROPIC_API_KEY=""
 ```

 Then run Claude Code with any Ollama model:

 ```shell
-# Local models
 claude --model qwen3-coder
-claude --model gpt-oss:20b
-
-# Cloud models
-claude --model glm-4.7:cloud
-claude --model minimax-m2.1:cloud
 ```

 ## Endpoints
--- a/docs/cli.mdx
+++ b/docs/cli.mdx
@@ -8,6 +8,47 @@ title: CLI Reference
 ollama run gemma3
 ```

+### Launch integrations
+
+```
+ollama launch
+```
+
+Configure and launch external applications to use Ollama models. This provides an interactive way to set up and start integrations with supported apps.
+
+#### Supported integrations
+
+- **OpenCode** - Open-source coding assistant
+- **Claude Code** - Anthropic's agentic coding tool
+- **Codex** - OpenAI's coding assistant
+- **Droid** - Factory's AI coding agent
+
+#### Examples
+
+Launch an integration interactively:
+
+```
+ollama launch
+```
+
+Launch a specific integration:
+
+```
+ollama launch claude
+```
+
+Launch with a specific model:
+
+```
+ollama launch claude --model qwen3-coder
+```
+
+Configure without launching:
+
+```
+ollama launch droid --config
+```
+
 #### Multiline input

 For multiline input, you can wrap text with `"""`:
--- a/docs/cloud.mdx
+++ b/docs/cloud.mdx
@@ -3,8 +3,6 @@ title: Cloud
 sidebarTitle: Cloud
 ---

-<Info>Ollama's cloud is currently in preview.</Info>
-
 ## Cloud Models

 Ollama's cloud models are a new kind of model in Ollama that can run without a powerful GPU. Instead, cloud models are automatically offloaded to Ollama's cloud service while offering the same capabilities as local models, making it possible to keep using your local tools while running larger models that wouldn't fit on a personal computer.
--- a/docs/context-length.mdx
+++ b/docs/context-length.mdx
@@ -8,7 +8,7 @@ Context length is the maximum number of tokens that the model has access to in m
  The default context length in Ollama is 4096 tokens.
 </Note>

-Tasks which require large context like web search, agents, and coding tools should be set to at least 32000 tokens.
+Tasks which require large context like web search, agents, and coding tools should be set to at least 64000 tokens.

 ## Setting context length

@@ -24,7 +24,7 @@ Change the slider in the Ollama app under settings to your desired context lengt
 ### CLI
 If editing the context length for Ollama is not possible, the context length can also be updated when serving Ollama.  
 ```
-OLLAMA_CONTEXT_LENGTH=32000 ollama serve
+OLLAMA_CONTEXT_LENGTH=64000 ollama serve
 ```

 ### Check allocated context length and model offloading
--- a/docs/docs.json
+++ b/docs/docs.json
@@ -71,6 +71,10 @@
    {
      "source": "/api",
      "destination": "/api/introduction"
+    },
+    {
+      "source": "/integrations/clawdbot",
+      "destination": "/integrations/openclaw"
    }
  ],
  "navigation": {
@@ -102,18 +106,20 @@
            "group": "Integrations",
            "pages": [
              "/integrations/claude-code",
-              "/integrations/vscode",
-              "/integrations/jetbrains",
-              "/integrations/codex",
              "/integrations/cline",
+              "/integrations/openclaw",
+              "/integrations/codex",
              "/integrations/droid",
              "/integrations/goose",
-              "/integrations/zed",
-              "/integrations/roo-code",
+              "/integrations/jetbrains",
+              "/integrations/marimo",
              "/integrations/n8n",
-              "/integrations/xcode",
              "/integrations/onyx",
-              "/integrations/marimo"
+              "/integrations/opencode",
+              "/integrations/roo-code",
+              "/integrations/vscode",
+              "/integrations/xcode",
+              "/integrations/zed"
            ]
          },
          {
--- a/docs/gpu.mdx
+++ b/docs/gpu.mdx
@@ -10,6 +10,7 @@ Check your compute compatibility to see if your card is supported:

 | Compute Capability | Family              | Cards                                                                                                                          |
 | ------------------ | ------------------- | ------------------------------------------------------------------------------------------------------------------------------ |
+| 12.1               | NVIDIA              | `GB10 (DGX Spark)`                                                                                                             |
 | 12.0               | GeForce RTX 50xx    | `RTX 5060` `RTX 5060 Ti` `RTX 5070` `RTX 5070 Ti` `RTX 5080` `RTX 5090`                                                        |
 |                    | NVIDIA Professional | `RTX PRO 4000 Blackwell` `RTX PRO 4500 Blackwell` `RTX PRO 5000 Blackwell` `RTX PRO 6000 Blackwell`                            |
 | 9.0                | NVIDIA              | `H200` `H100`                                                                                                                  |
@@ -163,4 +164,4 @@ To select specific Vulkan GPU(s), you can set the environment variable
 `GGML_VK_VISIBLE_DEVICES` to one or more numeric IDs on the Ollama server as
 described in the [FAQ](faq#how-do-i-configure-ollama-server). If you
 encounter any problems with Vulkan based GPUs, you can disable all Vulkan GPUs
-by setting `GGML_VK_VISIBLE_DEVICES=-1` 
+by setting `GGML_VK_VISIBLE_DEVICES=-1` 
--- a/docs/import.mdx
+++ b/docs/import.mdx
@@ -134,22 +134,12 @@ success

 ### Supported Quantizations

- `q4_0`
- `q4_1`
- `q5_0`
- `q5_1`
 - `q8_0`

 #### K-means Quantizations

- `q3_K_S`
- `q3_K_M`
- `q3_K_L`
 - `q4_K_S`
 - `q4_K_M`
- `q5_K_S`
- `q5_K_M`
- `q6_K`

 ## Sharing your model on ollama.com

--- a/docs/index.mdx
+++ b/docs/index.mdx
@@ -9,7 +9,7 @@ sidebarTitle: Welcome

 <CardGroup cols={2}>
  <Card title="Quickstart" icon="rocket" href="/quickstart">
-    Get up and running with your first model
+    Get up and running with your first model or integrate Ollama with your favorite tools
  </Card>
  <Card
    title="Download Ollama"
--- a/docs/integrations/claude-code.mdx
+++ b/docs/integrations/claude-code.mdx
@@ -4,7 +4,7 @@ title: Claude Code

 Claude Code is Anthropic's agentic coding tool that can read, modify, and execute code in your working directory. 

-Open models can be used with Claude Code through Ollama's Anthropic-compatible API, enabling you to use models such as `qwen3-coder`, `gpt-oss:20b`, or other models.
+Open models can be used with Claude Code through Ollama's Anthropic-compatible API, enabling you to use models such as `glm-4.7`, `qwen3-coder`, `gpt-oss`. 

 ![Claude Code with Ollama](https://files.ollama.com/claude-code.png)

@@ -26,12 +26,27 @@ irm https://claude.ai/install.ps1 | iex

 ## Usage with Ollama

+### Quick setup
+
+```shell
+ollama launch claude
+```
+
+To configure without launching:
+
+```shell
+ollama launch claude --config
+```
+
+### Manual setup
+
 Claude Code connects to Ollama using the Anthropic-compatible API.

 1. Set the environment variables:

 ```shell
 export ANTHROPIC_AUTH_TOKEN=ollama
+export ANTHROPIC_API_KEY=""
 export ANTHROPIC_BASE_URL=http://localhost:11434
 ```

@@ -44,35 +59,17 @@ claude --model gpt-oss:20b
 Or run with environment variables inline:

 ```shell
-ANTHROPIC_AUTH_TOKEN=ollama ANTHROPIC_BASE_URL=http://localhost:11434 claude --model gpt-oss:20b
+ANTHROPIC_AUTH_TOKEN=ollama ANTHROPIC_BASE_URL=http://localhost:11434 ANTHROPIC_API_KEY="" claude --model qwen3-coder 
 ```

-**Note:** Claude Code requires a large context window. We recommend at least 32K tokens. See the [context length documentation](/context-length) for how to adjust context length in Ollama.
-
-## Connecting to ollama.com
-
-1. Create an [API key](https://ollama.com/settings/keys) on ollama.com
-2. Set the environment variables:
-
-```shell
-export ANTHROPIC_BASE_URL=https://ollama.com
-export ANTHROPIC_API_KEY=<your-api-key>
-```
-
-3. Run Claude Code with a cloud model:
-
-```shell
-claude --model glm-4.7:cloud
-```
+**Note:** Claude Code requires a large context window. We recommend at least 64k tokens. See the [context length documentation](/context-length) for how to adjust context length in Ollama.

 ## Recommended Models

-### Cloud models
- `glm-4.7:cloud` - High-performance cloud model
- `minimax-m2.1:cloud` - Fast cloud model
- `qwen3-coder:480b` - Large coding model
+- `qwen3-coder` 
+- `glm-4.7`
+- `gpt-oss:20b`
+- `gpt-oss:120b`
+
+Cloud models are also available at [ollama.com/search?c=cloud](https://ollama.com/search?c=cloud).

-### Local models
- `qwen3-coder` - Excellent for coding tasks
- `gpt-oss:20b` - Strong general-purpose model
- `gpt-oss:120b` - Larger general-purpose model for more complex tasks
--- a/docs/integrations/codex.mdx
+++ b/docs/integrations/codex.mdx
@@ -13,7 +13,21 @@ npm install -g @openai/codex

 ## Usage with Ollama

-<Note>Codex requires a larger context window. It is recommended to use a context window of at least 32K tokens.</Note>
+<Note>Codex requires a larger context window. It is recommended to use a context window of at least 64k tokens.</Note>
+
+### Quick setup
+
+```
+ollama launch codex
+```
+
+To configure without launching:
+
+```shell
+ollama launch codex --config
+```
+
+### Manual setup

 To use `codex` with Ollama, use the `--oss` flag:

--- a/docs/integrations/droid.mdx
+++ b/docs/integrations/droid.mdx
@@ -11,10 +11,24 @@ Install the [Droid CLI](https://factory.ai/):
 curl -fsSL https://app.factory.ai/cli | sh
 ```

-<Note>Droid requires a larger context window. It is recommended to use a context window of at least 32K tokens. See [Context length](/context-length) for more information.</Note>
+<Note>Droid requires a larger context window. It is recommended to use a context window of at least 64k tokens. See [Context length](/context-length) for more information.</Note>

 ## Usage with Ollama

+### Quick setup
+
+```bash
+ollama launch droid
+```
+
+To configure without launching:
+
+```shell
+ollama launch droid --config
+```
+
+### Manual setup
+
 Add a local configuration block to `~/.factory/config.json`:

 ```json
@@ -73,4 +87,4 @@ Add the cloud configuration block to `~/.factory/config.json`:
   }
   ```

-Run `droid` in a new terminal to load the new settings.
+Run `droid` in a new terminal to load the new settings.
--- a/docs/integrations/openclaw.mdx
+++ b/docs/integrations/openclaw.mdx
@@ -0,0 +1,50 @@
+---
+title: OpenClaw
+---
+
+OpenClaw is a personal AI assistant that runs on your own devices. It bridges messaging services (WhatsApp, Telegram, Slack, Discord, iMessage, and more) to AI coding agents through a centralized gateway.
+
+## Install
+
+Install [OpenClaw](https://openclaw.ai/) 
+
+```bash
+npm install -g openclaw@latest
+```
+
+Then run the onboarding wizard:
+
+```bash
+openclaw onboard --install-daemon
+```
+
+<Note>OpenClaw requires a larger context window. It is recommended to use a context window of at least 64k tokens. See [Context length](/context-length) for more information.</Note>
+
+## Usage with Ollama
+
+### Quick setup
+
+```bash
+ollama launch openclaw
+```
+
+<Note>Previously known as Clawdbot. `ollama launch clawdbot` still works as an alias.</Note>
+
+This configures OpenClaw to use Ollama and starts the gateway.
+If the gateway is already running, no changes need to be made as the gateway will auto-reload the changes. 
+
+
+To configure without launching:
+
+```shell
+ollama launch openclaw --config
+```
+
+## Recommended Models
+
+- `qwen3-coder`
+- `glm-4.7`
+- `gpt-oss:20b`
+- `gpt-oss:120b`
+
+Cloud models are also available at [ollama.com/search?c=cloud](https://ollama.com/search?c=cloud).
--- a/docs/integrations/opencode.mdx
+++ b/docs/integrations/opencode.mdx
@@ -0,0 +1,106 @@
+---
+title: OpenCode
+---
+
+OpenCode is an open-source AI coding assistant that runs in your terminal.
+
+## Install
+
+Install the [OpenCode CLI](https://opencode.ai):
+
+```bash
+curl -fsSL https://opencode.ai/install | bash
+```
+
+<Note>OpenCode requires a larger context window. It is recommended to use a context window of at least 64k tokens. See [Context length](/context-length) for more information.</Note>
+
+## Usage with Ollama
+
+### Quick setup
+
+```bash
+ollama launch opencode
+```
+
+To configure without launching:
+
+```shell
+ollama launch opencode --config
+```
+
+### Manual setup
+
+Add a configuration block to `~/.config/opencode/opencode.json`:
+
+```json
+{
+  "$schema": "https://opencode.ai/config.json",
+  "provider": {
+    "ollama": {
+      "npm": "@ai-sdk/openai-compatible",
+      "name": "Ollama",
+      "options": {
+        "baseURL": "http://localhost:11434/v1"
+      },
+      "models": {
+        "qwen3-coder": {
+          "name": "qwen3-coder"
+        }
+      }
+    }
+  }
+}
+```
+
+## Cloud Models
+
+`glm-4.7:cloud` is the recommended model for use with OpenCode.
+
+Add the cloud configuration to `~/.config/opencode/opencode.json`:
+
+```json
+{
+  "$schema": "https://opencode.ai/config.json",
+  "provider": {
+    "ollama": {
+      "npm": "@ai-sdk/openai-compatible",
+      "name": "Ollama",
+      "options": {
+        "baseURL": "http://localhost:11434/v1"
+      },
+      "models": {
+        "glm-4.7:cloud": {
+          "name": "glm-4.7:cloud"
+        }
+      }
+    }
+  }
+}
+```
+
+## Connecting to ollama.com
+
+1. Create an [API key](https://ollama.com/settings/keys) from ollama.com and export it as `OLLAMA_API_KEY`.
+2. Update `~/.config/opencode/opencode.json` to point to ollama.com:
+
+```json
+{
+  "$schema": "https://opencode.ai/config.json",
+  "provider": {
+    "ollama": {
+      "npm": "@ai-sdk/openai-compatible",
+      "name": "Ollama Cloud",
+      "options": {
+        "baseURL": "https://ollama.com/v1"
+      },
+      "models": {
+        "glm-4.7:cloud": {
+          "name": "glm-4.7:cloud"
+        }
+      }
+    }
+  }
+}
+```
+
+Run `opencode` in a new terminal to load the new settings.
--- a/docs/quickstart.mdx
+++ b/docs/quickstart.mdx
@@ -18,13 +18,13 @@ This quickstart will walk your through running your first model with Ollama. To
  <Tab title="CLI">
    Open a terminal and run the command:

-    ```
+    ```sh
    ollama run gemma3
    ```

  </Tab>
  <Tab title="cURL">
-    ```
+    ```sh
    ollama pull gemma3
    ```

@@ -45,13 +45,13 @@ This quickstart will walk your through running your first model with Ollama. To
  <Tab title="Python">
    Start by downloading a model:

-    ```
+    ```sh
    ollama pull gemma3
    ```

    Then install Ollama's Python library:

-    ```
+    ```sh
    pip install ollama
    ```

@@ -101,3 +101,42 @@ This quickstart will walk your through running your first model with Ollama. To
 </Tabs>

 See a full list of available models [here](https://ollama.com/models).
+
+## Coding 
+
+For coding use cases, we recommend using the `glm-4.7-flash` model. 
+
+Note: this model requires 23 GB of VRAM with 64000 tokens context length.
+```sh
+ollama pull glm-4.7-flash 
+```
+
+Alternatively, you can use a more powerful cloud model (with full context length):
+```sh
+ollama pull glm-4.7:cloud
+```
+
+Use `ollama launch` to quickly set up a coding tool with Ollama models:
+
+```sh
+ollama launch
+```
+
+### Supported integrations
+
+- [OpenCode](/integrations/opencode) - Open-source coding assistant
+- [Claude Code](/integrations/claude-code) - Anthropic's agentic coding tool
+- [Codex](/integrations/codex) - OpenAI's coding assistant
+- [Droid](/integrations/droid) - Factory's AI coding agent
+
+### Launch with a specific model
+
+```sh
+ollama launch claude --model glm-4.7-flash
+```
+
+### Configure without launching
+
+```sh
+ollama launch claude --config
+```
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -201,7 +201,7 @@ var (
 	// Enable the new Ollama engine
 	NewEngine = Bool("OLLAMA_NEW_ENGINE")
 	// ContextLength sets the default context length
-	ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 4096)
+	ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 0)
 	// Auth enables authentication between the Ollama client and server
 	UseAuth = Bool("OLLAMA_AUTH")
 	// Enable Vulkan backend
@@ -290,7 +290,7 @@ func AsMap() map[string]EnvVar {
 		"OLLAMA_ORIGINS":           {"OLLAMA_ORIGINS", AllowedOrigins(), "A comma separated list of allowed origins"},
 		"OLLAMA_SCHED_SPREAD":      {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
 		"OLLAMA_MULTIUSER_CACHE":   {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},
-		"OLLAMA_CONTEXT_LENGTH":    {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 4096)"},
+		"OLLAMA_CONTEXT_LENGTH":    {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 4k/32k/256k based on VRAM)"},
 		"OLLAMA_NEW_ENGINE":        {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"},
 		"OLLAMA_REMOTES":           {"OLLAMA_REMOTES", Remotes(), "Allowed hosts for remote models (default \"ollama.com\")"},

--- a/envconfig/config_test.go
+++ b/envconfig/config_test.go
@@ -282,7 +282,7 @@ func TestVar(t *testing.T) {

 func TestContextLength(t *testing.T) {
 	cases := map[string]uint{
-		"":     4096,
+		"":     0,
 		"2048": 2048,
 	}

--- a/llama/patches/0032-ggml-enable-MLA-flash-attention-for-GLM-4.7-flash.patch
+++ b/llama/patches/0032-ggml-enable-MLA-flash-attention-for-GLM-4.7-flash.patch
@@ -19,20 +19,20 @@ CUDA changes:
 - Add template instances for ncols2=4
 - Fix nbatch_fa values in nvidia_fp32 config (32->64)
 ---
- ggml/src/ggml-cuda/fattn-mma-f16.cuh           | 15 ++++++++++++---
- ggml/src/ggml-cuda/fattn-tile.cuh              | 18 +++++++++++++++++-
- ggml/src/ggml-cuda/fattn.cu                    | 12 ++++++++----
- ...attn-mma-f16-instance-ncols1_16-ncols2_4.cu |  1 +
- ...fattn-mma-f16-instance-ncols1_2-ncols2_4.cu |  1 +
- ...fattn-mma-f16-instance-ncols1_4-ncols2_4.cu |  1 +
- ...fattn-mma-f16-instance-ncols1_8-ncols2_4.cu |  1 +
- ggml/src/ggml-metal/ggml-metal-device.m        |  8 ++------
- ggml/src/ggml-metal/ggml-metal-ops.cpp         |  2 +-
- ggml/src/ggml-metal/ggml-metal.metal           |  1 +
- 10 files changed, 45 insertions(+), 15 deletions(-)
+ ggml/src/ggml-cuda/fattn-mma-f16.cuh          | 40 +++++++++++++++----
+ ggml/src/ggml-cuda/fattn-tile.cuh             | 16 ++++++++
+ ggml/src/ggml-cuda/fattn.cu                   | 12 ++++--
+ ...ttn-mma-f16-instance-ncols1_16-ncols2_4.cu |  1 +
+ ...attn-mma-f16-instance-ncols1_2-ncols2_4.cu |  1 +
+ ...attn-mma-f16-instance-ncols1_4-ncols2_4.cu |  1 +
+ ...attn-mma-f16-instance-ncols1_8-ncols2_4.cu |  1 +
+ ggml/src/ggml-metal/ggml-metal-device.m       |  8 +---
+ ggml/src/ggml-metal/ggml-metal-ops.cpp        |  2 +-
+ ggml/src/ggml-metal/ggml-metal.metal          |  1 +
+ 10 files changed, 64 insertions(+), 19 deletions(-)

 diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
-index 7bd1044c1..a627302f9 100644
+index 7bd1044c1..3dea2205e 100644
 --- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh
 +++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@@ -66,7 +66,8 @@ static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_co
@@ -65,45 +65,103 @@ index 7bd1044c1..a627302f9 100644
     GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 16,  64, 4,  32, 288, 256,  64, 1, false);
     GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 32, 128, 2,  32, 160, 128,  64, 1, false);
     GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 64, 256, 1,  32, 160, 128,  64, 1, false);
-@@ -1585,3 +1588,9 @@ DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 256,  64)
+@@ -397,7 +400,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
+     constexpr int  ncols           = ncols1 * ncols2;
+     constexpr int  cols_per_warp   = T_B_KQ::I;
+     constexpr int  cols_per_thread = 2; // This is specifically KQ columns, Volta only has a single VKQ column.
+-    constexpr int  np              = nwarps * (cols_per_warp/ncols2) / ncols1; // Number of parallel CUDA warps per Q column.
+    constexpr int  np              = cols_per_warp > ncols ? nwarps : nwarps * cols_per_warp/ncols; // Number of parallel CUDA warps per Q column.
+     constexpr int  nbatch_fa       = ggml_cuda_fattn_mma_get_nbatch_fa(DKQ, DV, ncols);
+     constexpr int  nbatch_K2       = ggml_cuda_fattn_mma_get_nbatch_K2(DKQ, DV, ncols);
+     constexpr int  nbatch_V2       = ggml_cuda_fattn_mma_get_nbatch_V2(DKQ, DV, ncols);
+@@ -467,7 +470,6 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
+                 }
+             }
+         } else {
+-            static_assert(cols_per_warp != 8, "cols_per_warp == 8 not implemented");
+ #pragma unroll
+             for (int k_KQ_0 = k0_start; k_KQ_0 < k0_stop; k_KQ_0 += T_A_KQ::J) {
+                 load_ldmatrix(Q_B[0], tile_Q + (threadIdx.y / np)*(T_B_KQ::I*stride_tile_Q) + k_KQ_0, stride_tile_Q);
+@@ -479,8 +481,18 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
+                     T_A_KQ K_A;
+                     load_ldmatrix(K_A, tile_K + i_KQ_0*stride_tile_K + (k_KQ_0 - k0_start), stride_tile_K);
+ 
+-                    // Wide version of KQ_C is column-major => swap A and B.
+-                    mma(KQ_C[i_KQ_00/(np*T_A_KQ::I)], Q_B[0], K_A);
+                    if constexpr (cols_per_warp == 8) {
+                        mma(KQ_C[i_KQ_00/(np*T_A_KQ::I)], K_A, Q_B[0]);
+                    } else {
+                        // Wide version of KQ_C is column-major
+#if defined(AMD_WMMA_AVAILABLE)
+                        // RDNA matrix C is column-major.
+                        mma(KQ_C[i_KQ_00/(np*T_A_KQ::I)], K_A, Q_B[0]);
+#else
+                        // swap A and B for CUDA.
+                        mma(KQ_C[i_KQ_00/(np*T_A_KQ::I)], Q_B[0], K_A);
+#endif // defined(AMD_WMMA_AVAILABLE)
+                    }
+                 }
+             }
+         }
+@@ -841,7 +853,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
+ 
+     constexpr int  cols_per_warp   = T_B_KQ::I;
+     constexpr int  cols_per_thread = 2; // This is specifically KQ columns, Volta only has a single VKQ column.
+-    constexpr int  np              = nwarps * (cols_per_warp/ncols2) / ncols1; // Number of parallel CUDA warps per Q column.
+    constexpr int  np              = cols_per_warp > ncols ? nwarps : nwarps * cols_per_warp/ncols; // Number of parallel CUDA warps per Q column.
+     constexpr int  nbatch_fa       = ggml_cuda_fattn_mma_get_nbatch_fa     (DKQ, DV, ncols);
+     constexpr int  nbatch_K2       = ggml_cuda_fattn_mma_get_nbatch_K2     (DKQ, DV, ncols);
+     constexpr int  nbatch_V2       = ggml_cuda_fattn_mma_get_nbatch_V2     (DKQ, DV, ncols);
+@@ -1353,6 +1365,13 @@ static __global__ void flash_attn_ext_f16(
+         NO_DEVICE_CODE;
+         return;
+     }
+#ifdef VOLTA_MMA_AVAILABLE
+    if (ncols1*ncols2 < 32) {
+        NO_DEVICE_CODE;
+        return;
+    }
+#endif // VOLTA_MMA_AVAILABLE
+
+ #if __CUDA_ARCH__ == GGML_CUDA_CC_TURING
+     if (ncols1*ncols2 > 32) {
+         NO_DEVICE_CODE;
+@@ -1585,3 +1604,8 @@ DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 256,  64)
 extern DECL_FATTN_MMA_F16_CASE(576, 512, 1, 16);
 extern DECL_FATTN_MMA_F16_CASE(576, 512, 2, 16);
 extern DECL_FATTN_MMA_F16_CASE(576, 512, 4, 16);
 +
-+// GLM 4.7 Flash uses gqa_ratio 4:
-+extern DECL_FATTN_MMA_F16_CASE(576, 512, 2,  4);
-+extern DECL_FATTN_MMA_F16_CASE(576, 512, 4,  4);
-+extern DECL_FATTN_MMA_F16_CASE(576, 512, 8,  4);
-+extern DECL_FATTN_MMA_F16_CASE(576, 512, 16, 4);
+// For GLM 4.7 Flash
+extern DECL_FATTN_MMA_F16_CASE(576, 512,  4,  4);
+extern DECL_FATTN_MMA_F16_CASE(576, 512,  8,  4);
+extern DECL_FATTN_MMA_F16_CASE(576, 512, 16,  4);
 diff --git a/ggml/src/ggml-cuda/fattn-tile.cuh b/ggml/src/ggml-cuda/fattn-tile.cuh
-index 7c4d6fe67..6389ba5c4 100644
+index 7c4d6fe67..371be7442 100644
 --- a/ggml/src/ggml-cuda/fattn-tile.cuh
 +++ b/ggml/src/ggml-cuda/fattn-tile.cuh
@@ -68,6 +68,8 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
     GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2,  64,  64)
     GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2,  64,  64)
 
-+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 256, 2,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 128, 2,  64,  64)
 +    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  8, 256, 2,  64,  64)
     GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2,  64,  64)
 
     return 0;
-@@ -122,7 +124,9 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
+@@ -122,6 +124,8 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
     GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2,  32, 128)
     GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2,  32,  64)
 
-    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2,  32,  64)
-+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 256, 2,  64,  64)
-+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  8, 256, 2,  64,  64)
-+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 128, 2,  32,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  8, 256, 2,  32,  64)
+     GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2,  32,  64)
 
     return 0;
- }
@@ -183,6 +187,8 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
     GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2,  32, 128)
     GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2,  32, 128)
 
-+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 256, 2,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 128, 2,  64,  64)
 +    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  8, 256, 2,  64,  64)
     GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2,  64,  64)
     GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 32, 512, 1, 128,  64)
@@ -112,8 +170,8 @@ index 7c4d6fe67..6389ba5c4 100644
     GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 5,  32, 256)
     GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 3,  64, 128)
 
-+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 256, 4,  64,  64)
-+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  8, 256, 4,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 128, 2,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  8, 256, 2,  64,  64)
     GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 4,  64,  64)
     GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 32, 256, 2, 128,  64)
 
--- a/llm/server.go
+++ b/llm/server.go
@@ -80,6 +80,7 @@ type LlamaServer interface {
 	GetPort() int
 	GetDeviceInfos(ctx context.Context) []ml.DeviceInfo
 	HasExited() bool
+	ContextLength() int
 }

 // llmServer is an instance of a runner hosting a single model
@@ -1200,7 +1201,8 @@ func (s *llmServer) initModel(ctx context.Context, req LoadRequest, operation Lo

 	resp, err := http.DefaultClient.Do(r)
 	if err != nil {
-		return nil, fmt.Errorf("do load request: %w", err)
+		slog.Error("do load request", "error", err)
+		return nil, errors.New("model failed to load, this may be due to resource limitations or an internal error, check ollama server logs for details")
 	}
 	defer resp.Body.Close()

@@ -1901,6 +1903,10 @@ func (s *llmServer) VRAMByGPU(id ml.DeviceID) uint64 {
 	return 0
 }

+func (s *llmServer) ContextLength() int {
+	return s.options.NumCtx
+}
+
 func (s *ollamaServer) GetDeviceInfos(ctx context.Context) []ml.DeviceInfo {
 	devices, err := ml.GetDevicesFromRunner(ctx, s)
 	if err != nil {
--- a/ml/backend/ggml/ggml/src/ggml-cuda/fattn-mma-f16.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@@ -400,7 +400,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
    constexpr int  ncols           = ncols1 * ncols2;
    constexpr int  cols_per_warp   = T_B_KQ::I;
    constexpr int  cols_per_thread = 2; // This is specifically KQ columns, Volta only has a single VKQ column.
-    constexpr int  np              = nwarps * (cols_per_warp/ncols2) / ncols1; // Number of parallel CUDA warps per Q column.
+    constexpr int  np              = cols_per_warp > ncols ? nwarps : nwarps * cols_per_warp/ncols; // Number of parallel CUDA warps per Q column.
    constexpr int  nbatch_fa       = ggml_cuda_fattn_mma_get_nbatch_fa(DKQ, DV, ncols);
    constexpr int  nbatch_K2       = ggml_cuda_fattn_mma_get_nbatch_K2(DKQ, DV, ncols);
    constexpr int  nbatch_V2       = ggml_cuda_fattn_mma_get_nbatch_V2(DKQ, DV, ncols);
@@ -470,7 +470,6 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
                }
            }
        } else {
-            static_assert(cols_per_warp != 8, "cols_per_warp == 8 not implemented");
 #pragma unroll
            for (int k_KQ_0 = k0_start; k_KQ_0 < k0_stop; k_KQ_0 += T_A_KQ::J) {
                load_ldmatrix(Q_B[0], tile_Q + (threadIdx.y / np)*(T_B_KQ::I*stride_tile_Q) + k_KQ_0, stride_tile_Q);
@@ -482,8 +481,18 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
                    T_A_KQ K_A;
                    load_ldmatrix(K_A, tile_K + i_KQ_0*stride_tile_K + (k_KQ_0 - k0_start), stride_tile_K);

-                    // Wide version of KQ_C is column-major => swap A and B.
-                    mma(KQ_C[i_KQ_00/(np*T_A_KQ::I)], Q_B[0], K_A);
+                    if constexpr (cols_per_warp == 8) {
+                        mma(KQ_C[i_KQ_00/(np*T_A_KQ::I)], K_A, Q_B[0]);
+                    } else {
+                        // Wide version of KQ_C is column-major
+#if defined(AMD_WMMA_AVAILABLE)
+                        // RDNA matrix C is column-major.
+                        mma(KQ_C[i_KQ_00/(np*T_A_KQ::I)], K_A, Q_B[0]);
+#else
+                        // swap A and B for CUDA.
+                        mma(KQ_C[i_KQ_00/(np*T_A_KQ::I)], Q_B[0], K_A);
+#endif // defined(AMD_WMMA_AVAILABLE)
+                    }
                }
            }
        }
@@ -844,7 +853,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(

    constexpr int  cols_per_warp   = T_B_KQ::I;
    constexpr int  cols_per_thread = 2; // This is specifically KQ columns, Volta only has a single VKQ column.
-    constexpr int  np              = nwarps * (cols_per_warp/ncols2) / ncols1; // Number of parallel CUDA warps per Q column.
+    constexpr int  np              = cols_per_warp > ncols ? nwarps : nwarps * cols_per_warp/ncols; // Number of parallel CUDA warps per Q column.
    constexpr int  nbatch_fa       = ggml_cuda_fattn_mma_get_nbatch_fa     (DKQ, DV, ncols);
    constexpr int  nbatch_K2       = ggml_cuda_fattn_mma_get_nbatch_K2     (DKQ, DV, ncols);
    constexpr int  nbatch_V2       = ggml_cuda_fattn_mma_get_nbatch_V2     (DKQ, DV, ncols);
@@ -1356,6 +1365,13 @@ static __global__ void flash_attn_ext_f16(
        NO_DEVICE_CODE;
        return;
    }
+#ifdef VOLTA_MMA_AVAILABLE
+    if (ncols1*ncols2 < 32) {
+        NO_DEVICE_CODE;
+        return;
+    }
+#endif // VOLTA_MMA_AVAILABLE
+
 #if __CUDA_ARCH__ == GGML_CUDA_CC_TURING
    if (ncols1*ncols2 > 32) {
        NO_DEVICE_CODE;
@@ -1589,8 +1605,7 @@ extern DECL_FATTN_MMA_F16_CASE(576, 512, 1, 16);
 extern DECL_FATTN_MMA_F16_CASE(576, 512, 2, 16);
 extern DECL_FATTN_MMA_F16_CASE(576, 512, 4, 16);

-// GLM 4.7 Flash uses gqa_ratio 4:
-extern DECL_FATTN_MMA_F16_CASE(576, 512, 2,  4);
-extern DECL_FATTN_MMA_F16_CASE(576, 512, 4,  4);
-extern DECL_FATTN_MMA_F16_CASE(576, 512, 8,  4);
-extern DECL_FATTN_MMA_F16_CASE(576, 512, 16, 4);
+// For GLM 4.7 Flash
+extern DECL_FATTN_MMA_F16_CASE(576, 512,  4,  4);
+extern DECL_FATTN_MMA_F16_CASE(576, 512,  8,  4);
+extern DECL_FATTN_MMA_F16_CASE(576, 512, 16,  4);
--- a/ml/backend/ggml/ggml/src/ggml-cuda/fattn-tile.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-tile.cuh
@@ -68,7 +68,7 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2,  64,  64)

-    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 256, 2,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 128, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  8, 256, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2,  64,  64)

@@ -124,9 +124,9 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2,  32, 128)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2,  32,  64)

-    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 256, 2,  64,  64)
-    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  8, 256, 2,  64,  64)
-    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 128, 2,  32,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  8, 256, 2,  32,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2,  32,  64)

    return 0;
 }
@@ -187,7 +187,7 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2,  32, 128)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2,  32, 128)

-    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 256, 2,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 128, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  8, 256, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 32, 512, 1, 128,  64)
@@ -251,8 +251,8 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 5,  32, 256)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 3,  64, 128)

-    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 256, 4,  64,  64)
-    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  8, 256, 4,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 128, 2,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  8, 256, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 4,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 32, 256, 2, 128,  64)

--- a/model/models/glm4moelite/model.go
+++ b/model/models/glm4moelite/model.go
@@ -223,12 +223,7 @@ func New(c fs.Config) (model.Model, error) {

 	keyLength := int(c.Uint("attention.key_length"))
 	valueLength := int(c.Uint("attention.value_length"))
-	kvLoraRank := int(c.Uint("attention.kv_lora_rank"))
-	qkRopeHeadDim := int(c.Uint("rope.dimension_count"))
-
-	// For MLA absorption, the effective key dimension is kvLoraRank + qkRopeHeadDim
-	mlaKeyLength := kvLoraRank + qkRopeHeadDim
-	kqScale := 1.0 / math.Sqrt(float64(mlaKeyLength))
+	kqScale := 1.0 / math.Sqrt(float64(keyLength))

 	var pre []string
 	switch c.String("tokenizer.ggml.pre") {
@@ -246,7 +241,7 @@ func New(c fs.Config) (model.Model, error) {
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
-				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
+				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", false),
 				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
 				EOS: append(
--- a/model/parsers/ministral.go
+++ b/model/parsers/ministral.go
@@ -4,6 +4,7 @@ import (
 	"encoding/json"
 	"fmt"
 	"strings"
+	"unicode"

 	"github.com/ollama/ollama/api"
 )
@@ -17,12 +18,34 @@ const (
 	ministralCollectingToolArgs
 )

+// ministralEvent represents an event emitted during parsing
+type ministralEvent interface {
+	isMinistralEvent()
+}
+
+type ministralEventContent struct {
+	content string
+}
+
+type ministralEventThinking struct {
+	thinking string
+}
+
+type ministralEventToolCall struct {
+	name string
+	args string // raw JSON string
+}
+
+func (ministralEventContent) isMinistralEvent()  {}
+func (ministralEventThinking) isMinistralEvent() {}
+func (ministralEventToolCall) isMinistralEvent() {}
+
 type MinistralParser struct {
 	state              ministralParserState
 	buffer             strings.Builder
 	tools              []api.Tool
 	hasThinkingSupport bool
-	currentTool        *api.Tool
+	pendingToolName    string // stores tool name while collecting args
 }

 func (p *MinistralParser) HasToolSupport() bool {
@@ -63,74 +86,251 @@ func toolByName(tools []api.Tool, n string) (*api.Tool, error) {
 	return nil, fmt.Errorf("tool '%s' not found", n)
 }

-func (p *MinistralParser) Add(s string, done bool) (content string, thinking string, calls []api.ToolCall, err error) {
-	p.buffer.WriteString(s)
+const (
+	ministralToolCallsTag = "[TOOL_CALLS]"
+	ministralThinkTag     = "[THINK]"
+	ministralThinkEndTag  = "[/THINK]"
+	ministralArgsTag      = "[ARGS]"
+)
+
+// eat consumes the parser's buffer, and returns a list of any unambiguous
+// events from the current parser state. The second return value indicates
+// whether to keep looping (true when state transitions, false when waiting
+// for more data).
+func (p *MinistralParser) eat() ([]ministralEvent, bool) {
+	var events []ministralEvent

 	switch p.state {
 	case ministralCollectingContent:
-		if strings.Contains(p.buffer.String(), "[TOOL_CALLS]") {
-			before, _ := splitAtTag(&p.buffer, "[TOOL_CALLS]", false)
-			if before != "" {
-				return before, "", calls, nil
+		bufStr := p.buffer.String()
+
+		// Check for [TOOL_CALLS] tag
+		if strings.Contains(bufStr, ministralToolCallsTag) {
+			split := strings.SplitN(bufStr, ministralToolCallsTag, 2)
+			before := strings.TrimRightFunc(split[0], unicode.IsSpace)
+			if len(before) > 0 {
+				events = append(events, ministralEventContent{content: before})
 			}
+			after := split[1]
+			p.buffer.Reset()
+			p.buffer.WriteString(after)
 			p.state = ministralCollectingToolName
-		} else if strings.Contains(p.buffer.String(), "[THINK]") {
+			return events, true
+		}
+
+		// Check for [THINK] tag
+		if strings.Contains(bufStr, ministralThinkTag) {
+			split := strings.SplitN(bufStr, ministralThinkTag, 2)
+			before := strings.TrimRightFunc(split[0], unicode.IsSpace)
+			if len(before) > 0 {
+				events = append(events, ministralEventContent{content: before})
+			}
+			after := split[1]
+			p.buffer.Reset()
+			p.buffer.WriteString(after)
 			p.state = ministralCollectingThinkingContent
-			return "", "", calls, nil
-		} else {
-			p.buffer.Reset()
-			return s, "", calls, nil
+			return events, true
 		}
+
+		// Check for partial tag overlap with [TOOL_CALLS] or [THINK]
+		overlapToolCalls := overlap(bufStr, ministralToolCallsTag)
+		overlapThink := overlap(bufStr, ministralThinkTag)
+		maxOverlap := max(overlapToolCalls, overlapThink)
+
+		if maxOverlap > 0 {
+			// Withhold the potential partial tag
+			beforePartialTag := bufStr[:len(bufStr)-maxOverlap]
+			trailingWS := trailingWhitespaceLen(beforePartialTag)
+			ambiguousStart := len(beforePartialTag) - trailingWS
+			unambiguous := bufStr[:ambiguousStart]
+			ambiguous := bufStr[ambiguousStart:]
+			p.buffer.Reset()
+			p.buffer.WriteString(ambiguous)
+			if len(unambiguous) > 0 {
+				events = append(events, ministralEventContent{content: unambiguous})
+			}
+			return events, false
+		}
+
+		// No tag found: emit content but withhold trailing whitespace
+		whitespaceLen := trailingWhitespaceLen(bufStr)
+		ambiguousStart := len(bufStr) - whitespaceLen
+		unambiguous := bufStr[:ambiguousStart]
+		ambiguous := bufStr[ambiguousStart:]
+		p.buffer.Reset()
+		p.buffer.WriteString(ambiguous)
+		if len(unambiguous) > 0 {
+			events = append(events, ministralEventContent{content: unambiguous})
+		}
+		return events, false
+
 	case ministralCollectingThinkingContent:
-		if strings.Contains(p.buffer.String(), "[/THINK]") {
-			thinkingContent, after := splitAtTag(&p.buffer, "[/THINK]", true)
-			p.state = ministralCollectingContent
-			if after != "" {
-				p.buffer.Reset()
-				return after, thinkingContent, calls, nil
-			}
-			return "", thinkingContent, calls, nil
-		} else {
+		bufStr := p.buffer.String()
+
+		if strings.Contains(bufStr, ministralThinkEndTag) {
+			split := strings.SplitN(bufStr, ministralThinkEndTag, 2)
+			thinkingContent := split[0]
+			after := strings.TrimLeftFunc(split[1], unicode.IsSpace)
 			p.buffer.Reset()
-			return "", s, calls, nil
-		}
-	case ministralCollectingToolName:
-		if strings.Contains(p.buffer.String(), "[ARGS]") {
-			name, _ := splitAtTag(&p.buffer, "[ARGS]", false)
-
-			t, err := toolByName(p.tools, name)
-			if err != nil {
-				return "", "", calls, err
+			p.buffer.WriteString(after)
+			if len(thinkingContent) > 0 {
+				events = append(events, ministralEventThinking{thinking: thinkingContent})
 			}
-			p.currentTool = t
-			p.state = ministralCollectingToolArgs
-			return "", "", calls, nil
-		}
-		return "", "", calls, nil
-	case ministralCollectingToolArgs:
-		if strings.Contains(p.buffer.String(), "}") {
-			before, _ := splitAtTag(&p.buffer, "}", false)
-			before += "}"
-
-			var args api.ToolCallFunctionArguments
-			if err := json.Unmarshal([]byte(before), &args); err != nil {
-				// todo - throw a better error
-				return "", "", calls, err
-			}
-
 			p.state = ministralCollectingContent
+			return events, true
+		}

-			call := api.ToolCall{
+		// Check for partial overlap with [/THINK]
+		if overlapLen := overlap(bufStr, ministralThinkEndTag); overlapLen > 0 {
+			unambiguous := bufStr[:len(bufStr)-overlapLen]
+			ambiguous := bufStr[len(bufStr)-overlapLen:]
+			p.buffer.Reset()
+			p.buffer.WriteString(ambiguous)
+			if len(unambiguous) > 0 {
+				events = append(events, ministralEventThinking{thinking: unambiguous})
+			}
+			return events, false
+		}
+
+		// No tag found: emit all thinking content
+		p.buffer.Reset()
+		if len(bufStr) > 0 {
+			events = append(events, ministralEventThinking{thinking: bufStr})
+		}
+		return events, false
+
+	case ministralCollectingToolName:
+		bufStr := p.buffer.String()
+
+		if strings.Contains(bufStr, ministralArgsTag) {
+			split := strings.SplitN(bufStr, ministralArgsTag, 2)
+			toolName := split[0]
+			after := split[1]
+			p.pendingToolName = toolName
+			p.buffer.Reset()
+			p.buffer.WriteString(after)
+			p.state = ministralCollectingToolArgs
+			return events, true
+		}
+		// Wait for more data
+		return events, false
+
+	case ministralCollectingToolArgs:
+		bufStr := p.buffer.String()
+		jsonEnd := findJSONEnd(bufStr)
+
+		if jsonEnd != -1 {
+			jsonStr := bufStr[:jsonEnd+1]
+			remaining := bufStr[jsonEnd+1:]
+
+			events = append(events, ministralEventToolCall{
+				name: p.pendingToolName,
+				args: jsonStr,
+			})
+
+			p.pendingToolName = ""
+			p.buffer.Reset()
+			p.buffer.WriteString(remaining)
+			p.state = ministralCollectingContent
+			return events, true
+		}
+		// Wait for more data
+		return events, false
+
+	default:
+		panic("unexpected ministral event")
+	}
+}
+
+// parseEvents loops calling eat() until it returns false
+func (p *MinistralParser) parseEvents() []ministralEvent {
+	var all []ministralEvent
+	keepLooping := true
+	for keepLooping {
+		var events []ministralEvent
+		events, keepLooping = p.eat()
+		all = append(all, events...)
+	}
+	return all
+}
+
+func (p *MinistralParser) Add(s string, done bool) (content string, thinking string, calls []api.ToolCall, err error) {
+	p.buffer.WriteString(s)
+
+	events := p.parseEvents()
+
+	var contentBuilder, thinkingBuilder strings.Builder
+	var toolCalls []api.ToolCall
+
+	for _, event := range events {
+		switch e := event.(type) {
+		case ministralEventContent:
+			contentBuilder.WriteString(e.content)
+		case ministralEventThinking:
+			thinkingBuilder.WriteString(e.thinking)
+		case ministralEventToolCall:
+			// Validate tool exists
+			tool, toolErr := toolByName(p.tools, e.name)
+			if toolErr != nil {
+				return contentBuilder.String(), thinkingBuilder.String(), toolCalls, toolErr
+			}
+			// Parse JSON arguments
+			var args api.ToolCallFunctionArguments
+			if jsonErr := json.Unmarshal([]byte(e.args), &args); jsonErr != nil {
+				return contentBuilder.String(), thinkingBuilder.String(), toolCalls, jsonErr
+			}
+			toolCalls = append(toolCalls, api.ToolCall{
 				Function: api.ToolCallFunction{
-					Name:      p.currentTool.Function.Name,
+					Name:      tool.Function.Name,
 					Arguments: args,
 				},
-			}
-			calls = append(calls, call)
-			return "", "", calls, nil
+			})
 		}
-		return "", "", calls, nil
 	}

-	return p.buffer.String(), thinking, calls, nil
+	return contentBuilder.String(), thinkingBuilder.String(), toolCalls, nil
+}
+
+// findJSONEnd finds the index of the closing brace that completes a JSON object.
+// It properly handles nested objects, arrays, and strings (including escaped characters).
+// Returns -1 if the JSON is not yet complete.
+func findJSONEnd(s string) int {
+	depth := 0
+	inString := false
+	escaped := false
+
+	for i, r := range s {
+		if inString {
+			switch {
+			case escaped:
+				// If the previous character was a backslash, skip this character
+				escaped = false
+			case r == '\\':
+				// Mark the next character as escaped
+				escaped = true
+			case r == '"':
+				// End of string literal
+				inString = false
+			}
+			continue
+		}
+
+		switch r {
+		case '"':
+			// Start of string literal
+			inString = true
+		case '{', '[':
+			// Increase nesting level for objects and arrays
+			depth++
+		case '}', ']':
+			// Decrease nesting level
+			depth--
+			if depth == 0 {
+				// Reached the end of the root JSON structure
+				return i
+			}
+		}
+	}
+
+	return -1
 }
--- a/model/parsers/ministral_test.go
+++ b/model/parsers/ministral_test.go
@@ -0,0 +1,545 @@
+package parsers
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/ollama/ollama/api"
+)
+
+func TestMinistralParserStreaming(t *testing.T) {
+	type step struct {
+		input      string
+		wantEvents []ministralEvent
+	}
+
+	cases := []struct {
+		desc  string
+		tools []api.Tool
+		steps []step
+		think bool // whether to enable thinking support
+	}{
+		// Content streaming
+		{
+			desc: "simple content",
+			steps: []step{
+				{input: "Hello, how can I help you?", wantEvents: []ministralEvent{
+					ministralEventContent{content: "Hello, how can I help you?"},
+				}},
+			},
+		},
+		{
+			desc: "streaming content word by word",
+			steps: []step{
+				{input: "Hello,", wantEvents: []ministralEvent{ministralEventContent{content: "Hello,"}}},
+				{input: " how", wantEvents: []ministralEvent{ministralEventContent{content: " how"}}},
+				{input: " can I help?", wantEvents: []ministralEvent{ministralEventContent{content: " can I help?"}}},
+			},
+		},
+
+		// Simple tool calls
+		{
+			desc:  "simple tool call",
+			tools: []api.Tool{{Function: api.ToolFunction{Name: "get_weather"}}},
+			steps: []step{
+				{input: `[TOOL_CALLS]get_weather[ARGS]{"location": "San Francisco"}`, wantEvents: []ministralEvent{
+					ministralEventToolCall{name: "get_weather", args: `{"location": "San Francisco"}`},
+				}},
+			},
+		},
+		{
+			desc:  "tool call with nested object",
+			tools: []api.Tool{{Function: api.ToolFunction{Name: "create_entities"}}},
+			steps: []step{
+				{input: `[TOOL_CALLS]create_entities[ARGS]{"entities": [{"entityType": "Person", "name": "Jack", "observations": ["Works as a baker"]}]}`, wantEvents: []ministralEvent{
+					ministralEventToolCall{name: "create_entities", args: `{"entities": [{"entityType": "Person", "name": "Jack", "observations": ["Works as a baker"]}]}`},
+				}},
+			},
+		},
+		{
+			desc:  "tool call with deeply nested objects",
+			tools: []api.Tool{{Function: api.ToolFunction{Name: "update_config"}}},
+			steps: []step{
+				{input: `[TOOL_CALLS]update_config[ARGS]{"settings": {"user": {"profile": {"name": "John", "age": 30}}, "theme": "dark"}}`, wantEvents: []ministralEvent{
+					ministralEventToolCall{name: "update_config", args: `{"settings": {"user": {"profile": {"name": "John", "age": 30}}, "theme": "dark"}}`},
+				}},
+			},
+		},
+		{
+			desc:  "tool call with array of objects",
+			tools: []api.Tool{{Function: api.ToolFunction{Name: "process_items"}}},
+			steps: []step{
+				{input: `[TOOL_CALLS]process_items[ARGS]{"items": [{"id": 1}, {"id": 2}, {"id": 3}]}`, wantEvents: []ministralEvent{
+					ministralEventToolCall{name: "process_items", args: `{"items": [{"id": 1}, {"id": 2}, {"id": 3}]}`},
+				}},
+			},
+		},
+		{
+			desc:  "tool call with escaped quotes in string",
+			tools: []api.Tool{{Function: api.ToolFunction{Name: "search"}}},
+			steps: []step{
+				{input: `[TOOL_CALLS]search[ARGS]{"query": "say \"hello\""}`, wantEvents: []ministralEvent{
+					ministralEventToolCall{name: "search", args: `{"query": "say \"hello\""}`},
+				}},
+			},
+		},
+		{
+			desc:  "tool call with braces inside string",
+			tools: []api.Tool{{Function: api.ToolFunction{Name: "format"}}},
+			steps: []step{
+				{input: `[TOOL_CALLS]format[ARGS]{"template": "Hello {name}!"}`, wantEvents: []ministralEvent{
+					ministralEventToolCall{name: "format", args: `{"template": "Hello {name}!"}`},
+				}},
+			},
+		},
+		{
+			desc:  "empty JSON object",
+			tools: []api.Tool{{Function: api.ToolFunction{Name: "no_args"}}},
+			steps: []step{
+				{input: `[TOOL_CALLS]no_args[ARGS]{}`, wantEvents: []ministralEvent{
+					ministralEventToolCall{name: "no_args", args: `{}`},
+				}},
+			},
+		},
+		{
+			desc:  "JSON with newlines in string",
+			tools: []api.Tool{{Function: api.ToolFunction{Name: "write"}}},
+			steps: []step{
+				{input: `[TOOL_CALLS]write[ARGS]{"content": "line1\nline2\nline3"}`, wantEvents: []ministralEvent{
+					ministralEventToolCall{name: "write", args: `{"content": "line1\nline2\nline3"}`},
+				}},
+			},
+		},
+		{
+			desc:  "backslash in string value",
+			tools: []api.Tool{{Function: api.ToolFunction{Name: "path"}}},
+			steps: []step{
+				{input: `[TOOL_CALLS]path[ARGS]{"dir": "C:\\Users\\test"}`, wantEvents: []ministralEvent{
+					ministralEventToolCall{name: "path", args: `{"dir": "C:\\Users\\test"}`},
+				}},
+			},
+		},
+
+		// Content after tool call
+		{
+			desc:  "content after tool call",
+			tools: []api.Tool{{Function: api.ToolFunction{Name: "test"}}},
+			steps: []step{
+				// NOTE: It's unclear if this is valid Ministral output, but the parser
+				// currently treats text after a tool call as regular content. This test
+				// documents that behavior so we notice if it changes.
+				{input: `[TOOL_CALLS]test[ARGS]{"a": 1}some content after`, wantEvents: []ministralEvent{
+					ministralEventToolCall{name: "test", args: `{"a": 1}`},
+					ministralEventContent{content: "some content after"},
+				}},
+			},
+		},
+
+		// Multiple tool calls
+		{
+			desc: "multiple tool calls in sequence",
+			tools: []api.Tool{
+				{Function: api.ToolFunction{Name: "get_weather"}},
+				{Function: api.ToolFunction{Name: "get_time"}},
+			},
+			steps: []step{
+				{input: `[TOOL_CALLS]get_weather[ARGS]{"location": "NYC"}[TOOL_CALLS]get_time[ARGS]{"timezone": "EST"}`, wantEvents: []ministralEvent{
+					ministralEventToolCall{name: "get_weather", args: `{"location": "NYC"}`},
+					ministralEventToolCall{name: "get_time", args: `{"timezone": "EST"}`},
+				}},
+			},
+		},
+		{
+			desc: "multiple tool calls streamed separately",
+			tools: []api.Tool{
+				{Function: api.ToolFunction{Name: "tool_a"}},
+				{Function: api.ToolFunction{Name: "tool_b"}},
+			},
+			steps: []step{
+				{input: `[TOOL_CALLS]tool_a[ARGS]{"x": 1}`, wantEvents: []ministralEvent{
+					ministralEventToolCall{name: "tool_a", args: `{"x": 1}`},
+				}},
+				{input: `[TOOL_CALLS]tool_b[ARGS]{"y": 2}`, wantEvents: []ministralEvent{
+					ministralEventToolCall{name: "tool_b", args: `{"y": 2}`},
+				}},
+			},
+		},
+
+		// Streaming tool calls
+		{
+			desc:  "streaming tool call with nested objects",
+			tools: []api.Tool{{Function: api.ToolFunction{Name: "create_entities"}}},
+			steps: []step{
+				{input: "[TOOL_CALLS]create_entities[ARGS]", wantEvents: []ministralEvent{}},
+				{input: `{"entities": [{"entityType": "Person",`, wantEvents: []ministralEvent{}},
+				{input: ` "name": "Jack",`, wantEvents: []ministralEvent{}},
+				{input: ` "observations": ["Works`, wantEvents: []ministralEvent{}},
+				{input: ` as a baker"]}`, wantEvents: []ministralEvent{}},
+				{input: `]}`, wantEvents: []ministralEvent{
+					ministralEventToolCall{name: "create_entities", args: `{"entities": [{"entityType": "Person", "name": "Jack", "observations": ["Works as a baker"]}]}`},
+				}},
+			},
+		},
+		{
+			desc:  "streaming with incomplete JSON waits for completion",
+			tools: []api.Tool{{Function: api.ToolFunction{Name: "test"}}},
+			steps: []step{
+				{input: "[TOOL_CALLS]test[ARGS]{", wantEvents: []ministralEvent{}},
+				{input: `"a": {`, wantEvents: []ministralEvent{}},
+				{input: `"b": 1`, wantEvents: []ministralEvent{}},
+				{input: `}`, wantEvents: []ministralEvent{}},
+				{input: `}`, wantEvents: []ministralEvent{
+					ministralEventToolCall{name: "test", args: `{"a": {"b": 1}}`},
+				}},
+			},
+		},
+
+		// Partial tag handling
+		{
+			desc: "partial tool tag fakeout",
+			steps: []step{
+				{input: "abc[TOOL", wantEvents: []ministralEvent{ministralEventContent{content: "abc"}}},
+				{input: " not a tag", wantEvents: []ministralEvent{ministralEventContent{content: "[TOOL not a tag"}}},
+			},
+		},
+		{
+			desc:  "tool call tag split across chunks",
+			tools: []api.Tool{{Function: api.ToolFunction{Name: "test"}}},
+			steps: []step{
+				{input: "[TOOL_", wantEvents: []ministralEvent{}},
+				{input: "CALLS]test[ARGS]{}", wantEvents: []ministralEvent{
+					ministralEventToolCall{name: "test", args: `{}`},
+				}},
+			},
+		},
+		{
+			desc:  "content before tool call",
+			tools: []api.Tool{{Function: api.ToolFunction{Name: "get_weather"}}},
+			steps: []step{
+				{input: "hello [TOOL_CALLS]get_weather[ARGS]{}", wantEvents: []ministralEvent{
+					ministralEventContent{content: "hello"},
+					ministralEventToolCall{name: "get_weather", args: `{}`},
+				}},
+			},
+		},
+		{
+			desc:  "whitespace between content and tool call is trimmed",
+			tools: []api.Tool{{Function: api.ToolFunction{Name: "test"}}},
+			steps: []step{
+				{input: "content \n [TOOL_CALLS]test[ARGS]{}", wantEvents: []ministralEvent{
+					ministralEventContent{content: "content"},
+					ministralEventToolCall{name: "test", args: `{}`},
+				}},
+			},
+		},
+		{
+			desc:  "tabs and newlines before tool call are trimmed",
+			tools: []api.Tool{{Function: api.ToolFunction{Name: "test"}}},
+			steps: []step{
+				{input: "content\t\n\t[TOOL_CALLS]test[ARGS]{}", wantEvents: []ministralEvent{
+					ministralEventContent{content: "content"},
+					ministralEventToolCall{name: "test", args: `{}`},
+				}},
+			},
+		},
+		{
+			desc:  "non-breaking space before tool call is trimmed",
+			tools: []api.Tool{{Function: api.ToolFunction{Name: "test"}}},
+			steps: []step{
+				// \u00a0 is non-breaking space, which unicode.IsSpace considers whitespace
+				{input: "content\u00a0[TOOL_CALLS]test[ARGS]{}", wantEvents: []ministralEvent{
+					ministralEventContent{content: "content"},
+					ministralEventToolCall{name: "test", args: `{}`},
+				}},
+			},
+		},
+		{
+			desc: "whitespace before THINK tag is trimmed",
+			steps: []step{
+				{input: "content \n [THINK]thinking[/THINK]after", wantEvents: []ministralEvent{
+					ministralEventContent{content: "content"},
+					ministralEventThinking{thinking: "thinking"},
+					ministralEventContent{content: "after"},
+				}},
+			},
+		},
+		{
+			desc: "trailing whitespace withheld then emitted",
+			steps: []step{
+				{input: "Hello ", wantEvents: []ministralEvent{ministralEventContent{content: "Hello"}}},
+				{input: "world", wantEvents: []ministralEvent{ministralEventContent{content: " world"}}},
+			},
+		},
+		{
+			desc: "trailing newline withheld then emitted",
+			steps: []step{
+				{input: "Hello\n", wantEvents: []ministralEvent{ministralEventContent{content: "Hello"}}},
+				{input: "world", wantEvents: []ministralEvent{ministralEventContent{content: "\nworld"}}},
+			},
+		},
+
+		// Thinking support
+		{
+			desc:  "thinking content",
+			think: true,
+			steps: []step{
+				{input: "thinking here[/THINK]", wantEvents: []ministralEvent{
+					ministralEventThinking{thinking: "thinking here"},
+				}},
+				{input: "content after", wantEvents: []ministralEvent{
+					ministralEventContent{content: "content after"},
+				}},
+			},
+		},
+		{
+			desc:  "thinking with whitespace after end tag",
+			think: true,
+			steps: []step{
+				{input: "my thoughts[/THINK]  \n  response", wantEvents: []ministralEvent{
+					ministralEventThinking{thinking: "my thoughts"},
+					ministralEventContent{content: "response"},
+				}},
+			},
+		},
+		{
+			desc:  "non-breaking space after think end tag is trimmed",
+			think: true,
+			steps: []step{
+				// \u00a0 is non-breaking space
+				{input: "thinking[/THINK]\u00a0response", wantEvents: []ministralEvent{
+					ministralEventThinking{thinking: "thinking"},
+					ministralEventContent{content: "response"},
+				}},
+			},
+		},
+		{
+			desc:  "partial think end tag",
+			think: true,
+			steps: []step{
+				{input: "thinking[/THI", wantEvents: []ministralEvent{ministralEventThinking{thinking: "thinking"}}},
+				{input: "NK]after", wantEvents: []ministralEvent{ministralEventContent{content: "after"}}},
+			},
+		},
+		{
+			desc:  "think tag fakeout",
+			think: true,
+			steps: []step{
+				{input: "thinking[/THI", wantEvents: []ministralEvent{ministralEventThinking{thinking: "thinking"}}},
+				{input: "not end tag", wantEvents: []ministralEvent{ministralEventThinking{thinking: "[/THInot end tag"}}},
+			},
+		},
+		{
+			desc:  "thinking then tool call",
+			think: true,
+			tools: []api.Tool{{Function: api.ToolFunction{Name: "test"}}},
+			steps: []step{
+				{input: "let me think[/THINK][TOOL_CALLS]test[ARGS]{}", wantEvents: []ministralEvent{
+					ministralEventThinking{thinking: "let me think"},
+					ministralEventToolCall{name: "test", args: `{}`},
+				}},
+			},
+		},
+
+		// Content then THINK tag transition
+		{
+			desc: "content then think tag",
+			steps: []step{
+				{input: "content[THINK]thinking[/THINK]more", wantEvents: []ministralEvent{
+					ministralEventContent{content: "content"},
+					ministralEventThinking{thinking: "thinking"},
+					ministralEventContent{content: "more"},
+				}},
+			},
+		},
+
+		// Unicode handling
+		{
+			desc: "unicode content",
+			steps: []step{
+				{input: "你好 🌍 مرحبا", wantEvents: []ministralEvent{
+					ministralEventContent{content: "你好 🌍 مرحبا"},
+				}},
+			},
+		},
+		{
+			desc:  "unicode in tool args",
+			tools: []api.Tool{{Function: api.ToolFunction{Name: "greet"}}},
+			steps: []step{
+				{input: `[TOOL_CALLS]greet[ARGS]{"message": "你好 🌍"}`, wantEvents: []ministralEvent{
+					ministralEventToolCall{name: "greet", args: `{"message": "你好 🌍"}`},
+				}},
+			},
+		},
+	}
+
+	for _, tc := range cases {
+		t.Run(tc.desc, func(t *testing.T) {
+			parser := MinistralParser{}
+			parser.hasThinkingSupport = tc.think
+			parser.Init(tc.tools, nil, nil)
+
+			for i, step := range tc.steps {
+				parser.buffer.WriteString(step.input)
+				gotEvents := parser.parseEvents()
+
+				if len(gotEvents) == 0 && len(step.wantEvents) == 0 {
+					// avoid deep equal on empty vs. nil slices
+					continue
+				}
+
+				if !reflect.DeepEqual(gotEvents, step.wantEvents) {
+					t.Errorf("step %d: input %q: got events %#v, want %#v", i, step.input, gotEvents, step.wantEvents)
+				}
+			}
+		})
+	}
+}
+
+func TestMinistralParser_Errors(t *testing.T) {
+	t.Run("unknown tool returns error", func(t *testing.T) {
+		p := &MinistralParser{}
+		p.Init([]api.Tool{{Function: api.ToolFunction{Name: "known_tool"}}}, nil, nil)
+
+		_, _, _, err := p.Add(`[TOOL_CALLS]unknown_tool[ARGS]{"a": 1}`, true)
+		if err == nil {
+			t.Fatal("expected error for unknown tool")
+		}
+	})
+
+	t.Run("invalid JSON returns error", func(t *testing.T) {
+		p := &MinistralParser{}
+		p.Init([]api.Tool{{Function: api.ToolFunction{Name: "test"}}}, nil, nil)
+
+		_, _, _, err := p.Add(`[TOOL_CALLS]test[ARGS]{invalid json}`, true)
+		if err == nil {
+			t.Fatal("expected error for invalid JSON")
+		}
+	})
+}
+
+func TestFindJSONEnd(t *testing.T) {
+	tests := []struct {
+		name     string
+		input    string
+		expected int
+	}{
+		{
+			name:     "simple object",
+			input:    `{"a": 1}`,
+			expected: 7,
+		},
+		{
+			name:     "nested object",
+			input:    `{"a": {"b": 2}}`,
+			expected: 14,
+		},
+		{
+			name:     "array inside object",
+			input:    `{"items": [1, 2, 3]}`,
+			expected: 19,
+		},
+		{
+			name:     "braces in string",
+			input:    `{"template": "Hello {name}!"}`,
+			expected: 28,
+		},
+		{
+			name:     "escaped quotes",
+			input:    `{"msg": "say \"hi\""}`,
+			expected: 20,
+		},
+		{
+			name:     "incomplete object",
+			input:    `{"a": {"b": 1}`,
+			expected: -1,
+		},
+		{
+			name:     "deeply nested",
+			input:    `{"a": {"b": {"c": {"d": 1}}}}`,
+			expected: 28,
+		},
+		{
+			name:     "object with trailing content",
+			input:    `{"a": 1} extra`,
+			expected: 7,
+		},
+		{
+			name:     "array",
+			input:    `[{"a": 1}, {"b": 2}]`,
+			expected: 19,
+		},
+		{
+			name:     "escaped backslash before quote",
+			input:    `{"path": "C:\\"}`,
+			expected: 15,
+		},
+		{
+			name:     "empty string",
+			input:    "",
+			expected: -1,
+		},
+		{
+			name:     "no opening brace",
+			input:    "hello world",
+			expected: -1,
+		},
+		{
+			name:     "only opening brace",
+			input:    "{",
+			expected: -1,
+		},
+		{
+			name:     "unclosed string",
+			input:    `{"key": "unclosed`,
+			expected: -1,
+		},
+		{
+			name:     "double escaped backslash then quote",
+			input:    `{"path": "C:\\\\"}`,
+			expected: 17,
+		},
+		{
+			name:     "unicode in key and value",
+			input:    `{"키": "값"}`,
+			expected: 13,
+		},
+		{
+			name:     "nested arrays",
+			input:    `{"matrix": [[1, 2], [3, 4]]}`,
+			expected: 27,
+		},
+		{
+			name:     "mixed nesting",
+			input:    `{"a": [{"b": {"c": [1, 2, 3]}}]}`,
+			expected: 31,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := findJSONEnd(tt.input)
+			if result != tt.expected {
+				t.Errorf("findJSONEnd(%q) = %d, want %d", tt.input, result, tt.expected)
+			}
+		})
+	}
+}
+
+func TestMinistralParser_HasToolSupport(t *testing.T) {
+	p := &MinistralParser{}
+	if !p.HasToolSupport() {
+		t.Error("expected HasToolSupport to return true")
+	}
+}
+
+func TestMinistralParser_HasThinkingSupport(t *testing.T) {
+	p := &MinistralParser{hasThinkingSupport: false}
+	if p.HasThinkingSupport() {
+		t.Error("expected HasThinkingSupport to return false")
+	}
+
+	p = &MinistralParser{hasThinkingSupport: true}
+	if !p.HasThinkingSupport() {
+		t.Error("expected HasThinkingSupport to return true")
+	}
+}
--- a/model/parsers/parsers.go
+++ b/model/parsers/parsers.go
@@ -3,6 +3,7 @@ package parsers
 import (
 	"strings"
 	"unicode"
+	"unicode/utf8"

 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/harmony"
@@ -114,3 +115,33 @@ func splitAtTag(sb *strings.Builder, tag string, trimAfter bool) (string, string
 	sb.WriteString(after)
 	return before, after // return events
 }
+
+// overlap returns the longest overlap between the suffix of s and the prefix of delim
+func overlap(s, delim string) int {
+	max := min(len(delim), len(s))
+	for i := max; i > 0; i-- {
+		if strings.HasSuffix(s, delim[:i]) {
+			return i
+		}
+	}
+	return 0
+}
+
+// trailingWhitespaceLen returns the length in bytes of trailing whitespace in s
+func trailingWhitespaceLen(s string) int {
+	remaining := s
+	total := 0
+	for len(remaining) > 0 {
+		r, size := utf8.DecodeLastRuneInString(remaining)
+		// if it's an invalid utf8 rune, assume it isn't whitespace
+		if r == utf8.RuneError && size == 1 {
+			break
+		}
+		if !unicode.IsSpace(r) {
+			break
+		}
+		total += size
+		remaining = remaining[:len(remaining)-size]
+	}
+	return total
+}
--- a/model/parsers/qwen3coder.go
+++ b/model/parsers/qwen3coder.go
@@ -11,7 +11,6 @@ import (
 	"strconv"
 	"strings"
 	"unicode"
-	"unicode/utf8"

 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/logutil"
@@ -194,36 +193,6 @@ func eat(p *Qwen3CoderParser) ([]qwenEvent, bool) {
 	}
 }

-// TODO(drifkin): move this to a shared location
-// longest overlap between suffix of s and prefix of delim
-func overlap(s, delim string) int {
-	max := min(len(delim), len(s))
-	for i := max; i > 0; i-- {
-		if strings.HasSuffix(s, delim[:i]) {
-			return i
-		}
-	}
-	return 0
-}
-
-func trailingWhitespaceLen(s string) int {
-	remaining := s
-	total := 0
-	for len(remaining) > 0 {
-		r, size := utf8.DecodeLastRuneInString(remaining)
-		// if it's an invalid utf8 rune, assume it isn't whitespace
-		if r == utf8.RuneError && size == 1 {
-			break
-		}
-		if !unicode.IsSpace(r) {
-			break
-		}
-		total += size
-		remaining = remaining[:len(remaining)-size]
-	}
-	return total
-}
-
 type XMLFunctionCall struct {
 	XMLName    xml.Name       `xml:"function"`
 	Name       string         `xml:"name,attr"`
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -1358,7 +1358,7 @@ func (s *Server) info(w http.ResponseWriter, r *http.Request) {
 		// Dummy load to get the backend wired up
 		f, err := os.CreateTemp("", "*.bin")
 		if err != nil {
-			http.Error(w, fmt.Sprintf("failed to initialize baackend: %v", err), http.StatusInternalServerError)
+			http.Error(w, fmt.Sprintf("failed to initialize backend: %v", err), http.StatusInternalServerError)
 			return
 		}
 		defer f.Close()
@@ -1368,13 +1368,13 @@ func (s *Server) info(w http.ResponseWriter, r *http.Request) {
 			"general.architecture": "llama",
 			"tokenizer.ggml.model": "gpt2",
 		}, nil); err != nil {
-			http.Error(w, fmt.Sprintf("failed to initialize baackend: %v", err), http.StatusInternalServerError)
+			http.Error(w, fmt.Sprintf("failed to initialize backend: %v", err), http.StatusInternalServerError)
 			return
 		}

 		m, err = model.New(f.Name(), ml.BackendParams{NumThreads: runtime.NumCPU(), AllocMemory: false, GPULayers: ml.GPULayersList{{}}})
 		if err != nil {
-			http.Error(w, fmt.Sprintf("failed to initialize baackend: %v", err), http.StatusInternalServerError)
+			http.Error(w, fmt.Sprintf("failed to initialize backend: %v", err), http.StatusInternalServerError)
 			return
 		}
 		slog.Debug("dummy model load took", "duration", time.Since(startLoad))
--- a/scripts/build_darwin.sh
+++ b/scripts/build_darwin.sh
@@ -14,8 +14,8 @@
 VOL_NAME=${VOL_NAME:-"Ollama"}
 export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")}
 export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${VERSION#v}\" \"-X=github.com/ollama/ollama/server.mode=release\"'"
-export CGO_CFLAGS="-mmacosx-version-min=14.0"
-export CGO_CXXFLAGS="-mmacosx-version-min=14.0"
+export CGO_CFLAGS="-O3 -mmacosx-version-min=14.0"
+export CGO_CXXFLAGS="-O3 -mmacosx-version-min=14.0"
 export CGO_LDFLAGS="-mmacosx-version-min=14.0"

 set -e
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@@ -56,6 +56,12 @@ function checkEnv {

    $script:DIST_DIR="${script:SRC_DIR}\dist\windows-${script:TARGET_ARCH}"
    $env:CGO_ENABLED="1"
+    if (-not $env:CGO_CFLAGS) {
+        $env:CGO_CFLAGS = "-O3"
+    }
+    if (-not $env:CGO_CXXFLAGS) {
+        $env:CGO_CXXFLAGS = "-O3"
+    }
    Write-Output "Checking version"
    if (!$env:VERSION) {
        $data=(git describe --tags --first-parent --abbrev=7 --long --dirty --always)
--- a/server/quantization.go
+++ b/server/quantization.go
@@ -95,6 +95,13 @@ func getTensorNewType(kv fsggml.KV, qs *quantizeState, newType fsggml.TensorType
 			// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
 			newType = fsggml.TensorTypeQ8_0
 		}
+	} else if strings.Contains(name, "attn_k_b.weight") ||
+		strings.Contains(name, "attn_v_b.weight") ||
+		strings.Contains(name, "attn_kv_a_mqa.weight") ||
+		strings.Contains(name, "attn_q_a.weight") ||
+		strings.Contains(name, "attn_q_b.weight") {
+		// MLA tensors need higher precision to avoid quality degradation
+		newType = fsggml.TensorTypeQ8_0
 	} else if strings.Contains(name, "ffn_down") {
 		iLayer := qs.iFfnDown
 		n_layer := qs.nFfnDown
--- a/server/routes.go
+++ b/server/routes.go
@@ -75,16 +75,12 @@ func experimentEnabled(name string) bool {

 var useClient2 = experimentEnabled("client2")

-// Low VRAM mode is based on the sum of total VRAM (not free) and triggers
-// reduced context length on some models
-var lowVRAMThreshold uint64 = 20 * format.GibiByte
-
 var mode string = gin.DebugMode

 type Server struct {
-	addr    net.Addr
-	sched   *Scheduler
-	lowVRAM bool
+	addr          net.Addr
+	sched         *Scheduler
+	defaultNumCtx int
 }

 func init() {
@@ -107,8 +103,12 @@ var (
 	errBadTemplate = errors.New("template error")
 )

-func modelOptions(model *Model, requestOpts map[string]any) (api.Options, error) {
+func (s *Server) modelOptions(model *Model, requestOpts map[string]any) (api.Options, error) {
 	opts := api.DefaultOptions()
+	if opts.NumCtx == 0 {
+		opts.NumCtx = s.defaultNumCtx
+	}
+
 	if err := opts.FromMap(model.Options); err != nil {
 		return api.Options{}, err
 	}
@@ -140,20 +140,11 @@ func (s *Server) scheduleRunner(ctx context.Context, name string, caps []model.C
 		return nil, nil, nil, fmt.Errorf("%s %w", name, err)
 	}

-	opts, err := modelOptions(model, requestOpts)
+	opts, err := s.modelOptions(model, requestOpts)
 	if err != nil {
 		return nil, nil, nil, err
 	}

-	// This model is much more capable with a larger context, so set that
-	// unless it would penalize performance too much
-	if !s.lowVRAM && slices.Contains([]string{
-		"gptoss", "gpt-oss",
-		"qwen3vl", "qwen3vlmoe",
-	}, model.Config.ModelFamily) {
-		opts.NumCtx = max(opts.NumCtx, 8192)
-	}
-
 	runnerCh, errCh := s.sched.GetRunner(ctx, model, opts, keepAlive)
 	var runner *runnerRef
 	select {
@@ -1720,10 +1711,18 @@ func Serve(ln net.Listener) error {
 	for _, gpu := range gpus {
 		totalVRAM += gpu.TotalMemory - envconfig.GpuOverhead()
 	}
-	if totalVRAM < lowVRAMThreshold {
-		s.lowVRAM = true
-		slog.Info("entering low vram mode", "total vram", format.HumanBytes2(totalVRAM), "threshold", format.HumanBytes2(lowVRAMThreshold))
+
+	// Set default context based on VRAM tier
+	// Use slightly lower thresholds (47/23 GiB vs. 48/24 GiB) to account for small differences in the exact value
+	switch {
+	case totalVRAM >= 47*format.GibiByte:
+		s.defaultNumCtx = 262144
+	case totalVRAM >= 23*format.GibiByte:
+		s.defaultNumCtx = 32768
+	default:
+		s.defaultNumCtx = 4096
 	}
+	slog.Info("vram-based default context", "total_vram", format.HumanBytes2(totalVRAM), "default_num_ctx", s.defaultNumCtx)

 	err = srvr.Serve(ln)
 	// If server is closed from the signal handler, wait for the ctx to be done
@@ -1897,8 +1896,8 @@ func (s *Server) PsHandler(c *gin.Context) {
 			Details:   modelDetails,
 			ExpiresAt: v.expiresAt,
 		}
-		if v.Options != nil {
-			mr.ContextLength = v.Options.NumCtx
+		if v.llama != nil {
+			mr.ContextLength = v.llama.ContextLength()
 		}
 		// The scheduler waits to set expiresAt, so if a model is loading it's
 		// possible that it will be set to the unix epoch. For those cases, just
--- a/server/routes_debug_test.go
+++ b/server/routes_debug_test.go
@@ -15,6 +15,7 @@ import (
 )

 func TestGenerateDebugRenderOnly(t *testing.T) {
+	t.Setenv("OLLAMA_CONTEXT_LENGTH", "4096")
 	gin.SetMode(gin.TestMode)

 	mock := mockRunner{
@@ -208,6 +209,7 @@ func TestGenerateDebugRenderOnly(t *testing.T) {
 }

 func TestChatDebugRenderOnly(t *testing.T) {
+	t.Setenv("OLLAMA_CONTEXT_LENGTH", "4096")
 	gin.SetMode(gin.TestMode)

 	mock := mockRunner{
--- a/server/routes_generate_renderer_test.go
+++ b/server/routes_generate_renderer_test.go
@@ -20,6 +20,7 @@ import (
 // TestGenerateWithBuiltinRenderer tests that api/generate uses built-in renderers
 // when in chat-like flow (messages present, no suffix, no template)
 func TestGenerateWithBuiltinRenderer(t *testing.T) {
+	t.Setenv("OLLAMA_CONTEXT_LENGTH", "4096")
 	gin.SetMode(gin.TestMode)

 	mock := mockRunner{
@@ -204,6 +205,7 @@ func TestGenerateWithBuiltinRenderer(t *testing.T) {

 // TestGenerateWithDebugRenderOnly tests that debug_render_only works with built-in renderers
 func TestGenerateWithDebugRenderOnly(t *testing.T) {
+	t.Setenv("OLLAMA_CONTEXT_LENGTH", "4096")
 	gin.SetMode(gin.TestMode)

 	mock := mockRunner{
--- a/server/routes_generate_test.go
+++ b/server/routes_generate_test.go
@@ -162,6 +162,7 @@ func TestGenerateChatRemote(t *testing.T) {
 }

 func TestGenerateChat(t *testing.T) {
+	t.Setenv("OLLAMA_CONTEXT_LENGTH", "4096")
 	gin.SetMode(gin.TestMode)

 	mock := mockRunner{
@@ -878,6 +879,7 @@ func TestGenerateChat(t *testing.T) {
 }

 func TestGenerate(t *testing.T) {
+	t.Setenv("OLLAMA_CONTEXT_LENGTH", "4096")
 	gin.SetMode(gin.TestMode)

 	mock := mockRunner{
@@ -2355,6 +2357,7 @@ func TestGenerateWithImages(t *testing.T) {
 // TestImageGenerateStreamFalse tests that image generation respects stream=false
 // and returns a single JSON response instead of streaming ndjson.
 func TestImageGenerateStreamFalse(t *testing.T) {
+	t.Setenv("OLLAMA_CONTEXT_LENGTH", "4096")
 	gin.SetMode(gin.TestMode)

 	p := t.TempDir()
--- a/server/routes_options_test.go
+++ b/server/routes_options_test.go
@@ -0,0 +1,127 @@
+package server
+
+import (
+	"testing"
+)
+
+func TestModelOptionsNumCtxPriority(t *testing.T) {
+	tests := []struct {
+		name           string
+		envContextLen  string // empty means not set (uses 0 sentinel)
+		defaultNumCtx  int    // VRAM-based default
+		modelNumCtx    int    // 0 means not set in model
+		requestNumCtx  int    // 0 means not set in request
+		expectedNumCtx int
+	}{
+		{
+			name:           "vram default when nothing else set",
+			envContextLen:  "",
+			defaultNumCtx:  32768,
+			modelNumCtx:    0,
+			requestNumCtx:  0,
+			expectedNumCtx: 32768,
+		},
+		{
+			name:           "env var overrides vram default",
+			envContextLen:  "8192",
+			defaultNumCtx:  32768,
+			modelNumCtx:    0,
+			requestNumCtx:  0,
+			expectedNumCtx: 8192,
+		},
+		{
+			name:           "model overrides vram default",
+			envContextLen:  "",
+			defaultNumCtx:  32768,
+			modelNumCtx:    16384,
+			requestNumCtx:  0,
+			expectedNumCtx: 16384,
+		},
+		{
+			name:           "model overrides env var",
+			envContextLen:  "8192",
+			defaultNumCtx:  32768,
+			modelNumCtx:    16384,
+			requestNumCtx:  0,
+			expectedNumCtx: 16384,
+		},
+		{
+			name:           "request overrides everything",
+			envContextLen:  "8192",
+			defaultNumCtx:  32768,
+			modelNumCtx:    16384,
+			requestNumCtx:  4096,
+			expectedNumCtx: 4096,
+		},
+		{
+			name:           "request overrides vram default",
+			envContextLen:  "",
+			defaultNumCtx:  32768,
+			modelNumCtx:    0,
+			requestNumCtx:  4096,
+			expectedNumCtx: 4096,
+		},
+		{
+			name:           "request overrides model",
+			envContextLen:  "",
+			defaultNumCtx:  32768,
+			modelNumCtx:    16384,
+			requestNumCtx:  4096,
+			expectedNumCtx: 4096,
+		},
+		{
+			name:           "low vram tier default",
+			envContextLen:  "",
+			defaultNumCtx:  4096,
+			modelNumCtx:    0,
+			requestNumCtx:  0,
+			expectedNumCtx: 4096,
+		},
+		{
+			name:           "high vram tier default",
+			envContextLen:  "",
+			defaultNumCtx:  262144,
+			modelNumCtx:    0,
+			requestNumCtx:  0,
+			expectedNumCtx: 262144,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			// Set or clear environment variable
+			if tt.envContextLen != "" {
+				t.Setenv("OLLAMA_CONTEXT_LENGTH", tt.envContextLen)
+			}
+
+			// Create server with VRAM-based default
+			s := &Server{
+				defaultNumCtx: tt.defaultNumCtx,
+			}
+
+			// Create model options (use float64 as FromMap expects JSON-style numbers)
+			var modelOpts map[string]any
+			if tt.modelNumCtx != 0 {
+				modelOpts = map[string]any{"num_ctx": float64(tt.modelNumCtx)}
+			}
+			model := &Model{
+				Options: modelOpts,
+			}
+
+			// Create request options (use float64 as FromMap expects JSON-style numbers)
+			var requestOpts map[string]any
+			if tt.requestNumCtx != 0 {
+				requestOpts = map[string]any{"num_ctx": float64(tt.requestNumCtx)}
+			}
+
+			opts, err := s.modelOptions(model, requestOpts)
+			if err != nil {
+				t.Fatalf("modelOptions failed: %v", err)
+			}
+
+			if opts.NumCtx != tt.expectedNumCtx {
+				t.Errorf("NumCtx = %d, want %d", opts.NumCtx, tt.expectedNumCtx)
+			}
+		})
+	}
+}
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -804,6 +804,7 @@ func (s *mockLlm) GetPort() int                                       { return -
 func (s *mockLlm) GetDeviceInfos(ctx context.Context) []ml.DeviceInfo { return nil }
 func (s *mockLlm) HasExited() bool                                    { return false }
 func (s *mockLlm) GetActiveDeviceIDs() []ml.DeviceID                  { return nil }
+func (s *mockLlm) ContextLength() int                                 { return 0 }

 // TestImageGenRunnerCanBeEvicted verifies that an image generation model
 // loaded in the scheduler can be evicted when idle.
--- a/x/imagegen/server.go
+++ b/x/imagegen/server.go
@@ -347,6 +347,11 @@ func (s *Server) VRAMByGPU(id ml.DeviceID) uint64 {
 	return s.vramSize
 }

+// Context length is not applicable for image generation.
+func (s *Server) ContextLength() int {
+	return 0
+}
+
 func (s *Server) Embedding(ctx context.Context, input string) ([]float32, int, error) {
 	return nil, 0, errors.New("not supported")
 }
Author	SHA1	Message	Date
ParthSareen	d8b954f8e2	add min vram for glm-4.7-flash	2026-02-02 13:53:27 -08:00
ParthSareen	e4c0575f8f	revert doc changes for recommended models	2026-02-02 13:47:07 -08:00
ParthSareen	970621df27	launch: add recommended models with install flow and sorting Show recommended models (glm-4.7-flash, glm-4.7:cloud, kimi-k2.5:cloud) in the model selector. Non-installed models are labeled "recommended, install?" and sorted to the bottom. Selecting a non-installed model prompts for confirmation before pulling. Strip :latest suffix from display names. Update docs to match new recommendations.	2026-02-02 13:45:46 -08:00
ParthSareen	a3024436a8	cmd: launch default models	2026-02-02 13:29:45 -08:00
Richard Lyons	6582f6da5c	llm: Make "do load request" error message more informative	2026-02-02 11:13:21 -08:00
Jesse Gross	0334ffa625	server: use tiered VRAM-based default context length Replace binary low VRAM mode with tiered VRAM thresholds that set default context lengths for all models: - < 24 GiB VRAM: 4,096 context - 24-48 GiB VRAM: 32,768 context - >= 48 GiB VRAM: 262,144 context	2026-02-02 10:47:09 -08:00
Jesse Gross	d11fbd2c60	server: fix ollama ps showing configured instead of actual context length When context length is clamped to the model's trained context length, ollama ps now shows the actual clamped value instead of the originally configured value.	2026-02-02 10:47:09 -08:00
Jeffrey Morgan	6a7c3f188e	openclaw: run onboarding for fresh installs (#14006 ) When launching OpenClaw without prior onboarding, run the onboarding wizard instead of going straight to gateway. This ensures proper gateway configuration (mode, token, etc.) before first use. - Add onboarded() to check for wizard.lastRunAt marker in config - Run onboard with --auth-choice skip --gateway-token ollama for fresh installs - Existing installs (onboarding completed) run gateway directly	2026-02-01 13:46:45 -08:00
Jeffrey Morgan	427e2c962a	docs: add redirect from clawdbot to openclaw (#14004 )	2026-01-31 20:50:42 -08:00
Thanh Nguyen	27db7f806f	cmd/config: rename integration to openclaw (#13979 ) --------- Co-authored-by: ParthSareen <parth.sareen@ollama.com>	2026-01-31 18:31:13 -05:00
Dhiraj Lochib	3590fbfa76	runner: fix typo 'baackend' -> 'backend' in error messages (#13645 ) Fix typo in three error messages where 'baackend' was written instead of 'backend' in the /health endpoint handler when initializing the dummy model load.	2026-01-31 13:26:20 -08:00
noureldin-azzab	cd0094f772	added stakpak to web & desktop (#13961 )	2026-01-31 13:04:34 -08:00
Louis Beaumont	06bc8e6712	docs: add Screenpipe to Community Integrations (#13906 ) Screenpipe is a 24/7 screen & mic recording tool that uses Ollama for local LLM-powered search and AI features. 16k+ GitHub stars.	2026-01-31 12:49:52 -08:00
frob	fc5f9bb448	docs: remove unsupported quantizations (#13982 )	2026-01-31 12:46:20 -08:00
frob	a0740f7ef7	docs: add GB10 to supported devices (#13987 )	2026-01-31 12:45:27 -08:00
Parth Sareen	a0923cbdd0	cmd: ollama launch add placeholder text for selector (#13966 )	2026-01-29 09:48:49 -08:00
Seokrin Taron Sung	f92e362b2e	cmd: capitalize Ollama in serve command help text (#13965 )	2026-01-29 09:47:53 -08:00
Tincho	aa23d8ecd2	docs: update installation command for OpenCode CLI (#13971 )	2026-01-29 09:47:02 -08:00
Gabe Goodhart	7b62c41060	cmd/config: use envconfig.Host() for base API in launch config packages (#13937 )	2026-01-27 13:30:00 -08:00
Parth Sareen	26acab64b7	docs: add clawdbot (#13925 )	2026-01-26 18:32:54 -08:00
Gyungrai Wang	e0f03790b1	parsers/ministral: fix nested tool call parsing by counting brace nesting (#13905 ) * parsers/ministral: fix nested tool call parsing by counting brace nesting * fix lint error * parsers: refactor ministral parser The old one was very tied to expecting to see only one token at a time, which I don't like to assume (who knows what the future might hold wrt speculative decoding, etc). This new one follows a similar structure to qwen3-coder's parser, which incidentally makes it easier to test as well (since we can test the individual events that come out when given particular inputs). --------- Co-authored-by: Devon Rifkin <drifkin@drifkin.net>	2026-01-26 15:03:43 -08:00
Parth Sareen	3ab842b0f5	cmd: clawdbot config fixes (#13922 )	2026-01-26 14:34:29 -08:00
Parth Sareen	b8e8ef8929	cmd: ollama launch clawdbot (#13921 )	2026-01-26 13:40:59 -08:00
Parth Sareen	465d124183	cmd: fix opencode config (#13894 )	2026-01-24 18:42:56 -08:00
Parth Sareen	d310e56fa3	cmd: add fallback for claude (#13892 )	2026-01-24 18:26:01 -08:00
Jeffrey Morgan	a1ca428c90	glm4moelite: fix attention scale calculation (#13893 ) Use the original key dimension (qkNopeHeadDim + qkRopeHeadDim = 256) for the attention scale instead of the MLA absorbed dimension (kvLoraRank + qkRopeHeadDim = 576). MLA absorption is a mathematically equivalent reorganization of the attention computation - it should not change the effective attention scale. The scale should match training, which uses 1/sqrt(256). This improves tool calling and model looping issues.	2026-01-24 17:48:09 -08:00
Jeffrey Morgan	16750865d1	glm4moelite: quantize more tensors to q8_0 and avoid double BOS token (#13891 )	2026-01-24 16:33:54 -08:00
Jeffrey Morgan	f3b476c592	build: add -O3 optimization to CGO flags (#13877 ) CGO_CFLAGS and CGO_CXXFLAGS were being set without optimization flags, which overrides Go's default -O2 and results in unoptimized C++ code. This caused significant performance degradation in release builds compared to local `go build` which uses the default optimization. - build_darwin.sh: add -O3 to CGO_CFLAGS and CGO_CXXFLAGS exports - Dockerfile: preserve CGO_CFLAGS/CGO_CXXFLAGS from build args instead of overwriting them - app/README.md: update documentation to include -O3	2026-01-24 10:55:38 -08:00
Parth Sareen	5267d31d56	docs: ollama launch (#13852 )	2026-01-23 23:18:50 -08:00
Stillhart	b44f56319f	README: Update the "Ollama for ruby" to the most popular and maintained ruby gem. (#13855 ) * update README ruby link the ollama-ai ruby gem is vastly less popular and seems unmaintained https://rubygems.org/gems/ollama-ai the defacto standard with the most downloads in the ruby ecosystem is ruby_llm https://rubygems.org/gems/ruby_llm I would link to that to avoid complication and guarantee feature compatibility with ollama. * Update gem link ruby_llm from website to GitHub ollama links mostly to github, not project websites, hence link to ruby_llm github.	2026-01-24 01:24:52 -05:00
Jeffrey Morgan	0209c268bb	llama: fix CUDA MMA errors in release build (#13874 )	2026-01-23 20:10:04 -08:00
Jeffrey Morgan	912d984346	llama: fix fattn-tile shared memory overflow on sm_50/52 (#13872 ) Use nthreads=128 for ncols=4 configurations in flash attention tile kernel to reduce shared memory usage below 48KB limit on Maxwell architectures (sm_50/52). With nthreads=256 and ncols=4, np=2 which caused shared memory to exceed 48KB. With nthreads=128 and ncols=4, np=1 keeps shared memory under the limit.	2026-01-23 19:22:32 -08:00