model: support for qwen3.5 architecture (#14378 )

ggml: ensure tensor size is valid (#14406 )
When quantizing tensors during model creation validate that the resulting sizes match what is expected based on the shape.
2026-02-25 11:36:54 -05:00 · 2026-02-24 20:08:05 -08:00 · 2026-02-24 21:52:44 -04:00 · 2026-02-23 16:44:29 -08:00 · 2026-02-23 18:08:52 -05:00 · 2026-02-23 15:03:53 -08:00
141 changed files with 13605 additions and 3785 deletions
--- a/9
+++ b/9
@@ -9,15 +9,10 @@ ARG JETPACK6VERSION=r36.4.0
 ARG CMAKEVERSION=3.31.2
 ARG VULKANVERSION=1.4.321.1

-# We require gcc v10 minimum.  v10.3 has regressions, so the rockylinux 8.5 AppStream has the latest compatible version
 FROM --platform=linux/amd64 rocm/dev-almalinux-8:${ROCMVERSION}-complete AS base-amd64
-RUN yum install -y yum-utils \
-    && yum-config-manager --add-repo https://dl.rockylinux.org/vault/rocky/8.5/AppStream/\$basearch/os/ \
-    && rpm --import https://dl.rockylinux.org/pub/rocky/RPM-GPG-KEY-Rocky-8 \
-    && dnf install -y yum-utils ccache gcc-toolset-10-gcc-10.2.1-8.2.el8 gcc-toolset-10-gcc-c++-10.2.1-8.2.el8 gcc-toolset-10-binutils-2.35-11.el8 \
-    && dnf install -y ccache \
+RUN dnf install -y yum-utils ccache gcc-toolset-11-gcc gcc-toolset-11-gcc-c++ gcc-toolset-11-binutils \
    && yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
-ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH
+ENV PATH=/opt/rh/gcc-toolset-11/root/usr/bin:$PATH
 ARG VULKANVERSION
 RUN wget https://sdk.lunarg.com/sdk/download/${VULKANVERSION}/linux/vulkansdk-linux-x86_64-${VULKANVERSION}.tar.xz -O /tmp/vulkansdk-linux-x86_64-${VULKANVERSION}.tar.xz \
    && tar xvf /tmp/vulkansdk-linux-x86_64-${VULKANVERSION}.tar.xz \
--- a/2
+++ b/2
@@ -1 +1 @@
-v0.4.1
+v0.5.0
--- a/app/cmd/app/app.go
+++ b/app/cmd/app/app.go
@@ -253,6 +253,8 @@ func main() {
 		done <- osrv.Run(octx)
 	}()

+	upd := &updater.Updater{Store: st}
+
 	uiServer := ui.Server{
 		Token: token,
 		Restart: func() {
@@ -267,6 +269,10 @@ func main() {
 		ToolRegistry: toolRegistry,
 		Dev:          devMode,
 		Logger:       slog.Default(),
+		Updater:      upd,
+		UpdateAvailableFunc: func() {
+			UpdateAvailable("")
+		},
 	}

 	srv := &http.Server{
@@ -284,8 +290,13 @@ func main() {
 		slog.Debug("background desktop server done")
 	}()

-	updater := &updater.Updater{Store: st}
-	updater.StartBackgroundUpdaterChecker(ctx, UpdateAvailable)
+	upd.StartBackgroundUpdaterChecker(ctx, UpdateAvailable)
+
+	// Check for pending updates on startup (show tray notification if update is ready)
+	if updater.IsUpdatePending() {
+		slog.Debug("update pending on startup, showing tray notification")
+		UpdateAvailable("")
+	}

 	hasCompletedFirstRun, err := st.HasCompletedFirstRun()
 	if err != nil {
@@ -348,6 +359,18 @@ func startHiddenTasks() {
 			// CLI triggered app startup use-case
 			slog.Info("deferring pending update for fast startup")
 		} else {
+			// Check if auto-update is enabled before automatically upgrading
+			st := &store.Store{}
+			settings, err := st.Settings()
+			if err != nil {
+				slog.Warn("failed to load settings for upgrade check", "error", err)
+			} else if !settings.AutoUpdateEnabled {
+				slog.Info("auto-update disabled, skipping automatic upgrade at startup")
+				// Still show tray notification so user knows update is ready
+				UpdateAvailable("")
+				return
+			}
+
 			if err := updater.DoUpgradeAtStartup(); err != nil {
 				slog.Info("unable to perform upgrade at startup", "error", err)
 				// Make sure the restart to upgrade menu shows so we can attempt an interactive upgrade to get authorization
--- a/app/server/server.go
+++ b/app/server/server.go
@@ -41,6 +41,11 @@ type InferenceCompute struct {
 	VRAM    string
 }

+type InferenceInfo struct {
+	Computes             []InferenceCompute
+	DefaultContextLength int
+}
+
 func New(s *store.Store, devMode bool) *Server {
 	p := resolvePath("ollama")
 	return &Server{store: s, bin: p, dev: devMode}
@@ -272,9 +277,12 @@ func openRotatingLog() (io.WriteCloser, error) {

 // Attempt to retrieve inference compute information from the server
 // log.  Set ctx to timeout to control how long to wait for the logs to appear
-func GetInferenceComputer(ctx context.Context) ([]InferenceCompute, error) {
-	inference := []InferenceCompute{}
-	marker := regexp.MustCompile(`inference compute.*library=`)
+func GetInferenceInfo(ctx context.Context) (*InferenceInfo, error) {
+	info := &InferenceInfo{}
+	computeMarker := regexp.MustCompile(`inference compute.*library=`)
+	defaultCtxMarker := regexp.MustCompile(`vram-based default context`)
+	defaultCtxRegex := regexp.MustCompile(`default_num_ctx=(\d+)`)
+
 	q := `inference compute.*%s=["]([^"]*)["]`
 	nq := `inference compute.*%s=(\S+)\s`
 	type regex struct {
@@ -340,8 +348,8 @@ func GetInferenceComputer(ctx context.Context) ([]InferenceCompute, error) {
 		scanner := bufio.NewScanner(file)
 		for scanner.Scan() {
 			line := scanner.Text()
-			match := marker.FindStringSubmatch(line)
-			if len(match) > 0 {
+			// Check for inference compute lines
+			if computeMarker.MatchString(line) {
 				ic := InferenceCompute{
 					Library: get("library", line),
 					Variant: get("variant", line),
@@ -352,12 +360,25 @@ func GetInferenceComputer(ctx context.Context) ([]InferenceCompute, error) {
 				}

 				slog.Info("Matched", "inference compute", ic)
-				inference = append(inference, ic)
-			} else {
-				// Break out on first non matching line after we start matching
-				if len(inference) > 0 {
-					return inference, nil
+				info.Computes = append(info.Computes, ic)
+				continue
+			}
+			// Check for default context length line
+			if defaultCtxMarker.MatchString(line) {
+				match := defaultCtxRegex.FindStringSubmatch(line)
+				if len(match) > 1 {
+					numCtx, err := strconv.Atoi(match[1])
+					if err == nil {
+						info.DefaultContextLength = numCtx
+						slog.Info("Matched default context length", "default_num_ctx", numCtx)
+					}
 				}
+				return info, nil
+			}
+			// If we've found compute info but hit a non-matching line, return what we have
+			// This handles older server versions that don't log the default context line
+			if len(info.Computes) > 0 {
+				return info, nil
 			}
 		}
 		time.Sleep(100 * time.Millisecond)
--- a/app/server/server_test.go
+++ b/app/server/server_test.go
@@ -205,44 +205,50 @@ func TestServerCmdCloudSettingEnv(t *testing.T) {
 	}
 }

-func TestGetInferenceComputer(t *testing.T) {
+func TestGetInferenceInfo(t *testing.T) {
 	tests := []struct {
-		name string
-		log  string
-		exp  []InferenceCompute
+		name             string
+		log              string
+		expComputes      []InferenceCompute
+		expDefaultCtxLen int
 	}{
 		{
 			name: "metal",
 			log: `time=2025-06-30T09:23:07.374-07:00 level=DEBUG source=sched.go:108 msg="starting llm scheduler"
 time=2025-06-30T09:23:07.416-07:00 level=INFO source=types.go:130 msg="inference compute" id=0 library=metal variant="" compute="" driver=0.0 name="" total="96.0 GiB" available="96.0 GiB"
+time=2025-06-30T09:23:07.417-07:00 level=INFO source=routes.go:1721 msg="vram-based default context" total_vram="96.0 GiB" default_num_ctx=262144
 time=2025-06-30T09:25:56.197-07:00 level=DEBUG source=ggml.go:155 msg="key not found" key=general.alignment default=32
 `,
-			exp: []InferenceCompute{{
+			expComputes: []InferenceCompute{{
 				Library: "metal",
 				Driver:  "0.0",
 				VRAM:    "96.0 GiB",
 			}},
+			expDefaultCtxLen: 262144,
 		},
 		{
 			name: "cpu",
 			log: `time=2025-07-01T17:59:51.470Z level=INFO source=gpu.go:377 msg="no compatible GPUs were discovered"
 time=2025-07-01T17:59:51.470Z level=INFO source=types.go:130 msg="inference compute" id=0 library=cpu variant="" compute="" driver=0.0 name="" total="31.3 GiB" available="30.4 GiB"
+time=2025-07-01T17:59:51.471Z level=INFO source=routes.go:1721 msg="vram-based default context" total_vram="31.3 GiB" default_num_ctx=32768
 [GIN] 2025/07/01 - 18:00:09 | 200 |      50.263µs | 100.126.204.152 | HEAD     "/"
 `,
-			exp: []InferenceCompute{{
+			expComputes: []InferenceCompute{{
 				Library: "cpu",
 				Driver:  "0.0",
 				VRAM:    "31.3 GiB",
 			}},
+			expDefaultCtxLen: 32768,
 		},
 		{
 			name: "cuda1",
 			log: `time=2025-07-01T19:33:43.162Z level=DEBUG source=amd_linux.go:419 msg="amdgpu driver not detected /sys/module/amdgpu"
 releasing cuda driver library
 time=2025-07-01T19:33:43.162Z level=INFO source=types.go:130 msg="inference compute" id=GPU-452cac9f-6960-839c-4fb3-0cec83699196 library=cuda variant=v12 compute=6.1 driver=12.7 name="NVIDIA GeForce GT 1030" total="3.9 GiB" available="3.9 GiB"
+time=2025-07-01T19:33:43.163Z level=INFO source=routes.go:1721 msg="vram-based default context" total_vram="3.9 GiB" default_num_ctx=4096
 [GIN] 2025/07/01 - 18:00:09 | 200 |      50.263µs | 100.126.204.152 | HEAD     "/"
 `,
-			exp: []InferenceCompute{{
+			expComputes: []InferenceCompute{{
 				Library: "cuda",
 				Variant: "v12",
 				Compute: "6.1",
@@ -250,6 +256,7 @@ time=2025-07-01T19:33:43.162Z level=INFO source=types.go:130 msg="inference comp
 				Name:    "NVIDIA GeForce GT 1030",
 				VRAM:    "3.9 GiB",
 			}},
+			expDefaultCtxLen: 4096,
 		},
 		{
 			name: "frank",
@@ -257,9 +264,10 @@ time=2025-07-01T19:33:43.162Z level=INFO source=types.go:130 msg="inference comp
 		releasing cuda driver library
 		time=2025-07-01T19:36:13.315Z level=INFO source=types.go:130 msg="inference compute" id=GPU-d6de3398-9932-6902-11ec-fee8e424c8a2 library=cuda variant=v12 compute=7.5 driver=12.8 name="NVIDIA GeForce RTX 2080 Ti" total="10.6 GiB" available="10.4 GiB"
 		time=2025-07-01T19:36:13.315Z level=INFO source=types.go:130 msg="inference compute" id=GPU-9abb57639fa80c50 library=rocm variant="" compute=gfx1030 driver=6.3 name=1002:73bf total="16.0 GiB" available="1.3 GiB"
+		time=2025-07-01T19:36:13.316Z level=INFO source=routes.go:1721 msg="vram-based default context" total_vram="26.6 GiB" default_num_ctx=32768
 		[GIN] 2025/07/01 - 18:00:09 | 200 |      50.263µs | 100.126.204.152 | HEAD     "/"
 		`,
-			exp: []InferenceCompute{
+			expComputes: []InferenceCompute{
 				{
 					Library: "cuda",
 					Variant: "v12",
@@ -276,6 +284,20 @@ time=2025-07-01T19:33:43.162Z level=INFO source=types.go:130 msg="inference comp
 					VRAM:    "16.0 GiB",
 				},
 			},
+			expDefaultCtxLen: 32768,
+		},
+		{
+			name: "missing_default_context",
+			log: `time=2025-06-30T09:23:07.374-07:00 level=DEBUG source=sched.go:108 msg="starting llm scheduler"
+time=2025-06-30T09:23:07.416-07:00 level=INFO source=types.go:130 msg="inference compute" id=0 library=metal variant="" compute="" driver=0.0 name="" total="96.0 GiB" available="96.0 GiB"
+time=2025-06-30T09:25:56.197-07:00 level=DEBUG source=ggml.go:155 msg="key not found" key=general.alignment default=32
+`,
+			expComputes: []InferenceCompute{{
+				Library: "metal",
+				Driver:  "0.0",
+				VRAM:    "96.0 GiB",
+			}},
+			expDefaultCtxLen: 0, // No default context line, should return 0
 		},
 	}
 	for _, tt := range tests {
@@ -288,18 +310,21 @@ time=2025-07-01T19:33:43.162Z level=INFO source=types.go:130 msg="inference comp
 			}
 			ctx, cancel := context.WithTimeout(t.Context(), 10*time.Millisecond)
 			defer cancel()
-			ics, err := GetInferenceComputer(ctx)
+			info, err := GetInferenceInfo(ctx)
 			if err != nil {
-				t.Fatalf(" failed to get inference compute: %v", err)
+				t.Fatalf("failed to get inference info: %v", err)
 			}
-			if !reflect.DeepEqual(ics, tt.exp) {
-				t.Fatalf("got:\n%#v\nwant:\n%#v", ics, tt.exp)
+			if !reflect.DeepEqual(info.Computes, tt.expComputes) {
+				t.Fatalf("computes mismatch\ngot:\n%#v\nwant:\n%#v", info.Computes, tt.expComputes)
+			}
+			if info.DefaultContextLength != tt.expDefaultCtxLen {
+				t.Fatalf("default context length mismatch: got %d, want %d", info.DefaultContextLength, tt.expDefaultCtxLen)
 			}
 		})
 	}
 }

-func TestGetInferenceComputerTimeout(t *testing.T) {
+func TestGetInferenceInfoTimeout(t *testing.T) {
 	ctx, cancel := context.WithTimeout(t.Context(), 10*time.Millisecond)
 	defer cancel()
 	tmpDir := t.TempDir()
@@ -308,7 +333,7 @@ func TestGetInferenceComputerTimeout(t *testing.T) {
 	if err != nil {
 		t.Fatalf("failed to write log file %s: %s", serverLogPath, err)
 	}
-	_, err = GetInferenceComputer(ctx)
+	_, err = GetInferenceInfo(ctx)
 	if err == nil {
 		t.Fatal("expected timeout")
 	}
--- a/app/store/database.go
+++ b/app/store/database.go
@@ -9,12 +9,12 @@ import (
 	"strings"
 	"time"

-	sqlite3 "github.com/mattn/go-sqlite3"
+	_ "github.com/mattn/go-sqlite3"
 )

 // currentSchemaVersion defines the current database schema version.
 // Increment this when making schema changes that require migrations.
-const currentSchemaVersion = 13
+const currentSchemaVersion = 15

 // database wraps the SQLite connection.
 // SQLite handles its own locking for concurrent access:
@@ -73,7 +73,7 @@ func (db *database) init() error {
 		agent BOOLEAN NOT NULL DEFAULT 0,
 		tools BOOLEAN NOT NULL DEFAULT 0,
 		working_dir TEXT NOT NULL DEFAULT '',
-		context_length INTEGER NOT NULL DEFAULT 4096,
+		context_length INTEGER NOT NULL DEFAULT 0,
 		window_width INTEGER NOT NULL DEFAULT 0,
 		window_height INTEGER NOT NULL DEFAULT 0,
 		config_migrated BOOLEAN NOT NULL DEFAULT 0,
@@ -86,6 +86,7 @@ func (db *database) init() error {
 		think_level TEXT NOT NULL DEFAULT '',
 		cloud_setting_migrated BOOLEAN NOT NULL DEFAULT 0,
 		remote TEXT NOT NULL DEFAULT '', -- deprecated
+		auto_update_enabled BOOLEAN NOT NULL DEFAULT 1,
 		schema_version INTEGER NOT NULL DEFAULT %d
 	);

@@ -251,6 +252,18 @@ func (db *database) migrate() error {
 				return fmt.Errorf("migrate v12 to v13: %w", err)
 			}
 			version = 13
+		case 13:
+			// change default context_length from 4096 to 0 (VRAM-based tiered defaults)
+			if err := db.migrateV13ToV14(); err != nil {
+				return fmt.Errorf("migrate v13 to v14: %w", err)
+			}
+			version = 14
+		case 14:
+			// add auto_update_enabled column to settings table
+			if err := db.migrateV14ToV15(); err != nil {
+				return fmt.Errorf("migrate v14 to v15: %w", err)
+			}
+			version = 15
 		default:
 			// If we have a version we don't recognize, just set it to current
 			// This might happen during development
@@ -474,6 +487,37 @@ func (db *database) migrateV12ToV13() error {
 	return nil
 }

+// migrateV13ToV14 changes the default context_length from 4096 to 0.
+// When context_length is 0, the ollama server uses VRAM-based tiered defaults.
+func (db *database) migrateV13ToV14() error {
+	_, err := db.conn.Exec(`UPDATE settings SET context_length = 0 WHERE context_length = 4096`)
+	if err != nil {
+		return fmt.Errorf("update context_length default: %w", err)
+	}
+
+	_, err = db.conn.Exec(`UPDATE settings SET schema_version = 14`)
+	if err != nil {
+		return fmt.Errorf("update schema version: %w", err)
+	}
+
+	return nil
+}
+
+// migrateV14ToV15 adds the auto_update_enabled column to the settings table
+func (db *database) migrateV14ToV15() error {
+	_, err := db.conn.Exec(`ALTER TABLE settings ADD COLUMN auto_update_enabled BOOLEAN NOT NULL DEFAULT 1`)
+	if err != nil && !duplicateColumnError(err) {
+		return fmt.Errorf("add auto_update_enabled column: %w", err)
+	}
+
+	_, err = db.conn.Exec(`UPDATE settings SET schema_version = 15`)
+	if err != nil {
+		return fmt.Errorf("update schema version: %w", err)
+	}
+
+	return nil
+}
+
 // cleanupOrphanedData removes orphaned records that may exist due to the foreign key bug
 func (db *database) cleanupOrphanedData() error {
 	_, err := db.conn.Exec(`
@@ -504,19 +548,11 @@ func (db *database) cleanupOrphanedData() error {
 }

 func duplicateColumnError(err error) bool {
-	if sqlite3Err, ok := err.(sqlite3.Error); ok {
-		return sqlite3Err.Code == sqlite3.ErrError &&
-			strings.Contains(sqlite3Err.Error(), "duplicate column name")
-	}
-	return false
+	return err != nil && strings.Contains(err.Error(), "duplicate column name")
 }

 func columnNotExists(err error) bool {
-	if sqlite3Err, ok := err.(sqlite3.Error); ok {
-		return sqlite3Err.Code == sqlite3.ErrError &&
-			strings.Contains(sqlite3Err.Error(), "no such column")
-	}
-	return false
+	return err != nil && strings.Contains(err.Error(), "no such column")
 }

 func (db *database) getAllChats() ([]Chat, error) {
@@ -1130,9 +1166,9 @@ func (db *database) getSettings() (Settings, error) {
 	var s Settings

 	err := db.conn.QueryRow(`
-		SELECT expose, survey, browser, models, agent, tools, working_dir, context_length, turbo_enabled, websearch_enabled, selected_model, sidebar_open, think_enabled, think_level
+		SELECT expose, survey, browser, models, agent, tools, working_dir, context_length, turbo_enabled, websearch_enabled, selected_model, sidebar_open, think_enabled, think_level, auto_update_enabled
 		FROM settings
-	`).Scan(&s.Expose, &s.Survey, &s.Browser, &s.Models, &s.Agent, &s.Tools, &s.WorkingDir, &s.ContextLength, &s.TurboEnabled, &s.WebSearchEnabled, &s.SelectedModel, &s.SidebarOpen, &s.ThinkEnabled, &s.ThinkLevel)
+	`).Scan(&s.Expose, &s.Survey, &s.Browser, &s.Models, &s.Agent, &s.Tools, &s.WorkingDir, &s.ContextLength, &s.TurboEnabled, &s.WebSearchEnabled, &s.SelectedModel, &s.SidebarOpen, &s.ThinkEnabled, &s.ThinkLevel, &s.AutoUpdateEnabled)
 	if err != nil {
 		return Settings{}, fmt.Errorf("get settings: %w", err)
 	}
@@ -1142,9 +1178,9 @@ func (db *database) getSettings() (Settings, error) {

 func (db *database) setSettings(s Settings) error {
 	_, err := db.conn.Exec(`
-		UPDATE settings 
-		SET expose = ?, survey = ?, browser = ?, models = ?, agent = ?, tools = ?, working_dir = ?, context_length = ?, turbo_enabled = ?, websearch_enabled = ?, selected_model = ?, sidebar_open = ?, think_enabled = ?, think_level = ?
-	`, s.Expose, s.Survey, s.Browser, s.Models, s.Agent, s.Tools, s.WorkingDir, s.ContextLength, s.TurboEnabled, s.WebSearchEnabled, s.SelectedModel, s.SidebarOpen, s.ThinkEnabled, s.ThinkLevel)
+		UPDATE settings
+		SET expose = ?, survey = ?, browser = ?, models = ?, agent = ?, tools = ?, working_dir = ?, context_length = ?, turbo_enabled = ?, websearch_enabled = ?, selected_model = ?, sidebar_open = ?, think_enabled = ?, think_level = ?, auto_update_enabled = ?
+	`, s.Expose, s.Survey, s.Browser, s.Models, s.Agent, s.Tools, s.WorkingDir, s.ContextLength, s.TurboEnabled, s.WebSearchEnabled, s.SelectedModel, s.SidebarOpen, s.ThinkEnabled, s.ThinkLevel, s.AutoUpdateEnabled)
 	if err != nil {
 		return fmt.Errorf("set settings: %w", err)
 	}
--- a/app/store/database_test.go
+++ b/app/store/database_test.go
@@ -98,6 +98,43 @@ func TestSchemaMigrations(t *testing.T) {
 	})
 }

+func TestMigrationV13ToV14ContextLength(t *testing.T) {
+	tmpDir := t.TempDir()
+	dbPath := filepath.Join(tmpDir, "test.db")
+
+	db, err := newDatabase(dbPath)
+	if err != nil {
+		t.Fatalf("failed to create database: %v", err)
+	}
+	defer db.Close()
+
+	_, err = db.conn.Exec("UPDATE settings SET context_length = 4096, schema_version = 13")
+	if err != nil {
+		t.Fatalf("failed to seed v13 settings row: %v", err)
+	}
+
+	if err := db.migrate(); err != nil {
+		t.Fatalf("migration from v13 to v14 failed: %v", err)
+	}
+
+	var contextLength int
+	if err := db.conn.QueryRow("SELECT context_length FROM settings").Scan(&contextLength); err != nil {
+		t.Fatalf("failed to read context_length: %v", err)
+	}
+
+	if contextLength != 0 {
+		t.Fatalf("expected context_length to migrate to 0, got %d", contextLength)
+	}
+
+	version, err := db.getSchemaVersion()
+	if err != nil {
+		t.Fatalf("failed to get schema version: %v", err)
+	}
+	if version != currentSchemaVersion {
+		t.Fatalf("expected schema version %d, got %d", currentSchemaVersion, version)
+	}
+}
+
 func TestChatDeletionWithCascade(t *testing.T) {
 	t.Run("chat deletion cascades to related messages", func(t *testing.T) {
 		tmpDir := t.TempDir()
--- a/app/store/store.go
+++ b/app/store/store.go
@@ -166,6 +166,9 @@ type Settings struct {

 	// SidebarOpen indicates if the chat sidebar is open
 	SidebarOpen bool
+
+	// AutoUpdateEnabled indicates if automatic updates should be downloaded
+	AutoUpdateEnabled bool
 }

 type Store struct {
--- a/app/store/testdata/schema.sql
+++ b/app/store/testdata/schema.sql
@@ -13,7 +13,7 @@ CREATE TABLE IF NOT EXISTS settings (
    agent BOOLEAN NOT NULL DEFAULT 0,
    tools BOOLEAN NOT NULL DEFAULT 0,
    working_dir TEXT NOT NULL DEFAULT '',
-    context_length INTEGER NOT NULL DEFAULT 4096,
+    context_length INTEGER NOT NULL DEFAULT 0,
    window_width INTEGER NOT NULL DEFAULT 0,
    window_height INTEGER NOT NULL DEFAULT 0,
    config_migrated BOOLEAN NOT NULL DEFAULT 0,
--- a/app/ui/app/codegen/gotypes.gen.ts
+++ b/app/ui/app/codegen/gotypes.gen.ts
@@ -289,10 +289,12 @@ export class InferenceCompute {
 }
 export class InferenceComputeResponse {
    inferenceComputes: InferenceCompute[];
+    defaultContextLength: number;

    constructor(source: any = {}) {
        if ('string' === typeof source) source = JSON.parse(source);
        this.inferenceComputes = this.convertValues(source["inferenceComputes"], InferenceCompute);
+        this.defaultContextLength = source["defaultContextLength"];
    }

 	convertValues(a: any, classs: any, asMap: boolean = false): any {
@@ -412,6 +414,7 @@ export class Settings {
    ThinkLevel: string;
    SelectedModel: string;
    SidebarOpen: boolean;
+    AutoUpdateEnabled: boolean;

    constructor(source: any = {}) {
        if ('string' === typeof source) source = JSON.parse(source);
@@ -429,6 +432,7 @@ export class Settings {
        this.ThinkLevel = source["ThinkLevel"];
        this.SelectedModel = source["SelectedModel"];
        this.SidebarOpen = source["SidebarOpen"];
+        this.AutoUpdateEnabled = source["AutoUpdateEnabled"];
    }
 }
 export class SettingsResponse {
--- a/app/ui/app/src/api.ts
+++ b/app/ui/app/src/api.ts
@@ -4,7 +4,6 @@ import {
  ChatEvent,
  DownloadEvent,
  ErrorEvent,
-  InferenceCompute,
  InferenceComputeResponse,
  ModelCapabilitiesResponse,
  Model,
@@ -407,7 +406,7 @@ export async function* pullModel(
  }
 }

-export async function getInferenceCompute(): Promise<InferenceCompute[]> {
+export async function getInferenceCompute(): Promise<InferenceComputeResponse> {
  const response = await fetch(`${API_BASE}/api/v1/inference-compute`);
  if (!response.ok) {
    throw new Error(
@@ -416,8 +415,7 @@ export async function getInferenceCompute(): Promise<InferenceCompute[]> {
  }

  const data = await response.json();
-  const inferenceComputeResponse = new InferenceComputeResponse(data);
-  return inferenceComputeResponse.inferenceComputes || [];
+  return new InferenceComputeResponse(data);
 }

 export async function fetchHealth(): Promise<boolean> {
--- a/app/ui/app/src/components/ChatForm.tsx
+++ b/app/ui/app/src/components/ChatForm.tsx
@@ -17,7 +17,10 @@ import {
 } from "@/hooks/useChats";
 import { useNavigate } from "@tanstack/react-router";
 import { useSelectedModel } from "@/hooks/useSelectedModel";
-import { useHasVisionCapability } from "@/hooks/useModelCapabilities";
+import {
+  useHasVisionCapability,
+  useHasToolsCapability,
+} from "@/hooks/useModelCapabilities";
 import { useUser } from "@/hooks/useUser";
 import { DisplayLogin } from "@/components/DisplayLogin";
 import { ErrorEvent, Message } from "@/gotypes";
@@ -149,12 +152,7 @@ function ChatForm({
  } = useSettings();
  const { cloudDisabled } = useCloudStatus();

-  // current supported models for web search
-  const modelLower = selectedModel?.model.toLowerCase() || "";
-  const supportsWebSearch =
-    modelLower.startsWith("gpt-oss") ||
-    modelLower.startsWith("qwen3") ||
-    modelLower.startsWith("deepseek-v3");
+  const supportsWebSearch = useHasToolsCapability(selectedModel?.model);
  // Use per-chat thinking level instead of global
  const thinkLevel: ThinkingLevel =
    settingsThinkLevel === "none" || !settingsThinkLevel
--- a/app/ui/app/src/components/Settings.tsx
+++ b/app/ui/app/src/components/Settings.tsx
@@ -15,6 +15,7 @@ import {
  XMarkIcon,
  CogIcon,
  ArrowLeftIcon,
+  ArrowDownTrayIcon,
 } from "@heroicons/react/20/solid";
 import { Settings as SettingsType } from "@/gotypes";
 import { useNavigate } from "@tanstack/react-router";
@@ -26,6 +27,7 @@ import {
  type CloudStatusResponse,
  updateCloudSetting,
  updateSettings,
+  getInferenceCompute,
 } from "@/api";

 function AnimatedDots() {
@@ -77,6 +79,13 @@ export default function Settings() {

  const settings = settingsData?.settings || null;

+  const { data: inferenceComputeResponse } = useQuery({
+    queryKey: ["inferenceCompute"],
+    queryFn: getInferenceCompute,
+  });
+
+  const defaultContextLength = inferenceComputeResponse?.defaultContextLength;
+
  const updateSettingsMutation = useMutation({
    mutationFn: updateSettings,
    onSuccess: () => {
@@ -204,7 +213,7 @@ export default function Settings() {
        Models: "",
        Agent: false,
        Tools: false,
-        ContextLength: 4096,
+        ContextLength: 0,
      });
      updateSettingsMutation.mutate(defaultSettings);
    }
@@ -432,6 +441,29 @@ export default function Settings() {
                </div>
              </Field>

+              {/* Auto Update */}
+              <Field>
+                <div className="flex items-start justify-between gap-4">
+                  <div className="flex items-start space-x-3 flex-1">
+                    <ArrowDownTrayIcon className="mt-1 h-5 w-5 flex-shrink-0 text-black dark:text-neutral-100" />
+                    <div>
+                      <Label>Auto-download updates</Label>
+                      <Description>
+                        {settings.AutoUpdateEnabled
+                          ? "Automatically download updates when available."
+                          : "Updates will not be downloaded automatically."}
+                      </Description>
+                    </div>
+                  </div>
+                  <div className="flex-shrink-0">
+                    <Switch
+                      checked={settings.AutoUpdateEnabled}
+                      onChange={(checked) => handleChange("AutoUpdateEnabled", checked)}
+                    />
+                  </div>
+                </div>
+              </Field>
+
              {/* Expose Ollama */}
              <Field>
                <div className="flex items-start justify-between gap-4">
@@ -507,13 +539,11 @@ export default function Settings() {
                    </Description>
                    <div className="mt-3">
                      <Slider
-                        value={(() => {
-                          // Otherwise use the settings value
-                          return settings.ContextLength || 4096;
-                        })()}
+                        value={settings.ContextLength || defaultContextLength || 0}
                        onChange={(value) => {
                          handleChange("ContextLength", value);
                        }}
+                        disabled={!defaultContextLength}
                        options={[
                          { value: 4096, label: "4k" },
                          { value: 8192, label: "8k" },
--- a/app/ui/app/src/components/ui/slider.tsx
+++ b/app/ui/app/src/components/ui/slider.tsx
@@ -6,10 +6,11 @@ export interface SliderProps {
  value?: number;
  onChange?: (value: number) => void;
  className?: string;
+  disabled?: boolean;
 }

 const Slider = React.forwardRef<HTMLDivElement, SliderProps>(
-  ({ label, options, value = 0, onChange }, ref) => {
+  ({ label, options, value = 0, onChange, disabled = false }, ref) => {
    const [selectedValue, setSelectedValue] = React.useState(value);
    const [isDragging, setIsDragging] = React.useState(false);
    const containerRef = React.useRef<HTMLDivElement>(null);
@@ -20,6 +21,7 @@ const Slider = React.forwardRef<HTMLDivElement, SliderProps>(
    }, [value]);

    const handleClick = (optionValue: number) => {
+      if (disabled) return;
      setSelectedValue(optionValue);
      onChange?.(optionValue);
    };
@@ -39,6 +41,7 @@ const Slider = React.forwardRef<HTMLDivElement, SliderProps>(
    };

    const handleMouseDown = (e: React.MouseEvent) => {
+      if (disabled) return;
      setIsDragging(true);
      e.preventDefault();
    };
@@ -77,7 +80,7 @@ const Slider = React.forwardRef<HTMLDivElement, SliderProps>(
    }

    return (
-      <div className="space-y-2" ref={ref}>
+      <div className={`space-y-2 ${disabled ? "opacity-50" : ""}`} ref={ref}>
        {label && <label className="text-sm font-medium">{label}</label>}
        <div className="relative">
          <div className="absolute top-[9px] left-2 right-2 h-1 bg-neutral-200 dark:bg-neutral-700 pointer-events-none rounded-full" />
@@ -88,10 +91,11 @@ const Slider = React.forwardRef<HTMLDivElement, SliderProps>(
                <button
                  onClick={() => handleClick(option.value)}
                  onMouseDown={handleMouseDown}
-                  className="relative px-3 py-6 -mx-3 -my-6 z-10 cursor-pointer"
+                  disabled={disabled}
+                  className={`relative px-3 py-6 -mx-3 -my-6 z-10 ${disabled ? "cursor-not-allowed" : "cursor-pointer"}`}
                >
                  <div className="relative w-5 h-5 flex items-center justify-center">
-                    {selectedValue === option.value && (
+                    {selectedValue === option.value && !disabled && (
                      <div className="w-4 h-4 bg-white dark:bg-white border border-neutral-400 dark:border-neutral-500 rounded-full cursor-grab active:cursor-grabbing" />
                    )}
                  </div>
--- a/app/ui/app/src/hooks/useModelCapabilities.ts
+++ b/app/ui/app/src/hooks/useModelCapabilities.ts
@@ -20,3 +20,8 @@ export function useHasVisionCapability(modelName: string | undefined) {
  const { data: capabilitiesResponse } = useModelCapabilities(modelName);
  return capabilitiesResponse?.capabilities?.includes("vision") ?? false;
 }
+
+export function useHasToolsCapability(modelName: string | undefined) {
+  const { data: capabilitiesResponse } = useModelCapabilities(modelName);
+  return capabilitiesResponse?.capabilities?.includes("tools") ?? false;
+}
--- a/app/ui/app/src/hooks/useSelectedModel.ts
+++ b/app/ui/app/src/hooks/useSelectedModel.ts
@@ -28,12 +28,14 @@ export function useSelectedModel(currentChatId?: string, searchQuery?: string) {
    currentChatId && currentChatId !== "new" ? currentChatId : "",
  );

-  const { data: inferenceComputes = [] } = useQuery({
-    queryKey: ["inference-compute"],
+  const { data: inferenceComputeResponse } = useQuery({
+    queryKey: ["inferenceCompute"],
    queryFn: getInferenceCompute,
    enabled: !settings.selectedModel, // Only fetch if no model is selected
  });

+  const inferenceComputes = inferenceComputeResponse?.inferenceComputes || [];
+
  const totalVRAM = useMemo(
    () => getTotalVRAM(inferenceComputes),
    [inferenceComputes],
--- a/app/ui/responses/types.go
+++ b/app/ui/responses/types.go
@@ -45,7 +45,8 @@ type InferenceCompute struct {
 }

 type InferenceComputeResponse struct {
-	InferenceComputes []InferenceCompute `json:"inferenceComputes"`
+	InferenceComputes    []InferenceCompute `json:"inferenceComputes"`
+	DefaultContextLength int                `json:"defaultContextLength"`
 }

 type ModelCapabilitiesResponse struct {
--- a/app/ui/ui.go
+++ b/app/ui/ui.go
@@ -28,6 +28,7 @@ import (
 	"github.com/ollama/ollama/app/tools"
 	"github.com/ollama/ollama/app/types/not"
 	"github.com/ollama/ollama/app/ui/responses"
+	"github.com/ollama/ollama/app/updater"
 	"github.com/ollama/ollama/app/version"
 	ollamaAuth "github.com/ollama/ollama/auth"
 	"github.com/ollama/ollama/envconfig"
@@ -106,6 +107,10 @@ type Server struct {

 	// Dev is true if the server is running in development mode
 	Dev bool
+
+	// Updater for checking and downloading updates
+	Updater             *updater.Updater
+	UpdateAvailableFunc func()
 }

 func (s *Server) log() *slog.Logger {
@@ -829,8 +834,9 @@ func (s *Server) chat(w http.ResponseWriter, r *http.Request) error {

 	if !hasAttachments {
 		WebSearchEnabled := req.WebSearch != nil && *req.WebSearch
+		hasToolsCapability := slices.Contains(details.Capabilities, model.CapabilityTools)

-		if WebSearchEnabled {
+		if WebSearchEnabled && hasToolsCapability {
 			if supportsBrowserTools(req.Model) {
 				browserState, ok := s.browserState(chat)
 				if !ok {
@@ -840,7 +846,7 @@ func (s *Server) chat(w http.ResponseWriter, r *http.Request) error {
 				registry.Register(tools.NewBrowserSearch(browser))
 				registry.Register(tools.NewBrowserOpen(browser))
 				registry.Register(tools.NewBrowserFind(browser))
-			} else if supportsWebSearchTools(req.Model) {
+			} else {
 				registry.Register(&tools.WebSearch{})
 				registry.Register(&tools.WebFetch{})
 			}
@@ -1420,11 +1426,6 @@ func (s *Server) getSettings(w http.ResponseWriter, r *http.Request) error {
 		settings.Models = envconfig.Models()
 	}

-	// set default context length if not set
-	if settings.ContextLength == 0 {
-		settings.ContextLength = 4096
-	}
-
 	// Include current runtime settings
 	settings.Agent = s.Agent
 	settings.Tools = s.Tools
@@ -1451,6 +1452,24 @@ func (s *Server) settings(w http.ResponseWriter, r *http.Request) error {
 		return fmt.Errorf("failed to save settings: %w", err)
 	}

+	// Handle auto-update toggle changes
+	if old.AutoUpdateEnabled != settings.AutoUpdateEnabled {
+		if !settings.AutoUpdateEnabled {
+			// Auto-update disabled: cancel any ongoing download
+			if s.Updater != nil {
+				s.Updater.CancelOngoingDownload()
+			}
+		} else {
+			// Auto-update re-enabled: show notification if update is already staged, or trigger immediate check
+			if (updater.IsUpdatePending() || updater.UpdateDownloaded) && s.UpdateAvailableFunc != nil {
+				s.UpdateAvailableFunc()
+			} else if s.Updater != nil {
+				// Trigger the background checker to run immediately
+				s.Updater.TriggerImmediateCheck()
+			}
+		}
+	}
+
 	if old.ContextLength != settings.ContextLength ||
 		old.Models != settings.Models ||
 		old.Expose != settings.Expose {
@@ -1500,14 +1519,14 @@ func (s *Server) writeCloudStatus(w http.ResponseWriter) error {
 func (s *Server) getInferenceCompute(w http.ResponseWriter, r *http.Request) error {
 	ctx, cancel := context.WithTimeout(r.Context(), 500*time.Millisecond)
 	defer cancel()
-	serverInferenceComputes, err := server.GetInferenceComputer(ctx)
+	info, err := server.GetInferenceInfo(ctx)
 	if err != nil {
-		s.log().Error("failed to get inference compute", "error", err)
-		return fmt.Errorf("failed to get inference compute: %w", err)
+		s.log().Error("failed to get inference info", "error", err)
+		return fmt.Errorf("failed to get inference info: %w", err)
 	}

-	inferenceComputes := make([]responses.InferenceCompute, len(serverInferenceComputes))
-	for i, ic := range serverInferenceComputes {
+	inferenceComputes := make([]responses.InferenceCompute, len(info.Computes))
+	for i, ic := range info.Computes {
 		inferenceComputes[i] = responses.InferenceCompute{
 			Library: ic.Library,
 			Variant: ic.Variant,
@@ -1519,7 +1538,8 @@ func (s *Server) getInferenceCompute(w http.ResponseWriter, r *http.Request) err
 	}

 	response := responses.InferenceComputeResponse{
-		InferenceComputes: inferenceComputes,
+		InferenceComputes:    inferenceComputes,
+		DefaultContextLength: info.DefaultContextLength,
 	}

 	w.Header().Set("Content-Type", "application/json")
@@ -1652,17 +1672,6 @@ func supportsBrowserTools(model string) bool {
 	return strings.HasPrefix(strings.ToLower(model), "gpt-oss")
 }

-// Web search tools are simpler, providing only basic web search and fetch capabilities (e.g., "web_search", "web_fetch") without simulating a browser. Currently only qwen3 and deepseek-v3 support web search tools.
-func supportsWebSearchTools(model string) bool {
-	model = strings.ToLower(model)
-	prefixes := []string{"qwen3", "deepseek-v3"}
-	for _, p := range prefixes {
-		if strings.HasPrefix(model, p) {
-			return true
-		}
-	}
-	return false
-}

 // buildChatRequest converts store.Chat to api.ChatRequest
 func (s *Server) buildChatRequest(chat *store.Chat, model string, think any, availableTools []map[string]any) (*api.ChatRequest, error) {
--- a/app/ui/ui_test.go
+++ b/app/ui/ui_test.go
@@ -4,6 +4,7 @@ package ui

 import (
 	"bytes"
+	"context"
 	"encoding/json"
 	"io"
 	"net/http"
@@ -11,9 +12,11 @@ import (
 	"path/filepath"
 	"runtime"
 	"strings"
+	"sync/atomic"
 	"testing"

 	"github.com/ollama/ollama/app/store"
+	"github.com/ollama/ollama/app/updater"
 )

 func TestHandlePostApiSettings(t *testing.T) {
@@ -522,3 +525,290 @@ func TestUserAgentTransport(t *testing.T) {

 	t.Logf("User-Agent transport successfully set: %s", receivedUA)
 }
+
+func TestSupportsBrowserTools(t *testing.T) {
+	tests := []struct {
+		model string
+		want  bool
+	}{
+		{"gpt-oss", true},
+		{"gpt-oss-latest", true},
+		{"GPT-OSS", true},
+		{"Gpt-Oss-v2", true},
+		{"qwen3", false},
+		{"deepseek-v3", false},
+		{"llama3.3", false},
+		{"", false},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.model, func(t *testing.T) {
+			if got := supportsBrowserTools(tt.model); got != tt.want {
+				t.Errorf("supportsBrowserTools(%q) = %v, want %v", tt.model, got, tt.want)
+			}
+		})
+	}
+}
+
+func TestWebSearchToolRegistration(t *testing.T) {
+	// Validates that the capability-gating logic in chat() correctly
+	// decides which tools to register based on model capabilities and
+	// the web search flag.
+	tests := []struct {
+		name             string
+		webSearchEnabled bool
+		hasToolsCap      bool
+		model            string
+		wantBrowser      bool // expects browser tools (gpt-oss)
+		wantWebSearch    bool // expects basic web search/fetch tools
+		wantNone         bool // expects no tools registered
+	}{
+		{
+			name:             "web search enabled with tools capability - browser model",
+			webSearchEnabled: true,
+			hasToolsCap:      true,
+			model:            "gpt-oss-latest",
+			wantBrowser:      true,
+		},
+		{
+			name:             "web search enabled with tools capability - non-browser model",
+			webSearchEnabled: true,
+			hasToolsCap:      true,
+			model:            "qwen3",
+			wantWebSearch:    true,
+		},
+		{
+			name:             "web search enabled without tools capability",
+			webSearchEnabled: true,
+			hasToolsCap:      false,
+			model:            "llama3.3",
+			wantNone:         true,
+		},
+		{
+			name:             "web search disabled with tools capability",
+			webSearchEnabled: false,
+			hasToolsCap:      true,
+			model:            "qwen3",
+			wantNone:         true,
+		},
+		{
+			name:             "web search disabled without tools capability",
+			webSearchEnabled: false,
+			hasToolsCap:      false,
+			model:            "llama3.3",
+			wantNone:         true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			// Replicate the decision logic from chat() handler
+			gotBrowser := false
+			gotWebSearch := false
+
+			if tt.webSearchEnabled && tt.hasToolsCap {
+				if supportsBrowserTools(tt.model) {
+					gotBrowser = true
+				} else {
+					gotWebSearch = true
+				}
+			}
+
+			if tt.wantBrowser && !gotBrowser {
+				t.Error("expected browser tools to be registered")
+			}
+			if tt.wantWebSearch && !gotWebSearch {
+				t.Error("expected web search tools to be registered")
+			}
+			if tt.wantNone && (gotBrowser || gotWebSearch) {
+				t.Error("expected no tools to be registered")
+			}
+			if !tt.wantBrowser && gotBrowser {
+				t.Error("unexpected browser tools registered")
+			}
+			if !tt.wantWebSearch && gotWebSearch {
+				t.Error("unexpected web search tools registered")
+			}
+		})
+	}
+}
+
+func TestSettingsToggleAutoUpdateOff_CancelsDownload(t *testing.T) {
+	testStore := &store.Store{
+		DBPath: filepath.Join(t.TempDir(), "db.sqlite"),
+	}
+	defer testStore.Close()
+
+	// Start with auto-update enabled
+	settings, err := testStore.Settings()
+	if err != nil {
+		t.Fatal(err)
+	}
+	settings.AutoUpdateEnabled = true
+	if err := testStore.SetSettings(settings); err != nil {
+		t.Fatal(err)
+	}
+
+	upd := &updater.Updater{Store: &store.Store{
+		DBPath: filepath.Join(t.TempDir(), "db2.sqlite"),
+	}}
+	defer upd.Store.Close()
+
+	// We can't easily mock CancelOngoingDownload, but we can verify
+	// the full settings handler flow works without error
+	server := &Server{
+		Store:   testStore,
+		Restart: func() {},
+		Updater: upd,
+	}
+
+	// Disable auto-update via settings API
+	settings.AutoUpdateEnabled = false
+	body, err := json.Marshal(settings)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	req := httptest.NewRequest("POST", "/api/v1/settings", bytes.NewReader(body))
+	req.Header.Set("Content-Type", "application/json")
+	rr := httptest.NewRecorder()
+
+	if err := server.settings(rr, req); err != nil {
+		t.Fatalf("settings() error = %v", err)
+	}
+	if rr.Code != http.StatusOK {
+		t.Fatalf("settings() status = %d, want %d", rr.Code, http.StatusOK)
+	}
+
+	// Verify settings were saved with auto-update disabled
+	saved, err := testStore.Settings()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if saved.AutoUpdateEnabled {
+		t.Fatal("expected AutoUpdateEnabled to be false after toggle off")
+	}
+}
+
+func TestSettingsToggleAutoUpdateOn_WithPendingUpdate_ShowsNotification(t *testing.T) {
+	testStore := &store.Store{
+		DBPath: filepath.Join(t.TempDir(), "db.sqlite"),
+	}
+	defer testStore.Close()
+
+	// Start with auto-update disabled
+	settings, err := testStore.Settings()
+	if err != nil {
+		t.Fatal(err)
+	}
+	settings.AutoUpdateEnabled = false
+	if err := testStore.SetSettings(settings); err != nil {
+		t.Fatal(err)
+	}
+
+	// Simulate that an update was previously downloaded
+	oldVal := updater.UpdateDownloaded
+	updater.UpdateDownloaded = true
+	defer func() { updater.UpdateDownloaded = oldVal }()
+
+	var notificationCalled atomic.Bool
+	server := &Server{
+		Store:   testStore,
+		Restart: func() {},
+		UpdateAvailableFunc: func() {
+			notificationCalled.Store(true)
+		},
+	}
+
+	// Re-enable auto-update via settings API
+	settings.AutoUpdateEnabled = true
+	body, err := json.Marshal(settings)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	req := httptest.NewRequest("POST", "/api/v1/settings", bytes.NewReader(body))
+	req.Header.Set("Content-Type", "application/json")
+	rr := httptest.NewRecorder()
+
+	if err := server.settings(rr, req); err != nil {
+		t.Fatalf("settings() error = %v", err)
+	}
+	if rr.Code != http.StatusOK {
+		t.Fatalf("settings() status = %d, want %d", rr.Code, http.StatusOK)
+	}
+
+	if !notificationCalled.Load() {
+		t.Fatal("expected UpdateAvailableFunc to be called when re-enabling with a downloaded update")
+	}
+}
+
+func TestSettingsToggleAutoUpdateOn_NoPendingUpdate_TriggersCheck(t *testing.T) {
+	testStore := &store.Store{
+		DBPath: filepath.Join(t.TempDir(), "db.sqlite"),
+	}
+	defer testStore.Close()
+
+	// Start with auto-update disabled
+	settings, err := testStore.Settings()
+	if err != nil {
+		t.Fatal(err)
+	}
+	settings.AutoUpdateEnabled = false
+	if err := testStore.SetSettings(settings); err != nil {
+		t.Fatal(err)
+	}
+
+	// Ensure no pending update - clear both the downloaded flag and the stage dir
+	oldVal := updater.UpdateDownloaded
+	updater.UpdateDownloaded = false
+	defer func() { updater.UpdateDownloaded = oldVal }()
+
+	oldStageDir := updater.UpdateStageDir
+	updater.UpdateStageDir = t.TempDir() // empty dir means IsUpdatePending() returns false
+	defer func() { updater.UpdateStageDir = oldStageDir }()
+
+	upd := &updater.Updater{Store: &store.Store{
+		DBPath: filepath.Join(t.TempDir(), "db2.sqlite"),
+	}}
+	defer upd.Store.Close()
+
+	// Initialize the checkNow channel by starting (and immediately stopping) the checker
+	// so TriggerImmediateCheck doesn't panic on nil channel
+	ctx, cancel := context.WithCancel(t.Context())
+	upd.StartBackgroundUpdaterChecker(ctx, func(string) error { return nil })
+	defer cancel()
+
+	var notificationCalled atomic.Bool
+	server := &Server{
+		Store:   testStore,
+		Restart: func() {},
+		Updater: upd,
+		UpdateAvailableFunc: func() {
+			notificationCalled.Store(true)
+		},
+	}
+
+	// Re-enable auto-update via settings API
+	settings.AutoUpdateEnabled = true
+	body, err := json.Marshal(settings)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	req := httptest.NewRequest("POST", "/api/v1/settings", bytes.NewReader(body))
+	req.Header.Set("Content-Type", "application/json")
+	rr := httptest.NewRecorder()
+
+	if err := server.settings(rr, req); err != nil {
+		t.Fatalf("settings() error = %v", err)
+	}
+	if rr.Code != http.StatusOK {
+		t.Fatalf("settings() status = %d, want %d", rr.Code, http.StatusOK)
+	}
+
+	// UpdateAvailableFunc should NOT be called since there's no pending update
+	if notificationCalled.Load() {
+		t.Fatal("UpdateAvailableFunc should not be called when there is no pending update")
+	}
+}
--- a/app/updater/updater.go
+++ b/app/updater/updater.go
@@ -19,6 +19,7 @@ import (
 	"runtime"
 	"strconv"
 	"strings"
+	"sync"
 	"time"

 	"github.com/ollama/ollama/app/store"
@@ -58,7 +59,8 @@ func (u *Updater) checkForUpdate(ctx context.Context) (bool, UpdateResponse) {
 	query := requestURL.Query()
 	query.Add("os", runtime.GOOS)
 	query.Add("arch", runtime.GOARCH)
-	query.Add("version", version.Version)
+	currentVersion := version.Version
+	query.Add("version", currentVersion)
 	query.Add("ts", strconv.FormatInt(time.Now().Unix(), 10))

 	// The original macOS app used to use the device ID
@@ -131,15 +133,27 @@ func (u *Updater) checkForUpdate(ctx context.Context) (bool, UpdateResponse) {
 }

 func (u *Updater) DownloadNewRelease(ctx context.Context, updateResp UpdateResponse) error {
+	// Create a cancellable context for this download
+	downloadCtx, cancel := context.WithCancel(ctx)
+	u.cancelDownloadLock.Lock()
+	u.cancelDownload = cancel
+	u.cancelDownloadLock.Unlock()
+	defer func() {
+		u.cancelDownloadLock.Lock()
+		u.cancelDownload = nil
+		u.cancelDownloadLock.Unlock()
+		cancel()
+	}()
+
 	// Do a head first to check etag info
-	req, err := http.NewRequestWithContext(ctx, http.MethodHead, updateResp.UpdateURL, nil)
+	req, err := http.NewRequestWithContext(downloadCtx, http.MethodHead, updateResp.UpdateURL, nil)
 	if err != nil {
 		return err
 	}

 	// In case of slow downloads, continue the update check in the background
-	bgctx, cancel := context.WithCancel(ctx)
-	defer cancel()
+	bgctx, bgcancel := context.WithCancel(downloadCtx)
+	defer bgcancel()
 	go func() {
 		for {
 			select {
@@ -176,6 +190,7 @@ func (u *Updater) DownloadNewRelease(ctx context.Context, updateResp UpdateRespo
 	_, err = os.Stat(stageFilename)
 	if err == nil {
 		slog.Info("update already downloaded", "bundle", stageFilename)
+		UpdateDownloaded = true
 		return nil
 	}

@@ -244,33 +259,84 @@ func cleanupOldDownloads(stageDir string) {
 }

 type Updater struct {
-	Store *store.Store
+	Store              *store.Store
+	cancelDownload     context.CancelFunc
+	cancelDownloadLock sync.Mutex
+	checkNow           chan struct{}
+}
+
+// CancelOngoingDownload cancels any currently running download
+func (u *Updater) CancelOngoingDownload() {
+	u.cancelDownloadLock.Lock()
+	defer u.cancelDownloadLock.Unlock()
+	if u.cancelDownload != nil {
+		slog.Info("cancelling ongoing update download")
+		u.cancelDownload()
+		u.cancelDownload = nil
+	}
+}
+
+// TriggerImmediateCheck signals the background checker to check for updates immediately
+func (u *Updater) TriggerImmediateCheck() {
+	if u.checkNow != nil {
+		select {
+		case u.checkNow <- struct{}{}:
+		default:
+			// Check already pending, no need to queue another
+		}
+	}
 }

 func (u *Updater) StartBackgroundUpdaterChecker(ctx context.Context, cb func(string) error) {
+	u.checkNow = make(chan struct{}, 1)
 	go func() {
 		// Don't blast an update message immediately after startup
 		time.Sleep(UpdateCheckInitialDelay)
 		slog.Info("beginning update checker", "interval", UpdateCheckInterval)
+		ticker := time.NewTicker(UpdateCheckInterval)
+		defer ticker.Stop()
+
 		for {
-			available, resp := u.checkForUpdate(ctx)
-			if available {
-				err := u.DownloadNewRelease(ctx, resp)
-				if err != nil {
-					slog.Error(fmt.Sprintf("failed to download new release: %s", err))
-				} else {
-					err = cb(resp.UpdateVersion)
-					if err != nil {
-						slog.Warn(fmt.Sprintf("failed to register update available with tray: %s", err))
-					}
-				}
-			}
 			select {
 			case <-ctx.Done():
 				slog.Debug("stopping background update checker")
 				return
-			default:
-				time.Sleep(UpdateCheckInterval)
+			case <-u.checkNow:
+				// Immediate check triggered
+			case <-ticker.C:
+				// Regular interval check
+			}
+
+			// Always check for updates
+			available, resp := u.checkForUpdate(ctx)
+			if !available {
+				continue
+			}
+
+			// Update is available - check if auto-update is enabled for downloading
+			settings, err := u.Store.Settings()
+			if err != nil {
+				slog.Error("failed to load settings", "error", err)
+				continue
+			}
+
+			if !settings.AutoUpdateEnabled {
+				// Auto-update disabled - don't download, just log
+				slog.Debug("update available but auto-update disabled", "version", resp.UpdateVersion)
+				continue
+			}
+
+			// Auto-update is enabled - download
+			err = u.DownloadNewRelease(ctx, resp)
+			if err != nil {
+				slog.Error("failed to download new release", "error", err)
+				continue
+			}
+
+			// Download successful - show tray notification (regardless of toggle state)
+			err = cb(resp.UpdateVersion)
+			if err != nil {
+				slog.Warn("failed to register update available with tray", "error", err)
 			}
 		}
 	}()
--- a/app/updater/updater_test.go
+++ b/app/updater/updater_test.go
@@ -11,6 +11,8 @@ import (
 	"log/slog"
 	"net/http"
 	"net/http/httptest"
+	"path/filepath"
+	"sync/atomic"
 	"testing"
 	"time"

@@ -33,7 +35,7 @@ func TestIsNewReleaseAvailable(t *testing.T) {
 	defer server.Close()
 	slog.Debug("server", "url", server.URL)

-	updater := &Updater{Store: &store.Store{}}
+	updater := &Updater{Store: &store.Store{DBPath: filepath.Join(t.TempDir(), "test.db")}}
 	defer updater.Store.Close() // Ensure database is closed
 	UpdateCheckURLBase = server.URL + "/update.json"
 	updatePresent, resp := updater.checkForUpdate(t.Context())
@@ -84,8 +86,18 @@ func TestBackgoundChecker(t *testing.T) {
 	defer server.Close()
 	UpdateCheckURLBase = server.URL + "/update.json"

-	updater := &Updater{Store: &store.Store{}}
-	defer updater.Store.Close() // Ensure database is closed
+	updater := &Updater{Store: &store.Store{DBPath: filepath.Join(t.TempDir(), "test.db")}}
+	defer updater.Store.Close()
+
+	settings, err := updater.Store.Settings()
+	if err != nil {
+		t.Fatal(err)
+	}
+	settings.AutoUpdateEnabled = true
+	if err := updater.Store.SetSettings(settings); err != nil {
+		t.Fatal(err)
+	}
+
 	updater.StartBackgroundUpdaterChecker(ctx, cb)
 	select {
 	case <-stallTimer.C:
@@ -99,3 +111,264 @@ func TestBackgoundChecker(t *testing.T) {
 		}
 	}
 }
+
+func TestAutoUpdateDisabledSkipsDownload(t *testing.T) {
+	UpdateStageDir = t.TempDir()
+	var downloadAttempted atomic.Bool
+	done := make(chan struct{})
+
+	ctx, cancel := context.WithCancel(t.Context())
+	defer cancel()
+	UpdateCheckInitialDelay = 5 * time.Millisecond
+	UpdateCheckInterval = 5 * time.Millisecond
+	VerifyDownload = func() error {
+		return nil
+	}
+
+	var server *httptest.Server
+	server = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Path == "/update.json" {
+			w.Write([]byte(
+				fmt.Sprintf(`{"version": "9.9.9", "url": "%s"}`,
+					server.URL+"/9.9.9/"+Installer)))
+		} else if r.URL.Path == "/9.9.9/"+Installer {
+			downloadAttempted.Store(true)
+			buf := &bytes.Buffer{}
+			zw := zip.NewWriter(buf)
+			zw.Close()
+			io.Copy(w, buf)
+		}
+	}))
+	defer server.Close()
+	UpdateCheckURLBase = server.URL + "/update.json"
+
+	updater := &Updater{Store: &store.Store{DBPath: filepath.Join(t.TempDir(), "test.db")}}
+	defer updater.Store.Close()
+
+	// Ensure auto-update is disabled
+	settings, err := updater.Store.Settings()
+	if err != nil {
+		t.Fatal(err)
+	}
+	settings.AutoUpdateEnabled = false
+	if err := updater.Store.SetSettings(settings); err != nil {
+		t.Fatal(err)
+	}
+
+	cb := func(ver string) error {
+		t.Fatal("callback should not be called when auto-update is disabled")
+		return nil
+	}
+
+	updater.StartBackgroundUpdaterChecker(ctx, cb)
+
+	// Wait enough time for multiple check cycles
+	time.Sleep(50 * time.Millisecond)
+	close(done)
+
+	if downloadAttempted.Load() {
+		t.Fatal("download should not be attempted when auto-update is disabled")
+	}
+}
+
+func TestAutoUpdateReenabledDownloadsUpdate(t *testing.T) {
+	UpdateStageDir = t.TempDir()
+	var downloadAttempted atomic.Bool
+	callbackCalled := make(chan struct{}, 1)
+
+	ctx, cancel := context.WithCancel(t.Context())
+	defer cancel()
+	UpdateCheckInitialDelay = 5 * time.Millisecond
+	UpdateCheckInterval = 5 * time.Millisecond
+	VerifyDownload = func() error {
+		return nil
+	}
+
+	var server *httptest.Server
+	server = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Path == "/update.json" {
+			w.Write([]byte(
+				fmt.Sprintf(`{"version": "9.9.9", "url": "%s"}`,
+					server.URL+"/9.9.9/"+Installer)))
+		} else if r.URL.Path == "/9.9.9/"+Installer {
+			downloadAttempted.Store(true)
+			buf := &bytes.Buffer{}
+			zw := zip.NewWriter(buf)
+			zw.Close()
+			io.Copy(w, buf)
+		}
+	}))
+	defer server.Close()
+	UpdateCheckURLBase = server.URL + "/update.json"
+
+	upd := &Updater{Store: &store.Store{DBPath: filepath.Join(t.TempDir(), "test.db")}}
+	defer upd.Store.Close()
+
+	// Start with auto-update disabled
+	settings, err := upd.Store.Settings()
+	if err != nil {
+		t.Fatal(err)
+	}
+	settings.AutoUpdateEnabled = false
+	if err := upd.Store.SetSettings(settings); err != nil {
+		t.Fatal(err)
+	}
+
+	cb := func(ver string) error {
+		select {
+		case callbackCalled <- struct{}{}:
+		default:
+		}
+		return nil
+	}
+
+	upd.StartBackgroundUpdaterChecker(ctx, cb)
+
+	// Wait for a few cycles with auto-update disabled - no download should happen
+	time.Sleep(50 * time.Millisecond)
+	if downloadAttempted.Load() {
+		t.Fatal("download should not happen while auto-update is disabled")
+	}
+
+	// Re-enable auto-update
+	settings.AutoUpdateEnabled = true
+	if err := upd.Store.SetSettings(settings); err != nil {
+		t.Fatal(err)
+	}
+
+	// Wait for the checker to pick it up and download
+	select {
+	case <-callbackCalled:
+		// Success: download happened and callback was called after re-enabling
+		if !downloadAttempted.Load() {
+			t.Fatal("expected download to be attempted after re-enabling")
+		}
+	case <-time.After(5 * time.Second):
+		t.Fatal("expected download and callback after re-enabling auto-update")
+	}
+}
+
+func TestCancelOngoingDownload(t *testing.T) {
+	UpdateStageDir = t.TempDir()
+	downloadStarted := make(chan struct{})
+	downloadCancelled := make(chan struct{})
+
+	ctx := t.Context()
+	VerifyDownload = func() error {
+		return nil
+	}
+
+	var server *httptest.Server
+	server = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Path == "/update.json" {
+			w.Write([]byte(
+				fmt.Sprintf(`{"version": "9.9.9", "url": "%s"}`,
+					server.URL+"/9.9.9/"+Installer)))
+		} else if r.URL.Path == "/9.9.9/"+Installer {
+			if r.Method == http.MethodHead {
+				w.Header().Set("Content-Length", "1000000")
+				w.WriteHeader(http.StatusOK)
+				return
+			}
+			// Signal that download has started
+			close(downloadStarted)
+			// Wait for cancellation or timeout
+			select {
+			case <-r.Context().Done():
+				close(downloadCancelled)
+				return
+			case <-time.After(5 * time.Second):
+				t.Error("download was not cancelled in time")
+			}
+		}
+	}))
+	defer server.Close()
+	UpdateCheckURLBase = server.URL + "/update.json"
+
+	updater := &Updater{Store: &store.Store{DBPath: filepath.Join(t.TempDir(), "test.db")}}
+	defer updater.Store.Close()
+
+	_, resp := updater.checkForUpdate(ctx)
+
+	// Start download in goroutine
+	go func() {
+		_ = updater.DownloadNewRelease(ctx, resp)
+	}()
+
+	// Wait for download to start
+	select {
+	case <-downloadStarted:
+	case <-time.After(2 * time.Second):
+		t.Fatal("download did not start in time")
+	}
+
+	// Cancel the download
+	updater.CancelOngoingDownload()
+
+	// Verify cancellation was received
+	select {
+	case <-downloadCancelled:
+		// Success
+	case <-time.After(2 * time.Second):
+		t.Fatal("download cancellation was not received by server")
+	}
+}
+
+func TestTriggerImmediateCheck(t *testing.T) {
+	UpdateStageDir = t.TempDir()
+	checkCount := atomic.Int32{}
+	checkDone := make(chan struct{}, 10)
+
+	ctx, cancel := context.WithCancel(t.Context())
+	defer cancel()
+	// Set a very long interval so only TriggerImmediateCheck causes checks
+	UpdateCheckInitialDelay = 1 * time.Millisecond
+	UpdateCheckInterval = 1 * time.Hour
+	VerifyDownload = func() error {
+		return nil
+	}
+
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Path == "/update.json" {
+			checkCount.Add(1)
+			select {
+			case checkDone <- struct{}{}:
+			default:
+			}
+			// Return no update available
+			w.WriteHeader(http.StatusNoContent)
+		}
+	}))
+	defer server.Close()
+	UpdateCheckURLBase = server.URL + "/update.json"
+
+	updater := &Updater{Store: &store.Store{DBPath: filepath.Join(t.TempDir(), "test.db")}}
+	defer updater.Store.Close()
+
+	cb := func(ver string) error {
+		return nil
+	}
+
+	updater.StartBackgroundUpdaterChecker(ctx, cb)
+
+	// Wait for goroutine to start and pass initial delay
+	time.Sleep(10 * time.Millisecond)
+
+	// With 1 hour interval, no check should have happened yet
+	initialCount := checkCount.Load()
+
+	// Trigger immediate check
+	updater.TriggerImmediateCheck()
+
+	// Wait for the triggered check
+	select {
+	case <-checkDone:
+	case <-time.After(2 * time.Second):
+		t.Fatal("triggered check did not happen")
+	}
+
+	finalCount := checkCount.Load()
+	if finalCount <= initialCount {
+		t.Fatalf("TriggerImmediateCheck did not cause additional check: initial=%d, final=%d", initialCount, finalCount)
+	}
+}
--- a/app/wintray/tray.go
+++ b/app/wintray/tray.go
@@ -369,25 +369,6 @@ func (t *winTray) addSeparatorMenuItem(menuItemId, parentId uint32) error {
 	return nil
 }

-// func (t *winTray) hideMenuItem(menuItemId, parentId uint32) error {
-// 	const ERROR_SUCCESS syscall.Errno = 0
-
-// 	t.muMenus.RLock()
-// 	menu := uintptr(t.menus[parentId])
-// 	t.muMenus.RUnlock()
-// 	res, _, err := pRemoveMenu.Call(
-// 		menu,
-// 		uintptr(menuItemId),
-// 		MF_BYCOMMAND,
-// 	)
-// 	if res == 0 && err.(syscall.Errno) != ERROR_SUCCESS {
-// 		return err
-// 	}
-// 	t.delFromVisibleItems(parentId, menuItemId)
-
-// 	return nil
-// }
-
 func (t *winTray) showMenu() error {
 	p := point{}
 	boolRet, _, err := pGetCursorPos.Call(uintptr(unsafe.Pointer(&p)))
--- a/app/wintray/w32api.go
+++ b/app/wintray/w32api.go
@@ -51,7 +51,6 @@ const (
 	IMAGE_ICON          = 1          // Loads an icon
 	LR_DEFAULTSIZE      = 0x00000040 // Loads default-size icon for windows(SM_CXICON x SM_CYICON) if cx, cy are set to zero
 	LR_LOADFROMFILE     = 0x00000010 // Loads the stand-alone image from the file
-	MF_BYCOMMAND        = 0x00000000
 	MFS_DISABLED        = 0x00000003
 	MFT_SEPARATOR       = 0x00000800
 	MFT_STRING          = 0x00000000
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -1956,6 +1956,10 @@ func runInteractiveTUI(cmd *cobra.Command) {
 		}

 		launchIntegration := func(name string) bool {
+			if err := config.EnsureInstalled(name); err != nil {
+				fmt.Fprintf(os.Stderr, "Error: %v\n", err)
+				return true
+			}
 			// If not configured or model no longer exists, prompt for model selection
 			configuredModel := config.IntegrationModel(name)
 			if configuredModel == "" || !config.ModelExists(cmd.Context(), configuredModel) || config.IsCloudModelDisabled(cmd.Context(), configuredModel) {
--- a/cmd/config/codex.go
+++ b/cmd/config/codex.go
@@ -6,6 +6,7 @@ import (
 	"os/exec"
 	"strings"

+	"github.com/ollama/ollama/envconfig"
 	"golang.org/x/mod/semver"
 )

@@ -32,6 +33,10 @@ func (c *Codex) Run(model string, args []string) error {
 	cmd.Stdin = os.Stdin
 	cmd.Stdout = os.Stdout
 	cmd.Stderr = os.Stderr
+	cmd.Env = append(os.Environ(),
+		"OPENAI_BASE_URL="+envconfig.Host().String()+"/v1/",
+		"OPENAI_API_KEY=ollama",
+	)
 	return cmd.Run()
 }

--- a/cmd/config/config.go
+++ b/cmd/config/config.go
@@ -15,8 +15,9 @@ import (
 )

 type integration struct {
-	Models  []string          `json:"models"`
-	Aliases map[string]string `json:"aliases,omitempty"`
+	Models    []string          `json:"models"`
+	Aliases   map[string]string `json:"aliases,omitempty"`
+	Onboarded bool              `json:"onboarded,omitempty"`
 }

 type config struct {
@@ -139,34 +140,54 @@ func SaveIntegration(appName string, models []string) error {
 	key := strings.ToLower(appName)
 	existing := cfg.Integrations[key]
 	var aliases map[string]string
-	if existing != nil && existing.Aliases != nil {
+	var onboarded bool
+	if existing != nil {
 		aliases = existing.Aliases
+		onboarded = existing.Onboarded
 	}

 	cfg.Integrations[key] = &integration{
-		Models:  models,
-		Aliases: aliases,
+		Models:    models,
+		Aliases:   aliases,
+		Onboarded: onboarded,
 	}

 	return save(cfg)
 }

+// integrationOnboarded marks an integration as onboarded in ollama's config.
+func integrationOnboarded(appName string) error {
+	cfg, err := load()
+	if err != nil {
+		return err
+	}
+
+	key := strings.ToLower(appName)
+	existing := cfg.Integrations[key]
+	if existing == nil {
+		existing = &integration{}
+	}
+	existing.Onboarded = true
+	cfg.Integrations[key] = existing
+	return save(cfg)
+}
+
 // IntegrationModel returns the first configured model for an integration, or empty string if not configured.
 func IntegrationModel(appName string) string {
-	ic, err := loadIntegration(appName)
-	if err != nil || len(ic.Models) == 0 {
+	integrationConfig, err := loadIntegration(appName)
+	if err != nil || len(integrationConfig.Models) == 0 {
 		return ""
 	}
-	return ic.Models[0]
+	return integrationConfig.Models[0]
 }

 // IntegrationModels returns all configured models for an integration, or nil.
 func IntegrationModels(appName string) []string {
-	ic, err := loadIntegration(appName)
-	if err != nil || len(ic.Models) == 0 {
+	integrationConfig, err := loadIntegration(appName)
+	if err != nil || len(integrationConfig.Models) == 0 {
 		return nil
 	}
-	return ic.Models
+	return integrationConfig.Models
 }

 // LastModel returns the last model that was run, or empty string if none.
@@ -234,12 +255,12 @@ func loadIntegration(appName string) (*integration, error) {
 		return nil, err
 	}

-	ic, ok := cfg.Integrations[strings.ToLower(appName)]
+	integrationConfig, ok := cfg.Integrations[strings.ToLower(appName)]
 	if !ok {
 		return nil, os.ErrNotExist
 	}

-	return ic, nil
+	return integrationConfig, nil
 }

 func saveAliases(appName string, aliases map[string]string) error {
@@ -272,8 +293,8 @@ func listIntegrations() ([]integration, error) {
 	}

 	result := make([]integration, 0, len(cfg.Integrations))
-	for _, ic := range cfg.Integrations {
-		result = append(result, *ic)
+	for _, integrationConfig := range cfg.Integrations {
+		result = append(result, *integrationConfig)
 	}

 	return result, nil
--- a/cmd/config/integrations.go
+++ b/cmd/config/integrations.go
@@ -228,6 +228,31 @@ func IsIntegrationInstalled(name string) bool {
 	}
 }

+// AutoInstallable returns true if the integration can be automatically
+// installed when not found (e.g. via npm).
+func AutoInstallable(name string) bool {
+	switch strings.ToLower(name) {
+	case "openclaw", "clawdbot", "moltbot":
+		return true
+	default:
+		return false
+	}
+}
+
+// EnsureInstalled checks if an auto-installable integration is present and
+// offers to install it if missing. Returns nil for non-auto-installable
+// integrations or when the binary is already on PATH.
+func EnsureInstalled(name string) error {
+	if !AutoInstallable(name) {
+		return nil
+	}
+	if IsIntegrationInstalled(name) {
+		return nil
+	}
+	_, err := ensureOpenclawInstalled()
+	return err
+}
+
 // IsEditorIntegration returns true if the named integration uses multi-model
 // selection (implements the Editor interface).
 func IsEditorIntegration(name string) bool {
@@ -926,6 +951,10 @@ Examples:
 				return fmt.Errorf("unknown integration: %s", name)
 			}

+			if err := EnsureInstalled(name); err != nil {
+				return err
+			}
+
 			if modelFlag != "" && IsCloudModelDisabled(cmd.Context(), modelFlag) {
 				modelFlag = ""
 			}
--- a/cmd/config/openclaw.go
+++ b/cmd/config/openclaw.go
@@ -1,81 +1,287 @@
 package config

 import (
-	"bytes"
 	"context"
 	"encoding/json"
-	"errors"
 	"fmt"
-	"io"
+	"net"
+	"net/url"
 	"os"
 	"os/exec"
 	"path/filepath"
+	"runtime"
+	"slices"
 	"strings"
+	"time"

+	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/envconfig"
+	"github.com/ollama/ollama/types/model"
 )

+const defaultGatewayPort = 18789
+
+// Bound model capability probing so launch/config cannot hang on slow/unreachable API calls.
+var openclawModelShowTimeout = 5 * time.Second
+
 type Openclaw struct{}

 func (c *Openclaw) String() string { return "OpenClaw" }

 func (c *Openclaw) Run(model string, args []string) error {
-	bin := "openclaw"
-	if _, err := exec.LookPath(bin); err != nil {
-		bin = "clawdbot"
-		if _, err := exec.LookPath(bin); err != nil {
-			return fmt.Errorf("openclaw is not installed, install from https://docs.openclaw.ai")
-		}
-	}
-
-	models := []string{model}
-	if config, err := loadIntegration("openclaw"); err == nil && len(config.Models) > 0 {
-		models = config.Models
-	} else if config, err := loadIntegration("clawdbot"); err == nil && len(config.Models) > 0 {
-		models = config.Models
-	}
-	var err error
-	models, err = resolveEditorModels("openclaw", models, func() ([]string, error) {
-		return selectModels(context.Background(), "openclaw", "")
-	})
-	if errors.Is(err, errCancelled) {
-		return nil
-	}
+	bin, err := ensureOpenclawInstalled()
 	if err != nil {
 		return err
 	}
-	if err := c.Edit(models); err != nil {
-		return fmt.Errorf("setup failed: %w", err)
+
+	firstLaunch := true
+	if integrationConfig, err := loadIntegration("openclaw"); err == nil {
+		firstLaunch = !integrationConfig.Onboarded
+	}
+
+	if firstLaunch {
+		fmt.Fprintf(os.Stderr, "\n%sSecurity%s\n\n", ansiBold, ansiReset)
+		fmt.Fprintf(os.Stderr, "  OpenClaw can read files and run actions when tools are enabled.\n")
+		fmt.Fprintf(os.Stderr, "  A bad prompt can trick it into doing unsafe things.\n\n")
+		fmt.Fprintf(os.Stderr, "%s  Learn more: https://docs.openclaw.ai/gateway/security%s\n\n", ansiGray, ansiReset)
+
+		ok, err := confirmPrompt("I understand the risks. Continue?")
+		if err != nil {
+			return err
+		}
+		if !ok {
+			return nil
+		}
 	}

 	if !c.onboarded() {
-		// Onboarding not completed: run it (model already set via Edit)
-		// Use "ollama" as gateway token for simple local access
+		fmt.Fprintf(os.Stderr, "\n%sSetting up OpenClaw with Ollama...%s\n", ansiGreen, ansiReset)
+		fmt.Fprintf(os.Stderr, "%s  Model: %s%s\n\n", ansiGray, model, ansiReset)
+
 		cmd := exec.Command(bin, "onboard",
+			"--non-interactive",
+			"--accept-risk",
 			"--auth-choice", "skip",
 			"--gateway-token", "ollama",
+			"--install-daemon",
+			"--skip-channels",
+			"--skip-skills",
 		)
 		cmd.Stdin = os.Stdin
 		cmd.Stdout = os.Stdout
 		cmd.Stderr = os.Stderr
-		return cmd.Run()
+		if err := cmd.Run(); err != nil {
+			return windowsHint(fmt.Errorf("openclaw onboarding failed: %w\n\nTry running: openclaw onboard", err))
+		}
+
+		patchDeviceScopes()
+
+		// Onboarding overwrites openclaw.json, so re-apply the model config
+		// that Edit() wrote before Run() was called.
+		if err := c.Edit([]string{model}); err != nil {
+			fmt.Fprintf(os.Stderr, "%s  Warning: could not re-apply model config: %v%s\n", ansiYellow, err, ansiReset)
+		}
 	}

-	// Onboarding completed: run gateway
-	cmd := exec.Command(bin, append([]string{"gateway"}, args...)...)
-	cmd.Stdin = os.Stdin
+	if strings.HasSuffix(model, ":cloud") || strings.HasSuffix(model, "-cloud") {
+		if ensureWebSearchPlugin() {
+			registerWebSearchPlugin()
+		}
+	}

-	// Capture output to detect "already running" message
-	var outputBuf bytes.Buffer
-	cmd.Stdout = io.MultiWriter(os.Stdout, &outputBuf)
-	cmd.Stderr = io.MultiWriter(os.Stderr, &outputBuf)
+	if firstLaunch {
+		fmt.Fprintf(os.Stderr, "\n%sPreparing your assistant — this may take a moment...%s\n\n", ansiGray, ansiReset)
+	} else {
+		fmt.Fprintf(os.Stderr, "\n%sStarting your assistant — this may take a moment...%s\n\n", ansiGray, ansiReset)
+	}

-	err = cmd.Run()
-	if err != nil && strings.Contains(outputBuf.String(), "Gateway already running") {
-		fmt.Fprintf(os.Stderr, "%sOpenClaw has been configured with Ollama. Gateway is already running.%s\n", ansiGreen, ansiReset)
+	// When extra args are passed through, run exactly what the user asked for
+	// after setup and skip the built-in gateway+TUI convenience flow.
+	if len(args) > 0 {
+		cmd := exec.Command(bin, args...)
+		cmd.Env = openclawEnv()
+		cmd.Stdin = os.Stdin
+		cmd.Stdout = os.Stdout
+		cmd.Stderr = os.Stderr
+		if err := cmd.Run(); err != nil {
+			return windowsHint(err)
+		}
+		if firstLaunch {
+			if err := integrationOnboarded("openclaw"); err != nil {
+				return fmt.Errorf("failed to save onboarding state: %w", err)
+			}
+		}
 		return nil
 	}
-	return err
+
+	token, port := c.gatewayInfo()
+	addr := fmt.Sprintf("localhost:%d", port)
+
+	// If the gateway is already running (e.g. via the daemon), restart it
+	// so it picks up any config changes from Edit() above (model, provider, etc.).
+	if portOpen(addr) {
+		restart := exec.Command(bin, "daemon", "restart")
+		restart.Env = openclawEnv()
+		if err := restart.Run(); err != nil {
+			fmt.Fprintf(os.Stderr, "%s  Warning: daemon restart failed: %v%s\n", ansiYellow, err, ansiReset)
+		}
+		if !waitForPort(addr, 10*time.Second) {
+			fmt.Fprintf(os.Stderr, "%s  Warning: gateway did not come back after restart%s\n", ansiYellow, ansiReset)
+		}
+	}
+
+	// If the gateway isn't running, start it as a background child process.
+	if !portOpen(addr) {
+		gw := exec.Command(bin, "gateway", "run", "--force")
+		gw.Env = openclawEnv()
+		if err := gw.Start(); err != nil {
+			return windowsHint(fmt.Errorf("failed to start gateway: %w", err))
+		}
+		defer func() {
+			if gw.Process != nil {
+				_ = gw.Process.Kill()
+				_ = gw.Wait()
+			}
+		}()
+	}
+
+	fmt.Fprintf(os.Stderr, "%sStarting gateway...%s\n", ansiGray, ansiReset)
+	if !waitForPort(addr, 30*time.Second) {
+		return windowsHint(fmt.Errorf("gateway did not start on %s", addr))
+	}
+
+	printOpenclawReady(bin, token, port, firstLaunch)
+
+	tuiArgs := []string{"tui"}
+	if firstLaunch {
+		tuiArgs = append(tuiArgs, "--message", "Wake up, my friend!")
+	}
+	tui := exec.Command(bin, tuiArgs...)
+	tui.Env = openclawEnv()
+	tui.Stdin = os.Stdin
+	tui.Stdout = os.Stdout
+	tui.Stderr = os.Stderr
+	if err := tui.Run(); err != nil {
+		return windowsHint(err)
+	}
+
+	if firstLaunch {
+		if err := integrationOnboarded("openclaw"); err != nil {
+			return fmt.Errorf("failed to save onboarding state: %w", err)
+		}
+	}
+	return nil
+}
+
+// gatewayInfo reads the gateway auth token and port from the OpenClaw config.
+func (c *Openclaw) gatewayInfo() (token string, port int) {
+	port = defaultGatewayPort
+	home, err := os.UserHomeDir()
+	if err != nil {
+		return "", port
+	}
+
+	for _, path := range []string{
+		filepath.Join(home, ".openclaw", "openclaw.json"),
+		filepath.Join(home, ".clawdbot", "clawdbot.json"),
+	} {
+		data, err := os.ReadFile(path)
+		if err != nil {
+			continue
+		}
+		var config map[string]any
+		if json.Unmarshal(data, &config) != nil {
+			continue
+		}
+		gw, _ := config["gateway"].(map[string]any)
+		if p, ok := gw["port"].(float64); ok && p > 0 {
+			port = int(p)
+		}
+		auth, _ := gw["auth"].(map[string]any)
+		if t, _ := auth["token"].(string); t != "" {
+			token = t
+		}
+		return token, port
+	}
+	return "", port
+}
+
+func printOpenclawReady(bin, token string, port int, firstLaunch bool) {
+	u := fmt.Sprintf("http://localhost:%d", port)
+	if token != "" {
+		u += "/#token=" + url.QueryEscape(token)
+	}
+
+	fmt.Fprintf(os.Stderr, "\n%s✓ OpenClaw is running%s\n\n", ansiGreen, ansiReset)
+	fmt.Fprintf(os.Stderr, "  Open the Web UI:\n")
+	fmt.Fprintf(os.Stderr, "    %s\n\n", hyperlink(u, u))
+
+	if firstLaunch {
+		fmt.Fprintf(os.Stderr, "%s  Quick start:%s\n", ansiBold, ansiReset)
+		fmt.Fprintf(os.Stderr, "%s    /help             see all commands%s\n", ansiGray, ansiReset)
+		fmt.Fprintf(os.Stderr, "%s    %s configure --section channels   connect WhatsApp, Telegram, etc.%s\n", ansiGray, bin, ansiReset)
+		fmt.Fprintf(os.Stderr, "%s    %s skills                         browse and install skills%s\n\n", ansiGray, bin, ansiReset)
+		fmt.Fprintf(os.Stderr, "%s  The OpenClaw gateway is running in the background.%s\n", ansiYellow, ansiReset)
+		fmt.Fprintf(os.Stderr, "%s  Stop it with: %s gateway stop%s\n\n", ansiYellow, bin, ansiReset)
+	} else {
+		fmt.Fprintf(os.Stderr, "%sTip: connect WhatsApp, Telegram, and more with: %s configure --section channels%s\n", ansiGray, bin, ansiReset)
+	}
+}
+
+// openclawEnv returns the current environment with provider API keys cleared
+// so openclaw only uses the Ollama gateway, not keys from the user's shell.
+func openclawEnv() []string {
+	clear := map[string]bool{
+		"ANTHROPIC_API_KEY":     true,
+		"ANTHROPIC_OAUTH_TOKEN": true,
+		"OPENAI_API_KEY":        true,
+		"GEMINI_API_KEY":        true,
+		"MISTRAL_API_KEY":       true,
+		"GROQ_API_KEY":          true,
+		"XAI_API_KEY":           true,
+		"OPENROUTER_API_KEY":    true,
+	}
+	var env []string
+	for _, e := range os.Environ() {
+		key, _, _ := strings.Cut(e, "=")
+		if !clear[key] {
+			env = append(env, e)
+		}
+	}
+	return env
+}
+
+// portOpen checks if a TCP port is currently accepting connections.
+func portOpen(addr string) bool {
+	conn, err := net.DialTimeout("tcp", addr, 500*time.Millisecond)
+	if err != nil {
+		return false
+	}
+	conn.Close()
+	return true
+}
+
+func waitForPort(addr string, timeout time.Duration) bool {
+	deadline := time.Now().Add(timeout)
+	for time.Now().Before(deadline) {
+		conn, err := net.DialTimeout("tcp", addr, 500*time.Millisecond)
+		if err == nil {
+			conn.Close()
+			return true
+		}
+		time.Sleep(250 * time.Millisecond)
+	}
+	return false
+}
+
+func windowsHint(err error) error {
+	if runtime.GOOS != "windows" {
+		return err
+	}
+	return fmt.Errorf("%w\n\n"+
+		"OpenClaw runs best on WSL2.\n"+
+		"Quick setup: wsl --install\n"+
+		"Guide: https://docs.openclaw.ai/windows", err)
 }

 // onboarded checks if OpenClaw onboarding wizard was completed
@@ -107,6 +313,144 @@ func (c *Openclaw) onboarded() bool {
 	return lastRunAt != ""
 }

+// patchDeviceScopes upgrades the local CLI device's paired scopes to include
+// operator.admin. Only patches the local device, not remote ones.
+// Best-effort: silently returns on any error.
+func patchDeviceScopes() {
+	home, err := os.UserHomeDir()
+	if err != nil {
+		return
+	}
+
+	deviceID := readLocalDeviceID(home)
+	if deviceID == "" {
+		return
+	}
+
+	path := filepath.Join(home, ".openclaw", "devices", "paired.json")
+	data, err := os.ReadFile(path)
+	if err != nil {
+		return
+	}
+
+	var devices map[string]map[string]any
+	if err := json.Unmarshal(data, &devices); err != nil {
+		return
+	}
+
+	dev, ok := devices[deviceID]
+	if !ok {
+		return
+	}
+
+	required := []string{
+		"operator.read",
+		"operator.admin",
+		"operator.approvals",
+		"operator.pairing",
+	}
+
+	changed := patchScopes(dev, "scopes", required)
+	if tokens, ok := dev["tokens"].(map[string]any); ok {
+		for _, tok := range tokens {
+			if tokenMap, ok := tok.(map[string]any); ok {
+				if patchScopes(tokenMap, "scopes", required) {
+					changed = true
+				}
+			}
+		}
+	}
+
+	if !changed {
+		return
+	}
+
+	out, err := json.MarshalIndent(devices, "", "  ")
+	if err != nil {
+		return
+	}
+	_ = os.WriteFile(path, out, 0o600)
+}
+
+// readLocalDeviceID reads the local device ID from openclaw's identity file.
+func readLocalDeviceID(home string) string {
+	data, err := os.ReadFile(filepath.Join(home, ".openclaw", "identity", "device-auth.json"))
+	if err != nil {
+		return ""
+	}
+	var auth map[string]any
+	if err := json.Unmarshal(data, &auth); err != nil {
+		return ""
+	}
+	id, _ := auth["deviceId"].(string)
+	return id
+}
+
+// patchScopes ensures obj[key] contains all required scopes. Returns true if
+// any scopes were added.
+func patchScopes(obj map[string]any, key string, required []string) bool {
+	existing, _ := obj[key].([]any)
+	have := make(map[string]bool, len(existing))
+	for _, s := range existing {
+		if str, ok := s.(string); ok {
+			have[str] = true
+		}
+	}
+	added := false
+	for _, s := range required {
+		if !have[s] {
+			existing = append(existing, s)
+			added = true
+		}
+	}
+	if added {
+		obj[key] = existing
+	}
+	return added
+}
+
+func ensureOpenclawInstalled() (string, error) {
+	if _, err := exec.LookPath("openclaw"); err == nil {
+		return "openclaw", nil
+	}
+	if _, err := exec.LookPath("clawdbot"); err == nil {
+		return "clawdbot", nil
+	}
+
+	if _, err := exec.LookPath("npm"); err != nil {
+		return "", fmt.Errorf("openclaw is not installed and npm was not found\n\n" +
+			"Install Node.js first:\n" +
+			"  https://nodejs.org/\n\n" +
+			"Then rerun:\n" +
+			"  ollama launch\n" +
+			"and select OpenClaw")
+	}
+
+	ok, err := confirmPrompt("OpenClaw is not installed. Install with npm?")
+	if err != nil {
+		return "", err
+	}
+	if !ok {
+		return "", fmt.Errorf("openclaw installation cancelled")
+	}
+
+	fmt.Fprintf(os.Stderr, "\nInstalling OpenClaw...\n")
+	cmd := exec.Command("npm", "install", "-g", "openclaw@latest")
+	cmd.Stdin = os.Stdin
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+	if err := cmd.Run(); err != nil {
+		return "", fmt.Errorf("failed to install openclaw: %w", err)
+	}
+
+	if _, err := exec.LookPath("openclaw"); err != nil {
+		return "", fmt.Errorf("openclaw was installed but the binary was not found on PATH\n\nYou may need to restart your shell")
+	}
+
+	fmt.Fprintf(os.Stderr, "%sOpenClaw installed successfully%s\n\n", ansiGreen, ansiReset)
+	return "openclaw", nil
+}
+
 func (c *Openclaw) Paths() []string {
 	home, _ := os.UserHomeDir()
 	p := filepath.Join(home, ".openclaw", "openclaw.json")
@@ -161,8 +505,7 @@ func (c *Openclaw) Edit(models []string) error {
 	ollama["baseUrl"] = envconfig.Host().String() + "/v1"
 	// needed to register provider
 	ollama["apiKey"] = "ollama-local"
-	// TODO(parthsareen): potentially move to responses
-	ollama["api"] = "openai-completions"
+	ollama["api"] = "ollama"

 	// Build map of existing models to preserve user customizations
 	existingModels, _ := ollama["models"].([]any)
@@ -175,25 +518,13 @@ func (c *Openclaw) Edit(models []string) error {
 		}
 	}

+	client, _ := api.ClientFromEnvironment()
+
 	var newModels []any
-	for _, model := range models {
-		entry := map[string]any{
-			"id":        model,
-			"name":      model,
-			"reasoning": false,
-			"input":     []any{"text"},
-			"cost": map[string]any{
-				"input":      0,
-				"output":     0,
-				"cacheRead":  0,
-				"cacheWrite": 0,
-			},
-			// TODO(parthsareen): get these values from API
-			"contextWindow": 131072,
-			"maxTokens":     16384,
-		}
+	for _, m := range models {
+		entry, _ := openclawModelConfig(context.Background(), client, m)
 		// Merge existing fields (user customizations)
-		if existing, ok := existingByID[model]; ok {
+		if existing, ok := existingByID[m]; ok {
 			for k, v := range existing {
 				if _, isNew := entry[k]; !isNew {
 					entry[k] = v
@@ -230,7 +561,213 @@ func (c *Openclaw) Edit(models []string) error {
 	if err != nil {
 		return err
 	}
-	return writeWithBackup(configPath, data)
+	if err := writeWithBackup(configPath, data); err != nil {
+		return err
+	}
+
+	// Clear any per-session model overrides so the new primary takes effect
+	// immediately rather than being shadowed by a cached modelOverride.
+	clearSessionModelOverride(models[0])
+	return nil
+}
+
+// clearSessionModelOverride removes per-session model overrides from the main
+// agent session so the global primary model takes effect on the next TUI launch.
+func clearSessionModelOverride(primary string) {
+	home, err := os.UserHomeDir()
+	if err != nil {
+		return
+	}
+	path := filepath.Join(home, ".openclaw", "agents", "main", "sessions", "sessions.json")
+	data, err := os.ReadFile(path)
+	if err != nil {
+		return
+	}
+	var sessions map[string]map[string]any
+	if json.Unmarshal(data, &sessions) != nil {
+		return
+	}
+	changed := false
+	for _, sess := range sessions {
+		if override, _ := sess["modelOverride"].(string); override != "" && override != primary {
+			delete(sess, "modelOverride")
+			delete(sess, "providerOverride")
+			sess["model"] = primary
+			changed = true
+		}
+	}
+	if !changed {
+		return
+	}
+	out, err := json.MarshalIndent(sessions, "", "  ")
+	if err != nil {
+		return
+	}
+	_ = os.WriteFile(path, out, 0o600)
+}
+
+const webSearchNpmPackage = "@ollama/openclaw-web-search"
+
+// ensureWebSearchPlugin installs the openclaw-web-search extension into the
+// user-level extensions directory (~/.openclaw/extensions/) if it isn't already
+// present. Returns true if the extension is available.
+func ensureWebSearchPlugin() bool {
+	home, err := os.UserHomeDir()
+	if err != nil {
+		return false
+	}
+
+	pluginDir := filepath.Join(home, ".openclaw", "extensions", "openclaw-web-search")
+	if _, err := os.Stat(filepath.Join(pluginDir, "index.ts")); err == nil {
+		return true // already installed
+	}
+
+	npmBin, err := exec.LookPath("npm")
+	if err != nil {
+		return false
+	}
+
+	if err := os.MkdirAll(pluginDir, 0o755); err != nil {
+		return false
+	}
+
+	// Download the tarball via `npm pack`, extract it flat into the plugin dir.
+	pack := exec.Command(npmBin, "pack", webSearchNpmPackage, "--pack-destination", pluginDir)
+	out, err := pack.Output()
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "%s  Warning: could not download web search plugin: %v%s\n", ansiYellow, err, ansiReset)
+		return false
+	}
+
+	tgzName := strings.TrimSpace(string(out))
+	tgzPath := filepath.Join(pluginDir, tgzName)
+	defer os.Remove(tgzPath)
+
+	tar := exec.Command("tar", "xzf", tgzPath, "--strip-components=1", "-C", pluginDir)
+	if err := tar.Run(); err != nil {
+		fmt.Fprintf(os.Stderr, "%s  Warning: could not extract web search plugin: %v%s\n", ansiYellow, err, ansiReset)
+		return false
+	}
+
+	fmt.Fprintf(os.Stderr, "%s  ✓ Installed web search plugin%s\n", ansiGreen, ansiReset)
+	return true
+}
+
+// registerWebSearchPlugin adds plugins.entries.openclaw-web-search to the OpenClaw
+// config so the gateway activates it on next start. Best-effort; silently returns
+// on any error.
+func registerWebSearchPlugin() {
+	home, err := os.UserHomeDir()
+	if err != nil {
+		return
+	}
+	configPath := filepath.Join(home, ".openclaw", "openclaw.json")
+	data, err := os.ReadFile(configPath)
+	if err != nil {
+		return
+	}
+	var config map[string]any
+	if json.Unmarshal(data, &config) != nil {
+		return
+	}
+
+	plugins, _ := config["plugins"].(map[string]any)
+	if plugins == nil {
+		plugins = make(map[string]any)
+	}
+	entries, _ := plugins["entries"].(map[string]any)
+	if entries == nil {
+		entries = make(map[string]any)
+	}
+	if _, ok := entries["openclaw-web-search"]; ok {
+		return // already registered
+	}
+	entries["openclaw-web-search"] = map[string]any{"enabled": true}
+	plugins["entries"] = entries
+	config["plugins"] = plugins
+
+	// Disable the built-in web search since our plugin replaces it.
+	tools, _ := config["tools"].(map[string]any)
+	if tools == nil {
+		tools = make(map[string]any)
+	}
+	web, _ := tools["web"].(map[string]any)
+	if web == nil {
+		web = make(map[string]any)
+	}
+	web["search"] = map[string]any{"enabled": false}
+	tools["web"] = web
+	config["tools"] = tools
+
+	out, err := json.MarshalIndent(config, "", "  ")
+	if err != nil {
+		return
+	}
+	_ = os.WriteFile(configPath, out, 0o600)
+}
+
+// openclawModelConfig builds an OpenClaw model config entry with capability detection.
+// The second return value indicates whether the model is a cloud (remote) model.
+func openclawModelConfig(ctx context.Context, client *api.Client, modelID string) (map[string]any, bool) {
+	entry := map[string]any{
+		"id":    modelID,
+		"name":  modelID,
+		"input": []any{"text"},
+		"cost": map[string]any{
+			"input":      0,
+			"output":     0,
+			"cacheRead":  0,
+			"cacheWrite": 0,
+		},
+	}
+
+	if client == nil {
+		return entry, false
+	}
+
+	showCtx := ctx
+	if _, hasDeadline := ctx.Deadline(); !hasDeadline {
+		var cancel context.CancelFunc
+		showCtx, cancel = context.WithTimeout(ctx, openclawModelShowTimeout)
+		defer cancel()
+	}
+
+	resp, err := client.Show(showCtx, &api.ShowRequest{Model: modelID})
+	if err != nil {
+		return entry, false
+	}
+
+	// Set input types based on vision capability
+	if slices.Contains(resp.Capabilities, model.CapabilityVision) {
+		entry["input"] = []any{"text", "image"}
+	}
+
+	// Set reasoning based on thinking capability
+	if slices.Contains(resp.Capabilities, model.CapabilityThinking) {
+		entry["reasoning"] = true
+	}
+
+	// Cloud models: use hardcoded limits for context/output tokens.
+	// Capability detection above still applies (vision, thinking).
+	if resp.RemoteModel != "" {
+		if l, ok := lookupCloudModelLimit(modelID); ok {
+			entry["contextWindow"] = l.Context
+			entry["maxTokens"] = l.Output
+		}
+		return entry, true
+	}
+
+	// Extract context window from ModelInfo (local models only)
+	for key, val := range resp.ModelInfo {
+		if strings.HasSuffix(key, ".context_length") {
+			if ctxLen, ok := val.(float64); ok && ctxLen > 0 {
+				entry["contextWindow"] = int(ctxLen)
+			}
+			break
+		}
+	}
+
+	return entry, false
 }

 func (c *Openclaw) Models() []string {
--- a/cmd/config/openclaw_test.go
+++ b/cmd/config/openclaw_test.go
@@ -1,11 +1,21 @@
 package config

 import (
+	"bytes"
+	"context"
 	"encoding/json"
 	"fmt"
+	"net/http"
+	"net/http/httptest"
+	"net/url"
 	"os"
 	"path/filepath"
+	"runtime"
+	"strings"
 	"testing"
+	"time"
+
+	"github.com/ollama/ollama/api"
 )

 func TestOpenclawIntegration(t *testing.T) {
@@ -26,6 +36,124 @@ func TestOpenclawIntegration(t *testing.T) {
 	})
 }

+func TestOpenclawRunPassthroughArgs(t *testing.T) {
+	if runtime.GOOS == "windows" {
+		t.Skip("uses a POSIX shell test binary")
+	}
+
+	tmpDir := t.TempDir()
+	setTestHome(t, tmpDir)
+	t.Setenv("PATH", tmpDir)
+
+	if err := integrationOnboarded("openclaw"); err != nil {
+		t.Fatal(err)
+	}
+
+	configDir := filepath.Join(tmpDir, ".openclaw")
+	if err := os.MkdirAll(configDir, 0o755); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(filepath.Join(configDir, "openclaw.json"), []byte(`{
+		"wizard": {"lastRunAt": "2026-01-01T00:00:00Z"}
+	}`), 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	bin := filepath.Join(tmpDir, "openclaw")
+	if err := os.WriteFile(bin, []byte("#!/bin/sh\nprintf '%s\\n' \"$*\" >> \"$HOME/invocations.log\"\n"), 0o755); err != nil {
+		t.Fatal(err)
+	}
+
+	c := &Openclaw{}
+	if err := c.Run("llama3.2", []string{"gateway", "--someflag"}); err != nil {
+		t.Fatalf("Run() error = %v", err)
+	}
+
+	data, err := os.ReadFile(filepath.Join(tmpDir, "invocations.log"))
+	if err != nil {
+		t.Fatal(err)
+	}
+	lines := strings.Split(strings.TrimSpace(string(data)), "\n")
+	if len(lines) != 1 {
+		t.Fatalf("expected exactly 1 invocation, got %d: %v", len(lines), lines)
+	}
+	if lines[0] != "gateway --someflag" {
+		t.Fatalf("invocation = %q, want %q", lines[0], "gateway --someflag")
+	}
+}
+
+func TestOpenclawRunFirstLaunchPersistence(t *testing.T) {
+	if runtime.GOOS == "windows" {
+		t.Skip("uses a POSIX shell test binary")
+	}
+
+	oldHook := DefaultConfirmPrompt
+	DefaultConfirmPrompt = func(prompt string) (bool, error) {
+		return true, nil
+	}
+	defer func() { DefaultConfirmPrompt = oldHook }()
+
+	t.Run("success persists onboarding flag", func(t *testing.T) {
+		tmpDir := t.TempDir()
+		setTestHome(t, tmpDir)
+		t.Setenv("PATH", tmpDir)
+
+		configDir := filepath.Join(tmpDir, ".openclaw")
+		if err := os.MkdirAll(configDir, 0o755); err != nil {
+			t.Fatal(err)
+		}
+		// Mark OpenClaw onboarding complete so Run takes passthrough path directly.
+		if err := os.WriteFile(filepath.Join(configDir, "openclaw.json"), []byte(`{
+			"wizard": {"lastRunAt": "2026-01-01T00:00:00Z"}
+		}`), 0o644); err != nil {
+			t.Fatal(err)
+		}
+		if err := os.WriteFile(filepath.Join(tmpDir, "openclaw"), []byte("#!/bin/sh\nexit 0\n"), 0o755); err != nil {
+			t.Fatal(err)
+		}
+
+		c := &Openclaw{}
+		if err := c.Run("llama3.2", []string{"gateway", "--status"}); err != nil {
+			t.Fatalf("Run() error = %v", err)
+		}
+		integrationConfig, err := loadIntegration("openclaw")
+		if err != nil {
+			t.Fatalf("loadIntegration() error = %v", err)
+		}
+		if !integrationConfig.Onboarded {
+			t.Fatal("expected onboarding flag to be persisted after successful run")
+		}
+	})
+
+	t.Run("failure does not persist onboarding flag", func(t *testing.T) {
+		tmpDir := t.TempDir()
+		setTestHome(t, tmpDir)
+		t.Setenv("PATH", tmpDir)
+
+		configDir := filepath.Join(tmpDir, ".openclaw")
+		if err := os.MkdirAll(configDir, 0o755); err != nil {
+			t.Fatal(err)
+		}
+		if err := os.WriteFile(filepath.Join(configDir, "openclaw.json"), []byte(`{
+			"wizard": {"lastRunAt": "2026-01-01T00:00:00Z"}
+		}`), 0o644); err != nil {
+			t.Fatal(err)
+		}
+		if err := os.WriteFile(filepath.Join(tmpDir, "openclaw"), []byte("#!/bin/sh\nexit 1\n"), 0o755); err != nil {
+			t.Fatal(err)
+		}
+
+		c := &Openclaw{}
+		if err := c.Run("llama3.2", []string{"gateway", "--status"}); err == nil {
+			t.Fatal("expected run failure")
+		}
+		integrationConfig, err := loadIntegration("openclaw")
+		if err == nil && integrationConfig.Onboarded {
+			t.Fatal("expected onboarding flag to remain unset after failed run")
+		}
+	})
+}
+
 func TestOpenclawEdit(t *testing.T) {
 	c := &Openclaw{}
 	tmpDir := t.TempDir()
@@ -359,19 +487,16 @@ func TestOpenclawEditSchemaFields(t *testing.T) {
 	modelList := ollama["models"].([]any)
 	entry := modelList[0].(map[string]any)

-	// Verify required schema fields
-	if entry["reasoning"] != false {
-		t.Error("reasoning should be false")
+	// Verify base schema fields (always set regardless of API availability)
+	if entry["id"] != "llama3.2" {
+		t.Errorf("id = %v, want llama3.2", entry["id"])
+	}
+	if entry["name"] != "llama3.2" {
+		t.Errorf("name = %v, want llama3.2", entry["name"])
 	}
 	if entry["input"] == nil {
 		t.Error("input should be set")
 	}
-	if entry["contextWindow"] == nil {
-		t.Error("contextWindow should be set")
-	}
-	if entry["maxTokens"] == nil {
-		t.Error("maxTokens should be set")
-	}
 	cost := entry["cost"].(map[string]any)
 	if cost["cacheRead"] == nil {
 		t.Error("cost.cacheRead should be set")
@@ -876,3 +1001,589 @@ func TestOpenclawOnboarded(t *testing.T) {
 		}
 	})
 }
+
+func TestOpenclawGatewayInfo(t *testing.T) {
+	c := &Openclaw{}
+
+	t.Run("returns defaults when no config exists", func(t *testing.T) {
+		tmpDir := t.TempDir()
+		setTestHome(t, tmpDir)
+
+		token, port := c.gatewayInfo()
+		if token != "" {
+			t.Errorf("expected empty token, got %q", token)
+		}
+		if port != defaultGatewayPort {
+			t.Errorf("expected default port %d, got %d", defaultGatewayPort, port)
+		}
+	})
+
+	t.Run("reads token and port from config", func(t *testing.T) {
+		tmpDir := t.TempDir()
+		setTestHome(t, tmpDir)
+		configDir := filepath.Join(tmpDir, ".openclaw")
+		os.MkdirAll(configDir, 0o755)
+		os.WriteFile(filepath.Join(configDir, "openclaw.json"), []byte(`{
+			"gateway": {
+				"port": 9999,
+				"auth": {"mode": "token", "token": "my-secret"}
+			}
+		}`), 0o644)
+
+		token, port := c.gatewayInfo()
+		if token != "my-secret" {
+			t.Errorf("expected token %q, got %q", "my-secret", token)
+		}
+		if port != 9999 {
+			t.Errorf("expected port 9999, got %d", port)
+		}
+	})
+
+	t.Run("uses default port when not in config", func(t *testing.T) {
+		tmpDir := t.TempDir()
+		setTestHome(t, tmpDir)
+		configDir := filepath.Join(tmpDir, ".openclaw")
+		os.MkdirAll(configDir, 0o755)
+		os.WriteFile(filepath.Join(configDir, "openclaw.json"), []byte(`{
+			"gateway": {"auth": {"token": "tok"}}
+		}`), 0o644)
+
+		token, port := c.gatewayInfo()
+		if token != "tok" {
+			t.Errorf("expected token %q, got %q", "tok", token)
+		}
+		if port != defaultGatewayPort {
+			t.Errorf("expected default port %d, got %d", defaultGatewayPort, port)
+		}
+	})
+
+	t.Run("falls back to legacy clawdbot config", func(t *testing.T) {
+		tmpDir := t.TempDir()
+		setTestHome(t, tmpDir)
+		legacyDir := filepath.Join(tmpDir, ".clawdbot")
+		os.MkdirAll(legacyDir, 0o755)
+		os.WriteFile(filepath.Join(legacyDir, "clawdbot.json"), []byte(`{
+			"gateway": {"port": 12345, "auth": {"token": "legacy-token"}}
+		}`), 0o644)
+
+		token, port := c.gatewayInfo()
+		if token != "legacy-token" {
+			t.Errorf("expected token %q, got %q", "legacy-token", token)
+		}
+		if port != 12345 {
+			t.Errorf("expected port 12345, got %d", port)
+		}
+	})
+
+	t.Run("handles corrupted JSON gracefully", func(t *testing.T) {
+		tmpDir := t.TempDir()
+		setTestHome(t, tmpDir)
+		configDir := filepath.Join(tmpDir, ".openclaw")
+		os.MkdirAll(configDir, 0o755)
+		os.WriteFile(filepath.Join(configDir, "openclaw.json"), []byte(`{corrupted`), 0o644)
+
+		token, port := c.gatewayInfo()
+		if token != "" {
+			t.Errorf("expected empty token, got %q", token)
+		}
+		if port != defaultGatewayPort {
+			t.Errorf("expected default port, got %d", port)
+		}
+	})
+
+	t.Run("handles missing gateway section", func(t *testing.T) {
+		tmpDir := t.TempDir()
+		setTestHome(t, tmpDir)
+		configDir := filepath.Join(tmpDir, ".openclaw")
+		os.MkdirAll(configDir, 0o755)
+		os.WriteFile(filepath.Join(configDir, "openclaw.json"), []byte(`{"theme":"dark"}`), 0o644)
+
+		token, port := c.gatewayInfo()
+		if token != "" {
+			t.Errorf("expected empty token, got %q", token)
+		}
+		if port != defaultGatewayPort {
+			t.Errorf("expected default port, got %d", port)
+		}
+	})
+}
+
+func TestPrintOpenclawReady(t *testing.T) {
+	t.Run("includes port in URL", func(t *testing.T) {
+		var buf bytes.Buffer
+		old := os.Stderr
+		r, w, _ := os.Pipe()
+		os.Stderr = w
+
+		printOpenclawReady("openclaw", "", 9999, false)
+
+		w.Close()
+		os.Stderr = old
+		buf.ReadFrom(r)
+
+		output := buf.String()
+		if !strings.Contains(output, "localhost:9999") {
+			t.Errorf("expected port 9999 in output, got:\n%s", output)
+		}
+		if strings.Contains(output, "#token=") {
+			t.Error("should not include token fragment when token is empty")
+		}
+	})
+
+	t.Run("URL-escapes token", func(t *testing.T) {
+		var buf bytes.Buffer
+		old := os.Stderr
+		r, w, _ := os.Pipe()
+		os.Stderr = w
+
+		printOpenclawReady("openclaw", "my token&special=chars", defaultGatewayPort, false)
+
+		w.Close()
+		os.Stderr = old
+		buf.ReadFrom(r)
+
+		output := buf.String()
+		escaped := url.QueryEscape("my token&special=chars")
+		if !strings.Contains(output, "#token="+escaped) {
+			t.Errorf("expected URL-escaped token %q in output, got:\n%s", escaped, output)
+		}
+	})
+
+	t.Run("simple token is not mangled", func(t *testing.T) {
+		var buf bytes.Buffer
+		old := os.Stderr
+		r, w, _ := os.Pipe()
+		os.Stderr = w
+
+		printOpenclawReady("openclaw", "ollama", defaultGatewayPort, false)
+
+		w.Close()
+		os.Stderr = old
+		buf.ReadFrom(r)
+
+		output := buf.String()
+		if !strings.Contains(output, "#token=ollama") {
+			t.Errorf("expected #token=ollama in output, got:\n%s", output)
+		}
+	})
+
+	t.Run("includes web UI hint", func(t *testing.T) {
+		var buf bytes.Buffer
+		old := os.Stderr
+		r, w, _ := os.Pipe()
+		os.Stderr = w
+
+		printOpenclawReady("openclaw", "", defaultGatewayPort, false)
+
+		w.Close()
+		os.Stderr = old
+		buf.ReadFrom(r)
+
+		output := buf.String()
+		if !strings.Contains(output, "Open the Web UI") {
+			t.Errorf("expected web UI hint in output, got:\n%s", output)
+		}
+	})
+
+	t.Run("first launch shows quick start tips", func(t *testing.T) {
+		var buf bytes.Buffer
+		old := os.Stderr
+		r, w, _ := os.Pipe()
+		os.Stderr = w
+
+		printOpenclawReady("openclaw", "ollama", defaultGatewayPort, true)
+
+		w.Close()
+		os.Stderr = old
+		buf.ReadFrom(r)
+
+		output := buf.String()
+		for _, want := range []string{"/help", "channels", "skills", "gateway"} {
+			if !strings.Contains(output, want) {
+				t.Errorf("expected %q in first-launch output, got:\n%s", want, output)
+			}
+		}
+	})
+
+	t.Run("subsequent launch shows single tip", func(t *testing.T) {
+		var buf bytes.Buffer
+		old := os.Stderr
+		r, w, _ := os.Pipe()
+		os.Stderr = w
+
+		printOpenclawReady("openclaw", "ollama", defaultGatewayPort, false)
+
+		w.Close()
+		os.Stderr = old
+		buf.ReadFrom(r)
+
+		output := buf.String()
+		if !strings.Contains(output, "Tip:") {
+			t.Errorf("expected single tip line, got:\n%s", output)
+		}
+		if strings.Contains(output, "Quick start") {
+			t.Errorf("should not show quick start on subsequent launch")
+		}
+	})
+}
+
+func TestOpenclawModelConfig(t *testing.T) {
+	t.Run("nil client returns base config", func(t *testing.T) {
+		cfg, _ := openclawModelConfig(context.Background(), nil, "llama3.2")
+
+		if cfg["id"] != "llama3.2" {
+			t.Errorf("id = %v, want llama3.2", cfg["id"])
+		}
+		if cfg["name"] != "llama3.2" {
+			t.Errorf("name = %v, want llama3.2", cfg["name"])
+		}
+		if cfg["cost"] == nil {
+			t.Error("cost should be set")
+		}
+		// Should not have capability fields without API
+		if _, ok := cfg["reasoning"]; ok {
+			t.Error("reasoning should not be set without API")
+		}
+		if _, ok := cfg["contextWindow"]; ok {
+			t.Error("contextWindow should not be set without API")
+		}
+	})
+
+	t.Run("sets vision input when model has vision capability", func(t *testing.T) {
+		srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			if r.URL.Path == "/api/show" {
+				fmt.Fprintf(w, `{"capabilities":["vision"],"model_info":{"llama.context_length":4096}}`)
+				return
+			}
+			w.WriteHeader(http.StatusNotFound)
+		}))
+		defer srv.Close()
+
+		u, _ := url.Parse(srv.URL)
+		client := api.NewClient(u, srv.Client())
+
+		cfg, _ := openclawModelConfig(context.Background(), client, "llava:7b")
+
+		input, ok := cfg["input"].([]any)
+		if !ok || len(input) != 2 {
+			t.Errorf("input = %v, want [text image]", cfg["input"])
+		}
+	})
+
+	t.Run("sets text-only input when model lacks vision", func(t *testing.T) {
+		srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			if r.URL.Path == "/api/show" {
+				fmt.Fprintf(w, `{"capabilities":["completion"],"model_info":{}}`)
+				return
+			}
+			w.WriteHeader(http.StatusNotFound)
+		}))
+		defer srv.Close()
+
+		u, _ := url.Parse(srv.URL)
+		client := api.NewClient(u, srv.Client())
+
+		cfg, _ := openclawModelConfig(context.Background(), client, "llama3.2")
+
+		input, ok := cfg["input"].([]any)
+		if !ok || len(input) != 1 {
+			t.Errorf("input = %v, want [text]", cfg["input"])
+		}
+		if _, ok := cfg["reasoning"]; ok {
+			t.Error("reasoning should not be set for non-thinking model")
+		}
+	})
+
+	t.Run("sets reasoning when model has thinking capability", func(t *testing.T) {
+		srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			if r.URL.Path == "/api/show" {
+				fmt.Fprintf(w, `{"capabilities":["thinking"],"model_info":{}}`)
+				return
+			}
+			w.WriteHeader(http.StatusNotFound)
+		}))
+		defer srv.Close()
+
+		u, _ := url.Parse(srv.URL)
+		client := api.NewClient(u, srv.Client())
+
+		cfg, _ := openclawModelConfig(context.Background(), client, "qwq")
+
+		if cfg["reasoning"] != true {
+			t.Error("expected reasoning = true for thinking model")
+		}
+	})
+
+	t.Run("extracts context window from model info", func(t *testing.T) {
+		srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			if r.URL.Path == "/api/show" {
+				fmt.Fprintf(w, `{"capabilities":[],"model_info":{"llama.context_length":131072}}`)
+				return
+			}
+			w.WriteHeader(http.StatusNotFound)
+		}))
+		defer srv.Close()
+
+		u, _ := url.Parse(srv.URL)
+		client := api.NewClient(u, srv.Client())
+
+		cfg, _ := openclawModelConfig(context.Background(), client, "llama3.2")
+
+		if cfg["contextWindow"] != 131072 {
+			t.Errorf("contextWindow = %v, want 131072", cfg["contextWindow"])
+		}
+	})
+
+	t.Run("handles all capabilities together", func(t *testing.T) {
+		srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			if r.URL.Path == "/api/show" {
+				fmt.Fprintf(w, `{"capabilities":["vision","thinking"],"model_info":{"qwen3.context_length":32768}}`)
+				return
+			}
+			w.WriteHeader(http.StatusNotFound)
+		}))
+		defer srv.Close()
+
+		u, _ := url.Parse(srv.URL)
+		client := api.NewClient(u, srv.Client())
+
+		cfg, _ := openclawModelConfig(context.Background(), client, "qwen3-vision")
+
+		input, ok := cfg["input"].([]any)
+		if !ok || len(input) != 2 {
+			t.Errorf("input = %v, want [text image]", cfg["input"])
+		}
+		if cfg["reasoning"] != true {
+			t.Error("expected reasoning = true")
+		}
+		if cfg["contextWindow"] != 32768 {
+			t.Errorf("contextWindow = %v, want 32768", cfg["contextWindow"])
+		}
+	})
+
+	t.Run("returns base config when show fails", func(t *testing.T) {
+		srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			w.WriteHeader(http.StatusNotFound)
+			fmt.Fprintf(w, `{"error":"model not found"}`)
+		}))
+		defer srv.Close()
+
+		u, _ := url.Parse(srv.URL)
+		client := api.NewClient(u, srv.Client())
+
+		cfg, _ := openclawModelConfig(context.Background(), client, "missing-model")
+
+		if cfg["id"] != "missing-model" {
+			t.Errorf("id = %v, want missing-model", cfg["id"])
+		}
+		// Should still have input (default)
+		if cfg["input"] == nil {
+			t.Error("input should always be set")
+		}
+		if _, ok := cfg["reasoning"]; ok {
+			t.Error("reasoning should not be set when show fails")
+		}
+		if _, ok := cfg["contextWindow"]; ok {
+			t.Error("contextWindow should not be set when show fails")
+		}
+	})
+
+	t.Run("times out slow show and returns base config", func(t *testing.T) {
+		oldTimeout := openclawModelShowTimeout
+		openclawModelShowTimeout = 50 * time.Millisecond
+		t.Cleanup(func() { openclawModelShowTimeout = oldTimeout })
+
+		srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			if r.URL.Path == "/api/show" {
+				time.Sleep(300 * time.Millisecond)
+				fmt.Fprintf(w, `{"capabilities":["thinking"],"model_info":{"llama.context_length":4096}}`)
+				return
+			}
+			w.WriteHeader(http.StatusNotFound)
+		}))
+		defer srv.Close()
+
+		u, _ := url.Parse(srv.URL)
+		client := api.NewClient(u, srv.Client())
+
+		start := time.Now()
+		cfg, _ := openclawModelConfig(context.Background(), client, "slow-model")
+		elapsed := time.Since(start)
+		if elapsed >= 250*time.Millisecond {
+			t.Fatalf("openclawModelConfig took too long: %v", elapsed)
+		}
+		if cfg["id"] != "slow-model" {
+			t.Errorf("id = %v, want slow-model", cfg["id"])
+		}
+		if _, ok := cfg["reasoning"]; ok {
+			t.Error("reasoning should not be set on timeout")
+		}
+		if _, ok := cfg["contextWindow"]; ok {
+			t.Error("contextWindow should not be set on timeout")
+		}
+	})
+
+	t.Run("skips zero context length", func(t *testing.T) {
+		srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			if r.URL.Path == "/api/show" {
+				fmt.Fprintf(w, `{"capabilities":[],"model_info":{"llama.context_length":0}}`)
+				return
+			}
+			w.WriteHeader(http.StatusNotFound)
+		}))
+		defer srv.Close()
+
+		u, _ := url.Parse(srv.URL)
+		client := api.NewClient(u, srv.Client())
+
+		cfg, _ := openclawModelConfig(context.Background(), client, "test-model")
+
+		if _, ok := cfg["contextWindow"]; ok {
+			t.Error("contextWindow should not be set for zero value")
+		}
+	})
+
+	t.Run("cloud model uses hardcoded limits", func(t *testing.T) {
+		// Use a model name that's in cloudModelLimits and make the server
+		// report it as a remote/cloud model
+		srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			if r.URL.Path == "/api/show" {
+				fmt.Fprintf(w, `{"capabilities":[],"model_info":{},"remote_model":"minimax-m2.5"}`)
+				return
+			}
+			w.WriteHeader(http.StatusNotFound)
+		}))
+		defer srv.Close()
+
+		u, _ := url.Parse(srv.URL)
+		client := api.NewClient(u, srv.Client())
+
+		cfg, isCloud := openclawModelConfig(context.Background(), client, "minimax-m2.5:cloud")
+
+		if !isCloud {
+			t.Error("expected isCloud = true for cloud model")
+		}
+		if cfg["contextWindow"] != 204_800 {
+			t.Errorf("contextWindow = %v, want 204800", cfg["contextWindow"])
+		}
+		if cfg["maxTokens"] != 128_000 {
+			t.Errorf("maxTokens = %v, want 128000", cfg["maxTokens"])
+		}
+	})
+
+	t.Run("cloud model with vision capability gets image input", func(t *testing.T) {
+		// Regression test: cloud models must not skip capability detection.
+		// A cloud model that reports vision capability should have input: [text, image].
+		srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			if r.URL.Path == "/api/show" {
+				fmt.Fprintf(w, `{"capabilities":["vision"],"model_info":{},"remote_model":"qwen3-vl"}`)
+				return
+			}
+			w.WriteHeader(http.StatusNotFound)
+		}))
+		defer srv.Close()
+
+		u, _ := url.Parse(srv.URL)
+		client := api.NewClient(u, srv.Client())
+
+		cfg, isCloud := openclawModelConfig(context.Background(), client, "qwen3-vl:235b-cloud")
+
+		if !isCloud {
+			t.Error("expected isCloud = true for cloud vision model")
+		}
+		input, ok := cfg["input"].([]any)
+		if !ok || len(input) != 2 {
+			t.Errorf("input = %v, want [text image] for cloud vision model", cfg["input"])
+		}
+	})
+
+	t.Run("cloud model with thinking capability gets reasoning flag", func(t *testing.T) {
+		// Regression test: cloud models must not skip capability detection.
+		// A cloud model that reports thinking capability should have reasoning: true.
+		srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			if r.URL.Path == "/api/show" {
+				fmt.Fprintf(w, `{"capabilities":["thinking"],"model_info":{},"remote_model":"qwq-cloud"}`)
+				return
+			}
+			w.WriteHeader(http.StatusNotFound)
+		}))
+		defer srv.Close()
+
+		u, _ := url.Parse(srv.URL)
+		client := api.NewClient(u, srv.Client())
+
+		cfg, isCloud := openclawModelConfig(context.Background(), client, "qwq:cloud")
+
+		if !isCloud {
+			t.Error("expected isCloud = true for cloud thinking model")
+		}
+		if cfg["reasoning"] != true {
+			t.Error("expected reasoning = true for cloud thinking model")
+		}
+	})
+}
+
+func TestIntegrationOnboarded(t *testing.T) {
+	t.Run("returns false when not set", func(t *testing.T) {
+		tmpDir := t.TempDir()
+		setTestHome(t, tmpDir)
+
+		integrationConfig, err := loadIntegration("openclaw")
+		if err == nil && integrationConfig.Onboarded {
+			t.Error("expected false for fresh config")
+		}
+	})
+
+	t.Run("returns true after integrationOnboarded", func(t *testing.T) {
+		tmpDir := t.TempDir()
+		setTestHome(t, tmpDir)
+		os.MkdirAll(filepath.Join(tmpDir, ".ollama"), 0o755)
+
+		if err := integrationOnboarded("openclaw"); err != nil {
+			t.Fatal(err)
+		}
+		integrationConfig, err := loadIntegration("openclaw")
+		if err != nil || !integrationConfig.Onboarded {
+			t.Error("expected true after integrationOnboarded")
+		}
+	})
+
+	t.Run("is case insensitive", func(t *testing.T) {
+		tmpDir := t.TempDir()
+		setTestHome(t, tmpDir)
+		os.MkdirAll(filepath.Join(tmpDir, ".ollama"), 0o755)
+
+		if err := integrationOnboarded("OpenClaw"); err != nil {
+			t.Fatal(err)
+		}
+		integrationConfig, err := loadIntegration("openclaw")
+		if err != nil || !integrationConfig.Onboarded {
+			t.Error("expected true when set with different case")
+		}
+	})
+
+	t.Run("preserves existing integration data", func(t *testing.T) {
+		tmpDir := t.TempDir()
+		setTestHome(t, tmpDir)
+		os.MkdirAll(filepath.Join(tmpDir, ".ollama"), 0o755)
+
+		if err := SaveIntegration("openclaw", []string{"llama3.2", "mistral"}); err != nil {
+			t.Fatal(err)
+		}
+		if err := integrationOnboarded("openclaw"); err != nil {
+			t.Fatal(err)
+		}
+
+		// Verify onboarded is set
+		integrationConfig, err := loadIntegration("openclaw")
+		if err != nil || !integrationConfig.Onboarded {
+			t.Error("expected true after integrationOnboarded")
+		}
+
+		// Verify models are preserved
+		model := IntegrationModel("openclaw")
+		if model != "llama3.2" {
+			t.Errorf("expected first model llama3.2, got %q", model)
+		}
+	})
+}
--- a/cmd/config/selector.go
+++ b/cmd/config/selector.go
@@ -10,10 +10,11 @@ import (

 // ANSI escape sequences for terminal formatting.
 const (
-	ansiBold  = "\033[1m"
-	ansiReset = "\033[0m"
-	ansiGray  = "\033[37m"
-	ansiGreen = "\033[32m"
+	ansiBold   = "\033[1m"
+	ansiReset  = "\033[0m"
+	ansiGray   = "\033[37m"
+	ansiGreen  = "\033[32m"
+	ansiYellow = "\033[33m"
 )

 // ErrCancelled is returned when the user cancels a selection.
--- a/cmd/tui/selector.go
+++ b/cmd/tui/selector.go
@@ -415,6 +415,12 @@ type multiSelectorModel struct {
 	cancelled    bool
 	confirmed    bool
 	width        int
+
+	// multi enables full multi-select editing mode. The zero value (false)
+	// shows a single-select picker where Enter adds the chosen model to
+	// the existing list. Tab toggles between modes.
+	multi     bool
+	singleAdd string // model picked in single mode
 }

 func newMultiSelectorModel(title string, items []SelectItem, preChecked []string) multiSelectorModel {
@@ -429,13 +435,23 @@ func newMultiSelectorModel(title string, items []SelectItem, preChecked []string
 		m.itemIndex[item.Name] = i
 	}

-	for _, name := range preChecked {
-		if idx, ok := m.itemIndex[name]; ok {
+	// Reverse order so preChecked[0] (the current default) ends up last
+	// in checkOrder, matching the "last checked = default" convention.
+	for i := len(preChecked) - 1; i >= 0; i-- {
+		if idx, ok := m.itemIndex[preChecked[i]]; ok {
 			m.checked[idx] = true
 			m.checkOrder = append(m.checkOrder, idx)
 		}
 	}

+	// Position cursor on the current default model
+	if len(preChecked) > 0 {
+		if idx, ok := m.itemIndex[preChecked[0]]; ok {
+			m.cursor = idx
+			m.updateScroll(m.otherStart())
+		}
+	}
+
 	return m
 }

@@ -546,14 +562,25 @@ func (m multiSelectorModel) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
 			m.cancelled = true
 			return m, tea.Quit

+		case tea.KeyTab:
+			m.multi = !m.multi
+
 		case tea.KeyEnter:
-			if len(m.checkOrder) > 0 {
+			if !m.multi {
+				if len(filtered) > 0 && m.cursor < len(filtered) {
+					m.singleAdd = filtered[m.cursor].Name
+					m.confirmed = true
+					return m, tea.Quit
+				}
+			} else if len(m.checkOrder) > 0 {
 				m.confirmed = true
 				return m, tea.Quit
 			}

 		case tea.KeySpace:
-			m.toggleItem()
+			if m.multi {
+				m.toggleItem()
+			}

 		case tea.KeyUp:
 			if m.cursor > 0 {
@@ -592,7 +619,9 @@ func (m multiSelectorModel) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
 			// On some terminals (e.g. Windows PowerShell), space arrives as
 			// KeyRunes instead of KeySpace. Intercept it so toggle still works.
 			if len(msg.Runes) == 1 && msg.Runes[0] == ' ' {
-				m.toggleItem()
+				if m.multi {
+					m.toggleItem()
+				}
 			} else {
 				m.filter += string(msg.Runes)
 				m.cursor = 0
@@ -604,6 +633,19 @@ func (m multiSelectorModel) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
 	return m, nil
 }

+func (m multiSelectorModel) renderSingleItem(s *strings.Builder, item SelectItem, idx int) {
+	if idx == m.cursor {
+		s.WriteString(selectorSelectedItemStyle.Render("▸ " + item.Name))
+	} else {
+		s.WriteString(selectorItemStyle.Render(item.Name))
+	}
+	s.WriteString("\n")
+	if item.Description != "" {
+		s.WriteString(selectorDescLineStyle.Render(item.Description))
+		s.WriteString("\n")
+	}
+}
+
 func (m multiSelectorModel) renderMultiItem(s *strings.Builder, item SelectItem, idx int) {
 	origIdx := m.itemIndex[item.Name]

@@ -615,7 +657,7 @@ func (m multiSelectorModel) renderMultiItem(s *strings.Builder, item SelectItem,
 	}

 	suffix := ""
-	if len(m.checkOrder) > 0 && m.checkOrder[0] == origIdx {
+	if len(m.checkOrder) > 0 && m.checkOrder[len(m.checkOrder)-1] == origIdx {
 		suffix = " " + selectorDefaultTagStyle.Render("(default)")
 	}

@@ -637,6 +679,11 @@ func (m multiSelectorModel) View() string {
 		return ""
 	}

+	renderItem := m.renderSingleItem
+	if m.multi {
+		renderItem = m.renderMultiItem
+	}
+
 	var s strings.Builder

 	s.WriteString(selectorTitleStyle.Render(m.title))
@@ -661,7 +708,7 @@ func (m multiSelectorModel) View() string {
 			if idx >= len(filtered) {
 				break
 			}
-			m.renderMultiItem(&s, filtered[idx], idx)
+			renderItem(&s, filtered[idx], idx)
 		}

 		if remaining := len(filtered) - m.scrollOffset - displayCount; remaining > 0 {
@@ -684,7 +731,7 @@ func (m multiSelectorModel) View() string {
 			s.WriteString(sectionHeaderStyle.Render("Recommended"))
 			s.WriteString("\n")
 			for _, idx := range recItems {
-				m.renderMultiItem(&s, filtered[idx], idx)
+				renderItem(&s, filtered[idx], idx)
 			}
 		}

@@ -704,7 +751,7 @@ func (m multiSelectorModel) View() string {
 				if idx >= len(otherItems) {
 					break
 				}
-				m.renderMultiItem(&s, filtered[otherItems[idx]], otherItems[idx])
+				renderItem(&s, filtered[otherItems[idx]], otherItems[idx])
 			}

 			if remaining := len(otherItems) - m.scrollOffset - displayCount; remaining > 0 {
@@ -716,15 +763,18 @@ func (m multiSelectorModel) View() string {

 	s.WriteString("\n")

-	count := m.selectedCount()
-	if count == 0 {
-		s.WriteString(selectorDescStyle.Render("  Select at least one model."))
+	if !m.multi {
+		s.WriteString(selectorHelpStyle.Render("↑/↓ navigate • enter select • tab add multiple • esc cancel"))
 	} else {
-		s.WriteString(selectorDescStyle.Render(fmt.Sprintf("  %d selected - press enter to continue", count)))
+		count := m.selectedCount()
+		if count == 0 {
+			s.WriteString(selectorDescStyle.Render("  Select at least one model."))
+		} else {
+			s.WriteString(selectorDescStyle.Render(fmt.Sprintf("  %d selected - press enter to continue", count)))
+		}
+		s.WriteString("\n\n")
+		s.WriteString(selectorHelpStyle.Render("↑/↓ navigate • space toggle • tab select single • enter confirm • esc cancel"))
 	}
-	s.WriteString("\n\n")
-
-	s.WriteString(selectorHelpStyle.Render("↑/↓ navigate • space toggle • enter confirm • esc cancel"))

 	result := s.String()
 	if m.width > 0 {
@@ -747,18 +797,28 @@ func SelectMultiple(title string, items []SelectItem, preChecked []string) ([]st
 	}

 	fm := finalModel.(multiSelectorModel)
-	if fm.cancelled {
+	if fm.cancelled || !fm.confirmed {
 		return nil, ErrCancelled
 	}

-	if !fm.confirmed {
-		return nil, ErrCancelled
+	// Single-add mode: prepend the picked model, keep existing models deduped
+	if fm.singleAdd != "" {
+		result := []string{fm.singleAdd}
+		for _, name := range preChecked {
+			if name != fm.singleAdd {
+				result = append(result, name)
+			}
+		}
+		return result, nil
 	}

-	var result []string
+	// Multi-edit mode: last checked is default (first in result)
+	last := fm.checkOrder[len(fm.checkOrder)-1]
+	result := []string{fm.items[last].Name}
 	for _, idx := range fm.checkOrder {
-		result = append(result, fm.items[idx].Name)
+		if idx != last {
+			result = append(result, fm.items[idx].Name)
+		}
 	}
-
 	return result, nil
 }
--- a/cmd/tui/selector_test.go
+++ b/cmd/tui/selector_test.go
@@ -539,6 +539,7 @@ func TestMultiView_CursorIndicator(t *testing.T) {

 func TestMultiView_CheckedItemShowsX(t *testing.T) {
 	m := newMultiSelectorModel("Pick:", items("a", "b"), []string{"a"})
+	m.multi = true
 	content := m.View()

 	if !strings.Contains(content, "[x]") {
@@ -550,11 +551,18 @@ func TestMultiView_CheckedItemShowsX(t *testing.T) {
 }

 func TestMultiView_DefaultTag(t *testing.T) {
-	m := newMultiSelectorModel("Pick:", items("a", "b"), []string{"a"})
+	m := newMultiSelectorModel("Pick:", items("a", "b", "c"), []string{"a", "b"})
+	m.multi = true
 	content := m.View()

 	if !strings.Contains(content, "(default)") {
-		t.Error("first checked item should have (default) tag")
+		t.Error("should have (default) tag")
+	}
+	// preChecked[0] ("a") should be the default (last in checkOrder)
+	aIdx := strings.Index(content, "a")
+	defaultIdx := strings.Index(content, "(default)")
+	if defaultIdx < aIdx {
+		t.Error("(default) tag should appear after 'a' (the current default)")
 	}
 }

@@ -585,6 +593,7 @@ func TestMultiView_OverflowIndicator(t *testing.T) {

 func TestMultiUpdate_SpaceTogglesItem(t *testing.T) {
 	m := newMultiSelectorModel("Pick:", items("a", "b", "c"), nil)
+	m.multi = true
 	m.cursor = 1

 	// Simulate space delivered as tea.KeySpace
@@ -601,6 +610,7 @@ func TestMultiUpdate_SpaceTogglesItem(t *testing.T) {

 func TestMultiUpdate_SpaceRuneTogglesItem(t *testing.T) {
 	m := newMultiSelectorModel("Pick:", items("a", "b", "c"), nil)
+	m.multi = true
 	m.cursor = 1

 	// Simulate space delivered as tea.KeyRunes (Windows PowerShell behavior)
@@ -618,6 +628,161 @@ func TestMultiUpdate_SpaceRuneTogglesItem(t *testing.T) {
 	}
 }

+// --- Single-add mode ---
+
+func TestMulti_StartsInSingleMode(t *testing.T) {
+	m := newMultiSelectorModel("Pick:", items("a", "b"), nil)
+	if m.multi {
+		t.Error("should start in single mode (multi=false)")
+	}
+}
+
+func TestMulti_SingleModeNoCheckboxes(t *testing.T) {
+	m := newMultiSelectorModel("Pick:", items("a", "b"), nil)
+	content := m.View()
+	if strings.Contains(content, "[x]") || strings.Contains(content, "[ ]") {
+		t.Error("single mode should not show checkboxes")
+	}
+	if !strings.Contains(content, "▸") {
+		t.Error("single mode should show cursor indicator")
+	}
+}
+
+func TestMulti_SingleModeEnterPicksItem(t *testing.T) {
+	m := newMultiSelectorModel("Pick:", items("a", "b", "c"), nil)
+	m.cursor = 1
+
+	updated, _ := m.Update(tea.KeyMsg{Type: tea.KeyEnter})
+	m = updated.(multiSelectorModel)
+
+	if m.singleAdd != "b" {
+		t.Errorf("enter in single mode should pick cursor item, got %q", m.singleAdd)
+	}
+	if !m.confirmed {
+		t.Error("should set confirmed")
+	}
+}
+
+func TestMulti_SingleModeSpaceIsNoop(t *testing.T) {
+	m := newMultiSelectorModel("Pick:", items("a", "b"), nil)
+	m.cursor = 0
+
+	updated, _ := m.Update(tea.KeyMsg{Type: tea.KeySpace})
+	m = updated.(multiSelectorModel)
+
+	if len(m.checked) != 0 {
+		t.Error("space in single mode should not toggle items")
+	}
+}
+
+func TestMulti_SingleModeSpaceRuneIsNoop(t *testing.T) {
+	m := newMultiSelectorModel("Pick:", items("a", "b"), nil)
+	m.cursor = 0
+
+	updated, _ := m.Update(tea.KeyMsg{Type: tea.KeyRunes, Runes: []rune{' '}})
+	m = updated.(multiSelectorModel)
+
+	if len(m.checked) != 0 {
+		t.Error("space rune in single mode should not toggle items")
+	}
+	if m.filter != "" {
+		t.Error("space rune in single mode should not add to filter")
+	}
+}
+
+func TestMulti_TabTogglesMode(t *testing.T) {
+	m := newMultiSelectorModel("Pick:", items("a", "b"), nil)
+
+	if m.multi {
+		t.Fatal("should start in single mode")
+	}
+
+	updated, _ := m.Update(tea.KeyMsg{Type: tea.KeyTab})
+	m = updated.(multiSelectorModel)
+	if !m.multi {
+		t.Error("tab should switch to multi mode")
+	}
+
+	updated, _ = m.Update(tea.KeyMsg{Type: tea.KeyTab})
+	m = updated.(multiSelectorModel)
+	if m.multi {
+		t.Error("tab should switch back to single mode")
+	}
+}
+
+func TestMulti_SingleModeHelpText(t *testing.T) {
+	m := newMultiSelectorModel("Pick:", items("a"), nil)
+	content := m.View()
+	if !strings.Contains(content, "tab add multiple") {
+		t.Error("single mode should show 'tab add multiple' in help")
+	}
+}
+
+func TestMulti_MultiModeHelpText(t *testing.T) {
+	m := newMultiSelectorModel("Pick:", items("a"), nil)
+	m.multi = true
+	content := m.View()
+	if !strings.Contains(content, "tab select single") {
+		t.Error("multi mode should show 'tab select single' in help")
+	}
+}
+
+// --- preChecked initialization order ---
+
+func TestMulti_PreCheckedDefaultIsLast(t *testing.T) {
+	// preChecked[0] ("a") is the current default and should end up
+	// last in checkOrder so it gets the (default) tag.
+	m := newMultiSelectorModel("Pick:", items("a", "b", "c"), []string{"a", "b", "c"})
+
+	if len(m.checkOrder) != 3 {
+		t.Fatalf("expected 3 in checkOrder, got %d", len(m.checkOrder))
+	}
+	lastIdx := m.checkOrder[len(m.checkOrder)-1]
+	if m.items[lastIdx].Name != "a" {
+		t.Errorf("preChecked[0] should be last in checkOrder, got %q", m.items[lastIdx].Name)
+	}
+}
+
+func TestMulti_CursorOnDefaultModel(t *testing.T) {
+	// preChecked[0] ("b") is the default; cursor should start on it
+	m := newMultiSelectorModel("Pick:", items("a", "b", "c"), []string{"b", "c"})
+
+	if m.cursor != 1 {
+		t.Errorf("cursor should be on preChecked[0] ('b') at index 1, got %d", m.cursor)
+	}
+}
+
+// --- Multi-mode last-checked is default ---
+
+func TestMulti_LastCheckedIsDefault(t *testing.T) {
+	m := newMultiSelectorModel("Pick:", items("alpha", "beta", "gamma"), nil)
+	m.multi = true
+
+	// Check "alpha" then "gamma"
+	m.cursor = 0
+	m.toggleItem()
+	m.cursor = 2
+	m.toggleItem()
+
+	// Last checked ("gamma") should be at the end of checkOrder
+	lastIdx := m.checkOrder[len(m.checkOrder)-1]
+	if m.items[lastIdx].Name != "gamma" {
+		t.Errorf("last checked should be 'gamma', got %q", m.items[lastIdx].Name)
+	}
+
+	// The (default) tag renders based on checkOrder[len-1]
+	content := m.View()
+	if !strings.Contains(content, "(default)") {
+		t.Fatal("should show (default) tag")
+	}
+	// "alpha" line should NOT have the default tag
+	for _, line := range strings.Split(content, "\n") {
+		if strings.Contains(line, "alpha") && strings.Contains(line, "(default)") {
+			t.Error("'alpha' (first checked) should not have (default) tag")
+		}
+	}
+}
+
 // Key message helpers for testing

 type keyType = int
--- a/cmd/tui/tui.go
+++ b/cmd/tui/tui.go
@@ -429,8 +429,24 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
 			}
 			if m.multiModalSelector.confirmed {
 				var selected []string
-				for _, idx := range m.multiModalSelector.checkOrder {
-					selected = append(selected, m.multiModalSelector.items[idx].Name)
+				if m.multiModalSelector.singleAdd != "" {
+					// Single-add mode: prepend picked model, keep existing deduped
+					selected = []string{m.multiModalSelector.singleAdd}
+					for _, name := range config.IntegrationModels(m.items[m.cursor].integration) {
+						if name != m.multiModalSelector.singleAdd {
+							selected = append(selected, name)
+						}
+					}
+				} else {
+					// Last checked is default (first in result)
+					co := m.multiModalSelector.checkOrder
+					last := co[len(co)-1]
+					selected = []string{m.multiModalSelector.items[last].Name}
+					for _, idx := range co {
+						if idx != last {
+							selected = append(selected, m.multiModalSelector.items[idx].Name)
+						}
+					}
 				}
 				if len(selected) > 0 {
 					m.changeModels = selected
@@ -508,7 +524,7 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
 		case "enter", " ":
 			item := m.items[m.cursor]

-			if item.integration != "" && !config.IsIntegrationInstalled(item.integration) {
+			if item.integration != "" && !config.IsIntegrationInstalled(item.integration) && !config.AutoInstallable(item.integration) {
 				return m, nil
 			}

@@ -539,6 +555,12 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
 			item := m.items[m.cursor]
 			if item.integration != "" || item.isRunModel {
 				if item.integration != "" && !config.IsIntegrationInstalled(item.integration) {
+					if config.AutoInstallable(item.integration) {
+						// Auto-installable: select to trigger install flow
+						m.selected = true
+						m.quitting = true
+						return m, tea.Quit
+					}
 					return m, nil
 				}
 				if item.integration != "" && config.IsEditorIntegration(item.integration) {
@@ -602,7 +624,11 @@ func (m model) View() string {
 		var modelSuffix string
 		if item.integration != "" {
 			if !isInstalled {
-				title += " " + notInstalledStyle.Render("(not installed)")
+				if config.AutoInstallable(item.integration) {
+					title += " " + notInstalledStyle.Render("(install)")
+				} else {
+					title += " " + notInstalledStyle.Render("(not installed)")
+				}
 			} else if m.cursor == i {
 				if mdl := config.IntegrationModel(item.integration); mdl != "" && m.modelExists(mdl) {
 					modelSuffix = " " + modelStyle.Render("("+mdl+")")
@@ -618,7 +644,9 @@ func (m model) View() string {

 		desc := item.description
 		if !isInstalled && item.integration != "" && m.cursor == i {
-			if hint := config.IntegrationInstallHint(item.integration); hint != "" {
+			if config.AutoInstallable(item.integration) {
+				desc = "Press enter to install"
+			} else if hint := config.IntegrationInstallHint(item.integration); hint != "" {
 				desc = hint
 			} else {
 				desc = "not installed"
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -257,10 +257,11 @@ func LoadModelMetadata(fsys fs.FS) (ModelKV, *Tokenizer, error) {
 	if err != nil {
 		return nil, nil, err
 	}
+	bts = sanitizeNonFiniteJSON(bts)

 	var p ModelParameters
 	if err := json.Unmarshal(bts, &p); err != nil {
-		return nil, nil, err
+		return nil, nil, fmt.Errorf("parse config.json: %w", err)
 	}

 	if len(p.Architectures) < 1 {
@@ -315,16 +316,20 @@ func LoadModelMetadata(fsys fs.FS) (ModelKV, *Tokenizer, error) {
 		conv = &glm4MoeLiteModel{}
 	case "GlmOcrForConditionalGeneration":
 		conv = &glmOcrModel{}
-	case "Lfm2ForCausalLM":
+	case "Lfm2ForCausalLM", "Lfm2MoeForCausalLM":
 		conv = &lfm2Model{}
-	case "Qwen3NextForCausalLM":
+	case "Lfm2VlForConditionalGeneration":
+		conv = &lfm2VLTextModel{}
+	case "Qwen3NextForCausalLM", "Qwen3_5ForConditionalGeneration", "Qwen3_5MoeForConditionalGeneration":
 		conv = &qwen3NextModel{}
+	case "NemotronHForCausalLM":
+		conv = &nemotronHModel{}
 	default:
 		return nil, nil, fmt.Errorf("unsupported architecture %q", p.Architectures[0])
 	}

 	if err := json.Unmarshal(bts, conv); err != nil {
-		return nil, nil, err
+		return nil, nil, fmt.Errorf("parse config.json for %q: %w", p.Architectures[0], err)
 	}

 	if t, ok := conv.(moreParser); ok {
--- a/convert/convert_lfm2.go
+++ b/convert/convert_lfm2.go
@@ -1,6 +1,8 @@
 package convert

 import (
+	"cmp"
+	"fmt"
 	"slices"
 	"strings"

@@ -13,42 +15,149 @@ type lfm2Model struct {
 	NumHiddenLayers       uint32   `json:"num_hidden_layers"`
 	MaxPositionEmbeddings uint32   `json:"max_position_embeddings"`
 	IntermediateSize      uint32   `json:"intermediate_size"`
+	BlockFFDim            uint32   `json:"block_ff_dim"`
+	BlockMultipleOf       uint32   `json:"block_multiple_of"`
+	BlockAutoAdjustFFDim  bool     `json:"block_auto_adjust_ff_dim"`
+	BlockFFNDimMultiplier float32  `json:"block_ffn_dim_multiplier"`
 	NumAttentionHeads     uint32   `json:"num_attention_heads"`
 	NumKeyValueHeads      uint32   `json:"num_key_value_heads"`
 	RopeTheta             float32  `json:"rope_theta"`
 	NormEps               float32  `json:"norm_eps"`
 	ConvLCache            uint32   `json:"conv_L_cache"`
+	MoEIntermediateSize   uint32   `json:"moe_intermediate_size"`
+	NumExperts            uint32   `json:"num_experts"`
+	NumLocalExperts       uint32   `json:"num_local_experts"`
+	NumExpertsPerToken    uint32   `json:"num_experts_per_tok"`
+	NumDenseLayers        uint32   `json:"num_dense_layers"`
+	RoutedScalingFactor   float32  `json:"routed_scaling_factor"`
 	LayerTypes            []string `json:"layer_types"`
 	TieEmbedding          bool     `json:"tie_embedding"`
+	RopeParameters        struct {
+		RopeTheta float32 `json:"rope_theta"`
+	} `json:"rope_parameters"`
 }

 var _ ModelConverter = (*lfm2Model)(nil)

+const (
+	defaultMaxPositionEmbeddings = uint32(128_000)
+	fallbackContextLength        = uint32(32_768)
+)
+
+func (p *lfm2Model) isMoE() bool {
+	return p.ModelType == "lfm2_moe" || p.expertCount() > 0
+}
+
+func (p *lfm2Model) ropeFreqBase() float32 {
+	if p.RopeTheta != 0 {
+		return p.RopeTheta
+	}
+
+	return p.RopeParameters.RopeTheta
+}
+
+func (p *lfm2Model) expertCount() uint32 {
+	if p.NumLocalExperts > 0 {
+		return p.NumLocalExperts
+	}
+	return p.NumExperts
+}
+
+func (p *lfm2Model) feedForwardLength() uint32 {
+	ff := p.IntermediateSize
+	if p.BlockFFDim != 0 {
+		ff = p.BlockFFDim
+	}
+
+	if !p.BlockAutoAdjustFFDim || p.BlockMultipleOf == 0 {
+		return ff
+	}
+
+	ff = (2 * ff) / 3
+
+	// Keep default multiplier behavior consistent with llama.cpp conversion.
+	if p.BlockFFNDimMultiplier != 0 {
+		ff = uint32(float32(ff) * p.BlockFFNDimMultiplier)
+	}
+
+	m := p.BlockMultipleOf
+	return m * ((ff + m - 1) / m)
+}
+
+func (p *lfm2Model) hasKnownContextLengthFallbackSignature() bool {
+	return p.isMoE() &&
+		p.VocabSize == 65536 &&
+		p.HiddenSize == 2048 &&
+		p.NumHiddenLayers == 40 &&
+		p.IntermediateSize == 11776 &&
+		p.NumAttentionHeads == 32 &&
+		p.NumKeyValueHeads == 8 &&
+		p.NumDenseLayers == 2 &&
+		p.expertCount() == 64 &&
+		p.NumExpertsPerToken == 4 &&
+		p.MoEIntermediateSize == 1536
+}
+
+func (p *lfm2Model) contextLength() uint32 {
+	if p.MaxPositionEmbeddings == defaultMaxPositionEmbeddings && p.hasKnownContextLengthFallbackSignature() {
+		return fallbackContextLength
+	}
+
+	return p.MaxPositionEmbeddings
+}
+
 func (p *lfm2Model) KV(t *Tokenizer) KV {
+	architecture := "lfm2"
+	if p.isMoE() {
+		architecture = "lfm2moe"
+	}
+
 	kv := p.ModelParameters.KV(t)
-	kv["general.architecture"] = "lfm2"
-	kv["lfm2.vocab_size"] = p.VocabSize
-	kv["lfm2.block_count"] = p.NumHiddenLayers
-	kv["lfm2.embedding_length"] = p.HiddenSize
-	kv["lfm2.feed_forward_length"] = p.IntermediateSize
-	kv["lfm2.context_length"] = p.MaxPositionEmbeddings
+	kv["general.architecture"] = architecture
+	kv["tokenizer.ggml.pre"] = "lfm2"
+	kv["vocab_size"] = p.VocabSize
+	kv["block_count"] = p.NumHiddenLayers
+	kv["embedding_length"] = p.HiddenSize
+	kv["feed_forward_length"] = p.feedForwardLength()
+	kv["context_length"] = p.contextLength()

 	// Build per-layer KV head count array based on layer_types
-	// (0 = shortconv layer, non-zero = attention layer with that many KV heads)
+	// (0 = shortconv layer, non-zero = attention layer with that many KV heads).
+	//
+	// Dense LFM2 in HF defaults to all attention layers when layer_types is absent.
+	// Preserve that behavior to avoid accidentally emitting all-conv metadata.
 	kvHeadCounts := make([]uint32, p.NumHiddenLayers)
-	for i := range p.NumHiddenLayers {
-		if int(i) < len(p.LayerTypes) && p.LayerTypes[i] == "full_attention" {
+	if len(p.LayerTypes) == 0 {
+		for i := range p.NumHiddenLayers {
 			kvHeadCounts[i] = p.NumKeyValueHeads
 		}
+	} else {
+		for i := range p.NumHiddenLayers {
+			if int(i) < len(p.LayerTypes) && p.LayerTypes[i] == "full_attention" {
+				kvHeadCounts[i] = p.NumKeyValueHeads
+			}
+		}
 	}

-	kv["lfm2.attention.head_count"] = p.NumAttentionHeads
-	kv["lfm2.attention.head_count_kv"] = kvHeadCounts
-	kv["lfm2.attention.key_length"] = p.HiddenSize / p.NumAttentionHeads
-	kv["lfm2.attention.value_length"] = p.HiddenSize / p.NumAttentionHeads
-	kv["lfm2.attention.layer_norm_rms_epsilon"] = p.NormEps
-	kv["lfm2.rope.freq_base"] = p.RopeTheta
-	kv["lfm2.shortconv.l_cache"] = p.ConvLCache
+	kv["attention.head_count"] = p.NumAttentionHeads
+	kv["attention.head_count_kv"] = kvHeadCounts
+	kv["attention.key_length"] = p.HiddenSize / p.NumAttentionHeads
+	kv["attention.value_length"] = p.HiddenSize / p.NumAttentionHeads
+	kv["attention.layer_norm_rms_epsilon"] = p.NormEps
+	kv["shortconv.l_cache"] = p.ConvLCache
+
+	if ropeFreqBase := p.ropeFreqBase(); ropeFreqBase != 0 {
+		kv["rope.freq_base"] = ropeFreqBase
+	}
+
+	if p.isMoE() {
+		kv["expert_count"] = p.expertCount()
+		kv["expert_used_count"] = p.NumExpertsPerToken
+		kv["expert_feed_forward_length"] = p.MoEIntermediateSize
+		kv["leading_dense_block_count"] = p.NumDenseLayers
+		kv["expert_gating_func"] = uint32(2) // sigmoid
+		kv["expert_weights_scale"] = cmp.Or(p.RoutedScalingFactor, float32(1.0))
+	}

 	return kv
 }
@@ -56,6 +165,30 @@ func (p *lfm2Model) KV(t *Tokenizer) KV {
 func (p *lfm2Model) Tensors(ts []Tensor) []*ggml.Tensor {
 	var out []*ggml.Tensor

+	if p.isMoE() {
+		merges := make([]merge, 0, p.NumHiddenLayers*3)
+		for i := range p.NumHiddenLayers {
+			if i < p.NumDenseLayers {
+				continue
+			}
+
+			merges = append(merges, merge{
+				fmt.Sprintf("blk.%d.feed_forward.experts.*.w1.weight", i),
+				fmt.Sprintf("blk.%d.ffn_gate_exps.weight", i),
+			}, merge{
+				fmt.Sprintf("blk.%d.feed_forward.experts.*.w2.weight", i),
+				fmt.Sprintf("blk.%d.ffn_down_exps.weight", i),
+			}, merge{
+				fmt.Sprintf("blk.%d.feed_forward.experts.*.w3.weight", i),
+				fmt.Sprintf("blk.%d.ffn_up_exps.weight", i),
+			})
+		}
+
+		merged, remaining := mergeTensors(ts, merges...)
+		out = append(out, merged...)
+		ts = remaining
+	}
+
 	for _, t := range ts {
 		shape := t.Shape()

@@ -80,7 +213,7 @@ func (p *lfm2Model) Tensors(ts []Tensor) []*ggml.Tensor {
 func (p *lfm2Model) Replacements() []string {
 	return []string{
 		"model.embed_tokens", "token_embd",
-		"model.embedding_norm", "output_norm",
+		"model.embedding_norm", "token_embd_norm",
 		"model.layers", "blk",
 		"operator_norm", "attn_norm",
 		"self_attn.q_proj", "attn_q",
@@ -92,6 +225,8 @@ func (p *lfm2Model) Replacements() []string {
 		"conv.conv", "shortconv.conv",
 		"conv.in_proj", "shortconv.in_proj",
 		"conv.out_proj", "shortconv.out_proj",
+		"feed_forward.gate", "ffn_gate_inp",
+		"feed_forward.expert_bias", "exp_probs_b.bias",
 		"feed_forward.w1", "ffn_gate",
 		"feed_forward.w2", "ffn_down",
 		"feed_forward.w3", "ffn_up",
--- a/convert/convert_lfm2_test.go
+++ b/convert/convert_lfm2_test.go
@@ -0,0 +1,271 @@
+package convert
+
+import (
+	"io"
+	"slices"
+	"strings"
+	"testing"
+)
+
+type lfm2StubTensor struct {
+	tensorBase
+}
+
+func newLFM2StubTensor(name string, shape []uint64) *lfm2StubTensor {
+	return &lfm2StubTensor{
+		tensorBase: tensorBase{
+			name:  name,
+			shape: shape,
+		},
+	}
+}
+
+func (t *lfm2StubTensor) WriteTo(io.Writer) (int64, error) {
+	return 0, nil
+}
+
+func (t *lfm2StubTensor) Clone() Tensor {
+	return &lfm2StubTensor{
+		tensorBase: tensorBase{
+			name:  t.name,
+			shape: slices.Clone(t.shape),
+		},
+	}
+}
+
+func TestLFM2MoEKV(t *testing.T) {
+	var p lfm2Model
+	p.ModelParameters.ModelType = "lfm2_moe"
+	p.VocabSize = 65536
+	p.HiddenSize = 2048
+	p.NumHiddenLayers = 4
+	p.MaxPositionEmbeddings = 128000
+	p.IntermediateSize = 11776
+	p.NumAttentionHeads = 32
+	p.NumKeyValueHeads = 8
+	p.LayerTypes = []string{"conv", "full_attention", "conv", "full_attention"}
+	p.NormEps = 1e-5
+	p.ConvLCache = 3
+	p.MoEIntermediateSize = 1536
+	p.NumExperts = 64
+	p.NumExpertsPerToken = 4
+	p.NumDenseLayers = 2
+	p.RopeParameters.RopeTheta = 1_000_000
+
+	kv := p.KV(&Tokenizer{Vocabulary: &Vocabulary{Model: "gpt2"}})
+
+	if got, want := kv["general.architecture"], "lfm2moe"; got != want {
+		t.Fatalf("general.architecture = %v, want %v", got, want)
+	}
+	if got, want := kv["tokenizer.ggml.pre"], "lfm2"; got != want {
+		t.Fatalf("tokenizer.ggml.pre = %v, want %v", got, want)
+	}
+
+	if got, want := kv["expert_count"], uint32(64); got != want {
+		t.Fatalf("expert_count = %v, want %v", got, want)
+	}
+
+	if got, want := kv["expert_used_count"], uint32(4); got != want {
+		t.Fatalf("expert_used_count = %v, want %v", got, want)
+	}
+
+	if got, want := kv["expert_feed_forward_length"], uint32(1536); got != want {
+		t.Fatalf("expert_feed_forward_length = %v, want %v", got, want)
+	}
+
+	if got, want := kv["leading_dense_block_count"], uint32(2); got != want {
+		t.Fatalf("leading_dense_block_count = %v, want %v", got, want)
+	}
+
+	if got, want := kv["expert_gating_func"], uint32(2); got != want {
+		t.Fatalf("expert_gating_func = %v, want %v", got, want)
+	}
+
+	gotHeadCounts, ok := kv["attention.head_count_kv"].([]uint32)
+	if !ok {
+		t.Fatalf("attention.head_count_kv has unexpected type %T", kv["attention.head_count_kv"])
+	}
+
+	wantHeadCounts := []uint32{0, 8, 0, 8}
+	if !slices.Equal(gotHeadCounts, wantHeadCounts) {
+		t.Fatalf("attention.head_count_kv = %v, want %v", gotHeadCounts, wantHeadCounts)
+	}
+
+	if got, want := kv["rope.freq_base"], float32(1_000_000); got != want {
+		t.Fatalf("rope.freq_base = %v, want %v", got, want)
+	}
+}
+
+func TestLFM2DenseKV(t *testing.T) {
+	p := lfm2Model{
+		ModelParameters:       ModelParameters{ModelType: "lfm2", VocabSize: 32000},
+		HiddenSize:            1024,
+		NumHiddenLayers:       2,
+		MaxPositionEmbeddings: 32768,
+		IntermediateSize:      4096,
+		NumAttentionHeads:     16,
+		NumKeyValueHeads:      4,
+		LayerTypes:            []string{"conv", "full_attention"},
+		NormEps:               1e-5,
+		ConvLCache:            3,
+		RopeTheta:             10000,
+	}
+
+	kv := p.KV(&Tokenizer{Vocabulary: &Vocabulary{Model: "gpt2"}})
+
+	if got, want := kv["general.architecture"], "lfm2"; got != want {
+		t.Fatalf("general.architecture = %v, want %v", got, want)
+	}
+	if got, want := kv["tokenizer.ggml.pre"], "lfm2"; got != want {
+		t.Fatalf("tokenizer.ggml.pre = %v, want %v", got, want)
+	}
+
+	if _, ok := kv["expert_count"]; ok {
+		t.Fatalf("expert_count should not be set for dense lfm2")
+	}
+}
+
+func TestLFM2MoETensors(t *testing.T) {
+	p := lfm2Model{
+		ModelParameters: ModelParameters{ModelType: "lfm2_moe"},
+		NumHiddenLayers: 4,
+		NumDenseLayers:  2,
+	}
+
+	in := []Tensor{
+		newLFM2StubTensor("blk.2.feed_forward.experts.0.w1.weight", []uint64{1536, 2048}),
+		newLFM2StubTensor("blk.2.feed_forward.experts.1.w1.weight", []uint64{1536, 2048}),
+		newLFM2StubTensor("blk.2.feed_forward.experts.0.w2.weight", []uint64{2048, 1536}),
+		newLFM2StubTensor("blk.2.feed_forward.experts.1.w2.weight", []uint64{2048, 1536}),
+		newLFM2StubTensor("blk.2.feed_forward.experts.0.w3.weight", []uint64{1536, 2048}),
+		newLFM2StubTensor("blk.2.feed_forward.experts.1.w3.weight", []uint64{1536, 2048}),
+		newLFM2StubTensor("blk.0.shortconv.conv.weight", []uint64{2048, 1, 3}),
+	}
+
+	out := p.Tensors(in)
+
+	byName := make(map[string][]uint64, len(out))
+	for _, tns := range out {
+		byName[tns.Name] = tns.Shape
+	}
+
+	if got, ok := byName["blk.2.ffn_gate_exps.weight"]; !ok {
+		t.Fatalf("missing merged tensor blk.2.ffn_gate_exps.weight")
+	} else if !slices.Equal(got, []uint64{2, 1536, 2048}) {
+		t.Fatalf("blk.2.ffn_gate_exps.weight shape = %v, want [2 1536 2048]", got)
+	}
+
+	if got, ok := byName["blk.2.ffn_down_exps.weight"]; !ok {
+		t.Fatalf("missing merged tensor blk.2.ffn_down_exps.weight")
+	} else if !slices.Equal(got, []uint64{2, 2048, 1536}) {
+		t.Fatalf("blk.2.ffn_down_exps.weight shape = %v, want [2 2048 1536]", got)
+	}
+
+	if got, ok := byName["blk.2.ffn_up_exps.weight"]; !ok {
+		t.Fatalf("missing merged tensor blk.2.ffn_up_exps.weight")
+	} else if !slices.Equal(got, []uint64{2, 1536, 2048}) {
+		t.Fatalf("blk.2.ffn_up_exps.weight shape = %v, want [2 1536 2048]", got)
+	}
+
+	if got, ok := byName["blk.0.shortconv.conv.weight"]; !ok {
+		t.Fatalf("missing shortconv tensor")
+	} else if !slices.Equal(got, []uint64{2048, 3}) {
+		t.Fatalf("blk.0.shortconv.conv.weight shape = %v, want [2048 3]", got)
+	}
+
+	if _, ok := byName["blk.2.feed_forward.experts.0.w1.weight"]; ok {
+		t.Fatalf("unmerged expert tensor should not be present")
+	}
+}
+
+func TestLFM2MoEReplacements(t *testing.T) {
+	p := lfm2Model{}
+	replacer := strings.NewReplacer(p.Replacements()...)
+
+	if got, want := replacer.Replace("model.layers.2.feed_forward.expert_bias"), "blk.2.exp_probs_b.bias"; got != want {
+		t.Fatalf("expert bias replacement = %q, want %q", got, want)
+	}
+
+	if got, want := replacer.Replace("model.layers.2.feed_forward.gate.weight"), "blk.2.ffn_gate_inp.weight"; got != want {
+		t.Fatalf("gate replacement = %q, want %q", got, want)
+	}
+}
+
+func TestLFM2KVContextLengthEdgeCaseFallbackOverride(t *testing.T) {
+	p := lfm2Model{
+		ModelParameters:       ModelParameters{ModelType: "lfm2_moe", VocabSize: 65536},
+		HiddenSize:            2048,
+		NumHiddenLayers:       40,
+		MaxPositionEmbeddings: 128000,
+		IntermediateSize:      11776,
+		NumAttentionHeads:     32,
+		NumKeyValueHeads:      8,
+		LayerTypes:            make([]string, 40),
+		NormEps:               1e-5,
+		ConvLCache:            3,
+		MoEIntermediateSize:   1536,
+		NumExperts:            64,
+		NumExpertsPerToken:    4,
+		NumDenseLayers:        2,
+	}
+	for i := 0; i < len(p.LayerTypes); i++ {
+		p.LayerTypes[i] = "conv"
+	}
+	p.LayerTypes[2] = "full_attention"
+
+	kv := p.KV(&Tokenizer{Vocabulary: &Vocabulary{Model: "gpt2"}})
+
+	if got, want := kv["context_length"], uint32(32768); got != want {
+		t.Fatalf("context_length = %v, want %v", got, want)
+	}
+}
+
+func TestLFM2KVContextLengthNoOverride(t *testing.T) {
+	p := lfm2Model{
+		ModelParameters:       ModelParameters{ModelType: "lfm2_moe", VocabSize: 65536},
+		HiddenSize:            2048,
+		NumHiddenLayers:       39, // mismatch: should not trigger edge case
+		MaxPositionEmbeddings: 128000,
+		IntermediateSize:      11776,
+		NumAttentionHeads:     32,
+		NumKeyValueHeads:      8,
+		LayerTypes:            []string{"conv", "full_attention"},
+		NormEps:               1e-5,
+		ConvLCache:            3,
+		MoEIntermediateSize:   1536,
+		NumExperts:            64,
+		NumExpertsPerToken:    4,
+		NumDenseLayers:        2,
+	}
+
+	kv := p.KV(&Tokenizer{Vocabulary: &Vocabulary{Model: "gpt2"}})
+
+	if got, want := kv["context_length"], uint32(128000); got != want {
+		t.Fatalf("context_length = %v, want %v", got, want)
+	}
+}
+
+func TestLFM2KVFeedForwardLengthAutoAdjust(t *testing.T) {
+	p := lfm2Model{
+		ModelParameters:       ModelParameters{ModelType: "lfm2", VocabSize: 65536},
+		HiddenSize:            2048,
+		NumHiddenLayers:       16,
+		MaxPositionEmbeddings: 128000,
+		IntermediateSize:      12288, // should be ignored when block_ff_dim is set
+		BlockFFDim:            12288,
+		BlockAutoAdjustFFDim:  true,
+		BlockMultipleOf:       256,
+		BlockFFNDimMultiplier: 1.0,
+		NumAttentionHeads:     32,
+		NumKeyValueHeads:      8,
+		LayerTypes:            []string{"conv", "full_attention"},
+		NormEps:               1e-5,
+		ConvLCache:            3,
+	}
+
+	kv := p.KV(&Tokenizer{Vocabulary: &Vocabulary{Model: "gpt2"}})
+
+	if got, want := kv["feed_forward_length"], uint32(8192); got != want {
+		t.Fatalf("feed_forward_length = %v, want %v", got, want)
+	}
+}
--- a/convert/convert_lfm2_vl.go
+++ b/convert/convert_lfm2_vl.go
@@ -0,0 +1,417 @@
+package convert
+
+import (
+	"cmp"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"io/fs"
+	"slices"
+	"strings"
+
+	"github.com/ollama/ollama/fs/ggml"
+)
+
+// lfm2VLTextModel converts the language model component of LFM2 VL checkpoints.
+type lfm2VLTextModel struct {
+	TextConfig            lfm2Model `json:"text_config"`
+	DoImageSplitting      *bool     `json:"do_image_splitting"`
+	DownsampleFactor      uint32    `json:"downsample_factor"`
+	EncoderPatchSize      uint32    `json:"encoder_patch_size"`
+	ImageTokenID          uint32    `json:"image_token_id"`
+	MaxImageTokens        uint32    `json:"max_image_tokens"`
+	MinImageTokens        uint32    `json:"min_image_tokens"`
+	MaxTiles              uint32    `json:"max_tiles"`
+	MinTiles              uint32    `json:"min_tiles"`
+	TileSize              uint32    `json:"tile_size"`
+	MaxPixelsTolerance    float32   `json:"max_pixels_tolerance"`
+	ProjectorUseLayernorm bool      `json:"projector_use_layernorm"`
+	ProjectorHiddenSize   uint32    `json:"projector_hidden_size"`
+	ProjectorHiddenAct    string    `json:"projector_hidden_act"`
+	UseImageSpecialTokens *bool     `json:"use_image_special_tokens"`
+	UseThumbnail          *bool     `json:"use_thumbnail"`
+	VisionConfig          struct {
+		HiddenSize        uint32  `json:"hidden_size"`
+		IntermediateSize  uint32  `json:"intermediate_size"`
+		NumAttentionHeads uint32  `json:"num_attention_heads"`
+		NumHiddenLayers   uint32  `json:"num_hidden_layers"`
+		NumChannels       uint32  `json:"num_channels"`
+		PatchSize         uint32  `json:"patch_size"`
+		LayerNormEpsilon  float32 `json:"layer_norm_eps"`
+	} `json:"vision_config"`
+	Processor struct {
+		ImageProcessor struct {
+			DoImageSplitting *bool     `json:"do_image_splitting"`
+			DownsampleFactor uint32    `json:"downsample_factor"`
+			MaxImageTokens   uint32    `json:"max_image_tokens"`
+			MinImageTokens   uint32    `json:"min_image_tokens"`
+			MaxTiles         uint32    `json:"max_tiles"`
+			MinTiles         uint32    `json:"min_tiles"`
+			MaxPixelsTol     float32   `json:"max_pixels_tolerance"`
+			TileSize         uint32    `json:"tile_size"`
+			UseThumbnail     *bool     `json:"use_thumbnail"`
+			ImageMean        []float32 `json:"image_mean"`
+			ImageStd         []float32 `json:"image_std"`
+			Size             struct {
+				Height uint32 `json:"height"`
+				Width  uint32 `json:"width"`
+			} `json:"size"`
+		} `json:"image_processor"`
+	}
+}
+
+func (p *lfm2VLTextModel) textModel() *lfm2Model {
+	return &p.TextConfig
+}
+
+func (p *lfm2VLTextModel) specialTokenTypes() []string {
+	return p.textModel().specialTokenTypes()
+}
+
+func (p *lfm2VLTextModel) parseMore(fsys fs.FS) error {
+	bts, err := fs.ReadFile(fsys, "processor_config.json")
+	if err != nil {
+		if errors.Is(err, fs.ErrNotExist) {
+			return nil
+		}
+		return err
+	}
+
+	return json.Unmarshal(bts, &p.Processor)
+}
+
+func (p *lfm2VLTextModel) visionImageSize() uint32 {
+	// LFM2-VL image processor operates on 512 tiles and downsamples by factor 2
+	// before projection. Keep a fixed square image size compatible with position
+	// embeddings and the simplified runtime image pipeline.
+	tile := cmp.Or(
+		p.Processor.ImageProcessor.TileSize,
+		p.Processor.ImageProcessor.Size.Height,
+		p.Processor.ImageProcessor.Size.Width,
+		uint32(512),
+	)
+	downsample := cmp.Or(p.DownsampleFactor, p.Processor.ImageProcessor.DownsampleFactor, uint32(2))
+	if downsample == 0 {
+		return tile
+	}
+
+	return max(uint32(1), tile/downsample)
+}
+
+func (p *lfm2VLTextModel) KV(t *Tokenizer) KV {
+	kv := p.textModel().KV(t)
+
+	boolOr := func(defaultValue bool, values ...*bool) bool {
+		for _, v := range values {
+			if v != nil {
+				return *v
+			}
+		}
+		return defaultValue
+	}
+
+	kv["vision.block_count"] = cmp.Or(p.VisionConfig.NumHiddenLayers, uint32(27))
+	kv["vision.embedding_length"] = cmp.Or(p.VisionConfig.HiddenSize, uint32(1152))
+	kv["vision.feed_forward_length"] = cmp.Or(p.VisionConfig.IntermediateSize, uint32(4304))
+	kv["vision.attention.head_count"] = cmp.Or(p.VisionConfig.NumAttentionHeads, uint32(16))
+	kv["vision.attention.layer_norm_epsilon"] = cmp.Or(p.VisionConfig.LayerNormEpsilon, float32(1e-6))
+	kv["vision.patch_size"] = cmp.Or(p.VisionConfig.PatchSize, p.EncoderPatchSize, uint32(16))
+	kv["vision.num_channels"] = cmp.Or(p.VisionConfig.NumChannels, uint32(3))
+	kv["vision.image_size"] = p.visionImageSize()
+	kv["vision.projector.scale_factor"] = cmp.Or(p.DownsampleFactor, p.Processor.ImageProcessor.DownsampleFactor, uint32(2))
+	kv["vision.projector.use_layernorm"] = p.ProjectorUseLayernorm
+	kv["vision.do_image_splitting"] = boolOr(true, p.DoImageSplitting, p.Processor.ImageProcessor.DoImageSplitting)
+	kv["vision.min_tiles"] = cmp.Or(p.MinTiles, p.Processor.ImageProcessor.MinTiles, uint32(2))
+	kv["vision.max_tiles"] = cmp.Or(p.MaxTiles, p.Processor.ImageProcessor.MaxTiles, uint32(10))
+	kv["vision.tile_size"] = cmp.Or(p.TileSize, p.Processor.ImageProcessor.TileSize, uint32(512))
+	kv["vision.min_image_tokens"] = cmp.Or(p.MinImageTokens, p.Processor.ImageProcessor.MinImageTokens, uint32(64))
+	kv["vision.max_image_tokens"] = cmp.Or(p.MaxImageTokens, p.Processor.ImageProcessor.MaxImageTokens, uint32(256))
+	kv["vision.max_pixels_tolerance"] = cmp.Or(p.MaxPixelsTolerance, p.Processor.ImageProcessor.MaxPixelsTol, float32(2.0))
+	kv["vision.use_thumbnail"] = boolOr(true, p.UseThumbnail, p.Processor.ImageProcessor.UseThumbnail)
+	kv["vision.use_image_special_tokens"] = boolOr(true, p.UseImageSpecialTokens)
+	kv["vision.image_mean"] = slices.Clone(defaultFloat32Slice(p.Processor.ImageProcessor.ImageMean, []float32{0.5, 0.5, 0.5}))
+	kv["vision.image_std"] = slices.Clone(defaultFloat32Slice(p.Processor.ImageProcessor.ImageStd, []float32{0.5, 0.5, 0.5}))
+	kv["vision.image_token_id"] = cmp.Or(p.ImageTokenID, uint32(396))
+
+	setVisionTokenID := func(k, token string) {
+		if t == nil || t.Vocabulary == nil {
+			return
+		}
+		for i, v := range t.Vocabulary.Tokens {
+			if v == token {
+				kv[k] = uint32(i)
+				return
+			}
+		}
+	}
+	setVisionTokenID("vision.image_start_token_id", "<|image_start|>")
+	setVisionTokenID("vision.image_end_token_id", "<|image_end|>")
+	setVisionTokenID("vision.image_thumbnail_token_id", "<|img_thumbnail|>")
+
+	return kv
+}
+
+func (p *lfm2VLTextModel) Tensors(ts []Tensor) []*ggml.Tensor {
+	patchSize := int(cmp.Or(p.VisionConfig.PatchSize, p.EncoderPatchSize, uint32(16)))
+	numChannels := int(cmp.Or(p.VisionConfig.NumChannels, uint32(3)))
+
+	for _, t := range ts {
+		if t.Name() == "v.patch_embd.weight" {
+			shape := t.Shape()
+			if len(shape) == 2 {
+				inputDim := uint64(numChannels * patchSize * patchSize)
+				if shape[1] == inputDim {
+					channels := numChannels
+					patch := patchSize
+					t.SetRepacker(func(_ string, data []float32, srcShape []uint64) ([]float32, error) {
+						return repackPatchEmbeddingWeight(data, srcShape, channels, patch)
+					})
+				}
+			}
+		}
+	}
+
+	out := p.textModel().Tensors(ts)
+	for _, t := range out {
+		if t.Name == "v.patch_embd.weight" && len(t.Shape) == 2 {
+			t.Shape = []uint64{t.Shape[0], uint64(numChannels), uint64(patchSize), uint64(patchSize)}
+		}
+	}
+	return out
+}
+
+func (p *lfm2VLTextModel) Replacements() []string {
+	out := make([]string, 0, 96)
+
+	addText := func(from, to string) {
+		out = append(out, from, to)
+		if strings.HasPrefix(from, "model.") {
+			suffix := strings.TrimPrefix(from, "model.")
+			out = append(out,
+				"model.language_model."+suffix, to,
+				"model.language_model.model."+suffix, to,
+			)
+		}
+	}
+
+	base := p.textModel().Replacements()
+	for i := 0; i+1 < len(base); i += 2 {
+		addText(base[i], base[i+1])
+	}
+
+	// Vision tower + multimodal projector tensors (single-file conversion).
+	out = append(out,
+		"model.vision_tower.vision_model.embeddings.patch_embedding", "v.patch_embd",
+		"model.vision_tower.vision_model.embeddings.position_embedding", "v.position_embd",
+		"model.vision_tower.vision_model.encoder.layers", "v.blk",
+		"model.vision_tower.vision_model.post_layernorm", "v.post_ln",
+		"model.multi_modal_projector.layer_norm", "mm.layer_norm",
+		"model.multi_modal_projector.linear_1", "mm.1",
+		"model.multi_modal_projector.linear_2", "mm.2",
+		"self_attn.q_proj", "attn_q",
+		"self_attn.k_proj", "attn_k",
+		"self_attn.v_proj", "attn_v",
+		"self_attn.out_proj", "attn_out",
+		"layer_norm1", "ln1",
+		"layer_norm2", "ln2",
+		"mlp.fc1", "ffn_up",
+		"mlp.fc2", "ffn_down",
+	)
+
+	return out
+}
+
+// lfm2VLProjectorModel converts the vision encoder + projector component of LFM2 VL checkpoints.
+type lfm2VLProjectorModel struct {
+	ModelParameters
+	DownsampleFactor   uint32 `json:"downsample_factor"`
+	ProjectorHiddenDim uint32 `json:"projector_hidden_size"`
+	VisionModel        struct {
+		HiddenSize        uint32  `json:"hidden_size"`
+		IntermediateSize  uint32  `json:"intermediate_size"`
+		NumAttentionHeads uint32  `json:"num_attention_heads"`
+		NumHiddenLayers   uint32  `json:"num_hidden_layers"`
+		NumChannels       uint32  `json:"num_channels"`
+		PatchSize         uint32  `json:"patch_size"`
+		LayerNormEpsilon  float32 `json:"layer_norm_eps"`
+		ImageSize         uint32  `json:"image_size"`
+	} `json:"vision_config"`
+	Processor struct {
+		ImageProcessor struct {
+			DownsampleFactor uint32    `json:"downsample_factor"`
+			TileSize         uint32    `json:"tile_size"`
+			ImageMean        []float32 `json:"image_mean"`
+			ImageStd         []float32 `json:"image_std"`
+			Size             struct {
+				Height uint32 `json:"height"`
+				Width  uint32 `json:"width"`
+			} `json:"size"`
+		} `json:"image_processor"`
+	}
+}
+
+var (
+	_ ModelConverter = (*lfm2VLTextModel)(nil)
+	_ ModelConverter = (*lfm2VLProjectorModel)(nil)
+	_ moreParser     = (*lfm2VLTextModel)(nil)
+	_ moreParser     = (*lfm2VLProjectorModel)(nil)
+)
+
+func (p *lfm2VLProjectorModel) parseMore(fsys fs.FS) error {
+	bts, err := fs.ReadFile(fsys, "processor_config.json")
+	if err != nil {
+		if errors.Is(err, fs.ErrNotExist) {
+			return nil
+		}
+		return err
+	}
+
+	return json.Unmarshal(bts, &p.Processor)
+}
+
+func (p *lfm2VLProjectorModel) imageSize() uint32 {
+	if p.VisionModel.ImageSize > 0 {
+		return p.VisionModel.ImageSize
+	}
+
+	downsample := cmp.Or(p.DownsampleFactor, p.Processor.ImageProcessor.DownsampleFactor, uint32(2))
+	baseSize := cmp.Or(
+		p.Processor.ImageProcessor.TileSize,
+		p.Processor.ImageProcessor.Size.Height,
+		p.Processor.ImageProcessor.Size.Width,
+		uint32(256),
+	)
+	if downsample == 0 {
+		return baseSize
+	}
+
+	return max(uint32(1), baseSize/downsample)
+}
+
+func (p *lfm2VLProjectorModel) KV(_ *Tokenizer) KV {
+	kv := KV{
+		"general.architecture":         "clip",
+		"general.type":                 "mmproj",
+		"general.file_type":            uint32(1),
+		"general.quantization_version": uint32(2),
+		"clip.has_vision_encoder":      true,
+		"clip.projector_type":          "lfm2",
+		"clip.use_gelu":                true,
+	}
+
+	kv["clip.vision.block_count"] = cmp.Or(p.VisionModel.NumHiddenLayers, uint32(27))
+	kv["clip.vision.embedding_length"] = cmp.Or(p.VisionModel.HiddenSize, uint32(1152))
+	kv["clip.vision.feed_forward_length"] = cmp.Or(p.VisionModel.IntermediateSize, uint32(4304))
+	kv["clip.vision.attention.head_count"] = cmp.Or(p.VisionModel.NumAttentionHeads, uint32(16))
+	kv["clip.vision.attention.layer_norm_epsilon"] = cmp.Or(p.VisionModel.LayerNormEpsilon, float32(1e-6))
+	kv["clip.vision.patch_size"] = cmp.Or(p.VisionModel.PatchSize, uint32(16))
+	kv["clip.vision.image_size"] = p.imageSize()
+	kv["clip.vision.projection_dim"] = cmp.Or(p.ProjectorHiddenDim, uint32(2048))
+	kv["clip.vision.projector.scale_factor"] = cmp.Or(p.DownsampleFactor, p.Processor.ImageProcessor.DownsampleFactor, uint32(2))
+	kv["clip.vision.image_mean"] = slices.Clone(defaultFloat32Slice(p.Processor.ImageProcessor.ImageMean, []float32{0.5, 0.5, 0.5}))
+	kv["clip.vision.image_std"] = slices.Clone(defaultFloat32Slice(p.Processor.ImageProcessor.ImageStd, []float32{0.5, 0.5, 0.5}))
+
+	return kv
+}
+
+func defaultFloat32Slice(v, fallback []float32) []float32 {
+	if len(v) > 0 {
+		return v
+	}
+
+	return fallback
+}
+
+func (p *lfm2VLProjectorModel) Tensors(ts []Tensor) []*ggml.Tensor {
+	var out []*ggml.Tensor
+
+	numChannels := cmp.Or(p.VisionModel.NumChannels, uint32(3))
+	patchSize := cmp.Or(p.VisionModel.PatchSize, uint32(16))
+
+	for _, t := range ts {
+		name := t.Name()
+		if !(strings.HasPrefix(name, "v.") || strings.HasPrefix(name, "mm.")) {
+			continue
+		}
+
+		shape := t.Shape()
+		if name == "v.patch_embd.weight" && len(shape) == 2 {
+			inputDim := uint64(numChannels * patchSize * patchSize)
+			if shape[1] == inputDim {
+				shape = []uint64{shape[0], uint64(numChannels), uint64(patchSize), uint64(patchSize)}
+				channels := int(numChannels)
+				patch := int(patchSize)
+				t.SetRepacker(func(_ string, data []float32, srcShape []uint64) ([]float32, error) {
+					return repackPatchEmbeddingWeight(data, srcShape, channels, patch)
+				})
+			}
+		}
+
+		out = append(out, &ggml.Tensor{
+			Name:     name,
+			Kind:     t.Kind(),
+			Shape:    slices.Clone(shape),
+			WriterTo: t,
+		})
+	}
+
+	return out
+}
+
+func (p *lfm2VLProjectorModel) Replacements() []string {
+	return []string{
+		"model.multi_modal_projector.linear_1", "mm.1",
+		"model.multi_modal_projector.linear_2", "mm.2",
+		"model.vision_tower.vision_model.embeddings.patch_embedding", "v.patch_embd",
+		"model.vision_tower.vision_model.embeddings.position_embedding", "v.position_embd",
+		"model.vision_tower.vision_model.encoder.layers", "v.blk",
+		"self_attn.q_proj", "attn_q",
+		"self_attn.k_proj", "attn_k",
+		"self_attn.v_proj", "attn_v",
+		"self_attn.out_proj", "attn_out",
+		"layer_norm1", "ln1",
+		"layer_norm2", "ln2",
+		"mlp.fc1", "ffn_up",
+		"mlp.fc2", "ffn_down",
+		"model.vision_tower.vision_model.post_layernorm", "v.post_ln",
+	}
+}
+
+func repackPatchEmbeddingWeight(data []float32, srcShape []uint64, channels, patch int) ([]float32, error) {
+	if len(srcShape) != 2 {
+		return nil, fmt.Errorf("invalid patch embedding shape rank: %d", len(srcShape))
+	}
+
+	outDim := int(srcShape[0])
+	flatInputDim := int(srcShape[1])
+	expectedInputDim := channels * patch * patch
+	if flatInputDim != expectedInputDim {
+		return nil, fmt.Errorf("invalid patch embedding input dim: got %d, want %d", flatInputDim, expectedInputDim)
+	}
+
+	expectedSize := outDim * flatInputDim
+	if len(data) != expectedSize {
+		return nil, fmt.Errorf("invalid patch embedding data size: got %d, want %d", len(data), expectedSize)
+	}
+
+	repacked := make([]float32, len(data))
+	perChannel := patch * patch
+
+	for o := range outDim {
+		inBase := o * flatInputDim
+		outBase := o * flatInputDim
+
+		for y := range patch {
+			for x := range patch {
+				inPixelBase := inBase + (y*patch+x)*channels
+				for c := range channels {
+					src := inPixelBase + c
+					dst := outBase + c*perChannel + y*patch + x
+					repacked[dst] = data[src]
+				}
+			}
+		}
+	}
+
+	return repacked, nil
+}
--- a/convert/convert_lfm2_vl_test.go
+++ b/convert/convert_lfm2_vl_test.go
@@ -0,0 +1,249 @@
+package convert
+
+import (
+	"slices"
+	"strings"
+	"testing"
+)
+
+func TestLFM2VLTextModelKVUsesTextConfig(t *testing.T) {
+	p := lfm2VLTextModel{
+		TextConfig: lfm2Model{
+			ModelParameters:       ModelParameters{ModelType: "lfm2", VocabSize: 65536},
+			HiddenSize:            2048,
+			NumHiddenLayers:       16,
+			MaxPositionEmbeddings: 128000,
+			IntermediateSize:      12288,
+			BlockFFDim:            12288,
+			BlockAutoAdjustFFDim:  true,
+			BlockMultipleOf:       256,
+			BlockFFNDimMultiplier: 1.0,
+			NumAttentionHeads:     32,
+			NumKeyValueHeads:      8,
+			LayerTypes:            []string{"conv", "full_attention"},
+			NormEps:               1e-5,
+			ConvLCache:            3,
+		},
+		DownsampleFactor: 2,
+		VisionConfig: struct {
+			HiddenSize        uint32  `json:"hidden_size"`
+			IntermediateSize  uint32  `json:"intermediate_size"`
+			NumAttentionHeads uint32  `json:"num_attention_heads"`
+			NumHiddenLayers   uint32  `json:"num_hidden_layers"`
+			NumChannels       uint32  `json:"num_channels"`
+			PatchSize         uint32  `json:"patch_size"`
+			LayerNormEpsilon  float32 `json:"layer_norm_eps"`
+		}{
+			HiddenSize:        1152,
+			IntermediateSize:  4304,
+			NumAttentionHeads: 16,
+			NumHiddenLayers:   27,
+			NumChannels:       3,
+			PatchSize:         16,
+			LayerNormEpsilon:  1e-6,
+		},
+	}
+	p.Processor.ImageProcessor.TileSize = 512
+	p.Processor.ImageProcessor.ImageMean = []float32{0.5, 0.5, 0.5}
+	p.Processor.ImageProcessor.ImageStd = []float32{0.5, 0.5, 0.5}
+
+	kv := p.KV(&Tokenizer{
+		Vocabulary: &Vocabulary{
+			Model:  "gpt2",
+			Tokens: []string{"<|pad|>", "<image>", "<|image_start|>", "<|image_end|>", "<|img_thumbnail|>"},
+		},
+	})
+
+	if got, want := kv["general.architecture"], "lfm2"; got != want {
+		t.Fatalf("general.architecture = %v, want %v", got, want)
+	}
+
+	if got, want := kv["feed_forward_length"], uint32(8192); got != want {
+		t.Fatalf("feed_forward_length = %v, want %v", got, want)
+	}
+
+	if got, want := kv["vision.block_count"], uint32(27); got != want {
+		t.Fatalf("vision.block_count = %v, want %v", got, want)
+	}
+
+	if got, want := kv["vision.image_size"], uint32(256); got != want {
+		t.Fatalf("vision.image_size = %v, want %v", got, want)
+	}
+
+	if got, want := kv["vision.image_token_id"], uint32(396); got != want {
+		t.Fatalf("vision.image_token_id = %v, want %v", got, want)
+	}
+
+	if got, want := kv["vision.image_start_token_id"], uint32(2); got != want {
+		t.Fatalf("vision.image_start_token_id = %v, want %v", got, want)
+	}
+
+	if got, want := kv["vision.do_image_splitting"], true; got != want {
+		t.Fatalf("vision.do_image_splitting = %v, want %v", got, want)
+	}
+	if got, want := kv["vision.min_tiles"], uint32(2); got != want {
+		t.Fatalf("vision.min_tiles = %v, want %v", got, want)
+	}
+	if got, want := kv["vision.max_tiles"], uint32(10); got != want {
+		t.Fatalf("vision.max_tiles = %v, want %v", got, want)
+	}
+	if got, want := kv["vision.tile_size"], uint32(512); got != want {
+		t.Fatalf("vision.tile_size = %v, want %v", got, want)
+	}
+	if got, want := kv["vision.use_thumbnail"], true; got != want {
+		t.Fatalf("vision.use_thumbnail = %v, want %v", got, want)
+	}
+	if got, want := kv["vision.use_image_special_tokens"], true; got != want {
+		t.Fatalf("vision.use_image_special_tokens = %v, want %v", got, want)
+	}
+}
+
+func TestLFM2VLTextModelTensorsIncludeVision(t *testing.T) {
+	p := lfm2VLTextModel{}
+	p.VisionConfig.PatchSize = 16
+	p.VisionConfig.NumChannels = 3
+	input := []Tensor{
+		newLFM2StubTensor("model.embed_tokens.weight", []uint64{65536, 2048}),
+		newLFM2StubTensor("model.layers.0.ffn_norm.weight", []uint64{2048}),
+		newLFM2StubTensor("v.patch_embd.weight", []uint64{1152, 768}),
+		newLFM2StubTensor("v.blk.0.attn_q.weight", []uint64{1152, 1152}),
+		newLFM2StubTensor("mm.1.weight", []uint64{2048, 4608}),
+	}
+
+	out := p.Tensors(input)
+	if len(out) == 0 {
+		t.Fatal("expected non-empty tensor list")
+	}
+
+	foundPatch := false
+	foundVision := false
+	for _, tns := range out {
+		if tns.Name == "v.patch_embd.weight" {
+			foundPatch = true
+			if !slices.Equal(tns.Shape, []uint64{1152, 3, 16, 16}) {
+				t.Fatalf("v.patch_embd.weight shape = %v, want [1152 3 16 16]", tns.Shape)
+			}
+		}
+		if strings.HasPrefix(tns.Name, "v.") || strings.HasPrefix(tns.Name, "mm.") {
+			foundVision = true
+		}
+	}
+
+	if !foundPatch {
+		t.Fatal("expected v.patch_embd.weight in output tensors")
+	}
+	if !foundVision {
+		t.Fatal("expected at least one vision/projector tensor in output")
+	}
+}
+
+func TestLFM2VLTextModelReplacements(t *testing.T) {
+	p := lfm2VLTextModel{}
+	r := strings.NewReplacer(p.Replacements()...)
+
+	tests := []struct {
+		name string
+		in   string
+		want string
+	}{
+		{
+			name: "language_model_embed_tokens",
+			in:   "model.language_model.embed_tokens.weight",
+			want: "token_embd.weight",
+		},
+		{
+			name: "language_model_layers",
+			in:   "model.language_model.layers.2.self_attn.q_proj.weight",
+			want: "blk.2.attn_q.weight",
+		},
+		{
+			name: "nested_language_model_prefix",
+			in:   "model.language_model.model.embedding_norm.weight",
+			want: "token_embd_norm.weight",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := r.Replace(tt.in); got != tt.want {
+				t.Fatalf("replacement(%q) = %q, want %q", tt.in, got, tt.want)
+			}
+		})
+	}
+}
+
+func TestLFM2VLProjectorKV(t *testing.T) {
+	p := lfm2VLProjectorModel{
+		DownsampleFactor:   2,
+		ProjectorHiddenDim: 2048,
+	}
+	p.VisionModel.NumHiddenLayers = 27
+	p.VisionModel.HiddenSize = 1152
+	p.VisionModel.IntermediateSize = 4304
+	p.VisionModel.NumAttentionHeads = 16
+	p.VisionModel.PatchSize = 16
+	p.VisionModel.LayerNormEpsilon = 1e-6
+	p.Processor.ImageProcessor.TileSize = 512
+	p.Processor.ImageProcessor.ImageMean = []float32{0.5, 0.5, 0.5}
+	p.Processor.ImageProcessor.ImageStd = []float32{0.5, 0.5, 0.5}
+
+	kv := p.KV(nil)
+
+	if got, want := kv["general.architecture"], "clip"; got != want {
+		t.Fatalf("general.architecture = %v, want %v", got, want)
+	}
+	if got, want := kv["clip.projector_type"], "lfm2"; got != want {
+		t.Fatalf("clip.projector_type = %v, want %v", got, want)
+	}
+	if got, want := kv["clip.vision.image_size"], uint32(256); got != want {
+		t.Fatalf("clip.vision.image_size = %v, want %v", got, want)
+	}
+}
+
+func TestLFM2VLProjectorTensorsPatchReshape(t *testing.T) {
+	p := lfm2VLProjectorModel{}
+	p.VisionModel.NumChannels = 3
+	p.VisionModel.PatchSize = 16
+
+	input := []Tensor{
+		newLFM2StubTensor("v.patch_embd.weight", []uint64{1152, 768}),
+		newLFM2StubTensor("mm.1.weight", []uint64{2048, 4608}),
+		newLFM2StubTensor("model.embed_tokens.weight", []uint64{65536, 2048}),
+	}
+
+	out := p.Tensors(input)
+	if len(out) != 2 {
+		t.Fatalf("expected 2 tensors, got %d", len(out))
+	}
+
+	var patchShape []uint64
+	for _, tns := range out {
+		if tns.Name == "v.patch_embd.weight" {
+			patchShape = tns.Shape
+			break
+		}
+	}
+
+	if !slices.Equal(patchShape, []uint64{1152, 3, 16, 16}) {
+		t.Fatalf("v.patch_embd.weight shape = %v, want [1152 3 16 16]", patchShape)
+	}
+}
+
+func TestRepackPatchEmbeddingWeight(t *testing.T) {
+	data := []float32{
+		0, 1, // y=0,x=0
+		2, 3, // y=0,x=1
+		4, 5, // y=1,x=0
+		6, 7, // y=1,x=1
+	}
+
+	got, err := repackPatchEmbeddingWeight(data, []uint64{1, 8}, 2, 2)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	want := []float32{0, 2, 4, 6, 1, 3, 5, 7}
+	if !slices.Equal(got, want) {
+		t.Fatalf("repacked data = %v, want %v", got, want)
+	}
+}
--- a/convert/convert_nemotron_h.go
+++ b/convert/convert_nemotron_h.go
@@ -0,0 +1,385 @@
+package convert
+
+import (
+	"cmp"
+	"encoding/json"
+	"fmt"
+	"io/fs"
+	"math"
+	"slices"
+	"strings"
+
+	"github.com/ollama/ollama/fs/ggml"
+)
+
+type hybridPattern string
+
+func (p *hybridPattern) UnmarshalJSON(data []byte) error {
+	if string(data) == "null" {
+		*p = ""
+		return nil
+	}
+
+	var single string
+	if err := json.Unmarshal(data, &single); err == nil {
+		*p = hybridPattern(strings.TrimSpace(single))
+		return nil
+	}
+
+	var parts []string
+	if err := json.Unmarshal(data, &parts); err == nil {
+		*p = hybridPattern(strings.Join(parts, ""))
+		return nil
+	}
+
+	return fmt.Errorf("hybrid_override_pattern must be a string or string array")
+}
+
+type nemotronHModel struct {
+	ModelParameters
+	MaxPositionEmbeddings uint32        `json:"max_position_embeddings"`
+	HiddenSize            uint32        `json:"hidden_size"`
+	NumHiddenLayers       uint32        `json:"num_hidden_layers"`
+	NumAttentionHeads     uint32        `json:"num_attention_heads"`
+	NumKeyValueHeads      uint32        `json:"num_key_value_heads"`
+	HeadDim               uint32        `json:"head_dim"`
+	LayerNormEpsilon      float32       `json:"layer_norm_epsilon"`
+	NormEpsilon           float32       `json:"norm_eps"`
+	RopeTheta             float32       `json:"rope_theta"`
+	PartialRotaryFactor   float32       `json:"partial_rotary_factor"`
+	ConvKernel            uint32        `json:"conv_kernel"`
+	SSMStateSize          uint32        `json:"ssm_state_size"`
+	MambaNumHeads         uint32        `json:"mamba_num_heads"`
+	MambaHeadDim          uint32        `json:"mamba_head_dim"`
+	NGroups               uint32        `json:"n_groups"`
+	IntermediateSize      uint32        `json:"intermediate_size"`
+	HybridOverridePattern hybridPattern `json:"hybrid_override_pattern"`
+
+	// MoE
+	NumExperts                  uint32  `json:"num_experts"`
+	NumSharedExperts            uint32  `json:"num_shared_experts"`
+	NRoutedExperts              uint32  `json:"n_routed_experts"`
+	NSharedExperts              uint32  `json:"n_shared_experts"`
+	NumExpertsPerTok            uint32  `json:"num_experts_per_tok"`
+	MoEIntermediateSize         uint32  `json:"moe_intermediate_size"`
+	MoESharedExpertIntermediate uint32  `json:"moe_shared_expert_intermediate_size"`
+	NormTopKProb                bool    `json:"norm_topk_prob"`
+	RoutedScalingFactor         float32 `json:"routed_scaling_factor"`
+	ExpertGroupCount            uint32  `json:"n_group"`
+	ExpertGroupUsedCount        uint32  `json:"topk_group"`
+}
+
+var _ ModelConverter = (*nemotronHModel)(nil)
+
+func (n *nemotronHModel) parseMore(_ fs.FS) error {
+	if n.NumHiddenLayers == 0 {
+		return fmt.Errorf("nemotron_h: num_hidden_layers must be set")
+	}
+	if n.HiddenSize == 0 {
+		return fmt.Errorf("nemotron_h: hidden_size must be set")
+	}
+	if n.NumAttentionHeads == 0 {
+		return fmt.Errorf("nemotron_h: num_attention_heads must be set")
+	}
+	if n.HeadDim == 0 {
+		if n.HiddenSize%n.NumAttentionHeads != 0 {
+			return fmt.Errorf("nemotron_h: hidden_size (%d) must be divisible by num_attention_heads (%d)", n.HiddenSize, n.NumAttentionHeads)
+		}
+		n.HeadDim = n.HiddenSize / n.NumAttentionHeads
+	}
+	if n.NumKeyValueHeads == 0 {
+		n.NumKeyValueHeads = n.NumAttentionHeads
+	}
+	if n.ConvKernel == 0 {
+		return fmt.Errorf("nemotron_h: conv_kernel must be set")
+	}
+	if n.SSMStateSize == 0 {
+		return fmt.Errorf("nemotron_h: ssm_state_size must be set")
+	}
+	if n.ssmHeadCount() == 0 {
+		return fmt.Errorf("nemotron_h: mamba_num_heads must be set")
+	}
+	if n.MambaHeadDim == 0 {
+		return fmt.Errorf("nemotron_h: mamba_head_dim must be set")
+	}
+	if n.NGroups == 0 {
+		n.NGroups = 1
+	}
+
+	if _, _, err := n.layerArrays(); err != nil {
+		return err
+	}
+
+	if n.isMoE() {
+		if n.routedExpertCount() == 0 {
+			return fmt.Errorf("nemotron_h: routed expert count must be set for MoE models")
+		}
+		if n.NumExpertsPerTok == 0 {
+			return fmt.Errorf("nemotron_h: num_experts_per_tok must be set for MoE models")
+		}
+		if n.NumExpertsPerTok > n.routedExpertCount() {
+			return fmt.Errorf("nemotron_h: num_experts_per_tok (%d) cannot exceed expert_count (%d)", n.NumExpertsPerTok, n.routedExpertCount())
+		}
+		if n.moeIntermediateSize() == 0 {
+			return fmt.Errorf("nemotron_h: moe_intermediate_size must be set for MoE models")
+		}
+	}
+
+	return nil
+}
+
+func (n *nemotronHModel) isMoE() bool {
+	return cmp.Or(n.routedExpertCount(), n.NumExpertsPerTok, n.MoEIntermediateSize) > 0
+}
+
+func (n *nemotronHModel) routedExpertCount() uint32 {
+	return cmp.Or(n.NRoutedExperts, n.NumExperts)
+}
+
+func (n *nemotronHModel) sharedExpertCount() uint32 {
+	return cmp.Or(n.NSharedExperts, n.NumSharedExperts)
+}
+
+func (n *nemotronHModel) ssmHeadCount() uint32 {
+	return n.MambaNumHeads
+}
+
+func (n *nemotronHModel) ssmInnerSize() uint32 {
+	return n.MambaHeadDim * n.ssmHeadCount()
+}
+
+func (n *nemotronHModel) epsilon() float32 {
+	return cmp.Or(n.NormEpsilon, n.LayerNormEpsilon, float32(1e-5))
+}
+
+func (n *nemotronHModel) moeIntermediateSize() uint32 {
+	return cmp.Or(n.MoEIntermediateSize, n.IntermediateSize)
+}
+
+func (n *nemotronHModel) denseIntermediateSize() uint32 {
+	return cmp.Or(n.IntermediateSize, n.MoEIntermediateSize)
+}
+
+func (n *nemotronHModel) layerArrays() (headCountKV []uint32, ffnLengths []uint32, err error) {
+	pattern := strings.TrimSpace(string(n.HybridOverridePattern))
+	if pattern == "" {
+		return nil, nil, fmt.Errorf("nemotron_h: hybrid_override_pattern must be set")
+	}
+
+	runes := []rune(pattern)
+	if len(runes) != int(n.NumHiddenLayers) {
+		return nil, nil, fmt.Errorf("nemotron_h: hybrid_override_pattern length (%d) must match num_hidden_layers (%d)", len(runes), n.NumHiddenLayers)
+	}
+
+	headCountKV = make([]uint32, n.NumHiddenLayers)
+	ffnLengths = make([]uint32, n.NumHiddenLayers)
+
+	attnKVHeads := cmp.Or(n.NumKeyValueHeads, n.NumAttentionHeads)
+	moeFFN := n.moeIntermediateSize()
+	denseFFN := n.denseIntermediateSize()
+
+	for i, layerType := range runes {
+		switch layerType {
+		case 'M':
+			// Recurrent layer: no KV heads and no FFN.
+		case '*', 'A':
+			// Attention-only layer.
+			headCountKV[i] = attnKVHeads
+		case 'E':
+			// MoE layer.
+			if moeFFN == 0 {
+				return nil, nil, fmt.Errorf("nemotron_h: moe layer at index %d but moe_intermediate_size is zero", i)
+			}
+			ffnLengths[i] = moeFFN
+		case '-':
+			// Dense FFN layer.
+			if denseFFN == 0 {
+				return nil, nil, fmt.Errorf("nemotron_h: dense FFN layer at index %d but intermediate_size is zero", i)
+			}
+			ffnLengths[i] = denseFFN
+		default:
+			return nil, nil, fmt.Errorf("nemotron_h: unsupported layer type %q in hybrid_override_pattern at index %d", layerType, i)
+		}
+	}
+
+	return headCountKV, ffnLengths, nil
+}
+
+func (n *nemotronHModel) KV(t *Tokenizer) KV {
+	kv := n.ModelParameters.KV(t)
+
+	arch := "nemotron_h"
+	if n.isMoE() {
+		arch = "nemotron_h_moe"
+	}
+	kv["general.architecture"] = arch
+	kv["block_count"] = n.NumHiddenLayers
+	kv["context_length"] = n.MaxPositionEmbeddings
+	kv["embedding_length"] = n.HiddenSize
+	kv["attention.head_count"] = n.NumAttentionHeads
+	kv["attention.key_length"] = n.HeadDim
+	kv["attention.value_length"] = n.HeadDim
+	kv["attention.layer_norm_epsilon"] = n.epsilon()
+	kv["attention.layer_norm_rms_epsilon"] = n.epsilon()
+	kv["rope.freq_base"] = cmp.Or(n.RopeTheta, float32(10000))
+	if n.PartialRotaryFactor > 0 && n.PartialRotaryFactor <= 1 {
+		kv["rope.dimension_count"] = uint32(float32(n.HeadDim) * n.PartialRotaryFactor)
+	}
+
+	if headCountKV, ffnLengths, err := n.layerArrays(); err == nil {
+		kv["attention.head_count_kv"] = headCountKV
+		kv["feed_forward_length"] = ffnLengths
+	}
+
+	kv["ssm.conv_kernel"] = n.ConvKernel
+	kv["ssm.inner_size"] = n.ssmInnerSize()
+	kv["ssm.state_size"] = n.SSMStateSize
+	kv["ssm.group_count"] = n.NGroups
+	kv["ssm.time_step_rank"] = n.ssmHeadCount()
+
+	if n.isMoE() {
+		kv["expert_count"] = n.routedExpertCount()
+		kv["expert_used_count"] = n.NumExpertsPerTok
+		kv["expert_feed_forward_length"] = n.moeIntermediateSize()
+		if n.sharedExpertCount() > 0 {
+			kv["expert_shared_count"] = n.sharedExpertCount()
+		}
+		if n.MoESharedExpertIntermediate > 0 {
+			kv["expert_shared_feed_forward_length"] = n.MoESharedExpertIntermediate
+		}
+		kv["expert_weights_norm"] = n.NormTopKProb
+		kv["expert_weights_scale"] = n.RoutedScalingFactor
+		if n.ExpertGroupCount > 0 {
+			kv["expert_group_count"] = n.ExpertGroupCount
+		}
+		if n.ExpertGroupUsedCount > 0 {
+			kv["expert_group_used_count"] = n.ExpertGroupUsedCount
+		}
+	}
+
+	return kv
+}
+
+func normalizeVectorShapeToColumn(shape []uint64) []uint64 {
+	switch len(shape) {
+	case 1:
+		return []uint64{shape[0], 1}
+	case 2:
+		if shape[0] == 1 && shape[1] > 1 {
+			return []uint64{shape[1], 1}
+		}
+		if shape[1] == 1 && shape[0] > 1 {
+			return []uint64{shape[0], 1}
+		}
+	}
+
+	return slices.Clone(shape)
+}
+
+func (n *nemotronHModel) Tensors(ts []Tensor) []*ggml.Tensor {
+	var out []*ggml.Tensor
+
+	remaining := ts
+	if n.isMoE() {
+		merges := make([]merge, 0, n.NumHiddenLayers*2)
+		for i := range n.NumHiddenLayers {
+			merges = append(merges, merge{
+				fmt.Sprintf("blk.%d.mixer.experts.*.up_proj.weight", i),
+				fmt.Sprintf("blk.%d.ffn_up_exps.weight", i),
+			}, merge{
+				fmt.Sprintf("blk.%d.mixer.experts.*.down_proj.weight", i),
+				fmt.Sprintf("blk.%d.ffn_down_exps.weight", i),
+			})
+		}
+
+		merged, rest := mergeTensors(ts, merges...)
+		out = append(out, merged...)
+		remaining = rest
+	}
+
+	nGroups := uint64(cmp.Or(n.NGroups, uint32(1)))
+	for _, t := range remaining {
+		name := t.Name()
+		shape := slices.Clone(t.Shape())
+
+		switch {
+		case strings.HasSuffix(name, ".ssm_a"):
+			shape = normalizeVectorShapeToColumn(shape)
+			t.SetRepacker(func(_ string, data []float32, _ []uint64) ([]float32, error) {
+				out := make([]float32, len(data))
+				for i, v := range data {
+					out[i] = -float32(math.Exp(float64(v)))
+				}
+				return out, nil
+			})
+		case strings.HasSuffix(name, ".ssm_d"):
+			shape = normalizeVectorShapeToColumn(shape)
+		case strings.HasSuffix(name, ".ssm_norm.weight"):
+			switch len(shape) {
+			case 1:
+				if nGroups > 0 && shape[0]%nGroups == 0 {
+					shape = []uint64{nGroups, shape[0] / nGroups}
+				}
+			case 2:
+				if shape[0] == 1 && nGroups > 0 && shape[1]%nGroups == 0 {
+					shape = []uint64{nGroups, shape[1] / nGroups}
+				}
+			}
+		case strings.HasSuffix(name, ".ssm_conv1d.weight"):
+			if len(shape) == 3 {
+				if shape[0] == 1 {
+					shape = []uint64{shape[1], shape[2]}
+				} else if shape[1] == 1 {
+					shape = []uint64{shape[0], shape[2]}
+				}
+			}
+		}
+
+		out = append(out, &ggml.Tensor{
+			Name:     name,
+			Kind:     t.Kind(),
+			Shape:    shape,
+			WriterTo: t,
+		})
+	}
+
+	return out
+}
+
+func (n *nemotronHModel) Replacements() []string {
+	return []string{
+		// Embedding and output
+		"lm_head", "output",
+		"backbone.embeddings", "token_embd",
+		"backbone.norm_f", "output_norm",
+		"backbone.layers", "blk",
+
+		// Recurrent (Mamba2) tensors
+		"mixer.in_proj", "ssm_in",
+		"mixer.out_proj", "ssm_out",
+		"mixer.dt_bias", "ssm_dt.bias",
+		"mixer.A_log", "ssm_a",
+		"mixer.D", "ssm_d",
+		"mixer.conv1d", "ssm_conv1d",
+		"mixer.norm.weight", "ssm_norm.weight",
+
+		// Attention tensors
+		"mixer.q_proj", "attn_q",
+		"mixer.k_proj", "attn_k",
+		"mixer.v_proj", "attn_v",
+		"mixer.o_proj", "attn_output",
+
+		// FFN / MoE tensors
+		"mixer.gate.e_score_correction_bias", "exp_probs_b.bias",
+		"mixer.gate", "ffn_gate_inp",
+		"mixer.fc1_latent_proj", "ffn_latent_in",
+		"mixer.fc2_latent_proj", "ffn_latent_out",
+		"mixer.shared_experts.up_proj", "ffn_up_shexp",
+		"mixer.shared_experts.down_proj", "ffn_down_shexp",
+		"mixer.up_proj", "ffn_up",
+		"mixer.down_proj", "ffn_down",
+
+		// Per-layer pre-norm
+		".norm.weight", ".attn_norm.weight",
+	}
+}
--- a/convert/convert_nemotron_h_test.go
+++ b/convert/convert_nemotron_h_test.go
@@ -0,0 +1,230 @@
+package convert
+
+import (
+	"bytes"
+	"encoding/binary"
+	"encoding/json"
+	"io"
+	"os"
+	"path/filepath"
+	"slices"
+	"strings"
+	"testing"
+)
+
+func TestHybridPatternUnmarshal(t *testing.T) {
+	t.Run("string", func(t *testing.T) {
+		var p hybridPattern
+		if err := json.Unmarshal([]byte(`"MEM*"`), &p); err != nil {
+			t.Fatal(err)
+		}
+		if got, want := string(p), "MEM*"; got != want {
+			t.Fatalf("unexpected pattern: got %q want %q", got, want)
+		}
+	})
+
+	t.Run("array", func(t *testing.T) {
+		var p hybridPattern
+		if err := json.Unmarshal([]byte(`["M","E","M","*"]`), &p); err != nil {
+			t.Fatal(err)
+		}
+		if got, want := string(p), "MEM*"; got != want {
+			t.Fatalf("unexpected pattern: got %q want %q", got, want)
+		}
+	})
+}
+
+func TestNemotronHLayerArrays(t *testing.T) {
+	m := &nemotronHModel{
+		NumHiddenLayers:       5,
+		NumAttentionHeads:     32,
+		NumKeyValueHeads:      8,
+		HybridOverridePattern: "MEM*E",
+		NRoutedExperts:        128,
+		NumExpertsPerTok:      6,
+		MoEIntermediateSize:   1856,
+	}
+
+	headsKV, ffn, err := m.layerArrays()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if got, want := headsKV, []uint32{0, 0, 0, 8, 0}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected head_count_kv: got %v want %v", got, want)
+	}
+	if got, want := ffn, []uint32{0, 1856, 0, 0, 1856}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected feed_forward_length: got %v want %v", got, want)
+	}
+}
+
+func TestNemotronHKV(t *testing.T) {
+	m := &nemotronHModel{
+		MaxPositionEmbeddings:       1048576,
+		HiddenSize:                  2688,
+		NumHiddenLayers:             5,
+		NumAttentionHeads:           32,
+		NumKeyValueHeads:            2,
+		HeadDim:                     128,
+		LayerNormEpsilon:            1e-5,
+		RopeTheta:                   10000,
+		PartialRotaryFactor:         0.5,
+		ConvKernel:                  4,
+		SSMStateSize:                128,
+		MambaNumHeads:               64,
+		MambaHeadDim:                64,
+		NGroups:                     8,
+		HybridOverridePattern:       "MEM*E",
+		NRoutedExperts:              128,
+		NSharedExperts:              1,
+		NumExpertsPerTok:            6,
+		MoEIntermediateSize:         1856,
+		MoESharedExpertIntermediate: 3712,
+		NormTopKProb:                true,
+		RoutedScalingFactor:         2.5,
+	}
+	if err := m.parseMore(nil); err != nil {
+		t.Fatal(err)
+	}
+
+	kv := m.KV(&Tokenizer{Vocabulary: &Vocabulary{}})
+	if got, want := kv["general.architecture"], "nemotron_h_moe"; got != want {
+		t.Fatalf("unexpected architecture: got %v want %v", got, want)
+	}
+
+	headCountKV, ok := kv["attention.head_count_kv"].([]uint32)
+	if !ok {
+		t.Fatalf("attention.head_count_kv has unexpected type: %T", kv["attention.head_count_kv"])
+	}
+	if got, want := headCountKV, []uint32{0, 0, 0, 2, 0}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected attention.head_count_kv: got %v want %v", got, want)
+	}
+
+	ffnLength, ok := kv["feed_forward_length"].([]uint32)
+	if !ok {
+		t.Fatalf("feed_forward_length has unexpected type: %T", kv["feed_forward_length"])
+	}
+	if got, want := ffnLength, []uint32{0, 1856, 0, 0, 1856}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected feed_forward_length: got %v want %v", got, want)
+	}
+}
+
+func TestNemotronHTensorsTransforms(t *testing.T) {
+	m := &nemotronHModel{NGroups: 8}
+	in := []Tensor{
+		&fakeTensor{
+			name:  "blk.0.ssm_a",
+			shape: []uint64{4},
+			data:  []float32{0, 1, 2, 3},
+		},
+		&fakeTensor{
+			name:  "blk.0.ssm_d",
+			shape: []uint64{4},
+			data:  []float32{0, 1, 2, 3},
+		},
+		&fakeTensor{
+			name:  "blk.0.ssm_norm.weight",
+			shape: []uint64{16},
+			data:  make([]float32, 16),
+		},
+		&fakeTensor{
+			name:  "blk.0.ssm_conv1d.weight",
+			shape: []uint64{10, 1, 4},
+			data:  make([]float32, 40),
+		},
+	}
+
+	out := m.Tensors(in)
+	if len(out) != len(in) {
+		t.Fatalf("unexpected output tensor count: got %d want %d", len(out), len(in))
+	}
+
+	got := map[string]struct {
+		shape  []uint64
+		writer io.WriterTo
+	}{}
+	for _, t := range out {
+		got[t.Name] = struct {
+			shape  []uint64
+			writer io.WriterTo
+		}{shape: t.Shape, writer: t.WriterTo}
+	}
+
+	if shape := got["blk.0.ssm_a"].shape; !slices.Equal(shape, []uint64{4, 1}) {
+		t.Fatalf("unexpected ssm_a shape: %v", shape)
+	}
+	if shape := got["blk.0.ssm_d"].shape; !slices.Equal(shape, []uint64{4, 1}) {
+		t.Fatalf("unexpected ssm_d shape: %v", shape)
+	}
+	if shape := got["blk.0.ssm_norm.weight"].shape; !slices.Equal(shape, []uint64{8, 2}) {
+		t.Fatalf("unexpected ssm_norm shape: %v", shape)
+	}
+	if shape := got["blk.0.ssm_conv1d.weight"].shape; !slices.Equal(shape, []uint64{10, 4}) {
+		t.Fatalf("unexpected ssm_conv1d shape: %v", shape)
+	}
+
+	var b bytes.Buffer
+	if _, err := got["blk.0.ssm_a"].writer.WriteTo(&b); err != nil {
+		t.Fatal(err)
+	}
+	values := make([]float32, 4)
+	if err := binary.Read(&b, binary.LittleEndian, &values); err != nil {
+		t.Fatal(err)
+	}
+	// 0 -> -exp(0) == -1
+	if values[0] != -1 {
+		t.Fatalf("unexpected transformed ssm_a[0]: got %v want -1", values[0])
+	}
+}
+
+func TestNemotronHLoadModelMetadata(t *testing.T) {
+	tempDir := t.TempDir()
+
+	config := `{
+		"architectures": ["NemotronHForCausalLM"],
+		"model_type": "nemotron_h",
+		"num_hidden_layers": 4,
+		"hidden_size": 512,
+		"max_position_embeddings": 32768,
+		"num_attention_heads": 8,
+		"num_key_value_heads": 2,
+		"head_dim": 64,
+		"layer_norm_epsilon": 1e-5,
+		"conv_kernel": 4,
+		"ssm_state_size": 128,
+		"mamba_num_heads": 16,
+		"mamba_head_dim": 32,
+		"n_groups": 8,
+		"hybrid_override_pattern": "ME*M",
+		"n_routed_experts": 16,
+		"num_experts_per_tok": 4,
+		"moe_intermediate_size": 256
+	}`
+
+	if err := os.WriteFile(filepath.Join(tempDir, "config.json"), []byte(config), 0o644); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(filepath.Join(tempDir, "tokenizer.json"), []byte(`{}`), 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	kv, _, err := LoadModelMetadata(os.DirFS(tempDir))
+	if err != nil {
+		t.Fatal(err)
+	}
+	if _, ok := kv.(*nemotronHModel); !ok {
+		t.Fatalf("unexpected converter type: %T", kv)
+	}
+}
+
+func TestNemotronHReplacementsLatentProjections(t *testing.T) {
+	m := &nemotronHModel{}
+	r := strings.NewReplacer(m.Replacements()...)
+
+	if got, want := r.Replace("backbone.layers.1.mixer.fc1_latent_proj.weight"), "blk.1.ffn_latent_in.weight"; got != want {
+		t.Fatalf("unexpected fc1 replacement: got %q want %q", got, want)
+	}
+	if got, want := r.Replace("backbone.layers.1.mixer.fc2_latent_proj.weight"), "blk.1.ffn_latent_out.weight"; got != want {
+		t.Fatalf("unexpected fc2 replacement: got %q want %q", got, want)
+	}
+}
--- a/convert/convert_qwen3next.go
+++ b/convert/convert_qwen3next.go
@@ -1,6 +1,7 @@
 package convert

 import (
+	"encoding/json"
 	"fmt"
 	"io/fs"
 	"math"
@@ -13,8 +14,21 @@ import (
 	"github.com/ollama/ollama/fs/ggml"
 )

-type qwen3NextModel struct {
-	ModelParameters
+type qwen3NextRopeScaling struct {
+	Type         string     `json:"type"`
+	Factor       ropeFactor `json:"factor"`
+	MropeSection []int32    `json:"mrope_section"`
+}
+
+type qwen3NextRopeParams struct {
+	MRopeInterleaved    bool    `json:"mrope_interleaved"`
+	MropeSection        []int32 `json:"mrope_section"`
+	RopeType            string  `json:"rope_type"`
+	RopeTheta           float32 `json:"rope_theta"`
+	PartialRotaryFactor float32 `json:"partial_rotary_factor"`
+}
+
+type qwen3NextTextConfig struct {
 	MaxPositionEmbeddings uint32  `json:"max_position_embeddings"`
 	HiddenSize            uint32  `json:"hidden_size"`
 	NumHiddenLayers       uint32  `json:"num_hidden_layers"`
@@ -28,12 +42,13 @@ type qwen3NextModel struct {
 	// MoE config
 	NumExperts             uint32 `json:"num_experts"`
 	NumExpertsPerToken     uint32 `json:"num_experts_per_tok"`
-	NormTopkProb           bool   `json:"norm_topk_prob"`
+	NormTopkProb           *bool  `json:"norm_topk_prob"`
 	MoEIntermediateSize    uint32 `json:"moe_intermediate_size"`
 	SharedExpertIntermSize uint32 `json:"shared_expert_intermediate_size"`

 	// Hybrid attention config
-	FullAttentionInterval uint32 `json:"full_attention_interval"`
+	FullAttentionInterval uint32   `json:"full_attention_interval"`
+	LayerTypes            []string `json:"layer_types"`

 	// Linear attention (Gated Delta Net) config
 	LinearConvKernelDim uint32 `json:"linear_conv_kernel_dim"`
@@ -43,16 +58,102 @@ type qwen3NextModel struct {
 	LinearValueHeadDim  uint32 `json:"linear_value_head_dim"`

 	// RoPE config
-	PartialRotaryFactor float32 `json:"partial_rotary_factor"`
-	RopeScaling         struct {
-		Type   string     `json:"type"`
-		Factor ropeFactor `json:"factor"`
-	} `json:"rope_scaling"`
+	PartialRotaryFactor float32              `json:"partial_rotary_factor"`
+	RopeScaling         qwen3NextRopeScaling `json:"rope_scaling"`
+	RopeParameters      qwen3NextRopeParams  `json:"rope_parameters"`
+}
+
+type qwen3NextVisionConfig struct {
+	Depth                  uint32  `json:"depth"`
+	HiddenSize             uint32  `json:"hidden_size"`
+	NumHeads               uint32  `json:"num_heads"`
+	InChannels             uint32  `json:"in_channels"`
+	PatchSize              uint32  `json:"patch_size"`
+	SpatialMergeSize       uint32  `json:"spatial_merge_size"`
+	RMSNormEps             float32 `json:"layer_norm_epsilon"`
+	RopeTheta              float32 `json:"rope_theta"`
+	TemporalPatchSize      uint32  `json:"temporal_patch_size"`
+	DeepstackVisualIndexes []int32 `json:"deepstack_visual_indexes"`
+
+	Size struct {
+		ShortestEdge uint32 `json:"shortest_edge"`
+		LongestEdge  uint32 `json:"longest_edge"`
+	} `json:"size"`
+
+	ImageMean []float32 `json:"image_mean"`
+	ImageStd  []float32 `json:"image_std"`
+}
+
+type qwen3NextModel struct {
+	ModelParameters
+	qwen3NextTextConfig
+
+	TextConfig  *qwen3NextTextConfig  `json:"text_config"`
+	VisionModel qwen3NextVisionConfig `json:"vision_config"`
+
+	ImageTokenID       uint32 `json:"image_token_id"`
+	VisionStartTokenID uint32 `json:"vision_start_token_id"`
+	VisionEndTokenID   uint32 `json:"vision_end_token_id"`
 }

 var _ ModelConverter = (*qwen3NextModel)(nil)

-func (q *qwen3NextModel) parseMore(_ fs.FS) error {
+func (q *qwen3NextModel) parseMore(fsys fs.FS) error {
+	if q.TextConfig != nil {
+		q.qwen3NextTextConfig = *q.TextConfig
+	}
+
+	if q.RopeTheta == 0 {
+		q.RopeTheta = q.RopeParameters.RopeTheta
+	}
+	if q.PartialRotaryFactor == 0 {
+		q.PartialRotaryFactor = q.RopeParameters.PartialRotaryFactor
+	}
+
+	if q.RopeScaling.Type == "" && q.RopeParameters.RopeType != "" {
+		q.RopeScaling.Type = q.RopeParameters.RopeType
+	}
+
+	// Pull vision preprocessing fields when present.
+	if q.VisionModel.Depth > 0 {
+		if bts, err := fs.ReadFile(fsys, "preprocessor_config.json"); err == nil {
+			var pre struct {
+				Size struct {
+					ShortestEdge uint32 `json:"shortest_edge"`
+					LongestEdge  uint32 `json:"longest_edge"`
+				} `json:"size"`
+				PatchSize         uint32    `json:"patch_size"`
+				TemporalPatchSize uint32    `json:"temporal_patch_size"`
+				MergeSize         uint32    `json:"merge_size"`
+				ImageMean         []float32 `json:"image_mean"`
+				ImageStd          []float32 `json:"image_std"`
+			}
+			if json.Unmarshal(bts, &pre) == nil {
+				if q.VisionModel.PatchSize == 0 {
+					q.VisionModel.PatchSize = pre.PatchSize
+				}
+				if q.VisionModel.TemporalPatchSize == 0 {
+					q.VisionModel.TemporalPatchSize = pre.TemporalPatchSize
+				}
+				if q.VisionModel.SpatialMergeSize == 0 {
+					q.VisionModel.SpatialMergeSize = pre.MergeSize
+				}
+				if q.VisionModel.Size.ShortestEdge == 0 {
+					q.VisionModel.Size.ShortestEdge = pre.Size.ShortestEdge
+				}
+				if q.VisionModel.Size.LongestEdge == 0 {
+					q.VisionModel.Size.LongestEdge = pre.Size.LongestEdge
+				}
+				if len(q.VisionModel.ImageMean) == 0 {
+					q.VisionModel.ImageMean = pre.ImageMean
+				}
+				if len(q.VisionModel.ImageStd) == 0 {
+					q.VisionModel.ImageStd = pre.ImageStd
+				}
+			}
+		}
+	}
+
 	if q.NumHiddenLayers == 0 {
 		return fmt.Errorf("qwen3next: num_hidden_layers must be set")
 	}
@@ -74,36 +175,96 @@ func (q *qwen3NextModel) parseMore(_ fs.FS) error {
 	if q.LinearNumKeyHeads == 0 || q.LinearNumValueHeads == 0 || q.LinearKeyHeadDim == 0 || q.LinearValueHeadDim == 0 {
 		return fmt.Errorf("qwen3next: linear attention config must be set (linear_num_key_heads, linear_num_value_heads, linear_key_head_dim, linear_value_head_dim)")
 	}
-	if q.FullAttentionInterval == 0 {
-		return fmt.Errorf("qwen3next: full_attention_interval must be set")
-	}
-	if q.FullAttentionInterval > q.NumHiddenLayers {
-		return fmt.Errorf("qwen3next: full_attention_interval (%d) exceeds num_hidden_layers (%d)", q.FullAttentionInterval, q.NumHiddenLayers)
-	}
-
-	hasFull := false
-	for i := range q.NumHiddenLayers {
-		if (i+1)%q.FullAttentionInterval == 0 {
-			hasFull = true
-			break
-		}
-	}
-	if !hasFull {
-		return fmt.Errorf("qwen3next: head_count_kv would be all zeros (full_attention_interval=%d, num_hidden_layers=%d)", q.FullAttentionInterval, q.NumHiddenLayers)
+	if _, err := q.kvHeadCounts(); err != nil {
+		return err
 	}

 	return nil
 }

+func (q *qwen3NextModel) kvHeadCounts() ([]uint32, error) {
+	if len(q.LayerTypes) > 0 {
+		kv := make([]uint32, q.NumHiddenLayers)
+		hasFull := false
+		hasRecurrent := false
+		for i := range q.NumHiddenLayers {
+			layerType := ""
+			if i < uint32(len(q.LayerTypes)) {
+				layerType = q.LayerTypes[i]
+			}
+			if layerType == "full_attention" {
+				kv[i] = q.NumKeyValueHeads
+				hasFull = true
+			} else {
+				hasRecurrent = true
+			}
+		}
+		if !hasFull || !hasRecurrent {
+			return nil, fmt.Errorf("qwen3next: layer_types must include both full_attention and linear_attention")
+		}
+		return kv, nil
+	}
+
+	if q.FullAttentionInterval == 0 {
+		return nil, fmt.Errorf("qwen3next: full_attention_interval must be set")
+	}
+	if q.FullAttentionInterval > q.NumHiddenLayers {
+		return nil, fmt.Errorf("qwen3next: full_attention_interval (%d) exceeds num_hidden_layers (%d)", q.FullAttentionInterval, q.NumHiddenLayers)
+	}
+
+	kv := make([]uint32, q.NumHiddenLayers)
+	hasFull := false
+	for i := range q.NumHiddenLayers {
+		if (i+1)%q.FullAttentionInterval == 0 {
+			kv[i] = q.NumKeyValueHeads
+			hasFull = true
+		}
+	}
+	if !hasFull {
+		return nil, fmt.Errorf("qwen3next: head_count_kv would be all zeros (full_attention_interval=%d, num_hidden_layers=%d)", q.FullAttentionInterval, q.NumHiddenLayers)
+	}
+	return kv, nil
+}
+
+func (q *qwen3NextModel) ropeSections() []int32 {
+	if len(q.RopeParameters.MropeSection) > 0 {
+		return q.RopeParameters.MropeSection
+	}
+	return q.RopeScaling.MropeSection
+}
+
+func (q *qwen3NextModel) shouldReorderVHeads() bool {
+	modelType := strings.ToLower(q.ModelType)
+	if strings.Contains(modelType, "qwen3_next") || strings.Contains(modelType, "qwen3next") {
+		return false
+	}
+
+	for _, arch := range q.Architectures {
+		arch = strings.ToLower(arch)
+		if strings.Contains(arch, "qwen3next") || strings.Contains(arch, "qwen3_next") {
+			return false
+		}
+	}
+
+	// Default to qwen3.5 layout for all other qwen3next-family imports.
+	return true
+}
+
 func (q *qwen3NextModel) KV(t *Tokenizer) KV {
 	kv := q.ModelParameters.KV(t)
-	kv["general.architecture"] = "qwen3next"
-	kv["tokenizer.ggml.pre"] = "qwen2"
+
+	arch := "qwen35"
+	if q.NumExperts > 0 {
+		arch = "qwen35moe"
+	}
+	kv["general.architecture"] = arch
+	kv["tokenizer.ggml.pre"] = "qwen35"
 	kv["block_count"] = q.NumHiddenLayers
 	kv["context_length"] = q.MaxPositionEmbeddings
 	kv["embedding_length"] = q.HiddenSize
 	kv["feed_forward_length"] = q.IntermediateSize
 	kv["attention.head_count"] = q.NumAttentionHeads
+
 	headDim := q.HeadDim
 	if headDim == 0 && q.NumAttentionHeads > 0 {
 		headDim = q.HiddenSize / q.NumAttentionHeads
@@ -113,18 +274,31 @@ func (q *qwen3NextModel) KV(t *Tokenizer) KV {
 	kv["attention.layer_norm_rms_epsilon"] = q.RMSNormEPS
 	kv["rope.freq_base"] = q.RopeTheta

-	// RoPE dimension count (partial rotary)
-	// partial_rotary_factor = 0.25 means only 25% of head_dim uses RoPE
 	partialRotary := q.PartialRotaryFactor
 	if partialRotary > 0 && partialRotary <= 1 {
 		kv["rope.dimension_count"] = uint32(float32(headDim) * partialRotary)
 	}

-	// MoE config
+	if sections := q.ropeSections(); len(sections) > 0 {
+		kv["mrope_sections"] = sections
+		kv["rope.mrope_section"] = sections
+		kv["rope.dimension_sections"] = sections
+	}
+	if q.RopeParameters.MRopeInterleaved {
+		kv["rope.mrope_interleaved"] = true
+	}
+
+	if q.RopeScaling.Type != "" && q.RopeScaling.Type != "default" {
+		kv["rope.scaling.type"] = q.RopeScaling.Type
+		kv["rope.scaling.factor"] = q.RopeScaling.Factor
+	}
+
 	if q.NumExperts > 0 {
 		kv["expert_count"] = q.NumExperts
 		kv["expert_used_count"] = q.NumExpertsPerToken
-		kv["norm_top_k_prob"] = q.NormTopkProb
+		if q.NormTopkProb != nil {
+			kv["norm_top_k_prob"] = *q.NormTopkProb
+		}
 		if q.MoEIntermediateSize > 0 {
 			kv["expert_feed_forward_length"] = q.MoEIntermediateSize
 		}
@@ -133,33 +307,66 @@ func (q *qwen3NextModel) KV(t *Tokenizer) KV {
 		}
 	}

-	// SSM/Linear attention config
-	// d_inner = linear_value_head_dim * linear_num_value_heads
 	dInner := q.LinearValueHeadDim * q.LinearNumValueHeads
 	kv["ssm.inner_size"] = dInner
-	kv["ssm.state_size"] = q.LinearKeyHeadDim        // head_k_dim
-	kv["ssm.group_count"] = q.LinearNumKeyHeads      // num_k_heads
-	kv["ssm.time_step_rank"] = q.LinearNumValueHeads // num_v_heads
+	kv["ssm.state_size"] = q.LinearKeyHeadDim
+	kv["ssm.group_count"] = q.LinearNumKeyHeads
+	kv["ssm.time_step_rank"] = q.LinearNumValueHeads
 	kv["ssm.conv_kernel"] = q.LinearConvKernelDim
-	interval := q.FullAttentionInterval
-	kv["full_attention_interval"] = interval
-
-	// Build per-layer KV head count array to identify layer types
-	// 0 = recurrent (linear attention), non-zero = full attention
-	kvHeadCounts := make([]uint32, q.NumHiddenLayers)
-	for i := range q.NumHiddenLayers {
-		// Full attention every full_attention_interval layers (starting at interval-1)
-		if interval > 0 && (i+1)%interval == 0 {
-			kvHeadCounts[i] = q.NumKeyValueHeads
-		}
-		// else stays 0 (recurrent layer)
+	if q.shouldReorderVHeads() {
+		kv["ssm.v_head_reordered"] = true
+	}
+	if q.FullAttentionInterval > 0 {
+		kv["full_attention_interval"] = q.FullAttentionInterval
 	}
-	kv["attention.head_count_kv"] = kvHeadCounts

-	// RoPE scaling
-	if q.RopeScaling.Type != "" {
-		kv["rope.scaling.type"] = q.RopeScaling.Type
-		kv["rope.scaling.factor"] = q.RopeScaling.Factor
+	if headCounts, err := q.kvHeadCounts(); err == nil {
+		kv["attention.head_count_kv"] = headCounts
+	}
+
+	if q.VisionModel.Depth > 0 {
+		kv["vision.block_count"] = q.VisionModel.Depth
+		kv["vision.embedding_length"] = q.VisionModel.HiddenSize
+		kv["vision.attention.head_count"] = q.VisionModel.NumHeads
+		kv["vision.num_channels"] = q.VisionModel.InChannels
+		if q.VisionModel.PatchSize > 0 {
+			kv["vision.patch_size"] = q.VisionModel.PatchSize
+		}
+		if q.VisionModel.SpatialMergeSize > 0 {
+			kv["vision.spatial_merge_size"] = q.VisionModel.SpatialMergeSize
+		}
+		if q.VisionModel.RMSNormEps > 0 {
+			kv["vision.attention.layer_norm_epsilon"] = q.VisionModel.RMSNormEps
+		}
+		if q.VisionModel.RopeTheta > 0 {
+			kv["vision.rope.freq_base"] = q.VisionModel.RopeTheta
+		}
+		if q.VisionModel.TemporalPatchSize > 0 {
+			kv["vision.temporal_patch_size"] = q.VisionModel.TemporalPatchSize
+		}
+		kv["vision.deepstack_visual_indexes"] = q.VisionModel.DeepstackVisualIndexes
+		if q.VisionModel.Size.ShortestEdge > 0 {
+			kv["vision.shortest_edge"] = q.VisionModel.Size.ShortestEdge
+		}
+		if q.VisionModel.Size.LongestEdge > 0 {
+			kv["vision.longest_edge"] = q.VisionModel.Size.LongestEdge
+		}
+		if len(q.VisionModel.ImageMean) > 0 {
+			kv["vision.image_mean"] = q.VisionModel.ImageMean
+		}
+		if len(q.VisionModel.ImageStd) > 0 {
+			kv["vision.image_std"] = q.VisionModel.ImageStd
+		}
+	}
+
+	if q.ImageTokenID > 0 {
+		kv["image_token_id"] = q.ImageTokenID
+	}
+	if q.VisionStartTokenID > 0 {
+		kv["vision_start_token_id"] = q.VisionStartTokenID
+	}
+	if q.VisionEndTokenID > 0 {
+		kv["vision_end_token_id"] = q.VisionEndTokenID
 	}

 	return kv
@@ -168,7 +375,6 @@ func (q *qwen3NextModel) KV(t *Tokenizer) KV {
 func (q *qwen3NextModel) Tensors(ts []Tensor) []*ggml.Tensor {
 	var out []*ggml.Tensor

-	// Create merges for expert tensors - stack individual experts into batched tensors
 	merges := make([]merge, q.NumHiddenLayers*3)
 	for i := range q.NumHiddenLayers {
 		merges[i*3+0] = merge{
@@ -185,16 +391,13 @@ func (q *qwen3NextModel) Tensors(ts []Tensor) []*ggml.Tensor {
 		}
 	}

-	// Merge expert tensors
 	merged, remaining := mergeTensors(ts, merges...)
 	out = append(out, merged...)

-	// Process remaining tensors
 	for _, t := range remaining {
 		name := t.Name()
 		shape := t.Shape()

-		// Split linear_attn.in_proj_qkvz (ssm_in) into attn_qkv + attn_gate when possible
 		if strings.HasSuffix(name, ".ssm_in.weight") {
 			if qkv, gate, ok := q.splitQKVZTensor(t); ok {
 				out = append(out, qkv, gate)
@@ -204,84 +407,299 @@ func (q *qwen3NextModel) Tensors(ts []Tensor) []*ggml.Tensor {
 		}

 		switch {
-		// Add 1 to norm weights (except ssm_norm which is linear_attn.norm)
-		// This matches the Python converter behavior for qwen3next
+		case strings.Contains(name, ".mlp.experts.gate_up_proj"):
+			out = append(out, slices.Collect(splitDim(t, 1,
+				split{Replacer: strings.NewReplacer(".mlp.experts.gate_up_proj", ".ffn_gate_exps.weight")},
+				split{Replacer: strings.NewReplacer(".mlp.experts.gate_up_proj", ".ffn_up_exps.weight")},
+			))...)
+
+		case strings.Contains(name, ".mlp.experts.down_proj"):
+			out = append(out, &ggml.Tensor{
+				Name:     strings.NewReplacer(".mlp.experts.down_proj", ".ffn_down_exps.weight").Replace(name),
+				Kind:     t.Kind(),
+				Shape:    slices.Clone(shape),
+				WriterTo: t,
+			})
+
+		case strings.HasPrefix(name, "v.blk.") && strings.Contains(name, ".attn_qkv"):
+			out = append(out, slices.Collect(splitDim(t, 0,
+				split{Replacer: strings.NewReplacer("attn_qkv", "attn_q")},
+				split{Replacer: strings.NewReplacer("attn_qkv", "attn_k")},
+				split{Replacer: strings.NewReplacer("attn_qkv", "attn_v")},
+			))...)
+
+		case strings.Contains(name, "patch_embed") && strings.HasSuffix(name, "weight"):
+			out = append(out, &ggml.Tensor{
+				Name:     name,
+				Kind:     t.Kind(),
+				Shape:    append([]uint64{shape[0] * shape[1]}, shape[2:]...),
+				WriterTo: t,
+			})
+
 		case strings.HasSuffix(name, "_norm.weight") && !strings.HasSuffix(name, ".ssm_norm.weight"):
 			t.SetRepacker(q.addOne)
-			out = append(out, &ggml.Tensor{
-				Name:     name,
-				Kind:     t.Kind(),
-				Shape:    slices.Clone(shape),
-				WriterTo: t,
-			})
+			out = append(out, &ggml.Tensor{Name: name, Kind: t.Kind(), Shape: slices.Clone(shape), WriterTo: t})

-		// Handle linear attention A_log -> ssm_a (negate and exp)
-		// Note: name has already been transformed by Replacements at this point
 		case strings.HasSuffix(name, ".ssm_a"):
-			t.SetRepacker(func(_ string, data []float32, shape []uint64) ([]float32, error) {
-				// Compute -exp(A_log)
-				result := make([]float32, len(data))
-				for i, v := range data {
-					// -exp(v)
-					result[i] = -float32(math.Exp(float64(v)))
-				}
-				return result, nil
-			})
-			out = append(out, &ggml.Tensor{
-				Name:     name,
-				Kind:     t.Kind(),
-				Shape:    slices.Clone(shape),
-				WriterTo: t,
-			})
+			t.SetRepacker(q.repackSSMA())
+			out = append(out, &ggml.Tensor{Name: name, Kind: t.Kind(), Shape: slices.Clone(shape), WriterTo: t})
+
+		case strings.HasSuffix(name, ".attn_qkv.weight"):
+			if q.shouldReorderVHeads() {
+				t.SetRepacker(q.repackAttnQKV())
+			}
+			out = append(out, &ggml.Tensor{Name: name, Kind: t.Kind(), Shape: slices.Clone(shape), WriterTo: t})
+
+		case strings.HasSuffix(name, ".attn_gate.weight"):
+			if q.shouldReorderVHeads() {
+				// HF tensor layout is [out_features, in_features]; reorder rows.
+				t.SetRepacker(q.repackReorderDim(0, int(q.LinearValueHeadDim)))
+			}
+			out = append(out, &ggml.Tensor{Name: name, Kind: t.Kind(), Shape: slices.Clone(shape), WriterTo: t})
+
+		case strings.HasSuffix(name, ".ssm_beta.weight"), strings.HasSuffix(name, ".ssm_alpha.weight"):
+			if q.shouldReorderVHeads() {
+				// HF tensor layout is [out_features, in_features]; reorder rows.
+				t.SetRepacker(q.repackReorderDim(0, 1))
+			}
+			out = append(out, &ggml.Tensor{Name: name, Kind: t.Kind(), Shape: slices.Clone(shape), WriterTo: t})
+
+		case strings.HasSuffix(name, ".ssm_dt"):
+			if q.shouldReorderVHeads() {
+				t.SetRepacker(q.repackReorderDim(0, 1))
+			}
+			out = append(out, &ggml.Tensor{Name: name, Kind: t.Kind(), Shape: slices.Clone(shape), WriterTo: t})
+
+		case strings.HasSuffix(name, ".ssm_out.weight"):
+			if q.shouldReorderVHeads() {
+				// HF out_proj layout is [out_features, in_features]; reorder columns.
+				t.SetRepacker(q.repackReorderDim(1, int(q.LinearValueHeadDim)))
+			}
+			out = append(out, &ggml.Tensor{Name: name, Kind: t.Kind(), Shape: slices.Clone(shape), WriterTo: t})

-		// Squeeze conv1d weights: [1, D, K] or [D, 1, K] -> [D, K]
 		case strings.HasSuffix(name, ".ssm_conv1d.weight"):
 			newShape := slices.Clone(shape)
 			if len(shape) == 3 {
 				if shape[0] == 1 {
-					// [1, D, K] -> [D, K]
 					newShape = []uint64{shape[1], shape[2]}
 				} else if shape[1] == 1 {
-					// [D, 1, K] -> [D, K]
 					newShape = []uint64{shape[0], shape[2]}
 				}
 			}
-			out = append(out, &ggml.Tensor{
-				Name:     name,
-				Kind:     t.Kind(),
-				Shape:    newShape,
-				WriterTo: t,
-			})
-		// Squeeze shared expert gate: [D, 1] or [1, D] -> [D]
-		case strings.HasSuffix(name, ".ffn_gate_inp_shexp.weight"):
-			newShape := slices.Clone(shape)
-			if len(shape) == 2 {
-				if shape[0] == 1 && shape[1] > 1 {
-					newShape = []uint64{shape[1]}
-				} else if shape[1] == 1 && shape[0] > 1 {
-					newShape = []uint64{shape[0]}
-				}
+			if q.shouldReorderVHeads() {
+				t.SetRepacker(q.repackConv1D())
 			}
-			out = append(out, &ggml.Tensor{
-				Name:     name,
-				Kind:     t.Kind(),
-				Shape:    newShape,
-				WriterTo: t,
-			})
+			out = append(out, &ggml.Tensor{Name: name, Kind: t.Kind(), Shape: newShape, WriterTo: t})

 		default:
-			out = append(out, &ggml.Tensor{
-				Name:     name,
-				Kind:     t.Kind(),
-				Shape:    slices.Clone(shape),
-				WriterTo: t,
-			})
+			out = append(out, &ggml.Tensor{Name: name, Kind: t.Kind(), Shape: slices.Clone(shape), WriterTo: t})
 		}
 	}

 	return out
 }

+func (q *qwen3NextModel) repackReorderDim(dim, headDim int) Repacker {
+	return func(_ string, data []float32, shape []uint64) ([]float32, error) {
+		if !q.shouldReorderVHeads() {
+			return data, nil
+		}
+		numK := int(q.LinearNumKeyHeads)
+		numVPerK := int(q.LinearNumValueHeads / q.LinearNumKeyHeads)
+		return reorderHeadLayout(data, shape, dim, numK, numVPerK, headDim)
+	}
+}
+
+func (q *qwen3NextModel) repackAttnQKV() Repacker {
+	return func(_ string, data []float32, shape []uint64) ([]float32, error) {
+		if !q.shouldReorderVHeads() || len(shape) != 2 {
+			return data, nil
+		}
+
+		rows := int(shape[0])
+		cols := int(shape[1])
+		numK := int(q.LinearNumKeyHeads)
+		numV := int(q.LinearNumValueHeads)
+		headK := int(q.LinearKeyHeadDim)
+		headV := int(q.LinearValueHeadDim)
+		qDim := headK * numK
+		kDim := headK * numK
+		vDim := headV * numV
+		qkvDim := qDim + kDim + vDim
+
+		switch {
+		case rows == qkvDim:
+			// HF layout: [out_features, in_features]. Keep Q/K rows unchanged and
+			// reorder only V rows from grouped -> tiled head layout.
+			out := make([]float32, len(data))
+			qkRows := qDim + kDim
+			qkSize := qkRows * cols
+			copy(out[:qkSize], data[:qkSize])
+
+			vStart := qkSize
+			vEnd := vStart + vDim*cols
+			reorderedV, err := reorderHeadLayout(data[vStart:vEnd], []uint64{uint64(vDim), uint64(cols)}, 0, numK, numV/numK, headV)
+			if err != nil {
+				return nil, err
+			}
+			copy(out[vStart:vEnd], reorderedV)
+			copy(out[vEnd:], data[vEnd:])
+			return out, nil
+
+		case cols == qkvDim:
+			// Fallback for already-transposed [in_features, out_features] tensors.
+			out := make([]float32, len(data))
+			copy(out, data)
+			for r := range rows {
+				base := r * cols
+				vStart := base + qDim + kDim
+				vEnd := vStart + vDim
+				reorderedV, err := reorderHeadLayout(out[vStart:vEnd], []uint64{uint64(vDim)}, 0, numK, numV/numK, headV)
+				if err != nil {
+					return nil, err
+				}
+				copy(out[vStart:vEnd], reorderedV)
+			}
+			return out, nil
+
+		default:
+			return data, nil
+		}
+	}
+}
+
+func (q *qwen3NextModel) repackConv1D() Repacker {
+	return func(_ string, data []float32, shape []uint64) ([]float32, error) {
+		if !q.shouldReorderVHeads() {
+			return data, nil
+		}
+
+		normShape := slices.Clone(shape)
+		if len(shape) == 3 {
+			if shape[0] == 1 {
+				normShape = []uint64{shape[1], shape[2]}
+			} else if shape[1] == 1 {
+				normShape = []uint64{shape[0], shape[2]}
+			}
+		}
+		if len(normShape) != 2 {
+			return data, nil
+		}
+
+		rows := int(normShape[0])
+		cols := int(normShape[1])
+		numK := int(q.LinearNumKeyHeads)
+		numV := int(q.LinearNumValueHeads)
+		headK := int(q.LinearKeyHeadDim)
+		headV := int(q.LinearValueHeadDim)
+		qkChannels := 2 * headK * numK
+		totalChannels := qkChannels + headV*numV
+		if qkChannels <= 0 {
+			return data, nil
+		}
+
+		switch {
+		case rows == totalChannels:
+			// HF layout after squeeze: [channels, kernel]
+			out := make([]float32, len(data))
+			prefix := qkChannels * cols
+			copy(out[:prefix], data[:prefix])
+			reorderedV, err := reorderHeadLayout(data[prefix:], []uint64{uint64(totalChannels - qkChannels), uint64(cols)}, 0, numK, numV/numK, headV)
+			if err != nil {
+				return nil, err
+			}
+			copy(out[prefix:], reorderedV)
+			return out, nil
+		case cols == totalChannels:
+			// Fallback for transposed [kernel, channels]
+			out := make([]float32, len(data))
+			copy(out, data)
+			vChannels := totalChannels - qkChannels
+			for r := range rows {
+				base := r * cols
+				vStart := base + qkChannels
+				vEnd := vStart + vChannels
+				reorderedV, err := reorderHeadLayout(out[vStart:vEnd], []uint64{uint64(vChannels)}, 0, numK, numV/numK, headV)
+				if err != nil {
+					return nil, err
+				}
+				copy(out[vStart:vEnd], reorderedV)
+			}
+			return out, nil
+		default:
+			return data, nil
+		}
+	}
+}
+
+func (q *qwen3NextModel) repackSSMA() Repacker {
+	return func(_ string, data []float32, shape []uint64) ([]float32, error) {
+		result := make([]float32, len(data))
+		for i, v := range data {
+			result[i] = -float32(math.Exp(float64(v)))
+		}
+		if !q.shouldReorderVHeads() {
+			return result, nil
+		}
+		numK := int(q.LinearNumKeyHeads)
+		numVPerK := int(q.LinearNumValueHeads / q.LinearNumKeyHeads)
+		return reorderHeadLayout(result, shape, 0, numK, numVPerK, 1)
+	}
+}
+
+func reorderHeadLayout(data []float32, shape []uint64, dim int, numKHeads, numVPerK, headDim int) ([]float32, error) {
+	if len(shape) == 0 || numKHeads <= 0 || numVPerK <= 0 || headDim <= 0 {
+		return data, nil
+	}
+
+	dims := make([]int, len(shape))
+	for i := range shape {
+		dims[i] = int(shape[i])
+	}
+	if dim < 0 {
+		dim += len(dims)
+	}
+	if dim < 0 || dim >= len(dims) {
+		return data, nil
+	}
+
+	expected := numKHeads * numVPerK * headDim
+	if dims[dim] != expected {
+		return data, nil
+	}
+
+	newShape := make([]int, 0, len(dims)+2)
+	newShape = append(newShape, dims[:dim]...)
+	newShape = append(newShape, numKHeads, numVPerK, headDim)
+	newShape = append(newShape, dims[dim+1:]...)
+
+	var tt tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
+	if err := tt.Reshape(newShape...); err != nil {
+		return nil, err
+	}
+
+	perm := make([]int, len(newShape))
+	for i := range perm {
+		perm[i] = i
+	}
+	perm[dim], perm[dim+1] = perm[dim+1], perm[dim]
+
+	tt, err := tensor.Transpose(tt, perm...)
+	if err != nil {
+		return nil, err
+	}
+	tt = tensor.Materialize(tt)
+
+	total := 1
+	for _, d := range dims {
+		total *= d
+	}
+	if err := tt.Reshape(total); err != nil {
+		return nil, err
+	}
+	return native.VectorF32(tt.(*tensor.Dense))
+}
+
 type qkvzSplitSpec struct {
 	hidden    int
 	headKDim  int
@@ -369,7 +787,6 @@ func (q *qwen3NextModel) repackQKVZ(spec qkvzSplitSpec, extractGate bool) Repack
 		var tt tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
 		var err error

-		// Convert to [hidden, out_features] layout for slicing
 		tt, err = tensor.Transpose(tt, 1, 0)
 		if err != nil {
 			return nil, err
@@ -444,7 +861,6 @@ func (q *qwen3NextModel) repackQKVZ(spec qkvzSplitSpec, extractGate bool) Repack
 	}
 }

-// addOne adds 1.0 to all elements in the tensor (for norm weights)
 func (*qwen3NextModel) addOne(_ string, data []float32, shape []uint64) ([]float32, error) {
 	n := tensor.New(tensor.WithShape(int(shape[0])), tensor.WithBacking(data))
 	ones := tensor.Ones(tensor.Float32, int(shape[0]))
@@ -471,10 +887,21 @@ func (q *qwen3NextModel) Replacements() []string {
 	return []string{
 		// Embeddings and output
 		"lm_head", "output",
+		"model.language_model.embed_tokens", "token_embd",
+		"model.language_model.norm", "output_norm",
+		"model.language_model.layers", "blk",
 		"model.embed_tokens", "token_embd",
 		"model.norm", "output_norm",
 		"model.layers", "blk",

+		// Vision
+		"model.visual", "v",
+		"patch_embed.proj", "patch_embed",
+		"blocks", "blk",
+		"attn.qkv", "attn_qkv",
+		"attn.proj", "attn_out",
+		"deepstack_merger_list", "deepstack_merger",
+
 		// Layer norms
 		"input_layernorm", "attn_norm",
 		"post_attention_layernorm", "post_attention_norm",
@@ -487,9 +914,16 @@ func (q *qwen3NextModel) Replacements() []string {
 		"self_attn.v_proj", "attn_v",
 		"self_attn.o_proj", "attn_output",

-		// Linear attention (Gated Delta Net)
+		// Linear attention (legacy qwen3next)
 		"linear_attn.in_proj_qkvz", "ssm_in",
 		"linear_attn.in_proj_ba", "ssm_ba",
+
+		// Linear attention (qwen35)
+		"linear_attn.in_proj_qkv", "attn_qkv",
+		"linear_attn.in_proj_z", "attn_gate",
+		"linear_attn.in_proj_a", "ssm_alpha",
+		"linear_attn.in_proj_b", "ssm_beta",
+
 		"linear_attn.conv1d", "ssm_conv1d",
 		"linear_attn.dt_bias", "ssm_dt",
 		"linear_attn.dt_proj", "ssm_dt",
@@ -497,14 +931,14 @@ func (q *qwen3NextModel) Replacements() []string {
 		"linear_attn.norm", "ssm_norm",
 		"linear_attn.out_proj", "ssm_out",

-		// MoE (experts are stacked via mergeTensors, not replaced here)
+		// MoE
 		"mlp.gate.weight", "ffn_gate_inp.weight",
 		"mlp.shared_expert.down_proj", "ffn_down_shexp",
 		"mlp.shared_expert.gate_proj", "ffn_gate_shexp",
 		"mlp.shared_expert.up_proj", "ffn_up_shexp",
 		"mlp.shared_expert_gate", "ffn_gate_inp_shexp",

-		// Dense FFN (if any layers use it)
+		// Dense FFN
 		"mlp.down_proj", "ffn_down",
 		"mlp.gate_proj", "ffn_gate",
 		"mlp.up_proj", "ffn_up",
--- a/convert/convert_qwen3next_test.go
+++ b/convert/convert_qwen3next_test.go
@@ -0,0 +1,563 @@
+package convert
+
+import (
+	"bytes"
+	"encoding/binary"
+	"os"
+	"slices"
+	"strings"
+	"testing"
+
+	"github.com/ollama/ollama/fs/ggml"
+)
+
+func boolPtr(v bool) *bool {
+	return &v
+}
+
+func readTensorData(t *testing.T, tensor *ggml.Tensor) []float32 {
+	t.Helper()
+
+	var b bytes.Buffer
+	if _, err := tensor.WriteTo(&b); err != nil {
+		t.Fatal(err)
+	}
+
+	numel := 1
+	for _, d := range tensor.Shape {
+		numel *= int(d)
+	}
+
+	values := make([]float32, numel)
+	if err := binary.Read(&b, binary.LittleEndian, &values); err != nil {
+		t.Fatal(err)
+	}
+
+	return values
+}
+
+func TestQwen3NextLegacyModelTypeDisablesReorder(t *testing.T) {
+	m := &qwen3NextModel{
+		ModelParameters: ModelParameters{
+			ModelType: "qwen3_next",
+		},
+	}
+
+	if m.shouldReorderVHeads() {
+		t.Fatalf("legacy qwen3_next model_type should not reorder v-head layout")
+	}
+}
+
+func TestQwen3NextLegacyArchitectureDisablesReorder(t *testing.T) {
+	m := &qwen3NextModel{
+		ModelParameters: ModelParameters{
+			Architectures: []string{"Qwen3NextForCausalLM"},
+		},
+	}
+
+	if m.shouldReorderVHeads() {
+		t.Fatalf("legacy Qwen3Next architecture should not reorder v-head layout")
+	}
+}
+
+func TestQwen3NextKVLegacyConfig(t *testing.T) {
+	m := &qwen3NextModel{
+		ModelParameters: ModelParameters{
+			ModelType: "qwen3_next",
+		},
+		qwen3NextTextConfig: qwen3NextTextConfig{
+			MaxPositionEmbeddings: 8192,
+			HiddenSize:            512,
+			NumHiddenLayers:       4,
+			IntermediateSize:      2048,
+			NumAttentionHeads:     8,
+			NumKeyValueHeads:      2,
+			HeadDim:               64,
+			RopeTheta:             1_000_000,
+			RMSNormEPS:            1e-6,
+
+			NumExperts:             8,
+			NumExpertsPerToken:     2,
+			NormTopkProb:           boolPtr(true),
+			MoEIntermediateSize:    256,
+			SharedExpertIntermSize: 512,
+
+			FullAttentionInterval: 2,
+
+			LinearConvKernelDim: 4,
+			LinearKeyHeadDim:    64,
+			LinearNumKeyHeads:   2,
+			LinearNumValueHeads: 4,
+			LinearValueHeadDim:  64,
+
+			PartialRotaryFactor: 0.25,
+		},
+	}
+
+	if err := m.parseMore(os.DirFS(t.TempDir())); err != nil {
+		t.Fatal(err)
+	}
+
+	kv := m.KV(&Tokenizer{Vocabulary: &Vocabulary{}})
+	if got, want := kv["general.architecture"], "qwen35moe"; got != want {
+		t.Fatalf("unexpected architecture: got %v want %v", got, want)
+	}
+	if got, want := kv["tokenizer.ggml.pre"], "qwen35"; got != want {
+		t.Fatalf("unexpected tokenizer pre: got %v want %v", got, want)
+	}
+
+	headCountKV, ok := kv["attention.head_count_kv"].([]uint32)
+	if !ok {
+		t.Fatalf("attention.head_count_kv has unexpected type: %T", kv["attention.head_count_kv"])
+	}
+	if got, want := headCountKV, []uint32{0, 2, 0, 2}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected attention.head_count_kv: got %v want %v", got, want)
+	}
+
+	if _, ok := kv["ssm.v_head_reordered"]; ok {
+		t.Fatalf("legacy qwen3next should not enable ssm.v_head_reordered")
+	}
+	if got, want := kv["norm_top_k_prob"], true; got != want {
+		t.Fatalf("unexpected norm_top_k_prob: got %v want %v", got, want)
+	}
+}
+
+func TestQwen35MoeOmitsNormTopKProbWhenUnset(t *testing.T) {
+	m := &qwen3NextModel{
+		ModelParameters: ModelParameters{
+			ModelType: "qwen3_5",
+		},
+		qwen3NextTextConfig: qwen3NextTextConfig{
+			MaxPositionEmbeddings: 4096,
+			HiddenSize:            512,
+			NumHiddenLayers:       4,
+			IntermediateSize:      2048,
+			NumAttentionHeads:     8,
+			NumKeyValueHeads:      2,
+			HeadDim:               64,
+			RopeTheta:             1_000_000,
+			RMSNormEPS:            1e-6,
+			NumExperts:            8,
+			NumExpertsPerToken:    2,
+			FullAttentionInterval: 2,
+			LinearConvKernelDim:   4,
+			LinearKeyHeadDim:      64,
+			LinearNumKeyHeads:     2,
+			LinearNumValueHeads:   4,
+			LinearValueHeadDim:    64,
+			PartialRotaryFactor:   0.25,
+		},
+	}
+
+	if err := m.parseMore(os.DirFS(t.TempDir())); err != nil {
+		t.Fatal(err)
+	}
+
+	kv := m.KV(&Tokenizer{Vocabulary: &Vocabulary{}})
+	if _, ok := kv["norm_top_k_prob"]; ok {
+		t.Fatalf("expected norm_top_k_prob to be omitted when not set in config")
+	}
+}
+
+func TestQwen35KVFromTextConfig(t *testing.T) {
+	m := &qwen3NextModel{
+		ModelParameters: ModelParameters{
+			ModelType: "qwen3_5",
+		},
+		TextConfig: &qwen3NextTextConfig{
+			MaxPositionEmbeddings: 16384,
+			HiddenSize:            1024,
+			NumHiddenLayers:       4,
+			IntermediateSize:      4096,
+			NumAttentionHeads:     8,
+			NumKeyValueHeads:      4,
+			HeadDim:               128,
+			RMSNormEPS:            1e-6,
+
+			LayerTypes: []string{
+				"linear_attention",
+				"full_attention",
+				"linear_attention",
+				"full_attention",
+			},
+
+			LinearConvKernelDim: 4,
+			LinearKeyHeadDim:    128,
+			LinearNumKeyHeads:   2,
+			LinearNumValueHeads: 4,
+			LinearValueHeadDim:  128,
+
+			RopeParameters: qwen3NextRopeParams{
+				MRopeInterleaved:    true,
+				MropeSection:        []int32{11, 11, 10},
+				RopeType:            "default",
+				RopeTheta:           10_000_000,
+				PartialRotaryFactor: 0.25,
+			},
+		},
+		VisionModel: qwen3NextVisionConfig{
+			Depth:                  2,
+			HiddenSize:             128,
+			NumHeads:               4,
+			InChannels:             3,
+			PatchSize:              16,
+			SpatialMergeSize:       2,
+			RMSNormEps:             1e-6,
+			RopeTheta:              10_000,
+			TemporalPatchSize:      2,
+			DeepstackVisualIndexes: []int32{1},
+		},
+		ImageTokenID:       1001,
+		VisionStartTokenID: 1002,
+		VisionEndTokenID:   1003,
+	}
+	m.VisionModel.Size.ShortestEdge = 224
+	m.VisionModel.Size.LongestEdge = 4096
+	m.VisionModel.ImageMean = []float32{0.5, 0.5, 0.5}
+	m.VisionModel.ImageStd = []float32{0.2, 0.2, 0.2}
+
+	if err := m.parseMore(os.DirFS(t.TempDir())); err != nil {
+		t.Fatal(err)
+	}
+
+	kv := m.KV(&Tokenizer{Vocabulary: &Vocabulary{}})
+	if got, want := kv["general.architecture"], "qwen35"; got != want {
+		t.Fatalf("unexpected architecture: got %v want %v", got, want)
+	}
+
+	headCountKV, ok := kv["attention.head_count_kv"].([]uint32)
+	if !ok {
+		t.Fatalf("attention.head_count_kv has unexpected type: %T", kv["attention.head_count_kv"])
+	}
+	if got, want := headCountKV, []uint32{0, 4, 0, 4}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected attention.head_count_kv: got %v want %v", got, want)
+	}
+
+	if got, ok := kv["ssm.v_head_reordered"].(bool); !ok || !got {
+		t.Fatalf("expected ssm.v_head_reordered=true, got %v (%T)", kv["ssm.v_head_reordered"], kv["ssm.v_head_reordered"])
+	}
+
+	mrope, ok := kv["mrope_sections"].([]int32)
+	if !ok {
+		t.Fatalf("mrope_sections has unexpected type: %T", kv["mrope_sections"])
+	}
+	if got, want := mrope, []int32{11, 11, 10}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected mrope_sections: got %v want %v", got, want)
+	}
+	ropeSections, ok := kv["rope.dimension_sections"].([]int32)
+	if !ok {
+		t.Fatalf("rope.dimension_sections has unexpected type: %T", kv["rope.dimension_sections"])
+	}
+	if got, want := ropeSections, []int32{11, 11, 10}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected rope.dimension_sections: got %v want %v", got, want)
+	}
+
+	if got, ok := kv["rope.mrope_interleaved"].(bool); !ok || !got {
+		t.Fatalf("expected rope.mrope_interleaved=true, got %v (%T)", kv["rope.mrope_interleaved"], kv["rope.mrope_interleaved"])
+	}
+
+	if got, want := kv["vision.block_count"], uint32(2); got != want {
+		t.Fatalf("unexpected vision.block_count: got %v want %v", got, want)
+	}
+}
+
+func TestQwen3NextReplacements(t *testing.T) {
+	r := strings.NewReplacer((&qwen3NextModel{}).Replacements()...)
+
+	if got, want := r.Replace("model.language_model.layers.1.linear_attn.in_proj_qkv.weight"), "blk.1.attn_qkv.weight"; got != want {
+		t.Fatalf("unexpected language-model replacement: got %q want %q", got, want)
+	}
+	if got, want := r.Replace("model.visual.blocks.0.attn.qkv.weight"), "v.blk.0.attn_qkv.weight"; got != want {
+		t.Fatalf("unexpected vision replacement: got %q want %q", got, want)
+	}
+	if got, want := r.Replace("model.layers.1.linear_attn.in_proj_qkvz.weight"), "blk.1.ssm_in.weight"; got != want {
+		t.Fatalf("unexpected legacy replacement: got %q want %q", got, want)
+	}
+}
+
+func TestQwen35ReordersVHeads(t *testing.T) {
+	m := &qwen3NextModel{
+		ModelParameters: ModelParameters{
+			ModelType: "qwen3_5",
+		},
+		qwen3NextTextConfig: qwen3NextTextConfig{
+			LinearNumKeyHeads:   2,
+			LinearNumValueHeads: 4,
+			LinearValueHeadDim:  1,
+		},
+	}
+
+	out := m.Tensors([]Tensor{
+		&fakeTensor{
+			name:  "blk.0.attn_gate.weight",
+			shape: []uint64{4, 2},
+			data:  []float32{0, 1, 2, 3, 4, 5, 6, 7},
+		},
+	})
+	if len(out) != 1 {
+		t.Fatalf("unexpected output tensor count: got %d want 1", len(out))
+	}
+
+	if got, want := readTensorData(t, out[0]), []float32{0, 1, 4, 5, 2, 3, 6, 7}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected data: got %v want %v", got, want)
+	}
+}
+
+func TestQwen35ReordersAttnQKVOutputDim(t *testing.T) {
+	m := &qwen3NextModel{
+		ModelParameters: ModelParameters{
+			ModelType: "qwen3_5",
+		},
+		qwen3NextTextConfig: qwen3NextTextConfig{
+			LinearNumKeyHeads:   2,
+			LinearNumValueHeads: 4,
+			LinearKeyHeadDim:    1,
+			LinearValueHeadDim:  1,
+		},
+	}
+
+	out := m.Tensors([]Tensor{
+		&fakeTensor{
+			name:  "blk.0.attn_qkv.weight",
+			shape: []uint64{8, 2}, // [out_features, in_features] (HF layout)
+			data: []float32{
+				0, 1, // q0
+				2, 3, // q1
+				4, 5, // k0
+				6, 7, // k1
+				10, 11, // v(k0,v0)
+				12, 13, // v(k0,v1)
+				20, 21, // v(k1,v0)
+				22, 23, // v(k1,v1)
+			},
+		},
+	})
+	if len(out) != 1 {
+		t.Fatalf("unexpected output tensor count: got %d want 1", len(out))
+	}
+
+	if got, want := readTensorData(t, out[0]), []float32{
+		0, 1, 2, 3, 4, 5, 6, 7,
+		10, 11, 20, 21, 12, 13, 22, 23,
+	}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected qkv data: got %v want %v", got, want)
+	}
+}
+
+func TestQwen35ReordersSsmOutInputDim(t *testing.T) {
+	m := &qwen3NextModel{
+		ModelParameters: ModelParameters{
+			ModelType: "qwen3_5",
+		},
+		qwen3NextTextConfig: qwen3NextTextConfig{
+			LinearNumKeyHeads:   2,
+			LinearNumValueHeads: 4,
+			LinearValueHeadDim:  1,
+		},
+	}
+
+	out := m.Tensors([]Tensor{
+		&fakeTensor{
+			name:  "blk.0.ssm_out.weight",
+			shape: []uint64{2, 4},
+			data:  []float32{0, 1, 2, 3, 4, 5, 6, 7},
+		},
+	})
+	if len(out) != 1 {
+		t.Fatalf("unexpected output tensor count: got %d want 1", len(out))
+	}
+
+	if got, want := readTensorData(t, out[0]), []float32{0, 2, 1, 3, 4, 6, 5, 7}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected ssm_out data: got %v want %v", got, want)
+	}
+}
+
+func TestQwen35ReordersSsmBetaRows(t *testing.T) {
+	m := &qwen3NextModel{
+		ModelParameters: ModelParameters{
+			ModelType: "qwen3_5",
+		},
+		qwen3NextTextConfig: qwen3NextTextConfig{
+			LinearNumKeyHeads:   2,
+			LinearNumValueHeads: 4,
+		},
+	}
+
+	out := m.Tensors([]Tensor{
+		&fakeTensor{
+			name:  "blk.0.ssm_beta.weight",
+			shape: []uint64{4, 2},
+			data:  []float32{0, 1, 2, 3, 4, 5, 6, 7},
+		},
+	})
+	if len(out) != 1 {
+		t.Fatalf("unexpected output tensor count: got %d want 1", len(out))
+	}
+
+	if got, want := readTensorData(t, out[0]), []float32{0, 1, 4, 5, 2, 3, 6, 7}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected ssm_beta data: got %v want %v", got, want)
+	}
+}
+
+func TestQwen35ReordersConv1DChannelDim(t *testing.T) {
+	m := &qwen3NextModel{
+		ModelParameters: ModelParameters{
+			ModelType: "qwen3_5",
+		},
+		qwen3NextTextConfig: qwen3NextTextConfig{
+			LinearNumKeyHeads:   2,
+			LinearNumValueHeads: 4,
+			LinearKeyHeadDim:    1,
+			LinearValueHeadDim:  1,
+		},
+	}
+
+	out := m.Tensors([]Tensor{
+		&fakeTensor{
+			name:  "blk.0.ssm_conv1d.weight",
+			shape: []uint64{8, 2}, // [channels, kernel] after squeeze
+			data: []float32{
+				0, 1, // q0
+				2, 3, // q1
+				4, 5, // k0
+				6, 7, // k1
+				10, 11, // v(k0,v0)
+				12, 13, // v(k0,v1)
+				20, 21, // v(k1,v0)
+				22, 23, // v(k1,v1)
+			},
+		},
+	})
+	if len(out) != 1 {
+		t.Fatalf("unexpected output tensor count: got %d want 1", len(out))
+	}
+
+	if got, want := readTensorData(t, out[0]), []float32{
+		0, 1, 2, 3, 4, 5, 6, 7,
+		10, 11, 20, 21, 12, 13, 22, 23,
+	}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected conv1d data: got %v want %v", got, want)
+	}
+}
+
+func TestLegacyQwen3NextDoesNotReorderVHeads(t *testing.T) {
+	m := &qwen3NextModel{
+		ModelParameters: ModelParameters{
+			ModelType: "qwen3_next",
+		},
+		qwen3NextTextConfig: qwen3NextTextConfig{
+			LinearNumKeyHeads:   2,
+			LinearNumValueHeads: 4,
+			LinearValueHeadDim:  1,
+		},
+	}
+
+	out := m.Tensors([]Tensor{
+		&fakeTensor{
+			name:  "blk.0.attn_gate.weight",
+			shape: []uint64{4, 1},
+			data:  []float32{0, 1, 2, 3},
+		},
+	})
+	if len(out) != 1 {
+		t.Fatalf("unexpected output tensor count: got %d want 1", len(out))
+	}
+
+	if got, want := readTensorData(t, out[0]), []float32{0, 1, 2, 3}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected data for legacy qwen3next: got %v want %v", got, want)
+	}
+}
+
+func TestQwen35MoePackedExperts(t *testing.T) {
+	m := &qwen3NextModel{
+		qwen3NextTextConfig: qwen3NextTextConfig{
+			NumHiddenLayers: 1,
+		},
+	}
+
+	out := m.Tensors([]Tensor{
+		&fakeTensor{
+			name:  "blk.0.mlp.experts.gate_up_proj",
+			shape: []uint64{2, 4, 3},
+			data: []float32{
+				0, 1, 2,
+				3, 4, 5,
+				6, 7, 8,
+				9, 10, 11,
+				12, 13, 14,
+				15, 16, 17,
+				18, 19, 20,
+				21, 22, 23,
+			},
+		},
+		&fakeTensor{
+			name:  "blk.0.mlp.experts.down_proj",
+			shape: []uint64{2, 5, 3},
+			data:  make([]float32, 2*5*3),
+		},
+	})
+
+	get := func(name string) *ggml.Tensor {
+		for _, tensor := range out {
+			if tensor.Name == name {
+				return tensor
+			}
+		}
+		return nil
+	}
+
+	gate := get("blk.0.ffn_gate_exps.weight")
+	if gate == nil {
+		t.Fatalf("missing tensor %q", "blk.0.ffn_gate_exps.weight")
+	}
+	if got, want := gate.Shape, []uint64{2, 2, 3}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected gate shape: got %v want %v", got, want)
+	}
+	if got, want := readTensorData(t, gate), []float32{
+		0, 1, 2, 3, 4, 5,
+		12, 13, 14, 15, 16, 17,
+	}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected gate values: got %v want %v", got, want)
+	}
+
+	up := get("blk.0.ffn_up_exps.weight")
+	if up == nil {
+		t.Fatalf("missing tensor %q", "blk.0.ffn_up_exps.weight")
+	}
+	if got, want := up.Shape, []uint64{2, 2, 3}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected up shape: got %v want %v", got, want)
+	}
+	if got, want := readTensorData(t, up), []float32{
+		6, 7, 8, 9, 10, 11,
+		18, 19, 20, 21, 22, 23,
+	}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected up values: got %v want %v", got, want)
+	}
+
+	down := get("blk.0.ffn_down_exps.weight")
+	if down == nil {
+		t.Fatalf("missing tensor %q", "blk.0.ffn_down_exps.weight")
+	}
+	if got, want := down.Shape, []uint64{2, 5, 3}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected down shape: got %v want %v", got, want)
+	}
+}
+
+func TestQwen35SharedExpertGateKeepsMatrixShape(t *testing.T) {
+	m := &qwen3NextModel{}
+
+	out := m.Tensors([]Tensor{
+		&fakeTensor{
+			name:  "blk.0.ffn_gate_inp_shexp.weight",
+			shape: []uint64{1, 4},
+			data:  []float32{0, 1, 2, 3},
+		},
+	})
+	if len(out) != 1 {
+		t.Fatalf("unexpected output tensor count: got %d want 1", len(out))
+	}
+
+	if got, want := out[0].Shape, []uint64{1, 4}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected shared gate shape: got %v want %v", got, want)
+	}
+}
--- a/convert/json_compat.go
+++ b/convert/json_compat.go
@@ -0,0 +1,97 @@
+package convert
+
+// sanitizeNonFiniteJSON rewrites non-standard JSON numeric tokens that some
+// HF configs emit (Infinity, -Infinity, NaN) into standard JSON numbers.
+//
+// This is intentionally conservative:
+// - only runs outside quoted strings
+// - only rewrites full tokens
+//
+// We map these values to 0 because encoding/json rejects non-finite values,
+// and these fields are typically model-side metadata not consumed by the
+// converter.
+func sanitizeNonFiniteJSON(in []byte) []byte {
+	if len(in) == 0 {
+		return in
+	}
+
+	out := make([]byte, 0, len(in))
+	inString := false
+	escape := false
+
+	for i := 0; i < len(in); {
+		c := in[i]
+
+		if inString {
+			out = append(out, c)
+			if escape {
+				escape = false
+			} else if c == '\\' {
+				escape = true
+			} else if c == '"' {
+				inString = false
+			}
+			i++
+			continue
+		}
+
+		if c == '"' {
+			inString = true
+			out = append(out, c)
+			i++
+			continue
+		}
+
+		if hasToken(in, i, "-Infinity") {
+			out = append(out, '0')
+			i += len("-Infinity")
+			continue
+		}
+
+		if hasToken(in, i, "Infinity") {
+			out = append(out, '0')
+			i += len("Infinity")
+			continue
+		}
+
+		if hasToken(in, i, "NaN") {
+			out = append(out, '0')
+			i += len("NaN")
+			continue
+		}
+
+		out = append(out, c)
+		i++
+	}
+
+	return out
+}
+
+func hasToken(in []byte, at int, tok string) bool {
+	end := at + len(tok)
+	if at < 0 || end > len(in) {
+		return false
+	}
+	if string(in[at:end]) != tok {
+		return false
+	}
+	if at > 0 && !isJSONValuePrefixBoundary(in[at-1]) {
+		return false
+	}
+	if end < len(in) && !isJSONValueSuffixBoundary(in[end]) {
+		return false
+	}
+	return true
+}
+
+func isJSONWhitespace(b byte) bool {
+	return b == ' ' || b == '\t' || b == '\n' || b == '\r'
+}
+
+func isJSONValuePrefixBoundary(b byte) bool {
+	return isJSONWhitespace(b) || b == ':' || b == ',' || b == '['
+}
+
+func isJSONValueSuffixBoundary(b byte) bool {
+	return isJSONWhitespace(b) || b == ',' || b == ']' || b == '}'
+}
--- a/convert/json_compat_test.go
+++ b/convert/json_compat_test.go
@@ -0,0 +1,46 @@
+package convert
+
+import "testing"
+
+func TestSanitizeNonFiniteJSON(t *testing.T) {
+	tests := []struct {
+		name string
+		in   string
+		want string
+	}{
+		{
+			name: "infinity token",
+			in:   `{"a":[0,Infinity,1]}`,
+			want: `{"a":[0,0,1]}`,
+		},
+		{
+			name: "negative infinity token",
+			in:   `{"a":-Infinity}`,
+			want: `{"a":0}`,
+		},
+		{
+			name: "nan token",
+			in:   `{"a":NaN}`,
+			want: `{"a":0}`,
+		},
+		{
+			name: "tokens inside strings untouched",
+			in:   `{"a":"Infinity -Infinity NaN","b":Infinity}`,
+			want: `{"a":"Infinity -Infinity NaN","b":0}`,
+		},
+		{
+			name: "identifier-like token untouched",
+			in:   `{"a":InfinityValue}`,
+			want: `{"a":InfinityValue}`,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := string(sanitizeNonFiniteJSON([]byte(tt.in)))
+			if got != tt.want {
+				t.Fatalf("sanitizeNonFiniteJSON() = %q, want %q", got, tt.want)
+			}
+		})
+	}
+}
--- a/convert/tokenizer.go
+++ b/convert/tokenizer.go
@@ -101,6 +101,8 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
 			t.Pre = "deepseek-coder"
 		case "1ff7f41064896984db5d1bb6ff64fa4bc29007d08c1b439e505b7392777a319e":
 			t.Pre = "qwen2"
+		case "00431aed57e696b747435f734d1e3b9b1bfd931a121fb5cac7129e97c181e9ba":
+			t.Pre = "qwen35"
 		case "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855":
 			// noop, empty pretokenizer
 		default:
@@ -212,8 +214,13 @@ type tokenizer struct {

 	PreTokenizer struct {
 		PreTokenizers []struct {
-			Type    string `json:"type"`
-			Pattern struct {
+			Type           string `json:"type"`
+			Behavior       string `json:"behavior"`
+			Invert         bool   `json:"invert"`
+			AddPrefixSpace bool   `json:"add_prefix_space"`
+			TrimOffsets    bool   `json:"trim_offsets"`
+			UseRegex       bool   `json:"use_regex"`
+			Pattern        struct {
 				Regex string `json:"Regex"`
 			} `json:"pattern"`
 		} `json:"pretokenizers"`
--- a/convert/tokenizer_test.go
+++ b/convert/tokenizer_test.go
@@ -191,6 +191,84 @@ func TestParseTokenizer(t *testing.T) {
 				Pre: "default",
 			},
 		},
+		{
+			name: "llama-bpe pretokenizer and control tokens",
+			fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
+				"tokenizer.json": strings.NewReader(`{
+					"added_tokens": [
+						{"id": 1, "content": "<|startoftext|>", "special": true},
+						{"id": 6, "content": "<|im_start|>", "special": true},
+						{"id": 7, "content": "<|im_end|>", "special": true},
+						{"id": 8, "content": "<|tool_list_start|>", "special": true},
+						{"id": 9, "content": "<|tool_list_end|>", "special": true},
+						{"id": 10, "content": "<|tool_call_start|>", "special": true},
+						{"id": 11, "content": "<|tool_call_end|>", "special": true},
+						{"id": 12, "content": "<|tool_response_start|>", "special": true},
+						{"id": 13, "content": "<|tool_response_end|>", "special": true},
+						{"id": 396, "content": "<image>", "special": true},
+						{"id": 64400, "content": "<think>", "special": true},
+						{"id": 64401, "content": "</think>", "special": true}
+					],
+					"model": {
+						"vocab": {
+							"<|startoftext|>": 1,
+							"<|im_start|>": 6,
+							"<|im_end|>": 7,
+							"<|tool_list_start|>": 8,
+							"<|tool_list_end|>": 9,
+							"<|tool_call_start|>": 10,
+							"<|tool_call_end|>": 11,
+							"<|tool_response_start|>": 12,
+							"<|tool_response_end|>": 13,
+							"<image>": 396,
+							"<think>": 64400,
+							"</think>": 64401
+						}
+					},
+					"pre_tokenizer": {
+						"type": "Sequence",
+						"pretokenizers": [
+							{
+								"type": "Split",
+								"pattern": {
+									"Regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
+								},
+								"behavior": "Isolated",
+								"invert": false
+							},
+							{
+								"type": "ByteLevel",
+								"add_prefix_space": false,
+								"trim_offsets": true,
+								"use_regex": false
+							}
+						]
+					}
+				}`),
+			}),
+			want: &Tokenizer{
+				Vocabulary: &Vocabulary{
+					Model: "gpt2",
+					Tokens: []string{
+						"<|startoftext|>",
+						"<|im_start|>",
+						"<|im_end|>",
+						"<|tool_list_start|>",
+						"<|tool_list_end|>",
+						"<|tool_call_start|>",
+						"<|tool_call_end|>",
+						"<|tool_response_start|>",
+						"<|tool_response_end|>",
+						"<image>",
+						"<think>",
+						"</think>",
+					},
+					Scores: []float32{1, 6, 7, 8, 9, 10, 11, 12, 13, 396, 64400, 64401},
+					Types:  []int32{3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3},
+				},
+				Pre: "llama-bpe",
+			},
+		},
 		{
 			name: "list string merges",
 			fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
@@ -308,6 +386,28 @@ func TestParseTokenizer(t *testing.T) {
 				Pre: "default",
 			},
 		},
+		{
+			name: "qwen35 pretokenizer",
+			fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
+				"tokenizer.json": strings.NewReader(`{
+					"pre_tokenizer": {
+						"type": "Sequence",
+						"pretokenizers": [
+							{
+								"type": "Split",
+								"pattern": {
+									"Regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?[\\p{L}\\p{M}]+|\\p{N}| ?[^\\s\\p{L}\\p{M}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
+								}
+							}
+						]
+					}
+				}`),
+			}),
+			want: &Tokenizer{
+				Vocabulary: &Vocabulary{Model: "gpt2"},
+				Pre:        "qwen35",
+			},
+		},
 	}

 	for _, tt := range cases {
--- a/docs/integrations/openclaw.mdx
+++ b/docs/integrations/openclaw.mdx
@@ -4,47 +4,65 @@ title: OpenClaw

 OpenClaw is a personal AI assistant that runs on your own devices. It bridges messaging services (WhatsApp, Telegram, Slack, Discord, iMessage, and more) to AI coding agents through a centralized gateway.

-## Install
-
-Install [OpenClaw](https://openclaw.ai/) 
-
-```bash
-npm install -g openclaw@latest
-```
-
-Then run the onboarding wizard:
-
-```bash
-openclaw onboard --install-daemon
-```
-
-<Note>OpenClaw requires a larger context window. It is recommended to use a context window of at least 64k tokens. See [Context length](/context-length) for more information.</Note>
-
-## Usage with Ollama
-
-### Quick setup
+## Quick start

 ```bash
 ollama launch openclaw
 ```

+Ollama handles everything automatically:
+
+1. **Install** — If OpenClaw isn't installed, Ollama prompts to install it via npm
+2. **Security** — On the first launch, a security notice explains the risks of tool access
+3. **Model** — Pick a model from the selector (local or cloud)
+4. **Onboarding** — Ollama configures the provider, installs the gateway daemon, and sets your model as the primary
+5. **Gateway** — Starts in the background and opens the OpenClaw TUI
+
+<Note>OpenClaw requires a larger context window. It is recommended to use a context window of at least 64k tokens if using local models. See [Context length](/context-length) for more information.</Note>
+
 <Note>Previously known as Clawdbot. `ollama launch clawdbot` still works as an alias.</Note>

-This configures OpenClaw to use Ollama and starts the gateway.
-If the gateway is already running, no changes need to be made as the gateway will auto-reload the changes. 
+## Configure without launching

+To change the model without starting the gateway and TUI:

-To configure without launching:
-
-```shell
+```bash
 ollama launch openclaw --config
 ```

-## Recommended Models
+To use a specific model directly:

- `qwen3-coder`
- `glm-4.7`
- `gpt-oss:20b`
- `gpt-oss:120b`
+```bash
+ollama launch openclaw --model kimi-k2.5:cloud
+```
+
+If the gateway is already running, it restarts automatically to pick up the new model.
+
+## Recommended models
+
+**Cloud models**:
+
+- `kimi-k2.5:cloud` — Multimodal reasoning with subagents
+- `minimax-m2.5:cloud` — Fast, efficient coding and real-world productivity
+- `glm-5:cloud` — Reasoning and code generation
+
+**Local models:**
+
+- `glm-4.7-flash` — Reasoning and code generation locally (~25 GB VRAM)
+
+More models at [ollama.com/search](https://ollama.com/search?c=cloud).
+
+## Connect messaging apps
+
+```bash
+openclaw configure --section channels
+```
+
+Link WhatsApp, Telegram, Slack, Discord, or iMessage to chat with your local models from anywhere.
+
+## Stopping the gateway
+
+```bash
+openclaw gateway stop
+```

-Cloud models are also available at [ollama.com/search?c=cloud](https://ollama.com/search?c=cloud).
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -160,6 +160,27 @@ func (kv KV) SSMGroupCount() uint64 {
 	return uint64(kv.Uint("ssm.group_count"))
 }

+func (kv KV) FFNLength() []uint64 {
+	ffnLengthDefault := uint32(0)
+	ffnLength := kv.UintOrArrayValueAsArray("feed_forward_length", ffnLengthDefault)
+	if len(ffnLength) == 1 {
+		ffnLengthDefault = ffnLength[0]
+	}
+	nLayers := int(kv.BlockCount())
+	if len(ffnLength) > nLayers {
+		slog.Warn("got more elements of feed_forward_length than layers", "len(ffnLength)", len(ffnLength), "layers", nLayers)
+	}
+	out := make([]uint64, nLayers)
+	for i := range nLayers {
+		if i >= len(ffnLength) {
+			out[i] = uint64(ffnLengthDefault)
+		} else {
+			out[i] = uint64(ffnLength[i])
+		}
+	}
+	return out
+}
+
 // general types

 func (kv KV) String(key string, defaultValue ...string) string {
@@ -264,15 +285,18 @@ func (kv KV) OllamaEngineRequired() bool {
 		"llama4",
 		"mistral3",
 		"mllama",
+		"nemotron_h", "nemotron_h_moe",
 		"nomic-bert",
 		"olmo3",
 		"qwen25vl",
 		"qwen3", "qwen3moe",
+		"qwen35", "qwen35moe",
 		"qwen3next",
 		"qwen3vl", "qwen3vlmoe",
 		"glm4moelite",
 		"glmocr",
 		"lfm2",
+		"lfm2moe",
 	}, kv.Architecture())
 }

@@ -845,7 +869,12 @@ func (f GGML) SupportsFlashAttention() bool {
 		return false
 	}

-	if arch := f.KV().Architecture(); slices.Contains([]string{"gemma2"}, arch) {
+	arch := f.KV().Architecture()
+	if slices.Contains([]string{"qwen35", "qwen35moe", "qwen3next"}, arch) {
+		return true
+	}
+
+	if slices.Contains([]string{"gemma2"}, arch) {
 		return false
 	}

@@ -864,9 +893,12 @@ func (f GGML) FlashAttention() bool {
 		"glmocr",
 		"gptoss", "gpt-oss",
 		"lfm2",
+		"lfm2moe",
 		"mistral3",
+		"nemotron_h", "nemotron_h_moe",
 		"olmo3",
 		"qwen3", "qwen3moe",
+		"qwen35", "qwen35moe",
 		"qwen3next",
 		"qwen3vl", "qwen3vlmoe",
 	}, f.KV().String("general.architecture"))
--- a/fs/ggml/gguf.go
+++ b/fs/ggml/gguf.go
@@ -245,7 +245,22 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
 	padding := ggufPadding(offset, int64(alignment))
 	llm.tensorOffset = uint64(offset + padding)

+	// get file size to validate tensor bounds
+	fileSize, err := rs.Seek(0, io.SeekEnd)
+	if err != nil {
+		return fmt.Errorf("failed to determine file size: %w", err)
+	}
+
+	if _, err := rs.Seek(offset, io.SeekStart); err != nil {
+		return fmt.Errorf("failed to seek back after size check: %w", err)
+	}
+
 	for _, tensor := range llm.tensors {
+		tensorEnd := llm.tensorOffset + tensor.Offset + tensor.Size()
+		if tensorEnd > uint64(fileSize) {
+			return fmt.Errorf("tensor %q offset+size (%d) exceeds file size (%d)", tensor.Name, tensorEnd, fileSize)
+		}
+
 		offset, err := rs.Seek(0, io.SeekCurrent)
 		if err != nil {
 			return fmt.Errorf("failed to get current offset: %w", err)
--- a/fs/ggml/gguf_test.go
+++ b/fs/ggml/gguf_test.go
@@ -11,21 +11,21 @@ import (
 )

 func TestWriteGGUF(t *testing.T) {
-	b := bytes.NewBuffer(make([]byte, 2*3))
+	tensorData := make([]byte, 2*3*4) // 6 F32 elements = 24 bytes
 	for range 8 {
 		t.Run("shuffle", func(t *testing.T) {
 			t.Parallel()

 			ts := []*Tensor{
-				{Name: "token_embd.weight", Shape: []uint64{2, 3}, WriterTo: b},
-				{Name: "blk.0.ffn_norm.weight", Shape: []uint64{2, 3}, WriterTo: b},
-				{Name: "blk.0.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: b},
-				{Name: "blk.1.ffn_up.weight", Shape: []uint64{2, 3}, WriterTo: b},
-				{Name: "blk.2.ffn_norm.weight", Shape: []uint64{2, 3}, WriterTo: b},
-				{Name: "blk.1.ffn_down.weight", Shape: []uint64{2, 3}, WriterTo: b},
-				{Name: "blk.0.attn_k.weight", Shape: []uint64{2, 3}, WriterTo: b},
-				{Name: "output_norm.weight", Shape: []uint64{3, 2}, WriterTo: b},
-				{Name: "output.weight", Shape: []uint64{3, 2}, WriterTo: b},
+				{Name: "token_embd.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewReader(tensorData)},
+				{Name: "blk.0.ffn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewReader(tensorData)},
+				{Name: "blk.0.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewReader(tensorData)},
+				{Name: "blk.1.ffn_up.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewReader(tensorData)},
+				{Name: "blk.2.ffn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewReader(tensorData)},
+				{Name: "blk.1.ffn_down.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewReader(tensorData)},
+				{Name: "blk.0.attn_k.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewReader(tensorData)},
+				{Name: "output_norm.weight", Shape: []uint64{3, 2}, WriterTo: bytes.NewReader(tensorData)},
+				{Name: "output.weight", Shape: []uint64{3, 2}, WriterTo: bytes.NewReader(tensorData)},
 			}

 			rand.Shuffle(len(ts), func(i, j int) {
@@ -98,4 +98,32 @@ func TestWriteGGUF(t *testing.T) {
 			}
 		})
 	}
+
+	t.Run("truncated_tensor_data", func(t *testing.T) {
+		t.Parallel()
+
+		ts := []*Tensor{
+			{Name: "blk.0.attn.weight", Kind: 0, Shape: []uint64{512, 2}, WriterTo: bytes.NewBuffer(make([]byte, 32))},
+		}
+
+		w, err := os.CreateTemp(t.TempDir(), "truncated_*.bin")
+		if err != nil {
+			t.Fatal(err)
+		}
+		defer w.Close()
+
+		if err := WriteGGUF(w, KV{"general.architecture": "test"}, ts); err != nil {
+			t.Fatal(err)
+		}
+
+		r, err := os.Open(w.Name())
+		if err != nil {
+			t.Fatal(err)
+		}
+		defer r.Close()
+
+		if _, err := Decode(r, -1); err == nil {
+			t.Error("Decode should reject GGUF files where tensor data extends beyond file size")
+		}
+	})
 }
--- a/go.mod
+++ b/go.mod
@@ -26,6 +26,7 @@ require (
 	github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1
 	github.com/dlclark/regexp2 v1.11.4
 	github.com/emirpasic/gods/v2 v2.0.0-alpha
+	github.com/klauspost/compress v1.18.3
 	github.com/mattn/go-runewidth v0.0.16
 	github.com/nlpodyssey/gopickle v0.3.0
 	github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c
--- a/go.sum
+++ b/go.sum
@@ -122,7 +122,6 @@ github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaS
 github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY=
 github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek=
 github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps=
-github.com/golang/snappy v0.0.3 h1:fHPg5GQYlCeLIPB9BZqMVR5nR9A+IM5zcgeTdjMYmLA=
 github.com/golang/snappy v0.0.3/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
 github.com/google/flatbuffers v2.0.0+incompatible/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8=
 github.com/google/flatbuffers v24.3.25+incompatible h1:CX395cjN9Kke9mmalRoL3d81AtFUxJM+yDthflgJGkI=
@@ -150,8 +149,9 @@ github.com/jung-kurt/gofpdf v1.0.0/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+
 github.com/jung-kurt/gofpdf v1.0.3-0.20190309125859-24315acbbda5/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes=
 github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
 github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
-github.com/klauspost/compress v1.13.1 h1:wXr2uRxZTJXHLly6qhJabee5JqIhTRoLBhDOA74hDEQ=
 github.com/klauspost/compress v1.13.1/go.mod h1:8dP1Hq4DHOhN9w426knH3Rhby4rFm6D8eO+e+Dq5Gzg=
+github.com/klauspost/compress v1.18.3 h1:9PJRvfbmTabkOX8moIpXPbMMbYN60bWImDDU7L+/6zw=
+github.com/klauspost/compress v1.18.3/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4=
 github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
 github.com/klauspost/cpuid/v2 v2.2.7 h1:ZWSB3igEs+d0qvnxR/ZBzXVmxkgt8DdzP6m9pfuVLDM=
 github.com/klauspost/cpuid/v2 v2.2.7/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws=
--- a/kvcache/recurrent.go
+++ b/kvcache/recurrent.go
@@ -0,0 +1,752 @@
+package kvcache
+
+import (
+	"errors"
+	"fmt"
+	"math"
+	"slices"
+
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/model/input"
+)
+
+const (
+	DefaultCheckpointCount    = 24
+	DefaultCheckpointMinPos   = int32(16)
+	DefaultCheckpointInterval = int32(1664)
+)
+
+var ErrInvalidRecurrentShape = errors.New("kvcache: invalid recurrent state shape")
+
+// Config configures a shared hybrid recurrent cache.
+type RecurrentConfig struct {
+	Shift               func(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error)
+	ConvDim             int
+	ConvChannels        int
+	RecurrentStateSize  int
+	CheckpointLogPrefix string
+}
+
+var (
+	_ Cache           = (*Recurrent)(nil)
+	_ CheckpointCache = (*Recurrent)(nil)
+)
+
+// Cache stores:
+// - a standard causal KV cache
+// - per-sequence conv state for recurrent operators
+// - per-sequence recurrent state for recurrent operators
+//
+// Conv state shape (per layer, per sequence): [convDim, convChannels]
+// Recurrent state shape (per layer, per sequence): [recurrentStateSize]
+type Recurrent struct {
+	kv *Causal
+
+	backend      ml.Backend
+	dtype        ml.DType
+	maxSequences int
+
+	// Conv state dimensions
+	convDim      int
+	convChannels int
+
+	// Recurrent state dimensions
+	recurrentStateSize int
+
+	logPrefix string
+
+	// slot mapping for recurrent state (copy-on-write)
+	slotForSeq  map[int]int
+	refCount    []int
+	freeSlots   []int
+	seqCounts   map[int]int
+	slotScratch [1]int32
+
+	// per-layer conv state buffers (allocated lazily)
+	convCtxs   map[int]ml.Context
+	convStates map[int]ml.Tensor // [convDim*convChannels, maxSlots]
+
+	// per-layer recurrent state buffers (allocated lazily)
+	recurrentCtxs   map[int]ml.Context
+	recurrentStates map[int]ml.Tensor // [recurrentStateSize, maxSlots]
+
+	// recurrent checkpoints (per slot)
+	checkpointCount     int
+	checkpointMinPos    int32
+	checkpointInterval  int32
+	checkpointCtxSize   int
+	checkpoints         map[int]*slotCheckpointStore
+	pendingRestore      map[int]checkpointRestore
+	curCheckpointPos    []int32
+	curCheckpointSlots  map[int]int
+	reserveCheckpoints  bool
+	checkpointConvCtxs  map[int]ml.Context
+	checkpointRecurCtxs map[int]ml.Context
+	checkpointReserved  map[int]struct{}
+
+	// current forward batch (derived in StartForward)
+	curSeqs       []int
+	curSlots      []int
+	curSlotsInput ml.Tensor
+	curSeqTokens  int
+
+	// track if EnsureWritable has been called for this forward pass
+	writableEnsured bool
+	writableError   error
+}
+
+func NewRecurrentCache(config RecurrentConfig) *Recurrent {
+	return &Recurrent{
+		kv:                  NewCausalCache(config.Shift),
+		convDim:             config.ConvDim,
+		convChannels:        config.ConvChannels,
+		recurrentStateSize:  config.RecurrentStateSize,
+		logPrefix:           config.CheckpointLogPrefix,
+		slotForSeq:          make(map[int]int),
+		seqCounts:           make(map[int]int),
+		convCtxs:            make(map[int]ml.Context),
+		convStates:          make(map[int]ml.Tensor),
+		recurrentCtxs:       make(map[int]ml.Context),
+		recurrentStates:     make(map[int]ml.Tensor),
+		checkpointCount:     DefaultCheckpointCount,
+		checkpointMinPos:    DefaultCheckpointMinPos,
+		checkpointInterval:  DefaultCheckpointInterval,
+		checkpoints:         make(map[int]*slotCheckpointStore),
+		pendingRestore:      make(map[int]checkpointRestore),
+		curCheckpointSlots:  make(map[int]int),
+		checkpointConvCtxs:  make(map[int]ml.Context),
+		checkpointRecurCtxs: make(map[int]ml.Context),
+		checkpointReserved:  make(map[int]struct{}),
+	}
+}
+
+func (c *Recurrent) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity, maxBatch int) {
+	c.backend = backend
+	c.dtype = dtype
+	c.maxSequences = maxSequences
+	c.checkpoints = make(map[int]*slotCheckpointStore)
+	c.pendingRestore = make(map[int]checkpointRestore)
+	c.curCheckpointPos = c.curCheckpointPos[:0]
+	c.curCheckpointSlots = make(map[int]int)
+	c.checkpointReserved = make(map[int]struct{})
+	c.checkpointCtxSize = c.checkpointCount * c.maxSequences
+	if c.checkpointCtxSize < 8 {
+		c.checkpointCtxSize = 8
+	}
+
+	// initialize slot allocator
+	c.refCount = make([]int, maxSequences)
+	c.freeSlots = c.freeSlots[:0]
+	for i := maxSequences - 1; i >= 0; i-- {
+		c.freeSlots = append(c.freeSlots, i)
+	}
+
+	c.kv.Init(backend, dtype, maxSequences, capacity, maxBatch)
+}
+
+func (c *Recurrent) Close() {
+	for _, ctx := range c.convCtxs {
+		ctx.Close()
+	}
+	for _, ctx := range c.recurrentCtxs {
+		ctx.Close()
+	}
+	for _, ctx := range c.checkpointConvCtxs {
+		ctx.Close()
+	}
+	for _, ctx := range c.checkpointRecurCtxs {
+		ctx.Close()
+	}
+	c.kv.Close()
+}
+
+func (c *Recurrent) SetConfig(config ml.CacheConfig) {
+	c.kv.SetConfig(config)
+}
+
+func (c *Recurrent) SetLayer(layer int) {
+	c.kv.SetLayer(layer)
+}
+
+func (c *Recurrent) Get(ctx ml.Context) (ml.Tensor, ml.Tensor, ml.Tensor) {
+	return c.kv.Get(ctx)
+}
+
+func (c *Recurrent) Put(ctx ml.Context, key, value ml.Tensor) {
+	c.kv.Put(ctx, key, value)
+}
+
+func (c *Recurrent) StartForward(ctx ml.Context, batch input.Batch, reserve bool) error {
+	if err := c.kv.StartForward(ctx, batch, reserve); err != nil {
+		return err
+	}
+
+	nTokens := len(batch.Sequences)
+	if nTokens == 0 {
+		c.curSeqs = c.curSeqs[:0]
+		c.curSlots = c.curSlots[:0]
+		c.curSlotsInput = nil
+		c.curSeqTokens = 0
+		c.reserveCheckpoints = false
+		c.writableEnsured = false
+		c.writableError = nil
+		return nil
+	}
+
+	// Fast path for single-sequence batches (common during decode and prefill).
+	firstSeq := batch.Sequences[0]
+	singleSeq := true
+	for _, s := range batch.Sequences[1:] {
+		if s != firstSeq {
+			singleSeq = false
+			break
+		}
+	}
+	if singleSeq {
+		return c.startForwardSingleSeq(ctx, firstSeq, nTokens, batch, reserve)
+	}
+
+	// Derive equal-length sequence layout for recurrent layers.
+	seqCounts := c.seqCounts
+	for s := range seqCounts {
+		delete(seqCounts, s)
+	}
+
+	c.curSeqs = c.curSeqs[:0]
+	for _, s := range batch.Sequences {
+		if seqCounts[s] == 0 {
+			c.curSeqs = append(c.curSeqs, s)
+		}
+		seqCounts[s]++
+	}
+
+	nSeqs := len(c.curSeqs)
+	want := nTokens / nSeqs
+	for _, s := range c.curSeqs {
+		if seqCounts[s] != want {
+			return ErrNotSupported
+		}
+	}
+
+	c.curSeqTokens = want
+
+	if reserve {
+		c.curSlots = c.curSlots[:0]
+		for i := range nSeqs {
+			c.curSlots = append(c.curSlots, i)
+		}
+		c.finalizeStartForward(ctx, batch, true)
+		return nil
+	}
+
+	// Ensure slots exist for sequences in this batch.
+	c.curSlots = c.curSlots[:0]
+	var newSlots []int
+	for _, s := range c.curSeqs {
+		slot, ok := c.slotForSeq[s]
+		if !ok {
+			var err error
+			slot, err = c.allocSlot()
+			if err != nil {
+				return err
+			}
+			c.slotForSeq[s] = slot
+			c.refCount[slot] = 1
+			newSlots = append(newSlots, slot)
+		}
+		c.curSlots = append(c.curSlots, slot)
+	}
+
+	if len(newSlots) > 0 {
+		c.zeroSlots(ctx, newSlots)
+	}
+
+	c.finalizeStartForward(ctx, batch, false)
+
+	return nil
+}
+
+func (c *Recurrent) startForwardSingleSeq(ctx ml.Context, seq, seqTokens int, batch input.Batch, reserve bool) error {
+	c.curSeqs = append(c.curSeqs[:0], seq)
+	c.curSeqTokens = seqTokens
+
+	if reserve {
+		c.curSlots = append(c.curSlots[:0], 0)
+		c.finalizeStartForward(ctx, batch, true)
+		return nil
+	}
+
+	slot, ok := c.slotForSeq[seq]
+	if !ok {
+		var err error
+		slot, err = c.allocSlot()
+		if err != nil {
+			return err
+		}
+
+		c.slotForSeq[seq] = slot
+		c.refCount[slot] = 1
+		slotList := [1]int{slot}
+		c.zeroSlots(ctx, slotList[:])
+	}
+
+	c.curSlots = append(c.curSlots[:0], slot)
+	c.finalizeStartForward(ctx, batch, false)
+
+	return nil
+}
+
+func (c *Recurrent) finalizeStartForward(ctx ml.Context, batch input.Batch, reserve bool) {
+	c.setCurSlotsInput(ctx)
+	c.writableEnsured = false
+	c.writableError = nil
+	c.reserveCheckpoints = reserve
+	c.planCheckpoints(batch)
+}
+
+func (c *Recurrent) setCurSlotsInput(ctx ml.Context) {
+	c.curSlotsInput = c.slotsInput(ctx, c.curSlots)
+}
+
+func (c *Recurrent) slotsInput(ctx ml.Context, slots []int) ml.Tensor {
+	switch len(slots) {
+	case 0:
+		return nil
+	case 1:
+		c.slotScratch[0] = int32(slots[0])
+		return ctx.Input().FromInts(c.slotScratch[:], 1)
+	default:
+		slotIndices := make([]int32, len(slots))
+		for i, v := range slots {
+			slotIndices[i] = int32(v)
+		}
+		return ctx.Input().FromInts(slotIndices, len(slotIndices))
+	}
+}
+
+func (c *Recurrent) allocSlot() (int, error) {
+	if len(c.freeSlots) == 0 {
+		return 0, ErrKvCacheFull
+	}
+	slot := c.freeSlots[len(c.freeSlots)-1]
+	c.freeSlots = c.freeSlots[:len(c.freeSlots)-1]
+	return slot, nil
+}
+
+func (c *Recurrent) freeSlot(slot int) {
+	if slot >= 0 && slot < c.maxSequences {
+		c.freeSlots = append(c.freeSlots, slot)
+	}
+}
+
+// zeroSlots zeros recurrent state for the given slots across all cached layers.
+func (c *Recurrent) zeroSlots(ctx ml.Context, slots []int) {
+	if len(slots) == 0 {
+		return
+	}
+
+	inputCtx := ctx.Input()
+	slotsTensor := c.slotsInput(ctx, slots)
+
+	if len(c.convStates) > 0 {
+		zeros := inputCtx.Zeros(ml.DTypeF32, c.convDim*c.convChannels, len(slots))
+		for _, buf := range c.convStates {
+			ctx.Forward(buf.SetRows(ctx, zeros, slotsTensor))
+		}
+	}
+
+	if len(c.recurrentStates) > 0 {
+		zeros := inputCtx.Zeros(ml.DTypeF32, c.recurrentStateSize, len(slots))
+		for _, buf := range c.recurrentStates {
+			ctx.Forward(buf.SetRows(ctx, zeros, slotsTensor))
+		}
+	}
+}
+
+// EnsureWritable ensures sequences have private slots (copy-on-write).
+func (c *Recurrent) EnsureWritable(ctx ml.Context) error {
+	for i, seq := range c.curSeqs {
+		slot, ok := c.slotForSeq[seq]
+		if !ok {
+			continue
+		}
+
+		if slot < 0 || slot >= len(c.refCount) {
+			continue
+		}
+
+		if c.refCount[slot] <= 1 {
+			continue
+		}
+
+		newSlot, err := c.allocSlot()
+		if err != nil {
+			return err
+		}
+		c.refCount[slot]--
+		c.refCount[newSlot] = 1
+		c.slotForSeq[seq] = newSlot
+		c.curSlots[i] = newSlot
+
+		c.copyRecurrentState(ctx, slot, newSlot)
+		c.copyCheckpoints(ctx, slot, newSlot)
+	}
+
+	c.setCurSlotsInput(ctx)
+
+	return nil
+}
+
+func (c *Recurrent) copyRecurrentState(ctx ml.Context, srcSlot, dstSlot int) {
+	src := ctx.Input().FromInts([]int32{int32(srcSlot)}, 1)
+	dst := ctx.Input().FromInts([]int32{int32(dstSlot)}, 1)
+
+	for _, buf := range c.convStates {
+		rows := buf.Rows(ctx, src)
+		if rows.DType() != ml.DTypeF32 {
+			rows = rows.Cast(ctx, ml.DTypeF32)
+		}
+		ctx.Forward(buf.SetRows(ctx, rows, dst))
+	}
+
+	for _, buf := range c.recurrentStates {
+		rows := buf.Rows(ctx, src)
+		if rows.DType() != ml.DTypeF32 {
+			rows = rows.Cast(ctx, ml.DTypeF32)
+		}
+		ctx.Forward(buf.SetRows(ctx, rows, dst))
+	}
+}
+
+func (c *Recurrent) CopyPrefix(srcSeq, dstSeq int, prefixLen int32) {
+	c.kv.CopyPrefix(srcSeq, dstSeq, prefixLen)
+
+	if dstSlot, ok := c.slotForSeq[dstSeq]; ok {
+		if c.validSlot(dstSlot) {
+			c.refCount[dstSlot]--
+			if c.refCount[dstSlot] <= 0 {
+				c.refCount[dstSlot] = 0
+				c.freeSlot(dstSlot)
+			}
+		}
+		delete(c.slotForSeq, dstSeq)
+	}
+
+	srcSlot, ok := c.slotForSeq[srcSeq]
+	if !ok {
+		return
+	}
+
+	if c.validSlot(srcSlot) {
+		c.slotForSeq[dstSeq] = srcSlot
+		c.refCount[srcSlot]++
+	}
+}
+
+func (c *Recurrent) CanResume(seq int, pos int32) bool {
+	if !c.kv.CanResume(seq, pos) {
+		return false
+	}
+	if pos == 0 {
+		return true
+	}
+	return c.hasCheckpoint(seq, pos)
+}
+
+func (c *Recurrent) Remove(seq int, beginIndex, endIndex int32) error {
+	if beginIndex > 0 && endIndex != math.MaxInt32 {
+		if err := c.kv.Remove(seq, beginIndex, endIndex); err != nil {
+			return err
+		}
+		delete(c.pendingRestore, seq)
+
+		slot, ok := c.slotForSeq[seq]
+		if !ok || !c.validSlot(slot) {
+			return nil
+		}
+
+		// Detach shared recurrent state/checkpoints before mutating checkpoint positions.
+		if c.refCount[slot] > 1 {
+			newSlot, err := c.allocSlot()
+			if err != nil {
+				return err
+			}
+			ctx := c.backend.NewContext()
+			c.copyRecurrentState(ctx, slot, newSlot)
+			c.copyCheckpoints(ctx, slot, newSlot)
+			if len(c.convStates) > 0 || len(c.recurrentStates) > 0 {
+				ctx.Compute()
+			}
+			ctx.Close()
+
+			c.refCount[slot]--
+			c.refCount[newSlot] = 1
+			c.slotForSeq[seq] = newSlot
+			slot = newSlot
+		}
+
+		c.shiftCheckpoints(slot, beginIndex, endIndex)
+		return nil
+	}
+
+	if beginIndex > 0 {
+		restore, ok := c.pendingRestore[seq]
+		if !ok || restore.pos+1 != beginIndex {
+			return ErrNotSupported
+		}
+		if !c.restoreComplete(restore) {
+			return ErrNotSupported
+		}
+		if slot, ok := c.slotForSeq[seq]; ok && c.validSlot(slot) && c.refCount[slot] > 1 {
+			newSlot, err := c.allocSlot()
+			if err != nil {
+				return err
+			}
+			ctx := c.backend.NewContext()
+			c.copyRecurrentState(ctx, slot, newSlot)
+			c.copyCheckpoints(ctx, slot, newSlot)
+			if len(c.convStates) > 0 || len(c.recurrentStates) > 0 {
+				ctx.Compute()
+			}
+			ctx.Close()
+
+			c.refCount[slot]--
+			c.refCount[newSlot] = 1
+			c.slotForSeq[seq] = newSlot
+
+			restore.slot = newSlot
+			c.pendingRestore[seq] = restore
+		}
+	}
+
+	if err := c.kv.Remove(seq, beginIndex, endIndex); err != nil {
+		return err
+	}
+
+	if beginIndex > 0 {
+		restore := c.pendingRestore[seq]
+		delete(c.pendingRestore, seq)
+		return c.applyCheckpointRestore(restore)
+	}
+
+	slot, ok := c.slotForSeq[seq]
+	delete(c.pendingRestore, seq)
+	if !ok {
+		return nil
+	}
+
+	if !c.validSlot(slot) {
+		delete(c.slotForSeq, seq)
+		return nil
+	}
+
+	c.refCount[slot]--
+	if c.refCount[slot] <= 0 {
+		c.refCount[slot] = 0
+		c.clearCheckpoints(slot)
+		c.freeSlot(slot)
+	}
+	delete(c.slotForSeq, seq)
+
+	return nil
+}
+
+func (c *Recurrent) validSlot(slot int) bool {
+	return slot >= 0 && slot < len(c.refCount)
+}
+
+func (c *Recurrent) SlotsTensor() ml.Tensor {
+	return c.curSlotsInput
+}
+
+// contiguousSlots returns the starting slot if current slots are contiguous and ordered.
+func (c *Recurrent) contiguousSlots() (int, bool) {
+	if len(c.curSlots) == 0 {
+		return 0, false
+	}
+	start := c.curSlots[0]
+	for i, s := range c.curSlots {
+		if s != start+i {
+			return 0, false
+		}
+	}
+	return start, true
+}
+
+func (c *Recurrent) SeqTokens() int {
+	return c.curSeqTokens
+}
+
+func (c *Recurrent) NumSeqs() int {
+	return len(c.curSeqs)
+}
+
+func (c *Recurrent) convBuffer(layer int) ml.Tensor {
+	if buf, ok := c.convStates[layer]; ok {
+		return buf
+	}
+
+	if _, ok := c.convCtxs[layer]; !ok {
+		c.convCtxs[layer] = c.backend.NewContextSize(1).Layer(layer)
+	}
+
+	buf := c.convCtxs[layer].Zeros(ml.DTypeF32, c.convDim*c.convChannels, c.maxSequences)
+	c.convStates[layer] = buf
+	return buf
+}
+
+func (c *Recurrent) recurrentBuffer(layer int) ml.Tensor {
+	if buf, ok := c.recurrentStates[layer]; ok {
+		return buf
+	}
+
+	if _, ok := c.recurrentCtxs[layer]; !ok {
+		c.recurrentCtxs[layer] = c.backend.NewContextSize(1).Layer(layer)
+	}
+
+	buf := c.recurrentCtxs[layer].Zeros(ml.DTypeF32, c.recurrentStateSize, c.maxSequences)
+	c.recurrentStates[layer] = buf
+	return buf
+}
+
+func (c *Recurrent) ensureWritable(ctx ml.Context) error {
+	c.ensureWritableOnce(ctx)
+	return c.writableError
+}
+
+func (c *Recurrent) currentSlotRows(ctx ml.Context, buf ml.Tensor, rowSize int) ml.Tensor {
+	if start, ok := c.contiguousSlots(); ok {
+		offset := start * buf.Stride(1)
+		return buf.View(ctx, offset, rowSize, buf.Stride(1), c.NumSeqs())
+	}
+
+	return buf.Rows(ctx, c.SlotsTensor())
+}
+
+func (c *Recurrent) writeCurrentSlotRows(ctx ml.Context, buf ml.Tensor, rowSize int, src ml.Tensor) {
+	if start, ok := c.contiguousSlots(); ok {
+		offset := start * buf.Stride(1)
+		view := buf.View(ctx, offset, rowSize, buf.Stride(1), c.NumSeqs())
+		ctx.Forward(src.Copy(ctx, view))
+		return
+	}
+
+	ctx.Forward(buf.SetRows(ctx, src, c.SlotsTensor()))
+}
+
+func (c *Recurrent) ensureWritableOnce(ctx ml.Context) {
+	if !c.writableEnsured {
+		needsWritable := false
+		for _, seq := range c.curSeqs {
+			slot, ok := c.slotForSeq[seq]
+			if !ok {
+				continue
+			}
+			if slot >= 0 && slot < len(c.refCount) && c.refCount[slot] > 1 {
+				needsWritable = true
+				break
+			}
+		}
+
+		if needsWritable {
+			if err := c.EnsureWritable(ctx); err != nil {
+				c.writableError = err
+			}
+		}
+		c.writableEnsured = true
+	}
+}
+
+// ConvState returns conv state for current batch sequences as [convDim, convChannels, nSeqs].
+func (c *Recurrent) ConvState(ctx ml.Context, layer int) (ml.Tensor, error) {
+	if err := c.ensureWritable(ctx); err != nil {
+		return nil, err
+	}
+
+	buf := c.convBuffer(layer)
+	cur := c.currentSlotRows(ctx, buf, c.convDim*c.convChannels)
+	return cur.Reshape(ctx, c.convDim, c.convChannels, c.NumSeqs()), nil
+}
+
+// UpdateConvState writes new conv state for current batch sequences.
+func (c *Recurrent) UpdateConvState(ctx ml.Context, layer int, newState ml.Tensor) {
+	buf := c.convBuffer(layer)
+	src := newState.Reshape(ctx, c.convDim*c.convChannels, c.NumSeqs())
+	srcF32 := src
+	if src.DType() != ml.DTypeF32 {
+		srcF32 = src.Cast(ctx, ml.DTypeF32)
+	}
+	c.writeCurrentSlotRows(ctx, buf, c.convDim*c.convChannels, srcF32)
+
+	c.captureConvCheckpoint(ctx, layer, srcF32)
+}
+
+// RecurrentState returns recurrent state for current batch sequences with shape [dims..., nSeqs].
+func (c *Recurrent) RecurrentState(ctx ml.Context, layer int, dims ...int) (ml.Tensor, error) {
+	if err := c.ensureWritable(ctx); err != nil {
+		return nil, err
+	}
+	if len(dims) == 0 {
+		return nil, ErrInvalidRecurrentShape
+	}
+
+	size := 1
+	for _, d := range dims {
+		if d <= 0 {
+			return nil, ErrInvalidRecurrentShape
+		}
+		size *= d
+	}
+	if size != c.recurrentStateSize {
+		return nil, fmt.Errorf("%w: got %v (size %d), want size %d", ErrInvalidRecurrentShape, dims, size, c.recurrentStateSize)
+	}
+
+	buf := c.recurrentBuffer(layer)
+	cur := c.currentSlotRows(ctx, buf, c.recurrentStateSize)
+	shape := make([]int, 0, len(dims)+1)
+	shape = append(shape, dims...)
+	shape = append(shape, c.NumSeqs())
+	return cur.Reshape(ctx, shape...), nil
+}
+
+// RecurrentState4D returns recurrent state as [dim0, dim1, dim2, nSeqs].
+func (c *Recurrent) RecurrentState4D(ctx ml.Context, layer int, dim0, dim1, dim2 int) (ml.Tensor, error) {
+	if err := c.ensureWritable(ctx); err != nil {
+		return nil, err
+	}
+	if dim0 <= 0 || dim1 <= 0 || dim2 <= 0 {
+		return nil, ErrInvalidRecurrentShape
+	}
+
+	size := dim0 * dim1 * dim2
+	if size != c.recurrentStateSize {
+		return nil, fmt.Errorf("%w: got [%d %d %d] (size %d), want size %d", ErrInvalidRecurrentShape, dim0, dim1, dim2, size, c.recurrentStateSize)
+	}
+
+	buf := c.recurrentBuffer(layer)
+	cur := c.currentSlotRows(ctx, buf, c.recurrentStateSize)
+	return cur.Reshape(ctx, dim0, dim1, dim2, c.NumSeqs()), nil
+}
+
+// UpdateRecurrentState writes new recurrent state for current batch sequences.
+func (c *Recurrent) UpdateRecurrentState(ctx ml.Context, layer int, newState ml.Tensor) {
+	buf := c.recurrentBuffer(layer)
+	src := newState.Reshape(ctx, c.recurrentStateSize, c.NumSeqs())
+	srcF32 := src
+	if src.DType() != ml.DTypeF32 {
+		srcF32 = src.Cast(ctx, ml.DTypeF32)
+	}
+	c.writeCurrentSlotRows(ctx, buf, c.recurrentStateSize, srcF32)
+
+	c.captureRecurrentCheckpoint(ctx, layer, srcF32)
+}
+
+// IsSupportedForBatch returns true if the current batch layout supports recurrent layers.
+func (c *Recurrent) IsSupportedForBatch() bool {
+	return c.curSeqTokens > 0 && len(c.curSeqs) > 0
+}
+
+// Seqs returns the ordered unique sequences for the current forward pass.
+func (c *Recurrent) Seqs() []int {
+	return slices.Clone(c.curSeqs)
+}
--- a/model/models/qwen3next/checkpoints.go
+++ b/model/models/qwen3next/checkpoints.go
@@ -1,27 +1,20 @@
-package qwen3next
+package kvcache

 import (
 	"log/slog"
 	"math"

-	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/model/input"
 )

-const (
-	checkpointCountDefault    = 32
-	checkpointMinPosDefault   = int32(16)
-	checkpointIntervalDefault = int32(1280)
-)
-
 // TODO(jmorganca): Add byte-serialized host-RAM checkpoints to reduce GPU
 // memory usage while preserving prefix reuse for recurrent state.

 type checkpointEntry struct {
-	pos   int32
-	conv  map[int]ml.Tensor
-	delta map[int]ml.Tensor
+	pos       int32
+	conv      map[int]ml.Tensor
+	recurrent map[int]ml.Tensor
 }

 type slotCheckpointStore struct {
@@ -132,6 +125,63 @@ func (s *slotCheckpointStore) pruneAfter(pos int32) {
 	s.lastPos = pos
 }

+func (s *slotCheckpointStore) shiftRange(beginIndex, endIndex int32) {
+	if len(s.entries) == 0 {
+		s.size = 0
+		s.next = 0
+		s.lastPos = -1
+		return
+	}
+
+	offset := beginIndex - endIndex
+
+	size := 0
+	next := -1
+	minPos := int32(math.MaxInt32)
+	maxPos := int32(-1)
+	minIdx := 0
+
+	for i := range s.entries {
+		pos := s.entries[i].pos
+		if pos >= 0 {
+			if pos >= beginIndex && pos < endIndex {
+				s.entries[i].pos = -1
+			} else if pos >= endIndex {
+				s.entries[i].pos = pos + offset
+			}
+		}
+
+		pos = s.entries[i].pos
+		if pos >= 0 {
+			size++
+			if pos < minPos {
+				minPos = pos
+				minIdx = i
+			}
+			if pos > maxPos {
+				maxPos = pos
+			}
+		} else if next == -1 {
+			next = i
+		}
+	}
+
+	s.size = size
+	if size == 0 {
+		s.next = 0
+		s.lastPos = -1
+		return
+	}
+
+	if next != -1 {
+		s.next = next
+	} else {
+		// Full ring: overwrite the oldest checkpoint next.
+		s.next = minIdx
+	}
+	s.lastPos = maxPos
+}
+
 func (s *slotCheckpointStore) window() (size int, minPos, maxPos, lastPos int32) {
 	minPos = int32(math.MaxInt32)
 	maxPos = int32(-1)
@@ -155,7 +205,14 @@ func (s *slotCheckpointStore) window() (size int, minPos, maxPos, lastPos int32)
 	return size, minPos, maxPos, s.lastPos
 }

-func (c *HybridCache) planCheckpoints(batch input.Batch) {
+func (c *Recurrent) checkpointTag() string {
+	if c.logPrefix == "" {
+		return "kvcache.recurrent"
+	}
+	return c.logPrefix
+}
+
+func (c *Recurrent) planCheckpoints(batch input.Batch) {
 	if c.checkpointCount == 0 || len(c.curSeqs) == 0 {
 		c.curCheckpointPos = c.curCheckpointPos[:0]
 		for k := range c.curCheckpointSlots {
@@ -201,7 +258,7 @@ func (c *HybridCache) planCheckpoints(batch input.Batch) {
 	}
 }

-func (c *HybridCache) checkpointStore(slot int) *slotCheckpointStore {
+func (c *Recurrent) checkpointStore(slot int) *slotCheckpointStore {
 	store, ok := c.checkpoints[slot]
 	if ok {
 		return store
@@ -211,7 +268,7 @@ func (c *HybridCache) checkpointStore(slot int) *slotCheckpointStore {
 	return store
 }

-func (c *HybridCache) checkpointIndexForSlot(slot int, pos int32) int {
+func (c *Recurrent) checkpointIndexForSlot(slot int, pos int32) int {
 	if c.checkpointCount == 0 {
 		return -1
 	}
@@ -226,7 +283,7 @@ func (c *HybridCache) checkpointIndexForSlot(slot int, pos int32) int {
 	return idx
 }

-func (c *HybridCache) hasCheckpoint(seq int, pos int32) bool {
+func (c *Recurrent) hasCheckpoint(seq int, pos int32) bool {
 	if pos <= 0 {
 		return false
 	}
@@ -242,7 +299,7 @@ func (c *HybridCache) hasCheckpoint(seq int, pos int32) bool {
 	return ok
 }

-func (c *HybridCache) PrepareRestore(seq int, targetPos int32) (int32, bool) {
+func (c *Recurrent) PrepareRestore(seq int, targetPos int32) (int32, bool) {
 	if targetPos <= 0 {
 		return 0, false
 	}
@@ -252,13 +309,13 @@ func (c *HybridCache) PrepareRestore(seq int, targetPos int32) (int32, bool) {
 	}
 	store, ok := c.checkpoints[slot]
 	if !ok {
-		slog.Debug("qwen3next: checkpoint miss", "seq", seq, "slot", slot, "target", targetPos, "size", 0)
+		slog.Debug(c.checkpointTag()+": checkpoint miss", "seq", seq, "slot", slot, "target", targetPos, "size", 0)
 		return 0, false
 	}
 	idx, pos, ok := store.bestIndex(targetPos)
 	if !ok {
 		size, minPos, maxPos, lastPos := store.window()
-		slog.Debug("qwen3next: checkpoint miss", "seq", seq, "slot", slot, "target", targetPos, "size", size,
+		slog.Debug(c.checkpointTag()+": checkpoint miss", "seq", seq, "slot", slot, "target", targetPos, "size", size,
 			"min", minPos, "max", maxPos, "last", lastPos)
 		return 0, false
 	}
@@ -270,10 +327,10 @@ func (c *HybridCache) PrepareRestore(seq int, targetPos int32) (int32, bool) {
 	return pos + 1, true
 }

-func (c *HybridCache) applyCheckpointRestore(restore checkpointRestore) error {
+func (c *Recurrent) applyCheckpointRestore(restore checkpointRestore) error {
 	entry, ok := c.restoreEntry(restore)
 	if !ok {
-		return kvcache.ErrNotSupported
+		return ErrNotSupported
 	}

 	ctx := c.backend.NewContext()
@@ -281,15 +338,15 @@ func (c *HybridCache) applyCheckpointRestore(restore checkpointRestore) error {

 	slotIdx := ctx.Input().FromInts([]int32{int32(restore.slot)}, 1)
 	for layer, src := range entry.conv {
-		buf := c.convBuffer(ctx, layer)
+		buf := c.convBuffer(layer)
 		ctx.Forward(buf.SetRows(ctx, src, slotIdx))
 	}
-	for layer, src := range entry.delta {
-		buf := c.deltaBuffer(ctx, layer)
+	for layer, src := range entry.recurrent {
+		buf := c.recurrentBuffer(layer)
 		ctx.Forward(buf.SetRows(ctx, src, slotIdx))
 	}

-	if len(entry.conv) > 0 || len(entry.delta) > 0 {
+	if len(entry.conv) > 0 || len(entry.recurrent) > 0 {
 		ctx.Compute()
 	}
 	store := c.checkpoints[restore.slot]
@@ -297,12 +354,12 @@ func (c *HybridCache) applyCheckpointRestore(restore checkpointRestore) error {
 	return nil
 }

-func (c *HybridCache) restoreComplete(restore checkpointRestore) bool {
+func (c *Recurrent) restoreComplete(restore checkpointRestore) bool {
 	_, ok := c.restoreEntry(restore)
 	return ok
 }

-func (c *HybridCache) restoreEntry(restore checkpointRestore) (*checkpointEntry, bool) {
+func (c *Recurrent) restoreEntry(restore checkpointRestore) (*checkpointEntry, bool) {
 	store, ok := c.checkpoints[restore.slot]
 	if !ok || restore.idx < 0 || restore.idx >= len(store.entries) {
 		return nil, false
@@ -317,27 +374,33 @@ func (c *HybridCache) restoreEntry(restore checkpointRestore) (*checkpointEntry,
 	return entry, true
 }

-func (c *HybridCache) entryComplete(entry *checkpointEntry) bool {
+func (c *Recurrent) entryComplete(entry *checkpointEntry) bool {
 	for layer := range c.convStates {
 		if entry.conv == nil || entry.conv[layer] == nil {
 			return false
 		}
 	}
-	for layer := range c.deltaStates {
-		if entry.delta == nil || entry.delta[layer] == nil {
+	for layer := range c.recurrentStates {
+		if entry.recurrent == nil || entry.recurrent[layer] == nil {
 			return false
 		}
 	}
 	return true
 }

-func (c *HybridCache) clearCheckpoints(slot int) {
+func (c *Recurrent) clearCheckpoints(slot int) {
 	if store, ok := c.checkpoints[slot]; ok {
 		store.reset()
 	}
 }

-func (c *HybridCache) copyCheckpoints(ctx ml.Context, srcSlot, dstSlot int) {
+func (c *Recurrent) shiftCheckpoints(slot int, beginIndex, endIndex int32) {
+	if store, ok := c.checkpoints[slot]; ok {
+		store.shiftRange(beginIndex, endIndex)
+	}
+}
+
+func (c *Recurrent) copyCheckpoints(ctx ml.Context, srcSlot, dstSlot int) {
 	if c.checkpointCount == 0 {
 		return
 	}
@@ -363,19 +426,19 @@ func (c *HybridCache) copyCheckpoints(ctx ml.Context, srcSlot, dstSlot int) {
 				ctx.Forward(src.Copy(ctx, dst))
 			}
 		}
-		if srcEntry.delta != nil {
-			if dstEntry.delta == nil {
-				dstEntry.delta = make(map[int]ml.Tensor)
+		if srcEntry.recurrent != nil {
+			if dstEntry.recurrent == nil {
+				dstEntry.recurrent = make(map[int]ml.Tensor)
 			}
-			for layer, src := range srcEntry.delta {
-				dst := c.ensureCheckpointDelta(layer, dstEntry)
+			for layer, src := range srcEntry.recurrent {
+				dst := c.ensureCheckpointRecurrent(layer, dstEntry)
 				ctx.Forward(src.Copy(ctx, dst))
 			}
 		}
 	}
 }

-func (c *HybridCache) captureConvCheckpoint(ctx ml.Context, layer int, src ml.Tensor) {
+func (c *Recurrent) captureConvCheckpoint(ctx ml.Context, layer int, src ml.Tensor) {
 	if c.checkpointCount == 0 {
 		return
 	}
@@ -402,12 +465,12 @@ func (c *HybridCache) captureConvCheckpoint(ctx ml.Context, layer int, src ml.Te
 	}
 }

-func (c *HybridCache) captureDeltaCheckpoint(ctx ml.Context, layer int, src ml.Tensor) {
+func (c *Recurrent) captureRecurrentCheckpoint(ctx ml.Context, layer int, src ml.Tensor) {
 	if c.checkpointCount == 0 {
 		return
 	}
 	if c.reserveCheckpoints {
-		c.reserveCheckpointDelta(layer)
+		c.reserveCheckpointRecurrent(layer)
 		return
 	}
 	if len(c.curCheckpointPos) == 0 {
@@ -423,13 +486,13 @@ func (c *HybridCache) captureDeltaCheckpoint(ctx ml.Context, layer int, src ml.T
 			continue
 		}
 		entry := &c.checkpoints[slot].entries[idx]
-		dst := c.ensureCheckpointDelta(layer, entry)
+		dst := c.ensureCheckpointRecurrent(layer, entry)
 		seqSlice := src.Slice(ctx, 1, i, i+1, 1)
 		ctx.Forward(seqSlice.Copy(ctx, dst))
 	}
 }

-func (c *HybridCache) ensureCheckpointConv(layer int, entry *checkpointEntry) ml.Tensor {
+func (c *Recurrent) ensureCheckpointConv(layer int, entry *checkpointEntry) ml.Tensor {
 	if entry.conv == nil {
 		entry.conv = make(map[int]ml.Tensor)
 	}
@@ -446,24 +509,24 @@ func (c *HybridCache) ensureCheckpointConv(layer int, entry *checkpointEntry) ml
 	return t
 }

-func (c *HybridCache) ensureCheckpointDelta(layer int, entry *checkpointEntry) ml.Tensor {
-	if entry.delta == nil {
-		entry.delta = make(map[int]ml.Tensor)
+func (c *Recurrent) ensureCheckpointRecurrent(layer int, entry *checkpointEntry) ml.Tensor {
+	if entry.recurrent == nil {
+		entry.recurrent = make(map[int]ml.Tensor)
 	}
-	if t, ok := entry.delta[layer]; ok {
+	if t, ok := entry.recurrent[layer]; ok {
 		return t
 	}
-	ctx, ok := c.checkpointDeltaCtxs[layer]
+	ctx, ok := c.checkpointRecurCtxs[layer]
 	if !ok {
 		ctx = c.backend.NewContextSize(c.checkpointCtxSize).Layer(layer)
-		c.checkpointDeltaCtxs[layer] = ctx
+		c.checkpointRecurCtxs[layer] = ctx
 	}
-	t := ctx.Zeros(ml.DTypeF32, c.deltaStateSize, 1)
-	entry.delta[layer] = t
+	t := ctx.Zeros(ml.DTypeF32, c.recurrentStateSize, 1)
+	entry.recurrent[layer] = t
 	return t
 }

-func (c *HybridCache) reserveCheckpointConv(layer int) {
+func (c *Recurrent) reserveCheckpointConv(layer int) {
 	key := checkpointReserveKey(layer, 0)
 	if _, ok := c.checkpointReserved[key]; ok {
 		return
@@ -478,7 +541,7 @@ func (c *HybridCache) reserveCheckpointConv(layer int) {
 	c.checkpointReserved[key] = struct{}{}
 }

-func (c *HybridCache) reserveCheckpointDelta(layer int) {
+func (c *Recurrent) reserveCheckpointRecurrent(layer int) {
 	key := checkpointReserveKey(layer, 1)
 	if _, ok := c.checkpointReserved[key]; ok {
 		return
@@ -487,7 +550,7 @@ func (c *HybridCache) reserveCheckpointDelta(layer int) {
 		store := c.checkpointStore(slot)
 		for i := range store.entries {
 			entry := &store.entries[i]
-			_ = c.ensureCheckpointDelta(layer, entry)
+			_ = c.ensureCheckpointRecurrent(layer, entry)
 		}
 	}
 	c.checkpointReserved[key] = struct{}{}
--- a/model/models/qwen3next/checkpoints_test.go
+++ b/model/models/qwen3next/checkpoints_test.go
@@ -1,40 +1,16 @@
-package qwen3next
+package kvcache

 import (
 	"errors"
 	"math"
-	"os"
+	"slices"
 	"testing"

-	"github.com/ollama/ollama/fs/ggml"
-	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 )

-func newTestBackend(tb testing.TB) ml.Backend {
-	tb.Helper()
-
-	f, err := os.CreateTemp(tb.TempDir(), "*.gguf")
-	if err != nil {
-		tb.Fatal(err)
-	}
-	if err := ggml.WriteGGUF(f, ggml.KV{"general.architecture": "test"}, nil); err != nil {
-		_ = f.Close()
-		tb.Fatal(err)
-	}
-	if err := f.Close(); err != nil {
-		tb.Fatal(err)
-	}
-
-	b, err := ml.NewBackend(f.Name(), ml.BackendParams{AllocMemory: true})
-	if err != nil {
-		tb.Fatal(err)
-	}
-	tb.Cleanup(func() {
-		b.Close()
-	})
-
-	return b
+func newTestCache() *Recurrent {
+	return NewRecurrentCache(RecurrentConfig{ConvDim: 1, ConvChannels: 2, RecurrentStateSize: 2})
 }

 func TestSlotCheckpointStoreBestIndex(t *testing.T) {
@@ -59,8 +35,8 @@ func TestSlotCheckpointStoreBestIndex(t *testing.T) {
 	}
 }

-func TestHybridCachePrepareRestore(t *testing.T) {
-	cache := NewHybridCache(nil, 1, 1, 1)
+func TestCachePrepareRestore(t *testing.T) {
+	cache := newTestCache()
 	cache.checkpointCount = 3
 	cache.checkpoints = make(map[int]*slotCheckpointStore)
 	cache.pendingRestore = make(map[int]checkpointRestore)
@@ -110,45 +86,8 @@ func TestSlotCheckpointStorePruneAfter(t *testing.T) {
 	}
 }

-func TestHybridCacheRestoreDetachesSharedSlot(t *testing.T) {
-	backend := newTestBackend(t)
-
-	cache := NewHybridCache(nil, 1, 2, 2)
-	cache.Init(backend, ml.DTypeF16, 2, 8, 2)
-
-	cache.slotForSeq[1] = 0
-	cache.slotForSeq[2] = 0
-	cache.refCount[0] = 2
-	cache.refCount[1] = 0
-	cache.freeSlots = []int{1}
-
-	store := cache.checkpointStore(0)
-	idx := store.record(9)
-	cache.pendingRestore[1] = checkpointRestore{slot: 0, idx: idx, pos: 9}
-
-	if err := cache.Remove(1, 10, math.MaxInt32); err != nil {
-		t.Fatalf("Remove failed: %v", err)
-	}
-
-	if cache.slotForSeq[1] == cache.slotForSeq[2] {
-		t.Fatalf("expected restore to detach shared slot, got same slot %d", cache.slotForSeq[1])
-	}
-	if cache.slotForSeq[1] != 1 {
-		t.Fatalf("expected seq 1 to move to slot 1, got %d", cache.slotForSeq[1])
-	}
-	if cache.slotForSeq[2] != 0 {
-		t.Fatalf("expected seq 2 to remain on slot 0, got %d", cache.slotForSeq[2])
-	}
-	if cache.refCount[0] != 1 || cache.refCount[1] != 1 {
-		t.Fatalf("unexpected refCounts: slot0=%d slot1=%d", cache.refCount[0], cache.refCount[1])
-	}
-	if _, ok := cache.pendingRestore[1]; ok {
-		t.Fatalf("expected pending restore to be cleared")
-	}
-}
-
-func TestHybridCacheRestoreRejectsIncompleteCheckpoint(t *testing.T) {
-	cache := NewHybridCache(nil, 1, 2, 2)
+func TestCacheRestoreRejectsIncompleteCheckpoint(t *testing.T) {
+	cache := newTestCache()
 	cache.checkpointCount = 3
 	cache.checkpoints = make(map[int]*slotCheckpointStore)
 	cache.pendingRestore = make(map[int]checkpointRestore)
@@ -157,27 +96,26 @@ func TestHybridCacheRestoreRejectsIncompleteCheckpoint(t *testing.T) {
 	cache.refCount = []int{1}
 	cache.freeSlots = nil

-	// Simulate that layer 0 has both conv and delta state (so entryComplete expects both)
-	cache.convStates[0] = nil  // placeholder to indicate layer 0 exists
-	cache.deltaStates[0] = nil // placeholder to indicate layer 0 exists
+	// Simulate layer 0 requires both conv and recurrent checkpoints.
+	cache.convStates[0] = nil
+	cache.recurrentStates[0] = nil

 	store := cache.checkpointStore(0)
 	idx := store.record(9)
 	entry := &store.entries[idx]
-	// Only set conv checkpoint, not delta - making it incomplete
 	entry.conv = map[int]ml.Tensor{0: nil}
-	// entry.delta is not set, so checkpoint is incomplete
+	// entry.recurrent intentionally missing

 	cache.pendingRestore[1] = checkpointRestore{slot: 0, idx: idx, pos: 9}

 	err := cache.Remove(1, 10, math.MaxInt32)
-	if !errors.Is(err, kvcache.ErrNotSupported) {
+	if !errors.Is(err, ErrNotSupported) {
 		t.Fatalf("expected ErrNotSupported for incomplete checkpoint, got %v", err)
 	}
 }

-func TestHybridCacheRestoreAcceptsCompleteCheckpoint(t *testing.T) {
-	cache := NewHybridCache(nil, 1, 2, 2)
+func TestCacheRestoreAcceptsCompleteCheckpoint(t *testing.T) {
+	cache := newTestCache()
 	cache.checkpointCount = 3
 	cache.checkpoints = make(map[int]*slotCheckpointStore)
 	cache.pendingRestore = make(map[int]checkpointRestore)
@@ -186,55 +124,111 @@ func TestHybridCacheRestoreAcceptsCompleteCheckpoint(t *testing.T) {
 	cache.refCount = []int{1}
 	cache.freeSlots = nil

-	// Don't set convStates/deltaStates - with no layers to check,
-	// entryComplete will return true as long as entry.pos >= 0
-
 	store := cache.checkpointStore(0)
 	idx := store.record(9)

 	cache.pendingRestore[1] = checkpointRestore{slot: 0, idx: idx, pos: 9}

-	// Test that restoreComplete returns true when no layers need checkpoints
 	restore := cache.pendingRestore[1]
 	if !cache.restoreComplete(restore) {
 		t.Fatalf("expected restoreComplete to return true for complete checkpoint")
 	}
 }

+func TestCacheRecurrentStateShapeValidation(t *testing.T) {
+	cache := newTestCache()
+	_, err := cache.RecurrentState(nil, 0, 3)
+	if !errors.Is(err, ErrInvalidRecurrentShape) {
+		t.Fatalf("expected ErrInvalidRecurrentShape, got %v", err)
+	}
+}
+
+func TestSlotCheckpointStoreShiftRange(t *testing.T) {
+	store := newSlotCheckpointStore(5)
+	store.record(1)
+	store.record(4)
+	store.record(7)
+	store.record(10)
+
+	store.shiftRange(2, 6)
+
+	var positions []int32
+	for i := range store.entries {
+		if store.entries[i].pos >= 0 {
+			positions = append(positions, store.entries[i].pos)
+		}
+	}
+	slices.Sort(positions)
+
+	want := []int32{1, 3, 6}
+	if !slices.Equal(positions, want) {
+		t.Fatalf("unexpected shifted positions: got=%v want=%v", positions, want)
+	}
+	if store.lastPos != 6 {
+		t.Fatalf("expected lastPos 6, got %d", store.lastPos)
+	}
+}
+
+func TestCacheRemoveMiddleShiftsCheckpoints(t *testing.T) {
+	cache := newTestCache()
+	cache.slotForSeq[1] = 0
+	cache.refCount = []int{1}
+	cache.pendingRestore[1] = checkpointRestore{slot: 0, idx: 0, pos: 1}
+
+	store := cache.checkpointStore(0)
+	store.record(1)
+	store.record(4)
+	store.record(7)
+	store.record(10)
+
+	if err := cache.Remove(1, 2, 6); err != nil {
+		t.Fatalf("expected middle remove to succeed, got %v", err)
+	}
+
+	if _, ok := cache.pendingRestore[1]; ok {
+		t.Fatalf("expected pending restore to be cleared after middle remove")
+	}
+
+	var positions []int32
+	for i := range store.entries {
+		if store.entries[i].pos >= 0 {
+			positions = append(positions, store.entries[i].pos)
+		}
+	}
+	slices.Sort(positions)
+
+	want := []int32{1, 3, 6}
+	if !slices.Equal(positions, want) {
+		t.Fatalf("unexpected checkpoint positions after remove: got=%v want=%v", positions, want)
+	}
+}
+
 func TestSlotCheckpointStoreRingBufferWrapAround(t *testing.T) {
-	// Test that ring buffer wrap-around reuses entries without clearing maps.
 	store := newSlotCheckpointStore(3)

-	// Fill the buffer
 	store.record(10)
 	store.record(20)
 	store.record(30)

-	// Create fake tensor data in the first entry's maps
 	store.entries[0].conv = make(map[int]ml.Tensor)
-	store.entries[0].conv[0] = nil // Simulated tensor reference
-	store.entries[0].delta = make(map[int]ml.Tensor)
-	store.entries[0].delta[0] = nil // Simulated tensor reference
+	store.entries[0].conv[0] = nil
+	store.entries[0].recurrent = make(map[int]ml.Tensor)
+	store.entries[0].recurrent[0] = nil

-	// Record another entry, which should wrap around and overwrite entry 0
 	store.record(40)

-	// Verify the maps are still present (we reuse tensors)
 	if store.entries[0].conv == nil {
 		t.Fatalf("expected conv map to be preserved on reuse")
 	}
-	if store.entries[0].delta == nil {
-		t.Fatalf("expected delta map to be preserved on reuse")
+	if store.entries[0].recurrent == nil {
+		t.Fatalf("expected recurrent map to be preserved on reuse")
 	}
-
-	// Verify the new position was recorded
 	if store.entries[0].pos != 40 {
 		t.Fatalf("expected entry 0 pos to be 40, got %d", store.entries[0].pos)
 	}
 }

 func TestSlotCheckpointStoreFullCapacity(t *testing.T) {
-	// Test behavior when buffer is exactly at capacity
 	store := newSlotCheckpointStore(2)

 	idx1 := store.record(10)
@@ -243,12 +237,10 @@ func TestSlotCheckpointStoreFullCapacity(t *testing.T) {
 	if idx1 != 0 || idx2 != 1 {
 		t.Fatalf("expected indices 0, 1, got %d, %d", idx1, idx2)
 	}
-
 	if store.size != 2 {
 		t.Fatalf("expected size 2, got %d", store.size)
 	}

-	// Verify both checkpoints are accessible
 	_, pos1, ok1 := store.bestIndex(15)
 	_, pos2, ok2 := store.bestIndex(25)

@@ -261,7 +253,6 @@ func TestSlotCheckpointStoreFullCapacity(t *testing.T) {
 }

 func TestSlotCheckpointStoreEmptyBuffer(t *testing.T) {
-	// Test behavior with zero-size buffer
 	store := newSlotCheckpointStore(0)

 	idx := store.record(10)
@@ -276,19 +267,16 @@ func TestSlotCheckpointStoreEmptyBuffer(t *testing.T) {
 }

 func TestSlotCheckpointStorePruneAfterAll(t *testing.T) {
-	// Test pruning that removes all checkpoints
 	store := newSlotCheckpointStore(3)
 	store.record(10)
 	store.record(20)
 	store.record(30)

-	// Prune everything by setting threshold below all positions
 	store.pruneAfter(5)

 	if store.size != 0 {
 		t.Fatalf("expected size 0 after pruning all, got %d", store.size)
 	}
-	// When all checkpoints are pruned, lastPos is reset to -1
 	if store.lastPos != -1 {
 		t.Fatalf("expected lastPos -1 after pruning all, got %d", store.lastPos)
 	}
--- a/llama/patches/0034-ggml-metal-guard-mul_mat_id-map0-and-add-ne20-22-spe.patch
+++ b/llama/patches/0034-ggml-metal-guard-mul_mat_id-map0-and-add-ne20-22-spe.patch
@@ -0,0 +1,37 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: jmorganca <jmorganca@gmail.com>
+Date: Sun, 22 Feb 2026 14:12:30 -0800
+Subject: [PATCH] ggml-metal: guard mul_mat_id map0 and add ne20=22
+ specialization
+
+---
+ ggml/src/ggml-metal/ggml-metal-ops.cpp | 3 ++-
+ ggml/src/ggml-metal/ggml-metal.metal   | 1 +
+ 2 files changed, 3 insertions(+), 1 deletion(-)
+
+diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp
+index 4ac135603..ac5ad53db 100644
+--- a/ggml/src/ggml-metal/ggml-metal-ops.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp
+@@ -1961,7 +1961,8 @@ int ggml_metal_op_mul_mat_id(ggml_metal_op_t ctx, int idx) {
+     // ne21 = n_rows (batch size)
+     const int ne21_mm_id_min = 32;
+ 
+-    if (props_dev->has_simdgroup_mm && ne00 >= 64 && (ne21 >= ne21_mm_id_min)) {
+    if (props_dev->has_simdgroup_mm && ne00 >= 64 && (ne21 >= ne21_mm_id_min) &&
+            (ne20 == 1 || ne20 == 2 || ne20 == 4 || ne20 == 6 || ne20 == 8 || ne20 == 10 || ne20 == 16 || ne20 == 22)) {
+         // some Metal matrix data types require aligned pointers
+         // ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5)
+         //switch (op->src[0]->type) {
+diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
+index c37447a10..4f338aa13 100644
+--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
+@@ -9427,6 +9427,7 @@ template [[host_name("kernel_mul_mm_id_map0_ne20_6" )]] kernel kernel_mul_mm_id_
+ template [[host_name("kernel_mul_mm_id_map0_ne20_8" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<8>;
+ template [[host_name("kernel_mul_mm_id_map0_ne20_10")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<10>;
+ template [[host_name("kernel_mul_mm_id_map0_ne20_16")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<16>;
+template [[host_name("kernel_mul_mm_id_map0_ne20_22")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<22>;
+ 
+ template<typename S0, typename S0_4x4, typename S0_8x8, typename S1, typename S1_2x4, typename S1_8x8, typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread S0_4x4 &), typename T0, typename T0_4x4, typename T1, typename T1_2x4>
+ kernel void kernel_mul_mm_id(
--- a/middleware/openai.go
+++ b/middleware/openai.go
@@ -11,6 +11,7 @@ import (
 	"time"

 	"github.com/gin-gonic/gin"
+	"github.com/klauspost/compress/zstd"

 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/openai"
@@ -496,6 +497,17 @@ func (w *ResponsesWriter) Write(data []byte) (int, error) {

 func ResponsesMiddleware() gin.HandlerFunc {
 	return func(c *gin.Context) {
+		if c.GetHeader("Content-Encoding") == "zstd" {
+			reader, err := zstd.NewReader(c.Request.Body, zstd.WithDecoderMaxMemory(8<<20))
+			if err != nil {
+				c.AbortWithStatusJSON(http.StatusBadRequest, openai.NewError(http.StatusBadRequest, "failed to decompress zstd body"))
+				return
+			}
+			defer reader.Close()
+			c.Request.Body = io.NopCloser(reader)
+			c.Request.Header.Del("Content-Encoding")
+		}
+
 		var req openai.ResponsesRequest
 		if err := c.ShouldBindJSON(&req); err != nil {
 			c.AbortWithStatusJSON(http.StatusBadRequest, openai.NewError(http.StatusBadRequest, err.Error()))
--- a/middleware/openai_test.go
+++ b/middleware/openai_test.go
@@ -14,6 +14,7 @@ import (

 	"github.com/gin-gonic/gin"
 	"github.com/google/go-cmp/cmp"
+	"github.com/klauspost/compress/zstd"

 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/openai"
@@ -1238,3 +1239,102 @@ func TestImageEditsMiddleware(t *testing.T) {
 		})
 	}
 }
+
+func zstdCompress(t *testing.T, data []byte) []byte {
+	t.Helper()
+	var buf bytes.Buffer
+	w, err := zstd.NewWriter(&buf)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if _, err := w.Write(data); err != nil {
+		t.Fatal(err)
+	}
+	if err := w.Close(); err != nil {
+		t.Fatal(err)
+	}
+	return buf.Bytes()
+}
+
+func TestResponsesMiddlewareZstd(t *testing.T) {
+	tests := []struct {
+		name        string
+		body        string
+		useZstd     bool
+		oversized   bool
+		wantCode    int
+		wantModel   string
+		wantMessage string
+	}{
+		{
+			name:        "plain JSON",
+			body:        `{"model": "test-model", "input": "Hello"}`,
+			wantCode:    http.StatusOK,
+			wantModel:   "test-model",
+			wantMessage: "Hello",
+		},
+		{
+			name:        "zstd compressed",
+			body:        `{"model": "test-model", "input": "Hello"}`,
+			useZstd:     true,
+			wantCode:    http.StatusOK,
+			wantModel:   "test-model",
+			wantMessage: "Hello",
+		},
+		{
+			name:      "zstd over max decompressed size",
+			oversized: true,
+			useZstd:   true,
+			wantCode:  http.StatusBadRequest,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			var capturedRequest *api.ChatRequest
+
+			gin.SetMode(gin.TestMode)
+			router := gin.New()
+			router.Use(ResponsesMiddleware(), captureRequestMiddleware(&capturedRequest))
+			router.Handle(http.MethodPost, "/v1/responses", func(c *gin.Context) {
+				c.Status(http.StatusOK)
+			})
+
+			var bodyReader io.Reader
+			if tt.oversized {
+				bodyReader = bytes.NewReader(zstdCompress(t, bytes.Repeat([]byte("A"), 9<<20)))
+			} else if tt.useZstd {
+				bodyReader = bytes.NewReader(zstdCompress(t, []byte(tt.body)))
+			} else {
+				bodyReader = strings.NewReader(tt.body)
+			}
+
+			req, _ := http.NewRequest(http.MethodPost, "/v1/responses", bodyReader)
+			req.Header.Set("Content-Type", "application/json")
+			if tt.useZstd || tt.oversized {
+				req.Header.Set("Content-Encoding", "zstd")
+			}
+
+			resp := httptest.NewRecorder()
+			router.ServeHTTP(resp, req)
+
+			if resp.Code != tt.wantCode {
+				t.Fatalf("expected status %d, got %d: %s", tt.wantCode, resp.Code, resp.Body.String())
+			}
+
+			if tt.wantCode != http.StatusOK {
+				return
+			}
+
+			if capturedRequest == nil {
+				t.Fatal("expected captured request, got nil")
+			}
+			if capturedRequest.Model != tt.wantModel {
+				t.Fatalf("expected model %q, got %q", tt.wantModel, capturedRequest.Model)
+			}
+			if len(capturedRequest.Messages) != 1 || capturedRequest.Messages[0].Content != tt.wantMessage {
+				t.Fatalf("expected single user message %q, got %+v", tt.wantMessage, capturedRequest.Messages)
+			}
+		})
+	}
+}
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -163,6 +163,7 @@ type Tensor interface {
 	Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
 	Conv3D(ctx Context, weight Tensor, c, s0, s1, s2, p0, p1, p2, d0, d1, d2 int) Tensor
 	SSMConv(ctx Context, kernel Tensor) Tensor
+	SSMScan(ctx Context, x, dt, A, B, C, ids Tensor) Tensor

 	IM2Col(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor

--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -1662,6 +1662,13 @@ func (t *Tensor) SSMConv(ctx ml.Context, kernel ml.Tensor) ml.Tensor {
 	}
 }

+func (t *Tensor) SSMScan(ctx ml.Context, x, dt, A, B, C, ids ml.Tensor) ml.Tensor {
+	return &Tensor{
+		b: t.b,
+		t: C.ggml_ssm_scan(ctx.(*Context).ctx, t.t, x.(*Tensor).t, dt.(*Tensor).t, A.(*Tensor).t, B.(*Tensor).t, C.(*Tensor).t, ids.(*Tensor).t),
+	}
+}
+
 func (t *Tensor) AvgPool2D(ctx ml.Context, k, s int, p float32) ml.Tensor {
 	return &Tensor{
 		b: t.b,
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
@@ -12249,6 +12249,7 @@ template [[host_name("kernel_mul_mm_id_map0_ne20_6" )]] kernel kernel_mul_mm_id_
 template [[host_name("kernel_mul_mm_id_map0_ne20_8" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<8>;
 template [[host_name("kernel_mul_mm_id_map0_ne20_10")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<10>;
 template [[host_name("kernel_mul_mm_id_map0_ne20_16")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<16>;
+template [[host_name("kernel_mul_mm_id_map0_ne20_22")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<22>;

 template<typename S0, typename S0_4x4, typename S0_8x8, typename S1, typename S1_2x4, typename S1_8x8, typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread S0_4x4 &), typename T0, typename T0_4x4, typename T1, typename T1_2x4>
 kernel void kernel_mul_mm_id(
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-ops.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-ops.cpp
@@ -1961,7 +1961,8 @@ int ggml_metal_op_mul_mat_id(ggml_metal_op_t ctx, int idx) {
    // ne21 = n_rows (batch size)
    const int ne21_mm_id_min = 32;

-    if (props_dev->has_simdgroup_mm && ne00 >= 64 && (ne21 >= ne21_mm_id_min)) {
+    if (props_dev->has_simdgroup_mm && ne00 >= 64 && (ne21 >= ne21_mm_id_min) &&
+            (ne20 == 1 || ne20 == 2 || ne20 == 4 || ne20 == 6 || ne20 == 8 || ne20 == 10 || ne20 == 16 || ne20 == 22)) {
        // some Metal matrix data types require aligned pointers
        // ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5)
        //switch (op->src[0]->type) {
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal
@@ -9427,6 +9427,7 @@ template [[host_name("kernel_mul_mm_id_map0_ne20_6" )]] kernel kernel_mul_mm_id_
 template [[host_name("kernel_mul_mm_id_map0_ne20_8" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<8>;
 template [[host_name("kernel_mul_mm_id_map0_ne20_10")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<10>;
 template [[host_name("kernel_mul_mm_id_map0_ne20_16")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<16>;
+template [[host_name("kernel_mul_mm_id_map0_ne20_22")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<22>;

 template<typename S0, typename S0_4x4, typename S0_8x8, typename S1, typename S1_2x4, typename S1_8x8, typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread S0_4x4 &), typename T0, typename T0_4x4, typename T1, typename T1_2x4>
 kernel void kernel_mul_mm_id(
--- a/model/model_test.go
+++ b/model/model_test.go
@@ -67,6 +67,7 @@ func (f *fakeTensor) Tri(ctx ml.Context, _ int) ml.Tensor
 func (f *fakeTensor) Fill(ctx ml.Context, _ float32) ml.Tensor                     { return f }
 func (f *fakeTensor) Repeat4D(ctx ml.Context, _, _, _, _ int) ml.Tensor            { return f }
 func (f *fakeTensor) SolveTri(ctx ml.Context, _ ml.Tensor, _, _, _ bool) ml.Tensor { return f }
+func (f *fakeTensor) SSMScan(ctx ml.Context, _, _, _, _, _, _ ml.Tensor) ml.Tensor { return f }

 func (m *fakeBackend) Get(name string) ml.Tensor {
 	if slices.Contains(m.names, name) {
--- a/model/models/lfm2/cache.go
+++ b/model/models/lfm2/cache.go
@@ -1,410 +1,44 @@
 package lfm2

 import (
-	"slices"
-
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
-	"github.com/ollama/ollama/model/input"
 )

-var _ kvcache.Cache = (*HybridCache)(nil)
+var (
+	_ kvcache.Cache           = (*HybridCache)(nil)
+	_ kvcache.CheckpointCache = (*HybridCache)(nil)
+)

-// HybridCache stores:
-// - a standard causal KV cache for attention layers
-// - a per-sequence recurrent conv state for shortconv layers
+// HybridCache adapts the shared recurrent cache for LFM2:
+// - KV attention cache is handled by the embedded causal cache
+// - shortconv recurrent state uses conv slots [dConv, hiddenSize]
 //
-// Conv state shape (per layer, per sequence): [dConv, hiddenSize] where dConv = L_cache - 1.
-// Stored internally as a tensor of shape [dConv * hiddenSize, maxSlots].
+// This reuses shared checkpoint/restore logic for prefix mismatch recovery.
 type HybridCache struct {
-	kv *kvcache.Causal
-
-	backend      ml.Backend
-	dtype        ml.DType
-	maxSequences int
-
-	hiddenSize int
-	dConv      int
-
-	// slot mapping for recurrent state
-	slotForSeq map[int]int
-	refCount   []int
-	freeSlots  []int
-
-	// per-layer conv state buffers (allocated lazily)
-	convCtxs   map[int]ml.Context
-	convStates map[int]ml.Tensor // [dConv*hiddenSize, maxSlots]
-
-	// current forward batch (derived in StartForward)
-	curSeqs       []int
-	curSlots      []int
-	curSlotsInput ml.Tensor
-	curSeqTokens  int
-
-	// track if EnsureWritable has been called for this forward pass
-	writableEnsured bool
-	// track any error from EnsureWritable to propagate later
-	writableError error
+	*kvcache.Recurrent
 }

 func NewHybridCache(shift func(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error), hiddenSize, dConv int) *HybridCache {
-	return &HybridCache{
-		kv:         kvcache.NewCausalCache(shift),
-		hiddenSize: hiddenSize,
-		dConv:      dConv,
-		slotForSeq: make(map[int]int),
-		convCtxs:   make(map[int]ml.Context),
-		convStates: make(map[int]ml.Tensor),
-	}
-}
+	base := kvcache.NewRecurrentCache(kvcache.RecurrentConfig{
+		Shift:               shift,
+		ConvDim:             dConv,
+		ConvChannels:        hiddenSize,
+		RecurrentStateSize:  1, // LFM2 uses only conv state; keep a minimal recurrent buffer size.
+		CheckpointLogPrefix: "lfm2",
+	})

-func (c *HybridCache) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity, maxBatch int) {
-	c.backend = backend
-	c.dtype = dtype
-	c.maxSequences = maxSequences
-
-	// initialize slot allocator
-	c.refCount = make([]int, maxSequences)
-	c.freeSlots = c.freeSlots[:0]
-	for i := maxSequences - 1; i >= 0; i-- {
-		c.freeSlots = append(c.freeSlots, i)
-	}
-
-	c.kv.Init(backend, dtype, maxSequences, capacity, maxBatch)
-}
-
-func (c *HybridCache) Close() {
-	for _, ctx := range c.convCtxs {
-		ctx.Close()
-	}
-	c.kv.Close()
-}
-
-func (c *HybridCache) SetConfig(config ml.CacheConfig) {
-	c.kv.SetConfig(config)
-}
-
-func (c *HybridCache) SetLayer(layer int) {
-	c.kv.SetLayer(layer)
-}
-
-func (c *HybridCache) Get(ctx ml.Context) (ml.Tensor, ml.Tensor, ml.Tensor) {
-	return c.kv.Get(ctx)
-}
-
-func (c *HybridCache) Put(ctx ml.Context, key, value ml.Tensor) {
-	c.kv.Put(ctx, key, value)
-}
-
-func (c *HybridCache) StartForward(ctx ml.Context, batch input.Batch, reserve bool) error {
-	if err := c.kv.StartForward(ctx, batch, reserve); err != nil {
-		return err
-	}
-
-	// Derive equal-length sequence layout for shortconv.
-	// LFM2 shortconv assumes tokens form a [seq_tokens, seqs] grid.
-	seqCounts := make(map[int]int)
-	c.curSeqs = c.curSeqs[:0]
-	for _, s := range batch.Sequences {
-		if _, ok := seqCounts[s]; !ok {
-			c.curSeqs = append(c.curSeqs, s)
-		}
-		seqCounts[s]++
-	}
-
-	if len(c.curSeqs) == 0 {
-		return nil
-	}
-
-	nTokens := len(batch.Sequences)
-	nSeqs := len(c.curSeqs)
-	want := nTokens / nSeqs
-	for _, s := range c.curSeqs {
-		if seqCounts[s] != want {
-			return kvcache.ErrNotSupported
-		}
-	}
-
-	c.curSeqTokens = want
-
-	// When reserving memory for estimation, use fake slot assignments
-	// without modifying permanent state (slotForSeq, refCount)
-	if reserve {
-		c.curSlots = c.curSlots[:0]
-		slots := make([]int32, nSeqs)
-		for i := range nSeqs {
-			c.curSlots = append(c.curSlots, i)
-			slots[i] = int32(i)
-		}
-		c.curSlotsInput = ctx.Input().FromInts(slots, len(slots))
-		return nil
-	}
-
-	// Ensure slots exist for sequences in this batch
-	c.curSlots = c.curSlots[:0]
-	var newSlots []int // track newly allocated slots that need zeroing
-	for _, s := range c.curSeqs {
-		slot, ok := c.slotForSeq[s]
-		if !ok {
-			var err error
-			slot, err = c.allocSlot()
-			if err != nil {
-				return err
-			}
-			c.slotForSeq[s] = slot
-			c.refCount[slot] = 1
-			newSlots = append(newSlots, slot)
-		}
-		c.curSlots = append(c.curSlots, slot)
-	}
-
-	// Zero conv state for newly allocated slots to clear stale data from previous sequences
-	if len(newSlots) > 0 {
-		c.zeroConvSlots(ctx, newSlots)
-	}
-
-	// Create a tensor for the current slots
-	slots := make([]int32, len(c.curSlots))
-	for i, v := range c.curSlots {
-		slots[i] = int32(v)
-	}
-	c.curSlotsInput = ctx.Input().FromInts(slots, len(slots))
-
-	// Reset writable state for new forward pass
-	c.writableEnsured = false
-	c.writableError = nil
-
-	return nil
-}
-
-func (c *HybridCache) allocSlot() (int, error) {
-	if len(c.freeSlots) == 0 {
-		return 0, kvcache.ErrKvCacheFull
-	}
-	slot := c.freeSlots[len(c.freeSlots)-1]
-	c.freeSlots = c.freeSlots[:len(c.freeSlots)-1]
-	return slot, nil
-}
-
-func (c *HybridCache) freeSlot(slot int) {
-	// Bounds check before freeing
-	if slot >= 0 && slot < c.maxSequences {
-		c.freeSlots = append(c.freeSlots, slot)
-	}
-}
-
-// zeroConvSlots zeros the conv state for the given slots across all layers.
-// This must be called when recycling slots to prevent stale state from affecting new sequences.
-func (c *HybridCache) zeroConvSlots(ctx ml.Context, slots []int) {
-	if len(slots) == 0 || len(c.convStates) == 0 {
-		return
-	}
-
-	// Use input context for creating tensors
-	inputCtx := ctx.Input()
-
-	// Create slot indices tensor
-	slotIndices := make([]int32, len(slots))
-	for i, s := range slots {
-		slotIndices[i] = int32(s)
-	}
-	slotsTensor := inputCtx.FromInts(slotIndices, len(slotIndices))
-
-	// Create zero tensor for the slots (SetRows requires F32 source)
-	zeros := inputCtx.Zeros(ml.DTypeF32, c.dConv*c.hiddenSize, len(slots))
-
-	// Zero each layer's conv state for these slots
-	for _, buf := range c.convStates {
-		ctx.Forward(buf.SetRows(ctx, zeros, slotsTensor))
-	}
-}
-
-// EnsureWritable ensures that sequences in the current batch have private (non-shared) conv slots.
-// Returns an error if slot allocation fails.
-func (c *HybridCache) EnsureWritable(ctx ml.Context) error {
-	for i, seq := range c.curSeqs {
-		slot, ok := c.slotForSeq[seq]
-		if !ok {
-			continue
-		}
-
-		// Bounds check
-		if slot < 0 || slot >= len(c.refCount) {
-			continue
-		}
-
-		if c.refCount[slot] <= 1 {
-			continue
-		}
-
-		newSlot, err := c.allocSlot()
-		if err != nil {
-			return err
-		}
-		c.refCount[slot]--
-		c.refCount[newSlot] = 1
-		c.slotForSeq[seq] = newSlot
-		c.curSlots[i] = newSlot
-
-		// Copy existing conv state for all initialized layers
-		for _, buf := range c.convStates {
-			// buf: [dConv*hiddenSize, maxSlots]
-			src := buf.Rows(ctx, ctx.Input().FromInts([]int32{int32(slot)}, 1))
-			// SetRows requires F32 source
-			srcF32 := src.Cast(ctx, ml.DTypeF32)
-			ctx.Forward(buf.SetRows(ctx, srcF32, ctx.Input().FromInts([]int32{int32(newSlot)}, 1)))
-		}
-	}
-
-	// Rebuild current slots tensor
-	slots := make([]int32, len(c.curSlots))
-	for i, v := range c.curSlots {
-		slots[i] = int32(v)
-	}
-	c.curSlotsInput = ctx.Input().FromInts(slots, len(slots))
-
-	return nil
-}
-
-func (c *HybridCache) CopyPrefix(srcSeq, dstSeq int, prefixLen int32) {
-	// KV cache shares prefix metadata (no copy) which is correct for prefix reuse.
-	c.kv.CopyPrefix(srcSeq, dstSeq, prefixLen)
-
-	// For shortconv state we implement copy-on-write: dst shares the same slot as src.
-	// On the first write to dst, EnsureWritable will create a private slot.
-	if dstSlot, ok := c.slotForSeq[dstSeq]; ok {
-		// Bounds check before decrementing
-		if dstSlot >= 0 && dstSlot < len(c.refCount) {
-			c.refCount[dstSlot]--
-			if c.refCount[dstSlot] <= 0 {
-				c.refCount[dstSlot] = 0
-				c.freeSlot(dstSlot)
-			}
-		}
-		delete(c.slotForSeq, dstSeq)
-	}
-
-	srcSlot, ok := c.slotForSeq[srcSeq]
-	if !ok {
-		// src may not have a slot yet; dst will allocate on demand
-		return
-	}
-
-	// Bounds check before incrementing
-	if srcSlot >= 0 && srcSlot < len(c.refCount) {
-		c.slotForSeq[dstSeq] = srcSlot
-		c.refCount[srcSlot]++
-	}
-}
-
-func (c *HybridCache) CanResume(seq int, pos int32) bool {
-	return c.kv.CanResume(seq, pos)
-}
-
-func (c *HybridCache) Remove(seq int, beginIndex, endIndex int32) error {
-	if err := c.kv.Remove(seq, beginIndex, endIndex); err != nil {
-		return err
-	}
-
-	// For recurrent state, any removal invalidates the state because
-	// the state at position N depends on all previous positions.
-	// Drop the slot mapping so it resets on next use.
-	slot, ok := c.slotForSeq[seq]
-	if !ok {
-		return nil
-	}
-
-	// Bounds check
-	if slot < 0 || slot >= len(c.refCount) {
-		delete(c.slotForSeq, seq)
-		return nil
-	}
-
-	c.refCount[slot]--
-	if c.refCount[slot] <= 0 {
-		c.refCount[slot] = 0
-		c.freeSlot(slot)
-	}
-	delete(c.slotForSeq, seq)
-
-	return nil
+	return &HybridCache{Recurrent: base}
 }

 func (c *HybridCache) slotsTensor() ml.Tensor {
-	return c.curSlotsInput
+	return c.SlotsTensor()
 }

 func (c *HybridCache) seqTokens() int {
-	return c.curSeqTokens
+	return c.SeqTokens()
 }

 func (c *HybridCache) numSeqs() int {
-	return len(c.curSeqs)
-}
-
-func (c *HybridCache) convBuffer(ctx ml.Context, layer int) ml.Tensor {
-	if buf, ok := c.convStates[layer]; ok {
-		return buf
-	}
-
-	if _, ok := c.convCtxs[layer]; !ok {
-		c.convCtxs[layer] = c.backend.NewContextSize(1).Layer(layer)
-	}
-
-	buf := c.convCtxs[layer].Zeros(c.dtype, c.dConv*c.hiddenSize, c.maxSequences)
-	c.convStates[layer] = buf
-	return buf
-}
-
-// ConvState returns the conv state for current batch sequences as shape [dConv, hiddenSize, nSeqs].
-// Returns an error if copy-on-write allocation fails.
-func (c *HybridCache) ConvState(ctx ml.Context, layer int) (ml.Tensor, error) {
-	if !c.writableEnsured {
-		needsWritable := false
-		for _, seq := range c.curSeqs {
-			slot, ok := c.slotForSeq[seq]
-			if !ok {
-				continue
-			}
-			if slot >= 0 && slot < len(c.refCount) && c.refCount[slot] > 1 {
-				needsWritable = true
-				break
-			}
-		}
-
-		if needsWritable {
-			if err := c.EnsureWritable(ctx); err != nil {
-				c.writableError = err
-			}
-		}
-		c.writableEnsured = true
-	}
-
-	if c.writableError != nil {
-		return nil, c.writableError
-	}
-
-	buf := c.convBuffer(ctx, layer)
-	cur := buf.Rows(ctx, c.slotsTensor())
-	return cur.Reshape(ctx, c.dConv, c.hiddenSize, c.numSeqs()), nil
-}
-
-// UpdateConvState writes a new conv state for current batch sequences.
-// newState must have shape [dConv, hiddenSize, nSeqs].
-func (c *HybridCache) UpdateConvState(ctx ml.Context, layer int, newState ml.Tensor) {
-	buf := c.convBuffer(ctx, layer)
-	src := newState.Reshape(ctx, c.dConv*c.hiddenSize, c.numSeqs())
-	// SetRows requires F32 source
-	srcF32 := src.Cast(ctx, ml.DTypeF32)
-	ctx.Forward(buf.SetRows(ctx, srcF32, c.slotsTensor()))
-}
-
-// IsSupportedForBatch returns true if the current batch layout supports shortconv.
-func (c *HybridCache) IsSupportedForBatch() bool {
-	return c.curSeqTokens > 0 && len(c.curSeqs) > 0
-}
-
-// Seqs returns the ordered unique sequences for the current forward pass.
-func (c *HybridCache) Seqs() []int {
-	return slices.Clone(c.curSeqs)
+	return c.NumSeqs()
 }
--- a/model/models/lfm2/cache_test.go
+++ b/model/models/lfm2/cache_test.go
@@ -4,441 +4,39 @@ import (
 	"testing"

 	"github.com/ollama/ollama/kvcache"
-	"github.com/ollama/ollama/ml"
 )

-// TestHybridCache tests verify the slot management logic of HybridCache.
-// These tests focus on the recurrent state slot allocation, reference counting,
-// and copy-on-write semantics without requiring a full ML backend.
+func TestHybridCache_New(t *testing.T) {
+	cache := NewHybridCache(nil, 512, 2)
+	if cache == nil {
+		t.Fatal("expected cache to be created")
+	}

-// createSlotOnlyCache creates a HybridCache with only the slot management
-// fields initialized. Used to test slot logic in isolation.
-func createSlotOnlyCache(maxSequences int) *HybridCache {
-	return &HybridCache{
-		hiddenSize:   256,
-		dConv:        3,
-		maxSequences: maxSequences,
-		refCount:     make([]int, maxSequences),
-		freeSlots:    initFreeSlots(maxSequences),
-		slotForSeq:   make(map[int]int),
-		convCtxs:     make(map[int]ml.Context),
-		convStates:   make(map[int]ml.Tensor),
+	if cache.Recurrent == nil {
+		t.Fatal("expected embedded recurrent cache to be created")
 	}
 }

-func initFreeSlots(n int) []int {
-	slots := make([]int, 0, n)
-	for i := n - 1; i >= 0; i-- {
-		slots = append(slots, i)
-	}
-	return slots
-}
+func TestHybridCache_ImplementsCheckpointCache(t *testing.T) {
+	cache := NewHybridCache(nil, 512, 2)

-func TestHybridCache_SlotAllocation(t *testing.T) {
-	cache := createSlotOnlyCache(4)
-
-	// Verify initial state
-	if len(cache.freeSlots) != 4 {
-		t.Errorf("expected 4 free slots, got %d", len(cache.freeSlots))
-	}
-
-	// Allocate all slots
-	for range 4 {
-		slot, err := cache.allocSlot()
-		if err != nil {
-			t.Fatalf("allocSlot failed: %v", err)
-		}
-		cache.refCount[slot] = 1
-	}
-
-	// Should be full now
-	if len(cache.freeSlots) != 0 {
-		t.Errorf("expected 0 free slots, got %d", len(cache.freeSlots))
-	}
-
-	// Trying to allocate another should fail
-	_, err := cache.allocSlot()
-	if err != kvcache.ErrKvCacheFull {
-		t.Errorf("expected ErrKvCacheFull, got %v", err)
+	if _, ok := any(cache).(kvcache.CheckpointCache); !ok {
+		t.Fatal("expected HybridCache to implement CheckpointCache")
 	}
 }

-func TestHybridCache_SlotReuse(t *testing.T) {
-	cache := createSlotOnlyCache(4)
+func TestHybridCache_DefaultBatchState(t *testing.T) {
+	cache := NewHybridCache(nil, 512, 2)

-	// Allocate a slot
-	slot1, _ := cache.allocSlot()
-	cache.refCount[slot1] = 1
-
-	// Free it
-	cache.refCount[slot1] = 0
-	cache.freeSlot(slot1)
-
-	// Allocate again - should get the same slot back (LIFO)
-	slot2, _ := cache.allocSlot()
-	if slot2 != slot1 {
-		t.Errorf("expected slot %d to be reused, got %d", slot1, slot2)
-	}
-}
-
-func TestHybridCache_SlotRefCounting_ShareSlot(t *testing.T) {
-	cache := createSlotOnlyCache(4)
-
-	// Allocate slot for seq 1
-	slot1, _ := cache.allocSlot()
-	cache.slotForSeq[1] = slot1
-	cache.refCount[slot1] = 1
-
-	// Simulate sharing slot with seq 2 (copy-on-write style)
-	cache.slotForSeq[2] = slot1
-	cache.refCount[slot1]++
-
-	// Should share the same slot
-	if cache.slotForSeq[2] != slot1 {
-		t.Errorf("expected seq 2 to share slot %d, got %d", slot1, cache.slotForSeq[2])
+	if got := cache.numSeqs(); got != 0 {
+		t.Fatalf("expected 0 sequences before StartForward, got %d", got)
 	}

-	// Ref count should be 2
-	if cache.refCount[slot1] != 2 {
-		t.Errorf("expected refCount 2, got %d", cache.refCount[slot1])
-	}
-}
-
-func TestHybridCache_SlotRefCounting_DecRef(t *testing.T) {
-	cache := createSlotOnlyCache(4)
-
-	// Allocate slot for seq 1
-	slot1, _ := cache.allocSlot()
-	cache.slotForSeq[1] = slot1
-	cache.refCount[slot1] = 1
-
-	// Share with seq 2
-	cache.slotForSeq[2] = slot1
-	cache.refCount[slot1]++
-
-	// Unshare seq 2
-	cache.refCount[slot1]--
-	delete(cache.slotForSeq, 2)
-
-	// Ref count should be back to 1
-	if cache.refCount[slot1] != 1 {
-		t.Errorf("expected refCount 1 after unshare, got %d", cache.refCount[slot1])
+	if got := cache.seqTokens(); got != 0 {
+		t.Fatalf("expected 0 sequence tokens before StartForward, got %d", got)
 	}

-	// Seq 2 should no longer have a slot
-	if _, ok := cache.slotForSeq[2]; ok {
-		t.Error("seq 2 should not have a slot after unshare")
-	}
-}
-
-func TestHybridCache_SlotFreeWhenUnused(t *testing.T) {
-	cache := createSlotOnlyCache(4)
-
-	initialFreeSlots := len(cache.freeSlots)
-
-	// Allocate slot for seq 1
-	slot1, _ := cache.allocSlot()
-	cache.slotForSeq[1] = slot1
-	cache.refCount[slot1] = 1
-
-	// Free the slot when refCount drops to 0
-	cache.refCount[slot1]--
-	if cache.refCount[slot1] <= 0 {
-		cache.refCount[slot1] = 0
-		cache.freeSlot(slot1)
-	}
-	delete(cache.slotForSeq, 1)
-
-	// Slot should be freed
-	if len(cache.freeSlots) != initialFreeSlots {
-		t.Errorf("expected %d free slots, got %d", initialFreeSlots, len(cache.freeSlots))
-	}
-
-	// Ref count should be 0
-	if cache.refCount[slot1] != 0 {
-		t.Errorf("expected refCount 0, got %d", cache.refCount[slot1])
-	}
-}
-
-func TestHybridCache_SlotOverwrite(t *testing.T) {
-	cache := createSlotOnlyCache(4)
-
-	// Allocate slots for seq 1 and seq 2
-	slot1, _ := cache.allocSlot()
-	cache.slotForSeq[1] = slot1
-	cache.refCount[slot1] = 1
-
-	slot2, _ := cache.allocSlot()
-	cache.slotForSeq[2] = slot2
-	cache.refCount[slot2] = 1
-
-	initialFreeSlots := len(cache.freeSlots)
-
-	// Simulate overwriting seq 2's slot with slot1 (sharing)
-	// First free the old slot
-	cache.refCount[slot2]--
-	if cache.refCount[slot2] <= 0 {
-		cache.refCount[slot2] = 0
-		cache.freeSlot(slot2)
-	}
-	// Then share slot1
-	cache.slotForSeq[2] = slot1
-	cache.refCount[slot1]++
-
-	// Seq 2 should now share slot1
-	if cache.slotForSeq[2] != slot1 {
-		t.Errorf("expected seq 2 to share slot %d, got %d", slot1, cache.slotForSeq[2])
-	}
-
-	// Old slot2 should be freed
-	if len(cache.freeSlots) != initialFreeSlots+1 {
-		t.Errorf("expected %d free slots, got %d", initialFreeSlots+1, len(cache.freeSlots))
-	}
-}
-
-func TestHybridCache_BoundsChecking(t *testing.T) {
-	cache := createSlotOnlyCache(4)
-
-	// Test freeing invalid slot (should not panic)
-	cache.freeSlot(-1)
-	cache.freeSlot(100) // out of bounds
-
-	// freeSlot does bounds checking, so invalid slots should be ignored
-	if len(cache.freeSlots) != 4 {
-		t.Errorf("invalid slots should not affect free list, got %d slots", len(cache.freeSlots))
-	}
-}
-
-func TestHybridCache_MultipleSequences_RefCounting(t *testing.T) {
-	cache := createSlotOnlyCache(8)
-
-	// Allocate slot for seq 1
-	slot1, _ := cache.allocSlot()
-	cache.slotForSeq[1] = slot1
-	cache.refCount[slot1] = 1
-
-	// Fork to seq 2, 3, 4 (all share slot1)
-	for _, seq := range []int{2, 3, 4} {
-		cache.slotForSeq[seq] = slot1
-		cache.refCount[slot1]++
-	}
-
-	// Ref count should be 4
-	if cache.refCount[slot1] != 4 {
-		t.Errorf("expected refCount 4, got %d", cache.refCount[slot1])
-	}
-
-	// Remove seq 2, 3
-	for _, seq := range []int{2, 3} {
-		delete(cache.slotForSeq, seq)
-		cache.refCount[slot1]--
-	}
-
-	if cache.refCount[slot1] != 2 {
-		t.Errorf("expected refCount 2, got %d", cache.refCount[slot1])
-	}
-
-	// Slot should still be allocated (not in free list)
-	found := false
-	for _, s := range cache.freeSlots {
-		if s == slot1 {
-			found = true
-			break
-		}
-	}
-	if found {
-		t.Error("slot1 should not be in free list yet")
-	}
-
-	// Remove remaining sequences
-	for _, seq := range []int{1, 4} {
-		delete(cache.slotForSeq, seq)
-		cache.refCount[slot1]--
-	}
-
-	if cache.refCount[slot1] != 0 {
-		t.Errorf("expected refCount 0, got %d", cache.refCount[slot1])
-	}
-}
-
-func TestHybridCache_ChainedSharing(t *testing.T) {
-	cache := createSlotOnlyCache(8)
-
-	// Create seq 1
-	slot1, _ := cache.allocSlot()
-	cache.slotForSeq[1] = slot1
-	cache.refCount[slot1] = 1
-
-	// Share 1 -> 2
-	cache.slotForSeq[2] = slot1
-	cache.refCount[slot1]++
-
-	// Share 2 -> 3 (should still share slot1)
-	cache.slotForSeq[3] = cache.slotForSeq[2] // which is slot1
-	cache.refCount[slot1]++
-
-	// All should share slot1
-	if cache.slotForSeq[1] != slot1 || cache.slotForSeq[2] != slot1 || cache.slotForSeq[3] != slot1 {
-		t.Error("all sequences should share slot1")
-	}
-
-	if cache.refCount[slot1] != 3 {
-		t.Errorf("expected refCount 3, got %d", cache.refCount[slot1])
-	}
-}
-
-func TestHybridCache_CacheParameters(t *testing.T) {
-	cache := NewHybridCache(nil, 512, 5) // hiddenSize=512, dConv=5
-
-	if cache.hiddenSize != 512 {
-		t.Errorf("expected hiddenSize 512, got %d", cache.hiddenSize)
-	}
-	if cache.dConv != 5 {
-		t.Errorf("expected dConv 5, got %d", cache.dConv)
-	}
-}
-
-func TestHybridCache_NumSeqs(t *testing.T) {
-	cache := createSlotOnlyCache(4)
-
-	// Initially no sequences
-	if cache.numSeqs() != 0 {
-		t.Errorf("expected 0 seqs, got %d", cache.numSeqs())
-	}
-
-	// Manually set up current batch state
-	cache.curSeqs = []int{1, 2, 3}
-
-	if cache.numSeqs() != 3 {
-		t.Errorf("expected 3 seqs, got %d", cache.numSeqs())
-	}
-}
-
-func TestHybridCache_SeqTokens(t *testing.T) {
-	cache := createSlotOnlyCache(4)
-
-	// Initially 0
-	if cache.seqTokens() != 0 {
-		t.Errorf("expected 0 seqTokens, got %d", cache.seqTokens())
-	}
-
-	// Manually set up current batch state
-	cache.curSeqTokens = 16
-
-	if cache.seqTokens() != 16 {
-		t.Errorf("expected 16 seqTokens, got %d", cache.seqTokens())
-	}
-}
-
-// Test that Seqs returns a clone of curSeqs
-func TestHybridCache_Seqs_ReturnsClone(t *testing.T) {
-	cache := createSlotOnlyCache(4)
-
-	cache.curSeqs = []int{1, 2, 3}
-
-	seqs := cache.Seqs()
-
-	// Modify returned slice
-	seqs[0] = 999
-
-	// Original should be unchanged
-	if cache.curSeqs[0] != 1 {
-		t.Error("Seqs should return a clone, not the original slice")
-	}
-}
-
-func TestHybridCache_IsSupportedForBatch(t *testing.T) {
-	cache := createSlotOnlyCache(4)
-
-	// Initially not supported (no batch set up)
 	if cache.IsSupportedForBatch() {
-		t.Error("expected IsSupportedForBatch to be false initially")
-	}
-
-	// Set up a valid batch
-	cache.curSeqTokens = 1
-	cache.curSeqs = []int{1}
-
-	if !cache.IsSupportedForBatch() {
-		t.Error("expected IsSupportedForBatch to be true with valid batch")
-	}
-}
-
-func TestHybridCache_ZeroConvSlots_EmptyInputs(t *testing.T) {
-	cache := createSlotOnlyCache(4)
-
-	// zeroConvSlots should handle empty slots without panicking
-	cache.zeroConvSlots(nil, nil)
-	cache.zeroConvSlots(nil, []int{})
-
-	// zeroConvSlots should handle empty convStates without panicking
-	cache.zeroConvSlots(nil, []int{0, 1, 2})
-}
-
-func TestHybridCache_SlotRecycling_TracksNewSlots(t *testing.T) {
-	cache := createSlotOnlyCache(4)
-
-	// Allocate slot for seq 1
-	slot1, _ := cache.allocSlot()
-	cache.slotForSeq[1] = slot1
-	cache.refCount[slot1] = 1
-
-	// Free the slot (simulating sequence removal)
-	cache.refCount[slot1]--
-	cache.freeSlot(slot1)
-	delete(cache.slotForSeq, 1)
-
-	// Verify slot is in free list
-	if len(cache.freeSlots) != 4 {
-		t.Errorf("expected 4 free slots after freeing, got %d", len(cache.freeSlots))
-	}
-
-	// Allocate for new seq 2 - should get recycled slot
-	slot2, _ := cache.allocSlot()
-	if slot2 != slot1 {
-		t.Errorf("expected recycled slot %d, got %d", slot1, slot2)
-	}
-
-	// This recycled slot would need zeroing in the real implementation
-	// The actual zeroing is tested via integration tests since it requires ML context
-}
-
-func TestHybridCache_NewSequence_GetsTrackedForZeroing(t *testing.T) {
-	cache := createSlotOnlyCache(4)
-
-	// Simulate the slot allocation flow from StartForward
-	// When a sequence doesn't have a slot, it gets allocated and tracked as "new"
-
-	newSlots := []int{}
-
-	// Seq 1 doesn't have a slot - allocate and track
-	seq := 1
-	if _, ok := cache.slotForSeq[seq]; !ok {
-		slot, err := cache.allocSlot()
-		if err != nil {
-			t.Fatalf("allocSlot failed: %v", err)
-		}
-		cache.slotForSeq[seq] = slot
-		cache.refCount[slot] = 1
-		newSlots = append(newSlots, slot)
-	}
-
-	// Verify newSlots contains the allocated slot
-	if len(newSlots) != 1 {
-		t.Errorf("expected 1 new slot, got %d", len(newSlots))
-	}
-
-	// Seq 1 already has a slot - should NOT be tracked as new
-	newSlots2 := []int{}
-	if _, ok := cache.slotForSeq[seq]; !ok {
-		slot, _ := cache.allocSlot()
-		cache.slotForSeq[seq] = slot
-		cache.refCount[slot] = 1
-		newSlots2 = append(newSlots2, slot)
-	}
-
-	// Verify no new slots for existing sequence
-	if len(newSlots2) != 0 {
-		t.Errorf("expected 0 new slots for existing sequence, got %d", len(newSlots2))
+		t.Fatal("expected unsupported batch layout before StartForward")
 	}
 }
--- a/model/models/lfm2/model.go
+++ b/model/models/lfm2/model.go
@@ -1,7 +1,11 @@
 package lfm2

 import (
+	"bytes"
 	"cmp"
+	"errors"
+	"fmt"
+	"image"
 	"math"

 	"github.com/ollama/ollama/fs"
@@ -25,8 +29,20 @@ type Options struct {
 	// per-layer head counts (LFM2 alternates attention and recurrent layers)
 	numHeadsByLayer   []int
 	numKVHeadsByLayer []int
+
+	// MoE config
+	numExperts         int
+	numExpertsUsed     int
+	normTopKProb       bool
+	expertWeightsScale float32
+	expertGatingFunc   uint32
 }

+const (
+	expertGatingFuncSoftmax = uint32(0)
+	expertGatingFuncSigmoid = uint32(2)
+)
+
 func (o Options) headDimValue() int {
 	// Head dim is shared across layers; fall back to first attention layer head count.
 	for _, h := range o.numHeadsByLayer {
@@ -67,18 +83,138 @@ type Model struct {
 	OutputNorm     *nn.RMSNorm   `gguf:"output_norm,alt:token_embd_norm"`
 	Output         *nn.Linear    `gguf:"output,alt:token_embd"`

+	VisionModel      *VisionModel     `gguf:"v"`
+	VisionProjector  *VisionProjector `gguf:"mm"`
+	ImageProcessor   ImageProcessor
+	imageTokenID     int32
+	imageStartToken  int32
+	imageEndToken    int32
+	imageThumbnailID int32
+	imageRowColIDs   map[imageGridPos]int32
+	useSpecialTokens bool
+	projectorOptions VisionProjectorOptions
+
 	Options
 }

-func New(c fs.Config) (model.Model, error) {
-	if c.Uint("expert_count") > 0 {
-		return nil, model.ErrUnsupportedModel
+var _ model.MultimodalProcessor = (*Model)(nil)
+
+type imageGridPos struct {
+	row int
+	col int
+}
+
+type visionEmbeddingLayout struct {
+	rows         int
+	cols         int
+	hasThumbnail bool
+}
+
+type visionChunkData struct {
+	tokens    int
+	row       int
+	col       int
+	thumbnail bool
+	layout    *visionEmbeddingLayout
+}
+
+func (m *Model) Validate() error {
+	if m.TokenEmbedding == nil {
+		return errors.New("lfm2: missing token_embd tensor")
+	}
+	if m.OutputNorm == nil {
+		return errors.New("lfm2: missing output_norm tensor")
+	}
+	if m.Output == nil {
+		return errors.New("lfm2: missing output tensor")
 	}

+	for i, layer := range m.Layers {
+		if layer.AttentionNorm == nil {
+			return fmt.Errorf("lfm2: missing blk.%d.attn_norm tensor", i)
+		}
+		if layer.MLPNorm == nil {
+			return fmt.Errorf("lfm2: missing blk.%d.ffn_norm tensor", i)
+		}
+		switch ff := layer.MLP.(type) {
+		case nil:
+			return fmt.Errorf("lfm2: missing blk.%d feed-forward tensors", i)
+		case *denseMLP:
+			if ff.Up == nil || ff.Down == nil || ff.Gate == nil {
+				return fmt.Errorf("lfm2: missing blk.%d dense feed-forward tensors", i)
+			}
+		case *sparseMLP:
+			if ff.Router == nil || ff.Gate == nil || ff.Up == nil || ff.Down == nil {
+				return fmt.Errorf("lfm2: missing blk.%d sparse feed-forward tensors", i)
+			}
+		default:
+			return fmt.Errorf("lfm2: unsupported feed-forward type at blk.%d", i)
+		}
+
+		switch op := layer.Operator.(type) {
+		case *Attention:
+			if op == nil || op.Query == nil || op.Key == nil || op.Value == nil || op.Output == nil || op.QueryNorm == nil || op.KeyNorm == nil {
+				return fmt.Errorf("lfm2: missing blk.%d attention tensors", i)
+			}
+		case *ShortConv:
+			if op == nil || op.Conv == nil || op.Conv.Weight == nil || op.InProj == nil || op.OutProj == nil {
+				return fmt.Errorf("lfm2: missing blk.%d shortconv tensors", i)
+			}
+		default:
+			return fmt.Errorf("lfm2: unsupported operator at blk.%d", i)
+		}
+	}
+
+	if m.VisionModel != nil {
+		if m.VisionModel.PatchEmbedding == nil {
+			return errors.New("lfm2: missing vision patch embedding tensors")
+		}
+		if m.VisionModel.PositionEmbedding == nil {
+			return errors.New("lfm2: missing vision position embedding tensors")
+		}
+		if m.VisionModel.PostLayerNorm == nil {
+			return errors.New("lfm2: missing vision post layer norm tensors")
+		}
+		if len(m.VisionModel.Layers) == 0 {
+			return errors.New("lfm2: missing vision encoder layers")
+		}
+		for i, layer := range m.VisionModel.Layers {
+			if layer.LayerNorm1 == nil || layer.LayerNorm2 == nil || layer.SelfAttention == nil || layer.MLP == nil {
+				return fmt.Errorf("lfm2: missing vision layer tensors at v.blk.%d", i)
+			}
+			if layer.SelfAttention.Query == nil || layer.SelfAttention.Key == nil || layer.SelfAttention.Value == nil || layer.SelfAttention.Output == nil {
+				return fmt.Errorf("lfm2: missing vision attention tensors at v.blk.%d", i)
+			}
+			if layer.MLP.Up == nil || layer.MLP.Down == nil {
+				return fmt.Errorf("lfm2: missing vision feed-forward tensors at v.blk.%d", i)
+			}
+		}
+
+		if m.VisionProjector == nil || m.VisionProjector.Linear1 == nil || m.VisionProjector.Linear2 == nil {
+			return errors.New("lfm2: missing multimodal projector tensors")
+		}
+	}
+
+	return nil
+}
+
+func New(c fs.Config) (model.Model, error) {
 	if c.String("tokenizer.ggml.model") != "gpt2" {
 		return nil, model.ErrUnsupportedTokenizer
 	}

+	numExperts := int(c.Uint("expert_count"))
+	isMoE := numExperts > 0
+	numExpertsUsed := int(c.Uint("expert_used_count"))
+	if isMoE {
+		if numExperts <= 0 {
+			return nil, fmt.Errorf("lfm2: invalid expert_count=%d", numExperts)
+		}
+		if numExpertsUsed <= 0 || numExpertsUsed > numExperts {
+			return nil, fmt.Errorf("lfm2: invalid expert_used_count=%d for expert_count=%d", numExpertsUsed, numExperts)
+		}
+	}
+
 	vocabulary := tokenizer.Vocabulary{
 		Values: c.Strings("tokenizer.ggml.tokens"),
 		Scores: c.Floats("tokenizer.ggml.scores"),
@@ -105,8 +241,16 @@ func New(c fs.Config) (model.Model, error) {
 	}

 	m := Model{
-		Tokenizer: tokenizer.NewBytePairEncoding(&vocabulary, pretokenizers...),
-		Layers:    make([]Layer, c.Uint("block_count")),
+		Tokenizer:       tokenizer.NewBytePairEncoding(&vocabulary, pretokenizers...),
+		Layers:          make([]Layer, c.Uint("block_count")),
+		ImageProcessor:  newImageProcessor(c),
+		VisionModel:     newVisionModel(c),
+		VisionProjector: &VisionProjector{},
+		imageRowColIDs:  make(map[imageGridPos]int32),
+		projectorOptions: VisionProjectorOptions{
+			scaleFactor:  int(c.Uint("vision.projector.scale_factor", 2)),
+			useLayerNorm: c.Bool("vision.projector.use_layernorm", false),
+		},
 		Options: Options{
 			hiddenSize:            int(c.Uint("embedding_length")),
 			headDim:               int(c.Uint("attention.key_length")),
@@ -116,9 +260,66 @@ func New(c fs.Config) (model.Model, error) {
 			ropeBase:              c.Float("rope.freq_base"),
 			ropeScale:             c.Float("rope.scaling.factor", 1),
 			originalContextLength: int(c.Uint("rope.scaling.original_context_length")),
+			numExperts:            numExperts,
+			numExpertsUsed:        numExpertsUsed,
+			normTopKProb:          c.Bool("norm_top_k_prob", true),
+			expertWeightsScale:    c.Float("expert_weights_scale", 1.0),
+			expertGatingFunc:      c.Uint("expert_gating_func", expertGatingFuncSoftmax),
 		},
 	}

+	lookupTokenID := func(token string) int32 {
+		for i, t := range vocabulary.Values {
+			if t == token {
+				return int32(i)
+			}
+		}
+		return 0
+	}
+
+	resolveTokenID := func(explicitKey, token string, fallback uint32) int32 {
+		if explicitKey != "" {
+			if id := c.Uint(explicitKey); id != 0 {
+				return int32(id)
+			}
+		}
+		if tokenID := lookupTokenID(token); tokenID != 0 {
+			return tokenID
+		}
+		return int32(fallback)
+	}
+
+	m.imageTokenID = resolveTokenID("vision.image_token_id", "<image>", 396)
+	m.imageStartToken = resolveTokenID("vision.image_start_token_id", "<|image_start|>", 0)
+	m.imageEndToken = resolveTokenID("vision.image_end_token_id", "<|image_end|>", 0)
+	m.imageThumbnailID = resolveTokenID("vision.image_thumbnail_token_id", "<|img_thumbnail|>", 0)
+	m.useSpecialTokens = c.Bool("vision.use_image_special_tokens", true)
+
+	maxGridTokens := int(c.Uint("vision.max_tiles", 10))
+	if maxGridTokens <= 0 {
+		maxGridTokens = 10
+	}
+	for row := 1; row <= maxGridTokens; row++ {
+		for col := 1; col <= maxGridTokens; col++ {
+			token := fmt.Sprintf("<|img_row_%d_col_%d|>", row, col)
+			if tokenID := lookupTokenID(token); tokenID > 0 {
+				m.imageRowColIDs[imageGridPos{row: row, col: col}] = tokenID
+			}
+		}
+	}
+
+	if !m.useSpecialTokens {
+		m.imageStartToken = 0
+		m.imageEndToken = 0
+		m.imageThumbnailID = 0
+		m.imageRowColIDs = map[imageGridPos]int32{}
+	}
+
+	if c.Uint("vision.block_count") == 0 {
+		m.VisionModel = nil
+		m.VisionProjector = nil
+	}
+
 	type headCounts interface {
 		HeadCount() []uint64
 		HeadCountKV() []uint64
@@ -133,6 +334,14 @@ func New(c fs.Config) (model.Model, error) {

 	m.numHeadsByLayer = make([]int, len(m.Layers))
 	m.numKVHeadsByLayer = make([]int, len(m.Layers))
+	leadingDenseBlockCount := int(c.Uint("leading_dense_block_count"))
+	if leadingDenseBlockCount < 0 {
+		leadingDenseBlockCount = 0
+	}
+	if leadingDenseBlockCount > len(m.Layers) {
+		leadingDenseBlockCount = len(m.Layers)
+	}
+
 	for i := range m.Layers {
 		m.numHeadsByLayer[i] = int(headCount[i])
 		m.numKVHeadsByLayer[i] = int(headCountKV[i])
@@ -142,6 +351,12 @@ func New(c fs.Config) (model.Model, error) {
 		} else {
 			m.Layers[i].Operator = &Attention{}
 		}
+
+		if isMoE && i >= leadingDenseBlockCount {
+			m.Layers[i].MLP = &sparseMLP{}
+		} else {
+			m.Layers[i].MLP = &denseMLP{}
+		}
 	}

 	lCache := int(c.Uint("shortconv.l_cache"))
@@ -188,22 +403,77 @@ func (sa *Attention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor,
 	return sa.Output.Forward(ctx, attention)
 }

-type MLP struct {
+type FeedForward interface {
+	Forward(ml.Context, ml.Tensor, *Options) ml.Tensor
+}
+
+type denseMLP struct {
 	Up   *nn.Linear `gguf:"ffn_up"`
 	Down *nn.Linear `gguf:"ffn_down"`
 	Gate *nn.Linear `gguf:"ffn_gate"`
 }

-func (mlp *MLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *Options) ml.Tensor {
+func (mlp *denseMLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *Options) ml.Tensor {
 	hiddenState = mlp.Gate.Forward(ctx, hiddenState).SILU(ctx, mlp.Up.Forward(ctx, hiddenState))
 	return mlp.Down.Forward(ctx, hiddenState)
 }

+type sparseMLP struct {
+	Router *nn.Linear      `gguf:"ffn_gate_inp"`
+	Gate   *nn.LinearBatch `gguf:"ffn_gate_exps"`
+	Up     *nn.LinearBatch `gguf:"ffn_up_exps"`
+	Down   *nn.LinearBatch `gguf:"ffn_down_exps"`
+	Bias   ml.Tensor       `gguf:"exp_probs_b.bias,alt:exp_probs_b"`
+}
+
+func (mlp *sparseMLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *Options) ml.Tensor {
+	// hiddenState: [hidden, tokens]
+	routerLogits := mlp.Router.Forward(ctx, hiddenState)
+
+	probs := routerLogits.Softmax(ctx)
+	if opts.expertGatingFunc == expertGatingFuncSigmoid {
+		probs = routerLogits.Sigmoid(ctx)
+	}
+
+	selectionProbs := probs
+	if mlp.Bias != nil {
+		selectionProbs = selectionProbs.Add(ctx, mlp.Bias)
+	}
+
+	selectedExperts := selectionProbs.TopK(ctx, opts.numExpertsUsed)
+	routingWeights := probs.Reshape(ctx, 1, opts.numExperts, hiddenState.Dim(1)).Rows(ctx, selectedExperts)
+	if opts.normTopKProb {
+		routingWeights = routingWeights.Reshape(ctx, opts.numExpertsUsed, hiddenState.Dim(1))
+		weightsSum := routingWeights.SumRows(ctx)
+		weightsSum = weightsSum.Clamp(ctx, 1e-6, float32(math.Inf(1)))
+		routingWeights = routingWeights.Div(ctx, weightsSum)
+		routingWeights = routingWeights.Reshape(ctx, 1, opts.numExpertsUsed, hiddenState.Dim(1))
+	}
+	if opts.expertWeightsScale != 1 {
+		routingWeights = routingWeights.Scale(ctx, float64(opts.expertWeightsScale))
+	}
+
+	// Build routing-weights branch early to enable topk-MoE fusion.
+	ctx.Forward(routingWeights)
+
+	hiddenState3D := hiddenState.Reshape(ctx, hiddenState.Dim(0), 1, hiddenState.Dim(1))
+	experts := mlp.Gate.Forward(ctx, hiddenState3D, selectedExperts).SILU(ctx, mlp.Up.Forward(ctx, hiddenState3D, selectedExperts))
+	experts = mlp.Down.Forward(ctx, experts, selectedExperts)
+	experts = experts.Mul(ctx, routingWeights)
+
+	nextState := experts.View(ctx, 0, experts.Dim(0), experts.Stride(2), experts.Dim(2))
+	for i := 1; i < opts.numExpertsUsed; i++ {
+		nextState = nextState.Add(ctx, experts.View(ctx, i*experts.Stride(1), experts.Dim(0), experts.Stride(2), experts.Dim(2)))
+	}
+
+	return nextState
+}
+
 type Layer struct {
 	AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
 	Operator      Operator
 	MLPNorm       *nn.RMSNorm `gguf:"ffn_norm"`
-	MLP           *MLP
+	MLP           FeedForward
 }

 func (l *Layer) Forward(ctx ml.Context, layer int, hiddenState, positions, outputs ml.Tensor, cache *HybridCache, opts *Options) ml.Tensor {
@@ -229,10 +499,233 @@ func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tenso
 	return m.applyRotaryPositionEmbeddings(ctx, key, shift), nil
 }

+func multimodalTokenCount(mm input.Multimodal) int {
+	if mm.Tensor != nil {
+		return mm.Tensor.Dim(1)
+	}
+
+	switch data := mm.Data.(type) {
+	case int:
+		return data
+	case int32:
+		return int(data)
+	case visionChunkData:
+		return data.tokens
+	case *visionChunkData:
+		if data != nil {
+			return data.tokens
+		}
+	}
+
+	return 0
+}
+
+func multimodalChunkInfo(mm input.Multimodal) visionChunkData {
+	switch data := mm.Data.(type) {
+	case visionChunkData:
+		return data
+	case *visionChunkData:
+		if data != nil {
+			return *data
+		}
+	}
+
+	return visionChunkData{
+		tokens: multimodalTokenCount(mm),
+	}
+}
+
+func multimodalLayout(mm []input.Multimodal) visionEmbeddingLayout {
+	layout := visionEmbeddingLayout{rows: 1, cols: 1}
+	if len(mm) == 0 {
+		return layout
+	}
+
+	first := multimodalChunkInfo(mm[0])
+	if first.layout != nil {
+		return *first.layout
+	}
+
+	return layout
+}
+
+func (m *Model) imageRowColToken(row, col int) int32 {
+	if row <= 0 || col <= 0 {
+		return 0
+	}
+	return m.imageRowColIDs[imageGridPos{row: row, col: col}]
+}
+
+func (m *Model) appendImageChunk(result []*input.Input, chunk input.Multimodal, imageToken int32, hash uint64) ([]*input.Input, error) {
+	tokenCount := multimodalTokenCount(chunk)
+	if tokenCount <= 0 {
+		return nil, errors.New("lfm2: multimodal input has no tokens")
+	}
+
+	result = append(result, &input.Input{
+		Token:          imageToken,
+		Multimodal:     []input.Multimodal{chunk},
+		MultimodalHash: hash,
+		SameBatch:      tokenCount - 1,
+	})
+
+	for range tokenCount - 1 {
+		result = append(result, &input.Input{Token: imageToken})
+	}
+
+	return result, nil
+}
+
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
+	if m.VisionModel == nil || m.VisionProjector == nil || len(m.VisionModel.Layers) == 0 {
+		return nil, model.ErrNoVisionModel
+	}
+
+	img, _, err := image.Decode(bytes.NewReader(multimodalData))
+	if err != nil {
+		return nil, err
+	}
+
+	processedImages, layout, err := m.ImageProcessor.ProcessImage(img)
+	if err != nil {
+		return nil, err
+	}
+
+	if m.ImageProcessor.patchSize <= 0 {
+		return nil, errors.New("lfm2: invalid vision patch size")
+	}
+
+	layoutInfo := &visionEmbeddingLayout{
+		rows:         layout.rows,
+		cols:         layout.cols,
+		hasThumbnail: layout.hasThumbnail,
+	}
+
+	mm := make([]input.Multimodal, 0, len(processedImages))
+	for i, processed := range processedImages {
+		patches := visionPatchGrid{
+			Width:  processed.size.X / m.ImageProcessor.patchSize,
+			Height: processed.size.Y / m.ImageProcessor.patchSize,
+		}
+		if patches.Width == 0 || patches.Height == 0 {
+			return nil, errors.New("lfm2: invalid resized image dimensions")
+		}
+
+		pixelValues := ctx.Input().FromFloats(processed.data, processed.size.X, processed.size.Y, m.ImageProcessor.numChannels)
+		visionOutputs := m.VisionModel.Forward(ctx, pixelValues, patches)
+		projected := m.VisionProjector.Forward(ctx, visionOutputs, patches, m.projectorOptions)
+
+		chunk := visionChunkData{
+			tokens:    projected.Dim(1),
+			row:       processed.row,
+			col:       processed.col,
+			thumbnail: processed.thumbnail,
+		}
+		if i == 0 {
+			chunk.layout = layoutInfo
+		}
+
+		mm = append(mm, input.Multimodal{
+			Tensor: projected,
+			Data:   chunk,
+		})
+	}
+
+	return mm, nil
+}
+
+func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
+	var result []*input.Input
+
+	imageToken := m.imageTokenID
+	if imageToken == 0 {
+		imageToken = 396
+	}
+	useSpecialTokens := m.useSpecialTokens || m.imageStartToken > 0 || m.imageEndToken > 0 || m.imageThumbnailID > 0 || len(m.imageRowColIDs) > 0
+
+	for _, inp := range inputs {
+		if len(inp.Multimodal) == 0 {
+			result = append(result, inp)
+			continue
+		}
+
+		layout := multimodalLayout(inp.Multimodal)
+		if layout.rows <= 0 {
+			layout.rows = 1
+		}
+		if layout.cols <= 0 {
+			layout.cols = 1
+		}
+		tiles := layout.rows * layout.cols
+		multitile := tiles > 1
+
+		if useSpecialTokens && m.imageStartToken > 0 {
+			result = append(result, &input.Input{Token: m.imageStartToken})
+		}
+
+		for i, mm := range inp.Multimodal {
+			chunk := multimodalChunkInfo(mm)
+			if chunk.tokens <= 0 {
+				chunk.tokens = multimodalTokenCount(mm)
+			}
+
+			if multitile && !chunk.thumbnail && chunk.row == 0 && chunk.col == 0 && i < tiles {
+				chunk.row = i/layout.cols + 1
+				chunk.col = i%layout.cols + 1
+			}
+			if multitile && layout.hasThumbnail && i == tiles {
+				chunk.thumbnail = true
+			}
+
+			if useSpecialTokens && multitile {
+				if chunk.thumbnail {
+					if m.imageThumbnailID > 0 {
+						result = append(result, &input.Input{Token: m.imageThumbnailID})
+					}
+				} else if marker := m.imageRowColToken(chunk.row, chunk.col); marker > 0 {
+					result = append(result, &input.Input{Token: marker})
+				}
+			}
+
+			var err error
+			result, err = m.appendImageChunk(result, input.Multimodal{
+				Tensor: mm.Tensor,
+				Data:   chunk,
+			}, imageToken, inp.MultimodalHash)
+			if err != nil {
+				return nil, err
+			}
+		}
+
+		if useSpecialTokens && m.imageEndToken > 0 {
+			result = append(result, &input.Input{Token: m.imageEndToken})
+		}
+	}
+
+	return result, nil
+}
+
 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 	positions := ctx.Input().FromInts(batch.Positions, len(batch.Positions))

 	hiddenState := m.TokenEmbedding.Forward(ctx, batch.Inputs)
+	if len(batch.Multimodal) > 0 {
+		// We splice vision embeddings into token embeddings in-place; duplicate to
+		// avoid aliasing the raw embedding output graph.
+		hiddenState = hiddenState.Duplicate(ctx)
+	}
+	for _, mm := range batch.Multimodal {
+		offset := mm.Index
+		for _, multimodal := range mm.Multimodal {
+			if multimodal.Tensor == nil {
+				continue
+			}
+
+			visionOutputs := multimodal.Tensor
+			ctx.Forward(visionOutputs.Copy(ctx, hiddenState.View(ctx, offset*hiddenState.Stride(1), visionOutputs.Dim(0)*visionOutputs.Dim(1))))
+			offset += visionOutputs.Dim(1)
+		}
+	}

 	for i, layer := range m.Layers {
 		m.Cache.SetLayer(i)
@@ -251,4 +744,5 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {

 func init() {
 	model.Register("lfm2", New)
+	model.Register("lfm2moe", New)
 }
--- a/model/models/lfm2/model_multimodal_test.go
+++ b/model/models/lfm2/model_multimodal_test.go
@@ -0,0 +1,160 @@
+package lfm2
+
+import (
+	"testing"
+
+	"github.com/ollama/ollama/model/input"
+)
+
+func TestPostTokenizeWithSpecialImageTokens(t *testing.T) {
+	m := &Model{
+		imageTokenID:     396,
+		imageStartToken:  2,
+		imageEndToken:    3,
+		useSpecialTokens: true,
+	}
+
+	in := []*input.Input{
+		{Token: 11},
+		{Multimodal: []input.Multimodal{{Data: 64}}, MultimodalHash: 123},
+		{Token: 12},
+	}
+
+	out, err := m.PostTokenize(in)
+	if err != nil {
+		t.Fatalf("PostTokenize returned error: %v", err)
+	}
+
+	if len(out) != 68 {
+		t.Fatalf("expected 68 tokens, got %d", len(out))
+	}
+
+	if out[0].Token != 11 {
+		t.Fatalf("out[0].Token = %d, want 11", out[0].Token)
+	}
+	if out[1].Token != 2 {
+		t.Fatalf("out[1].Token = %d, want 2", out[1].Token)
+	}
+
+	firstImage := out[2]
+	if firstImage.Token != 396 {
+		t.Fatalf("out[2].Token = %d, want 396", firstImage.Token)
+	}
+	if len(firstImage.Multimodal) != 1 {
+		t.Fatalf("expected multimodal payload on first image token")
+	}
+	if firstImage.MultimodalHash != 123 {
+		t.Fatalf("out[2].MultimodalHash = %d, want 123", firstImage.MultimodalHash)
+	}
+	if firstImage.SameBatch != 63 {
+		t.Fatalf("out[2].SameBatch = %d, want 63", firstImage.SameBatch)
+	}
+
+	for i := 3; i < 66; i++ {
+		if out[i].Token != 396 {
+			t.Fatalf("out[%d].Token = %d, want 396", i, out[i].Token)
+		}
+		if len(out[i].Multimodal) != 0 {
+			t.Fatalf("out[%d] should not carry multimodal payload", i)
+		}
+	}
+
+	if out[66].Token != 3 {
+		t.Fatalf("out[66].Token = %d, want 3", out[66].Token)
+	}
+	if out[67].Token != 12 {
+		t.Fatalf("out[67].Token = %d, want 12", out[67].Token)
+	}
+}
+
+func TestPostTokenizeWithoutSpecialImageTokens(t *testing.T) {
+	m := &Model{
+		imageTokenID:     777,
+		useSpecialTokens: false,
+	}
+
+	in := []*input.Input{
+		{Multimodal: []input.Multimodal{{Data: 5}}, MultimodalHash: 9},
+	}
+
+	out, err := m.PostTokenize(in)
+	if err != nil {
+		t.Fatalf("PostTokenize returned error: %v", err)
+	}
+
+	if len(out) != 5 {
+		t.Fatalf("expected 5 tokens, got %d", len(out))
+	}
+	if out[0].Token != 777 || out[0].SameBatch != 4 || len(out[0].Multimodal) != 1 {
+		t.Fatalf("unexpected first token: %+v", *out[0])
+	}
+	for i := 1; i < 5; i++ {
+		if out[i].Token != 777 {
+			t.Fatalf("out[%d].Token = %d, want 777", i, out[i].Token)
+		}
+		if len(out[i].Multimodal) != 0 {
+			t.Fatalf("out[%d] should not carry multimodal payload", i)
+		}
+	}
+}
+
+func TestPostTokenizeMultiTileLayoutTokens(t *testing.T) {
+	m := &Model{
+		imageTokenID:     396,
+		imageStartToken:  498,
+		imageEndToken:    499,
+		imageThumbnailID: 497,
+		imageRowColIDs: map[imageGridPos]int32{
+			{row: 1, col: 1}: 397,
+			{row: 1, col: 2}: 398,
+		},
+		useSpecialTokens: true,
+	}
+
+	layout := &visionEmbeddingLayout{rows: 1, cols: 2, hasThumbnail: true}
+	in := []*input.Input{{
+		Multimodal: []input.Multimodal{
+			{Data: visionChunkData{tokens: 3, row: 1, col: 1, layout: layout}},
+			{Data: visionChunkData{tokens: 3, row: 1, col: 2}},
+			{Data: visionChunkData{tokens: 2, thumbnail: true}},
+		},
+		MultimodalHash: 1,
+	}}
+
+	out, err := m.PostTokenize(in)
+	if err != nil {
+		t.Fatalf("PostTokenize returned error: %v", err)
+	}
+
+	got := make([]int32, len(out))
+	for i := range out {
+		got[i] = out[i].Token
+	}
+
+	want := []int32{
+		498, // <|image_start|>
+		397, // <|img_row_1_col_1|>
+		396, 396, 396,
+		398, // <|img_row_1_col_2|>
+		396, 396, 396,
+		497, // <|img_thumbnail|>
+		396, 396,
+		499, // <|image_end|>
+	}
+
+	if len(got) != len(want) {
+		t.Fatalf("len(out) = %d, want %d", len(got), len(want))
+	}
+	for i := range want {
+		if got[i] != want[i] {
+			t.Fatalf("out[%d].Token = %d, want %d", i, got[i], want[i])
+		}
+	}
+
+	if len(out[2].Multimodal) != 1 || len(out[6].Multimodal) != 1 || len(out[10].Multimodal) != 1 {
+		t.Fatalf("expected multimodal payload on first token of each chunk")
+	}
+	if out[2].SameBatch != 2 || out[6].SameBatch != 2 || out[10].SameBatch != 1 {
+		t.Fatalf("unexpected SameBatch values: [%d %d %d]", out[2].SameBatch, out[6].SameBatch, out[10].SameBatch)
+	}
+}
--- a/model/models/lfm2/model_vision.go
+++ b/model/models/lfm2/model_vision.go
@@ -0,0 +1,184 @@
+package lfm2
+
+import (
+	"math"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+)
+
+const lfm2VisionBatchSize = 1
+
+type visionPatchGrid struct {
+	Width  int
+	Height int
+}
+
+type VisionSelfAttention struct {
+	Query  *nn.Linear `gguf:"attn_q"`
+	Key    *nn.Linear `gguf:"attn_k"`
+	Value  *nn.Linear `gguf:"attn_v"`
+	Output *nn.Linear `gguf:"attn_output,alt:attn_out"`
+}
+
+func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
+	headDim := opts.hiddenSize / opts.numHeads
+
+	query := sa.Query.Forward(ctx, hiddenState)
+	key := sa.Key.Forward(ctx, hiddenState)
+	value := sa.Value.Forward(ctx, hiddenState)
+
+	query = query.Reshape(ctx, headDim, opts.numHeads, query.Dim(1), lfm2VisionBatchSize)
+	key = key.Reshape(ctx, headDim, opts.numHeads, key.Dim(1), lfm2VisionBatchSize)
+	value = value.Reshape(ctx, headDim, opts.numHeads, value.Dim(1), lfm2VisionBatchSize)
+
+	attention := nn.Attention(ctx, query, key, value, 1.0/math.Sqrt(float64(headDim)), nil)
+	attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), lfm2VisionBatchSize)
+	return sa.Output.Forward(ctx, attention)
+}
+
+type VisionMLP struct {
+	Up   *nn.Linear `gguf:"ffn_up"`
+	Down *nn.Linear `gguf:"ffn_down"`
+}
+
+func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenState ml.Tensor) ml.Tensor {
+	return mlp.Down.Forward(ctx, mlp.Up.Forward(ctx, hiddenState).GELU(ctx))
+}
+
+type VisionEncoderLayer struct {
+	LayerNorm1    *nn.LayerNorm `gguf:"ln1"`
+	SelfAttention *VisionSelfAttention
+
+	LayerNorm2 *nn.LayerNorm `gguf:"ln2"`
+	MLP        *VisionMLP
+}
+
+func (l *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
+	residual := hiddenState
+
+	hiddenState = l.LayerNorm1.Forward(ctx, hiddenState, opts.eps)
+	hiddenState = l.SelfAttention.Forward(ctx, hiddenState, opts)
+	hiddenState = hiddenState.Add(ctx, residual)
+
+	residual = hiddenState
+	hiddenState = l.LayerNorm2.Forward(ctx, hiddenState, opts.eps)
+	hiddenState = l.MLP.Forward(ctx, hiddenState)
+	return hiddenState.Add(ctx, residual)
+}
+
+type VisionModelOptions struct {
+	hiddenSize, numHeads int
+	imageSize, patchSize int
+	eps                  float32
+}
+
+type VisionModel struct {
+	PatchEmbedding    *nn.Conv2D    `gguf:"patch_embd"`
+	PositionEmbedding *nn.Embedding `gguf:"position_embd"`
+	PostLayerNorm     *nn.LayerNorm `gguf:"post_ln"`
+
+	Layers []VisionEncoderLayer `gguf:"blk"`
+
+	*VisionModelOptions
+}
+
+func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor, patches visionPatchGrid) ml.Tensor {
+	numPatches := patches.Width * patches.Height
+
+	hiddenState := m.PatchEmbedding.Forward(ctx, pixelValues, m.patchSize, m.patchSize, 0, 0, 1, 1)
+	hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize)
+	hiddenState = hiddenState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
+
+	if m.PositionEmbedding != nil {
+		posTokens := m.PositionEmbedding.Weight.Dim(1)
+		source := int(math.Sqrt(float64(posTokens)))
+
+		var positionEmbeddings ml.Tensor
+		if source > 0 && source*source == posTokens && (source != patches.Width || source != patches.Height) {
+			// SigLIP2 NAFlex-style position interpolation for variable image sizes.
+			positionIDs := ctx.Arange(0, float32(posTokens), 1, ml.DTypeI32)
+			positionEmbeddings = m.PositionEmbedding.Forward(ctx, positionIDs)
+			positionEmbeddings = positionEmbeddings.Reshape(ctx, -1, source, source)
+			positionEmbeddings = positionEmbeddings.Permute(ctx, 2, 0, 1, 3).Contiguous(ctx)
+			positionEmbeddings = positionEmbeddings.Interpolate(ctx, [4]int{
+				patches.Width,
+				patches.Height,
+				hiddenState.Dim(0),
+				1,
+			}, ml.SamplingModeBilinear)
+			positionEmbeddings = positionEmbeddings.Permute(ctx, 1, 2, 0, 3)
+			positionEmbeddings = positionEmbeddings.Contiguous(ctx, -1, patches.Width*patches.Height)
+		} else {
+			positionIDs := ctx.Arange(0, float32(numPatches), 1, ml.DTypeI32)
+			positionEmbeddings = m.PositionEmbedding.Forward(ctx, positionIDs)
+		}
+
+		hiddenState = hiddenState.Add(ctx, positionEmbeddings)
+	}
+
+	for _, layer := range m.Layers {
+		hiddenState = layer.Forward(ctx, hiddenState, m.VisionModelOptions)
+	}
+
+	return m.PostLayerNorm.Forward(ctx, hiddenState, m.eps)
+}
+
+func newVisionModel(c fs.Config) *VisionModel {
+	return &VisionModel{
+		Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count")),
+		VisionModelOptions: &VisionModelOptions{
+			hiddenSize: int(c.Uint("vision.embedding_length", 1152)),
+			numHeads:   int(c.Uint("vision.attention.head_count", 16)),
+			imageSize:  int(c.Uint("vision.image_size", 256)),
+			patchSize:  int(c.Uint("vision.patch_size", 16)),
+			eps:        c.Float("vision.attention.layer_norm_epsilon", 1e-6),
+		},
+	}
+}
+
+type VisionProjector struct {
+	LayerNorm *nn.LayerNorm `gguf:"layer_norm"`
+	Linear1   *nn.Linear    `gguf:"1"`
+	Linear2   *nn.Linear    `gguf:"2"`
+}
+
+type VisionProjectorOptions struct {
+	scaleFactor  int
+	useLayerNorm bool
+}
+
+func (p *VisionProjector) Forward(ctx ml.Context, visionOutputs ml.Tensor, patches visionPatchGrid, opts VisionProjectorOptions) ml.Tensor {
+	hiddenSize := visionOutputs.Dim(0)
+	featureMap := visionOutputs
+
+	merge := max(opts.scaleFactor, 1)
+	if merge > 1 {
+		width := patches.Width
+		height := patches.Height
+
+		featureMap = featureMap.Reshape(ctx, hiddenSize, width, height)
+
+		// Match llama.cpp patch merger: pad spatial dims to merge factor.
+		padWidth := (merge - width%merge) % merge
+		padHeight := (merge - height%merge) % merge
+		if padWidth != 0 || padHeight != 0 {
+			featureMap = featureMap.Pad(ctx, 0, padWidth, padHeight, 0)
+			width += padWidth
+			height += padHeight
+		}
+
+		featureMap = featureMap.Reshape(ctx, hiddenSize*merge, width/merge, height)
+		featureMap = featureMap.Permute(ctx, 0, 2, 1).Contiguous(ctx, hiddenSize*merge*merge, height/merge, width/merge)
+		featureMap = featureMap.Permute(ctx, 0, 2, 1).Contiguous(ctx)
+		featureMap = featureMap.Contiguous(ctx, featureMap.Dim(0), featureMap.Dim(1)*featureMap.Dim(2))
+	}
+
+	if opts.useLayerNorm && p.LayerNorm != nil {
+		featureMap = p.LayerNorm.Forward(ctx, featureMap, 1e-5)
+	}
+
+	featureMap = p.Linear1.Forward(ctx, featureMap).GELU(ctx)
+	return p.Linear2.Forward(ctx, featureMap)
+}
--- a/model/models/lfm2/process_image.go
+++ b/model/models/lfm2/process_image.go
@@ -0,0 +1,260 @@
+package lfm2
+
+import (
+	"image"
+	stdimage "image/draw"
+	"math"
+	"slices"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/model/imageproc"
+)
+
+type ImageProcessor struct {
+	imageSize, patchSize, numChannels int
+	downsampleFactor                  int
+	imageMean, imageStd               [3]float32
+
+	doImageSplitting bool
+	minTiles         int
+	maxTiles         int
+	useThumbnail     bool
+	tileSize         int
+
+	minImageTokens     int
+	maxImageTokens     int
+	maxPixelsTolerance float64
+}
+
+type processedVisionImage struct {
+	data      []float32
+	size      image.Point
+	row       int
+	col       int
+	thumbnail bool
+}
+
+type processedVisionLayout struct {
+	rows         int
+	cols         int
+	hasThumbnail bool
+}
+
+func newImageProcessor(c fs.Config) ImageProcessor {
+	mean := c.Floats("vision.image_mean")
+	std := c.Floats("vision.image_std")
+
+	processor := ImageProcessor{
+		imageSize:          int(c.Uint("vision.image_size", 256)),
+		patchSize:          int(c.Uint("vision.patch_size", 16)),
+		numChannels:        int(c.Uint("vision.num_channels", 3)),
+		downsampleFactor:   int(c.Uint("vision.projector.scale_factor", 2)),
+		imageMean:          [3]float32{0.5, 0.5, 0.5},
+		imageStd:           [3]float32{0.5, 0.5, 0.5},
+		doImageSplitting:   c.Bool("vision.do_image_splitting", true),
+		minTiles:           int(c.Uint("vision.min_tiles", 2)),
+		maxTiles:           int(c.Uint("vision.max_tiles", 10)),
+		useThumbnail:       c.Bool("vision.use_thumbnail", true),
+		tileSize:           int(c.Uint("vision.tile_size", 512)),
+		minImageTokens:     int(c.Uint("vision.min_image_tokens", 64)),
+		maxImageTokens:     int(c.Uint("vision.max_image_tokens", 256)),
+		maxPixelsTolerance: float64(c.Float("vision.max_pixels_tolerance", 2.0)),
+	}
+
+	if len(mean) >= 3 {
+		processor.imageMean = [3]float32{mean[0], mean[1], mean[2]}
+	}
+	if len(std) >= 3 {
+		processor.imageStd = [3]float32{std[0], std[1], std[2]}
+	}
+
+	// Keep defaults aligned with HF unless explicitly configured.
+	if processor.downsampleFactor <= 0 {
+		processor.downsampleFactor = 2
+	}
+	if processor.patchSize <= 0 {
+		processor.patchSize = 16
+	}
+	if processor.tileSize <= 0 {
+		processor.tileSize = 512
+	}
+	if processor.minTiles <= 0 {
+		processor.minTiles = 2
+	}
+	if processor.maxTiles < processor.minTiles {
+		processor.maxTiles = processor.minTiles
+	}
+	if processor.minImageTokens <= 0 {
+		processor.minImageTokens = 64
+	}
+	if processor.maxImageTokens < processor.minImageTokens {
+		processor.maxImageTokens = processor.minImageTokens
+	}
+	if processor.maxPixelsTolerance <= 0 {
+		processor.maxPixelsTolerance = 2.0
+	}
+
+	return processor
+}
+
+func (p ImageProcessor) ProcessImage(img image.Image) ([]processedVisionImage, processedVisionLayout, error) {
+	img = imageproc.Composite(img)
+
+	orig := img.Bounds().Size()
+	resizedWidth, resizedHeight := p.smartResize(orig.Y, orig.X)
+
+	layout := processedVisionLayout{rows: 1, cols: 1}
+	if p.shouldSplit(orig.Y, orig.X) {
+		gridWidth, gridHeight, targetWidth, targetHeight := p.gridLayout(orig.Y, orig.X)
+		layout.rows = gridHeight
+		layout.cols = gridWidth
+		layout.hasThumbnail = p.useThumbnail && gridWidth*gridHeight != 1
+
+		resized := imageproc.Resize(img, image.Point{X: targetWidth, Y: targetHeight}, imageproc.ResizeBilinear)
+		images := make([]processedVisionImage, 0, gridWidth*gridHeight+1)
+		for row := range gridHeight {
+			for col := range gridWidth {
+				rect := image.Rect(
+					col*p.tileSize,
+					row*p.tileSize,
+					(col+1)*p.tileSize,
+					(row+1)*p.tileSize,
+				)
+				tile := cropImage(resized, rect)
+				images = append(images, processedVisionImage{
+					data: imageproc.Normalize(tile, p.imageMean, p.imageStd, true, true),
+					size: tile.Bounds().Size(),
+					row:  row + 1,
+					col:  col + 1,
+				})
+			}
+		}
+
+		if layout.hasThumbnail {
+			thumbnail := imageproc.Resize(img, image.Point{X: resizedWidth, Y: resizedHeight}, imageproc.ResizeBilinear)
+			images = append(images, processedVisionImage{
+				data:      imageproc.Normalize(thumbnail, p.imageMean, p.imageStd, true, true),
+				size:      thumbnail.Bounds().Size(),
+				thumbnail: true,
+			})
+		}
+
+		return images, layout, nil
+	}
+
+	single := imageproc.Resize(img, image.Point{X: resizedWidth, Y: resizedHeight}, imageproc.ResizeBilinear)
+	return []processedVisionImage{{
+		data: imageproc.Normalize(single, p.imageMean, p.imageStd, true, true),
+		size: single.Bounds().Size(),
+	}}, layout, nil
+}
+
+func (p ImageProcessor) shouldSplit(height, width int) bool {
+	if !p.doImageSplitting || p.minTiles == 1 && p.maxTiles == 1 {
+		return false
+	}
+
+	totalFactor := p.patchSize * p.downsampleFactor
+	hBar := max(p.patchSize, roundByFactor(height, totalFactor))
+	wBar := max(p.patchSize, roundByFactor(width, totalFactor))
+
+	limit := float64(p.maxImageTokens * p.patchSize * p.patchSize * p.downsampleFactor * p.downsampleFactor)
+	limit *= p.maxPixelsTolerance
+
+	return float64(hBar*wBar) > limit
+}
+
+func (p ImageProcessor) smartResize(height, width int) (int, int) {
+	totalFactor := p.patchSize * p.downsampleFactor
+	minPixels := p.minImageTokens * p.patchSize * p.patchSize * p.downsampleFactor * p.downsampleFactor
+	maxPixels := p.maxImageTokens * p.patchSize * p.patchSize * p.downsampleFactor * p.downsampleFactor
+
+	hBar := max(totalFactor, roundByFactor(height, totalFactor))
+	wBar := max(totalFactor, roundByFactor(width, totalFactor))
+
+	if hBar*wBar > maxPixels {
+		beta := math.Sqrt(float64(height*width) / float64(maxPixels))
+		hBar = max(totalFactor, int(math.Floor(float64(height)/beta/float64(totalFactor)))*totalFactor)
+		wBar = max(totalFactor, int(math.Floor(float64(width)/beta/float64(totalFactor)))*totalFactor)
+	} else if hBar*wBar < minPixels {
+		beta := math.Sqrt(float64(minPixels) / float64(height*width))
+		hBar = int(math.Ceil(float64(height)*beta/float64(totalFactor))) * totalFactor
+		wBar = int(math.Ceil(float64(width)*beta/float64(totalFactor))) * totalFactor
+	}
+
+	return wBar, hBar
+}
+
+func (p ImageProcessor) gridLayout(height, width int) (gridWidth, gridHeight, targetWidth, targetHeight int) {
+	aspectRatio := float64(width) / float64(height)
+	targetRatios := p.targetRatios()
+	bestRatio := clipImageSize{width: 1, height: 1}
+	bestRatioDiff := math.MaxFloat64
+	area := float64(width * height)
+
+	for _, ratio := range targetRatios {
+		targetAspect := float64(ratio.width) / float64(ratio.height)
+		ratioDiff := math.Abs(aspectRatio - targetAspect)
+
+		if ratioDiff < bestRatioDiff {
+			bestRatioDiff = ratioDiff
+			bestRatio = ratio
+			continue
+		}
+
+		if ratioDiff == bestRatioDiff {
+			targetArea := float64(p.tileSize * p.tileSize * ratio.width * ratio.height)
+			if area > 0.5*targetArea {
+				bestRatio = ratio
+			}
+		}
+	}
+
+	return bestRatio.width, bestRatio.height, p.tileSize * bestRatio.width, p.tileSize * bestRatio.height
+}
+
+type clipImageSize struct {
+	width  int
+	height int
+}
+
+func (p ImageProcessor) targetRatios() []clipImageSize {
+	targetRatios := make([]clipImageSize, 0, p.maxTiles*p.maxTiles)
+	for n := p.minTiles; n <= p.maxTiles; n++ {
+		for w := 1; w <= n; w++ {
+			for h := 1; h <= n; h++ {
+				if w*h < p.minTiles || w*h > p.maxTiles {
+					continue
+				}
+				targetRatios = append(targetRatios, clipImageSize{width: w, height: h})
+			}
+		}
+	}
+
+	unique := targetRatios[:0]
+	for _, ratio := range targetRatios {
+		if slices.Contains(unique, ratio) {
+			continue
+		}
+		unique = append(unique, ratio)
+	}
+
+	slices.SortFunc(unique, func(a, b clipImageSize) int {
+		return a.width*a.height - b.width*b.height
+	})
+
+	return unique
+}
+
+func roundByFactor(number, factor int) int {
+	if factor <= 0 {
+		return number
+	}
+	return int(math.RoundToEven(float64(number)/float64(factor))) * factor
+}
+
+func cropImage(img image.Image, rect image.Rectangle) image.Image {
+	dst := image.NewRGBA(image.Rect(0, 0, rect.Dx(), rect.Dy()))
+	stdimage.Draw(dst, dst.Bounds(), img, rect.Min, stdimage.Src)
+	return dst
+}
--- a/model/models/lfm2/process_image_test.go
+++ b/model/models/lfm2/process_image_test.go
@@ -0,0 +1,105 @@
+package lfm2
+
+import (
+	"image"
+	"image/color"
+	"testing"
+)
+
+func TestProcessImageSingleTile(t *testing.T) {
+	p := ImageProcessor{
+		patchSize:          16,
+		downsampleFactor:   2,
+		numChannels:        3,
+		imageMean:          [3]float32{0.5, 0.5, 0.5},
+		imageStd:           [3]float32{0.5, 0.5, 0.5},
+		doImageSplitting:   true,
+		minTiles:           2,
+		maxTiles:           10,
+		useThumbnail:       true,
+		tileSize:           512,
+		minImageTokens:     64,
+		maxImageTokens:     256,
+		maxPixelsTolerance: 2.0,
+	}
+
+	img := image.NewRGBA(image.Rect(0, 0, 320, 320))
+	out, layout, err := p.ProcessImage(img)
+	if err != nil {
+		t.Fatalf("ProcessImage returned error: %v", err)
+	}
+
+	if layout.rows != 1 || layout.cols != 1 || layout.hasThumbnail {
+		t.Fatalf("layout = %+v, want rows=1 cols=1 hasThumbnail=false", layout)
+	}
+	if len(out) != 1 {
+		t.Fatalf("len(out) = %d, want 1", len(out))
+	}
+	if out[0].size != (image.Point{X: 320, Y: 320}) {
+		t.Fatalf("single image size = %+v, want 320x320", out[0].size)
+	}
+	if out[0].thumbnail {
+		t.Fatalf("single image should not be marked as thumbnail")
+	}
+}
+
+func TestProcessImageDynamicTiling(t *testing.T) {
+	p := ImageProcessor{
+		patchSize:          16,
+		downsampleFactor:   2,
+		numChannels:        3,
+		imageMean:          [3]float32{0.5, 0.5, 0.5},
+		imageStd:           [3]float32{0.5, 0.5, 0.5},
+		doImageSplitting:   true,
+		minTiles:           2,
+		maxTiles:           10,
+		useThumbnail:       true,
+		tileSize:           512,
+		minImageTokens:     64,
+		maxImageTokens:     256,
+		maxPixelsTolerance: 2.0,
+	}
+
+	// Wide image that should trigger multi-tile splitting.
+	img := image.NewRGBA(image.Rect(0, 0, 3000, 1000))
+	fill := color.RGBA{R: 120, G: 90, B: 60, A: 255}
+	for y := range 1000 {
+		for x := range 3000 {
+			img.Set(x, y, fill)
+		}
+	}
+
+	out, layout, err := p.ProcessImage(img)
+	if err != nil {
+		t.Fatalf("ProcessImage returned error: %v", err)
+	}
+
+	if layout.rows*layout.cols <= 1 {
+		t.Fatalf("expected multi-tile layout, got %+v", layout)
+	}
+	if !layout.hasThumbnail {
+		t.Fatalf("expected thumbnail for multi-tile layout")
+	}
+
+	wantLen := layout.rows*layout.cols + 1
+	if len(out) != wantLen {
+		t.Fatalf("len(out) = %d, want %d", len(out), wantLen)
+	}
+
+	for i := range layout.rows * layout.cols {
+		if out[i].size != (image.Point{X: 512, Y: 512}) {
+			t.Fatalf("tile[%d] size = %+v, want 512x512", i, out[i].size)
+		}
+		if out[i].thumbnail {
+			t.Fatalf("tile[%d] should not be marked as thumbnail", i)
+		}
+	}
+
+	thumb := out[len(out)-1]
+	if !thumb.thumbnail {
+		t.Fatalf("last chunk should be thumbnail")
+	}
+	if thumb.size.X%32 != 0 || thumb.size.Y%32 != 0 {
+		t.Fatalf("thumbnail size = %+v, want dimensions aligned to 32", thumb.size)
+	}
+}
--- a/model/models/models.go
+++ b/model/models/models.go
@@ -15,6 +15,7 @@ import (
 	_ "github.com/ollama/ollama/model/models/llama4"
 	_ "github.com/ollama/ollama/model/models/mistral3"
 	_ "github.com/ollama/ollama/model/models/mllama"
+	_ "github.com/ollama/ollama/model/models/nemotronh"
 	_ "github.com/ollama/ollama/model/models/nomicbert"
 	_ "github.com/ollama/ollama/model/models/olmo3"
 	_ "github.com/ollama/ollama/model/models/qwen2"
--- a/model/models/nemotronh/attention.go
+++ b/model/models/nemotronh/attention.go
@@ -0,0 +1,88 @@
+package nemotronh
+
+import (
+	"fmt"
+	"math"
+
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+)
+
+// Attention implements simple attention without RoPE for Nemotron-H.
+// Unlike Qwen3Next, Nemotron-H attention has:
+// - No RoPE (position info comes from Mamba2 layers)
+// - Standard scaled dot-product attention
+type Attention struct {
+	Query  *nn.Linear `gguf:"attn_q"`
+	Key    *nn.Linear `gguf:"attn_k"`
+	Value  *nn.Linear `gguf:"attn_v"`
+	Output *nn.Linear `gguf:"attn_output"`
+}
+
+func (a *Attention) Forward(ctx ml.Context, hiddenStates ml.Tensor, cache *HybridCache, opts *Options) (ml.Tensor, error) {
+	hiddenDim := hiddenStates.Dim(0)
+	nSeqTokens := hiddenStates.Dim(1)
+	switch hiddenStates.Dim(2) {
+	case 0:
+		hiddenStates = hiddenStates.Reshape(ctx, hiddenDim, nSeqTokens, 1)
+	case 1:
+	default:
+		return nil, ErrUnsupportedBatchLayout
+	}
+
+	// Nemotron-H is currently clamped to num_parallel=1.
+	if cache != nil && cache.IsSupportedForBatch() {
+		if cache.numSeqs() != 1 {
+			return nil, ErrUnsupportedBatchLayout
+		}
+		if seqTokens := cache.seqTokens(); seqTokens > 0 && nSeqTokens != seqTokens {
+			return nil, ErrUnsupportedBatchLayout
+		}
+	}
+	batchSize := nSeqTokens
+	hiddenStates = hiddenStates.Reshape(ctx, hiddenDim, batchSize)
+
+	headDim := opts.getHeadDim()
+	if headDim <= 0 {
+		return nil, fmt.Errorf("nemotronh: invalid attention head dimension %d", headDim)
+	}
+
+	// Q projection
+	query := a.Query.Forward(ctx, hiddenStates)
+	if query.Dim(0)%headDim != 0 {
+		return nil, fmt.Errorf("nemotronh: query dim %d not divisible by head dim %d", query.Dim(0), headDim)
+	}
+	numHeads := query.Dim(0) / headDim
+	query = query.Reshape(ctx, headDim, numHeads, batchSize)
+
+	// K projection
+	key := a.Key.Forward(ctx, hiddenStates)
+	if key.Dim(0)%headDim != 0 {
+		return nil, fmt.Errorf("nemotronh: key dim %d not divisible by head dim %d", key.Dim(0), headDim)
+	}
+	numKVHeads := key.Dim(0) / headDim
+	key = key.Reshape(ctx, headDim, numKVHeads, batchSize)
+
+	// V projection
+	value := a.Value.Forward(ctx, hiddenStates)
+	if value.Dim(0)%headDim != 0 {
+		return nil, fmt.Errorf("nemotronh: value dim %d not divisible by head dim %d", value.Dim(0), headDim)
+	}
+	if value.Dim(0)/headDim != numKVHeads {
+		return nil, fmt.Errorf("nemotronh: key heads %d and value heads %d do not match", numKVHeads, value.Dim(0)/headDim)
+	}
+	value = value.Reshape(ctx, headDim, numKVHeads, batchSize)
+
+	// Standard attention computation (no RoPE)
+	scale := opts.attentionScale
+	if scale == 0 {
+		scale = 1.0 / math.Sqrt(float64(headDim))
+	}
+	attention := nn.Attention(ctx, query, key, value, scale, cache)
+
+	// Flatten heads
+	attention = attention.Reshape(ctx, headDim*numHeads, batchSize)
+
+	// Output projection
+	return a.Output.Forward(ctx, attention), nil
+}
--- a/model/models/nemotronh/cache.go
+++ b/model/models/nemotronh/cache.go
@@ -0,0 +1,55 @@
+package nemotronh
+
+import (
+	"errors"
+
+	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/ml"
+)
+
+// ErrUnsupportedBatchLayout is returned when the batch layout is incompatible
+// with the layer requirements.
+var ErrUnsupportedBatchLayout = errors.New("nemotronh: unsupported batch layout")
+
+var (
+	_ kvcache.Cache           = (*HybridCache)(nil)
+	_ kvcache.CheckpointCache = (*HybridCache)(nil)
+)
+
+// HybridCache adapts the shared recurrent cache base for Nemotron-H naming.
+type HybridCache struct {
+	*kvcache.Recurrent
+}
+
+func NewHybridCache(convDim, convChannels, ssmStateSize int) *HybridCache {
+	base := kvcache.NewRecurrentCache(kvcache.RecurrentConfig{
+		Shift:               Shift,
+		ConvDim:             convDim,
+		ConvChannels:        convChannels,
+		RecurrentStateSize:  ssmStateSize,
+		CheckpointLogPrefix: "nemotronh",
+	})
+	return &HybridCache{Recurrent: base}
+}
+
+// SSMState returns the SSM state for current batch sequences.
+func (c *HybridCache) SSMState(ctx ml.Context, layer int, dState, headDim, nHead int) (ml.Tensor, error) {
+	return c.RecurrentState4D(ctx, layer, dState, headDim, nHead)
+}
+
+// UpdateSSMState writes a new SSM state for current batch sequences.
+func (c *HybridCache) UpdateSSMState(ctx ml.Context, layer int, newState ml.Tensor) {
+	c.UpdateRecurrentState(ctx, layer, newState)
+}
+
+func (c *HybridCache) slotsTensor() ml.Tensor {
+	return c.SlotsTensor()
+}
+
+func (c *HybridCache) seqTokens() int {
+	return c.SeqTokens()
+}
+
+func (c *HybridCache) numSeqs() int {
+	return c.NumSeqs()
+}
--- a/model/models/nemotronh/mamba2.go
+++ b/model/models/nemotronh/mamba2.go
@@ -0,0 +1,197 @@
+package nemotronh
+
+import (
+	"log/slog"
+
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+)
+
+// convKernel wraps the 1D convolution kernel tensor
+type convKernel struct {
+	Weight ml.Tensor `gguf:"weight"`
+}
+
+// Mamba2 implements the Mamba2 SSM layer for Nemotron-H.
+// The forward pass follows llama.cpp's build_mamba2_layer:
+// 1. Input projection: zxBCdt = SSMIn @ hidden
+// 2. Split: z, xBC, dt
+// 3. Concat with conv state, apply SSMConv, save new conv state
+// 4. Apply SiLU to convolved xBC
+// 5. Split: x, B, C
+// 6. Add dt bias
+// 7. SSMScan: y = SSMScan(state, x, dt, A, B, C, ids)
+// 8. D skip: y = y + x * D
+// 9. Swiglu with z: y = z * silu(y)
+// 10. Group RMSNorm
+// 11. Output projection
+type Mamba2 struct {
+	SSMIn      *nn.Linear  `gguf:"ssm_in"`     // n_embd → d_in_proj (2*d_inner + 2*n_group*d_state + n_head)
+	SSMConv1D  *convKernel `gguf:"ssm_conv1d"` // conv kernel
+	SSMConv1DB ml.Tensor   `gguf:"ssm_conv1d.bias"`
+	SSMDtB     ml.Tensor   `gguf:"ssm_dt.bias"` // dt bias [n_head]
+	SSMA       ml.Tensor   `gguf:"ssm_a"`       // A parameter [1, n_head]
+	SSMD       ml.Tensor   `gguf:"ssm_d"`       // D skip connection [1, n_head]
+	SSMNorm    *nn.RMSNorm `gguf:"ssm_norm"`    // group norm
+	SSMOut     *nn.Linear  `gguf:"ssm_out"`     // output projection
+	Layer      int
+}
+
+func (m *Mamba2) Forward(ctx ml.Context, hiddenStates ml.Tensor, cache *HybridCache, opts *Options) (ml.Tensor, error) {
+	layer := m.Layer
+	hiddenDim := hiddenStates.Dim(0)
+	nSeqTokens := hiddenStates.Dim(1)
+	switch hiddenStates.Dim(2) {
+	case 0:
+		hiddenStates = hiddenStates.Reshape(ctx, hiddenDim, nSeqTokens, 1)
+	case 1:
+	default:
+		return nil, ErrUnsupportedBatchLayout
+	}
+
+	// Nemotron-H is currently clamped to num_parallel=1.
+	if cache != nil && cache.IsSupportedForBatch() {
+		if cache.numSeqs() != 1 {
+			return nil, ErrUnsupportedBatchLayout
+		}
+		if seqTokens := cache.seqTokens(); seqTokens > 0 && nSeqTokens != seqTokens {
+			return nil, ErrUnsupportedBatchLayout
+		}
+	}
+	nSeqs := 1
+
+	dConv := opts.ssmDConv
+	dInner := opts.ssmDInner
+	dState := opts.ssmDState
+	nHead := opts.ssmNHead
+	headDim := dInner / nHead
+	nGroup := opts.ssmNGroup
+
+	// {n_embd, n_seq_tokens, n_seqs} => {d_in_proj, n_seq_tokens, n_seqs}
+	// d_in_proj = 2*d_inner + 2*n_group*d_state + n_head
+	zxBCdt := m.SSMIn.Forward(ctx, hiddenStates)
+
+	// Split into z, xBC, dt
+	// z: [head_dim, n_head, n_seq_tokens, n_seqs]
+	z := zxBCdt.Slice(ctx, 0, 0, dInner, 1)
+	z = z.Reshape(ctx, headDim, nHead, nSeqTokens, nSeqs)
+
+	// xBC: [d_inner + 2*n_group*d_state, n_seq_tokens, n_seqs]
+	xBCSize := dInner + 2*nGroup*dState
+	xBC := zxBCdt.Slice(ctx, 0, dInner, dInner+xBCSize, 1)
+	if nSeqTokens == 1 {
+		xBC = xBC.Reshape(ctx, xBCSize, 1, nSeqs)
+	}
+
+	// dt: [n_head, n_seq_tokens, n_seqs]
+	dt := zxBCdt.Slice(ctx, 0, 2*dInner+2*nGroup*dState, 2*dInner+2*nGroup*dState+nHead, 1)
+	if nSeqTokens == 1 {
+		dt = dt.Reshape(ctx, nHead, 1, nSeqs)
+	} else {
+		dt = dt.Contiguous(ctx, nHead, nSeqTokens, nSeqs)
+	}
+
+	// Get conv state from cache
+	convStates, err := cache.ConvState(ctx, layer)
+	if err != nil {
+		slog.Warn("nemotronh: failed to get conv state, using zeros", "layer", layer, "error", err)
+		convStates = ctx.Input().Zeros(ml.DTypeF32, dConv-1, xBCSize, nSeqs)
+	}
+
+	// Reshape conv states: [d_conv-1, xBCSize, n_seqs]
+	convStates = convStates.Reshape(ctx, dConv-1, xBCSize, nSeqs)
+
+	// For decode (n_seq_tokens == 1), reshape avoids a transpose/contiguous pair.
+	var xBCT ml.Tensor
+	if nSeqTokens == 1 {
+		xBCT = xBC.Reshape(ctx, 1, xBCSize, nSeqs)
+	} else {
+		// Prefill path: [xBCSize, n_seq_tokens, n_seqs] -> [n_seq_tokens, xBCSize, n_seqs]
+		xBCT = xBC.Permute(ctx, 1, 0, 2, 3)
+	}
+
+	// Concatenate with conv state: [d_conv-1 + n_seq_tokens, xBCSize, n_seqs]
+	convInput := convStates.Concat(ctx, xBCT, 0)
+
+	// Save new conv state (last d_conv-1 columns)
+	lastConvStates := convInput.Slice(ctx, 0, nSeqTokens, nSeqTokens+dConv-1, 1)
+	cache.UpdateConvState(ctx, layer, lastConvStates)
+
+	// Apply SSM convolution
+	xBC = convInput.SSMConv(ctx, m.SSMConv1D.Weight)
+
+	// Add conv bias
+	if m.SSMConv1DB != nil {
+		xBC = xBC.Add(ctx, m.SSMConv1DB)
+	}
+
+	// Apply SiLU
+	xBC = xBC.SILU(ctx)
+
+	// Split xBC into x, B, C
+	// x: [head_dim, n_head, n_seq_tokens, n_seqs]
+	x := xBC.Slice(ctx, 0, 0, dInner, 1)
+	x = x.Reshape(ctx, headDim, nHead, nSeqTokens, nSeqs)
+
+	// B: [d_state, n_group, n_seq_tokens, n_seqs]
+	B := xBC.Slice(ctx, 0, dInner, dInner+nGroup*dState, 1)
+	B = B.Reshape(ctx, dState, nGroup, nSeqTokens, nSeqs)
+
+	// C: [d_state, n_group, n_seq_tokens, n_seqs]
+	C := xBC.Slice(ctx, 0, dInner+nGroup*dState, dInner+2*nGroup*dState, 1)
+	C = C.Reshape(ctx, dState, nGroup, nSeqTokens, nSeqs)
+
+	// Add dt bias
+	dt = dt.Add(ctx, m.SSMDtB)
+
+	// Get SSM state from cache
+	state, err := cache.SSMState(ctx, layer, dState, headDim, nHead)
+	if err != nil {
+		slog.Warn("nemotronh: failed to get SSM state, using zeros", "layer", layer, "error", err)
+		state = ctx.Input().Zeros(ml.DTypeF32, dState, headDim, nHead, nSeqs)
+	}
+
+	// SSMScan
+	// state: [d_state, head_dim, n_head, n_seqs]
+	// returns: [head_dim, n_head, n_seq_tokens, n_seqs] concatenated with new state
+	ySsm := state.SSMScan(ctx, x, dt, m.SSMA, B, C, cache.slotsTensor())
+
+	// ySsm is a packed 1D buffer: [y (nSeqTokens*headDim*nHead*nSeqs), newState]
+	yElems := headDim * nHead * nSeqTokens * nSeqs
+	y := ySsm.View(ctx, 0, yElems).Reshape(ctx, headDim, nHead, nSeqTokens, nSeqs)
+
+	stateOffsetBytes := yElems * x.Stride(0)
+	stateElems := dState * headDim * nHead * nSeqs
+	newState := ySsm.View(ctx, stateOffsetBytes, stateElems)
+	newState = newState.Reshape(ctx, dState, headDim, nHead, nSeqs)
+
+	// Update SSM state in cache
+	cache.UpdateSSMState(ctx, layer, newState)
+
+	// D skip connection: y = y + x * D
+	if m.SSMD != nil {
+		// SSMD shape: [1, n_head] -> broadcast to [head_dim, n_head, n_seq_tokens, n_seqs]
+		xD := x.Mul(ctx, m.SSMD)
+		y = y.Add(ctx, xD)
+	}
+
+	// Swiglu with z: y = z * silu(y)
+	y = z.SILU(ctx, y)
+
+	// Group RMSNorm
+	if m.SSMNorm != nil {
+		// Reshape for group norm: [d_inner/n_group, n_group, n_seq_tokens, n_seqs]
+		innerPerGroup := dInner / nGroup
+		y = y.Reshape(ctx, innerPerGroup, nGroup, nSeqTokens, nSeqs)
+		y = m.SSMNorm.Forward(ctx, y, opts.eps)
+	}
+
+	// Reshape back to [d_inner, n_seq_tokens, n_seqs]
+	y = y.Reshape(ctx, dInner, nSeqTokens, nSeqs)
+
+	// Output projection
+	out := m.SSMOut.Forward(ctx, y)
+
+	// Reshape to 2D for consistency with attention output
+	return out.Reshape(ctx, out.Dim(0), nSeqTokens*nSeqs), nil
+}
--- a/model/models/nemotronh/model.go
+++ b/model/models/nemotronh/model.go
@@ -0,0 +1,417 @@
+package nemotronh
+
+import (
+	"fmt"
+	"math"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/model/input"
+	"github.com/ollama/ollama/tokenizer"
+)
+
+// Options contains model configuration
+type Options struct {
+	hiddenSize int
+	numHeads   int // attention heads
+	numKVHeads int // KV heads for attention layers
+	headDim    int
+	eps        float32
+
+	// Mamba2 SSM config
+	ssmDConv  int // conv kernel size
+	ssmDInner int // inner dimension (d_inner)
+	ssmDState int // state dimension
+	ssmNHead  int // number of SSM heads (dt_rank)
+	ssmNGroup int // number of groups for B, C
+
+	// Per-layer configuration
+	isRecurrent []bool // true = Mamba2, false = attention or FFN
+	nFF         []int  // n_ff per layer (0 = attention-only)
+
+	// Attention scale
+	attentionScale float64
+
+	// MoE config
+	numExperts            int
+	numExpertsUsed        int
+	expertWeightsNorm     bool
+	expertWeightsScale    float32
+	expertWeightsNormClip float32
+}
+
+func (o Options) getHeadDim() int {
+	if o.headDim > 0 {
+		return o.headDim
+	}
+	if o.numHeads <= 0 {
+		return 0
+	}
+	return o.hiddenSize / o.numHeads
+}
+
+// Operator is the interface for layer operators (Mamba2 or Attention)
+type Operator interface {
+	Forward(ctx ml.Context, hiddenStates ml.Tensor, cache *HybridCache, opts *Options) (ml.Tensor, error)
+}
+
+// MLP is the interface for feedforward networks
+type MLP interface {
+	Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *Options) ml.Tensor
+}
+
+// Layer represents a single transformer layer
+type Layer struct {
+	AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
+	Operator      Operator    // Mamba2, Attention, or nil (for FFN-only layers)
+	MLP           MLP         // Dense or MoE FFN, or nil
+}
+
+func (l *Layer) Forward(ctx ml.Context, layer int, hiddenStates, outputs ml.Tensor, cache *HybridCache, opts *Options) (ml.Tensor, error) {
+	residual := hiddenStates
+
+	// Pre-layer norm
+	hiddenStates = l.AttentionNorm.Forward(ctx, hiddenStates, opts.eps)
+
+	// Layer operator (Mamba2, Attention, or FFN)
+	if l.Operator != nil {
+		var err error
+		hiddenStates, err = l.Operator.Forward(ctx, hiddenStates, cache, opts)
+		if err != nil {
+			return nil, err
+		}
+	} else if l.MLP != nil {
+		// FFN-only layer
+		hiddenStates = l.MLP.Forward(ctx, hiddenStates, opts)
+	}
+
+	// Output projection for last layer
+	if outputs != nil {
+		hiddenStates = hiddenStates.Rows(ctx, outputs)
+		residual = residual.Rows(ctx, outputs)
+	}
+
+	// Residual connection
+	return hiddenStates.Add(ctx, residual), nil
+}
+
+// Model is the main Nemotron-H model
+type Model struct {
+	model.Base
+	tokenizer.Tokenizer
+
+	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
+	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
+	Output         *nn.Linear    `gguf:"output,alt:token_embd"`
+
+	Layers []Layer `gguf:"blk"`
+
+	*Options
+}
+
+// Shift is used for KV cache position shifting.
+// Nemotron-H attention does not apply RoPE, so keys do not need to be transformed.
+func Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
+	return key, nil
+}
+
+func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
+	hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs)
+
+	cache := m.Cache.(*HybridCache)
+
+	for i, layer := range m.Layers {
+		cache.SetLayer(i)
+
+		var outputs ml.Tensor
+		if i == len(m.Layers)-1 {
+			outputs = batch.Outputs
+		}
+
+		var err error
+		hiddenStates, err = layer.Forward(ctx, i, hiddenStates, outputs, cache, m.Options)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.eps)
+	return m.Output.Forward(ctx, hiddenStates), nil
+}
+
+func New(c fs.Config) (model.Model, error) {
+	numLayers := int(c.Uint("block_count"))
+	layers := make([]Layer, numLayers)
+
+	// Get per-layer configuration from GGUF metadata
+	// Use the same interface pattern as qwen3next
+	type perLayerConfig interface {
+		HeadCount() []uint64
+		HeadCountKV() []uint64
+		FFNLength() []uint64
+	}
+
+	var headCount []uint64
+	var headCountKV []uint64
+	var ffnLength []uint64
+
+	if plc, ok := c.(perLayerConfig); ok {
+		headCount = plc.HeadCount()
+		headCountKV = plc.HeadCountKV()
+		ffnLength = plc.FFNLength()
+	}
+
+	// Build per-layer arrays with defaults
+	isRecurrent := make([]bool, numLayers)
+	nFF := make([]int, numLayers)
+
+	for i := range numLayers {
+		// Get per-layer values
+		kvHeads := uint64(1) // Default non-zero
+		if i < len(headCountKV) {
+			kvHeads = headCountKV[i]
+		}
+		ff := uint64(0)
+		if i < len(ffnLength) {
+			ff = ffnLength[i]
+		}
+		nFF[i] = int(ff)
+
+		// A layer is recurrent IFF n_head_kv == 0 AND n_ff == 0
+		// This matches llama.cpp behavior for Nemotron-H
+		isRecurrent[i] = kvHeads == 0 && ff == 0
+	}
+
+	// Determine if MoE
+	isMoE := c.Uint("expert_count") > 0
+
+	for i := range layers {
+		if isRecurrent[i] {
+			// Mamba2 layer
+			layers[i].Operator = &Mamba2{Layer: i}
+		} else if nFF[i] == 0 {
+			// Attention-only layer (n_head_kv > 0, n_ff == 0)
+			layers[i].Operator = &Attention{}
+		} else {
+			// FFN layer (n_ff > 0)
+			if isMoE {
+				layers[i].MLP = &MoESparse{}
+			} else {
+				layers[i].MLP = &Dense{}
+			}
+		}
+	}
+
+	// Get attention head configuration
+	numHeads := int(c.Uint("attention.head_count"))
+	if numHeads == 0 {
+		for i := range numLayers {
+			if i < len(headCount) && i < len(headCountKV) && headCount[i] > 0 && headCountKV[i] > 0 {
+				numHeads = int(headCount[i])
+				break
+			}
+		}
+	}
+	numKVHeads := int(c.Uint("attention.head_count_kv"))
+	if numKVHeads == 0 {
+		for i := range numLayers {
+			if i < len(headCountKV) && i < len(ffnLength) && headCountKV[i] > 0 && ffnLength[i] == 0 {
+				numKVHeads = int(headCountKV[i])
+				break
+			}
+		}
+		if numKVHeads == 0 {
+			numKVHeads = numHeads
+		}
+	}
+
+	headDim := int(c.Uint("attention.head_dim"))
+	if headDim == 0 {
+		if keyLength := int(c.Uint("attention.key_length")); keyLength > 0 {
+			headDim = keyLength
+		} else if numHeads > 0 {
+			headDim = int(c.Uint("embedding_length")) / numHeads
+		}
+	}
+	if headDim <= 0 {
+		return nil, fmt.Errorf("nemotronh: invalid attention head dimension")
+	}
+	if numHeads <= 0 {
+		// Attention layers derive per-layer head counts from projection weights.
+		// Keep a non-zero default to avoid invalid option math.
+		numHeads = 1
+	}
+
+	numExperts := int(c.Uint("expert_count"))
+	numExpertsUsed := int(c.Uint("expert_used_count"))
+	if numExperts > 0 {
+		if numExpertsUsed <= 0 || numExpertsUsed > numExperts {
+			return nil, fmt.Errorf("nemotronh: invalid expert_used_count=%d for expert_count=%d", numExpertsUsed, numExperts)
+		}
+	}
+
+	opts := &Options{
+		hiddenSize:            int(c.Uint("embedding_length")),
+		numHeads:              numHeads,
+		numKVHeads:            numKVHeads,
+		headDim:               headDim,
+		eps:                   c.Float("attention.layer_norm_rms_epsilon"),
+		ssmDConv:              int(c.Uint("ssm.conv_kernel")),
+		ssmDInner:             int(c.Uint("ssm.inner_size")),
+		ssmDState:             int(c.Uint("ssm.state_size")),
+		ssmNHead:              int(c.Uint("ssm.time_step_rank")),
+		ssmNGroup:             int(c.Uint("ssm.group_count")),
+		isRecurrent:           isRecurrent,
+		nFF:                   nFF,
+		attentionScale:        float64(c.Float("attention.scale")),
+		numExperts:            numExperts,
+		numExpertsUsed:        numExpertsUsed,
+		expertWeightsNorm:     c.Bool("expert_weights_norm", false),
+		expertWeightsScale:    c.Float("expert_weights_scale", 1.0),
+		expertWeightsNormClip: c.Float("expert_weights_norm_clip", 0),
+	}
+
+	// Calculate cache dimensions
+	convDim := max(0, opts.ssmDConv-1)
+	convChannels := opts.ssmDInner + 2*opts.ssmNGroup*opts.ssmDState
+	ssmHeadDim := 0
+	if opts.ssmNHead > 0 {
+		ssmHeadDim = opts.ssmDInner / opts.ssmNHead
+	}
+	ssmStateSize := opts.ssmDState * ssmHeadDim * opts.ssmNHead
+
+	m := Model{
+		Tokenizer: tokenizer.NewBytePairEncoding(
+			&tokenizer.Vocabulary{
+				Values: c.Strings("tokenizer.ggml.tokens"),
+				Types:  c.Ints("tokenizer.ggml.token_type"),
+				Merges: c.Strings("tokenizer.ggml.merges"),
+				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", false),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
+				EOS: append(
+					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				),
+			},
+			`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
+		),
+		Layers:  layers,
+		Options: opts,
+	}
+
+	m.Cache = NewHybridCache(convDim, convChannels, ssmStateSize)
+	return &m, nil
+}
+
+func init() {
+	model.Register("nemotron_h", New)
+	model.Register("nemotron_h_moe", New)
+}
+
+// Ensure Model implements model.Model
+var _ model.Model = (*Model)(nil)
+
+// Dense implements standard feedforward with ReLU-squared activation
+type Dense struct {
+	Up   *nn.Linear `gguf:"ffn_up"`
+	Down *nn.Linear `gguf:"ffn_down"`
+}
+
+func (d *Dense) Forward(ctx ml.Context, x ml.Tensor, opts *Options) ml.Tensor {
+	// up -> ReLU-squared -> down
+	up := d.Up.Forward(ctx, x)
+	up = up.RELU(ctx)
+	up = up.Mul(ctx, up) // Square
+	return d.Down.Forward(ctx, up)
+}
+
+// MoESparse implements MoE with shared experts for Nemotron-H-MoE
+type MoESparse struct {
+	Router *nn.Linear      `gguf:"ffn_gate_inp"`
+	Up     *nn.LinearBatch `gguf:"ffn_up_exps"`
+	Down   *nn.LinearBatch `gguf:"ffn_down_exps"`
+	Bias   ml.Tensor       `gguf:"exp_probs_b.bias,alt:exp_probs_b"`
+
+	LatentIn  *nn.Linear `gguf:"ffn_latent_in"`
+	LatentOut *nn.Linear `gguf:"ffn_latent_out"`
+
+	// Shared experts
+	SharedUp   *nn.Linear `gguf:"ffn_up_shexp"`
+	SharedDown *nn.Linear `gguf:"ffn_down_shexp"`
+}
+
+func (m *MoESparse) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *Options) ml.Tensor {
+	hiddenDim := hiddenStates.Dim(0)
+	seqLen := hiddenStates.Dim(1)
+	batchSize := hiddenStates.Dim(2)
+	if batchSize == 0 {
+		batchSize = 1
+	}
+	hiddenStates2D := hiddenStates.Reshape(ctx, hiddenDim, seqLen*batchSize)
+
+	// Router logits with sigmoid gating
+	routerLogits := m.Router.Forward(ctx, hiddenStates2D)
+
+	// Weights come from unbiased sigmoid probabilities.
+	probs := routerLogits.Sigmoid(ctx)
+
+	// Selection uses optional bias.
+	selectionProbs := probs
+	if m.Bias != nil {
+		selectionProbs = selectionProbs.Add(ctx, m.Bias)
+	}
+
+	// Select top-k experts
+	selectedExperts := selectionProbs.TopK(ctx, opts.numExpertsUsed)
+	routingWeights := probs.Reshape(ctx, 1, opts.numExperts, hiddenStates2D.Dim(1)).Rows(ctx, selectedExperts)
+
+	// Normalize routing weights
+	if opts.expertWeightsNorm {
+		routingWeights = routingWeights.Reshape(ctx, opts.numExpertsUsed, hiddenStates2D.Dim(1))
+		weightsSum := routingWeights.SumRows(ctx)
+		weightsSum = weightsSum.Clamp(ctx, 6.103515625e-5, float32(math.MaxFloat32))
+		routingWeights = routingWeights.Div(ctx, weightsSum)
+		routingWeights = routingWeights.Reshape(ctx, 1, opts.numExpertsUsed, hiddenStates2D.Dim(1))
+	}
+
+	// Scale routing weights
+	if opts.expertWeightsScale != 1.0 {
+		routingWeights = routingWeights.Scale(ctx, float64(opts.expertWeightsScale))
+	}
+
+	routedInput := hiddenStates2D
+	if m.LatentIn != nil {
+		routedInput = m.LatentIn.Forward(ctx, routedInput)
+	}
+	hiddenStates3D := routedInput.Reshape(ctx, routedInput.Dim(0), 1, routedInput.Dim(1))
+
+	// Expert computation with ReLU-squared activation
+	upOut := m.Up.Forward(ctx, hiddenStates3D, selectedExperts)
+	upOut = upOut.RELU(ctx)
+	upOut = upOut.Mul(ctx, upOut) // Square
+	experts := m.Down.Forward(ctx, upOut, selectedExperts)
+	experts = experts.Mul(ctx, routingWeights)
+
+	// Sum over experts
+	moeOut := experts.View(ctx, 0, experts.Dim(0), experts.Stride(2), experts.Dim(2))
+	for i := 1; i < opts.numExpertsUsed; i++ {
+		moeOut = moeOut.Add(ctx, experts.View(ctx, i*experts.Stride(1), experts.Dim(0), experts.Stride(2), experts.Dim(2)))
+	}
+	if m.LatentOut != nil {
+		moeOut = m.LatentOut.Forward(ctx, moeOut)
+	}
+
+	// Add shared experts if present
+	if m.SharedUp != nil {
+		sharedUp := m.SharedUp.Forward(ctx, hiddenStates2D)
+		sharedUp = sharedUp.RELU(ctx)
+		sharedUp = sharedUp.Mul(ctx, sharedUp) // Square
+		sharedOut := m.SharedDown.Forward(ctx, sharedUp)
+		moeOut = moeOut.Add(ctx, sharedOut)
+	}
+
+	return moeOut
+}
--- a/model/models/qwen3next/cache.go
+++ b/model/models/qwen3next/cache.go
@@ -2,595 +2,58 @@ package qwen3next

 import (
 	"math"
-	"slices"

 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
-	"github.com/ollama/ollama/model/input"
 )

-var _ kvcache.Cache = (*HybridCache)(nil)
+var (
+	_ kvcache.Cache           = (*HybridCache)(nil)
+	_ kvcache.CheckpointCache = (*HybridCache)(nil)
+)

-// HybridCache stores:
-// - a standard causal KV cache for full attention layers
-// - per-sequence conv state for linear attention layers
-// - per-sequence delta state for linear attention layers
-//
-// Conv state shape (per layer, per sequence): [convKernelSize-1, convChannels]
-// Delta state shape (per layer, per sequence): [headVDim, headVDim * numVHeads]
+// HybridCache adapts the shared recurrent cache base for Qwen3-Next naming.
 type HybridCache struct {
-	kv *kvcache.Causal
-
-	backend      ml.Backend
-	dtype        ml.DType
-	maxSequences int
-
-	// Conv state dimensions
-	convDim      int // convKernelSize - 1
-	convChannels int // d_inner + 2 * num_k_heads * head_k_dim
-
-	// Delta state dimensions
-	deltaStateSize int // headVDim * headVDim * numVHeads
-
-	// slot mapping for recurrent state (copy-on-write)
-	slotForSeq map[int]int
-	refCount   []int
-	freeSlots  []int
-
-	// per-layer conv state buffers (allocated lazily)
-	convCtxs   map[int]ml.Context
-	convStates map[int]ml.Tensor // [convDim*convChannels, maxSlots]
-
-	// per-layer delta state buffers (allocated lazily)
-	deltaCtxs   map[int]ml.Context
-	deltaStates map[int]ml.Tensor // [deltaStateSize, maxSlots]
-
-	// recurrent checkpoints (per slot)
-	checkpointCount     int
-	checkpointMinPos    int32
-	checkpointInterval  int32
-	checkpointCtxSize   int
-	checkpoints         map[int]*slotCheckpointStore
-	pendingRestore      map[int]checkpointRestore
-	curCheckpointPos    []int32
-	curCheckpointSlots  map[int]int
-	reserveCheckpoints  bool
-	checkpointConvCtxs  map[int]ml.Context
-	checkpointDeltaCtxs map[int]ml.Context
-	checkpointReserved  map[int]struct{}
-
-	// current forward batch (derived in StartForward)
-	curSeqs       []int
-	curSlots      []int
-	curSlotsInput ml.Tensor
-	curSeqTokens  int
-
-	// track if EnsureWritable has been called for this forward pass
-	writableEnsured bool
-	writableError   error
+	*kvcache.Recurrent
 }

 func NewHybridCache(
 	shift func(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error),
 	convDim, convChannels, deltaStateSize int,
 ) *HybridCache {
-	return &HybridCache{
-		kv:                  kvcache.NewCausalCache(shift),
-		convDim:             convDim,
-		convChannels:        convChannels,
-		deltaStateSize:      deltaStateSize,
-		slotForSeq:          make(map[int]int),
-		convCtxs:            make(map[int]ml.Context),
-		convStates:          make(map[int]ml.Tensor),
-		deltaCtxs:           make(map[int]ml.Context),
-		deltaStates:         make(map[int]ml.Tensor),
-		checkpointCount:     checkpointCountDefault,
-		checkpointMinPos:    checkpointMinPosDefault,
-		checkpointInterval:  checkpointIntervalDefault,
-		checkpoints:         make(map[int]*slotCheckpointStore),
-		pendingRestore:      make(map[int]checkpointRestore),
-		curCheckpointSlots:  make(map[int]int),
-		checkpointConvCtxs:  make(map[int]ml.Context),
-		checkpointDeltaCtxs: make(map[int]ml.Context),
-		checkpointReserved:  make(map[int]struct{}),
-	}
+	base := kvcache.NewRecurrentCache(kvcache.RecurrentConfig{
+		Shift:               shift,
+		ConvDim:             convDim,
+		ConvChannels:        convChannels,
+		RecurrentStateSize:  deltaStateSize,
+		CheckpointLogPrefix: "qwen3next",
+	})
+	return &HybridCache{Recurrent: base}
 }

-func (c *HybridCache) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity, maxBatch int) {
-	c.backend = backend
-	c.dtype = dtype
-	c.maxSequences = maxSequences
-	c.checkpoints = make(map[int]*slotCheckpointStore)
-	c.pendingRestore = make(map[int]checkpointRestore)
-	c.curCheckpointPos = c.curCheckpointPos[:0]
-	c.curCheckpointSlots = make(map[int]int)
-	c.checkpointReserved = make(map[int]struct{})
-	c.checkpointCtxSize = c.checkpointCount * c.maxSequences
-	if c.checkpointCtxSize < 8 {
-		c.checkpointCtxSize = 8
-	}
-
-	// initialize slot allocator
-	c.refCount = make([]int, maxSequences)
-	c.freeSlots = c.freeSlots[:0]
-	for i := maxSequences - 1; i >= 0; i-- {
-		c.freeSlots = append(c.freeSlots, i)
-	}
-
-	c.kv.Init(backend, dtype, maxSequences, capacity, maxBatch)
-}
-
-func (c *HybridCache) Close() {
-	for _, ctx := range c.convCtxs {
-		ctx.Close()
-	}
-	for _, ctx := range c.deltaCtxs {
-		ctx.Close()
-	}
-	for _, ctx := range c.checkpointConvCtxs {
-		ctx.Close()
-	}
-	for _, ctx := range c.checkpointDeltaCtxs {
-		ctx.Close()
-	}
-	c.kv.Close()
-}
-
-func (c *HybridCache) SetConfig(config ml.CacheConfig) {
-	c.kv.SetConfig(config)
-}
-
-func (c *HybridCache) SetLayer(layer int) {
-	c.kv.SetLayer(layer)
-}
-
-func (c *HybridCache) Get(ctx ml.Context) (ml.Tensor, ml.Tensor, ml.Tensor) {
-	return c.kv.Get(ctx)
-}
-
-func (c *HybridCache) Put(ctx ml.Context, key, value ml.Tensor) {
-	c.kv.Put(ctx, key, value)
-}
-
-func (c *HybridCache) StartForward(ctx ml.Context, batch input.Batch, reserve bool) error {
-	if err := c.kv.StartForward(ctx, batch, reserve); err != nil {
-		return err
-	}
-
-	// Derive equal-length sequence layout for recurrent layers
-	seqCounts := make(map[int]int)
-	c.curSeqs = c.curSeqs[:0]
-	for _, s := range batch.Sequences {
-		if _, ok := seqCounts[s]; !ok {
-			c.curSeqs = append(c.curSeqs, s)
-		}
-		seqCounts[s]++
-	}
-
-	if len(c.curSeqs) == 0 {
-		return nil
-	}
-
-	nTokens := len(batch.Sequences)
-	nSeqs := len(c.curSeqs)
-	want := nTokens / nSeqs
-	for _, s := range c.curSeqs {
-		if seqCounts[s] != want {
-			return kvcache.ErrNotSupported
-		}
-	}
-
-	c.curSeqTokens = want
-
-	// When reserving memory for estimation, use fake slot assignments
-	if reserve {
-		c.curSlots = c.curSlots[:0]
-		slots := make([]int32, nSeqs)
-		for i := range nSeqs {
-			c.curSlots = append(c.curSlots, i)
-			slots[i] = int32(i)
-		}
-		c.curSlotsInput = ctx.Input().FromInts(slots, len(slots))
-		c.reserveCheckpoints = true
-		c.planCheckpoints(batch)
-		return nil
-	}
-
-	// Ensure slots exist for sequences in this batch
-	c.curSlots = c.curSlots[:0]
-	var newSlots []int
-	for _, s := range c.curSeqs {
-		slot, ok := c.slotForSeq[s]
-		if !ok {
-			var err error
-			slot, err = c.allocSlot()
-			if err != nil {
-				return err
-			}
-			c.slotForSeq[s] = slot
-			c.refCount[slot] = 1
-			newSlots = append(newSlots, slot)
-		}
-		c.curSlots = append(c.curSlots, slot)
-	}
-
-	// Zero state for newly allocated slots
-	if len(newSlots) > 0 {
-		c.zeroSlots(ctx, newSlots)
-	}
-
-	// Create a tensor for the current slots
-	slots := make([]int32, len(c.curSlots))
-	for i, v := range c.curSlots {
-		slots[i] = int32(v)
-	}
-	c.curSlotsInput = ctx.Input().FromInts(slots, len(slots))
-
-	// Reset writable state for new forward pass
-	c.writableEnsured = false
-	c.writableError = nil
-	c.reserveCheckpoints = false
-	c.planCheckpoints(batch)
-
-	return nil
-}
-
-func (c *HybridCache) allocSlot() (int, error) {
-	if len(c.freeSlots) == 0 {
-		return 0, kvcache.ErrKvCacheFull
-	}
-	slot := c.freeSlots[len(c.freeSlots)-1]
-	c.freeSlots = c.freeSlots[:len(c.freeSlots)-1]
-	return slot, nil
-}
-
-func (c *HybridCache) freeSlot(slot int) {
-	if slot >= 0 && slot < c.maxSequences {
-		c.freeSlots = append(c.freeSlots, slot)
-	}
-}
-
-// zeroSlots zeros the recurrent state for the given slots across all layers.
-func (c *HybridCache) zeroSlots(ctx ml.Context, slots []int) {
-	if len(slots) == 0 {
-		return
-	}
-
-	inputCtx := ctx.Input()
-
-	slotIndices := make([]int32, len(slots))
-	for i, s := range slots {
-		slotIndices[i] = int32(s)
-	}
-	slotsTensor := inputCtx.FromInts(slotIndices, len(slotIndices))
-
-	// Zero conv states
-	if len(c.convStates) > 0 {
-		zeros := inputCtx.Zeros(ml.DTypeF32, c.convDim*c.convChannels, len(slots))
-		for _, buf := range c.convStates {
-			ctx.Forward(buf.SetRows(ctx, zeros, slotsTensor))
-		}
-	}
-
-	// Zero delta states
-	if len(c.deltaStates) > 0 {
-		zeros := inputCtx.Zeros(ml.DTypeF32, c.deltaStateSize, len(slots))
-		for _, buf := range c.deltaStates {
-			ctx.Forward(buf.SetRows(ctx, zeros, slotsTensor))
-		}
-	}
-}
-
-// EnsureWritable ensures sequences have private slots (copy-on-write).
-func (c *HybridCache) EnsureWritable(ctx ml.Context) error {
-	for i, seq := range c.curSeqs {
-		slot, ok := c.slotForSeq[seq]
-		if !ok {
-			continue
-		}
-
-		if slot < 0 || slot >= len(c.refCount) {
-			continue
-		}
-
-		if c.refCount[slot] <= 1 {
-			continue
-		}
-
-		newSlot, err := c.allocSlot()
-		if err != nil {
-			return err
-		}
-		c.refCount[slot]--
-		c.refCount[newSlot] = 1
-		c.slotForSeq[seq] = newSlot
-		c.curSlots[i] = newSlot
-
-		c.copyRecurrentState(ctx, slot, newSlot)
-		c.copyCheckpoints(ctx, slot, newSlot)
-	}
-
-	// Rebuild current slots tensor
-	slots := make([]int32, len(c.curSlots))
-	for i, v := range c.curSlots {
-		slots[i] = int32(v)
-	}
-	c.curSlotsInput = ctx.Input().FromInts(slots, len(slots))
-
-	return nil
-}
-
-func (c *HybridCache) copyRecurrentState(ctx ml.Context, srcSlot, dstSlot int) {
-	src := ctx.Input().FromInts([]int32{int32(srcSlot)}, 1)
-	dst := ctx.Input().FromInts([]int32{int32(dstSlot)}, 1)
-
-	for _, buf := range c.convStates {
-		rows := buf.Rows(ctx, src)
-		rowsF32 := rows.Cast(ctx, ml.DTypeF32)
-		ctx.Forward(buf.SetRows(ctx, rowsF32, dst))
-	}
-
-	for _, buf := range c.deltaStates {
-		rows := buf.Rows(ctx, src)
-		rowsF32 := rows.Cast(ctx, ml.DTypeF32)
-		ctx.Forward(buf.SetRows(ctx, rowsF32, dst))
-	}
-}
-
-func (c *HybridCache) CopyPrefix(srcSeq, dstSeq int, prefixLen int32) {
-	c.kv.CopyPrefix(srcSeq, dstSeq, prefixLen)
-
-	// Copy-on-write for recurrent state
-	if dstSlot, ok := c.slotForSeq[dstSeq]; ok {
-		if c.validSlot(dstSlot) {
-			c.refCount[dstSlot]--
-			if c.refCount[dstSlot] <= 0 {
-				c.refCount[dstSlot] = 0
-				c.freeSlot(dstSlot)
-			}
-		}
-		delete(c.slotForSeq, dstSeq)
-	}
-
-	srcSlot, ok := c.slotForSeq[srcSeq]
-	if !ok {
-		return
-	}
-
-	if c.validSlot(srcSlot) {
-		c.slotForSeq[dstSeq] = srcSlot
-		c.refCount[srcSlot]++
-	}
-}
-
-func (c *HybridCache) CanResume(seq int, pos int32) bool {
-	if !c.kv.CanResume(seq, pos) {
-		return false
-	}
-	if pos == 0 {
-		return true
-	}
-	return c.hasCheckpoint(seq, pos)
-}
-
-func (c *HybridCache) Remove(seq int, beginIndex, endIndex int32) error {
-	if beginIndex > 0 && endIndex != math.MaxInt32 {
-		return kvcache.ErrNotSupported
-	}
-
-	if beginIndex > 0 {
-		restore, ok := c.pendingRestore[seq]
-		if !ok || restore.pos+1 != beginIndex {
-			return kvcache.ErrNotSupported
-		}
-		if !c.restoreComplete(restore) {
-			return kvcache.ErrNotSupported
-		}
-		// If the recurrent slot is shared, detach it before applying a restore.
-		if slot, ok := c.slotForSeq[seq]; ok && c.validSlot(slot) && c.refCount[slot] > 1 {
-			newSlot, err := c.allocSlot()
-			if err != nil {
-				return err
-			}
-			ctx := c.backend.NewContext()
-			c.copyRecurrentState(ctx, slot, newSlot)
-			c.copyCheckpoints(ctx, slot, newSlot)
-			if len(c.convStates) > 0 || len(c.deltaStates) > 0 {
-				ctx.Compute()
-			}
-			ctx.Close()
-
-			c.refCount[slot]--
-			c.refCount[newSlot] = 1
-			c.slotForSeq[seq] = newSlot
-
-			restore.slot = newSlot
-			c.pendingRestore[seq] = restore
-		}
-	}
-
-	if err := c.kv.Remove(seq, beginIndex, endIndex); err != nil {
-		return err
-	}
-
-	if beginIndex > 0 {
-		restore := c.pendingRestore[seq]
-		delete(c.pendingRestore, seq)
-		return c.applyCheckpointRestore(restore)
-	}
-
-	// Removal invalidates recurrent state
-	slot, ok := c.slotForSeq[seq]
-	delete(c.pendingRestore, seq)
-	if !ok {
-		return nil
-	}
-
-	if !c.validSlot(slot) {
-		delete(c.slotForSeq, seq)
-		return nil
-	}
-
-	c.refCount[slot]--
-	if c.refCount[slot] <= 0 {
-		c.refCount[slot] = 0
-		c.clearCheckpoints(slot)
-		c.freeSlot(slot)
-	}
-	delete(c.slotForSeq, seq)
-
-	return nil
-}
-
-func (c *HybridCache) validSlot(slot int) bool {
-	return slot >= 0 && slot < len(c.refCount)
-}
-
-func (c *HybridCache) slotsTensor() ml.Tensor {
-	return c.curSlotsInput
-}
-
-// contiguousSlots returns the starting slot if current slots are contiguous and ordered.
-func (c *HybridCache) contiguousSlots() (int, bool) {
-	if len(c.curSlots) == 0 {
-		return 0, false
-	}
-	start := c.curSlots[0]
-	for i, s := range c.curSlots {
-		if s != start+i {
-			return 0, false
-		}
-	}
-	return start, true
-}
-
-func (c *HybridCache) seqTokens() int {
-	return c.curSeqTokens
-}
-
-func (c *HybridCache) numSeqs() int {
-	return len(c.curSeqs)
-}
-
-func (c *HybridCache) convBuffer(ctx ml.Context, layer int) ml.Tensor {
-	if buf, ok := c.convStates[layer]; ok {
-		return buf
-	}
-
-	if _, ok := c.convCtxs[layer]; !ok {
-		c.convCtxs[layer] = c.backend.NewContextSize(1).Layer(layer)
-	}
-
-	// Recurrent state must stay in F32 (ssm_conv kernels are F32-only).
-	buf := c.convCtxs[layer].Zeros(ml.DTypeF32, c.convDim*c.convChannels, c.maxSequences)
-	c.convStates[layer] = buf
-	return buf
-}
-
-func (c *HybridCache) deltaBuffer(ctx ml.Context, layer int) ml.Tensor {
-	if buf, ok := c.deltaStates[layer]; ok {
-		return buf
-	}
-
-	if _, ok := c.deltaCtxs[layer]; !ok {
-		c.deltaCtxs[layer] = c.backend.NewContextSize(1).Layer(layer)
-	}
-
-	// Recurrent delta state must stay in F32.
-	buf := c.deltaCtxs[layer].Zeros(ml.DTypeF32, c.deltaStateSize, c.maxSequences)
-	c.deltaStates[layer] = buf
-	return buf
-}
-
-func (c *HybridCache) ensureWritableOnce(ctx ml.Context) {
-	if !c.writableEnsured {
-		needsWritable := false
-		for _, seq := range c.curSeqs {
-			slot, ok := c.slotForSeq[seq]
-			if !ok {
-				continue
-			}
-			if slot >= 0 && slot < len(c.refCount) && c.refCount[slot] > 1 {
-				needsWritable = true
-				break
-			}
-		}
-
-		if needsWritable {
-			if err := c.EnsureWritable(ctx); err != nil {
-				c.writableError = err
-			}
-		}
-		c.writableEnsured = true
-	}
-}
-
-// ConvState returns the conv state for current batch sequences as [convDim, convChannels, nSeqs].
-func (c *HybridCache) ConvState(ctx ml.Context, layer int) (ml.Tensor, error) {
-	c.ensureWritableOnce(ctx)
-
-	if c.writableError != nil {
-		return nil, c.writableError
-	}
-
-	buf := c.convBuffer(ctx, layer)
-	cur := buf.Rows(ctx, c.slotsTensor())
-	return cur.Reshape(ctx, c.convDim, c.convChannels, c.numSeqs()), nil
-}
-
-// UpdateConvState writes a new conv state for current batch sequences.
-func (c *HybridCache) UpdateConvState(ctx ml.Context, layer int, newState ml.Tensor) {
-	buf := c.convBuffer(ctx, layer)
-	src := newState.Reshape(ctx, c.convDim*c.convChannels, c.numSeqs())
-	srcF32 := src.Cast(ctx, ml.DTypeF32)
-	if start, ok := c.contiguousSlots(); ok {
-		// Fast path: contiguous slots allow a single view + copy
-		offset := start * buf.Stride(1)
-		view := buf.View(ctx, offset, c.convDim*c.convChannels, buf.Stride(1), c.numSeqs())
-		ctx.Forward(srcF32.Copy(ctx, view))
-	} else {
-		ctx.Forward(buf.SetRows(ctx, srcF32, c.slotsTensor()))
-	}
-
-	c.captureConvCheckpoint(ctx, layer, srcF32)
-}
-
-// DeltaState returns the delta state for current batch sequences as [headVDim, headVDim*numVHeads, nSeqs].
+// DeltaState returns the delta state for current batch sequences as
+// [headVDim, headVDim*numVHeads, nSeqs].
 func (c *HybridCache) DeltaState(ctx ml.Context, layer int, headVDim, numVHeads int) (ml.Tensor, error) {
-	c.ensureWritableOnce(ctx)
-
-	if c.writableError != nil {
-		return nil, c.writableError
-	}
-
-	buf := c.deltaBuffer(ctx, layer)
-	cur := buf.Rows(ctx, c.slotsTensor())
-	return cur.Reshape(ctx, headVDim, headVDim*numVHeads, c.numSeqs()), nil
+	return c.RecurrentState(ctx, layer, headVDim, headVDim*numVHeads)
 }

 // UpdateDeltaState writes a new delta state for current batch sequences.
 func (c *HybridCache) UpdateDeltaState(ctx ml.Context, layer int, newState ml.Tensor) {
-	buf := c.deltaBuffer(ctx, layer)
-	src := newState.Reshape(ctx, c.deltaStateSize, c.numSeqs())
-	srcF32 := src.Cast(ctx, ml.DTypeF32)
-	if start, ok := c.contiguousSlots(); ok {
-		// Fast path: contiguous slots allow a single view + copy
-		offset := start * buf.Stride(1)
-		view := buf.View(ctx, offset, c.deltaStateSize, buf.Stride(1), c.numSeqs())
-		ctx.Forward(srcF32.Copy(ctx, view))
-	} else {
-		ctx.Forward(buf.SetRows(ctx, srcF32, c.slotsTensor()))
+	c.UpdateRecurrentState(ctx, layer, newState)
+}
+
+func (c *HybridCache) seqTokens() int {
+	return c.SeqTokens()
+}
+
+func (c *HybridCache) numSeqs() int {
+	return c.NumSeqs()
+}
+
+// Keep qwen3next behavior for partial mid-sequence removals.
+func (c *HybridCache) Remove(seq int, beginIndex, endIndex int32) error {
+	if beginIndex > 0 && endIndex != math.MaxInt32 {
+		return kvcache.ErrNotSupported
 	}
-
-	c.captureDeltaCheckpoint(ctx, layer, srcF32)
-}
-
-// IsSupportedForBatch returns true if the current batch layout supports recurrent layers.
-func (c *HybridCache) IsSupportedForBatch() bool {
-	return c.curSeqTokens > 0 && len(c.curSeqs) > 0
-}
-
-// Seqs returns the ordered unique sequences for the current forward pass.
-func (c *HybridCache) Seqs() []int {
-	return slices.Clone(c.curSeqs)
+	return c.Recurrent.Remove(seq, beginIndex, endIndex)
 }
--- a/model/models/qwen3next/deltanet.go
+++ b/model/models/qwen3next/deltanet.go
@@ -37,7 +37,9 @@ type GatedDeltaNet struct {
 	// Optimized path: pre-split QKV and gate
 	SSMQKV       *nn.Linear  `gguf:"attn_qkv"`  // -> Q, K, V (concatenated)
 	SSMQKVGate   *nn.Linear  `gguf:"attn_gate"` // -> Z gate
-	SSMBetaAlpha *nn.Linear  `gguf:"ssm_ba"`    // -> beta, alpha
+	SSMBetaAlpha *nn.Linear  `gguf:"ssm_ba"`    // -> beta, alpha (legacy qwen3next)
+	SSMBeta      *nn.Linear  `gguf:"ssm_beta"`  // -> beta (qwen35)
+	SSMAlpha     *nn.Linear  `gguf:"ssm_alpha"` // -> alpha (qwen35)
 	SSMConv1D    *convKernel `gguf:"ssm_conv1d"`
 	SSMDT        ml.Tensor   `gguf:"ssm_dt"` // alpha bias
 	SSMA         ml.Tensor   `gguf:"ssm_a"`  // -A_log.exp()
@@ -96,7 +98,6 @@ func (gdn *GatedDeltaNet) Forward(ctx ml.Context, hiddenStates, _ ml.Tensor, cac
 	headVDim := opts.ssmDInner / numVHeads
 	convKernelSize := opts.convKernelSize

-	mixedBA := gdn.SSMBetaAlpha.Forward(ctx, hiddenStates)
 	qkvDim := headKDim*numKHeads*2 + headVDim*numVHeads

 	if gdn.SSMQKV == nil || gdn.SSMQKVGate == nil {
@@ -106,24 +107,40 @@ func (gdn *GatedDeltaNet) Forward(ctx ml.Context, hiddenStates, _ ml.Tensor, cac
 	qkvMixed := gdn.SSMQKV.Forward(ctx, hiddenStates).Reshape(ctx, qkvDim, nSeqTokens, nSeqs)
 	z := gdn.SSMQKVGate.Forward(ctx, hiddenStates)

-	baNewDim := 2 * numVHeads / numKHeads
-	mixedBAReshaped := mixedBA.Reshape(ctx, baNewDim, numKHeads, nSeqTokens, nSeqs)
+	var beta ml.Tensor
+	var alpha ml.Tensor
+	switch {
+	case gdn.SSMBetaAlpha != nil:
+		// Legacy qwen3next path: in_proj_ba packs beta/alpha grouped by K-head.
+		mixedBA := gdn.SSMBetaAlpha.Forward(ctx, hiddenStates)
+		baNewDim := 2 * numVHeads / numKHeads
+		mixedBAReshaped := mixedBA.Reshape(ctx, baNewDim, numKHeads, nSeqTokens, nSeqs)

-	// Split beta and alpha
-	betaSize := numVHeads / numKHeads
-	alphaSize := numVHeads / numKHeads
+		betaSize := numVHeads / numKHeads
+		alphaSize := numVHeads / numKHeads

-	b := mixedBAReshaped.Slice(ctx, 0, 0, betaSize, 1)
-	a := mixedBAReshaped.Slice(ctx, 0, betaSize, betaSize+alphaSize, 1)
+		b := mixedBAReshaped.Slice(ctx, 0, 0, betaSize, 1)
+		a := mixedBAReshaped.Slice(ctx, 0, betaSize, betaSize+alphaSize, 1)

-	// Reshape to merge head dimensions
-	beta := b.Contiguous(ctx, numVHeads, 1, nSeqTokens, nSeqs)
-	alpha := a.Contiguous(ctx, numVHeads, nSeqTokens, nSeqs)
+		// Keep beta layout consistent with qwen35 and llama.cpp:
+		// [1, numVHeads, nSeqTokens, nSeqs]
+		beta = b.Contiguous(ctx, 1, numVHeads, nSeqTokens, nSeqs)
+		alpha = a.Contiguous(ctx, numVHeads, nSeqTokens, nSeqs)
+
+	case gdn.SSMBeta != nil && gdn.SSMAlpha != nil:
+		// qwen35 path: beta/alpha are separate projections.
+		beta = gdn.SSMBeta.Forward(ctx, hiddenStates).Reshape(ctx, 1, numVHeads, nSeqTokens, nSeqs)
+		alpha = gdn.SSMAlpha.Forward(ctx, hiddenStates).Reshape(ctx, numVHeads, nSeqTokens, nSeqs)
+
+	default:
+		return nil, errors.New("qwen3next: missing linear attention beta/alpha projections")
+	}

 	// Compute gate: softplus(alpha + dt_bias) * -A
 	alphaBiased := alpha.Add(ctx, gdn.SSMDT)
 	alphaSoftplus := alphaBiased.Softplus(ctx)
 	gate := alphaSoftplus.Mul(ctx, gdn.SSMA)
+	gate = gate.Reshape(ctx, 1, numVHeads, nSeqTokens, nSeqs)
 	qkvMixed = qkvMixed.Permute(ctx, 1, 0, 2, 3)

 	// Get conv state from cache
@@ -172,16 +189,20 @@ func (gdn *GatedDeltaNet) Forward(ctx ml.Context, hiddenStates, _ ml.Tensor, cac

 	// Repeat interleave Q and K if numKHeads != numVHeads
 	if numKHeads != numVHeads {
-		repeatFactor := numVHeads / numKHeads
+		if opts.vHeadReordered {
+			qConv = qConv.Repeat4D(ctx, headKDim, numVHeads, nSeqTokens, nSeqs)
+			kConv = kConv.Repeat4D(ctx, headKDim, numVHeads, nSeqTokens, nSeqs)
+		} else {
+			repeatFactor := numVHeads / numKHeads
+			qReshaped := qConv.Reshape(ctx, headKDim, 1, numKHeads*nSeqTokens*nSeqs)
+			kReshaped := kConv.Reshape(ctx, headKDim, 1, numKHeads*nSeqTokens*nSeqs)

-		qReshaped := qConv.Reshape(ctx, headKDim, 1, numKHeads*nSeqTokens*nSeqs)
-		kReshaped := kConv.Reshape(ctx, headKDim, 1, numKHeads*nSeqTokens*nSeqs)
+			qRepeated := qReshaped.Repeat4D(ctx, headKDim, repeatFactor, numKHeads*nSeqTokens*nSeqs, 1)
+			kRepeated := kReshaped.Repeat4D(ctx, headKDim, repeatFactor, numKHeads*nSeqTokens*nSeqs, 1)

-		qRepeated := qReshaped.Repeat4D(ctx, headKDim, repeatFactor, numKHeads*nSeqTokens*nSeqs, 1)
-		kRepeated := kReshaped.Repeat4D(ctx, headKDim, repeatFactor, numKHeads*nSeqTokens*nSeqs, 1)
-
-		qConv = qRepeated.Reshape(ctx, headKDim, numKHeads*repeatFactor, nSeqTokens, nSeqs)
-		kConv = kRepeated.Reshape(ctx, headKDim, numKHeads*repeatFactor, nSeqTokens, nSeqs)
+			qConv = qRepeated.Reshape(ctx, headKDim, numKHeads*repeatFactor, nSeqTokens, nSeqs)
+			kConv = kRepeated.Reshape(ctx, headKDim, numKHeads*repeatFactor, nSeqTokens, nSeqs)
+		}
 	}

 	// Choose computation mode based on sequence length
@@ -189,7 +210,9 @@ func (gdn *GatedDeltaNet) Forward(ctx ml.Context, hiddenStates, _ ml.Tensor, cac
 	if nSeqTokens == 1 {
 		attnOut = gdn.deltaNetAutoregressive(ctx, qConv, kConv, vConv, gate, beta, state, opts, layer, cache)
 	} else {
-		// Use pre-computed masks from opts (created once in Model.Forward)
+		if opts.masks == nil {
+			opts.masks = createMasks(ctx)
+		}
 		attnOut = gdn.deltaNetChunked(ctx, qConv, kConv, vConv, gate, beta, state, opts.masks, opts, layer, cache)
 	}

@@ -310,9 +333,10 @@ func (gdn *GatedDeltaNet) deltaNetChunked(
 	q = q.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx, headKDim, nTokens, numVHeads, nSeqs)
 	k = k.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx, headKDim, nTokens, numVHeads, nSeqs)
 	v = v.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx, headVDim, nTokens, numVHeads, nSeqs)
-	gate = gate.Permute(ctx, 2, 0, 3, 1).Contiguous(ctx, nTokens, 1, numVHeads, nSeqs)
-
-	beta = beta.Permute(ctx, 2, 0, 1, 3).Contiguous(ctx)
+	// Match llama.cpp delta-net-base layout:
+	// gate/beta: [1, numVHeads, nTokens, nSeqs] -> [1, nTokens, numVHeads, nSeqs]
+	gate = gate.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx, 1, nTokens, numVHeads, nSeqs)
+	beta = beta.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx, 1, nTokens, numVHeads, nSeqs)
 	state = state.Reshape(ctx, headVDim, headVDim, numVHeads, nSeqs)

 	// Compute padding
@@ -324,7 +348,7 @@ func (gdn *GatedDeltaNet) deltaNetChunked(
 		q = q.Pad(ctx, 0, pad, 0, 0)
 		k = k.Pad(ctx, 0, pad, 0, 0)
 		v = v.Pad(ctx, 0, pad, 0, 0)
-		gate = gate.Pad(ctx, pad, 0, 0, 0)
+		gate = gate.Pad(ctx, 0, pad, 0, 0)
 		beta = beta.Pad(ctx, 0, pad, 0, 0)
 	}

@@ -344,10 +368,12 @@ func (gdn *GatedDeltaNet) deltaNetChunked(
 	kBeta = kBeta.Reshape(ctx, headKDim, chunkSize, nChunks, numVHeads*nSeqs)
 	vBeta = vBeta.Reshape(ctx, headVDim, chunkSize, nChunks, numVHeads*nSeqs)

-	gate = gate.Reshape(ctx, chunkSize, 1, nChunks, numVHeads*nSeqs)
+	// Reshape gate and cumsum over chunk axis.
+	// [1, chunkSize, nChunks, H*nSeqs] -> transpose -> [chunkSize, 1, nChunks, H*nSeqs]
+	gate = gate.Reshape(ctx, 1, chunkSize, nChunks, numVHeads*nSeqs)

 	// g_cumsum = cumsum(gate)
-	gCumsum := gate.CumSum(ctx)
+	gCumsum := gate.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx, chunkSize, 1, nChunks, numVHeads*nSeqs).CumSum(ctx)

 	// Compute decay mask
 	gcsI := gCumsum.Reshape(ctx, chunkSize, 1, nChunks, numVHeads*nSeqs)
--- a/model/models/qwen3next/model.go
+++ b/model/models/qwen3next/model.go
@@ -1,9 +1,12 @@
 package qwen3next

 import (
+	"bytes"
 	"cmp"
 	"fmt"
+	"image"
 	"math"
+	"slices"

 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/ml"
@@ -11,6 +14,7 @@ import (
 	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
+	"github.com/ollama/ollama/model/models/qwen3vl"
 	"github.com/ollama/ollama/tokenizer"
 )

@@ -41,10 +45,15 @@ type Options struct {
 	ssmNGroup      int // num_k_heads
 	ssmDtRank      int // num_v_heads
 	convKernelSize int // SSM conv kernel size
+	vHeadReordered bool

 	// Per-layer type from GGUF metadata
 	isRecurrent []bool

+	// RoPE mode config (used by qwen35/qwen35moe)
+	mropeSections    []int
+	mropeInterleaved bool
+
 	// Pre-computed masks for chunked attention (created once per forward pass)
 	masks *Masks
 }
@@ -54,7 +63,17 @@ func (o Options) headDim() int {
 }

 func (o Options) applyRotaryPositionEmbeddings(ctx ml.Context, states, positions ml.Tensor) ml.Tensor {
-	opts := []func(*rope.Options){rope.WithTypeNeoX()}
+	var opts []func(*rope.Options)
+	if len(o.mropeSections) > 0 {
+		if o.mropeInterleaved {
+			opts = append(opts, rope.WithInterleaveMRoPE(o.mropeSections))
+		} else {
+			opts = append(opts, rope.WithMRoPE(o.mropeSections))
+		}
+	} else {
+		opts = append(opts, rope.WithTypeNeoX())
+	}
+
 	if o.ropeType == "yarn" {
 		attnFactor := float32(1.0 / (1.0 + 0.1*math.Log(float64(o.ropeScale))))
 		opts = append(opts,
@@ -214,20 +233,190 @@ type Model struct {
 	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
 	Output         *nn.Linear    `gguf:"output,alt:token_embd"`

-	Layers []Layer `gguf:"blk"`
+	Layers []Layer              `gguf:"blk"`
+	Vision *qwen3vl.VisionModel `gguf:"v"`
+
+	ImageProcessor *qwen3vl.ImageProcessor

 	*Options
+
+	positionCache    []int32
+	imageToken       int32
+	visionStart      int32
+	visionEnd        int32
+	spatialMergeSize uint32
+}
+
+func (m *Model) mapPosition(id int32) int32 {
+	if id < int32(len(m.positionCache)) {
+		return m.positionCache[id]
+	}
+	if len(m.positionCache) > 0 {
+		return id - int32(len(m.positionCache)) + m.positionCache[len(m.positionCache)-1] + 1
+	}
+	return id
+}
+
+func (m *Model) buildPositions(ctx ml.Context, batch input.Batch) ml.Tensor {
+	if len(m.mropeSections) == 0 {
+		return ctx.Input().FromInts(batch.Positions, len(batch.Positions))
+	}
+
+	// ggml MRoPE expects [time, height, width, extra] for each token.
+	positionSlice := [][]int32{
+		make([]int32, len(batch.Positions)),
+		make([]int32, len(batch.Positions)),
+		make([]int32, len(batch.Positions)),
+		make([]int32, len(batch.Positions)),
+	}
+
+	for i, id := range batch.Positions {
+		p := m.mapPosition(id)
+		positionSlice[0][i] = p
+		positionSlice[1][i] = p
+		positionSlice[2][i] = p
+	}
+
+	if m.Vision != nil {
+		for _, mi := range batch.Multimodal {
+			grid, ok := mi.Multimodal[0].Data.(*qwen3vl.Grid)
+			if !ok {
+				continue
+			}
+			w := max(1, grid.Width/int(m.spatialMergeSize))
+			for i := range mi.Multimodal[0].Tensor.Dim(1) {
+				positionSlice[1][mi.Index+i] += int32(i / w)
+				positionSlice[2][mi.Index+i] += int32(i % w)
+			}
+		}
+	}
+
+	return ctx.Input().FromInts(slices.Concat(positionSlice...), len(positionSlice[0])*len(positionSlice))
+}
+
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
+	if m.Vision == nil || m.ImageProcessor == nil || len(m.Vision.Layers) == 0 {
+		return nil, model.ErrNoVisionModel
+	}
+
+	img, _, err := image.Decode(bytes.NewReader(multimodalData))
+	if err != nil {
+		return nil, err
+	}
+
+	pixelValues, grid, err := m.ImageProcessor.ProcessImage(ctx, img)
+	if err != nil {
+		return nil, err
+	}
+
+	visionOutputs, deepstackVisualEmbeds := m.Vision.Forward(ctx, pixelValues, grid)
+	mm := []input.Multimodal{{Tensor: visionOutputs, Data: grid}}
+	for i := range deepstackVisualEmbeds {
+		mm = append(mm, input.Multimodal{Tensor: deepstackVisualEmbeds[i]})
+	}
+
+	return mm, nil
+}
+
+func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
+	m.positionCache = m.positionCache[:0]
+	var result []*input.Input
+	appendInput := func(inp *input.Input, position int32) {
+		result = append(result, inp)
+		m.positionCache = append(m.positionCache, position)
+	}
+
+	var p int32
+	for _, inp := range inputs {
+		if inp.Multimodal == nil {
+			appendInput(inp, p)
+			p++
+			continue
+		}
+
+		grid := inp.Multimodal[0].Data.(*qwen3vl.Grid)
+		tokensPerGrid := inp.Multimodal[0].Tensor.Dim(1)
+
+		appendInput(&input.Input{
+			Token:     m.visionStart,
+			SameBatch: tokensPerGrid + 1,
+		}, p)
+		p++
+
+		appendInput(&input.Input{
+			Token:          m.imageToken,
+			Multimodal:     inp.Multimodal,
+			MultimodalHash: inp.MultimodalHash,
+		}, p)
+
+		for range tokensPerGrid - 1 {
+			appendInput(&input.Input{
+				Token: m.imageToken,
+			}, p)
+		}
+
+		gridSpan := max(grid.Width/int(m.spatialMergeSize), grid.Height/int(m.spatialMergeSize))
+		p = p + int32(gridSpan)
+		appendInput(&input.Input{
+			Token: m.visionEnd,
+		}, p)
+		p++
+	}
+
+	return result, nil
 }

 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions := ctx.Input().FromInts(batch.Positions, len(batch.Positions))
+	positions := m.buildPositions(ctx, batch)

 	hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs)
+	if len(batch.Multimodal) > 0 {
+		hiddenStates = hiddenStates.Duplicate(ctx)
+
+		var deepstackVisualEmbeds []ml.Tensor
+		for _, mi := range batch.Multimodal {
+			visionOutputs := mi.Multimodal[0].Tensor
+			ctx.Forward(visionOutputs.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), visionOutputs.Dim(0)*visionOutputs.Dim(1))))
+
+			if len(mi.Multimodal[1:]) > len(deepstackVisualEmbeds) {
+				deepstackVisualEmbeds = append(deepstackVisualEmbeds, make([]ml.Tensor, len(mi.Multimodal[1:])-len(deepstackVisualEmbeds))...)
+			}
+			for i, mm := range mi.Multimodal[1:] {
+				if deepstackVisualEmbeds[i] == nil {
+					deepstackVisualEmbeds[i] = ctx.Input().Zeros(mm.Tensor.DType(), hiddenStates.Shape()...)
+				}
+				ctx.Forward(mm.Tensor.Copy(ctx, deepstackVisualEmbeds[i].View(ctx, mi.Index*deepstackVisualEmbeds[i].Stride(1), mm.Tensor.Dim(0)*mm.Tensor.Dim(1))))
+			}
+		}
+
+		cache := m.Cache.(*HybridCache)
+		m.Options.masks = nil
+		for i, layer := range m.Layers {
+			cache.SetLayer(i)
+
+			var outputs ml.Tensor
+			if i == len(m.Layers)-1 {
+				outputs = batch.Outputs
+			}
+
+			var err error
+			hiddenStates, err = layer.Forward(ctx, i, hiddenStates, positions, outputs, cache, m.Options)
+			if err != nil {
+				return nil, err
+			}
+			if i < len(deepstackVisualEmbeds) {
+				hiddenStates = hiddenStates.Add(ctx, deepstackVisualEmbeds[i])
+			}
+		}
+
+		hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.eps)
+		return m.Output.Forward(ctx, hiddenStates), nil
+	}

 	cache := m.Cache.(*HybridCache)

-	// Create masks once per forward pass
-	m.Options.masks = createMasks(ctx)
+	// Masks are allocated lazily only for chunked recurrent prefill.
+	m.Options.masks = nil

 	for i, layer := range m.Layers {
 		cache.SetLayer(i)
@@ -249,10 +438,17 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 }

 func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
+	m.positionCache = nil
+	if len(m.mropeSections) > 0 {
+		shift = shift.Repeat(ctx, 1, 4).Reshape(ctx, -1)
+	}
 	return m.applyRotaryPositionEmbeddings(ctx, key, shift), nil
 }

-var _ model.Model = (*Model)(nil)
+var (
+	_ model.Model               = (*Model)(nil)
+	_ model.MultimodalProcessor = (*Model)(nil)
+)

 func New(c fs.Config) (model.Model, error) {
 	numLayers := int(c.Uint("block_count"))
@@ -303,6 +499,22 @@ func New(c fs.Config) (model.Model, error) {
 		}
 	}

+	mropeSections := c.Ints("mrope_sections", nil)
+	if len(mropeSections) == 0 {
+		mropeSections = c.Ints("rope.mrope_section", nil)
+	}
+	if len(mropeSections) == 0 {
+		mropeSections = c.Ints("rope.dimension_sections", nil)
+	}
+	if len(mropeSections) > 4 {
+		mropeSections = mropeSections[:4]
+	}
+
+	ropeType := c.String("rope.scaling.type")
+	if ropeType == "" {
+		ropeType = c.String("rope.type")
+	}
+
 	opts := &Options{
 		hiddenSize: int(c.Uint("embedding_length")),
 		numHeads:   int(c.Uint("attention.head_count")),
@@ -318,7 +530,7 @@ func New(c fs.Config) (model.Model, error) {
 		valueLength:           int(c.Uint("attention.value_length")),
 		ropeDim:               int(c.Uint("rope.dimension_count")),
 		eps:                   c.Float("attention.layer_norm_rms_epsilon"),
-		ropeType:              c.String("rope.scaling.type"),
+		ropeType:              ropeType,
 		ropeBase:              c.Float("rope.freq_base"),
 		ropeScale:             c.Float("rope.scaling.factor", 1),
 		originalContextLength: int(c.Uint("rope.scaling.original_context_length")),
@@ -331,7 +543,16 @@ func New(c fs.Config) (model.Model, error) {
 		ssmNGroup:             int(c.Uint("ssm.group_count")),
 		ssmDtRank:             int(c.Uint("ssm.time_step_rank")),
 		convKernelSize:        int(c.Uint("ssm.conv_kernel")),
+		vHeadReordered:        c.Bool("ssm.v_head_reordered", false),
 		isRecurrent:           isRecurrent,
+		mropeSections: slices.Collect(func(yield func(int) bool) {
+			for _, section := range mropeSections {
+				if !yield(int(section)) {
+					return
+				}
+			}
+		}),
+		mropeInterleaved: c.Bool("rope.mrope_interleaved", c.Bool("mrope_interleaved", false)),
 	}
 	if opts.numKVHeads == 0 {
 		return nil, fmt.Errorf("qwen3next: attention.head_count_kv array must include at least one non-zero value")
@@ -353,6 +574,19 @@ func New(c fs.Config) (model.Model, error) {
 		return nil, fmt.Errorf("qwen3next: headKDim (%d) != headVDim (%d) not supported; state computations require equal dimensions", headKDim, headVDim)
 	}

+	var vision *qwen3vl.VisionModel
+	var imageProcessor *qwen3vl.ImageProcessor
+	if c.Uint("vision.block_count", 0) > 0 {
+		vision = qwen3vl.NewVisionModel(c)
+		processor := qwen3vl.NewImageProcessor(c)
+		imageProcessor = &processor
+	}
+
+	spatialMergeSize := c.Uint("vision.spatial_merge_size", 2)
+	if spatialMergeSize == 0 {
+		spatialMergeSize = 2
+	}
+
 	m := Model{
 		Tokenizer: tokenizer.NewBytePairEncoding(
 			&tokenizer.Vocabulary{
@@ -371,8 +605,14 @@ func New(c fs.Config) (model.Model, error) {
 			},
 			`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
 		),
-		Layers:  layers,
-		Options: opts,
+		Layers:           layers,
+		Vision:           vision,
+		ImageProcessor:   imageProcessor,
+		Options:          opts,
+		imageToken:       int32(c.Uint("image_token_id", 151655)),
+		visionStart:      int32(c.Uint("vision_start_token_id", 151652)),
+		visionEnd:        int32(c.Uint("vision_end_token_id", 151653)),
+		spatialMergeSize: spatialMergeSize,
 	}

 	m.Cache = NewHybridCache(m.Shift, convDim, convChannels, deltaStateSize)
@@ -380,5 +620,7 @@ func New(c fs.Config) (model.Model, error) {
 }

 func init() {
+	model.Register("qwen35", New)
+	model.Register("qwen35moe", New)
 	model.Register("qwen3next", New)
 }
--- a/model/models/qwen3next/model_posttokenize_test.go
+++ b/model/models/qwen3next/model_posttokenize_test.go
@@ -0,0 +1,101 @@
+package qwen3next
+
+import (
+	"testing"
+
+	"github.com/ollama/ollama/ml/backend/ggml"
+	"github.com/ollama/ollama/model/input"
+	"github.com/ollama/ollama/model/models/qwen3vl"
+)
+
+type fakeTensor struct {
+	*ggml.Tensor
+	dims []int
+}
+
+func (t *fakeTensor) Dim(i int) int {
+	return t.dims[i]
+}
+
+func makeImageInput(hash uint64, width, height, tokens int) *input.Input {
+	return &input.Input{
+		Multimodal: []input.Multimodal{{
+			Tensor: &fakeTensor{dims: []int{1, tokens, 1, 1}},
+			Data:   &qwen3vl.Grid{Width: width, Height: height},
+		}},
+		MultimodalHash: hash,
+	}
+}
+
+func TestPostTokenizeMultiImageSpans(t *testing.T) {
+	m := &Model{
+		imageToken:       10,
+		visionStart:      11,
+		visionEnd:        12,
+		spatialMergeSize: 2,
+	}
+
+	inputs := []*input.Input{
+		{Token: 100},
+		makeImageInput(1, 8, 4, 4),
+		makeImageInput(2, 4, 8, 4),
+		{Token: 200},
+	}
+
+	got, err := m.PostTokenize(inputs)
+	if err != nil {
+		t.Fatalf("PostTokenize() error = %v", err)
+	}
+
+	want := []struct {
+		token     int32
+		hash      uint64
+		sameBatch int
+		hasMM     bool
+	}{
+		{token: 100},
+		{token: 11, sameBatch: 5},
+		{token: 10, hash: 1, hasMM: true},
+		{token: 10},
+		{token: 10},
+		{token: 10},
+		{token: 12},
+		{token: 11, sameBatch: 5},
+		{token: 10, hash: 2, hasMM: true},
+		{token: 10},
+		{token: 10},
+		{token: 10},
+		{token: 12},
+		{token: 200},
+	}
+
+	if len(got) != len(want) {
+		t.Fatalf("len(got) = %d, want %d", len(got), len(want))
+	}
+
+	for i := range want {
+		if got[i].Token != want[i].token {
+			t.Fatalf("got[%d].Token = %d, want %d", i, got[i].Token, want[i].token)
+		}
+		if got[i].MultimodalHash != want[i].hash {
+			t.Fatalf("got[%d].MultimodalHash = %d, want %d", i, got[i].MultimodalHash, want[i].hash)
+		}
+		if got[i].SameBatch != want[i].sameBatch {
+			t.Fatalf("got[%d].SameBatch = %d, want %d", i, got[i].SameBatch, want[i].sameBatch)
+		}
+		hasMM := len(got[i].Multimodal) > 0
+		if hasMM != want[i].hasMM {
+			t.Fatalf("got[%d].hasMM = %v, want %v", i, hasMM, want[i].hasMM)
+		}
+	}
+
+	wantPositions := []int32{0, 1, 2, 2, 2, 2, 6, 7, 8, 8, 8, 8, 12, 13}
+	if len(m.positionCache) != len(wantPositions) {
+		t.Fatalf("len(positionCache) = %d, want %d", len(m.positionCache), len(wantPositions))
+	}
+	for i := range wantPositions {
+		if m.positionCache[i] != wantPositions[i] {
+			t.Fatalf("positionCache[%d] = %d, want %d", i, m.positionCache[i], wantPositions[i])
+		}
+	}
+}
--- a/model/models/qwen3vl/imageprocessor.go
+++ b/model/models/qwen3vl/imageprocessor.go
@@ -24,8 +24,8 @@ type ImageProcessor struct {
 	imageStd          []float32
 }

-// newImageProcessor creates a new image processor with default values
-func newImageProcessor(c fs.Config) ImageProcessor {
+// NewImageProcessor creates a new image processor with default values.
+func NewImageProcessor(c fs.Config) ImageProcessor {
 	patchSize := int(c.Uint("vision.patch_size", 14))
 	mergeSize := int(c.Uint("vision.spatial_merge_size", 2))

--- a/model/models/qwen3vl/model.go
+++ b/model/models/qwen3vl/model.go
@@ -56,60 +56,46 @@ var (
 	tokenVisionEnd   int32 = 151653
 )

-type modelInput struct {
-	*input.Input
-	position int32
-}
-
 // PostTokenize arranges Qwen 3 VL's inputs for the forward pass
 func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
 	m.positionCache = m.positionCache[:0]
-	return slices.Collect(func(yield func(*input.Input) bool) {
-		for i := range inputs {
-			s := []modelInput{{Input: inputs[i]}}
-			if mm := inputs[i].Multimodal; mm != nil {
-				t := mm[0].Tensor
-				s = slices.Repeat([]modelInput{
-					{
-						position: int32(i + 1),
-						Input:    &input.Input{Token: tokenVision},
-					},
-				}, t.Dim(1)+1+1)
+	var result []*input.Input
+	appendInput := func(inp *input.Input, position int32) {
+		result = append(result, inp)
+		m.positionCache = append(m.positionCache, position)
+	}

-				s[0] = modelInput{
-					Input:    &input.Input{Token: tokenVisionStart},
-					position: int32(i),
-				}
-
-				s[len(s)-1] = modelInput{
-					Input:    &input.Input{Token: tokenVisionEnd},
-					position: int32(i + mm[0].Data.(*Grid).Width/m.spatialMergeSize + 1),
-				}
-
-				s[1] = modelInput{
-					Input: &input.Input{
-						Token:          tokenVision,
-						Multimodal:     inputs[i].Multimodal,
-						MultimodalHash: inputs[i].MultimodalHash,
-						SameBatch:      t.Dim(1),
-					},
-					position: int32(i + 1),
-				}
-			}
-
-			for _, e := range s {
-				position := e.position
-				if position == 0 && len(m.positionCache) > 0 {
-					position = m.positionCache[len(m.positionCache)-1] + 1
-				}
-
-				m.positionCache = append(m.positionCache, position)
-				if !yield(e.Input) {
-					return
-				}
-			}
+	var p int32
+	for _, inp := range inputs {
+		if inp.Multimodal == nil {
+			appendInput(inp, p)
+			p++
+			continue
 		}
-	}), nil
+
+		grid := inp.Multimodal[0].Data.(*Grid)
+		tokensPerGrid := inp.Multimodal[0].Tensor.Dim(1)
+
+		appendInput(&input.Input{Token: tokenVisionStart}, p)
+		p++
+
+		appendInput(&input.Input{
+			Token:          tokenVision,
+			Multimodal:     inp.Multimodal,
+			MultimodalHash: inp.MultimodalHash,
+			SameBatch:      tokensPerGrid,
+		}, p)
+
+		for range tokensPerGrid - 1 {
+			appendInput(&input.Input{Token: tokenVision}, p)
+		}
+
+		p = p + int32(grid.Width/m.spatialMergeSize)
+		appendInput(&input.Input{Token: tokenVisionEnd}, p)
+		p++
+	}
+
+	return result, nil
 }

 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
@@ -143,9 +129,13 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 			}
 		}

-		deepstackVisualEmbeds = make([]ml.Tensor, len(mi.Multimodal[1:]))
+		if len(mi.Multimodal[1:]) > len(deepstackVisualEmbeds) {
+			deepstackVisualEmbeds = append(deepstackVisualEmbeds, make([]ml.Tensor, len(mi.Multimodal[1:])-len(deepstackVisualEmbeds))...)
+		}
 		for i, mm := range mi.Multimodal[1:] {
-			deepstackVisualEmbeds[i] = ctx.Input().Zeros(mm.Tensor.DType(), hiddenStates.Shape()...)
+			if deepstackVisualEmbeds[i] == nil {
+				deepstackVisualEmbeds[i] = ctx.Input().Zeros(mm.Tensor.DType(), hiddenStates.Shape()...)
+			}
 			ctx.Forward(mm.Tensor.Copy(ctx, deepstackVisualEmbeds[i].View(ctx, mi.Index*deepstackVisualEmbeds[i].Stride(1), mm.Tensor.Dim(0)*mm.Tensor.Dim(1))))
 		}
 	}
@@ -189,8 +179,8 @@ func New(c fs.Config) (model.Model, error) {
 			`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
 		),
 		TextModel:      newTextModel(c),
-		VisionModel:    newVisionModel(c),
-		ImageProcessor: newImageProcessor(c),
+		VisionModel:    NewVisionModel(c),
+		ImageProcessor: NewImageProcessor(c),
 	}

 	m.Cache = kvcache.NewCausalCache(func(ctx ml.Context, layer int, key, positions ml.Tensor) (ml.Tensor, error) {
--- a/model/models/qwen3vl/model_vision.go
+++ b/model/models/qwen3vl/model_vision.go
@@ -238,8 +238,8 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor, grid *Grid)
 	return hiddenStates, deepstackStates
 }

-// newVisionModel creates a new instance of the Qwen vision model
-func newVisionModel(c fs.Config) *VisionModel {
+// NewVisionModel creates a new instance of the Qwen vision model.
+func NewVisionModel(c fs.Config) *VisionModel {
 	deepstackVisualIndexes := c.Ints("vision.deepstack_visual_indexes")
 	model := &VisionModel{
 		Layers:          make([]VisionEncoderLayer, c.Uint("vision.block_count", 32)),
--- a/model/parsers/lfm2.go
+++ b/model/parsers/lfm2.go
@@ -32,6 +32,8 @@ type LFM2Parser struct {
 	hasThinkingSupport       bool
 	needsThinkingLeadingTrim bool // trim leading whitespace after <think> tag
 	needsContentLeadingTrim  bool // trim leading whitespace after </think> tag
+	toolNames                map[string]struct{}
+	hasTools                 bool
 }

 func (p *LFM2Parser) HasToolSupport() bool {
@@ -63,6 +65,13 @@ func (p *LFM2Parser) setInitialState(lastMessage *api.Message, thinkValue *api.T
 }

 func (p *LFM2Parser) Init(tools []api.Tool, lastMessage *api.Message, thinkValue *api.ThinkValue) []api.Tool {
+	p.toolNames = make(map[string]struct{}, len(tools))
+	p.hasTools = len(tools) > 0
+	for _, tool := range tools {
+		if tool.Function.Name != "" {
+			p.toolNames[tool.Function.Name] = struct{}{}
+		}
+	}
 	p.setInitialState(lastMessage, thinkValue)
 	return tools
 }
@@ -105,9 +114,33 @@ func (p *LFM2Parser) Add(s string, done bool) (content string, thinking string,
 		}
 	}

+	// Fallback for models that emit bare tool calls without <|tool_call_*|> wrappers.
+	if done && len(toolCalls) == 0 && p.hasTools {
+		candidate := strings.TrimSpace(contentSb.String())
+		if fallbackCalls, parseErr := p.parseToolCallsContent(candidate); parseErr == nil && p.toolCallsAllowed(fallbackCalls) {
+			contentSb.Reset()
+			toolCalls = append(toolCalls, fallbackCalls...)
+		}
+	}
+
 	return contentSb.String(), thinkingSb.String(), toolCalls, nil
 }

+func (p *LFM2Parser) toolCallsAllowed(calls []api.ToolCall) bool {
+	if len(calls) == 0 {
+		return false
+	}
+	if len(p.toolNames) == 0 {
+		return true
+	}
+	for _, call := range calls {
+		if _, ok := p.toolNames[call.Function.Name]; !ok {
+			return false
+		}
+	}
+	return true
+}
+
 func (p *LFM2Parser) parseEvents() []lfm2Event {
 	var all []lfm2Event

@@ -269,36 +302,16 @@ func (p *LFM2Parser) eat() ([]lfm2Event, bool) {
 	return events, false
 }

-// parseToolCallsContent parses one or more tool calls from content
-// Supports JSON format and Python-style format including multiple calls: [func1(...),func2(...)]
+// parseToolCallsContent parses one or more Python-style tool calls.
+// Example: [func1(arg='v'), func2(x=1)]
 func (p *LFM2Parser) parseToolCallsContent(content string) ([]api.ToolCall, error) {
 	content = strings.TrimSpace(content)

-	// Try JSON format first: {"name": "func", "arguments": {...}}
-	var parsed struct {
-		Name      string          `json:"name"`
-		Arguments json.RawMessage `json:"arguments"`
-	}
+	// Be tolerant of malformed outputs that include wrapper tags without proper pairing.
+	content = strings.TrimSpace(strings.TrimPrefix(content, lfm2ToolCallStartTag))
+	content = strings.TrimSpace(strings.TrimSuffix(content, lfm2ToolCallEndTag))

-	if err := json.Unmarshal([]byte(content), &parsed); err == nil && parsed.Name != "" {
-		var args api.ToolCallFunctionArguments
-		if len(parsed.Arguments) > 0 {
-			if err := json.Unmarshal(parsed.Arguments, &args); err != nil {
-				return nil, err
-			}
-		} else {
-			args = api.NewToolCallFunctionArguments()
-		}
-
-		return []api.ToolCall{{
-			Function: api.ToolCallFunction{
-				Name:      parsed.Name,
-				Arguments: args,
-			},
-		}}, nil
-	}
-
-	// Try Python-style format: [func(arg1='val1'),func2(arg2='val2')] or func(arg1='val1')
+	// Parse Python-style format: [func(arg1='val1'),func2(arg2='val2')] or func(arg1='val1')
 	return p.parsePythonStyleToolCalls(content)
 }

@@ -417,21 +430,16 @@ func (p *LFM2Parser) parseToolCallContent(content string) (api.ToolCall, error)

 // parsePythonArgs parses Python-style keyword arguments: key='value', key2="value2"
 func parsePythonArgs(argsStr string, args *api.ToolCallFunctionArguments) error {
-	// Simple state machine to parse key='value' pairs
-	// Handles: command='ls', flag="-la", count=42, enabled=true
-	var key string
 	i := 0
-
 	for i < len(argsStr) {
-		// Skip whitespace
-		for i < len(argsStr) && (argsStr[i] == ' ' || argsStr[i] == '\t' || argsStr[i] == '\n') {
+		// Skip separators and whitespace.
+		for i < len(argsStr) && (argsStr[i] == ',' || unicode.IsSpace(rune(argsStr[i]))) {
 			i++
 		}
 		if i >= len(argsStr) {
 			break
 		}

-		// Parse key
 		keyStart := i
 		for i < len(argsStr) && argsStr[i] != '=' && argsStr[i] != ',' {
 			i++
@@ -439,60 +447,238 @@ func parsePythonArgs(argsStr string, args *api.ToolCallFunctionArguments) error
 		if i >= len(argsStr) || argsStr[i] != '=' {
 			return errors.New("invalid argument: expected '='")
 		}
-		key = strings.TrimSpace(argsStr[keyStart:i])
+
+		key := strings.TrimSpace(argsStr[keyStart:i])
+		if key == "" {
+			return errors.New("invalid argument: empty key")
+		}
 		i++ // skip '='

-		// Skip whitespace after =
-		for i < len(argsStr) && (argsStr[i] == ' ' || argsStr[i] == '\t') {
+		for i < len(argsStr) && unicode.IsSpace(rune(argsStr[i])) {
 			i++
 		}
-
-		// Parse value
-		var value string
-		if i < len(argsStr) && (argsStr[i] == '\'' || argsStr[i] == '"') {
-			// Quoted string
-			quote := argsStr[i]
-			i++
-			valueStart := i
-			for i < len(argsStr) && argsStr[i] != quote {
-				if argsStr[i] == '\\' && i+1 < len(argsStr) {
-					i += 2 // skip escaped char
-				} else {
-					i++
-				}
-			}
-			value = argsStr[valueStart:i]
-			if i < len(argsStr) {
-				i++ // skip closing quote
-			}
-			args.Set(key, value)
-		} else {
-			// Unquoted value (number, bool, etc)
-			valueStart := i
-			for i < len(argsStr) && argsStr[i] != ',' {
-				i++
-			}
-			value = strings.TrimSpace(argsStr[valueStart:i])
-
-			// Try to parse as number or bool
-			if v, err := strconv.ParseInt(value, 10, 64); err == nil {
-				args.Set(key, v)
-			} else if v, err := strconv.ParseFloat(value, 64); err == nil {
-				args.Set(key, v)
-			} else if value == "true" {
-				args.Set(key, true)
-			} else if value == "false" {
-				args.Set(key, false)
-			} else {
-				args.Set(key, value)
-			}
+		if i >= len(argsStr) {
+			return errors.New("invalid argument: missing value")
 		}

-		// Skip comma and whitespace
-		for i < len(argsStr) && (argsStr[i] == ',' || argsStr[i] == ' ' || argsStr[i] == '\t' || argsStr[i] == '\n') {
+		value, next, err := parsePythonArgValue(argsStr, i)
+		if err != nil {
+			return err
+		}
+		args.Set(key, value)
+		i = next
+
+		// Optional trailing comma before next key/value.
+		if i < len(argsStr) && argsStr[i] == ',' {
 			i++
 		}
 	}

 	return nil
 }
+
+func parsePythonArgValue(s string, i int) (any, int, error) {
+	if i >= len(s) {
+		return nil, i, errors.New("invalid argument: missing value")
+	}
+
+	// Quoted string literal.
+	if s[i] == '\'' || s[i] == '"' {
+		quote := s[i]
+		i++
+		start := i
+		for i < len(s) {
+			if s[i] == '\\' && i+1 < len(s) {
+				i += 2
+				continue
+			}
+			if s[i] == quote {
+				value := s[start:i]
+				i++
+				return value, i, nil
+			}
+			i++
+		}
+		return nil, i, errors.New("invalid argument: unterminated string")
+	}
+
+	// Unquoted literal. Consume until top-level comma.
+	start := i
+	depthParen, depthSquare, depthCurly := 0, 0, 0
+	inString := false
+	var quote byte
+	escaped := false
+
+	for i < len(s) {
+		ch := s[i]
+		if inString {
+			if escaped {
+				escaped = false
+			} else if ch == '\\' {
+				escaped = true
+			} else if ch == quote {
+				inString = false
+			}
+			i++
+			continue
+		}
+
+		switch ch {
+		case '\'', '"':
+			inString = true
+			quote = ch
+		case '(':
+			depthParen++
+		case ')':
+			if depthParen > 0 {
+				depthParen--
+			}
+		case '[':
+			depthSquare++
+		case ']':
+			if depthSquare > 0 {
+				depthSquare--
+			}
+		case '{':
+			depthCurly++
+		case '}':
+			if depthCurly > 0 {
+				depthCurly--
+			}
+		case ',':
+			if depthParen == 0 && depthSquare == 0 && depthCurly == 0 {
+				token := strings.TrimSpace(s[start:i])
+				value, err := parsePythonLiteral(token)
+				return value, i, err
+			}
+		}
+		i++
+	}
+
+	token := strings.TrimSpace(s[start:i])
+	value, err := parsePythonLiteral(token)
+	return value, i, err
+}
+
+func parsePythonLiteral(token string) (any, error) {
+	switch token {
+	case "":
+		return "", nil
+	case "true", "True":
+		return true, nil
+	case "false", "False":
+		return false, nil
+	case "null", "None":
+		return nil, nil
+	}
+
+	if v, err := strconv.ParseInt(token, 10, 64); err == nil {
+		return v, nil
+	}
+	if v, err := strconv.ParseFloat(token, 64); err == nil {
+		return v, nil
+	}
+
+	if strings.HasPrefix(token, "[") || strings.HasPrefix(token, "{") {
+		var parsed any
+		if err := json.Unmarshal([]byte(token), &parsed); err == nil {
+			return parsed, nil
+		}
+
+		if converted, err := pythonLiteralToJSON(token); err == nil {
+			if err := json.Unmarshal([]byte(converted), &parsed); err == nil {
+				return parsed, nil
+			}
+		}
+	}
+
+	return token, nil
+}
+
+func pythonLiteralToJSON(s string) (string, error) {
+	var out strings.Builder
+	out.Grow(len(s) + len(s)/8)
+
+	inString := false
+	var quote byte
+	escaped := false
+
+	for i := 0; i < len(s); i++ {
+		ch := s[i]
+
+		if inString {
+			if escaped {
+				out.WriteByte(ch)
+				escaped = false
+				continue
+			}
+
+			if ch == '\\' {
+				out.WriteByte(ch)
+				escaped = true
+				continue
+			}
+
+			if ch == quote {
+				out.WriteByte('"')
+				inString = false
+				continue
+			}
+
+			if quote == '\'' && ch == '"' {
+				out.WriteString(`\"`)
+				continue
+			}
+
+			out.WriteByte(ch)
+			continue
+		}
+
+		if ch == '\'' || ch == '"' {
+			inString = true
+			quote = ch
+			escaped = false
+			out.WriteByte('"')
+			continue
+		}
+
+		// Replace Python identifiers with JSON equivalents when outside strings.
+		if isIdentStart(ch) {
+			j := i + 1
+			for j < len(s) && isIdentPart(s[j]) {
+				j++
+			}
+
+			ident := s[i:j]
+			switch ident {
+			case "True":
+				out.WriteString("true")
+			case "False":
+				out.WriteString("false")
+			case "None":
+				out.WriteString("null")
+			default:
+				out.WriteString(ident)
+			}
+
+			i = j - 1
+			continue
+		}
+
+		out.WriteByte(ch)
+	}
+
+	if inString {
+		return "", errors.New("unterminated string")
+	}
+
+	return out.String(), nil
+}
+
+func isIdentStart(b byte) bool {
+	return (b >= 'A' && b <= 'Z') || (b >= 'a' && b <= 'z') || b == '_'
+}
+
+func isIdentPart(b byte) bool {
+	return isIdentStart(b) || (b >= '0' && b <= '9')
+}
--- a/model/parsers/lfm2_test.go
+++ b/model/parsers/lfm2_test.go
@@ -39,7 +39,7 @@ func TestLFM2Parser(t *testing.T) {
 		},
 		{
 			name:            "tool_call_simple",
-			input:           "I'll check the weather.<|tool_call_start|>{\"name\":\"get_weather\",\"arguments\":{\"location\":\"Paris\"}}<|tool_call_end|>",
+			input:           "I'll check the weather.<|tool_call_start|>[get_weather(location=\"Paris\")]<|tool_call_end|>",
 			expectedContent: "I'll check the weather.",
 			expectedCalls: []api.ToolCall{
 				{
@@ -55,7 +55,7 @@ func TestLFM2Parser(t *testing.T) {
 		},
 		{
 			name:            "multiple_tool_calls",
-			input:           "Getting weather for both cities.<|tool_call_start|>{\"name\":\"get_weather\",\"arguments\":{\"location\":\"Paris\"}}<|tool_call_end|><|tool_call_start|>{\"name\":\"get_weather\",\"arguments\":{\"location\":\"London\"}}<|tool_call_end|>",
+			input:           "Getting weather for both cities.<|tool_call_start|>[get_weather(location=\"Paris\")]<|tool_call_end|><|tool_call_start|>[get_weather(location=\"London\")]<|tool_call_end|>",
 			expectedContent: "Getting weather for both cities.",
 			expectedCalls: []api.ToolCall{
 				{
@@ -79,7 +79,7 @@ func TestLFM2Parser(t *testing.T) {
 		},
 		{
 			name:            "complex_tool_arguments",
-			input:           "Processing data.<|tool_call_start|>{\"name\":\"process_data\",\"arguments\":{\"items\":[\"item1\",\"item2\"],\"config\":{\"enabled\":true,\"threshold\":0.95}}}<|tool_call_end|>",
+			input:           "Processing data.<|tool_call_start|>[process_data(items=['item1','item2'], config={'enabled': True, 'threshold': 0.95})]<|tool_call_end|>",
 			expectedContent: "Processing data.",
 			expectedCalls: []api.ToolCall{
 				{
@@ -96,7 +96,7 @@ func TestLFM2Parser(t *testing.T) {
 		},
 		{
 			name:             "thinking_with_tool_call",
-			input:            "Let me check the weather...</think>I'll get that for you.<|tool_call_start|>{\"name\":\"get_weather\",\"arguments\":{\"location\":\"Paris\"}}<|tool_call_end|>",
+			input:            "Let me check the weather...</think>I'll get that for you.<|tool_call_start|>[get_weather(location=\"Paris\")]<|tool_call_end|>",
 			expectedThinking: "Let me check the weather...",
 			expectedContent:  "I'll get that for you.",
 			expectedCalls: []api.ToolCall{
@@ -144,16 +144,16 @@ func TestLFM2Parser(t *testing.T) {
 			hasThinking:      true,
 		},
 		{
-			name:            "tool_call_with_unicode_args",
-			input:           "Searching for information.<|tool_call_start|>{\"name\":\"search\",\"arguments\":{\"query\":\"北京天气\",\"language\":\"中文\"}}<|tool_call_end|>",
+			name:            "tool_call_with_text_args",
+			input:           "Searching for information.<|tool_call_start|>[search(query='beijing weather', language='zh')]<|tool_call_end|>",
 			expectedContent: "Searching for information.",
 			expectedCalls: []api.ToolCall{
 				{
 					Function: api.ToolCallFunction{
 						Name: "search",
 						Arguments: testArgs(map[string]any{
-							"query":    "北京天气",
-							"language": "中文",
+							"query":    "beijing weather",
+							"language": "zh",
 						}),
 					},
 				},
@@ -169,7 +169,7 @@ func TestLFM2Parser(t *testing.T) {
 		},
 		{
 			name:            "empty_tool_call_args",
-			input:           "Pinging server.<|tool_call_start|>{\"name\":\"ping\",\"arguments\":{}}<|tool_call_end|>",
+			input:           "Pinging server.<|tool_call_start|>[ping()]<|tool_call_end|>",
 			expectedContent: "Pinging server.",
 			expectedCalls: []api.ToolCall{
 				{
@@ -353,7 +353,7 @@ func TestLFM2Parser_Streaming(t *testing.T) {
 		},
 		{
 			name:            "streaming_tool_call",
-			chunks:          []string{"I'll check weather.", "<|tool_call_start|>", "{\"name\":\"get_weather\",", "\"arguments\":{\"location\":\"Paris\"}}", "<|tool_call_end|>"},
+			chunks:          []string{"I'll check weather.", "<|tool_call_start|>", "[get_weather(", "location=\"Paris\")]", "<|tool_call_end|>"},
 			expectedContent: "I'll check weather.",
 			expectedCalls: []api.ToolCall{
 				{
@@ -381,16 +381,16 @@ func TestLFM2Parser_Streaming(t *testing.T) {
 			hasThinking:     false,
 		},
 		{
-			name:            "streaming_tool_call_with_split_json",
-			chunks:          []string{"Processing.", "<|tool_call_start|>{\"name\":\"calc\",\"arguments\":{\"x\":", "42,\"y\":", "24}}<|tool_call_end|>"},
+			name:            "streaming_tool_call_with_split_python",
+			chunks:          []string{"Processing.", "<|tool_call_start|>", "[calc(", "x=42, ", "y=24)]", "<|tool_call_end|>"},
 			expectedContent: "Processing.",
 			expectedCalls: []api.ToolCall{
 				{
 					Function: api.ToolCallFunction{
 						Name: "calc",
 						Arguments: testArgs(map[string]any{
-							"x": float64(42),
-							"y": float64(24),
+							"x": int64(42),
+							"y": int64(24),
 						}),
 					},
 				},
@@ -516,8 +516,8 @@ func TestLFM2Parser_parseToolCallContent(t *testing.T) {
 		expectError bool
 	}{
 		{
-			name:    "valid_tool_call",
-			content: `{"name":"get_weather","arguments":{"location":"Paris"}}`,
+			name:    "python_style_single_call",
+			content: `get_weather(location="Paris")`,
 			expected: api.ToolCall{
 				Function: api.ToolCallFunction{
 					Name: "get_weather",
@@ -528,21 +528,33 @@ func TestLFM2Parser_parseToolCallContent(t *testing.T) {
 			},
 		},
 		{
-			name:    "complex_arguments",
-			content: `{"name":"process_data","arguments":{"items":["a","b"],"config":{"enabled":true}}}`,
+			name:    "python_style_with_brackets",
+			content: `[get_weather(location="Paris")]`,
 			expected: api.ToolCall{
 				Function: api.ToolCallFunction{
-					Name: "process_data",
+					Name: "get_weather",
 					Arguments: testArgs(map[string]any{
-						"items":  []interface{}{"a", "b"},
-						"config": map[string]interface{}{"enabled": true},
+						"location": "Paris",
 					}),
 				},
 			},
 		},
 		{
-			name:    "empty_arguments",
-			content: `{"name":"ping","arguments":{}}`,
+			name:    "python_style_complex_arguments",
+			content: `process(items=['a', 'b'], config={'enabled': True})`,
+			expected: api.ToolCall{
+				Function: api.ToolCallFunction{
+					Name: "process",
+					Arguments: testArgs(map[string]any{
+						"items":  []any{"a", "b"},
+						"config": map[string]any{"enabled": true},
+					}),
+				},
+			},
+		},
+		{
+			name:    "python_style_empty_arguments",
+			content: `ping()`,
 			expected: api.ToolCall{
 				Function: api.ToolCallFunction{
 					Name:      "ping",
@@ -551,44 +563,13 @@ func TestLFM2Parser_parseToolCallContent(t *testing.T) {
 			},
 		},
 		{
-			name:    "unicode_in_tool_name",
-			content: `{"name":"获取天气","arguments":{"城市":"北京"}}`,
-			expected: api.ToolCall{
-				Function: api.ToolCallFunction{
-					Name: "获取天气",
-					Arguments: testArgs(map[string]any{
-						"城市": "北京",
-					}),
-				},
-			},
-		},
-		{
-			name:    "numeric_arguments",
-			content: `{"name":"calculate","arguments":{"x":3.14,"y":42,"enabled":true}}`,
-			expected: api.ToolCall{
-				Function: api.ToolCallFunction{
-					Name: "calculate",
-					Arguments: testArgs(map[string]any{
-						"x":       3.14,
-						"y":       float64(42),
-						"enabled": true,
-					}),
-				},
-			},
-		},
-		{
-			name:        "invalid_json",
-			content:     `{invalid json}`,
+			name:        "missing_parenthesis",
+			content:     `get_weather location="Paris")`,
 			expectError: true,
 		},
 		{
-			name:        "missing_name",
-			content:     `{"arguments":{"arg":"value"}}`,
-			expectError: true,
-		},
-		{
-			name:        "empty_name",
-			content:     `{"name":"","arguments":{"arg":"value"}}`,
+			name:        "invalid_argument_format",
+			content:     `bash(command)`,
 			expectError: true,
 		},
 	}
@@ -645,6 +626,24 @@ func TestLFM2Parser_parseToolCallsContent(t *testing.T) {
 				},
 			},
 		},
+		{
+			name:    "python_style_complex_literals",
+			content: `[AskUserQuestion(question="What's up?", headers=['Hello!', 'How can I help you?'], options=['Debugging help', 'Code writing assistance'], multiSelect=False, metadata={'priority': 1, 'active': True})]`,
+			expected: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Name: "AskUserQuestion",
+						Arguments: testArgs(map[string]any{
+							"question":    "What's up?",
+							"headers":     []any{"Hello!", "How can I help you?"},
+							"options":     []any{"Debugging help", "Code writing assistance"},
+							"multiSelect": false,
+							"metadata":    map[string]any{"priority": float64(1), "active": true},
+						}),
+					},
+				},
+			},
+		},
 		{
 			name:    "single_python_style_call",
 			content: `bash(command='ls -la')`,
@@ -673,6 +672,34 @@ func TestLFM2Parser_parseToolCallsContent(t *testing.T) {
 				},
 			},
 		},
+		{
+			name:    "single_call_with_orphan_end_tag",
+			content: `[bash(command='ls')]<|tool_call_end|>`,
+			expected: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Name: "bash",
+						Arguments: testArgs(map[string]any{
+							"command": "ls",
+						}),
+					},
+				},
+			},
+		},
+		{
+			name:    "single_call_with_wrapper_tags",
+			content: `<|tool_call_start|>[bash(command='pwd')]<|tool_call_end|>`,
+			expected: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Name: "bash",
+						Arguments: testArgs(map[string]any{
+							"command": "pwd",
+						}),
+					},
+				},
+			},
+		},
 		{
 			name:    "multiple_different_functions",
 			content: `[get_weather(location='Paris'),search(query='news')]`,
@@ -1086,3 +1113,106 @@ func TestLFM2Parser_EdgeCases(t *testing.T) {
 		})
 	}
 }
+
+func TestLFM2Parser_BareToolCallFallback(t *testing.T) {
+	parser := &LFM2Parser{}
+	tools := []api.Tool{
+		{
+			Type: "function",
+			Function: api.ToolFunction{
+				Name: "get_weather",
+			},
+		},
+	}
+	parser.Init(tools, nil, &api.ThinkValue{Value: false})
+
+	content, thinking, calls, err := parser.Add(`[get_weather(location="Paris")]`, true)
+	if err != nil {
+		t.Fatalf("Add() error = %v", err)
+	}
+
+	if content != "" {
+		t.Fatalf("expected empty content, got %q", content)
+	}
+	if thinking != "" {
+		t.Fatalf("expected empty thinking, got %q", thinking)
+	}
+	if len(calls) != 1 {
+		t.Fatalf("expected 1 tool call, got %d", len(calls))
+	}
+	if calls[0].Function.Name != "get_weather" {
+		t.Fatalf("expected tool name get_weather, got %q", calls[0].Function.Name)
+	}
+}
+
+func TestLFM2Parser_BareUnknownToolCallDoesNotParse(t *testing.T) {
+	parser := &LFM2Parser{}
+	tools := []api.Tool{
+		{
+			Type: "function",
+			Function: api.ToolFunction{
+				Name: "get_weather",
+			},
+		},
+	}
+	parser.Init(tools, nil, &api.ThinkValue{Value: false})
+
+	input := `[unknown_tool(location="Paris")]`
+	content, _, calls, err := parser.Add(input, true)
+	if err != nil {
+		t.Fatalf("Add() error = %v", err)
+	}
+
+	if content != input {
+		t.Fatalf("expected content to be preserved, got %q", content)
+	}
+	if len(calls) != 0 {
+		t.Fatalf("expected no tool calls, got %d", len(calls))
+	}
+}
+
+func TestLFM2Parser_ImagePlaceholdersPreserved(t *testing.T) {
+	tests := []struct {
+		name  string
+		input string
+	}{
+		{
+			name:  "indexed_img_placeholder",
+			input: "[img-0]describe this image",
+		},
+		{
+			name:  "template_image_placeholder",
+			input: "<image>describe this image",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			parser := &LFM2Parser{}
+			tools := []api.Tool{
+				{
+					Type: "function",
+					Function: api.ToolFunction{
+						Name: "bash",
+					},
+				},
+			}
+			parser.Init(tools, nil, &api.ThinkValue{Value: false})
+
+			content, thinking, calls, err := parser.Add(tt.input, true)
+			if err != nil {
+				t.Fatalf("Add() error = %v", err)
+			}
+
+			if content != tt.input {
+				t.Fatalf("expected content %q, got %q", tt.input, content)
+			}
+			if thinking != "" {
+				t.Fatalf("expected empty thinking, got %q", thinking)
+			}
+			if len(calls) != 0 {
+				t.Fatalf("expected no tool calls, got %d", len(calls))
+			}
+		})
+	}
+}
--- a/model/parsers/parsers.go
+++ b/model/parsers/parsers.go
@@ -49,6 +49,8 @@ func ParserForName(name string) Parser {
 		p = &Qwen3Parser{hasThinkingSupport: false, defaultThinking: false}
 	case "qwen3-thinking":
 		p = &Qwen3Parser{hasThinkingSupport: true, defaultThinking: true}
+	case "qwen3.5":
+		p = &Qwen3Parser{hasThinkingSupport: true, defaultThinking: true}
 	case "qwen3-coder":
 		p = &Qwen3CoderParser{}
 	case "qwen3-vl-instruct":
--- a/model/parsers/parsers_test.go
+++ b/model/parsers/parsers_test.go
@@ -57,6 +57,9 @@ func TestBuiltInParsersStillWork(t *testing.T) {
 		{"qwen3"},
 		{"qwen3-thinking"},
 		{"qwen3-coder"},
+		{"lfm2"},
+		{"lfm2-thinking"},
+		{"qwen3.5"},
 		{"harmony"},
 	}

--- a/model/parsers/qwen3_test.go
+++ b/model/parsers/qwen3_test.go
@@ -145,3 +145,26 @@ func TestQwen3ParserToolCall(t *testing.T) {
 		t.Fatalf("expected unit %q, got %v", "celsius", unit)
 	}
 }
+
+func TestQwen35ParserRespectsNoThink(t *testing.T) {
+	parser := ParserForName("qwen3.5")
+	if parser == nil {
+		t.Fatal("expected qwen3.5 parser")
+	}
+
+	parser.Init(nil, nil, &api.ThinkValue{Value: false})
+	content, thinking, calls, err := parser.Add("Hello! How can I help you today?", true)
+	if err != nil {
+		t.Fatalf("parse failed: %v", err)
+	}
+
+	if thinking != "" {
+		t.Fatalf("expected no thinking, got %q", thinking)
+	}
+	if content != "Hello! How can I help you today?" {
+		t.Fatalf("expected content %q, got %q", "Hello! How can I help you today?", content)
+	}
+	if len(calls) != 0 {
+		t.Fatalf("expected no tool calls, got %d", len(calls))
+	}
+}
--- a/model/renderers/lfm2.go
+++ b/model/renderers/lfm2.go
@@ -1,7 +1,10 @@
 package renderers

 import (
+	"bytes"
 	"encoding/json"
+	"fmt"
+	"sort"
 	"strings"

 	"github.com/ollama/ollama/api"
@@ -9,18 +12,222 @@ import (

 type LFM2Renderer struct {
 	IsThinking bool
+	useImgTags bool
+}
+
+const lfm2BOSToken = "<|startoftext|>"
+
+const (
+	lfm2ToolListStartTag     = "<|tool_list_start|>"
+	lfm2ToolListEndTag       = "<|tool_list_end|>"
+	lfm2ToolCallStartTag     = "<|tool_call_start|>"
+	lfm2ToolCallEndTag       = "<|tool_call_end|>"
+	lfm2ToolResponseStartTag = "<|tool_response_start|>"
+	lfm2ToolResponseEndTag   = "<|tool_response_end|>"
+)
+
+func lfm2RenderSystemContent(content any) string {
+	switch v := content.(type) {
+	case string:
+		return v
+	case []any:
+		var sb strings.Builder
+		for _, item := range v {
+			obj, ok := item.(map[string]any)
+			if !ok {
+				continue
+			}
+
+			if itemType, _ := obj["type"].(string); itemType == "text" {
+				if text, ok := obj["text"].(string); ok {
+					sb.WriteString(text)
+				}
+			}
+		}
+		return sb.String()
+	default:
+		return ""
+	}
+}
+
+func lfm2JSON(v any) string {
+	var buf bytes.Buffer
+	enc := json.NewEncoder(&buf)
+	enc.SetEscapeHTML(false)
+	if err := enc.Encode(v); err != nil {
+		fallback, _ := json.Marshal(v)
+		return string(fallback)
+	}
+
+	encoded := bytes.TrimSuffix(buf.Bytes(), []byte{'\n'})
+
+	// HF `tojson` defaults to `json.dumps(..., separators=None)`, which inserts
+	// a space after commas and colons.
+	var out strings.Builder
+	out.Grow(len(encoded) + len(encoded)/8)
+
+	inString := false
+	escaped := false
+	for i, b := range encoded {
+		out.WriteByte(b)
+
+		if inString {
+			if escaped {
+				escaped = false
+				continue
+			}
+			if b == '\\' {
+				escaped = true
+				continue
+			}
+			if b == '"' {
+				inString = false
+			}
+			continue
+		}
+
+		if b == '"' {
+			inString = true
+			continue
+		}
+
+		if (b == ':' || b == ',') && i+1 < len(encoded) {
+			next := encoded[i+1]
+			if next != ' ' && next != '\n' && next != '\r' && next != '\t' {
+				out.WriteByte(' ')
+			}
+		}
+	}
+
+	return out.String()
+}
+
+func lfm2ImagePlaceholder(useImgTags bool) string {
+	if useImgTags {
+		return "[img]"
+	}
+
+	return "<image>"
+}
+
+func lfm2RenderContent(content any, useImgTags bool) string {
+	switch v := content.(type) {
+	case string:
+		return v
+	case []any:
+		var sb strings.Builder
+		for _, item := range v {
+			obj, ok := item.(map[string]any)
+			if !ok {
+				sb.WriteString(lfm2JSON(item))
+				continue
+			}
+
+			itemType, _ := obj["type"].(string)
+			switch itemType {
+			case "image":
+				sb.WriteString(lfm2ImagePlaceholder(useImgTags))
+			case "text":
+				if text, ok := obj["text"].(string); ok {
+					sb.WriteString(text)
+				} else {
+					sb.WriteString(lfm2JSON(item))
+				}
+			default:
+				sb.WriteString(lfm2JSON(item))
+			}
+		}
+		return sb.String()
+	default:
+		return lfm2JSON(content)
+	}
+}
+
+func lfm2ToolSchema(tool api.Tool) any {
+	if tool.Function.Name == "" {
+		return tool
+	}
+
+	// LFM2 templates are typically fed function-schema objects (name/description/parameters).
+	return tool.Function
+}
+
+func lfm2ToolCallArgument(v any) string {
+	return lfm2JSON(v)
+}
+
+func lfm2RenderToolCalls(calls []api.ToolCall) string {
+	var sb strings.Builder
+
+	sb.WriteString(lfm2ToolCallStartTag)
+	sb.WriteString("[")
+	for i, tc := range calls {
+		if i > 0 {
+			sb.WriteString(",")
+		}
+
+		sb.WriteString(tc.Function.Name)
+		sb.WriteString("(")
+
+		keys := make([]string, 0, tc.Function.Arguments.Len())
+		for key := range tc.Function.Arguments.All() {
+			keys = append(keys, key)
+		}
+		sort.Strings(keys)
+
+		for j, key := range keys {
+			if j > 0 {
+				sb.WriteString(",")
+			}
+			value, _ := tc.Function.Arguments.Get(key)
+			sb.WriteString(key)
+			sb.WriteString("=")
+			sb.WriteString(lfm2ToolCallArgument(value))
+		}
+
+		sb.WriteString(")")
+	}
+	sb.WriteString("]")
+	sb.WriteString(lfm2ToolCallEndTag)
+
+	return sb.String()
+}
+
+func (r *LFM2Renderer) renderMessageContent(message api.Message, imageOffset int) string {
+	content := lfm2RenderContent(message.Content, r.useImgTags)
+	if len(message.Images) == 0 {
+		return content
+	}
+
+	var sb strings.Builder
+	if r.useImgTags {
+		for i := range message.Images {
+			sb.WriteString(fmt.Sprintf("[img-%d]", imageOffset+i))
+		}
+	} else {
+		placeholder := lfm2ImagePlaceholder(false)
+		if strings.Contains(content, placeholder) {
+			return content
+		}
+		for range message.Images {
+			sb.WriteString(placeholder)
+		}
+	}
+	sb.WriteString(content)
+	return sb.String()
 }

 func (r *LFM2Renderer) Render(messages []api.Message, tools []api.Tool, thinkValue *api.ThinkValue) (string, error) {
 	var sb strings.Builder

-	// Note: BOS token is added by the tokenizer (add_bos_token: true), not the renderer
+	// Follow Liquid tool-use formatting for LFM2 tool wrappers.
+	sb.WriteString(lfm2BOSToken)

 	// Extract first system message if present (to combine with tools)
 	var firstSystemContent string
 	startIdx := 0
 	if len(messages) > 0 && messages[0].Role == "system" {
-		firstSystemContent = messages[0].Content
+		firstSystemContent = lfm2RenderSystemContent(messages[0].Content)
 		startIdx = 1
 	}

@@ -29,18 +236,17 @@ func (r *LFM2Renderer) Render(messages []api.Message, tools []api.Tool, thinkVal
 		if firstSystemContent != "" {
 			firstSystemContent += "\n"
 		}
-		firstSystemContent += "List of tools: ["
+		firstSystemContent += "List of tools: "
+		firstSystemContent += lfm2ToolListStartTag
+		firstSystemContent += "["
 		for i, tool := range tools {
-			toolJSON, err := json.Marshal(tool)
-			if err != nil {
-				return "", err
-			}
-			firstSystemContent += string(toolJSON)
+			firstSystemContent += lfm2JSON(lfm2ToolSchema(tool))
 			if i < len(tools)-1 {
 				firstSystemContent += ", "
 			}
 		}
 		firstSystemContent += "]"
+		firstSystemContent += lfm2ToolListEndTag
 	}

 	// Output first system block if it has content
@@ -50,6 +256,8 @@ func (r *LFM2Renderer) Render(messages []api.Message, tools []api.Tool, thinkVal
 		sb.WriteString("<|im_end|>\n")
 	}

+	keepPastThinking := r.IsThinking && (thinkValue != nil && thinkValue.Bool())
+
 	// Find the index of the last assistant message for thinking stripping
 	lastAssistantIndex := -1
 	for i := len(messages) - 1; i >= startIdx; i-- {
@@ -59,85 +267,53 @@ func (r *LFM2Renderer) Render(messages []api.Message, tools []api.Tool, thinkVal
 		}
 	}

-	// Track whether we need to add generation prompt
-	needsGenerationPrompt := len(messages) > 0
+	imageOffset := 0
+	for i := range startIdx {
+		imageOffset += len(messages[i].Images)
+	}

 	for i := startIdx; i < len(messages); i++ {
 		message := messages[i]
-		switch message.Role {
-		case "system":
-			// Additional system messages (after the first) are rendered normally
-			sb.WriteString("<|im_start|>system\n")
-			sb.WriteString(message.Content)
-			sb.WriteString("<|im_end|>\n")
+		lastMessage := i == len(messages)-1
+		prefill := lastMessage && message.Role == "assistant"

-		case "user":
-			sb.WriteString("<|im_start|>user\n")
-			sb.WriteString(message.Content)
-			sb.WriteString("<|im_end|>\n")
-			needsGenerationPrompt = true
+		sb.WriteString("<|im_start|>")
+		sb.WriteString(message.Role)
+		sb.WriteString("\n")

-		case "assistant":
-			sb.WriteString("<|im_start|>assistant\n")
-
-			// Check if this is the last assistant message
-			isLastAssistant := i == lastAssistantIndex
-
-			// Process content (may need thinking stripped)
-			content := message.Content
-
-			// Handle thinking tags in assistant content
-			keepPastThinking := r.IsThinking && (thinkValue != nil && thinkValue.Bool())
-			if strings.Contains(content, "</think>") {
-				parts := strings.SplitN(content, "</think>", 2)
-				if len(parts) > 1 {
-					if !isLastAssistant && !keepPastThinking {
-						// Strip thinking entirely for past assistant messages
-						content = strings.TrimSpace(parts[1])
-					} else {
-						// Preserve thinking but trim whitespace after </think>
-						content = parts[0] + "</think>" + strings.TrimLeft(parts[1], " \t\n\r")
-					}
-				}
+		content := r.renderMessageContent(message, imageOffset)
+		imageOffset += len(message.Images)
+		if message.Role == "assistant" && !keepPastThinking && i != lastAssistantIndex {
+			if idx := strings.LastIndex(content, "</think>"); idx >= 0 {
+				content = strings.TrimSpace(content[idx+len("</think>"):])
 			}
-
-			if len(message.ToolCalls) > 0 {
-				// Assistant with tool calls - write content first (if any after stripping)
-				if content != "" {
-					sb.WriteString(content)
-				}
-
-				for _, toolCall := range message.ToolCalls {
-					sb.WriteString("<|tool_call_start|>")
-					toolCallJSON := map[string]any{
-						"name":      toolCall.Function.Name,
-						"arguments": toolCall.Function.Arguments,
-					}
-					callJSON, _ := json.Marshal(toolCallJSON)
-					sb.WriteString(string(callJSON))
-					sb.WriteString("<|tool_call_end|>")
-				}
+		}
+		if message.Role == "assistant" && len(message.ToolCalls) > 0 && !strings.Contains(content, lfm2ToolCallStartTag) {
+			if strings.TrimSpace(content) == "" {
+				content = lfm2RenderToolCalls(message.ToolCalls) + content
 			} else {
-				sb.WriteString(content)
+				content = lfm2RenderToolCalls(message.ToolCalls) + "\n" + content
 			}
+		}
+		if message.Role == "tool" && !strings.Contains(content, lfm2ToolResponseStartTag) {
+			content = lfm2ToolResponseStartTag + content + lfm2ToolResponseEndTag
+		}

+		sb.WriteString(content)
+		if !prefill {
 			sb.WriteString("<|im_end|>\n")
-			needsGenerationPrompt = true // Always add gen prompt after assistant when add_generation_prompt=true
-
-		case "tool":
-			// Tool responses are rendered as plain messages per the chat template
-			sb.WriteString("<|im_start|>tool\n")
-			sb.WriteString(message.Content)
-			sb.WriteString("<|im_end|>\n")
-			needsGenerationPrompt = true
 		}
 	}

-	// Add generation prompt
+	needsGenerationPrompt := true
+	if len(messages) > 0 && messages[len(messages)-1].Role == "assistant" {
+		needsGenerationPrompt = false
+	}
+
 	if needsGenerationPrompt {
+		// RenderWithRenderer uses add_generation_prompt=true for chat rendering,
+		// unless we're prefilling a trailing assistant message.
 		sb.WriteString("<|im_start|>assistant\n")
-		// Note: Model is a "thinking-only" model - it will output <think> itself
-		// We don't add <think> tag to the prompt
 	}

 	return sb.String(), nil
--- a/model/renderers/lfm2_test.go
+++ b/model/renderers/lfm2_test.go
@@ -8,73 +8,136 @@ import (
 	"github.com/ollama/ollama/api"
 )

-func TestLFM2Renderer(t *testing.T) {
+func TestLFM2Renderer_ChatTemplateParity(t *testing.T) {
 	tests := []struct {
 		name       string
+		renderer   *LFM2Renderer
 		messages   []api.Message
 		tools      []api.Tool
 		thinkValue *api.ThinkValue
 		expected   string
 	}{
 		{
-			name: "basic user message",
+			name:     "user_only",
+			renderer: &LFM2Renderer{IsThinking: false},
 			messages: []api.Message{
-				{Role: "user", Content: "Hello!"},
+				{Role: "user", Content: "Hello"},
 			},
 			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>user\nHello!<|im_end|>\n<|im_start|>assistant\n",
+			expected:   "<|startoftext|><|im_start|>user\nHello<|im_end|>\n<|im_start|>assistant\n",
 		},
 		{
-			name: "basic with system message",
-			messages: []api.Message{
-				{Role: "system", Content: "You are a helpful assistant."},
-				{Role: "user", Content: "Hello!"},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nHello!<|im_end|>\n<|im_start|>assistant\n",
-		},
-		{
-			name: "multiple system messages rendered separately",
-			messages: []api.Message{
-				{Role: "system", Content: "First instruction."},
-				{Role: "system", Content: "Second instruction."},
-				{Role: "user", Content: "Hello!"},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>system\nFirst instruction.<|im_end|>\n<|im_start|>system\nSecond instruction.<|im_end|>\n<|im_start|>user\nHello!<|im_end|>\n<|im_start|>assistant\n",
-		},
-		{
-			name: "multi-turn conversation",
-			messages: []api.Message{
-				{Role: "user", Content: "What is 2+2?"},
-				{Role: "assistant", Content: "The answer is 4."},
-				{Role: "user", Content: "Thanks!"},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>user\nWhat is 2+2?<|im_end|>\n<|im_start|>assistant\nThe answer is 4.<|im_end|>\n<|im_start|>user\nThanks!<|im_end|>\n<|im_start|>assistant\n",
-		},
-		{
-			name: "only system message",
+			name:     "system_and_user",
+			renderer: &LFM2Renderer{IsThinking: false},
 			messages: []api.Message{
 				{Role: "system", Content: "You are helpful."},
+				{Role: "user", Content: "Hi"},
 			},
 			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>system\nYou are helpful.<|im_end|>\n<|im_start|>assistant\n",
+			expected:   "<|startoftext|><|im_start|>system\nYou are helpful.<|im_end|>\n<|im_start|>user\nHi<|im_end|>\n<|im_start|>assistant\n",
 		},
 		{
-			// When assistant is the LAST assistant, thinking is preserved (even with keep_past_thinking=false)
-			name: "user-assistant-user: last assistant preserves thinking",
+			name:     "tools_without_system",
+			renderer: &LFM2Renderer{IsThinking: false},
 			messages: []api.Message{
-				{Role: "user", Content: "Q1"},
-				{Role: "assistant", Content: "<think>reasoning</think>A1"},
-				{Role: "user", Content: "Q2"},
+				{Role: "user", Content: "Use tools"},
+			},
+			tools: []api.Tool{
+				{
+					Type: "function",
+					Function: api.ToolFunction{
+						Name: "get_weather",
+						Parameters: api.ToolFunctionParameters{
+							Type: "object",
+						},
+					},
+				},
 			},
 			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>user\nQ1<|im_end|>\n<|im_start|>assistant\n<think>reasoning</think>A1<|im_end|>\n<|im_start|>user\nQ2<|im_end|>\n<|im_start|>assistant\n",
+			expected: "<|startoftext|><|im_start|>system\nList of tools: <|tool_list_start|>[{\"name\": \"get_weather\", \"parameters\": {\"type\": \"object\", \"properties\": null}}]<|tool_list_end|><|im_end|>\n" +
+				"<|im_start|>user\nUse tools<|im_end|>\n<|im_start|>assistant\n",
 		},
 		{
-			// With two assistants, first is stripped (not last), second preserved (is last)
-			name: "multi-turn thinking: first stripped, second preserved",
+			name:     "first_system_combined_with_tools",
+			renderer: &LFM2Renderer{IsThinking: false},
+			messages: []api.Message{
+				{Role: "system", Content: "Follow instructions."},
+				{Role: "user", Content: "Do work"},
+			},
+			tools: []api.Tool{
+				{
+					Type: "function",
+					Function: api.ToolFunction{
+						Name: "tool_a",
+						Parameters: api.ToolFunctionParameters{
+							Type: "object",
+						},
+					},
+				},
+				{
+					Type: "function",
+					Function: api.ToolFunction{
+						Name: "tool_b",
+						Parameters: api.ToolFunctionParameters{
+							Type: "object",
+						},
+					},
+				},
+			},
+			thinkValue: &api.ThinkValue{Value: false},
+			expected: "<|startoftext|><|im_start|>system\nFollow instructions.\nList of tools: <|tool_list_start|>[{\"name\": \"tool_a\", \"parameters\": {\"type\": \"object\", \"properties\": null}}, {\"name\": \"tool_b\", \"parameters\": {\"type\": \"object\", \"properties\": null}}]<|tool_list_end|><|im_end|>\n" +
+				"<|im_start|>user\nDo work<|im_end|>\n<|im_start|>assistant\n",
+		},
+		{
+			name:     "assistant_tool_calls_and_tool_responses_are_rendered",
+			renderer: &LFM2Renderer{IsThinking: false},
+			messages: []api.Message{
+				{Role: "user", Content: "Call a tool"},
+				{
+					Role:    "assistant",
+					Content: "",
+					ToolCalls: []api.ToolCall{
+						{
+							Function: api.ToolCallFunction{
+								Name: "get_weather",
+								Arguments: testArgs(map[string]any{
+									"location": "Paris",
+								}),
+							},
+						},
+					},
+				},
+				{Role: "tool", Content: "22C"},
+			},
+			thinkValue: &api.ThinkValue{Value: false},
+			expected:   "<|startoftext|><|im_start|>user\nCall a tool<|im_end|>\n<|im_start|>assistant\n<|tool_call_start|>[get_weather(location=\"Paris\")]<|tool_call_end|><|im_end|>\n<|im_start|>tool\n<|tool_response_start|>22C<|tool_response_end|><|im_end|>\n<|im_start|>assistant\n",
+		},
+		{
+			name:     "assistant_tool_calls_with_content_preserves_both",
+			renderer: &LFM2Renderer{IsThinking: false},
+			messages: []api.Message{
+				{Role: "user", Content: "Call a tool"},
+				{
+					Role:    "assistant",
+					Content: "Checking now.",
+					ToolCalls: []api.ToolCall{
+						{
+							Function: api.ToolCallFunction{
+								Name: "get_weather",
+								Arguments: testArgs(map[string]any{
+									"location": "Paris",
+								}),
+							},
+						},
+					},
+				},
+			},
+			thinkValue: &api.ThinkValue{Value: false},
+			expected:   "<|startoftext|><|im_start|>user\nCall a tool<|im_end|>\n<|im_start|>assistant\n<|tool_call_start|>[get_weather(location=\"Paris\")]<|tool_call_end|>\nChecking now.",
+		},
+		{
+			name:     "thinking_strips_non_last_assistant_when_disabled",
+			renderer: &LFM2Renderer{IsThinking: true},
 			messages: []api.Message{
 				{Role: "user", Content: "Q1"},
 				{Role: "assistant", Content: "<think>reason1</think>A1"},
@@ -82,11 +145,11 @@ func TestLFM2Renderer(t *testing.T) {
 				{Role: "assistant", Content: "<think>reason2</think>A2"},
 			},
 			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>user\nQ1<|im_end|>\n<|im_start|>assistant\nA1<|im_end|>\n<|im_start|>user\nQ2<|im_end|>\n<|im_start|>assistant\n<think>reason2</think>A2<|im_end|>\n<|im_start|>assistant\n",
+			expected:   "<|startoftext|><|im_start|>user\nQ1<|im_end|>\n<|im_start|>assistant\nA1<|im_end|>\n<|im_start|>user\nQ2<|im_end|>\n<|im_start|>assistant\n<think>reason2</think>A2",
 		},
 		{
-			// With thinking enabled (keep_past_thinking=true), both preserved
-			name: "multi-turn thinking: both preserved when thinking enabled",
+			name:     "thinking_preserves_past_assistant_when_enabled",
+			renderer: &LFM2Renderer{IsThinking: true},
 			messages: []api.Message{
 				{Role: "user", Content: "Q1"},
 				{Role: "assistant", Content: "<think>reason1</think>A1"},
@@ -94,334 +157,127 @@ func TestLFM2Renderer(t *testing.T) {
 				{Role: "assistant", Content: "<think>reason2</think>A2"},
 			},
 			thinkValue: &api.ThinkValue{Value: true},
-			expected:   "<|im_start|>user\nQ1<|im_end|>\n<|im_start|>assistant\n<think>reason1</think>A1<|im_end|>\n<|im_start|>user\nQ2<|im_end|>\n<|im_start|>assistant\n<think>reason2</think>A2<|im_end|>\n<|im_start|>assistant\n",
+			expected:   "<|startoftext|><|im_start|>user\nQ1<|im_end|>\n<|im_start|>assistant\n<think>reason1</think>A1<|im_end|>\n<|im_start|>user\nQ2<|im_end|>\n<|im_start|>assistant\n<think>reason2</think>A2",
 		},
 		{
-			name: "assistant with tool calls",
+			name:     "arbitrary_roles_are_rendered_verbatim",
+			renderer: &LFM2Renderer{IsThinking: false},
 			messages: []api.Message{
-				{Role: "user", Content: "What's the weather?"},
-				{
-					Role: "assistant",
-					ToolCalls: []api.ToolCall{
-						{
-							Function: api.ToolCallFunction{
-								Name: "get_weather",
-								Arguments: testArgs(map[string]any{
-									"location": "Paris",
-								}),
-							},
-						},
-					},
-				},
+				{Role: "developer", Content: "Do X"},
+				{Role: "user", Content: "Hi"},
 			},
 			thinkValue: &api.ThinkValue{Value: false},
-			expected:   `<|im_start|>user` + "\n" + `What's the weather?<|im_end|>` + "\n" + `<|im_start|>assistant` + "\n" + `<|tool_call_start|>{"arguments":{"location":"Paris"},"name":"get_weather"}<|tool_call_end|><|im_end|>` + "\n" + `<|im_start|>assistant` + "\n",
+			expected:   "<|startoftext|><|im_start|>developer\nDo X<|im_end|>\n<|im_start|>user\nHi<|im_end|>\n<|im_start|>assistant\n",
 		},
 		{
-			name: "assistant with content and tool calls",
+			name:       "empty_messages_still_add_generation_prompt",
+			renderer:   &LFM2Renderer{IsThinking: false},
+			messages:   nil,
+			thinkValue: &api.ThinkValue{Value: false},
+			expected:   "<|startoftext|><|im_start|>assistant\n",
+		},
+		{
+			name:     "assistant_prefill_no_generation_prompt",
+			renderer: &LFM2Renderer{IsThinking: false},
 			messages: []api.Message{
-				{Role: "user", Content: "What's the weather in Paris?"},
-				{
-					Role:    "assistant",
-					Content: "Let me check.",
-					ToolCalls: []api.ToolCall{
-						{
-							Function: api.ToolCallFunction{
-								Name: "get_weather",
-								Arguments: testArgs(map[string]any{
-									"location": "Paris",
-								}),
-							},
-						},
-					},
-				},
+				{Role: "user", Content: "Hi"},
+				{Role: "assistant", Content: "Hello"},
 			},
 			thinkValue: &api.ThinkValue{Value: false},
-			expected:   `<|im_start|>user` + "\n" + `What's the weather in Paris?<|im_end|>` + "\n" + `<|im_start|>assistant` + "\n" + `Let me check.<|tool_call_start|>{"arguments":{"location":"Paris"},"name":"get_weather"}<|tool_call_end|><|im_end|>` + "\n" + `<|im_start|>assistant` + "\n",
-		},
-		{
-			name: "tool response",
-			messages: []api.Message{
-				{Role: "user", Content: "What's the weather?"},
-				{Role: "assistant", Content: "Let me check."},
-				{Role: "tool", Content: "22C, Sunny"},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>user\nWhat's the weather?<|im_end|>\n<|im_start|>assistant\nLet me check.<|im_end|>\n<|im_start|>tool\n22C, Sunny<|im_end|>\n<|im_start|>assistant\n",
-		},
-		{
-			name: "multiple tool calls",
-			messages: []api.Message{
-				{Role: "user", Content: "Get weather for Paris and London"},
-				{
-					Role: "assistant",
-					ToolCalls: []api.ToolCall{
-						{
-							Function: api.ToolCallFunction{
-								Name: "get_weather",
-								Arguments: testArgs(map[string]any{
-									"location": "Paris",
-								}),
-							},
-						},
-						{
-							Function: api.ToolCallFunction{
-								Name: "get_weather",
-								Arguments: testArgs(map[string]any{
-									"location": "London",
-								}),
-							},
-						},
-					},
-				},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   `<|im_start|>user` + "\n" + `Get weather for Paris and London<|im_end|>` + "\n" + `<|im_start|>assistant` + "\n" + `<|tool_call_start|>{"arguments":{"location":"Paris"},"name":"get_weather"}<|tool_call_end|><|tool_call_start|>{"arguments":{"location":"London"},"name":"get_weather"}<|tool_call_end|><|im_end|>` + "\n" + `<|im_start|>assistant` + "\n",
-		},
-		{
-			name: "tools definitions with system message",
-			messages: []api.Message{
-				{Role: "system", Content: "You are helpful."},
-				{Role: "user", Content: "What's the weather?"},
-			},
-			tools: []api.Tool{
-				{
-					Type: "function",
-					Function: api.ToolFunction{
-						Name:        "get_weather",
-						Description: "Get current weather",
-						Parameters: api.ToolFunctionParameters{
-							Type: "object",
-							Properties: testPropsMap(map[string]api.ToolProperty{
-								"location": {
-									Type:        api.PropertyType{"string"},
-									Description: "City name",
-								},
-							}),
-							Required: []string{"location"},
-						},
-					},
-				},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   `<|im_start|>system` + "\n" + `You are helpful.` + "\n" + `List of tools: [{"type":"function","function":{"name":"get_weather","description":"Get current weather","parameters":{"type":"object","required":["location"],"properties":{"location":{"type":"string","description":"City name"}}}}}]<|im_end|>` + "\n" + `<|im_start|>user` + "\n" + `What's the weather?<|im_end|>` + "\n" + `<|im_start|>assistant` + "\n",
-		},
-		{
-			name: "tools definitions without system message",
-			messages: []api.Message{
-				{Role: "user", Content: "What's the weather?"},
-			},
-			tools: []api.Tool{
-				{
-					Type: "function",
-					Function: api.ToolFunction{
-						Name:        "get_weather",
-						Description: "Get current weather",
-						Parameters: api.ToolFunctionParameters{
-							Type: "object",
-							Properties: testPropsMap(map[string]api.ToolProperty{
-								"location": {
-									Type:        api.PropertyType{"string"},
-									Description: "City name",
-								},
-							}),
-							Required: []string{"location"},
-						},
-					},
-				},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   `<|im_start|>system` + "\n" + `List of tools: [{"type":"function","function":{"name":"get_weather","description":"Get current weather","parameters":{"type":"object","required":["location"],"properties":{"location":{"type":"string","description":"City name"}}}}}]<|im_end|>` + "\n" + `<|im_start|>user` + "\n" + `What's the weather?<|im_end|>` + "\n" + `<|im_start|>assistant` + "\n",
-		},
-		{
-			name: "multiple tools without system message",
-			messages: []api.Message{
-				{Role: "user", Content: "Hello"},
-			},
-			tools: []api.Tool{
-				{
-					Type: "function",
-					Function: api.ToolFunction{
-						Name:        "get_weather",
-						Description: "Get weather",
-					},
-				},
-				{
-					Type: "function",
-					Function: api.ToolFunction{
-						Name:        "get_time",
-						Description: "Get time",
-					},
-				},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>system\nList of tools: [{\"type\":\"function\",\"function\":{\"name\":\"get_weather\",\"description\":\"Get weather\",\"parameters\":{\"type\":\"\",\"properties\":null}}}, {\"type\":\"function\",\"function\":{\"name\":\"get_time\",\"description\":\"Get time\",\"parameters\":{\"type\":\"\",\"properties\":null}}}]<|im_end|>\n<|im_start|>user\nHello<|im_end|>\n<|im_start|>assistant\n",
-		},
-		{
-			name: "user-tool sequence",
-			messages: []api.Message{
-				{Role: "user", Content: "Check weather"},
-				{Role: "tool", Content: "22C"},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>user\nCheck weather<|im_end|>\n<|im_start|>tool\n22C<|im_end|>\n<|im_start|>assistant\n",
-		},
-		{
-			name: "full tool call cycle",
-			messages: []api.Message{
-				{Role: "user", Content: "Check weather"},
-				{Role: "assistant", Content: "Let me check"},
-				{Role: "tool", Content: "22C"},
-				{Role: "assistant", Content: "It's 22C"},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>user\nCheck weather<|im_end|>\n<|im_start|>assistant\nLet me check<|im_end|>\n<|im_start|>tool\n22C<|im_end|>\n<|im_start|>assistant\nIt's 22C<|im_end|>\n<|im_start|>assistant\n",
-		},
-		{
-			name: "unicode content",
-			messages: []api.Message{
-				{Role: "user", Content: "你好世界! مرحبا 🌍"},
-				{Role: "assistant", Content: "Hello! 👋"},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>user\n你好世界! مرحبا 🌍<|im_end|>\n<|im_start|>assistant\nHello! 👋<|im_end|>\n<|im_start|>assistant\n",
-		},
-		{
-			name: "newlines in content",
-			messages: []api.Message{
-				{Role: "user", Content: "Line 1\nLine 2\n\nLine 4"},
-				{Role: "assistant", Content: "Response with\nmultiple\nlines"},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>user\nLine 1\nLine 2\n\nLine 4<|im_end|>\n<|im_start|>assistant\nResponse with\nmultiple\nlines<|im_end|>\n<|im_start|>assistant\n",
-		},
-		{
-			name: "empty assistant content",
-			messages: []api.Message{
-				{Role: "user", Content: "Hello"},
-				{Role: "assistant", Content: ""},
-				{Role: "user", Content: "OK"},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>user\nHello<|im_end|>\n<|im_start|>assistant\n<|im_end|>\n<|im_start|>user\nOK<|im_end|>\n<|im_start|>assistant\n",
-		},
-		{
-			// Generation prompt does NOT include <think> - model outputs it
-			name: "generation prompt has no think tag",
-			messages: []api.Message{
-				{Role: "user", Content: "Think hard"},
-			},
-			thinkValue: &api.ThinkValue{Value: true},
-			expected:   "<|im_start|>user\nThink hard<|im_end|>\n<|im_start|>assistant\n",
-		},
-		{
-			// Interleaved: thinking before tool call - last assistant preserves thinking
-			name: "thinking before tool call (last assistant)",
-			messages: []api.Message{
-				{Role: "user", Content: "What's the weather?"},
-				{
-					Role:    "assistant",
-					Content: "<think>I need to check the weather</think>",
-					ToolCalls: []api.ToolCall{
-						{
-							Function: api.ToolCallFunction{
-								Name: "get_weather",
-								Arguments: testArgs(map[string]any{
-									"location": "Paris",
-								}),
-							},
-						},
-					},
-				},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>user\nWhat's the weather?<|im_end|>\n<|im_start|>assistant\n<think>I need to check the weather</think><|tool_call_start|>{\"arguments\":{\"location\":\"Paris\"},\"name\":\"get_weather\"}<|tool_call_end|><|im_end|>\n<|im_start|>assistant\n",
-		},
-		{
-			// Two assistants with tool calls - first has thinking stripped
-			name: "two assistants with tools: first thinking stripped",
-			messages: []api.Message{
-				{Role: "user", Content: "What's the weather?"},
-				{
-					Role:    "assistant",
-					Content: "<think>checking</think>",
-					ToolCalls: []api.ToolCall{
-						{
-							Function: api.ToolCallFunction{
-								Name: "get_weather",
-								Arguments: testArgs(map[string]any{
-									"location": "Paris",
-								}),
-							},
-						},
-					},
-				},
-				{Role: "tool", Content: "22C"},
-				{Role: "assistant", Content: "<think>got result</think>It's 22C!"},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>user\nWhat's the weather?<|im_end|>\n<|im_start|>assistant\n<|tool_call_start|>{\"arguments\":{\"location\":\"Paris\"},\"name\":\"get_weather\"}<|tool_call_end|><|im_end|>\n<|im_start|>tool\n22C<|im_end|>\n<|im_start|>assistant\n<think>got result</think>It's 22C!<|im_end|>\n<|im_start|>assistant\n",
-		},
-		{
-			// Two assistants with tools - both preserved when thinking enabled
-			name: "two assistants with tools: both preserved when thinking enabled",
-			messages: []api.Message{
-				{Role: "user", Content: "What's the weather?"},
-				{
-					Role:    "assistant",
-					Content: "<think>checking</think>",
-					ToolCalls: []api.ToolCall{
-						{
-							Function: api.ToolCallFunction{
-								Name: "get_weather",
-								Arguments: testArgs(map[string]any{
-									"location": "Paris",
-								}),
-							},
-						},
-					},
-				},
-				{Role: "tool", Content: "22C"},
-				{Role: "assistant", Content: "<think>got result</think>It's 22C!"},
-			},
-			thinkValue: &api.ThinkValue{Value: true},
-			expected:   "<|im_start|>user\nWhat's the weather?<|im_end|>\n<|im_start|>assistant\n<think>checking</think><|tool_call_start|>{\"arguments\":{\"location\":\"Paris\"},\"name\":\"get_weather\"}<|tool_call_end|><|im_end|>\n<|im_start|>tool\n22C<|im_end|>\n<|im_start|>assistant\n<think>got result</think>It's 22C!<|im_end|>\n<|im_start|>assistant\n",
-		},
-		{
-			// Content before thinking before tool call
-			name: "content then thinking then tool call",
-			messages: []api.Message{
-				{Role: "user", Content: "What's the weather?"},
-				{
-					Role:    "assistant",
-					Content: "Let me check.<think>Using weather API</think>",
-					ToolCalls: []api.ToolCall{
-						{
-							Function: api.ToolCallFunction{
-								Name: "get_weather",
-								Arguments: testArgs(map[string]any{
-									"location": "Paris",
-								}),
-							},
-						},
-					},
-				},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>user\nWhat's the weather?<|im_end|>\n<|im_start|>assistant\nLet me check.<think>Using weather API</think><|tool_call_start|>{\"arguments\":{\"location\":\"Paris\"},\"name\":\"get_weather\"}<|tool_call_end|><|im_end|>\n<|im_start|>assistant\n",
+			expected:   "<|startoftext|><|im_start|>user\nHi<|im_end|>\n<|im_start|>assistant\nHello",
 		},
 	}

-	renderer := &LFM2Renderer{IsThinking: true}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
-			rendered, err := renderer.Render(tt.messages, tt.tools, tt.thinkValue)
+			rendered, err := tt.renderer.Render(tt.messages, tt.tools, tt.thinkValue)
 			if err != nil {
 				t.Fatalf("Render() error = %v", err)
 			}
 			if diff := cmp.Diff(tt.expected, rendered); diff != "" {
-				t.Errorf("Render() mismatch (-want +got):\n%s", diff)
+				t.Fatalf("Render() mismatch (-want +got):\n%s", diff)
 			}
 		})
 	}
 }
+
+func TestLFM2Renderer_Images(t *testing.T) {
+	tests := []struct {
+		name     string
+		renderer *LFM2Renderer
+		message  api.Message
+		expected string
+	}{
+		{
+			name:     "single_image_default_placeholder",
+			renderer: &LFM2Renderer{},
+			message: api.Message{
+				Role:    "user",
+				Content: "Describe this image.",
+				Images:  []api.ImageData{api.ImageData("img1")},
+			},
+			expected: "<|startoftext|><|im_start|>user\n<image>Describe this image.<|im_end|>\n<|im_start|>assistant\n",
+		},
+		{
+			name:     "multiple_images_default_placeholder",
+			renderer: &LFM2Renderer{},
+			message: api.Message{
+				Role:    "user",
+				Content: "Describe these images.",
+				Images:  []api.ImageData{api.ImageData("img1"), api.ImageData("img2")},
+			},
+			expected: "<|startoftext|><|im_start|>user\n<image><image>Describe these images.<|im_end|>\n<|im_start|>assistant\n",
+		},
+		{
+			name:     "single_image_img_tag_placeholder",
+			renderer: &LFM2Renderer{useImgTags: true},
+			message: api.Message{
+				Role:    "user",
+				Content: "Describe this image.",
+				Images:  []api.ImageData{api.ImageData("img1")},
+			},
+			expected: "<|startoftext|><|im_start|>user\n[img-0]Describe this image.<|im_end|>\n<|im_start|>assistant\n",
+		},
+		{
+			name:     "existing_template_image_placeholder_not_duplicated",
+			renderer: &LFM2Renderer{},
+			message: api.Message{
+				Role:    "user",
+				Content: "<image>Describe this image.",
+				Images:  []api.ImageData{api.ImageData("img1")},
+			},
+			expected: "<|startoftext|><|im_start|>user\n<image>Describe this image.<|im_end|>\n<|im_start|>assistant\n",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got, err := tt.renderer.Render([]api.Message{tt.message}, nil, nil)
+			if err != nil {
+				t.Fatalf("Render() error = %v", err)
+			}
+			if diff := cmp.Diff(tt.expected, got); diff != "" {
+				t.Fatalf("Render() mismatch (-want +got):\n%s", diff)
+			}
+		})
+	}
+}
+
+func TestLFM2Renderer_JSONFormatting(t *testing.T) {
+	tool := api.Tool{
+		Type: "function",
+		Function: api.ToolFunction{
+			Name:        "echo",
+			Description: "<html>",
+			Parameters: api.ToolFunctionParameters{
+				Type: "object",
+			},
+		},
+	}
+
+	got := lfm2JSON(tool)
+	want := "{\"type\": \"function\", \"function\": {\"name\": \"echo\", \"description\": \"<html>\", \"parameters\": {\"type\": \"object\", \"properties\": null}}}"
+	if diff := cmp.Diff(want, got); diff != "" {
+		t.Fatalf("lfm2JSON mismatch (-want +got):\n%s", diff)
+	}
+}
--- a/model/renderers/qwen3vl.go
+++ b/model/renderers/qwen3vl.go
@@ -1,6 +1,7 @@
 package renderers

 import (
+	"fmt"
 	"strings"

 	"github.com/ollama/ollama/api"
@@ -9,10 +10,11 @@ import (
 type Qwen3VLRenderer struct {
 	isThinking bool

-	useImgTags bool
+	emitEmptyThinkOnNoThink bool
+	useImgTags              bool
 }

-func (r *Qwen3VLRenderer) renderContent(content api.Message) string {
+func (r *Qwen3VLRenderer) renderContent(content api.Message, imageOffset int) (string, int) {
 	// This assumes all images are at the front of the message - same assumption as ollama/ollama/runner.go
 	var subSb strings.Builder
 	for range content.Images {
@@ -20,7 +22,8 @@ func (r *Qwen3VLRenderer) renderContent(content api.Message) string {
 		// model backends, and so we should eventually parameterize this or
 		// only output a placeholder such as [img]
 		if r.useImgTags {
-			subSb.WriteString("[img]")
+			subSb.WriteString(fmt.Sprintf("[img-%d]", imageOffset))
+			imageOffset++
 		} else {
 			subSb.WriteString("<|vision_start|><|image_pad|><|vision_end|>")
 		}
@@ -28,12 +31,17 @@ func (r *Qwen3VLRenderer) renderContent(content api.Message) string {
 	// TODO: support videos

 	subSb.WriteString(content.Content)
-	return subSb.String()
+	return subSb.String(), imageOffset
 }

-func (r *Qwen3VLRenderer) Render(messages []api.Message, tools []api.Tool, _ *api.ThinkValue) (string, error) {
+func (r *Qwen3VLRenderer) Render(messages []api.Message, tools []api.Tool, think *api.ThinkValue) (string, error) {
 	var sb strings.Builder

+	isThinking := r.isThinking
+	if think != nil {
+		isThinking = think.Bool()
+	}
+
 	if len(tools) > 0 {
 		sb.WriteString(imStartTag + "system\n")
 		if len(messages) > 0 && messages[0].Role == "system" {
@@ -57,7 +65,7 @@ func (r *Qwen3VLRenderer) Render(messages []api.Message, tools []api.Tool, _ *ap
 		message := messages[i]
 		if multiStepTool && message.Role == "user" {
 			// Check if content starts with <tool_response> and ends with </tool_response>
-			content := r.renderContent(message)
+			content, _ := r.renderContent(message, 0)
 			if !(strings.HasPrefix(content, "<tool_response>") && strings.HasSuffix(content, "</tool_response>")) {
 				multiStepTool = false
 				lastQueryIndex = i
@@ -65,8 +73,10 @@ func (r *Qwen3VLRenderer) Render(messages []api.Message, tools []api.Tool, _ *ap
 		}
 	}

+	imageOffset := 0
 	for i, message := range messages {
-		content := r.renderContent(message)
+		content, nextImageOffset := r.renderContent(message, imageOffset)
+		imageOffset = nextImageOffset

 		lastMessage := i == len(messages)-1
 		prefill := lastMessage && message.Role == "assistant"
@@ -76,13 +86,13 @@ func (r *Qwen3VLRenderer) Render(messages []api.Message, tools []api.Tool, _ *ap
 		} else if message.Role == "assistant" {
 			contentReasoning := ""

-			if r.isThinking {
+			if isThinking {
 				if message.Thinking != "" {
 					contentReasoning = message.Thinking
 				}
 			}

-			if r.isThinking && i > lastQueryIndex {
+			if isThinking && i > lastQueryIndex {
 				if i == len(messages)-1 || contentReasoning != "" {
 					sb.WriteString("<|im_start|>" + message.Role + "\n<think>\n" + strings.Trim(contentReasoning, "\n")) // do we want to add a new line here?
 					if content != "" {
@@ -125,8 +135,10 @@ func (r *Qwen3VLRenderer) Render(messages []api.Message, tools []api.Tool, _ *ap
 		// prefill at the end
 		if lastMessage && !prefill {
 			sb.WriteString("<|im_start|>assistant\n")
-			if r.isThinking {
+			if isThinking {
 				sb.WriteString("<think>\n")
+			} else if r.emitEmptyThinkOnNoThink {
+				sb.WriteString("<think>\n\n</think>\n\n")
 			}
 		}
 	}
--- a/model/renderers/qwen3vl_nonthinking_test.go
+++ b/model/renderers/qwen3vl_nonthinking_test.go
@@ -101,7 +101,7 @@ Let me analyze this image.`,
 			},
 			useImgTags: true,
 			expected: `<|im_start|>user
-[img]Describe this image.<|im_end|>
+[img-0]Describe this image.<|im_end|>
 <|im_start|>assistant
 Let me analyze this image.`,
 		},
@@ -123,7 +123,7 @@ Let me analyze this image.`,
 			},
 			useImgTags: true,
 			expected: `<|im_start|>user
-[img][img]Describe these images.<|im_end|>
+[img-0][img-1]Describe these images.<|im_end|>
 <|im_start|>assistant
 Let me analyze this image.`,
 		},
--- a/model/renderers/qwen3vl_thinking_test.go
+++ b/model/renderers/qwen3vl_thinking_test.go
@@ -1,6 +1,7 @@
 package renderers

 import (
+	"strings"
 	"testing"

 	"github.com/google/go-cmp/cmp"
@@ -370,3 +371,74 @@ func TestFormatToolCallArgumentThinkingVL(t *testing.T) {
 		})
 	}
 }
+
+func TestQwen3VLRendererThinkOverride(t *testing.T) {
+	msgs := []api.Message{
+		{Role: "user", Content: "Hello"},
+	}
+
+	renderThinking, err := (&Qwen3VLRenderer{isThinking: true}).Render(msgs, nil, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if !strings.Contains(renderThinking, "<|im_start|>assistant\n<think>\n") {
+		t.Fatalf("expected default thinking renderer to emit <think>, got:\n%s", renderThinking)
+	}
+
+	renderNonThinking, err := (&Qwen3VLRenderer{isThinking: true}).Render(msgs, nil, &api.ThinkValue{Value: false})
+	if err != nil {
+		t.Fatal(err)
+	}
+	if strings.Contains(renderNonThinking, "<think>") {
+		t.Fatalf("expected think=false override to suppress <think>, got:\n%s", renderNonThinking)
+	}
+
+	renderForcedThinking, err := (&Qwen3VLRenderer{isThinking: false}).Render(msgs, nil, &api.ThinkValue{Value: true})
+	if err != nil {
+		t.Fatal(err)
+	}
+	if !strings.Contains(renderForcedThinking, "<|im_start|>assistant\n<think>\n") {
+		t.Fatalf("expected think=true override to emit <think>, got:\n%s", renderForcedThinking)
+	}
+}
+
+func TestQwen3VLRendererThinkOverrideWithExplicitNoThinkPrefill(t *testing.T) {
+	msgs := []api.Message{
+		{Role: "user", Content: "Hello"},
+	}
+
+	renderNonThinking, err := (&Qwen3VLRenderer{
+		isThinking:              true,
+		emitEmptyThinkOnNoThink: true,
+	}).Render(msgs, nil, &api.ThinkValue{Value: false})
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if !strings.Contains(renderNonThinking, "<|im_start|>assistant\n<think>\n\n</think>\n\n") {
+		t.Fatalf("expected explicit think=false prefill block, got:\n%s", renderNonThinking)
+	}
+}
+
+func TestQwenRendererNameNoThinkBehaviorSplit(t *testing.T) {
+	msgs := []api.Message{
+		{Role: "user", Content: "Hello"},
+	}
+	thinkFalse := &api.ThinkValue{Value: false}
+
+	qwen35Rendered, err := RenderWithRenderer("qwen3.5", msgs, nil, thinkFalse)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if !strings.Contains(qwen35Rendered, "<|im_start|>assistant\n<think>\n\n</think>\n\n") {
+		t.Fatalf("expected qwen3.5 renderer to emit explicit no-think prefill, got:\n%s", qwen35Rendered)
+	}
+
+	qwen3VLRendered, err := RenderWithRenderer("qwen3-vl-thinking", msgs, nil, thinkFalse)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if strings.Contains(qwen3VLRendered, "<|im_start|>assistant\n<think>\n\n</think>\n\n") {
+		t.Fatalf("expected qwen3-vl-thinking renderer to keep legacy non-empty no-think behavior, got:\n%s", qwen3VLRendered)
+	}
+}
--- a/model/renderers/renderer.go
+++ b/model/renderers/renderer.go
@@ -56,6 +56,9 @@ func rendererForName(name string) Renderer {
 	case "qwen3-vl-thinking":
 		renderer := &Qwen3VLRenderer{isThinking: true, useImgTags: RenderImgTags}
 		return renderer
+	case "qwen3.5":
+		renderer := &Qwen3VLRenderer{isThinking: true, emitEmptyThinkOnNoThink: true, useImgTags: RenderImgTags}
+		return renderer
 	case "cogito":
 		renderer := &CogitoRenderer{isThinking: true}
 		return renderer
@@ -85,9 +88,9 @@ func rendererForName(name string) Renderer {
 	case "glm-ocr":
 		return &GlmOcrRenderer{}
 	case "lfm2":
-		return &LFM2Renderer{IsThinking: false}
+		return &LFM2Renderer{IsThinking: false, useImgTags: RenderImgTags}
 	case "lfm2-thinking":
-		return &LFM2Renderer{IsThinking: true}
+		return &LFM2Renderer{IsThinking: true, useImgTags: RenderImgTags}
 	default:
 		return nil
 	}
--- a/model/renderers/renderer_test.go
+++ b/model/renderers/renderer_test.go
@@ -29,17 +29,27 @@ func TestRegisterCustomRenderer(t *testing.T) {
 }

 func TestBuiltInRendererStillWorks(t *testing.T) {
-	// Test that qwen3-coder still works
+	tests := []struct {
+		name string
+	}{
+		{name: "qwen3-coder"},
+		{name: "qwen3.5"},
+	}
+
 	messages := []api.Message{
 		{Role: "user", Content: "Hello"},
 	}

-	result, err := RenderWithRenderer("qwen3-coder", messages, nil, nil)
-	if err != nil {
-		t.Fatalf("unexpected error: %v", err)
-	}
-	if result == "" {
-		t.Error("expected non-empty result from qwen3-coder renderer")
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result, err := RenderWithRenderer(tt.name, messages, nil, nil)
+			if err != nil {
+				t.Fatalf("unexpected error: %v", err)
+			}
+			if result == "" {
+				t.Fatalf("expected non-empty result from %s renderer", tt.name)
+			}
+		})
 	}
 }

--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -2,6 +2,10 @@
 # This script installs Ollama on Linux and macOS.
 # It detects the current operating system architecture and installs the appropriate version of Ollama.

+# Wrap script in main function so that a truncated partial download doesn't end
+# up executing half a script.
+main() {
+
 set -eu

 red="$( (/usr/bin/tput bold || :; /usr/bin/tput setaf 1 || :) 2>&-)"
@@ -446,3 +450,6 @@ fi

 status "NVIDIA GPU ready."
 install_success
+}
+
+main
--- a/server/prompt.go
+++ b/server/prompt.go
@@ -86,6 +86,11 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
 				ID:   len(images),
 				Data: i,
 			}
+			images = append(images, imgData)
+
+			if m.Config.Renderer != "" {
+				continue
+			}

 			imgTag := fmt.Sprintf("[img-%d]", imgData.ID)
 			if !strings.Contains(prompt, "[img]") {
@@ -93,8 +98,6 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
 			} else {
 				prompt = strings.Replace(prompt, "[img]", imgTag, 1)
 			}
-
-			images = append(images, imgData)
 		}
 		msgs[currMsgIdx+cnt].Content = prefix + prompt
 	}
--- a/server/prompt_test.go
+++ b/server/prompt_test.go
@@ -9,6 +9,7 @@ import (

 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/template"
+	"github.com/ollama/ollama/types/model"
 )

 func TestChatPrompt(t *testing.T) {
@@ -330,3 +331,38 @@ func TestChatPromptTokenizeCalls(t *testing.T) {
 		})
 	}
 }
+
+func TestChatPromptRendererDoesNotRewriteMessageContent(t *testing.T) {
+	msgs := []api.Message{
+		{
+			Role:    "user",
+			Content: "what do these photos have in common?",
+			Images:  []api.ImageData{[]byte("img-1"), []byte("img-2"), []byte("img-3")},
+		},
+	}
+	originalContent := msgs[0].Content
+
+	m := Model{
+		Config:         model.ConfigV2{Renderer: "qwen3-vl-instruct"},
+		ProjectorPaths: []string{"vision"},
+	}
+	opts := api.Options{Runner: api.Runner{NumCtx: 8192}}
+	think := false
+
+	prompt, images, err := chatPrompt(t.Context(), &m, mockRunner{}.Tokenize, &opts, msgs, nil, &api.ThinkValue{Value: think}, true)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if msgs[0].Content != originalContent {
+		t.Fatalf("renderer path should not mutate message content: got %q, want %q", msgs[0].Content, originalContent)
+	}
+
+	if got, want := len(images), 3; got != want {
+		t.Fatalf("len(images) = %d, want %d", got, want)
+	}
+
+	if prompt == "" {
+		t.Fatal("prompt is empty")
+	}
+}
--- a/server/quantization.go
+++ b/server/quantization.go
@@ -6,6 +6,7 @@ import (
 	"log/slog"
 	"maps"
 	"os"
+	"slices"
 	"strings"
 	"unsafe"

@@ -33,6 +34,9 @@ func (q quantizer) WriteTo(w io.Writer) (int64, error) {
 		slog.Warn("file read error", "tensor", q.from.Name, "file", q.Name(), "error", err)
 		return 0, fmt.Errorf("unable to read tensor %s from %s: %s", q.from.Name, q.Name(), err)
 	}
+	if uint64(len(data)) < q.from.Size() {
+		return 0, fmt.Errorf("tensor %s data size %d is less than expected %d from shape %v", q.from.Name, len(data), q.from.Size(), q.from.Shape)
+	}
 	var f32s []float32
 	newType := fsggml.TensorType(q.to.Kind)
 	if fsggml.TensorType(q.from.Kind) == fsggml.TensorTypeF32 {
@@ -58,7 +62,7 @@ func useMoreBits(iLayer, nLayers int) bool {
 	return iLayer < (nLayers/8) || iLayer >= 7*nLayers/8 || (iLayer-nLayers/8)%3 == 2
 }

-func qwen3nextQuantType(name string) (fsggml.TensorType, bool) {
+func qwen3LinearAttnQuantType(name string) (fsggml.TensorType, bool) {
 	switch {
 	// Full attention
 	case strings.HasSuffix(name, ".attn_q.weight"):
@@ -79,6 +83,10 @@ func qwen3nextQuantType(name string) (fsggml.TensorType, bool) {
 	// SSM
 	case strings.HasSuffix(name, ".ssm_ba.weight"):
 		return fsggml.TensorTypeQ4_K, true
+	case strings.HasSuffix(name, ".ssm_beta.weight"):
+		return fsggml.TensorTypeQ4_K, true
+	case strings.HasSuffix(name, ".ssm_alpha.weight"):
+		return fsggml.TensorTypeQ4_K, true
 	case strings.HasSuffix(name, ".ssm_out.weight"):
 		return fsggml.TensorTypeQ4_K, true

@@ -287,8 +295,8 @@ func newType(t *fsggml.Tensor, kv fsggml.KV, qs *quantizeState, ftype fsggml.Fil

 	newType := fsggml.TensorType(t.Kind)
 	if quantize {
-		if kv.Architecture() == "qwen3next" && (ftype == fsggml.FileTypeQ4_K_M || ftype == fsggml.FileTypeQ4_K_S) {
-			if qt, ok := qwen3nextQuantType(name); ok {
+		if slices.Contains([]string{"qwen3next", "qwen35", "qwen35moe"}, kv.Architecture()) && (ftype == fsggml.FileTypeQ4_K_M || ftype == fsggml.FileTypeQ4_K_S) {
+			if qt, ok := qwen3LinearAttnQuantType(name); ok {
 				return qt
 			}
 		}
--- a/server/quantization_test.go
+++ b/server/quantization_test.go
@@ -166,6 +166,60 @@ func TestGetTensorNewType(t *testing.T) {
 	}
 }

+func TestQwen3LinearAttentionQuantOverride(t *testing.T) {
+	cases := []struct {
+		name     string
+		arch     string
+		tensor   string
+		fileType fsggml.FileType
+		expected fsggml.TensorType
+	}{
+		{
+			name:     "qwen35_beta",
+			arch:     "qwen35",
+			tensor:   "blk.0.ssm_beta.weight",
+			fileType: fsggml.FileTypeQ4_K_M,
+			expected: fsggml.TensorTypeQ4_K,
+		},
+		{
+			name:     "qwen35_alpha",
+			arch:     "qwen35",
+			tensor:   "blk.0.ssm_alpha.weight",
+			fileType: fsggml.FileTypeQ4_K_M,
+			expected: fsggml.TensorTypeQ4_K,
+		},
+		{
+			name:     "qwen35moe_attn_qkv",
+			arch:     "qwen35moe",
+			tensor:   "blk.0.attn_qkv.weight",
+			fileType: fsggml.FileTypeQ4_K_M,
+			expected: fsggml.TensorTypeQ4_K,
+		},
+		{
+			name:     "non_qwen35_falls_back",
+			arch:     "foo",
+			tensor:   "blk.0.attn_qkv.weight",
+			fileType: fsggml.FileTypeQ4_K_M,
+			expected: fsggml.TensorTypeQ5_K,
+		},
+	}
+
+	for _, tt := range cases {
+		t.Run(tt.name, func(t *testing.T) {
+			kv := fsggml.KV{"general.architecture": tt.arch}
+			got := newType(&fsggml.Tensor{
+				Name:  tt.tensor,
+				Shape: []uint64{256, 256},
+				Kind:  uint32(fsggml.TensorTypeF16),
+			}, kv, &quantizeState{}, tt.fileType)
+
+			if got != tt.expected {
+				t.Fatalf("unexpected tensor type for %s (%s): got %s want %s", tt.tensor, tt.arch, got, tt.expected)
+			}
+		})
+	}
+}
+
 func TestQuantizeModel(t *testing.T) {
 	cases := []struct {
 		name                string
@@ -173,6 +227,7 @@ func TestQuantizeModel(t *testing.T) {
 		tensors             []*fsggml.Tensor
 		newType             string
 		expectedTensorTypes map[string]fsggml.TensorType
+		expectErr           bool
 	}{
 		{
 			name: "f16_q4_k",
@@ -253,6 +308,36 @@ func TestQuantizeModel(t *testing.T) {
 				"output.weight":     fsggml.TensorTypeQ8_0,
 			},
 		},
+		{
+			name: "f32_short_data",
+			kv: map[string]any{
+				"general.architecture": "foo",
+			},
+			tensors: []*fsggml.Tensor{
+				{
+					Name: "blk.0.attn.weight", Kind: uint32(fsggml.TensorTypeF32),
+					Offset: uint64(0), Shape: []uint64{512, 2},
+					WriterTo: bytes.NewReader(make([]byte, 32)),
+				},
+			},
+			newType:   "Q4_K",
+			expectErr: true,
+		},
+		{
+			name: "f16_short_data",
+			kv: map[string]any{
+				"general.architecture": "foo",
+			},
+			tensors: []*fsggml.Tensor{
+				{
+					Name: "blk.0.attn.weight", Kind: uint32(fsggml.TensorTypeF16),
+					Offset: uint64(0), Shape: []uint64{512, 2},
+					WriterTo: bytes.NewReader(make([]byte, 32)),
+				},
+			},
+			newType:   "Q4_K",
+			expectErr: true,
+		},
 	}

 	for _, tt := range cases {
@@ -264,6 +349,9 @@ func TestQuantizeModel(t *testing.T) {
 			}
 			defer fp.Close()
 			meta, err := fsggml.Decode(fp, -1)
+			if tt.expectErr && err != nil {
+				return
+			}
 			if err != nil {
 				t.Fatal(err.Error())
 			}
@@ -283,6 +371,12 @@ func TestQuantizeModel(t *testing.T) {
 			}

 			err = quantize(fp, tmp, meta, ftype, progress)
+			if tt.expectErr {
+				if err == nil {
+					t.Fatal("expected quantize to return an error")
+				}
+				return
+			}
 			if err != nil {
 				t.Fatalf("error during quantize: %s", err)
 			}
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Jeffrey Morgan	da70c3222e	model: support for qwen3.5 architecture (#14378 )	2026-02-24 20:08:05 -08:00
Bruce MacDonald	9d902d63ce	ggml: ensure tensor size is valid (#14406 ) When quantizing tensors during model creation validate that the resulting sizes match what is expected based on the shape.	2026-02-24 21:52:44 -04:00
Daniel Hiltgen	f4f0a4a471	update mlx-c bindings to 0.5.0 (#14380 ) * chore: update mlx-c bindings to 0.5.0 (#14303) * linux: use gcc 11 --------- Co-authored-by: Patrick Devine <patrick@infrahq.com>	2026-02-23 16:44:29 -08:00
Eva H	3323c1d319	app: add upgrade configuration to settings page (#13512 )	2026-02-23 18:08:52 -05:00
Jesse Gross	f20dc6b698	mlx: don't default to affine quantization for unquantized models Otherwise the BF16 version of models trigger segfaults when they call into quantized kernels.	2026-02-23 15:03:53 -08:00
Jeffrey Morgan	4b2ac1f369	model: improvements to LFM architectures (#14368 )	2026-02-23 14:38:10 -08:00
Jesse Gross	8daf47fb3a	mlxrunner: Fix duplicate log prefixes and reduce log noise Pass subprocess stdout/stderr through to the parent's stderr directly instead of re-wrapping each line with slog. The subprocess already writes structured slog output, so the re-wrapping produced nested timestamps, levels, and message fields that were hard to read. Also downgrade verbose KV cache debug logs to trace level.	2026-02-23 14:09:20 -08:00
Eva H	6c980579cd	ui: use capability-based detection for web search (#14336 )	2026-02-23 15:00:09 -05:00
Jesse Gross	5c73c4e2ee	mlxrunner: Simplify KV cache to single-entry prefix matching The KV cache previously used a tree structure which could store multiple divergent sequences, which is good for cache reuse. However, this is typically used in conjunction with paged attention so each node in the tree can store just a chunk of the KV cache and they can be stitched together later. We don't currently do this, so the cache was storing copies of the full cache for each past sequence. This redundancy plus the lack of resource limits, caused significant memory use as a conversation grew. Instead, this changes to store a single entry for the cache, which can be prefix matched. Although it is less ideal for multiple users, it largely matches Ollama's current behavior. It can be improved as additional pieces are fleshed out.	2026-02-23 09:50:07 -08:00
Jesse Gross	5daf59cc66	mlxrunner: Fix memory leaks with pin/sweep lifecycle management The previous approach tracked array lifecycles through reference counting, where each array recorded its inputs and a reference count that was decremented as dependents were freed. This is not really necessary as MLX tracks references internally. It is also error prone as it is easy to create new arrays and forget to free them when the Go variable goes out of scope. Instead, we can pin just the arrays we want (typically outputs and specific intermediates, like the cache). All other arrays are freed by default when we run sweep. This avoids most causes of memory leaks while still giving the freedom to save what we want.	2026-02-23 09:50:07 -08:00
Jeffrey Morgan	0ade9205cc	models: add nemotronh architecture support (#14356 )	2026-02-22 15:09:14 -08:00
Parth Sareen	06edabdde1	cmd/config: install web search plugin to user-level extensions dir (#14362 )	2026-02-22 02:17:03 -08:00
Jeffrey Morgan	8b4e5a82a8	mlx: remove noisy error output from dynamic library loading (#14346 ) The recent change in #14322 added tryLoadByName() which attempts to load libmlxc.dylib via rpath before searching directories. This is an optimization for Homebrew installations where rpath is correctly set. However, when rpath isn't set (which is the common case for app bundle installations), dlopen fails and the CHECK macro prints an error to stderr: ERROR - dynamic.c:21 - CHECK failed: handle->ctx != NULL This error is misleading because it's an expected failure path - the code correctly falls back to searching the executable directory and loads the library successfully. The error message causes user confusion and makes it appear that something is broken. Replace the CHECK macro with a simple return code so the C code fails silently. The Go code already handles error logging appropriately: tryLoadByName() fails silently (intentional fallback), while tryLoadFromDir() logs via slog.Error() when explicit path loading fails.	2026-02-20 23:46:07 -08:00
Parth Sareen	3445223311	cmd: openclaw onboarding (#14344 )	2026-02-20 19:08:38 -08:00
Jeffrey Morgan	fa6c0127e6	app: expose server's default context length to UI (#14037 ) Parse the default_num_ctx from the server's "vram-based default context" log line and expose it through the inference compute API. This eliminates duplicate VRAM tier calculation logic in the frontend. - Add InferenceInfo struct with Computes and DefaultContextLength - Rename GetInferenceComputer to GetInferenceInfo - Handle missing default context line gracefully (older servers) - Add DefaultContextLength to InferenceComputeResponse - Update Settings UI to use server's default, disable slider while loading - Add disabled prop to Slider component (grays out + hides handle) - Migrate existing users with context_length=4096 to 0 (auto mode)	2026-02-20 18:56:30 -08:00
Patrick Devine	97323d1c68	consolidate the tokenizer (#14327 ) This change adds a new x/tokenizer package which includes: * New BPE and SentencePiece tokenizers * Removing the dependency on the imagegen tokenizers * Fixes to multibyte decoding in the pipeline * Various correctness and benchmark tests Not included in this PR is the WordPiece tokenizer for BERT models which will be added when we add embedding models. The imagegen tokenizers will also be removed in a follow-up PR.	2026-02-19 15:55:45 -08:00
natl-set	458dd1b9d9	mlx: try loading library via rpath before searching directories (#14322 ) The existing code manually searches directories for libmlxc.* and passes full paths to dlopen, bypassing the binary's rpath. This means MLX libraries installed via package managers (e.g., Homebrew) aren't found even when rpath is correctly set at link time. This change adds a fallback that tries loading via rpath first (using just the library name), before falling back to the existing directory search. This follows standard Unix/macOS conventions and works with any installation that sets rpath. Fixes library loading on macOS with Homebrew-installed mlx-c without requiring OLLAMA_LIBRARY_PATH environment variable. Co-authored-by: Natl <nat@MacBook-Pro.local>	2026-02-19 10:55:02 -08:00
Bruce MacDonald	9d02d1d767	install: prevent partial download script execution (#14311 ) Wrap script in main function so that a truncated partial download doesn't end up executing half a script.	2026-02-18 18:32:45 -08:00
Bruce MacDonald	1a636fb47a	cmd: set codex env vars on launch and handle zstd request bodies (#14122 ) The Codex runner was not setting OPENAI_BASE_URL or OPENAI_API_KEY, this prevents Codex from sending requests to api.openai.com instead of the local Ollama server. This mirrors the approach used by the Claude runner. Codex v0.98.0 sends zstd-compressed request bodies to the /v1/responses endpoint. Add decompression support in ResponsesMiddleware with an 8MB max decompressed size limit to prevent resource exhaustion.	2026-02-18 17:19:36 -08:00
Patrick Devine	0759fface9	Revert "chore: update mlx-c bindings to 0.5.0 (#14303 )" (#14316 ) This reverts commit `f01a9a7859`.	2026-02-18 17:01:25 -08:00
Parth Sareen	325b72bc31	cmd/tui: default to single-select for editor integrations (#14302 )	2026-02-17 18:17:27 -08:00
Patrick Devine	f01a9a7859	chore: update mlx-c bindings to 0.5.0 (#14303 )	2026-02-17 16:48:16 -08:00