bugfix: fix crash bug in token cache logic

This change fixes a problem in the token cache logic to avoid panics caused by empty token arrays by ensuring at least one token remains on full cache hits in the relevant function. The happens if there is an exact match in the cache on subsequent generations.
model: fix qwen3 tool calling in thinking (#14477 )
2026-02-27 04:27:01 -05:00 · 2026-02-26 18:35:44 -08:00 · 2026-02-26 16:13:18 -08:00 · 2026-02-26 16:47:12 -05:00 · 2026-02-25 18:29:55 -05:00 · 2026-02-25 14:00:42 -08:00
108 changed files with 10039 additions and 3637 deletions
--- a/9
+++ b/9
@@ -9,15 +9,10 @@ ARG JETPACK6VERSION=r36.4.0
 ARG CMAKEVERSION=3.31.2
 ARG VULKANVERSION=1.4.321.1

-# We require gcc v10 minimum.  v10.3 has regressions, so the rockylinux 8.5 AppStream has the latest compatible version
 FROM --platform=linux/amd64 rocm/dev-almalinux-8:${ROCMVERSION}-complete AS base-amd64
-RUN yum install -y yum-utils \
-    && yum-config-manager --add-repo https://dl.rockylinux.org/vault/rocky/8.5/AppStream/\$basearch/os/ \
-    && rpm --import https://dl.rockylinux.org/pub/rocky/RPM-GPG-KEY-Rocky-8 \
-    && dnf install -y yum-utils ccache gcc-toolset-10-gcc-10.2.1-8.2.el8 gcc-toolset-10-gcc-c++-10.2.1-8.2.el8 gcc-toolset-10-binutils-2.35-11.el8 \
-    && dnf install -y ccache \
+RUN dnf install -y yum-utils ccache gcc-toolset-11-gcc gcc-toolset-11-gcc-c++ gcc-toolset-11-binutils \
    && yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
-ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH
+ENV PATH=/opt/rh/gcc-toolset-11/root/usr/bin:$PATH
 ARG VULKANVERSION
 RUN wget https://sdk.lunarg.com/sdk/download/${VULKANVERSION}/linux/vulkansdk-linux-x86_64-${VULKANVERSION}.tar.xz -O /tmp/vulkansdk-linux-x86_64-${VULKANVERSION}.tar.xz \
    && tar xvf /tmp/vulkansdk-linux-x86_64-${VULKANVERSION}.tar.xz \
--- a/2
+++ b/2
@@ -1 +1 @@
-v0.4.1
+v0.5.0
--- a/app/cmd/app/app.go
+++ b/app/cmd/app/app.go
@@ -35,6 +35,7 @@ import (
 var (
 	wv           = &Webview{}
 	uiServerPort int
+	appStore     *store.Store
 )

 var debug = strings.EqualFold(os.Getenv("OLLAMA_DEBUG"), "true") || os.Getenv("OLLAMA_DEBUG") == "1"
@@ -208,6 +209,7 @@ func main() {
 	uiServerPort = port

 	st := &store.Store{}
+	appStore = st

 	// Enable CORS in development mode
 	if devMode {
@@ -253,6 +255,8 @@ func main() {
 		done <- osrv.Run(octx)
 	}()

+	upd := &updater.Updater{Store: st}
+
 	uiServer := ui.Server{
 		Token: token,
 		Restart: func() {
@@ -267,6 +271,10 @@ func main() {
 		ToolRegistry: toolRegistry,
 		Dev:          devMode,
 		Logger:       slog.Default(),
+		Updater:      upd,
+		UpdateAvailableFunc: func() {
+			UpdateAvailable("")
+		},
 	}

 	srv := &http.Server{
@@ -284,8 +292,20 @@ func main() {
 		slog.Debug("background desktop server done")
 	}()

-	updater := &updater.Updater{Store: st}
-	updater.StartBackgroundUpdaterChecker(ctx, UpdateAvailable)
+	upd.StartBackgroundUpdaterChecker(ctx, UpdateAvailable)
+
+	// Check for pending updates on startup (show tray notification if update is ready)
+	if updater.IsUpdatePending() {
+		// On Windows, the tray is initialized in osRun(). Calling UpdateAvailable
+		// before that would dereference a nil tray callback.
+		// TODO: refactor so the update check runs after platform init on all platforms.
+		if runtime.GOOS == "windows" {
+			slog.Debug("update pending on startup, deferring tray notification until tray initialization")
+		} else {
+			slog.Debug("update pending on startup, showing tray notification")
+			UpdateAvailable("")
+		}
+	}

 	hasCompletedFirstRun, err := st.HasCompletedFirstRun()
 	if err != nil {
@@ -348,6 +368,17 @@ func startHiddenTasks() {
 			// CLI triggered app startup use-case
 			slog.Info("deferring pending update for fast startup")
 		} else {
+			// Check if auto-update is enabled before automatically upgrading
+			settings, err := appStore.Settings()
+			if err != nil {
+				slog.Warn("failed to load settings for upgrade check", "error", err)
+			} else if !settings.AutoUpdateEnabled {
+				slog.Info("auto-update disabled, skipping automatic upgrade at startup")
+				// Still show tray notification so user knows update is ready
+				UpdateAvailable("")
+				return
+			}
+
 			if err := updater.DoUpgradeAtStartup(); err != nil {
 				slog.Info("unable to perform upgrade at startup", "error", err)
 				// Make sure the restart to upgrade menu shows so we can attempt an interactive upgrade to get authorization
--- a/app/cmd/app/app_windows.go
+++ b/app/cmd/app/app_windows.go
@@ -154,6 +154,10 @@ func handleURLSchemeRequest(urlScheme string) {
 }

 func UpdateAvailable(ver string) error {
+	if app.t == nil {
+		slog.Debug("tray not yet initialized, skipping update notification")
+		return nil
+	}
 	return app.t.UpdateAvailable(ver)
 }

@@ -165,6 +169,14 @@ func osRun(shutdown func(), hasCompletedFirstRun, startHidden bool) {
 		log.Fatalf("Failed to start: %s", err)
 	}

+	// Check for pending updates now that the tray is initialized.
+	// The platform-independent check in app.go fires before osRun,
+	// when app.t is still nil, so we must re-check here.
+	if updater.IsUpdatePending() {
+		slog.Debug("update pending on startup, showing tray notification")
+		UpdateAvailable("")
+	}
+
 	signals := make(chan os.Signal, 1)
 	signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM)

--- a/app/store/database.go
+++ b/app/store/database.go
@@ -9,12 +9,12 @@ import (
 	"strings"
 	"time"

-	sqlite3 "github.com/mattn/go-sqlite3"
+	_ "github.com/mattn/go-sqlite3"
 )

 // currentSchemaVersion defines the current database schema version.
 // Increment this when making schema changes that require migrations.
-const currentSchemaVersion = 14
+const currentSchemaVersion = 15

 // database wraps the SQLite connection.
 // SQLite handles its own locking for concurrent access:
@@ -86,6 +86,7 @@ func (db *database) init() error {
 		think_level TEXT NOT NULL DEFAULT '',
 		cloud_setting_migrated BOOLEAN NOT NULL DEFAULT 0,
 		remote TEXT NOT NULL DEFAULT '', -- deprecated
+		auto_update_enabled BOOLEAN NOT NULL DEFAULT 1,
 		schema_version INTEGER NOT NULL DEFAULT %d
 	);

@@ -257,6 +258,12 @@ func (db *database) migrate() error {
 				return fmt.Errorf("migrate v13 to v14: %w", err)
 			}
 			version = 14
+		case 14:
+			// add auto_update_enabled column to settings table
+			if err := db.migrateV14ToV15(); err != nil {
+				return fmt.Errorf("migrate v14 to v15: %w", err)
+			}
+			version = 15
 		default:
 			// If we have a version we don't recognize, just set it to current
 			// This might happen during development
@@ -496,6 +503,21 @@ func (db *database) migrateV13ToV14() error {
 	return nil
 }

+// migrateV14ToV15 adds the auto_update_enabled column to the settings table
+func (db *database) migrateV14ToV15() error {
+	_, err := db.conn.Exec(`ALTER TABLE settings ADD COLUMN auto_update_enabled BOOLEAN NOT NULL DEFAULT 1`)
+	if err != nil && !duplicateColumnError(err) {
+		return fmt.Errorf("add auto_update_enabled column: %w", err)
+	}
+
+	_, err = db.conn.Exec(`UPDATE settings SET schema_version = 15`)
+	if err != nil {
+		return fmt.Errorf("update schema version: %w", err)
+	}
+
+	return nil
+}
+
 // cleanupOrphanedData removes orphaned records that may exist due to the foreign key bug
 func (db *database) cleanupOrphanedData() error {
 	_, err := db.conn.Exec(`
@@ -526,19 +548,11 @@ func (db *database) cleanupOrphanedData() error {
 }

 func duplicateColumnError(err error) bool {
-	if sqlite3Err, ok := err.(sqlite3.Error); ok {
-		return sqlite3Err.Code == sqlite3.ErrError &&
-			strings.Contains(sqlite3Err.Error(), "duplicate column name")
-	}
-	return false
+	return err != nil && strings.Contains(err.Error(), "duplicate column name")
 }

 func columnNotExists(err error) bool {
-	if sqlite3Err, ok := err.(sqlite3.Error); ok {
-		return sqlite3Err.Code == sqlite3.ErrError &&
-			strings.Contains(sqlite3Err.Error(), "no such column")
-	}
-	return false
+	return err != nil && strings.Contains(err.Error(), "no such column")
 }

 func (db *database) getAllChats() ([]Chat, error) {
@@ -1152,9 +1166,9 @@ func (db *database) getSettings() (Settings, error) {
 	var s Settings

 	err := db.conn.QueryRow(`
-		SELECT expose, survey, browser, models, agent, tools, working_dir, context_length, turbo_enabled, websearch_enabled, selected_model, sidebar_open, think_enabled, think_level
+		SELECT expose, survey, browser, models, agent, tools, working_dir, context_length, turbo_enabled, websearch_enabled, selected_model, sidebar_open, think_enabled, think_level, auto_update_enabled
 		FROM settings
-	`).Scan(&s.Expose, &s.Survey, &s.Browser, &s.Models, &s.Agent, &s.Tools, &s.WorkingDir, &s.ContextLength, &s.TurboEnabled, &s.WebSearchEnabled, &s.SelectedModel, &s.SidebarOpen, &s.ThinkEnabled, &s.ThinkLevel)
+	`).Scan(&s.Expose, &s.Survey, &s.Browser, &s.Models, &s.Agent, &s.Tools, &s.WorkingDir, &s.ContextLength, &s.TurboEnabled, &s.WebSearchEnabled, &s.SelectedModel, &s.SidebarOpen, &s.ThinkEnabled, &s.ThinkLevel, &s.AutoUpdateEnabled)
 	if err != nil {
 		return Settings{}, fmt.Errorf("get settings: %w", err)
 	}
@@ -1164,9 +1178,9 @@ func (db *database) getSettings() (Settings, error) {

 func (db *database) setSettings(s Settings) error {
 	_, err := db.conn.Exec(`
-		UPDATE settings 
-		SET expose = ?, survey = ?, browser = ?, models = ?, agent = ?, tools = ?, working_dir = ?, context_length = ?, turbo_enabled = ?, websearch_enabled = ?, selected_model = ?, sidebar_open = ?, think_enabled = ?, think_level = ?
-	`, s.Expose, s.Survey, s.Browser, s.Models, s.Agent, s.Tools, s.WorkingDir, s.ContextLength, s.TurboEnabled, s.WebSearchEnabled, s.SelectedModel, s.SidebarOpen, s.ThinkEnabled, s.ThinkLevel)
+		UPDATE settings
+		SET expose = ?, survey = ?, browser = ?, models = ?, agent = ?, tools = ?, working_dir = ?, context_length = ?, turbo_enabled = ?, websearch_enabled = ?, selected_model = ?, sidebar_open = ?, think_enabled = ?, think_level = ?, auto_update_enabled = ?
+	`, s.Expose, s.Survey, s.Browser, s.Models, s.Agent, s.Tools, s.WorkingDir, s.ContextLength, s.TurboEnabled, s.WebSearchEnabled, s.SelectedModel, s.SidebarOpen, s.ThinkEnabled, s.ThinkLevel, s.AutoUpdateEnabled)
 	if err != nil {
 		return fmt.Errorf("set settings: %w", err)
 	}
--- a/app/store/store.go
+++ b/app/store/store.go
@@ -166,6 +166,9 @@ type Settings struct {

 	// SidebarOpen indicates if the chat sidebar is open
 	SidebarOpen bool
+
+	// AutoUpdateEnabled indicates if automatic updates should be downloaded
+	AutoUpdateEnabled bool
 }

 type Store struct {
--- a/app/ui/app/codegen/gotypes.gen.ts
+++ b/app/ui/app/codegen/gotypes.gen.ts
@@ -414,6 +414,7 @@ export class Settings {
    ThinkLevel: string;
    SelectedModel: string;
    SidebarOpen: boolean;
+    AutoUpdateEnabled: boolean;

    constructor(source: any = {}) {
        if ('string' === typeof source) source = JSON.parse(source);
@@ -431,6 +432,7 @@ export class Settings {
        this.ThinkLevel = source["ThinkLevel"];
        this.SelectedModel = source["SelectedModel"];
        this.SidebarOpen = source["SidebarOpen"];
+        this.AutoUpdateEnabled = source["AutoUpdateEnabled"];
    }
 }
 export class SettingsResponse {
--- a/app/ui/app/src/components/ChatForm.tsx
+++ b/app/ui/app/src/components/ChatForm.tsx
@@ -17,7 +17,10 @@ import {
 } from "@/hooks/useChats";
 import { useNavigate } from "@tanstack/react-router";
 import { useSelectedModel } from "@/hooks/useSelectedModel";
-import { useHasVisionCapability } from "@/hooks/useModelCapabilities";
+import {
+  useHasVisionCapability,
+  useHasToolsCapability,
+} from "@/hooks/useModelCapabilities";
 import { useUser } from "@/hooks/useUser";
 import { DisplayLogin } from "@/components/DisplayLogin";
 import { ErrorEvent, Message } from "@/gotypes";
@@ -149,12 +152,7 @@ function ChatForm({
  } = useSettings();
  const { cloudDisabled } = useCloudStatus();

-  // current supported models for web search
-  const modelLower = selectedModel?.model.toLowerCase() || "";
-  const supportsWebSearch =
-    modelLower.startsWith("gpt-oss") ||
-    modelLower.startsWith("qwen3") ||
-    modelLower.startsWith("deepseek-v3");
+  const supportsWebSearch = useHasToolsCapability(selectedModel?.model);
  // Use per-chat thinking level instead of global
  const thinkLevel: ThinkingLevel =
    settingsThinkLevel === "none" || !settingsThinkLevel
--- a/app/ui/app/src/components/Settings.tsx
+++ b/app/ui/app/src/components/Settings.tsx
@@ -15,6 +15,7 @@ import {
  XMarkIcon,
  CogIcon,
  ArrowLeftIcon,
+  ArrowDownTrayIcon,
 } from "@heroicons/react/20/solid";
 import { Settings as SettingsType } from "@/gotypes";
 import { useNavigate } from "@tanstack/react-router";
@@ -440,6 +441,29 @@ export default function Settings() {
                </div>
              </Field>

+              {/* Auto Update */}
+              <Field>
+                <div className="flex items-start justify-between gap-4">
+                  <div className="flex items-start space-x-3 flex-1">
+                    <ArrowDownTrayIcon className="mt-1 h-5 w-5 flex-shrink-0 text-black dark:text-neutral-100" />
+                    <div>
+                      <Label>Auto-download updates</Label>
+                      <Description>
+                        {settings.AutoUpdateEnabled
+                          ? "Automatically download updates when available."
+                          : "Updates will not be downloaded automatically."}
+                      </Description>
+                    </div>
+                  </div>
+                  <div className="flex-shrink-0">
+                    <Switch
+                      checked={settings.AutoUpdateEnabled}
+                      onChange={(checked) => handleChange("AutoUpdateEnabled", checked)}
+                    />
+                  </div>
+                </div>
+              </Field>
+
              {/* Expose Ollama */}
              <Field>
                <div className="flex items-start justify-between gap-4">
--- a/app/ui/app/src/hooks/useModelCapabilities.ts
+++ b/app/ui/app/src/hooks/useModelCapabilities.ts
@@ -20,3 +20,8 @@ export function useHasVisionCapability(modelName: string | undefined) {
  const { data: capabilitiesResponse } = useModelCapabilities(modelName);
  return capabilitiesResponse?.capabilities?.includes("vision") ?? false;
 }
+
+export function useHasToolsCapability(modelName: string | undefined) {
+  const { data: capabilitiesResponse } = useModelCapabilities(modelName);
+  return capabilitiesResponse?.capabilities?.includes("tools") ?? false;
+}
--- a/app/ui/ui.go
+++ b/app/ui/ui.go
@@ -28,6 +28,7 @@ import (
 	"github.com/ollama/ollama/app/tools"
 	"github.com/ollama/ollama/app/types/not"
 	"github.com/ollama/ollama/app/ui/responses"
+	"github.com/ollama/ollama/app/updater"
 	"github.com/ollama/ollama/app/version"
 	ollamaAuth "github.com/ollama/ollama/auth"
 	"github.com/ollama/ollama/envconfig"
@@ -106,6 +107,10 @@ type Server struct {

 	// Dev is true if the server is running in development mode
 	Dev bool
+
+	// Updater for checking and downloading updates
+	Updater             *updater.Updater
+	UpdateAvailableFunc func()
 }

 func (s *Server) log() *slog.Logger {
@@ -829,8 +834,9 @@ func (s *Server) chat(w http.ResponseWriter, r *http.Request) error {

 	if !hasAttachments {
 		WebSearchEnabled := req.WebSearch != nil && *req.WebSearch
+		hasToolsCapability := slices.Contains(details.Capabilities, model.CapabilityTools)

-		if WebSearchEnabled {
+		if WebSearchEnabled && hasToolsCapability {
 			if supportsBrowserTools(req.Model) {
 				browserState, ok := s.browserState(chat)
 				if !ok {
@@ -840,7 +846,7 @@ func (s *Server) chat(w http.ResponseWriter, r *http.Request) error {
 				registry.Register(tools.NewBrowserSearch(browser))
 				registry.Register(tools.NewBrowserOpen(browser))
 				registry.Register(tools.NewBrowserFind(browser))
-			} else if supportsWebSearchTools(req.Model) {
+			} else {
 				registry.Register(&tools.WebSearch{})
 				registry.Register(&tools.WebFetch{})
 			}
@@ -1446,6 +1452,24 @@ func (s *Server) settings(w http.ResponseWriter, r *http.Request) error {
 		return fmt.Errorf("failed to save settings: %w", err)
 	}

+	// Handle auto-update toggle changes
+	if old.AutoUpdateEnabled != settings.AutoUpdateEnabled {
+		if !settings.AutoUpdateEnabled {
+			// Auto-update disabled: cancel any ongoing download
+			if s.Updater != nil {
+				s.Updater.CancelOngoingDownload()
+			}
+		} else {
+			// Auto-update re-enabled: show notification if update is already staged, or trigger immediate check
+			if (updater.IsUpdatePending() || updater.UpdateDownloaded) && s.UpdateAvailableFunc != nil {
+				s.UpdateAvailableFunc()
+			} else if s.Updater != nil {
+				// Trigger the background checker to run immediately
+				s.Updater.TriggerImmediateCheck()
+			}
+		}
+	}
+
 	if old.ContextLength != settings.ContextLength ||
 		old.Models != settings.Models ||
 		old.Expose != settings.Expose {
@@ -1648,17 +1672,6 @@ func supportsBrowserTools(model string) bool {
 	return strings.HasPrefix(strings.ToLower(model), "gpt-oss")
 }

-// Web search tools are simpler, providing only basic web search and fetch capabilities (e.g., "web_search", "web_fetch") without simulating a browser. Currently only qwen3 and deepseek-v3 support web search tools.
-func supportsWebSearchTools(model string) bool {
-	model = strings.ToLower(model)
-	prefixes := []string{"qwen3", "deepseek-v3"}
-	for _, p := range prefixes {
-		if strings.HasPrefix(model, p) {
-			return true
-		}
-	}
-	return false
-}

 // buildChatRequest converts store.Chat to api.ChatRequest
 func (s *Server) buildChatRequest(chat *store.Chat, model string, think any, availableTools []map[string]any) (*api.ChatRequest, error) {
--- a/app/ui/ui_test.go
+++ b/app/ui/ui_test.go
@@ -4,6 +4,7 @@ package ui

 import (
 	"bytes"
+	"context"
 	"encoding/json"
 	"io"
 	"net/http"
@@ -11,9 +12,11 @@ import (
 	"path/filepath"
 	"runtime"
 	"strings"
+	"sync/atomic"
 	"testing"

 	"github.com/ollama/ollama/app/store"
+	"github.com/ollama/ollama/app/updater"
 )

 func TestHandlePostApiSettings(t *testing.T) {
@@ -522,3 +525,290 @@ func TestUserAgentTransport(t *testing.T) {

 	t.Logf("User-Agent transport successfully set: %s", receivedUA)
 }
+
+func TestSupportsBrowserTools(t *testing.T) {
+	tests := []struct {
+		model string
+		want  bool
+	}{
+		{"gpt-oss", true},
+		{"gpt-oss-latest", true},
+		{"GPT-OSS", true},
+		{"Gpt-Oss-v2", true},
+		{"qwen3", false},
+		{"deepseek-v3", false},
+		{"llama3.3", false},
+		{"", false},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.model, func(t *testing.T) {
+			if got := supportsBrowserTools(tt.model); got != tt.want {
+				t.Errorf("supportsBrowserTools(%q) = %v, want %v", tt.model, got, tt.want)
+			}
+		})
+	}
+}
+
+func TestWebSearchToolRegistration(t *testing.T) {
+	// Validates that the capability-gating logic in chat() correctly
+	// decides which tools to register based on model capabilities and
+	// the web search flag.
+	tests := []struct {
+		name             string
+		webSearchEnabled bool
+		hasToolsCap      bool
+		model            string
+		wantBrowser      bool // expects browser tools (gpt-oss)
+		wantWebSearch    bool // expects basic web search/fetch tools
+		wantNone         bool // expects no tools registered
+	}{
+		{
+			name:             "web search enabled with tools capability - browser model",
+			webSearchEnabled: true,
+			hasToolsCap:      true,
+			model:            "gpt-oss-latest",
+			wantBrowser:      true,
+		},
+		{
+			name:             "web search enabled with tools capability - non-browser model",
+			webSearchEnabled: true,
+			hasToolsCap:      true,
+			model:            "qwen3",
+			wantWebSearch:    true,
+		},
+		{
+			name:             "web search enabled without tools capability",
+			webSearchEnabled: true,
+			hasToolsCap:      false,
+			model:            "llama3.3",
+			wantNone:         true,
+		},
+		{
+			name:             "web search disabled with tools capability",
+			webSearchEnabled: false,
+			hasToolsCap:      true,
+			model:            "qwen3",
+			wantNone:         true,
+		},
+		{
+			name:             "web search disabled without tools capability",
+			webSearchEnabled: false,
+			hasToolsCap:      false,
+			model:            "llama3.3",
+			wantNone:         true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			// Replicate the decision logic from chat() handler
+			gotBrowser := false
+			gotWebSearch := false
+
+			if tt.webSearchEnabled && tt.hasToolsCap {
+				if supportsBrowserTools(tt.model) {
+					gotBrowser = true
+				} else {
+					gotWebSearch = true
+				}
+			}
+
+			if tt.wantBrowser && !gotBrowser {
+				t.Error("expected browser tools to be registered")
+			}
+			if tt.wantWebSearch && !gotWebSearch {
+				t.Error("expected web search tools to be registered")
+			}
+			if tt.wantNone && (gotBrowser || gotWebSearch) {
+				t.Error("expected no tools to be registered")
+			}
+			if !tt.wantBrowser && gotBrowser {
+				t.Error("unexpected browser tools registered")
+			}
+			if !tt.wantWebSearch && gotWebSearch {
+				t.Error("unexpected web search tools registered")
+			}
+		})
+	}
+}
+
+func TestSettingsToggleAutoUpdateOff_CancelsDownload(t *testing.T) {
+	testStore := &store.Store{
+		DBPath: filepath.Join(t.TempDir(), "db.sqlite"),
+	}
+	defer testStore.Close()
+
+	// Start with auto-update enabled
+	settings, err := testStore.Settings()
+	if err != nil {
+		t.Fatal(err)
+	}
+	settings.AutoUpdateEnabled = true
+	if err := testStore.SetSettings(settings); err != nil {
+		t.Fatal(err)
+	}
+
+	upd := &updater.Updater{Store: &store.Store{
+		DBPath: filepath.Join(t.TempDir(), "db2.sqlite"),
+	}}
+	defer upd.Store.Close()
+
+	// We can't easily mock CancelOngoingDownload, but we can verify
+	// the full settings handler flow works without error
+	server := &Server{
+		Store:   testStore,
+		Restart: func() {},
+		Updater: upd,
+	}
+
+	// Disable auto-update via settings API
+	settings.AutoUpdateEnabled = false
+	body, err := json.Marshal(settings)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	req := httptest.NewRequest("POST", "/api/v1/settings", bytes.NewReader(body))
+	req.Header.Set("Content-Type", "application/json")
+	rr := httptest.NewRecorder()
+
+	if err := server.settings(rr, req); err != nil {
+		t.Fatalf("settings() error = %v", err)
+	}
+	if rr.Code != http.StatusOK {
+		t.Fatalf("settings() status = %d, want %d", rr.Code, http.StatusOK)
+	}
+
+	// Verify settings were saved with auto-update disabled
+	saved, err := testStore.Settings()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if saved.AutoUpdateEnabled {
+		t.Fatal("expected AutoUpdateEnabled to be false after toggle off")
+	}
+}
+
+func TestSettingsToggleAutoUpdateOn_WithPendingUpdate_ShowsNotification(t *testing.T) {
+	testStore := &store.Store{
+		DBPath: filepath.Join(t.TempDir(), "db.sqlite"),
+	}
+	defer testStore.Close()
+
+	// Start with auto-update disabled
+	settings, err := testStore.Settings()
+	if err != nil {
+		t.Fatal(err)
+	}
+	settings.AutoUpdateEnabled = false
+	if err := testStore.SetSettings(settings); err != nil {
+		t.Fatal(err)
+	}
+
+	// Simulate that an update was previously downloaded
+	oldVal := updater.UpdateDownloaded
+	updater.UpdateDownloaded = true
+	defer func() { updater.UpdateDownloaded = oldVal }()
+
+	var notificationCalled atomic.Bool
+	server := &Server{
+		Store:   testStore,
+		Restart: func() {},
+		UpdateAvailableFunc: func() {
+			notificationCalled.Store(true)
+		},
+	}
+
+	// Re-enable auto-update via settings API
+	settings.AutoUpdateEnabled = true
+	body, err := json.Marshal(settings)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	req := httptest.NewRequest("POST", "/api/v1/settings", bytes.NewReader(body))
+	req.Header.Set("Content-Type", "application/json")
+	rr := httptest.NewRecorder()
+
+	if err := server.settings(rr, req); err != nil {
+		t.Fatalf("settings() error = %v", err)
+	}
+	if rr.Code != http.StatusOK {
+		t.Fatalf("settings() status = %d, want %d", rr.Code, http.StatusOK)
+	}
+
+	if !notificationCalled.Load() {
+		t.Fatal("expected UpdateAvailableFunc to be called when re-enabling with a downloaded update")
+	}
+}
+
+func TestSettingsToggleAutoUpdateOn_NoPendingUpdate_TriggersCheck(t *testing.T) {
+	testStore := &store.Store{
+		DBPath: filepath.Join(t.TempDir(), "db.sqlite"),
+	}
+	defer testStore.Close()
+
+	// Start with auto-update disabled
+	settings, err := testStore.Settings()
+	if err != nil {
+		t.Fatal(err)
+	}
+	settings.AutoUpdateEnabled = false
+	if err := testStore.SetSettings(settings); err != nil {
+		t.Fatal(err)
+	}
+
+	// Ensure no pending update - clear both the downloaded flag and the stage dir
+	oldVal := updater.UpdateDownloaded
+	updater.UpdateDownloaded = false
+	defer func() { updater.UpdateDownloaded = oldVal }()
+
+	oldStageDir := updater.UpdateStageDir
+	updater.UpdateStageDir = t.TempDir() // empty dir means IsUpdatePending() returns false
+	defer func() { updater.UpdateStageDir = oldStageDir }()
+
+	upd := &updater.Updater{Store: &store.Store{
+		DBPath: filepath.Join(t.TempDir(), "db2.sqlite"),
+	}}
+	defer upd.Store.Close()
+
+	// Initialize the checkNow channel by starting (and immediately stopping) the checker
+	// so TriggerImmediateCheck doesn't panic on nil channel
+	ctx, cancel := context.WithCancel(t.Context())
+	upd.StartBackgroundUpdaterChecker(ctx, func(string) error { return nil })
+	defer cancel()
+
+	var notificationCalled atomic.Bool
+	server := &Server{
+		Store:   testStore,
+		Restart: func() {},
+		Updater: upd,
+		UpdateAvailableFunc: func() {
+			notificationCalled.Store(true)
+		},
+	}
+
+	// Re-enable auto-update via settings API
+	settings.AutoUpdateEnabled = true
+	body, err := json.Marshal(settings)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	req := httptest.NewRequest("POST", "/api/v1/settings", bytes.NewReader(body))
+	req.Header.Set("Content-Type", "application/json")
+	rr := httptest.NewRecorder()
+
+	if err := server.settings(rr, req); err != nil {
+		t.Fatalf("settings() error = %v", err)
+	}
+	if rr.Code != http.StatusOK {
+		t.Fatalf("settings() status = %d, want %d", rr.Code, http.StatusOK)
+	}
+
+	// UpdateAvailableFunc should NOT be called since there's no pending update
+	if notificationCalled.Load() {
+		t.Fatal("UpdateAvailableFunc should not be called when there is no pending update")
+	}
+}
--- a/app/updater/updater.go
+++ b/app/updater/updater.go
@@ -19,6 +19,7 @@ import (
 	"runtime"
 	"strconv"
 	"strings"
+	"sync"
 	"time"

 	"github.com/ollama/ollama/app/store"
@@ -58,7 +59,8 @@ func (u *Updater) checkForUpdate(ctx context.Context) (bool, UpdateResponse) {
 	query := requestURL.Query()
 	query.Add("os", runtime.GOOS)
 	query.Add("arch", runtime.GOARCH)
-	query.Add("version", version.Version)
+	currentVersion := version.Version
+	query.Add("version", currentVersion)
 	query.Add("ts", strconv.FormatInt(time.Now().Unix(), 10))

 	// The original macOS app used to use the device ID
@@ -131,15 +133,27 @@ func (u *Updater) checkForUpdate(ctx context.Context) (bool, UpdateResponse) {
 }

 func (u *Updater) DownloadNewRelease(ctx context.Context, updateResp UpdateResponse) error {
+	// Create a cancellable context for this download
+	downloadCtx, cancel := context.WithCancel(ctx)
+	u.cancelDownloadLock.Lock()
+	u.cancelDownload = cancel
+	u.cancelDownloadLock.Unlock()
+	defer func() {
+		u.cancelDownloadLock.Lock()
+		u.cancelDownload = nil
+		u.cancelDownloadLock.Unlock()
+		cancel()
+	}()
+
 	// Do a head first to check etag info
-	req, err := http.NewRequestWithContext(ctx, http.MethodHead, updateResp.UpdateURL, nil)
+	req, err := http.NewRequestWithContext(downloadCtx, http.MethodHead, updateResp.UpdateURL, nil)
 	if err != nil {
 		return err
 	}

 	// In case of slow downloads, continue the update check in the background
-	bgctx, cancel := context.WithCancel(ctx)
-	defer cancel()
+	bgctx, bgcancel := context.WithCancel(downloadCtx)
+	defer bgcancel()
 	go func() {
 		for {
 			select {
@@ -176,6 +190,7 @@ func (u *Updater) DownloadNewRelease(ctx context.Context, updateResp UpdateRespo
 	_, err = os.Stat(stageFilename)
 	if err == nil {
 		slog.Info("update already downloaded", "bundle", stageFilename)
+		UpdateDownloaded = true
 		return nil
 	}

@@ -244,33 +259,85 @@ func cleanupOldDownloads(stageDir string) {
 }

 type Updater struct {
-	Store *store.Store
+	Store              *store.Store
+	cancelDownload     context.CancelFunc
+	cancelDownloadLock sync.Mutex
+	checkNow           chan struct{}
+}
+
+// CancelOngoingDownload cancels any currently running download
+func (u *Updater) CancelOngoingDownload() {
+	u.cancelDownloadLock.Lock()
+	defer u.cancelDownloadLock.Unlock()
+	if u.cancelDownload != nil {
+		slog.Info("cancelling ongoing update download")
+		u.cancelDownload()
+		u.cancelDownload = nil
+	}
+}
+
+// TriggerImmediateCheck signals the background checker to check for updates immediately
+func (u *Updater) TriggerImmediateCheck() {
+	if u.checkNow != nil {
+		select {
+		case u.checkNow <- struct{}{}:
+		default:
+			// Check already pending, no need to queue another
+		}
+	}
 }

 func (u *Updater) StartBackgroundUpdaterChecker(ctx context.Context, cb func(string) error) {
+	u.checkNow = make(chan struct{}, 1)
+	u.checkNow <- struct{}{} // Trigger first check after initial delay
 	go func() {
 		// Don't blast an update message immediately after startup
 		time.Sleep(UpdateCheckInitialDelay)
 		slog.Info("beginning update checker", "interval", UpdateCheckInterval)
+		ticker := time.NewTicker(UpdateCheckInterval)
+		defer ticker.Stop()
+
 		for {
-			available, resp := u.checkForUpdate(ctx)
-			if available {
-				err := u.DownloadNewRelease(ctx, resp)
-				if err != nil {
-					slog.Error(fmt.Sprintf("failed to download new release: %s", err))
-				} else {
-					err = cb(resp.UpdateVersion)
-					if err != nil {
-						slog.Warn(fmt.Sprintf("failed to register update available with tray: %s", err))
-					}
-				}
-			}
 			select {
 			case <-ctx.Done():
 				slog.Debug("stopping background update checker")
 				return
-			default:
-				time.Sleep(UpdateCheckInterval)
+			case <-u.checkNow:
+				// Immediate check triggered
+			case <-ticker.C:
+				// Regular interval check
+			}
+
+			// Always check for updates
+			available, resp := u.checkForUpdate(ctx)
+			if !available {
+				continue
+			}
+
+			// Update is available - check if auto-update is enabled for downloading
+			settings, err := u.Store.Settings()
+			if err != nil {
+				slog.Error("failed to load settings", "error", err)
+				continue
+			}
+
+			if !settings.AutoUpdateEnabled {
+				// Auto-update disabled - don't download, just log
+				slog.Debug("update available but auto-update disabled", "version", resp.UpdateVersion)
+				continue
+			}
+
+			// Auto-update is enabled - download
+			err = u.DownloadNewRelease(ctx, resp)
+			if err != nil {
+				slog.Error("failed to download new release", "error", err)
+				continue
+			}
+
+			// Download successful - show tray notification
+			err = cb(resp.UpdateVersion)
+			if err != nil {
+				slog.Warn("failed to register update available with tray", "error", err)
 			}
 		}
 	}()
--- a/app/updater/updater_test.go
+++ b/app/updater/updater_test.go
@@ -11,6 +11,8 @@ import (
 	"log/slog"
 	"net/http"
 	"net/http/httptest"
+	"path/filepath"
+	"sync/atomic"
 	"testing"
 	"time"

@@ -33,7 +35,7 @@ func TestIsNewReleaseAvailable(t *testing.T) {
 	defer server.Close()
 	slog.Debug("server", "url", server.URL)

-	updater := &Updater{Store: &store.Store{}}
+	updater := &Updater{Store: &store.Store{DBPath: filepath.Join(t.TempDir(), "test.db")}}
 	defer updater.Store.Close() // Ensure database is closed
 	UpdateCheckURLBase = server.URL + "/update.json"
 	updatePresent, resp := updater.checkForUpdate(t.Context())
@@ -84,8 +86,18 @@ func TestBackgoundChecker(t *testing.T) {
 	defer server.Close()
 	UpdateCheckURLBase = server.URL + "/update.json"

-	updater := &Updater{Store: &store.Store{}}
-	defer updater.Store.Close() // Ensure database is closed
+	updater := &Updater{Store: &store.Store{DBPath: filepath.Join(t.TempDir(), "test.db")}}
+	defer updater.Store.Close()
+
+	settings, err := updater.Store.Settings()
+	if err != nil {
+		t.Fatal(err)
+	}
+	settings.AutoUpdateEnabled = true
+	if err := updater.Store.SetSettings(settings); err != nil {
+		t.Fatal(err)
+	}
+
 	updater.StartBackgroundUpdaterChecker(ctx, cb)
 	select {
 	case <-stallTimer.C:
@@ -99,3 +111,267 @@ func TestBackgoundChecker(t *testing.T) {
 		}
 	}
 }
+
+func TestAutoUpdateDisabledSkipsDownload(t *testing.T) {
+	UpdateStageDir = t.TempDir()
+	var downloadAttempted atomic.Bool
+	done := make(chan struct{})
+
+	ctx, cancel := context.WithCancel(t.Context())
+	defer cancel()
+	UpdateCheckInitialDelay = 5 * time.Millisecond
+	UpdateCheckInterval = 5 * time.Millisecond
+	VerifyDownload = func() error {
+		return nil
+	}
+
+	var server *httptest.Server
+	server = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Path == "/update.json" {
+			w.Write([]byte(
+				fmt.Sprintf(`{"version": "9.9.9", "url": "%s"}`,
+					server.URL+"/9.9.9/"+Installer)))
+		} else if r.URL.Path == "/9.9.9/"+Installer {
+			downloadAttempted.Store(true)
+			buf := &bytes.Buffer{}
+			zw := zip.NewWriter(buf)
+			zw.Close()
+			io.Copy(w, buf)
+		}
+	}))
+	defer server.Close()
+	UpdateCheckURLBase = server.URL + "/update.json"
+
+	updater := &Updater{Store: &store.Store{DBPath: filepath.Join(t.TempDir(), "test.db")}}
+	defer updater.Store.Close()
+
+	// Ensure auto-update is disabled
+	settings, err := updater.Store.Settings()
+	if err != nil {
+		t.Fatal(err)
+	}
+	settings.AutoUpdateEnabled = false
+	if err := updater.Store.SetSettings(settings); err != nil {
+		t.Fatal(err)
+	}
+
+	cb := func(ver string) error {
+		t.Fatal("callback should not be called when auto-update is disabled")
+		return nil
+	}
+
+	updater.StartBackgroundUpdaterChecker(ctx, cb)
+
+	// Wait enough time for multiple check cycles
+	time.Sleep(50 * time.Millisecond)
+	close(done)
+
+	if downloadAttempted.Load() {
+		t.Fatal("download should not be attempted when auto-update is disabled")
+	}
+}
+
+func TestAutoUpdateReenabledDownloadsUpdate(t *testing.T) {
+	UpdateStageDir = t.TempDir()
+	var downloadAttempted atomic.Bool
+	callbackCalled := make(chan struct{}, 1)
+
+	ctx, cancel := context.WithCancel(t.Context())
+	defer cancel()
+	UpdateCheckInitialDelay = 5 * time.Millisecond
+	UpdateCheckInterval = 5 * time.Millisecond
+	VerifyDownload = func() error {
+		return nil
+	}
+
+	var server *httptest.Server
+	server = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Path == "/update.json" {
+			w.Write([]byte(
+				fmt.Sprintf(`{"version": "9.9.9", "url": "%s"}`,
+					server.URL+"/9.9.9/"+Installer)))
+		} else if r.URL.Path == "/9.9.9/"+Installer {
+			downloadAttempted.Store(true)
+			buf := &bytes.Buffer{}
+			zw := zip.NewWriter(buf)
+			zw.Close()
+			io.Copy(w, buf)
+		}
+	}))
+	defer server.Close()
+	UpdateCheckURLBase = server.URL + "/update.json"
+
+	upd := &Updater{Store: &store.Store{DBPath: filepath.Join(t.TempDir(), "test.db")}}
+	defer upd.Store.Close()
+
+	// Start with auto-update disabled
+	settings, err := upd.Store.Settings()
+	if err != nil {
+		t.Fatal(err)
+	}
+	settings.AutoUpdateEnabled = false
+	if err := upd.Store.SetSettings(settings); err != nil {
+		t.Fatal(err)
+	}
+
+	cb := func(ver string) error {
+		select {
+		case callbackCalled <- struct{}{}:
+		default:
+		}
+		return nil
+	}
+
+	upd.StartBackgroundUpdaterChecker(ctx, cb)
+
+	// Wait for a few cycles with auto-update disabled - no download should happen
+	time.Sleep(50 * time.Millisecond)
+	if downloadAttempted.Load() {
+		t.Fatal("download should not happen while auto-update is disabled")
+	}
+
+	// Re-enable auto-update
+	settings.AutoUpdateEnabled = true
+	if err := upd.Store.SetSettings(settings); err != nil {
+		t.Fatal(err)
+	}
+
+	// Wait for the checker to pick it up and download
+	select {
+	case <-callbackCalled:
+		// Success: download happened and callback was called after re-enabling
+		if !downloadAttempted.Load() {
+			t.Fatal("expected download to be attempted after re-enabling")
+		}
+	case <-time.After(5 * time.Second):
+		t.Fatal("expected download and callback after re-enabling auto-update")
+	}
+}
+
+func TestCancelOngoingDownload(t *testing.T) {
+	UpdateStageDir = t.TempDir()
+	downloadStarted := make(chan struct{})
+	downloadCancelled := make(chan struct{})
+
+	ctx := t.Context()
+	VerifyDownload = func() error {
+		return nil
+	}
+
+	var server *httptest.Server
+	server = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Path == "/update.json" {
+			w.Write([]byte(
+				fmt.Sprintf(`{"version": "9.9.9", "url": "%s"}`,
+					server.URL+"/9.9.9/"+Installer)))
+		} else if r.URL.Path == "/9.9.9/"+Installer {
+			if r.Method == http.MethodHead {
+				w.Header().Set("Content-Length", "1000000")
+				w.WriteHeader(http.StatusOK)
+				return
+			}
+			// Signal that download has started
+			close(downloadStarted)
+			// Wait for cancellation or timeout
+			select {
+			case <-r.Context().Done():
+				close(downloadCancelled)
+				return
+			case <-time.After(5 * time.Second):
+				t.Error("download was not cancelled in time")
+			}
+		}
+	}))
+	defer server.Close()
+	UpdateCheckURLBase = server.URL + "/update.json"
+
+	updater := &Updater{Store: &store.Store{DBPath: filepath.Join(t.TempDir(), "test.db")}}
+	defer updater.Store.Close()
+
+	_, resp := updater.checkForUpdate(ctx)
+
+	// Start download in goroutine
+	go func() {
+		_ = updater.DownloadNewRelease(ctx, resp)
+	}()
+
+	// Wait for download to start
+	select {
+	case <-downloadStarted:
+	case <-time.After(2 * time.Second):
+		t.Fatal("download did not start in time")
+	}
+
+	// Cancel the download
+	updater.CancelOngoingDownload()
+
+	// Verify cancellation was received
+	select {
+	case <-downloadCancelled:
+		// Success
+	case <-time.After(2 * time.Second):
+		t.Fatal("download cancellation was not received by server")
+	}
+}
+
+func TestTriggerImmediateCheck(t *testing.T) {
+	UpdateStageDir = t.TempDir()
+	checkCount := atomic.Int32{}
+	checkDone := make(chan struct{}, 10)
+
+	ctx, cancel := context.WithCancel(t.Context())
+	defer cancel()
+	// Set a very long interval so only TriggerImmediateCheck causes checks
+	UpdateCheckInitialDelay = 1 * time.Millisecond
+	UpdateCheckInterval = 1 * time.Hour
+	VerifyDownload = func() error {
+		return nil
+	}
+
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Path == "/update.json" {
+			checkCount.Add(1)
+			select {
+			case checkDone <- struct{}{}:
+			default:
+			}
+			// Return no update available
+			w.WriteHeader(http.StatusNoContent)
+		}
+	}))
+	defer server.Close()
+	UpdateCheckURLBase = server.URL + "/update.json"
+
+	updater := &Updater{Store: &store.Store{DBPath: filepath.Join(t.TempDir(), "test.db")}}
+	defer updater.Store.Close()
+
+	cb := func(ver string) error {
+		return nil
+	}
+
+	updater.StartBackgroundUpdaterChecker(ctx, cb)
+
+	// Wait for the initial check that fires after the initial delay
+	select {
+	case <-checkDone:
+	case <-time.After(2 * time.Second):
+		t.Fatal("initial check did not happen")
+	}
+
+	initialCount := checkCount.Load()
+
+	// Trigger immediate check
+	updater.TriggerImmediateCheck()
+
+	// Wait for the triggered check
+	select {
+	case <-checkDone:
+	case <-time.After(2 * time.Second):
+		t.Fatal("triggered check did not happen")
+	}
+
+	finalCount := checkCount.Load()
+	if finalCount <= initialCount {
+		t.Fatalf("TriggerImmediateCheck did not cause additional check: initial=%d, final=%d", initialCount, finalCount)
+	}
+}
--- a/app/wintray/tray.go
+++ b/app/wintray/tray.go
@@ -369,25 +369,6 @@ func (t *winTray) addSeparatorMenuItem(menuItemId, parentId uint32) error {
 	return nil
 }

-// func (t *winTray) hideMenuItem(menuItemId, parentId uint32) error {
-// 	const ERROR_SUCCESS syscall.Errno = 0
-
-// 	t.muMenus.RLock()
-// 	menu := uintptr(t.menus[parentId])
-// 	t.muMenus.RUnlock()
-// 	res, _, err := pRemoveMenu.Call(
-// 		menu,
-// 		uintptr(menuItemId),
-// 		MF_BYCOMMAND,
-// 	)
-// 	if res == 0 && err.(syscall.Errno) != ERROR_SUCCESS {
-// 		return err
-// 	}
-// 	t.delFromVisibleItems(parentId, menuItemId)
-
-// 	return nil
-// }
-
 func (t *winTray) showMenu() error {
 	p := point{}
 	boolRet, _, err := pGetCursorPos.Call(uintptr(unsafe.Pointer(&p)))
--- a/app/wintray/w32api.go
+++ b/app/wintray/w32api.go
@@ -51,7 +51,6 @@ const (
 	IMAGE_ICON          = 1          // Loads an icon
 	LR_DEFAULTSIZE      = 0x00000040 // Loads default-size icon for windows(SM_CXICON x SM_CYICON) if cx, cy are set to zero
 	LR_LOADFROMFILE     = 0x00000010 // Loads the stand-alone image from the file
-	MF_BYCOMMAND        = 0x00000000
 	MFS_DISABLED        = 0x00000003
 	MFT_SEPARATOR       = 0x00000800
 	MFT_STRING          = 0x00000000
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -257,10 +257,11 @@ func LoadModelMetadata(fsys fs.FS) (ModelKV, *Tokenizer, error) {
 	if err != nil {
 		return nil, nil, err
 	}
+	bts = sanitizeNonFiniteJSON(bts)

 	var p ModelParameters
 	if err := json.Unmarshal(bts, &p); err != nil {
-		return nil, nil, err
+		return nil, nil, fmt.Errorf("parse config.json: %w", err)
 	}

 	if len(p.Architectures) < 1 {
@@ -315,16 +316,20 @@ func LoadModelMetadata(fsys fs.FS) (ModelKV, *Tokenizer, error) {
 		conv = &glm4MoeLiteModel{}
 	case "GlmOcrForConditionalGeneration":
 		conv = &glmOcrModel{}
-	case "Lfm2ForCausalLM":
+	case "Lfm2ForCausalLM", "Lfm2MoeForCausalLM":
 		conv = &lfm2Model{}
-	case "Qwen3NextForCausalLM":
+	case "Lfm2VlForConditionalGeneration":
+		conv = &lfm2VLTextModel{}
+	case "Qwen3NextForCausalLM", "Qwen3_5ForConditionalGeneration", "Qwen3_5MoeForConditionalGeneration":
 		conv = &qwen3NextModel{}
+	case "NemotronHForCausalLM":
+		conv = &nemotronHModel{}
 	default:
 		return nil, nil, fmt.Errorf("unsupported architecture %q", p.Architectures[0])
 	}

 	if err := json.Unmarshal(bts, conv); err != nil {
-		return nil, nil, err
+		return nil, nil, fmt.Errorf("parse config.json for %q: %w", p.Architectures[0], err)
 	}

 	if t, ok := conv.(moreParser); ok {
--- a/convert/convert_lfm2.go
+++ b/convert/convert_lfm2.go
@@ -1,6 +1,8 @@
 package convert

 import (
+	"cmp"
+	"fmt"
 	"slices"
 	"strings"

@@ -13,42 +15,149 @@ type lfm2Model struct {
 	NumHiddenLayers       uint32   `json:"num_hidden_layers"`
 	MaxPositionEmbeddings uint32   `json:"max_position_embeddings"`
 	IntermediateSize      uint32   `json:"intermediate_size"`
+	BlockFFDim            uint32   `json:"block_ff_dim"`
+	BlockMultipleOf       uint32   `json:"block_multiple_of"`
+	BlockAutoAdjustFFDim  bool     `json:"block_auto_adjust_ff_dim"`
+	BlockFFNDimMultiplier float32  `json:"block_ffn_dim_multiplier"`
 	NumAttentionHeads     uint32   `json:"num_attention_heads"`
 	NumKeyValueHeads      uint32   `json:"num_key_value_heads"`
 	RopeTheta             float32  `json:"rope_theta"`
 	NormEps               float32  `json:"norm_eps"`
 	ConvLCache            uint32   `json:"conv_L_cache"`
+	MoEIntermediateSize   uint32   `json:"moe_intermediate_size"`
+	NumExperts            uint32   `json:"num_experts"`
+	NumLocalExperts       uint32   `json:"num_local_experts"`
+	NumExpertsPerToken    uint32   `json:"num_experts_per_tok"`
+	NumDenseLayers        uint32   `json:"num_dense_layers"`
+	RoutedScalingFactor   float32  `json:"routed_scaling_factor"`
 	LayerTypes            []string `json:"layer_types"`
 	TieEmbedding          bool     `json:"tie_embedding"`
+	RopeParameters        struct {
+		RopeTheta float32 `json:"rope_theta"`
+	} `json:"rope_parameters"`
 }

 var _ ModelConverter = (*lfm2Model)(nil)

+const (
+	defaultMaxPositionEmbeddings = uint32(128_000)
+	fallbackContextLength        = uint32(32_768)
+)
+
+func (p *lfm2Model) isMoE() bool {
+	return p.ModelType == "lfm2_moe" || p.expertCount() > 0
+}
+
+func (p *lfm2Model) ropeFreqBase() float32 {
+	if p.RopeTheta != 0 {
+		return p.RopeTheta
+	}
+
+	return p.RopeParameters.RopeTheta
+}
+
+func (p *lfm2Model) expertCount() uint32 {
+	if p.NumLocalExperts > 0 {
+		return p.NumLocalExperts
+	}
+	return p.NumExperts
+}
+
+func (p *lfm2Model) feedForwardLength() uint32 {
+	ff := p.IntermediateSize
+	if p.BlockFFDim != 0 {
+		ff = p.BlockFFDim
+	}
+
+	if !p.BlockAutoAdjustFFDim || p.BlockMultipleOf == 0 {
+		return ff
+	}
+
+	ff = (2 * ff) / 3
+
+	// Keep default multiplier behavior consistent with llama.cpp conversion.
+	if p.BlockFFNDimMultiplier != 0 {
+		ff = uint32(float32(ff) * p.BlockFFNDimMultiplier)
+	}
+
+	m := p.BlockMultipleOf
+	return m * ((ff + m - 1) / m)
+}
+
+func (p *lfm2Model) hasKnownContextLengthFallbackSignature() bool {
+	return p.isMoE() &&
+		p.VocabSize == 65536 &&
+		p.HiddenSize == 2048 &&
+		p.NumHiddenLayers == 40 &&
+		p.IntermediateSize == 11776 &&
+		p.NumAttentionHeads == 32 &&
+		p.NumKeyValueHeads == 8 &&
+		p.NumDenseLayers == 2 &&
+		p.expertCount() == 64 &&
+		p.NumExpertsPerToken == 4 &&
+		p.MoEIntermediateSize == 1536
+}
+
+func (p *lfm2Model) contextLength() uint32 {
+	if p.MaxPositionEmbeddings == defaultMaxPositionEmbeddings && p.hasKnownContextLengthFallbackSignature() {
+		return fallbackContextLength
+	}
+
+	return p.MaxPositionEmbeddings
+}
+
 func (p *lfm2Model) KV(t *Tokenizer) KV {
+	architecture := "lfm2"
+	if p.isMoE() {
+		architecture = "lfm2moe"
+	}
+
 	kv := p.ModelParameters.KV(t)
-	kv["general.architecture"] = "lfm2"
-	kv["lfm2.vocab_size"] = p.VocabSize
-	kv["lfm2.block_count"] = p.NumHiddenLayers
-	kv["lfm2.embedding_length"] = p.HiddenSize
-	kv["lfm2.feed_forward_length"] = p.IntermediateSize
-	kv["lfm2.context_length"] = p.MaxPositionEmbeddings
+	kv["general.architecture"] = architecture
+	kv["tokenizer.ggml.pre"] = "lfm2"
+	kv["vocab_size"] = p.VocabSize
+	kv["block_count"] = p.NumHiddenLayers
+	kv["embedding_length"] = p.HiddenSize
+	kv["feed_forward_length"] = p.feedForwardLength()
+	kv["context_length"] = p.contextLength()

 	// Build per-layer KV head count array based on layer_types
-	// (0 = shortconv layer, non-zero = attention layer with that many KV heads)
+	// (0 = shortconv layer, non-zero = attention layer with that many KV heads).
+	//
+	// Dense LFM2 in HF defaults to all attention layers when layer_types is absent.
+	// Preserve that behavior to avoid accidentally emitting all-conv metadata.
 	kvHeadCounts := make([]uint32, p.NumHiddenLayers)
-	for i := range p.NumHiddenLayers {
-		if int(i) < len(p.LayerTypes) && p.LayerTypes[i] == "full_attention" {
+	if len(p.LayerTypes) == 0 {
+		for i := range p.NumHiddenLayers {
 			kvHeadCounts[i] = p.NumKeyValueHeads
 		}
+	} else {
+		for i := range p.NumHiddenLayers {
+			if int(i) < len(p.LayerTypes) && p.LayerTypes[i] == "full_attention" {
+				kvHeadCounts[i] = p.NumKeyValueHeads
+			}
+		}
 	}

-	kv["lfm2.attention.head_count"] = p.NumAttentionHeads
-	kv["lfm2.attention.head_count_kv"] = kvHeadCounts
-	kv["lfm2.attention.key_length"] = p.HiddenSize / p.NumAttentionHeads
-	kv["lfm2.attention.value_length"] = p.HiddenSize / p.NumAttentionHeads
-	kv["lfm2.attention.layer_norm_rms_epsilon"] = p.NormEps
-	kv["lfm2.rope.freq_base"] = p.RopeTheta
-	kv["lfm2.shortconv.l_cache"] = p.ConvLCache
+	kv["attention.head_count"] = p.NumAttentionHeads
+	kv["attention.head_count_kv"] = kvHeadCounts
+	kv["attention.key_length"] = p.HiddenSize / p.NumAttentionHeads
+	kv["attention.value_length"] = p.HiddenSize / p.NumAttentionHeads
+	kv["attention.layer_norm_rms_epsilon"] = p.NormEps
+	kv["shortconv.l_cache"] = p.ConvLCache
+
+	if ropeFreqBase := p.ropeFreqBase(); ropeFreqBase != 0 {
+		kv["rope.freq_base"] = ropeFreqBase
+	}
+
+	if p.isMoE() {
+		kv["expert_count"] = p.expertCount()
+		kv["expert_used_count"] = p.NumExpertsPerToken
+		kv["expert_feed_forward_length"] = p.MoEIntermediateSize
+		kv["leading_dense_block_count"] = p.NumDenseLayers
+		kv["expert_gating_func"] = uint32(2) // sigmoid
+		kv["expert_weights_scale"] = cmp.Or(p.RoutedScalingFactor, float32(1.0))
+	}

 	return kv
 }
@@ -56,6 +165,30 @@ func (p *lfm2Model) KV(t *Tokenizer) KV {
 func (p *lfm2Model) Tensors(ts []Tensor) []*ggml.Tensor {
 	var out []*ggml.Tensor

+	if p.isMoE() {
+		merges := make([]merge, 0, p.NumHiddenLayers*3)
+		for i := range p.NumHiddenLayers {
+			if i < p.NumDenseLayers {
+				continue
+			}
+
+			merges = append(merges, merge{
+				fmt.Sprintf("blk.%d.feed_forward.experts.*.w1.weight", i),
+				fmt.Sprintf("blk.%d.ffn_gate_exps.weight", i),
+			}, merge{
+				fmt.Sprintf("blk.%d.feed_forward.experts.*.w2.weight", i),
+				fmt.Sprintf("blk.%d.ffn_down_exps.weight", i),
+			}, merge{
+				fmt.Sprintf("blk.%d.feed_forward.experts.*.w3.weight", i),
+				fmt.Sprintf("blk.%d.ffn_up_exps.weight", i),
+			})
+		}
+
+		merged, remaining := mergeTensors(ts, merges...)
+		out = append(out, merged...)
+		ts = remaining
+	}
+
 	for _, t := range ts {
 		shape := t.Shape()

@@ -80,7 +213,7 @@ func (p *lfm2Model) Tensors(ts []Tensor) []*ggml.Tensor {
 func (p *lfm2Model) Replacements() []string {
 	return []string{
 		"model.embed_tokens", "token_embd",
-		"model.embedding_norm", "output_norm",
+		"model.embedding_norm", "token_embd_norm",
 		"model.layers", "blk",
 		"operator_norm", "attn_norm",
 		"self_attn.q_proj", "attn_q",
@@ -92,6 +225,8 @@ func (p *lfm2Model) Replacements() []string {
 		"conv.conv", "shortconv.conv",
 		"conv.in_proj", "shortconv.in_proj",
 		"conv.out_proj", "shortconv.out_proj",
+		"feed_forward.gate", "ffn_gate_inp",
+		"feed_forward.expert_bias", "exp_probs_b.bias",
 		"feed_forward.w1", "ffn_gate",
 		"feed_forward.w2", "ffn_down",
 		"feed_forward.w3", "ffn_up",
--- a/convert/convert_lfm2_test.go
+++ b/convert/convert_lfm2_test.go
@@ -0,0 +1,271 @@
+package convert
+
+import (
+	"io"
+	"slices"
+	"strings"
+	"testing"
+)
+
+type lfm2StubTensor struct {
+	tensorBase
+}
+
+func newLFM2StubTensor(name string, shape []uint64) *lfm2StubTensor {
+	return &lfm2StubTensor{
+		tensorBase: tensorBase{
+			name:  name,
+			shape: shape,
+		},
+	}
+}
+
+func (t *lfm2StubTensor) WriteTo(io.Writer) (int64, error) {
+	return 0, nil
+}
+
+func (t *lfm2StubTensor) Clone() Tensor {
+	return &lfm2StubTensor{
+		tensorBase: tensorBase{
+			name:  t.name,
+			shape: slices.Clone(t.shape),
+		},
+	}
+}
+
+func TestLFM2MoEKV(t *testing.T) {
+	var p lfm2Model
+	p.ModelParameters.ModelType = "lfm2_moe"
+	p.VocabSize = 65536
+	p.HiddenSize = 2048
+	p.NumHiddenLayers = 4
+	p.MaxPositionEmbeddings = 128000
+	p.IntermediateSize = 11776
+	p.NumAttentionHeads = 32
+	p.NumKeyValueHeads = 8
+	p.LayerTypes = []string{"conv", "full_attention", "conv", "full_attention"}
+	p.NormEps = 1e-5
+	p.ConvLCache = 3
+	p.MoEIntermediateSize = 1536
+	p.NumExperts = 64
+	p.NumExpertsPerToken = 4
+	p.NumDenseLayers = 2
+	p.RopeParameters.RopeTheta = 1_000_000
+
+	kv := p.KV(&Tokenizer{Vocabulary: &Vocabulary{Model: "gpt2"}})
+
+	if got, want := kv["general.architecture"], "lfm2moe"; got != want {
+		t.Fatalf("general.architecture = %v, want %v", got, want)
+	}
+	if got, want := kv["tokenizer.ggml.pre"], "lfm2"; got != want {
+		t.Fatalf("tokenizer.ggml.pre = %v, want %v", got, want)
+	}
+
+	if got, want := kv["expert_count"], uint32(64); got != want {
+		t.Fatalf("expert_count = %v, want %v", got, want)
+	}
+
+	if got, want := kv["expert_used_count"], uint32(4); got != want {
+		t.Fatalf("expert_used_count = %v, want %v", got, want)
+	}
+
+	if got, want := kv["expert_feed_forward_length"], uint32(1536); got != want {
+		t.Fatalf("expert_feed_forward_length = %v, want %v", got, want)
+	}
+
+	if got, want := kv["leading_dense_block_count"], uint32(2); got != want {
+		t.Fatalf("leading_dense_block_count = %v, want %v", got, want)
+	}
+
+	if got, want := kv["expert_gating_func"], uint32(2); got != want {
+		t.Fatalf("expert_gating_func = %v, want %v", got, want)
+	}
+
+	gotHeadCounts, ok := kv["attention.head_count_kv"].([]uint32)
+	if !ok {
+		t.Fatalf("attention.head_count_kv has unexpected type %T", kv["attention.head_count_kv"])
+	}
+
+	wantHeadCounts := []uint32{0, 8, 0, 8}
+	if !slices.Equal(gotHeadCounts, wantHeadCounts) {
+		t.Fatalf("attention.head_count_kv = %v, want %v", gotHeadCounts, wantHeadCounts)
+	}
+
+	if got, want := kv["rope.freq_base"], float32(1_000_000); got != want {
+		t.Fatalf("rope.freq_base = %v, want %v", got, want)
+	}
+}
+
+func TestLFM2DenseKV(t *testing.T) {
+	p := lfm2Model{
+		ModelParameters:       ModelParameters{ModelType: "lfm2", VocabSize: 32000},
+		HiddenSize:            1024,
+		NumHiddenLayers:       2,
+		MaxPositionEmbeddings: 32768,
+		IntermediateSize:      4096,
+		NumAttentionHeads:     16,
+		NumKeyValueHeads:      4,
+		LayerTypes:            []string{"conv", "full_attention"},
+		NormEps:               1e-5,
+		ConvLCache:            3,
+		RopeTheta:             10000,
+	}
+
+	kv := p.KV(&Tokenizer{Vocabulary: &Vocabulary{Model: "gpt2"}})
+
+	if got, want := kv["general.architecture"], "lfm2"; got != want {
+		t.Fatalf("general.architecture = %v, want %v", got, want)
+	}
+	if got, want := kv["tokenizer.ggml.pre"], "lfm2"; got != want {
+		t.Fatalf("tokenizer.ggml.pre = %v, want %v", got, want)
+	}
+
+	if _, ok := kv["expert_count"]; ok {
+		t.Fatalf("expert_count should not be set for dense lfm2")
+	}
+}
+
+func TestLFM2MoETensors(t *testing.T) {
+	p := lfm2Model{
+		ModelParameters: ModelParameters{ModelType: "lfm2_moe"},
+		NumHiddenLayers: 4,
+		NumDenseLayers:  2,
+	}
+
+	in := []Tensor{
+		newLFM2StubTensor("blk.2.feed_forward.experts.0.w1.weight", []uint64{1536, 2048}),
+		newLFM2StubTensor("blk.2.feed_forward.experts.1.w1.weight", []uint64{1536, 2048}),
+		newLFM2StubTensor("blk.2.feed_forward.experts.0.w2.weight", []uint64{2048, 1536}),
+		newLFM2StubTensor("blk.2.feed_forward.experts.1.w2.weight", []uint64{2048, 1536}),
+		newLFM2StubTensor("blk.2.feed_forward.experts.0.w3.weight", []uint64{1536, 2048}),
+		newLFM2StubTensor("blk.2.feed_forward.experts.1.w3.weight", []uint64{1536, 2048}),
+		newLFM2StubTensor("blk.0.shortconv.conv.weight", []uint64{2048, 1, 3}),
+	}
+
+	out := p.Tensors(in)
+
+	byName := make(map[string][]uint64, len(out))
+	for _, tns := range out {
+		byName[tns.Name] = tns.Shape
+	}
+
+	if got, ok := byName["blk.2.ffn_gate_exps.weight"]; !ok {
+		t.Fatalf("missing merged tensor blk.2.ffn_gate_exps.weight")
+	} else if !slices.Equal(got, []uint64{2, 1536, 2048}) {
+		t.Fatalf("blk.2.ffn_gate_exps.weight shape = %v, want [2 1536 2048]", got)
+	}
+
+	if got, ok := byName["blk.2.ffn_down_exps.weight"]; !ok {
+		t.Fatalf("missing merged tensor blk.2.ffn_down_exps.weight")
+	} else if !slices.Equal(got, []uint64{2, 2048, 1536}) {
+		t.Fatalf("blk.2.ffn_down_exps.weight shape = %v, want [2 2048 1536]", got)
+	}
+
+	if got, ok := byName["blk.2.ffn_up_exps.weight"]; !ok {
+		t.Fatalf("missing merged tensor blk.2.ffn_up_exps.weight")
+	} else if !slices.Equal(got, []uint64{2, 1536, 2048}) {
+		t.Fatalf("blk.2.ffn_up_exps.weight shape = %v, want [2 1536 2048]", got)
+	}
+
+	if got, ok := byName["blk.0.shortconv.conv.weight"]; !ok {
+		t.Fatalf("missing shortconv tensor")
+	} else if !slices.Equal(got, []uint64{2048, 3}) {
+		t.Fatalf("blk.0.shortconv.conv.weight shape = %v, want [2048 3]", got)
+	}
+
+	if _, ok := byName["blk.2.feed_forward.experts.0.w1.weight"]; ok {
+		t.Fatalf("unmerged expert tensor should not be present")
+	}
+}
+
+func TestLFM2MoEReplacements(t *testing.T) {
+	p := lfm2Model{}
+	replacer := strings.NewReplacer(p.Replacements()...)
+
+	if got, want := replacer.Replace("model.layers.2.feed_forward.expert_bias"), "blk.2.exp_probs_b.bias"; got != want {
+		t.Fatalf("expert bias replacement = %q, want %q", got, want)
+	}
+
+	if got, want := replacer.Replace("model.layers.2.feed_forward.gate.weight"), "blk.2.ffn_gate_inp.weight"; got != want {
+		t.Fatalf("gate replacement = %q, want %q", got, want)
+	}
+}
+
+func TestLFM2KVContextLengthEdgeCaseFallbackOverride(t *testing.T) {
+	p := lfm2Model{
+		ModelParameters:       ModelParameters{ModelType: "lfm2_moe", VocabSize: 65536},
+		HiddenSize:            2048,
+		NumHiddenLayers:       40,
+		MaxPositionEmbeddings: 128000,
+		IntermediateSize:      11776,
+		NumAttentionHeads:     32,
+		NumKeyValueHeads:      8,
+		LayerTypes:            make([]string, 40),
+		NormEps:               1e-5,
+		ConvLCache:            3,
+		MoEIntermediateSize:   1536,
+		NumExperts:            64,
+		NumExpertsPerToken:    4,
+		NumDenseLayers:        2,
+	}
+	for i := 0; i < len(p.LayerTypes); i++ {
+		p.LayerTypes[i] = "conv"
+	}
+	p.LayerTypes[2] = "full_attention"
+
+	kv := p.KV(&Tokenizer{Vocabulary: &Vocabulary{Model: "gpt2"}})
+
+	if got, want := kv["context_length"], uint32(32768); got != want {
+		t.Fatalf("context_length = %v, want %v", got, want)
+	}
+}
+
+func TestLFM2KVContextLengthNoOverride(t *testing.T) {
+	p := lfm2Model{
+		ModelParameters:       ModelParameters{ModelType: "lfm2_moe", VocabSize: 65536},
+		HiddenSize:            2048,
+		NumHiddenLayers:       39, // mismatch: should not trigger edge case
+		MaxPositionEmbeddings: 128000,
+		IntermediateSize:      11776,
+		NumAttentionHeads:     32,
+		NumKeyValueHeads:      8,
+		LayerTypes:            []string{"conv", "full_attention"},
+		NormEps:               1e-5,
+		ConvLCache:            3,
+		MoEIntermediateSize:   1536,
+		NumExperts:            64,
+		NumExpertsPerToken:    4,
+		NumDenseLayers:        2,
+	}
+
+	kv := p.KV(&Tokenizer{Vocabulary: &Vocabulary{Model: "gpt2"}})
+
+	if got, want := kv["context_length"], uint32(128000); got != want {
+		t.Fatalf("context_length = %v, want %v", got, want)
+	}
+}
+
+func TestLFM2KVFeedForwardLengthAutoAdjust(t *testing.T) {
+	p := lfm2Model{
+		ModelParameters:       ModelParameters{ModelType: "lfm2", VocabSize: 65536},
+		HiddenSize:            2048,
+		NumHiddenLayers:       16,
+		MaxPositionEmbeddings: 128000,
+		IntermediateSize:      12288, // should be ignored when block_ff_dim is set
+		BlockFFDim:            12288,
+		BlockAutoAdjustFFDim:  true,
+		BlockMultipleOf:       256,
+		BlockFFNDimMultiplier: 1.0,
+		NumAttentionHeads:     32,
+		NumKeyValueHeads:      8,
+		LayerTypes:            []string{"conv", "full_attention"},
+		NormEps:               1e-5,
+		ConvLCache:            3,
+	}
+
+	kv := p.KV(&Tokenizer{Vocabulary: &Vocabulary{Model: "gpt2"}})
+
+	if got, want := kv["feed_forward_length"], uint32(8192); got != want {
+		t.Fatalf("feed_forward_length = %v, want %v", got, want)
+	}
+}
--- a/convert/convert_lfm2_vl.go
+++ b/convert/convert_lfm2_vl.go
@@ -0,0 +1,417 @@
+package convert
+
+import (
+	"cmp"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"io/fs"
+	"slices"
+	"strings"
+
+	"github.com/ollama/ollama/fs/ggml"
+)
+
+// lfm2VLTextModel converts the language model component of LFM2 VL checkpoints.
+type lfm2VLTextModel struct {
+	TextConfig            lfm2Model `json:"text_config"`
+	DoImageSplitting      *bool     `json:"do_image_splitting"`
+	DownsampleFactor      uint32    `json:"downsample_factor"`
+	EncoderPatchSize      uint32    `json:"encoder_patch_size"`
+	ImageTokenID          uint32    `json:"image_token_id"`
+	MaxImageTokens        uint32    `json:"max_image_tokens"`
+	MinImageTokens        uint32    `json:"min_image_tokens"`
+	MaxTiles              uint32    `json:"max_tiles"`
+	MinTiles              uint32    `json:"min_tiles"`
+	TileSize              uint32    `json:"tile_size"`
+	MaxPixelsTolerance    float32   `json:"max_pixels_tolerance"`
+	ProjectorUseLayernorm bool      `json:"projector_use_layernorm"`
+	ProjectorHiddenSize   uint32    `json:"projector_hidden_size"`
+	ProjectorHiddenAct    string    `json:"projector_hidden_act"`
+	UseImageSpecialTokens *bool     `json:"use_image_special_tokens"`
+	UseThumbnail          *bool     `json:"use_thumbnail"`
+	VisionConfig          struct {
+		HiddenSize        uint32  `json:"hidden_size"`
+		IntermediateSize  uint32  `json:"intermediate_size"`
+		NumAttentionHeads uint32  `json:"num_attention_heads"`
+		NumHiddenLayers   uint32  `json:"num_hidden_layers"`
+		NumChannels       uint32  `json:"num_channels"`
+		PatchSize         uint32  `json:"patch_size"`
+		LayerNormEpsilon  float32 `json:"layer_norm_eps"`
+	} `json:"vision_config"`
+	Processor struct {
+		ImageProcessor struct {
+			DoImageSplitting *bool     `json:"do_image_splitting"`
+			DownsampleFactor uint32    `json:"downsample_factor"`
+			MaxImageTokens   uint32    `json:"max_image_tokens"`
+			MinImageTokens   uint32    `json:"min_image_tokens"`
+			MaxTiles         uint32    `json:"max_tiles"`
+			MinTiles         uint32    `json:"min_tiles"`
+			MaxPixelsTol     float32   `json:"max_pixels_tolerance"`
+			TileSize         uint32    `json:"tile_size"`
+			UseThumbnail     *bool     `json:"use_thumbnail"`
+			ImageMean        []float32 `json:"image_mean"`
+			ImageStd         []float32 `json:"image_std"`
+			Size             struct {
+				Height uint32 `json:"height"`
+				Width  uint32 `json:"width"`
+			} `json:"size"`
+		} `json:"image_processor"`
+	}
+}
+
+func (p *lfm2VLTextModel) textModel() *lfm2Model {
+	return &p.TextConfig
+}
+
+func (p *lfm2VLTextModel) specialTokenTypes() []string {
+	return p.textModel().specialTokenTypes()
+}
+
+func (p *lfm2VLTextModel) parseMore(fsys fs.FS) error {
+	bts, err := fs.ReadFile(fsys, "processor_config.json")
+	if err != nil {
+		if errors.Is(err, fs.ErrNotExist) {
+			return nil
+		}
+		return err
+	}
+
+	return json.Unmarshal(bts, &p.Processor)
+}
+
+func (p *lfm2VLTextModel) visionImageSize() uint32 {
+	// LFM2-VL image processor operates on 512 tiles and downsamples by factor 2
+	// before projection. Keep a fixed square image size compatible with position
+	// embeddings and the simplified runtime image pipeline.
+	tile := cmp.Or(
+		p.Processor.ImageProcessor.TileSize,
+		p.Processor.ImageProcessor.Size.Height,
+		p.Processor.ImageProcessor.Size.Width,
+		uint32(512),
+	)
+	downsample := cmp.Or(p.DownsampleFactor, p.Processor.ImageProcessor.DownsampleFactor, uint32(2))
+	if downsample == 0 {
+		return tile
+	}
+
+	return max(uint32(1), tile/downsample)
+}
+
+func (p *lfm2VLTextModel) KV(t *Tokenizer) KV {
+	kv := p.textModel().KV(t)
+
+	boolOr := func(defaultValue bool, values ...*bool) bool {
+		for _, v := range values {
+			if v != nil {
+				return *v
+			}
+		}
+		return defaultValue
+	}
+
+	kv["vision.block_count"] = cmp.Or(p.VisionConfig.NumHiddenLayers, uint32(27))
+	kv["vision.embedding_length"] = cmp.Or(p.VisionConfig.HiddenSize, uint32(1152))
+	kv["vision.feed_forward_length"] = cmp.Or(p.VisionConfig.IntermediateSize, uint32(4304))
+	kv["vision.attention.head_count"] = cmp.Or(p.VisionConfig.NumAttentionHeads, uint32(16))
+	kv["vision.attention.layer_norm_epsilon"] = cmp.Or(p.VisionConfig.LayerNormEpsilon, float32(1e-6))
+	kv["vision.patch_size"] = cmp.Or(p.VisionConfig.PatchSize, p.EncoderPatchSize, uint32(16))
+	kv["vision.num_channels"] = cmp.Or(p.VisionConfig.NumChannels, uint32(3))
+	kv["vision.image_size"] = p.visionImageSize()
+	kv["vision.projector.scale_factor"] = cmp.Or(p.DownsampleFactor, p.Processor.ImageProcessor.DownsampleFactor, uint32(2))
+	kv["vision.projector.use_layernorm"] = p.ProjectorUseLayernorm
+	kv["vision.do_image_splitting"] = boolOr(true, p.DoImageSplitting, p.Processor.ImageProcessor.DoImageSplitting)
+	kv["vision.min_tiles"] = cmp.Or(p.MinTiles, p.Processor.ImageProcessor.MinTiles, uint32(2))
+	kv["vision.max_tiles"] = cmp.Or(p.MaxTiles, p.Processor.ImageProcessor.MaxTiles, uint32(10))
+	kv["vision.tile_size"] = cmp.Or(p.TileSize, p.Processor.ImageProcessor.TileSize, uint32(512))
+	kv["vision.min_image_tokens"] = cmp.Or(p.MinImageTokens, p.Processor.ImageProcessor.MinImageTokens, uint32(64))
+	kv["vision.max_image_tokens"] = cmp.Or(p.MaxImageTokens, p.Processor.ImageProcessor.MaxImageTokens, uint32(256))
+	kv["vision.max_pixels_tolerance"] = cmp.Or(p.MaxPixelsTolerance, p.Processor.ImageProcessor.MaxPixelsTol, float32(2.0))
+	kv["vision.use_thumbnail"] = boolOr(true, p.UseThumbnail, p.Processor.ImageProcessor.UseThumbnail)
+	kv["vision.use_image_special_tokens"] = boolOr(true, p.UseImageSpecialTokens)
+	kv["vision.image_mean"] = slices.Clone(defaultFloat32Slice(p.Processor.ImageProcessor.ImageMean, []float32{0.5, 0.5, 0.5}))
+	kv["vision.image_std"] = slices.Clone(defaultFloat32Slice(p.Processor.ImageProcessor.ImageStd, []float32{0.5, 0.5, 0.5}))
+	kv["vision.image_token_id"] = cmp.Or(p.ImageTokenID, uint32(396))
+
+	setVisionTokenID := func(k, token string) {
+		if t == nil || t.Vocabulary == nil {
+			return
+		}
+		for i, v := range t.Vocabulary.Tokens {
+			if v == token {
+				kv[k] = uint32(i)
+				return
+			}
+		}
+	}
+	setVisionTokenID("vision.image_start_token_id", "<|image_start|>")
+	setVisionTokenID("vision.image_end_token_id", "<|image_end|>")
+	setVisionTokenID("vision.image_thumbnail_token_id", "<|img_thumbnail|>")
+
+	return kv
+}
+
+func (p *lfm2VLTextModel) Tensors(ts []Tensor) []*ggml.Tensor {
+	patchSize := int(cmp.Or(p.VisionConfig.PatchSize, p.EncoderPatchSize, uint32(16)))
+	numChannels := int(cmp.Or(p.VisionConfig.NumChannels, uint32(3)))
+
+	for _, t := range ts {
+		if t.Name() == "v.patch_embd.weight" {
+			shape := t.Shape()
+			if len(shape) == 2 {
+				inputDim := uint64(numChannels * patchSize * patchSize)
+				if shape[1] == inputDim {
+					channels := numChannels
+					patch := patchSize
+					t.SetRepacker(func(_ string, data []float32, srcShape []uint64) ([]float32, error) {
+						return repackPatchEmbeddingWeight(data, srcShape, channels, patch)
+					})
+				}
+			}
+		}
+	}
+
+	out := p.textModel().Tensors(ts)
+	for _, t := range out {
+		if t.Name == "v.patch_embd.weight" && len(t.Shape) == 2 {
+			t.Shape = []uint64{t.Shape[0], uint64(numChannels), uint64(patchSize), uint64(patchSize)}
+		}
+	}
+	return out
+}
+
+func (p *lfm2VLTextModel) Replacements() []string {
+	out := make([]string, 0, 96)
+
+	addText := func(from, to string) {
+		out = append(out, from, to)
+		if strings.HasPrefix(from, "model.") {
+			suffix := strings.TrimPrefix(from, "model.")
+			out = append(out,
+				"model.language_model."+suffix, to,
+				"model.language_model.model."+suffix, to,
+			)
+		}
+	}
+
+	base := p.textModel().Replacements()
+	for i := 0; i+1 < len(base); i += 2 {
+		addText(base[i], base[i+1])
+	}
+
+	// Vision tower + multimodal projector tensors (single-file conversion).
+	out = append(out,
+		"model.vision_tower.vision_model.embeddings.patch_embedding", "v.patch_embd",
+		"model.vision_tower.vision_model.embeddings.position_embedding", "v.position_embd",
+		"model.vision_tower.vision_model.encoder.layers", "v.blk",
+		"model.vision_tower.vision_model.post_layernorm", "v.post_ln",
+		"model.multi_modal_projector.layer_norm", "mm.layer_norm",
+		"model.multi_modal_projector.linear_1", "mm.1",
+		"model.multi_modal_projector.linear_2", "mm.2",
+		"self_attn.q_proj", "attn_q",
+		"self_attn.k_proj", "attn_k",
+		"self_attn.v_proj", "attn_v",
+		"self_attn.out_proj", "attn_out",
+		"layer_norm1", "ln1",
+		"layer_norm2", "ln2",
+		"mlp.fc1", "ffn_up",
+		"mlp.fc2", "ffn_down",
+	)
+
+	return out
+}
+
+// lfm2VLProjectorModel converts the vision encoder + projector component of LFM2 VL checkpoints.
+type lfm2VLProjectorModel struct {
+	ModelParameters
+	DownsampleFactor   uint32 `json:"downsample_factor"`
+	ProjectorHiddenDim uint32 `json:"projector_hidden_size"`
+	VisionModel        struct {
+		HiddenSize        uint32  `json:"hidden_size"`
+		IntermediateSize  uint32  `json:"intermediate_size"`
+		NumAttentionHeads uint32  `json:"num_attention_heads"`
+		NumHiddenLayers   uint32  `json:"num_hidden_layers"`
+		NumChannels       uint32  `json:"num_channels"`
+		PatchSize         uint32  `json:"patch_size"`
+		LayerNormEpsilon  float32 `json:"layer_norm_eps"`
+		ImageSize         uint32  `json:"image_size"`
+	} `json:"vision_config"`
+	Processor struct {
+		ImageProcessor struct {
+			DownsampleFactor uint32    `json:"downsample_factor"`
+			TileSize         uint32    `json:"tile_size"`
+			ImageMean        []float32 `json:"image_mean"`
+			ImageStd         []float32 `json:"image_std"`
+			Size             struct {
+				Height uint32 `json:"height"`
+				Width  uint32 `json:"width"`
+			} `json:"size"`
+		} `json:"image_processor"`
+	}
+}
+
+var (
+	_ ModelConverter = (*lfm2VLTextModel)(nil)
+	_ ModelConverter = (*lfm2VLProjectorModel)(nil)
+	_ moreParser     = (*lfm2VLTextModel)(nil)
+	_ moreParser     = (*lfm2VLProjectorModel)(nil)
+)
+
+func (p *lfm2VLProjectorModel) parseMore(fsys fs.FS) error {
+	bts, err := fs.ReadFile(fsys, "processor_config.json")
+	if err != nil {
+		if errors.Is(err, fs.ErrNotExist) {
+			return nil
+		}
+		return err
+	}
+
+	return json.Unmarshal(bts, &p.Processor)
+}
+
+func (p *lfm2VLProjectorModel) imageSize() uint32 {
+	if p.VisionModel.ImageSize > 0 {
+		return p.VisionModel.ImageSize
+	}
+
+	downsample := cmp.Or(p.DownsampleFactor, p.Processor.ImageProcessor.DownsampleFactor, uint32(2))
+	baseSize := cmp.Or(
+		p.Processor.ImageProcessor.TileSize,
+		p.Processor.ImageProcessor.Size.Height,
+		p.Processor.ImageProcessor.Size.Width,
+		uint32(256),
+	)
+	if downsample == 0 {
+		return baseSize
+	}
+
+	return max(uint32(1), baseSize/downsample)
+}
+
+func (p *lfm2VLProjectorModel) KV(_ *Tokenizer) KV {
+	kv := KV{
+		"general.architecture":         "clip",
+		"general.type":                 "mmproj",
+		"general.file_type":            uint32(1),
+		"general.quantization_version": uint32(2),
+		"clip.has_vision_encoder":      true,
+		"clip.projector_type":          "lfm2",
+		"clip.use_gelu":                true,
+	}
+
+	kv["clip.vision.block_count"] = cmp.Or(p.VisionModel.NumHiddenLayers, uint32(27))
+	kv["clip.vision.embedding_length"] = cmp.Or(p.VisionModel.HiddenSize, uint32(1152))
+	kv["clip.vision.feed_forward_length"] = cmp.Or(p.VisionModel.IntermediateSize, uint32(4304))
+	kv["clip.vision.attention.head_count"] = cmp.Or(p.VisionModel.NumAttentionHeads, uint32(16))
+	kv["clip.vision.attention.layer_norm_epsilon"] = cmp.Or(p.VisionModel.LayerNormEpsilon, float32(1e-6))
+	kv["clip.vision.patch_size"] = cmp.Or(p.VisionModel.PatchSize, uint32(16))
+	kv["clip.vision.image_size"] = p.imageSize()
+	kv["clip.vision.projection_dim"] = cmp.Or(p.ProjectorHiddenDim, uint32(2048))
+	kv["clip.vision.projector.scale_factor"] = cmp.Or(p.DownsampleFactor, p.Processor.ImageProcessor.DownsampleFactor, uint32(2))
+	kv["clip.vision.image_mean"] = slices.Clone(defaultFloat32Slice(p.Processor.ImageProcessor.ImageMean, []float32{0.5, 0.5, 0.5}))
+	kv["clip.vision.image_std"] = slices.Clone(defaultFloat32Slice(p.Processor.ImageProcessor.ImageStd, []float32{0.5, 0.5, 0.5}))
+
+	return kv
+}
+
+func defaultFloat32Slice(v, fallback []float32) []float32 {
+	if len(v) > 0 {
+		return v
+	}
+
+	return fallback
+}
+
+func (p *lfm2VLProjectorModel) Tensors(ts []Tensor) []*ggml.Tensor {
+	var out []*ggml.Tensor
+
+	numChannels := cmp.Or(p.VisionModel.NumChannels, uint32(3))
+	patchSize := cmp.Or(p.VisionModel.PatchSize, uint32(16))
+
+	for _, t := range ts {
+		name := t.Name()
+		if !(strings.HasPrefix(name, "v.") || strings.HasPrefix(name, "mm.")) {
+			continue
+		}
+
+		shape := t.Shape()
+		if name == "v.patch_embd.weight" && len(shape) == 2 {
+			inputDim := uint64(numChannels * patchSize * patchSize)
+			if shape[1] == inputDim {
+				shape = []uint64{shape[0], uint64(numChannels), uint64(patchSize), uint64(patchSize)}
+				channels := int(numChannels)
+				patch := int(patchSize)
+				t.SetRepacker(func(_ string, data []float32, srcShape []uint64) ([]float32, error) {
+					return repackPatchEmbeddingWeight(data, srcShape, channels, patch)
+				})
+			}
+		}
+
+		out = append(out, &ggml.Tensor{
+			Name:     name,
+			Kind:     t.Kind(),
+			Shape:    slices.Clone(shape),
+			WriterTo: t,
+		})
+	}
+
+	return out
+}
+
+func (p *lfm2VLProjectorModel) Replacements() []string {
+	return []string{
+		"model.multi_modal_projector.linear_1", "mm.1",
+		"model.multi_modal_projector.linear_2", "mm.2",
+		"model.vision_tower.vision_model.embeddings.patch_embedding", "v.patch_embd",
+		"model.vision_tower.vision_model.embeddings.position_embedding", "v.position_embd",
+		"model.vision_tower.vision_model.encoder.layers", "v.blk",
+		"self_attn.q_proj", "attn_q",
+		"self_attn.k_proj", "attn_k",
+		"self_attn.v_proj", "attn_v",
+		"self_attn.out_proj", "attn_out",
+		"layer_norm1", "ln1",
+		"layer_norm2", "ln2",
+		"mlp.fc1", "ffn_up",
+		"mlp.fc2", "ffn_down",
+		"model.vision_tower.vision_model.post_layernorm", "v.post_ln",
+	}
+}
+
+func repackPatchEmbeddingWeight(data []float32, srcShape []uint64, channels, patch int) ([]float32, error) {
+	if len(srcShape) != 2 {
+		return nil, fmt.Errorf("invalid patch embedding shape rank: %d", len(srcShape))
+	}
+
+	outDim := int(srcShape[0])
+	flatInputDim := int(srcShape[1])
+	expectedInputDim := channels * patch * patch
+	if flatInputDim != expectedInputDim {
+		return nil, fmt.Errorf("invalid patch embedding input dim: got %d, want %d", flatInputDim, expectedInputDim)
+	}
+
+	expectedSize := outDim * flatInputDim
+	if len(data) != expectedSize {
+		return nil, fmt.Errorf("invalid patch embedding data size: got %d, want %d", len(data), expectedSize)
+	}
+
+	repacked := make([]float32, len(data))
+	perChannel := patch * patch
+
+	for o := range outDim {
+		inBase := o * flatInputDim
+		outBase := o * flatInputDim
+
+		for y := range patch {
+			for x := range patch {
+				inPixelBase := inBase + (y*patch+x)*channels
+				for c := range channels {
+					src := inPixelBase + c
+					dst := outBase + c*perChannel + y*patch + x
+					repacked[dst] = data[src]
+				}
+			}
+		}
+	}
+
+	return repacked, nil
+}
--- a/convert/convert_lfm2_vl_test.go
+++ b/convert/convert_lfm2_vl_test.go
@@ -0,0 +1,249 @@
+package convert
+
+import (
+	"slices"
+	"strings"
+	"testing"
+)
+
+func TestLFM2VLTextModelKVUsesTextConfig(t *testing.T) {
+	p := lfm2VLTextModel{
+		TextConfig: lfm2Model{
+			ModelParameters:       ModelParameters{ModelType: "lfm2", VocabSize: 65536},
+			HiddenSize:            2048,
+			NumHiddenLayers:       16,
+			MaxPositionEmbeddings: 128000,
+			IntermediateSize:      12288,
+			BlockFFDim:            12288,
+			BlockAutoAdjustFFDim:  true,
+			BlockMultipleOf:       256,
+			BlockFFNDimMultiplier: 1.0,
+			NumAttentionHeads:     32,
+			NumKeyValueHeads:      8,
+			LayerTypes:            []string{"conv", "full_attention"},
+			NormEps:               1e-5,
+			ConvLCache:            3,
+		},
+		DownsampleFactor: 2,
+		VisionConfig: struct {
+			HiddenSize        uint32  `json:"hidden_size"`
+			IntermediateSize  uint32  `json:"intermediate_size"`
+			NumAttentionHeads uint32  `json:"num_attention_heads"`
+			NumHiddenLayers   uint32  `json:"num_hidden_layers"`
+			NumChannels       uint32  `json:"num_channels"`
+			PatchSize         uint32  `json:"patch_size"`
+			LayerNormEpsilon  float32 `json:"layer_norm_eps"`
+		}{
+			HiddenSize:        1152,
+			IntermediateSize:  4304,
+			NumAttentionHeads: 16,
+			NumHiddenLayers:   27,
+			NumChannels:       3,
+			PatchSize:         16,
+			LayerNormEpsilon:  1e-6,
+		},
+	}
+	p.Processor.ImageProcessor.TileSize = 512
+	p.Processor.ImageProcessor.ImageMean = []float32{0.5, 0.5, 0.5}
+	p.Processor.ImageProcessor.ImageStd = []float32{0.5, 0.5, 0.5}
+
+	kv := p.KV(&Tokenizer{
+		Vocabulary: &Vocabulary{
+			Model:  "gpt2",
+			Tokens: []string{"<|pad|>", "<image>", "<|image_start|>", "<|image_end|>", "<|img_thumbnail|>"},
+		},
+	})
+
+	if got, want := kv["general.architecture"], "lfm2"; got != want {
+		t.Fatalf("general.architecture = %v, want %v", got, want)
+	}
+
+	if got, want := kv["feed_forward_length"], uint32(8192); got != want {
+		t.Fatalf("feed_forward_length = %v, want %v", got, want)
+	}
+
+	if got, want := kv["vision.block_count"], uint32(27); got != want {
+		t.Fatalf("vision.block_count = %v, want %v", got, want)
+	}
+
+	if got, want := kv["vision.image_size"], uint32(256); got != want {
+		t.Fatalf("vision.image_size = %v, want %v", got, want)
+	}
+
+	if got, want := kv["vision.image_token_id"], uint32(396); got != want {
+		t.Fatalf("vision.image_token_id = %v, want %v", got, want)
+	}
+
+	if got, want := kv["vision.image_start_token_id"], uint32(2); got != want {
+		t.Fatalf("vision.image_start_token_id = %v, want %v", got, want)
+	}
+
+	if got, want := kv["vision.do_image_splitting"], true; got != want {
+		t.Fatalf("vision.do_image_splitting = %v, want %v", got, want)
+	}
+	if got, want := kv["vision.min_tiles"], uint32(2); got != want {
+		t.Fatalf("vision.min_tiles = %v, want %v", got, want)
+	}
+	if got, want := kv["vision.max_tiles"], uint32(10); got != want {
+		t.Fatalf("vision.max_tiles = %v, want %v", got, want)
+	}
+	if got, want := kv["vision.tile_size"], uint32(512); got != want {
+		t.Fatalf("vision.tile_size = %v, want %v", got, want)
+	}
+	if got, want := kv["vision.use_thumbnail"], true; got != want {
+		t.Fatalf("vision.use_thumbnail = %v, want %v", got, want)
+	}
+	if got, want := kv["vision.use_image_special_tokens"], true; got != want {
+		t.Fatalf("vision.use_image_special_tokens = %v, want %v", got, want)
+	}
+}
+
+func TestLFM2VLTextModelTensorsIncludeVision(t *testing.T) {
+	p := lfm2VLTextModel{}
+	p.VisionConfig.PatchSize = 16
+	p.VisionConfig.NumChannels = 3
+	input := []Tensor{
+		newLFM2StubTensor("model.embed_tokens.weight", []uint64{65536, 2048}),
+		newLFM2StubTensor("model.layers.0.ffn_norm.weight", []uint64{2048}),
+		newLFM2StubTensor("v.patch_embd.weight", []uint64{1152, 768}),
+		newLFM2StubTensor("v.blk.0.attn_q.weight", []uint64{1152, 1152}),
+		newLFM2StubTensor("mm.1.weight", []uint64{2048, 4608}),
+	}
+
+	out := p.Tensors(input)
+	if len(out) == 0 {
+		t.Fatal("expected non-empty tensor list")
+	}
+
+	foundPatch := false
+	foundVision := false
+	for _, tns := range out {
+		if tns.Name == "v.patch_embd.weight" {
+			foundPatch = true
+			if !slices.Equal(tns.Shape, []uint64{1152, 3, 16, 16}) {
+				t.Fatalf("v.patch_embd.weight shape = %v, want [1152 3 16 16]", tns.Shape)
+			}
+		}
+		if strings.HasPrefix(tns.Name, "v.") || strings.HasPrefix(tns.Name, "mm.") {
+			foundVision = true
+		}
+	}
+
+	if !foundPatch {
+		t.Fatal("expected v.patch_embd.weight in output tensors")
+	}
+	if !foundVision {
+		t.Fatal("expected at least one vision/projector tensor in output")
+	}
+}
+
+func TestLFM2VLTextModelReplacements(t *testing.T) {
+	p := lfm2VLTextModel{}
+	r := strings.NewReplacer(p.Replacements()...)
+
+	tests := []struct {
+		name string
+		in   string
+		want string
+	}{
+		{
+			name: "language_model_embed_tokens",
+			in:   "model.language_model.embed_tokens.weight",
+			want: "token_embd.weight",
+		},
+		{
+			name: "language_model_layers",
+			in:   "model.language_model.layers.2.self_attn.q_proj.weight",
+			want: "blk.2.attn_q.weight",
+		},
+		{
+			name: "nested_language_model_prefix",
+			in:   "model.language_model.model.embedding_norm.weight",
+			want: "token_embd_norm.weight",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := r.Replace(tt.in); got != tt.want {
+				t.Fatalf("replacement(%q) = %q, want %q", tt.in, got, tt.want)
+			}
+		})
+	}
+}
+
+func TestLFM2VLProjectorKV(t *testing.T) {
+	p := lfm2VLProjectorModel{
+		DownsampleFactor:   2,
+		ProjectorHiddenDim: 2048,
+	}
+	p.VisionModel.NumHiddenLayers = 27
+	p.VisionModel.HiddenSize = 1152
+	p.VisionModel.IntermediateSize = 4304
+	p.VisionModel.NumAttentionHeads = 16
+	p.VisionModel.PatchSize = 16
+	p.VisionModel.LayerNormEpsilon = 1e-6
+	p.Processor.ImageProcessor.TileSize = 512
+	p.Processor.ImageProcessor.ImageMean = []float32{0.5, 0.5, 0.5}
+	p.Processor.ImageProcessor.ImageStd = []float32{0.5, 0.5, 0.5}
+
+	kv := p.KV(nil)
+
+	if got, want := kv["general.architecture"], "clip"; got != want {
+		t.Fatalf("general.architecture = %v, want %v", got, want)
+	}
+	if got, want := kv["clip.projector_type"], "lfm2"; got != want {
+		t.Fatalf("clip.projector_type = %v, want %v", got, want)
+	}
+	if got, want := kv["clip.vision.image_size"], uint32(256); got != want {
+		t.Fatalf("clip.vision.image_size = %v, want %v", got, want)
+	}
+}
+
+func TestLFM2VLProjectorTensorsPatchReshape(t *testing.T) {
+	p := lfm2VLProjectorModel{}
+	p.VisionModel.NumChannels = 3
+	p.VisionModel.PatchSize = 16
+
+	input := []Tensor{
+		newLFM2StubTensor("v.patch_embd.weight", []uint64{1152, 768}),
+		newLFM2StubTensor("mm.1.weight", []uint64{2048, 4608}),
+		newLFM2StubTensor("model.embed_tokens.weight", []uint64{65536, 2048}),
+	}
+
+	out := p.Tensors(input)
+	if len(out) != 2 {
+		t.Fatalf("expected 2 tensors, got %d", len(out))
+	}
+
+	var patchShape []uint64
+	for _, tns := range out {
+		if tns.Name == "v.patch_embd.weight" {
+			patchShape = tns.Shape
+			break
+		}
+	}
+
+	if !slices.Equal(patchShape, []uint64{1152, 3, 16, 16}) {
+		t.Fatalf("v.patch_embd.weight shape = %v, want [1152 3 16 16]", patchShape)
+	}
+}
+
+func TestRepackPatchEmbeddingWeight(t *testing.T) {
+	data := []float32{
+		0, 1, // y=0,x=0
+		2, 3, // y=0,x=1
+		4, 5, // y=1,x=0
+		6, 7, // y=1,x=1
+	}
+
+	got, err := repackPatchEmbeddingWeight(data, []uint64{1, 8}, 2, 2)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	want := []float32{0, 2, 4, 6, 1, 3, 5, 7}
+	if !slices.Equal(got, want) {
+		t.Fatalf("repacked data = %v, want %v", got, want)
+	}
+}
--- a/convert/convert_nemotron_h.go
+++ b/convert/convert_nemotron_h.go
@@ -0,0 +1,385 @@
+package convert
+
+import (
+	"cmp"
+	"encoding/json"
+	"fmt"
+	"io/fs"
+	"math"
+	"slices"
+	"strings"
+
+	"github.com/ollama/ollama/fs/ggml"
+)
+
+type hybridPattern string
+
+func (p *hybridPattern) UnmarshalJSON(data []byte) error {
+	if string(data) == "null" {
+		*p = ""
+		return nil
+	}
+
+	var single string
+	if err := json.Unmarshal(data, &single); err == nil {
+		*p = hybridPattern(strings.TrimSpace(single))
+		return nil
+	}
+
+	var parts []string
+	if err := json.Unmarshal(data, &parts); err == nil {
+		*p = hybridPattern(strings.Join(parts, ""))
+		return nil
+	}
+
+	return fmt.Errorf("hybrid_override_pattern must be a string or string array")
+}
+
+type nemotronHModel struct {
+	ModelParameters
+	MaxPositionEmbeddings uint32        `json:"max_position_embeddings"`
+	HiddenSize            uint32        `json:"hidden_size"`
+	NumHiddenLayers       uint32        `json:"num_hidden_layers"`
+	NumAttentionHeads     uint32        `json:"num_attention_heads"`
+	NumKeyValueHeads      uint32        `json:"num_key_value_heads"`
+	HeadDim               uint32        `json:"head_dim"`
+	LayerNormEpsilon      float32       `json:"layer_norm_epsilon"`
+	NormEpsilon           float32       `json:"norm_eps"`
+	RopeTheta             float32       `json:"rope_theta"`
+	PartialRotaryFactor   float32       `json:"partial_rotary_factor"`
+	ConvKernel            uint32        `json:"conv_kernel"`
+	SSMStateSize          uint32        `json:"ssm_state_size"`
+	MambaNumHeads         uint32        `json:"mamba_num_heads"`
+	MambaHeadDim          uint32        `json:"mamba_head_dim"`
+	NGroups               uint32        `json:"n_groups"`
+	IntermediateSize      uint32        `json:"intermediate_size"`
+	HybridOverridePattern hybridPattern `json:"hybrid_override_pattern"`
+
+	// MoE
+	NumExperts                  uint32  `json:"num_experts"`
+	NumSharedExperts            uint32  `json:"num_shared_experts"`
+	NRoutedExperts              uint32  `json:"n_routed_experts"`
+	NSharedExperts              uint32  `json:"n_shared_experts"`
+	NumExpertsPerTok            uint32  `json:"num_experts_per_tok"`
+	MoEIntermediateSize         uint32  `json:"moe_intermediate_size"`
+	MoESharedExpertIntermediate uint32  `json:"moe_shared_expert_intermediate_size"`
+	NormTopKProb                bool    `json:"norm_topk_prob"`
+	RoutedScalingFactor         float32 `json:"routed_scaling_factor"`
+	ExpertGroupCount            uint32  `json:"n_group"`
+	ExpertGroupUsedCount        uint32  `json:"topk_group"`
+}
+
+var _ ModelConverter = (*nemotronHModel)(nil)
+
+func (n *nemotronHModel) parseMore(_ fs.FS) error {
+	if n.NumHiddenLayers == 0 {
+		return fmt.Errorf("nemotron_h: num_hidden_layers must be set")
+	}
+	if n.HiddenSize == 0 {
+		return fmt.Errorf("nemotron_h: hidden_size must be set")
+	}
+	if n.NumAttentionHeads == 0 {
+		return fmt.Errorf("nemotron_h: num_attention_heads must be set")
+	}
+	if n.HeadDim == 0 {
+		if n.HiddenSize%n.NumAttentionHeads != 0 {
+			return fmt.Errorf("nemotron_h: hidden_size (%d) must be divisible by num_attention_heads (%d)", n.HiddenSize, n.NumAttentionHeads)
+		}
+		n.HeadDim = n.HiddenSize / n.NumAttentionHeads
+	}
+	if n.NumKeyValueHeads == 0 {
+		n.NumKeyValueHeads = n.NumAttentionHeads
+	}
+	if n.ConvKernel == 0 {
+		return fmt.Errorf("nemotron_h: conv_kernel must be set")
+	}
+	if n.SSMStateSize == 0 {
+		return fmt.Errorf("nemotron_h: ssm_state_size must be set")
+	}
+	if n.ssmHeadCount() == 0 {
+		return fmt.Errorf("nemotron_h: mamba_num_heads must be set")
+	}
+	if n.MambaHeadDim == 0 {
+		return fmt.Errorf("nemotron_h: mamba_head_dim must be set")
+	}
+	if n.NGroups == 0 {
+		n.NGroups = 1
+	}
+
+	if _, _, err := n.layerArrays(); err != nil {
+		return err
+	}
+
+	if n.isMoE() {
+		if n.routedExpertCount() == 0 {
+			return fmt.Errorf("nemotron_h: routed expert count must be set for MoE models")
+		}
+		if n.NumExpertsPerTok == 0 {
+			return fmt.Errorf("nemotron_h: num_experts_per_tok must be set for MoE models")
+		}
+		if n.NumExpertsPerTok > n.routedExpertCount() {
+			return fmt.Errorf("nemotron_h: num_experts_per_tok (%d) cannot exceed expert_count (%d)", n.NumExpertsPerTok, n.routedExpertCount())
+		}
+		if n.moeIntermediateSize() == 0 {
+			return fmt.Errorf("nemotron_h: moe_intermediate_size must be set for MoE models")
+		}
+	}
+
+	return nil
+}
+
+func (n *nemotronHModel) isMoE() bool {
+	return cmp.Or(n.routedExpertCount(), n.NumExpertsPerTok, n.MoEIntermediateSize) > 0
+}
+
+func (n *nemotronHModel) routedExpertCount() uint32 {
+	return cmp.Or(n.NRoutedExperts, n.NumExperts)
+}
+
+func (n *nemotronHModel) sharedExpertCount() uint32 {
+	return cmp.Or(n.NSharedExperts, n.NumSharedExperts)
+}
+
+func (n *nemotronHModel) ssmHeadCount() uint32 {
+	return n.MambaNumHeads
+}
+
+func (n *nemotronHModel) ssmInnerSize() uint32 {
+	return n.MambaHeadDim * n.ssmHeadCount()
+}
+
+func (n *nemotronHModel) epsilon() float32 {
+	return cmp.Or(n.NormEpsilon, n.LayerNormEpsilon, float32(1e-5))
+}
+
+func (n *nemotronHModel) moeIntermediateSize() uint32 {
+	return cmp.Or(n.MoEIntermediateSize, n.IntermediateSize)
+}
+
+func (n *nemotronHModel) denseIntermediateSize() uint32 {
+	return cmp.Or(n.IntermediateSize, n.MoEIntermediateSize)
+}
+
+func (n *nemotronHModel) layerArrays() (headCountKV []uint32, ffnLengths []uint32, err error) {
+	pattern := strings.TrimSpace(string(n.HybridOverridePattern))
+	if pattern == "" {
+		return nil, nil, fmt.Errorf("nemotron_h: hybrid_override_pattern must be set")
+	}
+
+	runes := []rune(pattern)
+	if len(runes) != int(n.NumHiddenLayers) {
+		return nil, nil, fmt.Errorf("nemotron_h: hybrid_override_pattern length (%d) must match num_hidden_layers (%d)", len(runes), n.NumHiddenLayers)
+	}
+
+	headCountKV = make([]uint32, n.NumHiddenLayers)
+	ffnLengths = make([]uint32, n.NumHiddenLayers)
+
+	attnKVHeads := cmp.Or(n.NumKeyValueHeads, n.NumAttentionHeads)
+	moeFFN := n.moeIntermediateSize()
+	denseFFN := n.denseIntermediateSize()
+
+	for i, layerType := range runes {
+		switch layerType {
+		case 'M':
+			// Recurrent layer: no KV heads and no FFN.
+		case '*', 'A':
+			// Attention-only layer.
+			headCountKV[i] = attnKVHeads
+		case 'E':
+			// MoE layer.
+			if moeFFN == 0 {
+				return nil, nil, fmt.Errorf("nemotron_h: moe layer at index %d but moe_intermediate_size is zero", i)
+			}
+			ffnLengths[i] = moeFFN
+		case '-':
+			// Dense FFN layer.
+			if denseFFN == 0 {
+				return nil, nil, fmt.Errorf("nemotron_h: dense FFN layer at index %d but intermediate_size is zero", i)
+			}
+			ffnLengths[i] = denseFFN
+		default:
+			return nil, nil, fmt.Errorf("nemotron_h: unsupported layer type %q in hybrid_override_pattern at index %d", layerType, i)
+		}
+	}
+
+	return headCountKV, ffnLengths, nil
+}
+
+func (n *nemotronHModel) KV(t *Tokenizer) KV {
+	kv := n.ModelParameters.KV(t)
+
+	arch := "nemotron_h"
+	if n.isMoE() {
+		arch = "nemotron_h_moe"
+	}
+	kv["general.architecture"] = arch
+	kv["block_count"] = n.NumHiddenLayers
+	kv["context_length"] = n.MaxPositionEmbeddings
+	kv["embedding_length"] = n.HiddenSize
+	kv["attention.head_count"] = n.NumAttentionHeads
+	kv["attention.key_length"] = n.HeadDim
+	kv["attention.value_length"] = n.HeadDim
+	kv["attention.layer_norm_epsilon"] = n.epsilon()
+	kv["attention.layer_norm_rms_epsilon"] = n.epsilon()
+	kv["rope.freq_base"] = cmp.Or(n.RopeTheta, float32(10000))
+	if n.PartialRotaryFactor > 0 && n.PartialRotaryFactor <= 1 {
+		kv["rope.dimension_count"] = uint32(float32(n.HeadDim) * n.PartialRotaryFactor)
+	}
+
+	if headCountKV, ffnLengths, err := n.layerArrays(); err == nil {
+		kv["attention.head_count_kv"] = headCountKV
+		kv["feed_forward_length"] = ffnLengths
+	}
+
+	kv["ssm.conv_kernel"] = n.ConvKernel
+	kv["ssm.inner_size"] = n.ssmInnerSize()
+	kv["ssm.state_size"] = n.SSMStateSize
+	kv["ssm.group_count"] = n.NGroups
+	kv["ssm.time_step_rank"] = n.ssmHeadCount()
+
+	if n.isMoE() {
+		kv["expert_count"] = n.routedExpertCount()
+		kv["expert_used_count"] = n.NumExpertsPerTok
+		kv["expert_feed_forward_length"] = n.moeIntermediateSize()
+		if n.sharedExpertCount() > 0 {
+			kv["expert_shared_count"] = n.sharedExpertCount()
+		}
+		if n.MoESharedExpertIntermediate > 0 {
+			kv["expert_shared_feed_forward_length"] = n.MoESharedExpertIntermediate
+		}
+		kv["expert_weights_norm"] = n.NormTopKProb
+		kv["expert_weights_scale"] = n.RoutedScalingFactor
+		if n.ExpertGroupCount > 0 {
+			kv["expert_group_count"] = n.ExpertGroupCount
+		}
+		if n.ExpertGroupUsedCount > 0 {
+			kv["expert_group_used_count"] = n.ExpertGroupUsedCount
+		}
+	}
+
+	return kv
+}
+
+func normalizeVectorShapeToColumn(shape []uint64) []uint64 {
+	switch len(shape) {
+	case 1:
+		return []uint64{shape[0], 1}
+	case 2:
+		if shape[0] == 1 && shape[1] > 1 {
+			return []uint64{shape[1], 1}
+		}
+		if shape[1] == 1 && shape[0] > 1 {
+			return []uint64{shape[0], 1}
+		}
+	}
+
+	return slices.Clone(shape)
+}
+
+func (n *nemotronHModel) Tensors(ts []Tensor) []*ggml.Tensor {
+	var out []*ggml.Tensor
+
+	remaining := ts
+	if n.isMoE() {
+		merges := make([]merge, 0, n.NumHiddenLayers*2)
+		for i := range n.NumHiddenLayers {
+			merges = append(merges, merge{
+				fmt.Sprintf("blk.%d.mixer.experts.*.up_proj.weight", i),
+				fmt.Sprintf("blk.%d.ffn_up_exps.weight", i),
+			}, merge{
+				fmt.Sprintf("blk.%d.mixer.experts.*.down_proj.weight", i),
+				fmt.Sprintf("blk.%d.ffn_down_exps.weight", i),
+			})
+		}
+
+		merged, rest := mergeTensors(ts, merges...)
+		out = append(out, merged...)
+		remaining = rest
+	}
+
+	nGroups := uint64(cmp.Or(n.NGroups, uint32(1)))
+	for _, t := range remaining {
+		name := t.Name()
+		shape := slices.Clone(t.Shape())
+
+		switch {
+		case strings.HasSuffix(name, ".ssm_a"):
+			shape = normalizeVectorShapeToColumn(shape)
+			t.SetRepacker(func(_ string, data []float32, _ []uint64) ([]float32, error) {
+				out := make([]float32, len(data))
+				for i, v := range data {
+					out[i] = -float32(math.Exp(float64(v)))
+				}
+				return out, nil
+			})
+		case strings.HasSuffix(name, ".ssm_d"):
+			shape = normalizeVectorShapeToColumn(shape)
+		case strings.HasSuffix(name, ".ssm_norm.weight"):
+			switch len(shape) {
+			case 1:
+				if nGroups > 0 && shape[0]%nGroups == 0 {
+					shape = []uint64{nGroups, shape[0] / nGroups}
+				}
+			case 2:
+				if shape[0] == 1 && nGroups > 0 && shape[1]%nGroups == 0 {
+					shape = []uint64{nGroups, shape[1] / nGroups}
+				}
+			}
+		case strings.HasSuffix(name, ".ssm_conv1d.weight"):
+			if len(shape) == 3 {
+				if shape[0] == 1 {
+					shape = []uint64{shape[1], shape[2]}
+				} else if shape[1] == 1 {
+					shape = []uint64{shape[0], shape[2]}
+				}
+			}
+		}
+
+		out = append(out, &ggml.Tensor{
+			Name:     name,
+			Kind:     t.Kind(),
+			Shape:    shape,
+			WriterTo: t,
+		})
+	}
+
+	return out
+}
+
+func (n *nemotronHModel) Replacements() []string {
+	return []string{
+		// Embedding and output
+		"lm_head", "output",
+		"backbone.embeddings", "token_embd",
+		"backbone.norm_f", "output_norm",
+		"backbone.layers", "blk",
+
+		// Recurrent (Mamba2) tensors
+		"mixer.in_proj", "ssm_in",
+		"mixer.out_proj", "ssm_out",
+		"mixer.dt_bias", "ssm_dt.bias",
+		"mixer.A_log", "ssm_a",
+		"mixer.D", "ssm_d",
+		"mixer.conv1d", "ssm_conv1d",
+		"mixer.norm.weight", "ssm_norm.weight",
+
+		// Attention tensors
+		"mixer.q_proj", "attn_q",
+		"mixer.k_proj", "attn_k",
+		"mixer.v_proj", "attn_v",
+		"mixer.o_proj", "attn_output",
+
+		// FFN / MoE tensors
+		"mixer.gate.e_score_correction_bias", "exp_probs_b.bias",
+		"mixer.gate", "ffn_gate_inp",
+		"mixer.fc1_latent_proj", "ffn_latent_in",
+		"mixer.fc2_latent_proj", "ffn_latent_out",
+		"mixer.shared_experts.up_proj", "ffn_up_shexp",
+		"mixer.shared_experts.down_proj", "ffn_down_shexp",
+		"mixer.up_proj", "ffn_up",
+		"mixer.down_proj", "ffn_down",
+
+		// Per-layer pre-norm
+		".norm.weight", ".attn_norm.weight",
+	}
+}
--- a/convert/convert_nemotron_h_test.go
+++ b/convert/convert_nemotron_h_test.go
@@ -0,0 +1,230 @@
+package convert
+
+import (
+	"bytes"
+	"encoding/binary"
+	"encoding/json"
+	"io"
+	"os"
+	"path/filepath"
+	"slices"
+	"strings"
+	"testing"
+)
+
+func TestHybridPatternUnmarshal(t *testing.T) {
+	t.Run("string", func(t *testing.T) {
+		var p hybridPattern
+		if err := json.Unmarshal([]byte(`"MEM*"`), &p); err != nil {
+			t.Fatal(err)
+		}
+		if got, want := string(p), "MEM*"; got != want {
+			t.Fatalf("unexpected pattern: got %q want %q", got, want)
+		}
+	})
+
+	t.Run("array", func(t *testing.T) {
+		var p hybridPattern
+		if err := json.Unmarshal([]byte(`["M","E","M","*"]`), &p); err != nil {
+			t.Fatal(err)
+		}
+		if got, want := string(p), "MEM*"; got != want {
+			t.Fatalf("unexpected pattern: got %q want %q", got, want)
+		}
+	})
+}
+
+func TestNemotronHLayerArrays(t *testing.T) {
+	m := &nemotronHModel{
+		NumHiddenLayers:       5,
+		NumAttentionHeads:     32,
+		NumKeyValueHeads:      8,
+		HybridOverridePattern: "MEM*E",
+		NRoutedExperts:        128,
+		NumExpertsPerTok:      6,
+		MoEIntermediateSize:   1856,
+	}
+
+	headsKV, ffn, err := m.layerArrays()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if got, want := headsKV, []uint32{0, 0, 0, 8, 0}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected head_count_kv: got %v want %v", got, want)
+	}
+	if got, want := ffn, []uint32{0, 1856, 0, 0, 1856}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected feed_forward_length: got %v want %v", got, want)
+	}
+}
+
+func TestNemotronHKV(t *testing.T) {
+	m := &nemotronHModel{
+		MaxPositionEmbeddings:       1048576,
+		HiddenSize:                  2688,
+		NumHiddenLayers:             5,
+		NumAttentionHeads:           32,
+		NumKeyValueHeads:            2,
+		HeadDim:                     128,
+		LayerNormEpsilon:            1e-5,
+		RopeTheta:                   10000,
+		PartialRotaryFactor:         0.5,
+		ConvKernel:                  4,
+		SSMStateSize:                128,
+		MambaNumHeads:               64,
+		MambaHeadDim:                64,
+		NGroups:                     8,
+		HybridOverridePattern:       "MEM*E",
+		NRoutedExperts:              128,
+		NSharedExperts:              1,
+		NumExpertsPerTok:            6,
+		MoEIntermediateSize:         1856,
+		MoESharedExpertIntermediate: 3712,
+		NormTopKProb:                true,
+		RoutedScalingFactor:         2.5,
+	}
+	if err := m.parseMore(nil); err != nil {
+		t.Fatal(err)
+	}
+
+	kv := m.KV(&Tokenizer{Vocabulary: &Vocabulary{}})
+	if got, want := kv["general.architecture"], "nemotron_h_moe"; got != want {
+		t.Fatalf("unexpected architecture: got %v want %v", got, want)
+	}
+
+	headCountKV, ok := kv["attention.head_count_kv"].([]uint32)
+	if !ok {
+		t.Fatalf("attention.head_count_kv has unexpected type: %T", kv["attention.head_count_kv"])
+	}
+	if got, want := headCountKV, []uint32{0, 0, 0, 2, 0}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected attention.head_count_kv: got %v want %v", got, want)
+	}
+
+	ffnLength, ok := kv["feed_forward_length"].([]uint32)
+	if !ok {
+		t.Fatalf("feed_forward_length has unexpected type: %T", kv["feed_forward_length"])
+	}
+	if got, want := ffnLength, []uint32{0, 1856, 0, 0, 1856}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected feed_forward_length: got %v want %v", got, want)
+	}
+}
+
+func TestNemotronHTensorsTransforms(t *testing.T) {
+	m := &nemotronHModel{NGroups: 8}
+	in := []Tensor{
+		&fakeTensor{
+			name:  "blk.0.ssm_a",
+			shape: []uint64{4},
+			data:  []float32{0, 1, 2, 3},
+		},
+		&fakeTensor{
+			name:  "blk.0.ssm_d",
+			shape: []uint64{4},
+			data:  []float32{0, 1, 2, 3},
+		},
+		&fakeTensor{
+			name:  "blk.0.ssm_norm.weight",
+			shape: []uint64{16},
+			data:  make([]float32, 16),
+		},
+		&fakeTensor{
+			name:  "blk.0.ssm_conv1d.weight",
+			shape: []uint64{10, 1, 4},
+			data:  make([]float32, 40),
+		},
+	}
+
+	out := m.Tensors(in)
+	if len(out) != len(in) {
+		t.Fatalf("unexpected output tensor count: got %d want %d", len(out), len(in))
+	}
+
+	got := map[string]struct {
+		shape  []uint64
+		writer io.WriterTo
+	}{}
+	for _, t := range out {
+		got[t.Name] = struct {
+			shape  []uint64
+			writer io.WriterTo
+		}{shape: t.Shape, writer: t.WriterTo}
+	}
+
+	if shape := got["blk.0.ssm_a"].shape; !slices.Equal(shape, []uint64{4, 1}) {
+		t.Fatalf("unexpected ssm_a shape: %v", shape)
+	}
+	if shape := got["blk.0.ssm_d"].shape; !slices.Equal(shape, []uint64{4, 1}) {
+		t.Fatalf("unexpected ssm_d shape: %v", shape)
+	}
+	if shape := got["blk.0.ssm_norm.weight"].shape; !slices.Equal(shape, []uint64{8, 2}) {
+		t.Fatalf("unexpected ssm_norm shape: %v", shape)
+	}
+	if shape := got["blk.0.ssm_conv1d.weight"].shape; !slices.Equal(shape, []uint64{10, 4}) {
+		t.Fatalf("unexpected ssm_conv1d shape: %v", shape)
+	}
+
+	var b bytes.Buffer
+	if _, err := got["blk.0.ssm_a"].writer.WriteTo(&b); err != nil {
+		t.Fatal(err)
+	}
+	values := make([]float32, 4)
+	if err := binary.Read(&b, binary.LittleEndian, &values); err != nil {
+		t.Fatal(err)
+	}
+	// 0 -> -exp(0) == -1
+	if values[0] != -1 {
+		t.Fatalf("unexpected transformed ssm_a[0]: got %v want -1", values[0])
+	}
+}
+
+func TestNemotronHLoadModelMetadata(t *testing.T) {
+	tempDir := t.TempDir()
+
+	config := `{
+		"architectures": ["NemotronHForCausalLM"],
+		"model_type": "nemotron_h",
+		"num_hidden_layers": 4,
+		"hidden_size": 512,
+		"max_position_embeddings": 32768,
+		"num_attention_heads": 8,
+		"num_key_value_heads": 2,
+		"head_dim": 64,
+		"layer_norm_epsilon": 1e-5,
+		"conv_kernel": 4,
+		"ssm_state_size": 128,
+		"mamba_num_heads": 16,
+		"mamba_head_dim": 32,
+		"n_groups": 8,
+		"hybrid_override_pattern": "ME*M",
+		"n_routed_experts": 16,
+		"num_experts_per_tok": 4,
+		"moe_intermediate_size": 256
+	}`
+
+	if err := os.WriteFile(filepath.Join(tempDir, "config.json"), []byte(config), 0o644); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(filepath.Join(tempDir, "tokenizer.json"), []byte(`{}`), 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	kv, _, err := LoadModelMetadata(os.DirFS(tempDir))
+	if err != nil {
+		t.Fatal(err)
+	}
+	if _, ok := kv.(*nemotronHModel); !ok {
+		t.Fatalf("unexpected converter type: %T", kv)
+	}
+}
+
+func TestNemotronHReplacementsLatentProjections(t *testing.T) {
+	m := &nemotronHModel{}
+	r := strings.NewReplacer(m.Replacements()...)
+
+	if got, want := r.Replace("backbone.layers.1.mixer.fc1_latent_proj.weight"), "blk.1.ffn_latent_in.weight"; got != want {
+		t.Fatalf("unexpected fc1 replacement: got %q want %q", got, want)
+	}
+	if got, want := r.Replace("backbone.layers.1.mixer.fc2_latent_proj.weight"), "blk.1.ffn_latent_out.weight"; got != want {
+		t.Fatalf("unexpected fc2 replacement: got %q want %q", got, want)
+	}
+}
--- a/convert/convert_qwen3next.go
+++ b/convert/convert_qwen3next.go
@@ -1,6 +1,7 @@
 package convert

 import (
+	"encoding/json"
 	"fmt"
 	"io/fs"
 	"math"
@@ -13,8 +14,21 @@ import (
 	"github.com/ollama/ollama/fs/ggml"
 )

-type qwen3NextModel struct {
-	ModelParameters
+type qwen3NextRopeScaling struct {
+	Type         string     `json:"type"`
+	Factor       ropeFactor `json:"factor"`
+	MropeSection []int32    `json:"mrope_section"`
+}
+
+type qwen3NextRopeParams struct {
+	MRopeInterleaved    bool    `json:"mrope_interleaved"`
+	MropeSection        []int32 `json:"mrope_section"`
+	RopeType            string  `json:"rope_type"`
+	RopeTheta           float32 `json:"rope_theta"`
+	PartialRotaryFactor float32 `json:"partial_rotary_factor"`
+}
+
+type qwen3NextTextConfig struct {
 	MaxPositionEmbeddings uint32  `json:"max_position_embeddings"`
 	HiddenSize            uint32  `json:"hidden_size"`
 	NumHiddenLayers       uint32  `json:"num_hidden_layers"`
@@ -28,12 +42,13 @@ type qwen3NextModel struct {
 	// MoE config
 	NumExperts             uint32 `json:"num_experts"`
 	NumExpertsPerToken     uint32 `json:"num_experts_per_tok"`
-	NormTopkProb           bool   `json:"norm_topk_prob"`
+	NormTopkProb           *bool  `json:"norm_topk_prob"`
 	MoEIntermediateSize    uint32 `json:"moe_intermediate_size"`
 	SharedExpertIntermSize uint32 `json:"shared_expert_intermediate_size"`

 	// Hybrid attention config
-	FullAttentionInterval uint32 `json:"full_attention_interval"`
+	FullAttentionInterval uint32   `json:"full_attention_interval"`
+	LayerTypes            []string `json:"layer_types"`

 	// Linear attention (Gated Delta Net) config
 	LinearConvKernelDim uint32 `json:"linear_conv_kernel_dim"`
@@ -43,16 +58,102 @@ type qwen3NextModel struct {
 	LinearValueHeadDim  uint32 `json:"linear_value_head_dim"`

 	// RoPE config
-	PartialRotaryFactor float32 `json:"partial_rotary_factor"`
-	RopeScaling         struct {
-		Type   string     `json:"type"`
-		Factor ropeFactor `json:"factor"`
-	} `json:"rope_scaling"`
+	PartialRotaryFactor float32              `json:"partial_rotary_factor"`
+	RopeScaling         qwen3NextRopeScaling `json:"rope_scaling"`
+	RopeParameters      qwen3NextRopeParams  `json:"rope_parameters"`
+}
+
+type qwen3NextVisionConfig struct {
+	Depth                  uint32  `json:"depth"`
+	HiddenSize             uint32  `json:"hidden_size"`
+	NumHeads               uint32  `json:"num_heads"`
+	InChannels             uint32  `json:"in_channels"`
+	PatchSize              uint32  `json:"patch_size"`
+	SpatialMergeSize       uint32  `json:"spatial_merge_size"`
+	RMSNormEps             float32 `json:"layer_norm_epsilon"`
+	RopeTheta              float32 `json:"rope_theta"`
+	TemporalPatchSize      uint32  `json:"temporal_patch_size"`
+	DeepstackVisualIndexes []int32 `json:"deepstack_visual_indexes"`
+
+	Size struct {
+		ShortestEdge uint32 `json:"shortest_edge"`
+		LongestEdge  uint32 `json:"longest_edge"`
+	} `json:"size"`
+
+	ImageMean []float32 `json:"image_mean"`
+	ImageStd  []float32 `json:"image_std"`
+}
+
+type qwen3NextModel struct {
+	ModelParameters
+	qwen3NextTextConfig
+
+	TextConfig  *qwen3NextTextConfig  `json:"text_config"`
+	VisionModel qwen3NextVisionConfig `json:"vision_config"`
+
+	ImageTokenID       uint32 `json:"image_token_id"`
+	VisionStartTokenID uint32 `json:"vision_start_token_id"`
+	VisionEndTokenID   uint32 `json:"vision_end_token_id"`
 }

 var _ ModelConverter = (*qwen3NextModel)(nil)

-func (q *qwen3NextModel) parseMore(_ fs.FS) error {
+func (q *qwen3NextModel) parseMore(fsys fs.FS) error {
+	if q.TextConfig != nil {
+		q.qwen3NextTextConfig = *q.TextConfig
+	}
+
+	if q.RopeTheta == 0 {
+		q.RopeTheta = q.RopeParameters.RopeTheta
+	}
+	if q.PartialRotaryFactor == 0 {
+		q.PartialRotaryFactor = q.RopeParameters.PartialRotaryFactor
+	}
+
+	if q.RopeScaling.Type == "" && q.RopeParameters.RopeType != "" {
+		q.RopeScaling.Type = q.RopeParameters.RopeType
+	}
+
+	// Pull vision preprocessing fields when present.
+	if q.VisionModel.Depth > 0 {
+		if bts, err := fs.ReadFile(fsys, "preprocessor_config.json"); err == nil {
+			var pre struct {
+				Size struct {
+					ShortestEdge uint32 `json:"shortest_edge"`
+					LongestEdge  uint32 `json:"longest_edge"`
+				} `json:"size"`
+				PatchSize         uint32    `json:"patch_size"`
+				TemporalPatchSize uint32    `json:"temporal_patch_size"`
+				MergeSize         uint32    `json:"merge_size"`
+				ImageMean         []float32 `json:"image_mean"`
+				ImageStd          []float32 `json:"image_std"`
+			}
+			if json.Unmarshal(bts, &pre) == nil {
+				if q.VisionModel.PatchSize == 0 {
+					q.VisionModel.PatchSize = pre.PatchSize
+				}
+				if q.VisionModel.TemporalPatchSize == 0 {
+					q.VisionModel.TemporalPatchSize = pre.TemporalPatchSize
+				}
+				if q.VisionModel.SpatialMergeSize == 0 {
+					q.VisionModel.SpatialMergeSize = pre.MergeSize
+				}
+				if q.VisionModel.Size.ShortestEdge == 0 {
+					q.VisionModel.Size.ShortestEdge = pre.Size.ShortestEdge
+				}
+				if q.VisionModel.Size.LongestEdge == 0 {
+					q.VisionModel.Size.LongestEdge = pre.Size.LongestEdge
+				}
+				if len(q.VisionModel.ImageMean) == 0 {
+					q.VisionModel.ImageMean = pre.ImageMean
+				}
+				if len(q.VisionModel.ImageStd) == 0 {
+					q.VisionModel.ImageStd = pre.ImageStd
+				}
+			}
+		}
+	}
+
 	if q.NumHiddenLayers == 0 {
 		return fmt.Errorf("qwen3next: num_hidden_layers must be set")
 	}
@@ -74,36 +175,96 @@ func (q *qwen3NextModel) parseMore(_ fs.FS) error {
 	if q.LinearNumKeyHeads == 0 || q.LinearNumValueHeads == 0 || q.LinearKeyHeadDim == 0 || q.LinearValueHeadDim == 0 {
 		return fmt.Errorf("qwen3next: linear attention config must be set (linear_num_key_heads, linear_num_value_heads, linear_key_head_dim, linear_value_head_dim)")
 	}
-	if q.FullAttentionInterval == 0 {
-		return fmt.Errorf("qwen3next: full_attention_interval must be set")
-	}
-	if q.FullAttentionInterval > q.NumHiddenLayers {
-		return fmt.Errorf("qwen3next: full_attention_interval (%d) exceeds num_hidden_layers (%d)", q.FullAttentionInterval, q.NumHiddenLayers)
-	}
-
-	hasFull := false
-	for i := range q.NumHiddenLayers {
-		if (i+1)%q.FullAttentionInterval == 0 {
-			hasFull = true
-			break
-		}
-	}
-	if !hasFull {
-		return fmt.Errorf("qwen3next: head_count_kv would be all zeros (full_attention_interval=%d, num_hidden_layers=%d)", q.FullAttentionInterval, q.NumHiddenLayers)
+	if _, err := q.kvHeadCounts(); err != nil {
+		return err
 	}

 	return nil
 }

+func (q *qwen3NextModel) kvHeadCounts() ([]uint32, error) {
+	if len(q.LayerTypes) > 0 {
+		kv := make([]uint32, q.NumHiddenLayers)
+		hasFull := false
+		hasRecurrent := false
+		for i := range q.NumHiddenLayers {
+			layerType := ""
+			if i < uint32(len(q.LayerTypes)) {
+				layerType = q.LayerTypes[i]
+			}
+			if layerType == "full_attention" {
+				kv[i] = q.NumKeyValueHeads
+				hasFull = true
+			} else {
+				hasRecurrent = true
+			}
+		}
+		if !hasFull || !hasRecurrent {
+			return nil, fmt.Errorf("qwen3next: layer_types must include both full_attention and linear_attention")
+		}
+		return kv, nil
+	}
+
+	if q.FullAttentionInterval == 0 {
+		return nil, fmt.Errorf("qwen3next: full_attention_interval must be set")
+	}
+	if q.FullAttentionInterval > q.NumHiddenLayers {
+		return nil, fmt.Errorf("qwen3next: full_attention_interval (%d) exceeds num_hidden_layers (%d)", q.FullAttentionInterval, q.NumHiddenLayers)
+	}
+
+	kv := make([]uint32, q.NumHiddenLayers)
+	hasFull := false
+	for i := range q.NumHiddenLayers {
+		if (i+1)%q.FullAttentionInterval == 0 {
+			kv[i] = q.NumKeyValueHeads
+			hasFull = true
+		}
+	}
+	if !hasFull {
+		return nil, fmt.Errorf("qwen3next: head_count_kv would be all zeros (full_attention_interval=%d, num_hidden_layers=%d)", q.FullAttentionInterval, q.NumHiddenLayers)
+	}
+	return kv, nil
+}
+
+func (q *qwen3NextModel) ropeSections() []int32 {
+	if len(q.RopeParameters.MropeSection) > 0 {
+		return q.RopeParameters.MropeSection
+	}
+	return q.RopeScaling.MropeSection
+}
+
+func (q *qwen3NextModel) shouldReorderVHeads() bool {
+	modelType := strings.ToLower(q.ModelType)
+	if strings.Contains(modelType, "qwen3_next") || strings.Contains(modelType, "qwen3next") {
+		return false
+	}
+
+	for _, arch := range q.Architectures {
+		arch = strings.ToLower(arch)
+		if strings.Contains(arch, "qwen3next") || strings.Contains(arch, "qwen3_next") {
+			return false
+		}
+	}
+
+	// Default to qwen3.5 layout for all other qwen3next-family imports.
+	return true
+}
+
 func (q *qwen3NextModel) KV(t *Tokenizer) KV {
 	kv := q.ModelParameters.KV(t)
-	kv["general.architecture"] = "qwen3next"
-	kv["tokenizer.ggml.pre"] = "qwen2"
+
+	arch := "qwen35"
+	if q.NumExperts > 0 {
+		arch = "qwen35moe"
+	}
+	kv["general.architecture"] = arch
+	kv["tokenizer.ggml.pre"] = "qwen35"
 	kv["block_count"] = q.NumHiddenLayers
 	kv["context_length"] = q.MaxPositionEmbeddings
 	kv["embedding_length"] = q.HiddenSize
 	kv["feed_forward_length"] = q.IntermediateSize
 	kv["attention.head_count"] = q.NumAttentionHeads
+
 	headDim := q.HeadDim
 	if headDim == 0 && q.NumAttentionHeads > 0 {
 		headDim = q.HiddenSize / q.NumAttentionHeads
@@ -113,18 +274,31 @@ func (q *qwen3NextModel) KV(t *Tokenizer) KV {
 	kv["attention.layer_norm_rms_epsilon"] = q.RMSNormEPS
 	kv["rope.freq_base"] = q.RopeTheta

-	// RoPE dimension count (partial rotary)
-	// partial_rotary_factor = 0.25 means only 25% of head_dim uses RoPE
 	partialRotary := q.PartialRotaryFactor
 	if partialRotary > 0 && partialRotary <= 1 {
 		kv["rope.dimension_count"] = uint32(float32(headDim) * partialRotary)
 	}

-	// MoE config
+	if sections := q.ropeSections(); len(sections) > 0 {
+		kv["mrope_sections"] = sections
+		kv["rope.mrope_section"] = sections
+		kv["rope.dimension_sections"] = sections
+	}
+	if q.RopeParameters.MRopeInterleaved {
+		kv["rope.mrope_interleaved"] = true
+	}
+
+	if q.RopeScaling.Type != "" && q.RopeScaling.Type != "default" {
+		kv["rope.scaling.type"] = q.RopeScaling.Type
+		kv["rope.scaling.factor"] = q.RopeScaling.Factor
+	}
+
 	if q.NumExperts > 0 {
 		kv["expert_count"] = q.NumExperts
 		kv["expert_used_count"] = q.NumExpertsPerToken
-		kv["norm_top_k_prob"] = q.NormTopkProb
+		if q.NormTopkProb != nil {
+			kv["norm_top_k_prob"] = *q.NormTopkProb
+		}
 		if q.MoEIntermediateSize > 0 {
 			kv["expert_feed_forward_length"] = q.MoEIntermediateSize
 		}
@@ -133,33 +307,66 @@ func (q *qwen3NextModel) KV(t *Tokenizer) KV {
 		}
 	}

-	// SSM/Linear attention config
-	// d_inner = linear_value_head_dim * linear_num_value_heads
 	dInner := q.LinearValueHeadDim * q.LinearNumValueHeads
 	kv["ssm.inner_size"] = dInner
-	kv["ssm.state_size"] = q.LinearKeyHeadDim        // head_k_dim
-	kv["ssm.group_count"] = q.LinearNumKeyHeads      // num_k_heads
-	kv["ssm.time_step_rank"] = q.LinearNumValueHeads // num_v_heads
+	kv["ssm.state_size"] = q.LinearKeyHeadDim
+	kv["ssm.group_count"] = q.LinearNumKeyHeads
+	kv["ssm.time_step_rank"] = q.LinearNumValueHeads
 	kv["ssm.conv_kernel"] = q.LinearConvKernelDim
-	interval := q.FullAttentionInterval
-	kv["full_attention_interval"] = interval
-
-	// Build per-layer KV head count array to identify layer types
-	// 0 = recurrent (linear attention), non-zero = full attention
-	kvHeadCounts := make([]uint32, q.NumHiddenLayers)
-	for i := range q.NumHiddenLayers {
-		// Full attention every full_attention_interval layers (starting at interval-1)
-		if interval > 0 && (i+1)%interval == 0 {
-			kvHeadCounts[i] = q.NumKeyValueHeads
-		}
-		// else stays 0 (recurrent layer)
+	if q.shouldReorderVHeads() {
+		kv["ssm.v_head_reordered"] = true
+	}
+	if q.FullAttentionInterval > 0 {
+		kv["full_attention_interval"] = q.FullAttentionInterval
 	}
-	kv["attention.head_count_kv"] = kvHeadCounts

-	// RoPE scaling
-	if q.RopeScaling.Type != "" {
-		kv["rope.scaling.type"] = q.RopeScaling.Type
-		kv["rope.scaling.factor"] = q.RopeScaling.Factor
+	if headCounts, err := q.kvHeadCounts(); err == nil {
+		kv["attention.head_count_kv"] = headCounts
+	}
+
+	if q.VisionModel.Depth > 0 {
+		kv["vision.block_count"] = q.VisionModel.Depth
+		kv["vision.embedding_length"] = q.VisionModel.HiddenSize
+		kv["vision.attention.head_count"] = q.VisionModel.NumHeads
+		kv["vision.num_channels"] = q.VisionModel.InChannels
+		if q.VisionModel.PatchSize > 0 {
+			kv["vision.patch_size"] = q.VisionModel.PatchSize
+		}
+		if q.VisionModel.SpatialMergeSize > 0 {
+			kv["vision.spatial_merge_size"] = q.VisionModel.SpatialMergeSize
+		}
+		if q.VisionModel.RMSNormEps > 0 {
+			kv["vision.attention.layer_norm_epsilon"] = q.VisionModel.RMSNormEps
+		}
+		if q.VisionModel.RopeTheta > 0 {
+			kv["vision.rope.freq_base"] = q.VisionModel.RopeTheta
+		}
+		if q.VisionModel.TemporalPatchSize > 0 {
+			kv["vision.temporal_patch_size"] = q.VisionModel.TemporalPatchSize
+		}
+		kv["vision.deepstack_visual_indexes"] = q.VisionModel.DeepstackVisualIndexes
+		if q.VisionModel.Size.ShortestEdge > 0 {
+			kv["vision.shortest_edge"] = q.VisionModel.Size.ShortestEdge
+		}
+		if q.VisionModel.Size.LongestEdge > 0 {
+			kv["vision.longest_edge"] = q.VisionModel.Size.LongestEdge
+		}
+		if len(q.VisionModel.ImageMean) > 0 {
+			kv["vision.image_mean"] = q.VisionModel.ImageMean
+		}
+		if len(q.VisionModel.ImageStd) > 0 {
+			kv["vision.image_std"] = q.VisionModel.ImageStd
+		}
+	}
+
+	if q.ImageTokenID > 0 {
+		kv["image_token_id"] = q.ImageTokenID
+	}
+	if q.VisionStartTokenID > 0 {
+		kv["vision_start_token_id"] = q.VisionStartTokenID
+	}
+	if q.VisionEndTokenID > 0 {
+		kv["vision_end_token_id"] = q.VisionEndTokenID
 	}

 	return kv
@@ -168,7 +375,6 @@ func (q *qwen3NextModel) KV(t *Tokenizer) KV {
 func (q *qwen3NextModel) Tensors(ts []Tensor) []*ggml.Tensor {
 	var out []*ggml.Tensor

-	// Create merges for expert tensors - stack individual experts into batched tensors
 	merges := make([]merge, q.NumHiddenLayers*3)
 	for i := range q.NumHiddenLayers {
 		merges[i*3+0] = merge{
@@ -185,16 +391,13 @@ func (q *qwen3NextModel) Tensors(ts []Tensor) []*ggml.Tensor {
 		}
 	}

-	// Merge expert tensors
 	merged, remaining := mergeTensors(ts, merges...)
 	out = append(out, merged...)

-	// Process remaining tensors
 	for _, t := range remaining {
 		name := t.Name()
 		shape := t.Shape()

-		// Split linear_attn.in_proj_qkvz (ssm_in) into attn_qkv + attn_gate when possible
 		if strings.HasSuffix(name, ".ssm_in.weight") {
 			if qkv, gate, ok := q.splitQKVZTensor(t); ok {
 				out = append(out, qkv, gate)
@@ -204,84 +407,299 @@ func (q *qwen3NextModel) Tensors(ts []Tensor) []*ggml.Tensor {
 		}

 		switch {
-		// Add 1 to norm weights (except ssm_norm which is linear_attn.norm)
-		// This matches the Python converter behavior for qwen3next
+		case strings.Contains(name, ".mlp.experts.gate_up_proj"):
+			out = append(out, slices.Collect(splitDim(t, 1,
+				split{Replacer: strings.NewReplacer(".mlp.experts.gate_up_proj", ".ffn_gate_exps.weight")},
+				split{Replacer: strings.NewReplacer(".mlp.experts.gate_up_proj", ".ffn_up_exps.weight")},
+			))...)
+
+		case strings.Contains(name, ".mlp.experts.down_proj"):
+			out = append(out, &ggml.Tensor{
+				Name:     strings.NewReplacer(".mlp.experts.down_proj", ".ffn_down_exps.weight").Replace(name),
+				Kind:     t.Kind(),
+				Shape:    slices.Clone(shape),
+				WriterTo: t,
+			})
+
+		case strings.HasPrefix(name, "v.blk.") && strings.Contains(name, ".attn_qkv"):
+			out = append(out, slices.Collect(splitDim(t, 0,
+				split{Replacer: strings.NewReplacer("attn_qkv", "attn_q")},
+				split{Replacer: strings.NewReplacer("attn_qkv", "attn_k")},
+				split{Replacer: strings.NewReplacer("attn_qkv", "attn_v")},
+			))...)
+
+		case strings.Contains(name, "patch_embed") && strings.HasSuffix(name, "weight"):
+			out = append(out, &ggml.Tensor{
+				Name:     name,
+				Kind:     t.Kind(),
+				Shape:    append([]uint64{shape[0] * shape[1]}, shape[2:]...),
+				WriterTo: t,
+			})
+
 		case strings.HasSuffix(name, "_norm.weight") && !strings.HasSuffix(name, ".ssm_norm.weight"):
 			t.SetRepacker(q.addOne)
-			out = append(out, &ggml.Tensor{
-				Name:     name,
-				Kind:     t.Kind(),
-				Shape:    slices.Clone(shape),
-				WriterTo: t,
-			})
+			out = append(out, &ggml.Tensor{Name: name, Kind: t.Kind(), Shape: slices.Clone(shape), WriterTo: t})

-		// Handle linear attention A_log -> ssm_a (negate and exp)
-		// Note: name has already been transformed by Replacements at this point
 		case strings.HasSuffix(name, ".ssm_a"):
-			t.SetRepacker(func(_ string, data []float32, shape []uint64) ([]float32, error) {
-				// Compute -exp(A_log)
-				result := make([]float32, len(data))
-				for i, v := range data {
-					// -exp(v)
-					result[i] = -float32(math.Exp(float64(v)))
-				}
-				return result, nil
-			})
-			out = append(out, &ggml.Tensor{
-				Name:     name,
-				Kind:     t.Kind(),
-				Shape:    slices.Clone(shape),
-				WriterTo: t,
-			})
+			t.SetRepacker(q.repackSSMA())
+			out = append(out, &ggml.Tensor{Name: name, Kind: t.Kind(), Shape: slices.Clone(shape), WriterTo: t})
+
+		case strings.HasSuffix(name, ".attn_qkv.weight"):
+			if q.shouldReorderVHeads() {
+				t.SetRepacker(q.repackAttnQKV())
+			}
+			out = append(out, &ggml.Tensor{Name: name, Kind: t.Kind(), Shape: slices.Clone(shape), WriterTo: t})
+
+		case strings.HasSuffix(name, ".attn_gate.weight"):
+			if q.shouldReorderVHeads() {
+				// HF tensor layout is [out_features, in_features]; reorder rows.
+				t.SetRepacker(q.repackReorderDim(0, int(q.LinearValueHeadDim)))
+			}
+			out = append(out, &ggml.Tensor{Name: name, Kind: t.Kind(), Shape: slices.Clone(shape), WriterTo: t})
+
+		case strings.HasSuffix(name, ".ssm_beta.weight"), strings.HasSuffix(name, ".ssm_alpha.weight"):
+			if q.shouldReorderVHeads() {
+				// HF tensor layout is [out_features, in_features]; reorder rows.
+				t.SetRepacker(q.repackReorderDim(0, 1))
+			}
+			out = append(out, &ggml.Tensor{Name: name, Kind: t.Kind(), Shape: slices.Clone(shape), WriterTo: t})
+
+		case strings.HasSuffix(name, ".ssm_dt"):
+			if q.shouldReorderVHeads() {
+				t.SetRepacker(q.repackReorderDim(0, 1))
+			}
+			out = append(out, &ggml.Tensor{Name: name, Kind: t.Kind(), Shape: slices.Clone(shape), WriterTo: t})
+
+		case strings.HasSuffix(name, ".ssm_out.weight"):
+			if q.shouldReorderVHeads() {
+				// HF out_proj layout is [out_features, in_features]; reorder columns.
+				t.SetRepacker(q.repackReorderDim(1, int(q.LinearValueHeadDim)))
+			}
+			out = append(out, &ggml.Tensor{Name: name, Kind: t.Kind(), Shape: slices.Clone(shape), WriterTo: t})

-		// Squeeze conv1d weights: [1, D, K] or [D, 1, K] -> [D, K]
 		case strings.HasSuffix(name, ".ssm_conv1d.weight"):
 			newShape := slices.Clone(shape)
 			if len(shape) == 3 {
 				if shape[0] == 1 {
-					// [1, D, K] -> [D, K]
 					newShape = []uint64{shape[1], shape[2]}
 				} else if shape[1] == 1 {
-					// [D, 1, K] -> [D, K]
 					newShape = []uint64{shape[0], shape[2]}
 				}
 			}
-			out = append(out, &ggml.Tensor{
-				Name:     name,
-				Kind:     t.Kind(),
-				Shape:    newShape,
-				WriterTo: t,
-			})
-		// Squeeze shared expert gate: [D, 1] or [1, D] -> [D]
-		case strings.HasSuffix(name, ".ffn_gate_inp_shexp.weight"):
-			newShape := slices.Clone(shape)
-			if len(shape) == 2 {
-				if shape[0] == 1 && shape[1] > 1 {
-					newShape = []uint64{shape[1]}
-				} else if shape[1] == 1 && shape[0] > 1 {
-					newShape = []uint64{shape[0]}
-				}
+			if q.shouldReorderVHeads() {
+				t.SetRepacker(q.repackConv1D())
 			}
-			out = append(out, &ggml.Tensor{
-				Name:     name,
-				Kind:     t.Kind(),
-				Shape:    newShape,
-				WriterTo: t,
-			})
+			out = append(out, &ggml.Tensor{Name: name, Kind: t.Kind(), Shape: newShape, WriterTo: t})

 		default:
-			out = append(out, &ggml.Tensor{
-				Name:     name,
-				Kind:     t.Kind(),
-				Shape:    slices.Clone(shape),
-				WriterTo: t,
-			})
+			out = append(out, &ggml.Tensor{Name: name, Kind: t.Kind(), Shape: slices.Clone(shape), WriterTo: t})
 		}
 	}

 	return out
 }

+func (q *qwen3NextModel) repackReorderDim(dim, headDim int) Repacker {
+	return func(_ string, data []float32, shape []uint64) ([]float32, error) {
+		if !q.shouldReorderVHeads() {
+			return data, nil
+		}
+		numK := int(q.LinearNumKeyHeads)
+		numVPerK := int(q.LinearNumValueHeads / q.LinearNumKeyHeads)
+		return reorderHeadLayout(data, shape, dim, numK, numVPerK, headDim)
+	}
+}
+
+func (q *qwen3NextModel) repackAttnQKV() Repacker {
+	return func(_ string, data []float32, shape []uint64) ([]float32, error) {
+		if !q.shouldReorderVHeads() || len(shape) != 2 {
+			return data, nil
+		}
+
+		rows := int(shape[0])
+		cols := int(shape[1])
+		numK := int(q.LinearNumKeyHeads)
+		numV := int(q.LinearNumValueHeads)
+		headK := int(q.LinearKeyHeadDim)
+		headV := int(q.LinearValueHeadDim)
+		qDim := headK * numK
+		kDim := headK * numK
+		vDim := headV * numV
+		qkvDim := qDim + kDim + vDim
+
+		switch {
+		case rows == qkvDim:
+			// HF layout: [out_features, in_features]. Keep Q/K rows unchanged and
+			// reorder only V rows from grouped -> tiled head layout.
+			out := make([]float32, len(data))
+			qkRows := qDim + kDim
+			qkSize := qkRows * cols
+			copy(out[:qkSize], data[:qkSize])
+
+			vStart := qkSize
+			vEnd := vStart + vDim*cols
+			reorderedV, err := reorderHeadLayout(data[vStart:vEnd], []uint64{uint64(vDim), uint64(cols)}, 0, numK, numV/numK, headV)
+			if err != nil {
+				return nil, err
+			}
+			copy(out[vStart:vEnd], reorderedV)
+			copy(out[vEnd:], data[vEnd:])
+			return out, nil
+
+		case cols == qkvDim:
+			// Fallback for already-transposed [in_features, out_features] tensors.
+			out := make([]float32, len(data))
+			copy(out, data)
+			for r := range rows {
+				base := r * cols
+				vStart := base + qDim + kDim
+				vEnd := vStart + vDim
+				reorderedV, err := reorderHeadLayout(out[vStart:vEnd], []uint64{uint64(vDim)}, 0, numK, numV/numK, headV)
+				if err != nil {
+					return nil, err
+				}
+				copy(out[vStart:vEnd], reorderedV)
+			}
+			return out, nil
+
+		default:
+			return data, nil
+		}
+	}
+}
+
+func (q *qwen3NextModel) repackConv1D() Repacker {
+	return func(_ string, data []float32, shape []uint64) ([]float32, error) {
+		if !q.shouldReorderVHeads() {
+			return data, nil
+		}
+
+		normShape := slices.Clone(shape)
+		if len(shape) == 3 {
+			if shape[0] == 1 {
+				normShape = []uint64{shape[1], shape[2]}
+			} else if shape[1] == 1 {
+				normShape = []uint64{shape[0], shape[2]}
+			}
+		}
+		if len(normShape) != 2 {
+			return data, nil
+		}
+
+		rows := int(normShape[0])
+		cols := int(normShape[1])
+		numK := int(q.LinearNumKeyHeads)
+		numV := int(q.LinearNumValueHeads)
+		headK := int(q.LinearKeyHeadDim)
+		headV := int(q.LinearValueHeadDim)
+		qkChannels := 2 * headK * numK
+		totalChannels := qkChannels + headV*numV
+		if qkChannels <= 0 {
+			return data, nil
+		}
+
+		switch {
+		case rows == totalChannels:
+			// HF layout after squeeze: [channels, kernel]
+			out := make([]float32, len(data))
+			prefix := qkChannels * cols
+			copy(out[:prefix], data[:prefix])
+			reorderedV, err := reorderHeadLayout(data[prefix:], []uint64{uint64(totalChannels - qkChannels), uint64(cols)}, 0, numK, numV/numK, headV)
+			if err != nil {
+				return nil, err
+			}
+			copy(out[prefix:], reorderedV)
+			return out, nil
+		case cols == totalChannels:
+			// Fallback for transposed [kernel, channels]
+			out := make([]float32, len(data))
+			copy(out, data)
+			vChannels := totalChannels - qkChannels
+			for r := range rows {
+				base := r * cols
+				vStart := base + qkChannels
+				vEnd := vStart + vChannels
+				reorderedV, err := reorderHeadLayout(out[vStart:vEnd], []uint64{uint64(vChannels)}, 0, numK, numV/numK, headV)
+				if err != nil {
+					return nil, err
+				}
+				copy(out[vStart:vEnd], reorderedV)
+			}
+			return out, nil
+		default:
+			return data, nil
+		}
+	}
+}
+
+func (q *qwen3NextModel) repackSSMA() Repacker {
+	return func(_ string, data []float32, shape []uint64) ([]float32, error) {
+		result := make([]float32, len(data))
+		for i, v := range data {
+			result[i] = -float32(math.Exp(float64(v)))
+		}
+		if !q.shouldReorderVHeads() {
+			return result, nil
+		}
+		numK := int(q.LinearNumKeyHeads)
+		numVPerK := int(q.LinearNumValueHeads / q.LinearNumKeyHeads)
+		return reorderHeadLayout(result, shape, 0, numK, numVPerK, 1)
+	}
+}
+
+func reorderHeadLayout(data []float32, shape []uint64, dim int, numKHeads, numVPerK, headDim int) ([]float32, error) {
+	if len(shape) == 0 || numKHeads <= 0 || numVPerK <= 0 || headDim <= 0 {
+		return data, nil
+	}
+
+	dims := make([]int, len(shape))
+	for i := range shape {
+		dims[i] = int(shape[i])
+	}
+	if dim < 0 {
+		dim += len(dims)
+	}
+	if dim < 0 || dim >= len(dims) {
+		return data, nil
+	}
+
+	expected := numKHeads * numVPerK * headDim
+	if dims[dim] != expected {
+		return data, nil
+	}
+
+	newShape := make([]int, 0, len(dims)+2)
+	newShape = append(newShape, dims[:dim]...)
+	newShape = append(newShape, numKHeads, numVPerK, headDim)
+	newShape = append(newShape, dims[dim+1:]...)
+
+	var tt tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
+	if err := tt.Reshape(newShape...); err != nil {
+		return nil, err
+	}
+
+	perm := make([]int, len(newShape))
+	for i := range perm {
+		perm[i] = i
+	}
+	perm[dim], perm[dim+1] = perm[dim+1], perm[dim]
+
+	tt, err := tensor.Transpose(tt, perm...)
+	if err != nil {
+		return nil, err
+	}
+	tt = tensor.Materialize(tt)
+
+	total := 1
+	for _, d := range dims {
+		total *= d
+	}
+	if err := tt.Reshape(total); err != nil {
+		return nil, err
+	}
+	return native.VectorF32(tt.(*tensor.Dense))
+}
+
 type qkvzSplitSpec struct {
 	hidden    int
 	headKDim  int
@@ -369,7 +787,6 @@ func (q *qwen3NextModel) repackQKVZ(spec qkvzSplitSpec, extractGate bool) Repack
 		var tt tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
 		var err error

-		// Convert to [hidden, out_features] layout for slicing
 		tt, err = tensor.Transpose(tt, 1, 0)
 		if err != nil {
 			return nil, err
@@ -444,7 +861,6 @@ func (q *qwen3NextModel) repackQKVZ(spec qkvzSplitSpec, extractGate bool) Repack
 	}
 }

-// addOne adds 1.0 to all elements in the tensor (for norm weights)
 func (*qwen3NextModel) addOne(_ string, data []float32, shape []uint64) ([]float32, error) {
 	n := tensor.New(tensor.WithShape(int(shape[0])), tensor.WithBacking(data))
 	ones := tensor.Ones(tensor.Float32, int(shape[0]))
@@ -471,10 +887,21 @@ func (q *qwen3NextModel) Replacements() []string {
 	return []string{
 		// Embeddings and output
 		"lm_head", "output",
+		"model.language_model.embed_tokens", "token_embd",
+		"model.language_model.norm", "output_norm",
+		"model.language_model.layers", "blk",
 		"model.embed_tokens", "token_embd",
 		"model.norm", "output_norm",
 		"model.layers", "blk",

+		// Vision
+		"model.visual", "v",
+		"patch_embed.proj", "patch_embed",
+		"blocks", "blk",
+		"attn.qkv", "attn_qkv",
+		"attn.proj", "attn_out",
+		"deepstack_merger_list", "deepstack_merger",
+
 		// Layer norms
 		"input_layernorm", "attn_norm",
 		"post_attention_layernorm", "post_attention_norm",
@@ -487,9 +914,16 @@ func (q *qwen3NextModel) Replacements() []string {
 		"self_attn.v_proj", "attn_v",
 		"self_attn.o_proj", "attn_output",

-		// Linear attention (Gated Delta Net)
+		// Linear attention (legacy qwen3next)
 		"linear_attn.in_proj_qkvz", "ssm_in",
 		"linear_attn.in_proj_ba", "ssm_ba",
+
+		// Linear attention (qwen35)
+		"linear_attn.in_proj_qkv", "attn_qkv",
+		"linear_attn.in_proj_z", "attn_gate",
+		"linear_attn.in_proj_a", "ssm_alpha",
+		"linear_attn.in_proj_b", "ssm_beta",
+
 		"linear_attn.conv1d", "ssm_conv1d",
 		"linear_attn.dt_bias", "ssm_dt",
 		"linear_attn.dt_proj", "ssm_dt",
@@ -497,14 +931,14 @@ func (q *qwen3NextModel) Replacements() []string {
 		"linear_attn.norm", "ssm_norm",
 		"linear_attn.out_proj", "ssm_out",

-		// MoE (experts are stacked via mergeTensors, not replaced here)
+		// MoE
 		"mlp.gate.weight", "ffn_gate_inp.weight",
 		"mlp.shared_expert.down_proj", "ffn_down_shexp",
 		"mlp.shared_expert.gate_proj", "ffn_gate_shexp",
 		"mlp.shared_expert.up_proj", "ffn_up_shexp",
 		"mlp.shared_expert_gate", "ffn_gate_inp_shexp",

-		// Dense FFN (if any layers use it)
+		// Dense FFN
 		"mlp.down_proj", "ffn_down",
 		"mlp.gate_proj", "ffn_gate",
 		"mlp.up_proj", "ffn_up",
--- a/convert/convert_qwen3next_test.go
+++ b/convert/convert_qwen3next_test.go
@@ -0,0 +1,563 @@
+package convert
+
+import (
+	"bytes"
+	"encoding/binary"
+	"os"
+	"slices"
+	"strings"
+	"testing"
+
+	"github.com/ollama/ollama/fs/ggml"
+)
+
+func boolPtr(v bool) *bool {
+	return &v
+}
+
+func readTensorData(t *testing.T, tensor *ggml.Tensor) []float32 {
+	t.Helper()
+
+	var b bytes.Buffer
+	if _, err := tensor.WriteTo(&b); err != nil {
+		t.Fatal(err)
+	}
+
+	numel := 1
+	for _, d := range tensor.Shape {
+		numel *= int(d)
+	}
+
+	values := make([]float32, numel)
+	if err := binary.Read(&b, binary.LittleEndian, &values); err != nil {
+		t.Fatal(err)
+	}
+
+	return values
+}
+
+func TestQwen3NextLegacyModelTypeDisablesReorder(t *testing.T) {
+	m := &qwen3NextModel{
+		ModelParameters: ModelParameters{
+			ModelType: "qwen3_next",
+		},
+	}
+
+	if m.shouldReorderVHeads() {
+		t.Fatalf("legacy qwen3_next model_type should not reorder v-head layout")
+	}
+}
+
+func TestQwen3NextLegacyArchitectureDisablesReorder(t *testing.T) {
+	m := &qwen3NextModel{
+		ModelParameters: ModelParameters{
+			Architectures: []string{"Qwen3NextForCausalLM"},
+		},
+	}
+
+	if m.shouldReorderVHeads() {
+		t.Fatalf("legacy Qwen3Next architecture should not reorder v-head layout")
+	}
+}
+
+func TestQwen3NextKVLegacyConfig(t *testing.T) {
+	m := &qwen3NextModel{
+		ModelParameters: ModelParameters{
+			ModelType: "qwen3_next",
+		},
+		qwen3NextTextConfig: qwen3NextTextConfig{
+			MaxPositionEmbeddings: 8192,
+			HiddenSize:            512,
+			NumHiddenLayers:       4,
+			IntermediateSize:      2048,
+			NumAttentionHeads:     8,
+			NumKeyValueHeads:      2,
+			HeadDim:               64,
+			RopeTheta:             1_000_000,
+			RMSNormEPS:            1e-6,
+
+			NumExperts:             8,
+			NumExpertsPerToken:     2,
+			NormTopkProb:           boolPtr(true),
+			MoEIntermediateSize:    256,
+			SharedExpertIntermSize: 512,
+
+			FullAttentionInterval: 2,
+
+			LinearConvKernelDim: 4,
+			LinearKeyHeadDim:    64,
+			LinearNumKeyHeads:   2,
+			LinearNumValueHeads: 4,
+			LinearValueHeadDim:  64,
+
+			PartialRotaryFactor: 0.25,
+		},
+	}
+
+	if err := m.parseMore(os.DirFS(t.TempDir())); err != nil {
+		t.Fatal(err)
+	}
+
+	kv := m.KV(&Tokenizer{Vocabulary: &Vocabulary{}})
+	if got, want := kv["general.architecture"], "qwen35moe"; got != want {
+		t.Fatalf("unexpected architecture: got %v want %v", got, want)
+	}
+	if got, want := kv["tokenizer.ggml.pre"], "qwen35"; got != want {
+		t.Fatalf("unexpected tokenizer pre: got %v want %v", got, want)
+	}
+
+	headCountKV, ok := kv["attention.head_count_kv"].([]uint32)
+	if !ok {
+		t.Fatalf("attention.head_count_kv has unexpected type: %T", kv["attention.head_count_kv"])
+	}
+	if got, want := headCountKV, []uint32{0, 2, 0, 2}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected attention.head_count_kv: got %v want %v", got, want)
+	}
+
+	if _, ok := kv["ssm.v_head_reordered"]; ok {
+		t.Fatalf("legacy qwen3next should not enable ssm.v_head_reordered")
+	}
+	if got, want := kv["norm_top_k_prob"], true; got != want {
+		t.Fatalf("unexpected norm_top_k_prob: got %v want %v", got, want)
+	}
+}
+
+func TestQwen35MoeOmitsNormTopKProbWhenUnset(t *testing.T) {
+	m := &qwen3NextModel{
+		ModelParameters: ModelParameters{
+			ModelType: "qwen3_5",
+		},
+		qwen3NextTextConfig: qwen3NextTextConfig{
+			MaxPositionEmbeddings: 4096,
+			HiddenSize:            512,
+			NumHiddenLayers:       4,
+			IntermediateSize:      2048,
+			NumAttentionHeads:     8,
+			NumKeyValueHeads:      2,
+			HeadDim:               64,
+			RopeTheta:             1_000_000,
+			RMSNormEPS:            1e-6,
+			NumExperts:            8,
+			NumExpertsPerToken:    2,
+			FullAttentionInterval: 2,
+			LinearConvKernelDim:   4,
+			LinearKeyHeadDim:      64,
+			LinearNumKeyHeads:     2,
+			LinearNumValueHeads:   4,
+			LinearValueHeadDim:    64,
+			PartialRotaryFactor:   0.25,
+		},
+	}
+
+	if err := m.parseMore(os.DirFS(t.TempDir())); err != nil {
+		t.Fatal(err)
+	}
+
+	kv := m.KV(&Tokenizer{Vocabulary: &Vocabulary{}})
+	if _, ok := kv["norm_top_k_prob"]; ok {
+		t.Fatalf("expected norm_top_k_prob to be omitted when not set in config")
+	}
+}
+
+func TestQwen35KVFromTextConfig(t *testing.T) {
+	m := &qwen3NextModel{
+		ModelParameters: ModelParameters{
+			ModelType: "qwen3_5",
+		},
+		TextConfig: &qwen3NextTextConfig{
+			MaxPositionEmbeddings: 16384,
+			HiddenSize:            1024,
+			NumHiddenLayers:       4,
+			IntermediateSize:      4096,
+			NumAttentionHeads:     8,
+			NumKeyValueHeads:      4,
+			HeadDim:               128,
+			RMSNormEPS:            1e-6,
+
+			LayerTypes: []string{
+				"linear_attention",
+				"full_attention",
+				"linear_attention",
+				"full_attention",
+			},
+
+			LinearConvKernelDim: 4,
+			LinearKeyHeadDim:    128,
+			LinearNumKeyHeads:   2,
+			LinearNumValueHeads: 4,
+			LinearValueHeadDim:  128,
+
+			RopeParameters: qwen3NextRopeParams{
+				MRopeInterleaved:    true,
+				MropeSection:        []int32{11, 11, 10},
+				RopeType:            "default",
+				RopeTheta:           10_000_000,
+				PartialRotaryFactor: 0.25,
+			},
+		},
+		VisionModel: qwen3NextVisionConfig{
+			Depth:                  2,
+			HiddenSize:             128,
+			NumHeads:               4,
+			InChannels:             3,
+			PatchSize:              16,
+			SpatialMergeSize:       2,
+			RMSNormEps:             1e-6,
+			RopeTheta:              10_000,
+			TemporalPatchSize:      2,
+			DeepstackVisualIndexes: []int32{1},
+		},
+		ImageTokenID:       1001,
+		VisionStartTokenID: 1002,
+		VisionEndTokenID:   1003,
+	}
+	m.VisionModel.Size.ShortestEdge = 224
+	m.VisionModel.Size.LongestEdge = 4096
+	m.VisionModel.ImageMean = []float32{0.5, 0.5, 0.5}
+	m.VisionModel.ImageStd = []float32{0.2, 0.2, 0.2}
+
+	if err := m.parseMore(os.DirFS(t.TempDir())); err != nil {
+		t.Fatal(err)
+	}
+
+	kv := m.KV(&Tokenizer{Vocabulary: &Vocabulary{}})
+	if got, want := kv["general.architecture"], "qwen35"; got != want {
+		t.Fatalf("unexpected architecture: got %v want %v", got, want)
+	}
+
+	headCountKV, ok := kv["attention.head_count_kv"].([]uint32)
+	if !ok {
+		t.Fatalf("attention.head_count_kv has unexpected type: %T", kv["attention.head_count_kv"])
+	}
+	if got, want := headCountKV, []uint32{0, 4, 0, 4}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected attention.head_count_kv: got %v want %v", got, want)
+	}
+
+	if got, ok := kv["ssm.v_head_reordered"].(bool); !ok || !got {
+		t.Fatalf("expected ssm.v_head_reordered=true, got %v (%T)", kv["ssm.v_head_reordered"], kv["ssm.v_head_reordered"])
+	}
+
+	mrope, ok := kv["mrope_sections"].([]int32)
+	if !ok {
+		t.Fatalf("mrope_sections has unexpected type: %T", kv["mrope_sections"])
+	}
+	if got, want := mrope, []int32{11, 11, 10}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected mrope_sections: got %v want %v", got, want)
+	}
+	ropeSections, ok := kv["rope.dimension_sections"].([]int32)
+	if !ok {
+		t.Fatalf("rope.dimension_sections has unexpected type: %T", kv["rope.dimension_sections"])
+	}
+	if got, want := ropeSections, []int32{11, 11, 10}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected rope.dimension_sections: got %v want %v", got, want)
+	}
+
+	if got, ok := kv["rope.mrope_interleaved"].(bool); !ok || !got {
+		t.Fatalf("expected rope.mrope_interleaved=true, got %v (%T)", kv["rope.mrope_interleaved"], kv["rope.mrope_interleaved"])
+	}
+
+	if got, want := kv["vision.block_count"], uint32(2); got != want {
+		t.Fatalf("unexpected vision.block_count: got %v want %v", got, want)
+	}
+}
+
+func TestQwen3NextReplacements(t *testing.T) {
+	r := strings.NewReplacer((&qwen3NextModel{}).Replacements()...)
+
+	if got, want := r.Replace("model.language_model.layers.1.linear_attn.in_proj_qkv.weight"), "blk.1.attn_qkv.weight"; got != want {
+		t.Fatalf("unexpected language-model replacement: got %q want %q", got, want)
+	}
+	if got, want := r.Replace("model.visual.blocks.0.attn.qkv.weight"), "v.blk.0.attn_qkv.weight"; got != want {
+		t.Fatalf("unexpected vision replacement: got %q want %q", got, want)
+	}
+	if got, want := r.Replace("model.layers.1.linear_attn.in_proj_qkvz.weight"), "blk.1.ssm_in.weight"; got != want {
+		t.Fatalf("unexpected legacy replacement: got %q want %q", got, want)
+	}
+}
+
+func TestQwen35ReordersVHeads(t *testing.T) {
+	m := &qwen3NextModel{
+		ModelParameters: ModelParameters{
+			ModelType: "qwen3_5",
+		},
+		qwen3NextTextConfig: qwen3NextTextConfig{
+			LinearNumKeyHeads:   2,
+			LinearNumValueHeads: 4,
+			LinearValueHeadDim:  1,
+		},
+	}
+
+	out := m.Tensors([]Tensor{
+		&fakeTensor{
+			name:  "blk.0.attn_gate.weight",
+			shape: []uint64{4, 2},
+			data:  []float32{0, 1, 2, 3, 4, 5, 6, 7},
+		},
+	})
+	if len(out) != 1 {
+		t.Fatalf("unexpected output tensor count: got %d want 1", len(out))
+	}
+
+	if got, want := readTensorData(t, out[0]), []float32{0, 1, 4, 5, 2, 3, 6, 7}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected data: got %v want %v", got, want)
+	}
+}
+
+func TestQwen35ReordersAttnQKVOutputDim(t *testing.T) {
+	m := &qwen3NextModel{
+		ModelParameters: ModelParameters{
+			ModelType: "qwen3_5",
+		},
+		qwen3NextTextConfig: qwen3NextTextConfig{
+			LinearNumKeyHeads:   2,
+			LinearNumValueHeads: 4,
+			LinearKeyHeadDim:    1,
+			LinearValueHeadDim:  1,
+		},
+	}
+
+	out := m.Tensors([]Tensor{
+		&fakeTensor{
+			name:  "blk.0.attn_qkv.weight",
+			shape: []uint64{8, 2}, // [out_features, in_features] (HF layout)
+			data: []float32{
+				0, 1, // q0
+				2, 3, // q1
+				4, 5, // k0
+				6, 7, // k1
+				10, 11, // v(k0,v0)
+				12, 13, // v(k0,v1)
+				20, 21, // v(k1,v0)
+				22, 23, // v(k1,v1)
+			},
+		},
+	})
+	if len(out) != 1 {
+		t.Fatalf("unexpected output tensor count: got %d want 1", len(out))
+	}
+
+	if got, want := readTensorData(t, out[0]), []float32{
+		0, 1, 2, 3, 4, 5, 6, 7,
+		10, 11, 20, 21, 12, 13, 22, 23,
+	}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected qkv data: got %v want %v", got, want)
+	}
+}
+
+func TestQwen35ReordersSsmOutInputDim(t *testing.T) {
+	m := &qwen3NextModel{
+		ModelParameters: ModelParameters{
+			ModelType: "qwen3_5",
+		},
+		qwen3NextTextConfig: qwen3NextTextConfig{
+			LinearNumKeyHeads:   2,
+			LinearNumValueHeads: 4,
+			LinearValueHeadDim:  1,
+		},
+	}
+
+	out := m.Tensors([]Tensor{
+		&fakeTensor{
+			name:  "blk.0.ssm_out.weight",
+			shape: []uint64{2, 4},
+			data:  []float32{0, 1, 2, 3, 4, 5, 6, 7},
+		},
+	})
+	if len(out) != 1 {
+		t.Fatalf("unexpected output tensor count: got %d want 1", len(out))
+	}
+
+	if got, want := readTensorData(t, out[0]), []float32{0, 2, 1, 3, 4, 6, 5, 7}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected ssm_out data: got %v want %v", got, want)
+	}
+}
+
+func TestQwen35ReordersSsmBetaRows(t *testing.T) {
+	m := &qwen3NextModel{
+		ModelParameters: ModelParameters{
+			ModelType: "qwen3_5",
+		},
+		qwen3NextTextConfig: qwen3NextTextConfig{
+			LinearNumKeyHeads:   2,
+			LinearNumValueHeads: 4,
+		},
+	}
+
+	out := m.Tensors([]Tensor{
+		&fakeTensor{
+			name:  "blk.0.ssm_beta.weight",
+			shape: []uint64{4, 2},
+			data:  []float32{0, 1, 2, 3, 4, 5, 6, 7},
+		},
+	})
+	if len(out) != 1 {
+		t.Fatalf("unexpected output tensor count: got %d want 1", len(out))
+	}
+
+	if got, want := readTensorData(t, out[0]), []float32{0, 1, 4, 5, 2, 3, 6, 7}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected ssm_beta data: got %v want %v", got, want)
+	}
+}
+
+func TestQwen35ReordersConv1DChannelDim(t *testing.T) {
+	m := &qwen3NextModel{
+		ModelParameters: ModelParameters{
+			ModelType: "qwen3_5",
+		},
+		qwen3NextTextConfig: qwen3NextTextConfig{
+			LinearNumKeyHeads:   2,
+			LinearNumValueHeads: 4,
+			LinearKeyHeadDim:    1,
+			LinearValueHeadDim:  1,
+		},
+	}
+
+	out := m.Tensors([]Tensor{
+		&fakeTensor{
+			name:  "blk.0.ssm_conv1d.weight",
+			shape: []uint64{8, 2}, // [channels, kernel] after squeeze
+			data: []float32{
+				0, 1, // q0
+				2, 3, // q1
+				4, 5, // k0
+				6, 7, // k1
+				10, 11, // v(k0,v0)
+				12, 13, // v(k0,v1)
+				20, 21, // v(k1,v0)
+				22, 23, // v(k1,v1)
+			},
+		},
+	})
+	if len(out) != 1 {
+		t.Fatalf("unexpected output tensor count: got %d want 1", len(out))
+	}
+
+	if got, want := readTensorData(t, out[0]), []float32{
+		0, 1, 2, 3, 4, 5, 6, 7,
+		10, 11, 20, 21, 12, 13, 22, 23,
+	}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected conv1d data: got %v want %v", got, want)
+	}
+}
+
+func TestLegacyQwen3NextDoesNotReorderVHeads(t *testing.T) {
+	m := &qwen3NextModel{
+		ModelParameters: ModelParameters{
+			ModelType: "qwen3_next",
+		},
+		qwen3NextTextConfig: qwen3NextTextConfig{
+			LinearNumKeyHeads:   2,
+			LinearNumValueHeads: 4,
+			LinearValueHeadDim:  1,
+		},
+	}
+
+	out := m.Tensors([]Tensor{
+		&fakeTensor{
+			name:  "blk.0.attn_gate.weight",
+			shape: []uint64{4, 1},
+			data:  []float32{0, 1, 2, 3},
+		},
+	})
+	if len(out) != 1 {
+		t.Fatalf("unexpected output tensor count: got %d want 1", len(out))
+	}
+
+	if got, want := readTensorData(t, out[0]), []float32{0, 1, 2, 3}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected data for legacy qwen3next: got %v want %v", got, want)
+	}
+}
+
+func TestQwen35MoePackedExperts(t *testing.T) {
+	m := &qwen3NextModel{
+		qwen3NextTextConfig: qwen3NextTextConfig{
+			NumHiddenLayers: 1,
+		},
+	}
+
+	out := m.Tensors([]Tensor{
+		&fakeTensor{
+			name:  "blk.0.mlp.experts.gate_up_proj",
+			shape: []uint64{2, 4, 3},
+			data: []float32{
+				0, 1, 2,
+				3, 4, 5,
+				6, 7, 8,
+				9, 10, 11,
+				12, 13, 14,
+				15, 16, 17,
+				18, 19, 20,
+				21, 22, 23,
+			},
+		},
+		&fakeTensor{
+			name:  "blk.0.mlp.experts.down_proj",
+			shape: []uint64{2, 5, 3},
+			data:  make([]float32, 2*5*3),
+		},
+	})
+
+	get := func(name string) *ggml.Tensor {
+		for _, tensor := range out {
+			if tensor.Name == name {
+				return tensor
+			}
+		}
+		return nil
+	}
+
+	gate := get("blk.0.ffn_gate_exps.weight")
+	if gate == nil {
+		t.Fatalf("missing tensor %q", "blk.0.ffn_gate_exps.weight")
+	}
+	if got, want := gate.Shape, []uint64{2, 2, 3}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected gate shape: got %v want %v", got, want)
+	}
+	if got, want := readTensorData(t, gate), []float32{
+		0, 1, 2, 3, 4, 5,
+		12, 13, 14, 15, 16, 17,
+	}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected gate values: got %v want %v", got, want)
+	}
+
+	up := get("blk.0.ffn_up_exps.weight")
+	if up == nil {
+		t.Fatalf("missing tensor %q", "blk.0.ffn_up_exps.weight")
+	}
+	if got, want := up.Shape, []uint64{2, 2, 3}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected up shape: got %v want %v", got, want)
+	}
+	if got, want := readTensorData(t, up), []float32{
+		6, 7, 8, 9, 10, 11,
+		18, 19, 20, 21, 22, 23,
+	}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected up values: got %v want %v", got, want)
+	}
+
+	down := get("blk.0.ffn_down_exps.weight")
+	if down == nil {
+		t.Fatalf("missing tensor %q", "blk.0.ffn_down_exps.weight")
+	}
+	if got, want := down.Shape, []uint64{2, 5, 3}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected down shape: got %v want %v", got, want)
+	}
+}
+
+func TestQwen35SharedExpertGateKeepsMatrixShape(t *testing.T) {
+	m := &qwen3NextModel{}
+
+	out := m.Tensors([]Tensor{
+		&fakeTensor{
+			name:  "blk.0.ffn_gate_inp_shexp.weight",
+			shape: []uint64{1, 4},
+			data:  []float32{0, 1, 2, 3},
+		},
+	})
+	if len(out) != 1 {
+		t.Fatalf("unexpected output tensor count: got %d want 1", len(out))
+	}
+
+	if got, want := out[0].Shape, []uint64{1, 4}; !slices.Equal(got, want) {
+		t.Fatalf("unexpected shared gate shape: got %v want %v", got, want)
+	}
+}
--- a/convert/json_compat.go
+++ b/convert/json_compat.go
@@ -0,0 +1,97 @@
+package convert
+
+// sanitizeNonFiniteJSON rewrites non-standard JSON numeric tokens that some
+// HF configs emit (Infinity, -Infinity, NaN) into standard JSON numbers.
+//
+// This is intentionally conservative:
+// - only runs outside quoted strings
+// - only rewrites full tokens
+//
+// We map these values to 0 because encoding/json rejects non-finite values,
+// and these fields are typically model-side metadata not consumed by the
+// converter.
+func sanitizeNonFiniteJSON(in []byte) []byte {
+	if len(in) == 0 {
+		return in
+	}
+
+	out := make([]byte, 0, len(in))
+	inString := false
+	escape := false
+
+	for i := 0; i < len(in); {
+		c := in[i]
+
+		if inString {
+			out = append(out, c)
+			if escape {
+				escape = false
+			} else if c == '\\' {
+				escape = true
+			} else if c == '"' {
+				inString = false
+			}
+			i++
+			continue
+		}
+
+		if c == '"' {
+			inString = true
+			out = append(out, c)
+			i++
+			continue
+		}
+
+		if hasToken(in, i, "-Infinity") {
+			out = append(out, '0')
+			i += len("-Infinity")
+			continue
+		}
+
+		if hasToken(in, i, "Infinity") {
+			out = append(out, '0')
+			i += len("Infinity")
+			continue
+		}
+
+		if hasToken(in, i, "NaN") {
+			out = append(out, '0')
+			i += len("NaN")
+			continue
+		}
+
+		out = append(out, c)
+		i++
+	}
+
+	return out
+}
+
+func hasToken(in []byte, at int, tok string) bool {
+	end := at + len(tok)
+	if at < 0 || end > len(in) {
+		return false
+	}
+	if string(in[at:end]) != tok {
+		return false
+	}
+	if at > 0 && !isJSONValuePrefixBoundary(in[at-1]) {
+		return false
+	}
+	if end < len(in) && !isJSONValueSuffixBoundary(in[end]) {
+		return false
+	}
+	return true
+}
+
+func isJSONWhitespace(b byte) bool {
+	return b == ' ' || b == '\t' || b == '\n' || b == '\r'
+}
+
+func isJSONValuePrefixBoundary(b byte) bool {
+	return isJSONWhitespace(b) || b == ':' || b == ',' || b == '['
+}
+
+func isJSONValueSuffixBoundary(b byte) bool {
+	return isJSONWhitespace(b) || b == ',' || b == ']' || b == '}'
+}
--- a/convert/json_compat_test.go
+++ b/convert/json_compat_test.go
@@ -0,0 +1,46 @@
+package convert
+
+import "testing"
+
+func TestSanitizeNonFiniteJSON(t *testing.T) {
+	tests := []struct {
+		name string
+		in   string
+		want string
+	}{
+		{
+			name: "infinity token",
+			in:   `{"a":[0,Infinity,1]}`,
+			want: `{"a":[0,0,1]}`,
+		},
+		{
+			name: "negative infinity token",
+			in:   `{"a":-Infinity}`,
+			want: `{"a":0}`,
+		},
+		{
+			name: "nan token",
+			in:   `{"a":NaN}`,
+			want: `{"a":0}`,
+		},
+		{
+			name: "tokens inside strings untouched",
+			in:   `{"a":"Infinity -Infinity NaN","b":Infinity}`,
+			want: `{"a":"Infinity -Infinity NaN","b":0}`,
+		},
+		{
+			name: "identifier-like token untouched",
+			in:   `{"a":InfinityValue}`,
+			want: `{"a":InfinityValue}`,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := string(sanitizeNonFiniteJSON([]byte(tt.in)))
+			if got != tt.want {
+				t.Fatalf("sanitizeNonFiniteJSON() = %q, want %q", got, tt.want)
+			}
+		})
+	}
+}
--- a/convert/tokenizer.go
+++ b/convert/tokenizer.go
@@ -101,6 +101,8 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
 			t.Pre = "deepseek-coder"
 		case "1ff7f41064896984db5d1bb6ff64fa4bc29007d08c1b439e505b7392777a319e":
 			t.Pre = "qwen2"
+		case "00431aed57e696b747435f734d1e3b9b1bfd931a121fb5cac7129e97c181e9ba":
+			t.Pre = "qwen35"
 		case "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855":
 			// noop, empty pretokenizer
 		default:
@@ -212,8 +214,13 @@ type tokenizer struct {

 	PreTokenizer struct {
 		PreTokenizers []struct {
-			Type    string `json:"type"`
-			Pattern struct {
+			Type           string `json:"type"`
+			Behavior       string `json:"behavior"`
+			Invert         bool   `json:"invert"`
+			AddPrefixSpace bool   `json:"add_prefix_space"`
+			TrimOffsets    bool   `json:"trim_offsets"`
+			UseRegex       bool   `json:"use_regex"`
+			Pattern        struct {
 				Regex string `json:"Regex"`
 			} `json:"pattern"`
 		} `json:"pretokenizers"`
--- a/convert/tokenizer_test.go
+++ b/convert/tokenizer_test.go
@@ -191,6 +191,84 @@ func TestParseTokenizer(t *testing.T) {
 				Pre: "default",
 			},
 		},
+		{
+			name: "llama-bpe pretokenizer and control tokens",
+			fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
+				"tokenizer.json": strings.NewReader(`{
+					"added_tokens": [
+						{"id": 1, "content": "<|startoftext|>", "special": true},
+						{"id": 6, "content": "<|im_start|>", "special": true},
+						{"id": 7, "content": "<|im_end|>", "special": true},
+						{"id": 8, "content": "<|tool_list_start|>", "special": true},
+						{"id": 9, "content": "<|tool_list_end|>", "special": true},
+						{"id": 10, "content": "<|tool_call_start|>", "special": true},
+						{"id": 11, "content": "<|tool_call_end|>", "special": true},
+						{"id": 12, "content": "<|tool_response_start|>", "special": true},
+						{"id": 13, "content": "<|tool_response_end|>", "special": true},
+						{"id": 396, "content": "<image>", "special": true},
+						{"id": 64400, "content": "<think>", "special": true},
+						{"id": 64401, "content": "</think>", "special": true}
+					],
+					"model": {
+						"vocab": {
+							"<|startoftext|>": 1,
+							"<|im_start|>": 6,
+							"<|im_end|>": 7,
+							"<|tool_list_start|>": 8,
+							"<|tool_list_end|>": 9,
+							"<|tool_call_start|>": 10,
+							"<|tool_call_end|>": 11,
+							"<|tool_response_start|>": 12,
+							"<|tool_response_end|>": 13,
+							"<image>": 396,
+							"<think>": 64400,
+							"</think>": 64401
+						}
+					},
+					"pre_tokenizer": {
+						"type": "Sequence",
+						"pretokenizers": [
+							{
+								"type": "Split",
+								"pattern": {
+									"Regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
+								},
+								"behavior": "Isolated",
+								"invert": false
+							},
+							{
+								"type": "ByteLevel",
+								"add_prefix_space": false,
+								"trim_offsets": true,
+								"use_regex": false
+							}
+						]
+					}
+				}`),
+			}),
+			want: &Tokenizer{
+				Vocabulary: &Vocabulary{
+					Model: "gpt2",
+					Tokens: []string{
+						"<|startoftext|>",
+						"<|im_start|>",
+						"<|im_end|>",
+						"<|tool_list_start|>",
+						"<|tool_list_end|>",
+						"<|tool_call_start|>",
+						"<|tool_call_end|>",
+						"<|tool_response_start|>",
+						"<|tool_response_end|>",
+						"<image>",
+						"<think>",
+						"</think>",
+					},
+					Scores: []float32{1, 6, 7, 8, 9, 10, 11, 12, 13, 396, 64400, 64401},
+					Types:  []int32{3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3},
+				},
+				Pre: "llama-bpe",
+			},
+		},
 		{
 			name: "list string merges",
 			fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
@@ -308,6 +386,28 @@ func TestParseTokenizer(t *testing.T) {
 				Pre: "default",
 			},
 		},
+		{
+			name: "qwen35 pretokenizer",
+			fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
+				"tokenizer.json": strings.NewReader(`{
+					"pre_tokenizer": {
+						"type": "Sequence",
+						"pretokenizers": [
+							{
+								"type": "Split",
+								"pattern": {
+									"Regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?[\\p{L}\\p{M}]+|\\p{N}| ?[^\\s\\p{L}\\p{M}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
+								}
+							}
+						]
+					}
+				}`),
+			}),
+			want: &Tokenizer{
+				Vocabulary: &Vocabulary{Model: "gpt2"},
+				Pre:        "qwen35",
+			},
+		},
 	}

 	for _, tt := range cases {
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -160,6 +160,27 @@ func (kv KV) SSMGroupCount() uint64 {
 	return uint64(kv.Uint("ssm.group_count"))
 }

+func (kv KV) FFNLength() []uint64 {
+	ffnLengthDefault := uint32(0)
+	ffnLength := kv.UintOrArrayValueAsArray("feed_forward_length", ffnLengthDefault)
+	if len(ffnLength) == 1 {
+		ffnLengthDefault = ffnLength[0]
+	}
+	nLayers := int(kv.BlockCount())
+	if len(ffnLength) > nLayers {
+		slog.Warn("got more elements of feed_forward_length than layers", "len(ffnLength)", len(ffnLength), "layers", nLayers)
+	}
+	out := make([]uint64, nLayers)
+	for i := range nLayers {
+		if i >= len(ffnLength) {
+			out[i] = uint64(ffnLengthDefault)
+		} else {
+			out[i] = uint64(ffnLength[i])
+		}
+	}
+	return out
+}
+
 // general types

 func (kv KV) String(key string, defaultValue ...string) string {
@@ -264,15 +285,18 @@ func (kv KV) OllamaEngineRequired() bool {
 		"llama4",
 		"mistral3",
 		"mllama",
+		"nemotron_h", "nemotron_h_moe",
 		"nomic-bert",
 		"olmo3",
 		"qwen25vl",
 		"qwen3", "qwen3moe",
+		"qwen35", "qwen35moe",
 		"qwen3next",
 		"qwen3vl", "qwen3vlmoe",
 		"glm4moelite",
 		"glmocr",
 		"lfm2",
+		"lfm2moe",
 	}, kv.Architecture())
 }

@@ -845,7 +869,12 @@ func (f GGML) SupportsFlashAttention() bool {
 		return false
 	}

-	if arch := f.KV().Architecture(); slices.Contains([]string{"gemma2"}, arch) {
+	arch := f.KV().Architecture()
+	if slices.Contains([]string{"qwen35", "qwen35moe", "qwen3next"}, arch) {
+		return true
+	}
+
+	if slices.Contains([]string{"gemma2"}, arch) {
 		return false
 	}

@@ -864,9 +893,12 @@ func (f GGML) FlashAttention() bool {
 		"glmocr",
 		"gptoss", "gpt-oss",
 		"lfm2",
+		"lfm2moe",
 		"mistral3",
+		"nemotron_h", "nemotron_h_moe",
 		"olmo3",
 		"qwen3", "qwen3moe",
+		"qwen35", "qwen35moe",
 		"qwen3next",
 		"qwen3vl", "qwen3vlmoe",
 	}, f.KV().String("general.architecture"))
--- a/fs/ggml/gguf.go
+++ b/fs/ggml/gguf.go
@@ -245,7 +245,22 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
 	padding := ggufPadding(offset, int64(alignment))
 	llm.tensorOffset = uint64(offset + padding)

+	// get file size to validate tensor bounds
+	fileSize, err := rs.Seek(0, io.SeekEnd)
+	if err != nil {
+		return fmt.Errorf("failed to determine file size: %w", err)
+	}
+
+	if _, err := rs.Seek(offset, io.SeekStart); err != nil {
+		return fmt.Errorf("failed to seek back after size check: %w", err)
+	}
+
 	for _, tensor := range llm.tensors {
+		tensorEnd := llm.tensorOffset + tensor.Offset + tensor.Size()
+		if tensorEnd > uint64(fileSize) {
+			return fmt.Errorf("tensor %q offset+size (%d) exceeds file size (%d)", tensor.Name, tensorEnd, fileSize)
+		}
+
 		offset, err := rs.Seek(0, io.SeekCurrent)
 		if err != nil {
 			return fmt.Errorf("failed to get current offset: %w", err)
--- a/fs/ggml/gguf_test.go
+++ b/fs/ggml/gguf_test.go
@@ -11,21 +11,21 @@ import (
 )

 func TestWriteGGUF(t *testing.T) {
-	b := bytes.NewBuffer(make([]byte, 2*3))
+	tensorData := make([]byte, 2*3*4) // 6 F32 elements = 24 bytes
 	for range 8 {
 		t.Run("shuffle", func(t *testing.T) {
 			t.Parallel()

 			ts := []*Tensor{
-				{Name: "token_embd.weight", Shape: []uint64{2, 3}, WriterTo: b},
-				{Name: "blk.0.ffn_norm.weight", Shape: []uint64{2, 3}, WriterTo: b},
-				{Name: "blk.0.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: b},
-				{Name: "blk.1.ffn_up.weight", Shape: []uint64{2, 3}, WriterTo: b},
-				{Name: "blk.2.ffn_norm.weight", Shape: []uint64{2, 3}, WriterTo: b},
-				{Name: "blk.1.ffn_down.weight", Shape: []uint64{2, 3}, WriterTo: b},
-				{Name: "blk.0.attn_k.weight", Shape: []uint64{2, 3}, WriterTo: b},
-				{Name: "output_norm.weight", Shape: []uint64{3, 2}, WriterTo: b},
-				{Name: "output.weight", Shape: []uint64{3, 2}, WriterTo: b},
+				{Name: "token_embd.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewReader(tensorData)},
+				{Name: "blk.0.ffn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewReader(tensorData)},
+				{Name: "blk.0.attn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewReader(tensorData)},
+				{Name: "blk.1.ffn_up.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewReader(tensorData)},
+				{Name: "blk.2.ffn_norm.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewReader(tensorData)},
+				{Name: "blk.1.ffn_down.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewReader(tensorData)},
+				{Name: "blk.0.attn_k.weight", Shape: []uint64{2, 3}, WriterTo: bytes.NewReader(tensorData)},
+				{Name: "output_norm.weight", Shape: []uint64{3, 2}, WriterTo: bytes.NewReader(tensorData)},
+				{Name: "output.weight", Shape: []uint64{3, 2}, WriterTo: bytes.NewReader(tensorData)},
 			}

 			rand.Shuffle(len(ts), func(i, j int) {
@@ -98,4 +98,32 @@ func TestWriteGGUF(t *testing.T) {
 			}
 		})
 	}
+
+	t.Run("truncated_tensor_data", func(t *testing.T) {
+		t.Parallel()
+
+		ts := []*Tensor{
+			{Name: "blk.0.attn.weight", Kind: 0, Shape: []uint64{512, 2}, WriterTo: bytes.NewBuffer(make([]byte, 32))},
+		}
+
+		w, err := os.CreateTemp(t.TempDir(), "truncated_*.bin")
+		if err != nil {
+			t.Fatal(err)
+		}
+		defer w.Close()
+
+		if err := WriteGGUF(w, KV{"general.architecture": "test"}, ts); err != nil {
+			t.Fatal(err)
+		}
+
+		r, err := os.Open(w.Name())
+		if err != nil {
+			t.Fatal(err)
+		}
+		defer r.Close()
+
+		if _, err := Decode(r, -1); err == nil {
+			t.Error("Decode should reject GGUF files where tensor data extends beyond file size")
+		}
+	})
 }
--- a/kvcache/recurrent.go
+++ b/kvcache/recurrent.go
@@ -0,0 +1,752 @@
+package kvcache
+
+import (
+	"errors"
+	"fmt"
+	"math"
+	"slices"
+
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/model/input"
+)
+
+const (
+	DefaultCheckpointCount    = 24
+	DefaultCheckpointMinPos   = int32(16)
+	DefaultCheckpointInterval = int32(1664)
+)
+
+var ErrInvalidRecurrentShape = errors.New("kvcache: invalid recurrent state shape")
+
+// Config configures a shared hybrid recurrent cache.
+type RecurrentConfig struct {
+	Shift               func(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error)
+	ConvDim             int
+	ConvChannels        int
+	RecurrentStateSize  int
+	CheckpointLogPrefix string
+}
+
+var (
+	_ Cache           = (*Recurrent)(nil)
+	_ CheckpointCache = (*Recurrent)(nil)
+)
+
+// Cache stores:
+// - a standard causal KV cache
+// - per-sequence conv state for recurrent operators
+// - per-sequence recurrent state for recurrent operators
+//
+// Conv state shape (per layer, per sequence): [convDim, convChannels]
+// Recurrent state shape (per layer, per sequence): [recurrentStateSize]
+type Recurrent struct {
+	kv *Causal
+
+	backend      ml.Backend
+	dtype        ml.DType
+	maxSequences int
+
+	// Conv state dimensions
+	convDim      int
+	convChannels int
+
+	// Recurrent state dimensions
+	recurrentStateSize int
+
+	logPrefix string
+
+	// slot mapping for recurrent state (copy-on-write)
+	slotForSeq  map[int]int
+	refCount    []int
+	freeSlots   []int
+	seqCounts   map[int]int
+	slotScratch [1]int32
+
+	// per-layer conv state buffers (allocated lazily)
+	convCtxs   map[int]ml.Context
+	convStates map[int]ml.Tensor // [convDim*convChannels, maxSlots]
+
+	// per-layer recurrent state buffers (allocated lazily)
+	recurrentCtxs   map[int]ml.Context
+	recurrentStates map[int]ml.Tensor // [recurrentStateSize, maxSlots]
+
+	// recurrent checkpoints (per slot)
+	checkpointCount     int
+	checkpointMinPos    int32
+	checkpointInterval  int32
+	checkpointCtxSize   int
+	checkpoints         map[int]*slotCheckpointStore
+	pendingRestore      map[int]checkpointRestore
+	curCheckpointPos    []int32
+	curCheckpointSlots  map[int]int
+	reserveCheckpoints  bool
+	checkpointConvCtxs  map[int]ml.Context
+	checkpointRecurCtxs map[int]ml.Context
+	checkpointReserved  map[int]struct{}
+
+	// current forward batch (derived in StartForward)
+	curSeqs       []int
+	curSlots      []int
+	curSlotsInput ml.Tensor
+	curSeqTokens  int
+
+	// track if EnsureWritable has been called for this forward pass
+	writableEnsured bool
+	writableError   error
+}
+
+func NewRecurrentCache(config RecurrentConfig) *Recurrent {
+	return &Recurrent{
+		kv:                  NewCausalCache(config.Shift),
+		convDim:             config.ConvDim,
+		convChannels:        config.ConvChannels,
+		recurrentStateSize:  config.RecurrentStateSize,
+		logPrefix:           config.CheckpointLogPrefix,
+		slotForSeq:          make(map[int]int),
+		seqCounts:           make(map[int]int),
+		convCtxs:            make(map[int]ml.Context),
+		convStates:          make(map[int]ml.Tensor),
+		recurrentCtxs:       make(map[int]ml.Context),
+		recurrentStates:     make(map[int]ml.Tensor),
+		checkpointCount:     DefaultCheckpointCount,
+		checkpointMinPos:    DefaultCheckpointMinPos,
+		checkpointInterval:  DefaultCheckpointInterval,
+		checkpoints:         make(map[int]*slotCheckpointStore),
+		pendingRestore:      make(map[int]checkpointRestore),
+		curCheckpointSlots:  make(map[int]int),
+		checkpointConvCtxs:  make(map[int]ml.Context),
+		checkpointRecurCtxs: make(map[int]ml.Context),
+		checkpointReserved:  make(map[int]struct{}),
+	}
+}
+
+func (c *Recurrent) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity, maxBatch int) {
+	c.backend = backend
+	c.dtype = dtype
+	c.maxSequences = maxSequences
+	c.checkpoints = make(map[int]*slotCheckpointStore)
+	c.pendingRestore = make(map[int]checkpointRestore)
+	c.curCheckpointPos = c.curCheckpointPos[:0]
+	c.curCheckpointSlots = make(map[int]int)
+	c.checkpointReserved = make(map[int]struct{})
+	c.checkpointCtxSize = c.checkpointCount * c.maxSequences
+	if c.checkpointCtxSize < 8 {
+		c.checkpointCtxSize = 8
+	}
+
+	// initialize slot allocator
+	c.refCount = make([]int, maxSequences)
+	c.freeSlots = c.freeSlots[:0]
+	for i := maxSequences - 1; i >= 0; i-- {
+		c.freeSlots = append(c.freeSlots, i)
+	}
+
+	c.kv.Init(backend, dtype, maxSequences, capacity, maxBatch)
+}
+
+func (c *Recurrent) Close() {
+	for _, ctx := range c.convCtxs {
+		ctx.Close()
+	}
+	for _, ctx := range c.recurrentCtxs {
+		ctx.Close()
+	}
+	for _, ctx := range c.checkpointConvCtxs {
+		ctx.Close()
+	}
+	for _, ctx := range c.checkpointRecurCtxs {
+		ctx.Close()
+	}
+	c.kv.Close()
+}
+
+func (c *Recurrent) SetConfig(config ml.CacheConfig) {
+	c.kv.SetConfig(config)
+}
+
+func (c *Recurrent) SetLayer(layer int) {
+	c.kv.SetLayer(layer)
+}
+
+func (c *Recurrent) Get(ctx ml.Context) (ml.Tensor, ml.Tensor, ml.Tensor) {
+	return c.kv.Get(ctx)
+}
+
+func (c *Recurrent) Put(ctx ml.Context, key, value ml.Tensor) {
+	c.kv.Put(ctx, key, value)
+}
+
+func (c *Recurrent) StartForward(ctx ml.Context, batch input.Batch, reserve bool) error {
+	if err := c.kv.StartForward(ctx, batch, reserve); err != nil {
+		return err
+	}
+
+	nTokens := len(batch.Sequences)
+	if nTokens == 0 {
+		c.curSeqs = c.curSeqs[:0]
+		c.curSlots = c.curSlots[:0]
+		c.curSlotsInput = nil
+		c.curSeqTokens = 0
+		c.reserveCheckpoints = false
+		c.writableEnsured = false
+		c.writableError = nil
+		return nil
+	}
+
+	// Fast path for single-sequence batches (common during decode and prefill).
+	firstSeq := batch.Sequences[0]
+	singleSeq := true
+	for _, s := range batch.Sequences[1:] {
+		if s != firstSeq {
+			singleSeq = false
+			break
+		}
+	}
+	if singleSeq {
+		return c.startForwardSingleSeq(ctx, firstSeq, nTokens, batch, reserve)
+	}
+
+	// Derive equal-length sequence layout for recurrent layers.
+	seqCounts := c.seqCounts
+	for s := range seqCounts {
+		delete(seqCounts, s)
+	}
+
+	c.curSeqs = c.curSeqs[:0]
+	for _, s := range batch.Sequences {
+		if seqCounts[s] == 0 {
+			c.curSeqs = append(c.curSeqs, s)
+		}
+		seqCounts[s]++
+	}
+
+	nSeqs := len(c.curSeqs)
+	want := nTokens / nSeqs
+	for _, s := range c.curSeqs {
+		if seqCounts[s] != want {
+			return ErrNotSupported
+		}
+	}
+
+	c.curSeqTokens = want
+
+	if reserve {
+		c.curSlots = c.curSlots[:0]
+		for i := range nSeqs {
+			c.curSlots = append(c.curSlots, i)
+		}
+		c.finalizeStartForward(ctx, batch, true)
+		return nil
+	}
+
+	// Ensure slots exist for sequences in this batch.
+	c.curSlots = c.curSlots[:0]
+	var newSlots []int
+	for _, s := range c.curSeqs {
+		slot, ok := c.slotForSeq[s]
+		if !ok {
+			var err error
+			slot, err = c.allocSlot()
+			if err != nil {
+				return err
+			}
+			c.slotForSeq[s] = slot
+			c.refCount[slot] = 1
+			newSlots = append(newSlots, slot)
+		}
+		c.curSlots = append(c.curSlots, slot)
+	}
+
+	if len(newSlots) > 0 {
+		c.zeroSlots(ctx, newSlots)
+	}
+
+	c.finalizeStartForward(ctx, batch, false)
+
+	return nil
+}
+
+func (c *Recurrent) startForwardSingleSeq(ctx ml.Context, seq, seqTokens int, batch input.Batch, reserve bool) error {
+	c.curSeqs = append(c.curSeqs[:0], seq)
+	c.curSeqTokens = seqTokens
+
+	if reserve {
+		c.curSlots = append(c.curSlots[:0], 0)
+		c.finalizeStartForward(ctx, batch, true)
+		return nil
+	}
+
+	slot, ok := c.slotForSeq[seq]
+	if !ok {
+		var err error
+		slot, err = c.allocSlot()
+		if err != nil {
+			return err
+		}
+
+		c.slotForSeq[seq] = slot
+		c.refCount[slot] = 1
+		slotList := [1]int{slot}
+		c.zeroSlots(ctx, slotList[:])
+	}
+
+	c.curSlots = append(c.curSlots[:0], slot)
+	c.finalizeStartForward(ctx, batch, false)
+
+	return nil
+}
+
+func (c *Recurrent) finalizeStartForward(ctx ml.Context, batch input.Batch, reserve bool) {
+	c.setCurSlotsInput(ctx)
+	c.writableEnsured = false
+	c.writableError = nil
+	c.reserveCheckpoints = reserve
+	c.planCheckpoints(batch)
+}
+
+func (c *Recurrent) setCurSlotsInput(ctx ml.Context) {
+	c.curSlotsInput = c.slotsInput(ctx, c.curSlots)
+}
+
+func (c *Recurrent) slotsInput(ctx ml.Context, slots []int) ml.Tensor {
+	switch len(slots) {
+	case 0:
+		return nil
+	case 1:
+		c.slotScratch[0] = int32(slots[0])
+		return ctx.Input().FromInts(c.slotScratch[:], 1)
+	default:
+		slotIndices := make([]int32, len(slots))
+		for i, v := range slots {
+			slotIndices[i] = int32(v)
+		}
+		return ctx.Input().FromInts(slotIndices, len(slotIndices))
+	}
+}
+
+func (c *Recurrent) allocSlot() (int, error) {
+	if len(c.freeSlots) == 0 {
+		return 0, ErrKvCacheFull
+	}
+	slot := c.freeSlots[len(c.freeSlots)-1]
+	c.freeSlots = c.freeSlots[:len(c.freeSlots)-1]
+	return slot, nil
+}
+
+func (c *Recurrent) freeSlot(slot int) {
+	if slot >= 0 && slot < c.maxSequences {
+		c.freeSlots = append(c.freeSlots, slot)
+	}
+}
+
+// zeroSlots zeros recurrent state for the given slots across all cached layers.
+func (c *Recurrent) zeroSlots(ctx ml.Context, slots []int) {
+	if len(slots) == 0 {
+		return
+	}
+
+	inputCtx := ctx.Input()
+	slotsTensor := c.slotsInput(ctx, slots)
+
+	if len(c.convStates) > 0 {
+		zeros := inputCtx.Zeros(ml.DTypeF32, c.convDim*c.convChannels, len(slots))
+		for _, buf := range c.convStates {
+			ctx.Forward(buf.SetRows(ctx, zeros, slotsTensor))
+		}
+	}
+
+	if len(c.recurrentStates) > 0 {
+		zeros := inputCtx.Zeros(ml.DTypeF32, c.recurrentStateSize, len(slots))
+		for _, buf := range c.recurrentStates {
+			ctx.Forward(buf.SetRows(ctx, zeros, slotsTensor))
+		}
+	}
+}
+
+// EnsureWritable ensures sequences have private slots (copy-on-write).
+func (c *Recurrent) EnsureWritable(ctx ml.Context) error {
+	for i, seq := range c.curSeqs {
+		slot, ok := c.slotForSeq[seq]
+		if !ok {
+			continue
+		}
+
+		if slot < 0 || slot >= len(c.refCount) {
+			continue
+		}
+
+		if c.refCount[slot] <= 1 {
+			continue
+		}
+
+		newSlot, err := c.allocSlot()
+		if err != nil {
+			return err
+		}
+		c.refCount[slot]--
+		c.refCount[newSlot] = 1
+		c.slotForSeq[seq] = newSlot
+		c.curSlots[i] = newSlot
+
+		c.copyRecurrentState(ctx, slot, newSlot)
+		c.copyCheckpoints(ctx, slot, newSlot)
+	}
+
+	c.setCurSlotsInput(ctx)
+
+	return nil
+}
+
+func (c *Recurrent) copyRecurrentState(ctx ml.Context, srcSlot, dstSlot int) {
+	src := ctx.Input().FromInts([]int32{int32(srcSlot)}, 1)
+	dst := ctx.Input().FromInts([]int32{int32(dstSlot)}, 1)
+
+	for _, buf := range c.convStates {
+		rows := buf.Rows(ctx, src)
+		if rows.DType() != ml.DTypeF32 {
+			rows = rows.Cast(ctx, ml.DTypeF32)
+		}
+		ctx.Forward(buf.SetRows(ctx, rows, dst))
+	}
+
+	for _, buf := range c.recurrentStates {
+		rows := buf.Rows(ctx, src)
+		if rows.DType() != ml.DTypeF32 {
+			rows = rows.Cast(ctx, ml.DTypeF32)
+		}
+		ctx.Forward(buf.SetRows(ctx, rows, dst))
+	}
+}
+
+func (c *Recurrent) CopyPrefix(srcSeq, dstSeq int, prefixLen int32) {
+	c.kv.CopyPrefix(srcSeq, dstSeq, prefixLen)
+
+	if dstSlot, ok := c.slotForSeq[dstSeq]; ok {
+		if c.validSlot(dstSlot) {
+			c.refCount[dstSlot]--
+			if c.refCount[dstSlot] <= 0 {
+				c.refCount[dstSlot] = 0
+				c.freeSlot(dstSlot)
+			}
+		}
+		delete(c.slotForSeq, dstSeq)
+	}
+
+	srcSlot, ok := c.slotForSeq[srcSeq]
+	if !ok {
+		return
+	}
+
+	if c.validSlot(srcSlot) {
+		c.slotForSeq[dstSeq] = srcSlot
+		c.refCount[srcSlot]++
+	}
+}
+
+func (c *Recurrent) CanResume(seq int, pos int32) bool {
+	if !c.kv.CanResume(seq, pos) {
+		return false
+	}
+	if pos == 0 {
+		return true
+	}
+	return c.hasCheckpoint(seq, pos)
+}
+
+func (c *Recurrent) Remove(seq int, beginIndex, endIndex int32) error {
+	if beginIndex > 0 && endIndex != math.MaxInt32 {
+		if err := c.kv.Remove(seq, beginIndex, endIndex); err != nil {
+			return err
+		}
+		delete(c.pendingRestore, seq)
+
+		slot, ok := c.slotForSeq[seq]
+		if !ok || !c.validSlot(slot) {
+			return nil
+		}
+
+		// Detach shared recurrent state/checkpoints before mutating checkpoint positions.
+		if c.refCount[slot] > 1 {
+			newSlot, err := c.allocSlot()
+			if err != nil {
+				return err
+			}
+			ctx := c.backend.NewContext()
+			c.copyRecurrentState(ctx, slot, newSlot)
+			c.copyCheckpoints(ctx, slot, newSlot)
+			if len(c.convStates) > 0 || len(c.recurrentStates) > 0 {
+				ctx.Compute()
+			}
+			ctx.Close()
+
+			c.refCount[slot]--
+			c.refCount[newSlot] = 1
+			c.slotForSeq[seq] = newSlot
+			slot = newSlot
+		}
+
+		c.shiftCheckpoints(slot, beginIndex, endIndex)
+		return nil
+	}
+
+	if beginIndex > 0 {
+		restore, ok := c.pendingRestore[seq]
+		if !ok || restore.pos+1 != beginIndex {
+			return ErrNotSupported
+		}
+		if !c.restoreComplete(restore) {
+			return ErrNotSupported
+		}
+		if slot, ok := c.slotForSeq[seq]; ok && c.validSlot(slot) && c.refCount[slot] > 1 {
+			newSlot, err := c.allocSlot()
+			if err != nil {
+				return err
+			}
+			ctx := c.backend.NewContext()
+			c.copyRecurrentState(ctx, slot, newSlot)
+			c.copyCheckpoints(ctx, slot, newSlot)
+			if len(c.convStates) > 0 || len(c.recurrentStates) > 0 {
+				ctx.Compute()
+			}
+			ctx.Close()
+
+			c.refCount[slot]--
+			c.refCount[newSlot] = 1
+			c.slotForSeq[seq] = newSlot
+
+			restore.slot = newSlot
+			c.pendingRestore[seq] = restore
+		}
+	}
+
+	if err := c.kv.Remove(seq, beginIndex, endIndex); err != nil {
+		return err
+	}
+
+	if beginIndex > 0 {
+		restore := c.pendingRestore[seq]
+		delete(c.pendingRestore, seq)
+		return c.applyCheckpointRestore(restore)
+	}
+
+	slot, ok := c.slotForSeq[seq]
+	delete(c.pendingRestore, seq)
+	if !ok {
+		return nil
+	}
+
+	if !c.validSlot(slot) {
+		delete(c.slotForSeq, seq)
+		return nil
+	}
+
+	c.refCount[slot]--
+	if c.refCount[slot] <= 0 {
+		c.refCount[slot] = 0
+		c.clearCheckpoints(slot)
+		c.freeSlot(slot)
+	}
+	delete(c.slotForSeq, seq)
+
+	return nil
+}
+
+func (c *Recurrent) validSlot(slot int) bool {
+	return slot >= 0 && slot < len(c.refCount)
+}
+
+func (c *Recurrent) SlotsTensor() ml.Tensor {
+	return c.curSlotsInput
+}
+
+// contiguousSlots returns the starting slot if current slots are contiguous and ordered.
+func (c *Recurrent) contiguousSlots() (int, bool) {
+	if len(c.curSlots) == 0 {
+		return 0, false
+	}
+	start := c.curSlots[0]
+	for i, s := range c.curSlots {
+		if s != start+i {
+			return 0, false
+		}
+	}
+	return start, true
+}
+
+func (c *Recurrent) SeqTokens() int {
+	return c.curSeqTokens
+}
+
+func (c *Recurrent) NumSeqs() int {
+	return len(c.curSeqs)
+}
+
+func (c *Recurrent) convBuffer(layer int) ml.Tensor {
+	if buf, ok := c.convStates[layer]; ok {
+		return buf
+	}
+
+	if _, ok := c.convCtxs[layer]; !ok {
+		c.convCtxs[layer] = c.backend.NewContextSize(1).Layer(layer)
+	}
+
+	buf := c.convCtxs[layer].Zeros(ml.DTypeF32, c.convDim*c.convChannels, c.maxSequences)
+	c.convStates[layer] = buf
+	return buf
+}
+
+func (c *Recurrent) recurrentBuffer(layer int) ml.Tensor {
+	if buf, ok := c.recurrentStates[layer]; ok {
+		return buf
+	}
+
+	if _, ok := c.recurrentCtxs[layer]; !ok {
+		c.recurrentCtxs[layer] = c.backend.NewContextSize(1).Layer(layer)
+	}
+
+	buf := c.recurrentCtxs[layer].Zeros(ml.DTypeF32, c.recurrentStateSize, c.maxSequences)
+	c.recurrentStates[layer] = buf
+	return buf
+}
+
+func (c *Recurrent) ensureWritable(ctx ml.Context) error {
+	c.ensureWritableOnce(ctx)
+	return c.writableError
+}
+
+func (c *Recurrent) currentSlotRows(ctx ml.Context, buf ml.Tensor, rowSize int) ml.Tensor {
+	if start, ok := c.contiguousSlots(); ok {
+		offset := start * buf.Stride(1)
+		return buf.View(ctx, offset, rowSize, buf.Stride(1), c.NumSeqs())
+	}
+
+	return buf.Rows(ctx, c.SlotsTensor())
+}
+
+func (c *Recurrent) writeCurrentSlotRows(ctx ml.Context, buf ml.Tensor, rowSize int, src ml.Tensor) {
+	if start, ok := c.contiguousSlots(); ok {
+		offset := start * buf.Stride(1)
+		view := buf.View(ctx, offset, rowSize, buf.Stride(1), c.NumSeqs())
+		ctx.Forward(src.Copy(ctx, view))
+		return
+	}
+
+	ctx.Forward(buf.SetRows(ctx, src, c.SlotsTensor()))
+}
+
+func (c *Recurrent) ensureWritableOnce(ctx ml.Context) {
+	if !c.writableEnsured {
+		needsWritable := false
+		for _, seq := range c.curSeqs {
+			slot, ok := c.slotForSeq[seq]
+			if !ok {
+				continue
+			}
+			if slot >= 0 && slot < len(c.refCount) && c.refCount[slot] > 1 {
+				needsWritable = true
+				break
+			}
+		}
+
+		if needsWritable {
+			if err := c.EnsureWritable(ctx); err != nil {
+				c.writableError = err
+			}
+		}
+		c.writableEnsured = true
+	}
+}
+
+// ConvState returns conv state for current batch sequences as [convDim, convChannels, nSeqs].
+func (c *Recurrent) ConvState(ctx ml.Context, layer int) (ml.Tensor, error) {
+	if err := c.ensureWritable(ctx); err != nil {
+		return nil, err
+	}
+
+	buf := c.convBuffer(layer)
+	cur := c.currentSlotRows(ctx, buf, c.convDim*c.convChannels)
+	return cur.Reshape(ctx, c.convDim, c.convChannels, c.NumSeqs()), nil
+}
+
+// UpdateConvState writes new conv state for current batch sequences.
+func (c *Recurrent) UpdateConvState(ctx ml.Context, layer int, newState ml.Tensor) {
+	buf := c.convBuffer(layer)
+	src := newState.Reshape(ctx, c.convDim*c.convChannels, c.NumSeqs())
+	srcF32 := src
+	if src.DType() != ml.DTypeF32 {
+		srcF32 = src.Cast(ctx, ml.DTypeF32)
+	}
+	c.writeCurrentSlotRows(ctx, buf, c.convDim*c.convChannels, srcF32)
+
+	c.captureConvCheckpoint(ctx, layer, srcF32)
+}
+
+// RecurrentState returns recurrent state for current batch sequences with shape [dims..., nSeqs].
+func (c *Recurrent) RecurrentState(ctx ml.Context, layer int, dims ...int) (ml.Tensor, error) {
+	if err := c.ensureWritable(ctx); err != nil {
+		return nil, err
+	}
+	if len(dims) == 0 {
+		return nil, ErrInvalidRecurrentShape
+	}
+
+	size := 1
+	for _, d := range dims {
+		if d <= 0 {
+			return nil, ErrInvalidRecurrentShape
+		}
+		size *= d
+	}
+	if size != c.recurrentStateSize {
+		return nil, fmt.Errorf("%w: got %v (size %d), want size %d", ErrInvalidRecurrentShape, dims, size, c.recurrentStateSize)
+	}
+
+	buf := c.recurrentBuffer(layer)
+	cur := c.currentSlotRows(ctx, buf, c.recurrentStateSize)
+	shape := make([]int, 0, len(dims)+1)
+	shape = append(shape, dims...)
+	shape = append(shape, c.NumSeqs())
+	return cur.Reshape(ctx, shape...), nil
+}
+
+// RecurrentState4D returns recurrent state as [dim0, dim1, dim2, nSeqs].
+func (c *Recurrent) RecurrentState4D(ctx ml.Context, layer int, dim0, dim1, dim2 int) (ml.Tensor, error) {
+	if err := c.ensureWritable(ctx); err != nil {
+		return nil, err
+	}
+	if dim0 <= 0 || dim1 <= 0 || dim2 <= 0 {
+		return nil, ErrInvalidRecurrentShape
+	}
+
+	size := dim0 * dim1 * dim2
+	if size != c.recurrentStateSize {
+		return nil, fmt.Errorf("%w: got [%d %d %d] (size %d), want size %d", ErrInvalidRecurrentShape, dim0, dim1, dim2, size, c.recurrentStateSize)
+	}
+
+	buf := c.recurrentBuffer(layer)
+	cur := c.currentSlotRows(ctx, buf, c.recurrentStateSize)
+	return cur.Reshape(ctx, dim0, dim1, dim2, c.NumSeqs()), nil
+}
+
+// UpdateRecurrentState writes new recurrent state for current batch sequences.
+func (c *Recurrent) UpdateRecurrentState(ctx ml.Context, layer int, newState ml.Tensor) {
+	buf := c.recurrentBuffer(layer)
+	src := newState.Reshape(ctx, c.recurrentStateSize, c.NumSeqs())
+	srcF32 := src
+	if src.DType() != ml.DTypeF32 {
+		srcF32 = src.Cast(ctx, ml.DTypeF32)
+	}
+	c.writeCurrentSlotRows(ctx, buf, c.recurrentStateSize, srcF32)
+
+	c.captureRecurrentCheckpoint(ctx, layer, srcF32)
+}
+
+// IsSupportedForBatch returns true if the current batch layout supports recurrent layers.
+func (c *Recurrent) IsSupportedForBatch() bool {
+	return c.curSeqTokens > 0 && len(c.curSeqs) > 0
+}
+
+// Seqs returns the ordered unique sequences for the current forward pass.
+func (c *Recurrent) Seqs() []int {
+	return slices.Clone(c.curSeqs)
+}
--- a/model/models/qwen3next/checkpoints.go
+++ b/model/models/qwen3next/checkpoints.go
@@ -1,27 +1,20 @@
-package qwen3next
+package kvcache

 import (
 	"log/slog"
 	"math"

-	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/model/input"
 )

-const (
-	checkpointCountDefault    = 32
-	checkpointMinPosDefault   = int32(16)
-	checkpointIntervalDefault = int32(1280)
-)
-
 // TODO(jmorganca): Add byte-serialized host-RAM checkpoints to reduce GPU
 // memory usage while preserving prefix reuse for recurrent state.

 type checkpointEntry struct {
-	pos   int32
-	conv  map[int]ml.Tensor
-	delta map[int]ml.Tensor
+	pos       int32
+	conv      map[int]ml.Tensor
+	recurrent map[int]ml.Tensor
 }

 type slotCheckpointStore struct {
@@ -132,6 +125,63 @@ func (s *slotCheckpointStore) pruneAfter(pos int32) {
 	s.lastPos = pos
 }

+func (s *slotCheckpointStore) shiftRange(beginIndex, endIndex int32) {
+	if len(s.entries) == 0 {
+		s.size = 0
+		s.next = 0
+		s.lastPos = -1
+		return
+	}
+
+	offset := beginIndex - endIndex
+
+	size := 0
+	next := -1
+	minPos := int32(math.MaxInt32)
+	maxPos := int32(-1)
+	minIdx := 0
+
+	for i := range s.entries {
+		pos := s.entries[i].pos
+		if pos >= 0 {
+			if pos >= beginIndex && pos < endIndex {
+				s.entries[i].pos = -1
+			} else if pos >= endIndex {
+				s.entries[i].pos = pos + offset
+			}
+		}
+
+		pos = s.entries[i].pos
+		if pos >= 0 {
+			size++
+			if pos < minPos {
+				minPos = pos
+				minIdx = i
+			}
+			if pos > maxPos {
+				maxPos = pos
+			}
+		} else if next == -1 {
+			next = i
+		}
+	}
+
+	s.size = size
+	if size == 0 {
+		s.next = 0
+		s.lastPos = -1
+		return
+	}
+
+	if next != -1 {
+		s.next = next
+	} else {
+		// Full ring: overwrite the oldest checkpoint next.
+		s.next = minIdx
+	}
+	s.lastPos = maxPos
+}
+
 func (s *slotCheckpointStore) window() (size int, minPos, maxPos, lastPos int32) {
 	minPos = int32(math.MaxInt32)
 	maxPos = int32(-1)
@@ -155,7 +205,14 @@ func (s *slotCheckpointStore) window() (size int, minPos, maxPos, lastPos int32)
 	return size, minPos, maxPos, s.lastPos
 }

-func (c *HybridCache) planCheckpoints(batch input.Batch) {
+func (c *Recurrent) checkpointTag() string {
+	if c.logPrefix == "" {
+		return "kvcache.recurrent"
+	}
+	return c.logPrefix
+}
+
+func (c *Recurrent) planCheckpoints(batch input.Batch) {
 	if c.checkpointCount == 0 || len(c.curSeqs) == 0 {
 		c.curCheckpointPos = c.curCheckpointPos[:0]
 		for k := range c.curCheckpointSlots {
@@ -201,7 +258,7 @@ func (c *HybridCache) planCheckpoints(batch input.Batch) {
 	}
 }

-func (c *HybridCache) checkpointStore(slot int) *slotCheckpointStore {
+func (c *Recurrent) checkpointStore(slot int) *slotCheckpointStore {
 	store, ok := c.checkpoints[slot]
 	if ok {
 		return store
@@ -211,7 +268,7 @@ func (c *HybridCache) checkpointStore(slot int) *slotCheckpointStore {
 	return store
 }

-func (c *HybridCache) checkpointIndexForSlot(slot int, pos int32) int {
+func (c *Recurrent) checkpointIndexForSlot(slot int, pos int32) int {
 	if c.checkpointCount == 0 {
 		return -1
 	}
@@ -226,7 +283,7 @@ func (c *HybridCache) checkpointIndexForSlot(slot int, pos int32) int {
 	return idx
 }

-func (c *HybridCache) hasCheckpoint(seq int, pos int32) bool {
+func (c *Recurrent) hasCheckpoint(seq int, pos int32) bool {
 	if pos <= 0 {
 		return false
 	}
@@ -242,7 +299,7 @@ func (c *HybridCache) hasCheckpoint(seq int, pos int32) bool {
 	return ok
 }

-func (c *HybridCache) PrepareRestore(seq int, targetPos int32) (int32, bool) {
+func (c *Recurrent) PrepareRestore(seq int, targetPos int32) (int32, bool) {
 	if targetPos <= 0 {
 		return 0, false
 	}
@@ -252,13 +309,13 @@ func (c *HybridCache) PrepareRestore(seq int, targetPos int32) (int32, bool) {
 	}
 	store, ok := c.checkpoints[slot]
 	if !ok {
-		slog.Debug("qwen3next: checkpoint miss", "seq", seq, "slot", slot, "target", targetPos, "size", 0)
+		slog.Debug(c.checkpointTag()+": checkpoint miss", "seq", seq, "slot", slot, "target", targetPos, "size", 0)
 		return 0, false
 	}
 	idx, pos, ok := store.bestIndex(targetPos)
 	if !ok {
 		size, minPos, maxPos, lastPos := store.window()
-		slog.Debug("qwen3next: checkpoint miss", "seq", seq, "slot", slot, "target", targetPos, "size", size,
+		slog.Debug(c.checkpointTag()+": checkpoint miss", "seq", seq, "slot", slot, "target", targetPos, "size", size,
 			"min", minPos, "max", maxPos, "last", lastPos)
 		return 0, false
 	}
@@ -270,10 +327,10 @@ func (c *HybridCache) PrepareRestore(seq int, targetPos int32) (int32, bool) {
 	return pos + 1, true
 }

-func (c *HybridCache) applyCheckpointRestore(restore checkpointRestore) error {
+func (c *Recurrent) applyCheckpointRestore(restore checkpointRestore) error {
 	entry, ok := c.restoreEntry(restore)
 	if !ok {
-		return kvcache.ErrNotSupported
+		return ErrNotSupported
 	}

 	ctx := c.backend.NewContext()
@@ -281,15 +338,15 @@ func (c *HybridCache) applyCheckpointRestore(restore checkpointRestore) error {

 	slotIdx := ctx.Input().FromInts([]int32{int32(restore.slot)}, 1)
 	for layer, src := range entry.conv {
-		buf := c.convBuffer(ctx, layer)
+		buf := c.convBuffer(layer)
 		ctx.Forward(buf.SetRows(ctx, src, slotIdx))
 	}
-	for layer, src := range entry.delta {
-		buf := c.deltaBuffer(ctx, layer)
+	for layer, src := range entry.recurrent {
+		buf := c.recurrentBuffer(layer)
 		ctx.Forward(buf.SetRows(ctx, src, slotIdx))
 	}

-	if len(entry.conv) > 0 || len(entry.delta) > 0 {
+	if len(entry.conv) > 0 || len(entry.recurrent) > 0 {
 		ctx.Compute()
 	}
 	store := c.checkpoints[restore.slot]
@@ -297,12 +354,12 @@ func (c *HybridCache) applyCheckpointRestore(restore checkpointRestore) error {
 	return nil
 }

-func (c *HybridCache) restoreComplete(restore checkpointRestore) bool {
+func (c *Recurrent) restoreComplete(restore checkpointRestore) bool {
 	_, ok := c.restoreEntry(restore)
 	return ok
 }

-func (c *HybridCache) restoreEntry(restore checkpointRestore) (*checkpointEntry, bool) {
+func (c *Recurrent) restoreEntry(restore checkpointRestore) (*checkpointEntry, bool) {
 	store, ok := c.checkpoints[restore.slot]
 	if !ok || restore.idx < 0 || restore.idx >= len(store.entries) {
 		return nil, false
@@ -317,27 +374,33 @@ func (c *HybridCache) restoreEntry(restore checkpointRestore) (*checkpointEntry,
 	return entry, true
 }

-func (c *HybridCache) entryComplete(entry *checkpointEntry) bool {
+func (c *Recurrent) entryComplete(entry *checkpointEntry) bool {
 	for layer := range c.convStates {
 		if entry.conv == nil || entry.conv[layer] == nil {
 			return false
 		}
 	}
-	for layer := range c.deltaStates {
-		if entry.delta == nil || entry.delta[layer] == nil {
+	for layer := range c.recurrentStates {
+		if entry.recurrent == nil || entry.recurrent[layer] == nil {
 			return false
 		}
 	}
 	return true
 }

-func (c *HybridCache) clearCheckpoints(slot int) {
+func (c *Recurrent) clearCheckpoints(slot int) {
 	if store, ok := c.checkpoints[slot]; ok {
 		store.reset()
 	}
 }

-func (c *HybridCache) copyCheckpoints(ctx ml.Context, srcSlot, dstSlot int) {
+func (c *Recurrent) shiftCheckpoints(slot int, beginIndex, endIndex int32) {
+	if store, ok := c.checkpoints[slot]; ok {
+		store.shiftRange(beginIndex, endIndex)
+	}
+}
+
+func (c *Recurrent) copyCheckpoints(ctx ml.Context, srcSlot, dstSlot int) {
 	if c.checkpointCount == 0 {
 		return
 	}
@@ -363,19 +426,19 @@ func (c *HybridCache) copyCheckpoints(ctx ml.Context, srcSlot, dstSlot int) {
 				ctx.Forward(src.Copy(ctx, dst))
 			}
 		}
-		if srcEntry.delta != nil {
-			if dstEntry.delta == nil {
-				dstEntry.delta = make(map[int]ml.Tensor)
+		if srcEntry.recurrent != nil {
+			if dstEntry.recurrent == nil {
+				dstEntry.recurrent = make(map[int]ml.Tensor)
 			}
-			for layer, src := range srcEntry.delta {
-				dst := c.ensureCheckpointDelta(layer, dstEntry)
+			for layer, src := range srcEntry.recurrent {
+				dst := c.ensureCheckpointRecurrent(layer, dstEntry)
 				ctx.Forward(src.Copy(ctx, dst))
 			}
 		}
 	}
 }

-func (c *HybridCache) captureConvCheckpoint(ctx ml.Context, layer int, src ml.Tensor) {
+func (c *Recurrent) captureConvCheckpoint(ctx ml.Context, layer int, src ml.Tensor) {
 	if c.checkpointCount == 0 {
 		return
 	}
@@ -402,12 +465,12 @@ func (c *HybridCache) captureConvCheckpoint(ctx ml.Context, layer int, src ml.Te
 	}
 }

-func (c *HybridCache) captureDeltaCheckpoint(ctx ml.Context, layer int, src ml.Tensor) {
+func (c *Recurrent) captureRecurrentCheckpoint(ctx ml.Context, layer int, src ml.Tensor) {
 	if c.checkpointCount == 0 {
 		return
 	}
 	if c.reserveCheckpoints {
-		c.reserveCheckpointDelta(layer)
+		c.reserveCheckpointRecurrent(layer)
 		return
 	}
 	if len(c.curCheckpointPos) == 0 {
@@ -423,13 +486,13 @@ func (c *HybridCache) captureDeltaCheckpoint(ctx ml.Context, layer int, src ml.T
 			continue
 		}
 		entry := &c.checkpoints[slot].entries[idx]
-		dst := c.ensureCheckpointDelta(layer, entry)
+		dst := c.ensureCheckpointRecurrent(layer, entry)
 		seqSlice := src.Slice(ctx, 1, i, i+1, 1)
 		ctx.Forward(seqSlice.Copy(ctx, dst))
 	}
 }

-func (c *HybridCache) ensureCheckpointConv(layer int, entry *checkpointEntry) ml.Tensor {
+func (c *Recurrent) ensureCheckpointConv(layer int, entry *checkpointEntry) ml.Tensor {
 	if entry.conv == nil {
 		entry.conv = make(map[int]ml.Tensor)
 	}
@@ -446,24 +509,24 @@ func (c *HybridCache) ensureCheckpointConv(layer int, entry *checkpointEntry) ml
 	return t
 }

-func (c *HybridCache) ensureCheckpointDelta(layer int, entry *checkpointEntry) ml.Tensor {
-	if entry.delta == nil {
-		entry.delta = make(map[int]ml.Tensor)
+func (c *Recurrent) ensureCheckpointRecurrent(layer int, entry *checkpointEntry) ml.Tensor {
+	if entry.recurrent == nil {
+		entry.recurrent = make(map[int]ml.Tensor)
 	}
-	if t, ok := entry.delta[layer]; ok {
+	if t, ok := entry.recurrent[layer]; ok {
 		return t
 	}
-	ctx, ok := c.checkpointDeltaCtxs[layer]
+	ctx, ok := c.checkpointRecurCtxs[layer]
 	if !ok {
 		ctx = c.backend.NewContextSize(c.checkpointCtxSize).Layer(layer)
-		c.checkpointDeltaCtxs[layer] = ctx
+		c.checkpointRecurCtxs[layer] = ctx
 	}
-	t := ctx.Zeros(ml.DTypeF32, c.deltaStateSize, 1)
-	entry.delta[layer] = t
+	t := ctx.Zeros(ml.DTypeF32, c.recurrentStateSize, 1)
+	entry.recurrent[layer] = t
 	return t
 }

-func (c *HybridCache) reserveCheckpointConv(layer int) {
+func (c *Recurrent) reserveCheckpointConv(layer int) {
 	key := checkpointReserveKey(layer, 0)
 	if _, ok := c.checkpointReserved[key]; ok {
 		return
@@ -478,7 +541,7 @@ func (c *HybridCache) reserveCheckpointConv(layer int) {
 	c.checkpointReserved[key] = struct{}{}
 }

-func (c *HybridCache) reserveCheckpointDelta(layer int) {
+func (c *Recurrent) reserveCheckpointRecurrent(layer int) {
 	key := checkpointReserveKey(layer, 1)
 	if _, ok := c.checkpointReserved[key]; ok {
 		return
@@ -487,7 +550,7 @@ func (c *HybridCache) reserveCheckpointDelta(layer int) {
 		store := c.checkpointStore(slot)
 		for i := range store.entries {
 			entry := &store.entries[i]
-			_ = c.ensureCheckpointDelta(layer, entry)
+			_ = c.ensureCheckpointRecurrent(layer, entry)
 		}
 	}
 	c.checkpointReserved[key] = struct{}{}
--- a/model/models/qwen3next/checkpoints_test.go
+++ b/model/models/qwen3next/checkpoints_test.go
@@ -1,40 +1,16 @@
-package qwen3next
+package kvcache

 import (
 	"errors"
 	"math"
-	"os"
+	"slices"
 	"testing"

-	"github.com/ollama/ollama/fs/ggml"
-	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 )

-func newTestBackend(tb testing.TB) ml.Backend {
-	tb.Helper()
-
-	f, err := os.CreateTemp(tb.TempDir(), "*.gguf")
-	if err != nil {
-		tb.Fatal(err)
-	}
-	if err := ggml.WriteGGUF(f, ggml.KV{"general.architecture": "test"}, nil); err != nil {
-		_ = f.Close()
-		tb.Fatal(err)
-	}
-	if err := f.Close(); err != nil {
-		tb.Fatal(err)
-	}
-
-	b, err := ml.NewBackend(f.Name(), ml.BackendParams{AllocMemory: true})
-	if err != nil {
-		tb.Fatal(err)
-	}
-	tb.Cleanup(func() {
-		b.Close()
-	})
-
-	return b
+func newTestCache() *Recurrent {
+	return NewRecurrentCache(RecurrentConfig{ConvDim: 1, ConvChannels: 2, RecurrentStateSize: 2})
 }

 func TestSlotCheckpointStoreBestIndex(t *testing.T) {
@@ -59,8 +35,8 @@ func TestSlotCheckpointStoreBestIndex(t *testing.T) {
 	}
 }

-func TestHybridCachePrepareRestore(t *testing.T) {
-	cache := NewHybridCache(nil, 1, 1, 1)
+func TestCachePrepareRestore(t *testing.T) {
+	cache := newTestCache()
 	cache.checkpointCount = 3
 	cache.checkpoints = make(map[int]*slotCheckpointStore)
 	cache.pendingRestore = make(map[int]checkpointRestore)
@@ -110,45 +86,8 @@ func TestSlotCheckpointStorePruneAfter(t *testing.T) {
 	}
 }

-func TestHybridCacheRestoreDetachesSharedSlot(t *testing.T) {
-	backend := newTestBackend(t)
-
-	cache := NewHybridCache(nil, 1, 2, 2)
-	cache.Init(backend, ml.DTypeF16, 2, 8, 2)
-
-	cache.slotForSeq[1] = 0
-	cache.slotForSeq[2] = 0
-	cache.refCount[0] = 2
-	cache.refCount[1] = 0
-	cache.freeSlots = []int{1}
-
-	store := cache.checkpointStore(0)
-	idx := store.record(9)
-	cache.pendingRestore[1] = checkpointRestore{slot: 0, idx: idx, pos: 9}
-
-	if err := cache.Remove(1, 10, math.MaxInt32); err != nil {
-		t.Fatalf("Remove failed: %v", err)
-	}
-
-	if cache.slotForSeq[1] == cache.slotForSeq[2] {
-		t.Fatalf("expected restore to detach shared slot, got same slot %d", cache.slotForSeq[1])
-	}
-	if cache.slotForSeq[1] != 1 {
-		t.Fatalf("expected seq 1 to move to slot 1, got %d", cache.slotForSeq[1])
-	}
-	if cache.slotForSeq[2] != 0 {
-		t.Fatalf("expected seq 2 to remain on slot 0, got %d", cache.slotForSeq[2])
-	}
-	if cache.refCount[0] != 1 || cache.refCount[1] != 1 {
-		t.Fatalf("unexpected refCounts: slot0=%d slot1=%d", cache.refCount[0], cache.refCount[1])
-	}
-	if _, ok := cache.pendingRestore[1]; ok {
-		t.Fatalf("expected pending restore to be cleared")
-	}
-}
-
-func TestHybridCacheRestoreRejectsIncompleteCheckpoint(t *testing.T) {
-	cache := NewHybridCache(nil, 1, 2, 2)
+func TestCacheRestoreRejectsIncompleteCheckpoint(t *testing.T) {
+	cache := newTestCache()
 	cache.checkpointCount = 3
 	cache.checkpoints = make(map[int]*slotCheckpointStore)
 	cache.pendingRestore = make(map[int]checkpointRestore)
@@ -157,27 +96,26 @@ func TestHybridCacheRestoreRejectsIncompleteCheckpoint(t *testing.T) {
 	cache.refCount = []int{1}
 	cache.freeSlots = nil

-	// Simulate that layer 0 has both conv and delta state (so entryComplete expects both)
-	cache.convStates[0] = nil  // placeholder to indicate layer 0 exists
-	cache.deltaStates[0] = nil // placeholder to indicate layer 0 exists
+	// Simulate layer 0 requires both conv and recurrent checkpoints.
+	cache.convStates[0] = nil
+	cache.recurrentStates[0] = nil

 	store := cache.checkpointStore(0)
 	idx := store.record(9)
 	entry := &store.entries[idx]
-	// Only set conv checkpoint, not delta - making it incomplete
 	entry.conv = map[int]ml.Tensor{0: nil}
-	// entry.delta is not set, so checkpoint is incomplete
+	// entry.recurrent intentionally missing

 	cache.pendingRestore[1] = checkpointRestore{slot: 0, idx: idx, pos: 9}

 	err := cache.Remove(1, 10, math.MaxInt32)
-	if !errors.Is(err, kvcache.ErrNotSupported) {
+	if !errors.Is(err, ErrNotSupported) {
 		t.Fatalf("expected ErrNotSupported for incomplete checkpoint, got %v", err)
 	}
 }

-func TestHybridCacheRestoreAcceptsCompleteCheckpoint(t *testing.T) {
-	cache := NewHybridCache(nil, 1, 2, 2)
+func TestCacheRestoreAcceptsCompleteCheckpoint(t *testing.T) {
+	cache := newTestCache()
 	cache.checkpointCount = 3
 	cache.checkpoints = make(map[int]*slotCheckpointStore)
 	cache.pendingRestore = make(map[int]checkpointRestore)
@@ -186,55 +124,111 @@ func TestHybridCacheRestoreAcceptsCompleteCheckpoint(t *testing.T) {
 	cache.refCount = []int{1}
 	cache.freeSlots = nil

-	// Don't set convStates/deltaStates - with no layers to check,
-	// entryComplete will return true as long as entry.pos >= 0
-
 	store := cache.checkpointStore(0)
 	idx := store.record(9)

 	cache.pendingRestore[1] = checkpointRestore{slot: 0, idx: idx, pos: 9}

-	// Test that restoreComplete returns true when no layers need checkpoints
 	restore := cache.pendingRestore[1]
 	if !cache.restoreComplete(restore) {
 		t.Fatalf("expected restoreComplete to return true for complete checkpoint")
 	}
 }

+func TestCacheRecurrentStateShapeValidation(t *testing.T) {
+	cache := newTestCache()
+	_, err := cache.RecurrentState(nil, 0, 3)
+	if !errors.Is(err, ErrInvalidRecurrentShape) {
+		t.Fatalf("expected ErrInvalidRecurrentShape, got %v", err)
+	}
+}
+
+func TestSlotCheckpointStoreShiftRange(t *testing.T) {
+	store := newSlotCheckpointStore(5)
+	store.record(1)
+	store.record(4)
+	store.record(7)
+	store.record(10)
+
+	store.shiftRange(2, 6)
+
+	var positions []int32
+	for i := range store.entries {
+		if store.entries[i].pos >= 0 {
+			positions = append(positions, store.entries[i].pos)
+		}
+	}
+	slices.Sort(positions)
+
+	want := []int32{1, 3, 6}
+	if !slices.Equal(positions, want) {
+		t.Fatalf("unexpected shifted positions: got=%v want=%v", positions, want)
+	}
+	if store.lastPos != 6 {
+		t.Fatalf("expected lastPos 6, got %d", store.lastPos)
+	}
+}
+
+func TestCacheRemoveMiddleShiftsCheckpoints(t *testing.T) {
+	cache := newTestCache()
+	cache.slotForSeq[1] = 0
+	cache.refCount = []int{1}
+	cache.pendingRestore[1] = checkpointRestore{slot: 0, idx: 0, pos: 1}
+
+	store := cache.checkpointStore(0)
+	store.record(1)
+	store.record(4)
+	store.record(7)
+	store.record(10)
+
+	if err := cache.Remove(1, 2, 6); err != nil {
+		t.Fatalf("expected middle remove to succeed, got %v", err)
+	}
+
+	if _, ok := cache.pendingRestore[1]; ok {
+		t.Fatalf("expected pending restore to be cleared after middle remove")
+	}
+
+	var positions []int32
+	for i := range store.entries {
+		if store.entries[i].pos >= 0 {
+			positions = append(positions, store.entries[i].pos)
+		}
+	}
+	slices.Sort(positions)
+
+	want := []int32{1, 3, 6}
+	if !slices.Equal(positions, want) {
+		t.Fatalf("unexpected checkpoint positions after remove: got=%v want=%v", positions, want)
+	}
+}
+
 func TestSlotCheckpointStoreRingBufferWrapAround(t *testing.T) {
-	// Test that ring buffer wrap-around reuses entries without clearing maps.
 	store := newSlotCheckpointStore(3)

-	// Fill the buffer
 	store.record(10)
 	store.record(20)
 	store.record(30)

-	// Create fake tensor data in the first entry's maps
 	store.entries[0].conv = make(map[int]ml.Tensor)
-	store.entries[0].conv[0] = nil // Simulated tensor reference
-	store.entries[0].delta = make(map[int]ml.Tensor)
-	store.entries[0].delta[0] = nil // Simulated tensor reference
+	store.entries[0].conv[0] = nil
+	store.entries[0].recurrent = make(map[int]ml.Tensor)
+	store.entries[0].recurrent[0] = nil

-	// Record another entry, which should wrap around and overwrite entry 0
 	store.record(40)

-	// Verify the maps are still present (we reuse tensors)
 	if store.entries[0].conv == nil {
 		t.Fatalf("expected conv map to be preserved on reuse")
 	}
-	if store.entries[0].delta == nil {
-		t.Fatalf("expected delta map to be preserved on reuse")
+	if store.entries[0].recurrent == nil {
+		t.Fatalf("expected recurrent map to be preserved on reuse")
 	}
-
-	// Verify the new position was recorded
 	if store.entries[0].pos != 40 {
 		t.Fatalf("expected entry 0 pos to be 40, got %d", store.entries[0].pos)
 	}
 }

 func TestSlotCheckpointStoreFullCapacity(t *testing.T) {
-	// Test behavior when buffer is exactly at capacity
 	store := newSlotCheckpointStore(2)

 	idx1 := store.record(10)
@@ -243,12 +237,10 @@ func TestSlotCheckpointStoreFullCapacity(t *testing.T) {
 	if idx1 != 0 || idx2 != 1 {
 		t.Fatalf("expected indices 0, 1, got %d, %d", idx1, idx2)
 	}
-
 	if store.size != 2 {
 		t.Fatalf("expected size 2, got %d", store.size)
 	}

-	// Verify both checkpoints are accessible
 	_, pos1, ok1 := store.bestIndex(15)
 	_, pos2, ok2 := store.bestIndex(25)

@@ -261,7 +253,6 @@ func TestSlotCheckpointStoreFullCapacity(t *testing.T) {
 }

 func TestSlotCheckpointStoreEmptyBuffer(t *testing.T) {
-	// Test behavior with zero-size buffer
 	store := newSlotCheckpointStore(0)

 	idx := store.record(10)
@@ -276,19 +267,16 @@ func TestSlotCheckpointStoreEmptyBuffer(t *testing.T) {
 }

 func TestSlotCheckpointStorePruneAfterAll(t *testing.T) {
-	// Test pruning that removes all checkpoints
 	store := newSlotCheckpointStore(3)
 	store.record(10)
 	store.record(20)
 	store.record(30)

-	// Prune everything by setting threshold below all positions
 	store.pruneAfter(5)

 	if store.size != 0 {
 		t.Fatalf("expected size 0 after pruning all, got %d", store.size)
 	}
-	// When all checkpoints are pruned, lastPos is reset to -1
 	if store.lastPos != -1 {
 		t.Fatalf("expected lastPos -1 after pruning all, got %d", store.lastPos)
 	}
--- a/llama/patches/0034-ggml-metal-guard-mul_mat_id-map0-and-add-ne20-22-spe.patch
+++ b/llama/patches/0034-ggml-metal-guard-mul_mat_id-map0-and-add-ne20-22-spe.patch
@@ -0,0 +1,37 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: jmorganca <jmorganca@gmail.com>
+Date: Sun, 22 Feb 2026 14:12:30 -0800
+Subject: [PATCH] ggml-metal: guard mul_mat_id map0 and add ne20=22
+ specialization
+
+---
+ ggml/src/ggml-metal/ggml-metal-ops.cpp | 3 ++-
+ ggml/src/ggml-metal/ggml-metal.metal   | 1 +
+ 2 files changed, 3 insertions(+), 1 deletion(-)
+
+diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp
+index 4ac135603..ac5ad53db 100644
+--- a/ggml/src/ggml-metal/ggml-metal-ops.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp
+@@ -1961,7 +1961,8 @@ int ggml_metal_op_mul_mat_id(ggml_metal_op_t ctx, int idx) {
+     // ne21 = n_rows (batch size)
+     const int ne21_mm_id_min = 32;
+ 
+-    if (props_dev->has_simdgroup_mm && ne00 >= 64 && (ne21 >= ne21_mm_id_min)) {
+    if (props_dev->has_simdgroup_mm && ne00 >= 64 && (ne21 >= ne21_mm_id_min) &&
+            (ne20 == 1 || ne20 == 2 || ne20 == 4 || ne20 == 6 || ne20 == 8 || ne20 == 10 || ne20 == 16 || ne20 == 22)) {
+         // some Metal matrix data types require aligned pointers
+         // ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5)
+         //switch (op->src[0]->type) {
+diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
+index c37447a10..4f338aa13 100644
+--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
+@@ -9427,6 +9427,7 @@ template [[host_name("kernel_mul_mm_id_map0_ne20_6" )]] kernel kernel_mul_mm_id_
+ template [[host_name("kernel_mul_mm_id_map0_ne20_8" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<8>;
+ template [[host_name("kernel_mul_mm_id_map0_ne20_10")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<10>;
+ template [[host_name("kernel_mul_mm_id_map0_ne20_16")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<16>;
+template [[host_name("kernel_mul_mm_id_map0_ne20_22")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<22>;
+ 
+ template<typename S0, typename S0_4x4, typename S0_8x8, typename S1, typename S1_2x4, typename S1_8x8, typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread S0_4x4 &), typename T0, typename T0_4x4, typename T1, typename T1_2x4>
+ kernel void kernel_mul_mm_id(
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -163,6 +163,7 @@ type Tensor interface {
 	Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
 	Conv3D(ctx Context, weight Tensor, c, s0, s1, s2, p0, p1, p2, d0, d1, d2 int) Tensor
 	SSMConv(ctx Context, kernel Tensor) Tensor
+	SSMScan(ctx Context, x, dt, A, B, C, ids Tensor) Tensor

 	IM2Col(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor

@@ -194,6 +195,7 @@ type Tensor interface {
 	Concat(ctx Context, t2 Tensor, dim int) Tensor
 	Rows(ctx Context, t2 Tensor) Tensor
 	SetRows(ctx Context, src Tensor, idxs Tensor) Tensor
+	SetInplace(ctx Context, src Tensor, nb1, nb2, nb3, offset int) Tensor
 	Copy(ctx Context, t2 Tensor) Tensor
 	Duplicate(ctx Context) Tensor

--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -1345,6 +1345,21 @@ func (t *Tensor) SetRows(ctx ml.Context, src ml.Tensor, idxs ml.Tensor) ml.Tenso
 	}
 }

+func (t *Tensor) SetInplace(ctx ml.Context, src ml.Tensor, nb1, nb2, nb3, offset int) ml.Tensor {
+	return &Tensor{
+		b: t.b,
+		t: C.ggml_set_inplace(
+			ctx.(*Context).ctx,
+			t.t,
+			src.(*Tensor).t,
+			C.size_t(nb1),
+			C.size_t(nb2),
+			C.size_t(nb3),
+			C.size_t(offset),
+		),
+	}
+}
+
 func (t *Tensor) Copy(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
 	return &Tensor{
 		b: t.b,
@@ -1662,6 +1677,13 @@ func (t *Tensor) SSMConv(ctx ml.Context, kernel ml.Tensor) ml.Tensor {
 	}
 }

+func (t *Tensor) SSMScan(ctx ml.Context, x, dt, A, B, C, ids ml.Tensor) ml.Tensor {
+	return &Tensor{
+		b: t.b,
+		t: C.ggml_ssm_scan(ctx.(*Context).ctx, t.t, x.(*Tensor).t, dt.(*Tensor).t, A.(*Tensor).t, B.(*Tensor).t, C.(*Tensor).t, ids.(*Tensor).t),
+	}
+}
+
 func (t *Tensor) AvgPool2D(ctx ml.Context, k, s int, p float32) ml.Tensor {
 	return &Tensor{
 		b: t.b,
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
@@ -12249,6 +12249,7 @@ template [[host_name("kernel_mul_mm_id_map0_ne20_6" )]] kernel kernel_mul_mm_id_
 template [[host_name("kernel_mul_mm_id_map0_ne20_8" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<8>;
 template [[host_name("kernel_mul_mm_id_map0_ne20_10")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<10>;
 template [[host_name("kernel_mul_mm_id_map0_ne20_16")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<16>;
+template [[host_name("kernel_mul_mm_id_map0_ne20_22")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<22>;

 template<typename S0, typename S0_4x4, typename S0_8x8, typename S1, typename S1_2x4, typename S1_8x8, typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread S0_4x4 &), typename T0, typename T0_4x4, typename T1, typename T1_2x4>
 kernel void kernel_mul_mm_id(
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-ops.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-ops.cpp
@@ -1961,7 +1961,8 @@ int ggml_metal_op_mul_mat_id(ggml_metal_op_t ctx, int idx) {
    // ne21 = n_rows (batch size)
    const int ne21_mm_id_min = 32;

-    if (props_dev->has_simdgroup_mm && ne00 >= 64 && (ne21 >= ne21_mm_id_min)) {
+    if (props_dev->has_simdgroup_mm && ne00 >= 64 && (ne21 >= ne21_mm_id_min) &&
+            (ne20 == 1 || ne20 == 2 || ne20 == 4 || ne20 == 6 || ne20 == 8 || ne20 == 10 || ne20 == 16 || ne20 == 22)) {
        // some Metal matrix data types require aligned pointers
        // ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5)
        //switch (op->src[0]->type) {
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal
@@ -9427,6 +9427,7 @@ template [[host_name("kernel_mul_mm_id_map0_ne20_6" )]] kernel kernel_mul_mm_id_
 template [[host_name("kernel_mul_mm_id_map0_ne20_8" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<8>;
 template [[host_name("kernel_mul_mm_id_map0_ne20_10")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<10>;
 template [[host_name("kernel_mul_mm_id_map0_ne20_16")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<16>;
+template [[host_name("kernel_mul_mm_id_map0_ne20_22")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<22>;

 template<typename S0, typename S0_4x4, typename S0_8x8, typename S1, typename S1_2x4, typename S1_8x8, typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread S0_4x4 &), typename T0, typename T0_4x4, typename T1, typename T1_2x4>
 kernel void kernel_mul_mm_id(
--- a/model/model_test.go
+++ b/model/model_test.go
@@ -67,6 +67,7 @@ func (f *fakeTensor) Tri(ctx ml.Context, _ int) ml.Tensor
 func (f *fakeTensor) Fill(ctx ml.Context, _ float32) ml.Tensor                     { return f }
 func (f *fakeTensor) Repeat4D(ctx ml.Context, _, _, _, _ int) ml.Tensor            { return f }
 func (f *fakeTensor) SolveTri(ctx ml.Context, _ ml.Tensor, _, _, _ bool) ml.Tensor { return f }
+func (f *fakeTensor) SSMScan(ctx ml.Context, _, _, _, _, _, _ ml.Tensor) ml.Tensor { return f }

 func (m *fakeBackend) Get(name string) ml.Tensor {
 	if slices.Contains(m.names, name) {
--- a/model/models/lfm2/cache.go
+++ b/model/models/lfm2/cache.go
@@ -1,410 +1,44 @@
 package lfm2

 import (
-	"slices"
-
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
-	"github.com/ollama/ollama/model/input"
 )

-var _ kvcache.Cache = (*HybridCache)(nil)
+var (
+	_ kvcache.Cache           = (*HybridCache)(nil)
+	_ kvcache.CheckpointCache = (*HybridCache)(nil)
+)

-// HybridCache stores:
-// - a standard causal KV cache for attention layers
-// - a per-sequence recurrent conv state for shortconv layers
+// HybridCache adapts the shared recurrent cache for LFM2:
+// - KV attention cache is handled by the embedded causal cache
+// - shortconv recurrent state uses conv slots [dConv, hiddenSize]
 //
-// Conv state shape (per layer, per sequence): [dConv, hiddenSize] where dConv = L_cache - 1.
-// Stored internally as a tensor of shape [dConv * hiddenSize, maxSlots].
+// This reuses shared checkpoint/restore logic for prefix mismatch recovery.
 type HybridCache struct {
-	kv *kvcache.Causal
-
-	backend      ml.Backend
-	dtype        ml.DType
-	maxSequences int
-
-	hiddenSize int
-	dConv      int
-
-	// slot mapping for recurrent state
-	slotForSeq map[int]int
-	refCount   []int
-	freeSlots  []int
-
-	// per-layer conv state buffers (allocated lazily)
-	convCtxs   map[int]ml.Context
-	convStates map[int]ml.Tensor // [dConv*hiddenSize, maxSlots]
-
-	// current forward batch (derived in StartForward)
-	curSeqs       []int
-	curSlots      []int
-	curSlotsInput ml.Tensor
-	curSeqTokens  int
-
-	// track if EnsureWritable has been called for this forward pass
-	writableEnsured bool
-	// track any error from EnsureWritable to propagate later
-	writableError error
+	*kvcache.Recurrent
 }

 func NewHybridCache(shift func(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error), hiddenSize, dConv int) *HybridCache {
-	return &HybridCache{
-		kv:         kvcache.NewCausalCache(shift),
-		hiddenSize: hiddenSize,
-		dConv:      dConv,
-		slotForSeq: make(map[int]int),
-		convCtxs:   make(map[int]ml.Context),
-		convStates: make(map[int]ml.Tensor),
-	}
-}
+	base := kvcache.NewRecurrentCache(kvcache.RecurrentConfig{
+		Shift:               shift,
+		ConvDim:             dConv,
+		ConvChannels:        hiddenSize,
+		RecurrentStateSize:  1, // LFM2 uses only conv state; keep a minimal recurrent buffer size.
+		CheckpointLogPrefix: "lfm2",
+	})

-func (c *HybridCache) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity, maxBatch int) {
-	c.backend = backend
-	c.dtype = dtype
-	c.maxSequences = maxSequences
-
-	// initialize slot allocator
-	c.refCount = make([]int, maxSequences)
-	c.freeSlots = c.freeSlots[:0]
-	for i := maxSequences - 1; i >= 0; i-- {
-		c.freeSlots = append(c.freeSlots, i)
-	}
-
-	c.kv.Init(backend, dtype, maxSequences, capacity, maxBatch)
-}
-
-func (c *HybridCache) Close() {
-	for _, ctx := range c.convCtxs {
-		ctx.Close()
-	}
-	c.kv.Close()
-}
-
-func (c *HybridCache) SetConfig(config ml.CacheConfig) {
-	c.kv.SetConfig(config)
-}
-
-func (c *HybridCache) SetLayer(layer int) {
-	c.kv.SetLayer(layer)
-}
-
-func (c *HybridCache) Get(ctx ml.Context) (ml.Tensor, ml.Tensor, ml.Tensor) {
-	return c.kv.Get(ctx)
-}
-
-func (c *HybridCache) Put(ctx ml.Context, key, value ml.Tensor) {
-	c.kv.Put(ctx, key, value)
-}
-
-func (c *HybridCache) StartForward(ctx ml.Context, batch input.Batch, reserve bool) error {
-	if err := c.kv.StartForward(ctx, batch, reserve); err != nil {
-		return err
-	}
-
-	// Derive equal-length sequence layout for shortconv.
-	// LFM2 shortconv assumes tokens form a [seq_tokens, seqs] grid.
-	seqCounts := make(map[int]int)
-	c.curSeqs = c.curSeqs[:0]
-	for _, s := range batch.Sequences {
-		if _, ok := seqCounts[s]; !ok {
-			c.curSeqs = append(c.curSeqs, s)
-		}
-		seqCounts[s]++
-	}
-
-	if len(c.curSeqs) == 0 {
-		return nil
-	}
-
-	nTokens := len(batch.Sequences)
-	nSeqs := len(c.curSeqs)
-	want := nTokens / nSeqs
-	for _, s := range c.curSeqs {
-		if seqCounts[s] != want {
-			return kvcache.ErrNotSupported
-		}
-	}
-
-	c.curSeqTokens = want
-
-	// When reserving memory for estimation, use fake slot assignments
-	// without modifying permanent state (slotForSeq, refCount)
-	if reserve {
-		c.curSlots = c.curSlots[:0]
-		slots := make([]int32, nSeqs)
-		for i := range nSeqs {
-			c.curSlots = append(c.curSlots, i)
-			slots[i] = int32(i)
-		}
-		c.curSlotsInput = ctx.Input().FromInts(slots, len(slots))
-		return nil
-	}
-
-	// Ensure slots exist for sequences in this batch
-	c.curSlots = c.curSlots[:0]
-	var newSlots []int // track newly allocated slots that need zeroing
-	for _, s := range c.curSeqs {
-		slot, ok := c.slotForSeq[s]
-		if !ok {
-			var err error
-			slot, err = c.allocSlot()
-			if err != nil {
-				return err
-			}
-			c.slotForSeq[s] = slot
-			c.refCount[slot] = 1
-			newSlots = append(newSlots, slot)
-		}
-		c.curSlots = append(c.curSlots, slot)
-	}
-
-	// Zero conv state for newly allocated slots to clear stale data from previous sequences
-	if len(newSlots) > 0 {
-		c.zeroConvSlots(ctx, newSlots)
-	}
-
-	// Create a tensor for the current slots
-	slots := make([]int32, len(c.curSlots))
-	for i, v := range c.curSlots {
-		slots[i] = int32(v)
-	}
-	c.curSlotsInput = ctx.Input().FromInts(slots, len(slots))
-
-	// Reset writable state for new forward pass
-	c.writableEnsured = false
-	c.writableError = nil
-
-	return nil
-}
-
-func (c *HybridCache) allocSlot() (int, error) {
-	if len(c.freeSlots) == 0 {
-		return 0, kvcache.ErrKvCacheFull
-	}
-	slot := c.freeSlots[len(c.freeSlots)-1]
-	c.freeSlots = c.freeSlots[:len(c.freeSlots)-1]
-	return slot, nil
-}
-
-func (c *HybridCache) freeSlot(slot int) {
-	// Bounds check before freeing
-	if slot >= 0 && slot < c.maxSequences {
-		c.freeSlots = append(c.freeSlots, slot)
-	}
-}
-
-// zeroConvSlots zeros the conv state for the given slots across all layers.
-// This must be called when recycling slots to prevent stale state from affecting new sequences.
-func (c *HybridCache) zeroConvSlots(ctx ml.Context, slots []int) {
-	if len(slots) == 0 || len(c.convStates) == 0 {
-		return
-	}
-
-	// Use input context for creating tensors
-	inputCtx := ctx.Input()
-
-	// Create slot indices tensor
-	slotIndices := make([]int32, len(slots))
-	for i, s := range slots {
-		slotIndices[i] = int32(s)
-	}
-	slotsTensor := inputCtx.FromInts(slotIndices, len(slotIndices))
-
-	// Create zero tensor for the slots (SetRows requires F32 source)
-	zeros := inputCtx.Zeros(ml.DTypeF32, c.dConv*c.hiddenSize, len(slots))
-
-	// Zero each layer's conv state for these slots
-	for _, buf := range c.convStates {
-		ctx.Forward(buf.SetRows(ctx, zeros, slotsTensor))
-	}
-}
-
-// EnsureWritable ensures that sequences in the current batch have private (non-shared) conv slots.
-// Returns an error if slot allocation fails.
-func (c *HybridCache) EnsureWritable(ctx ml.Context) error {
-	for i, seq := range c.curSeqs {
-		slot, ok := c.slotForSeq[seq]
-		if !ok {
-			continue
-		}
-
-		// Bounds check
-		if slot < 0 || slot >= len(c.refCount) {
-			continue
-		}
-
-		if c.refCount[slot] <= 1 {
-			continue
-		}
-
-		newSlot, err := c.allocSlot()
-		if err != nil {
-			return err
-		}
-		c.refCount[slot]--
-		c.refCount[newSlot] = 1
-		c.slotForSeq[seq] = newSlot
-		c.curSlots[i] = newSlot
-
-		// Copy existing conv state for all initialized layers
-		for _, buf := range c.convStates {
-			// buf: [dConv*hiddenSize, maxSlots]
-			src := buf.Rows(ctx, ctx.Input().FromInts([]int32{int32(slot)}, 1))
-			// SetRows requires F32 source
-			srcF32 := src.Cast(ctx, ml.DTypeF32)
-			ctx.Forward(buf.SetRows(ctx, srcF32, ctx.Input().FromInts([]int32{int32(newSlot)}, 1)))
-		}
-	}
-
-	// Rebuild current slots tensor
-	slots := make([]int32, len(c.curSlots))
-	for i, v := range c.curSlots {
-		slots[i] = int32(v)
-	}
-	c.curSlotsInput = ctx.Input().FromInts(slots, len(slots))
-
-	return nil
-}
-
-func (c *HybridCache) CopyPrefix(srcSeq, dstSeq int, prefixLen int32) {
-	// KV cache shares prefix metadata (no copy) which is correct for prefix reuse.
-	c.kv.CopyPrefix(srcSeq, dstSeq, prefixLen)
-
-	// For shortconv state we implement copy-on-write: dst shares the same slot as src.
-	// On the first write to dst, EnsureWritable will create a private slot.
-	if dstSlot, ok := c.slotForSeq[dstSeq]; ok {
-		// Bounds check before decrementing
-		if dstSlot >= 0 && dstSlot < len(c.refCount) {
-			c.refCount[dstSlot]--
-			if c.refCount[dstSlot] <= 0 {
-				c.refCount[dstSlot] = 0
-				c.freeSlot(dstSlot)
-			}
-		}
-		delete(c.slotForSeq, dstSeq)
-	}
-
-	srcSlot, ok := c.slotForSeq[srcSeq]
-	if !ok {
-		// src may not have a slot yet; dst will allocate on demand
-		return
-	}
-
-	// Bounds check before incrementing
-	if srcSlot >= 0 && srcSlot < len(c.refCount) {
-		c.slotForSeq[dstSeq] = srcSlot
-		c.refCount[srcSlot]++
-	}
-}
-
-func (c *HybridCache) CanResume(seq int, pos int32) bool {
-	return c.kv.CanResume(seq, pos)
-}
-
-func (c *HybridCache) Remove(seq int, beginIndex, endIndex int32) error {
-	if err := c.kv.Remove(seq, beginIndex, endIndex); err != nil {
-		return err
-	}
-
-	// For recurrent state, any removal invalidates the state because
-	// the state at position N depends on all previous positions.
-	// Drop the slot mapping so it resets on next use.
-	slot, ok := c.slotForSeq[seq]
-	if !ok {
-		return nil
-	}
-
-	// Bounds check
-	if slot < 0 || slot >= len(c.refCount) {
-		delete(c.slotForSeq, seq)
-		return nil
-	}
-
-	c.refCount[slot]--
-	if c.refCount[slot] <= 0 {
-		c.refCount[slot] = 0
-		c.freeSlot(slot)
-	}
-	delete(c.slotForSeq, seq)
-
-	return nil
+	return &HybridCache{Recurrent: base}
 }

 func (c *HybridCache) slotsTensor() ml.Tensor {
-	return c.curSlotsInput
+	return c.SlotsTensor()
 }

 func (c *HybridCache) seqTokens() int {
-	return c.curSeqTokens
+	return c.SeqTokens()
 }

 func (c *HybridCache) numSeqs() int {
-	return len(c.curSeqs)
-}
-
-func (c *HybridCache) convBuffer(ctx ml.Context, layer int) ml.Tensor {
-	if buf, ok := c.convStates[layer]; ok {
-		return buf
-	}
-
-	if _, ok := c.convCtxs[layer]; !ok {
-		c.convCtxs[layer] = c.backend.NewContextSize(1).Layer(layer)
-	}
-
-	buf := c.convCtxs[layer].Zeros(c.dtype, c.dConv*c.hiddenSize, c.maxSequences)
-	c.convStates[layer] = buf
-	return buf
-}
-
-// ConvState returns the conv state for current batch sequences as shape [dConv, hiddenSize, nSeqs].
-// Returns an error if copy-on-write allocation fails.
-func (c *HybridCache) ConvState(ctx ml.Context, layer int) (ml.Tensor, error) {
-	if !c.writableEnsured {
-		needsWritable := false
-		for _, seq := range c.curSeqs {
-			slot, ok := c.slotForSeq[seq]
-			if !ok {
-				continue
-			}
-			if slot >= 0 && slot < len(c.refCount) && c.refCount[slot] > 1 {
-				needsWritable = true
-				break
-			}
-		}
-
-		if needsWritable {
-			if err := c.EnsureWritable(ctx); err != nil {
-				c.writableError = err
-			}
-		}
-		c.writableEnsured = true
-	}
-
-	if c.writableError != nil {
-		return nil, c.writableError
-	}
-
-	buf := c.convBuffer(ctx, layer)
-	cur := buf.Rows(ctx, c.slotsTensor())
-	return cur.Reshape(ctx, c.dConv, c.hiddenSize, c.numSeqs()), nil
-}
-
-// UpdateConvState writes a new conv state for current batch sequences.
-// newState must have shape [dConv, hiddenSize, nSeqs].
-func (c *HybridCache) UpdateConvState(ctx ml.Context, layer int, newState ml.Tensor) {
-	buf := c.convBuffer(ctx, layer)
-	src := newState.Reshape(ctx, c.dConv*c.hiddenSize, c.numSeqs())
-	// SetRows requires F32 source
-	srcF32 := src.Cast(ctx, ml.DTypeF32)
-	ctx.Forward(buf.SetRows(ctx, srcF32, c.slotsTensor()))
-}
-
-// IsSupportedForBatch returns true if the current batch layout supports shortconv.
-func (c *HybridCache) IsSupportedForBatch() bool {
-	return c.curSeqTokens > 0 && len(c.curSeqs) > 0
-}
-
-// Seqs returns the ordered unique sequences for the current forward pass.
-func (c *HybridCache) Seqs() []int {
-	return slices.Clone(c.curSeqs)
+	return c.NumSeqs()
 }
--- a/model/models/lfm2/cache_test.go
+++ b/model/models/lfm2/cache_test.go
@@ -4,441 +4,39 @@ import (
 	"testing"

 	"github.com/ollama/ollama/kvcache"
-	"github.com/ollama/ollama/ml"
 )

-// TestHybridCache tests verify the slot management logic of HybridCache.
-// These tests focus on the recurrent state slot allocation, reference counting,
-// and copy-on-write semantics without requiring a full ML backend.
+func TestHybridCache_New(t *testing.T) {
+	cache := NewHybridCache(nil, 512, 2)
+	if cache == nil {
+		t.Fatal("expected cache to be created")
+	}

-// createSlotOnlyCache creates a HybridCache with only the slot management
-// fields initialized. Used to test slot logic in isolation.
-func createSlotOnlyCache(maxSequences int) *HybridCache {
-	return &HybridCache{
-		hiddenSize:   256,
-		dConv:        3,
-		maxSequences: maxSequences,
-		refCount:     make([]int, maxSequences),
-		freeSlots:    initFreeSlots(maxSequences),
-		slotForSeq:   make(map[int]int),
-		convCtxs:     make(map[int]ml.Context),
-		convStates:   make(map[int]ml.Tensor),
+	if cache.Recurrent == nil {
+		t.Fatal("expected embedded recurrent cache to be created")
 	}
 }

-func initFreeSlots(n int) []int {
-	slots := make([]int, 0, n)
-	for i := n - 1; i >= 0; i-- {
-		slots = append(slots, i)
-	}
-	return slots
-}
+func TestHybridCache_ImplementsCheckpointCache(t *testing.T) {
+	cache := NewHybridCache(nil, 512, 2)

-func TestHybridCache_SlotAllocation(t *testing.T) {
-	cache := createSlotOnlyCache(4)
-
-	// Verify initial state
-	if len(cache.freeSlots) != 4 {
-		t.Errorf("expected 4 free slots, got %d", len(cache.freeSlots))
-	}
-
-	// Allocate all slots
-	for range 4 {
-		slot, err := cache.allocSlot()
-		if err != nil {
-			t.Fatalf("allocSlot failed: %v", err)
-		}
-		cache.refCount[slot] = 1
-	}
-
-	// Should be full now
-	if len(cache.freeSlots) != 0 {
-		t.Errorf("expected 0 free slots, got %d", len(cache.freeSlots))
-	}
-
-	// Trying to allocate another should fail
-	_, err := cache.allocSlot()
-	if err != kvcache.ErrKvCacheFull {
-		t.Errorf("expected ErrKvCacheFull, got %v", err)
+	if _, ok := any(cache).(kvcache.CheckpointCache); !ok {
+		t.Fatal("expected HybridCache to implement CheckpointCache")
 	}
 }

-func TestHybridCache_SlotReuse(t *testing.T) {
-	cache := createSlotOnlyCache(4)
+func TestHybridCache_DefaultBatchState(t *testing.T) {
+	cache := NewHybridCache(nil, 512, 2)

-	// Allocate a slot
-	slot1, _ := cache.allocSlot()
-	cache.refCount[slot1] = 1
-
-	// Free it
-	cache.refCount[slot1] = 0
-	cache.freeSlot(slot1)
-
-	// Allocate again - should get the same slot back (LIFO)
-	slot2, _ := cache.allocSlot()
-	if slot2 != slot1 {
-		t.Errorf("expected slot %d to be reused, got %d", slot1, slot2)
-	}
-}
-
-func TestHybridCache_SlotRefCounting_ShareSlot(t *testing.T) {
-	cache := createSlotOnlyCache(4)
-
-	// Allocate slot for seq 1
-	slot1, _ := cache.allocSlot()
-	cache.slotForSeq[1] = slot1
-	cache.refCount[slot1] = 1
-
-	// Simulate sharing slot with seq 2 (copy-on-write style)
-	cache.slotForSeq[2] = slot1
-	cache.refCount[slot1]++
-
-	// Should share the same slot
-	if cache.slotForSeq[2] != slot1 {
-		t.Errorf("expected seq 2 to share slot %d, got %d", slot1, cache.slotForSeq[2])
+	if got := cache.numSeqs(); got != 0 {
+		t.Fatalf("expected 0 sequences before StartForward, got %d", got)
 	}

-	// Ref count should be 2
-	if cache.refCount[slot1] != 2 {
-		t.Errorf("expected refCount 2, got %d", cache.refCount[slot1])
-	}
-}
-
-func TestHybridCache_SlotRefCounting_DecRef(t *testing.T) {
-	cache := createSlotOnlyCache(4)
-
-	// Allocate slot for seq 1
-	slot1, _ := cache.allocSlot()
-	cache.slotForSeq[1] = slot1
-	cache.refCount[slot1] = 1
-
-	// Share with seq 2
-	cache.slotForSeq[2] = slot1
-	cache.refCount[slot1]++
-
-	// Unshare seq 2
-	cache.refCount[slot1]--
-	delete(cache.slotForSeq, 2)
-
-	// Ref count should be back to 1
-	if cache.refCount[slot1] != 1 {
-		t.Errorf("expected refCount 1 after unshare, got %d", cache.refCount[slot1])
+	if got := cache.seqTokens(); got != 0 {
+		t.Fatalf("expected 0 sequence tokens before StartForward, got %d", got)
 	}

-	// Seq 2 should no longer have a slot
-	if _, ok := cache.slotForSeq[2]; ok {
-		t.Error("seq 2 should not have a slot after unshare")
-	}
-}
-
-func TestHybridCache_SlotFreeWhenUnused(t *testing.T) {
-	cache := createSlotOnlyCache(4)
-
-	initialFreeSlots := len(cache.freeSlots)
-
-	// Allocate slot for seq 1
-	slot1, _ := cache.allocSlot()
-	cache.slotForSeq[1] = slot1
-	cache.refCount[slot1] = 1
-
-	// Free the slot when refCount drops to 0
-	cache.refCount[slot1]--
-	if cache.refCount[slot1] <= 0 {
-		cache.refCount[slot1] = 0
-		cache.freeSlot(slot1)
-	}
-	delete(cache.slotForSeq, 1)
-
-	// Slot should be freed
-	if len(cache.freeSlots) != initialFreeSlots {
-		t.Errorf("expected %d free slots, got %d", initialFreeSlots, len(cache.freeSlots))
-	}
-
-	// Ref count should be 0
-	if cache.refCount[slot1] != 0 {
-		t.Errorf("expected refCount 0, got %d", cache.refCount[slot1])
-	}
-}
-
-func TestHybridCache_SlotOverwrite(t *testing.T) {
-	cache := createSlotOnlyCache(4)
-
-	// Allocate slots for seq 1 and seq 2
-	slot1, _ := cache.allocSlot()
-	cache.slotForSeq[1] = slot1
-	cache.refCount[slot1] = 1
-
-	slot2, _ := cache.allocSlot()
-	cache.slotForSeq[2] = slot2
-	cache.refCount[slot2] = 1
-
-	initialFreeSlots := len(cache.freeSlots)
-
-	// Simulate overwriting seq 2's slot with slot1 (sharing)
-	// First free the old slot
-	cache.refCount[slot2]--
-	if cache.refCount[slot2] <= 0 {
-		cache.refCount[slot2] = 0
-		cache.freeSlot(slot2)
-	}
-	// Then share slot1
-	cache.slotForSeq[2] = slot1
-	cache.refCount[slot1]++
-
-	// Seq 2 should now share slot1
-	if cache.slotForSeq[2] != slot1 {
-		t.Errorf("expected seq 2 to share slot %d, got %d", slot1, cache.slotForSeq[2])
-	}
-
-	// Old slot2 should be freed
-	if len(cache.freeSlots) != initialFreeSlots+1 {
-		t.Errorf("expected %d free slots, got %d", initialFreeSlots+1, len(cache.freeSlots))
-	}
-}
-
-func TestHybridCache_BoundsChecking(t *testing.T) {
-	cache := createSlotOnlyCache(4)
-
-	// Test freeing invalid slot (should not panic)
-	cache.freeSlot(-1)
-	cache.freeSlot(100) // out of bounds
-
-	// freeSlot does bounds checking, so invalid slots should be ignored
-	if len(cache.freeSlots) != 4 {
-		t.Errorf("invalid slots should not affect free list, got %d slots", len(cache.freeSlots))
-	}
-}
-
-func TestHybridCache_MultipleSequences_RefCounting(t *testing.T) {
-	cache := createSlotOnlyCache(8)
-
-	// Allocate slot for seq 1
-	slot1, _ := cache.allocSlot()
-	cache.slotForSeq[1] = slot1
-	cache.refCount[slot1] = 1
-
-	// Fork to seq 2, 3, 4 (all share slot1)
-	for _, seq := range []int{2, 3, 4} {
-		cache.slotForSeq[seq] = slot1
-		cache.refCount[slot1]++
-	}
-
-	// Ref count should be 4
-	if cache.refCount[slot1] != 4 {
-		t.Errorf("expected refCount 4, got %d", cache.refCount[slot1])
-	}
-
-	// Remove seq 2, 3
-	for _, seq := range []int{2, 3} {
-		delete(cache.slotForSeq, seq)
-		cache.refCount[slot1]--
-	}
-
-	if cache.refCount[slot1] != 2 {
-		t.Errorf("expected refCount 2, got %d", cache.refCount[slot1])
-	}
-
-	// Slot should still be allocated (not in free list)
-	found := false
-	for _, s := range cache.freeSlots {
-		if s == slot1 {
-			found = true
-			break
-		}
-	}
-	if found {
-		t.Error("slot1 should not be in free list yet")
-	}
-
-	// Remove remaining sequences
-	for _, seq := range []int{1, 4} {
-		delete(cache.slotForSeq, seq)
-		cache.refCount[slot1]--
-	}
-
-	if cache.refCount[slot1] != 0 {
-		t.Errorf("expected refCount 0, got %d", cache.refCount[slot1])
-	}
-}
-
-func TestHybridCache_ChainedSharing(t *testing.T) {
-	cache := createSlotOnlyCache(8)
-
-	// Create seq 1
-	slot1, _ := cache.allocSlot()
-	cache.slotForSeq[1] = slot1
-	cache.refCount[slot1] = 1
-
-	// Share 1 -> 2
-	cache.slotForSeq[2] = slot1
-	cache.refCount[slot1]++
-
-	// Share 2 -> 3 (should still share slot1)
-	cache.slotForSeq[3] = cache.slotForSeq[2] // which is slot1
-	cache.refCount[slot1]++
-
-	// All should share slot1
-	if cache.slotForSeq[1] != slot1 || cache.slotForSeq[2] != slot1 || cache.slotForSeq[3] != slot1 {
-		t.Error("all sequences should share slot1")
-	}
-
-	if cache.refCount[slot1] != 3 {
-		t.Errorf("expected refCount 3, got %d", cache.refCount[slot1])
-	}
-}
-
-func TestHybridCache_CacheParameters(t *testing.T) {
-	cache := NewHybridCache(nil, 512, 5) // hiddenSize=512, dConv=5
-
-	if cache.hiddenSize != 512 {
-		t.Errorf("expected hiddenSize 512, got %d", cache.hiddenSize)
-	}
-	if cache.dConv != 5 {
-		t.Errorf("expected dConv 5, got %d", cache.dConv)
-	}
-}
-
-func TestHybridCache_NumSeqs(t *testing.T) {
-	cache := createSlotOnlyCache(4)
-
-	// Initially no sequences
-	if cache.numSeqs() != 0 {
-		t.Errorf("expected 0 seqs, got %d", cache.numSeqs())
-	}
-
-	// Manually set up current batch state
-	cache.curSeqs = []int{1, 2, 3}
-
-	if cache.numSeqs() != 3 {
-		t.Errorf("expected 3 seqs, got %d", cache.numSeqs())
-	}
-}
-
-func TestHybridCache_SeqTokens(t *testing.T) {
-	cache := createSlotOnlyCache(4)
-
-	// Initially 0
-	if cache.seqTokens() != 0 {
-		t.Errorf("expected 0 seqTokens, got %d", cache.seqTokens())
-	}
-
-	// Manually set up current batch state
-	cache.curSeqTokens = 16
-
-	if cache.seqTokens() != 16 {
-		t.Errorf("expected 16 seqTokens, got %d", cache.seqTokens())
-	}
-}
-
-// Test that Seqs returns a clone of curSeqs
-func TestHybridCache_Seqs_ReturnsClone(t *testing.T) {
-	cache := createSlotOnlyCache(4)
-
-	cache.curSeqs = []int{1, 2, 3}
-
-	seqs := cache.Seqs()
-
-	// Modify returned slice
-	seqs[0] = 999
-
-	// Original should be unchanged
-	if cache.curSeqs[0] != 1 {
-		t.Error("Seqs should return a clone, not the original slice")
-	}
-}
-
-func TestHybridCache_IsSupportedForBatch(t *testing.T) {
-	cache := createSlotOnlyCache(4)
-
-	// Initially not supported (no batch set up)
 	if cache.IsSupportedForBatch() {
-		t.Error("expected IsSupportedForBatch to be false initially")
-	}
-
-	// Set up a valid batch
-	cache.curSeqTokens = 1
-	cache.curSeqs = []int{1}
-
-	if !cache.IsSupportedForBatch() {
-		t.Error("expected IsSupportedForBatch to be true with valid batch")
-	}
-}
-
-func TestHybridCache_ZeroConvSlots_EmptyInputs(t *testing.T) {
-	cache := createSlotOnlyCache(4)
-
-	// zeroConvSlots should handle empty slots without panicking
-	cache.zeroConvSlots(nil, nil)
-	cache.zeroConvSlots(nil, []int{})
-
-	// zeroConvSlots should handle empty convStates without panicking
-	cache.zeroConvSlots(nil, []int{0, 1, 2})
-}
-
-func TestHybridCache_SlotRecycling_TracksNewSlots(t *testing.T) {
-	cache := createSlotOnlyCache(4)
-
-	// Allocate slot for seq 1
-	slot1, _ := cache.allocSlot()
-	cache.slotForSeq[1] = slot1
-	cache.refCount[slot1] = 1
-
-	// Free the slot (simulating sequence removal)
-	cache.refCount[slot1]--
-	cache.freeSlot(slot1)
-	delete(cache.slotForSeq, 1)
-
-	// Verify slot is in free list
-	if len(cache.freeSlots) != 4 {
-		t.Errorf("expected 4 free slots after freeing, got %d", len(cache.freeSlots))
-	}
-
-	// Allocate for new seq 2 - should get recycled slot
-	slot2, _ := cache.allocSlot()
-	if slot2 != slot1 {
-		t.Errorf("expected recycled slot %d, got %d", slot1, slot2)
-	}
-
-	// This recycled slot would need zeroing in the real implementation
-	// The actual zeroing is tested via integration tests since it requires ML context
-}
-
-func TestHybridCache_NewSequence_GetsTrackedForZeroing(t *testing.T) {
-	cache := createSlotOnlyCache(4)
-
-	// Simulate the slot allocation flow from StartForward
-	// When a sequence doesn't have a slot, it gets allocated and tracked as "new"
-
-	newSlots := []int{}
-
-	// Seq 1 doesn't have a slot - allocate and track
-	seq := 1
-	if _, ok := cache.slotForSeq[seq]; !ok {
-		slot, err := cache.allocSlot()
-		if err != nil {
-			t.Fatalf("allocSlot failed: %v", err)
-		}
-		cache.slotForSeq[seq] = slot
-		cache.refCount[slot] = 1
-		newSlots = append(newSlots, slot)
-	}
-
-	// Verify newSlots contains the allocated slot
-	if len(newSlots) != 1 {
-		t.Errorf("expected 1 new slot, got %d", len(newSlots))
-	}
-
-	// Seq 1 already has a slot - should NOT be tracked as new
-	newSlots2 := []int{}
-	if _, ok := cache.slotForSeq[seq]; !ok {
-		slot, _ := cache.allocSlot()
-		cache.slotForSeq[seq] = slot
-		cache.refCount[slot] = 1
-		newSlots2 = append(newSlots2, slot)
-	}
-
-	// Verify no new slots for existing sequence
-	if len(newSlots2) != 0 {
-		t.Errorf("expected 0 new slots for existing sequence, got %d", len(newSlots2))
+		t.Fatal("expected unsupported batch layout before StartForward")
 	}
 }
--- a/model/models/lfm2/model.go
+++ b/model/models/lfm2/model.go
@@ -1,7 +1,11 @@
 package lfm2

 import (
+	"bytes"
 	"cmp"
+	"errors"
+	"fmt"
+	"image"
 	"math"

 	"github.com/ollama/ollama/fs"
@@ -25,8 +29,20 @@ type Options struct {
 	// per-layer head counts (LFM2 alternates attention and recurrent layers)
 	numHeadsByLayer   []int
 	numKVHeadsByLayer []int
+
+	// MoE config
+	numExperts         int
+	numExpertsUsed     int
+	normTopKProb       bool
+	expertWeightsScale float32
+	expertGatingFunc   uint32
 }

+const (
+	expertGatingFuncSoftmax = uint32(0)
+	expertGatingFuncSigmoid = uint32(2)
+)
+
 func (o Options) headDimValue() int {
 	// Head dim is shared across layers; fall back to first attention layer head count.
 	for _, h := range o.numHeadsByLayer {
@@ -67,18 +83,138 @@ type Model struct {
 	OutputNorm     *nn.RMSNorm   `gguf:"output_norm,alt:token_embd_norm"`
 	Output         *nn.Linear    `gguf:"output,alt:token_embd"`

+	VisionModel      *VisionModel     `gguf:"v"`
+	VisionProjector  *VisionProjector `gguf:"mm"`
+	ImageProcessor   ImageProcessor
+	imageTokenID     int32
+	imageStartToken  int32
+	imageEndToken    int32
+	imageThumbnailID int32
+	imageRowColIDs   map[imageGridPos]int32
+	useSpecialTokens bool
+	projectorOptions VisionProjectorOptions
+
 	Options
 }

-func New(c fs.Config) (model.Model, error) {
-	if c.Uint("expert_count") > 0 {
-		return nil, model.ErrUnsupportedModel
+var _ model.MultimodalProcessor = (*Model)(nil)
+
+type imageGridPos struct {
+	row int
+	col int
+}
+
+type visionEmbeddingLayout struct {
+	rows         int
+	cols         int
+	hasThumbnail bool
+}
+
+type visionChunkData struct {
+	tokens    int
+	row       int
+	col       int
+	thumbnail bool
+	layout    *visionEmbeddingLayout
+}
+
+func (m *Model) Validate() error {
+	if m.TokenEmbedding == nil {
+		return errors.New("lfm2: missing token_embd tensor")
+	}
+	if m.OutputNorm == nil {
+		return errors.New("lfm2: missing output_norm tensor")
+	}
+	if m.Output == nil {
+		return errors.New("lfm2: missing output tensor")
 	}

+	for i, layer := range m.Layers {
+		if layer.AttentionNorm == nil {
+			return fmt.Errorf("lfm2: missing blk.%d.attn_norm tensor", i)
+		}
+		if layer.MLPNorm == nil {
+			return fmt.Errorf("lfm2: missing blk.%d.ffn_norm tensor", i)
+		}
+		switch ff := layer.MLP.(type) {
+		case nil:
+			return fmt.Errorf("lfm2: missing blk.%d feed-forward tensors", i)
+		case *denseMLP:
+			if ff.Up == nil || ff.Down == nil || ff.Gate == nil {
+				return fmt.Errorf("lfm2: missing blk.%d dense feed-forward tensors", i)
+			}
+		case *sparseMLP:
+			if ff.Router == nil || ff.Gate == nil || ff.Up == nil || ff.Down == nil {
+				return fmt.Errorf("lfm2: missing blk.%d sparse feed-forward tensors", i)
+			}
+		default:
+			return fmt.Errorf("lfm2: unsupported feed-forward type at blk.%d", i)
+		}
+
+		switch op := layer.Operator.(type) {
+		case *Attention:
+			if op == nil || op.Query == nil || op.Key == nil || op.Value == nil || op.Output == nil || op.QueryNorm == nil || op.KeyNorm == nil {
+				return fmt.Errorf("lfm2: missing blk.%d attention tensors", i)
+			}
+		case *ShortConv:
+			if op == nil || op.Conv == nil || op.Conv.Weight == nil || op.InProj == nil || op.OutProj == nil {
+				return fmt.Errorf("lfm2: missing blk.%d shortconv tensors", i)
+			}
+		default:
+			return fmt.Errorf("lfm2: unsupported operator at blk.%d", i)
+		}
+	}
+
+	if m.VisionModel != nil {
+		if m.VisionModel.PatchEmbedding == nil {
+			return errors.New("lfm2: missing vision patch embedding tensors")
+		}
+		if m.VisionModel.PositionEmbedding == nil {
+			return errors.New("lfm2: missing vision position embedding tensors")
+		}
+		if m.VisionModel.PostLayerNorm == nil {
+			return errors.New("lfm2: missing vision post layer norm tensors")
+		}
+		if len(m.VisionModel.Layers) == 0 {
+			return errors.New("lfm2: missing vision encoder layers")
+		}
+		for i, layer := range m.VisionModel.Layers {
+			if layer.LayerNorm1 == nil || layer.LayerNorm2 == nil || layer.SelfAttention == nil || layer.MLP == nil {
+				return fmt.Errorf("lfm2: missing vision layer tensors at v.blk.%d", i)
+			}
+			if layer.SelfAttention.Query == nil || layer.SelfAttention.Key == nil || layer.SelfAttention.Value == nil || layer.SelfAttention.Output == nil {
+				return fmt.Errorf("lfm2: missing vision attention tensors at v.blk.%d", i)
+			}
+			if layer.MLP.Up == nil || layer.MLP.Down == nil {
+				return fmt.Errorf("lfm2: missing vision feed-forward tensors at v.blk.%d", i)
+			}
+		}
+
+		if m.VisionProjector == nil || m.VisionProjector.Linear1 == nil || m.VisionProjector.Linear2 == nil {
+			return errors.New("lfm2: missing multimodal projector tensors")
+		}
+	}
+
+	return nil
+}
+
+func New(c fs.Config) (model.Model, error) {
 	if c.String("tokenizer.ggml.model") != "gpt2" {
 		return nil, model.ErrUnsupportedTokenizer
 	}

+	numExperts := int(c.Uint("expert_count"))
+	isMoE := numExperts > 0
+	numExpertsUsed := int(c.Uint("expert_used_count"))
+	if isMoE {
+		if numExperts <= 0 {
+			return nil, fmt.Errorf("lfm2: invalid expert_count=%d", numExperts)
+		}
+		if numExpertsUsed <= 0 || numExpertsUsed > numExperts {
+			return nil, fmt.Errorf("lfm2: invalid expert_used_count=%d for expert_count=%d", numExpertsUsed, numExperts)
+		}
+	}
+
 	vocabulary := tokenizer.Vocabulary{
 		Values: c.Strings("tokenizer.ggml.tokens"),
 		Scores: c.Floats("tokenizer.ggml.scores"),
@@ -105,8 +241,16 @@ func New(c fs.Config) (model.Model, error) {
 	}

 	m := Model{
-		Tokenizer: tokenizer.NewBytePairEncoding(&vocabulary, pretokenizers...),
-		Layers:    make([]Layer, c.Uint("block_count")),
+		Tokenizer:       tokenizer.NewBytePairEncoding(&vocabulary, pretokenizers...),
+		Layers:          make([]Layer, c.Uint("block_count")),
+		ImageProcessor:  newImageProcessor(c),
+		VisionModel:     newVisionModel(c),
+		VisionProjector: &VisionProjector{},
+		imageRowColIDs:  make(map[imageGridPos]int32),
+		projectorOptions: VisionProjectorOptions{
+			scaleFactor:  int(c.Uint("vision.projector.scale_factor", 2)),
+			useLayerNorm: c.Bool("vision.projector.use_layernorm", false),
+		},
 		Options: Options{
 			hiddenSize:            int(c.Uint("embedding_length")),
 			headDim:               int(c.Uint("attention.key_length")),
@@ -116,9 +260,66 @@ func New(c fs.Config) (model.Model, error) {
 			ropeBase:              c.Float("rope.freq_base"),
 			ropeScale:             c.Float("rope.scaling.factor", 1),
 			originalContextLength: int(c.Uint("rope.scaling.original_context_length")),
+			numExperts:            numExperts,
+			numExpertsUsed:        numExpertsUsed,
+			normTopKProb:          c.Bool("norm_top_k_prob", true),
+			expertWeightsScale:    c.Float("expert_weights_scale", 1.0),
+			expertGatingFunc:      c.Uint("expert_gating_func", expertGatingFuncSoftmax),
 		},
 	}

+	lookupTokenID := func(token string) int32 {
+		for i, t := range vocabulary.Values {
+			if t == token {
+				return int32(i)
+			}
+		}
+		return 0
+	}
+
+	resolveTokenID := func(explicitKey, token string, fallback uint32) int32 {
+		if explicitKey != "" {
+			if id := c.Uint(explicitKey); id != 0 {
+				return int32(id)
+			}
+		}
+		if tokenID := lookupTokenID(token); tokenID != 0 {
+			return tokenID
+		}
+		return int32(fallback)
+	}
+
+	m.imageTokenID = resolveTokenID("vision.image_token_id", "<image>", 396)
+	m.imageStartToken = resolveTokenID("vision.image_start_token_id", "<|image_start|>", 0)
+	m.imageEndToken = resolveTokenID("vision.image_end_token_id", "<|image_end|>", 0)
+	m.imageThumbnailID = resolveTokenID("vision.image_thumbnail_token_id", "<|img_thumbnail|>", 0)
+	m.useSpecialTokens = c.Bool("vision.use_image_special_tokens", true)
+
+	maxGridTokens := int(c.Uint("vision.max_tiles", 10))
+	if maxGridTokens <= 0 {
+		maxGridTokens = 10
+	}
+	for row := 1; row <= maxGridTokens; row++ {
+		for col := 1; col <= maxGridTokens; col++ {
+			token := fmt.Sprintf("<|img_row_%d_col_%d|>", row, col)
+			if tokenID := lookupTokenID(token); tokenID > 0 {
+				m.imageRowColIDs[imageGridPos{row: row, col: col}] = tokenID
+			}
+		}
+	}
+
+	if !m.useSpecialTokens {
+		m.imageStartToken = 0
+		m.imageEndToken = 0
+		m.imageThumbnailID = 0
+		m.imageRowColIDs = map[imageGridPos]int32{}
+	}
+
+	if c.Uint("vision.block_count") == 0 {
+		m.VisionModel = nil
+		m.VisionProjector = nil
+	}
+
 	type headCounts interface {
 		HeadCount() []uint64
 		HeadCountKV() []uint64
@@ -133,6 +334,14 @@ func New(c fs.Config) (model.Model, error) {

 	m.numHeadsByLayer = make([]int, len(m.Layers))
 	m.numKVHeadsByLayer = make([]int, len(m.Layers))
+	leadingDenseBlockCount := int(c.Uint("leading_dense_block_count"))
+	if leadingDenseBlockCount < 0 {
+		leadingDenseBlockCount = 0
+	}
+	if leadingDenseBlockCount > len(m.Layers) {
+		leadingDenseBlockCount = len(m.Layers)
+	}
+
 	for i := range m.Layers {
 		m.numHeadsByLayer[i] = int(headCount[i])
 		m.numKVHeadsByLayer[i] = int(headCountKV[i])
@@ -142,6 +351,12 @@ func New(c fs.Config) (model.Model, error) {
 		} else {
 			m.Layers[i].Operator = &Attention{}
 		}
+
+		if isMoE && i >= leadingDenseBlockCount {
+			m.Layers[i].MLP = &sparseMLP{}
+		} else {
+			m.Layers[i].MLP = &denseMLP{}
+		}
 	}

 	lCache := int(c.Uint("shortconv.l_cache"))
@@ -188,22 +403,77 @@ func (sa *Attention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor,
 	return sa.Output.Forward(ctx, attention)
 }

-type MLP struct {
+type FeedForward interface {
+	Forward(ml.Context, ml.Tensor, *Options) ml.Tensor
+}
+
+type denseMLP struct {
 	Up   *nn.Linear `gguf:"ffn_up"`
 	Down *nn.Linear `gguf:"ffn_down"`
 	Gate *nn.Linear `gguf:"ffn_gate"`
 }

-func (mlp *MLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *Options) ml.Tensor {
+func (mlp *denseMLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *Options) ml.Tensor {
 	hiddenState = mlp.Gate.Forward(ctx, hiddenState).SILU(ctx, mlp.Up.Forward(ctx, hiddenState))
 	return mlp.Down.Forward(ctx, hiddenState)
 }

+type sparseMLP struct {
+	Router *nn.Linear      `gguf:"ffn_gate_inp"`
+	Gate   *nn.LinearBatch `gguf:"ffn_gate_exps"`
+	Up     *nn.LinearBatch `gguf:"ffn_up_exps"`
+	Down   *nn.LinearBatch `gguf:"ffn_down_exps"`
+	Bias   ml.Tensor       `gguf:"exp_probs_b.bias,alt:exp_probs_b"`
+}
+
+func (mlp *sparseMLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *Options) ml.Tensor {
+	// hiddenState: [hidden, tokens]
+	routerLogits := mlp.Router.Forward(ctx, hiddenState)
+
+	probs := routerLogits.Softmax(ctx)
+	if opts.expertGatingFunc == expertGatingFuncSigmoid {
+		probs = routerLogits.Sigmoid(ctx)
+	}
+
+	selectionProbs := probs
+	if mlp.Bias != nil {
+		selectionProbs = selectionProbs.Add(ctx, mlp.Bias)
+	}
+
+	selectedExperts := selectionProbs.TopK(ctx, opts.numExpertsUsed)
+	routingWeights := probs.Reshape(ctx, 1, opts.numExperts, hiddenState.Dim(1)).Rows(ctx, selectedExperts)
+	if opts.normTopKProb {
+		routingWeights = routingWeights.Reshape(ctx, opts.numExpertsUsed, hiddenState.Dim(1))
+		weightsSum := routingWeights.SumRows(ctx)
+		weightsSum = weightsSum.Clamp(ctx, 1e-6, float32(math.Inf(1)))
+		routingWeights = routingWeights.Div(ctx, weightsSum)
+		routingWeights = routingWeights.Reshape(ctx, 1, opts.numExpertsUsed, hiddenState.Dim(1))
+	}
+	if opts.expertWeightsScale != 1 {
+		routingWeights = routingWeights.Scale(ctx, float64(opts.expertWeightsScale))
+	}
+
+	// Build routing-weights branch early to enable topk-MoE fusion.
+	ctx.Forward(routingWeights)
+
+	hiddenState3D := hiddenState.Reshape(ctx, hiddenState.Dim(0), 1, hiddenState.Dim(1))
+	experts := mlp.Gate.Forward(ctx, hiddenState3D, selectedExperts).SILU(ctx, mlp.Up.Forward(ctx, hiddenState3D, selectedExperts))
+	experts = mlp.Down.Forward(ctx, experts, selectedExperts)
+	experts = experts.Mul(ctx, routingWeights)
+
+	nextState := experts.View(ctx, 0, experts.Dim(0), experts.Stride(2), experts.Dim(2))
+	for i := 1; i < opts.numExpertsUsed; i++ {
+		nextState = nextState.Add(ctx, experts.View(ctx, i*experts.Stride(1), experts.Dim(0), experts.Stride(2), experts.Dim(2)))
+	}
+
+	return nextState
+}
+
 type Layer struct {
 	AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
 	Operator      Operator
 	MLPNorm       *nn.RMSNorm `gguf:"ffn_norm"`
-	MLP           *MLP
+	MLP           FeedForward
 }

 func (l *Layer) Forward(ctx ml.Context, layer int, hiddenState, positions, outputs ml.Tensor, cache *HybridCache, opts *Options) ml.Tensor {
@@ -229,10 +499,233 @@ func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tenso
 	return m.applyRotaryPositionEmbeddings(ctx, key, shift), nil
 }

+func multimodalTokenCount(mm input.Multimodal) int {
+	if mm.Tensor != nil {
+		return mm.Tensor.Dim(1)
+	}
+
+	switch data := mm.Data.(type) {
+	case int:
+		return data
+	case int32:
+		return int(data)
+	case visionChunkData:
+		return data.tokens
+	case *visionChunkData:
+		if data != nil {
+			return data.tokens
+		}
+	}
+
+	return 0
+}
+
+func multimodalChunkInfo(mm input.Multimodal) visionChunkData {
+	switch data := mm.Data.(type) {
+	case visionChunkData:
+		return data
+	case *visionChunkData:
+		if data != nil {
+			return *data
+		}
+	}
+
+	return visionChunkData{
+		tokens: multimodalTokenCount(mm),
+	}
+}
+
+func multimodalLayout(mm []input.Multimodal) visionEmbeddingLayout {
+	layout := visionEmbeddingLayout{rows: 1, cols: 1}
+	if len(mm) == 0 {
+		return layout
+	}
+
+	first := multimodalChunkInfo(mm[0])
+	if first.layout != nil {
+		return *first.layout
+	}
+
+	return layout
+}
+
+func (m *Model) imageRowColToken(row, col int) int32 {
+	if row <= 0 || col <= 0 {
+		return 0
+	}
+	return m.imageRowColIDs[imageGridPos{row: row, col: col}]
+}
+
+func (m *Model) appendImageChunk(result []*input.Input, chunk input.Multimodal, imageToken int32, hash uint64) ([]*input.Input, error) {
+	tokenCount := multimodalTokenCount(chunk)
+	if tokenCount <= 0 {
+		return nil, errors.New("lfm2: multimodal input has no tokens")
+	}
+
+	result = append(result, &input.Input{
+		Token:          imageToken,
+		Multimodal:     []input.Multimodal{chunk},
+		MultimodalHash: hash,
+		SameBatch:      tokenCount - 1,
+	})
+
+	for range tokenCount - 1 {
+		result = append(result, &input.Input{Token: imageToken})
+	}
+
+	return result, nil
+}
+
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
+	if m.VisionModel == nil || m.VisionProjector == nil || len(m.VisionModel.Layers) == 0 {
+		return nil, model.ErrNoVisionModel
+	}
+
+	img, _, err := image.Decode(bytes.NewReader(multimodalData))
+	if err != nil {
+		return nil, err
+	}
+
+	processedImages, layout, err := m.ImageProcessor.ProcessImage(img)
+	if err != nil {
+		return nil, err
+	}
+
+	if m.ImageProcessor.patchSize <= 0 {
+		return nil, errors.New("lfm2: invalid vision patch size")
+	}
+
+	layoutInfo := &visionEmbeddingLayout{
+		rows:         layout.rows,
+		cols:         layout.cols,
+		hasThumbnail: layout.hasThumbnail,
+	}
+
+	mm := make([]input.Multimodal, 0, len(processedImages))
+	for i, processed := range processedImages {
+		patches := visionPatchGrid{
+			Width:  processed.size.X / m.ImageProcessor.patchSize,
+			Height: processed.size.Y / m.ImageProcessor.patchSize,
+		}
+		if patches.Width == 0 || patches.Height == 0 {
+			return nil, errors.New("lfm2: invalid resized image dimensions")
+		}
+
+		pixelValues := ctx.Input().FromFloats(processed.data, processed.size.X, processed.size.Y, m.ImageProcessor.numChannels)
+		visionOutputs := m.VisionModel.Forward(ctx, pixelValues, patches)
+		projected := m.VisionProjector.Forward(ctx, visionOutputs, patches, m.projectorOptions)
+
+		chunk := visionChunkData{
+			tokens:    projected.Dim(1),
+			row:       processed.row,
+			col:       processed.col,
+			thumbnail: processed.thumbnail,
+		}
+		if i == 0 {
+			chunk.layout = layoutInfo
+		}
+
+		mm = append(mm, input.Multimodal{
+			Tensor: projected,
+			Data:   chunk,
+		})
+	}
+
+	return mm, nil
+}
+
+func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
+	var result []*input.Input
+
+	imageToken := m.imageTokenID
+	if imageToken == 0 {
+		imageToken = 396
+	}
+	useSpecialTokens := m.useSpecialTokens || m.imageStartToken > 0 || m.imageEndToken > 0 || m.imageThumbnailID > 0 || len(m.imageRowColIDs) > 0
+
+	for _, inp := range inputs {
+		if len(inp.Multimodal) == 0 {
+			result = append(result, inp)
+			continue
+		}
+
+		layout := multimodalLayout(inp.Multimodal)
+		if layout.rows <= 0 {
+			layout.rows = 1
+		}
+		if layout.cols <= 0 {
+			layout.cols = 1
+		}
+		tiles := layout.rows * layout.cols
+		multitile := tiles > 1
+
+		if useSpecialTokens && m.imageStartToken > 0 {
+			result = append(result, &input.Input{Token: m.imageStartToken})
+		}
+
+		for i, mm := range inp.Multimodal {
+			chunk := multimodalChunkInfo(mm)
+			if chunk.tokens <= 0 {
+				chunk.tokens = multimodalTokenCount(mm)
+			}
+
+			if multitile && !chunk.thumbnail && chunk.row == 0 && chunk.col == 0 && i < tiles {
+				chunk.row = i/layout.cols + 1
+				chunk.col = i%layout.cols + 1
+			}
+			if multitile && layout.hasThumbnail && i == tiles {
+				chunk.thumbnail = true
+			}
+
+			if useSpecialTokens && multitile {
+				if chunk.thumbnail {
+					if m.imageThumbnailID > 0 {
+						result = append(result, &input.Input{Token: m.imageThumbnailID})
+					}
+				} else if marker := m.imageRowColToken(chunk.row, chunk.col); marker > 0 {
+					result = append(result, &input.Input{Token: marker})
+				}
+			}
+
+			var err error
+			result, err = m.appendImageChunk(result, input.Multimodal{
+				Tensor: mm.Tensor,
+				Data:   chunk,
+			}, imageToken, inp.MultimodalHash)
+			if err != nil {
+				return nil, err
+			}
+		}
+
+		if useSpecialTokens && m.imageEndToken > 0 {
+			result = append(result, &input.Input{Token: m.imageEndToken})
+		}
+	}
+
+	return result, nil
+}
+
 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 	positions := ctx.Input().FromInts(batch.Positions, len(batch.Positions))

 	hiddenState := m.TokenEmbedding.Forward(ctx, batch.Inputs)
+	if len(batch.Multimodal) > 0 {
+		// We splice vision embeddings into token embeddings in-place; duplicate to
+		// avoid aliasing the raw embedding output graph.
+		hiddenState = hiddenState.Duplicate(ctx)
+	}
+	for _, mm := range batch.Multimodal {
+		offset := mm.Index
+		for _, multimodal := range mm.Multimodal {
+			if multimodal.Tensor == nil {
+				continue
+			}
+
+			visionOutputs := multimodal.Tensor
+			ctx.Forward(visionOutputs.Copy(ctx, hiddenState.View(ctx, offset*hiddenState.Stride(1), visionOutputs.Dim(0)*visionOutputs.Dim(1))))
+			offset += visionOutputs.Dim(1)
+		}
+	}

 	for i, layer := range m.Layers {
 		m.Cache.SetLayer(i)
@@ -251,4 +744,5 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {

 func init() {
 	model.Register("lfm2", New)
+	model.Register("lfm2moe", New)
 }
--- a/model/models/lfm2/model_multimodal_test.go
+++ b/model/models/lfm2/model_multimodal_test.go
@@ -0,0 +1,160 @@
+package lfm2
+
+import (
+	"testing"
+
+	"github.com/ollama/ollama/model/input"
+)
+
+func TestPostTokenizeWithSpecialImageTokens(t *testing.T) {
+	m := &Model{
+		imageTokenID:     396,
+		imageStartToken:  2,
+		imageEndToken:    3,
+		useSpecialTokens: true,
+	}
+
+	in := []*input.Input{
+		{Token: 11},
+		{Multimodal: []input.Multimodal{{Data: 64}}, MultimodalHash: 123},
+		{Token: 12},
+	}
+
+	out, err := m.PostTokenize(in)
+	if err != nil {
+		t.Fatalf("PostTokenize returned error: %v", err)
+	}
+
+	if len(out) != 68 {
+		t.Fatalf("expected 68 tokens, got %d", len(out))
+	}
+
+	if out[0].Token != 11 {
+		t.Fatalf("out[0].Token = %d, want 11", out[0].Token)
+	}
+	if out[1].Token != 2 {
+		t.Fatalf("out[1].Token = %d, want 2", out[1].Token)
+	}
+
+	firstImage := out[2]
+	if firstImage.Token != 396 {
+		t.Fatalf("out[2].Token = %d, want 396", firstImage.Token)
+	}
+	if len(firstImage.Multimodal) != 1 {
+		t.Fatalf("expected multimodal payload on first image token")
+	}
+	if firstImage.MultimodalHash != 123 {
+		t.Fatalf("out[2].MultimodalHash = %d, want 123", firstImage.MultimodalHash)
+	}
+	if firstImage.SameBatch != 63 {
+		t.Fatalf("out[2].SameBatch = %d, want 63", firstImage.SameBatch)
+	}
+
+	for i := 3; i < 66; i++ {
+		if out[i].Token != 396 {
+			t.Fatalf("out[%d].Token = %d, want 396", i, out[i].Token)
+		}
+		if len(out[i].Multimodal) != 0 {
+			t.Fatalf("out[%d] should not carry multimodal payload", i)
+		}
+	}
+
+	if out[66].Token != 3 {
+		t.Fatalf("out[66].Token = %d, want 3", out[66].Token)
+	}
+	if out[67].Token != 12 {
+		t.Fatalf("out[67].Token = %d, want 12", out[67].Token)
+	}
+}
+
+func TestPostTokenizeWithoutSpecialImageTokens(t *testing.T) {
+	m := &Model{
+		imageTokenID:     777,
+		useSpecialTokens: false,
+	}
+
+	in := []*input.Input{
+		{Multimodal: []input.Multimodal{{Data: 5}}, MultimodalHash: 9},
+	}
+
+	out, err := m.PostTokenize(in)
+	if err != nil {
+		t.Fatalf("PostTokenize returned error: %v", err)
+	}
+
+	if len(out) != 5 {
+		t.Fatalf("expected 5 tokens, got %d", len(out))
+	}
+	if out[0].Token != 777 || out[0].SameBatch != 4 || len(out[0].Multimodal) != 1 {
+		t.Fatalf("unexpected first token: %+v", *out[0])
+	}
+	for i := 1; i < 5; i++ {
+		if out[i].Token != 777 {
+			t.Fatalf("out[%d].Token = %d, want 777", i, out[i].Token)
+		}
+		if len(out[i].Multimodal) != 0 {
+			t.Fatalf("out[%d] should not carry multimodal payload", i)
+		}
+	}
+}
+
+func TestPostTokenizeMultiTileLayoutTokens(t *testing.T) {
+	m := &Model{
+		imageTokenID:     396,
+		imageStartToken:  498,
+		imageEndToken:    499,
+		imageThumbnailID: 497,
+		imageRowColIDs: map[imageGridPos]int32{
+			{row: 1, col: 1}: 397,
+			{row: 1, col: 2}: 398,
+		},
+		useSpecialTokens: true,
+	}
+
+	layout := &visionEmbeddingLayout{rows: 1, cols: 2, hasThumbnail: true}
+	in := []*input.Input{{
+		Multimodal: []input.Multimodal{
+			{Data: visionChunkData{tokens: 3, row: 1, col: 1, layout: layout}},
+			{Data: visionChunkData{tokens: 3, row: 1, col: 2}},
+			{Data: visionChunkData{tokens: 2, thumbnail: true}},
+		},
+		MultimodalHash: 1,
+	}}
+
+	out, err := m.PostTokenize(in)
+	if err != nil {
+		t.Fatalf("PostTokenize returned error: %v", err)
+	}
+
+	got := make([]int32, len(out))
+	for i := range out {
+		got[i] = out[i].Token
+	}
+
+	want := []int32{
+		498, // <|image_start|>
+		397, // <|img_row_1_col_1|>
+		396, 396, 396,
+		398, // <|img_row_1_col_2|>
+		396, 396, 396,
+		497, // <|img_thumbnail|>
+		396, 396,
+		499, // <|image_end|>
+	}
+
+	if len(got) != len(want) {
+		t.Fatalf("len(out) = %d, want %d", len(got), len(want))
+	}
+	for i := range want {
+		if got[i] != want[i] {
+			t.Fatalf("out[%d].Token = %d, want %d", i, got[i], want[i])
+		}
+	}
+
+	if len(out[2].Multimodal) != 1 || len(out[6].Multimodal) != 1 || len(out[10].Multimodal) != 1 {
+		t.Fatalf("expected multimodal payload on first token of each chunk")
+	}
+	if out[2].SameBatch != 2 || out[6].SameBatch != 2 || out[10].SameBatch != 1 {
+		t.Fatalf("unexpected SameBatch values: [%d %d %d]", out[2].SameBatch, out[6].SameBatch, out[10].SameBatch)
+	}
+}
--- a/model/models/lfm2/model_vision.go
+++ b/model/models/lfm2/model_vision.go
@@ -0,0 +1,184 @@
+package lfm2
+
+import (
+	"math"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+)
+
+const lfm2VisionBatchSize = 1
+
+type visionPatchGrid struct {
+	Width  int
+	Height int
+}
+
+type VisionSelfAttention struct {
+	Query  *nn.Linear `gguf:"attn_q"`
+	Key    *nn.Linear `gguf:"attn_k"`
+	Value  *nn.Linear `gguf:"attn_v"`
+	Output *nn.Linear `gguf:"attn_output,alt:attn_out"`
+}
+
+func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
+	headDim := opts.hiddenSize / opts.numHeads
+
+	query := sa.Query.Forward(ctx, hiddenState)
+	key := sa.Key.Forward(ctx, hiddenState)
+	value := sa.Value.Forward(ctx, hiddenState)
+
+	query = query.Reshape(ctx, headDim, opts.numHeads, query.Dim(1), lfm2VisionBatchSize)
+	key = key.Reshape(ctx, headDim, opts.numHeads, key.Dim(1), lfm2VisionBatchSize)
+	value = value.Reshape(ctx, headDim, opts.numHeads, value.Dim(1), lfm2VisionBatchSize)
+
+	attention := nn.Attention(ctx, query, key, value, 1.0/math.Sqrt(float64(headDim)), nil)
+	attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), lfm2VisionBatchSize)
+	return sa.Output.Forward(ctx, attention)
+}
+
+type VisionMLP struct {
+	Up   *nn.Linear `gguf:"ffn_up"`
+	Down *nn.Linear `gguf:"ffn_down"`
+}
+
+func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenState ml.Tensor) ml.Tensor {
+	return mlp.Down.Forward(ctx, mlp.Up.Forward(ctx, hiddenState).GELU(ctx))
+}
+
+type VisionEncoderLayer struct {
+	LayerNorm1    *nn.LayerNorm `gguf:"ln1"`
+	SelfAttention *VisionSelfAttention
+
+	LayerNorm2 *nn.LayerNorm `gguf:"ln2"`
+	MLP        *VisionMLP
+}
+
+func (l *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
+	residual := hiddenState
+
+	hiddenState = l.LayerNorm1.Forward(ctx, hiddenState, opts.eps)
+	hiddenState = l.SelfAttention.Forward(ctx, hiddenState, opts)
+	hiddenState = hiddenState.Add(ctx, residual)
+
+	residual = hiddenState
+	hiddenState = l.LayerNorm2.Forward(ctx, hiddenState, opts.eps)
+	hiddenState = l.MLP.Forward(ctx, hiddenState)
+	return hiddenState.Add(ctx, residual)
+}
+
+type VisionModelOptions struct {
+	hiddenSize, numHeads int
+	imageSize, patchSize int
+	eps                  float32
+}
+
+type VisionModel struct {
+	PatchEmbedding    *nn.Conv2D    `gguf:"patch_embd"`
+	PositionEmbedding *nn.Embedding `gguf:"position_embd"`
+	PostLayerNorm     *nn.LayerNorm `gguf:"post_ln"`
+
+	Layers []VisionEncoderLayer `gguf:"blk"`
+
+	*VisionModelOptions
+}
+
+func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor, patches visionPatchGrid) ml.Tensor {
+	numPatches := patches.Width * patches.Height
+
+	hiddenState := m.PatchEmbedding.Forward(ctx, pixelValues, m.patchSize, m.patchSize, 0, 0, 1, 1)
+	hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize)
+	hiddenState = hiddenState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
+
+	if m.PositionEmbedding != nil {
+		posTokens := m.PositionEmbedding.Weight.Dim(1)
+		source := int(math.Sqrt(float64(posTokens)))
+
+		var positionEmbeddings ml.Tensor
+		if source > 0 && source*source == posTokens && (source != patches.Width || source != patches.Height) {
+			// SigLIP2 NAFlex-style position interpolation for variable image sizes.
+			positionIDs := ctx.Arange(0, float32(posTokens), 1, ml.DTypeI32)
+			positionEmbeddings = m.PositionEmbedding.Forward(ctx, positionIDs)
+			positionEmbeddings = positionEmbeddings.Reshape(ctx, -1, source, source)
+			positionEmbeddings = positionEmbeddings.Permute(ctx, 2, 0, 1, 3).Contiguous(ctx)
+			positionEmbeddings = positionEmbeddings.Interpolate(ctx, [4]int{
+				patches.Width,
+				patches.Height,
+				hiddenState.Dim(0),
+				1,
+			}, ml.SamplingModeBilinear)
+			positionEmbeddings = positionEmbeddings.Permute(ctx, 1, 2, 0, 3)
+			positionEmbeddings = positionEmbeddings.Contiguous(ctx, -1, patches.Width*patches.Height)
+		} else {
+			positionIDs := ctx.Arange(0, float32(numPatches), 1, ml.DTypeI32)
+			positionEmbeddings = m.PositionEmbedding.Forward(ctx, positionIDs)
+		}
+
+		hiddenState = hiddenState.Add(ctx, positionEmbeddings)
+	}
+
+	for _, layer := range m.Layers {
+		hiddenState = layer.Forward(ctx, hiddenState, m.VisionModelOptions)
+	}
+
+	return m.PostLayerNorm.Forward(ctx, hiddenState, m.eps)
+}
+
+func newVisionModel(c fs.Config) *VisionModel {
+	return &VisionModel{
+		Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count")),
+		VisionModelOptions: &VisionModelOptions{
+			hiddenSize: int(c.Uint("vision.embedding_length", 1152)),
+			numHeads:   int(c.Uint("vision.attention.head_count", 16)),
+			imageSize:  int(c.Uint("vision.image_size", 256)),
+			patchSize:  int(c.Uint("vision.patch_size", 16)),
+			eps:        c.Float("vision.attention.layer_norm_epsilon", 1e-6),
+		},
+	}
+}
+
+type VisionProjector struct {
+	LayerNorm *nn.LayerNorm `gguf:"layer_norm"`
+	Linear1   *nn.Linear    `gguf:"1"`
+	Linear2   *nn.Linear    `gguf:"2"`
+}
+
+type VisionProjectorOptions struct {
+	scaleFactor  int
+	useLayerNorm bool
+}
+
+func (p *VisionProjector) Forward(ctx ml.Context, visionOutputs ml.Tensor, patches visionPatchGrid, opts VisionProjectorOptions) ml.Tensor {
+	hiddenSize := visionOutputs.Dim(0)
+	featureMap := visionOutputs
+
+	merge := max(opts.scaleFactor, 1)
+	if merge > 1 {
+		width := patches.Width
+		height := patches.Height
+
+		featureMap = featureMap.Reshape(ctx, hiddenSize, width, height)
+
+		// Match llama.cpp patch merger: pad spatial dims to merge factor.
+		padWidth := (merge - width%merge) % merge
+		padHeight := (merge - height%merge) % merge
+		if padWidth != 0 || padHeight != 0 {
+			featureMap = featureMap.Pad(ctx, 0, padWidth, padHeight, 0)
+			width += padWidth
+			height += padHeight
+		}
+
+		featureMap = featureMap.Reshape(ctx, hiddenSize*merge, width/merge, height)
+		featureMap = featureMap.Permute(ctx, 0, 2, 1).Contiguous(ctx, hiddenSize*merge*merge, height/merge, width/merge)
+		featureMap = featureMap.Permute(ctx, 0, 2, 1).Contiguous(ctx)
+		featureMap = featureMap.Contiguous(ctx, featureMap.Dim(0), featureMap.Dim(1)*featureMap.Dim(2))
+	}
+
+	if opts.useLayerNorm && p.LayerNorm != nil {
+		featureMap = p.LayerNorm.Forward(ctx, featureMap, 1e-5)
+	}
+
+	featureMap = p.Linear1.Forward(ctx, featureMap).GELU(ctx)
+	return p.Linear2.Forward(ctx, featureMap)
+}
--- a/model/models/lfm2/process_image.go
+++ b/model/models/lfm2/process_image.go
@@ -0,0 +1,260 @@
+package lfm2
+
+import (
+	"image"
+	stdimage "image/draw"
+	"math"
+	"slices"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/model/imageproc"
+)
+
+type ImageProcessor struct {
+	imageSize, patchSize, numChannels int
+	downsampleFactor                  int
+	imageMean, imageStd               [3]float32
+
+	doImageSplitting bool
+	minTiles         int
+	maxTiles         int
+	useThumbnail     bool
+	tileSize         int
+
+	minImageTokens     int
+	maxImageTokens     int
+	maxPixelsTolerance float64
+}
+
+type processedVisionImage struct {
+	data      []float32
+	size      image.Point
+	row       int
+	col       int
+	thumbnail bool
+}
+
+type processedVisionLayout struct {
+	rows         int
+	cols         int
+	hasThumbnail bool
+}
+
+func newImageProcessor(c fs.Config) ImageProcessor {
+	mean := c.Floats("vision.image_mean")
+	std := c.Floats("vision.image_std")
+
+	processor := ImageProcessor{
+		imageSize:          int(c.Uint("vision.image_size", 256)),
+		patchSize:          int(c.Uint("vision.patch_size", 16)),
+		numChannels:        int(c.Uint("vision.num_channels", 3)),
+		downsampleFactor:   int(c.Uint("vision.projector.scale_factor", 2)),
+		imageMean:          [3]float32{0.5, 0.5, 0.5},
+		imageStd:           [3]float32{0.5, 0.5, 0.5},
+		doImageSplitting:   c.Bool("vision.do_image_splitting", true),
+		minTiles:           int(c.Uint("vision.min_tiles", 2)),
+		maxTiles:           int(c.Uint("vision.max_tiles", 10)),
+		useThumbnail:       c.Bool("vision.use_thumbnail", true),
+		tileSize:           int(c.Uint("vision.tile_size", 512)),
+		minImageTokens:     int(c.Uint("vision.min_image_tokens", 64)),
+		maxImageTokens:     int(c.Uint("vision.max_image_tokens", 256)),
+		maxPixelsTolerance: float64(c.Float("vision.max_pixels_tolerance", 2.0)),
+	}
+
+	if len(mean) >= 3 {
+		processor.imageMean = [3]float32{mean[0], mean[1], mean[2]}
+	}
+	if len(std) >= 3 {
+		processor.imageStd = [3]float32{std[0], std[1], std[2]}
+	}
+
+	// Keep defaults aligned with HF unless explicitly configured.
+	if processor.downsampleFactor <= 0 {
+		processor.downsampleFactor = 2
+	}
+	if processor.patchSize <= 0 {
+		processor.patchSize = 16
+	}
+	if processor.tileSize <= 0 {
+		processor.tileSize = 512
+	}
+	if processor.minTiles <= 0 {
+		processor.minTiles = 2
+	}
+	if processor.maxTiles < processor.minTiles {
+		processor.maxTiles = processor.minTiles
+	}
+	if processor.minImageTokens <= 0 {
+		processor.minImageTokens = 64
+	}
+	if processor.maxImageTokens < processor.minImageTokens {
+		processor.maxImageTokens = processor.minImageTokens
+	}
+	if processor.maxPixelsTolerance <= 0 {
+		processor.maxPixelsTolerance = 2.0
+	}
+
+	return processor
+}
+
+func (p ImageProcessor) ProcessImage(img image.Image) ([]processedVisionImage, processedVisionLayout, error) {
+	img = imageproc.Composite(img)
+
+	orig := img.Bounds().Size()
+	resizedWidth, resizedHeight := p.smartResize(orig.Y, orig.X)
+
+	layout := processedVisionLayout{rows: 1, cols: 1}
+	if p.shouldSplit(orig.Y, orig.X) {
+		gridWidth, gridHeight, targetWidth, targetHeight := p.gridLayout(orig.Y, orig.X)
+		layout.rows = gridHeight
+		layout.cols = gridWidth
+		layout.hasThumbnail = p.useThumbnail && gridWidth*gridHeight != 1
+
+		resized := imageproc.Resize(img, image.Point{X: targetWidth, Y: targetHeight}, imageproc.ResizeBilinear)
+		images := make([]processedVisionImage, 0, gridWidth*gridHeight+1)
+		for row := range gridHeight {
+			for col := range gridWidth {
+				rect := image.Rect(
+					col*p.tileSize,
+					row*p.tileSize,
+					(col+1)*p.tileSize,
+					(row+1)*p.tileSize,
+				)
+				tile := cropImage(resized, rect)
+				images = append(images, processedVisionImage{
+					data: imageproc.Normalize(tile, p.imageMean, p.imageStd, true, true),
+					size: tile.Bounds().Size(),
+					row:  row + 1,
+					col:  col + 1,
+				})
+			}
+		}
+
+		if layout.hasThumbnail {
+			thumbnail := imageproc.Resize(img, image.Point{X: resizedWidth, Y: resizedHeight}, imageproc.ResizeBilinear)
+			images = append(images, processedVisionImage{
+				data:      imageproc.Normalize(thumbnail, p.imageMean, p.imageStd, true, true),
+				size:      thumbnail.Bounds().Size(),
+				thumbnail: true,
+			})
+		}
+
+		return images, layout, nil
+	}
+
+	single := imageproc.Resize(img, image.Point{X: resizedWidth, Y: resizedHeight}, imageproc.ResizeBilinear)
+	return []processedVisionImage{{
+		data: imageproc.Normalize(single, p.imageMean, p.imageStd, true, true),
+		size: single.Bounds().Size(),
+	}}, layout, nil
+}
+
+func (p ImageProcessor) shouldSplit(height, width int) bool {
+	if !p.doImageSplitting || p.minTiles == 1 && p.maxTiles == 1 {
+		return false
+	}
+
+	totalFactor := p.patchSize * p.downsampleFactor
+	hBar := max(p.patchSize, roundByFactor(height, totalFactor))
+	wBar := max(p.patchSize, roundByFactor(width, totalFactor))
+
+	limit := float64(p.maxImageTokens * p.patchSize * p.patchSize * p.downsampleFactor * p.downsampleFactor)
+	limit *= p.maxPixelsTolerance
+
+	return float64(hBar*wBar) > limit
+}
+
+func (p ImageProcessor) smartResize(height, width int) (int, int) {
+	totalFactor := p.patchSize * p.downsampleFactor
+	minPixels := p.minImageTokens * p.patchSize * p.patchSize * p.downsampleFactor * p.downsampleFactor
+	maxPixels := p.maxImageTokens * p.patchSize * p.patchSize * p.downsampleFactor * p.downsampleFactor
+
+	hBar := max(totalFactor, roundByFactor(height, totalFactor))
+	wBar := max(totalFactor, roundByFactor(width, totalFactor))
+
+	if hBar*wBar > maxPixels {
+		beta := math.Sqrt(float64(height*width) / float64(maxPixels))
+		hBar = max(totalFactor, int(math.Floor(float64(height)/beta/float64(totalFactor)))*totalFactor)
+		wBar = max(totalFactor, int(math.Floor(float64(width)/beta/float64(totalFactor)))*totalFactor)
+	} else if hBar*wBar < minPixels {
+		beta := math.Sqrt(float64(minPixels) / float64(height*width))
+		hBar = int(math.Ceil(float64(height)*beta/float64(totalFactor))) * totalFactor
+		wBar = int(math.Ceil(float64(width)*beta/float64(totalFactor))) * totalFactor
+	}
+
+	return wBar, hBar
+}
+
+func (p ImageProcessor) gridLayout(height, width int) (gridWidth, gridHeight, targetWidth, targetHeight int) {
+	aspectRatio := float64(width) / float64(height)
+	targetRatios := p.targetRatios()
+	bestRatio := clipImageSize{width: 1, height: 1}
+	bestRatioDiff := math.MaxFloat64
+	area := float64(width * height)
+
+	for _, ratio := range targetRatios {
+		targetAspect := float64(ratio.width) / float64(ratio.height)
+		ratioDiff := math.Abs(aspectRatio - targetAspect)
+
+		if ratioDiff < bestRatioDiff {
+			bestRatioDiff = ratioDiff
+			bestRatio = ratio
+			continue
+		}
+
+		if ratioDiff == bestRatioDiff {
+			targetArea := float64(p.tileSize * p.tileSize * ratio.width * ratio.height)
+			if area > 0.5*targetArea {
+				bestRatio = ratio
+			}
+		}
+	}
+
+	return bestRatio.width, bestRatio.height, p.tileSize * bestRatio.width, p.tileSize * bestRatio.height
+}
+
+type clipImageSize struct {
+	width  int
+	height int
+}
+
+func (p ImageProcessor) targetRatios() []clipImageSize {
+	targetRatios := make([]clipImageSize, 0, p.maxTiles*p.maxTiles)
+	for n := p.minTiles; n <= p.maxTiles; n++ {
+		for w := 1; w <= n; w++ {
+			for h := 1; h <= n; h++ {
+				if w*h < p.minTiles || w*h > p.maxTiles {
+					continue
+				}
+				targetRatios = append(targetRatios, clipImageSize{width: w, height: h})
+			}
+		}
+	}
+
+	unique := targetRatios[:0]
+	for _, ratio := range targetRatios {
+		if slices.Contains(unique, ratio) {
+			continue
+		}
+		unique = append(unique, ratio)
+	}
+
+	slices.SortFunc(unique, func(a, b clipImageSize) int {
+		return a.width*a.height - b.width*b.height
+	})
+
+	return unique
+}
+
+func roundByFactor(number, factor int) int {
+	if factor <= 0 {
+		return number
+	}
+	return int(math.RoundToEven(float64(number)/float64(factor))) * factor
+}
+
+func cropImage(img image.Image, rect image.Rectangle) image.Image {
+	dst := image.NewRGBA(image.Rect(0, 0, rect.Dx(), rect.Dy()))
+	stdimage.Draw(dst, dst.Bounds(), img, rect.Min, stdimage.Src)
+	return dst
+}
--- a/model/models/lfm2/process_image_test.go
+++ b/model/models/lfm2/process_image_test.go
@@ -0,0 +1,105 @@
+package lfm2
+
+import (
+	"image"
+	"image/color"
+	"testing"
+)
+
+func TestProcessImageSingleTile(t *testing.T) {
+	p := ImageProcessor{
+		patchSize:          16,
+		downsampleFactor:   2,
+		numChannels:        3,
+		imageMean:          [3]float32{0.5, 0.5, 0.5},
+		imageStd:           [3]float32{0.5, 0.5, 0.5},
+		doImageSplitting:   true,
+		minTiles:           2,
+		maxTiles:           10,
+		useThumbnail:       true,
+		tileSize:           512,
+		minImageTokens:     64,
+		maxImageTokens:     256,
+		maxPixelsTolerance: 2.0,
+	}
+
+	img := image.NewRGBA(image.Rect(0, 0, 320, 320))
+	out, layout, err := p.ProcessImage(img)
+	if err != nil {
+		t.Fatalf("ProcessImage returned error: %v", err)
+	}
+
+	if layout.rows != 1 || layout.cols != 1 || layout.hasThumbnail {
+		t.Fatalf("layout = %+v, want rows=1 cols=1 hasThumbnail=false", layout)
+	}
+	if len(out) != 1 {
+		t.Fatalf("len(out) = %d, want 1", len(out))
+	}
+	if out[0].size != (image.Point{X: 320, Y: 320}) {
+		t.Fatalf("single image size = %+v, want 320x320", out[0].size)
+	}
+	if out[0].thumbnail {
+		t.Fatalf("single image should not be marked as thumbnail")
+	}
+}
+
+func TestProcessImageDynamicTiling(t *testing.T) {
+	p := ImageProcessor{
+		patchSize:          16,
+		downsampleFactor:   2,
+		numChannels:        3,
+		imageMean:          [3]float32{0.5, 0.5, 0.5},
+		imageStd:           [3]float32{0.5, 0.5, 0.5},
+		doImageSplitting:   true,
+		minTiles:           2,
+		maxTiles:           10,
+		useThumbnail:       true,
+		tileSize:           512,
+		minImageTokens:     64,
+		maxImageTokens:     256,
+		maxPixelsTolerance: 2.0,
+	}
+
+	// Wide image that should trigger multi-tile splitting.
+	img := image.NewRGBA(image.Rect(0, 0, 3000, 1000))
+	fill := color.RGBA{R: 120, G: 90, B: 60, A: 255}
+	for y := range 1000 {
+		for x := range 3000 {
+			img.Set(x, y, fill)
+		}
+	}
+
+	out, layout, err := p.ProcessImage(img)
+	if err != nil {
+		t.Fatalf("ProcessImage returned error: %v", err)
+	}
+
+	if layout.rows*layout.cols <= 1 {
+		t.Fatalf("expected multi-tile layout, got %+v", layout)
+	}
+	if !layout.hasThumbnail {
+		t.Fatalf("expected thumbnail for multi-tile layout")
+	}
+
+	wantLen := layout.rows*layout.cols + 1
+	if len(out) != wantLen {
+		t.Fatalf("len(out) = %d, want %d", len(out), wantLen)
+	}
+
+	for i := range layout.rows * layout.cols {
+		if out[i].size != (image.Point{X: 512, Y: 512}) {
+			t.Fatalf("tile[%d] size = %+v, want 512x512", i, out[i].size)
+		}
+		if out[i].thumbnail {
+			t.Fatalf("tile[%d] should not be marked as thumbnail", i)
+		}
+	}
+
+	thumb := out[len(out)-1]
+	if !thumb.thumbnail {
+		t.Fatalf("last chunk should be thumbnail")
+	}
+	if thumb.size.X%32 != 0 || thumb.size.Y%32 != 0 {
+		t.Fatalf("thumbnail size = %+v, want dimensions aligned to 32", thumb.size)
+	}
+}
--- a/model/models/models.go
+++ b/model/models/models.go
@@ -15,6 +15,7 @@ import (
 	_ "github.com/ollama/ollama/model/models/llama4"
 	_ "github.com/ollama/ollama/model/models/mistral3"
 	_ "github.com/ollama/ollama/model/models/mllama"
+	_ "github.com/ollama/ollama/model/models/nemotronh"
 	_ "github.com/ollama/ollama/model/models/nomicbert"
 	_ "github.com/ollama/ollama/model/models/olmo3"
 	_ "github.com/ollama/ollama/model/models/qwen2"
--- a/model/models/nemotronh/attention.go
+++ b/model/models/nemotronh/attention.go
@@ -0,0 +1,88 @@
+package nemotronh
+
+import (
+	"fmt"
+	"math"
+
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+)
+
+// Attention implements simple attention without RoPE for Nemotron-H.
+// Unlike Qwen3Next, Nemotron-H attention has:
+// - No RoPE (position info comes from Mamba2 layers)
+// - Standard scaled dot-product attention
+type Attention struct {
+	Query  *nn.Linear `gguf:"attn_q"`
+	Key    *nn.Linear `gguf:"attn_k"`
+	Value  *nn.Linear `gguf:"attn_v"`
+	Output *nn.Linear `gguf:"attn_output"`
+}
+
+func (a *Attention) Forward(ctx ml.Context, hiddenStates ml.Tensor, cache *HybridCache, opts *Options) (ml.Tensor, error) {
+	hiddenDim := hiddenStates.Dim(0)
+	nSeqTokens := hiddenStates.Dim(1)
+	switch hiddenStates.Dim(2) {
+	case 0:
+		hiddenStates = hiddenStates.Reshape(ctx, hiddenDim, nSeqTokens, 1)
+	case 1:
+	default:
+		return nil, ErrUnsupportedBatchLayout
+	}
+
+	// Nemotron-H is currently clamped to num_parallel=1.
+	if cache != nil && cache.IsSupportedForBatch() {
+		if cache.numSeqs() != 1 {
+			return nil, ErrUnsupportedBatchLayout
+		}
+		if seqTokens := cache.seqTokens(); seqTokens > 0 && nSeqTokens != seqTokens {
+			return nil, ErrUnsupportedBatchLayout
+		}
+	}
+	batchSize := nSeqTokens
+	hiddenStates = hiddenStates.Reshape(ctx, hiddenDim, batchSize)
+
+	headDim := opts.getHeadDim()
+	if headDim <= 0 {
+		return nil, fmt.Errorf("nemotronh: invalid attention head dimension %d", headDim)
+	}
+
+	// Q projection
+	query := a.Query.Forward(ctx, hiddenStates)
+	if query.Dim(0)%headDim != 0 {
+		return nil, fmt.Errorf("nemotronh: query dim %d not divisible by head dim %d", query.Dim(0), headDim)
+	}
+	numHeads := query.Dim(0) / headDim
+	query = query.Reshape(ctx, headDim, numHeads, batchSize)
+
+	// K projection
+	key := a.Key.Forward(ctx, hiddenStates)
+	if key.Dim(0)%headDim != 0 {
+		return nil, fmt.Errorf("nemotronh: key dim %d not divisible by head dim %d", key.Dim(0), headDim)
+	}
+	numKVHeads := key.Dim(0) / headDim
+	key = key.Reshape(ctx, headDim, numKVHeads, batchSize)
+
+	// V projection
+	value := a.Value.Forward(ctx, hiddenStates)
+	if value.Dim(0)%headDim != 0 {
+		return nil, fmt.Errorf("nemotronh: value dim %d not divisible by head dim %d", value.Dim(0), headDim)
+	}
+	if value.Dim(0)/headDim != numKVHeads {
+		return nil, fmt.Errorf("nemotronh: key heads %d and value heads %d do not match", numKVHeads, value.Dim(0)/headDim)
+	}
+	value = value.Reshape(ctx, headDim, numKVHeads, batchSize)
+
+	// Standard attention computation (no RoPE)
+	scale := opts.attentionScale
+	if scale == 0 {
+		scale = 1.0 / math.Sqrt(float64(headDim))
+	}
+	attention := nn.Attention(ctx, query, key, value, scale, cache)
+
+	// Flatten heads
+	attention = attention.Reshape(ctx, headDim*numHeads, batchSize)
+
+	// Output projection
+	return a.Output.Forward(ctx, attention), nil
+}
--- a/model/models/nemotronh/cache.go
+++ b/model/models/nemotronh/cache.go
@@ -0,0 +1,55 @@
+package nemotronh
+
+import (
+	"errors"
+
+	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/ml"
+)
+
+// ErrUnsupportedBatchLayout is returned when the batch layout is incompatible
+// with the layer requirements.
+var ErrUnsupportedBatchLayout = errors.New("nemotronh: unsupported batch layout")
+
+var (
+	_ kvcache.Cache           = (*HybridCache)(nil)
+	_ kvcache.CheckpointCache = (*HybridCache)(nil)
+)
+
+// HybridCache adapts the shared recurrent cache base for Nemotron-H naming.
+type HybridCache struct {
+	*kvcache.Recurrent
+}
+
+func NewHybridCache(convDim, convChannels, ssmStateSize int) *HybridCache {
+	base := kvcache.NewRecurrentCache(kvcache.RecurrentConfig{
+		Shift:               Shift,
+		ConvDim:             convDim,
+		ConvChannels:        convChannels,
+		RecurrentStateSize:  ssmStateSize,
+		CheckpointLogPrefix: "nemotronh",
+	})
+	return &HybridCache{Recurrent: base}
+}
+
+// SSMState returns the SSM state for current batch sequences.
+func (c *HybridCache) SSMState(ctx ml.Context, layer int, dState, headDim, nHead int) (ml.Tensor, error) {
+	return c.RecurrentState4D(ctx, layer, dState, headDim, nHead)
+}
+
+// UpdateSSMState writes a new SSM state for current batch sequences.
+func (c *HybridCache) UpdateSSMState(ctx ml.Context, layer int, newState ml.Tensor) {
+	c.UpdateRecurrentState(ctx, layer, newState)
+}
+
+func (c *HybridCache) slotsTensor() ml.Tensor {
+	return c.SlotsTensor()
+}
+
+func (c *HybridCache) seqTokens() int {
+	return c.SeqTokens()
+}
+
+func (c *HybridCache) numSeqs() int {
+	return c.NumSeqs()
+}
--- a/model/models/nemotronh/mamba2.go
+++ b/model/models/nemotronh/mamba2.go
@@ -0,0 +1,197 @@
+package nemotronh
+
+import (
+	"log/slog"
+
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+)
+
+// convKernel wraps the 1D convolution kernel tensor
+type convKernel struct {
+	Weight ml.Tensor `gguf:"weight"`
+}
+
+// Mamba2 implements the Mamba2 SSM layer for Nemotron-H.
+// The forward pass follows llama.cpp's build_mamba2_layer:
+// 1. Input projection: zxBCdt = SSMIn @ hidden
+// 2. Split: z, xBC, dt
+// 3. Concat with conv state, apply SSMConv, save new conv state
+// 4. Apply SiLU to convolved xBC
+// 5. Split: x, B, C
+// 6. Add dt bias
+// 7. SSMScan: y = SSMScan(state, x, dt, A, B, C, ids)
+// 8. D skip: y = y + x * D
+// 9. Swiglu with z: y = z * silu(y)
+// 10. Group RMSNorm
+// 11. Output projection
+type Mamba2 struct {
+	SSMIn      *nn.Linear  `gguf:"ssm_in"`     // n_embd → d_in_proj (2*d_inner + 2*n_group*d_state + n_head)
+	SSMConv1D  *convKernel `gguf:"ssm_conv1d"` // conv kernel
+	SSMConv1DB ml.Tensor   `gguf:"ssm_conv1d.bias"`
+	SSMDtB     ml.Tensor   `gguf:"ssm_dt.bias"` // dt bias [n_head]
+	SSMA       ml.Tensor   `gguf:"ssm_a"`       // A parameter [1, n_head]
+	SSMD       ml.Tensor   `gguf:"ssm_d"`       // D skip connection [1, n_head]
+	SSMNorm    *nn.RMSNorm `gguf:"ssm_norm"`    // group norm
+	SSMOut     *nn.Linear  `gguf:"ssm_out"`     // output projection
+	Layer      int
+}
+
+func (m *Mamba2) Forward(ctx ml.Context, hiddenStates ml.Tensor, cache *HybridCache, opts *Options) (ml.Tensor, error) {
+	layer := m.Layer
+	hiddenDim := hiddenStates.Dim(0)
+	nSeqTokens := hiddenStates.Dim(1)
+	switch hiddenStates.Dim(2) {
+	case 0:
+		hiddenStates = hiddenStates.Reshape(ctx, hiddenDim, nSeqTokens, 1)
+	case 1:
+	default:
+		return nil, ErrUnsupportedBatchLayout
+	}
+
+	// Nemotron-H is currently clamped to num_parallel=1.
+	if cache != nil && cache.IsSupportedForBatch() {
+		if cache.numSeqs() != 1 {
+			return nil, ErrUnsupportedBatchLayout
+		}
+		if seqTokens := cache.seqTokens(); seqTokens > 0 && nSeqTokens != seqTokens {
+			return nil, ErrUnsupportedBatchLayout
+		}
+	}
+	nSeqs := 1
+
+	dConv := opts.ssmDConv
+	dInner := opts.ssmDInner
+	dState := opts.ssmDState
+	nHead := opts.ssmNHead
+	headDim := dInner / nHead
+	nGroup := opts.ssmNGroup
+
+	// {n_embd, n_seq_tokens, n_seqs} => {d_in_proj, n_seq_tokens, n_seqs}
+	// d_in_proj = 2*d_inner + 2*n_group*d_state + n_head
+	zxBCdt := m.SSMIn.Forward(ctx, hiddenStates)
+
+	// Split into z, xBC, dt
+	// z: [head_dim, n_head, n_seq_tokens, n_seqs]
+	z := zxBCdt.Slice(ctx, 0, 0, dInner, 1)
+	z = z.Reshape(ctx, headDim, nHead, nSeqTokens, nSeqs)
+
+	// xBC: [d_inner + 2*n_group*d_state, n_seq_tokens, n_seqs]
+	xBCSize := dInner + 2*nGroup*dState
+	xBC := zxBCdt.Slice(ctx, 0, dInner, dInner+xBCSize, 1)
+	if nSeqTokens == 1 {
+		xBC = xBC.Reshape(ctx, xBCSize, 1, nSeqs)
+	}
+
+	// dt: [n_head, n_seq_tokens, n_seqs]
+	dt := zxBCdt.Slice(ctx, 0, 2*dInner+2*nGroup*dState, 2*dInner+2*nGroup*dState+nHead, 1)
+	if nSeqTokens == 1 {
+		dt = dt.Reshape(ctx, nHead, 1, nSeqs)
+	} else {
+		dt = dt.Contiguous(ctx, nHead, nSeqTokens, nSeqs)
+	}
+
+	// Get conv state from cache
+	convStates, err := cache.ConvState(ctx, layer)
+	if err != nil {
+		slog.Warn("nemotronh: failed to get conv state, using zeros", "layer", layer, "error", err)
+		convStates = ctx.Input().Zeros(ml.DTypeF32, dConv-1, xBCSize, nSeqs)
+	}
+
+	// Reshape conv states: [d_conv-1, xBCSize, n_seqs]
+	convStates = convStates.Reshape(ctx, dConv-1, xBCSize, nSeqs)
+
+	// For decode (n_seq_tokens == 1), reshape avoids a transpose/contiguous pair.
+	var xBCT ml.Tensor
+	if nSeqTokens == 1 {
+		xBCT = xBC.Reshape(ctx, 1, xBCSize, nSeqs)
+	} else {
+		// Prefill path: [xBCSize, n_seq_tokens, n_seqs] -> [n_seq_tokens, xBCSize, n_seqs]
+		xBCT = xBC.Permute(ctx, 1, 0, 2, 3)
+	}
+
+	// Concatenate with conv state: [d_conv-1 + n_seq_tokens, xBCSize, n_seqs]
+	convInput := convStates.Concat(ctx, xBCT, 0)
+
+	// Save new conv state (last d_conv-1 columns)
+	lastConvStates := convInput.Slice(ctx, 0, nSeqTokens, nSeqTokens+dConv-1, 1)
+	cache.UpdateConvState(ctx, layer, lastConvStates)
+
+	// Apply SSM convolution
+	xBC = convInput.SSMConv(ctx, m.SSMConv1D.Weight)
+
+	// Add conv bias
+	if m.SSMConv1DB != nil {
+		xBC = xBC.Add(ctx, m.SSMConv1DB)
+	}
+
+	// Apply SiLU
+	xBC = xBC.SILU(ctx)
+
+	// Split xBC into x, B, C
+	// x: [head_dim, n_head, n_seq_tokens, n_seqs]
+	x := xBC.Slice(ctx, 0, 0, dInner, 1)
+	x = x.Reshape(ctx, headDim, nHead, nSeqTokens, nSeqs)
+
+	// B: [d_state, n_group, n_seq_tokens, n_seqs]
+	B := xBC.Slice(ctx, 0, dInner, dInner+nGroup*dState, 1)
+	B = B.Reshape(ctx, dState, nGroup, nSeqTokens, nSeqs)
+
+	// C: [d_state, n_group, n_seq_tokens, n_seqs]
+	C := xBC.Slice(ctx, 0, dInner+nGroup*dState, dInner+2*nGroup*dState, 1)
+	C = C.Reshape(ctx, dState, nGroup, nSeqTokens, nSeqs)
+
+	// Add dt bias
+	dt = dt.Add(ctx, m.SSMDtB)
+
+	// Get SSM state from cache
+	state, err := cache.SSMState(ctx, layer, dState, headDim, nHead)
+	if err != nil {
+		slog.Warn("nemotronh: failed to get SSM state, using zeros", "layer", layer, "error", err)
+		state = ctx.Input().Zeros(ml.DTypeF32, dState, headDim, nHead, nSeqs)
+	}
+
+	// SSMScan
+	// state: [d_state, head_dim, n_head, n_seqs]
+	// returns: [head_dim, n_head, n_seq_tokens, n_seqs] concatenated with new state
+	ySsm := state.SSMScan(ctx, x, dt, m.SSMA, B, C, cache.slotsTensor())
+
+	// ySsm is a packed 1D buffer: [y (nSeqTokens*headDim*nHead*nSeqs), newState]
+	yElems := headDim * nHead * nSeqTokens * nSeqs
+	y := ySsm.View(ctx, 0, yElems).Reshape(ctx, headDim, nHead, nSeqTokens, nSeqs)
+
+	stateOffsetBytes := yElems * x.Stride(0)
+	stateElems := dState * headDim * nHead * nSeqs
+	newState := ySsm.View(ctx, stateOffsetBytes, stateElems)
+	newState = newState.Reshape(ctx, dState, headDim, nHead, nSeqs)
+
+	// Update SSM state in cache
+	cache.UpdateSSMState(ctx, layer, newState)
+
+	// D skip connection: y = y + x * D
+	if m.SSMD != nil {
+		// SSMD shape: [1, n_head] -> broadcast to [head_dim, n_head, n_seq_tokens, n_seqs]
+		xD := x.Mul(ctx, m.SSMD)
+		y = y.Add(ctx, xD)
+	}
+
+	// Swiglu with z: y = z * silu(y)
+	y = z.SILU(ctx, y)
+
+	// Group RMSNorm
+	if m.SSMNorm != nil {
+		// Reshape for group norm: [d_inner/n_group, n_group, n_seq_tokens, n_seqs]
+		innerPerGroup := dInner / nGroup
+		y = y.Reshape(ctx, innerPerGroup, nGroup, nSeqTokens, nSeqs)
+		y = m.SSMNorm.Forward(ctx, y, opts.eps)
+	}
+
+	// Reshape back to [d_inner, n_seq_tokens, n_seqs]
+	y = y.Reshape(ctx, dInner, nSeqTokens, nSeqs)
+
+	// Output projection
+	out := m.SSMOut.Forward(ctx, y)
+
+	// Reshape to 2D for consistency with attention output
+	return out.Reshape(ctx, out.Dim(0), nSeqTokens*nSeqs), nil
+}
--- a/model/models/nemotronh/model.go
+++ b/model/models/nemotronh/model.go
@@ -0,0 +1,417 @@
+package nemotronh
+
+import (
+	"fmt"
+	"math"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/model/input"
+	"github.com/ollama/ollama/tokenizer"
+)
+
+// Options contains model configuration
+type Options struct {
+	hiddenSize int
+	numHeads   int // attention heads
+	numKVHeads int // KV heads for attention layers
+	headDim    int
+	eps        float32
+
+	// Mamba2 SSM config
+	ssmDConv  int // conv kernel size
+	ssmDInner int // inner dimension (d_inner)
+	ssmDState int // state dimension
+	ssmNHead  int // number of SSM heads (dt_rank)
+	ssmNGroup int // number of groups for B, C
+
+	// Per-layer configuration
+	isRecurrent []bool // true = Mamba2, false = attention or FFN
+	nFF         []int  // n_ff per layer (0 = attention-only)
+
+	// Attention scale
+	attentionScale float64
+
+	// MoE config
+	numExperts            int
+	numExpertsUsed        int
+	expertWeightsNorm     bool
+	expertWeightsScale    float32
+	expertWeightsNormClip float32
+}
+
+func (o Options) getHeadDim() int {
+	if o.headDim > 0 {
+		return o.headDim
+	}
+	if o.numHeads <= 0 {
+		return 0
+	}
+	return o.hiddenSize / o.numHeads
+}
+
+// Operator is the interface for layer operators (Mamba2 or Attention)
+type Operator interface {
+	Forward(ctx ml.Context, hiddenStates ml.Tensor, cache *HybridCache, opts *Options) (ml.Tensor, error)
+}
+
+// MLP is the interface for feedforward networks
+type MLP interface {
+	Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *Options) ml.Tensor
+}
+
+// Layer represents a single transformer layer
+type Layer struct {
+	AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
+	Operator      Operator    // Mamba2, Attention, or nil (for FFN-only layers)
+	MLP           MLP         // Dense or MoE FFN, or nil
+}
+
+func (l *Layer) Forward(ctx ml.Context, layer int, hiddenStates, outputs ml.Tensor, cache *HybridCache, opts *Options) (ml.Tensor, error) {
+	residual := hiddenStates
+
+	// Pre-layer norm
+	hiddenStates = l.AttentionNorm.Forward(ctx, hiddenStates, opts.eps)
+
+	// Layer operator (Mamba2, Attention, or FFN)
+	if l.Operator != nil {
+		var err error
+		hiddenStates, err = l.Operator.Forward(ctx, hiddenStates, cache, opts)
+		if err != nil {
+			return nil, err
+		}
+	} else if l.MLP != nil {
+		// FFN-only layer
+		hiddenStates = l.MLP.Forward(ctx, hiddenStates, opts)
+	}
+
+	// Output projection for last layer
+	if outputs != nil {
+		hiddenStates = hiddenStates.Rows(ctx, outputs)
+		residual = residual.Rows(ctx, outputs)
+	}
+
+	// Residual connection
+	return hiddenStates.Add(ctx, residual), nil
+}
+
+// Model is the main Nemotron-H model
+type Model struct {
+	model.Base
+	tokenizer.Tokenizer
+
+	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
+	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
+	Output         *nn.Linear    `gguf:"output,alt:token_embd"`
+
+	Layers []Layer `gguf:"blk"`
+
+	*Options
+}
+
+// Shift is used for KV cache position shifting.
+// Nemotron-H attention does not apply RoPE, so keys do not need to be transformed.
+func Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
+	return key, nil
+}
+
+func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
+	hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs)
+
+	cache := m.Cache.(*HybridCache)
+
+	for i, layer := range m.Layers {
+		cache.SetLayer(i)
+
+		var outputs ml.Tensor
+		if i == len(m.Layers)-1 {
+			outputs = batch.Outputs
+		}
+
+		var err error
+		hiddenStates, err = layer.Forward(ctx, i, hiddenStates, outputs, cache, m.Options)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.eps)
+	return m.Output.Forward(ctx, hiddenStates), nil
+}
+
+func New(c fs.Config) (model.Model, error) {
+	numLayers := int(c.Uint("block_count"))
+	layers := make([]Layer, numLayers)
+
+	// Get per-layer configuration from GGUF metadata
+	// Use the same interface pattern as qwen3next
+	type perLayerConfig interface {
+		HeadCount() []uint64
+		HeadCountKV() []uint64
+		FFNLength() []uint64
+	}
+
+	var headCount []uint64
+	var headCountKV []uint64
+	var ffnLength []uint64
+
+	if plc, ok := c.(perLayerConfig); ok {
+		headCount = plc.HeadCount()
+		headCountKV = plc.HeadCountKV()
+		ffnLength = plc.FFNLength()
+	}
+
+	// Build per-layer arrays with defaults
+	isRecurrent := make([]bool, numLayers)
+	nFF := make([]int, numLayers)
+
+	for i := range numLayers {
+		// Get per-layer values
+		kvHeads := uint64(1) // Default non-zero
+		if i < len(headCountKV) {
+			kvHeads = headCountKV[i]
+		}
+		ff := uint64(0)
+		if i < len(ffnLength) {
+			ff = ffnLength[i]
+		}
+		nFF[i] = int(ff)
+
+		// A layer is recurrent IFF n_head_kv == 0 AND n_ff == 0
+		// This matches llama.cpp behavior for Nemotron-H
+		isRecurrent[i] = kvHeads == 0 && ff == 0
+	}
+
+	// Determine if MoE
+	isMoE := c.Uint("expert_count") > 0
+
+	for i := range layers {
+		if isRecurrent[i] {
+			// Mamba2 layer
+			layers[i].Operator = &Mamba2{Layer: i}
+		} else if nFF[i] == 0 {
+			// Attention-only layer (n_head_kv > 0, n_ff == 0)
+			layers[i].Operator = &Attention{}
+		} else {
+			// FFN layer (n_ff > 0)
+			if isMoE {
+				layers[i].MLP = &MoESparse{}
+			} else {
+				layers[i].MLP = &Dense{}
+			}
+		}
+	}
+
+	// Get attention head configuration
+	numHeads := int(c.Uint("attention.head_count"))
+	if numHeads == 0 {
+		for i := range numLayers {
+			if i < len(headCount) && i < len(headCountKV) && headCount[i] > 0 && headCountKV[i] > 0 {
+				numHeads = int(headCount[i])
+				break
+			}
+		}
+	}
+	numKVHeads := int(c.Uint("attention.head_count_kv"))
+	if numKVHeads == 0 {
+		for i := range numLayers {
+			if i < len(headCountKV) && i < len(ffnLength) && headCountKV[i] > 0 && ffnLength[i] == 0 {
+				numKVHeads = int(headCountKV[i])
+				break
+			}
+		}
+		if numKVHeads == 0 {
+			numKVHeads = numHeads
+		}
+	}
+
+	headDim := int(c.Uint("attention.head_dim"))
+	if headDim == 0 {
+		if keyLength := int(c.Uint("attention.key_length")); keyLength > 0 {
+			headDim = keyLength
+		} else if numHeads > 0 {
+			headDim = int(c.Uint("embedding_length")) / numHeads
+		}
+	}
+	if headDim <= 0 {
+		return nil, fmt.Errorf("nemotronh: invalid attention head dimension")
+	}
+	if numHeads <= 0 {
+		// Attention layers derive per-layer head counts from projection weights.
+		// Keep a non-zero default to avoid invalid option math.
+		numHeads = 1
+	}
+
+	numExperts := int(c.Uint("expert_count"))
+	numExpertsUsed := int(c.Uint("expert_used_count"))
+	if numExperts > 0 {
+		if numExpertsUsed <= 0 || numExpertsUsed > numExperts {
+			return nil, fmt.Errorf("nemotronh: invalid expert_used_count=%d for expert_count=%d", numExpertsUsed, numExperts)
+		}
+	}
+
+	opts := &Options{
+		hiddenSize:            int(c.Uint("embedding_length")),
+		numHeads:              numHeads,
+		numKVHeads:            numKVHeads,
+		headDim:               headDim,
+		eps:                   c.Float("attention.layer_norm_rms_epsilon"),
+		ssmDConv:              int(c.Uint("ssm.conv_kernel")),
+		ssmDInner:             int(c.Uint("ssm.inner_size")),
+		ssmDState:             int(c.Uint("ssm.state_size")),
+		ssmNHead:              int(c.Uint("ssm.time_step_rank")),
+		ssmNGroup:             int(c.Uint("ssm.group_count")),
+		isRecurrent:           isRecurrent,
+		nFF:                   nFF,
+		attentionScale:        float64(c.Float("attention.scale")),
+		numExperts:            numExperts,
+		numExpertsUsed:        numExpertsUsed,
+		expertWeightsNorm:     c.Bool("expert_weights_norm", false),
+		expertWeightsScale:    c.Float("expert_weights_scale", 1.0),
+		expertWeightsNormClip: c.Float("expert_weights_norm_clip", 0),
+	}
+
+	// Calculate cache dimensions
+	convDim := max(0, opts.ssmDConv-1)
+	convChannels := opts.ssmDInner + 2*opts.ssmNGroup*opts.ssmDState
+	ssmHeadDim := 0
+	if opts.ssmNHead > 0 {
+		ssmHeadDim = opts.ssmDInner / opts.ssmNHead
+	}
+	ssmStateSize := opts.ssmDState * ssmHeadDim * opts.ssmNHead
+
+	m := Model{
+		Tokenizer: tokenizer.NewBytePairEncoding(
+			&tokenizer.Vocabulary{
+				Values: c.Strings("tokenizer.ggml.tokens"),
+				Types:  c.Ints("tokenizer.ggml.token_type"),
+				Merges: c.Strings("tokenizer.ggml.merges"),
+				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", false),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
+				EOS: append(
+					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				),
+			},
+			`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
+		),
+		Layers:  layers,
+		Options: opts,
+	}
+
+	m.Cache = NewHybridCache(convDim, convChannels, ssmStateSize)
+	return &m, nil
+}
+
+func init() {
+	model.Register("nemotron_h", New)
+	model.Register("nemotron_h_moe", New)
+}
+
+// Ensure Model implements model.Model
+var _ model.Model = (*Model)(nil)
+
+// Dense implements standard feedforward with ReLU-squared activation
+type Dense struct {
+	Up   *nn.Linear `gguf:"ffn_up"`
+	Down *nn.Linear `gguf:"ffn_down"`
+}
+
+func (d *Dense) Forward(ctx ml.Context, x ml.Tensor, opts *Options) ml.Tensor {
+	// up -> ReLU-squared -> down
+	up := d.Up.Forward(ctx, x)
+	up = up.RELU(ctx)
+	up = up.Mul(ctx, up) // Square
+	return d.Down.Forward(ctx, up)
+}
+
+// MoESparse implements MoE with shared experts for Nemotron-H-MoE
+type MoESparse struct {
+	Router *nn.Linear      `gguf:"ffn_gate_inp"`
+	Up     *nn.LinearBatch `gguf:"ffn_up_exps"`
+	Down   *nn.LinearBatch `gguf:"ffn_down_exps"`
+	Bias   ml.Tensor       `gguf:"exp_probs_b.bias,alt:exp_probs_b"`
+
+	LatentIn  *nn.Linear `gguf:"ffn_latent_in"`
+	LatentOut *nn.Linear `gguf:"ffn_latent_out"`
+
+	// Shared experts
+	SharedUp   *nn.Linear `gguf:"ffn_up_shexp"`
+	SharedDown *nn.Linear `gguf:"ffn_down_shexp"`
+}
+
+func (m *MoESparse) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *Options) ml.Tensor {
+	hiddenDim := hiddenStates.Dim(0)
+	seqLen := hiddenStates.Dim(1)
+	batchSize := hiddenStates.Dim(2)
+	if batchSize == 0 {
+		batchSize = 1
+	}
+	hiddenStates2D := hiddenStates.Reshape(ctx, hiddenDim, seqLen*batchSize)
+
+	// Router logits with sigmoid gating
+	routerLogits := m.Router.Forward(ctx, hiddenStates2D)
+
+	// Weights come from unbiased sigmoid probabilities.
+	probs := routerLogits.Sigmoid(ctx)
+
+	// Selection uses optional bias.
+	selectionProbs := probs
+	if m.Bias != nil {
+		selectionProbs = selectionProbs.Add(ctx, m.Bias)
+	}
+
+	// Select top-k experts
+	selectedExperts := selectionProbs.TopK(ctx, opts.numExpertsUsed)
+	routingWeights := probs.Reshape(ctx, 1, opts.numExperts, hiddenStates2D.Dim(1)).Rows(ctx, selectedExperts)
+
+	// Normalize routing weights
+	if opts.expertWeightsNorm {
+		routingWeights = routingWeights.Reshape(ctx, opts.numExpertsUsed, hiddenStates2D.Dim(1))
+		weightsSum := routingWeights.SumRows(ctx)
+		weightsSum = weightsSum.Clamp(ctx, 6.103515625e-5, float32(math.MaxFloat32))
+		routingWeights = routingWeights.Div(ctx, weightsSum)
+		routingWeights = routingWeights.Reshape(ctx, 1, opts.numExpertsUsed, hiddenStates2D.Dim(1))
+	}
+
+	// Scale routing weights
+	if opts.expertWeightsScale != 1.0 {
+		routingWeights = routingWeights.Scale(ctx, float64(opts.expertWeightsScale))
+	}
+
+	routedInput := hiddenStates2D
+	if m.LatentIn != nil {
+		routedInput = m.LatentIn.Forward(ctx, routedInput)
+	}
+	hiddenStates3D := routedInput.Reshape(ctx, routedInput.Dim(0), 1, routedInput.Dim(1))
+
+	// Expert computation with ReLU-squared activation
+	upOut := m.Up.Forward(ctx, hiddenStates3D, selectedExperts)
+	upOut = upOut.RELU(ctx)
+	upOut = upOut.Mul(ctx, upOut) // Square
+	experts := m.Down.Forward(ctx, upOut, selectedExperts)
+	experts = experts.Mul(ctx, routingWeights)
+
+	// Sum over experts
+	moeOut := experts.View(ctx, 0, experts.Dim(0), experts.Stride(2), experts.Dim(2))
+	for i := 1; i < opts.numExpertsUsed; i++ {
+		moeOut = moeOut.Add(ctx, experts.View(ctx, i*experts.Stride(1), experts.Dim(0), experts.Stride(2), experts.Dim(2)))
+	}
+	if m.LatentOut != nil {
+		moeOut = m.LatentOut.Forward(ctx, moeOut)
+	}
+
+	// Add shared experts if present
+	if m.SharedUp != nil {
+		sharedUp := m.SharedUp.Forward(ctx, hiddenStates2D)
+		sharedUp = sharedUp.RELU(ctx)
+		sharedUp = sharedUp.Mul(ctx, sharedUp) // Square
+		sharedOut := m.SharedDown.Forward(ctx, sharedUp)
+		moeOut = moeOut.Add(ctx, sharedOut)
+	}
+
+	return moeOut
+}
--- a/model/models/qwen3next/cache.go
+++ b/model/models/qwen3next/cache.go
@@ -2,595 +2,58 @@ package qwen3next

 import (
 	"math"
-	"slices"

 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
-	"github.com/ollama/ollama/model/input"
 )

-var _ kvcache.Cache = (*HybridCache)(nil)
+var (
+	_ kvcache.Cache           = (*HybridCache)(nil)
+	_ kvcache.CheckpointCache = (*HybridCache)(nil)
+)

-// HybridCache stores:
-// - a standard causal KV cache for full attention layers
-// - per-sequence conv state for linear attention layers
-// - per-sequence delta state for linear attention layers
-//
-// Conv state shape (per layer, per sequence): [convKernelSize-1, convChannels]
-// Delta state shape (per layer, per sequence): [headVDim, headVDim * numVHeads]
+// HybridCache adapts the shared recurrent cache base for Qwen3-Next naming.
 type HybridCache struct {
-	kv *kvcache.Causal
-
-	backend      ml.Backend
-	dtype        ml.DType
-	maxSequences int
-
-	// Conv state dimensions
-	convDim      int // convKernelSize - 1
-	convChannels int // d_inner + 2 * num_k_heads * head_k_dim
-
-	// Delta state dimensions
-	deltaStateSize int // headVDim * headVDim * numVHeads
-
-	// slot mapping for recurrent state (copy-on-write)
-	slotForSeq map[int]int
-	refCount   []int
-	freeSlots  []int
-
-	// per-layer conv state buffers (allocated lazily)
-	convCtxs   map[int]ml.Context
-	convStates map[int]ml.Tensor // [convDim*convChannels, maxSlots]
-
-	// per-layer delta state buffers (allocated lazily)
-	deltaCtxs   map[int]ml.Context
-	deltaStates map[int]ml.Tensor // [deltaStateSize, maxSlots]
-
-	// recurrent checkpoints (per slot)
-	checkpointCount     int
-	checkpointMinPos    int32
-	checkpointInterval  int32
-	checkpointCtxSize   int
-	checkpoints         map[int]*slotCheckpointStore
-	pendingRestore      map[int]checkpointRestore
-	curCheckpointPos    []int32
-	curCheckpointSlots  map[int]int
-	reserveCheckpoints  bool
-	checkpointConvCtxs  map[int]ml.Context
-	checkpointDeltaCtxs map[int]ml.Context
-	checkpointReserved  map[int]struct{}
-
-	// current forward batch (derived in StartForward)
-	curSeqs       []int
-	curSlots      []int
-	curSlotsInput ml.Tensor
-	curSeqTokens  int
-
-	// track if EnsureWritable has been called for this forward pass
-	writableEnsured bool
-	writableError   error
+	*kvcache.Recurrent
 }

 func NewHybridCache(
 	shift func(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error),
 	convDim, convChannels, deltaStateSize int,
 ) *HybridCache {
-	return &HybridCache{
-		kv:                  kvcache.NewCausalCache(shift),
-		convDim:             convDim,
-		convChannels:        convChannels,
-		deltaStateSize:      deltaStateSize,
-		slotForSeq:          make(map[int]int),
-		convCtxs:            make(map[int]ml.Context),
-		convStates:          make(map[int]ml.Tensor),
-		deltaCtxs:           make(map[int]ml.Context),
-		deltaStates:         make(map[int]ml.Tensor),
-		checkpointCount:     checkpointCountDefault,
-		checkpointMinPos:    checkpointMinPosDefault,
-		checkpointInterval:  checkpointIntervalDefault,
-		checkpoints:         make(map[int]*slotCheckpointStore),
-		pendingRestore:      make(map[int]checkpointRestore),
-		curCheckpointSlots:  make(map[int]int),
-		checkpointConvCtxs:  make(map[int]ml.Context),
-		checkpointDeltaCtxs: make(map[int]ml.Context),
-		checkpointReserved:  make(map[int]struct{}),
-	}
+	base := kvcache.NewRecurrentCache(kvcache.RecurrentConfig{
+		Shift:               shift,
+		ConvDim:             convDim,
+		ConvChannels:        convChannels,
+		RecurrentStateSize:  deltaStateSize,
+		CheckpointLogPrefix: "qwen3next",
+	})
+	return &HybridCache{Recurrent: base}
 }

-func (c *HybridCache) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity, maxBatch int) {
-	c.backend = backend
-	c.dtype = dtype
-	c.maxSequences = maxSequences
-	c.checkpoints = make(map[int]*slotCheckpointStore)
-	c.pendingRestore = make(map[int]checkpointRestore)
-	c.curCheckpointPos = c.curCheckpointPos[:0]
-	c.curCheckpointSlots = make(map[int]int)
-	c.checkpointReserved = make(map[int]struct{})
-	c.checkpointCtxSize = c.checkpointCount * c.maxSequences
-	if c.checkpointCtxSize < 8 {
-		c.checkpointCtxSize = 8
-	}
-
-	// initialize slot allocator
-	c.refCount = make([]int, maxSequences)
-	c.freeSlots = c.freeSlots[:0]
-	for i := maxSequences - 1; i >= 0; i-- {
-		c.freeSlots = append(c.freeSlots, i)
-	}
-
-	c.kv.Init(backend, dtype, maxSequences, capacity, maxBatch)
-}
-
-func (c *HybridCache) Close() {
-	for _, ctx := range c.convCtxs {
-		ctx.Close()
-	}
-	for _, ctx := range c.deltaCtxs {
-		ctx.Close()
-	}
-	for _, ctx := range c.checkpointConvCtxs {
-		ctx.Close()
-	}
-	for _, ctx := range c.checkpointDeltaCtxs {
-		ctx.Close()
-	}
-	c.kv.Close()
-}
-
-func (c *HybridCache) SetConfig(config ml.CacheConfig) {
-	c.kv.SetConfig(config)
-}
-
-func (c *HybridCache) SetLayer(layer int) {
-	c.kv.SetLayer(layer)
-}
-
-func (c *HybridCache) Get(ctx ml.Context) (ml.Tensor, ml.Tensor, ml.Tensor) {
-	return c.kv.Get(ctx)
-}
-
-func (c *HybridCache) Put(ctx ml.Context, key, value ml.Tensor) {
-	c.kv.Put(ctx, key, value)
-}
-
-func (c *HybridCache) StartForward(ctx ml.Context, batch input.Batch, reserve bool) error {
-	if err := c.kv.StartForward(ctx, batch, reserve); err != nil {
-		return err
-	}
-
-	// Derive equal-length sequence layout for recurrent layers
-	seqCounts := make(map[int]int)
-	c.curSeqs = c.curSeqs[:0]
-	for _, s := range batch.Sequences {
-		if _, ok := seqCounts[s]; !ok {
-			c.curSeqs = append(c.curSeqs, s)
-		}
-		seqCounts[s]++
-	}
-
-	if len(c.curSeqs) == 0 {
-		return nil
-	}
-
-	nTokens := len(batch.Sequences)
-	nSeqs := len(c.curSeqs)
-	want := nTokens / nSeqs
-	for _, s := range c.curSeqs {
-		if seqCounts[s] != want {
-			return kvcache.ErrNotSupported
-		}
-	}
-
-	c.curSeqTokens = want
-
-	// When reserving memory for estimation, use fake slot assignments
-	if reserve {
-		c.curSlots = c.curSlots[:0]
-		slots := make([]int32, nSeqs)
-		for i := range nSeqs {
-			c.curSlots = append(c.curSlots, i)
-			slots[i] = int32(i)
-		}
-		c.curSlotsInput = ctx.Input().FromInts(slots, len(slots))
-		c.reserveCheckpoints = true
-		c.planCheckpoints(batch)
-		return nil
-	}
-
-	// Ensure slots exist for sequences in this batch
-	c.curSlots = c.curSlots[:0]
-	var newSlots []int
-	for _, s := range c.curSeqs {
-		slot, ok := c.slotForSeq[s]
-		if !ok {
-			var err error
-			slot, err = c.allocSlot()
-			if err != nil {
-				return err
-			}
-			c.slotForSeq[s] = slot
-			c.refCount[slot] = 1
-			newSlots = append(newSlots, slot)
-		}
-		c.curSlots = append(c.curSlots, slot)
-	}
-
-	// Zero state for newly allocated slots
-	if len(newSlots) > 0 {
-		c.zeroSlots(ctx, newSlots)
-	}
-
-	// Create a tensor for the current slots
-	slots := make([]int32, len(c.curSlots))
-	for i, v := range c.curSlots {
-		slots[i] = int32(v)
-	}
-	c.curSlotsInput = ctx.Input().FromInts(slots, len(slots))
-
-	// Reset writable state for new forward pass
-	c.writableEnsured = false
-	c.writableError = nil
-	c.reserveCheckpoints = false
-	c.planCheckpoints(batch)
-
-	return nil
-}
-
-func (c *HybridCache) allocSlot() (int, error) {
-	if len(c.freeSlots) == 0 {
-		return 0, kvcache.ErrKvCacheFull
-	}
-	slot := c.freeSlots[len(c.freeSlots)-1]
-	c.freeSlots = c.freeSlots[:len(c.freeSlots)-1]
-	return slot, nil
-}
-
-func (c *HybridCache) freeSlot(slot int) {
-	if slot >= 0 && slot < c.maxSequences {
-		c.freeSlots = append(c.freeSlots, slot)
-	}
-}
-
-// zeroSlots zeros the recurrent state for the given slots across all layers.
-func (c *HybridCache) zeroSlots(ctx ml.Context, slots []int) {
-	if len(slots) == 0 {
-		return
-	}
-
-	inputCtx := ctx.Input()
-
-	slotIndices := make([]int32, len(slots))
-	for i, s := range slots {
-		slotIndices[i] = int32(s)
-	}
-	slotsTensor := inputCtx.FromInts(slotIndices, len(slotIndices))
-
-	// Zero conv states
-	if len(c.convStates) > 0 {
-		zeros := inputCtx.Zeros(ml.DTypeF32, c.convDim*c.convChannels, len(slots))
-		for _, buf := range c.convStates {
-			ctx.Forward(buf.SetRows(ctx, zeros, slotsTensor))
-		}
-	}
-
-	// Zero delta states
-	if len(c.deltaStates) > 0 {
-		zeros := inputCtx.Zeros(ml.DTypeF32, c.deltaStateSize, len(slots))
-		for _, buf := range c.deltaStates {
-			ctx.Forward(buf.SetRows(ctx, zeros, slotsTensor))
-		}
-	}
-}
-
-// EnsureWritable ensures sequences have private slots (copy-on-write).
-func (c *HybridCache) EnsureWritable(ctx ml.Context) error {
-	for i, seq := range c.curSeqs {
-		slot, ok := c.slotForSeq[seq]
-		if !ok {
-			continue
-		}
-
-		if slot < 0 || slot >= len(c.refCount) {
-			continue
-		}
-
-		if c.refCount[slot] <= 1 {
-			continue
-		}
-
-		newSlot, err := c.allocSlot()
-		if err != nil {
-			return err
-		}
-		c.refCount[slot]--
-		c.refCount[newSlot] = 1
-		c.slotForSeq[seq] = newSlot
-		c.curSlots[i] = newSlot
-
-		c.copyRecurrentState(ctx, slot, newSlot)
-		c.copyCheckpoints(ctx, slot, newSlot)
-	}
-
-	// Rebuild current slots tensor
-	slots := make([]int32, len(c.curSlots))
-	for i, v := range c.curSlots {
-		slots[i] = int32(v)
-	}
-	c.curSlotsInput = ctx.Input().FromInts(slots, len(slots))
-
-	return nil
-}
-
-func (c *HybridCache) copyRecurrentState(ctx ml.Context, srcSlot, dstSlot int) {
-	src := ctx.Input().FromInts([]int32{int32(srcSlot)}, 1)
-	dst := ctx.Input().FromInts([]int32{int32(dstSlot)}, 1)
-
-	for _, buf := range c.convStates {
-		rows := buf.Rows(ctx, src)
-		rowsF32 := rows.Cast(ctx, ml.DTypeF32)
-		ctx.Forward(buf.SetRows(ctx, rowsF32, dst))
-	}
-
-	for _, buf := range c.deltaStates {
-		rows := buf.Rows(ctx, src)
-		rowsF32 := rows.Cast(ctx, ml.DTypeF32)
-		ctx.Forward(buf.SetRows(ctx, rowsF32, dst))
-	}
-}
-
-func (c *HybridCache) CopyPrefix(srcSeq, dstSeq int, prefixLen int32) {
-	c.kv.CopyPrefix(srcSeq, dstSeq, prefixLen)
-
-	// Copy-on-write for recurrent state
-	if dstSlot, ok := c.slotForSeq[dstSeq]; ok {
-		if c.validSlot(dstSlot) {
-			c.refCount[dstSlot]--
-			if c.refCount[dstSlot] <= 0 {
-				c.refCount[dstSlot] = 0
-				c.freeSlot(dstSlot)
-			}
-		}
-		delete(c.slotForSeq, dstSeq)
-	}
-
-	srcSlot, ok := c.slotForSeq[srcSeq]
-	if !ok {
-		return
-	}
-
-	if c.validSlot(srcSlot) {
-		c.slotForSeq[dstSeq] = srcSlot
-		c.refCount[srcSlot]++
-	}
-}
-
-func (c *HybridCache) CanResume(seq int, pos int32) bool {
-	if !c.kv.CanResume(seq, pos) {
-		return false
-	}
-	if pos == 0 {
-		return true
-	}
-	return c.hasCheckpoint(seq, pos)
-}
-
-func (c *HybridCache) Remove(seq int, beginIndex, endIndex int32) error {
-	if beginIndex > 0 && endIndex != math.MaxInt32 {
-		return kvcache.ErrNotSupported
-	}
-
-	if beginIndex > 0 {
-		restore, ok := c.pendingRestore[seq]
-		if !ok || restore.pos+1 != beginIndex {
-			return kvcache.ErrNotSupported
-		}
-		if !c.restoreComplete(restore) {
-			return kvcache.ErrNotSupported
-		}
-		// If the recurrent slot is shared, detach it before applying a restore.
-		if slot, ok := c.slotForSeq[seq]; ok && c.validSlot(slot) && c.refCount[slot] > 1 {
-			newSlot, err := c.allocSlot()
-			if err != nil {
-				return err
-			}
-			ctx := c.backend.NewContext()
-			c.copyRecurrentState(ctx, slot, newSlot)
-			c.copyCheckpoints(ctx, slot, newSlot)
-			if len(c.convStates) > 0 || len(c.deltaStates) > 0 {
-				ctx.Compute()
-			}
-			ctx.Close()
-
-			c.refCount[slot]--
-			c.refCount[newSlot] = 1
-			c.slotForSeq[seq] = newSlot
-
-			restore.slot = newSlot
-			c.pendingRestore[seq] = restore
-		}
-	}
-
-	if err := c.kv.Remove(seq, beginIndex, endIndex); err != nil {
-		return err
-	}
-
-	if beginIndex > 0 {
-		restore := c.pendingRestore[seq]
-		delete(c.pendingRestore, seq)
-		return c.applyCheckpointRestore(restore)
-	}
-
-	// Removal invalidates recurrent state
-	slot, ok := c.slotForSeq[seq]
-	delete(c.pendingRestore, seq)
-	if !ok {
-		return nil
-	}
-
-	if !c.validSlot(slot) {
-		delete(c.slotForSeq, seq)
-		return nil
-	}
-
-	c.refCount[slot]--
-	if c.refCount[slot] <= 0 {
-		c.refCount[slot] = 0
-		c.clearCheckpoints(slot)
-		c.freeSlot(slot)
-	}
-	delete(c.slotForSeq, seq)
-
-	return nil
-}
-
-func (c *HybridCache) validSlot(slot int) bool {
-	return slot >= 0 && slot < len(c.refCount)
-}
-
-func (c *HybridCache) slotsTensor() ml.Tensor {
-	return c.curSlotsInput
-}
-
-// contiguousSlots returns the starting slot if current slots are contiguous and ordered.
-func (c *HybridCache) contiguousSlots() (int, bool) {
-	if len(c.curSlots) == 0 {
-		return 0, false
-	}
-	start := c.curSlots[0]
-	for i, s := range c.curSlots {
-		if s != start+i {
-			return 0, false
-		}
-	}
-	return start, true
-}
-
-func (c *HybridCache) seqTokens() int {
-	return c.curSeqTokens
-}
-
-func (c *HybridCache) numSeqs() int {
-	return len(c.curSeqs)
-}
-
-func (c *HybridCache) convBuffer(ctx ml.Context, layer int) ml.Tensor {
-	if buf, ok := c.convStates[layer]; ok {
-		return buf
-	}
-
-	if _, ok := c.convCtxs[layer]; !ok {
-		c.convCtxs[layer] = c.backend.NewContextSize(1).Layer(layer)
-	}
-
-	// Recurrent state must stay in F32 (ssm_conv kernels are F32-only).
-	buf := c.convCtxs[layer].Zeros(ml.DTypeF32, c.convDim*c.convChannels, c.maxSequences)
-	c.convStates[layer] = buf
-	return buf
-}
-
-func (c *HybridCache) deltaBuffer(ctx ml.Context, layer int) ml.Tensor {
-	if buf, ok := c.deltaStates[layer]; ok {
-		return buf
-	}
-
-	if _, ok := c.deltaCtxs[layer]; !ok {
-		c.deltaCtxs[layer] = c.backend.NewContextSize(1).Layer(layer)
-	}
-
-	// Recurrent delta state must stay in F32.
-	buf := c.deltaCtxs[layer].Zeros(ml.DTypeF32, c.deltaStateSize, c.maxSequences)
-	c.deltaStates[layer] = buf
-	return buf
-}
-
-func (c *HybridCache) ensureWritableOnce(ctx ml.Context) {
-	if !c.writableEnsured {
-		needsWritable := false
-		for _, seq := range c.curSeqs {
-			slot, ok := c.slotForSeq[seq]
-			if !ok {
-				continue
-			}
-			if slot >= 0 && slot < len(c.refCount) && c.refCount[slot] > 1 {
-				needsWritable = true
-				break
-			}
-		}
-
-		if needsWritable {
-			if err := c.EnsureWritable(ctx); err != nil {
-				c.writableError = err
-			}
-		}
-		c.writableEnsured = true
-	}
-}
-
-// ConvState returns the conv state for current batch sequences as [convDim, convChannels, nSeqs].
-func (c *HybridCache) ConvState(ctx ml.Context, layer int) (ml.Tensor, error) {
-	c.ensureWritableOnce(ctx)
-
-	if c.writableError != nil {
-		return nil, c.writableError
-	}
-
-	buf := c.convBuffer(ctx, layer)
-	cur := buf.Rows(ctx, c.slotsTensor())
-	return cur.Reshape(ctx, c.convDim, c.convChannels, c.numSeqs()), nil
-}
-
-// UpdateConvState writes a new conv state for current batch sequences.
-func (c *HybridCache) UpdateConvState(ctx ml.Context, layer int, newState ml.Tensor) {
-	buf := c.convBuffer(ctx, layer)
-	src := newState.Reshape(ctx, c.convDim*c.convChannels, c.numSeqs())
-	srcF32 := src.Cast(ctx, ml.DTypeF32)
-	if start, ok := c.contiguousSlots(); ok {
-		// Fast path: contiguous slots allow a single view + copy
-		offset := start * buf.Stride(1)
-		view := buf.View(ctx, offset, c.convDim*c.convChannels, buf.Stride(1), c.numSeqs())
-		ctx.Forward(srcF32.Copy(ctx, view))
-	} else {
-		ctx.Forward(buf.SetRows(ctx, srcF32, c.slotsTensor()))
-	}
-
-	c.captureConvCheckpoint(ctx, layer, srcF32)
-}
-
-// DeltaState returns the delta state for current batch sequences as [headVDim, headVDim*numVHeads, nSeqs].
+// DeltaState returns the delta state for current batch sequences as
+// [headVDim, headVDim*numVHeads, nSeqs].
 func (c *HybridCache) DeltaState(ctx ml.Context, layer int, headVDim, numVHeads int) (ml.Tensor, error) {
-	c.ensureWritableOnce(ctx)
-
-	if c.writableError != nil {
-		return nil, c.writableError
-	}
-
-	buf := c.deltaBuffer(ctx, layer)
-	cur := buf.Rows(ctx, c.slotsTensor())
-	return cur.Reshape(ctx, headVDim, headVDim*numVHeads, c.numSeqs()), nil
+	return c.RecurrentState(ctx, layer, headVDim, headVDim*numVHeads)
 }

 // UpdateDeltaState writes a new delta state for current batch sequences.
 func (c *HybridCache) UpdateDeltaState(ctx ml.Context, layer int, newState ml.Tensor) {
-	buf := c.deltaBuffer(ctx, layer)
-	src := newState.Reshape(ctx, c.deltaStateSize, c.numSeqs())
-	srcF32 := src.Cast(ctx, ml.DTypeF32)
-	if start, ok := c.contiguousSlots(); ok {
-		// Fast path: contiguous slots allow a single view + copy
-		offset := start * buf.Stride(1)
-		view := buf.View(ctx, offset, c.deltaStateSize, buf.Stride(1), c.numSeqs())
-		ctx.Forward(srcF32.Copy(ctx, view))
-	} else {
-		ctx.Forward(buf.SetRows(ctx, srcF32, c.slotsTensor()))
+	c.UpdateRecurrentState(ctx, layer, newState)
+}
+
+func (c *HybridCache) seqTokens() int {
+	return c.SeqTokens()
+}
+
+func (c *HybridCache) numSeqs() int {
+	return c.NumSeqs()
+}
+
+// Keep qwen3next behavior for partial mid-sequence removals.
+func (c *HybridCache) Remove(seq int, beginIndex, endIndex int32) error {
+	if beginIndex > 0 && endIndex != math.MaxInt32 {
+		return kvcache.ErrNotSupported
 	}
-
-	c.captureDeltaCheckpoint(ctx, layer, srcF32)
-}
-
-// IsSupportedForBatch returns true if the current batch layout supports recurrent layers.
-func (c *HybridCache) IsSupportedForBatch() bool {
-	return c.curSeqTokens > 0 && len(c.curSeqs) > 0
-}
-
-// Seqs returns the ordered unique sequences for the current forward pass.
-func (c *HybridCache) Seqs() []int {
-	return slices.Clone(c.curSeqs)
+	return c.Recurrent.Remove(seq, beginIndex, endIndex)
 }
--- a/model/models/qwen3next/deltanet.go
+++ b/model/models/qwen3next/deltanet.go
@@ -37,7 +37,9 @@ type GatedDeltaNet struct {
 	// Optimized path: pre-split QKV and gate
 	SSMQKV       *nn.Linear  `gguf:"attn_qkv"`  // -> Q, K, V (concatenated)
 	SSMQKVGate   *nn.Linear  `gguf:"attn_gate"` // -> Z gate
-	SSMBetaAlpha *nn.Linear  `gguf:"ssm_ba"`    // -> beta, alpha
+	SSMBetaAlpha *nn.Linear  `gguf:"ssm_ba"`    // -> beta, alpha (legacy qwen3next)
+	SSMBeta      *nn.Linear  `gguf:"ssm_beta"`  // -> beta (qwen35)
+	SSMAlpha     *nn.Linear  `gguf:"ssm_alpha"` // -> alpha (qwen35)
 	SSMConv1D    *convKernel `gguf:"ssm_conv1d"`
 	SSMDT        ml.Tensor   `gguf:"ssm_dt"` // alpha bias
 	SSMA         ml.Tensor   `gguf:"ssm_a"`  // -A_log.exp()
@@ -96,7 +98,6 @@ func (gdn *GatedDeltaNet) Forward(ctx ml.Context, hiddenStates, _ ml.Tensor, cac
 	headVDim := opts.ssmDInner / numVHeads
 	convKernelSize := opts.convKernelSize

-	mixedBA := gdn.SSMBetaAlpha.Forward(ctx, hiddenStates)
 	qkvDim := headKDim*numKHeads*2 + headVDim*numVHeads

 	if gdn.SSMQKV == nil || gdn.SSMQKVGate == nil {
@@ -106,24 +107,40 @@ func (gdn *GatedDeltaNet) Forward(ctx ml.Context, hiddenStates, _ ml.Tensor, cac
 	qkvMixed := gdn.SSMQKV.Forward(ctx, hiddenStates).Reshape(ctx, qkvDim, nSeqTokens, nSeqs)
 	z := gdn.SSMQKVGate.Forward(ctx, hiddenStates)

-	baNewDim := 2 * numVHeads / numKHeads
-	mixedBAReshaped := mixedBA.Reshape(ctx, baNewDim, numKHeads, nSeqTokens, nSeqs)
+	var beta ml.Tensor
+	var alpha ml.Tensor
+	switch {
+	case gdn.SSMBetaAlpha != nil:
+		// Legacy qwen3next path: in_proj_ba packs beta/alpha grouped by K-head.
+		mixedBA := gdn.SSMBetaAlpha.Forward(ctx, hiddenStates)
+		baNewDim := 2 * numVHeads / numKHeads
+		mixedBAReshaped := mixedBA.Reshape(ctx, baNewDim, numKHeads, nSeqTokens, nSeqs)

-	// Split beta and alpha
-	betaSize := numVHeads / numKHeads
-	alphaSize := numVHeads / numKHeads
+		betaSize := numVHeads / numKHeads
+		alphaSize := numVHeads / numKHeads

-	b := mixedBAReshaped.Slice(ctx, 0, 0, betaSize, 1)
-	a := mixedBAReshaped.Slice(ctx, 0, betaSize, betaSize+alphaSize, 1)
+		b := mixedBAReshaped.Slice(ctx, 0, 0, betaSize, 1)
+		a := mixedBAReshaped.Slice(ctx, 0, betaSize, betaSize+alphaSize, 1)

-	// Reshape to merge head dimensions
-	beta := b.Contiguous(ctx, numVHeads, 1, nSeqTokens, nSeqs)
-	alpha := a.Contiguous(ctx, numVHeads, nSeqTokens, nSeqs)
+		// Keep beta layout consistent with qwen35.
+		// [1, numVHeads, nSeqTokens, nSeqs]
+		beta = b.Contiguous(ctx, 1, numVHeads, nSeqTokens, nSeqs)
+		alpha = a.Contiguous(ctx, numVHeads, nSeqTokens, nSeqs)
+
+	case gdn.SSMBeta != nil && gdn.SSMAlpha != nil:
+		// qwen35 path: beta/alpha are separate projections.
+		beta = gdn.SSMBeta.Forward(ctx, hiddenStates).Reshape(ctx, 1, numVHeads, nSeqTokens, nSeqs)
+		alpha = gdn.SSMAlpha.Forward(ctx, hiddenStates).Reshape(ctx, numVHeads, nSeqTokens, nSeqs)
+
+	default:
+		return nil, errors.New("qwen3next: missing linear attention beta/alpha projections")
+	}

 	// Compute gate: softplus(alpha + dt_bias) * -A
 	alphaBiased := alpha.Add(ctx, gdn.SSMDT)
 	alphaSoftplus := alphaBiased.Softplus(ctx)
 	gate := alphaSoftplus.Mul(ctx, gdn.SSMA)
+	gate = gate.Reshape(ctx, 1, numVHeads, nSeqTokens, nSeqs)
 	qkvMixed = qkvMixed.Permute(ctx, 1, 0, 2, 3)

 	// Get conv state from cache
@@ -172,16 +189,20 @@ func (gdn *GatedDeltaNet) Forward(ctx ml.Context, hiddenStates, _ ml.Tensor, cac

 	// Repeat interleave Q and K if numKHeads != numVHeads
 	if numKHeads != numVHeads {
-		repeatFactor := numVHeads / numKHeads
+		if opts.vHeadReordered {
+			qConv = qConv.Repeat4D(ctx, headKDim, numVHeads, nSeqTokens, nSeqs)
+			kConv = kConv.Repeat4D(ctx, headKDim, numVHeads, nSeqTokens, nSeqs)
+		} else {
+			repeatFactor := numVHeads / numKHeads
+			qReshaped := qConv.Reshape(ctx, headKDim, 1, numKHeads*nSeqTokens*nSeqs)
+			kReshaped := kConv.Reshape(ctx, headKDim, 1, numKHeads*nSeqTokens*nSeqs)

-		qReshaped := qConv.Reshape(ctx, headKDim, 1, numKHeads*nSeqTokens*nSeqs)
-		kReshaped := kConv.Reshape(ctx, headKDim, 1, numKHeads*nSeqTokens*nSeqs)
+			qRepeated := qReshaped.Repeat4D(ctx, headKDim, repeatFactor, numKHeads*nSeqTokens*nSeqs, 1)
+			kRepeated := kReshaped.Repeat4D(ctx, headKDim, repeatFactor, numKHeads*nSeqTokens*nSeqs, 1)

-		qRepeated := qReshaped.Repeat4D(ctx, headKDim, repeatFactor, numKHeads*nSeqTokens*nSeqs, 1)
-		kRepeated := kReshaped.Repeat4D(ctx, headKDim, repeatFactor, numKHeads*nSeqTokens*nSeqs, 1)
-
-		qConv = qRepeated.Reshape(ctx, headKDim, numKHeads*repeatFactor, nSeqTokens, nSeqs)
-		kConv = kRepeated.Reshape(ctx, headKDim, numKHeads*repeatFactor, nSeqTokens, nSeqs)
+			qConv = qRepeated.Reshape(ctx, headKDim, numKHeads*repeatFactor, nSeqTokens, nSeqs)
+			kConv = kRepeated.Reshape(ctx, headKDim, numKHeads*repeatFactor, nSeqTokens, nSeqs)
+		}
 	}

 	// Choose computation mode based on sequence length
@@ -189,7 +210,9 @@ func (gdn *GatedDeltaNet) Forward(ctx ml.Context, hiddenStates, _ ml.Tensor, cac
 	if nSeqTokens == 1 {
 		attnOut = gdn.deltaNetAutoregressive(ctx, qConv, kConv, vConv, gate, beta, state, opts, layer, cache)
 	} else {
-		// Use pre-computed masks from opts (created once in Model.Forward)
+		if opts.masks == nil {
+			opts.masks = createMasks(ctx)
+		}
 		attnOut = gdn.deltaNetChunked(ctx, qConv, kConv, vConv, gate, beta, state, opts.masks, opts, layer, cache)
 	}

@@ -310,9 +333,9 @@ func (gdn *GatedDeltaNet) deltaNetChunked(
 	q = q.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx, headKDim, nTokens, numVHeads, nSeqs)
 	k = k.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx, headKDim, nTokens, numVHeads, nSeqs)
 	v = v.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx, headVDim, nTokens, numVHeads, nSeqs)
-	gate = gate.Permute(ctx, 2, 0, 3, 1).Contiguous(ctx, nTokens, 1, numVHeads, nSeqs)
-
-	beta = beta.Permute(ctx, 2, 0, 1, 3).Contiguous(ctx)
+	// gate/beta: [1, numVHeads, nTokens, nSeqs] -> [1, nTokens, numVHeads, nSeqs]
+	gate = gate.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx, 1, nTokens, numVHeads, nSeqs)
+	beta = beta.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx, 1, nTokens, numVHeads, nSeqs)
 	state = state.Reshape(ctx, headVDim, headVDim, numVHeads, nSeqs)

 	// Compute padding
@@ -324,7 +347,7 @@ func (gdn *GatedDeltaNet) deltaNetChunked(
 		q = q.Pad(ctx, 0, pad, 0, 0)
 		k = k.Pad(ctx, 0, pad, 0, 0)
 		v = v.Pad(ctx, 0, pad, 0, 0)
-		gate = gate.Pad(ctx, pad, 0, 0, 0)
+		gate = gate.Pad(ctx, 0, pad, 0, 0)
 		beta = beta.Pad(ctx, 0, pad, 0, 0)
 	}

@@ -344,10 +367,12 @@ func (gdn *GatedDeltaNet) deltaNetChunked(
 	kBeta = kBeta.Reshape(ctx, headKDim, chunkSize, nChunks, numVHeads*nSeqs)
 	vBeta = vBeta.Reshape(ctx, headVDim, chunkSize, nChunks, numVHeads*nSeqs)

-	gate = gate.Reshape(ctx, chunkSize, 1, nChunks, numVHeads*nSeqs)
+	// Reshape gate and cumsum over chunk axis.
+	// [1, chunkSize, nChunks, H*nSeqs] -> transpose -> [chunkSize, 1, nChunks, H*nSeqs]
+	gate = gate.Reshape(ctx, 1, chunkSize, nChunks, numVHeads*nSeqs)

 	// g_cumsum = cumsum(gate)
-	gCumsum := gate.CumSum(ctx)
+	gCumsum := gate.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx, chunkSize, 1, nChunks, numVHeads*nSeqs).CumSum(ctx)

 	// Compute decay mask
 	gcsI := gCumsum.Reshape(ctx, chunkSize, 1, nChunks, numVHeads*nSeqs)
@@ -411,60 +436,64 @@ func (gdn *GatedDeltaNet) deltaNetChunked(
 	keyGDiff := k.Mul(ctx, gDiffExpReshaped)
 	keyGDiffT := keyGDiff.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)

-	// Process chunks and update state
-	var coreAttnOut ml.Tensor
-	newState := state
+	// Process chunks and update state.
+	// Keep a transposed view of v and recurrent state across chunks so the
+	// chunk loop does not need extra transpose+contiguous nodes.
+	vT := v.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx, chunkSize, headVDim, nChunks, numVHeads*nSeqs)
+	stateT := state.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx, headVDim, headVDim, 1, numVHeads*nSeqs)

 	for chunk := range nChunks {
 		qChunk := q.Slice(ctx, 2, chunk, chunk+1, 1)
-		vChunk := v.Slice(ctx, 2, chunk, chunk+1, 1)
+		vTChunk := vT.Slice(ctx, 2, chunk, chunk+1, 1)
 		gExpChunk := gExp.Slice(ctx, 2, chunk, chunk+1, 1)
 		kCumdecayChunk := kCumdecay.Slice(ctx, 2, chunk, chunk+1, 1)
 		attnChunk := attnKQ.Slice(ctx, 2, chunk, chunk+1, 1) // Pre-computed!

-		// state^T - permute is needed but Contiguous creates a copy
-		stateT := newState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx, headVDim, headVDim, 1, numVHeads*nSeqs)
+		// v'_t = k_cumdecay @ state_t
+		vTPrime := kCumdecayChunk.Mulmat(ctx, stateT)

-		// v_prime = k_cumdecay @ state
-		vPrime := stateT.Mulmat(ctx, kCumdecayChunk)
-
-		// v_new = v - v_prime
-		vNew := vChunk.Sub(ctx, vPrime)
-		vNewT := vNew.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
+		// v_t_new = v_t - v'_t
+		vTNewChunk := vTChunk.Sub(ctx, vTPrime)

 		// attn_inter = (q * g_exp) @ state
 		qGExp := qChunk.Mul(ctx, gExpChunk)
 		attnInter := stateT.Mulmat(ctx, qGExp)

 		// core_attn_out = attn_inter + attn @ v_new
-		vAttn := vNewT.Mulmat(ctx, attnChunk)
+		vAttn := vTNewChunk.Mulmat(ctx, attnChunk)
 		coreAttnOutChunk := attnInter.Add(ctx, vAttn)

-		if coreAttnOut == nil {
-			coreAttnOut = coreAttnOutChunk
-		} else {
-			coreAttnOut = coreAttnOut.Concat(ctx, coreAttnOutChunk, 1)
-		}
+		v = v.SetInplace(
+			ctx,
+			coreAttnOutChunk,
+			v.Stride(1),
+			v.Stride(2),
+			v.Stride(3),
+			chunk*v.Stride(2),
+		)

 		// Update state for next chunk
 		gExpLastChunk := gLastExp.Slice(ctx, 2, chunk, chunk+1, 1)
 		kGDiffChunkT := keyGDiffT.Slice(ctx, 2, chunk, chunk+1, 1)
-		kgdMulVNew := vNewT.Mulmat(ctx, kGDiffChunkT)
+		// kgdmulvnew = key_gdiff_t @ v_new_t
+		kgdMulVNew := kGDiffChunkT.Mulmat(ctx, vTNewChunk)

-		// state = state * g_last + kgdmulvnew
-		gExpLastReshaped := gExpLastChunk.Contiguous(ctx).Reshape(ctx, 1, 1, numVHeads, nSeqs)
-		newState = newState.Mul(ctx, gExpLastReshaped)
-		newState = newState.Add(ctx, kgdMulVNew.Reshape(ctx, headVDim, headVDim, numVHeads, nSeqs))
+		// stateT = stateT * g_last + kgdmulvnew
+		stateT = stateT.Mul(ctx, gExpLastChunk)
+		stateT = stateT.Add(ctx, kgdMulVNew)
 	}

 	// Final reshape
-	coreAttnOut = coreAttnOut.Contiguous(ctx, headVDim, chunkSize*nChunks, numVHeads, nSeqs)
+	coreAttnOut := v.Contiguous(ctx, headVDim, chunkSize*nChunks, numVHeads, nSeqs)

 	// Slice to remove padding
 	if pad > 0 {
 		coreAttnOut = coreAttnOut.Slice(ctx, 1, 0, nTokens, 1)
 	}

+	// Convert stateT back to cache layout [S_v, S_v, H_v, nSeqs]
+	newState := stateT.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx, headVDim, headVDim, numVHeads, nSeqs)
+
 	// Update delta state in cache
 	cache.UpdateDeltaState(ctx, layer, newState.Reshape(ctx, headVDim, headVDim*numVHeads, nSeqs))

--- a/model/models/qwen3next/model.go
+++ b/model/models/qwen3next/model.go
@@ -1,9 +1,12 @@
 package qwen3next

 import (
+	"bytes"
 	"cmp"
 	"fmt"
+	"image"
 	"math"
+	"slices"

 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/ml"
@@ -11,6 +14,7 @@ import (
 	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
+	"github.com/ollama/ollama/model/models/qwen3vl"
 	"github.com/ollama/ollama/tokenizer"
 )

@@ -41,10 +45,15 @@ type Options struct {
 	ssmNGroup      int // num_k_heads
 	ssmDtRank      int // num_v_heads
 	convKernelSize int // SSM conv kernel size
+	vHeadReordered bool

 	// Per-layer type from GGUF metadata
 	isRecurrent []bool

+	// RoPE mode config (used by qwen35/qwen35moe)
+	mropeSections    []int
+	mropeInterleaved bool
+
 	// Pre-computed masks for chunked attention (created once per forward pass)
 	masks *Masks
 }
@@ -54,7 +63,17 @@ func (o Options) headDim() int {
 }

 func (o Options) applyRotaryPositionEmbeddings(ctx ml.Context, states, positions ml.Tensor) ml.Tensor {
-	opts := []func(*rope.Options){rope.WithTypeNeoX()}
+	var opts []func(*rope.Options)
+	if len(o.mropeSections) > 0 {
+		if o.mropeInterleaved {
+			opts = append(opts, rope.WithInterleaveMRoPE(o.mropeSections))
+		} else {
+			opts = append(opts, rope.WithMRoPE(o.mropeSections))
+		}
+	} else {
+		opts = append(opts, rope.WithTypeNeoX())
+	}
+
 	if o.ropeType == "yarn" {
 		attnFactor := float32(1.0 / (1.0 + 0.1*math.Log(float64(o.ropeScale))))
 		opts = append(opts,
@@ -214,20 +233,190 @@ type Model struct {
 	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
 	Output         *nn.Linear    `gguf:"output,alt:token_embd"`

-	Layers []Layer `gguf:"blk"`
+	Layers []Layer              `gguf:"blk"`
+	Vision *qwen3vl.VisionModel `gguf:"v"`
+
+	ImageProcessor *qwen3vl.ImageProcessor

 	*Options
+
+	positionCache    []int32
+	imageToken       int32
+	visionStart      int32
+	visionEnd        int32
+	spatialMergeSize uint32
+}
+
+func (m *Model) mapPosition(id int32) int32 {
+	if id < int32(len(m.positionCache)) {
+		return m.positionCache[id]
+	}
+	if len(m.positionCache) > 0 {
+		return id - int32(len(m.positionCache)) + m.positionCache[len(m.positionCache)-1] + 1
+	}
+	return id
+}
+
+func (m *Model) buildPositions(ctx ml.Context, batch input.Batch) ml.Tensor {
+	if len(m.mropeSections) == 0 {
+		return ctx.Input().FromInts(batch.Positions, len(batch.Positions))
+	}
+
+	// ggml MRoPE expects [time, height, width, extra] for each token.
+	positionSlice := [][]int32{
+		make([]int32, len(batch.Positions)),
+		make([]int32, len(batch.Positions)),
+		make([]int32, len(batch.Positions)),
+		make([]int32, len(batch.Positions)),
+	}
+
+	for i, id := range batch.Positions {
+		p := m.mapPosition(id)
+		positionSlice[0][i] = p
+		positionSlice[1][i] = p
+		positionSlice[2][i] = p
+	}
+
+	if m.Vision != nil {
+		for _, mi := range batch.Multimodal {
+			grid, ok := mi.Multimodal[0].Data.(*qwen3vl.Grid)
+			if !ok {
+				continue
+			}
+			w := max(1, grid.Width/int(m.spatialMergeSize))
+			for i := range mi.Multimodal[0].Tensor.Dim(1) {
+				positionSlice[1][mi.Index+i] += int32(i / w)
+				positionSlice[2][mi.Index+i] += int32(i % w)
+			}
+		}
+	}
+
+	return ctx.Input().FromInts(slices.Concat(positionSlice...), len(positionSlice[0])*len(positionSlice))
+}
+
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
+	if m.Vision == nil || m.ImageProcessor == nil || len(m.Vision.Layers) == 0 {
+		return nil, model.ErrNoVisionModel
+	}
+
+	img, _, err := image.Decode(bytes.NewReader(multimodalData))
+	if err != nil {
+		return nil, err
+	}
+
+	pixelValues, grid, err := m.ImageProcessor.ProcessImage(ctx, img)
+	if err != nil {
+		return nil, err
+	}
+
+	visionOutputs, deepstackVisualEmbeds := m.Vision.Forward(ctx, pixelValues, grid)
+	mm := []input.Multimodal{{Tensor: visionOutputs, Data: grid}}
+	for i := range deepstackVisualEmbeds {
+		mm = append(mm, input.Multimodal{Tensor: deepstackVisualEmbeds[i]})
+	}
+
+	return mm, nil
+}
+
+func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
+	m.positionCache = m.positionCache[:0]
+	var result []*input.Input
+	appendInput := func(inp *input.Input, position int32) {
+		result = append(result, inp)
+		m.positionCache = append(m.positionCache, position)
+	}
+
+	var p int32
+	for _, inp := range inputs {
+		if inp.Multimodal == nil {
+			appendInput(inp, p)
+			p++
+			continue
+		}
+
+		grid := inp.Multimodal[0].Data.(*qwen3vl.Grid)
+		tokensPerGrid := inp.Multimodal[0].Tensor.Dim(1)
+
+		appendInput(&input.Input{
+			Token:     m.visionStart,
+			SameBatch: tokensPerGrid + 1,
+		}, p)
+		p++
+
+		appendInput(&input.Input{
+			Token:          m.imageToken,
+			Multimodal:     inp.Multimodal,
+			MultimodalHash: inp.MultimodalHash,
+		}, p)
+
+		for range tokensPerGrid - 1 {
+			appendInput(&input.Input{
+				Token: m.imageToken,
+			}, p)
+		}
+
+		gridSpan := max(grid.Width/int(m.spatialMergeSize), grid.Height/int(m.spatialMergeSize))
+		p = p + int32(gridSpan)
+		appendInput(&input.Input{
+			Token: m.visionEnd,
+		}, p)
+		p++
+	}
+
+	return result, nil
 }

 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions := ctx.Input().FromInts(batch.Positions, len(batch.Positions))
+	positions := m.buildPositions(ctx, batch)

 	hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs)
+	if len(batch.Multimodal) > 0 {
+		hiddenStates = hiddenStates.Duplicate(ctx)
+
+		var deepstackVisualEmbeds []ml.Tensor
+		for _, mi := range batch.Multimodal {
+			visionOutputs := mi.Multimodal[0].Tensor
+			ctx.Forward(visionOutputs.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), visionOutputs.Dim(0)*visionOutputs.Dim(1))))
+
+			if len(mi.Multimodal[1:]) > len(deepstackVisualEmbeds) {
+				deepstackVisualEmbeds = append(deepstackVisualEmbeds, make([]ml.Tensor, len(mi.Multimodal[1:])-len(deepstackVisualEmbeds))...)
+			}
+			for i, mm := range mi.Multimodal[1:] {
+				if deepstackVisualEmbeds[i] == nil {
+					deepstackVisualEmbeds[i] = ctx.Input().Zeros(mm.Tensor.DType(), hiddenStates.Shape()...)
+				}
+				ctx.Forward(mm.Tensor.Copy(ctx, deepstackVisualEmbeds[i].View(ctx, mi.Index*deepstackVisualEmbeds[i].Stride(1), mm.Tensor.Dim(0)*mm.Tensor.Dim(1))))
+			}
+		}
+
+		cache := m.Cache.(*HybridCache)
+		m.Options.masks = nil
+		for i, layer := range m.Layers {
+			cache.SetLayer(i)
+
+			var outputs ml.Tensor
+			if i == len(m.Layers)-1 {
+				outputs = batch.Outputs
+			}
+
+			var err error
+			hiddenStates, err = layer.Forward(ctx, i, hiddenStates, positions, outputs, cache, m.Options)
+			if err != nil {
+				return nil, err
+			}
+			if i < len(deepstackVisualEmbeds) {
+				hiddenStates = hiddenStates.Add(ctx, deepstackVisualEmbeds[i])
+			}
+		}
+
+		hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.eps)
+		return m.Output.Forward(ctx, hiddenStates), nil
+	}

 	cache := m.Cache.(*HybridCache)

-	// Create masks once per forward pass
-	m.Options.masks = createMasks(ctx)
+	// Masks are allocated lazily only for chunked recurrent prefill.
+	m.Options.masks = nil

 	for i, layer := range m.Layers {
 		cache.SetLayer(i)
@@ -249,10 +438,17 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 }

 func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
+	m.positionCache = nil
+	if len(m.mropeSections) > 0 {
+		shift = shift.Repeat(ctx, 1, 4).Reshape(ctx, -1)
+	}
 	return m.applyRotaryPositionEmbeddings(ctx, key, shift), nil
 }

-var _ model.Model = (*Model)(nil)
+var (
+	_ model.Model               = (*Model)(nil)
+	_ model.MultimodalProcessor = (*Model)(nil)
+)

 func New(c fs.Config) (model.Model, error) {
 	numLayers := int(c.Uint("block_count"))
@@ -303,6 +499,22 @@ func New(c fs.Config) (model.Model, error) {
 		}
 	}

+	mropeSections := c.Ints("mrope_sections", nil)
+	if len(mropeSections) == 0 {
+		mropeSections = c.Ints("rope.mrope_section", nil)
+	}
+	if len(mropeSections) == 0 {
+		mropeSections = c.Ints("rope.dimension_sections", nil)
+	}
+	if len(mropeSections) > 4 {
+		mropeSections = mropeSections[:4]
+	}
+
+	ropeType := c.String("rope.scaling.type")
+	if ropeType == "" {
+		ropeType = c.String("rope.type")
+	}
+
 	opts := &Options{
 		hiddenSize: int(c.Uint("embedding_length")),
 		numHeads:   int(c.Uint("attention.head_count")),
@@ -318,7 +530,7 @@ func New(c fs.Config) (model.Model, error) {
 		valueLength:           int(c.Uint("attention.value_length")),
 		ropeDim:               int(c.Uint("rope.dimension_count")),
 		eps:                   c.Float("attention.layer_norm_rms_epsilon"),
-		ropeType:              c.String("rope.scaling.type"),
+		ropeType:              ropeType,
 		ropeBase:              c.Float("rope.freq_base"),
 		ropeScale:             c.Float("rope.scaling.factor", 1),
 		originalContextLength: int(c.Uint("rope.scaling.original_context_length")),
@@ -331,7 +543,16 @@ func New(c fs.Config) (model.Model, error) {
 		ssmNGroup:             int(c.Uint("ssm.group_count")),
 		ssmDtRank:             int(c.Uint("ssm.time_step_rank")),
 		convKernelSize:        int(c.Uint("ssm.conv_kernel")),
+		vHeadReordered:        c.Bool("ssm.v_head_reordered", false),
 		isRecurrent:           isRecurrent,
+		mropeSections: slices.Collect(func(yield func(int) bool) {
+			for _, section := range mropeSections {
+				if !yield(int(section)) {
+					return
+				}
+			}
+		}),
+		mropeInterleaved: c.Bool("rope.mrope_interleaved", c.Bool("mrope_interleaved", false)),
 	}
 	if opts.numKVHeads == 0 {
 		return nil, fmt.Errorf("qwen3next: attention.head_count_kv array must include at least one non-zero value")
@@ -353,6 +574,19 @@ func New(c fs.Config) (model.Model, error) {
 		return nil, fmt.Errorf("qwen3next: headKDim (%d) != headVDim (%d) not supported; state computations require equal dimensions", headKDim, headVDim)
 	}

+	var vision *qwen3vl.VisionModel
+	var imageProcessor *qwen3vl.ImageProcessor
+	if c.Uint("vision.block_count", 0) > 0 {
+		vision = qwen3vl.NewVisionModel(c)
+		processor := qwen3vl.NewImageProcessor(c)
+		imageProcessor = &processor
+	}
+
+	spatialMergeSize := c.Uint("vision.spatial_merge_size", 2)
+	if spatialMergeSize == 0 {
+		spatialMergeSize = 2
+	}
+
 	m := Model{
 		Tokenizer: tokenizer.NewBytePairEncoding(
 			&tokenizer.Vocabulary{
@@ -371,8 +605,14 @@ func New(c fs.Config) (model.Model, error) {
 			},
 			`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
 		),
-		Layers:  layers,
-		Options: opts,
+		Layers:           layers,
+		Vision:           vision,
+		ImageProcessor:   imageProcessor,
+		Options:          opts,
+		imageToken:       int32(c.Uint("image_token_id", 151655)),
+		visionStart:      int32(c.Uint("vision_start_token_id", 151652)),
+		visionEnd:        int32(c.Uint("vision_end_token_id", 151653)),
+		spatialMergeSize: spatialMergeSize,
 	}

 	m.Cache = NewHybridCache(m.Shift, convDim, convChannels, deltaStateSize)
@@ -380,5 +620,7 @@ func New(c fs.Config) (model.Model, error) {
 }

 func init() {
+	model.Register("qwen35", New)
+	model.Register("qwen35moe", New)
 	model.Register("qwen3next", New)
 }
--- a/model/models/qwen3next/model_posttokenize_test.go
+++ b/model/models/qwen3next/model_posttokenize_test.go
@@ -0,0 +1,101 @@
+package qwen3next
+
+import (
+	"testing"
+
+	"github.com/ollama/ollama/ml/backend/ggml"
+	"github.com/ollama/ollama/model/input"
+	"github.com/ollama/ollama/model/models/qwen3vl"
+)
+
+type fakeTensor struct {
+	*ggml.Tensor
+	dims []int
+}
+
+func (t *fakeTensor) Dim(i int) int {
+	return t.dims[i]
+}
+
+func makeImageInput(hash uint64, width, height, tokens int) *input.Input {
+	return &input.Input{
+		Multimodal: []input.Multimodal{{
+			Tensor: &fakeTensor{dims: []int{1, tokens, 1, 1}},
+			Data:   &qwen3vl.Grid{Width: width, Height: height},
+		}},
+		MultimodalHash: hash,
+	}
+}
+
+func TestPostTokenizeMultiImageSpans(t *testing.T) {
+	m := &Model{
+		imageToken:       10,
+		visionStart:      11,
+		visionEnd:        12,
+		spatialMergeSize: 2,
+	}
+
+	inputs := []*input.Input{
+		{Token: 100},
+		makeImageInput(1, 8, 4, 4),
+		makeImageInput(2, 4, 8, 4),
+		{Token: 200},
+	}
+
+	got, err := m.PostTokenize(inputs)
+	if err != nil {
+		t.Fatalf("PostTokenize() error = %v", err)
+	}
+
+	want := []struct {
+		token     int32
+		hash      uint64
+		sameBatch int
+		hasMM     bool
+	}{
+		{token: 100},
+		{token: 11, sameBatch: 5},
+		{token: 10, hash: 1, hasMM: true},
+		{token: 10},
+		{token: 10},
+		{token: 10},
+		{token: 12},
+		{token: 11, sameBatch: 5},
+		{token: 10, hash: 2, hasMM: true},
+		{token: 10},
+		{token: 10},
+		{token: 10},
+		{token: 12},
+		{token: 200},
+	}
+
+	if len(got) != len(want) {
+		t.Fatalf("len(got) = %d, want %d", len(got), len(want))
+	}
+
+	for i := range want {
+		if got[i].Token != want[i].token {
+			t.Fatalf("got[%d].Token = %d, want %d", i, got[i].Token, want[i].token)
+		}
+		if got[i].MultimodalHash != want[i].hash {
+			t.Fatalf("got[%d].MultimodalHash = %d, want %d", i, got[i].MultimodalHash, want[i].hash)
+		}
+		if got[i].SameBatch != want[i].sameBatch {
+			t.Fatalf("got[%d].SameBatch = %d, want %d", i, got[i].SameBatch, want[i].sameBatch)
+		}
+		hasMM := len(got[i].Multimodal) > 0
+		if hasMM != want[i].hasMM {
+			t.Fatalf("got[%d].hasMM = %v, want %v", i, hasMM, want[i].hasMM)
+		}
+	}
+
+	wantPositions := []int32{0, 1, 2, 2, 2, 2, 6, 7, 8, 8, 8, 8, 12, 13}
+	if len(m.positionCache) != len(wantPositions) {
+		t.Fatalf("len(positionCache) = %d, want %d", len(m.positionCache), len(wantPositions))
+	}
+	for i := range wantPositions {
+		if m.positionCache[i] != wantPositions[i] {
+			t.Fatalf("positionCache[%d] = %d, want %d", i, m.positionCache[i], wantPositions[i])
+		}
+	}
+}
--- a/model/models/qwen3vl/imageprocessor.go
+++ b/model/models/qwen3vl/imageprocessor.go
@@ -24,8 +24,8 @@ type ImageProcessor struct {
 	imageStd          []float32
 }

-// newImageProcessor creates a new image processor with default values
-func newImageProcessor(c fs.Config) ImageProcessor {
+// NewImageProcessor creates a new image processor with default values.
+func NewImageProcessor(c fs.Config) ImageProcessor {
 	patchSize := int(c.Uint("vision.patch_size", 14))
 	mergeSize := int(c.Uint("vision.spatial_merge_size", 2))

--- a/model/models/qwen3vl/model.go
+++ b/model/models/qwen3vl/model.go
@@ -56,60 +56,46 @@ var (
 	tokenVisionEnd   int32 = 151653
 )

-type modelInput struct {
-	*input.Input
-	position int32
-}
-
 // PostTokenize arranges Qwen 3 VL's inputs for the forward pass
 func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
 	m.positionCache = m.positionCache[:0]
-	return slices.Collect(func(yield func(*input.Input) bool) {
-		for i := range inputs {
-			s := []modelInput{{Input: inputs[i]}}
-			if mm := inputs[i].Multimodal; mm != nil {
-				t := mm[0].Tensor
-				s = slices.Repeat([]modelInput{
-					{
-						position: int32(i + 1),
-						Input:    &input.Input{Token: tokenVision},
-					},
-				}, t.Dim(1)+1+1)
+	var result []*input.Input
+	appendInput := func(inp *input.Input, position int32) {
+		result = append(result, inp)
+		m.positionCache = append(m.positionCache, position)
+	}

-				s[0] = modelInput{
-					Input:    &input.Input{Token: tokenVisionStart},
-					position: int32(i),
-				}
-
-				s[len(s)-1] = modelInput{
-					Input:    &input.Input{Token: tokenVisionEnd},
-					position: int32(i + mm[0].Data.(*Grid).Width/m.spatialMergeSize + 1),
-				}
-
-				s[1] = modelInput{
-					Input: &input.Input{
-						Token:          tokenVision,
-						Multimodal:     inputs[i].Multimodal,
-						MultimodalHash: inputs[i].MultimodalHash,
-						SameBatch:      t.Dim(1),
-					},
-					position: int32(i + 1),
-				}
-			}
-
-			for _, e := range s {
-				position := e.position
-				if position == 0 && len(m.positionCache) > 0 {
-					position = m.positionCache[len(m.positionCache)-1] + 1
-				}
-
-				m.positionCache = append(m.positionCache, position)
-				if !yield(e.Input) {
-					return
-				}
-			}
+	var p int32
+	for _, inp := range inputs {
+		if inp.Multimodal == nil {
+			appendInput(inp, p)
+			p++
+			continue
 		}
-	}), nil
+
+		grid := inp.Multimodal[0].Data.(*Grid)
+		tokensPerGrid := inp.Multimodal[0].Tensor.Dim(1)
+
+		appendInput(&input.Input{Token: tokenVisionStart}, p)
+		p++
+
+		appendInput(&input.Input{
+			Token:          tokenVision,
+			Multimodal:     inp.Multimodal,
+			MultimodalHash: inp.MultimodalHash,
+			SameBatch:      tokensPerGrid,
+		}, p)
+
+		for range tokensPerGrid - 1 {
+			appendInput(&input.Input{Token: tokenVision}, p)
+		}
+
+		p = p + int32(grid.Width/m.spatialMergeSize)
+		appendInput(&input.Input{Token: tokenVisionEnd}, p)
+		p++
+	}
+
+	return result, nil
 }

 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
@@ -143,9 +129,13 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 			}
 		}

-		deepstackVisualEmbeds = make([]ml.Tensor, len(mi.Multimodal[1:]))
+		if len(mi.Multimodal[1:]) > len(deepstackVisualEmbeds) {
+			deepstackVisualEmbeds = append(deepstackVisualEmbeds, make([]ml.Tensor, len(mi.Multimodal[1:])-len(deepstackVisualEmbeds))...)
+		}
 		for i, mm := range mi.Multimodal[1:] {
-			deepstackVisualEmbeds[i] = ctx.Input().Zeros(mm.Tensor.DType(), hiddenStates.Shape()...)
+			if deepstackVisualEmbeds[i] == nil {
+				deepstackVisualEmbeds[i] = ctx.Input().Zeros(mm.Tensor.DType(), hiddenStates.Shape()...)
+			}
 			ctx.Forward(mm.Tensor.Copy(ctx, deepstackVisualEmbeds[i].View(ctx, mi.Index*deepstackVisualEmbeds[i].Stride(1), mm.Tensor.Dim(0)*mm.Tensor.Dim(1))))
 		}
 	}
@@ -189,8 +179,8 @@ func New(c fs.Config) (model.Model, error) {
 			`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
 		),
 		TextModel:      newTextModel(c),
-		VisionModel:    newVisionModel(c),
-		ImageProcessor: newImageProcessor(c),
+		VisionModel:    NewVisionModel(c),
+		ImageProcessor: NewImageProcessor(c),
 	}

 	m.Cache = kvcache.NewCausalCache(func(ctx ml.Context, layer int, key, positions ml.Tensor) (ml.Tensor, error) {
--- a/model/models/qwen3vl/model_vision.go
+++ b/model/models/qwen3vl/model_vision.go
@@ -238,8 +238,8 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor, grid *Grid)
 	return hiddenStates, deepstackStates
 }

-// newVisionModel creates a new instance of the Qwen vision model
-func newVisionModel(c fs.Config) *VisionModel {
+// NewVisionModel creates a new instance of the Qwen vision model.
+func NewVisionModel(c fs.Config) *VisionModel {
 	deepstackVisualIndexes := c.Ints("vision.deepstack_visual_indexes")
 	model := &VisionModel{
 		Layers:          make([]VisionEncoderLayer, c.Uint("vision.block_count", 32)),
--- a/model/parsers/lfm2.go
+++ b/model/parsers/lfm2.go
@@ -32,6 +32,8 @@ type LFM2Parser struct {
 	hasThinkingSupport       bool
 	needsThinkingLeadingTrim bool // trim leading whitespace after <think> tag
 	needsContentLeadingTrim  bool // trim leading whitespace after </think> tag
+	toolNames                map[string]struct{}
+	hasTools                 bool
 }

 func (p *LFM2Parser) HasToolSupport() bool {
@@ -63,6 +65,13 @@ func (p *LFM2Parser) setInitialState(lastMessage *api.Message, thinkValue *api.T
 }

 func (p *LFM2Parser) Init(tools []api.Tool, lastMessage *api.Message, thinkValue *api.ThinkValue) []api.Tool {
+	p.toolNames = make(map[string]struct{}, len(tools))
+	p.hasTools = len(tools) > 0
+	for _, tool := range tools {
+		if tool.Function.Name != "" {
+			p.toolNames[tool.Function.Name] = struct{}{}
+		}
+	}
 	p.setInitialState(lastMessage, thinkValue)
 	return tools
 }
@@ -105,9 +114,33 @@ func (p *LFM2Parser) Add(s string, done bool) (content string, thinking string,
 		}
 	}

+	// Fallback for models that emit bare tool calls without <|tool_call_*|> wrappers.
+	if done && len(toolCalls) == 0 && p.hasTools {
+		candidate := strings.TrimSpace(contentSb.String())
+		if fallbackCalls, parseErr := p.parseToolCallsContent(candidate); parseErr == nil && p.toolCallsAllowed(fallbackCalls) {
+			contentSb.Reset()
+			toolCalls = append(toolCalls, fallbackCalls...)
+		}
+	}
+
 	return contentSb.String(), thinkingSb.String(), toolCalls, nil
 }

+func (p *LFM2Parser) toolCallsAllowed(calls []api.ToolCall) bool {
+	if len(calls) == 0 {
+		return false
+	}
+	if len(p.toolNames) == 0 {
+		return true
+	}
+	for _, call := range calls {
+		if _, ok := p.toolNames[call.Function.Name]; !ok {
+			return false
+		}
+	}
+	return true
+}
+
 func (p *LFM2Parser) parseEvents() []lfm2Event {
 	var all []lfm2Event

@@ -269,36 +302,16 @@ func (p *LFM2Parser) eat() ([]lfm2Event, bool) {
 	return events, false
 }

-// parseToolCallsContent parses one or more tool calls from content
-// Supports JSON format and Python-style format including multiple calls: [func1(...),func2(...)]
+// parseToolCallsContent parses one or more Python-style tool calls.
+// Example: [func1(arg='v'), func2(x=1)]
 func (p *LFM2Parser) parseToolCallsContent(content string) ([]api.ToolCall, error) {
 	content = strings.TrimSpace(content)

-	// Try JSON format first: {"name": "func", "arguments": {...}}
-	var parsed struct {
-		Name      string          `json:"name"`
-		Arguments json.RawMessage `json:"arguments"`
-	}
+	// Be tolerant of malformed outputs that include wrapper tags without proper pairing.
+	content = strings.TrimSpace(strings.TrimPrefix(content, lfm2ToolCallStartTag))
+	content = strings.TrimSpace(strings.TrimSuffix(content, lfm2ToolCallEndTag))

-	if err := json.Unmarshal([]byte(content), &parsed); err == nil && parsed.Name != "" {
-		var args api.ToolCallFunctionArguments
-		if len(parsed.Arguments) > 0 {
-			if err := json.Unmarshal(parsed.Arguments, &args); err != nil {
-				return nil, err
-			}
-		} else {
-			args = api.NewToolCallFunctionArguments()
-		}
-
-		return []api.ToolCall{{
-			Function: api.ToolCallFunction{
-				Name:      parsed.Name,
-				Arguments: args,
-			},
-		}}, nil
-	}
-
-	// Try Python-style format: [func(arg1='val1'),func2(arg2='val2')] or func(arg1='val1')
+	// Parse Python-style format: [func(arg1='val1'),func2(arg2='val2')] or func(arg1='val1')
 	return p.parsePythonStyleToolCalls(content)
 }

@@ -417,21 +430,16 @@ func (p *LFM2Parser) parseToolCallContent(content string) (api.ToolCall, error)

 // parsePythonArgs parses Python-style keyword arguments: key='value', key2="value2"
 func parsePythonArgs(argsStr string, args *api.ToolCallFunctionArguments) error {
-	// Simple state machine to parse key='value' pairs
-	// Handles: command='ls', flag="-la", count=42, enabled=true
-	var key string
 	i := 0
-
 	for i < len(argsStr) {
-		// Skip whitespace
-		for i < len(argsStr) && (argsStr[i] == ' ' || argsStr[i] == '\t' || argsStr[i] == '\n') {
+		// Skip separators and whitespace.
+		for i < len(argsStr) && (argsStr[i] == ',' || unicode.IsSpace(rune(argsStr[i]))) {
 			i++
 		}
 		if i >= len(argsStr) {
 			break
 		}

-		// Parse key
 		keyStart := i
 		for i < len(argsStr) && argsStr[i] != '=' && argsStr[i] != ',' {
 			i++
@@ -439,60 +447,238 @@ func parsePythonArgs(argsStr string, args *api.ToolCallFunctionArguments) error
 		if i >= len(argsStr) || argsStr[i] != '=' {
 			return errors.New("invalid argument: expected '='")
 		}
-		key = strings.TrimSpace(argsStr[keyStart:i])
+
+		key := strings.TrimSpace(argsStr[keyStart:i])
+		if key == "" {
+			return errors.New("invalid argument: empty key")
+		}
 		i++ // skip '='

-		// Skip whitespace after =
-		for i < len(argsStr) && (argsStr[i] == ' ' || argsStr[i] == '\t') {
+		for i < len(argsStr) && unicode.IsSpace(rune(argsStr[i])) {
 			i++
 		}
-
-		// Parse value
-		var value string
-		if i < len(argsStr) && (argsStr[i] == '\'' || argsStr[i] == '"') {
-			// Quoted string
-			quote := argsStr[i]
-			i++
-			valueStart := i
-			for i < len(argsStr) && argsStr[i] != quote {
-				if argsStr[i] == '\\' && i+1 < len(argsStr) {
-					i += 2 // skip escaped char
-				} else {
-					i++
-				}
-			}
-			value = argsStr[valueStart:i]
-			if i < len(argsStr) {
-				i++ // skip closing quote
-			}
-			args.Set(key, value)
-		} else {
-			// Unquoted value (number, bool, etc)
-			valueStart := i
-			for i < len(argsStr) && argsStr[i] != ',' {
-				i++
-			}
-			value = strings.TrimSpace(argsStr[valueStart:i])
-
-			// Try to parse as number or bool
-			if v, err := strconv.ParseInt(value, 10, 64); err == nil {
-				args.Set(key, v)
-			} else if v, err := strconv.ParseFloat(value, 64); err == nil {
-				args.Set(key, v)
-			} else if value == "true" {
-				args.Set(key, true)
-			} else if value == "false" {
-				args.Set(key, false)
-			} else {
-				args.Set(key, value)
-			}
+		if i >= len(argsStr) {
+			return errors.New("invalid argument: missing value")
 		}

-		// Skip comma and whitespace
-		for i < len(argsStr) && (argsStr[i] == ',' || argsStr[i] == ' ' || argsStr[i] == '\t' || argsStr[i] == '\n') {
+		value, next, err := parsePythonArgValue(argsStr, i)
+		if err != nil {
+			return err
+		}
+		args.Set(key, value)
+		i = next
+
+		// Optional trailing comma before next key/value.
+		if i < len(argsStr) && argsStr[i] == ',' {
 			i++
 		}
 	}

 	return nil
 }
+
+func parsePythonArgValue(s string, i int) (any, int, error) {
+	if i >= len(s) {
+		return nil, i, errors.New("invalid argument: missing value")
+	}
+
+	// Quoted string literal.
+	if s[i] == '\'' || s[i] == '"' {
+		quote := s[i]
+		i++
+		start := i
+		for i < len(s) {
+			if s[i] == '\\' && i+1 < len(s) {
+				i += 2
+				continue
+			}
+			if s[i] == quote {
+				value := s[start:i]
+				i++
+				return value, i, nil
+			}
+			i++
+		}
+		return nil, i, errors.New("invalid argument: unterminated string")
+	}
+
+	// Unquoted literal. Consume until top-level comma.
+	start := i
+	depthParen, depthSquare, depthCurly := 0, 0, 0
+	inString := false
+	var quote byte
+	escaped := false
+
+	for i < len(s) {
+		ch := s[i]
+		if inString {
+			if escaped {
+				escaped = false
+			} else if ch == '\\' {
+				escaped = true
+			} else if ch == quote {
+				inString = false
+			}
+			i++
+			continue
+		}
+
+		switch ch {
+		case '\'', '"':
+			inString = true
+			quote = ch
+		case '(':
+			depthParen++
+		case ')':
+			if depthParen > 0 {
+				depthParen--
+			}
+		case '[':
+			depthSquare++
+		case ']':
+			if depthSquare > 0 {
+				depthSquare--
+			}
+		case '{':
+			depthCurly++
+		case '}':
+			if depthCurly > 0 {
+				depthCurly--
+			}
+		case ',':
+			if depthParen == 0 && depthSquare == 0 && depthCurly == 0 {
+				token := strings.TrimSpace(s[start:i])
+				value, err := parsePythonLiteral(token)
+				return value, i, err
+			}
+		}
+		i++
+	}
+
+	token := strings.TrimSpace(s[start:i])
+	value, err := parsePythonLiteral(token)
+	return value, i, err
+}
+
+func parsePythonLiteral(token string) (any, error) {
+	switch token {
+	case "":
+		return "", nil
+	case "true", "True":
+		return true, nil
+	case "false", "False":
+		return false, nil
+	case "null", "None":
+		return nil, nil
+	}
+
+	if v, err := strconv.ParseInt(token, 10, 64); err == nil {
+		return v, nil
+	}
+	if v, err := strconv.ParseFloat(token, 64); err == nil {
+		return v, nil
+	}
+
+	if strings.HasPrefix(token, "[") || strings.HasPrefix(token, "{") {
+		var parsed any
+		if err := json.Unmarshal([]byte(token), &parsed); err == nil {
+			return parsed, nil
+		}
+
+		if converted, err := pythonLiteralToJSON(token); err == nil {
+			if err := json.Unmarshal([]byte(converted), &parsed); err == nil {
+				return parsed, nil
+			}
+		}
+	}
+
+	return token, nil
+}
+
+func pythonLiteralToJSON(s string) (string, error) {
+	var out strings.Builder
+	out.Grow(len(s) + len(s)/8)
+
+	inString := false
+	var quote byte
+	escaped := false
+
+	for i := 0; i < len(s); i++ {
+		ch := s[i]
+
+		if inString {
+			if escaped {
+				out.WriteByte(ch)
+				escaped = false
+				continue
+			}
+
+			if ch == '\\' {
+				out.WriteByte(ch)
+				escaped = true
+				continue
+			}
+
+			if ch == quote {
+				out.WriteByte('"')
+				inString = false
+				continue
+			}
+
+			if quote == '\'' && ch == '"' {
+				out.WriteString(`\"`)
+				continue
+			}
+
+			out.WriteByte(ch)
+			continue
+		}
+
+		if ch == '\'' || ch == '"' {
+			inString = true
+			quote = ch
+			escaped = false
+			out.WriteByte('"')
+			continue
+		}
+
+		// Replace Python identifiers with JSON equivalents when outside strings.
+		if isIdentStart(ch) {
+			j := i + 1
+			for j < len(s) && isIdentPart(s[j]) {
+				j++
+			}
+
+			ident := s[i:j]
+			switch ident {
+			case "True":
+				out.WriteString("true")
+			case "False":
+				out.WriteString("false")
+			case "None":
+				out.WriteString("null")
+			default:
+				out.WriteString(ident)
+			}
+
+			i = j - 1
+			continue
+		}
+
+		out.WriteByte(ch)
+	}
+
+	if inString {
+		return "", errors.New("unterminated string")
+	}
+
+	return out.String(), nil
+}
+
+func isIdentStart(b byte) bool {
+	return (b >= 'A' && b <= 'Z') || (b >= 'a' && b <= 'z') || b == '_'
+}
+
+func isIdentPart(b byte) bool {
+	return isIdentStart(b) || (b >= '0' && b <= '9')
+}
--- a/model/parsers/lfm2_test.go
+++ b/model/parsers/lfm2_test.go
@@ -39,7 +39,7 @@ func TestLFM2Parser(t *testing.T) {
 		},
 		{
 			name:            "tool_call_simple",
-			input:           "I'll check the weather.<|tool_call_start|>{\"name\":\"get_weather\",\"arguments\":{\"location\":\"Paris\"}}<|tool_call_end|>",
+			input:           "I'll check the weather.<|tool_call_start|>[get_weather(location=\"Paris\")]<|tool_call_end|>",
 			expectedContent: "I'll check the weather.",
 			expectedCalls: []api.ToolCall{
 				{
@@ -55,7 +55,7 @@ func TestLFM2Parser(t *testing.T) {
 		},
 		{
 			name:            "multiple_tool_calls",
-			input:           "Getting weather for both cities.<|tool_call_start|>{\"name\":\"get_weather\",\"arguments\":{\"location\":\"Paris\"}}<|tool_call_end|><|tool_call_start|>{\"name\":\"get_weather\",\"arguments\":{\"location\":\"London\"}}<|tool_call_end|>",
+			input:           "Getting weather for both cities.<|tool_call_start|>[get_weather(location=\"Paris\")]<|tool_call_end|><|tool_call_start|>[get_weather(location=\"London\")]<|tool_call_end|>",
 			expectedContent: "Getting weather for both cities.",
 			expectedCalls: []api.ToolCall{
 				{
@@ -79,7 +79,7 @@ func TestLFM2Parser(t *testing.T) {
 		},
 		{
 			name:            "complex_tool_arguments",
-			input:           "Processing data.<|tool_call_start|>{\"name\":\"process_data\",\"arguments\":{\"items\":[\"item1\",\"item2\"],\"config\":{\"enabled\":true,\"threshold\":0.95}}}<|tool_call_end|>",
+			input:           "Processing data.<|tool_call_start|>[process_data(items=['item1','item2'], config={'enabled': True, 'threshold': 0.95})]<|tool_call_end|>",
 			expectedContent: "Processing data.",
 			expectedCalls: []api.ToolCall{
 				{
@@ -96,7 +96,7 @@ func TestLFM2Parser(t *testing.T) {
 		},
 		{
 			name:             "thinking_with_tool_call",
-			input:            "Let me check the weather...</think>I'll get that for you.<|tool_call_start|>{\"name\":\"get_weather\",\"arguments\":{\"location\":\"Paris\"}}<|tool_call_end|>",
+			input:            "Let me check the weather...</think>I'll get that for you.<|tool_call_start|>[get_weather(location=\"Paris\")]<|tool_call_end|>",
 			expectedThinking: "Let me check the weather...",
 			expectedContent:  "I'll get that for you.",
 			expectedCalls: []api.ToolCall{
@@ -144,16 +144,16 @@ func TestLFM2Parser(t *testing.T) {
 			hasThinking:      true,
 		},
 		{
-			name:            "tool_call_with_unicode_args",
-			input:           "Searching for information.<|tool_call_start|>{\"name\":\"search\",\"arguments\":{\"query\":\"北京天气\",\"language\":\"中文\"}}<|tool_call_end|>",
+			name:            "tool_call_with_text_args",
+			input:           "Searching for information.<|tool_call_start|>[search(query='beijing weather', language='zh')]<|tool_call_end|>",
 			expectedContent: "Searching for information.",
 			expectedCalls: []api.ToolCall{
 				{
 					Function: api.ToolCallFunction{
 						Name: "search",
 						Arguments: testArgs(map[string]any{
-							"query":    "北京天气",
-							"language": "中文",
+							"query":    "beijing weather",
+							"language": "zh",
 						}),
 					},
 				},
@@ -169,7 +169,7 @@ func TestLFM2Parser(t *testing.T) {
 		},
 		{
 			name:            "empty_tool_call_args",
-			input:           "Pinging server.<|tool_call_start|>{\"name\":\"ping\",\"arguments\":{}}<|tool_call_end|>",
+			input:           "Pinging server.<|tool_call_start|>[ping()]<|tool_call_end|>",
 			expectedContent: "Pinging server.",
 			expectedCalls: []api.ToolCall{
 				{
@@ -353,7 +353,7 @@ func TestLFM2Parser_Streaming(t *testing.T) {
 		},
 		{
 			name:            "streaming_tool_call",
-			chunks:          []string{"I'll check weather.", "<|tool_call_start|>", "{\"name\":\"get_weather\",", "\"arguments\":{\"location\":\"Paris\"}}", "<|tool_call_end|>"},
+			chunks:          []string{"I'll check weather.", "<|tool_call_start|>", "[get_weather(", "location=\"Paris\")]", "<|tool_call_end|>"},
 			expectedContent: "I'll check weather.",
 			expectedCalls: []api.ToolCall{
 				{
@@ -381,16 +381,16 @@ func TestLFM2Parser_Streaming(t *testing.T) {
 			hasThinking:     false,
 		},
 		{
-			name:            "streaming_tool_call_with_split_json",
-			chunks:          []string{"Processing.", "<|tool_call_start|>{\"name\":\"calc\",\"arguments\":{\"x\":", "42,\"y\":", "24}}<|tool_call_end|>"},
+			name:            "streaming_tool_call_with_split_python",
+			chunks:          []string{"Processing.", "<|tool_call_start|>", "[calc(", "x=42, ", "y=24)]", "<|tool_call_end|>"},
 			expectedContent: "Processing.",
 			expectedCalls: []api.ToolCall{
 				{
 					Function: api.ToolCallFunction{
 						Name: "calc",
 						Arguments: testArgs(map[string]any{
-							"x": float64(42),
-							"y": float64(24),
+							"x": int64(42),
+							"y": int64(24),
 						}),
 					},
 				},
@@ -516,8 +516,8 @@ func TestLFM2Parser_parseToolCallContent(t *testing.T) {
 		expectError bool
 	}{
 		{
-			name:    "valid_tool_call",
-			content: `{"name":"get_weather","arguments":{"location":"Paris"}}`,
+			name:    "python_style_single_call",
+			content: `get_weather(location="Paris")`,
 			expected: api.ToolCall{
 				Function: api.ToolCallFunction{
 					Name: "get_weather",
@@ -528,21 +528,33 @@ func TestLFM2Parser_parseToolCallContent(t *testing.T) {
 			},
 		},
 		{
-			name:    "complex_arguments",
-			content: `{"name":"process_data","arguments":{"items":["a","b"],"config":{"enabled":true}}}`,
+			name:    "python_style_with_brackets",
+			content: `[get_weather(location="Paris")]`,
 			expected: api.ToolCall{
 				Function: api.ToolCallFunction{
-					Name: "process_data",
+					Name: "get_weather",
 					Arguments: testArgs(map[string]any{
-						"items":  []interface{}{"a", "b"},
-						"config": map[string]interface{}{"enabled": true},
+						"location": "Paris",
 					}),
 				},
 			},
 		},
 		{
-			name:    "empty_arguments",
-			content: `{"name":"ping","arguments":{}}`,
+			name:    "python_style_complex_arguments",
+			content: `process(items=['a', 'b'], config={'enabled': True})`,
+			expected: api.ToolCall{
+				Function: api.ToolCallFunction{
+					Name: "process",
+					Arguments: testArgs(map[string]any{
+						"items":  []any{"a", "b"},
+						"config": map[string]any{"enabled": true},
+					}),
+				},
+			},
+		},
+		{
+			name:    "python_style_empty_arguments",
+			content: `ping()`,
 			expected: api.ToolCall{
 				Function: api.ToolCallFunction{
 					Name:      "ping",
@@ -551,44 +563,13 @@ func TestLFM2Parser_parseToolCallContent(t *testing.T) {
 			},
 		},
 		{
-			name:    "unicode_in_tool_name",
-			content: `{"name":"获取天气","arguments":{"城市":"北京"}}`,
-			expected: api.ToolCall{
-				Function: api.ToolCallFunction{
-					Name: "获取天气",
-					Arguments: testArgs(map[string]any{
-						"城市": "北京",
-					}),
-				},
-			},
-		},
-		{
-			name:    "numeric_arguments",
-			content: `{"name":"calculate","arguments":{"x":3.14,"y":42,"enabled":true}}`,
-			expected: api.ToolCall{
-				Function: api.ToolCallFunction{
-					Name: "calculate",
-					Arguments: testArgs(map[string]any{
-						"x":       3.14,
-						"y":       float64(42),
-						"enabled": true,
-					}),
-				},
-			},
-		},
-		{
-			name:        "invalid_json",
-			content:     `{invalid json}`,
+			name:        "missing_parenthesis",
+			content:     `get_weather location="Paris")`,
 			expectError: true,
 		},
 		{
-			name:        "missing_name",
-			content:     `{"arguments":{"arg":"value"}}`,
-			expectError: true,
-		},
-		{
-			name:        "empty_name",
-			content:     `{"name":"","arguments":{"arg":"value"}}`,
+			name:        "invalid_argument_format",
+			content:     `bash(command)`,
 			expectError: true,
 		},
 	}
@@ -645,6 +626,24 @@ func TestLFM2Parser_parseToolCallsContent(t *testing.T) {
 				},
 			},
 		},
+		{
+			name:    "python_style_complex_literals",
+			content: `[AskUserQuestion(question="What's up?", headers=['Hello!', 'How can I help you?'], options=['Debugging help', 'Code writing assistance'], multiSelect=False, metadata={'priority': 1, 'active': True})]`,
+			expected: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Name: "AskUserQuestion",
+						Arguments: testArgs(map[string]any{
+							"question":    "What's up?",
+							"headers":     []any{"Hello!", "How can I help you?"},
+							"options":     []any{"Debugging help", "Code writing assistance"},
+							"multiSelect": false,
+							"metadata":    map[string]any{"priority": float64(1), "active": true},
+						}),
+					},
+				},
+			},
+		},
 		{
 			name:    "single_python_style_call",
 			content: `bash(command='ls -la')`,
@@ -673,6 +672,34 @@ func TestLFM2Parser_parseToolCallsContent(t *testing.T) {
 				},
 			},
 		},
+		{
+			name:    "single_call_with_orphan_end_tag",
+			content: `[bash(command='ls')]<|tool_call_end|>`,
+			expected: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Name: "bash",
+						Arguments: testArgs(map[string]any{
+							"command": "ls",
+						}),
+					},
+				},
+			},
+		},
+		{
+			name:    "single_call_with_wrapper_tags",
+			content: `<|tool_call_start|>[bash(command='pwd')]<|tool_call_end|>`,
+			expected: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Name: "bash",
+						Arguments: testArgs(map[string]any{
+							"command": "pwd",
+						}),
+					},
+				},
+			},
+		},
 		{
 			name:    "multiple_different_functions",
 			content: `[get_weather(location='Paris'),search(query='news')]`,
@@ -1086,3 +1113,106 @@ func TestLFM2Parser_EdgeCases(t *testing.T) {
 		})
 	}
 }
+
+func TestLFM2Parser_BareToolCallFallback(t *testing.T) {
+	parser := &LFM2Parser{}
+	tools := []api.Tool{
+		{
+			Type: "function",
+			Function: api.ToolFunction{
+				Name: "get_weather",
+			},
+		},
+	}
+	parser.Init(tools, nil, &api.ThinkValue{Value: false})
+
+	content, thinking, calls, err := parser.Add(`[get_weather(location="Paris")]`, true)
+	if err != nil {
+		t.Fatalf("Add() error = %v", err)
+	}
+
+	if content != "" {
+		t.Fatalf("expected empty content, got %q", content)
+	}
+	if thinking != "" {
+		t.Fatalf("expected empty thinking, got %q", thinking)
+	}
+	if len(calls) != 1 {
+		t.Fatalf("expected 1 tool call, got %d", len(calls))
+	}
+	if calls[0].Function.Name != "get_weather" {
+		t.Fatalf("expected tool name get_weather, got %q", calls[0].Function.Name)
+	}
+}
+
+func TestLFM2Parser_BareUnknownToolCallDoesNotParse(t *testing.T) {
+	parser := &LFM2Parser{}
+	tools := []api.Tool{
+		{
+			Type: "function",
+			Function: api.ToolFunction{
+				Name: "get_weather",
+			},
+		},
+	}
+	parser.Init(tools, nil, &api.ThinkValue{Value: false})
+
+	input := `[unknown_tool(location="Paris")]`
+	content, _, calls, err := parser.Add(input, true)
+	if err != nil {
+		t.Fatalf("Add() error = %v", err)
+	}
+
+	if content != input {
+		t.Fatalf("expected content to be preserved, got %q", content)
+	}
+	if len(calls) != 0 {
+		t.Fatalf("expected no tool calls, got %d", len(calls))
+	}
+}
+
+func TestLFM2Parser_ImagePlaceholdersPreserved(t *testing.T) {
+	tests := []struct {
+		name  string
+		input string
+	}{
+		{
+			name:  "indexed_img_placeholder",
+			input: "[img-0]describe this image",
+		},
+		{
+			name:  "template_image_placeholder",
+			input: "<image>describe this image",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			parser := &LFM2Parser{}
+			tools := []api.Tool{
+				{
+					Type: "function",
+					Function: api.ToolFunction{
+						Name: "bash",
+					},
+				},
+			}
+			parser.Init(tools, nil, &api.ThinkValue{Value: false})
+
+			content, thinking, calls, err := parser.Add(tt.input, true)
+			if err != nil {
+				t.Fatalf("Add() error = %v", err)
+			}
+
+			if content != tt.input {
+				t.Fatalf("expected content %q, got %q", tt.input, content)
+			}
+			if thinking != "" {
+				t.Fatalf("expected empty thinking, got %q", thinking)
+			}
+			if len(calls) != 0 {
+				t.Fatalf("expected no tool calls, got %d", len(calls))
+			}
+		})
+	}
+}
--- a/model/parsers/parsers.go
+++ b/model/parsers/parsers.go
@@ -49,6 +49,8 @@ func ParserForName(name string) Parser {
 		p = &Qwen3Parser{hasThinkingSupport: false, defaultThinking: false}
 	case "qwen3-thinking":
 		p = &Qwen3Parser{hasThinkingSupport: true, defaultThinking: true}
+	case "qwen3.5":
+		p = &Qwen3Parser{hasThinkingSupport: true, defaultThinking: true}
 	case "qwen3-coder":
 		p = &Qwen3CoderParser{}
 	case "qwen3-vl-instruct":
--- a/model/parsers/parsers_test.go
+++ b/model/parsers/parsers_test.go
@@ -57,6 +57,9 @@ func TestBuiltInParsersStillWork(t *testing.T) {
 		{"qwen3"},
 		{"qwen3-thinking"},
 		{"qwen3-coder"},
+		{"lfm2"},
+		{"lfm2-thinking"},
+		{"qwen3.5"},
 		{"harmony"},
 	}

--- a/model/parsers/qwen3.go
+++ b/model/parsers/qwen3.go
@@ -204,6 +204,24 @@ func (p *Qwen3Parser) eat() ([]qwen3Event, bool) {
 			p.maybeThinkingOpenAtBOL = false
 		}

+		thinkingCloseIdx := strings.Index(acc, qwen3ThinkingCloseTag)
+		toolOpenIdx := strings.Index(acc, qwen3ToolOpenTag)
+
+		// If a tool call starts before </think>, treat that as the end of thinking
+		// for parsing purposes and continue in tool-call mode.
+		if toolOpenIdx != -1 && (thinkingCloseIdx == -1 || toolOpenIdx < thinkingCloseIdx) {
+			before, after := p.splitAtTag(qwen3ToolOpenTag, true)
+			if len(before) > 0 {
+				events = append(events, qwen3EventThinkingContent{content: before})
+			}
+			if after == "" {
+				p.state = qwen3ParserStateToolStartedEatingWhitespace
+			} else {
+				p.state = qwen3ParserStateCollectingToolContent
+			}
+			return events, true
+		}
+
 		if strings.Contains(acc, qwen3ThinkingCloseTag) {
 			thinking, remaining := p.splitAtTag(qwen3ThinkingCloseTag, true)
 			if len(thinking) > 0 {
@@ -215,7 +233,7 @@ func (p *Qwen3Parser) eat() ([]qwen3Event, bool) {
 				p.state = qwen3ParserStateCollectingContent
 			}
 			return events, true
-		} else if overlapLen := overlap(acc, qwen3ThinkingCloseTag); overlapLen > 0 {
+		} else if overlapLen := max(overlap(acc, qwen3ThinkingCloseTag), overlap(acc, qwen3ToolOpenTag)); overlapLen > 0 {
 			beforePartialTag := acc[:len(acc)-overlapLen]
 			trailingWsLen := trailingWhitespaceLen(beforePartialTag)
 			ambiguousStart := len(beforePartialTag) - trailingWsLen
--- a/model/parsers/qwen3_test.go
+++ b/model/parsers/qwen3_test.go
@@ -145,3 +145,88 @@ func TestQwen3ParserToolCall(t *testing.T) {
 		t.Fatalf("expected unit %q, got %v", "celsius", unit)
 	}
 }
+
+func TestQwen3ParserThinkingWithToolCallBeforeThinkingClose(t *testing.T) {
+	parser := &Qwen3Parser{hasThinkingSupport: true, defaultThinking: true}
+	parser.Init(nil, nil, &api.ThinkValue{Value: true})
+
+	input := "Let me think<tool_call>{\"name\":\"get_weather\",\"arguments\":{\"location\":\"San Francisco\",\"unit\":\"celsius\"}}</tool_call>"
+	content, thinking, calls, err := parser.Add(input, true)
+	if err != nil {
+		t.Fatalf("parse failed: %v", err)
+	}
+
+	if content != "" {
+		t.Fatalf("expected empty content, got %q", content)
+	}
+	if thinking != "Let me think" {
+		t.Fatalf("expected thinking %q, got %q", "Let me think", thinking)
+	}
+	if len(calls) != 1 {
+		t.Fatalf("expected 1 tool call, got %d", len(calls))
+	}
+	if calls[0].Function.Name != "get_weather" {
+		t.Fatalf("expected tool name %q, got %q", "get_weather", calls[0].Function.Name)
+	}
+}
+
+func TestQwen3ParserThinkingWithSplitToolOpenTag(t *testing.T) {
+	parser := &Qwen3Parser{hasThinkingSupport: true, defaultThinking: true}
+	parser.Init(nil, nil, &api.ThinkValue{Value: true})
+
+	content, thinking, calls, err := parser.Add("Let me think<tool_ca", false)
+	if err != nil {
+		t.Fatalf("parse failed on first chunk: %v", err)
+	}
+	if content != "" || thinking != "Let me think" || len(calls) != 0 {
+		t.Fatalf(
+			"expected content=%q thinking=%q calls=%d, got content=%q thinking=%q calls=%d",
+			"",
+			"Let me think",
+			0,
+			content,
+			thinking,
+			len(calls),
+		)
+	}
+
+	content, thinking, calls, err = parser.Add("ll>{\"name\":\"get_weather\",\"arguments\":{\"location\":\"SF\"}}</tool_call>", true)
+	if err != nil {
+		t.Fatalf("parse failed on second chunk: %v", err)
+	}
+	if content != "" {
+		t.Fatalf("expected empty content, got %q", content)
+	}
+	if thinking != "" {
+		t.Fatalf("expected no additional thinking on second chunk, got %q", thinking)
+	}
+	if len(calls) != 1 {
+		t.Fatalf("expected 1 tool call, got %d", len(calls))
+	}
+	if calls[0].Function.Name != "get_weather" {
+		t.Fatalf("expected tool name %q, got %q", "get_weather", calls[0].Function.Name)
+	}
+}
+
+func TestQwen35ParserRespectsNoThink(t *testing.T) {
+	parser := ParserForName("qwen3.5")
+	if parser == nil {
+		t.Fatal("expected qwen3.5 parser")
+	}
+
+	parser.Init(nil, nil, &api.ThinkValue{Value: false})
+	content, thinking, calls, err := parser.Add("Hello! How can I help you today?", true)
+	if err != nil {
+		t.Fatalf("parse failed: %v", err)
+	}
+
+	if thinking != "" {
+		t.Fatalf("expected no thinking, got %q", thinking)
+	}
+	if content != "Hello! How can I help you today?" {
+		t.Fatalf("expected content %q, got %q", "Hello! How can I help you today?", content)
+	}
+	if len(calls) != 0 {
+		t.Fatalf("expected no tool calls, got %d", len(calls))
+	}
+}
--- a/model/parsers/qwen3vl.go
+++ b/model/parsers/qwen3vl.go
@@ -180,7 +180,22 @@ func (p *Qwen3VLParser) eat() ([]qwenEvent, bool) {
 			return events, false
 		}
 	case CollectingThinkingContent:
-		if strings.Contains(p.buffer.String(), thinkingCloseTag) {
+		acc := p.buffer.String()
+		thinkingCloseIdx := strings.Index(acc, thinkingCloseTag)
+		toolOpenIdx := strings.Index(acc, toolOpenTag)
+
+		// If a tool call starts before </think>, treat that as the end of thinking
+		// for parsing purposes and continue in tool-call mode.
+		if toolOpenIdx != -1 && (thinkingCloseIdx == -1 || toolOpenIdx < thinkingCloseIdx) {
+			before, _ := splitAtTag(&p.buffer, toolOpenTag, false)
+			if len(before) > 0 {
+				events = append(events, qwenEventThinkingContent{content: before})
+			}
+			p.state = CollectingToolContent
+			return events, true
+		}
+
+		if strings.Contains(acc, thinkingCloseTag) {
 			thinking, remaining := splitAtTag(&p.buffer, thinkingCloseTag, true)
 			if len(thinking) > 0 {
 				events = append(events, qwenEventThinkingContent{content: thinking})
@@ -191,13 +206,13 @@ func (p *Qwen3VLParser) eat() ([]qwenEvent, bool) {
 				p.state = CollectingContent
 			}
 			return events, true
-		} else if overlapLen := overlap(p.buffer.String(), thinkingCloseTag); overlapLen > 0 {
-			beforePartialTag := p.buffer.String()[:len(p.buffer.String())-overlapLen]
+		} else if overlapLen := max(overlap(acc, thinkingCloseTag), overlap(acc, toolOpenTag)); overlapLen > 0 {
+			beforePartialTag := acc[:len(acc)-overlapLen]
 			trailingWhitespaceLen := trailingWhitespaceLen(beforePartialTag)
 			ambiguousStart := len(beforePartialTag) - trailingWhitespaceLen

-			unambiguous := p.buffer.String()[:ambiguousStart]
-			ambiguous := p.buffer.String()[ambiguousStart:]
+			unambiguous := acc[:ambiguousStart]
+			ambiguous := acc[ambiguousStart:]
 			p.buffer.Reset()
 			p.buffer.WriteString(ambiguous)
 			if len(unambiguous) > 0 {
@@ -205,11 +220,11 @@ func (p *Qwen3VLParser) eat() ([]qwenEvent, bool) {
 			}
 			return events, false
 		} else {
-			whitespaceLen := trailingWhitespaceLen(p.buffer.String())
-			ambiguousStart := len(p.buffer.String()) - whitespaceLen
+			whitespaceLen := trailingWhitespaceLen(acc)
+			ambiguousStart := len(acc) - whitespaceLen

-			unambiguous := p.buffer.String()[:ambiguousStart]
-			ambiguous := p.buffer.String()[ambiguousStart:]
+			unambiguous := acc[:ambiguousStart]
+			ambiguous := acc[ambiguousStart:]
 			p.buffer.Reset()
 			p.buffer.WriteString(ambiguous)
 			if len(unambiguous) > 0 {
--- a/model/parsers/qwen3vl_thinking_test.go
+++ b/model/parsers/qwen3vl_thinking_test.go
@@ -98,8 +98,12 @@ func TestQwen3VLThinkingParserStreaming(t *testing.T) {
 			desc: "nested thinking and tool call (outside thinking, inside tool call)",
 			steps: []step{
 				{
-					input:      "I'm thinking<tool_call>I'm nested tool call</tool_call></think>",
-					wantEvents: []qwenEvent{qwenEventThinkingContent{content: "I'm thinking<tool_call>I'm nested tool call</tool_call>"}},
+					input: "I'm thinking<tool_call>I'm nested tool call</tool_call></think>",
+					wantEvents: []qwenEvent{
+						qwenEventThinkingContent{content: "I'm thinking"},
+						qwenEventRawToolCall{raw: "I'm nested tool call"},
+						qwenEventContent{content: "</think>"},
+					},
 				},
 			},
 		},
@@ -109,8 +113,7 @@ func TestQwen3VLThinkingParserStreaming(t *testing.T) {
 				{
 					input: "<tool_call>I'm nested tool call<think>I'm thinking</think></tool_call>",
 					wantEvents: []qwenEvent{
-						qwenEventThinkingContent{content: "<tool_call>I'm nested tool call<think>I'm thinking"},
-						qwenEventContent{content: "</tool_call>"},
+						qwenEventRawToolCall{raw: "I'm nested tool call<think>I'm thinking</think>"},
 					},
 				},
 			},
@@ -121,8 +124,8 @@ func TestQwen3VLThinkingParserStreaming(t *testing.T) {
 				{
 					input: "I'm thinking<tool_call>I'm NOT a nested tool call</think></tool_call><tool_call>I'm nested tool call 2<think></tool_call></think>",
 					wantEvents: []qwenEvent{
-						qwenEventThinkingContent{content: "I'm thinking<tool_call>I'm NOT a nested tool call"},
-						qwenEventContent{content: "</tool_call>"},
+						qwenEventThinkingContent{content: "I'm thinking"},
+						qwenEventRawToolCall{raw: "I'm NOT a nested tool call</think>"},
 						qwenEventRawToolCall{raw: "I'm nested tool call 2<think>"},
 						qwenEventContent{content: "</think>"},
 					},
--- a/model/renderers/lfm2.go
+++ b/model/renderers/lfm2.go
@@ -1,7 +1,10 @@
 package renderers

 import (
+	"bytes"
 	"encoding/json"
+	"fmt"
+	"sort"
 	"strings"

 	"github.com/ollama/ollama/api"
@@ -9,18 +12,222 @@ import (

 type LFM2Renderer struct {
 	IsThinking bool
+	useImgTags bool
+}
+
+const lfm2BOSToken = "<|startoftext|>"
+
+const (
+	lfm2ToolListStartTag     = "<|tool_list_start|>"
+	lfm2ToolListEndTag       = "<|tool_list_end|>"
+	lfm2ToolCallStartTag     = "<|tool_call_start|>"
+	lfm2ToolCallEndTag       = "<|tool_call_end|>"
+	lfm2ToolResponseStartTag = "<|tool_response_start|>"
+	lfm2ToolResponseEndTag   = "<|tool_response_end|>"
+)
+
+func lfm2RenderSystemContent(content any) string {
+	switch v := content.(type) {
+	case string:
+		return v
+	case []any:
+		var sb strings.Builder
+		for _, item := range v {
+			obj, ok := item.(map[string]any)
+			if !ok {
+				continue
+			}
+
+			if itemType, _ := obj["type"].(string); itemType == "text" {
+				if text, ok := obj["text"].(string); ok {
+					sb.WriteString(text)
+				}
+			}
+		}
+		return sb.String()
+	default:
+		return ""
+	}
+}
+
+func lfm2JSON(v any) string {
+	var buf bytes.Buffer
+	enc := json.NewEncoder(&buf)
+	enc.SetEscapeHTML(false)
+	if err := enc.Encode(v); err != nil {
+		fallback, _ := json.Marshal(v)
+		return string(fallback)
+	}
+
+	encoded := bytes.TrimSuffix(buf.Bytes(), []byte{'\n'})
+
+	// HF `tojson` defaults to `json.dumps(..., separators=None)`, which inserts
+	// a space after commas and colons.
+	var out strings.Builder
+	out.Grow(len(encoded) + len(encoded)/8)
+
+	inString := false
+	escaped := false
+	for i, b := range encoded {
+		out.WriteByte(b)
+
+		if inString {
+			if escaped {
+				escaped = false
+				continue
+			}
+			if b == '\\' {
+				escaped = true
+				continue
+			}
+			if b == '"' {
+				inString = false
+			}
+			continue
+		}
+
+		if b == '"' {
+			inString = true
+			continue
+		}
+
+		if (b == ':' || b == ',') && i+1 < len(encoded) {
+			next := encoded[i+1]
+			if next != ' ' && next != '\n' && next != '\r' && next != '\t' {
+				out.WriteByte(' ')
+			}
+		}
+	}
+
+	return out.String()
+}
+
+func lfm2ImagePlaceholder(useImgTags bool) string {
+	if useImgTags {
+		return "[img]"
+	}
+
+	return "<image>"
+}
+
+func lfm2RenderContent(content any, useImgTags bool) string {
+	switch v := content.(type) {
+	case string:
+		return v
+	case []any:
+		var sb strings.Builder
+		for _, item := range v {
+			obj, ok := item.(map[string]any)
+			if !ok {
+				sb.WriteString(lfm2JSON(item))
+				continue
+			}
+
+			itemType, _ := obj["type"].(string)
+			switch itemType {
+			case "image":
+				sb.WriteString(lfm2ImagePlaceholder(useImgTags))
+			case "text":
+				if text, ok := obj["text"].(string); ok {
+					sb.WriteString(text)
+				} else {
+					sb.WriteString(lfm2JSON(item))
+				}
+			default:
+				sb.WriteString(lfm2JSON(item))
+			}
+		}
+		return sb.String()
+	default:
+		return lfm2JSON(content)
+	}
+}
+
+func lfm2ToolSchema(tool api.Tool) any {
+	if tool.Function.Name == "" {
+		return tool
+	}
+
+	// LFM2 templates are typically fed function-schema objects (name/description/parameters).
+	return tool.Function
+}
+
+func lfm2ToolCallArgument(v any) string {
+	return lfm2JSON(v)
+}
+
+func lfm2RenderToolCalls(calls []api.ToolCall) string {
+	var sb strings.Builder
+
+	sb.WriteString(lfm2ToolCallStartTag)
+	sb.WriteString("[")
+	for i, tc := range calls {
+		if i > 0 {
+			sb.WriteString(",")
+		}
+
+		sb.WriteString(tc.Function.Name)
+		sb.WriteString("(")
+
+		keys := make([]string, 0, tc.Function.Arguments.Len())
+		for key := range tc.Function.Arguments.All() {
+			keys = append(keys, key)
+		}
+		sort.Strings(keys)
+
+		for j, key := range keys {
+			if j > 0 {
+				sb.WriteString(",")
+			}
+			value, _ := tc.Function.Arguments.Get(key)
+			sb.WriteString(key)
+			sb.WriteString("=")
+			sb.WriteString(lfm2ToolCallArgument(value))
+		}
+
+		sb.WriteString(")")
+	}
+	sb.WriteString("]")
+	sb.WriteString(lfm2ToolCallEndTag)
+
+	return sb.String()
+}
+
+func (r *LFM2Renderer) renderMessageContent(message api.Message, imageOffset int) string {
+	content := lfm2RenderContent(message.Content, r.useImgTags)
+	if len(message.Images) == 0 {
+		return content
+	}
+
+	var sb strings.Builder
+	if r.useImgTags {
+		for i := range message.Images {
+			sb.WriteString(fmt.Sprintf("[img-%d]", imageOffset+i))
+		}
+	} else {
+		placeholder := lfm2ImagePlaceholder(false)
+		if strings.Contains(content, placeholder) {
+			return content
+		}
+		for range message.Images {
+			sb.WriteString(placeholder)
+		}
+	}
+	sb.WriteString(content)
+	return sb.String()
 }

 func (r *LFM2Renderer) Render(messages []api.Message, tools []api.Tool, thinkValue *api.ThinkValue) (string, error) {
 	var sb strings.Builder

-	// Note: BOS token is added by the tokenizer (add_bos_token: true), not the renderer
+	// Follow Liquid tool-use formatting for LFM2 tool wrappers.
+	sb.WriteString(lfm2BOSToken)

 	// Extract first system message if present (to combine with tools)
 	var firstSystemContent string
 	startIdx := 0
 	if len(messages) > 0 && messages[0].Role == "system" {
-		firstSystemContent = messages[0].Content
+		firstSystemContent = lfm2RenderSystemContent(messages[0].Content)
 		startIdx = 1
 	}

@@ -29,18 +236,17 @@ func (r *LFM2Renderer) Render(messages []api.Message, tools []api.Tool, thinkVal
 		if firstSystemContent != "" {
 			firstSystemContent += "\n"
 		}
-		firstSystemContent += "List of tools: ["
+		firstSystemContent += "List of tools: "
+		firstSystemContent += lfm2ToolListStartTag
+		firstSystemContent += "["
 		for i, tool := range tools {
-			toolJSON, err := json.Marshal(tool)
-			if err != nil {
-				return "", err
-			}
-			firstSystemContent += string(toolJSON)
+			firstSystemContent += lfm2JSON(lfm2ToolSchema(tool))
 			if i < len(tools)-1 {
 				firstSystemContent += ", "
 			}
 		}
 		firstSystemContent += "]"
+		firstSystemContent += lfm2ToolListEndTag
 	}

 	// Output first system block if it has content
@@ -50,6 +256,8 @@ func (r *LFM2Renderer) Render(messages []api.Message, tools []api.Tool, thinkVal
 		sb.WriteString("<|im_end|>\n")
 	}

+	keepPastThinking := r.IsThinking && (thinkValue != nil && thinkValue.Bool())
+
 	// Find the index of the last assistant message for thinking stripping
 	lastAssistantIndex := -1
 	for i := len(messages) - 1; i >= startIdx; i-- {
@@ -59,85 +267,53 @@ func (r *LFM2Renderer) Render(messages []api.Message, tools []api.Tool, thinkVal
 		}
 	}

-	// Track whether we need to add generation prompt
-	needsGenerationPrompt := len(messages) > 0
+	imageOffset := 0
+	for i := range startIdx {
+		imageOffset += len(messages[i].Images)
+	}

 	for i := startIdx; i < len(messages); i++ {
 		message := messages[i]
-		switch message.Role {
-		case "system":
-			// Additional system messages (after the first) are rendered normally
-			sb.WriteString("<|im_start|>system\n")
-			sb.WriteString(message.Content)
-			sb.WriteString("<|im_end|>\n")
+		lastMessage := i == len(messages)-1
+		prefill := lastMessage && message.Role == "assistant"

-		case "user":
-			sb.WriteString("<|im_start|>user\n")
-			sb.WriteString(message.Content)
-			sb.WriteString("<|im_end|>\n")
-			needsGenerationPrompt = true
+		sb.WriteString("<|im_start|>")
+		sb.WriteString(message.Role)
+		sb.WriteString("\n")

-		case "assistant":
-			sb.WriteString("<|im_start|>assistant\n")
-
-			// Check if this is the last assistant message
-			isLastAssistant := i == lastAssistantIndex
-
-			// Process content (may need thinking stripped)
-			content := message.Content
-
-			// Handle thinking tags in assistant content
-			keepPastThinking := r.IsThinking && (thinkValue != nil && thinkValue.Bool())
-			if strings.Contains(content, "</think>") {
-				parts := strings.SplitN(content, "</think>", 2)
-				if len(parts) > 1 {
-					if !isLastAssistant && !keepPastThinking {
-						// Strip thinking entirely for past assistant messages
-						content = strings.TrimSpace(parts[1])
-					} else {
-						// Preserve thinking but trim whitespace after </think>
-						content = parts[0] + "</think>" + strings.TrimLeft(parts[1], " \t\n\r")
-					}
-				}
+		content := r.renderMessageContent(message, imageOffset)
+		imageOffset += len(message.Images)
+		if message.Role == "assistant" && !keepPastThinking && i != lastAssistantIndex {
+			if idx := strings.LastIndex(content, "</think>"); idx >= 0 {
+				content = strings.TrimSpace(content[idx+len("</think>"):])
 			}
-
-			if len(message.ToolCalls) > 0 {
-				// Assistant with tool calls - write content first (if any after stripping)
-				if content != "" {
-					sb.WriteString(content)
-				}
-
-				for _, toolCall := range message.ToolCalls {
-					sb.WriteString("<|tool_call_start|>")
-					toolCallJSON := map[string]any{
-						"name":      toolCall.Function.Name,
-						"arguments": toolCall.Function.Arguments,
-					}
-					callJSON, _ := json.Marshal(toolCallJSON)
-					sb.WriteString(string(callJSON))
-					sb.WriteString("<|tool_call_end|>")
-				}
+		}
+		if message.Role == "assistant" && len(message.ToolCalls) > 0 && !strings.Contains(content, lfm2ToolCallStartTag) {
+			if strings.TrimSpace(content) == "" {
+				content = lfm2RenderToolCalls(message.ToolCalls) + content
 			} else {
-				sb.WriteString(content)
+				content = lfm2RenderToolCalls(message.ToolCalls) + "\n" + content
 			}
+		}
+		if message.Role == "tool" && !strings.Contains(content, lfm2ToolResponseStartTag) {
+			content = lfm2ToolResponseStartTag + content + lfm2ToolResponseEndTag
+		}

+		sb.WriteString(content)
+		if !prefill {
 			sb.WriteString("<|im_end|>\n")
-			needsGenerationPrompt = true // Always add gen prompt after assistant when add_generation_prompt=true
-
-		case "tool":
-			// Tool responses are rendered as plain messages per the chat template
-			sb.WriteString("<|im_start|>tool\n")
-			sb.WriteString(message.Content)
-			sb.WriteString("<|im_end|>\n")
-			needsGenerationPrompt = true
 		}
 	}

-	// Add generation prompt
+	needsGenerationPrompt := true
+	if len(messages) > 0 && messages[len(messages)-1].Role == "assistant" {
+		needsGenerationPrompt = false
+	}
+
 	if needsGenerationPrompt {
+		// RenderWithRenderer uses add_generation_prompt=true for chat rendering,
+		// unless we're prefilling a trailing assistant message.
 		sb.WriteString("<|im_start|>assistant\n")
-		// Note: Model is a "thinking-only" model - it will output <think> itself
-		// We don't add <think> tag to the prompt
 	}

 	return sb.String(), nil
--- a/model/renderers/lfm2_test.go
+++ b/model/renderers/lfm2_test.go
@@ -8,73 +8,136 @@ import (
 	"github.com/ollama/ollama/api"
 )

-func TestLFM2Renderer(t *testing.T) {
+func TestLFM2Renderer_ChatTemplateParity(t *testing.T) {
 	tests := []struct {
 		name       string
+		renderer   *LFM2Renderer
 		messages   []api.Message
 		tools      []api.Tool
 		thinkValue *api.ThinkValue
 		expected   string
 	}{
 		{
-			name: "basic user message",
+			name:     "user_only",
+			renderer: &LFM2Renderer{IsThinking: false},
 			messages: []api.Message{
-				{Role: "user", Content: "Hello!"},
+				{Role: "user", Content: "Hello"},
 			},
 			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>user\nHello!<|im_end|>\n<|im_start|>assistant\n",
+			expected:   "<|startoftext|><|im_start|>user\nHello<|im_end|>\n<|im_start|>assistant\n",
 		},
 		{
-			name: "basic with system message",
-			messages: []api.Message{
-				{Role: "system", Content: "You are a helpful assistant."},
-				{Role: "user", Content: "Hello!"},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nHello!<|im_end|>\n<|im_start|>assistant\n",
-		},
-		{
-			name: "multiple system messages rendered separately",
-			messages: []api.Message{
-				{Role: "system", Content: "First instruction."},
-				{Role: "system", Content: "Second instruction."},
-				{Role: "user", Content: "Hello!"},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>system\nFirst instruction.<|im_end|>\n<|im_start|>system\nSecond instruction.<|im_end|>\n<|im_start|>user\nHello!<|im_end|>\n<|im_start|>assistant\n",
-		},
-		{
-			name: "multi-turn conversation",
-			messages: []api.Message{
-				{Role: "user", Content: "What is 2+2?"},
-				{Role: "assistant", Content: "The answer is 4."},
-				{Role: "user", Content: "Thanks!"},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>user\nWhat is 2+2?<|im_end|>\n<|im_start|>assistant\nThe answer is 4.<|im_end|>\n<|im_start|>user\nThanks!<|im_end|>\n<|im_start|>assistant\n",
-		},
-		{
-			name: "only system message",
+			name:     "system_and_user",
+			renderer: &LFM2Renderer{IsThinking: false},
 			messages: []api.Message{
 				{Role: "system", Content: "You are helpful."},
+				{Role: "user", Content: "Hi"},
 			},
 			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>system\nYou are helpful.<|im_end|>\n<|im_start|>assistant\n",
+			expected:   "<|startoftext|><|im_start|>system\nYou are helpful.<|im_end|>\n<|im_start|>user\nHi<|im_end|>\n<|im_start|>assistant\n",
 		},
 		{
-			// When assistant is the LAST assistant, thinking is preserved (even with keep_past_thinking=false)
-			name: "user-assistant-user: last assistant preserves thinking",
+			name:     "tools_without_system",
+			renderer: &LFM2Renderer{IsThinking: false},
 			messages: []api.Message{
-				{Role: "user", Content: "Q1"},
-				{Role: "assistant", Content: "<think>reasoning</think>A1"},
-				{Role: "user", Content: "Q2"},
+				{Role: "user", Content: "Use tools"},
+			},
+			tools: []api.Tool{
+				{
+					Type: "function",
+					Function: api.ToolFunction{
+						Name: "get_weather",
+						Parameters: api.ToolFunctionParameters{
+							Type: "object",
+						},
+					},
+				},
 			},
 			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>user\nQ1<|im_end|>\n<|im_start|>assistant\n<think>reasoning</think>A1<|im_end|>\n<|im_start|>user\nQ2<|im_end|>\n<|im_start|>assistant\n",
+			expected: "<|startoftext|><|im_start|>system\nList of tools: <|tool_list_start|>[{\"name\": \"get_weather\", \"parameters\": {\"type\": \"object\", \"properties\": null}}]<|tool_list_end|><|im_end|>\n" +
+				"<|im_start|>user\nUse tools<|im_end|>\n<|im_start|>assistant\n",
 		},
 		{
-			// With two assistants, first is stripped (not last), second preserved (is last)
-			name: "multi-turn thinking: first stripped, second preserved",
+			name:     "first_system_combined_with_tools",
+			renderer: &LFM2Renderer{IsThinking: false},
+			messages: []api.Message{
+				{Role: "system", Content: "Follow instructions."},
+				{Role: "user", Content: "Do work"},
+			},
+			tools: []api.Tool{
+				{
+					Type: "function",
+					Function: api.ToolFunction{
+						Name: "tool_a",
+						Parameters: api.ToolFunctionParameters{
+							Type: "object",
+						},
+					},
+				},
+				{
+					Type: "function",
+					Function: api.ToolFunction{
+						Name: "tool_b",
+						Parameters: api.ToolFunctionParameters{
+							Type: "object",
+						},
+					},
+				},
+			},
+			thinkValue: &api.ThinkValue{Value: false},
+			expected: "<|startoftext|><|im_start|>system\nFollow instructions.\nList of tools: <|tool_list_start|>[{\"name\": \"tool_a\", \"parameters\": {\"type\": \"object\", \"properties\": null}}, {\"name\": \"tool_b\", \"parameters\": {\"type\": \"object\", \"properties\": null}}]<|tool_list_end|><|im_end|>\n" +
+				"<|im_start|>user\nDo work<|im_end|>\n<|im_start|>assistant\n",
+		},
+		{
+			name:     "assistant_tool_calls_and_tool_responses_are_rendered",
+			renderer: &LFM2Renderer{IsThinking: false},
+			messages: []api.Message{
+				{Role: "user", Content: "Call a tool"},
+				{
+					Role:    "assistant",
+					Content: "",
+					ToolCalls: []api.ToolCall{
+						{
+							Function: api.ToolCallFunction{
+								Name: "get_weather",
+								Arguments: testArgs(map[string]any{
+									"location": "Paris",
+								}),
+							},
+						},
+					},
+				},
+				{Role: "tool", Content: "22C"},
+			},
+			thinkValue: &api.ThinkValue{Value: false},
+			expected:   "<|startoftext|><|im_start|>user\nCall a tool<|im_end|>\n<|im_start|>assistant\n<|tool_call_start|>[get_weather(location=\"Paris\")]<|tool_call_end|><|im_end|>\n<|im_start|>tool\n<|tool_response_start|>22C<|tool_response_end|><|im_end|>\n<|im_start|>assistant\n",
+		},
+		{
+			name:     "assistant_tool_calls_with_content_preserves_both",
+			renderer: &LFM2Renderer{IsThinking: false},
+			messages: []api.Message{
+				{Role: "user", Content: "Call a tool"},
+				{
+					Role:    "assistant",
+					Content: "Checking now.",
+					ToolCalls: []api.ToolCall{
+						{
+							Function: api.ToolCallFunction{
+								Name: "get_weather",
+								Arguments: testArgs(map[string]any{
+									"location": "Paris",
+								}),
+							},
+						},
+					},
+				},
+			},
+			thinkValue: &api.ThinkValue{Value: false},
+			expected:   "<|startoftext|><|im_start|>user\nCall a tool<|im_end|>\n<|im_start|>assistant\n<|tool_call_start|>[get_weather(location=\"Paris\")]<|tool_call_end|>\nChecking now.",
+		},
+		{
+			name:     "thinking_strips_non_last_assistant_when_disabled",
+			renderer: &LFM2Renderer{IsThinking: true},
 			messages: []api.Message{
 				{Role: "user", Content: "Q1"},
 				{Role: "assistant", Content: "<think>reason1</think>A1"},
@@ -82,11 +145,11 @@ func TestLFM2Renderer(t *testing.T) {
 				{Role: "assistant", Content: "<think>reason2</think>A2"},
 			},
 			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>user\nQ1<|im_end|>\n<|im_start|>assistant\nA1<|im_end|>\n<|im_start|>user\nQ2<|im_end|>\n<|im_start|>assistant\n<think>reason2</think>A2<|im_end|>\n<|im_start|>assistant\n",
+			expected:   "<|startoftext|><|im_start|>user\nQ1<|im_end|>\n<|im_start|>assistant\nA1<|im_end|>\n<|im_start|>user\nQ2<|im_end|>\n<|im_start|>assistant\n<think>reason2</think>A2",
 		},
 		{
-			// With thinking enabled (keep_past_thinking=true), both preserved
-			name: "multi-turn thinking: both preserved when thinking enabled",
+			name:     "thinking_preserves_past_assistant_when_enabled",
+			renderer: &LFM2Renderer{IsThinking: true},
 			messages: []api.Message{
 				{Role: "user", Content: "Q1"},
 				{Role: "assistant", Content: "<think>reason1</think>A1"},
@@ -94,334 +157,127 @@ func TestLFM2Renderer(t *testing.T) {
 				{Role: "assistant", Content: "<think>reason2</think>A2"},
 			},
 			thinkValue: &api.ThinkValue{Value: true},
-			expected:   "<|im_start|>user\nQ1<|im_end|>\n<|im_start|>assistant\n<think>reason1</think>A1<|im_end|>\n<|im_start|>user\nQ2<|im_end|>\n<|im_start|>assistant\n<think>reason2</think>A2<|im_end|>\n<|im_start|>assistant\n",
+			expected:   "<|startoftext|><|im_start|>user\nQ1<|im_end|>\n<|im_start|>assistant\n<think>reason1</think>A1<|im_end|>\n<|im_start|>user\nQ2<|im_end|>\n<|im_start|>assistant\n<think>reason2</think>A2",
 		},
 		{
-			name: "assistant with tool calls",
+			name:     "arbitrary_roles_are_rendered_verbatim",
+			renderer: &LFM2Renderer{IsThinking: false},
 			messages: []api.Message{
-				{Role: "user", Content: "What's the weather?"},
-				{
-					Role: "assistant",
-					ToolCalls: []api.ToolCall{
-						{
-							Function: api.ToolCallFunction{
-								Name: "get_weather",
-								Arguments: testArgs(map[string]any{
-									"location": "Paris",
-								}),
-							},
-						},
-					},
-				},
+				{Role: "developer", Content: "Do X"},
+				{Role: "user", Content: "Hi"},
 			},
 			thinkValue: &api.ThinkValue{Value: false},
-			expected:   `<|im_start|>user` + "\n" + `What's the weather?<|im_end|>` + "\n" + `<|im_start|>assistant` + "\n" + `<|tool_call_start|>{"arguments":{"location":"Paris"},"name":"get_weather"}<|tool_call_end|><|im_end|>` + "\n" + `<|im_start|>assistant` + "\n",
+			expected:   "<|startoftext|><|im_start|>developer\nDo X<|im_end|>\n<|im_start|>user\nHi<|im_end|>\n<|im_start|>assistant\n",
 		},
 		{
-			name: "assistant with content and tool calls",
+			name:       "empty_messages_still_add_generation_prompt",
+			renderer:   &LFM2Renderer{IsThinking: false},
+			messages:   nil,
+			thinkValue: &api.ThinkValue{Value: false},
+			expected:   "<|startoftext|><|im_start|>assistant\n",
+		},
+		{
+			name:     "assistant_prefill_no_generation_prompt",
+			renderer: &LFM2Renderer{IsThinking: false},
 			messages: []api.Message{
-				{Role: "user", Content: "What's the weather in Paris?"},
-				{
-					Role:    "assistant",
-					Content: "Let me check.",
-					ToolCalls: []api.ToolCall{
-						{
-							Function: api.ToolCallFunction{
-								Name: "get_weather",
-								Arguments: testArgs(map[string]any{
-									"location": "Paris",
-								}),
-							},
-						},
-					},
-				},
+				{Role: "user", Content: "Hi"},
+				{Role: "assistant", Content: "Hello"},
 			},
 			thinkValue: &api.ThinkValue{Value: false},
-			expected:   `<|im_start|>user` + "\n" + `What's the weather in Paris?<|im_end|>` + "\n" + `<|im_start|>assistant` + "\n" + `Let me check.<|tool_call_start|>{"arguments":{"location":"Paris"},"name":"get_weather"}<|tool_call_end|><|im_end|>` + "\n" + `<|im_start|>assistant` + "\n",
-		},
-		{
-			name: "tool response",
-			messages: []api.Message{
-				{Role: "user", Content: "What's the weather?"},
-				{Role: "assistant", Content: "Let me check."},
-				{Role: "tool", Content: "22C, Sunny"},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>user\nWhat's the weather?<|im_end|>\n<|im_start|>assistant\nLet me check.<|im_end|>\n<|im_start|>tool\n22C, Sunny<|im_end|>\n<|im_start|>assistant\n",
-		},
-		{
-			name: "multiple tool calls",
-			messages: []api.Message{
-				{Role: "user", Content: "Get weather for Paris and London"},
-				{
-					Role: "assistant",
-					ToolCalls: []api.ToolCall{
-						{
-							Function: api.ToolCallFunction{
-								Name: "get_weather",
-								Arguments: testArgs(map[string]any{
-									"location": "Paris",
-								}),
-							},
-						},
-						{
-							Function: api.ToolCallFunction{
-								Name: "get_weather",
-								Arguments: testArgs(map[string]any{
-									"location": "London",
-								}),
-							},
-						},
-					},
-				},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   `<|im_start|>user` + "\n" + `Get weather for Paris and London<|im_end|>` + "\n" + `<|im_start|>assistant` + "\n" + `<|tool_call_start|>{"arguments":{"location":"Paris"},"name":"get_weather"}<|tool_call_end|><|tool_call_start|>{"arguments":{"location":"London"},"name":"get_weather"}<|tool_call_end|><|im_end|>` + "\n" + `<|im_start|>assistant` + "\n",
-		},
-		{
-			name: "tools definitions with system message",
-			messages: []api.Message{
-				{Role: "system", Content: "You are helpful."},
-				{Role: "user", Content: "What's the weather?"},
-			},
-			tools: []api.Tool{
-				{
-					Type: "function",
-					Function: api.ToolFunction{
-						Name:        "get_weather",
-						Description: "Get current weather",
-						Parameters: api.ToolFunctionParameters{
-							Type: "object",
-							Properties: testPropsMap(map[string]api.ToolProperty{
-								"location": {
-									Type:        api.PropertyType{"string"},
-									Description: "City name",
-								},
-							}),
-							Required: []string{"location"},
-						},
-					},
-				},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   `<|im_start|>system` + "\n" + `You are helpful.` + "\n" + `List of tools: [{"type":"function","function":{"name":"get_weather","description":"Get current weather","parameters":{"type":"object","required":["location"],"properties":{"location":{"type":"string","description":"City name"}}}}}]<|im_end|>` + "\n" + `<|im_start|>user` + "\n" + `What's the weather?<|im_end|>` + "\n" + `<|im_start|>assistant` + "\n",
-		},
-		{
-			name: "tools definitions without system message",
-			messages: []api.Message{
-				{Role: "user", Content: "What's the weather?"},
-			},
-			tools: []api.Tool{
-				{
-					Type: "function",
-					Function: api.ToolFunction{
-						Name:        "get_weather",
-						Description: "Get current weather",
-						Parameters: api.ToolFunctionParameters{
-							Type: "object",
-							Properties: testPropsMap(map[string]api.ToolProperty{
-								"location": {
-									Type:        api.PropertyType{"string"},
-									Description: "City name",
-								},
-							}),
-							Required: []string{"location"},
-						},
-					},
-				},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   `<|im_start|>system` + "\n" + `List of tools: [{"type":"function","function":{"name":"get_weather","description":"Get current weather","parameters":{"type":"object","required":["location"],"properties":{"location":{"type":"string","description":"City name"}}}}}]<|im_end|>` + "\n" + `<|im_start|>user` + "\n" + `What's the weather?<|im_end|>` + "\n" + `<|im_start|>assistant` + "\n",
-		},
-		{
-			name: "multiple tools without system message",
-			messages: []api.Message{
-				{Role: "user", Content: "Hello"},
-			},
-			tools: []api.Tool{
-				{
-					Type: "function",
-					Function: api.ToolFunction{
-						Name:        "get_weather",
-						Description: "Get weather",
-					},
-				},
-				{
-					Type: "function",
-					Function: api.ToolFunction{
-						Name:        "get_time",
-						Description: "Get time",
-					},
-				},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>system\nList of tools: [{\"type\":\"function\",\"function\":{\"name\":\"get_weather\",\"description\":\"Get weather\",\"parameters\":{\"type\":\"\",\"properties\":null}}}, {\"type\":\"function\",\"function\":{\"name\":\"get_time\",\"description\":\"Get time\",\"parameters\":{\"type\":\"\",\"properties\":null}}}]<|im_end|>\n<|im_start|>user\nHello<|im_end|>\n<|im_start|>assistant\n",
-		},
-		{
-			name: "user-tool sequence",
-			messages: []api.Message{
-				{Role: "user", Content: "Check weather"},
-				{Role: "tool", Content: "22C"},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>user\nCheck weather<|im_end|>\n<|im_start|>tool\n22C<|im_end|>\n<|im_start|>assistant\n",
-		},
-		{
-			name: "full tool call cycle",
-			messages: []api.Message{
-				{Role: "user", Content: "Check weather"},
-				{Role: "assistant", Content: "Let me check"},
-				{Role: "tool", Content: "22C"},
-				{Role: "assistant", Content: "It's 22C"},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>user\nCheck weather<|im_end|>\n<|im_start|>assistant\nLet me check<|im_end|>\n<|im_start|>tool\n22C<|im_end|>\n<|im_start|>assistant\nIt's 22C<|im_end|>\n<|im_start|>assistant\n",
-		},
-		{
-			name: "unicode content",
-			messages: []api.Message{
-				{Role: "user", Content: "你好世界! مرحبا 🌍"},
-				{Role: "assistant", Content: "Hello! 👋"},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>user\n你好世界! مرحبا 🌍<|im_end|>\n<|im_start|>assistant\nHello! 👋<|im_end|>\n<|im_start|>assistant\n",
-		},
-		{
-			name: "newlines in content",
-			messages: []api.Message{
-				{Role: "user", Content: "Line 1\nLine 2\n\nLine 4"},
-				{Role: "assistant", Content: "Response with\nmultiple\nlines"},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>user\nLine 1\nLine 2\n\nLine 4<|im_end|>\n<|im_start|>assistant\nResponse with\nmultiple\nlines<|im_end|>\n<|im_start|>assistant\n",
-		},
-		{
-			name: "empty assistant content",
-			messages: []api.Message{
-				{Role: "user", Content: "Hello"},
-				{Role: "assistant", Content: ""},
-				{Role: "user", Content: "OK"},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>user\nHello<|im_end|>\n<|im_start|>assistant\n<|im_end|>\n<|im_start|>user\nOK<|im_end|>\n<|im_start|>assistant\n",
-		},
-		{
-			// Generation prompt does NOT include <think> - model outputs it
-			name: "generation prompt has no think tag",
-			messages: []api.Message{
-				{Role: "user", Content: "Think hard"},
-			},
-			thinkValue: &api.ThinkValue{Value: true},
-			expected:   "<|im_start|>user\nThink hard<|im_end|>\n<|im_start|>assistant\n",
-		},
-		{
-			// Interleaved: thinking before tool call - last assistant preserves thinking
-			name: "thinking before tool call (last assistant)",
-			messages: []api.Message{
-				{Role: "user", Content: "What's the weather?"},
-				{
-					Role:    "assistant",
-					Content: "<think>I need to check the weather</think>",
-					ToolCalls: []api.ToolCall{
-						{
-							Function: api.ToolCallFunction{
-								Name: "get_weather",
-								Arguments: testArgs(map[string]any{
-									"location": "Paris",
-								}),
-							},
-						},
-					},
-				},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>user\nWhat's the weather?<|im_end|>\n<|im_start|>assistant\n<think>I need to check the weather</think><|tool_call_start|>{\"arguments\":{\"location\":\"Paris\"},\"name\":\"get_weather\"}<|tool_call_end|><|im_end|>\n<|im_start|>assistant\n",
-		},
-		{
-			// Two assistants with tool calls - first has thinking stripped
-			name: "two assistants with tools: first thinking stripped",
-			messages: []api.Message{
-				{Role: "user", Content: "What's the weather?"},
-				{
-					Role:    "assistant",
-					Content: "<think>checking</think>",
-					ToolCalls: []api.ToolCall{
-						{
-							Function: api.ToolCallFunction{
-								Name: "get_weather",
-								Arguments: testArgs(map[string]any{
-									"location": "Paris",
-								}),
-							},
-						},
-					},
-				},
-				{Role: "tool", Content: "22C"},
-				{Role: "assistant", Content: "<think>got result</think>It's 22C!"},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>user\nWhat's the weather?<|im_end|>\n<|im_start|>assistant\n<|tool_call_start|>{\"arguments\":{\"location\":\"Paris\"},\"name\":\"get_weather\"}<|tool_call_end|><|im_end|>\n<|im_start|>tool\n22C<|im_end|>\n<|im_start|>assistant\n<think>got result</think>It's 22C!<|im_end|>\n<|im_start|>assistant\n",
-		},
-		{
-			// Two assistants with tools - both preserved when thinking enabled
-			name: "two assistants with tools: both preserved when thinking enabled",
-			messages: []api.Message{
-				{Role: "user", Content: "What's the weather?"},
-				{
-					Role:    "assistant",
-					Content: "<think>checking</think>",
-					ToolCalls: []api.ToolCall{
-						{
-							Function: api.ToolCallFunction{
-								Name: "get_weather",
-								Arguments: testArgs(map[string]any{
-									"location": "Paris",
-								}),
-							},
-						},
-					},
-				},
-				{Role: "tool", Content: "22C"},
-				{Role: "assistant", Content: "<think>got result</think>It's 22C!"},
-			},
-			thinkValue: &api.ThinkValue{Value: true},
-			expected:   "<|im_start|>user\nWhat's the weather?<|im_end|>\n<|im_start|>assistant\n<think>checking</think><|tool_call_start|>{\"arguments\":{\"location\":\"Paris\"},\"name\":\"get_weather\"}<|tool_call_end|><|im_end|>\n<|im_start|>tool\n22C<|im_end|>\n<|im_start|>assistant\n<think>got result</think>It's 22C!<|im_end|>\n<|im_start|>assistant\n",
-		},
-		{
-			// Content before thinking before tool call
-			name: "content then thinking then tool call",
-			messages: []api.Message{
-				{Role: "user", Content: "What's the weather?"},
-				{
-					Role:    "assistant",
-					Content: "Let me check.<think>Using weather API</think>",
-					ToolCalls: []api.ToolCall{
-						{
-							Function: api.ToolCallFunction{
-								Name: "get_weather",
-								Arguments: testArgs(map[string]any{
-									"location": "Paris",
-								}),
-							},
-						},
-					},
-				},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>user\nWhat's the weather?<|im_end|>\n<|im_start|>assistant\nLet me check.<think>Using weather API</think><|tool_call_start|>{\"arguments\":{\"location\":\"Paris\"},\"name\":\"get_weather\"}<|tool_call_end|><|im_end|>\n<|im_start|>assistant\n",
+			expected:   "<|startoftext|><|im_start|>user\nHi<|im_end|>\n<|im_start|>assistant\nHello",
 		},
 	}

-	renderer := &LFM2Renderer{IsThinking: true}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
-			rendered, err := renderer.Render(tt.messages, tt.tools, tt.thinkValue)
+			rendered, err := tt.renderer.Render(tt.messages, tt.tools, tt.thinkValue)
 			if err != nil {
 				t.Fatalf("Render() error = %v", err)
 			}
 			if diff := cmp.Diff(tt.expected, rendered); diff != "" {
-				t.Errorf("Render() mismatch (-want +got):\n%s", diff)
+				t.Fatalf("Render() mismatch (-want +got):\n%s", diff)
 			}
 		})
 	}
 }
+
+func TestLFM2Renderer_Images(t *testing.T) {
+	tests := []struct {
+		name     string
+		renderer *LFM2Renderer
+		message  api.Message
+		expected string
+	}{
+		{
+			name:     "single_image_default_placeholder",
+			renderer: &LFM2Renderer{},
+			message: api.Message{
+				Role:    "user",
+				Content: "Describe this image.",
+				Images:  []api.ImageData{api.ImageData("img1")},
+			},
+			expected: "<|startoftext|><|im_start|>user\n<image>Describe this image.<|im_end|>\n<|im_start|>assistant\n",
+		},
+		{
+			name:     "multiple_images_default_placeholder",
+			renderer: &LFM2Renderer{},
+			message: api.Message{
+				Role:    "user",
+				Content: "Describe these images.",
+				Images:  []api.ImageData{api.ImageData("img1"), api.ImageData("img2")},
+			},
+			expected: "<|startoftext|><|im_start|>user\n<image><image>Describe these images.<|im_end|>\n<|im_start|>assistant\n",
+		},
+		{
+			name:     "single_image_img_tag_placeholder",
+			renderer: &LFM2Renderer{useImgTags: true},
+			message: api.Message{
+				Role:    "user",
+				Content: "Describe this image.",
+				Images:  []api.ImageData{api.ImageData("img1")},
+			},
+			expected: "<|startoftext|><|im_start|>user\n[img-0]Describe this image.<|im_end|>\n<|im_start|>assistant\n",
+		},
+		{
+			name:     "existing_template_image_placeholder_not_duplicated",
+			renderer: &LFM2Renderer{},
+			message: api.Message{
+				Role:    "user",
+				Content: "<image>Describe this image.",
+				Images:  []api.ImageData{api.ImageData("img1")},
+			},
+			expected: "<|startoftext|><|im_start|>user\n<image>Describe this image.<|im_end|>\n<|im_start|>assistant\n",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got, err := tt.renderer.Render([]api.Message{tt.message}, nil, nil)
+			if err != nil {
+				t.Fatalf("Render() error = %v", err)
+			}
+			if diff := cmp.Diff(tt.expected, got); diff != "" {
+				t.Fatalf("Render() mismatch (-want +got):\n%s", diff)
+			}
+		})
+	}
+}
+
+func TestLFM2Renderer_JSONFormatting(t *testing.T) {
+	tool := api.Tool{
+		Type: "function",
+		Function: api.ToolFunction{
+			Name:        "echo",
+			Description: "<html>",
+			Parameters: api.ToolFunctionParameters{
+				Type: "object",
+			},
+		},
+	}
+
+	got := lfm2JSON(tool)
+	want := "{\"type\": \"function\", \"function\": {\"name\": \"echo\", \"description\": \"<html>\", \"parameters\": {\"type\": \"object\", \"properties\": null}}}"
+	if diff := cmp.Diff(want, got); diff != "" {
+		t.Fatalf("lfm2JSON mismatch (-want +got):\n%s", diff)
+	}
+}
--- a/model/renderers/qwen3vl.go
+++ b/model/renderers/qwen3vl.go
@@ -1,6 +1,7 @@
 package renderers

 import (
+	"fmt"
 	"strings"

 	"github.com/ollama/ollama/api"
@@ -9,10 +10,11 @@ import (
 type Qwen3VLRenderer struct {
 	isThinking bool

-	useImgTags bool
+	emitEmptyThinkOnNoThink bool
+	useImgTags              bool
 }

-func (r *Qwen3VLRenderer) renderContent(content api.Message) string {
+func (r *Qwen3VLRenderer) renderContent(content api.Message, imageOffset int) (string, int) {
 	// This assumes all images are at the front of the message - same assumption as ollama/ollama/runner.go
 	var subSb strings.Builder
 	for range content.Images {
@@ -20,7 +22,8 @@ func (r *Qwen3VLRenderer) renderContent(content api.Message) string {
 		// model backends, and so we should eventually parameterize this or
 		// only output a placeholder such as [img]
 		if r.useImgTags {
-			subSb.WriteString("[img]")
+			subSb.WriteString(fmt.Sprintf("[img-%d]", imageOffset))
+			imageOffset++
 		} else {
 			subSb.WriteString("<|vision_start|><|image_pad|><|vision_end|>")
 		}
@@ -28,12 +31,17 @@ func (r *Qwen3VLRenderer) renderContent(content api.Message) string {
 	// TODO: support videos

 	subSb.WriteString(content.Content)
-	return subSb.String()
+	return subSb.String(), imageOffset
 }

-func (r *Qwen3VLRenderer) Render(messages []api.Message, tools []api.Tool, _ *api.ThinkValue) (string, error) {
+func (r *Qwen3VLRenderer) Render(messages []api.Message, tools []api.Tool, think *api.ThinkValue) (string, error) {
 	var sb strings.Builder

+	isThinking := r.isThinking
+	if think != nil {
+		isThinking = think.Bool()
+	}
+
 	if len(tools) > 0 {
 		sb.WriteString(imStartTag + "system\n")
 		if len(messages) > 0 && messages[0].Role == "system" {
@@ -57,7 +65,7 @@ func (r *Qwen3VLRenderer) Render(messages []api.Message, tools []api.Tool, _ *ap
 		message := messages[i]
 		if multiStepTool && message.Role == "user" {
 			// Check if content starts with <tool_response> and ends with </tool_response>
-			content := r.renderContent(message)
+			content, _ := r.renderContent(message, 0)
 			if !(strings.HasPrefix(content, "<tool_response>") && strings.HasSuffix(content, "</tool_response>")) {
 				multiStepTool = false
 				lastQueryIndex = i
@@ -65,8 +73,10 @@ func (r *Qwen3VLRenderer) Render(messages []api.Message, tools []api.Tool, _ *ap
 		}
 	}

+	imageOffset := 0
 	for i, message := range messages {
-		content := r.renderContent(message)
+		content, nextImageOffset := r.renderContent(message, imageOffset)
+		imageOffset = nextImageOffset

 		lastMessage := i == len(messages)-1
 		prefill := lastMessage && message.Role == "assistant"
@@ -76,13 +86,13 @@ func (r *Qwen3VLRenderer) Render(messages []api.Message, tools []api.Tool, _ *ap
 		} else if message.Role == "assistant" {
 			contentReasoning := ""

-			if r.isThinking {
+			if isThinking {
 				if message.Thinking != "" {
 					contentReasoning = message.Thinking
 				}
 			}

-			if r.isThinking && i > lastQueryIndex {
+			if isThinking && i > lastQueryIndex {
 				if i == len(messages)-1 || contentReasoning != "" {
 					sb.WriteString("<|im_start|>" + message.Role + "\n<think>\n" + strings.Trim(contentReasoning, "\n")) // do we want to add a new line here?
 					if content != "" {
@@ -125,8 +135,10 @@ func (r *Qwen3VLRenderer) Render(messages []api.Message, tools []api.Tool, _ *ap
 		// prefill at the end
 		if lastMessage && !prefill {
 			sb.WriteString("<|im_start|>assistant\n")
-			if r.isThinking {
+			if isThinking {
 				sb.WriteString("<think>\n")
+			} else if r.emitEmptyThinkOnNoThink {
+				sb.WriteString("<think>\n\n</think>\n\n")
 			}
 		}
 	}
--- a/model/renderers/qwen3vl_nonthinking_test.go
+++ b/model/renderers/qwen3vl_nonthinking_test.go
@@ -101,7 +101,7 @@ Let me analyze this image.`,
 			},
 			useImgTags: true,
 			expected: `<|im_start|>user
-[img]Describe this image.<|im_end|>
+[img-0]Describe this image.<|im_end|>
 <|im_start|>assistant
 Let me analyze this image.`,
 		},
@@ -123,7 +123,7 @@ Let me analyze this image.`,
 			},
 			useImgTags: true,
 			expected: `<|im_start|>user
-[img][img]Describe these images.<|im_end|>
+[img-0][img-1]Describe these images.<|im_end|>
 <|im_start|>assistant
 Let me analyze this image.`,
 		},
--- a/model/renderers/qwen3vl_thinking_test.go
+++ b/model/renderers/qwen3vl_thinking_test.go
@@ -1,6 +1,7 @@
 package renderers

 import (
+	"strings"
 	"testing"

 	"github.com/google/go-cmp/cmp"
@@ -370,3 +371,74 @@ func TestFormatToolCallArgumentThinkingVL(t *testing.T) {
 		})
 	}
 }
+
+func TestQwen3VLRendererThinkOverride(t *testing.T) {
+	msgs := []api.Message{
+		{Role: "user", Content: "Hello"},
+	}
+
+	renderThinking, err := (&Qwen3VLRenderer{isThinking: true}).Render(msgs, nil, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if !strings.Contains(renderThinking, "<|im_start|>assistant\n<think>\n") {
+		t.Fatalf("expected default thinking renderer to emit <think>, got:\n%s", renderThinking)
+	}
+
+	renderNonThinking, err := (&Qwen3VLRenderer{isThinking: true}).Render(msgs, nil, &api.ThinkValue{Value: false})
+	if err != nil {
+		t.Fatal(err)
+	}
+	if strings.Contains(renderNonThinking, "<think>") {
+		t.Fatalf("expected think=false override to suppress <think>, got:\n%s", renderNonThinking)
+	}
+
+	renderForcedThinking, err := (&Qwen3VLRenderer{isThinking: false}).Render(msgs, nil, &api.ThinkValue{Value: true})
+	if err != nil {
+		t.Fatal(err)
+	}
+	if !strings.Contains(renderForcedThinking, "<|im_start|>assistant\n<think>\n") {
+		t.Fatalf("expected think=true override to emit <think>, got:\n%s", renderForcedThinking)
+	}
+}
+
+func TestQwen3VLRendererThinkOverrideWithExplicitNoThinkPrefill(t *testing.T) {
+	msgs := []api.Message{
+		{Role: "user", Content: "Hello"},
+	}
+
+	renderNonThinking, err := (&Qwen3VLRenderer{
+		isThinking:              true,
+		emitEmptyThinkOnNoThink: true,
+	}).Render(msgs, nil, &api.ThinkValue{Value: false})
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if !strings.Contains(renderNonThinking, "<|im_start|>assistant\n<think>\n\n</think>\n\n") {
+		t.Fatalf("expected explicit think=false prefill block, got:\n%s", renderNonThinking)
+	}
+}
+
+func TestQwenRendererNameNoThinkBehaviorSplit(t *testing.T) {
+	msgs := []api.Message{
+		{Role: "user", Content: "Hello"},
+	}
+	thinkFalse := &api.ThinkValue{Value: false}
+
+	qwen35Rendered, err := RenderWithRenderer("qwen3.5", msgs, nil, thinkFalse)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if !strings.Contains(qwen35Rendered, "<|im_start|>assistant\n<think>\n\n</think>\n\n") {
+		t.Fatalf("expected qwen3.5 renderer to emit explicit no-think prefill, got:\n%s", qwen35Rendered)
+	}
+
+	qwen3VLRendered, err := RenderWithRenderer("qwen3-vl-thinking", msgs, nil, thinkFalse)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if strings.Contains(qwen3VLRendered, "<|im_start|>assistant\n<think>\n\n</think>\n\n") {
+		t.Fatalf("expected qwen3-vl-thinking renderer to keep legacy non-empty no-think behavior, got:\n%s", qwen3VLRendered)
+	}
+}
--- a/model/renderers/renderer.go
+++ b/model/renderers/renderer.go
@@ -56,6 +56,9 @@ func rendererForName(name string) Renderer {
 	case "qwen3-vl-thinking":
 		renderer := &Qwen3VLRenderer{isThinking: true, useImgTags: RenderImgTags}
 		return renderer
+	case "qwen3.5":
+		renderer := &Qwen3VLRenderer{isThinking: true, emitEmptyThinkOnNoThink: true, useImgTags: RenderImgTags}
+		return renderer
 	case "cogito":
 		renderer := &CogitoRenderer{isThinking: true}
 		return renderer
@@ -85,9 +88,9 @@ func rendererForName(name string) Renderer {
 	case "glm-ocr":
 		return &GlmOcrRenderer{}
 	case "lfm2":
-		return &LFM2Renderer{IsThinking: false}
+		return &LFM2Renderer{IsThinking: false, useImgTags: RenderImgTags}
 	case "lfm2-thinking":
-		return &LFM2Renderer{IsThinking: true}
+		return &LFM2Renderer{IsThinking: true, useImgTags: RenderImgTags}
 	default:
 		return nil
 	}
--- a/model/renderers/renderer_test.go
+++ b/model/renderers/renderer_test.go
@@ -29,17 +29,27 @@ func TestRegisterCustomRenderer(t *testing.T) {
 }

 func TestBuiltInRendererStillWorks(t *testing.T) {
-	// Test that qwen3-coder still works
+	tests := []struct {
+		name string
+	}{
+		{name: "qwen3-coder"},
+		{name: "qwen3.5"},
+	}
+
 	messages := []api.Message{
 		{Role: "user", Content: "Hello"},
 	}

-	result, err := RenderWithRenderer("qwen3-coder", messages, nil, nil)
-	if err != nil {
-		t.Fatalf("unexpected error: %v", err)
-	}
-	if result == "" {
-		t.Error("expected non-empty result from qwen3-coder renderer")
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result, err := RenderWithRenderer(tt.name, messages, nil, nil)
+			if err != nil {
+				t.Fatalf("unexpected error: %v", err)
+			}
+			if result == "" {
+				t.Fatalf("expected non-empty result from %s renderer", tt.name)
+			}
+		})
 	}
 }

--- a/server/prompt.go
+++ b/server/prompt.go
@@ -86,6 +86,11 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
 				ID:   len(images),
 				Data: i,
 			}
+			images = append(images, imgData)
+
+			if m.Config.Renderer != "" {
+				continue
+			}

 			imgTag := fmt.Sprintf("[img-%d]", imgData.ID)
 			if !strings.Contains(prompt, "[img]") {
@@ -93,8 +98,6 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
 			} else {
 				prompt = strings.Replace(prompt, "[img]", imgTag, 1)
 			}
-
-			images = append(images, imgData)
 		}
 		msgs[currMsgIdx+cnt].Content = prefix + prompt
 	}
--- a/server/prompt_test.go
+++ b/server/prompt_test.go
@@ -9,6 +9,7 @@ import (

 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/template"
+	"github.com/ollama/ollama/types/model"
 )

 func TestChatPrompt(t *testing.T) {
@@ -330,3 +331,38 @@ func TestChatPromptTokenizeCalls(t *testing.T) {
 		})
 	}
 }
+
+func TestChatPromptRendererDoesNotRewriteMessageContent(t *testing.T) {
+	msgs := []api.Message{
+		{
+			Role:    "user",
+			Content: "what do these photos have in common?",
+			Images:  []api.ImageData{[]byte("img-1"), []byte("img-2"), []byte("img-3")},
+		},
+	}
+	originalContent := msgs[0].Content
+
+	m := Model{
+		Config:         model.ConfigV2{Renderer: "qwen3-vl-instruct"},
+		ProjectorPaths: []string{"vision"},
+	}
+	opts := api.Options{Runner: api.Runner{NumCtx: 8192}}
+	think := false
+
+	prompt, images, err := chatPrompt(t.Context(), &m, mockRunner{}.Tokenize, &opts, msgs, nil, &api.ThinkValue{Value: think}, true)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if msgs[0].Content != originalContent {
+		t.Fatalf("renderer path should not mutate message content: got %q, want %q", msgs[0].Content, originalContent)
+	}
+
+	if got, want := len(images), 3; got != want {
+		t.Fatalf("len(images) = %d, want %d", got, want)
+	}
+
+	if prompt == "" {
+		t.Fatal("prompt is empty")
+	}
+}
--- a/server/quantization.go
+++ b/server/quantization.go
@@ -6,6 +6,7 @@ import (
 	"log/slog"
 	"maps"
 	"os"
+	"slices"
 	"strings"
 	"unsafe"

@@ -33,6 +34,9 @@ func (q quantizer) WriteTo(w io.Writer) (int64, error) {
 		slog.Warn("file read error", "tensor", q.from.Name, "file", q.Name(), "error", err)
 		return 0, fmt.Errorf("unable to read tensor %s from %s: %s", q.from.Name, q.Name(), err)
 	}
+	if uint64(len(data)) < q.from.Size() {
+		return 0, fmt.Errorf("tensor %s data size %d is less than expected %d from shape %v", q.from.Name, len(data), q.from.Size(), q.from.Shape)
+	}
 	var f32s []float32
 	newType := fsggml.TensorType(q.to.Kind)
 	if fsggml.TensorType(q.from.Kind) == fsggml.TensorTypeF32 {
@@ -58,7 +62,7 @@ func useMoreBits(iLayer, nLayers int) bool {
 	return iLayer < (nLayers/8) || iLayer >= 7*nLayers/8 || (iLayer-nLayers/8)%3 == 2
 }

-func qwen3nextQuantType(name string) (fsggml.TensorType, bool) {
+func qwen3LinearAttnQuantType(name string) (fsggml.TensorType, bool) {
 	switch {
 	// Full attention
 	case strings.HasSuffix(name, ".attn_q.weight"):
@@ -79,6 +83,10 @@ func qwen3nextQuantType(name string) (fsggml.TensorType, bool) {
 	// SSM
 	case strings.HasSuffix(name, ".ssm_ba.weight"):
 		return fsggml.TensorTypeQ4_K, true
+	case strings.HasSuffix(name, ".ssm_beta.weight"):
+		return fsggml.TensorTypeQ4_K, true
+	case strings.HasSuffix(name, ".ssm_alpha.weight"):
+		return fsggml.TensorTypeQ4_K, true
 	case strings.HasSuffix(name, ".ssm_out.weight"):
 		return fsggml.TensorTypeQ4_K, true

@@ -287,8 +295,8 @@ func newType(t *fsggml.Tensor, kv fsggml.KV, qs *quantizeState, ftype fsggml.Fil

 	newType := fsggml.TensorType(t.Kind)
 	if quantize {
-		if kv.Architecture() == "qwen3next" && (ftype == fsggml.FileTypeQ4_K_M || ftype == fsggml.FileTypeQ4_K_S) {
-			if qt, ok := qwen3nextQuantType(name); ok {
+		if slices.Contains([]string{"qwen3next", "qwen35", "qwen35moe"}, kv.Architecture()) && (ftype == fsggml.FileTypeQ4_K_M || ftype == fsggml.FileTypeQ4_K_S) {
+			if qt, ok := qwen3LinearAttnQuantType(name); ok {
 				return qt
 			}
 		}
--- a/server/quantization_test.go
+++ b/server/quantization_test.go
@@ -166,6 +166,60 @@ func TestGetTensorNewType(t *testing.T) {
 	}
 }

+func TestQwen3LinearAttentionQuantOverride(t *testing.T) {
+	cases := []struct {
+		name     string
+		arch     string
+		tensor   string
+		fileType fsggml.FileType
+		expected fsggml.TensorType
+	}{
+		{
+			name:     "qwen35_beta",
+			arch:     "qwen35",
+			tensor:   "blk.0.ssm_beta.weight",
+			fileType: fsggml.FileTypeQ4_K_M,
+			expected: fsggml.TensorTypeQ4_K,
+		},
+		{
+			name:     "qwen35_alpha",
+			arch:     "qwen35",
+			tensor:   "blk.0.ssm_alpha.weight",
+			fileType: fsggml.FileTypeQ4_K_M,
+			expected: fsggml.TensorTypeQ4_K,
+		},
+		{
+			name:     "qwen35moe_attn_qkv",
+			arch:     "qwen35moe",
+			tensor:   "blk.0.attn_qkv.weight",
+			fileType: fsggml.FileTypeQ4_K_M,
+			expected: fsggml.TensorTypeQ4_K,
+		},
+		{
+			name:     "non_qwen35_falls_back",
+			arch:     "foo",
+			tensor:   "blk.0.attn_qkv.weight",
+			fileType: fsggml.FileTypeQ4_K_M,
+			expected: fsggml.TensorTypeQ5_K,
+		},
+	}
+
+	for _, tt := range cases {
+		t.Run(tt.name, func(t *testing.T) {
+			kv := fsggml.KV{"general.architecture": tt.arch}
+			got := newType(&fsggml.Tensor{
+				Name:  tt.tensor,
+				Shape: []uint64{256, 256},
+				Kind:  uint32(fsggml.TensorTypeF16),
+			}, kv, &quantizeState{}, tt.fileType)
+
+			if got != tt.expected {
+				t.Fatalf("unexpected tensor type for %s (%s): got %s want %s", tt.tensor, tt.arch, got, tt.expected)
+			}
+		})
+	}
+}
+
 func TestQuantizeModel(t *testing.T) {
 	cases := []struct {
 		name                string
@@ -173,6 +227,7 @@ func TestQuantizeModel(t *testing.T) {
 		tensors             []*fsggml.Tensor
 		newType             string
 		expectedTensorTypes map[string]fsggml.TensorType
+		expectErr           bool
 	}{
 		{
 			name: "f16_q4_k",
@@ -253,6 +308,36 @@ func TestQuantizeModel(t *testing.T) {
 				"output.weight":     fsggml.TensorTypeQ8_0,
 			},
 		},
+		{
+			name: "f32_short_data",
+			kv: map[string]any{
+				"general.architecture": "foo",
+			},
+			tensors: []*fsggml.Tensor{
+				{
+					Name: "blk.0.attn.weight", Kind: uint32(fsggml.TensorTypeF32),
+					Offset: uint64(0), Shape: []uint64{512, 2},
+					WriterTo: bytes.NewReader(make([]byte, 32)),
+				},
+			},
+			newType:   "Q4_K",
+			expectErr: true,
+		},
+		{
+			name: "f16_short_data",
+			kv: map[string]any{
+				"general.architecture": "foo",
+			},
+			tensors: []*fsggml.Tensor{
+				{
+					Name: "blk.0.attn.weight", Kind: uint32(fsggml.TensorTypeF16),
+					Offset: uint64(0), Shape: []uint64{512, 2},
+					WriterTo: bytes.NewReader(make([]byte, 32)),
+				},
+			},
+			newType:   "Q4_K",
+			expectErr: true,
+		},
 	}

 	for _, tt := range cases {
@@ -264,6 +349,9 @@ func TestQuantizeModel(t *testing.T) {
 			}
 			defer fp.Close()
 			meta, err := fsggml.Decode(fp, -1)
+			if tt.expectErr && err != nil {
+				return
+			}
 			if err != nil {
 				t.Fatal(err.Error())
 			}
@@ -283,6 +371,12 @@ func TestQuantizeModel(t *testing.T) {
 			}

 			err = quantize(fp, tmp, meta, ftype, progress)
+			if tt.expectErr {
+				if err == nil {
+					t.Fatal("expected quantize to return an error")
+				}
+				return
+			}
 			if err != nil {
 				t.Fatalf("error during quantize: %s", err)
 			}
--- a/server/sched.go
+++ b/server/sched.go
@@ -447,7 +447,7 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo

 	// Some architectures are not safe with num_parallel > 1.
 	// ref: https://github.com/ollama/ollama/issues/4165
-	if slices.Contains([]string{"mllama", "qwen3vl", "qwen3vlmoe", "qwen3next", "lfm2", "lfm2moe"}, req.model.Config.ModelFamily) && numParallel != 1 {
+	if slices.Contains([]string{"mllama", "qwen3vl", "qwen3vlmoe", "qwen35", "qwen35moe", "qwen3next", "lfm2", "lfm2moe", "nemotron_h", "nemotron_h_moe"}, req.model.Config.ModelFamily) && numParallel != 1 {
 		numParallel = 1
 		slog.Warn("model architecture does not currently support parallel requests", "architecture", req.model.Config.ModelFamily)
 	}
--- a/x/imagegen/mlx/generate_wrappers.go
+++ b/x/imagegen/mlx/generate_wrappers.go
@@ -16,10 +16,10 @@ import (
 )

 type Function struct {
-	Name          string
-	ReturnType    string
-	Params        string
-	ParamNames    []string
+	Name            string
+	ReturnType      string
+	Params          string
+	ParamNames      []string
 	NeedsARM64Guard bool
 }

@@ -29,6 +29,11 @@ func findHeaders(directory string) ([]string, error) {
 		if err != nil {
 			return err
 		}
+		// Private headers contain C++ implementation helpers and are not part of
+		// the C API surface; parsing them can produce invalid wrapper signatures.
+		if d.IsDir() && d.Name() == "private" {
+			return fs.SkipDir
+		}
 		if !d.IsDir() && strings.HasSuffix(path, ".h") {
 			headers = append(headers, path)
 		}
@@ -194,10 +199,10 @@ func parseFunctions(content string) []Function {
 		needsGuard := needsARM64Guard(funcName, returnType, params)

 		functions = append(functions, Function{
-			Name:           funcName,
-			ReturnType:     returnType,
-			Params:         params,
-			ParamNames:     paramNames,
+			Name:            funcName,
+			ReturnType:      returnType,
+			Params:          params,
+			ParamNames:      paramNames,
 			NeedsARM64Guard: needsGuard,
 		})
 	}
--- a/x/imagegen/mlx/mlx.c
+++ b/x/imagegen/mlx/mlx.c
@@ -20,6 +20,8 @@ mlx_array (*mlx_array_new_float64_ptr)(double val) = NULL;
 mlx_array (*mlx_array_new_double_ptr)(double val) = NULL;
 mlx_array (*mlx_array_new_complex_ptr)(float real_val, float imag_val) = NULL;
 mlx_array (*mlx_array_new_data_ptr)(const void* data, const int* shape, int dim, mlx_dtype dtype) = NULL;
+mlx_array (*mlx_array_new_data_managed_ptr)(void* data, const int* shape, int dim, mlx_dtype dtype, void (*dtor)(void*)) = NULL;
+mlx_array (*mlx_array_new_data_managed_payload_ptr)(void* data, const int* shape, int dim, mlx_dtype dtype, void* payload, void (*dtor)(void*)) = NULL;
 int (*mlx_array_set_ptr)(mlx_array* arr, const mlx_array src) = NULL;
 int (*mlx_array_set_bool_ptr)(mlx_array* arr, bool val) = NULL;
 int (*mlx_array_set_int_ptr)(mlx_array* arr, int val) = NULL;
@@ -49,7 +51,7 @@ int (*mlx_array_item_int32_ptr)(int32_t* res, const mlx_array arr) = NULL;
 int (*mlx_array_item_int64_ptr)(int64_t* res, const mlx_array arr) = NULL;
 int (*mlx_array_item_float32_ptr)(float* res, const mlx_array arr) = NULL;
 int (*mlx_array_item_float64_ptr)(double* res, const mlx_array arr) = NULL;
-int (*mlx_array_item_complex64_ptr)(float _Complex* res, const mlx_array arr) = NULL;
+int (*mlx_array_item_complex64_ptr)(mlx_complex64_t* res, const mlx_array arr) = NULL;
 #if defined(__aarch64__) || defined(_M_ARM64)
 int (*mlx_array_item_float16_ptr)(float16_t* res, const mlx_array arr) = NULL;
 #endif
@@ -67,7 +69,7 @@ const int32_t* (*mlx_array_data_int32_ptr)(const mlx_array arr) = NULL;
 const int64_t* (*mlx_array_data_int64_ptr)(const mlx_array arr) = NULL;
 const float* (*mlx_array_data_float32_ptr)(const mlx_array arr) = NULL;
 const double* (*mlx_array_data_float64_ptr)(const mlx_array arr) = NULL;
-const float _Complex* (*mlx_array_data_complex64_ptr)(const mlx_array arr) = NULL;
+const mlx_complex64_t* (*mlx_array_data_complex64_ptr)(const mlx_array arr) = NULL;
 #if defined(__aarch64__) || defined(_M_ARM64)
 const float16_t* (*mlx_array_data_float16_ptr)(const mlx_array arr) = NULL;
 #endif
@@ -123,6 +125,7 @@ int (*mlx_detail_compile_erase_ptr)(uintptr_t fun_id) = NULL;
 int (*mlx_disable_compile_ptr)(void) = NULL;
 int (*mlx_enable_compile_ptr)(void) = NULL;
 int (*mlx_set_compile_mode_ptr)(mlx_compile_mode mode) = NULL;
+int (*mlx_cuda_is_available_ptr)(bool* res) = NULL;
 mlx_device (*mlx_device_new_ptr)(void) = NULL;
 mlx_device (*mlx_device_new_type_ptr)(mlx_device_type type, int index) = NULL;
 int (*mlx_device_free_ptr)(mlx_device dev) = NULL;
@@ -133,6 +136,16 @@ int (*mlx_device_get_index_ptr)(int* index, mlx_device dev) = NULL;
 int (*mlx_device_get_type_ptr)(mlx_device_type* type, mlx_device dev) = NULL;
 int (*mlx_get_default_device_ptr)(mlx_device* dev) = NULL;
 int (*mlx_set_default_device_ptr)(mlx_device dev) = NULL;
+int (*mlx_device_is_available_ptr)(bool* avail, mlx_device dev) = NULL;
+int (*mlx_device_count_ptr)(int* count, mlx_device_type type) = NULL;
+mlx_device_info (*mlx_device_info_new_ptr)(void) = NULL;
+int (*mlx_device_info_get_ptr)(mlx_device_info* info, mlx_device dev) = NULL;
+int (*mlx_device_info_free_ptr)(mlx_device_info info) = NULL;
+int (*mlx_device_info_has_key_ptr)(bool* exists, mlx_device_info info, const char* key) = NULL;
+int (*mlx_device_info_is_string_ptr)(bool* is_string, mlx_device_info info, const char* key) = NULL;
+int (*mlx_device_info_get_string_ptr)(const char** value, mlx_device_info info, const char* key) = NULL;
+int (*mlx_device_info_get_size_ptr)(size_t* value, mlx_device_info info, const char* key) = NULL;
+int (*mlx_device_info_get_keys_ptr)(mlx_vector_string* keys, mlx_device_info info) = NULL;
 int (*mlx_distributed_all_gather_ptr)(mlx_array* res, const mlx_array x, const mlx_distributed_group group , const mlx_stream S) = NULL;
 int (*mlx_distributed_all_max_ptr)(mlx_array* res, const mlx_array x, const mlx_distributed_group group , const mlx_stream s) = NULL;
 int (*mlx_distributed_all_min_ptr)(mlx_array* res, const mlx_array x, const mlx_distributed_group group , const mlx_stream s) = NULL;
@@ -263,7 +276,6 @@ int (*mlx_reset_peak_memory_ptr)(void) = NULL;
 int (*mlx_set_cache_limit_ptr)(size_t* res, size_t limit) = NULL;
 int (*mlx_set_memory_limit_ptr)(size_t* res, size_t limit) = NULL;
 int (*mlx_set_wired_limit_ptr)(size_t* res, size_t limit) = NULL;
-mlx_metal_device_info_t (*mlx_metal_device_info_ptr)(void) = NULL;
 int (*mlx_metal_is_available_ptr)(bool* res) = NULL;
 int (*mlx_metal_start_capture_ptr)(const char* path) = NULL;
 int (*mlx_metal_stop_capture_ptr)(void) = NULL;
@@ -658,6 +670,16 @@ int mlx_load_functions(void* handle) {
        fprintf(stderr, "MLX: Failed to load symbol: mlx_array_new_data\n");
        return -1;
    }
+    mlx_array_new_data_managed_ptr = dlsym(handle, "mlx_array_new_data_managed");
+    if (mlx_array_new_data_managed_ptr == NULL) {
+        fprintf(stderr, "MLX: Failed to load symbol: mlx_array_new_data_managed\n");
+        return -1;
+    }
+    mlx_array_new_data_managed_payload_ptr = dlsym(handle, "mlx_array_new_data_managed_payload");
+    if (mlx_array_new_data_managed_payload_ptr == NULL) {
+        fprintf(stderr, "MLX: Failed to load symbol: mlx_array_new_data_managed_payload\n");
+        return -1;
+    }
    mlx_array_set_ptr = dlsym(handle, "mlx_array_set");
    if (mlx_array_set_ptr == NULL) {
        fprintf(stderr, "MLX: Failed to load symbol: mlx_array_set\n");
@@ -1141,6 +1163,11 @@ int mlx_load_functions(void* handle) {
        fprintf(stderr, "MLX: Failed to load symbol: mlx_set_compile_mode\n");
        return -1;
    }
+    mlx_cuda_is_available_ptr = dlsym(handle, "mlx_cuda_is_available");
+    if (mlx_cuda_is_available_ptr == NULL) {
+        fprintf(stderr, "MLX: Failed to load symbol: mlx_cuda_is_available\n");
+        return -1;
+    }
    mlx_device_new_ptr = dlsym(handle, "mlx_device_new");
    if (mlx_device_new_ptr == NULL) {
        fprintf(stderr, "MLX: Failed to load symbol: mlx_device_new\n");
@@ -1191,6 +1218,56 @@ int mlx_load_functions(void* handle) {
        fprintf(stderr, "MLX: Failed to load symbol: mlx_set_default_device\n");
        return -1;
    }
+    mlx_device_is_available_ptr = dlsym(handle, "mlx_device_is_available");
+    if (mlx_device_is_available_ptr == NULL) {
+        fprintf(stderr, "MLX: Failed to load symbol: mlx_device_is_available\n");
+        return -1;
+    }
+    mlx_device_count_ptr = dlsym(handle, "mlx_device_count");
+    if (mlx_device_count_ptr == NULL) {
+        fprintf(stderr, "MLX: Failed to load symbol: mlx_device_count\n");
+        return -1;
+    }
+    mlx_device_info_new_ptr = dlsym(handle, "mlx_device_info_new");
+    if (mlx_device_info_new_ptr == NULL) {
+        fprintf(stderr, "MLX: Failed to load symbol: mlx_device_info_new\n");
+        return -1;
+    }
+    mlx_device_info_get_ptr = dlsym(handle, "mlx_device_info_get");
+    if (mlx_device_info_get_ptr == NULL) {
+        fprintf(stderr, "MLX: Failed to load symbol: mlx_device_info_get\n");
+        return -1;
+    }
+    mlx_device_info_free_ptr = dlsym(handle, "mlx_device_info_free");
+    if (mlx_device_info_free_ptr == NULL) {
+        fprintf(stderr, "MLX: Failed to load symbol: mlx_device_info_free\n");
+        return -1;
+    }
+    mlx_device_info_has_key_ptr = dlsym(handle, "mlx_device_info_has_key");
+    if (mlx_device_info_has_key_ptr == NULL) {
+        fprintf(stderr, "MLX: Failed to load symbol: mlx_device_info_has_key\n");
+        return -1;
+    }
+    mlx_device_info_is_string_ptr = dlsym(handle, "mlx_device_info_is_string");
+    if (mlx_device_info_is_string_ptr == NULL) {
+        fprintf(stderr, "MLX: Failed to load symbol: mlx_device_info_is_string\n");
+        return -1;
+    }
+    mlx_device_info_get_string_ptr = dlsym(handle, "mlx_device_info_get_string");
+    if (mlx_device_info_get_string_ptr == NULL) {
+        fprintf(stderr, "MLX: Failed to load symbol: mlx_device_info_get_string\n");
+        return -1;
+    }
+    mlx_device_info_get_size_ptr = dlsym(handle, "mlx_device_info_get_size");
+    if (mlx_device_info_get_size_ptr == NULL) {
+        fprintf(stderr, "MLX: Failed to load symbol: mlx_device_info_get_size\n");
+        return -1;
+    }
+    mlx_device_info_get_keys_ptr = dlsym(handle, "mlx_device_info_get_keys");
+    if (mlx_device_info_get_keys_ptr == NULL) {
+        fprintf(stderr, "MLX: Failed to load symbol: mlx_device_info_get_keys\n");
+        return -1;
+    }
    mlx_distributed_all_gather_ptr = dlsym(handle, "mlx_distributed_all_gather");
    if (mlx_distributed_all_gather_ptr == NULL) {
        fprintf(stderr, "MLX: Failed to load symbol: mlx_distributed_all_gather\n");
@@ -1841,11 +1918,6 @@ int mlx_load_functions(void* handle) {
        fprintf(stderr, "MLX: Failed to load symbol: mlx_set_wired_limit\n");
        return -1;
    }
-    mlx_metal_device_info_ptr = dlsym(handle, "mlx_metal_device_info");
-    if (mlx_metal_device_info_ptr == NULL) {
-        fprintf(stderr, "MLX: Failed to load symbol: mlx_metal_device_info\n");
-        return -1;
-    }
    mlx_metal_is_available_ptr = dlsym(handle, "mlx_metal_is_available");
    if (mlx_metal_is_available_ptr == NULL) {
        fprintf(stderr, "MLX: Failed to load symbol: mlx_metal_is_available\n");
@@ -3528,6 +3600,14 @@ mlx_array mlx_array_new_data(const void* data, const int* shape, int dim, mlx_dt
    return mlx_array_new_data_ptr(data, shape, dim, dtype);
 }

+mlx_array mlx_array_new_data_managed(void* data, const int* shape, int dim, mlx_dtype dtype, void (*dtor)(void*)) {
+    return mlx_array_new_data_managed_ptr(data, shape, dim, dtype, dtor);
+}
+
+mlx_array mlx_array_new_data_managed_payload(void* data, const int* shape, int dim, mlx_dtype dtype, void* payload, void (*dtor)(void*)) {
+    return mlx_array_new_data_managed_payload_ptr(data, shape, dim, dtype, payload, dtor);
+}
+
 int mlx_array_set(mlx_array* arr, const mlx_array src) {
    return mlx_array_set_ptr(arr, src);
 }
@@ -3644,7 +3724,7 @@ int mlx_array_item_float64(double* res, const mlx_array arr) {
    return mlx_array_item_float64_ptr(res, arr);
 }

-int mlx_array_item_complex64(float _Complex* res, const mlx_array arr) {
+int mlx_array_item_complex64(mlx_complex64_t* res, const mlx_array arr) {
    return mlx_array_item_complex64_ptr(res, arr);
 }

@@ -3704,7 +3784,7 @@ const double* mlx_array_data_float64(const mlx_array arr) {
    return mlx_array_data_float64_ptr(arr);
 }

-const float _Complex* mlx_array_data_complex64(const mlx_array arr) {
+const mlx_complex64_t* mlx_array_data_complex64(const mlx_array arr) {
    return mlx_array_data_complex64_ptr(arr);
 }

@@ -3916,6 +3996,10 @@ int mlx_set_compile_mode(mlx_compile_mode mode) {
    return mlx_set_compile_mode_ptr(mode);
 }

+int mlx_cuda_is_available(bool* res) {
+    return mlx_cuda_is_available_ptr(res);
+}
+
 mlx_device mlx_device_new(void) {
    return mlx_device_new_ptr();
 }
@@ -3956,6 +4040,46 @@ int mlx_set_default_device(mlx_device dev) {
    return mlx_set_default_device_ptr(dev);
 }

+int mlx_device_is_available(bool* avail, mlx_device dev) {
+    return mlx_device_is_available_ptr(avail, dev);
+}
+
+int mlx_device_count(int* count, mlx_device_type type) {
+    return mlx_device_count_ptr(count, type);
+}
+
+mlx_device_info mlx_device_info_new(void) {
+    return mlx_device_info_new_ptr();
+}
+
+int mlx_device_info_get(mlx_device_info* info, mlx_device dev) {
+    return mlx_device_info_get_ptr(info, dev);
+}
+
+int mlx_device_info_free(mlx_device_info info) {
+    return mlx_device_info_free_ptr(info);
+}
+
+int mlx_device_info_has_key(bool* exists, mlx_device_info info, const char* key) {
+    return mlx_device_info_has_key_ptr(exists, info, key);
+}
+
+int mlx_device_info_is_string(bool* is_string, mlx_device_info info, const char* key) {
+    return mlx_device_info_is_string_ptr(is_string, info, key);
+}
+
+int mlx_device_info_get_string(const char** value, mlx_device_info info, const char* key) {
+    return mlx_device_info_get_string_ptr(value, info, key);
+}
+
+int mlx_device_info_get_size(size_t* value, mlx_device_info info, const char* key) {
+    return mlx_device_info_get_size_ptr(value, info, key);
+}
+
+int mlx_device_info_get_keys(mlx_vector_string* keys, mlx_device_info info) {
+    return mlx_device_info_get_keys_ptr(keys, info);
+}
+
 int mlx_distributed_all_gather(mlx_array* res, const mlx_array x, const mlx_distributed_group group , const mlx_stream S) {
    return mlx_distributed_all_gather_ptr(res, x, group, S);
 }
@@ -4476,10 +4600,6 @@ int mlx_set_wired_limit(size_t* res, size_t limit) {
    return mlx_set_wired_limit_ptr(res, limit);
 }

-mlx_metal_device_info_t mlx_metal_device_info(void) {
-    return mlx_metal_device_info_ptr();
-}
-
 int mlx_metal_is_available(bool* res) {
    return mlx_metal_is_available_ptr(res);
 }
--- a/x/imagegen/mlx/mlx.h
+++ b/x/imagegen/mlx/mlx.h
@@ -26,6 +26,8 @@
 #undef mlx_array_new_double
 #undef mlx_array_new_complex
 #undef mlx_array_new_data
+#undef mlx_array_new_data_managed
+#undef mlx_array_new_data_managed_payload
 #undef mlx_array_set
 #undef mlx_array_set_bool
 #undef mlx_array_set_int
@@ -121,6 +123,7 @@
 #undef mlx_disable_compile
 #undef mlx_enable_compile
 #undef mlx_set_compile_mode
+#undef mlx_cuda_is_available
 #undef mlx_device_new
 #undef mlx_device_new_type
 #undef mlx_device_free
@@ -131,6 +134,16 @@
 #undef mlx_device_get_type
 #undef mlx_get_default_device
 #undef mlx_set_default_device
+#undef mlx_device_is_available
+#undef mlx_device_count
+#undef mlx_device_info_new
+#undef mlx_device_info_get
+#undef mlx_device_info_free
+#undef mlx_device_info_has_key
+#undef mlx_device_info_is_string
+#undef mlx_device_info_get_string
+#undef mlx_device_info_get_size
+#undef mlx_device_info_get_keys
 #undef mlx_distributed_all_gather
 #undef mlx_distributed_all_max
 #undef mlx_distributed_all_min
@@ -261,7 +274,6 @@
 #undef mlx_set_cache_limit
 #undef mlx_set_memory_limit
 #undef mlx_set_wired_limit
-#undef mlx_metal_device_info
 #undef mlx_metal_is_available
 #undef mlx_metal_start_capture
 #undef mlx_metal_stop_capture
@@ -602,6 +614,8 @@ extern mlx_array (*mlx_array_new_float64_ptr)(double val);
 extern mlx_array (*mlx_array_new_double_ptr)(double val);
 extern mlx_array (*mlx_array_new_complex_ptr)(float real_val, float imag_val);
 extern mlx_array (*mlx_array_new_data_ptr)(const void* data, const int* shape, int dim, mlx_dtype dtype);
+extern mlx_array (*mlx_array_new_data_managed_ptr)(void* data, const int* shape, int dim, mlx_dtype dtype, void (*dtor)(void*));
+extern mlx_array (*mlx_array_new_data_managed_payload_ptr)(void* data, const int* shape, int dim, mlx_dtype dtype, void* payload, void (*dtor)(void*));
 extern int (*mlx_array_set_ptr)(mlx_array* arr, const mlx_array src);
 extern int (*mlx_array_set_bool_ptr)(mlx_array* arr, bool val);
 extern int (*mlx_array_set_int_ptr)(mlx_array* arr, int val);
@@ -631,7 +645,7 @@ extern int (*mlx_array_item_int32_ptr)(int32_t* res, const mlx_array arr);
 extern int (*mlx_array_item_int64_ptr)(int64_t* res, const mlx_array arr);
 extern int (*mlx_array_item_float32_ptr)(float* res, const mlx_array arr);
 extern int (*mlx_array_item_float64_ptr)(double* res, const mlx_array arr);
-extern int (*mlx_array_item_complex64_ptr)(float _Complex* res, const mlx_array arr);
+extern int (*mlx_array_item_complex64_ptr)(mlx_complex64_t* res, const mlx_array arr);
 #if defined(__aarch64__) || defined(_M_ARM64)
 extern int (*mlx_array_item_float16_ptr)(float16_t* res, const mlx_array arr);
 #endif
@@ -649,7 +663,7 @@ extern const int32_t* (*mlx_array_data_int32_ptr)(const mlx_array arr);
 extern const int64_t* (*mlx_array_data_int64_ptr)(const mlx_array arr);
 extern const float* (*mlx_array_data_float32_ptr)(const mlx_array arr);
 extern const double* (*mlx_array_data_float64_ptr)(const mlx_array arr);
-extern const float _Complex* (*mlx_array_data_complex64_ptr)(const mlx_array arr);
+extern const mlx_complex64_t* (*mlx_array_data_complex64_ptr)(const mlx_array arr);
 #if defined(__aarch64__) || defined(_M_ARM64)
 extern const float16_t* (*mlx_array_data_float16_ptr)(const mlx_array arr);
 #endif
@@ -705,6 +719,7 @@ extern int (*mlx_detail_compile_erase_ptr)(uintptr_t fun_id);
 extern int (*mlx_disable_compile_ptr)(void);
 extern int (*mlx_enable_compile_ptr)(void);
 extern int (*mlx_set_compile_mode_ptr)(mlx_compile_mode mode);
+extern int (*mlx_cuda_is_available_ptr)(bool* res);
 extern mlx_device (*mlx_device_new_ptr)(void);
 extern mlx_device (*mlx_device_new_type_ptr)(mlx_device_type type, int index);
 extern int (*mlx_device_free_ptr)(mlx_device dev);
@@ -715,6 +730,16 @@ extern int (*mlx_device_get_index_ptr)(int* index, mlx_device dev);
 extern int (*mlx_device_get_type_ptr)(mlx_device_type* type, mlx_device dev);
 extern int (*mlx_get_default_device_ptr)(mlx_device* dev);
 extern int (*mlx_set_default_device_ptr)(mlx_device dev);
+extern int (*mlx_device_is_available_ptr)(bool* avail, mlx_device dev);
+extern int (*mlx_device_count_ptr)(int* count, mlx_device_type type);
+extern mlx_device_info (*mlx_device_info_new_ptr)(void);
+extern int (*mlx_device_info_get_ptr)(mlx_device_info* info, mlx_device dev);
+extern int (*mlx_device_info_free_ptr)(mlx_device_info info);
+extern int (*mlx_device_info_has_key_ptr)(bool* exists, mlx_device_info info, const char* key);
+extern int (*mlx_device_info_is_string_ptr)(bool* is_string, mlx_device_info info, const char* key);
+extern int (*mlx_device_info_get_string_ptr)(const char** value, mlx_device_info info, const char* key);
+extern int (*mlx_device_info_get_size_ptr)(size_t* value, mlx_device_info info, const char* key);
+extern int (*mlx_device_info_get_keys_ptr)(mlx_vector_string* keys, mlx_device_info info);
 extern int (*mlx_distributed_all_gather_ptr)(mlx_array* res, const mlx_array x, const mlx_distributed_group group , const mlx_stream S);
 extern int (*mlx_distributed_all_max_ptr)(mlx_array* res, const mlx_array x, const mlx_distributed_group group , const mlx_stream s);
 extern int (*mlx_distributed_all_min_ptr)(mlx_array* res, const mlx_array x, const mlx_distributed_group group , const mlx_stream s);
@@ -845,7 +870,6 @@ extern int (*mlx_reset_peak_memory_ptr)(void);
 extern int (*mlx_set_cache_limit_ptr)(size_t* res, size_t limit);
 extern int (*mlx_set_memory_limit_ptr)(size_t* res, size_t limit);
 extern int (*mlx_set_wired_limit_ptr)(size_t* res, size_t limit);
-extern mlx_metal_device_info_t (*mlx_metal_device_info_ptr)(void);
 extern int (*mlx_metal_is_available_ptr)(bool* res);
 extern int (*mlx_metal_start_capture_ptr)(const char* path);
 extern int (*mlx_metal_stop_capture_ptr)(void);
@@ -1202,6 +1226,10 @@ mlx_array mlx_array_new_complex(float real_val, float imag_val);

 mlx_array mlx_array_new_data(const void* data, const int* shape, int dim, mlx_dtype dtype);

+mlx_array mlx_array_new_data_managed(void* data, const int* shape, int dim, mlx_dtype dtype, void (*dtor)(void*));
+
+mlx_array mlx_array_new_data_managed_payload(void* data, const int* shape, int dim, mlx_dtype dtype, void* payload, void (*dtor)(void*));
+
 int mlx_array_set(mlx_array* arr, const mlx_array src);

 int mlx_array_set_bool(mlx_array* arr, bool val);
@@ -1260,7 +1288,7 @@ int mlx_array_item_float32(float* res, const mlx_array arr);

 int mlx_array_item_float64(double* res, const mlx_array arr);

-int mlx_array_item_complex64(float _Complex* res, const mlx_array arr);
+int mlx_array_item_complex64(mlx_complex64_t* res, const mlx_array arr);

 #if defined(__aarch64__) || defined(_M_ARM64)
 int mlx_array_item_float16(float16_t* res, const mlx_array arr);
@@ -1292,7 +1320,7 @@ const float* mlx_array_data_float32(const mlx_array arr);

 const double* mlx_array_data_float64(const mlx_array arr);

-const float _Complex* mlx_array_data_complex64(const mlx_array arr);
+const mlx_complex64_t* mlx_array_data_complex64(const mlx_array arr);

 #if defined(__aarch64__) || defined(_M_ARM64)
 const float16_t* mlx_array_data_float16(const mlx_array arr);
@@ -1400,6 +1428,8 @@ int mlx_enable_compile(void);

 int mlx_set_compile_mode(mlx_compile_mode mode);

+int mlx_cuda_is_available(bool* res);
+
 mlx_device mlx_device_new(void);

 mlx_device mlx_device_new_type(mlx_device_type type, int index);
@@ -1420,6 +1450,26 @@ int mlx_get_default_device(mlx_device* dev);

 int mlx_set_default_device(mlx_device dev);

+int mlx_device_is_available(bool* avail, mlx_device dev);
+
+int mlx_device_count(int* count, mlx_device_type type);
+
+mlx_device_info mlx_device_info_new(void);
+
+int mlx_device_info_get(mlx_device_info* info, mlx_device dev);
+
+int mlx_device_info_free(mlx_device_info info);
+
+int mlx_device_info_has_key(bool* exists, mlx_device_info info, const char* key);
+
+int mlx_device_info_is_string(bool* is_string, mlx_device_info info, const char* key);
+
+int mlx_device_info_get_string(const char** value, mlx_device_info info, const char* key);
+
+int mlx_device_info_get_size(size_t* value, mlx_device_info info, const char* key);
+
+int mlx_device_info_get_keys(mlx_vector_string* keys, mlx_device_info info);
+
 int mlx_distributed_all_gather(mlx_array* res, const mlx_array x, const mlx_distributed_group group , const mlx_stream S);

 int mlx_distributed_all_max(mlx_array* res, const mlx_array x, const mlx_distributed_group group , const mlx_stream s);
@@ -1680,8 +1730,6 @@ int mlx_set_memory_limit(size_t* res, size_t limit);

 int mlx_set_wired_limit(size_t* res, size_t limit);

-mlx_metal_device_info_t mlx_metal_device_info(void);
-
 int mlx_metal_is_available(bool* res);

 int mlx_metal_start_capture(const char* path);
--- a/x/imagegen/safetensors/loader.go
+++ b/x/imagegen/safetensors/loader.go
@@ -37,9 +37,11 @@ func QuantizationParams(quantization string) (groupSize, bits int, mode string)
 	case "MXFP8":
 		// Microsoft MX FP8: group_size=32, bits=8, E4M3 scales (no qbias)
 		return 32, 8, "mxfp8"
-	case "FP8", "Q8", "INT8", "":
+	case "FP8", "Q8", "INT8":
 		// 8-bit quantization with affine mode (default for quantized models)
 		return 64, 8, "affine"
+	case "":
+		return 0, 0, ""
 	default:
 		return 32, 8, "affine" // Default to affine
 	}
--- a/x/mlxrunner/cache.go
+++ b/x/mlxrunner/cache.go
@@ -3,94 +3,110 @@
 package mlxrunner

 import (
+	"fmt"
 	"log/slog"

+	"github.com/ollama/ollama/logutil"
 	"github.com/ollama/ollama/x/mlxrunner/cache"
+	"github.com/ollama/ollama/x/mlxrunner/mlx"
+	"github.com/ollama/ollama/x/mlxrunner/model/base"
 )

-type CacheEntry struct {
-	Caches  []cache.Cache
-	Count   int
-	Entries map[int32]*CacheEntry
+type kvCache struct {
+	// For now we only support a single entry, so this is just one sequence
+	tokens []int32
+	caches []cache.Cache
 }

-func (s Runner) FindNearestCache(tokens []int32) ([]cache.Cache, []int32) {
-	current := &CacheEntry{Entries: s.CacheEntries}
-	index, cacheIndex := 0, -1
-	for _, token := range tokens {
-		if _, ok := current.Entries[token]; !ok {
-			break
-		}
+// cacheSession manages caches for a single pipeline run.
+// Callers should append generated tokens to outputs and
+// defer close to save the cache state.
+type cacheSession struct {
+	cache   *kvCache
+	inputs  []int32
+	outputs []int32

-		current = current.Entries[token]
-		if len(current.Caches) > 0 {
-			cacheIndex = index
-		}
-
-		index += 1
-	}
-
-	if cacheIndex == len(tokens)-1 {
-		slog.Info("Cache hit", "type", "exact", "total", len(tokens), "cached", len(tokens), "left", len(tokens))
-		return current.Caches, []int32{}
-	} else if cacheIndex > 1 {
-		slog.Info("Cache hit", "type", "partial", "total", len(tokens), "cached", cacheIndex+1, "left", len(tokens[cacheIndex+1:]))
-		return current.Caches, tokens[cacheIndex+1:]
-	} else if index > 0 && cacheIndex < 0 {
-		type stackItem struct {
-			entry  *CacheEntry
-			tokens []int32
-		}
-
-		var best, item stackItem
-		stack := []stackItem{{entry: current, tokens: []int32{}}}
-		for len(stack) > 0 {
-			item, stack = stack[len(stack)-1], stack[:len(stack)-1]
-			if len(item.entry.Caches) > 0 {
-				if len(best.tokens) == 0 || len(item.tokens) < len(best.tokens) {
-					best = item
-				}
-			} else {
-				for token, entry := range item.entry.Entries {
-					stack = append(stack, stackItem{
-						entry:  entry,
-						tokens: append(item.tokens, token),
-					})
-				}
-			}
-		}
-
-		prefix := min(len(tokens)-1, index)
-		caches := make([]cache.Cache, len(best.entry.Caches))
-		trim := len(best.tokens)+1
-		for i := range caches {
-			caches[i] = best.entry.Caches[i].Clone()
-			caches[i].Trim(trim)
-		}
-
-		slog.Info("Cache hit", "type", "prefix", "total", len(tokens), "cached", prefix, "left", len(tokens[prefix:]), "trimmed", trim)
-		return caches, tokens[prefix:]
-	}
-
-	slog.Info("Cache miss", "left", len(tokens))
-	return nil, tokens
+	caches    []cache.Cache
+	remaining []int32
 }

-func (s *Runner) InsertCache(tokens []int32, caches []cache.Cache) {
-	current := &CacheEntry{Entries: s.CacheEntries}
-	for _, token := range tokens {
-		if _, ok := current.Entries[token]; !ok {
-			current.Entries[token] = &CacheEntry{
-				Entries: make(map[int32]*CacheEntry),
+// begin prepares caches for a new request. It finds the nearest
+// matching cache or creates new caches if none match.
+func (c *kvCache) begin(m base.Model, inputs []int32) *cacheSession {
+	if len(c.caches) == 0 {
+		if cacheFactory, ok := m.(interface{ NewCaches() []cache.Cache }); ok {
+			c.caches = cacheFactory.NewCaches()
+		} else {
+			c.caches = make([]cache.Cache, m.NumLayers())
+			for i := range c.caches {
+				c.caches[i] = cache.NewKVCache()
 			}
 		}
-
-		current = current.Entries[token]
 	}

-	if len(current.Caches) > 0 {
-		current.Count += 1
+	remaining := c.findRemaining(inputs)
+
+	return &cacheSession{
+		cache:     c,
+		inputs:    inputs,
+		caches:    c.caches,
+		remaining: remaining,
+	}
+}
+
+// close saves the token state if the forward pass ran.
+func (s *cacheSession) close() {
+	if offset := s.caches[0].Offset(); offset > 0 {
+		// Ensure that if we have run the forward pass and set the metadata
+		// that we also actually have the data
+		arrays := make([]*mlx.Array, 0, 2*len(s.caches))
+		for _, c := range s.caches {
+			k, v := c.State()
+			arrays = append(arrays, k, v)
+		}
+		mlx.AsyncEval(arrays...)
+
+		s.cache.tokens = append(s.inputs, s.outputs...)[:offset]
+	}
+}
+
+// findRemaining finds the longest common prefix between tokens and the cached
+// sequence, trims stale cache entries, and returns the remaining tokens.
+func (c *kvCache) findRemaining(tokens []int32) []int32 {
+	prefix := 0
+	for prefix < len(tokens) && prefix < len(c.tokens) && tokens[prefix] == c.tokens[prefix] {
+		prefix++
+	}
+
+	if prefix == len(tokens) && prefix > 0 {
+		// Leave one token to run through the model so we can sample a response.
+		prefix--
+	}
+
+	if prefix < len(c.tokens) {
+		trim := len(c.tokens) - prefix
+		for _, kv := range c.caches {
+			kv.Trim(trim)
+		}
+		c.tokens = c.tokens[:prefix]
+	}
+
+	if prefix == 0 {
+		slog.Info("Cache miss", "left", len(tokens))
 	} else {
-		current.Caches = caches
+		slog.Info("Cache hit", "total", len(tokens), "cached", prefix, "left", len(tokens[prefix:]))
 	}
+	return tokens[prefix:]
+}
+
+func (c *kvCache) log() {
+	if len(c.caches) == 0 {
+		return
+	}
+	var totalBytes int
+	for _, kv := range c.caches {
+		k, v := kv.State()
+		totalBytes += k.NumBytes() + v.NumBytes()
+	}
+	logutil.Trace(fmt.Sprintf("kv cache tokens: %d, size: %s", c.caches[0].Offset(), mlx.PrettyBytes(totalBytes)))
 }
--- a/x/mlxrunner/cache/cache.go
+++ b/x/mlxrunner/cache/cache.go
@@ -3,8 +3,7 @@
 package cache

 import (
-	"log/slog"
-
+	"github.com/ollama/ollama/logutil"
 	"github.com/ollama/ollama/x/mlxrunner/mlx"
 )

@@ -13,6 +12,7 @@ type Cache interface {
 	State() (keys, values *mlx.Array)
 	Trim(int) int
 	Clone() Cache
+	Free()
 	Offset() int
 	Len() int
 }
@@ -47,6 +47,7 @@ func (c *KVCache) Update(keys, values *mlx.Array) (*mlx.Array, *mlx.Array) {
 			c.values.Set(c.values.Concatenate(2, newValues))
 		} else {
 			c.keys, c.values = newKeys, newValues
+			mlx.Pin(c.keys, c.values)
 		}
 	}

@@ -73,12 +74,19 @@ func (c *KVCache) Trim(n int) int {
 }

 func (c *KVCache) Clone() Cache {
-	return &KVCache{
+	clone := &KVCache{
 		keys:   c.keys.Clone(),
 		values: c.values.Clone(),
 		offset: c.offset,
 		step:   c.step,
 	}
+	mlx.Pin(clone.keys, clone.values)
+	return clone
+}
+
+func (c *KVCache) Free() {
+	mlx.Unpin(c.keys, c.values)
+	c.keys, c.values = nil, nil
 }

 func (c *KVCache) Offset() int { return c.offset }
@@ -104,9 +112,10 @@ func (c *RotatingKVCache) Update(keys, values *mlx.Array) (*mlx.Array, *mlx.Arra
 }

 func (c *RotatingKVCache) concat(keys, values *mlx.Array) (newK *mlx.Array, newV *mlx.Array) {
-	slog.Debug("(*RotatingKVCache).concat", "keys_dim", keys.Dims(), "values_dim", values.Dims(), "offset", c.offset, "idx", c.idx, "max_size", c.maxSize)
+	logutil.Trace("(*RotatingKVCache).concat", "keys_dim", keys.Dims(), "values_dim", values.Dims(), "offset", c.offset, "idx", c.idx, "max_size", c.maxSize)
 	if c.keys == nil {
-		c.keys, c.values = keys, values
+		c.keys, c.values = keys.Clone(), values.Clone()
+		mlx.Pin(c.keys, c.values)
 	} else {
 		if c.idx < c.keys.Dim(2) {
 			c.keys.Set(c.keys.Slice(mlx.Slice(), mlx.Slice(), mlx.Slice(0, c.idx), mlx.Slice()))
@@ -130,7 +139,7 @@ func (c *RotatingKVCache) concat(keys, values *mlx.Array) (newK *mlx.Array, newV
 }

 func (c *RotatingKVCache) update(keys, values *mlx.Array) (*mlx.Array, *mlx.Array) {
-	slog.Debug("(*RotatingKVCache).update", "keys_dim", keys.Dims(), "values_dim", values.Dims(), "offset", c.offset, "idx", c.idx, "max_size", c.maxSize)
+	logutil.Trace("(*RotatingKVCache).update", "keys_dim", keys.Dims(), "values_dim", values.Dims(), "offset", c.offset, "idx", c.idx, "max_size", c.maxSize)
 	B, H, L, Dk, Dv := keys.Dim(0), keys.Dim(1), keys.Dim(2), keys.Dim(3), values.Dim(3)

 	prev := c.offset
@@ -145,6 +154,7 @@ func (c *RotatingKVCache) update(keys, values *mlx.Array) (*mlx.Array, *mlx.Arra
 			c.values.Set(c.values.Concatenate(2, newValues))
 		} else {
 			c.keys, c.values = newKeys, newValues
+			mlx.Pin(c.keys, c.values)
 		}
 		c.idx = prev
 	}
--- a/x/mlxrunner/client.go
+++ b/x/mlxrunner/client.go
@@ -119,16 +119,13 @@ func NewClient(modelName string) (*Client, error) {
 	stdout, _ := cmd.StdoutPipe()
 	stderr, _ := cmd.StderrPipe()
 	go func() {
-		scanner := bufio.NewScanner(stdout)
-		for scanner.Scan() {
-			slog.Info("mlx-runner", "msg", scanner.Text())
-		}
+		io.Copy(os.Stderr, stdout) //nolint:errcheck
 	}()
 	go func() {
 		scanner := bufio.NewScanner(stderr)
 		for scanner.Scan() {
 			line := scanner.Text()
-			slog.Warn("mlx-runner", "msg", line)
+			fmt.Fprintln(os.Stderr, line)
 			c.lastErrLock.Lock()
 			c.lastErr = line
 			c.lastErrLock.Unlock()
--- a/x/mlxrunner/mlx/CMakeLists.txt
+++ b/x/mlxrunner/mlx/CMakeLists.txt
@@ -15,7 +15,7 @@ set(CMAKE_INSTALL_RPATH "@loader_path")

 include(FetchContent)

-set(MLX_C_GIT_TAG "v0.4.1" CACHE STRING "")
+set(MLX_C_GIT_TAG "v0.5.0" CACHE STRING "")

 FetchContent_Declare(
  mlx-c
--- a/x/mlxrunner/mlx/array.go
+++ b/x/mlxrunner/mlx/array.go
@@ -7,48 +7,29 @@ import "C"

 import (
 	"encoding/binary"
+	"fmt"
 	"log/slog"
 	"reflect"
+	"sort"
 	"strings"
-	"time"
 	"unsafe"

 	"github.com/ollama/ollama/logutil"
 )

-type tensorDesc struct {
-	name    string
-	inputs  []*Array
-	numRefs int
-}
-
-func (d tensorDesc) LogValue() slog.Value {
-	return slog.GroupValue(
-		slog.String("name", d.name),
-		slog.Int("inputs", len(d.inputs)),
-		slog.Int("num_refs", d.numRefs),
-	)
-}
-
 type Array struct {
-	ctx  C.mlx_array
-	desc tensorDesc
+	ctx    C.mlx_array
+	name   string
+	pinned bool
 }

+var arrays []*Array
+
 // constructor utilities

-func New(name string, inputs ...*Array) *Array {
-	t := &Array{
-		desc: tensorDesc{
-			name:   name,
-			inputs: inputs,
-		},
-	}
-
-	for _, input := range inputs {
-		input.desc.numRefs++
-	}
-	logutil.Trace("New", "t", t)
+func New(name string) *Array {
+	t := &Array{name: name}
+	arrays = append(arrays, t)
 	return t
 }

@@ -133,18 +114,51 @@ func FromValues[S ~[]E, E arrayTypes](s S, shape ...int) *Array {
 }

 func (t *Array) Set(other *Array) {
-	Free(t.desc.inputs...)
-	other.desc.numRefs++
-	t.desc.inputs = []*Array{other}
 	C.mlx_array_set(&t.ctx, other.ctx)
 }

 func (t *Array) Clone() *Array {
-	tt := New(t.desc.name, t.desc.inputs...)
+	tt := New(t.name)
 	C.mlx_array_set(&tt.ctx, t.ctx)
 	return tt
 }

+// lifecycle utilities
+
+// Pin marks arrays as in-use so they are retained during Sweep.
+func Pin(s ...*Array) {
+	for _, t := range s {
+		if t != nil {
+			t.pinned = true
+		}
+	}
+}
+
+// Unpin marks arrays as no longer in-use, allowing Sweep to free them.
+func Unpin(s ...*Array) {
+	for _, t := range s {
+		if t != nil {
+			t.pinned = false
+		}
+	}
+}
+
+// Sweep releases all unpinned arrays, primarily intermediate tensors. MLX will truly
+// free them when there are no other references, including dependencies in the graph.
+func Sweep() {
+	n := 0
+	for _, t := range arrays {
+		if t.pinned && t.Valid() {
+			arrays[n] = t
+			n++
+		} else if t.Valid() {
+			C.mlx_array_free(t.ctx)
+			t.ctx.ctx = nil
+		}
+	}
+	arrays = arrays[:n]
+}
+
 // misc. utilities

 func (t *Array) Valid() bool {
@@ -159,7 +173,10 @@ func (t *Array) String() string {
 }

 func (t *Array) LogValue() slog.Value {
-	attrs := []slog.Attr{slog.Any("", t.desc)}
+	attrs := []slog.Attr{
+		slog.String("name", t.name),
+		slog.Bool("pinned", t.pinned),
+	}
 	if t.Valid() {
 		attrs = append(attrs,
 			slog.Any("dtype", t.DType()),
@@ -238,37 +255,15 @@ func (t Array) Save(name string) error {
 	return nil
 }

-func Free(s ...*Array) (n int) {
-	now := time.Now()
-	defer func() {
-		if n > 0 {
-			logutil.Trace("Freed tensors", "num_bytes", PrettyBytes(n), "took", time.Since(now))
-		}
-	}()
+// LogArrays logs all live arrays, sorted by size
+func LogArrays() {
+	sort.Slice(arrays, func(i, j int) bool {
+		return arrays[i].NumBytes() > arrays[j].NumBytes()
+	})

-	free := make([]*Array, 0, 8192)
-	fn := func(t *Array) {
-		if t.Valid() {
-			t.desc.numRefs--
-			if t.desc.numRefs <= 0 {
-				free = append(free, t.desc.inputs...)
-				logutil.Trace("Free", "t", t)
-				n += t.NumBytes()
-				C.mlx_array_free(t.ctx)
-				t.ctx.ctx = nil
-			}
-		}
+	for _, t := range arrays {
+		nb := t.NumBytes()
+		logutil.Trace(fmt.Sprintf("tensor %-60s %5s %5s %v", t.name, t.DType(), PrettyBytes(nb), t.Dims()))
 	}
-
-	for _, t := range s {
-		fn(t)
-	}
-
-	for len(free) > 0 {
-		tail := free[len(free)-1]
-		free = free[:len(free)-1]
-		fn(tail)
-	}
-
-	return n
+	logutil.Trace(fmt.Sprintf("tensors total: %d, size: %s", len(arrays), PrettyBytes(ActiveMemory())))
 }
--- a/x/mlxrunner/mlx/fast.go
+++ b/x/mlxrunner/mlx/fast.go
@@ -20,7 +20,7 @@ func ScaledDotProductAttention(query, key, value, mask *Array, scale float32) *A
 	cMode := C.CString(mode)
 	defer C.free(unsafe.Pointer(cMode))

-	out := New("FAST_SDPA", query, key, value, mask, sinks)
+	out := New("FAST_SDPA")
 	C.mlx_fast_scaled_dot_product_attention(&out.ctx, query.ctx, key.ctx, value.ctx, C.float(scale), cMode, mask.ctx, sinks.ctx, DefaultStream().ctx)
 	return out
 }
@@ -31,7 +31,7 @@ type LayerNorm struct {
 }

 func (r *LayerNorm) Forward(x *Array, eps float32) *Array {
-	out := New("FAST_LAYERNORM", x)
+	out := New("FAST_LAYERNORM")
 	C.mlx_fast_layer_norm(&out.ctx, x.ctx, r.Weight.ctx, r.Bias.ctx, C.float(eps), DefaultStream().ctx)
 	return out
 }
@@ -41,7 +41,7 @@ type RMSNorm struct {
 }

 func (r RMSNorm) Forward(x *Array, eps float32) *Array {
-	out := New("FAST_RMSNORM", x)
+	out := New("FAST_RMSNORM")
 	C.mlx_fast_rms_norm(&out.ctx, x.ctx, r.Weight.ctx, C.float(eps), DefaultStream().ctx)
 	return out
 }
@@ -55,7 +55,7 @@ type RoPE struct {

 func (r RoPE) Forward(t *Array, offset int) *Array {
 	freqs := New("")
-	out := New("FAST_ROPE", t, freqs)
+	out := New("FAST_ROPE")
 	C.mlx_fast_rope(
 		&out.ctx,
 		t.ctx,
--- a/x/mlxrunner/mlx/generated.c
+++ b/x/mlxrunner/mlx/generated.c
@@ -22,6 +22,19 @@ mlx_array (*mlx_array_new_data_)(
    const int* shape,
    int dim,
    mlx_dtype dtype) = NULL;
+mlx_array (*mlx_array_new_data_managed_)(
+    void* data,
+    const int* shape,
+    int dim,
+    mlx_dtype dtype,
+    void (*dtor)(void*)) = NULL;
+mlx_array (*mlx_array_new_data_managed_payload_)(
+    void* data,
+    const int* shape,
+    int dim,
+    mlx_dtype dtype,
+    void* payload,
+    void (*dtor)(void*)) = NULL;
 int (*mlx_array_set_)(mlx_array* arr, const mlx_array src) = NULL;
 int (*mlx_array_set_bool_)(mlx_array* arr, bool val) = NULL;
 int (*mlx_array_set_int_)(mlx_array* arr, int val) = NULL;
@@ -56,7 +69,7 @@ int (*mlx_array_item_int32_)(int32_t* res, const mlx_array arr) = NULL;
 int (*mlx_array_item_int64_)(int64_t* res, const mlx_array arr) = NULL;
 int (*mlx_array_item_float32_)(float* res, const mlx_array arr) = NULL;
 int (*mlx_array_item_float64_)(double* res, const mlx_array arr) = NULL;
-int (*mlx_array_item_complex64_)(float _Complex* res, const mlx_array arr) = NULL;
+int (*mlx_array_item_complex64_)(mlx_complex64_t* res, const mlx_array arr) = NULL;
 int (*mlx_array_item_float16_)(float16_t* res, const mlx_array arr) = NULL;
 int (*mlx_array_item_bfloat16_)(bfloat16_t* res, const mlx_array arr) = NULL;
 const bool * (*mlx_array_data_bool_)(const mlx_array arr) = NULL;
@@ -70,7 +83,7 @@ const int32_t * (*mlx_array_data_int32_)(const mlx_array arr) = NULL;
 const int64_t * (*mlx_array_data_int64_)(const mlx_array arr) = NULL;
 const float * (*mlx_array_data_float32_)(const mlx_array arr) = NULL;
 const double * (*mlx_array_data_float64_)(const mlx_array arr) = NULL;
-const float _Complex * (*mlx_array_data_complex64_)(const mlx_array arr) = NULL;
+const mlx_complex64_t * (*mlx_array_data_complex64_)(const mlx_array arr) = NULL;
 const float16_t * (*mlx_array_data_float16_)(const mlx_array arr) = NULL;
 const bfloat16_t * (*mlx_array_data_bfloat16_)(const mlx_array arr) = NULL;
 int (*_mlx_array_is_available_)(bool* res, const mlx_array arr) = NULL;
@@ -94,10 +107,11 @@ int (*mlx_closure_apply_)(
 mlx_closure (*mlx_closure_new_unary_)(int (*fun)(mlx_array*, const mlx_array)) = NULL;
 mlx_closure_kwargs (*mlx_closure_kwargs_new_)(void) = NULL;
 int (*mlx_closure_kwargs_free_)(mlx_closure_kwargs cls) = NULL;
-mlx_closure_kwargs (*mlx_closure_kwargs_new_func_)(int (*fun)(
-    mlx_vector_array*,
-    const mlx_vector_array,
-    const mlx_map_string_to_array)) = NULL;
+mlx_closure_kwargs (*mlx_closure_kwargs_new_func_)(
+    int (*fun)(
+        mlx_vector_array*,
+        const mlx_vector_array,
+        const mlx_map_string_to_array)) = NULL;
 mlx_closure_kwargs (*mlx_closure_kwargs_new_func_payload_)(
    int (*fun)(
        mlx_vector_array*,
@@ -136,11 +150,12 @@ int (*mlx_closure_value_and_grad_apply_)(
    const mlx_vector_array input) = NULL;
 mlx_closure_custom (*mlx_closure_custom_new_)(void) = NULL;
 int (*mlx_closure_custom_free_)(mlx_closure_custom cls) = NULL;
-mlx_closure_custom (*mlx_closure_custom_new_func_)(int (*fun)(
-    mlx_vector_array*,
-    const mlx_vector_array,
-    const mlx_vector_array,
-    const mlx_vector_array)) = NULL;
+mlx_closure_custom (*mlx_closure_custom_new_func_)(
+    int (*fun)(
+        mlx_vector_array*,
+        const mlx_vector_array,
+        const mlx_vector_array,
+        const mlx_vector_array)) = NULL;
 mlx_closure_custom (*mlx_closure_custom_new_func_payload_)(
    int (*fun)(
        mlx_vector_array*,
@@ -161,12 +176,13 @@ int (*mlx_closure_custom_apply_)(
    const mlx_vector_array input_2) = NULL;
 mlx_closure_custom_jvp (*mlx_closure_custom_jvp_new_)(void) = NULL;
 int (*mlx_closure_custom_jvp_free_)(mlx_closure_custom_jvp cls) = NULL;
-mlx_closure_custom_jvp (*mlx_closure_custom_jvp_new_func_)(int (*fun)(
-    mlx_vector_array*,
-    const mlx_vector_array,
-    const mlx_vector_array,
-    const int*,
-    size_t _num)) = NULL;
+mlx_closure_custom_jvp (*mlx_closure_custom_jvp_new_func_)(
+    int (*fun)(
+        mlx_vector_array*,
+        const mlx_vector_array,
+        const mlx_vector_array,
+        const int*,
+        size_t _num)) = NULL;
 mlx_closure_custom_jvp (*mlx_closure_custom_jvp_new_func_payload_)(
    int (*fun)(
        mlx_vector_array*,
@@ -189,12 +205,13 @@ int (*mlx_closure_custom_jvp_apply_)(
    size_t input_2_num) = NULL;
 mlx_closure_custom_vmap (*mlx_closure_custom_vmap_new_)(void) = NULL;
 int (*mlx_closure_custom_vmap_free_)(mlx_closure_custom_vmap cls) = NULL;
-mlx_closure_custom_vmap (*mlx_closure_custom_vmap_new_func_)(int (*fun)(
-    mlx_vector_array*,
-    mlx_vector_int*,
-    const mlx_vector_array,
-    const int*,
-    size_t _num)) = NULL;
+mlx_closure_custom_vmap (*mlx_closure_custom_vmap_new_func_)(
+    int (*fun)(
+        mlx_vector_array*,
+        mlx_vector_int*,
+        const mlx_vector_array,
+        const int*,
+        size_t _num)) = NULL;
 mlx_closure_custom_vmap (*mlx_closure_custom_vmap_new_func_payload_)(
    int (*fun)(
        mlx_vector_array*,
@@ -228,6 +245,7 @@ int (*mlx_detail_compile_erase_)(uintptr_t fun_id) = NULL;
 int (*mlx_disable_compile_)(void) = NULL;
 int (*mlx_enable_compile_)(void) = NULL;
 int (*mlx_set_compile_mode_)(mlx_compile_mode mode) = NULL;
+int (*mlx_cuda_is_available_)(bool* res) = NULL;
 mlx_device (*mlx_device_new_)(void) = NULL;
 mlx_device (*mlx_device_new_type_)(mlx_device_type type, int index) = NULL;
 int (*mlx_device_free_)(mlx_device dev) = NULL;
@@ -238,11 +256,28 @@ int (*mlx_device_get_index_)(int* index, mlx_device dev) = NULL;
 int (*mlx_device_get_type_)(mlx_device_type* type, mlx_device dev) = NULL;
 int (*mlx_get_default_device_)(mlx_device* dev) = NULL;
 int (*mlx_set_default_device_)(mlx_device dev) = NULL;
-int (*mlx_distributed_group_rank_)(mlx_distributed_group group) = NULL;
-int (*mlx_distributed_group_size_)(mlx_distributed_group group) = NULL;
-mlx_distributed_group (*mlx_distributed_group_split_)(mlx_distributed_group group, int color, int key) = NULL;
-bool (*mlx_distributed_is_available_)(void) = NULL;
-mlx_distributed_group (*mlx_distributed_init_)(bool strict) = NULL;
+int (*mlx_device_is_available_)(bool* avail, mlx_device dev) = NULL;
+int (*mlx_device_count_)(int* count, mlx_device_type type) = NULL;
+mlx_device_info (*mlx_device_info_new_)(void) = NULL;
+int (*mlx_device_info_get_)(mlx_device_info* info, mlx_device dev) = NULL;
+int (*mlx_device_info_free_)(mlx_device_info info) = NULL;
+int (*mlx_device_info_has_key_)(
+    bool* exists,
+    mlx_device_info info,
+    const char* key) = NULL;
+int (*mlx_device_info_is_string_)(
+    bool* is_string,
+    mlx_device_info info,
+    const char* key) = NULL;
+int (*mlx_device_info_get_string_)(
+    const char** value,
+    mlx_device_info info,
+    const char* key) = NULL;
+int (*mlx_device_info_get_size_)(
+    size_t* value,
+    mlx_device_info info,
+    const char* key) = NULL;
+int (*mlx_device_info_get_keys_)(mlx_vector_string* keys, mlx_device_info info) = NULL;
 int (*mlx_distributed_all_gather_)(
    mlx_array* res,
    const mlx_array x,
@@ -288,6 +323,11 @@ int (*mlx_distributed_sum_scatter_)(
    const mlx_array x,
    const mlx_distributed_group group /* may be null */,
    const mlx_stream s) = NULL;
+int (*mlx_distributed_group_rank_)(mlx_distributed_group group) = NULL;
+int (*mlx_distributed_group_size_)(mlx_distributed_group group) = NULL;
+mlx_distributed_group (*mlx_distributed_group_split_)(mlx_distributed_group group, int color, int key) = NULL;
+bool (*mlx_distributed_is_available_)(void) = NULL;
+mlx_distributed_group (*mlx_distributed_init_)(bool strict) = NULL;
 void (*mlx_set_error_handler_)(
    mlx_error_handler_func handler,
    void* data,
@@ -450,6 +490,16 @@ int (*mlx_fast_rope_)(
    int offset,
    const mlx_array freqs /* may be null */,
    const mlx_stream s) = NULL;
+int (*mlx_fast_rope_dynamic_)(
+    mlx_array* res,
+    const mlx_array x,
+    int dims,
+    bool traditional,
+    mlx_optional_float base,
+    float scale,
+    const mlx_array offset,
+    const mlx_array freqs /* may be null */,
+    const mlx_stream s) = NULL;
 int (*mlx_fast_scaled_dot_product_attention_)(
    mlx_array* res,
    const mlx_array queries,
@@ -560,14 +610,6 @@ int (*mlx_fft_rfftn_)(
    const int* axes,
    size_t axes_num,
    const mlx_stream s) = NULL;
-mlx_io_reader (*mlx_io_reader_new_)(void* desc, mlx_io_vtable vtable) = NULL;
-int (*mlx_io_reader_descriptor_)(void** desc_, mlx_io_reader io) = NULL;
-int (*mlx_io_reader_tostring_)(mlx_string* str_, mlx_io_reader io) = NULL;
-int (*mlx_io_reader_free_)(mlx_io_reader io) = NULL;
-mlx_io_writer (*mlx_io_writer_new_)(void* desc, mlx_io_vtable vtable) = NULL;
-int (*mlx_io_writer_descriptor_)(void** desc_, mlx_io_writer io) = NULL;
-int (*mlx_io_writer_tostring_)(mlx_string* str_, mlx_io_writer io) = NULL;
-int (*mlx_io_writer_free_)(mlx_io_writer io) = NULL;
 int (*mlx_load_reader_)(
    mlx_array* res,
    mlx_io_reader in_stream,
@@ -593,6 +635,14 @@ int (*mlx_save_safetensors_)(
    const char* file,
    const mlx_map_string_to_array param,
    const mlx_map_string_to_string metadata) = NULL;
+mlx_io_reader (*mlx_io_reader_new_)(void* desc, mlx_io_vtable vtable) = NULL;
+int (*mlx_io_reader_descriptor_)(void** desc_, mlx_io_reader io) = NULL;
+int (*mlx_io_reader_tostring_)(mlx_string* str_, mlx_io_reader io) = NULL;
+int (*mlx_io_reader_free_)(mlx_io_reader io) = NULL;
+mlx_io_writer (*mlx_io_writer_new_)(void* desc, mlx_io_vtable vtable) = NULL;
+int (*mlx_io_writer_descriptor_)(void** desc_, mlx_io_writer io) = NULL;
+int (*mlx_io_writer_tostring_)(mlx_string* str_, mlx_io_writer io) = NULL;
+int (*mlx_io_writer_free_)(mlx_io_writer io) = NULL;
 int (*mlx_linalg_cholesky_)(
    mlx_array* res,
    const mlx_array a,
@@ -733,7 +783,6 @@ int (*mlx_reset_peak_memory_)(void) = NULL;
 int (*mlx_set_cache_limit_)(size_t* res, size_t limit) = NULL;
 int (*mlx_set_memory_limit_)(size_t* res, size_t limit) = NULL;
 int (*mlx_set_wired_limit_)(size_t* res, size_t limit) = NULL;
-mlx_metal_device_info_t (*mlx_metal_device_info_)(void) = NULL;
 int (*mlx_metal_is_available_)(bool* res) = NULL;
 int (*mlx_metal_start_capture_)(const char* path) = NULL;
 int (*mlx_metal_stop_capture_)(void) = NULL;
@@ -1162,6 +1211,14 @@ int (*mlx_gather_)(
    const int* slice_sizes,
    size_t slice_sizes_num,
    const mlx_stream s) = NULL;
+int (*mlx_gather_single_)(
+    mlx_array* res,
+    const mlx_array a,
+    const mlx_array indices,
+    int axis,
+    const int* slice_sizes,
+    size_t slice_sizes_num,
+    const mlx_stream s) = NULL;
 int (*mlx_gather_mm_)(
    mlx_array* res,
    const mlx_array a,
@@ -1483,6 +1540,15 @@ int (*mlx_put_along_axis_)(
    const mlx_array values,
    int axis,
    const mlx_stream s) = NULL;
+int (*mlx_qqmm_)(
+    mlx_array* res,
+    const mlx_array x,
+    const mlx_array w,
+    const mlx_array w_scales /* may be null */,
+    mlx_optional_int group_size,
+    mlx_optional_int bits,
+    const char* mode,
+    const mlx_stream s) = NULL;
 int (*mlx_quantize_)(
    mlx_vector_array* res,
    const mlx_array w,
@@ -1566,6 +1632,13 @@ int (*mlx_scatter_)(
    const int* axes,
    size_t axes_num,
    const mlx_stream s) = NULL;
+int (*mlx_scatter_single_)(
+    mlx_array* res,
+    const mlx_array a,
+    const mlx_array indices,
+    const mlx_array updates,
+    int axis,
+    const mlx_stream s) = NULL;
 int (*mlx_scatter_add_)(
    mlx_array* res,
    const mlx_array a,
@@ -1574,6 +1647,13 @@ int (*mlx_scatter_add_)(
    const int* axes,
    size_t axes_num,
    const mlx_stream s) = NULL;
+int (*mlx_scatter_add_single_)(
+    mlx_array* res,
+    const mlx_array a,
+    const mlx_array indices,
+    const mlx_array updates,
+    int axis,
+    const mlx_stream s) = NULL;
 int (*mlx_scatter_add_axis_)(
    mlx_array* res,
    const mlx_array a,
@@ -1589,6 +1669,13 @@ int (*mlx_scatter_max_)(
    const int* axes,
    size_t axes_num,
    const mlx_stream s) = NULL;
+int (*mlx_scatter_max_single_)(
+    mlx_array* res,
+    const mlx_array a,
+    const mlx_array indices,
+    const mlx_array updates,
+    int axis,
+    const mlx_stream s) = NULL;
 int (*mlx_scatter_min_)(
    mlx_array* res,
    const mlx_array a,
@@ -1597,6 +1684,13 @@ int (*mlx_scatter_min_)(
    const int* axes,
    size_t axes_num,
    const mlx_stream s) = NULL;
+int (*mlx_scatter_min_single_)(
+    mlx_array* res,
+    const mlx_array a,
+    const mlx_array indices,
+    const mlx_array updates,
+    int axis,
+    const mlx_stream s) = NULL;
 int (*mlx_scatter_prod_)(
    mlx_array* res,
    const mlx_array a,
@@ -1605,6 +1699,13 @@ int (*mlx_scatter_prod_)(
    const int* axes,
    size_t axes_num,
    const mlx_stream s) = NULL;
+int (*mlx_scatter_prod_single_)(
+    mlx_array* res,
+    const mlx_array a,
+    const mlx_array indices,
+    const mlx_array updates,
+    int axis,
+    const mlx_stream s) = NULL;
 int (*mlx_segmented_mm_)(
    mlx_array* res,
    const mlx_array a,
@@ -2028,22 +2129,6 @@ mlx_string (*mlx_string_new_data_)(const char* str) = NULL;
 int (*mlx_string_set_)(mlx_string* str, const mlx_string src) = NULL;
 const char * (*mlx_string_data_)(mlx_string str) = NULL;
 int (*mlx_string_free_)(mlx_string str) = NULL;
-int (*mlx_detail_vmap_replace_)(
-    mlx_vector_array* res,
-    const mlx_vector_array inputs,
-    const mlx_vector_array s_inputs,
-    const mlx_vector_array s_outputs,
-    const int* in_axes,
-    size_t in_axes_num,
-    const int* out_axes,
-    size_t out_axes_num) = NULL;
-int (*mlx_detail_vmap_trace_)(
-    mlx_vector_array* res_0,
-    mlx_vector_array* res_1,
-    const mlx_closure fun,
-    const mlx_vector_array inputs,
-    const int* in_axes,
-    size_t in_axes_num) = NULL;
 int (*mlx_async_eval_)(const mlx_vector_array outputs) = NULL;
 int (*mlx_checkpoint_)(mlx_closure* res, const mlx_closure fun) = NULL;
 int (*mlx_custom_function_)(
@@ -2074,6 +2159,22 @@ int (*mlx_vjp_)(
    const mlx_closure fun,
    const mlx_vector_array primals,
    const mlx_vector_array cotangents) = NULL;
+int (*mlx_detail_vmap_replace_)(
+    mlx_vector_array* res,
+    const mlx_vector_array inputs,
+    const mlx_vector_array s_inputs,
+    const mlx_vector_array s_outputs,
+    const int* in_axes,
+    size_t in_axes_num,
+    const int* out_axes,
+    size_t out_axes_num) = NULL;
+int (*mlx_detail_vmap_trace_)(
+    mlx_vector_array* res_0,
+    mlx_vector_array* res_1,
+    const mlx_closure fun,
+    const mlx_vector_array inputs,
+    const int* in_axes,
+    size_t in_axes_num) = NULL;
 mlx_vector_array (*mlx_vector_array_new_)(void) = NULL;
 int (*mlx_vector_array_set_)(mlx_vector_array* vec, const mlx_vector_array src) = NULL;
 int (*mlx_vector_array_free_)(mlx_vector_array vec) = NULL;
@@ -2166,6 +2267,8 @@ int mlx_dynamic_load_symbols(mlx_dynamic_handle handle) {
    CHECK_LOAD(handle, mlx_array_new_double);
    CHECK_LOAD(handle, mlx_array_new_complex);
    CHECK_LOAD(handle, mlx_array_new_data);
+    CHECK_LOAD(handle, mlx_array_new_data_managed);
+    CHECK_LOAD(handle, mlx_array_new_data_managed_payload);
    CHECK_LOAD(handle, mlx_array_set);
    CHECK_LOAD(handle, mlx_array_set_bool);
    CHECK_LOAD(handle, mlx_array_set_int);
@@ -2261,6 +2364,7 @@ int mlx_dynamic_load_symbols(mlx_dynamic_handle handle) {
    CHECK_LOAD(handle, mlx_disable_compile);
    CHECK_LOAD(handle, mlx_enable_compile);
    CHECK_LOAD(handle, mlx_set_compile_mode);
+    CHECK_LOAD(handle, mlx_cuda_is_available);
    CHECK_LOAD(handle, mlx_device_new);
    CHECK_LOAD(handle, mlx_device_new_type);
    CHECK_LOAD(handle, mlx_device_free);
@@ -2271,11 +2375,16 @@ int mlx_dynamic_load_symbols(mlx_dynamic_handle handle) {
    CHECK_LOAD(handle, mlx_device_get_type);
    CHECK_LOAD(handle, mlx_get_default_device);
    CHECK_LOAD(handle, mlx_set_default_device);
-    CHECK_LOAD(handle, mlx_distributed_group_rank);
-    CHECK_LOAD(handle, mlx_distributed_group_size);
-    CHECK_LOAD(handle, mlx_distributed_group_split);
-    CHECK_LOAD(handle, mlx_distributed_is_available);
-    CHECK_LOAD(handle, mlx_distributed_init);
+    CHECK_LOAD(handle, mlx_device_is_available);
+    CHECK_LOAD(handle, mlx_device_count);
+    CHECK_LOAD(handle, mlx_device_info_new);
+    CHECK_LOAD(handle, mlx_device_info_get);
+    CHECK_LOAD(handle, mlx_device_info_free);
+    CHECK_LOAD(handle, mlx_device_info_has_key);
+    CHECK_LOAD(handle, mlx_device_info_is_string);
+    CHECK_LOAD(handle, mlx_device_info_get_string);
+    CHECK_LOAD(handle, mlx_device_info_get_size);
+    CHECK_LOAD(handle, mlx_device_info_get_keys);
    CHECK_LOAD(handle, mlx_distributed_all_gather);
    CHECK_LOAD(handle, mlx_distributed_all_max);
    CHECK_LOAD(handle, mlx_distributed_all_min);
@@ -2284,6 +2393,11 @@ int mlx_dynamic_load_symbols(mlx_dynamic_handle handle) {
    CHECK_LOAD(handle, mlx_distributed_recv_like);
    CHECK_LOAD(handle, mlx_distributed_send);
    CHECK_LOAD(handle, mlx_distributed_sum_scatter);
+    CHECK_LOAD(handle, mlx_distributed_group_rank);
+    CHECK_LOAD(handle, mlx_distributed_group_size);
+    CHECK_LOAD(handle, mlx_distributed_group_split);
+    CHECK_LOAD(handle, mlx_distributed_is_available);
+    CHECK_LOAD(handle, mlx_distributed_init);
    CHECK_LOAD(handle, mlx_set_error_handler);
    CHECK_LOAD(handle, _mlx_error);
    CHECK_LOAD(handle, mlx_export_function);
@@ -2325,6 +2439,7 @@ int mlx_dynamic_load_symbols(mlx_dynamic_handle handle) {
    CHECK_LOAD(handle, mlx_fast_metal_kernel_apply);
    CHECK_LOAD(handle, mlx_fast_rms_norm);
    CHECK_LOAD(handle, mlx_fast_rope);
+    CHECK_LOAD(handle, mlx_fast_rope_dynamic);
    CHECK_LOAD(handle, mlx_fast_scaled_dot_product_attention);
    CHECK_LOAD(handle, mlx_fft_fft);
    CHECK_LOAD(handle, mlx_fft_fft2);
@@ -2340,14 +2455,6 @@ int mlx_dynamic_load_symbols(mlx_dynamic_handle handle) {
    CHECK_LOAD(handle, mlx_fft_rfft);
    CHECK_LOAD(handle, mlx_fft_rfft2);
    CHECK_LOAD(handle, mlx_fft_rfftn);
-    CHECK_LOAD(handle, mlx_io_reader_new);
-    CHECK_LOAD(handle, mlx_io_reader_descriptor);
-    CHECK_LOAD(handle, mlx_io_reader_tostring);
-    CHECK_LOAD(handle, mlx_io_reader_free);
-    CHECK_LOAD(handle, mlx_io_writer_new);
-    CHECK_LOAD(handle, mlx_io_writer_descriptor);
-    CHECK_LOAD(handle, mlx_io_writer_tostring);
-    CHECK_LOAD(handle, mlx_io_writer_free);
    CHECK_LOAD(handle, mlx_load_reader);
    CHECK_LOAD(handle, mlx_load);
    CHECK_LOAD(handle, mlx_load_safetensors_reader);
@@ -2356,6 +2463,14 @@ int mlx_dynamic_load_symbols(mlx_dynamic_handle handle) {
    CHECK_LOAD(handle, mlx_save);
    CHECK_LOAD(handle, mlx_save_safetensors_writer);
    CHECK_LOAD(handle, mlx_save_safetensors);
+    CHECK_LOAD(handle, mlx_io_reader_new);
+    CHECK_LOAD(handle, mlx_io_reader_descriptor);
+    CHECK_LOAD(handle, mlx_io_reader_tostring);
+    CHECK_LOAD(handle, mlx_io_reader_free);
+    CHECK_LOAD(handle, mlx_io_writer_new);
+    CHECK_LOAD(handle, mlx_io_writer_descriptor);
+    CHECK_LOAD(handle, mlx_io_writer_tostring);
+    CHECK_LOAD(handle, mlx_io_writer_free);
    CHECK_LOAD(handle, mlx_linalg_cholesky);
    CHECK_LOAD(handle, mlx_linalg_cholesky_inv);
    CHECK_LOAD(handle, mlx_linalg_cross);
@@ -2400,7 +2515,6 @@ int mlx_dynamic_load_symbols(mlx_dynamic_handle handle) {
    CHECK_LOAD(handle, mlx_set_cache_limit);
    CHECK_LOAD(handle, mlx_set_memory_limit);
    CHECK_LOAD(handle, mlx_set_wired_limit);
-    CHECK_LOAD(handle, mlx_metal_device_info);
    CHECK_LOAD(handle, mlx_metal_is_available);
    CHECK_LOAD(handle, mlx_metal_start_capture);
    CHECK_LOAD(handle, mlx_metal_stop_capture);
@@ -2486,6 +2600,7 @@ int mlx_dynamic_load_symbols(mlx_dynamic_handle handle) {
    CHECK_LOAD(handle, mlx_full);
    CHECK_LOAD(handle, mlx_full_like);
    CHECK_LOAD(handle, mlx_gather);
+    CHECK_LOAD(handle, mlx_gather_single);
    CHECK_LOAD(handle, mlx_gather_mm);
    CHECK_LOAD(handle, mlx_gather_qmm);
    CHECK_LOAD(handle, mlx_greater);
@@ -2550,6 +2665,7 @@ int mlx_dynamic_load_symbols(mlx_dynamic_handle handle) {
    CHECK_LOAD(handle, mlx_prod_axis);
    CHECK_LOAD(handle, mlx_prod);
    CHECK_LOAD(handle, mlx_put_along_axis);
+    CHECK_LOAD(handle, mlx_qqmm);
    CHECK_LOAD(handle, mlx_quantize);
    CHECK_LOAD(handle, mlx_quantized_matmul);
    CHECK_LOAD(handle, mlx_radians);
@@ -2566,11 +2682,16 @@ int mlx_dynamic_load_symbols(mlx_dynamic_handle handle) {
    CHECK_LOAD(handle, mlx_round);
    CHECK_LOAD(handle, mlx_rsqrt);
    CHECK_LOAD(handle, mlx_scatter);
+    CHECK_LOAD(handle, mlx_scatter_single);
    CHECK_LOAD(handle, mlx_scatter_add);
+    CHECK_LOAD(handle, mlx_scatter_add_single);
    CHECK_LOAD(handle, mlx_scatter_add_axis);
    CHECK_LOAD(handle, mlx_scatter_max);
+    CHECK_LOAD(handle, mlx_scatter_max_single);
    CHECK_LOAD(handle, mlx_scatter_min);
+    CHECK_LOAD(handle, mlx_scatter_min_single);
    CHECK_LOAD(handle, mlx_scatter_prod);
+    CHECK_LOAD(handle, mlx_scatter_prod_single);
    CHECK_LOAD(handle, mlx_segmented_mm);
    CHECK_LOAD(handle, mlx_sigmoid);
    CHECK_LOAD(handle, mlx_sign);
@@ -2665,8 +2786,6 @@ int mlx_dynamic_load_symbols(mlx_dynamic_handle handle) {
    CHECK_LOAD(handle, mlx_string_set);
    CHECK_LOAD(handle, mlx_string_data);
    CHECK_LOAD(handle, mlx_string_free);
-    CHECK_LOAD(handle, mlx_detail_vmap_replace);
-    CHECK_LOAD(handle, mlx_detail_vmap_trace);
    CHECK_LOAD(handle, mlx_async_eval);
    CHECK_LOAD(handle, mlx_checkpoint);
    CHECK_LOAD(handle, mlx_custom_function);
@@ -2675,6 +2794,8 @@ int mlx_dynamic_load_symbols(mlx_dynamic_handle handle) {
    CHECK_LOAD(handle, mlx_jvp);
    CHECK_LOAD(handle, mlx_value_and_grad);
    CHECK_LOAD(handle, mlx_vjp);
+    CHECK_LOAD(handle, mlx_detail_vmap_replace);
+    CHECK_LOAD(handle, mlx_detail_vmap_trace);
    CHECK_LOAD(handle, mlx_vector_array_new);
    CHECK_LOAD(handle, mlx_vector_array_set);
    CHECK_LOAD(handle, mlx_vector_array_free);
--- a/x/mlxrunner/mlx/generated.h
+++ b/x/mlxrunner/mlx/generated.h
--- a/x/mlxrunner/mlx/generator/generated.h.gotmpl
+++ b/x/mlxrunner/mlx/generator/generated.h.gotmpl
@@ -4,6 +4,10 @@
 #define MLX_GENERATED_H

 #include "dynamic.h"
+{{ range .Functions }}
+#define {{ .Name }} {{ .Name }}_mlx_gen_orig_
+{{- end }}
+
 #include "mlx/c/mlx.h"
 {{ range .Functions }}
 #undef {{ .Name }}
--- a/x/mlxrunner/mlx/io.go
+++ b/x/mlxrunner/mlx/io.go
@@ -37,7 +37,9 @@ func Load(path string) iter.Seq2[string, *Array] {
 			}

 			name := C.GoString(key)
-			if !yield(name, &Array{ctx: value, desc: tensorDesc{name: name, numRefs: 1000}}) {
+			arr := New(name)
+			arr.ctx = value
+			if !yield(name, arr) {
 				break
 			}
 		}
--- a/x/mlxrunner/mlx/ops.go
+++ b/x/mlxrunner/mlx/ops.go
@@ -10,43 +10,43 @@ import (
 )

 func (t *Array) Abs() *Array {
-	out := New("ABS", t)
+	out := New("ABS")
 	C.mlx_abs(&out.ctx, t.ctx, DefaultStream().ctx)
 	return out
 }

 func (t *Array) Add(other *Array) *Array {
-	out := New("ADD", t, other)
+	out := New("ADD")
 	C.mlx_add(&out.ctx, t.ctx, other.ctx, DefaultStream().ctx)
 	return out
 }

 func (t *Array) Addmm(a, b *Array, alpha, beta float32) *Array {
-	out := New("ADDMM", t, a, b)
+	out := New("ADDMM")
 	C.mlx_addmm(&out.ctx, t.ctx, a.ctx, b.ctx, C.float(alpha), C.float(beta), DefaultStream().ctx)
 	return out
 }

 func (t *Array) Argmax(axis int, keepDims bool) *Array {
-	out := New("ARGMAX", t)
+	out := New("ARGMAX")
 	C.mlx_argmax_axis(&out.ctx, t.ctx, C.int(axis), C.bool(keepDims), DefaultStream().ctx)
 	return out
 }

 func (t *Array) ArgpartitionAxis(kth int, axis int) *Array {
-	out := New("ARGPARTITION", t)
+	out := New("ARGPARTITION")
 	C.mlx_argpartition_axis(&out.ctx, t.ctx, C.int(kth), C.int(axis), DefaultStream().ctx)
 	return out
 }

 func (t *Array) ArgsortAxis(axis int) *Array {
-	out := New("ARGSORT_AXIS", t)
+	out := New("ARGSORT_AXIS")
 	C.mlx_argsort_axis(&out.ctx, t.ctx, C.int(axis), DefaultStream().ctx)
 	return out
 }

 func (t *Array) AsType(dtype DType) *Array {
-	out := New("AS_TYPE", t)
+	out := New("AS_TYPE")
 	C.mlx_astype(&out.ctx, t.ctx, C.mlx_dtype(dtype), DefaultStream().ctx)
 	return out
 }
@@ -62,7 +62,7 @@ func (t *Array) AsStrided(shape []int, strides []int, offset int) *Array {
 		cStrides[i] = C.int64_t(s)
 	}

-	out := New("AS_STRIDED", t)
+	out := New("AS_STRIDED")
 	C.mlx_as_strided(
 		&out.ctx, t.ctx,
 		unsafe.SliceData(cShape), C.size_t(len(shape)),
@@ -82,31 +82,31 @@ func (t *Array) Concatenate(axis int, others ...*Array) *Array {
 		C.mlx_vector_array_append_value(vector, other.ctx)
 	}

-	out := New("CONCATENATE", s...)
+	out := New("CONCATENATE")
 	C.mlx_concatenate_axis(&out.ctx, vector, C.int(axis), DefaultStream().ctx)
 	return out
 }

 func (t *Array) Divide(other *Array) *Array {
-	out := New("DIVIDE", t, other)
+	out := New("DIVIDE")
 	C.mlx_divide(&out.ctx, t.ctx, other.ctx, DefaultStream().ctx)
 	return out
 }

 func (t *Array) ExpandDims(axis int) *Array {
-	out := New("EXPAND_DIMS", t)
+	out := New("EXPAND_DIMS")
 	C.mlx_expand_dims(&out.ctx, t.ctx, C.int(axis), DefaultStream().ctx)
 	return out
 }

 func (t *Array) Flatten(startAxis, endAxis int) *Array {
-	out := New("FLATTEN", t)
+	out := New("FLATTEN")
 	C.mlx_flatten(&out.ctx, t.ctx, C.int(startAxis), C.int(endAxis), DefaultStream().ctx)
 	return out
 }

 func (t *Array) FloorDivide(other *Array) *Array {
-	out := New("FLOOR_DIVIDE", t, other)
+	out := New("FLOOR_DIVIDE")
 	C.mlx_floor_divide(&out.ctx, t.ctx, other.ctx, DefaultStream().ctx)
 	return out
 }
@@ -118,43 +118,43 @@ func (t *Array) GatherMM(other, lhs, rhs *Array, sorted bool) *Array {
 	if rhs == nil {
 		rhs = New("")
 	}
-	out := New("GATHER_MM", t, other, lhs, rhs)
+	out := New("GATHER_MM")
 	C.mlx_gather_mm(&out.ctx, t.ctx, other.ctx, lhs.ctx, rhs.ctx, C.bool(sorted), DefaultStream().ctx)
 	return out
 }

 func (t *Array) Logsumexp(keepDims bool) *Array {
-	out := New("LOGSUMEXP", t)
+	out := New("LOGSUMEXP")
 	C.mlx_logsumexp(&out.ctx, t.ctx, C.bool(keepDims), DefaultStream().ctx)
 	return out
 }

 func (t *Array) Matmul(other *Array) *Array {
-	out := New("MATMUL", t, other)
+	out := New("MATMUL")
 	C.mlx_matmul(&out.ctx, t.ctx, other.ctx, DefaultStream().ctx)
 	return out
 }

 func (t *Array) Multiply(other *Array) *Array {
-	out := New("MULTIPLY", t, other)
+	out := New("MULTIPLY")
 	C.mlx_multiply(&out.ctx, t.ctx, other.ctx, DefaultStream().ctx)
 	return out
 }

 func (t *Array) Negative() *Array {
-	out := New("NEGATIVE", t)
+	out := New("NEGATIVE")
 	C.mlx_negative(&out.ctx, t.ctx, DefaultStream().ctx)
 	return out
 }

 func (t *Array) Power(exponent *Array) *Array {
-	out := New("POWER", t, exponent)
+	out := New("POWER")
 	C.mlx_power(&out.ctx, t.ctx, exponent.ctx, DefaultStream().ctx)
 	return out
 }

 func (t *Array) PutAlongAxis(indices, values *Array, axis int) *Array {
-	out := New("PUT_ALONG_AXIS", t, indices, values)
+	out := New("PUT_ALONG_AXIS")
 	C.mlx_put_along_axis(&out.ctx, t.ctx, indices.ctx, values.ctx, C.int(axis), DefaultStream().ctx)
 	return out
 }
@@ -165,25 +165,25 @@ func (t *Array) Reshape(axes ...int) *Array {
 		cAxes[i] = C.int(axes[i])
 	}

-	out := New("RESHAPE", t)
+	out := New("RESHAPE")
 	C.mlx_reshape(&out.ctx, t.ctx, unsafe.SliceData(cAxes), C.size_t(len(cAxes)), DefaultStream().ctx)
 	return out
 }

 func (t *Array) Sigmoid() *Array {
-	out := New("SIGMOID", t)
+	out := New("SIGMOID")
 	C.mlx_sigmoid(&out.ctx, t.ctx, DefaultStream().ctx)
 	return out
 }

 func (t *Array) Sqrt() *Array {
-	out := New("SQRT", t)
+	out := New("SQRT")
 	C.mlx_sqrt(&out.ctx, t.ctx, DefaultStream().ctx)
 	return out
 }

 func (t *Array) Squeeze(axis int) *Array {
-	out := New("SQUEEZE", t)
+	out := New("SQUEEZE")
 	C.mlx_squeeze_axis(&out.ctx, t.ctx, C.int(axis), DefaultStream().ctx)
 	return out
 }
@@ -198,37 +198,37 @@ func (t *Array) StackAxis(axis int, others ...*Array) *Array {
 	vector := C.mlx_vector_array_new_data(unsafe.SliceData(vectorData), C.size_t(len(vectorData)))
 	defer C.mlx_vector_array_free(vector)

-	out := New("STACK_AXIS", append(others, t)...)
+	out := New("STACK_AXIS")
 	C.mlx_stack_axis(&out.ctx, vector, C.int(axis), DefaultStream().ctx)
 	return out
 }

 func (t *Array) Subtract(other *Array) *Array {
-	out := New("SUBTRACT", t, other)
+	out := New("SUBTRACT")
 	C.mlx_subtract(&out.ctx, t.ctx, other.ctx, DefaultStream().ctx)
 	return out
 }

 func (t *Array) SumAxis(axis int, keepDims bool) *Array {
-	out := New("SUM_AXIS", t)
+	out := New("SUM_AXIS")
 	C.mlx_sum_axis(&out.ctx, t.ctx, C.int(axis), C.bool(keepDims), DefaultStream().ctx)
 	return out
 }

 func (t *Array) TakeAxis(indices *Array, axis int) *Array {
-	out := New("TAKE_AXIS", t, indices)
+	out := New("TAKE_AXIS")
 	C.mlx_take_axis(&out.ctx, t.ctx, indices.ctx, C.int(axis), DefaultStream().ctx)
 	return out
 }

 func (t *Array) TakeAlongAxis(indices *Array, axis int) *Array {
-	out := New("TAKE_ALONG_AXIS", t, indices)
+	out := New("TAKE_ALONG_AXIS")
 	C.mlx_take_along_axis(&out.ctx, t.ctx, indices.ctx, C.int(axis), DefaultStream().ctx)
 	return out
 }

 func (t *Array) Tanh() *Array {
-	out := New("TANH", t)
+	out := New("TANH")
 	C.mlx_tanh(&out.ctx, t.ctx, DefaultStream().ctx)
 	return out
 }
@@ -239,7 +239,7 @@ func (t *Array) Transpose(axes ...int) *Array {
 		cAxes[i] = C.int(axis)
 	}

-	out := New("TRANSPOSE", t)
+	out := New("TRANSPOSE")
 	C.mlx_transpose_axes(&out.ctx, t.ctx, unsafe.SliceData(cAxes), C.size_t(len(cAxes)), DefaultStream().ctx)
 	return out
 }
--- a/x/mlxrunner/mlx/ops_extra.go
+++ b/x/mlxrunner/mlx/ops_extra.go
@@ -41,14 +41,12 @@ func Dequantize(w, scales, biases *Array, groupSize, bits int, mode string) *Arr
 	optBits := C.mlx_optional_int{value: C.int(bits), has_value: true}
 	optDtype := C.mlx_optional_dtype{has_value: false}

-	inputs := []*Array{w, scales}
 	var b C.mlx_array
 	if biases != nil {
 		b = biases.ctx
-		inputs = append(inputs, biases)
 	}

-	out := New("DEQUANTIZE", inputs...)
+	out := New("DEQUANTIZE")
 	C.mlx_dequantize(&out.ctx, w.ctx, scales.ctx, b, optGroupSize, optBits, cMode, optDtype, DefaultStream().ctx)
 	return out
 }
@@ -59,14 +57,12 @@ func QuantizedMatmul(x, w, scales, biases *Array, transpose bool, groupSize, bit
 	optGroupSize := C.mlx_optional_int{value: C.int(groupSize), has_value: true}
 	optBits := C.mlx_optional_int{value: C.int(bits), has_value: true}

-	inputs := []*Array{x, w, scales}
 	var b C.mlx_array
 	if biases != nil {
 		b = biases.ctx
-		inputs = append(inputs, biases)
 	}

-	out := New("QUANTIZED_MATMUL", inputs...)
+	out := New("QUANTIZED_MATMUL")
 	C.mlx_quantized_matmul(&out.ctx, x.ctx, w.ctx, scales.ctx, b, C.bool(transpose), optGroupSize, optBits, cMode, DefaultStream().ctx)
 	return out
 }
@@ -77,22 +73,18 @@ func GatherQMM(x, w, scales *Array, biases, lhsIndices, rhsIndices *Array, trans
 	optGroupSize := C.mlx_optional_int{value: C.int(groupSize), has_value: true}
 	optBits := C.mlx_optional_int{value: C.int(bits), has_value: true}

-	inputs := []*Array{x, w, scales}
 	var b, lhs, rhs C.mlx_array
 	if biases != nil {
 		b = biases.ctx
-		inputs = append(inputs, biases)
 	}
 	if lhsIndices != nil {
 		lhs = lhsIndices.ctx
-		inputs = append(inputs, lhsIndices)
 	}
 	if rhsIndices != nil {
 		rhs = rhsIndices.ctx
-		inputs = append(inputs, rhsIndices)
 	}

-	out := New("GATHER_QMM", inputs...)
+	out := New("GATHER_QMM")
 	C.mlx_gather_qmm(&out.ctx, x.ctx, w.ctx, scales.ctx, b, lhs, rhs, C.bool(transpose), optGroupSize, optBits, cMode, C.bool(sortedIndices), DefaultStream().ctx)
 	return out
 }
@@ -104,7 +96,7 @@ func Tile(a *Array, reps []int32) *Array {
 	for i, r := range reps {
 		cReps[i] = C.int(r)
 	}
-	out := New("TILE", a)
+	out := New("TILE")
 	C.mlx_tile(&out.ctx, a.ctx, unsafe.SliceData(cReps), C.size_t(len(reps)), DefaultStream().ctx)
 	return out
 }
@@ -116,7 +108,7 @@ func Tri(n, m int32, k int) *Array {
 }

 func Where(condition, a, b *Array) *Array {
-	out := New("WHERE", condition, a, b)
+	out := New("WHERE")
 	C.mlx_where(&out.ctx, condition.ctx, a.ctx, b.ctx, DefaultStream().ctx)
 	return out
 }
@@ -131,7 +123,7 @@ func Stack(arrays []*Array, axis int) *Array {
 	vector := C.mlx_vector_array_new_data(unsafe.SliceData(vectorData), C.size_t(len(vectorData)))
 	defer C.mlx_vector_array_free(vector)

-	out := New("STACK", arrays...)
+	out := New("STACK")
 	C.mlx_stack_axis(&out.ctx, vector, C.int(axis), DefaultStream().ctx)
 	return out
 }
@@ -153,13 +145,13 @@ func Take(a *Array, indices *Array, axis int) *Array {
 }

 func RSqrt(a *Array) *Array {
-	out := New("RSQRT", a)
+	out := New("RSQRT")
 	C.mlx_rsqrt(&out.ctx, a.ctx, DefaultStream().ctx)
 	return out
 }

 func Mean(a *Array, axis int, keepDims bool) *Array {
-	out := New("MEAN_AXIS", a)
+	out := New("MEAN_AXIS")
 	C.mlx_mean_axis(&out.ctx, a.ctx, C.int(axis), C.bool(keepDims), DefaultStream().ctx)
 	return out
 }
@@ -235,7 +227,7 @@ func SliceStartStop(a *Array, start, stop []int32) *Array {
 		cStop[i] = C.int(stop[i])
 		cStrides[i] = 1
 	}
-	out := New("SLICE", a)
+	out := New("SLICE")
 	C.mlx_slice(&out.ctx, a.ctx, unsafe.SliceData(cStart), C.size_t(n), unsafe.SliceData(cStop), C.size_t(n), unsafe.SliceData(cStrides), C.size_t(n), DefaultStream().ctx)
 	return out
 }
@@ -257,7 +249,7 @@ func SiLU(a *Array) *Array {

 func RoPEWithBase(x *Array, dims int, traditional bool, base, scale float32, offset int) *Array {
 	freqs := New("")
-	out := New("FAST_ROPE", x, freqs)
+	out := New("FAST_ROPE")
 	C.mlx_fast_rope(
 		&out.ctx,
 		x.ctx,
@@ -289,13 +281,13 @@ func ScaledDotProductAttentionCausal(q, k, v *Array, scale float32, causalMask b
 	cMode := C.CString(mode)
 	defer C.free(unsafe.Pointer(cMode))

-	out := New("FAST_SDPA", q, k, v, mask, sinks)
+	out := New("FAST_SDPA")
 	C.mlx_fast_scaled_dot_product_attention(&out.ctx, q.ctx, k.ctx, v.ctx, C.float(scale), cMode, mask.ctx, sinks.ctx, DefaultStream().ctx)
 	return out
 }

 func RMSNormFn(x, weight *Array, eps float32) *Array {
-	out := New("FAST_RMSNORM", x)
+	out := New("FAST_RMSNORM")
 	C.mlx_fast_rms_norm(&out.ctx, x.ctx, weight.ctx, C.float(eps), DefaultStream().ctx)
 	return out
 }
@@ -322,7 +314,7 @@ func scalarWithDtype(s float32, a *Array) C.mlx_array {

 func AddScalar(a *Array, s float32) *Array {
 	scalar := scalarWithDtype(s, a)
-	out := New("ADD_SCALAR", a)
+	out := New("ADD_SCALAR")
 	C.mlx_add(&out.ctx, a.ctx, scalar, DefaultStream().ctx)
 	C.mlx_array_free(scalar)
 	return out
@@ -330,7 +322,7 @@ func AddScalar(a *Array, s float32) *Array {

 func MulScalar(a *Array, s float32) *Array {
 	scalar := scalarWithDtype(s, a)
-	out := New("MUL_SCALAR", a)
+	out := New("MUL_SCALAR")
 	C.mlx_multiply(&out.ctx, a.ctx, scalar, DefaultStream().ctx)
 	C.mlx_array_free(scalar)
 	return out
@@ -338,7 +330,7 @@ func MulScalar(a *Array, s float32) *Array {

 func DivScalar(a *Array, s float32) *Array {
 	scalar := scalarWithDtype(s, a)
-	out := New("DIV_SCALAR", a)
+	out := New("DIV_SCALAR")
 	C.mlx_divide(&out.ctx, a.ctx, scalar, DefaultStream().ctx)
 	C.mlx_array_free(scalar)
 	return out
--- a/x/mlxrunner/mlx/random.go
+++ b/x/mlxrunner/mlx/random.go
@@ -7,7 +7,7 @@ import "C"

 func (t *Array) Categorical(axis int) *Array {
 	key := New("")
-	out := New("", t, key)
+	out := New("")
 	C.mlx_random_categorical(&out.ctx, t.ctx, C.int(axis), key.ctx, DefaultStream().ctx)
 	return out
 }
--- a/x/mlxrunner/mlx/slice.go
+++ b/x/mlxrunner/mlx/slice.go
@@ -61,7 +61,7 @@ func makeSlices(dims []int, slices ...slice) (starts, stops, strides []C.int) {

 func (t *Array) Slice(slices ...slice) *Array {
 	starts, stops, strides := makeSlices(t.Dims(), slices...)
-	out := New("SLICE", t)
+	out := New("SLICE")
 	C.mlx_slice(
 		&out.ctx, t.ctx,
 		unsafe.SliceData(starts), C.size_t(len(starts)),
@@ -74,7 +74,7 @@ func (t *Array) Slice(slices ...slice) *Array {

 func (t *Array) SliceUpdate(other *Array, slices ...slice) *Array {
 	starts, stops, strides := makeSlices(t.Dims(), slices...)
-	out := New("SLICE_UPDATE", t, other)
+	out := New("SLICE_UPDATE")
 	C.mlx_slice_update(
 		&out.ctx, t.ctx, other.ctx,
 		unsafe.SliceData(starts), C.size_t(len(starts)),
--- a/x/mlxrunner/model/base/base.go
+++ b/x/mlxrunner/model/base/base.go
@@ -78,8 +78,21 @@ func New(root *model.Root) (Model, error) {
 	return fn(root)
 }

-// Weights returns the model's LoadWeights method, which encapsulates all
-// weight assignment and post-processing (MLA absorption, expert stacking).
+// Weights returns a function that loads model weights, then pins all
+// arrays reachable from the model struct and sweeps everything else.
 func Weights(m Model) func(map[string]*mlx.Array) error {
-	return m.LoadWeights
+	return func(tensors map[string]*mlx.Array) error {
+		if err := m.LoadWeights(tensors); err != nil {
+			return err
+		}
+
+		collected := mlx.Collect(m)
+		for _, arr := range collected {
+			mlx.Pin(arr)
+		}
+		mlx.Sweep()
+		mlx.Eval(collected...)
+
+		return nil
+	}
 }
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Patrick Devine	857cffd22a	bugfix: fix crash bug in token cache logic This change fixes a problem in the token cache logic to avoid panics caused by empty token arrays by ensuring at least one token remains on full cache hits in the relevant function. The happens if there is an exact match in the cache on subsequent generations.	2026-02-26 18:35:44 -08:00
Jeffrey Morgan	d98dda4676	model: fix qwen3 tool calling in thinking (#14477 ) Align Qwen parser behavior with Transformers serve by allowing <tool_call> parsing while still in thinking collection. Changes: - qwen3vl: detect <tool_call> before </think> in thinking state and transition to tool parsing - qwen3: same thinking-state tool detection and partial-tag overlap handling - tests: update qwen3vl thinking/tool interleaving expectations - tests: add qwen3 cases for tool call before </think> and split <tool_call> streaming	2026-02-26 16:13:18 -08:00
Eva H	d69ddc1edc	fix: window app crash on startup when update is pending (#14451 )	2026-02-26 16:47:12 -05:00
Eva H	9bf41969f0	app: fix first update check delayed by 1 hour (#14427 )	2026-02-25 18:29:55 -05:00
Jesse Gross	0f23b7bff5	mlxrunner: Cancel in-flight requests when the client disconnects Currently, a canceled request can result in computation continuing in the background to completion. It can also trigger a deadlock when there is nobody to read the output tokens and the pipeline cannot continue to the next request.	2026-02-25 14:00:42 -08:00
Jesse Gross	4e57d2094e	mlxrunner: Simplify pipeline memory and cache management Particularly in error cases, it can be difficult to ensure that all pinned memory is unpinned, MLX buffers are released and cache state is consistent. This encapsulates those pieces and sets up proper deferrals so that this happens automatically on exit.	2026-02-25 14:00:42 -08:00
Jeffrey Morgan	7f9efd53df	model: add support for qwen3.5-27b model (#14415 )	2026-02-25 01:09:58 -08:00
Jeffrey Morgan	da70c3222e	model: support for qwen3.5 architecture (#14378 )	2026-02-24 20:08:05 -08:00
Bruce MacDonald	9d902d63ce	ggml: ensure tensor size is valid (#14406 ) When quantizing tensors during model creation validate that the resulting sizes match what is expected based on the shape.	2026-02-24 21:52:44 -04:00
Daniel Hiltgen	f4f0a4a471	update mlx-c bindings to 0.5.0 (#14380 ) * chore: update mlx-c bindings to 0.5.0 (#14303) * linux: use gcc 11 --------- Co-authored-by: Patrick Devine <patrick@infrahq.com>	2026-02-23 16:44:29 -08:00
Eva H	3323c1d319	app: add upgrade configuration to settings page (#13512 )	2026-02-23 18:08:52 -05:00
Jesse Gross	f20dc6b698	mlx: don't default to affine quantization for unquantized models Otherwise the BF16 version of models trigger segfaults when they call into quantized kernels.	2026-02-23 15:03:53 -08:00
Jeffrey Morgan	4b2ac1f369	model: improvements to LFM architectures (#14368 )	2026-02-23 14:38:10 -08:00
Jesse Gross	8daf47fb3a	mlxrunner: Fix duplicate log prefixes and reduce log noise Pass subprocess stdout/stderr through to the parent's stderr directly instead of re-wrapping each line with slog. The subprocess already writes structured slog output, so the re-wrapping produced nested timestamps, levels, and message fields that were hard to read. Also downgrade verbose KV cache debug logs to trace level.	2026-02-23 14:09:20 -08:00
Eva H	6c980579cd	ui: use capability-based detection for web search (#14336 )	2026-02-23 15:00:09 -05:00
Jesse Gross	5c73c4e2ee	mlxrunner: Simplify KV cache to single-entry prefix matching The KV cache previously used a tree structure which could store multiple divergent sequences, which is good for cache reuse. However, this is typically used in conjunction with paged attention so each node in the tree can store just a chunk of the KV cache and they can be stitched together later. We don't currently do this, so the cache was storing copies of the full cache for each past sequence. This redundancy plus the lack of resource limits, caused significant memory use as a conversation grew. Instead, this changes to store a single entry for the cache, which can be prefix matched. Although it is less ideal for multiple users, it largely matches Ollama's current behavior. It can be improved as additional pieces are fleshed out.	2026-02-23 09:50:07 -08:00
Jesse Gross	5daf59cc66	mlxrunner: Fix memory leaks with pin/sweep lifecycle management The previous approach tracked array lifecycles through reference counting, where each array recorded its inputs and a reference count that was decremented as dependents were freed. This is not really necessary as MLX tracks references internally. It is also error prone as it is easy to create new arrays and forget to free them when the Go variable goes out of scope. Instead, we can pin just the arrays we want (typically outputs and specific intermediates, like the cache). All other arrays are freed by default when we run sweep. This avoids most causes of memory leaks while still giving the freedom to save what we want.	2026-02-23 09:50:07 -08:00
Jeffrey Morgan	0ade9205cc	models: add nemotronh architecture support (#14356 )	2026-02-22 15:09:14 -08:00