fix lint

fix tests
server: usage api
2026-02-04 04:33:30 -05:00 · 2026-01-28 13:16:52 -08:00 · 2026-01-28 13:07:48 -08:00 · 2026-01-27 17:01:18 -08:00
330 changed files with 8071 additions and 29630 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -92,7 +92,7 @@ jobs:
            flags: ''
          - os: windows
            arch: amd64
-            preset: 'CUDA 13 Windows'
+            preset: 'CUDA 13'
            install: https://developer.download.nvidia.com/compute/cuda/13.0.0/local_installers/cuda_13.0.0_windows.exe
            cuda-components:
              - '"cudart"'
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -40,17 +40,7 @@
      "name": "CUDA 13",
      "inherits": [ "CUDA" ],
      "cacheVariables": {
-        "CMAKE_CUDA_ARCHITECTURES": "75-virtual;80-virtual;86-virtual;89-virtual;90-virtual;90a-virtual;100-virtual;103-virtual;110-virtual;120-virtual;121-virtual",
-        "CMAKE_CUDA_FLAGS": "-t 4",
-        "OLLAMA_RUNNER_DIR": "cuda_v13"
-      }
-    },
-    {
-      "name": "CUDA 13 Windows",
-      "inherits": [ "CUDA" ],
-      "description": "Reduced architecture set for Windows to avoid MSVC template compilation issues",
-      "cacheVariables": {
-        "CMAKE_CUDA_ARCHITECTURES": "75-virtual;89-virtual;100-virtual;120-virtual",
+        "CMAKE_CUDA_ARCHITECTURES": "75-virtual;80-virtual;86-virtual;87-virtual;89-virtual;90-virtual;90a-virtual;100-virtual;103-virtual;110-virtual;120-virtual;121-virtual",
        "CMAKE_CUDA_FLAGS": "-t 4",
        "OLLAMA_RUNNER_DIR": "cuda_v13"
      }
@@ -148,11 +138,6 @@
      "inherits": [ "CUDA" ],
      "configurePreset": "CUDA 13"
    },
-    {
-      "name": "CUDA 13 Windows",
-      "inherits": [ "CUDA" ],
-      "configurePreset": "CUDA 13 Windows"
-    },
    {
      "name": "JetPack 5",
      "inherits": [ "CUDA" ],
--- a/Makefile.sync
+++ b/Makefile.sync
@@ -1,6 +1,6 @@
 UPSTREAM=https://github.com/ggml-org/llama.cpp.git
 WORKDIR=llama/vendor
-FETCH_HEAD=a5bb8ba4c50257437630c136210396810741bbf7
+FETCH_HEAD=ec98e2002

 .PHONY: help
 help:
--- a/README.md
+++ b/README.md
@@ -358,7 +358,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Odin Runes](https://github.com/leonid20000/OdinRunes)
 - [LLM-X](https://github.com/mrdjohnson/llm-x) (Progressive Web App)
 - [AnythingLLM (Docker + MacOs/Windows/Linux native app)](https://github.com/Mintplex-Labs/anything-llm)
- [Screenpipe](https://github.com/mediar-ai/screenpipe) (24/7 screen & mic recording with AI-powered search, uses Ollama for local LLM features)
 - [Ollama Basic Chat: Uses HyperDiv Reactive UI](https://github.com/rapidarchitect/ollama_basic_chat)
 - [Ollama-chats RPG](https://github.com/drazdra/ollama-chats)
 - [IntelliBar](https://intellibar.app/) (AI-powered assistant for macOS)
@@ -466,7 +465,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Clueless](https://github.com/KashyapTan/clueless) (Open Source & Local Cluely: A desktop application LLM assistant to help you talk to anything on your screen using locally served Ollama models. Also undetectable to screenshare)
 - [ollama-co2](https://github.com/carbonatedWaterOrg/ollama-co2) (FastAPI web interface for monitoring and managing local and remote Ollama servers with real-time model monitoring and concurrent downloads)
 - [Hillnote](https://hillnote.com) (A Markdown-first workspace designed to supercharge your AI workflow. Create documents ready to integrate with Claude, ChatGPT, Gemini, Cursor, and more - all while keeping your work on your device.)
- [Stakpak](https://github.com/stakpak/agent) (An open source, vendor neutral DevOps agent that works with any model, and any stack, for teams who just want to ship)

 ### Cloud

--- a/anthropic/anthropic.go
+++ b/anthropic/anthropic.go
@@ -211,7 +211,6 @@ type MessageDelta struct {

 // DeltaUsage contains cumulative token usage
 type DeltaUsage struct {
-	InputTokens  int `json:"input_tokens"`
 	OutputTokens int `json:"output_tokens"`
 }

@@ -722,7 +721,6 @@ func (c *StreamConverter) Process(r api.ChatResponse) []StreamEvent {
 			})
 		}

-		c.inputTokens = r.Metrics.PromptEvalCount
 		c.outputTokens = r.Metrics.EvalCount
 		stopReason := mapStopReason(r.DoneReason, len(c.toolCallsSent) > 0)

@@ -734,7 +732,6 @@ func (c *StreamConverter) Process(r api.ChatResponse) []StreamEvent {
 					StopReason: stopReason,
 				},
 				Usage: DeltaUsage{
-					InputTokens:  c.inputTokens,
 					OutputTokens: c.outputTokens,
 				},
 			},
--- a/anthropic/anthropic_test.go
+++ b/anthropic/anthropic_test.go
@@ -642,7 +642,7 @@ func TestStreamConverter_Basic(t *testing.T) {
 		},
 		Done:       true,
 		DoneReason: "stop",
-		Metrics:    api.Metrics{PromptEvalCount: 10, EvalCount: 5},
+		Metrics:    api.Metrics{EvalCount: 5},
 	}

 	events2 := conv.Process(resp2)
@@ -650,24 +650,6 @@ func TestStreamConverter_Basic(t *testing.T) {
 	// Should have content_block_delta, content_block_stop, message_delta, message_stop
 	hasStop := false
 	for _, e := range events2 {
-		if e.Event == "message_delta" {
-			if data, ok := e.Data.(MessageDeltaEvent); ok {
-				if data.Type != "message_delta" {
-					t.Errorf("unexpected data type: %+v", data)
-				}
-
-				if data.Delta.StopReason != "end_turn" {
-					t.Errorf("unexpected stop reason: %+v", data.Delta.StopReason)
-				}
-
-				if data.Usage.InputTokens != 10 || data.Usage.OutputTokens != 5 {
-					t.Errorf("unexpected usage: %+v", data.Usage)
-				}
-			} else {
-				t.Errorf("unexpected data: %+v", e.Data)
-			}
-		}
-
 		if e.Event == "message_stop" {
 			hasStop = true
 		}
--- a/api/types.go
+++ b/api/types.go
@@ -912,6 +912,19 @@ type UserResponse struct {
 	Plan      string    `json:"plan,omitempty"`
 }

+type UsageResponse struct {
+	// Start is the time the server started tracking usage (UTC, RFC 3339).
+	Start time.Time        `json:"start"`
+	Usage []ModelUsageData `json:"usage"`
+}
+
+type ModelUsageData struct {
+	Model            string `json:"model"`
+	Requests         int64  `json:"requests"`
+	PromptTokens     int64  `json:"prompt_tokens"`
+	CompletionTokens int64  `json:"completion_tokens"`
+}
+
 // Tensor describes the metadata for a given tensor.
 type Tensor struct {
 	Name  string   `json:"name"`
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -36,7 +36,6 @@ import (

 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/cmd/config"
-	"github.com/ollama/ollama/cmd/tui"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/parser"
@@ -1792,134 +1791,6 @@ Environment Variables:
 	cmd.SetUsageTemplate(cmd.UsageTemplate() + envUsage)
 }

-// runInteractiveTUI runs the main interactive TUI menu.
-func runInteractiveTUI(cmd *cobra.Command) {
-	// errSelectionCancelled is returned when user cancels model selection
-	errSelectionCancelled := errors.New("cancelled")
-
-	// Selector adapters for tui
-	singleSelector := func(title string, items []config.ModelItem) (string, error) {
-		tuiItems := make([]tui.SelectItem, len(items))
-		for i, item := range items {
-			tuiItems[i] = tui.SelectItem{Name: item.Name, Description: item.Description}
-		}
-		result, err := tui.SelectSingle(title, tuiItems)
-		if errors.Is(err, tui.ErrCancelled) {
-			return "", errSelectionCancelled
-		}
-		return result, err
-	}
-
-	multiSelector := func(title string, items []config.ModelItem, preChecked []string) ([]string, error) {
-		tuiItems := make([]tui.SelectItem, len(items))
-		for i, item := range items {
-			tuiItems[i] = tui.SelectItem{Name: item.Name, Description: item.Description}
-		}
-		result, err := tui.SelectMultiple(title, tuiItems, preChecked)
-		if errors.Is(err, tui.ErrCancelled) {
-			return nil, errSelectionCancelled
-		}
-		return result, err
-	}
-
-	for {
-		result, err := tui.Run()
-		if err != nil {
-			fmt.Fprintf(os.Stderr, "Error: %v\n", err)
-			return
-		}
-
-		runModel := func(modelName string) {
-			_ = config.SetLastModel(modelName)
-			opts := runOptions{
-				Model:       modelName,
-				WordWrap:    os.Getenv("TERM") == "xterm-256color",
-				Options:     map[string]any{},
-				ShowConnect: true,
-			}
-			if err := loadOrUnloadModel(cmd, &opts); err != nil {
-				fmt.Fprintf(os.Stderr, "Error loading model: %v\n", err)
-				return
-			}
-			if err := generateInteractive(cmd, opts); err != nil {
-				fmt.Fprintf(os.Stderr, "Error running model: %v\n", err)
-			}
-		}
-
-		launchIntegration := func(name string) bool {
-			// If not configured or model no longer exists, prompt for model selection
-			configuredModel := config.IntegrationModel(name)
-			if configuredModel == "" || !config.ModelExists(cmd.Context(), configuredModel) {
-				err := config.ConfigureIntegrationWithSelectors(cmd.Context(), name, singleSelector, multiSelector)
-				if errors.Is(err, errSelectionCancelled) {
-					return false // Return to main menu
-				}
-				if err != nil {
-					fmt.Fprintf(os.Stderr, "Error configuring %s: %v\n", name, err)
-					return true
-				}
-			}
-			if err := config.LaunchIntegration(name); err != nil {
-				fmt.Fprintf(os.Stderr, "Error launching %s: %v\n", name, err)
-			}
-			return true
-		}
-
-		switch result.Selection {
-		case tui.SelectionNone:
-			// User quit
-			return
-		case tui.SelectionRunModel:
-			_ = config.SetLastSelection("run")
-			// Run last model directly if configured and still exists
-			if modelName := config.LastModel(); modelName != "" && config.ModelExists(cmd.Context(), modelName) {
-				runModel(modelName)
-			} else {
-				// No last model or model no longer exists, show picker
-				modelName, err := config.SelectModelWithSelector(cmd.Context(), singleSelector)
-				if errors.Is(err, errSelectionCancelled) {
-					continue // Return to main menu
-				}
-				if err != nil {
-					fmt.Fprintf(os.Stderr, "Error selecting model: %v\n", err)
-					continue
-				}
-				runModel(modelName)
-			}
-		case tui.SelectionChangeRunModel:
-			_ = config.SetLastSelection("run")
-			// Always show picker
-			modelName, err := config.SelectModelWithSelector(cmd.Context(), singleSelector)
-			if errors.Is(err, errSelectionCancelled) {
-				continue // Return to main menu
-			}
-			if err != nil {
-				fmt.Fprintf(os.Stderr, "Error selecting model: %v\n", err)
-				continue
-			}
-			runModel(modelName)
-		case tui.SelectionIntegration:
-			_ = config.SetLastSelection(result.Integration)
-			if !launchIntegration(result.Integration) {
-				continue // Return to main menu
-			}
-		case tui.SelectionChangeIntegration:
-			_ = config.SetLastSelection(result.Integration)
-			err := config.ConfigureIntegrationWithSelectors(cmd.Context(), result.Integration, singleSelector, multiSelector)
-			if errors.Is(err, errSelectionCancelled) {
-				continue // Return to main menu
-			}
-			if err != nil {
-				fmt.Fprintf(os.Stderr, "Error configuring %s: %v\n", result.Integration, err)
-				continue
-			}
-			if err := config.LaunchIntegration(result.Integration); err != nil {
-				fmt.Fprintf(os.Stderr, "Error launching %s: %v\n", result.Integration, err)
-			}
-		}
-	}
-}
-
 func NewCLI() *cobra.Command {
 	log.SetFlags(log.LstdFlags | log.Lshortfile)
 	cobra.EnableCommandSorting = false
@@ -1942,13 +1813,11 @@ func NewCLI() *cobra.Command {
 				return
 			}

-			runInteractiveTUI(cmd)
+			cmd.Print(cmd.UsageString())
 		},
 	}

 	rootCmd.Flags().BoolP("version", "v", false, "Show version information")
-	rootCmd.Flags().Bool("verbose", false, "Show timings for response")
-	rootCmd.Flags().Bool("nowordwrap", false, "Don't wrap words to the next line automatically")

 	createCmd := &cobra.Command{
 		Use:   "create MODEL",
@@ -2019,7 +1888,7 @@ func NewCLI() *cobra.Command {
 	serveCmd := &cobra.Command{
 		Use:     "serve",
 		Aliases: []string{"start"},
-		Short:   "Start Ollama",
+		Short:   "Start ollama",
 		Args:    cobra.ExactArgs(0),
 		RunE:    RunServer,
 	}
@@ -2162,7 +2031,7 @@ func NewCLI() *cobra.Command {
 		copyCmd,
 		deleteCmd,
 		runnerCmd,
-		config.LaunchCmd(checkServerHeartbeat, runInteractiveTUI),
+		config.LaunchCmd(checkServerHeartbeat),
 	)

 	return rootCmd
--- a/cmd/cmd_test.go
+++ b/cmd/cmd_test.go
@@ -1553,7 +1553,7 @@ func TestShowInfoImageGen(t *testing.T) {
 		Details: api.ModelDetails{
 			Family:            "ZImagePipeline",
 			ParameterSize:     "10.3B",
-			QuantizationLevel: "Q8",
+			QuantizationLevel: "FP8",
 		},
 		Capabilities: []model.Capability{model.CapabilityImage},
 		Requires:     "0.14.0",
@@ -1565,7 +1565,7 @@ func TestShowInfoImageGen(t *testing.T) {
 	expect := "  Model\n" +
 		"    architecture    ZImagePipeline    \n" +
 		"    parameters      10.3B             \n" +
-		"    quantization    Q8                \n" +
+		"    quantization    FP8               \n" +
 		"    requires        0.14.0            \n" +
 		"\n" +
 		"  Capabilities\n" +
--- a/cmd/config/claude.go
+++ b/cmd/config/claude.go
@@ -15,13 +15,11 @@ type Claude struct{}

 func (c *Claude) String() string { return "Claude Code" }

-func (c *Claude) args(model string, extra []string) []string {
-	var args []string
+func (c *Claude) args(model string) []string {
 	if model != "" {
-		args = append(args, "--model", model)
+		return []string{"--model", model}
 	}
-	args = append(args, extra...)
-	return args
+	return nil
 }

 func (c *Claude) findPath() (string, error) {
@@ -43,13 +41,13 @@ func (c *Claude) findPath() (string, error) {
 	return fallback, nil
 }

-func (c *Claude) Run(model string, args []string) error {
+func (c *Claude) Run(model string) error {
 	claudePath, err := c.findPath()
 	if err != nil {
 		return fmt.Errorf("claude is not installed, install from https://code.claude.com/docs/en/quickstart")
 	}

-	cmd := exec.Command(claudePath, c.args(model, args)...)
+	cmd := exec.Command(claudePath, c.args(model)...)
 	cmd.Stdin = os.Stdin
 	cmd.Stdout = os.Stdout
 	cmd.Stderr = os.Stderr
--- a/cmd/config/claude_test.go
+++ b/cmd/config/claude_test.go
@@ -84,21 +84,17 @@ func TestClaudeArgs(t *testing.T) {
 	tests := []struct {
 		name  string
 		model string
-		args  []string
 		want  []string
 	}{
-		{"with model", "llama3.2", nil, []string{"--model", "llama3.2"}},
-		{"empty model", "", nil, nil},
-		{"with model and verbose", "llama3.2", []string{"--verbose"}, []string{"--model", "llama3.2", "--verbose"}},
-		{"empty model with help", "", []string{"--help"}, []string{"--help"}},
-		{"with allowed tools", "llama3.2", []string{"--allowedTools", "Read,Write,Bash"}, []string{"--model", "llama3.2", "--allowedTools", "Read,Write,Bash"}},
+		{"with model", "llama3.2", []string{"--model", "llama3.2"}},
+		{"empty model", "", nil},
 	}

 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
-			got := c.args(tt.model, tt.args)
+			got := c.args(tt.model)
 			if !slices.Equal(got, tt.want) {
-				t.Errorf("args(%q, %v) = %v, want %v", tt.model, tt.args, got, tt.want)
+				t.Errorf("args(%q) = %v, want %v", tt.model, got, tt.want)
 			}
 		})
 	}
--- a/cmd/config/clawdbot.go
+++ b/cmd/config/clawdbot.go
@@ -13,46 +13,26 @@ import (
 	"github.com/ollama/ollama/envconfig"
 )

-type Openclaw struct{}
+type Clawdbot struct{}

-func (c *Openclaw) String() string { return "OpenClaw" }
+func (c *Clawdbot) String() string { return "Clawdbot" }

 const ansiGreen = "\033[32m"

-func (c *Openclaw) Run(model string, args []string) error {
-	bin := "openclaw"
-	if _, err := exec.LookPath(bin); err != nil {
-		bin = "clawdbot"
-		if _, err := exec.LookPath(bin); err != nil {
-			return fmt.Errorf("openclaw is not installed, install from https://docs.openclaw.ai")
-		}
+func (c *Clawdbot) Run(model string) error {
+	if _, err := exec.LookPath("clawdbot"); err != nil {
+		return fmt.Errorf("clawdbot is not installed, install from https://docs.clawd.bot")
 	}

 	models := []string{model}
-	if config, err := loadIntegration("openclaw"); err == nil && len(config.Models) > 0 {
-		models = config.Models
-	} else if config, err := loadIntegration("clawdbot"); err == nil && len(config.Models) > 0 {
+	if config, err := loadIntegration("clawdbot"); err == nil && len(config.Models) > 0 {
 		models = config.Models
 	}
 	if err := c.Edit(models); err != nil {
 		return fmt.Errorf("setup failed: %w", err)
 	}

-	if !c.onboarded() {
-		// Onboarding not completed: run it (model already set via Edit)
-		// Use "ollama" as gateway token for simple local access
-		cmd := exec.Command(bin, "onboard",
-			"--auth-choice", "skip",
-			"--gateway-token", "ollama",
-		)
-		cmd.Stdin = os.Stdin
-		cmd.Stdout = os.Stdout
-		cmd.Stderr = os.Stderr
-		return cmd.Run()
-	}
-
-	// Onboarding completed: run gateway
-	cmd := exec.Command(bin, append([]string{"gateway"}, args...)...)
+	cmd := exec.Command("clawdbot", "gateway")
 	cmd.Stdin = os.Stdin

 	// Capture output to detect "already running" message
@@ -62,55 +42,22 @@ func (c *Openclaw) Run(model string, args []string) error {

 	err := cmd.Run()
 	if err != nil && strings.Contains(outputBuf.String(), "Gateway already running") {
-		fmt.Fprintf(os.Stderr, "%sOpenClaw has been configured with Ollama. Gateway is already running.%s\n", ansiGreen, ansiReset)
+		fmt.Fprintf(os.Stderr, "%sClawdbot has been configured with Ollama. Gateway is already running.%s\n", ansiGreen, ansiReset)
 		return nil
 	}
 	return err
 }

-// onboarded checks if OpenClaw onboarding wizard was completed
-// by looking for the wizard.lastRunAt marker in the config
-func (c *Openclaw) onboarded() bool {
-	home, err := os.UserHomeDir()
-	if err != nil {
-		return false
-	}
-
-	configPath := filepath.Join(home, ".openclaw", "openclaw.json")
-	legacyPath := filepath.Join(home, ".clawdbot", "clawdbot.json")
-
-	config := make(map[string]any)
-	if data, err := os.ReadFile(configPath); err == nil {
-		_ = json.Unmarshal(data, &config)
-	} else if data, err := os.ReadFile(legacyPath); err == nil {
-		_ = json.Unmarshal(data, &config)
-	} else {
-		return false
-	}
-
-	// Check for wizard.lastRunAt marker (set when onboarding completes)
-	wizard, _ := config["wizard"].(map[string]any)
-	if wizard == nil {
-		return false
-	}
-	lastRunAt, _ := wizard["lastRunAt"].(string)
-	return lastRunAt != ""
-}
-
-func (c *Openclaw) Paths() []string {
+func (c *Clawdbot) Paths() []string {
 	home, _ := os.UserHomeDir()
-	p := filepath.Join(home, ".openclaw", "openclaw.json")
+	p := filepath.Join(home, ".clawdbot", "clawdbot.json")
 	if _, err := os.Stat(p); err == nil {
 		return []string{p}
 	}
-	legacy := filepath.Join(home, ".clawdbot", "clawdbot.json")
-	if _, err := os.Stat(legacy); err == nil {
-		return []string{legacy}
-	}
 	return nil
 }

-func (c *Openclaw) Edit(models []string) error {
+func (c *Clawdbot) Edit(models []string) error {
 	if len(models) == 0 {
 		return nil
 	}
@@ -120,8 +67,7 @@ func (c *Openclaw) Edit(models []string) error {
 		return err
 	}

-	configPath := filepath.Join(home, ".openclaw", "openclaw.json")
-	legacyPath := filepath.Join(home, ".clawdbot", "clawdbot.json")
+	configPath := filepath.Join(home, ".clawdbot", "clawdbot.json")
 	if err := os.MkdirAll(filepath.Dir(configPath), 0o755); err != nil {
 		return err
 	}
@@ -130,8 +76,6 @@ func (c *Openclaw) Edit(models []string) error {
 	config := make(map[string]any)
 	if data, err := os.ReadFile(configPath); err == nil {
 		_ = json.Unmarshal(data, &config)
-	} else if data, err := os.ReadFile(legacyPath); err == nil {
-		_ = json.Unmarshal(data, &config)
 	}

 	// Navigate/create: models.providers.ollama (preserving other providers)
@@ -223,18 +167,15 @@ func (c *Openclaw) Edit(models []string) error {
 	return writeWithBackup(configPath, data)
 }

-func (c *Openclaw) Models() []string {
+func (c *Clawdbot) Models() []string {
 	home, err := os.UserHomeDir()
 	if err != nil {
 		return nil
 	}

-	config, err := readJSONFile(filepath.Join(home, ".openclaw", "openclaw.json"))
+	config, err := readJSONFile(filepath.Join(home, ".clawdbot", "clawdbot.json"))
 	if err != nil {
-		config, err = readJSONFile(filepath.Join(home, ".clawdbot", "clawdbot.json"))
-		if err != nil {
-			return nil
-		}
+		return nil
 	}

 	modelsSection, _ := config["models"].(map[string]any)
--- a/cmd/config/clawdbot_test.go
+++ b/cmd/config/clawdbot_test.go
@@ -8,12 +8,12 @@ import (
 	"testing"
 )

-func TestOpenclawIntegration(t *testing.T) {
-	c := &Openclaw{}
+func TestClawdbotIntegration(t *testing.T) {
+	c := &Clawdbot{}

 	t.Run("String", func(t *testing.T) {
-		if got := c.String(); got != "OpenClaw" {
-			t.Errorf("String() = %q, want %q", got, "OpenClaw")
+		if got := c.String(); got != "Clawdbot" {
+			t.Errorf("String() = %q, want %q", got, "Clawdbot")
 		}
 	})

@@ -26,13 +26,13 @@ func TestOpenclawIntegration(t *testing.T) {
 	})
 }

-func TestOpenclawEdit(t *testing.T) {
-	c := &Openclaw{}
+func TestClawdbotEdit(t *testing.T) {
+	c := &Clawdbot{}
 	tmpDir := t.TempDir()
 	setTestHome(t, tmpDir)

-	configDir := filepath.Join(tmpDir, ".openclaw")
-	configPath := filepath.Join(configDir, "openclaw.json")
+	configDir := filepath.Join(tmpDir, ".clawdbot")
+	configPath := filepath.Join(configDir, "clawdbot.json")

 	cleanup := func() { os.RemoveAll(configDir) }

@@ -41,8 +41,8 @@ func TestOpenclawEdit(t *testing.T) {
 		if err := c.Edit([]string{"llama3.2"}); err != nil {
 			t.Fatal(err)
 		}
-		assertOpenclawModelExists(t, configPath, "llama3.2")
-		assertOpenclawPrimaryModel(t, configPath, "ollama/llama3.2")
+		assertClawdbotModelExists(t, configPath, "llama3.2")
+		assertClawdbotPrimaryModel(t, configPath, "ollama/llama3.2")
 	})

 	t.Run("multiple models - first is primary", func(t *testing.T) {
@@ -50,9 +50,9 @@ func TestOpenclawEdit(t *testing.T) {
 		if err := c.Edit([]string{"llama3.2", "mistral"}); err != nil {
 			t.Fatal(err)
 		}
-		assertOpenclawModelExists(t, configPath, "llama3.2")
-		assertOpenclawModelExists(t, configPath, "mistral")
-		assertOpenclawPrimaryModel(t, configPath, "ollama/llama3.2")
+		assertClawdbotModelExists(t, configPath, "llama3.2")
+		assertClawdbotModelExists(t, configPath, "mistral")
+		assertClawdbotPrimaryModel(t, configPath, "ollama/llama3.2")
 	})

 	t.Run("preserve other providers", func(t *testing.T) {
@@ -127,8 +127,8 @@ func TestOpenclawEdit(t *testing.T) {
 		c.Edit([]string{"llama3.2", "mistral"})
 		c.Edit([]string{"llama3.2"})

-		assertOpenclawModelExists(t, configPath, "llama3.2")
-		assertOpenclawModelNotExists(t, configPath, "mistral")
+		assertClawdbotModelExists(t, configPath, "llama3.2")
+		assertClawdbotModelNotExists(t, configPath, "mistral")
 	})

 	t.Run("empty models is no-op", func(t *testing.T) {
@@ -169,12 +169,12 @@ func TestOpenclawEdit(t *testing.T) {
 		if err := c.Edit([]string{"llama3.2"}); err != nil {
 			t.Fatal(err)
 		}
-		assertOpenclawModelExists(t, configPath, "llama3.2")
+		assertClawdbotModelExists(t, configPath, "llama3.2")
 	})
 }

-func TestOpenclawModels(t *testing.T) {
-	c := &Openclaw{}
+func TestClawdbotModels(t *testing.T) {
+	c := &Clawdbot{}
 	tmpDir := t.TempDir()
 	setTestHome(t, tmpDir)

@@ -185,9 +185,9 @@ func TestOpenclawModels(t *testing.T) {
 	})

 	t.Run("returns all ollama models", func(t *testing.T) {
-		configDir := filepath.Join(tmpDir, ".openclaw")
+		configDir := filepath.Join(tmpDir, ".clawdbot")
 		os.MkdirAll(configDir, 0o755)
-		os.WriteFile(filepath.Join(configDir, "openclaw.json"), []byte(`{
+		os.WriteFile(filepath.Join(configDir, "clawdbot.json"), []byte(`{
 			"models":{"providers":{"ollama":{"models":[
 				{"id":"llama3.2"},
 				{"id":"mistral"}
@@ -202,7 +202,7 @@ func TestOpenclawModels(t *testing.T) {
 }

 // Helper functions
-func assertOpenclawModelExists(t *testing.T, path, model string) {
+func assertClawdbotModelExists(t *testing.T, path, model string) {
 	t.Helper()
 	data, _ := os.ReadFile(path)
 	var cfg map[string]any
@@ -221,7 +221,7 @@ func assertOpenclawModelExists(t *testing.T, path, model string) {
 	t.Errorf("model %s not found", model)
 }

-func assertOpenclawModelNotExists(t *testing.T, path, model string) {
+func assertClawdbotModelNotExists(t *testing.T, path, model string) {
 	t.Helper()
 	data, _ := os.ReadFile(path)
 	var cfg map[string]any
@@ -239,7 +239,7 @@ func assertOpenclawModelNotExists(t *testing.T, path, model string) {
 	}
 }

-func assertOpenclawPrimaryModel(t *testing.T, path, expected string) {
+func assertClawdbotPrimaryModel(t *testing.T, path, expected string) {
 	t.Helper()
 	data, _ := os.ReadFile(path)
 	var cfg map[string]any
@@ -252,15 +252,15 @@ func assertOpenclawPrimaryModel(t *testing.T, path, expected string) {
 	}
 }

-func TestOpenclawPaths(t *testing.T) {
-	c := &Openclaw{}
+func TestClawdbotPaths(t *testing.T) {
+	c := &Clawdbot{}

 	t.Run("returns path when config exists", func(t *testing.T) {
 		tmpDir := t.TempDir()
 		setTestHome(t, tmpDir)
-		configDir := filepath.Join(tmpDir, ".openclaw")
+		configDir := filepath.Join(tmpDir, ".clawdbot")
 		os.MkdirAll(configDir, 0o755)
-		os.WriteFile(filepath.Join(configDir, "openclaw.json"), []byte(`{}`), 0o644)
+		os.WriteFile(filepath.Join(configDir, "clawdbot.json"), []byte(`{}`), 0o644)

 		paths := c.Paths()
 		if len(paths) != 1 {
@@ -277,12 +277,12 @@ func TestOpenclawPaths(t *testing.T) {
 	})
 }

-func TestOpenclawModelsEdgeCases(t *testing.T) {
-	c := &Openclaw{}
+func TestClawdbotModelsEdgeCases(t *testing.T) {
+	c := &Clawdbot{}
 	tmpDir := t.TempDir()
 	setTestHome(t, tmpDir)
-	configDir := filepath.Join(tmpDir, ".openclaw")
-	configPath := filepath.Join(configDir, "openclaw.json")
+	configDir := filepath.Join(tmpDir, ".clawdbot")
+	configPath := filepath.Join(configDir, "clawdbot.json")
 	cleanup := func() { os.RemoveAll(configDir) }

 	t.Run("corrupted JSON returns nil", func(t *testing.T) {
@@ -340,11 +340,11 @@ func TestOpenclawModelsEdgeCases(t *testing.T) {
 	})
 }

-func TestOpenclawEditSchemaFields(t *testing.T) {
-	c := &Openclaw{}
+func TestClawdbotEditSchemaFields(t *testing.T) {
+	c := &Clawdbot{}
 	tmpDir := t.TempDir()
 	setTestHome(t, tmpDir)
-	configPath := filepath.Join(tmpDir, ".openclaw", "openclaw.json")
+	configPath := filepath.Join(tmpDir, ".clawdbot", "clawdbot.json")

 	if err := c.Edit([]string{"llama3.2"}); err != nil {
 		t.Fatal(err)
@@ -381,20 +381,20 @@ func TestOpenclawEditSchemaFields(t *testing.T) {
 	}
 }

-func TestOpenclawEditModelNames(t *testing.T) {
-	c := &Openclaw{}
+func TestClawdbotEditModelNames(t *testing.T) {
+	c := &Clawdbot{}
 	tmpDir := t.TempDir()
 	setTestHome(t, tmpDir)
-	configPath := filepath.Join(tmpDir, ".openclaw", "openclaw.json")
-	cleanup := func() { os.RemoveAll(filepath.Join(tmpDir, ".openclaw")) }
+	configPath := filepath.Join(tmpDir, ".clawdbot", "clawdbot.json")
+	cleanup := func() { os.RemoveAll(filepath.Join(tmpDir, ".clawdbot")) }

 	t.Run("model with colon tag", func(t *testing.T) {
 		cleanup()
 		if err := c.Edit([]string{"llama3.2:70b"}); err != nil {
 			t.Fatal(err)
 		}
-		assertOpenclawModelExists(t, configPath, "llama3.2:70b")
-		assertOpenclawPrimaryModel(t, configPath, "ollama/llama3.2:70b")
+		assertClawdbotModelExists(t, configPath, "llama3.2:70b")
+		assertClawdbotPrimaryModel(t, configPath, "ollama/llama3.2:70b")
 	})

 	t.Run("model with slash", func(t *testing.T) {
@@ -402,8 +402,8 @@ func TestOpenclawEditModelNames(t *testing.T) {
 		if err := c.Edit([]string{"library/model:tag"}); err != nil {
 			t.Fatal(err)
 		}
-		assertOpenclawModelExists(t, configPath, "library/model:tag")
-		assertOpenclawPrimaryModel(t, configPath, "ollama/library/model:tag")
+		assertClawdbotModelExists(t, configPath, "library/model:tag")
+		assertClawdbotPrimaryModel(t, configPath, "ollama/library/model:tag")
 	})

 	t.Run("model with hyphen", func(t *testing.T) {
@@ -411,16 +411,16 @@ func TestOpenclawEditModelNames(t *testing.T) {
 		if err := c.Edit([]string{"test-model"}); err != nil {
 			t.Fatal(err)
 		}
-		assertOpenclawModelExists(t, configPath, "test-model")
+		assertClawdbotModelExists(t, configPath, "test-model")
 	})
 }

-func TestOpenclawEditAgentsPreservation(t *testing.T) {
-	c := &Openclaw{}
+func TestClawdbotEditAgentsPreservation(t *testing.T) {
+	c := &Clawdbot{}
 	tmpDir := t.TempDir()
 	setTestHome(t, tmpDir)
-	configDir := filepath.Join(tmpDir, ".openclaw")
-	configPath := filepath.Join(configDir, "openclaw.json")
+	configDir := filepath.Join(tmpDir, ".clawdbot")
+	configPath := filepath.Join(configDir, "clawdbot.json")
 	cleanup := func() { os.RemoveAll(configDir) }

 	t.Run("preserve other agent defaults", func(t *testing.T) {
@@ -457,7 +457,7 @@ func TestOpenclawEditAgentsPreservation(t *testing.T) {
 	})
 }

-const testOpenclawFixture = `{
+const testClawdbotFixture = `{
  "theme": "dark",
  "mcp": {"servers": {"custom": {"enabled": true}}},
  "models": {
@@ -475,15 +475,15 @@ const testOpenclawFixture = `{
  }
 }`

-func TestOpenclawEdit_RoundTrip(t *testing.T) {
-	c := &Openclaw{}
+func TestClawdbotEdit_RoundTrip(t *testing.T) {
+	c := &Clawdbot{}
 	tmpDir := t.TempDir()
 	setTestHome(t, tmpDir)
-	configDir := filepath.Join(tmpDir, ".openclaw")
-	configPath := filepath.Join(configDir, "openclaw.json")
+	configDir := filepath.Join(tmpDir, ".clawdbot")
+	configPath := filepath.Join(configDir, "clawdbot.json")

 	os.MkdirAll(configDir, 0o755)
-	os.WriteFile(configPath, []byte(testOpenclawFixture), 0o644)
+	os.WriteFile(configPath, []byte(testClawdbotFixture), 0o644)

 	if err := c.Edit([]string{"llama3.2", "mistral"}); err != nil {
 		t.Fatal(err)
@@ -521,15 +521,15 @@ func TestOpenclawEdit_RoundTrip(t *testing.T) {
 	}
 }

-func TestOpenclawEdit_Idempotent(t *testing.T) {
-	c := &Openclaw{}
+func TestClawdbotEdit_Idempotent(t *testing.T) {
+	c := &Clawdbot{}
 	tmpDir := t.TempDir()
 	setTestHome(t, tmpDir)
-	configDir := filepath.Join(tmpDir, ".openclaw")
-	configPath := filepath.Join(configDir, "openclaw.json")
+	configDir := filepath.Join(tmpDir, ".clawdbot")
+	configPath := filepath.Join(configDir, "clawdbot.json")

 	os.MkdirAll(configDir, 0o755)
-	os.WriteFile(configPath, []byte(testOpenclawFixture), 0o644)
+	os.WriteFile(configPath, []byte(testClawdbotFixture), 0o644)

 	c.Edit([]string{"llama3.2", "mistral"})
 	firstData, _ := os.ReadFile(configPath)
@@ -542,15 +542,15 @@ func TestOpenclawEdit_Idempotent(t *testing.T) {
 	}
 }

-func TestOpenclawEdit_MultipleConsecutiveEdits(t *testing.T) {
-	c := &Openclaw{}
+func TestClawdbotEdit_MultipleConsecutiveEdits(t *testing.T) {
+	c := &Clawdbot{}
 	tmpDir := t.TempDir()
 	setTestHome(t, tmpDir)
-	configDir := filepath.Join(tmpDir, ".openclaw")
-	configPath := filepath.Join(configDir, "openclaw.json")
+	configDir := filepath.Join(tmpDir, ".clawdbot")
+	configPath := filepath.Join(configDir, "clawdbot.json")

 	os.MkdirAll(configDir, 0o755)
-	os.WriteFile(configPath, []byte(testOpenclawFixture), 0o644)
+	os.WriteFile(configPath, []byte(testClawdbotFixture), 0o644)

 	for i := range 10 {
 		models := []string{"model-a", "model-b"}
@@ -573,12 +573,12 @@ func TestOpenclawEdit_MultipleConsecutiveEdits(t *testing.T) {
 	}
 }

-func TestOpenclawEdit_BackupCreated(t *testing.T) {
-	c := &Openclaw{}
+func TestClawdbotEdit_BackupCreated(t *testing.T) {
+	c := &Clawdbot{}
 	tmpDir := t.TempDir()
 	setTestHome(t, tmpDir)
-	configDir := filepath.Join(tmpDir, ".openclaw")
-	configPath := filepath.Join(configDir, "openclaw.json")
+	configDir := filepath.Join(tmpDir, ".clawdbot")
+	configPath := filepath.Join(configDir, "clawdbot.json")
 	backupDir := filepath.Join(os.TempDir(), "ollama-backups")

 	os.MkdirAll(configDir, 0o755)
@@ -590,7 +590,7 @@ func TestOpenclawEdit_BackupCreated(t *testing.T) {
 		t.Fatal(err)
 	}

-	backups, _ := filepath.Glob(filepath.Join(backupDir, "openclaw.json.*"))
+	backups, _ := filepath.Glob(filepath.Join(backupDir, "clawdbot.json.*"))
 	foundBackup := false
 	for _, backup := range backups {
 		data, _ := os.ReadFile(backup)
@@ -605,151 +605,11 @@ func TestOpenclawEdit_BackupCreated(t *testing.T) {
 	}
 }

-func TestOpenclawClawdbotAlias(t *testing.T) {
-	for _, alias := range []string{"clawdbot", "moltbot"} {
-		t.Run(alias+" alias resolves to Openclaw runner", func(t *testing.T) {
-			r, ok := integrations[alias]
-			if !ok {
-				t.Fatalf("%s not found in integrations", alias)
-			}
-			if _, ok := r.(*Openclaw); !ok {
-				t.Errorf("%s integration is %T, want *Openclaw", alias, r)
-			}
-		})
-
-		t.Run(alias+" is hidden from selector", func(t *testing.T) {
-			if !integrationAliases[alias] {
-				t.Errorf("%s should be in integrationAliases", alias)
-			}
-		})
-	}
-}
-
-func TestOpenclawLegacyPaths(t *testing.T) {
-	c := &Openclaw{}
-
-	t.Run("falls back to legacy clawdbot path", func(t *testing.T) {
-		tmpDir := t.TempDir()
-		setTestHome(t, tmpDir)
-		legacyDir := filepath.Join(tmpDir, ".clawdbot")
-		os.MkdirAll(legacyDir, 0o755)
-		os.WriteFile(filepath.Join(legacyDir, "clawdbot.json"), []byte(`{}`), 0o644)
-
-		paths := c.Paths()
-		if len(paths) != 1 {
-			t.Fatalf("expected 1 path, got %d", len(paths))
-		}
-		if paths[0] != filepath.Join(legacyDir, "clawdbot.json") {
-			t.Errorf("expected legacy path, got %s", paths[0])
-		}
-	})
-
-	t.Run("prefers new path over legacy", func(t *testing.T) {
-		tmpDir := t.TempDir()
-		setTestHome(t, tmpDir)
-		newDir := filepath.Join(tmpDir, ".openclaw")
-		legacyDir := filepath.Join(tmpDir, ".clawdbot")
-		os.MkdirAll(newDir, 0o755)
-		os.MkdirAll(legacyDir, 0o755)
-		os.WriteFile(filepath.Join(newDir, "openclaw.json"), []byte(`{}`), 0o644)
-		os.WriteFile(filepath.Join(legacyDir, "clawdbot.json"), []byte(`{}`), 0o644)
-
-		paths := c.Paths()
-		if len(paths) != 1 {
-			t.Fatalf("expected 1 path, got %d", len(paths))
-		}
-		if paths[0] != filepath.Join(newDir, "openclaw.json") {
-			t.Errorf("expected new path, got %s", paths[0])
-		}
-	})
-
-	t.Run("Models reads from legacy path", func(t *testing.T) {
-		tmpDir := t.TempDir()
-		setTestHome(t, tmpDir)
-		legacyDir := filepath.Join(tmpDir, ".clawdbot")
-		os.MkdirAll(legacyDir, 0o755)
-		os.WriteFile(filepath.Join(legacyDir, "clawdbot.json"), []byte(`{
-			"models":{"providers":{"ollama":{"models":[{"id":"llama3.2"}]}}}
-		}`), 0o644)
-
-		models := c.Models()
-		if len(models) != 1 || models[0] != "llama3.2" {
-			t.Errorf("expected [llama3.2], got %v", models)
-		}
-	})
-
-	t.Run("Models prefers new path over legacy", func(t *testing.T) {
-		tmpDir := t.TempDir()
-		setTestHome(t, tmpDir)
-		newDir := filepath.Join(tmpDir, ".openclaw")
-		legacyDir := filepath.Join(tmpDir, ".clawdbot")
-		os.MkdirAll(newDir, 0o755)
-		os.MkdirAll(legacyDir, 0o755)
-		os.WriteFile(filepath.Join(newDir, "openclaw.json"), []byte(`{
-			"models":{"providers":{"ollama":{"models":[{"id":"new-model"}]}}}
-		}`), 0o644)
-		os.WriteFile(filepath.Join(legacyDir, "clawdbot.json"), []byte(`{
-			"models":{"providers":{"ollama":{"models":[{"id":"legacy-model"}]}}}
-		}`), 0o644)
-
-		models := c.Models()
-		if len(models) != 1 || models[0] != "new-model" {
-			t.Errorf("expected [new-model], got %v", models)
-		}
-	})
-
-	t.Run("Edit reads new path over legacy when both exist", func(t *testing.T) {
-		tmpDir := t.TempDir()
-		setTestHome(t, tmpDir)
-		newDir := filepath.Join(tmpDir, ".openclaw")
-		legacyDir := filepath.Join(tmpDir, ".clawdbot")
-		os.MkdirAll(newDir, 0o755)
-		os.MkdirAll(legacyDir, 0o755)
-		os.WriteFile(filepath.Join(newDir, "openclaw.json"), []byte(`{"theme":"new"}`), 0o644)
-		os.WriteFile(filepath.Join(legacyDir, "clawdbot.json"), []byte(`{"theme":"legacy"}`), 0o644)
-
-		if err := c.Edit([]string{"llama3.2"}); err != nil {
-			t.Fatal(err)
-		}
-
-		data, _ := os.ReadFile(filepath.Join(newDir, "openclaw.json"))
-		var cfg map[string]any
-		json.Unmarshal(data, &cfg)
-		if cfg["theme"] != "new" {
-			t.Errorf("expected theme from new config, got %v", cfg["theme"])
-		}
-	})
-
-	t.Run("Edit migrates from legacy config", func(t *testing.T) {
-		tmpDir := t.TempDir()
-		setTestHome(t, tmpDir)
-		legacyDir := filepath.Join(tmpDir, ".clawdbot")
-		os.MkdirAll(legacyDir, 0o755)
-		os.WriteFile(filepath.Join(legacyDir, "clawdbot.json"), []byte(`{"theme":"dark"}`), 0o644)
-
-		if err := c.Edit([]string{"llama3.2"}); err != nil {
-			t.Fatal(err)
-		}
-
-		// Should write to new path
-		newPath := filepath.Join(tmpDir, ".openclaw", "openclaw.json")
-		data, err := os.ReadFile(newPath)
-		if err != nil {
-			t.Fatal("expected new config file to be created")
-		}
-		var cfg map[string]any
-		json.Unmarshal(data, &cfg)
-		if cfg["theme"] != "dark" {
-			t.Error("legacy theme setting was not migrated")
-		}
-	})
-}
-
-func TestOpenclawEdit_CreatesDirectoryIfMissing(t *testing.T) {
-	c := &Openclaw{}
+func TestClawdbotEdit_CreatesDirectoryIfMissing(t *testing.T) {
+	c := &Clawdbot{}
 	tmpDir := t.TempDir()
 	setTestHome(t, tmpDir)
-	configDir := filepath.Join(tmpDir, ".openclaw")
+	configDir := filepath.Join(tmpDir, ".clawdbot")

 	if _, err := os.Stat(configDir); !os.IsNotExist(err) {
 		t.Fatal("directory should not exist before test")
@@ -763,116 +623,3 @@ func TestOpenclawEdit_CreatesDirectoryIfMissing(t *testing.T) {
 		t.Fatal("directory was not created")
 	}
 }
-
-func TestOpenclawOnboarded(t *testing.T) {
-	c := &Openclaw{}
-
-	t.Run("returns false when no config exists", func(t *testing.T) {
-		tmpDir := t.TempDir()
-		setTestHome(t, tmpDir)
-		if c.onboarded() {
-			t.Error("expected false when no config exists")
-		}
-	})
-
-	t.Run("returns false when config exists but no wizard section", func(t *testing.T) {
-		tmpDir := t.TempDir()
-		setTestHome(t, tmpDir)
-		configDir := filepath.Join(tmpDir, ".openclaw")
-		os.MkdirAll(configDir, 0o755)
-		os.WriteFile(filepath.Join(configDir, "openclaw.json"), []byte(`{"theme":"dark"}`), 0o644)
-
-		if c.onboarded() {
-			t.Error("expected false when no wizard section")
-		}
-	})
-
-	t.Run("returns false when wizard section exists but no lastRunAt", func(t *testing.T) {
-		tmpDir := t.TempDir()
-		setTestHome(t, tmpDir)
-		configDir := filepath.Join(tmpDir, ".openclaw")
-		os.MkdirAll(configDir, 0o755)
-		os.WriteFile(filepath.Join(configDir, "openclaw.json"), []byte(`{"wizard":{}}`), 0o644)
-
-		if c.onboarded() {
-			t.Error("expected false when wizard.lastRunAt is missing")
-		}
-	})
-
-	t.Run("returns false when wizard.lastRunAt is empty string", func(t *testing.T) {
-		tmpDir := t.TempDir()
-		setTestHome(t, tmpDir)
-		configDir := filepath.Join(tmpDir, ".openclaw")
-		os.MkdirAll(configDir, 0o755)
-		os.WriteFile(filepath.Join(configDir, "openclaw.json"), []byte(`{"wizard":{"lastRunAt":""}}`), 0o644)
-
-		if c.onboarded() {
-			t.Error("expected false when wizard.lastRunAt is empty")
-		}
-	})
-
-	t.Run("returns true when wizard.lastRunAt is set", func(t *testing.T) {
-		tmpDir := t.TempDir()
-		setTestHome(t, tmpDir)
-		configDir := filepath.Join(tmpDir, ".openclaw")
-		os.MkdirAll(configDir, 0o755)
-		os.WriteFile(filepath.Join(configDir, "openclaw.json"), []byte(`{"wizard":{"lastRunAt":"2024-01-01T00:00:00Z"}}`), 0o644)
-
-		if !c.onboarded() {
-			t.Error("expected true when wizard.lastRunAt is set")
-		}
-	})
-
-	t.Run("checks legacy clawdbot path", func(t *testing.T) {
-		tmpDir := t.TempDir()
-		setTestHome(t, tmpDir)
-		legacyDir := filepath.Join(tmpDir, ".clawdbot")
-		os.MkdirAll(legacyDir, 0o755)
-		os.WriteFile(filepath.Join(legacyDir, "clawdbot.json"), []byte(`{"wizard":{"lastRunAt":"2024-01-01T00:00:00Z"}}`), 0o644)
-
-		if !c.onboarded() {
-			t.Error("expected true when legacy config has wizard.lastRunAt")
-		}
-	})
-
-	t.Run("prefers new path over legacy", func(t *testing.T) {
-		tmpDir := t.TempDir()
-		setTestHome(t, tmpDir)
-		newDir := filepath.Join(tmpDir, ".openclaw")
-		legacyDir := filepath.Join(tmpDir, ".clawdbot")
-		os.MkdirAll(newDir, 0o755)
-		os.MkdirAll(legacyDir, 0o755)
-		// New path has no wizard marker
-		os.WriteFile(filepath.Join(newDir, "openclaw.json"), []byte(`{}`), 0o644)
-		// Legacy has wizard marker
-		os.WriteFile(filepath.Join(legacyDir, "clawdbot.json"), []byte(`{"wizard":{"lastRunAt":"2024-01-01T00:00:00Z"}}`), 0o644)
-
-		if c.onboarded() {
-			t.Error("expected false - should prefer new path which has no wizard marker")
-		}
-	})
-
-	t.Run("handles corrupted JSON gracefully", func(t *testing.T) {
-		tmpDir := t.TempDir()
-		setTestHome(t, tmpDir)
-		configDir := filepath.Join(tmpDir, ".openclaw")
-		os.MkdirAll(configDir, 0o755)
-		os.WriteFile(filepath.Join(configDir, "openclaw.json"), []byte(`{corrupted`), 0o644)
-
-		if c.onboarded() {
-			t.Error("expected false for corrupted JSON")
-		}
-	})
-
-	t.Run("handles wrong type for wizard section", func(t *testing.T) {
-		tmpDir := t.TempDir()
-		setTestHome(t, tmpDir)
-		configDir := filepath.Join(tmpDir, ".openclaw")
-		os.MkdirAll(configDir, 0o755)
-		os.WriteFile(filepath.Join(configDir, "openclaw.json"), []byte(`{"wizard":"not a map"}`), 0o644)
-
-		if c.onboarded() {
-			t.Error("expected false when wizard is wrong type")
-		}
-	})
-}
--- a/cmd/config/codex.go
+++ b/cmd/config/codex.go
@@ -14,21 +14,20 @@ type Codex struct{}

 func (c *Codex) String() string { return "Codex" }

-func (c *Codex) args(model string, extra []string) []string {
+func (c *Codex) args(model string) []string {
 	args := []string{"--oss"}
 	if model != "" {
 		args = append(args, "-m", model)
 	}
-	args = append(args, extra...)
 	return args
 }

-func (c *Codex) Run(model string, args []string) error {
+func (c *Codex) Run(model string) error {
 	if err := checkCodexVersion(); err != nil {
 		return err
 	}

-	cmd := exec.Command("codex", c.args(model, args)...)
+	cmd := exec.Command("codex", c.args(model)...)
 	cmd.Stdin = os.Stdin
 	cmd.Stdout = os.Stdout
 	cmd.Stderr = os.Stderr
--- a/cmd/config/codex_test.go
+++ b/cmd/config/codex_test.go
@@ -11,20 +11,17 @@ func TestCodexArgs(t *testing.T) {
 	tests := []struct {
 		name  string
 		model string
-		args  []string
 		want  []string
 	}{
-		{"with model", "llama3.2", nil, []string{"--oss", "-m", "llama3.2"}},
-		{"empty model", "", nil, []string{"--oss"}},
-		{"with model and profile", "qwen3-coder", []string{"-p", "myprofile"}, []string{"--oss", "-m", "qwen3-coder", "-p", "myprofile"}},
-		{"with sandbox flag", "llama3.2", []string{"--sandbox", "workspace-write"}, []string{"--oss", "-m", "llama3.2", "--sandbox", "workspace-write"}},
+		{"with model", "llama3.2", []string{"--oss", "-m", "llama3.2"}},
+		{"empty model", "", []string{"--oss"}},
 	}

 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
-			got := c.args(tt.model, tt.args)
+			got := c.args(tt.model)
 			if !slices.Equal(got, tt.want) {
-				t.Errorf("args(%q, %v) = %v, want %v", tt.model, tt.args, got, tt.want)
+				t.Errorf("args(%q) = %v, want %v", tt.model, got, tt.want)
 			}
 		})
 	}
--- a/cmd/config/config.go
+++ b/cmd/config/config.go
@@ -3,16 +3,12 @@
 package config

 import (
-	"context"
 	"encoding/json"
 	"errors"
 	"fmt"
-	"log/slog"
 	"os"
 	"path/filepath"
 	"strings"
-
-	"github.com/ollama/ollama/api"
 )

 type integration struct {
@@ -20,20 +16,10 @@ type integration struct {
 }

 type config struct {
-	Integrations  map[string]*integration `json:"integrations"`
-	LastModel     string                  `json:"last_model,omitempty"`
-	LastSelection string                  `json:"last_selection,omitempty"` // "run" or integration name
+	Integrations map[string]*integration `json:"integrations"`
 }

 func configPath() (string, error) {
-	home, err := os.UserHomeDir()
-	if err != nil {
-		return "", err
-	}
-	return filepath.Join(home, ".ollama", "config.json"), nil
-}
-
-func legacyConfigPath() (string, error) {
 	home, err := os.UserHomeDir()
 	if err != nil {
 		return "", err
@@ -41,46 +27,6 @@ func legacyConfigPath() (string, error) {
 	return filepath.Join(home, ".ollama", "config", "config.json"), nil
 }

-// migrateConfig moves the config from the legacy path to ~/.ollama/config.json
-func migrateConfig() (bool, error) {
-	oldPath, err := legacyConfigPath()
-	if err != nil {
-		return false, err
-	}
-
-	oldData, err := os.ReadFile(oldPath)
-	if err != nil {
-		if os.IsNotExist(err) {
-			return false, nil
-		}
-		return false, err
-	}
-
-	var js json.RawMessage
-	if err := json.Unmarshal(oldData, &js); err != nil {
-		slog.Warn("legacy config has invalid JSON, skipping migration", "path", oldPath, "error", err)
-		return false, nil
-	}
-
-	newPath, err := configPath()
-	if err != nil {
-		return false, err
-	}
-
-	if err := os.MkdirAll(filepath.Dir(newPath), 0o755); err != nil {
-		return false, err
-	}
-	if err := os.WriteFile(newPath, oldData, 0o644); err != nil {
-		return false, fmt.Errorf("write new config: %w", err)
-	}
-
-	_ = os.Remove(oldPath)
-	_ = os.Remove(filepath.Dir(oldPath)) // clean up empty directory
-
-	slog.Info("migrated config", "from", oldPath, "to", newPath)
-	return true, nil
-}
-
 func load() (*config, error) {
 	path, err := configPath()
 	if err != nil {
@@ -88,11 +34,6 @@ func load() (*config, error) {
 	}

 	data, err := os.ReadFile(path)
-	if err != nil && os.IsNotExist(err) {
-		if migrated, merr := migrateConfig(); merr == nil && migrated {
-			data, err = os.ReadFile(path)
-		}
-	}
 	if err != nil {
 		if os.IsNotExist(err) {
 			return &config{Integrations: make(map[string]*integration)}, nil
@@ -145,74 +86,6 @@ func saveIntegration(appName string, models []string) error {
 	return save(cfg)
 }

-// IntegrationModel returns the first configured model for an integration, or empty string if not configured.
-func IntegrationModel(appName string) string {
-	ic, err := loadIntegration(appName)
-	if err != nil || len(ic.Models) == 0 {
-		return ""
-	}
-	return ic.Models[0]
-}
-
-// LastModel returns the last model that was run, or empty string if none.
-func LastModel() string {
-	cfg, err := load()
-	if err != nil {
-		return ""
-	}
-	return cfg.LastModel
-}
-
-// SetLastModel saves the last model that was run.
-func SetLastModel(model string) error {
-	cfg, err := load()
-	if err != nil {
-		return err
-	}
-	cfg.LastModel = model
-	return save(cfg)
-}
-
-// LastSelection returns the last menu selection ("run" or integration name), or empty string if none.
-func LastSelection() string {
-	cfg, err := load()
-	if err != nil {
-		return ""
-	}
-	return cfg.LastSelection
-}
-
-// SetLastSelection saves the last menu selection ("run" or integration name).
-func SetLastSelection(selection string) error {
-	cfg, err := load()
-	if err != nil {
-		return err
-	}
-	cfg.LastSelection = selection
-	return save(cfg)
-}
-
-// ModelExists checks if a model exists on the Ollama server.
-func ModelExists(ctx context.Context, name string) bool {
-	if name == "" {
-		return false
-	}
-	client, err := api.ClientFromEnvironment()
-	if err != nil {
-		return false
-	}
-	models, err := client.List(ctx)
-	if err != nil {
-		return false
-	}
-	for _, m := range models.Models {
-		if m.Name == name || strings.HasPrefix(m.Name, name+":") {
-			return true
-		}
-	}
-	return false
-}
-
 func loadIntegration(appName string) (*integration, error) {
 	cfg, err := load()
 	if err != nil {
--- a/cmd/config/config_test.go
+++ b/cmd/config/config_test.go
@@ -200,10 +200,12 @@ func TestLoadIntegration_CorruptedJSON(t *testing.T) {
 	tmpDir := t.TempDir()
 	setTestHome(t, tmpDir)

-	dir := filepath.Join(tmpDir, ".ollama")
+	// Create corrupted config.json file
+	dir := filepath.Join(tmpDir, ".ollama", "config")
 	os.MkdirAll(dir, 0o755)
 	os.WriteFile(filepath.Join(dir, "config.json"), []byte(`{corrupted json`), 0o644)

+	// Corrupted file is treated as empty, so loadIntegration returns not found
 	_, err := loadIntegration("test")
 	if err == nil {
 		t.Error("expected error for nonexistent integration in corrupted file")
@@ -265,7 +267,7 @@ func TestConfigPath(t *testing.T) {
 		t.Fatal(err)
 	}

-	expected := filepath.Join(tmpDir, ".ollama", "config.json")
+	expected := filepath.Join(tmpDir, ".ollama", "config", "config.json")
 	if path != expected {
 		t.Errorf("expected %s, got %s", expected, path)
 	}
@@ -320,183 +322,6 @@ func TestLoad(t *testing.T) {
 	})
 }

-func TestMigrateConfig(t *testing.T) {
-	t.Run("migrates legacy file to new location", func(t *testing.T) {
-		tmpDir := t.TempDir()
-		setTestHome(t, tmpDir)
-
-		legacyDir := filepath.Join(tmpDir, ".ollama", "config")
-		os.MkdirAll(legacyDir, 0o755)
-		data := []byte(`{"integrations":{"claude":{"models":["llama3.2"]}}}`)
-		os.WriteFile(filepath.Join(legacyDir, "config.json"), data, 0o644)
-
-		migrated, err := migrateConfig()
-		if err != nil {
-			t.Fatal(err)
-		}
-		if !migrated {
-			t.Fatal("expected migration to occur")
-		}
-
-		newPath, _ := configPath()
-		got, err := os.ReadFile(newPath)
-		if err != nil {
-			t.Fatalf("new config not found: %v", err)
-		}
-		if string(got) != string(data) {
-			t.Errorf("content mismatch: got %s", got)
-		}
-
-		if _, err := os.Stat(filepath.Join(legacyDir, "config.json")); !os.IsNotExist(err) {
-			t.Error("legacy file should have been removed")
-		}
-
-		if _, err := os.Stat(legacyDir); !os.IsNotExist(err) {
-			t.Error("legacy directory should have been removed")
-		}
-	})
-
-	t.Run("no-op when no legacy file exists", func(t *testing.T) {
-		tmpDir := t.TempDir()
-		setTestHome(t, tmpDir)
-
-		migrated, err := migrateConfig()
-		if err != nil {
-			t.Fatal(err)
-		}
-		if migrated {
-			t.Error("expected no migration")
-		}
-	})
-
-	t.Run("skips corrupt legacy file", func(t *testing.T) {
-		tmpDir := t.TempDir()
-		setTestHome(t, tmpDir)
-
-		legacyDir := filepath.Join(tmpDir, ".ollama", "config")
-		os.MkdirAll(legacyDir, 0o755)
-		os.WriteFile(filepath.Join(legacyDir, "config.json"), []byte(`{corrupt`), 0o644)
-
-		migrated, err := migrateConfig()
-		if err != nil {
-			t.Fatal(err)
-		}
-		if migrated {
-			t.Error("should not migrate corrupt file")
-		}
-
-		if _, err := os.Stat(filepath.Join(legacyDir, "config.json")); os.IsNotExist(err) {
-			t.Error("corrupt legacy file should not have been deleted")
-		}
-	})
-
-	t.Run("new path takes precedence over legacy", func(t *testing.T) {
-		tmpDir := t.TempDir()
-		setTestHome(t, tmpDir)
-
-		legacyDir := filepath.Join(tmpDir, ".ollama", "config")
-		os.MkdirAll(legacyDir, 0o755)
-		os.WriteFile(filepath.Join(legacyDir, "config.json"), []byte(`{"integrations":{"old":{"models":["old-model"]}}}`), 0o644)
-
-		newDir := filepath.Join(tmpDir, ".ollama")
-		os.WriteFile(filepath.Join(newDir, "config.json"), []byte(`{"integrations":{"new":{"models":["new-model"]}}}`), 0o644)
-
-		cfg, err := load()
-		if err != nil {
-			t.Fatal(err)
-		}
-		if _, ok := cfg.Integrations["new"]; !ok {
-			t.Error("expected new-path integration to be loaded")
-		}
-		if _, ok := cfg.Integrations["old"]; ok {
-			t.Error("legacy integration should not have been loaded")
-		}
-	})
-
-	t.Run("idempotent when called twice", func(t *testing.T) {
-		tmpDir := t.TempDir()
-		setTestHome(t, tmpDir)
-
-		legacyDir := filepath.Join(tmpDir, ".ollama", "config")
-		os.MkdirAll(legacyDir, 0o755)
-		os.WriteFile(filepath.Join(legacyDir, "config.json"), []byte(`{"integrations":{}}`), 0o644)
-
-		if _, err := migrateConfig(); err != nil {
-			t.Fatal(err)
-		}
-
-		migrated, err := migrateConfig()
-		if err != nil {
-			t.Fatal(err)
-		}
-		if migrated {
-			t.Error("second migration should be a no-op")
-		}
-	})
-
-	t.Run("legacy directory preserved if not empty", func(t *testing.T) {
-		tmpDir := t.TempDir()
-		setTestHome(t, tmpDir)
-
-		legacyDir := filepath.Join(tmpDir, ".ollama", "config")
-		os.MkdirAll(legacyDir, 0o755)
-		os.WriteFile(filepath.Join(legacyDir, "config.json"), []byte(`{"integrations":{}}`), 0o644)
-		os.WriteFile(filepath.Join(legacyDir, "other-file.txt"), []byte("keep me"), 0o644)
-
-		if _, err := migrateConfig(); err != nil {
-			t.Fatal(err)
-		}
-
-		if _, err := os.Stat(legacyDir); os.IsNotExist(err) {
-			t.Error("directory with other files should not have been removed")
-		}
-		if _, err := os.Stat(filepath.Join(legacyDir, "other-file.txt")); os.IsNotExist(err) {
-			t.Error("other files in legacy directory should be untouched")
-		}
-	})
-
-	t.Run("save writes to new path after migration", func(t *testing.T) {
-		tmpDir := t.TempDir()
-		setTestHome(t, tmpDir)
-
-		legacyDir := filepath.Join(tmpDir, ".ollama", "config")
-		os.MkdirAll(legacyDir, 0o755)
-		os.WriteFile(filepath.Join(legacyDir, "config.json"), []byte(`{"integrations":{"claude":{"models":["llama3.2"]}}}`), 0o644)
-
-		// load triggers migration, then save should write to new path
-		if err := saveIntegration("codex", []string{"qwen2.5"}); err != nil {
-			t.Fatal(err)
-		}
-
-		newPath := filepath.Join(tmpDir, ".ollama", "config.json")
-		if _, err := os.Stat(newPath); os.IsNotExist(err) {
-			t.Error("save should write to new path")
-		}
-
-		// old path should not be recreated
-		if _, err := os.Stat(filepath.Join(legacyDir, "config.json")); !os.IsNotExist(err) {
-			t.Error("save should not recreate legacy path")
-		}
-	})
-
-	t.Run("load triggers migration transparently", func(t *testing.T) {
-		tmpDir := t.TempDir()
-		setTestHome(t, tmpDir)
-
-		legacyDir := filepath.Join(tmpDir, ".ollama", "config")
-		os.MkdirAll(legacyDir, 0o755)
-		os.WriteFile(filepath.Join(legacyDir, "config.json"), []byte(`{"integrations":{"claude":{"models":["llama3.2"]}}}`), 0o644)
-
-		cfg, err := load()
-		if err != nil {
-			t.Fatal(err)
-		}
-		if cfg.Integrations["claude"] == nil || cfg.Integrations["claude"].Models[0] != "llama3.2" {
-			t.Error("migration via load() did not preserve data")
-		}
-	})
-}
-
 func TestSave(t *testing.T) {
 	tmpDir := t.TempDir()
 	setTestHome(t, tmpDir)
--- a/cmd/config/droid.go
+++ b/cmd/config/droid.go
@@ -39,7 +39,7 @@ type modelEntry struct {

 func (d *Droid) String() string { return "Droid" }

-func (d *Droid) Run(model string, args []string) error {
+func (d *Droid) Run(model string) error {
 	if _, err := exec.LookPath("droid"); err != nil {
 		return fmt.Errorf("droid is not installed, install from https://docs.factory.ai/cli/getting-started/quickstart")
 	}
@@ -53,7 +53,7 @@ func (d *Droid) Run(model string, args []string) error {
 		return fmt.Errorf("setup failed: %w", err)
 	}

-	cmd := exec.Command("droid", args...)
+	cmd := exec.Command("droid")
 	cmd.Stdin = os.Stdin
 	cmd.Stdout = os.Stdout
 	cmd.Stderr = os.Stderr
--- a/cmd/config/integrations.go
+++ b/cmd/config/integrations.go
@@ -4,19 +4,15 @@ import (
 	"context"
 	"errors"
 	"fmt"
-	"io"
 	"maps"
-	"net/http"
 	"os"
 	"os/exec"
-	"path/filepath"
 	"runtime"
 	"slices"
 	"strings"
 	"time"

 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/progress"
 	"github.com/spf13/cobra"
 )

@@ -26,7 +22,7 @@ import (
 // Runner can run an integration with a model.

 type Runner interface {
-	Run(model string, args []string) error
+	Run(model string) error
 	// String returns the human-readable name of the integration
 	String() string
 }
@@ -45,186 +41,10 @@ type Editor interface {
 // integrations is the registry of available integrations.
 var integrations = map[string]Runner{
 	"claude":   &Claude{},
-	"clawdbot": &Openclaw{},
+	"clawdbot": &Clawdbot{},
 	"codex":    &Codex{},
-	"moltbot":  &Openclaw{},
 	"droid":    &Droid{},
 	"opencode": &OpenCode{},
-	"openclaw": &Openclaw{},
-}
-
-// recommendedModels are shown when the user has no models or as suggestions.
-// Order matters: local models first, then cloud models.
-var recommendedModels = []ModelItem{
-	{Name: "glm-4.7-flash", Description: "Recommended (requires ~25GB VRAM)"},
-	{Name: "qwen3:8b", Description: "Recommended (requires ~11GB VRAM)"},
-	{Name: "glm-4.7:cloud", Description: "Recommended"},
-	{Name: "kimi-k2.5:cloud", Description: "Recommended"},
-}
-
-// integrationAliases are hidden from the interactive selector but work as CLI arguments.
-var integrationAliases = map[string]bool{
-	"clawdbot": true,
-	"moltbot":  true,
-}
-
-// integrationInstallURLs maps integration names to their install script URLs.
-var integrationInstallURLs = map[string]string{
-	"claude":   "https://claude.ai/install.sh",
-	"openclaw": "https://openclaw.ai/install.sh",
-	"droid":    "https://app.factory.ai/cli",
-	"opencode": "https://opencode.ai/install",
-}
-
-// CanInstallIntegration returns true if we have an install script for this integration.
-func CanInstallIntegration(name string) bool {
-	_, ok := integrationInstallURLs[name]
-	return ok
-}
-
-// IsIntegrationInstalled checks if an integration binary is installed.
-func IsIntegrationInstalled(name string) bool {
-	switch name {
-	case "claude":
-		c := &Claude{}
-		_, err := c.findPath()
-		return err == nil
-	case "openclaw":
-		if _, err := exec.LookPath("openclaw"); err == nil {
-			return true
-		}
-		if _, err := exec.LookPath("clawdbot"); err == nil {
-			return true
-		}
-		return false
-	case "codex":
-		_, err := exec.LookPath("codex")
-		return err == nil
-	case "droid":
-		_, err := exec.LookPath("droid")
-		return err == nil
-	case "opencode":
-		_, err := exec.LookPath("opencode")
-		return err == nil
-	default:
-		return true // Assume installed for unknown integrations
-	}
-}
-
-// InstallIntegration downloads and runs the install script for an integration.
-func InstallIntegration(name string) error {
-	url, ok := integrationInstallURLs[name]
-	if !ok {
-		return fmt.Errorf("no install script available for %s", name)
-	}
-
-	// Download the install script
-	resp, err := http.Get(url)
-	if err != nil {
-		return fmt.Errorf("failed to download install script: %w", err)
-	}
-	defer resp.Body.Close()
-
-	if resp.StatusCode != http.StatusOK {
-		return fmt.Errorf("failed to download install script: HTTP %d", resp.StatusCode)
-	}
-
-	script, err := io.ReadAll(resp.Body)
-	if err != nil {
-		return fmt.Errorf("failed to read install script: %w", err)
-	}
-
-	// Create a temporary file for the script
-	tmpDir := os.TempDir()
-	scriptPath := filepath.Join(tmpDir, fmt.Sprintf("install-%s.sh", name))
-	if err := os.WriteFile(scriptPath, script, 0o700); err != nil {
-		return fmt.Errorf("failed to write install script: %w", err)
-	}
-	defer os.Remove(scriptPath)
-
-	// Execute the script with bash
-	cmd := exec.Command("bash", scriptPath)
-	cmd.Stdin = os.Stdin
-	cmd.Stdout = os.Stdout
-	cmd.Stderr = os.Stderr
-
-	if err := cmd.Run(); err != nil {
-		return fmt.Errorf("install script failed: %w", err)
-	}
-
-	return nil
-}
-
-// SelectModel lets the user select a model to run.
-// ModelItem represents a model for selection.
-type ModelItem struct {
-	Name        string
-	Description string
-}
-
-// SingleSelector is a function type for single item selection.
-type SingleSelector func(title string, items []ModelItem) (string, error)
-
-// MultiSelector is a function type for multi item selection.
-type MultiSelector func(title string, items []ModelItem, preChecked []string) ([]string, error)
-
-// SelectModelWithSelector prompts the user to select a model using the provided selector.
-func SelectModelWithSelector(ctx context.Context, selector SingleSelector) (string, error) {
-	client, err := api.ClientFromEnvironment()
-	if err != nil {
-		return "", err
-	}
-
-	models, err := client.List(ctx)
-	if err != nil {
-		return "", err
-	}
-
-	if len(models.Models) == 0 {
-		return "", fmt.Errorf("no models available, run 'ollama pull <model>' first")
-	}
-
-	lastModel := LastModel()
-
-	var items []ModelItem
-	for _, m := range models.Models {
-		items = append(items, ModelItem{Name: m.Name})
-	}
-
-	// Sort with last model first, then alphabetically
-	slices.SortFunc(items, func(a, b ModelItem) int {
-		aIsLast := a.Name == lastModel
-		bIsLast := b.Name == lastModel
-		if aIsLast != bIsLast {
-			if aIsLast {
-				return -1
-			}
-			return 1
-		}
-		return strings.Compare(strings.ToLower(a.Name), strings.ToLower(b.Name))
-	})
-
-	return selector("Select model to run:", items)
-}
-
-func SelectModel(ctx context.Context) (string, error) {
-	return SelectModelWithSelector(ctx, defaultSingleSelector)
-}
-
-func defaultSingleSelector(title string, items []ModelItem) (string, error) {
-	selectItems := make([]selectItem, len(items))
-	for i, item := range items {
-		selectItems[i] = selectItem(item)
-	}
-	return selectPrompt(title, selectItems)
-}
-
-func defaultMultiSelector(title string, items []ModelItem, preChecked []string) ([]string, error) {
-	selectItems := make([]selectItem, len(items))
-	for i, item := range items {
-		selectItems[i] = selectItem(item)
-	}
-	return multiSelectPrompt(title, selectItems, preChecked)
 }

 func selectIntegration() (string, error) {
@@ -235,9 +55,6 @@ func selectIntegration() (string, error) {
 	names := slices.Sorted(maps.Keys(integrations))
 	var items []selectItem
 	for _, name := range names {
-		if integrationAliases[name] {
-			continue
-		}
 		r := integrations[name]
 		description := r.String()
 		if conn, err := loadIntegration(name); err == nil && len(conn.Models) > 0 {
@@ -249,8 +66,8 @@ func selectIntegration() (string, error) {
 	return selectPrompt("Select integration:", items)
 }

-// selectModelsWithSelectors lets the user select models for an integration using provided selectors.
-func selectModelsWithSelectors(ctx context.Context, name, current string, single SingleSelector, multi MultiSelector) ([]string, error) {
+// selectModels lets the user select models for an integration
+func selectModels(ctx context.Context, name, current string) ([]string, error) {
 	r, ok := integrations[name]
 	if !ok {
 		return nil, fmt.Errorf("unknown integration: %s", name)
@@ -266,59 +83,76 @@ func selectModelsWithSelectors(ctx context.Context, name, current string, single
 		return nil, err
 	}

-	var existing []modelInfo
-	for _, m := range models.Models {
-		existing = append(existing, modelInfo{Name: m.Name, Remote: m.RemoteModel != ""})
+	if len(models.Models) == 0 {
+		return nil, fmt.Errorf("no models available, run 'ollama pull <model>' first")
 	}

+	var items []selectItem
+	cloudModels := make(map[string]bool)
+	for _, m := range models.Models {
+		if m.RemoteModel != "" {
+			cloudModels[m.Name] = true
+		}
+		items = append(items, selectItem{Name: m.Name})
+	}
+
+	if len(items) == 0 {
+		return nil, fmt.Errorf("no local models available, run 'ollama pull <model>' first")
+	}
+
+	// Get previously configured models (saved config takes precedence)
 	var preChecked []string
 	if saved, err := loadIntegration(name); err == nil {
 		preChecked = saved.Models
 	} else if editor, ok := r.(Editor); ok {
 		preChecked = editor.Models()
 	}
-
-	items, preChecked, existingModels, cloudModels := buildModelList(existing, preChecked, current)
-
-	if len(items) == 0 {
-		return nil, fmt.Errorf("no models available")
+	checked := make(map[string]bool, len(preChecked))
+	for _, n := range preChecked {
+		checked[n] = true
 	}

+	// Resolve current to full name (e.g., "llama3.2" -> "llama3.2:latest")
+	for _, item := range items {
+		if item.Name == current || strings.HasPrefix(item.Name, current+":") {
+			current = item.Name
+			break
+		}
+	}
+
+	// If current model is configured, move to front of preChecked
+	if checked[current] {
+		preChecked = append([]string{current}, slices.DeleteFunc(preChecked, func(m string) bool { return m == current })...)
+	}
+
+	// Sort: checked first, then alphabetical
+	slices.SortFunc(items, func(a, b selectItem) int {
+		ac, bc := checked[a.Name], checked[b.Name]
+		if ac != bc {
+			if ac {
+				return -1
+			}
+			return 1
+		}
+		return strings.Compare(strings.ToLower(a.Name), strings.ToLower(b.Name))
+	})
+
 	var selected []string
+	// only editors support multi-model selection
 	if _, ok := r.(Editor); ok {
-		selected, err = multi(fmt.Sprintf("Select models for %s:", r), items, preChecked)
+		selected, err = multiSelectPrompt(fmt.Sprintf("Select models for %s:", r), items, preChecked)
 		if err != nil {
 			return nil, err
 		}
 	} else {
-		model, err := single(fmt.Sprintf("Select model for %s:", r), items)
+		model, err := selectPrompt(fmt.Sprintf("Select model for %s:", r), items)
 		if err != nil {
 			return nil, err
 		}
 		selected = []string{model}
 	}

-	var toPull []string
-	for _, m := range selected {
-		if !existingModels[m] {
-			toPull = append(toPull, m)
-		}
-	}
-	if len(toPull) > 0 {
-		msg := fmt.Sprintf("Download %s?", strings.Join(toPull, ", "))
-		if ok, err := confirmPrompt(msg); err != nil {
-			return nil, err
-		} else if !ok {
-			return nil, errCancelled
-		}
-		for _, m := range toPull {
-			fmt.Fprintf(os.Stderr, "\n")
-			if err := pullModel(ctx, client, m); err != nil {
-				return nil, fmt.Errorf("failed to pull %s: %w", m, err)
-			}
-		}
-	}
-
+	// if any model in selected is a cloud model, ensure signed in
 	var selectedCloudModels []string
 	for _, m := range selected {
 		if cloudModels[m] {
@@ -388,149 +222,44 @@ func selectModelsWithSelectors(ctx context.Context, name, current string, single
 	return selected, nil
 }

-// selectModels lets the user select models for an integration using default selectors.
-func selectModels(ctx context.Context, name, current string) ([]string, error) {
-	return selectModelsWithSelectors(ctx, name, current, defaultSingleSelector, defaultMultiSelector)
-}
-
-func runIntegration(name, modelName string, args []string) error {
+func runIntegration(name, modelName string) error {
 	r, ok := integrations[name]
 	if !ok {
 		return fmt.Errorf("unknown integration: %s", name)
 	}
 	fmt.Fprintf(os.Stderr, "\nLaunching %s with %s...\n", r, modelName)
-	return r.Run(modelName, args)
-}
-
-// LaunchIntegration launches the named integration using saved config or prompts for setup.
-func LaunchIntegration(name string) error {
-	r, ok := integrations[name]
-	if !ok {
-		return fmt.Errorf("unknown integration: %s", name)
-	}
-
-	// Try to use saved config
-	if config, err := loadIntegration(name); err == nil && len(config.Models) > 0 {
-		return runIntegration(name, config.Models[0], nil)
-	}
-
-	// No saved config - prompt user to run setup
-	return fmt.Errorf("%s is not configured. Run 'ollama launch %s' to set it up", r, name)
-}
-
-// ConfigureIntegrationWithSelectors allows the user to select/change the model for an integration using custom selectors.
-func ConfigureIntegrationWithSelectors(ctx context.Context, name string, single SingleSelector, multi MultiSelector) error {
-	r, ok := integrations[name]
-	if !ok {
-		return fmt.Errorf("unknown integration: %s", name)
-	}
-
-	models, err := selectModelsWithSelectors(ctx, name, "", single, multi)
-	if errors.Is(err, errCancelled) {
-		return nil
-	}
-	if err != nil {
-		return err
-	}
-
-	if editor, isEditor := r.(Editor); isEditor {
-		paths := editor.Paths()
-		if len(paths) > 0 {
-			fmt.Fprintf(os.Stderr, "This will modify your %s configuration:\n", r)
-			for _, p := range paths {
-				fmt.Fprintf(os.Stderr, "  %s\n", p)
-			}
-			fmt.Fprintf(os.Stderr, "Backups will be saved to %s/\n\n", backupDir())
-
-			if ok, _ := confirmPrompt("Proceed?"); !ok {
-				return nil
-			}
-		}
-
-		if err := editor.Edit(models); err != nil {
-			return fmt.Errorf("setup failed: %w", err)
-		}
-	}
-
-	if err := saveIntegration(name, models); err != nil {
-		return fmt.Errorf("failed to save: %w", err)
-	}
-
-	if len(models) == 1 {
-		fmt.Fprintf(os.Stderr, "Configured %s with %s\n", r, models[0])
-	} else {
-		fmt.Fprintf(os.Stderr, "Configured %s with %d models (default: %s)\n", r, len(models), models[0])
-	}
-
-	return nil
-}
-
-// ConfigureIntegration allows the user to select/change the model for an integration.
-func ConfigureIntegration(ctx context.Context, name string) error {
-	return ConfigureIntegrationWithSelectors(ctx, name, defaultSingleSelector, defaultMultiSelector)
+	return r.Run(modelName)
 }

 // LaunchCmd returns the cobra command for launching integrations.
-// The runTUI callback is called when no arguments are provided (alias for main TUI).
-func LaunchCmd(checkServerHeartbeat func(cmd *cobra.Command, args []string) error, runTUI func(cmd *cobra.Command)) *cobra.Command {
+func LaunchCmd(checkServerHeartbeat func(cmd *cobra.Command, args []string) error) *cobra.Command {
 	var modelFlag string
 	var configFlag bool

 	cmd := &cobra.Command{
-		Use:   "launch [INTEGRATION] [-- [EXTRA_ARGS...]]",
-		Short: "Launch the Ollama menu or an integration",
-		Long: `Launch the Ollama interactive menu, or directly launch a specific integration.
-
-Without arguments, this is equivalent to running 'ollama' directly.
+		Use:   "launch [INTEGRATION]",
+		Short: "Launch an integration with Ollama",
+		Long: `Launch an integration configured with Ollama models.

 Supported integrations:
  claude    Claude Code
+  clawdbot  Clawdbot
  codex     Codex
  droid     Droid
  opencode  OpenCode
-  openclaw  OpenClaw (aliases: clawdbot, moltbot)

 Examples:
  ollama launch
  ollama launch claude
  ollama launch claude --model <model>
-  ollama launch droid --config (does not auto-launch)
-  ollama launch codex -- -p myprofile (pass extra args to integration)
-  ollama launch codex -- --sandbox workspace-write`,
-		Args:    cobra.ArbitraryArgs,
+  ollama launch droid --config (does not auto-launch)`,
+		Args:    cobra.MaximumNArgs(1),
 		PreRunE: checkServerHeartbeat,
 		RunE: func(cmd *cobra.Command, args []string) error {
-			// No args - run the main TUI (same as 'ollama')
-			if len(args) == 0 && modelFlag == "" && !configFlag {
-				runTUI(cmd)
-				return nil
-			}
-
-			// Extract integration name and args to pass through using -- separator
 			var name string
-			var passArgs []string
-			dashIdx := cmd.ArgsLenAtDash()
-
-			if dashIdx == -1 {
-				// No "--" separator: only allow 0 or 1 args (integration name)
-				if len(args) > 1 {
-					return fmt.Errorf("unexpected arguments: %v\nUse '--' to pass extra arguments to the integration", args[1:])
-				}
-				if len(args) == 1 {
-					name = args[0]
-				}
+			if len(args) > 0 {
+				name = args[0]
 			} else {
-				// "--" was used: args before it = integration name, args after = passthrough
-				if dashIdx > 1 {
-					return fmt.Errorf("expected at most 1 integration name before '--', got %d", dashIdx)
-				}
-				if dashIdx == 1 {
-					name = args[0]
-				}
-				passArgs = args[dashIdx:]
-			}
-
-			if name == "" {
 				var err error
 				name, err = selectIntegration()
 				if errors.Is(err, errCancelled) {
@@ -546,14 +275,16 @@ Examples:
 				return fmt.Errorf("unknown integration: %s", name)
 			}

+			// If launching without --model, use saved config if available
 			if !configFlag && modelFlag == "" {
 				if config, err := loadIntegration(name); err == nil && len(config.Models) > 0 {
-					return runIntegration(name, config.Models[0], passArgs)
+					return runIntegration(name, config.Models[0])
 				}
 			}

 			var models []string
 			if modelFlag != "" {
+				// When --model is specified, merge with existing models (new model becomes default)
 				models = []string{modelFlag}
 				if existing, err := loadIntegration(name); err == nil && len(existing.Models) > 0 {
 					for _, m := range existing.Models {
@@ -608,13 +339,13 @@ Examples:

 			if configFlag {
 				if launch, _ := confirmPrompt(fmt.Sprintf("\nLaunch %s now?", r)); launch {
-					return runIntegration(name, models[0], passArgs)
+					return runIntegration(name, models[0])
 				}
 				fmt.Fprintf(os.Stderr, "Run 'ollama launch %s' to start with %s\n", strings.ToLower(name), models[0])
 				return nil
 			}

-			return runIntegration(name, models[0], passArgs)
+			return runIntegration(name, models[0])
 		},
 	}

@@ -622,154 +353,3 @@ Examples:
 	cmd.Flags().BoolVar(&configFlag, "config", false, "Configure without launching")
 	return cmd
 }
-
-type modelInfo struct {
-	Name   string
-	Remote bool
-}
-
-// buildModelList merges existing models with recommendations, sorts them, and returns
-// the ordered items along with maps of existing and cloud model names.
-func buildModelList(existing []modelInfo, preChecked []string, current string) (items []ModelItem, orderedChecked []string, existingModels, cloudModels map[string]bool) {
-	existingModels = make(map[string]bool)
-	cloudModels = make(map[string]bool)
-	recommended := make(map[string]bool)
-	var hasLocalModel, hasCloudModel bool
-
-	for _, rec := range recommendedModels {
-		recommended[rec.Name] = true
-	}
-
-	for _, m := range existing {
-		existingModels[m.Name] = true
-		if m.Remote {
-			cloudModels[m.Name] = true
-			hasCloudModel = true
-		} else {
-			hasLocalModel = true
-		}
-		displayName := strings.TrimSuffix(m.Name, ":latest")
-		existingModels[displayName] = true
-		item := ModelItem{Name: displayName}
-		if recommended[displayName] {
-			item.Description = "recommended"
-		}
-		items = append(items, item)
-	}
-
-	for _, rec := range recommendedModels {
-		if existingModels[rec.Name] || existingModels[rec.Name+":latest"] {
-			continue
-		}
-		items = append(items, rec)
-		if isCloudModel(rec.Name) {
-			cloudModels[rec.Name] = true
-		}
-	}
-
-	checked := make(map[string]bool, len(preChecked))
-	for _, n := range preChecked {
-		checked[n] = true
-	}
-
-	// Resolve current to full name (e.g., "llama3.2" -> "llama3.2:latest")
-	for _, item := range items {
-		if item.Name == current || strings.HasPrefix(item.Name, current+":") {
-			current = item.Name
-			break
-		}
-	}
-
-	if checked[current] {
-		preChecked = append([]string{current}, slices.DeleteFunc(preChecked, func(m string) bool { return m == current })...)
-	}
-
-	// Non-existing models get "install?" suffix and are pushed to the bottom.
-	// When user has no models, preserve recommended order.
-	notInstalled := make(map[string]bool)
-	for i := range items {
-		if !existingModels[items[i].Name] {
-			notInstalled[items[i].Name] = true
-			if items[i].Description != "" {
-				items[i].Description += ", install?"
-			} else {
-				items[i].Description = "install?"
-			}
-		}
-	}
-
-	if hasLocalModel || hasCloudModel {
-		slices.SortStableFunc(items, func(a, b ModelItem) int {
-			ac, bc := checked[a.Name], checked[b.Name]
-			aNew, bNew := notInstalled[a.Name], notInstalled[b.Name]
-
-			if ac != bc {
-				if ac {
-					return -1
-				}
-				return 1
-			}
-			if !ac && !bc && aNew != bNew {
-				if aNew {
-					return 1
-				}
-				return -1
-			}
-			return strings.Compare(strings.ToLower(a.Name), strings.ToLower(b.Name))
-		})
-	}
-
-	return items, preChecked, existingModels, cloudModels
-}
-
-func isCloudModel(name string) bool {
-	return strings.HasSuffix(name, ":cloud")
-}
-
-func pullModel(ctx context.Context, client *api.Client, model string) error {
-	p := progress.NewProgress(os.Stderr)
-	defer p.Stop()
-
-	bars := make(map[string]*progress.Bar)
-	var status string
-	var spinner *progress.Spinner
-
-	fn := func(resp api.ProgressResponse) error {
-		if resp.Digest != "" {
-			if resp.Completed == 0 {
-				return nil
-			}
-
-			if spinner != nil {
-				spinner.Stop()
-			}
-
-			bar, ok := bars[resp.Digest]
-			if !ok {
-				name, isDigest := strings.CutPrefix(resp.Digest, "sha256:")
-				name = strings.TrimSpace(name)
-				if isDigest {
-					name = name[:min(12, len(name))]
-				}
-				bar = progress.NewBar(fmt.Sprintf("pulling %s:", name), resp.Total, resp.Completed)
-				bars[resp.Digest] = bar
-				p.Add(resp.Digest, bar)
-			}
-
-			bar.Set(resp.Completed)
-		} else if status != resp.Status {
-			if spinner != nil {
-				spinner.Stop()
-			}
-
-			status = resp.Status
-			spinner = progress.NewSpinner(status)
-			p.Add(status, spinner)
-		}
-
-		return nil
-	}
-
-	request := api.PullRequest{Name: model}
-	return client.Pull(ctx, &request, fn)
-}
--- a/cmd/config/integrations_test.go
+++ b/cmd/config/integrations_test.go
@@ -1,12 +1,10 @@
 package config

 import (
-	"fmt"
 	"slices"
 	"strings"
 	"testing"

-	"github.com/google/go-cmp/cmp"
 	"github.com/spf13/cobra"
 )

@@ -88,14 +86,12 @@ func TestLaunchCmd(t *testing.T) {
 	mockCheck := func(cmd *cobra.Command, args []string) error {
 		return nil
 	}
-	// Mock TUI function (not called in these tests)
-	mockTUI := func(cmd *cobra.Command) {}

-	cmd := LaunchCmd(mockCheck, mockTUI)
+	cmd := LaunchCmd(mockCheck)

 	t.Run("command structure", func(t *testing.T) {
-		if cmd.Use != "launch [INTEGRATION] [-- [EXTRA_ARGS...]]" {
-			t.Errorf("Use = %q, want %q", cmd.Use, "launch [INTEGRATION] [-- [EXTRA_ARGS...]]")
+		if cmd.Use != "launch [INTEGRATION]" {
+			t.Errorf("Use = %q, want %q", cmd.Use, "launch [INTEGRATION]")
 		}
 		if cmd.Short == "" {
 			t.Error("Short description should not be empty")
@@ -124,77 +120,8 @@ func TestLaunchCmd(t *testing.T) {
 	})
 }

-func TestLaunchCmd_TUICallback(t *testing.T) {
-	mockCheck := func(cmd *cobra.Command, args []string) error {
-		return nil
-	}
-
-	t.Run("no args calls TUI", func(t *testing.T) {
-		tuiCalled := false
-		mockTUI := func(cmd *cobra.Command) {
-			tuiCalled = true
-		}
-
-		cmd := LaunchCmd(mockCheck, mockTUI)
-		cmd.SetArgs([]string{})
-		_ = cmd.Execute()
-
-		if !tuiCalled {
-			t.Error("TUI callback should be called when no args provided")
-		}
-	})
-
-	t.Run("integration arg bypasses TUI", func(t *testing.T) {
-		tuiCalled := false
-		mockTUI := func(cmd *cobra.Command) {
-			tuiCalled = true
-		}
-
-		cmd := LaunchCmd(mockCheck, mockTUI)
-		cmd.SetArgs([]string{"claude"})
-		// Will error because claude isn't configured, but that's OK
-		_ = cmd.Execute()
-
-		if tuiCalled {
-			t.Error("TUI callback should NOT be called when integration arg provided")
-		}
-	})
-
-	t.Run("--model flag bypasses TUI", func(t *testing.T) {
-		tuiCalled := false
-		mockTUI := func(cmd *cobra.Command) {
-			tuiCalled = true
-		}
-
-		cmd := LaunchCmd(mockCheck, mockTUI)
-		cmd.SetArgs([]string{"--model", "test-model"})
-		// Will error because no integration specified, but that's OK
-		_ = cmd.Execute()
-
-		if tuiCalled {
-			t.Error("TUI callback should NOT be called when --model flag provided")
-		}
-	})
-
-	t.Run("--config flag bypasses TUI", func(t *testing.T) {
-		tuiCalled := false
-		mockTUI := func(cmd *cobra.Command) {
-			tuiCalled = true
-		}
-
-		cmd := LaunchCmd(mockCheck, mockTUI)
-		cmd.SetArgs([]string{"--config"})
-		// Will error because no integration specified, but that's OK
-		_ = cmd.Execute()
-
-		if tuiCalled {
-			t.Error("TUI callback should NOT be called when --config flag provided")
-		}
-	})
-}
-
 func TestRunIntegration_UnknownIntegration(t *testing.T) {
-	err := runIntegration("unknown-integration", "model", nil)
+	err := runIntegration("unknown-integration", "model")
 	if err == nil {
 		t.Error("expected error for unknown integration, got nil")
 	}
@@ -233,7 +160,7 @@ func TestHasLocalModel_DocumentsHeuristic(t *testing.T) {

 func TestLaunchCmd_NilHeartbeat(t *testing.T) {
 	// This should not panic - cmd creation should work even with nil
-	cmd := LaunchCmd(nil, nil)
+	cmd := LaunchCmd(nil)
 	if cmd == nil {
 		t.Fatal("LaunchCmd returned nil")
 	}
@@ -247,336 +174,15 @@ func TestLaunchCmd_NilHeartbeat(t *testing.T) {
 func TestAllIntegrations_HaveRequiredMethods(t *testing.T) {
 	for name, r := range integrations {
 		t.Run(name, func(t *testing.T) {
+			// Test String() doesn't panic and returns non-empty
 			displayName := r.String()
 			if displayName == "" {
 				t.Error("String() should not return empty")
 			}
-			var _ func(string, []string) error = r.Run
+
+			// Test Run() exists (we can't call it without actually running the command)
+			// Just verify the method is available
+			var _ func(string) error = r.Run
 		})
 	}
 }
-
-func TestParseArgs(t *testing.T) {
-	// Tests reflect cobra's ArgsLenAtDash() semantics:
-	// - cobra strips "--" from args
-	// - ArgsLenAtDash() returns the index where "--" was, or -1
-	tests := []struct {
-		name     string
-		args     []string // args as cobra delivers them (no "--")
-		dashIdx  int      // what ArgsLenAtDash() returns
-		wantName string
-		wantArgs []string
-		wantErr  bool
-	}{
-		{
-			name:     "no extra args, no dash",
-			args:     []string{"claude"},
-			dashIdx:  -1,
-			wantName: "claude",
-		},
-		{
-			name:     "with extra args after --",
-			args:     []string{"codex", "-p", "myprofile"},
-			dashIdx:  1,
-			wantName: "codex",
-			wantArgs: []string{"-p", "myprofile"},
-		},
-		{
-			name:     "extra args only after --",
-			args:     []string{"codex", "--sandbox", "workspace-write"},
-			dashIdx:  1,
-			wantName: "codex",
-			wantArgs: []string{"--sandbox", "workspace-write"},
-		},
-		{
-			name:     "-- at end with no args after",
-			args:     []string{"claude"},
-			dashIdx:  1,
-			wantName: "claude",
-		},
-		{
-			name:     "-- with no integration name",
-			args:     []string{"--verbose"},
-			dashIdx:  0,
-			wantName: "",
-			wantArgs: []string{"--verbose"},
-		},
-		{
-			name:    "multiple args before -- is error",
-			args:    []string{"claude", "codex", "--verbose"},
-			dashIdx: 2,
-			wantErr: true,
-		},
-		{
-			name:    "multiple args without -- is error",
-			args:    []string{"claude", "codex"},
-			dashIdx: -1,
-			wantErr: true,
-		},
-		{
-			name:     "no args, no dash",
-			args:     []string{},
-			dashIdx:  -1,
-			wantName: "",
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			// Simulate the parsing logic from LaunchCmd using dashIdx
-			var name string
-			var parsedArgs []string
-			var err error
-
-			dashIdx := tt.dashIdx
-			args := tt.args
-
-			if dashIdx == -1 {
-				if len(args) > 1 {
-					err = fmt.Errorf("unexpected arguments: %v", args[1:])
-				} else if len(args) == 1 {
-					name = args[0]
-				}
-			} else {
-				if dashIdx > 1 {
-					err = fmt.Errorf("expected at most 1 integration name before '--', got %d", dashIdx)
-				} else {
-					if dashIdx == 1 {
-						name = args[0]
-					}
-					parsedArgs = args[dashIdx:]
-				}
-			}
-
-			if tt.wantErr {
-				if err == nil {
-					t.Fatal("expected error, got nil")
-				}
-				return
-			}
-			if err != nil {
-				t.Fatalf("unexpected error: %v", err)
-			}
-			if name != tt.wantName {
-				t.Errorf("name = %q, want %q", name, tt.wantName)
-			}
-			if !slices.Equal(parsedArgs, tt.wantArgs) {
-				t.Errorf("args = %v, want %v", parsedArgs, tt.wantArgs)
-			}
-		})
-	}
-}
-
-func TestIsCloudModel(t *testing.T) {
-	tests := []struct {
-		name string
-		want bool
-	}{
-		{"glm-4.7:cloud", true},
-		{"kimi-k2.5:cloud", true},
-		{"glm-4.7-flash", false},
-		{"glm-4.7-flash:latest", false},
-		{"cloud-model", false},
-		{"model:cloudish", false},
-	}
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			if got := isCloudModel(tt.name); got != tt.want {
-				t.Errorf("isCloudModel(%q) = %v, want %v", tt.name, got, tt.want)
-			}
-		})
-	}
-}
-
-func names(items []ModelItem) []string {
-	var out []string
-	for _, item := range items {
-		out = append(out, item.Name)
-	}
-	return out
-}
-
-func TestBuildModelList_NoExistingModels(t *testing.T) {
-	items, _, _, _ := buildModelList(nil, nil, "")
-
-	want := []string{"glm-4.7-flash", "qwen3:8b", "glm-4.7:cloud", "kimi-k2.5:cloud"}
-	if diff := cmp.Diff(want, names(items)); diff != "" {
-		t.Errorf("with no existing models, items should be recommended in order (-want +got):\n%s", diff)
-	}
-
-	for _, item := range items {
-		if !strings.HasSuffix(item.Description, "install?") {
-			t.Errorf("item %q should have description ending with 'install?', got %q", item.Name, item.Description)
-		}
-	}
-}
-
-func TestBuildModelList_OnlyLocalModels_CloudRecsAtBottom(t *testing.T) {
-	existing := []modelInfo{
-		{Name: "llama3.2:latest", Remote: false},
-		{Name: "qwen2.5:latest", Remote: false},
-	}
-
-	items, _, _, _ := buildModelList(existing, nil, "")
-	got := names(items)
-
-	want := []string{"llama3.2", "qwen2.5", "glm-4.7-flash", "glm-4.7:cloud", "kimi-k2.5:cloud", "qwen3:8b"}
-	if diff := cmp.Diff(want, got); diff != "" {
-		t.Errorf("cloud recs should be at bottom (-want +got):\n%s", diff)
-	}
-}
-
-func TestBuildModelList_BothCloudAndLocal_RegularSort(t *testing.T) {
-	existing := []modelInfo{
-		{Name: "llama3.2:latest", Remote: false},
-		{Name: "glm-4.7:cloud", Remote: true},
-	}
-
-	items, _, _, _ := buildModelList(existing, nil, "")
-	got := names(items)
-
-	want := []string{"glm-4.7:cloud", "llama3.2", "glm-4.7-flash", "kimi-k2.5:cloud", "qwen3:8b"}
-	if diff := cmp.Diff(want, got); diff != "" {
-		t.Errorf("mixed models should be alphabetical (-want +got):\n%s", diff)
-	}
-}
-
-func TestBuildModelList_PreCheckedFirst(t *testing.T) {
-	existing := []modelInfo{
-		{Name: "llama3.2:latest", Remote: false},
-		{Name: "glm-4.7:cloud", Remote: true},
-	}
-
-	items, _, _, _ := buildModelList(existing, []string{"llama3.2"}, "")
-	got := names(items)
-
-	if got[0] != "llama3.2" {
-		t.Errorf("pre-checked model should be first, got %v", got)
-	}
-}
-
-func TestBuildModelList_ExistingRecommendedMarked(t *testing.T) {
-	existing := []modelInfo{
-		{Name: "glm-4.7-flash", Remote: false},
-		{Name: "glm-4.7:cloud", Remote: true},
-	}
-
-	items, _, _, _ := buildModelList(existing, nil, "")
-
-	for _, item := range items {
-		switch item.Name {
-		case "glm-4.7-flash", "glm-4.7:cloud":
-			if strings.HasSuffix(item.Description, "install?") {
-				t.Errorf("installed recommended %q should not have 'install?' suffix, got %q", item.Name, item.Description)
-			}
-		case "kimi-k2.5:cloud", "qwen3:8b":
-			if !strings.HasSuffix(item.Description, "install?") {
-				t.Errorf("non-installed recommended %q should have 'install?' suffix, got %q", item.Name, item.Description)
-			}
-		}
-	}
-}
-
-func TestBuildModelList_ExistingCloudModelsNotPushedToBottom(t *testing.T) {
-	existing := []modelInfo{
-		{Name: "glm-4.7-flash", Remote: false},
-		{Name: "glm-4.7:cloud", Remote: true},
-	}
-
-	items, _, _, _ := buildModelList(existing, nil, "")
-	got := names(items)
-
-	// glm-4.7-flash and glm-4.7:cloud are installed so they sort normally;
-	// kimi-k2.5:cloud and qwen3:8b are not installed so they go to the bottom
-	want := []string{"glm-4.7-flash", "glm-4.7:cloud", "kimi-k2.5:cloud", "qwen3:8b"}
-	if diff := cmp.Diff(want, got); diff != "" {
-		t.Errorf("existing cloud models should sort normally (-want +got):\n%s", diff)
-	}
-}
-
-func TestBuildModelList_HasRecommendedCloudModel_OnlyNonInstalledAtBottom(t *testing.T) {
-	existing := []modelInfo{
-		{Name: "llama3.2:latest", Remote: false},
-		{Name: "kimi-k2.5:cloud", Remote: true},
-	}
-
-	items, _, _, _ := buildModelList(existing, nil, "")
-	got := names(items)
-
-	// kimi-k2.5:cloud is installed so it sorts normally;
-	// the rest of the recommendations are not installed so they go to the bottom
-	want := []string{"kimi-k2.5:cloud", "llama3.2", "glm-4.7-flash", "glm-4.7:cloud", "qwen3:8b"}
-	if diff := cmp.Diff(want, got); diff != "" {
-		t.Errorf("only non-installed models should be at bottom (-want +got):\n%s", diff)
-	}
-
-	for _, item := range items {
-		if !slices.Contains([]string{"kimi-k2.5:cloud", "llama3.2"}, item.Name) {
-			if !strings.HasSuffix(item.Description, "install?") {
-				t.Errorf("non-installed %q should have 'install?' suffix, got %q", item.Name, item.Description)
-			}
-		}
-	}
-}
-
-func TestBuildModelList_LatestTagStripped(t *testing.T) {
-	existing := []modelInfo{
-		{Name: "glm-4.7-flash:latest", Remote: false},
-		{Name: "llama3.2:latest", Remote: false},
-	}
-
-	items, _, existingModels, _ := buildModelList(existing, nil, "")
-	got := names(items)
-
-	// :latest should be stripped from display names
-	for _, name := range got {
-		if strings.HasSuffix(name, ":latest") {
-			t.Errorf("name %q should not have :latest suffix", name)
-		}
-	}
-
-	// glm-4.7-flash should not be duplicated (existing :latest matches the recommendation)
-	count := 0
-	for _, name := range got {
-		if name == "glm-4.7-flash" {
-			count++
-		}
-	}
-	if count != 1 {
-		t.Errorf("glm-4.7-flash should appear exactly once, got %d in %v", count, got)
-	}
-
-	// Stripped name should be in existingModels so it won't be pulled
-	if !existingModels["glm-4.7-flash"] {
-		t.Error("glm-4.7-flash should be in existingModels")
-	}
-}
-
-func TestBuildModelList_ReturnsExistingAndCloudMaps(t *testing.T) {
-	existing := []modelInfo{
-		{Name: "llama3.2:latest", Remote: false},
-		{Name: "glm-4.7:cloud", Remote: true},
-	}
-
-	_, _, existingModels, cloudModels := buildModelList(existing, nil, "")
-
-	if !existingModels["llama3.2"] {
-		t.Error("llama3.2 should be in existingModels")
-	}
-	if !existingModels["glm-4.7:cloud"] {
-		t.Error("glm-4.7:cloud should be in existingModels")
-	}
-	if existingModels["glm-4.7-flash"] {
-		t.Error("glm-4.7-flash should not be in existingModels (it's a recommendation)")
-	}
-
-	if !cloudModels["glm-4.7:cloud"] {
-		t.Error("glm-4.7:cloud should be in cloudModels")
-	}
-	if !cloudModels["kimi-k2.5:cloud"] {
-		t.Error("kimi-k2.5:cloud should be in cloudModels (recommended cloud)")
-	}
-	if cloudModels["llama3.2"] {
-		t.Error("llama3.2 should not be in cloudModels")
-	}
-}
--- a/cmd/config/opencode.go
+++ b/cmd/config/opencode.go
@@ -18,7 +18,7 @@ type OpenCode struct{}

 func (o *OpenCode) String() string { return "OpenCode" }

-func (o *OpenCode) Run(model string, args []string) error {
+func (o *OpenCode) Run(model string) error {
 	if _, err := exec.LookPath("opencode"); err != nil {
 		return fmt.Errorf("opencode is not installed, install from https://opencode.ai")
 	}
@@ -32,7 +32,7 @@ func (o *OpenCode) Run(model string, args []string) error {
 		return fmt.Errorf("setup failed: %w", err)
 	}

-	cmd := exec.Command("opencode", args...)
+	cmd := exec.Command("opencode")
 	cmd.Stdin = os.Stdin
 	cmd.Stdout = os.Stdout
 	cmd.Stderr = os.Stderr
--- a/cmd/config/selector.go
+++ b/cmd/config/selector.go
@@ -275,11 +275,7 @@ func parseInput(r io.Reader) (inputEvent, byte, error) {
 func renderSelect(w io.Writer, prompt string, s *selectState) int {
 	filtered := s.filtered()

-	if s.filter == "" {
-		fmt.Fprintf(w, "%s %sType to filter...%s\r\n", prompt, ansiGray, ansiReset)
-	} else {
-		fmt.Fprintf(w, "%s %s\r\n", prompt, s.filter)
-	}
+	fmt.Fprintf(w, "%s %s\r\n", prompt, s.filter)
 	lineCount := 1

 	if len(filtered) == 0 {
@@ -318,11 +314,7 @@ func renderSelect(w io.Writer, prompt string, s *selectState) int {
 func renderMultiSelect(w io.Writer, prompt string, s *multiSelectState) int {
 	filtered := s.filtered()

-	if s.filter == "" {
-		fmt.Fprintf(w, "%s %sType to filter...%s\r\n", prompt, ansiGray, ansiReset)
-	} else {
-		fmt.Fprintf(w, "%s %s\r\n", prompt, s.filter)
-	}
+	fmt.Fprintf(w, "%s %s\r\n", prompt, s.filter)
 	lineCount := 1

 	if len(filtered) == 0 {
@@ -353,15 +345,10 @@ func renderMultiSelect(w io.Writer, prompt string, s *multiSelectState) int {
 				suffix = " " + ansiGray + "(default)" + ansiReset
 			}

-			desc := ""
-			if item.Description != "" {
-				desc = " " + ansiGray + "- " + item.Description + ansiReset
-			}
-
 			if idx == s.highlighted && !s.focusOnButton {
-				fmt.Fprintf(w, "  %s%s %s %s%s%s%s\r\n", ansiBold, prefix, checkbox, item.Name, ansiReset, desc, suffix)
+				fmt.Fprintf(w, "  %s%s %s %s%s%s\r\n", ansiBold, prefix, checkbox, item.Name, ansiReset, suffix)
 			} else {
-				fmt.Fprintf(w, "  %s %s %s%s%s\r\n", prefix, checkbox, item.Name, desc, suffix)
+				fmt.Fprintf(w, "  %s %s %s%s\r\n", prefix, checkbox, item.Name, suffix)
 			}
 			lineCount++
 		}
--- a/cmd/tui/selector.go
+++ b/cmd/tui/selector.go
@@ -1,471 +0,0 @@
-package tui
-
-import (
-	"errors"
-	"fmt"
-	"strings"
-
-	tea "github.com/charmbracelet/bubbletea"
-	"github.com/charmbracelet/lipgloss"
-)
-
-var (
-	selectorTitleStyle = lipgloss.NewStyle().
-				Bold(true).
-				Foreground(lipgloss.Color("147"))
-
-	selectorItemStyle = lipgloss.NewStyle().
-				PaddingLeft(4)
-
-	selectorSelectedItemStyle = lipgloss.NewStyle().
-					PaddingLeft(2).
-					Foreground(lipgloss.Color("147")).
-					Bold(true)
-
-	selectorDescStyle = lipgloss.NewStyle().
-				Foreground(lipgloss.Color("241"))
-
-	selectorFilterStyle = lipgloss.NewStyle().
-				Foreground(lipgloss.Color("241")).
-				Italic(true)
-
-	selectorInputStyle = lipgloss.NewStyle().
-				Foreground(lipgloss.Color("252"))
-
-	selectorCheckboxStyle = lipgloss.NewStyle().
-				Foreground(lipgloss.Color("241"))
-
-	selectorCheckboxCheckedStyle = lipgloss.NewStyle().
-					Foreground(lipgloss.Color("147"))
-
-	selectorDefaultTagStyle = lipgloss.NewStyle().
-				Foreground(lipgloss.Color("241")).
-				Italic(true)
-
-	selectorHelpStyle = lipgloss.NewStyle().
-				Foreground(lipgloss.Color("241"))
-
-	selectorMoreStyle = lipgloss.NewStyle().
-				PaddingLeft(4).
-				Foreground(lipgloss.Color("241")).
-				Italic(true)
-)
-
-const maxSelectorItems = 10
-
-// ErrCancelled is returned when the user cancels the selection.
-var ErrCancelled = errors.New("cancelled")
-
-// SelectItem represents an item that can be selected.
-type SelectItem struct {
-	Name        string
-	Description string
-}
-
-// selectorModel is the bubbletea model for single selection.
-type selectorModel struct {
-	title        string
-	items        []SelectItem
-	filter       string
-	cursor       int
-	scrollOffset int
-	selected     string
-	cancelled    bool
-}
-
-func (m selectorModel) filteredItems() []SelectItem {
-	if m.filter == "" {
-		return m.items
-	}
-	filterLower := strings.ToLower(m.filter)
-	var result []SelectItem
-	for _, item := range m.items {
-		if strings.Contains(strings.ToLower(item.Name), filterLower) {
-			result = append(result, item)
-		}
-	}
-	return result
-}
-
-func (m selectorModel) Init() tea.Cmd {
-	return nil
-}
-
-func (m selectorModel) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
-	switch msg := msg.(type) {
-	case tea.KeyMsg:
-		filtered := m.filteredItems()
-
-		switch msg.Type {
-		case tea.KeyCtrlC, tea.KeyEsc:
-			m.cancelled = true
-			return m, tea.Quit
-
-		case tea.KeyEnter:
-			if len(filtered) > 0 && m.cursor < len(filtered) {
-				m.selected = filtered[m.cursor].Name
-			}
-			return m, tea.Quit
-
-		case tea.KeyUp:
-			if m.cursor > 0 {
-				m.cursor--
-				if m.cursor < m.scrollOffset {
-					m.scrollOffset = m.cursor
-				}
-			}
-
-		case tea.KeyDown:
-			if m.cursor < len(filtered)-1 {
-				m.cursor++
-				if m.cursor >= m.scrollOffset+maxSelectorItems {
-					m.scrollOffset = m.cursor - maxSelectorItems + 1
-				}
-			}
-
-		case tea.KeyBackspace:
-			if len(m.filter) > 0 {
-				m.filter = m.filter[:len(m.filter)-1]
-				m.cursor = 0
-				m.scrollOffset = 0
-			}
-
-		case tea.KeyRunes:
-			m.filter += string(msg.Runes)
-			m.cursor = 0
-			m.scrollOffset = 0
-		}
-	}
-
-	return m, nil
-}
-
-func (m selectorModel) View() string {
-	// Clear screen when exiting
-	if m.cancelled || m.selected != "" {
-		return ""
-	}
-
-	var s strings.Builder
-
-	// Title with filter
-	s.WriteString(selectorTitleStyle.Render(m.title))
-	s.WriteString(" ")
-	if m.filter == "" {
-		s.WriteString(selectorFilterStyle.Render("Type to filter..."))
-	} else {
-		s.WriteString(selectorInputStyle.Render(m.filter))
-	}
-	s.WriteString("\n\n")
-
-	filtered := m.filteredItems()
-
-	if len(filtered) == 0 {
-		s.WriteString(selectorItemStyle.Render(selectorDescStyle.Render("(no matches)")))
-		s.WriteString("\n")
-	} else {
-		displayCount := min(len(filtered), maxSelectorItems)
-
-		for i := range displayCount {
-			idx := m.scrollOffset + i
-			if idx >= len(filtered) {
-				break
-			}
-			item := filtered[idx]
-
-			if idx == m.cursor {
-				s.WriteString(selectorSelectedItemStyle.Render("▸ " + item.Name))
-			} else {
-				s.WriteString(selectorItemStyle.Render(item.Name))
-			}
-
-			if item.Description != "" {
-				s.WriteString(" ")
-				s.WriteString(selectorDescStyle.Render("- " + item.Description))
-			}
-			s.WriteString("\n")
-		}
-
-		if remaining := len(filtered) - m.scrollOffset - displayCount; remaining > 0 {
-			s.WriteString(selectorMoreStyle.Render(fmt.Sprintf("... and %d more", remaining)))
-			s.WriteString("\n")
-		}
-	}
-
-	s.WriteString("\n")
-	s.WriteString(selectorHelpStyle.Render("↑/↓ navigate • enter select • esc cancel"))
-
-	return s.String()
-}
-
-// SelectSingle prompts the user to select a single item from a list.
-func SelectSingle(title string, items []SelectItem) (string, error) {
-	if len(items) == 0 {
-		return "", fmt.Errorf("no items to select from")
-	}
-
-	m := selectorModel{
-		title: title,
-		items: items,
-	}
-
-	p := tea.NewProgram(m)
-	finalModel, err := p.Run()
-	if err != nil {
-		return "", fmt.Errorf("error running selector: %w", err)
-	}
-
-	fm := finalModel.(selectorModel)
-	if fm.cancelled {
-		return "", ErrCancelled
-	}
-
-	return fm.selected, nil
-}
-
-// multiSelectorModel is the bubbletea model for multi selection.
-type multiSelectorModel struct {
-	title        string
-	items        []SelectItem
-	itemIndex    map[string]int
-	filter       string
-	cursor       int
-	scrollOffset int
-	checked      map[int]bool
-	checkOrder   []int
-	cancelled    bool
-	confirmed    bool
-}
-
-func newMultiSelectorModel(title string, items []SelectItem, preChecked []string) multiSelectorModel {
-	m := multiSelectorModel{
-		title:     title,
-		items:     items,
-		itemIndex: make(map[string]int, len(items)),
-		checked:   make(map[int]bool),
-	}
-
-	for i, item := range items {
-		m.itemIndex[item.Name] = i
-	}
-
-	for _, name := range preChecked {
-		if idx, ok := m.itemIndex[name]; ok {
-			m.checked[idx] = true
-			m.checkOrder = append(m.checkOrder, idx)
-		}
-	}
-
-	return m
-}
-
-func (m multiSelectorModel) filteredItems() []SelectItem {
-	if m.filter == "" {
-		return m.items
-	}
-	filterLower := strings.ToLower(m.filter)
-	var result []SelectItem
-	for _, item := range m.items {
-		if strings.Contains(strings.ToLower(item.Name), filterLower) {
-			result = append(result, item)
-		}
-	}
-	return result
-}
-
-func (m *multiSelectorModel) toggleItem() {
-	filtered := m.filteredItems()
-	if len(filtered) == 0 || m.cursor >= len(filtered) {
-		return
-	}
-
-	item := filtered[m.cursor]
-	origIdx := m.itemIndex[item.Name]
-
-	if m.checked[origIdx] {
-		delete(m.checked, origIdx)
-		for i, idx := range m.checkOrder {
-			if idx == origIdx {
-				m.checkOrder = append(m.checkOrder[:i], m.checkOrder[i+1:]...)
-				break
-			}
-		}
-	} else {
-		m.checked[origIdx] = true
-		m.checkOrder = append(m.checkOrder, origIdx)
-	}
-}
-
-func (m multiSelectorModel) selectedCount() int {
-	return len(m.checkOrder)
-}
-
-func (m multiSelectorModel) Init() tea.Cmd {
-	return nil
-}
-
-func (m multiSelectorModel) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
-	switch msg := msg.(type) {
-	case tea.KeyMsg:
-		filtered := m.filteredItems()
-
-		switch msg.Type {
-		case tea.KeyCtrlC, tea.KeyEsc:
-			m.cancelled = true
-			return m, tea.Quit
-
-		case tea.KeyEnter:
-			// Enter confirms if at least one item is selected
-			if len(m.checkOrder) > 0 {
-				m.confirmed = true
-				return m, tea.Quit
-			}
-
-		case tea.KeySpace:
-			// Space always toggles selection
-			m.toggleItem()
-
-		case tea.KeyUp:
-			if m.cursor > 0 {
-				m.cursor--
-				if m.cursor < m.scrollOffset {
-					m.scrollOffset = m.cursor
-				}
-			}
-
-		case tea.KeyDown:
-			if m.cursor < len(filtered)-1 {
-				m.cursor++
-				if m.cursor >= m.scrollOffset+maxSelectorItems {
-					m.scrollOffset = m.cursor - maxSelectorItems + 1
-				}
-			}
-
-		case tea.KeyBackspace:
-			if len(m.filter) > 0 {
-				m.filter = m.filter[:len(m.filter)-1]
-				m.cursor = 0
-				m.scrollOffset = 0
-			}
-
-		case tea.KeyRunes:
-			m.filter += string(msg.Runes)
-			m.cursor = 0
-			m.scrollOffset = 0
-		}
-	}
-
-	return m, nil
-}
-
-func (m multiSelectorModel) View() string {
-	// Clear screen when exiting
-	if m.cancelled || m.confirmed {
-		return ""
-	}
-
-	var s strings.Builder
-
-	// Title with filter
-	s.WriteString(selectorTitleStyle.Render(m.title))
-	s.WriteString(" ")
-	if m.filter == "" {
-		s.WriteString(selectorFilterStyle.Render("Type to filter..."))
-	} else {
-		s.WriteString(selectorInputStyle.Render(m.filter))
-	}
-	s.WriteString("\n\n")
-
-	filtered := m.filteredItems()
-
-	if len(filtered) == 0 {
-		s.WriteString(selectorItemStyle.Render(selectorDescStyle.Render("(no matches)")))
-		s.WriteString("\n")
-	} else {
-		displayCount := min(len(filtered), maxSelectorItems)
-
-		for i := range displayCount {
-			idx := m.scrollOffset + i
-			if idx >= len(filtered) {
-				break
-			}
-			item := filtered[idx]
-			origIdx := m.itemIndex[item.Name]
-
-			// Checkbox
-			var checkbox string
-			if m.checked[origIdx] {
-				checkbox = selectorCheckboxCheckedStyle.Render("[x]")
-			} else {
-				checkbox = selectorCheckboxStyle.Render("[ ]")
-			}
-
-			// Cursor and name
-			var line string
-			if idx == m.cursor {
-				line = selectorSelectedItemStyle.Render("▸ ") + checkbox + " " + selectorSelectedItemStyle.Render(item.Name)
-			} else {
-				line = "  " + checkbox + " " + item.Name
-			}
-
-			// Default tag
-			if len(m.checkOrder) > 0 && m.checkOrder[0] == origIdx {
-				line += " " + selectorDefaultTagStyle.Render("(default)")
-			}
-
-			s.WriteString(line)
-			s.WriteString("\n")
-		}
-
-		if remaining := len(filtered) - m.scrollOffset - displayCount; remaining > 0 {
-			s.WriteString(selectorMoreStyle.Render(fmt.Sprintf("... and %d more", remaining)))
-			s.WriteString("\n")
-		}
-	}
-
-	s.WriteString("\n")
-
-	// Status line
-	count := m.selectedCount()
-	if count == 0 {
-		s.WriteString(selectorDescStyle.Render("  Select at least one model."))
-	} else {
-		s.WriteString(selectorDescStyle.Render(fmt.Sprintf("  %d selected - press enter to continue", count)))
-	}
-	s.WriteString("\n\n")
-
-	s.WriteString(selectorHelpStyle.Render("↑/↓ navigate • space toggle • enter confirm • esc cancel"))
-
-	return s.String()
-}
-
-// SelectMultiple prompts the user to select multiple items from a list.
-func SelectMultiple(title string, items []SelectItem, preChecked []string) ([]string, error) {
-	if len(items) == 0 {
-		return nil, fmt.Errorf("no items to select from")
-	}
-
-	m := newMultiSelectorModel(title, items, preChecked)
-
-	p := tea.NewProgram(m)
-	finalModel, err := p.Run()
-	if err != nil {
-		return nil, fmt.Errorf("error running selector: %w", err)
-	}
-
-	fm := finalModel.(multiSelectorModel)
-	if fm.cancelled {
-		return nil, ErrCancelled
-	}
-
-	if !fm.confirmed {
-		return nil, ErrCancelled
-	}
-
-	var result []string
-	for _, idx := range fm.checkOrder {
-		result = append(result, fm.items[idx].Name)
-	}
-
-	return result, nil
-}
--- a/cmd/tui/tui.go
+++ b/cmd/tui/tui.go
@@ -1,442 +0,0 @@
-package tui
-
-import (
-	"context"
-	"fmt"
-	"strings"
-	"time"
-
-	tea "github.com/charmbracelet/bubbletea"
-	"github.com/charmbracelet/lipgloss"
-	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/cmd/config"
-	"github.com/ollama/ollama/version"
-)
-
-const (
-	logoNormal = ` ▆▁▂▃▂▁▆
-▟███████▙
-█▙▛▄ ▄▜▟█
-▟█▙▀▀▀▟█▙
-█████████
-▟███████▙
-▀▀▀▀▀▀▀▀▀`
-
-	logoBlink = ` ▆▁▂▃▂▁▆
-▟███████▙
-██▛▄ ▄▜██
-▟█▙▀▀▀▟█▙
-█████████
-▟███████▙
-▀▀▀▀▀▀▀▀▀`
-
-	blinkInterval = 15 * time.Second
-	blinkDuration = 250 * time.Millisecond
-)
-
-type (
-	blinkMsg   struct{}
-	unblinkMsg struct{}
-)
-
-var (
-	logoStyle = lipgloss.NewStyle().
-			Foreground(lipgloss.Color("255")).
-			Background(lipgloss.Color("0"))
-
-	titleStyle = lipgloss.NewStyle().
-			Bold(true).
-			MarginBottom(1)
-
-	versionStyle = lipgloss.NewStyle().
-			Foreground(lipgloss.Color("245"))
-
-	itemStyle = lipgloss.NewStyle().
-			PaddingLeft(2)
-
-	selectedStyle = lipgloss.NewStyle().
-			PaddingLeft(2).
-			Foreground(lipgloss.Color("147")).
-			Bold(true)
-
-	greyedStyle = lipgloss.NewStyle().
-			PaddingLeft(2).
-			Foreground(lipgloss.Color("241"))
-
-	greyedSelectedStyle = lipgloss.NewStyle().
-				PaddingLeft(2).
-				Foreground(lipgloss.Color("243"))
-
-	descStyle = lipgloss.NewStyle().
-			PaddingLeft(4).
-			Foreground(lipgloss.Color("241"))
-
-	modelStyle = lipgloss.NewStyle().
-			Foreground(lipgloss.Color("245"))
-
-	notInstalledStyle = lipgloss.NewStyle().
-				Foreground(lipgloss.Color("241")).
-				Italic(true)
-)
-
-type menuItem struct {
-	title       string
-	description string
-	integration string // integration name for loading model config, empty if not an integration
-	isRunModel  bool   // true for the "Run a model" option
-	isOthers    bool   // true for the "Others..." toggle item
-}
-
-var mainMenuItems = []menuItem{
-	{
-		title:       "Run a model",
-		description: "Start an interactive chat with a local model",
-		isRunModel:  true,
-	},
-	{
-		title:       "Launch Claude Code",
-		description: "Open Claude Code AI assistant",
-		integration: "claude",
-	},
-	{
-		title:       "Launch Open Claw",
-		description: "Open the Open Claw integration",
-		integration: "openclaw",
-	},
-}
-
-var othersMenuItem = menuItem{
-	title:       "Others...",
-	description: "Show additional integrations",
-	isOthers:    true,
-}
-
-// getOtherIntegrations returns the list of other integrations, filtering out
-// Codex if it's not installed (since it requires npm install).
-func getOtherIntegrations() []menuItem {
-	items := []menuItem{
-		{
-			title:       "Launch Droid",
-			description: "Open Droid integration",
-			integration: "droid",
-		},
-		{
-			title:       "Launch Open Code",
-			description: "Open Open Code integration",
-			integration: "opencode",
-		},
-	}
-
-	// Only show Codex if it's already installed
-	if config.IsIntegrationInstalled("codex") {
-		items = append([]menuItem{{
-			title:       "Launch Codex",
-			description: "Open Codex CLI",
-			integration: "codex",
-		}}, items...)
-	}
-
-	return items
-}
-
-type model struct {
-	items           []menuItem
-	cursor          int
-	quitting        bool
-	selected        bool            // true if user made a selection (enter/space)
-	changeModel     bool            // true if user pressed 'm' to change model
-	showOthers      bool            // true if "Others..." is expanded
-	availableModels map[string]bool // cache of available model names
-	blinking        bool            // true when showing blink logo
-	err             error
-}
-
-// modelExists checks if a model exists in the cached available models.
-func (m *model) modelExists(name string) bool {
-	if m.availableModels == nil || name == "" {
-		return false
-	}
-	if m.availableModels[name] {
-		return true
-	}
-	// Check for prefix match (e.g., "llama2" matches "llama2:latest")
-	for modelName := range m.availableModels {
-		if strings.HasPrefix(modelName, name+":") {
-			return true
-		}
-	}
-	return false
-}
-
-// loadAvailableModels fetches and caches the list of available models.
-func (m *model) loadAvailableModels() {
-	m.availableModels = make(map[string]bool)
-	client, err := api.ClientFromEnvironment()
-	if err != nil {
-		return
-	}
-	models, err := client.List(context.Background())
-	if err != nil {
-		return
-	}
-	for _, mdl := range models.Models {
-		m.availableModels[mdl.Name] = true
-	}
-}
-
-func (m *model) buildItems() {
-	others := getOtherIntegrations()
-	m.items = make([]menuItem, 0, len(mainMenuItems)+1+len(others))
-	m.items = append(m.items, mainMenuItems...)
-
-	if m.showOthers {
-		// Change "Others..." to "Hide others..."
-		hideItem := menuItem{
-			title:       "Hide others...",
-			description: "Hide additional integrations",
-			isOthers:    true,
-		}
-		m.items = append(m.items, hideItem)
-		m.items = append(m.items, others...)
-	} else {
-		m.items = append(m.items, othersMenuItem)
-	}
-}
-
-// isOthersIntegration returns true if the integration is in the "Others" menu
-func isOthersIntegration(name string) bool {
-	switch name {
-	case "codex", "droid", "opencode":
-		return true
-	}
-	return false
-}
-
-func initialModel() model {
-	m := model{
-		cursor: 0,
-	}
-	m.loadAvailableModels()
-
-	// Check last selection to determine if we need to expand "Others"
-	lastSelection := config.LastSelection()
-	if isOthersIntegration(lastSelection) {
-		m.showOthers = true
-	}
-
-	m.buildItems()
-
-	// Position cursor on last selection
-	if lastSelection != "" {
-		for i, item := range m.items {
-			if lastSelection == "run" && item.isRunModel {
-				m.cursor = i
-				break
-			} else if item.integration == lastSelection {
-				m.cursor = i
-				break
-			}
-		}
-	}
-
-	return m
-}
-
-func (m model) Init() tea.Cmd {
-	return tea.Tick(blinkInterval, func(t time.Time) tea.Msg {
-		return blinkMsg{}
-	})
-}
-
-func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
-	switch msg := msg.(type) {
-	case blinkMsg:
-		m.blinking = true
-		return m, tea.Tick(blinkDuration, func(t time.Time) tea.Msg {
-			return unblinkMsg{}
-		})
-
-	case unblinkMsg:
-		m.blinking = false
-		return m, tea.Tick(blinkInterval, func(t time.Time) tea.Msg {
-			return blinkMsg{}
-		})
-
-	case tea.KeyMsg:
-		switch msg.String() {
-		case "ctrl+c", "q", "esc":
-			m.quitting = true
-			return m, tea.Quit
-
-		case "up", "k":
-			if m.cursor > 0 {
-				m.cursor--
-			}
-
-		case "down", "j":
-			if m.cursor < len(m.items)-1 {
-				m.cursor++
-			}
-
-		case "enter", " ":
-			item := m.items[m.cursor]
-
-			// Handle "Others..." toggle
-			if item.isOthers {
-				m.showOthers = !m.showOthers
-				m.buildItems()
-				// Keep cursor on the Others/Hide item
-				if m.cursor >= len(m.items) {
-					m.cursor = len(m.items) - 1
-				}
-				return m, nil
-			}
-
-			// Don't allow selecting uninstalled integrations
-			if item.integration != "" && !config.IsIntegrationInstalled(item.integration) {
-				return m, nil
-			}
-
-			m.selected = true
-			m.quitting = true
-			return m, tea.Quit
-
-		case "m":
-			// Allow model change for integrations and run model
-			item := m.items[m.cursor]
-			if item.integration != "" || item.isRunModel {
-				// Don't allow for uninstalled integrations
-				if item.integration != "" && !config.IsIntegrationInstalled(item.integration) {
-					return m, nil
-				}
-				m.changeModel = true
-				m.quitting = true
-				return m, tea.Quit
-			}
-		}
-	}
-
-	return m, nil
-}
-
-func (m model) View() string {
-	if m.quitting {
-		return ""
-	}
-
-	logo := logoNormal
-	if m.blinking {
-		logo = logoBlink
-	}
-
-	versionText := "\n\n  Ollama " + versionStyle.Render("v"+version.Version)
-
-	logoRendered := logoStyle.Render(logo)
-	logoBlock := lipgloss.NewStyle().Padding(0, 1).MarginLeft(2).Background(lipgloss.Color("0")).Render(logoRendered)
-	versionBlock := titleStyle.Render(versionText)
-	header := lipgloss.JoinHorizontal(lipgloss.Top, logoBlock, versionBlock)
-
-	s := header + "\n\n"
-
-	for i, item := range m.items {
-		cursor := "  "
-		style := itemStyle
-		isInstalled := true
-
-		if item.integration != "" {
-			isInstalled = config.IsIntegrationInstalled(item.integration)
-		}
-
-		if m.cursor == i {
-			cursor = "▸ "
-			if isInstalled {
-				style = selectedStyle
-			} else {
-				style = greyedSelectedStyle
-			}
-		} else if !isInstalled && item.integration != "" {
-			style = greyedStyle
-		}
-
-		title := item.title
-		if item.integration != "" {
-			if !isInstalled {
-				title += " " + notInstalledStyle.Render("(not installed)")
-			} else if mdl := config.IntegrationModel(item.integration); mdl != "" && m.modelExists(mdl) {
-				title += " " + modelStyle.Render("("+mdl+")")
-			}
-		} else if item.isRunModel {
-			if mdl := config.LastModel(); mdl != "" && m.modelExists(mdl) {
-				title += " " + modelStyle.Render("("+mdl+")")
-			}
-		}
-
-		s += style.Render(cursor+title) + "\n"
-		s += descStyle.Render(item.description) + "\n\n"
-	}
-
-	s += "\n" + lipgloss.NewStyle().Foreground(lipgloss.Color("241")).Render("↑/↓ navigate • enter select • m change model • esc quit")
-
-	return s
-}
-
-// Selection represents what the user selected
-type Selection int
-
-const (
-	SelectionNone Selection = iota
-	SelectionRunModel
-	SelectionChangeRunModel
-	SelectionIntegration       // Generic integration selection
-	SelectionChangeIntegration // Generic change model for integration
-)
-
-// Result contains the selection and any associated data
-type Result struct {
-	Selection   Selection
-	Integration string // integration name if applicable
-}
-
-// Run starts the TUI and returns the user's selection
-func Run() (Result, error) {
-	m := initialModel()
-	p := tea.NewProgram(m)
-
-	finalModel, err := p.Run()
-	if err != nil {
-		return Result{Selection: SelectionNone}, fmt.Errorf("error running TUI: %w", err)
-	}
-
-	fm := finalModel.(model)
-	if fm.err != nil {
-		return Result{Selection: SelectionNone}, fm.err
-	}
-
-	// User quit without selecting
-	if !fm.selected && !fm.changeModel {
-		return Result{Selection: SelectionNone}, nil
-	}
-
-	item := fm.items[fm.cursor]
-
-	// Handle model change request
-	if fm.changeModel {
-		if item.isRunModel {
-			return Result{Selection: SelectionChangeRunModel}, nil
-		}
-		return Result{
-			Selection:   SelectionChangeIntegration,
-			Integration: item.integration,
-		}, nil
-	}
-
-	// Handle selection
-	if item.isRunModel {
-		return Result{Selection: SelectionRunModel}, nil
-	}
-
-	return Result{
-		Selection:   SelectionIntegration,
-		Integration: item.integration,
-	}, nil
-}
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -313,8 +313,6 @@ func LoadModelMetadata(fsys fs.FS) (ModelKV, *Tokenizer, error) {
 		conv = &deepseek2Model{}
 	case "Glm4MoeLiteForCausalLM":
 		conv = &glm4MoeLiteModel{}
-	case "GlmOcrForConditionalGeneration":
-		conv = &glmOcrModel{}
 	case "Lfm2ForCausalLM":
 		conv = &lfm2Model{}
 	default:
--- a/convert/convert_glmocr.go
+++ b/convert/convert_glmocr.go
@@ -1,455 +0,0 @@
-package convert
-
-import (
-	"cmp"
-	"encoding/json"
-	"io/fs"
-	"log/slog"
-	"regexp"
-	"strconv"
-	"strings"
-
-	"github.com/ollama/ollama/fs/ggml"
-	"github.com/pdevine/tensor"
-	"github.com/pdevine/tensor/native"
-)
-
-// normalToNeoXRepacker creates a repacker that permutes Q/K weights from interleaved (LLaMA)
-// to NeoX ordering for compatibility with GGML's M-RoPE kernel.
-//
-// For weights: reshape [out, in] -> [n_heads, head_dim, in], permute rotary dims, reshape back
-// For biases: reshape [out] -> [n_heads, head_dim], permute rotary dims, reshape back
-func normalToNeoXRepacker(nHeads, headDim int, partialRotaryFactor float32) func(string, []float32, []uint64) ([]float32, error) {
-	return func(_ string, data []float32, shape []uint64) ([]float32, error) {
-		rotaryDim := int(float32(headDim) * partialRotaryFactor)
-		if rotaryDim%2 != 0 {
-			rotaryDim = (rotaryDim / 2) * 2 // Round down to even
-		}
-
-		// Handle 1D (bias) or 2D (weight) tensors
-		is1D := len(shape) == 1
-		var inFeatures int
-		if is1D {
-			inFeatures = 1
-		} else {
-			inFeatures = int(shape[1])
-		}
-		outFeatures := int(shape[0])
-		nEffectiveHeads := outFeatures / headDim
-
-		if nEffectiveHeads != nHeads {
-			slog.Warn("normalToNeoX: unexpected head count", "effective", nEffectiveHeads, "expected", nHeads)
-		}
-
-		// Reshape to [n_heads, head_dim, in_features]
-		reshaped := make([]float32, len(data))
-		copy(reshaped, data)
-
-		// Permute the rotary dimensions: even indices first, then odd
-		// For each head, reorder [0,1,2,3,4,5...] to [0,2,4...,1,3,5...]
-		result := make([]float32, len(data))
-		halfRotary := rotaryDim / 2
-
-		for h := range nEffectiveHeads {
-			for f := range inFeatures {
-				for i := range halfRotary {
-					// Even dim (0, 2, 4, ...) -> position i
-					srcIdx := h*headDim*inFeatures + (2*i)*inFeatures + f
-					dstIdx := h*headDim*inFeatures + i*inFeatures + f
-					result[dstIdx] = reshaped[srcIdx]
-
-					// Odd dim (1, 3, 5, ...) -> position halfRotary + i
-					srcIdx = h*headDim*inFeatures + (2*i+1)*inFeatures + f
-					dstIdx = h*headDim*inFeatures + (halfRotary+i)*inFeatures + f
-					result[dstIdx] = reshaped[srcIdx]
-				}
-
-				// Non-rotary part: copy as-is
-				for i := rotaryDim; i < headDim; i++ {
-					srcIdx := h*headDim*inFeatures + i*inFeatures + f
-					result[srcIdx] = reshaped[srcIdx]
-				}
-			}
-		}
-
-		return result, nil
-	}
-}
-
-type glmOcrModel struct {
-	ModelParameters
-
-	TextConfig struct {
-		HiddenSize          uint32  `json:"hidden_size"`
-		IntermediateSize    uint32  `json:"intermediate_size"`
-		NumHiddenLayers     uint32  `json:"num_hidden_layers"`
-		NumAttentionHeads   uint32  `json:"num_attention_heads"`
-		NumKeyValueHeads    uint32  `json:"num_key_value_heads"`
-		HeadDim             uint32  `json:"head_dim"`
-		MaxPositionEmbed    uint32  `json:"max_position_embeddings"`
-		RMSNormEps          float32 `json:"rms_norm_eps"`
-		PartialRotaryFactor float32 `json:"partial_rotary_factor"`
-		RopeParameters      struct {
-			RopeType            string  `json:"rope_type"`
-			MRopeSection        []int32 `json:"mrope_section"`
-			RopeTheta           float32 `json:"rope_theta"`
-			PartialRotaryFactor float32 `json:"partial_rotary_factor"`
-		} `json:"rope_parameters"`
-	} `json:"text_config"`
-
-	VisionConfig struct {
-		HiddenSize        uint32  `json:"hidden_size"`
-		IntermediateSize  uint32  `json:"intermediate_size"`
-		Depth             uint32  `json:"depth"`
-		NumHeads          uint32  `json:"num_heads"`
-		ImageSize         uint32  `json:"image_size"`
-		PatchSize         uint32  `json:"patch_size"`
-		OutHiddenSize     uint32  `json:"out_hidden_size"`
-		RMSNormEps        float32 `json:"rms_norm_eps"`
-		SpatialMergeSize  uint32  `json:"spatial_merge_size"`
-		TemporalPatchSize uint32  `json:"temporal_patch_size"`
-	} `json:"vision_config"`
-
-	ImageStartTokenID uint32 `json:"image_start_token_id"`
-	ImageEndTokenID   uint32 `json:"image_end_token_id"`
-	VideoStartTokenID uint32 `json:"video_start_token_id"`
-	VideoEndTokenID   uint32 `json:"video_end_token_id"`
-	ImageTokenID      uint32 `json:"image_token_id"`
-	VideoTokenID      uint32 `json:"video_token_id"`
-
-	// Preprocessor config (preprocessor_config.json)
-	Preprocessor struct {
-		Size struct {
-			ShortestEdge uint32 `json:"shortest_edge"`
-			LongestEdge  uint32 `json:"longest_edge"`
-		} `json:"size"`
-		PatchSize         uint32    `json:"patch_size"`
-		TemporalPatchSize uint32    `json:"temporal_patch_size"`
-		MergeSize         uint32    `json:"merge_size"`
-		ImageMean         []float32 `json:"image_mean"`
-		ImageStd          []float32 `json:"image_std"`
-	} `json:"-"`
-}
-
-var _ ModelConverter = (*glmOcrModel)(nil)
-
-func (m *glmOcrModel) parseMore(fsys fs.FS) error {
-	bts, err := fs.ReadFile(fsys, "preprocessor_config.json")
-	if err != nil {
-		return err
-	}
-
-	return json.Unmarshal(bts, &m.Preprocessor)
-}
-
-func (m *glmOcrModel) KV(t *Tokenizer) KV {
-	kv := m.ModelParameters.KV(t)
-	kv["general.architecture"] = "glmocr"
-
-	// Text model parameters
-	kv["glmocr.block_count"] = cmp.Or(m.TextConfig.NumHiddenLayers, 16)
-	kv["glmocr.embedding_length"] = cmp.Or(m.TextConfig.HiddenSize, 1536)
-	kv["glmocr.attention.head_count"] = cmp.Or(m.TextConfig.NumAttentionHeads, 16)
-	kv["glmocr.attention.head_count_kv"] = cmp.Or(m.TextConfig.NumKeyValueHeads, 8)
-	headDim := cmp.Or(m.TextConfig.HeadDim, m.TextConfig.HiddenSize/m.TextConfig.NumAttentionHeads)
-	kv["glmocr.attention.key_length"] = headDim
-	kv["glmocr.attention.value_length"] = headDim
-	kv["glmocr.feed_forward_length"] = cmp.Or(m.TextConfig.IntermediateSize, 4608)
-	kv["glmocr.attention.layer_norm_rms_epsilon"] = cmp.Or(m.TextConfig.RMSNormEps, 1e-5)
-	kv["glmocr.context_length"] = cmp.Or(m.TextConfig.MaxPositionEmbed, 131072)
-	kv["glmocr.rope.freq_base"] = cmp.Or(m.TextConfig.RopeParameters.RopeTheta, float32(10000))
-	kv["glmocr.rope.partial_rotary_factor"] = cmp.Or(m.TextConfig.RopeParameters.PartialRotaryFactor, m.TextConfig.PartialRotaryFactor, float32(1.0))
-	if len(m.TextConfig.RopeParameters.MRopeSection) > 0 {
-		kv["glmocr.rope.mrope_section"] = m.TextConfig.RopeParameters.MRopeSection
-	}
-
-	// Vision model parameters
-	kv["glmocr.vision.block_count"] = cmp.Or(m.VisionConfig.Depth, 24)
-	kv["glmocr.vision.embedding_length"] = cmp.Or(m.VisionConfig.HiddenSize, 1024)
-	kv["glmocr.vision.attention.head_count"] = cmp.Or(m.VisionConfig.NumHeads, 16)
-	kv["glmocr.vision.image_size"] = cmp.Or(m.VisionConfig.ImageSize, 336)
-	kv["glmocr.vision.patch_size"] = cmp.Or(m.VisionConfig.PatchSize, m.Preprocessor.PatchSize, 14)
-	kv["glmocr.vision.spatial_merge_size"] = cmp.Or(m.VisionConfig.SpatialMergeSize, m.Preprocessor.MergeSize, 2)
-	kv["glmocr.vision.temporal_patch_size"] = cmp.Or(m.VisionConfig.TemporalPatchSize, m.Preprocessor.TemporalPatchSize, 2)
-	kv["glmocr.vision.out_hidden_size"] = cmp.Or(m.VisionConfig.OutHiddenSize, 1536)
-	kv["glmocr.vision.intermediate_size"] = cmp.Or(m.VisionConfig.IntermediateSize, 4096)
-	kv["glmocr.vision.attention.layer_norm_rms_epsilon"] = cmp.Or(m.VisionConfig.RMSNormEps, 1e-5)
-
-	// Preprocessor-derived image settings (min/max pixels and normalization)
-	// Note: fs.Config.keyValue() auto-prepends architecture prefix, so use full key
-	if m.Preprocessor.Size.ShortestEdge > 0 {
-		kv["glmocr.vision.min_pixels"] = m.Preprocessor.Size.ShortestEdge
-	}
-	if m.Preprocessor.Size.LongestEdge > 0 {
-		kv["glmocr.vision.max_pixels"] = m.Preprocessor.Size.LongestEdge
-	}
-	if len(m.Preprocessor.ImageMean) == 3 {
-		kv["glmocr.vision.image_mean"] = m.Preprocessor.ImageMean
-	}
-	if len(m.Preprocessor.ImageStd) == 3 {
-		kv["glmocr.vision.image_std"] = m.Preprocessor.ImageStd
-	}
-
-	// Special tokens
-	kv["glmocr.image_token_id"] = m.ImageTokenID
-	kv["glmocr.image_start_token_id"] = m.ImageStartTokenID
-	kv["glmocr.image_end_token_id"] = m.ImageEndTokenID
-	kv["glmocr.video_token_id"] = m.VideoTokenID
-	kv["glmocr.video_start_token_id"] = m.VideoStartTokenID
-	kv["glmocr.video_end_token_id"] = m.VideoEndTokenID
-
-	return kv
-}
-
-func (m *glmOcrModel) Tensors(ts []Tensor) []*ggml.Tensor {
-	var out []*ggml.Tensor
-
-	// Skip layers >= num_hidden_layers (Multi-Token Prediction layers not needed for basic inference)
-	numLayers := int(cmp.Or(m.TextConfig.NumHiddenLayers, 16))
-	skipLayer := func(name string) bool {
-		// Tensor names are already replaced to "blk.N.xxx" format
-		re := regexp.MustCompile(`^blk\.(\d+)`)
-		matches := re.FindStringSubmatch(name)
-		if matches == nil {
-			return false
-		}
-		blkNum, err := strconv.Atoi(matches[1])
-		if err != nil {
-			return false
-		}
-		return blkNum >= numLayers
-	}
-
-	for _, t := range ts {
-		name := t.Name()
-
-		// Skip next-n prediction layers (layers >= num_hidden_layers)
-		if skipLayer(name) {
-			continue
-		}
-
-		// Split ffn_gate_up into separate gate and up projections
-		if strings.Contains(name, "ffn_gate_up") {
-			for t := range splitDim(t, 0,
-				split{Replacer: strings.NewReplacer("ffn_gate_up", "ffn_gate")},
-				split{Replacer: strings.NewReplacer("ffn_gate_up", "ffn_up")},
-			) {
-				out = append(out, t)
-			}
-			continue
-		}
-
-		if strings.HasSuffix(name, "patch_embd.weight") {
-			shape := t.Shape()
-			if len(shape) == 5 && shape[2] == 2 {
-				newShape := []uint64{shape[0], shape[1], shape[3], shape[4]}
-
-				t0 := t.Clone()
-				t0.SetRepacker(func(_ string, data []float32, shape []uint64) ([]float32, error) {
-					dims := make([]int, len(shape))
-					for i := range shape {
-						dims[i] = int(shape[i])
-					}
-					var tt tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
-					tt, err := tt.Slice(nil, nil, tensor.S(0, 1), nil, nil)
-					if err != nil {
-						return nil, err
-					}
-					tt = tensor.Materialize(tt)
-					newDims := []int{int(shape[0]), int(shape[1]), int(shape[3]), int(shape[4])}
-					if err := tt.Reshape(newDims...); err != nil {
-						return nil, err
-					}
-					if err := tt.Reshape(tt.Shape().TotalSize()); err != nil {
-						return nil, err
-					}
-					return native.VectorF32(tt.(*tensor.Dense))
-				})
-				out = append(out, &ggml.Tensor{
-					Name:     strings.Replace(name, "patch_embd.weight", "patch_embd_0.weight", 1),
-					Kind:     t.Kind(),
-					Shape:    newShape,
-					WriterTo: t0,
-				})
-
-				t1 := t.Clone()
-				t1.SetRepacker(func(_ string, data []float32, shape []uint64) ([]float32, error) {
-					dims := make([]int, len(shape))
-					for i := range shape {
-						dims[i] = int(shape[i])
-					}
-					var tt tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
-					tt, err := tt.Slice(nil, nil, tensor.S(1, 2), nil, nil)
-					if err != nil {
-						return nil, err
-					}
-					tt = tensor.Materialize(tt)
-					newDims := []int{int(shape[0]), int(shape[1]), int(shape[3]), int(shape[4])}
-					if err := tt.Reshape(newDims...); err != nil {
-						return nil, err
-					}
-					if err := tt.Reshape(tt.Shape().TotalSize()); err != nil {
-						return nil, err
-					}
-					return native.VectorF32(tt.(*tensor.Dense))
-				})
-				out = append(out, &ggml.Tensor{
-					Name:     strings.Replace(name, "patch_embd.weight", "patch_embd_1.weight", 1),
-					Kind:     t.Kind(),
-					Shape:    newShape,
-					WriterTo: t1,
-				})
-
-				continue
-			}
-
-			if len(shape) == 4 {
-				out = append(out, &ggml.Tensor{
-					Name:     strings.Replace(name, "patch_embd.weight", "patch_embd_0.weight", 1),
-					Kind:     t.Kind(),
-					Shape:    t.Shape(),
-					WriterTo: t,
-				})
-				continue
-			}
-
-			slog.Warn("glmocr: patch_embed weight has unexpected shape - not splitting", "shape", shape)
-			// Fall through to default handling
-		}
-
-		// Handle pre-split patch embedding weights
-		// Pattern 1: v.patch_embd.0.weight, v.patch_embd.1.weight -> patch_embd_0.weight, patch_embd_1.weight
-		// Pattern 2: v.patch_embd.weight.0, v.patch_embd.weight.1 -> patch_embd_0.weight, patch_embd_1.weight
-		if strings.Contains(name, "patch_embd.0.") {
-			out = append(out, &ggml.Tensor{
-				Name:     strings.Replace(name, "patch_embd.0.", "patch_embd_0.", 1),
-				Kind:     t.Kind(),
-				Shape:    t.Shape(),
-				WriterTo: t,
-			})
-			continue
-		}
-		if strings.Contains(name, "patch_embd.1.") {
-			out = append(out, &ggml.Tensor{
-				Name:     strings.Replace(name, "patch_embd.1.", "patch_embd_1.", 1),
-				Kind:     t.Kind(),
-				Shape:    t.Shape(),
-				WriterTo: t,
-			})
-			continue
-		}
-		// Handle .weight.0 and .weight.1 suffix patterns
-		if strings.HasSuffix(name, "patch_embd.weight.0") {
-			out = append(out, &ggml.Tensor{
-				Name:     strings.Replace(name, "patch_embd.weight.0", "patch_embd_0.weight", 1),
-				Kind:     t.Kind(),
-				Shape:    t.Shape(),
-				WriterTo: t,
-			})
-			continue
-		}
-		if strings.HasSuffix(name, "patch_embd.weight.1") {
-			out = append(out, &ggml.Tensor{
-				Name:     strings.Replace(name, "patch_embd.weight.1", "patch_embd_1.weight", 1),
-				Kind:     t.Kind(),
-				Shape:    t.Shape(),
-				WriterTo: t,
-			})
-			continue
-		}
-
-		// Permute Q/K weights for M-RoPE compatibility (interleaved -> NeoX ordering)
-		// GGML's M-RoPE kernel uses NeoX-style rotation, but GLM-OCR uses interleaved (LLaMA-style)
-		// We permute at conversion time so the weights work correctly with GGML's kernel
-		// This aligns Q/K rotary dimensions with GGML's NeoX-style rotation
-		if len(m.TextConfig.RopeParameters.MRopeSection) > 0 &&
-			strings.Contains(name, "blk.") && (strings.Contains(name, "attn_q.") || strings.Contains(name, "attn_k.")) {
-			// Get config values for permutation
-			nHeads := int(cmp.Or(m.TextConfig.NumAttentionHeads, 16))
-			nKVHeads := int(cmp.Or(m.TextConfig.NumKeyValueHeads, 8))
-			hiddenSize := int(cmp.Or(m.TextConfig.HiddenSize, 1536))
-			headDim := int(cmp.Or(m.TextConfig.HeadDim, uint32(hiddenSize/nHeads)))
-			partialRotaryFactor := cmp.Or(m.TextConfig.PartialRotaryFactor, m.TextConfig.RopeParameters.PartialRotaryFactor, float32(1.0))
-
-			// Use appropriate head count: nHeads for Q, nKVHeads for K
-			effectiveHeads := nHeads
-			if strings.Contains(name, "attn_k.") {
-				effectiveHeads = nKVHeads
-			}
-
-			permutedT := t.Clone()
-			permutedT.SetRepacker(normalToNeoXRepacker(effectiveHeads, headDim, partialRotaryFactor))
-			out = append(out, &ggml.Tensor{
-				Name:     name,
-				Kind:     t.Kind(),
-				Shape:    t.Shape(),
-				WriterTo: permutedT,
-			})
-			continue
-		}
-
-		out = append(out, &ggml.Tensor{
-			Name:     name,
-			Kind:     t.Kind(),
-			Shape:    t.Shape(),
-			WriterTo: t,
-		})
-	}
-
-	return out
-}
-
-func (m *glmOcrModel) Replacements() []string {
-	return []string{
-		// Vision encoder
-		"model.visual.patch_embed.proj_1", "v.patch_embd_1", // Second temporal split
-		"model.visual.patch_embed.proj", "v.patch_embd",
-		"model.visual.blocks", "v.blk",
-		"model.visual.post_layernorm", "v.post_ln",
-		"model.visual.downsample", "mm.patch_merger",
-
-		// Vision attention
-		"attn.qkv", "attn_qkv",
-		"attn.proj", "attn_out",
-		"attn.q_norm", "attn_q_norm",
-		"attn.k_norm", "attn_k_norm",
-
-		// Vision norms
-		"norm1", "ln1",
-		"norm2", "ln2",
-
-		// Vision MLP
-		"mlp.gate_proj", "ffn_gate",
-		"mlp.up_proj", "ffn_up",
-		"mlp.down_proj", "ffn_down",
-
-		// Merger (multimodal projector)
-		"model.visual.merger.proj", "mm.model.fc",
-		"model.visual.merger.post_projection_norm", "mm.post_norm",
-		"model.visual.merger.gate_proj", "mm.gate",
-		"model.visual.merger.up_proj", "mm.up",
-		"model.visual.merger.down_proj", "mm.down",
-
-		// Language model
-		"model.language_model.embed_tokens", "token_embd",
-		"model.language_model.layers", "blk",
-		"model.language_model.norm", "output_norm",
-		"lm_head", "output",
-
-		// Language model attention
-		"self_attn.q_proj", "attn_q",
-		"self_attn.k_proj", "attn_k",
-		"self_attn.v_proj", "attn_v",
-		"self_attn.o_proj", "attn_out",
-
-		// Language model norms
-		"input_layernorm", "attn_norm",
-		"post_attention_layernorm", "ffn_norm",
-		"post_self_attn_layernorm", "post_attn_norm",
-		"post_mlp_layernorm", "post_ffn_norm",
-
-		// Language model MLP (remove mlp. prefix so ffn_* names work)
-		"mlp.gate_up_proj", "ffn_gate_up",
-		"mlp.down_proj", "ffn_down",
-	}
-}
--- a/convert/reader_safetensors.go
+++ b/convert/reader_safetensors.go
@@ -99,7 +99,6 @@ func (st safetensor) Kind() uint32 {
 	if st.dtype == "BF16" &&
 		!strings.HasPrefix(st.name, "v.") &&
 		!strings.HasPrefix(st.name, "s.") &&
-		!strings.HasPrefix(st.name, "mm.") &&
 		kind != tensorKindFP32 {
 		kind = tensorKindBF16
 	}
--- a/docs/api.md
+++ b/docs/api.md
@@ -15,6 +15,7 @@
 - [Push a Model](#push-a-model)
 - [Generate Embeddings](#generate-embeddings)
 - [List Running Models](#list-running-models)
+- [Usage](#usage)
 - [Version](#version)
 - [Experimental: Image Generation](#image-generation-experimental)

@@ -1854,6 +1855,53 @@ curl http://localhost:11434/api/embeddings -d '{
 }
 ```

+## Usage
+
+```
+GET /api/usage
+```
+
+Show aggregate usage statistics per model since the server started. All timestamps are UTC in RFC 3339 format.
+
+### Examples
+
+#### Request
+
+```shell
+curl http://localhost:11434/api/usage
+```
+
+#### Response
+
+```json
+{
+  "start": "2025-01-27T20:00:00Z",
+  "usage": [
+    {
+      "model": "llama3.2",
+      "requests": 5,
+      "prompt_tokens": 130,
+      "completion_tokens": 890
+    },
+    {
+      "model": "deepseek-r1",
+      "requests": 2,
+      "prompt_tokens": 48,
+      "completion_tokens": 312
+    }
+  ]
+}
+```
+
+#### Response fields
+
+- `start`: when the server started tracking usage (UTC, RFC 3339)
+- `usage`: list of per-model usage statistics
+  - `model`: model name
+  - `requests`: total number of completed requests
+  - `prompt_tokens`: total prompt tokens evaluated
+  - `completion_tokens`: total completion tokens generated
+
 ## Version

 ```
--- a/docs/docs.json
+++ b/docs/docs.json
@@ -71,10 +71,6 @@
    {
      "source": "/api",
      "destination": "/api/introduction"
-    },
-    {
-      "source": "/integrations/clawdbot",
-      "destination": "/integrations/openclaw"
    }
  ],
  "navigation": {
@@ -106,8 +102,8 @@
            "group": "Integrations",
            "pages": [
              "/integrations/claude-code",
+              "/integrations/clawdbot",
              "/integrations/cline",
-              "/integrations/openclaw",
              "/integrations/codex",
              "/integrations/droid",
              "/integrations/goose",
--- a/docs/gpu.mdx
+++ b/docs/gpu.mdx
@@ -10,7 +10,6 @@ Check your compute compatibility to see if your card is supported:

 | Compute Capability | Family              | Cards                                                                                                                          |
 | ------------------ | ------------------- | ------------------------------------------------------------------------------------------------------------------------------ |
-| 12.1               | NVIDIA              | `GB10 (DGX Spark)`                                                                                                             |
 | 12.0               | GeForce RTX 50xx    | `RTX 5060` `RTX 5060 Ti` `RTX 5070` `RTX 5070 Ti` `RTX 5080` `RTX 5090`                                                        |
 |                    | NVIDIA Professional | `RTX PRO 4000 Blackwell` `RTX PRO 4500 Blackwell` `RTX PRO 5000 Blackwell` `RTX PRO 6000 Blackwell`                            |
 | 9.0                | NVIDIA              | `H200` `H100`                                                                                                                  |
@@ -164,4 +163,4 @@ To select specific Vulkan GPU(s), you can set the environment variable
 `GGML_VK_VISIBLE_DEVICES` to one or more numeric IDs on the Ollama server as
 described in the [FAQ](faq#how-do-i-configure-ollama-server). If you
 encounter any problems with Vulkan based GPUs, you can disable all Vulkan GPUs
-by setting `GGML_VK_VISIBLE_DEVICES=-1` 
+by setting `GGML_VK_VISIBLE_DEVICES=-1` 
--- a/docs/import.mdx
+++ b/docs/import.mdx
@@ -134,12 +134,22 @@ success

 ### Supported Quantizations

+- `q4_0`
+- `q4_1`
+- `q5_0`
+- `q5_1`
 - `q8_0`

 #### K-means Quantizations

+- `q3_K_S`
+- `q3_K_M`
+- `q3_K_L`
 - `q4_K_S`
 - `q4_K_M`
+- `q5_K_S`
+- `q5_K_M`
+- `q6_K`

 ## Sharing your model on ollama.com

--- a/docs/integrations/clawdbot.mdx
+++ b/docs/integrations/clawdbot.mdx
@@ -1,43 +1,41 @@
 ---
-title: OpenClaw
+title: Clawdbot
 ---

-OpenClaw is a personal AI assistant that runs on your own devices. It bridges messaging services (WhatsApp, Telegram, Slack, Discord, iMessage, and more) to AI coding agents through a centralized gateway.
+Clawdbot is a personal AI assistant that runs on your own devices. It bridges messaging services (WhatsApp, Telegram, Slack, Discord, iMessage, and more) to AI coding agents through a centralized gateway.

 ## Install

-Install [OpenClaw](https://openclaw.ai/) 
+Install [Clawdbot](https://clawd.bot/) 

 ```bash
-npm install -g openclaw@latest
+npm install -g clawdbot@latest
 ```

 Then run the onboarding wizard:

 ```bash
-openclaw onboard --install-daemon
+clawdbot onboard --install-daemon
 ```

-<Note>OpenClaw requires a larger context window. It is recommended to use a context window of at least 64k tokens. See [Context length](/context-length) for more information.</Note>
+<Note>Clawdbot requires a larger context window. It is recommended to use a context window of at least 64k tokens. See [Context length](/context-length) for more information.</Note>

 ## Usage with Ollama

 ### Quick setup

 ```bash
-ollama launch openclaw
+ollama launch clawdbot
 ```

-<Note>Previously known as Clawdbot. `ollama launch clawdbot` still works as an alias.</Note>
-
-This configures OpenClaw to use Ollama and starts the gateway.
+This configures Clawdbot to use Ollama and starts the gateway.
 If the gateway is already running, no changes need to be made as the gateway will auto-reload the changes. 


 To configure without launching:

 ```shell
-ollama launch openclaw --config
+ollama launch clawdbot --config
 ```

 ## Recommended Models
--- a/docs/integrations/opencode.mdx
+++ b/docs/integrations/opencode.mdx
@@ -9,7 +9,7 @@ OpenCode is an open-source AI coding assistant that runs in your terminal.
 Install the [OpenCode CLI](https://opencode.ai):

 ```bash
-curl -fsSL https://opencode.ai/install | bash
+curl -fsSL https://opencode.ai/install.sh | bash
 ```

 <Note>OpenCode requires a larger context window. It is recommended to use a context window of at least 64k tokens. See [Context length](/context-length) for more information.</Note>
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -201,7 +201,7 @@ var (
 	// Enable the new Ollama engine
 	NewEngine = Bool("OLLAMA_NEW_ENGINE")
 	// ContextLength sets the default context length
-	ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 0)
+	ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 4096)
 	// Auth enables authentication between the Ollama client and server
 	UseAuth = Bool("OLLAMA_AUTH")
 	// Enable Vulkan backend
@@ -290,7 +290,7 @@ func AsMap() map[string]EnvVar {
 		"OLLAMA_ORIGINS":           {"OLLAMA_ORIGINS", AllowedOrigins(), "A comma separated list of allowed origins"},
 		"OLLAMA_SCHED_SPREAD":      {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
 		"OLLAMA_MULTIUSER_CACHE":   {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},
-		"OLLAMA_CONTEXT_LENGTH":    {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 4k/32k/256k based on VRAM)"},
+		"OLLAMA_CONTEXT_LENGTH":    {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 4096)"},
 		"OLLAMA_NEW_ENGINE":        {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"},
 		"OLLAMA_REMOTES":           {"OLLAMA_REMOTES", Remotes(), "Allowed hosts for remote models (default \"ollama.com\")"},

--- a/envconfig/config_test.go
+++ b/envconfig/config_test.go
@@ -282,7 +282,7 @@ func TestVar(t *testing.T) {

 func TestContextLength(t *testing.T) {
 	cases := map[string]uint{
-		"":     0,
+		"":     4096,
 		"2048": 2048,
 	}

--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -270,7 +270,6 @@ func (kv KV) OllamaEngineRequired() bool {
 		"qwen3", "qwen3moe",
 		"qwen3vl", "qwen3vlmoe",
 		"glm4moelite",
-		"glmocr",
 		"lfm2",
 	}, kv.Architecture())
 }
@@ -860,7 +859,6 @@ func (f GGML) FlashAttention() bool {
 		"bert",
 		"gemma3",
 		"glm4moelite",
-		"glmocr",
 		"gptoss", "gpt-oss",
 		"lfm2",
 		"mistral3",
--- a/go.mod
+++ b/go.mod
@@ -21,12 +21,10 @@ require (

 require (
 	github.com/agnivade/levenshtein v1.1.1
-	github.com/charmbracelet/bubbletea v1.3.10
-	github.com/charmbracelet/lipgloss v1.1.0
 	github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1
 	github.com/dlclark/regexp2 v1.11.4
 	github.com/emirpasic/gods/v2 v2.0.0-alpha
-	github.com/mattn/go-runewidth v0.0.16
+	github.com/mattn/go-runewidth v0.0.14
 	github.com/nlpodyssey/gopickle v0.3.0
 	github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c
 	github.com/tkrajina/typescriptify-golang-structs v0.2.0
@@ -39,34 +37,22 @@ require (

 require (
 	github.com/apache/arrow/go/arrow v0.0.0-20211112161151-bc219186db40 // indirect
-	github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect
 	github.com/bahlo/generic-list-go v0.2.0 // indirect
 	github.com/buger/jsonparser v1.1.1 // indirect
 	github.com/bytedance/sonic/loader v0.1.1 // indirect
-	github.com/charmbracelet/colorprofile v0.2.3-0.20250311203215-f60798e515dc // indirect
-	github.com/charmbracelet/x/ansi v0.10.1 // indirect
-	github.com/charmbracelet/x/cellbuf v0.0.13-0.20250311204145-2c3ea96c31dd // indirect
-	github.com/charmbracelet/x/term v0.2.1 // indirect
 	github.com/chewxy/hm v1.0.0 // indirect
 	github.com/chewxy/math32 v1.11.0 // indirect
 	github.com/cloudwego/base64x v0.1.4 // indirect
 	github.com/cloudwego/iasm v0.2.0 // indirect
 	github.com/davecgh/go-spew v1.1.1 // indirect
-	github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f // indirect
 	github.com/gogo/protobuf v1.3.2 // indirect
 	github.com/google/flatbuffers v24.3.25+incompatible // indirect
 	github.com/kr/text v0.2.0 // indirect
-	github.com/lucasb-eyer/go-colorful v1.2.0 // indirect
 	github.com/mailru/easyjson v0.7.7 // indirect
-	github.com/mattn/go-localereader v0.0.1 // indirect
-	github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 // indirect
-	github.com/muesli/cancelreader v0.2.2 // indirect
-	github.com/muesli/termenv v0.16.0 // indirect
 	github.com/pkg/errors v0.9.1 // indirect
 	github.com/pmezard/go-difflib v1.0.0 // indirect
-	github.com/rivo/uniseg v0.4.7 // indirect
+	github.com/rivo/uniseg v0.2.0 // indirect
 	github.com/tkrajina/go-reflector v0.5.5 // indirect
-	github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e // indirect
 	github.com/xtgo/set v1.0.0 // indirect
 	go4.org/unsafe/assume-no-moving-gc v0.0.0-20231121144256-b99613f794b6 // indirect
 	golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
--- a/go.sum
+++ b/go.sum
@@ -14,8 +14,6 @@ github.com/apache/arrow/go/arrow v0.0.0-20211112161151-bc219186db40 h1:q4dksr6IC
 github.com/apache/arrow/go/arrow v0.0.0-20211112161151-bc219186db40/go.mod h1:Q7yQnSMnLvcXlZ8RV+jwz/6y1rQTqbX6C82SndT52Zs=
 github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0 h1:jfIu9sQUG6Ig+0+Ap1h4unLjW6YQJpKZVmUzxsD4E/Q=
 github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0/go.mod h1:t2tdKJDJF9BV14lnkjHmOQgcvEKgtqs5a1N3LNdJhGE=
-github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k=
-github.com/aymanbagabas/go-osc52/v2 v2.0.1/go.mod h1:uYgXzlJ7ZpABp8OJ+exZzJJhRNQ2ASbcXHWsFqH8hp8=
 github.com/bahlo/generic-list-go v0.2.0 h1:5sz/EEAK+ls5wF+NeqDpk5+iNdMDXrh3z3nPnH1Wvgk=
 github.com/bahlo/generic-list-go v0.2.0/go.mod h1:2KvAjgMlE5NNynlg/5iLrrCCZ2+5xWbdbCW3pNTGyYg=
 github.com/boombuler/barcode v1.0.0/go.mod h1:paBWMcWSl3LHKBqUq+rly7CNSldXjb2rDl3JlRe0mD8=
@@ -26,18 +24,6 @@ github.com/bytedance/sonic v1.11.6/go.mod h1:LysEHSvpvDySVdC2f87zGWf6CIKJcAvqab1
 github.com/bytedance/sonic/loader v0.1.1 h1:c+e5Pt1k/cy5wMveRDyk2X4B9hF4g7an8N3zCYjJFNM=
 github.com/bytedance/sonic/loader v0.1.1/go.mod h1:ncP89zfokxS5LZrJxl5z0UJcsk4M4yY2JpfqGeCtNLU=
 github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
-github.com/charmbracelet/bubbletea v1.3.10 h1:otUDHWMMzQSB0Pkc87rm691KZ3SWa4KUlvF9nRvCICw=
-github.com/charmbracelet/bubbletea v1.3.10/go.mod h1:ORQfo0fk8U+po9VaNvnV95UPWA1BitP1E0N6xJPlHr4=
-github.com/charmbracelet/colorprofile v0.2.3-0.20250311203215-f60798e515dc h1:4pZI35227imm7yK2bGPcfpFEmuY1gc2YSTShr4iJBfs=
-github.com/charmbracelet/colorprofile v0.2.3-0.20250311203215-f60798e515dc/go.mod h1:X4/0JoqgTIPSFcRA/P6INZzIuyqdFY5rm8tb41s9okk=
-github.com/charmbracelet/lipgloss v1.1.0 h1:vYXsiLHVkK7fp74RkV7b2kq9+zDLoEU4MZoFqR/noCY=
-github.com/charmbracelet/lipgloss v1.1.0/go.mod h1:/6Q8FR2o+kj8rz4Dq0zQc3vYf7X+B0binUUBwA0aL30=
-github.com/charmbracelet/x/ansi v0.10.1 h1:rL3Koar5XvX0pHGfovN03f5cxLbCF2YvLeyz7D2jVDQ=
-github.com/charmbracelet/x/ansi v0.10.1/go.mod h1:3RQDQ6lDnROptfpWuUVIUG64bD2g2BgntdxH0Ya5TeE=
-github.com/charmbracelet/x/cellbuf v0.0.13-0.20250311204145-2c3ea96c31dd h1:vy0GVL4jeHEwG5YOXDmi86oYw2yuYUGqz6a8sLwg0X8=
-github.com/charmbracelet/x/cellbuf v0.0.13-0.20250311204145-2c3ea96c31dd/go.mod h1:xe0nKWGd3eJgtqZRaN9RjMtK7xUYchjzPr7q6kcvCCs=
-github.com/charmbracelet/x/term v0.2.1 h1:AQeHeLZ1OqSXhrAWpYUtZyX1T3zVxfpZuEQMIQaGIAQ=
-github.com/charmbracelet/x/term v0.2.1/go.mod h1:oQ4enTYFV7QN4m0i9mzHrViD7TQKvNEEkHUMCmsxdUg=
 github.com/chewxy/hm v1.0.0 h1:zy/TSv3LV2nD3dwUEQL2VhXeoXbb9QkpmdRAVUFiA6k=
 github.com/chewxy/hm v1.0.0/go.mod h1:qg9YI4q6Fkj/whwHR1D+bOGeF7SniIP40VweVepLjg0=
 github.com/chewxy/math32 v1.0.0/go.mod h1:Miac6hA1ohdDUTagnvJy/q+aNnEk16qWUdb8ZVhvCN0=
@@ -73,8 +59,6 @@ github.com/envoyproxy/go-control-plane v0.9.9-0.20201210154907-fd9021fe5dad/go.m
 github.com/envoyproxy/go-control-plane v0.9.9-0.20210217033140-668b12f5399d/go.mod h1:cXg6YxExXjJnVBQHBLXeUAgxn2UodCpnH306RInaBQk=
 github.com/envoyproxy/go-control-plane v0.9.9-0.20210512163311-63b5d3c536b0/go.mod h1:hliV/p42l8fGbc6Y9bQ70uLwIvmJyVE5k4iMKlh8wCQ=
 github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
-github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f h1:Y/CXytFA4m6baUTXGLOoWe4PQhGxaX0KpnayAqC48p4=
-github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f/go.mod h1:vw97MGsxSvLiUE2X8qFplwetxpGLQrlU1Q9AUEIzCaM=
 github.com/fogleman/gg v1.2.1-0.20190220221249-0403632d5b90/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k=
 github.com/fogleman/gg v1.3.0/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k=
 github.com/gabriel-vasile/mimetype v1.4.3 h1:in2uUcidCuFcDKtdcBxlR0rJ1+fsokWf+uqxgUFjbI0=
@@ -164,17 +148,13 @@ github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728 h1:QwWKgMY28TAXaDl+
 github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728/go.mod h1:1fEHWurg7pvf5SG6XNE5Q8UZmOwex51Mkx3SLhrW5B4=
 github.com/leodido/go-urn v1.4.0 h1:WT9HwE9SGECu3lg4d/dIA+jxlljEa1/ffXKmRjqdmIQ=
 github.com/leodido/go-urn v1.4.0/go.mod h1:bvxc+MVxLKB4z00jd1z+Dvzr47oO32F/QSNjSBOlFxI=
-github.com/lucasb-eyer/go-colorful v1.2.0 h1:1nnpGOrhyZZuNyfu1QjKiUICQ74+3FNCN69Aj6K7nkY=
-github.com/lucasb-eyer/go-colorful v1.2.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0=
 github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
 github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
 github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
 github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
-github.com/mattn/go-localereader v0.0.1 h1:ygSAOl7ZXTx4RdPYinUpg6W99U8jWvWi9Ye2JC/oIi4=
-github.com/mattn/go-localereader v0.0.1/go.mod h1:8fBrzywKY7BI3czFoHkuzRoWE9C+EiG4R1k4Cjx5p88=
 github.com/mattn/go-runewidth v0.0.9/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI=
-github.com/mattn/go-runewidth v0.0.16 h1:E5ScNMtiwvlvB5paMFdw9p4kSQzbXFikJ5SQO6TULQc=
-github.com/mattn/go-runewidth v0.0.16/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
+github.com/mattn/go-runewidth v0.0.14 h1:+xnbZSEeDbOIg5/mE6JF0w6n9duR1l3/WmbinWVwUuU=
+github.com/mattn/go-runewidth v0.0.14/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
 github.com/mattn/go-sqlite3 v1.14.24 h1:tpSp2G2KyMnnQu99ngJ47EIkWVmliIizyZBfPrBWDRM=
 github.com/mattn/go-sqlite3 v1.14.24/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y=
 github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
@@ -182,12 +162,6 @@ github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w
 github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
 github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M=
 github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
-github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 h1:ZK8zHtRHOkbHy6Mmr5D264iyp3TiX5OmNcI5cIARiQI=
-github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6/go.mod h1:CJlz5H+gyd6CUWT45Oy4q24RdLyn7Md9Vj2/ldJBSIo=
-github.com/muesli/cancelreader v0.2.2 h1:3I4Kt4BQjOR54NavqnDogx/MIoWBFa0StPA8ELUXHmA=
-github.com/muesli/cancelreader v0.2.2/go.mod h1:3XuTXfFS2VjM+HTLZY9Ak0l6eUKfijIfMUZ4EgX0QYo=
-github.com/muesli/termenv v0.16.0 h1:S5AlUN9dENB57rsbnkPyfdGuWIlkmzJjbFf0Tf5FWUc=
-github.com/muesli/termenv v0.16.0/go.mod h1:ZRfOIKPFDYQoDFF4Olj7/QJbW60Ol/kL1pU3VfY/Cnk=
 github.com/nlpodyssey/gopickle v0.3.0 h1:BLUE5gxFLyyNOPzlXxt6GoHEMMxD0qhsE4p0CIQyoLw=
 github.com/nlpodyssey/gopickle v0.3.0/go.mod h1:f070HJ/yR+eLi5WmM1OXJEGaTpuJEUiib19olXgYha0=
 github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N7AbDhec=
@@ -206,9 +180,8 @@ github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINE
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
+github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY=
 github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
-github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=
-github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
 github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ=
 github.com/rogpeppe/go-internal v1.8.0 h1:FCbCCtXNOY3UtUuHUYaghJg4y7Fd14rXifAYUAtL9R8=
 github.com/rogpeppe/go-internal v1.8.0/go.mod h1:WmiCO8CzOY8rg0OYDC4/i/2WRWAB6poM+XZ2dLUbcbE=
@@ -245,8 +218,6 @@ github.com/wk8/go-ordered-map/v2 v2.1.8 h1:5h/BUHu93oj4gIdvHHHGsScSTMijfx5PeYkE/
 github.com/wk8/go-ordered-map/v2 v2.1.8/go.mod h1:5nJHM5DyteebpVlHnWMV0rPz6Zp7+xBAnxjb1X5vnTw=
 github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM=
 github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg=
-github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e h1:JVG44RsyaB9T2KIHavMF/ppJZNG9ZpyihvCd0w101no=
-github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e/go.mod h1:RbqR21r5mrJuqunuUZ/Dhy/avygyECGrLceyNeo4LiM=
 github.com/xtgo/set v1.0.0 h1:6BCNBRv3ORNDQ7fyoJXRv+tstJz3m1JVFQErfeZz2pY=
 github.com/xtgo/set v1.0.0/go.mod h1:d3NHzGzSa0NmB2NhFyECA+QdRp29oEn2xbT+TpeFoM8=
 github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
@@ -333,7 +304,6 @@ golang.org/x/sys v0.0.0-20210330210617-4fbd30eecc44/go.mod h1:h1NjWce9XRLGQEsW7w
 golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.0.0-20210809222454-d867a43fc93e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.37.0 h1:fdNQudmxPjkdUTPnLn5mdQv7Zwvbvpaxqs831goi9kQ=
--- a/integration/embed_test.go
+++ b/integration/embed_test.go
@@ -73,18 +73,13 @@ func manhattanDistance[V float32 | float64](v1, v2 []V) V {
 }

 func TestEmbedCosineDistanceCorrelation(t *testing.T) {
-	softTimeout, hardTimeout := getTimeouts(t)
-	ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
 	defer cancel()
 	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()

-	started := time.Now()
 	for _, model := range libraryEmbedModels {
 		t.Run(model, func(t *testing.T) {
-			if time.Since(started) > softTimeout {
-				t.Skip("skipping - soft timeout exceeded")
-			}
 			testCases := []struct {
 				a string
 				b string
@@ -494,19 +489,14 @@ func TestEmbedTruncation(t *testing.T) {

 // TestEmbedLargeInput tests that embedding models can handle large inputs that would exceed typical batch sizes.
 func TestEmbedLargeInput(t *testing.T) {
-	softTimeout, hardTimeout := getTimeouts(t)
-	ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
+	ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
 	defer cancel()
 	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()

-	started := time.Now()
 	for _, model := range libraryEmbedModels {
 		model := model
 		t.Run(model, func(t *testing.T) {
-			if time.Since(started) > softTimeout {
-				t.Skip("skipping - soft timeout exceeded")
-			}
 			mctx, mcancel := context.WithTimeout(ctx, 2*time.Minute)
 			defer mcancel()

--- a/integration/tools_test.go
+++ b/integration/tools_test.go
@@ -21,10 +21,9 @@ func testPropsMap(m map[string]api.ToolProperty) *api.ToolPropertiesMap {
 }

 func TestAPIToolCalling(t *testing.T) {
-	initialTimeout := 90 * time.Second
-	streamTimeout := 90 * time.Second
-	softTimeout, hardTimeout := getTimeouts(t)
-	ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
+	initialTimeout := 60 * time.Second
+	streamTimeout := 60 * time.Second
+	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
 	defer cancel()

 	client, _, cleanup := InitServerConnection(ctx, t)
@@ -48,12 +47,8 @@ func TestAPIToolCalling(t *testing.T) {
 		"granite3.3":    7,
 	}

-	started := time.Now()
 	for _, model := range libraryToolsModels {
 		t.Run(model, func(t *testing.T) {
-			if time.Since(started) > softTimeout {
-				t.Skip("skipping - soft timeout exceeded")
-			}
 			if v, ok := minVRAM[model]; ok {
 				skipUnderMinVRAM(t, v)
 			}
--- a/llama/README.md
+++ b/llama/README.md
@@ -14,28 +14,25 @@ make -f Makefile.sync apply-patches

 ### Updating Base Commit

-To update to a new base commit:
+**Pin to new base commit**

-1. **Update FETCH_HEAD** in `Makefile.sync` to the new commit hash.
+To change the base commit, update `FETCH_HEAD` in Makefile.sync.

-2. **Check for upstreamed patches**: Before applying, review if any patches have been merged upstream. Remove those patches from `./patches/` to avoid conflicts.
+When updating to a newer base commit, the existing patches may not apply cleanly and require manual merge resolution.

-3. **Apply patches**:
-   ```shell
-   make -f Makefile.sync apply-patches
-   ```
+Start by applying the patches. If any of the patches have conflicts, the `git am` will stop at the first failure.

-4. **Resolve conflicts** (if any): When `git am` fails on a patch:
-   - Fix conflicts in `./vendor/`
-   - Stage the resolved files: `git -C llama/vendor add <file>`
-   - Continue: `git -C llama/vendor am --continue`
-   - Re-run: `make -f Makefile.sync apply-patches`
-   - Repeat until all patches are applied.
+```shell
+make -f Makefile.sync apply-patches
+```

-5. **Regenerate patches and sync**:
-   ```shell
-   make -f Makefile.sync format-patches sync
-   ```
+If there are conflicts, you will see an error message. Resolve the conflicts in `./vendor/`, and continue the patch series with `git am --continue` and rerun `make -f Makefile.sync apply-patches`. Repeat until all patches are successfully applied.
+
+Once all patches are applied, commit the changes to the tracking repository.
+
+```shell
+make -f Makefile.sync format-patches sync
+```

 ### Generating Patches

--- a/llama/build-info.cpp
+++ b/llama/build-info.cpp
@@ -1,4 +1,4 @@
 int LLAMA_BUILD_NUMBER = 0;
-char const *LLAMA_COMMIT = "a5bb8ba4c50257437630c136210396810741bbf7";
+char const *LLAMA_COMMIT = "ec98e2002";
 char const *LLAMA_COMPILER = "";
 char const *LLAMA_BUILD_TARGET = "";
--- a/llama/llama.cpp/common/common.cpp
+++ b/llama/llama.cpp/common/common.cpp
@@ -251,7 +251,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
        case GGML_SCHED_PRIO_REALTIME: p = -20; break;
    }

-    if (setpriority(PRIO_PROCESS, 0, p) != 0) {
+    if (!setpriority(PRIO_PROCESS, 0, p)) {
        LOG_WRN("failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
        return false;
    }
@@ -1078,15 +1078,12 @@ struct common_init_result::impl {
    impl() = default;
    ~impl() = default;

-    // note: the order in which model, context, etc. are declared matters because their destructors will be called bottom-to-top
-
    llama_model_ptr   model;
    llama_context_ptr context;

    std::vector<llama_adapter_lora_ptr> lora;

    std::vector<common_sampler_ptr> samplers;
-    std::vector<llama_sampler_seq_config> samplers_seq_config;
 };

 common_init_result::common_init_result(common_params & params) :
@@ -1095,9 +1092,9 @@ common_init_result::common_init_result(common_params & params) :
    auto cparams = common_context_params_to_llama(params);

    if (params.fit_params) {
-        LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
+        LOG_INF("%s: fitting params to device memory, to report bugs during this step use -fit off (or --verbose if you can't)\n", __func__);
        llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
-            params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target.data(), params.fit_params_min_ctx,
+            params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
            params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
    }

@@ -1110,25 +1107,6 @@ common_init_result::common_init_result(common_params & params) :

    const llama_vocab * vocab = llama_model_get_vocab(model);

-    // load and optionally apply lora adapters (must be loaded before context creation)
-    for (auto & la : params.lora_adapters) {
-        llama_adapter_lora_ptr lora;
-        lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
-        if (lora == nullptr) {
-            LOG_ERR("%s: failed to load lora adapter '%s'\n", __func__, la.path.c_str());
-            pimpl->model.reset(model);
-            return;
-        }
-
-        char buf[1024];
-        la.ptr = lora.get();
-        llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
-        la.task_name = buf;
-        llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
-        la.prompt_prefix = buf;
-        pimpl->lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
-    }
-
    // updates params.sampling
    // TODO: fix naming
    common_init_sampler_from_model(model, params.sampling);
@@ -1163,18 +1141,10 @@ common_init_result::common_init_result(common_params & params) :
    //    params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
    //}

-    // init the backend samplers as part of the context creation
    pimpl->samplers.resize(cparams.n_seq_max);
-    pimpl->samplers_seq_config.resize(cparams.n_seq_max);

    for (int i = 0; i < (int) cparams.n_seq_max; ++i) {
        pimpl->samplers[i].reset(common_sampler_init(model, params.sampling));
-        pimpl->samplers_seq_config[i] = { i, common_sampler_get(pimpl->samplers[i].get()) };
-    }
-
-    if (params.sampling.backend_sampling) {
-        cparams.samplers   = pimpl->samplers_seq_config.data();
-        cparams.n_samplers = pimpl->samplers_seq_config.size();
    }

    llama_context * lctx = llama_init_from_model(model, cparams);
@@ -1198,12 +1168,6 @@ common_sampler * common_init_result::sampler(llama_seq_id seq_id) {
    return pimpl->samplers[seq_id].get();
 }

-void common_init_result::reset_samplers() {
-    for (int i = 0; i < (int) pimpl->samplers.size(); ++i) {
-        llama_sampler_reset(common_sampler_get(pimpl->samplers[i].get()));
-    }
-}
-
 std::vector<llama_adapter_lora_ptr> & common_init_result::lora() {
    return pimpl->lora;
 }
@@ -1279,6 +1243,24 @@ common_init_result_ptr common_init_from_params(common_params & params) {
        }
    }

+    // load and optionally apply lora adapters
+    for (auto & la : params.lora_adapters) {
+        llama_adapter_lora_ptr lora;
+        lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
+        if (lora == nullptr) {
+            LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
+            return res;
+        }
+
+        char buf[1024];
+        la.ptr = lora.get();
+        llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
+        la.task_name = buf;
+        llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
+        la.prompt_prefix = buf;
+        res->lora().emplace_back(std::move(lora)); // copy to list of loaded adapters
+    }
+
    if (!params.lora_init_without_apply) {
        common_set_adapter_lora(lctx, params.lora_adapters);
    }
@@ -1319,9 +1301,6 @@ common_init_result_ptr common_init_from_params(common_params & params) {
        llama_synchronize(lctx);
        llama_perf_context_reset(lctx);
        llama_set_warmup(lctx, false);
-
-        // reset samplers to reset RNG state after warmup to the seeded state
-        res->reset_samplers();
    }

    return res;
@@ -1360,12 +1339,14 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
        mparams.devices = params.devices.data();
    }

-    mparams.n_gpu_layers    = params.n_gpu_layers;
+    if (params.n_gpu_layers != -1) {
+        mparams.n_gpu_layers = params.n_gpu_layers;
+    }
+
    mparams.main_gpu        = params.main_gpu;
    mparams.split_mode      = params.split_mode;
    mparams.tensor_split    = params.tensor_split;
    mparams.use_mmap        = params.use_mmap;
-    mparams.use_direct_io   = params.use_direct_io;
    mparams.use_mlock       = params.use_mlock;
    mparams.check_tensors   = params.check_tensors;
    mparams.use_extra_bufts = !params.no_extra_bufts;
--- a/llama/llama.cpp/common/common.h
+++ b/llama/llama.cpp/common/common.h
@@ -57,8 +57,6 @@ extern const char * LLAMA_COMMIT;
 extern const char * LLAMA_COMPILER;
 extern const char * LLAMA_BUILD_TARGET;

-const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
-
 struct common_control_vector_load_info;

 //
@@ -82,8 +80,6 @@ int32_t cpu_get_num_math();
 //

 enum llama_example {
-    LLAMA_EXAMPLE_BATCHED,
-    LLAMA_EXAMPLE_DEBUG,
    LLAMA_EXAMPLE_COMMON,
    LLAMA_EXAMPLE_SPECULATIVE,
    LLAMA_EXAMPLE_COMPLETION,
@@ -121,7 +117,6 @@ enum common_sampler_type {
    COMMON_SAMPLER_TYPE_INFILL      = 9,
    COMMON_SAMPLER_TYPE_PENALTIES   = 10,
    COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
-    COMMON_SAMPLER_TYPE_ADAPTIVE_P  = 12,
 };

 // dimensionality reduction methods, used by cvector-generator
@@ -169,34 +164,32 @@ enum common_params_sampling_config : uint64_t {
 struct common_params_sampling {
    uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler

-    int32_t n_prev             = 64;     // number of previous tokens to remember
-    int32_t n_probs            = 0;      // if greater than 0, output the probabilities of top n_probs tokens.
-    int32_t min_keep           = 0;      // 0 = disabled, otherwise samplers should return at least min_keep tokens
-    int32_t top_k              = 40;     // <= 0 to use vocab size
-    float   top_p              = 0.95f;  // 1.0 = disabled
-    float   min_p              = 0.05f;  // 0.0 = disabled
-    float   xtc_probability    = 0.00f;  // 0.0 = disabled
-    float   xtc_threshold      = 0.10f;  // > 0.5 disables XTC
-    float   typ_p              = 1.00f;  // typical_p, 1.0 = disabled
-    float   temp               = 0.80f;  // <= 0.0 to sample greedily, 0.0 to not output probabilities
-    float   dynatemp_range     = 0.00f;  // 0.0 = disabled
-    float   dynatemp_exponent  = 1.00f;  // controls how entropy maps to temperature in dynamic temperature sampler
-    int32_t penalty_last_n     = 64;     // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    float   penalty_repeat     = 1.00f;  // 1.0 = disabled
-    float   penalty_freq       = 0.00f;  // 0.0 = disabled
-    float   penalty_present    = 0.00f;  // 0.0 = disabled
-    float   dry_multiplier     = 0.0f;   // 0.0 = disabled;      DRY repetition penalty for tokens extending repetition:
-    float   dry_base           = 1.75f;  // 0.0 = disabled;      multiplier * base ^ (length of sequence before token - allowed length)
-    int32_t dry_allowed_length = 2;      // tokens extending repetitions beyond this receive penalty
-    int32_t dry_penalty_last_n = -1;     // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
-    float   adaptive_target    = -1.0f;  // select tokens near this probability (valid range 0.0 to 1.0; negative = disabled)
-    float   adaptive_decay     = 0.90f;  // EMA decay for adaptation; history ≈ 1/(1-decay) tokens (0.0 - 0.99)
-    int32_t mirostat           = 0;      // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
-    float   top_n_sigma        = -1.00f; // -1.0 = disabled
-    float   mirostat_tau       = 5.00f;  // target entropy
-    float   mirostat_eta       = 0.10f;  // learning rate
+    int32_t n_prev             = 64;    // number of previous tokens to remember
+    int32_t n_probs            = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
+    int32_t min_keep           = 0;     // 0 = disabled, otherwise samplers should return at least min_keep tokens
+    int32_t top_k              = 40;    // <= 0 to use vocab size
+    float   top_p              = 0.95f; // 1.0 = disabled
+    float   min_p              = 0.05f; // 0.0 = disabled
+    float   xtc_probability    = 0.00f; // 0.0 = disabled
+    float   xtc_threshold      = 0.10f; // > 0.5 disables XTC
+    float   typ_p              = 1.00f; // typical_p, 1.0 = disabled
+    float   temp               = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
+    float   dynatemp_range     = 0.00f; // 0.0 = disabled
+    float   dynatemp_exponent  = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
+    int32_t penalty_last_n     = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    float   penalty_repeat     = 1.00f; // 1.0 = disabled
+    float   penalty_freq       = 0.00f; // 0.0 = disabled
+    float   penalty_present    = 0.00f; // 0.0 = disabled
+    float   dry_multiplier     = 0.0f;  // 0.0 = disabled;      DRY repetition penalty for tokens extending repetition:
+    float   dry_base           = 1.75f; // 0.0 = disabled;      multiplier * base ^ (length of sequence before token - allowed length)
+    int32_t dry_allowed_length = 2;     // tokens extending repetitions beyond this receive penalty
+    int32_t dry_penalty_last_n = -1;    // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
+    int32_t mirostat           = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+    float   top_n_sigma        = -1.00f;// -1.0 = disabled
+    float   mirostat_tau       = 5.00f; // target entropy
+    float   mirostat_eta       = 0.10f; // learning rate
    bool    ignore_eos         = false;
-    bool    no_perf            = false;  // disable performance metrics
+    bool    no_perf            = false; // disable performance metrics
    bool    timing_per_token   = false;

    uint64_t user_sampling_config = 0; // bitfield to track user-specified samplers
@@ -223,8 +216,6 @@ struct common_params_sampling {
    std::vector<llama_logit_bias> logit_bias;     // logit biases to apply
    std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens

-    bool backend_sampling = false;
-
    bool has_logit_bias() const {
        return !logit_bias.empty();
    }
@@ -286,7 +277,6 @@ struct common_params_diffusion {
 };

 // reasoning API response format (not to be confused as chat template's reasoning format)
-// only used by server
 enum common_reasoning_format {
    COMMON_REASONING_FORMAT_NONE,
    COMMON_REASONING_FORMAT_AUTO,            // Same as deepseek, using `message.reasoning_content`
@@ -339,14 +329,12 @@ struct common_params {
    // offload params
    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading

-    int32_t n_gpu_layers       = -1;   // number of layers to store in VRAM, -1 is auto, <= -2 is all
-    int32_t main_gpu           = 0;    // the GPU that is used for scratch and small tensors
-    float   tensor_split[128]  = {0};  // how split tensors should be distributed across GPUs
-    bool    fit_params         = true; // whether to fit unset model/context parameters to free device memory
-    int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
-
-    // margin per device in bytes for fitting parameters to free memory:
-    std::vector<size_t> fit_params_target = std::vector<size_t>(llama_max_devices(), 1024 * 1024*1024);
+    int32_t n_gpu_layers       = -1;               // number of layers to store in VRAM (-1 - use default)
+    int32_t main_gpu           = 0;                // the GPU that is used for scratch and small tensors
+    float   tensor_split[128]  = {0};              // how split tensors should be distributed across GPUs
+    bool    fit_params         = true;             // whether to fit unset model/context parameters to free device memory
+    size_t  fit_params_target  = 1024 * 1024*1024; // margin per device in bytes for fitting parameters to free memory
+    int32_t fit_params_min_ctx = 4096;             // minimum context size to set when trying to reduce memory use

    enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs

@@ -382,11 +370,6 @@ struct common_params {
    std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding          // NOLINT
    std::string logits_file          = ""; // file for saving *all* logits                                  // NOLINT

-    // llama-debug specific options
-    std::string logits_output_dir = "data"; // directory for saving logits output files                     // NOLINT
-    bool        save_logits       = false;  // whether to save logits to files                              // NOLINT
-    std::vector<std::string> tensor_filter; // filter tensor names for debug output (regex)                 // NOLINT
-
    std::vector<std::string> in_files;   // all input files
    std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
    std::vector<llama_model_kv_override> kv_overrides;
@@ -437,8 +420,7 @@ struct common_params {
    bool kv_unified        = false; // enable unified KV cache

    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
-    bool use_mmap          = true;  // enable mmap to use filesystem cache
-    bool use_direct_io     = true;  // read from disk without buffering for faster model loading
+    bool use_mmap          = true;  // use mmap for faster loads
    bool use_mlock         = false; // use mlock to keep model in memory
    bool verbose_prompt    = false; // print prompt tokens before generation
    bool display_prompt    = true;  // print prompt before generation
@@ -482,7 +464,6 @@ struct common_params {
    int32_t timeout_write     = timeout_read; // http write timeout in seconds
    int32_t n_threads_http    = -1;           // number of threads to process HTTP requests (TODO: support threadpool)
    int32_t n_cache_reuse     = 0;            // min chunk size to reuse from the cache via KV shifting
-    bool    cache_prompt      = true;         // whether to enable prompt caching
    int32_t n_ctx_checkpoints = 8;            // max number of context checkpoints per slot
    int32_t cache_ram_mib     = 8192;         // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.

@@ -494,8 +475,7 @@ struct common_params {
    bool enable_chat_template = true;
    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
    int reasoning_budget = -1;
-    bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
-    int sleep_idle_seconds = -1;   // if >0, server will sleep after this many seconds of idle time
+    bool prefill_assistant = true;                                                                          // if true, any trailing assistant message will be prefilled into the response

    std::vector<std::string> api_keys;

@@ -504,11 +484,8 @@ struct common_params {

    std::map<std::string, std::string> default_template_kwargs;

-    // webui configs
-    bool webui = true;
-    std::string webui_config_json;
-
    // "advanced" endpoints are disabled by default for better security
+    bool webui            = true;
    bool endpoint_slots   = true;
    bool endpoint_props   = false; // only control POST requests, not GET
    bool endpoint_metrics = false;
@@ -708,9 +685,7 @@ struct common_init_result {

    llama_model * model();
    llama_context * context();
-
    common_sampler * sampler(llama_seq_id seq_id);
-    void reset_samplers();

    std::vector<llama_adapter_lora_ptr> & lora();

--- a/llama/llama.cpp/common/sampling.cpp
+++ b/llama/llama.cpp/common/sampling.cpp
@@ -104,9 +104,10 @@ struct ring_buffer {
 struct common_sampler {
    common_params_sampling params;

-    struct llama_sampler * grmr;
    struct llama_sampler * chain;

+    bool grammar;
+
    ring_buffer<llama_token> prev;

    std::vector<llama_token_data> cur;
@@ -120,34 +121,17 @@ struct common_sampler {
    }

    void set_logits(struct llama_context * ctx, int idx) {
-        const float *       sampled_probs  = llama_get_sampled_probs_ith     (ctx, idx);
-        const float *       sampled_logits = llama_get_sampled_logits_ith    (ctx, idx);
-        const llama_token * sampled_ids    = llama_get_sampled_candidates_ith(ctx, idx);
+        const auto * logits = llama_get_logits_ith(ctx, idx);

        const llama_model * model = llama_get_model(ctx);
        const llama_vocab * vocab = llama_model_get_vocab(model);

        const int n_vocab = llama_vocab_n_tokens(vocab);

-        if (sampled_probs) {
-            const uint32_t sampled_probs_count = llama_get_sampled_probs_count_ith(ctx, idx);
-            cur.resize(sampled_probs_count);
-            for (uint32_t i = 0; i < sampled_probs_count; ++i) {
-                cur[i] = llama_token_data{sampled_ids[i], sampled_logits[i], sampled_probs[i]};
-            }
-        } else if (sampled_logits) {
-            const uint32_t sampled_logits_count = llama_get_sampled_logits_count_ith(ctx, idx);
-            cur.resize(sampled_logits_count);
-            for (uint32_t i = 0; i < sampled_logits_count; i++) {
-                cur[i] = llama_token_data{sampled_ids[i], sampled_logits[i], 0.0f};
-            }
-        } else {
-            const auto * logits = llama_get_logits_ith(ctx, idx);
-            GGML_ASSERT(logits != nullptr);
-            cur.resize(n_vocab);
-            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-                cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
-            }
+        cur.resize(n_vocab);
+
+        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+            cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
        }

        cur_p = { cur.data(), cur.size(), -1, false };
@@ -167,59 +151,54 @@ std::string common_params_sampling::print() const {
            "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
            "\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n"
            "\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, top_n_sigma = %.3f, temp = %.3f\n"
-            "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f, adaptive_target = %.3f, adaptive_decay = %.3f",
+            "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
            penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
            dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n,
            top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, top_n_sigma, temp,
-            mirostat, mirostat_eta, mirostat_tau, adaptive_target, adaptive_decay);
+            mirostat, mirostat_eta, mirostat_tau);

    return std::string(result);
 }

-struct common_sampler * common_sampler_init(const struct llama_model * model, struct common_params_sampling & params) {
+struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params) {
    const llama_vocab * vocab = llama_model_get_vocab(model);

    llama_sampler_chain_params lparams = llama_sampler_chain_default_params();

    lparams.no_perf = params.no_perf;

-    llama_sampler * grmr = nullptr;
    llama_sampler * chain = llama_sampler_chain_init(lparams);

+    bool grammar = false;
    std::vector<llama_sampler *> samplers;

    if (params.grammar.compare(0, 11, "%llguidance") == 0) {
 #ifdef LLAMA_USE_LLGUIDANCE
-        grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str());
+        samplers.push_back(llama_sampler_init_llg(vocab, "lark", params.grammar.c_str()));
+        grammar = true;
 #else
        GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
 #endif // LLAMA_USE_LLGUIDANCE
    } else {
        std::vector<std::string> trigger_patterns;
+        std::vector<std::string> patterns_anywhere;
        std::vector<llama_token> trigger_tokens;
        for (const auto & trigger : params.grammar_triggers) {
            switch (trigger.type) {
                case COMMON_GRAMMAR_TRIGGER_TYPE_WORD:
                {
                    const auto & word = trigger.value;
-                    trigger_patterns.push_back(regex_escape(word));
+                    patterns_anywhere.push_back(regex_escape(word));
                    break;
                }
                case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
                {
-                    trigger_patterns.push_back(trigger.value);
+                    patterns_anywhere.push_back(trigger.value);
                    break;
                }
                case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL:
                {
-                    const auto & pattern = trigger.value;
-                    std::string anchored = "^$";
-                    if (!pattern.empty()) {
-                        anchored = (pattern.front() != '^' ? "^" : "")
-                            + pattern
-                            + (pattern.back() != '$' ? "$" : "");
-                    }
-                    trigger_patterns.push_back(anchored);
+                    trigger_patterns.push_back(trigger.value);
                    break;
                }
                case COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN:
@@ -233,6 +212,10 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
            }
        }

+        if (!patterns_anywhere.empty()) {
+            trigger_patterns.push_back("^[\\s\\S]*?(" + string_join(patterns_anywhere, "|") + ")[\\s\\S]*");
+        }
+
        std::vector<const char *> trigger_patterns_c;
        trigger_patterns_c.reserve(trigger_patterns.size());
        for (const auto & regex : trigger_patterns) {
@@ -241,12 +224,15 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st

        if (!params.grammar.empty()) {
             if (params.grammar_lazy) {
-                 grmr = llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
-                         trigger_patterns_c.data(), trigger_patterns_c.size(),
-                         trigger_tokens.data(), trigger_tokens.size());
+                 samplers.push_back(
+                         llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
+                             trigger_patterns_c.data(), trigger_patterns_c.size(),
+                             trigger_tokens.data(),     trigger_tokens.size()));
             } else {
-                 grmr = llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
+                 samplers.push_back(llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root"));
             }
+
+             grammar = true;
        }
    }

@@ -255,9 +241,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
    }

    if (params.mirostat == 0) {
-
-        bool use_adaptive_p = false; // see below
-
        for (const auto & cnstr : params.samplers) {
            switch (cnstr) {
                case COMMON_SAMPLER_TYPE_DRY:
@@ -267,54 +250,43 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
                        for (const auto & str : params.dry_sequence_breakers) {
                            c_breakers.push_back(str.c_str());
                        }
-                        samplers.push_back(llama_sampler_init_dry(vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
+
+                        samplers.push_back(llama_sampler_init_dry    (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
                    }
                    break;
                case COMMON_SAMPLER_TYPE_TOP_K:
-                    samplers.push_back(llama_sampler_init_top_k(params.top_k));
+                    samplers.push_back(llama_sampler_init_top_k      (params.top_k));
                    break;
                case COMMON_SAMPLER_TYPE_TOP_P:
-                    samplers.push_back(llama_sampler_init_top_p(params.top_p, params.min_keep));
+                    samplers.push_back(llama_sampler_init_top_p      (params.top_p, params.min_keep));
                    break;
                case COMMON_SAMPLER_TYPE_TOP_N_SIGMA:
                    samplers.push_back(llama_sampler_init_top_n_sigma(params.top_n_sigma));
                    break;
                case COMMON_SAMPLER_TYPE_MIN_P:
-                    samplers.push_back(llama_sampler_init_min_p(params.min_p, params.min_keep));
+                    samplers.push_back(llama_sampler_init_min_p      (params.min_p, params.min_keep));
                    break;
                case COMMON_SAMPLER_TYPE_XTC:
-                    samplers.push_back(llama_sampler_init_xtc(params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
+                    samplers.push_back(llama_sampler_init_xtc        (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
                    break;
                case COMMON_SAMPLER_TYPE_TYPICAL_P:
-                    samplers.push_back(llama_sampler_init_typical(params.typ_p, params.min_keep));
+                    samplers.push_back(llama_sampler_init_typical    (params.typ_p, params.min_keep));
                    break;
                case COMMON_SAMPLER_TYPE_TEMPERATURE:
-                    samplers.push_back(llama_sampler_init_temp_ext(params.temp, params.dynatemp_range, params.dynatemp_exponent));
+                    samplers.push_back(llama_sampler_init_temp_ext   (params.temp, params.dynatemp_range, params.dynatemp_exponent));
                    break;
                case COMMON_SAMPLER_TYPE_INFILL:
-                    samplers.push_back(llama_sampler_init_infill(vocab));
+                    samplers.push_back(llama_sampler_init_infill     (vocab));
                    break;
                case COMMON_SAMPLER_TYPE_PENALTIES:
-                    samplers.push_back(llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
-                    break;
-                case COMMON_SAMPLER_TYPE_ADAPTIVE_P:
-                    // the `adaptive-p` sampler is like `dist` and `mirostat` in that it selects
-                    // a single token, so we will add `dist` at the end of the chain by default,
-                    // unless the user specifically included `adaptive-p`. we set this flag here
-                    // so we know to add the sampler at the very end.
-                    use_adaptive_p = true;
+                    samplers.push_back(llama_sampler_init_penalties  (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
                    break;
                default:
                    GGML_ASSERT(false && "unknown sampler type");
            }
        }
-        if (use_adaptive_p) {
-            // only if user explicitly included adaptive-p sampler
-            samplers.push_back(llama_sampler_init_adaptive_p(params.adaptive_target, params.adaptive_decay, params.seed));
-        } else {
-            // default: sample from distribution
-            samplers.push_back(llama_sampler_init_dist(params.seed));
-        }
+
+        samplers.push_back(llama_sampler_init_dist(params.seed));
    } else if (params.mirostat == 1) {
        samplers.push_back(llama_sampler_init_temp(params.temp));
        samplers.push_back(llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
@@ -329,16 +301,10 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
        llama_sampler_chain_add(chain, smpl);
    }

-    if (grmr && params.backend_sampling) {
-        LOG_WRN("%s: backend sampling is not compatible with grammar, disabling\n", __func__);
-
-        params.backend_sampling = false;
-    }
-
    auto * result = new common_sampler {
        /* .params  = */ params,
-        /* .grmr    = */ grmr,
        /* .chain   = */ chain,
+        /* .grammar = */ grammar,
        /* .prev    = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
        /* .cur     = */ {},
        /* .cur_p   = */ {},
@@ -348,45 +314,47 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
 }

 void common_sampler_free(struct common_sampler * gsmpl) {
-    if (!gsmpl) {
-        return;
+    if (gsmpl) {
+        llama_sampler_free(gsmpl->chain);
+
+        delete gsmpl;
    }
-
-    llama_sampler_free(gsmpl->grmr);
-    llama_sampler_free(gsmpl->chain);
-
-    delete gsmpl;
 }

 void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
-    if (!gsmpl) {
-        return;
-    }
-
    const auto tm = gsmpl->tm();

-    if (gsmpl->grmr && accept_grammar) {
-        llama_sampler_accept(gsmpl->grmr, token);
-    }
+    if (gsmpl->grammar) {
+        const int n_smpl = llama_sampler_chain_n(gsmpl->chain);

-    llama_sampler_accept(gsmpl->chain, token);
+        for (int i = 0; i < n_smpl; i++) {
+            auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
+
+            // the grammar sampler is always the first one
+            if (i == 0) {
+                if (accept_grammar) {
+                    llama_sampler_accept(smpl, token);
+                }
+            } else {
+                llama_sampler_accept(smpl, token);
+            }
+        }
+    } else {
+        llama_sampler_accept(gsmpl->chain, token);
+    }

    gsmpl->prev.push_back(token);
 }

 void common_sampler_reset(struct common_sampler * gsmpl) {
-    if (!gsmpl) {
-        return;
-    }
-
    gsmpl->reset();
 }

 struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
    return new common_sampler {
        /* .params  = */ gsmpl->params,
-        /* .grmr    = */ llama_sampler_clone(gsmpl->grmr),
        /* .chain   = */ llama_sampler_clone(gsmpl->chain),
+        /* .grammar = */ gsmpl->grammar,
        /* .prev    = */ gsmpl->prev,
        /* .cur     = */ gsmpl->cur,
        /* .cur_p   = */ gsmpl->cur_p,
@@ -439,14 +407,10 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
 }

 struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl) {
-    if (!gsmpl) {
-        return nullptr;
-    }
-
    return gsmpl->chain;
 }

-llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
+llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx) {
    llama_synchronize(ctx);

    // start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
@@ -454,61 +418,11 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co

    llama_token id = LLAMA_TOKEN_NULL;

-    auto & grmr  = gsmpl->grmr;
    auto & chain = gsmpl->chain;
    auto & cur_p = gsmpl->cur_p; // initialized by set_logits

-    // Check if a backend sampler has already sampled a token in which case we
-    // return that token id directly.
-    {
-        id = llama_get_sampled_token_ith(ctx, idx);
-
-        if (id != LLAMA_TOKEN_NULL) {
-            LOG_DBG("%s: Backend sampler selected token: '%d'. Will not run any CPU samplers\n", __func__, id);
-
-            GGML_ASSERT(!gsmpl->grmr && "using grammar in combination with backend sampling is not supported");
-
-            // TODO: simplify
-            gsmpl->cur.resize(1);
-            gsmpl->cur[0] = { id, 0.0f, 1.0f };
-            cur_p = { gsmpl->cur.data(), gsmpl->cur.size(), 0, true };
-
-            return id;
-        }
-    }
-
    gsmpl->set_logits(ctx, idx);

-    if (grammar_first) {
-        llama_sampler_apply(grmr, &cur_p);
-    }
-
-    llama_sampler_apply(chain, &cur_p);
-
-    id = cur_p.data[cur_p.selected].id;
-
-    if (grammar_first) {
-        return id;
-    }
-
-    // check if it the sampled token fits the grammar (grammar-based rejection sampling)
-    {
-        llama_token_data       single_token_data       = { id, 1.0f, 0.0f };
-        llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
-
-        llama_sampler_apply(grmr, &single_token_data_array);
-
-        const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
-        if (is_valid) {
-            return id;
-        }
-    }
-
-    // resampling:
-    // if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
-    gsmpl->set_logits(ctx, idx);
-
-    llama_sampler_apply(grmr,  &cur_p);
    llama_sampler_apply(chain, &cur_p);

    GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
@@ -518,7 +432,7 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
    return id;
 }

-std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first) {
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft) {
    GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");

    std::vector<llama_token> result;
@@ -526,7 +440,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample

    size_t i = 0;
    for (; i < draft.size(); i++) {
-        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
+        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);

        common_sampler_accept(gsmpl, id, true);

@@ -538,7 +452,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
    }

    if (i == draft.size()) {
-        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
+        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);

        common_sampler_accept(gsmpl, id, true);

@@ -548,13 +462,13 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
    return result;
 }

-std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) {
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft) {
    std::vector<int> idxs(draft.size() + 1);
    for (size_t i = 0; i < idxs.size(); ++i) {
        idxs[i] = i;
    }

-    return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first);
+    return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft);
 }

 uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
@@ -639,7 +553,6 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
        case COMMON_SAMPLER_TYPE_XTC:         return 'x';
        case COMMON_SAMPLER_TYPE_INFILL:      return 'i';
        case COMMON_SAMPLER_TYPE_PENALTIES:   return 'e';
-        case COMMON_SAMPLER_TYPE_ADAPTIVE_P:  return 'a';
        default : return '?';
    }
 }
@@ -656,7 +569,6 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
        case COMMON_SAMPLER_TYPE_XTC:         return "xtc";
        case COMMON_SAMPLER_TYPE_INFILL:      return "infill";
        case COMMON_SAMPLER_TYPE_PENALTIES:   return "penalties";
-        case COMMON_SAMPLER_TYPE_ADAPTIVE_P:  return "adaptive_p";
        default : return "";
    }
 }
@@ -673,7 +585,6 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
        { "xtc",         COMMON_SAMPLER_TYPE_XTC },
        { "infill",      COMMON_SAMPLER_TYPE_INFILL },
        { "penalties",   COMMON_SAMPLER_TYPE_PENALTIES },
-        { "adaptive_p",  COMMON_SAMPLER_TYPE_ADAPTIVE_P },
    };

    // since samplers names are written multiple ways
@@ -689,7 +600,6 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
        { "typ",         COMMON_SAMPLER_TYPE_TYPICAL_P },
        { "min-p",       COMMON_SAMPLER_TYPE_MIN_P },
        { "temp",        COMMON_SAMPLER_TYPE_TEMPERATURE },
-        { "adaptive-p",  COMMON_SAMPLER_TYPE_ADAPTIVE_P },
    };

    std::vector<common_sampler_type> samplers;
@@ -726,7 +636,6 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC),         COMMON_SAMPLER_TYPE_XTC },
        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL),      COMMON_SAMPLER_TYPE_INFILL },
        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_PENALTIES),   COMMON_SAMPLER_TYPE_PENALTIES },
-        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_ADAPTIVE_P),  COMMON_SAMPLER_TYPE_ADAPTIVE_P },
    };

    std::vector<common_sampler_type> samplers;
--- a/llama/llama.cpp/common/sampling.h
+++ b/llama/llama.cpp/common/sampling.h
@@ -36,8 +36,7 @@ struct common_sampler;

 // llama_sampler API overloads

-// note: can mutate params in some cases
-struct common_sampler * common_sampler_init(const struct llama_model * model, struct common_params_sampling & params);
+struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params);

 void common_sampler_free(struct common_sampler * gsmpl);

@@ -49,7 +48,6 @@ struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
 // arguments can be nullptr to skip printing
 void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);

-// get the underlying llama_sampler_chain
 struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl);

 // extended sampling implementation:
@@ -59,10 +57,7 @@ struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl);
 // - check if the token fits the grammar (if any)
 // - if not: resample by first applying the grammar constraints and then sampling again (slower path)
 //
-// if grammar_first is true, the grammar is applied before the samplers (slower)
-// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
-//
-llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
+llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx);

 // generalized version of common_sampler_sample
 //
@@ -80,10 +75,10 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
 //
 // returns at least 1 token, up to idxs.size()
 //
-std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first = false);
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft);

 // assume idxs == [ 0, 1, 2, ..., draft.size() ]
-std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false);
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft);

 uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);

--- a/llama/llama.cpp/include/llama-cpp.h
+++ b/llama/llama.cpp/include/llama-cpp.h
@@ -21,9 +21,7 @@ struct llama_sampler_deleter {
 };

 struct llama_adapter_lora_deleter {
-    void operator()(llama_adapter_lora *) {
-        // llama_adapter_lora_free is deprecated
-    }
+    void operator()(llama_adapter_lora * adapter) { llama_adapter_lora_free(adapter); }
 };

 typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;
--- a/llama/llama.cpp/include/llama.h
+++ b/llama/llama.cpp/include/llama.h
@@ -286,7 +286,7 @@ extern "C" {
        // NULL-terminated list of buffer types to use for tensors that match a pattern
        const struct llama_model_tensor_buft_override * tensor_buft_overrides;

-        int32_t n_gpu_layers; // number of layers to store in VRAM, a negative value means all layers
+        int32_t n_gpu_layers; // number of layers to store in VRAM
        enum llama_split_mode split_mode; // how to split the model across multiple GPUs

        // the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
@@ -309,7 +309,6 @@ extern "C" {
        // Keep the booleans together to avoid misalignment during copy-by-value.
        bool vocab_only;      // only load the vocabulary, no weights
        bool use_mmap;        // use mmap if possible
-        bool use_direct_io;   // use direct io, takes precedence over use_mmap
        bool use_mlock;       // force system to keep model in RAM
        bool check_tensors;   // validate model tensor data
        bool use_extra_bufts; // use extra buffer types (used for weight repacking)
@@ -317,11 +316,6 @@ extern "C" {
        bool no_alloc;        // only load metadata and simulate memory allocations
    };

-    struct llama_sampler_seq_config {
-        llama_seq_id           seq_id;
-        struct llama_sampler * sampler;
-    };
-
    // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
    //       https://github.com/ggml-org/llama.cpp/pull/7544
    struct llama_context_params {
@@ -370,12 +364,6 @@ extern "C" {
        bool kv_unified;  // use a unified buffer across the input sequences when computing the attention
                          // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
                          // ref: https://github.com/ggml-org/llama.cpp/pull/14363
-
-        // [EXPERIMENTAL]
-        // backend sampler chain configuration (make sure the caller keeps the sampler chains alive)
-        // note: the samplers must be sampler chains (i.e. use llama_sampler_chain_init)
-        struct llama_sampler_seq_config * samplers;
-        size_t                            n_samplers;
    };

    // model quantization parameters
@@ -479,24 +467,16 @@ extern "C" {
    // Frees all allocated memory
    LLAMA_API void llama_free(struct llama_context * ctx);

-    enum llama_params_fit_status {
-        LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0, // found allocations that are projected to fit
-        LLAMA_PARAMS_FIT_STATUS_FAILURE = 1, // could not find allocations that are projected to fit
-        LLAMA_PARAMS_FIT_STATUS_ERROR   = 2, // a hard error occured, e.g. because no model could be found at the specified path
-    };
-
    // fits mparams and cparams to free device memory (assumes system memory is unlimited)
-    //   - returns true if the parameters could be successfully modified to fit device memory
-    //   - this function is NOT thread safe because it modifies the global llama logger state
-    //   - only parameters that have the same value as in llama_default_model_params are modified
-    //     with the exception of the context size which is modified if and only if equal to 0
-    LLAMA_API enum llama_params_fit_status llama_params_fit(
+    // returns true if the parameters could be successfully modified to fit device memory
+    // this function is NOT thread safe because it modifies the global llama logger state
+    LLAMA_API bool llama_params_fit(
                                   const char   * path_model,
                    struct llama_model_params   * mparams,
                    struct llama_context_params * cparams,
                                          float * tensor_split,          // writable buffer for tensor split, needs at least llama_max_devices elements
        struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
-                                         size_t * margins,               // margins of memory to leave per device in bytes
+                                         size_t   margin,                // margin of memory to leave per device in bytes
                                       uint32_t   n_ctx_min,             // minimum context size to set when trying to reduce memory use
                            enum ggml_log_level   log_level);            // minimum log level to print during fitting, lower levels go to debug log

@@ -537,7 +517,6 @@ extern "C" {
    LLAMA_API int32_t llama_model_n_ctx_train(const struct llama_model * model);
    LLAMA_API int32_t llama_model_n_embd     (const struct llama_model * model);
    LLAMA_API int32_t llama_model_n_embd_inp (const struct llama_model * model);
-    LLAMA_API int32_t llama_model_n_embd_out (const struct llama_model * model);
    LLAMA_API int32_t llama_model_n_layer    (const struct llama_model * model);
    LLAMA_API int32_t llama_model_n_head     (const struct llama_model * model);
    LLAMA_API int32_t llama_model_n_head_kv  (const struct llama_model * model);
@@ -621,8 +600,6 @@ extern "C" {
    //

    // Load a LoRA adapter from file
-    // The adapter is valid as long as the associated model is not freed
-    // All adapters must be loaded before context creation
    LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init(
            struct llama_model * model,
            const char * path_lora);
@@ -647,8 +624,7 @@ extern "C" {

    // Manually free a LoRA adapter
    // NOTE: loaded adapters will be free when the associated model is deleted
-    LLAMA_API DEPRECATED(void llama_adapter_lora_free(struct llama_adapter_lora * adapter),
-            "adapters are now freed together with the associated model");
+    LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);

    // Get the invocation tokens if the current lora is an alora
    LLAMA_API uint64_t            llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter);
@@ -1007,32 +983,6 @@ extern "C" {
    // otherwise: float[n_embd] (1-dimensional)
    LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);

-    //
-    // backend sampling API [EXPERIMENTAL]
-    // note: use only if the llama_context was created with at least one llama_sampler_seq_config
-    //
-
-    // Get the backend sampled token for the ith token.
-    // Returns LLAMA_TOKEN_NULL if no token was sampled.
-    LLAMA_API llama_token llama_get_sampled_token_ith(struct llama_context * ctx, int32_t i);
-
-    // Get the backend sampled probabilites for the ith token
-    // The index matches llama_get_sampled_token_ith().
-    // Returns NULL if no probabilites were generated.
-    LLAMA_API float *  llama_get_sampled_probs_ith      (struct llama_context * ctx, int32_t i);
-    LLAMA_API uint32_t llama_get_sampled_probs_count_ith(struct llama_context * ctx, int32_t i);
-
-    // Get the backend sampled logits for the ith token
-    // Returns NULL if no logits were sampled.
-    LLAMA_API float *  llama_get_sampled_logits_ith      (struct llama_context * ctx, int32_t i);
-    LLAMA_API uint32_t llama_get_sampled_logits_count_ith(struct llama_context * ctx, int32_t i);
-
-    // Get the backend sampled candidates (token ids) for the ith token
-    // These are needed to map probability/logit indices to vocab token ids.
-    // Returns NULL if no candidates were sampled.
-    LLAMA_API llama_token * llama_get_sampled_candidates_ith      (struct llama_context * ctx, int32_t i);
-    LLAMA_API uint32_t      llama_get_sampled_candidates_count_ith(struct llama_context * ctx, int32_t i);
-
    //
    // Vocab
    //
@@ -1204,16 +1154,11 @@ extern "C" {
    //
    //    llama_sampler_free(smpl);
    //
+    // TODO: In the future, llama_sampler will be utilized to offload the sampling to the backends (e.g. GPU).
+    //

    typedef void * llama_sampler_context_t;

-    struct llama_sampler_data {
-        struct ggml_tensor * logits;
-        struct ggml_tensor * probs;
-        struct ggml_tensor * sampled;
-        struct ggml_tensor * candidates;
-    };
-
    // user code can implement the interface below in order to create custom llama_sampler
    struct llama_sampler_i {
        const char *           (*name)  (const struct llama_sampler * smpl);                                 // can be NULL
@@ -1223,44 +1168,17 @@ extern "C" {
        struct llama_sampler * (*clone) (const struct llama_sampler * smpl);                                 // can be NULL if ctx is NULL
        void                   (*free)  (      struct llama_sampler * smpl);                                 // can be NULL if ctx is NULL

-        // [EXPERIMENTAL]
-        // backend sampling interface:
-
-        // return true if the backend supports all ops needed by the sampler
-        // note: call once per sampler
-        bool (*backend_init)(struct llama_sampler * smpl, ggml_backend_buffer_type_t buft);
-
-        // call after .backend_apply()
-        void (*backend_accept)(
-                struct llama_sampler * smpl,
-                struct ggml_context  * ctx,
-                struct ggml_cgraph   * gf,
-                struct ggml_tensor   * selected_token);
-
-        // call after .backend_init()
-        void (*backend_apply)(
-                struct llama_sampler      * smpl,
-                struct ggml_context       * ctx,
-                struct ggml_cgraph        * gf,
-                struct llama_sampler_data * data);
-
-        // called before graph execution to set inputs for the current ubatch
-        void (*backend_set_input)(struct llama_sampler * smpl);
+        // TODO: API for internal libllama usage for appending the sampling to an existing ggml_cgraph
+        //void (*apply_ggml) (struct llama_sampler * smpl, ...);
    };

    struct llama_sampler {
-        struct llama_sampler_i * iface;
-
-        llama_sampler_context_t ctx;
+        const struct llama_sampler_i * iface;
+        llama_sampler_context_t        ctx;
    };

-    // [EXPERIMENTAL]
-    // attach a sampler to the context
-    // note: prefer initializing the context with llama_context_params.samplers when possible
-    LLAMA_API bool llama_set_sampler(struct llama_context * ctx, llama_seq_id seq_id, struct llama_sampler * smpl);
-
    // mirror of llama_sampler_i:
-    LLAMA_API struct llama_sampler * llama_sampler_init  (      struct llama_sampler_i * iface, llama_sampler_context_t ctx);
+    LLAMA_API struct llama_sampler * llama_sampler_init  (const struct llama_sampler_i * iface, llama_sampler_context_t ctx);
    LLAMA_API const char *           llama_sampler_name  (const struct llama_sampler * smpl);
    LLAMA_API void                   llama_sampler_accept(      struct llama_sampler * smpl, llama_token token);
    LLAMA_API void                   llama_sampler_apply (      struct llama_sampler * smpl, llama_token_data_array * cur_p);
@@ -1276,15 +1194,7 @@ extern "C" {

    // important: takes ownership of the sampler object and will free it when llama_sampler_free is called
    LLAMA_API void                   llama_sampler_chain_add(      struct llama_sampler * chain, struct llama_sampler * smpl);
-
-    // return NULL if:
-    //   - the sampler is NULL
-    //   - the sampler is not a llama_sampler_chain
-    //   - the index is out of bounds, unless i == -1
-    //   - if i == -1, returns the chain itself (can be used to check if the sampler is a chain)
-    LLAMA_API struct llama_sampler * llama_sampler_chain_get(      struct llama_sampler * chain, int32_t i);
-
-    // the total number of samplers in the chain
+    LLAMA_API struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i);
    LLAMA_API int                    llama_sampler_chain_n  (const struct llama_sampler * chain);

    // after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed
@@ -1293,9 +1203,7 @@ extern "C" {
    // available samplers:

    LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
-
-    /// seed == LLAMA_DEFAULT_SEED to use a random seed.
-    LLAMA_API struct llama_sampler * llama_sampler_init_dist(uint32_t seed);
+    LLAMA_API struct llama_sampler * llama_sampler_init_dist  (uint32_t seed);

    /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
    /// Setting k <= 0 makes this a noop
@@ -1396,33 +1304,6 @@ extern "C" {
                          const char ** seq_breakers,
                              size_t    num_breakers);

-    /// adaptive-p: select tokens near a configurable target probability over time.
-    ///
-    /// the adaptive-p sampler transforms the token probability distribution to favor tokens
-    /// that fall near a user-configurable probability target.
-    ///
-    /// internally, the sampler maintains an exponential moving average of the *ORIGINAL*
-    /// probabilities of selected tokens at each sampling step. it uses this EMA to compute an
-    /// adapted target probability at each sampling step, thus maintaining the desired target
-    /// probability over time.
-    ///
-    /// adaptive-p selects a token ID rather than just mutating candidates, so it must be last
-    /// in the sampler chain (like mirostat, dist, greedy).
-    ///
-    /// only mild truncation before this sampler is recommended. we suggest applying min-p
-    /// before adaptive-p as the only other active sampler in the chain.
-    ///
-    /// @param target select tokens near this probability (valid range 0.0 to 1.0; negative = disabled)
-    /// @param decay  EMA decay for adaptation; history ≈ 1/(1-decay) tokens (valid range 0.0 - 0.99)
-    /// @param seed   RNG seed
-    ///
-    /// ref: https://github.com/ggml-org/llama.cpp/pull/17927
-    ///
-    LLAMA_API struct llama_sampler * llama_sampler_init_adaptive_p(
-                               float   target,
-                               float   decay,
-                            uint32_t   seed);
-
    LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias(
                             int32_t   n_vocab,
                             int32_t   n_logit_bias,
@@ -1476,12 +1357,12 @@ extern "C" {
    /// @details Build a split GGUF final path for this chunk.
    ///          llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
    //  Returns the split_path length.
-    LLAMA_API int32_t llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int32_t split_no, int32_t split_count);
+    LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);

    /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
    ///          llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
    //  Returns the split_prefix length.
-    LLAMA_API int32_t llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int32_t split_no, int32_t split_count);
+    LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);

    // Print system information
    LLAMA_API const char * llama_print_system_info(void);
--- a/llama/llama.cpp/src/llama-adapter.cpp
+++ b/llama/llama.cpp/src/llama-adapter.cpp
@@ -411,9 +411,6 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
        }
    }

-    // register adapter with model
-    model.loras.insert(&adapter);
-
    LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
 }

@@ -471,8 +468,8 @@ int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter,
    return snprintf(buf, buf_size, "%s", it->second.c_str());
 }

-void llama_adapter_lora_free(llama_adapter_lora *) {
-    // deprecated: adapters are freed by llama_model's destructor
+void llama_adapter_lora_free(llama_adapter_lora * adapter) {
+    delete adapter;
 }

 uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter) {
--- a/llama/llama.cpp/src/llama-adapter.h
+++ b/llama/llama.cpp/src/llama-adapter.h
@@ -77,10 +77,6 @@ struct llama_adapter_lora {
    ~llama_adapter_lora() = default;

    llama_adapter_lora_weight * get_weight(ggml_tensor * w);
-
-    uint32_t get_n_nodes() const {
-        return ab_map.size() * 6u; // a, b, scale, add, 2 x mul_mat
-    }
 };

 using llama_adapter_loras = std::unordered_map<llama_adapter_lora *, float>;
--- a/llama/llama.cpp/src/llama-arch.cpp
+++ b/llama/llama.cpp/src/llama-arch.cpp
@@ -20,7 +20,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_STARCODER,        "starcoder"        },
    { LLM_ARCH_REFACT,           "refact"           },
    { LLM_ARCH_BERT,             "bert"             },
-    { LLM_ARCH_MODERN_BERT,      "modern-bert"      },
    { LLM_ARCH_NOMIC_BERT,       "nomic-bert"       },
    { LLM_ARCH_NOMIC_BERT_MOE,   "nomic-bert-moe"   },
    { LLM_ARCH_NEO_BERT,         "neo-bert"         },
@@ -42,7 +41,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_PHIMOE,           "phimoe"           },
    { LLM_ARCH_PLAMO,            "plamo"            },
    { LLM_ARCH_PLAMO2,           "plamo2"           },
-    { LLM_ARCH_PLAMO3,           "plamo3"           },
    { LLM_ARCH_CODESHELL,        "codeshell"        },
    { LLM_ARCH_ORION,            "orion"            },
    { LLM_ARCH_INTERNLM2,        "internlm2"        },
@@ -81,7 +79,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_NEMOTRON_H_MOE,   "nemotron_h_moe"   },
    { LLM_ARCH_EXAONE,           "exaone"           },
    { LLM_ARCH_EXAONE4,          "exaone4"          },
-    { LLM_ARCH_EXAONE_MOE,       "exaone-moe"       },
    { LLM_ARCH_RWKV6,            "rwkv6"            },
    { LLM_ARCH_RWKV6QWEN2,       "rwkv6qwen2"       },
    { LLM_ARCH_RWKV7,            "rwkv7"            },
@@ -118,9 +115,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_RND1,             "rnd1"             },
    { LLM_ARCH_PANGU_EMBED,      "pangu-embedded"   },
    { LLM_ARCH_MISTRAL3,         "mistral3"         },
-    { LLM_ARCH_MIMO2,            "mimo2"           },
-    { LLM_ARCH_LLAMA_EMBED,      "llama-embed"      },
-    { LLM_ARCH_MAINCODER,        "maincoder"        },
    { LLM_ARCH_UNKNOWN,          "(unknown)"        },
 };

@@ -154,7 +148,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_VOCAB_SIZE,                        "%s.vocab_size"                        },
    { LLM_KV_CONTEXT_LENGTH,                    "%s.context_length"                    },
    { LLM_KV_EMBEDDING_LENGTH,                  "%s.embedding_length"                  },
-    { LLM_KV_EMBEDDING_LENGTH_OUT,              "%s.embedding_length_out"              },
    { LLM_KV_FEATURES_LENGTH,                   "%s.features_length"                   },
    { LLM_KV_BLOCK_COUNT,                       "%s.block_count"                       },
    { LLM_KV_LEADING_DENSE_BLOCK_COUNT,         "%s.leading_dense_block_count"         },
@@ -212,7 +205,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_ATTENTION_GATE_LORA_RANK,               "%s.attention.gate_lora_rank"               },
    { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,       "%s.attention.relative_buckets_count"       },
    { LLM_KV_ATTENTION_SLIDING_WINDOW,               "%s.attention.sliding_window"               },
-    { LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN,       "%s.attention.sliding_window_pattern"       },
    { LLM_KV_ATTENTION_SCALE,                        "%s.attention.scale"                        },
    { LLM_KV_ATTENTION_OUTPUT_SCALE,                 "%s.attention.output_scale"                 },
    { LLM_KV_ATTENTION_TEMPERATURE_LENGTH,           "%s.attention.temperature_length"           },
@@ -224,7 +216,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_ROPE_DIMENSION_COUNT,          "%s.rope.dimension_count"                 },
    { LLM_KV_ROPE_DIMENSION_SECTIONS,       "%s.rope.dimension_sections"              },
    { LLM_KV_ROPE_FREQ_BASE,                "%s.rope.freq_base"                       },
-    { LLM_KV_ROPE_FREQ_BASE_SWA,            "%s.rope.freq_base_swa"                   },
    { LLM_KV_ROPE_SCALE_LINEAR,             "%s.rope.scale_linear"                    },
    { LLM_KV_ROPE_SCALING_TYPE,             "%s.rope.scaling.type"                    },
    { LLM_KV_ROPE_SCALING_FACTOR,           "%s.rope.scaling.factor"                  },
@@ -509,7 +500,6 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
        case LLM_ARCH_LLAMA:
        case LLM_ARCH_DECI:
        case LLM_ARCH_MISTRAL3:
-        case LLM_ARCH_LLAMA_EMBED:
            return {
                LLM_TENSOR_TOKEN_EMBD,
                LLM_TENSOR_OUTPUT_NORM,
@@ -791,20 +781,6 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
                LLM_TENSOR_CLS,
                LLM_TENSOR_CLS_OUT,
            };
-        case LLM_ARCH_MODERN_BERT:
-            return {
-                LLM_TENSOR_TOKEN_EMBD,
-                LLM_TENSOR_TOKEN_EMBD_NORM,
-                LLM_TENSOR_OUTPUT_NORM,
-                LLM_TENSOR_ATTN_NORM,
-                LLM_TENSOR_ATTN_OUT,
-                LLM_TENSOR_ATTN_QKV,
-                LLM_TENSOR_FFN_DOWN,
-                LLM_TENSOR_FFN_UP,
-                LLM_TENSOR_FFN_NORM,
-                LLM_TENSOR_CLS,
-                LLM_TENSOR_CLS_OUT,
-            };
        case LLM_ARCH_JINA_BERT_V2:
            return {
                LLM_TENSOR_TOKEN_EMBD,
@@ -954,8 +930,6 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
                LLM_TENSOR_ATTN_K_NORM,
                LLM_TENSOR_ATTN_V,
                LLM_TENSOR_ATTN_OUT,
-                LLM_TENSOR_ATTN_QKV,
-                LLM_TENSOR_ATTN_GATE,
                LLM_TENSOR_FFN_NORM,
                LLM_TENSOR_FFN_GATE_INP,
                LLM_TENSOR_FFN_GATE_EXPS,
@@ -1086,22 +1060,6 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
                LLM_TENSOR_ATTN_POST_NORM,
                LLM_TENSOR_FFN_POST_NORM,
            };
-        case LLM_ARCH_PLAMO3:
-            return {
-                LLM_TENSOR_TOKEN_EMBD,
-                LLM_TENSOR_OUTPUT_NORM,
-                LLM_TENSOR_OUTPUT,
-                LLM_TENSOR_ATTN_NORM,
-                LLM_TENSOR_ATTN_QKV,
-                LLM_TENSOR_ATTN_Q_NORM,
-                LLM_TENSOR_ATTN_K_NORM,
-                LLM_TENSOR_ATTN_OUT,
-                LLM_TENSOR_ATTN_POST_NORM,
-                LLM_TENSOR_FFN_NORM,
-                LLM_TENSOR_FFN_POST_NORM,
-                LLM_TENSOR_FFN_DOWN,
-                LLM_TENSOR_FFN_UP,
-            };
        case LLM_ARCH_CODESHELL:
            return {
                LLM_TENSOR_TOKEN_EMBD,
@@ -1732,38 +1690,6 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
                LLM_TENSOR_FFN_UP,
                LLM_TENSOR_FFN_POST_NORM,
            };
-        case LLM_ARCH_EXAONE_MOE:
-            return {
-                LLM_TENSOR_TOKEN_EMBD,
-                LLM_TENSOR_OUTPUT_NORM,
-                LLM_TENSOR_OUTPUT,
-                LLM_TENSOR_ROPE_FREQS,
-                LLM_TENSOR_ATTN_NORM,
-                LLM_TENSOR_ATTN_Q,
-                LLM_TENSOR_ATTN_Q_NORM,
-                LLM_TENSOR_ATTN_K,
-                LLM_TENSOR_ATTN_K_NORM,
-                LLM_TENSOR_ATTN_V,
-                LLM_TENSOR_ATTN_OUT,
-                LLM_TENSOR_FFN_NORM,
-                LLM_TENSOR_FFN_GATE,
-                LLM_TENSOR_FFN_DOWN,
-                LLM_TENSOR_FFN_UP,
-                LLM_TENSOR_FFN_GATE_INP,
-                LLM_TENSOR_FFN_GATE_EXPS,
-                LLM_TENSOR_FFN_DOWN_EXPS,
-                LLM_TENSOR_FFN_UP_EXPS,
-                LLM_TENSOR_FFN_GATE_SHEXP,
-                LLM_TENSOR_FFN_UP_SHEXP,
-                LLM_TENSOR_FFN_DOWN_SHEXP,
-                LLM_TENSOR_FFN_EXP_PROBS_B,
-                LLM_TENSOR_NEXTN_EH_PROJ,
-                LLM_TENSOR_NEXTN_EMBED_TOKENS,
-                LLM_TENSOR_NEXTN_ENORM,
-                LLM_TENSOR_NEXTN_HNORM,
-                LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,
-                LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
-            };
        case LLM_ARCH_RWKV6:
            return {
                LLM_TENSOR_TOKEN_EMBD,
@@ -2114,7 +2040,6 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
                LLM_TENSOR_TOKEN_EMBD,
                LLM_TENSOR_OUTPUT_NORM_LFM2,
                LLM_TENSOR_OUTPUT,
-                LLM_TENSOR_DENSE_2_OUT,
            };
        case LLM_ARCH_LFM2MOE:
            return {
@@ -2133,7 +2058,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
                LLM_TENSOR_SHORTCONV_INPROJ,
                LLM_TENSOR_SHORTCONV_OUTPROJ,
                LLM_TENSOR_TOKEN_EMBD,
-                LLM_TENSOR_OUTPUT_NORM_LFM2,
+                LLM_TENSOR_OUTPUT_NORM,
                LLM_TENSOR_FFN_GATE_INP,
                LLM_TENSOR_FFN_GATE_EXPS,
                LLM_TENSOR_FFN_DOWN_EXPS,
@@ -2249,49 +2174,11 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
                LLM_TENSOR_VISEXP_FFN_DOWN,
                LLM_TENSOR_VISEXP_FFN_UP,
            };
-        case LLM_ARCH_MIMO2:
-            return {
-                LLM_TENSOR_TOKEN_EMBD,
-                LLM_TENSOR_OUTPUT_NORM,
-                LLM_TENSOR_OUTPUT,
-                LLM_TENSOR_ATTN_NORM,
-                LLM_TENSOR_ATTN_Q,
-                LLM_TENSOR_ATTN_K,
-                LLM_TENSOR_ATTN_V,
-                LLM_TENSOR_ATTN_SINKS,
-                LLM_TENSOR_ATTN_OUT,
-                LLM_TENSOR_FFN_NORM,
-                LLM_TENSOR_FFN_GATE,
-                LLM_TENSOR_FFN_DOWN,
-                LLM_TENSOR_FFN_UP,
-                LLM_TENSOR_FFN_GATE_INP,
-                LLM_TENSOR_FFN_GATE_EXPS,
-                LLM_TENSOR_FFN_DOWN_EXPS,
-                LLM_TENSOR_FFN_UP_EXPS,
-                LLM_TENSOR_FFN_EXP_PROBS_B,
-            };
        case LLM_ARCH_GPTJ:
        case LLM_ARCH_UNKNOWN:
            return {
                LLM_TENSOR_TOKEN_EMBD,
            };
-        case LLM_ARCH_MAINCODER:
-            return {
-                LLM_TENSOR_TOKEN_EMBD,
-                LLM_TENSOR_OUTPUT_NORM,
-                LLM_TENSOR_OUTPUT,
-                LLM_TENSOR_ATTN_NORM,
-                LLM_TENSOR_ATTN_Q,
-                LLM_TENSOR_ATTN_Q_NORM,
-                LLM_TENSOR_ATTN_K,
-                LLM_TENSOR_ATTN_K_NORM,
-                LLM_TENSOR_ATTN_V,
-                LLM_TENSOR_ATTN_OUT,
-                LLM_TENSOR_FFN_NORM,
-                LLM_TENSOR_FFN_GATE,
-                LLM_TENSOR_FFN_DOWN,
-                LLM_TENSOR_FFN_UP,
-            };
        case LLM_ARCH_SOLAR:
            return {
                LLM_TENSOR_TOKEN_EMBD,
--- a/llama/llama.cpp/src/llama-arch.h
+++ b/llama/llama.cpp/src/llama-arch.h
@@ -24,7 +24,6 @@ enum llm_arch {
    LLM_ARCH_STARCODER,
    LLM_ARCH_REFACT,
    LLM_ARCH_BERT,
-    LLM_ARCH_MODERN_BERT,
    LLM_ARCH_NOMIC_BERT,
    LLM_ARCH_NOMIC_BERT_MOE,
    LLM_ARCH_NEO_BERT,
@@ -46,7 +45,6 @@ enum llm_arch {
    LLM_ARCH_PHIMOE,
    LLM_ARCH_PLAMO,
    LLM_ARCH_PLAMO2,
-    LLM_ARCH_PLAMO3,
    LLM_ARCH_CODESHELL,
    LLM_ARCH_ORION,
    LLM_ARCH_INTERNLM2,
@@ -85,7 +83,6 @@ enum llm_arch {
    LLM_ARCH_NEMOTRON_H_MOE,
    LLM_ARCH_EXAONE,
    LLM_ARCH_EXAONE4,
-    LLM_ARCH_EXAONE_MOE,
    LLM_ARCH_RWKV6,
    LLM_ARCH_RWKV6QWEN2,
    LLM_ARCH_RWKV7,
@@ -122,9 +119,6 @@ enum llm_arch {
    LLM_ARCH_RND1,
    LLM_ARCH_PANGU_EMBED,
    LLM_ARCH_MISTRAL3,
-    LLM_ARCH_MIMO2,
-    LLM_ARCH_LLAMA_EMBED,
-    LLM_ARCH_MAINCODER,
    LLM_ARCH_UNKNOWN,
 };

@@ -158,7 +152,6 @@ enum llm_kv {
    LLM_KV_VOCAB_SIZE,
    LLM_KV_CONTEXT_LENGTH,
    LLM_KV_EMBEDDING_LENGTH,
-    LLM_KV_EMBEDDING_LENGTH_OUT,
    LLM_KV_FEATURES_LENGTH,
    LLM_KV_BLOCK_COUNT,
    LLM_KV_LEADING_DENSE_BLOCK_COUNT,
@@ -216,7 +209,6 @@ enum llm_kv {
    LLM_KV_ATTENTION_GATE_LORA_RANK,
    LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
    LLM_KV_ATTENTION_SLIDING_WINDOW,
-    LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN,
    LLM_KV_ATTENTION_SCALE,
    LLM_KV_ATTENTION_OUTPUT_SCALE,
    LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
@@ -228,7 +220,6 @@ enum llm_kv {
    LLM_KV_ROPE_DIMENSION_COUNT,
    LLM_KV_ROPE_DIMENSION_SECTIONS,
    LLM_KV_ROPE_FREQ_BASE,
-    LLM_KV_ROPE_FREQ_BASE_SWA,
    LLM_KV_ROPE_SCALE_LINEAR,
    LLM_KV_ROPE_SCALING_TYPE,
    LLM_KV_ROPE_SCALING_FACTOR,
--- a/llama/llama.cpp/src/llama-chat.cpp
+++ b/llama/llama.cpp/src/llama-chat.cpp
@@ -57,7 +57,6 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
    { "minicpm",           LLM_CHAT_TEMPLATE_MINICPM           },
    { "exaone3",           LLM_CHAT_TEMPLATE_EXAONE_3          },
    { "exaone4",           LLM_CHAT_TEMPLATE_EXAONE_4          },
-    { "exaone-moe",        LLM_CHAT_TEMPLATE_EXAONE_MOE        },
    { "rwkv-world",        LLM_CHAT_TEMPLATE_RWKV_WORLD        },
    { "granite",           LLM_CHAT_TEMPLATE_GRANITE           },
    { "gigachat",          LLM_CHAT_TEMPLATE_GIGACHAT          },
@@ -75,7 +74,6 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
    { "seed_oss",          LLM_CHAT_TEMPLATE_SEED_OSS          },
    { "grok-2",            LLM_CHAT_TEMPLATE_GROK_2            },
    { "pangu-embedded",    LLM_CHAT_TEMPLATE_PANGU_EMBED       },
-    { "solar-open",        LLM_CHAT_TEMPLATE_SOLAR_OPEN        },
 };

 llm_chat_template llm_chat_template_from_str(const std::string & name) {
@@ -138,9 +136,6 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
    } else if (tmpl_contains("[gMASK]<sop>")) {
        return LLM_CHAT_TEMPLATE_CHATGLM_4;
    } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
-        if (tmpl_contains("<|tool_declare|>")) {
-            return LLM_CHAT_TEMPLATE_EXAONE_MOE;
-        }
        return tmpl_contains("</s>") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE;
    } else if (tmpl_contains("<|{{ item['role'] }}|>") && tmpl_contains("<|begin_of_image|>")) {
        return LLM_CHAT_TEMPLATE_GLMEDGE;
@@ -221,8 +216,6 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
        return LLM_CHAT_TEMPLATE_GROK_2;
    } else if (tmpl_contains(LU8("[unused9]系统：[unused10]"))) {
        return LLM_CHAT_TEMPLATE_PANGU_EMBED;
-    } else if (tmpl_contains("<|begin|>") && tmpl_contains("<|end|>") && tmpl_contains("<|content|>")) {
-        return LLM_CHAT_TEMPLATE_SOLAR_OPEN;
    }
    return LLM_CHAT_TEMPLATE_UNKNOWN;
 }
@@ -580,22 +573,6 @@ int32_t llm_chat_apply_template(
        if (add_ass) {
            ss << "[|assistant|]";
        }
-    } else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_MOE) {
-        for (auto message : chat) {
-            std::string role(message->role);
-            if (role == "system") {
-                ss << "<|system|>\n" << trim(message->content) << "<|endofturn|>\n";
-            } else if (role == "user") {
-                ss << "<|user|>\n" << trim(message->content) << "<|endofturn|>\n";
-            } else if (role == "assistant") {
-                ss << "<|assistant|>\n" << trim(message->content) << "<|endofturn|>\n";
-            } else if (role == "tool") {
-                ss << "<|tool|>\n" << trim(message->content) << "<|endofturn|>\n";
-            }
-        }
-        if (add_ass) {
-            ss << "<|assistant|>\n";
-        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_RWKV_WORLD) {
        // this template requires the model to have "\n\n" as EOT token
        for (size_t i = 0; i < chat.size(); i++) {
@@ -868,14 +845,6 @@ int32_t llm_chat_apply_template(
        if (add_ass) {
            ss << "[unused9]助手：";
        }
-    } else if (tmpl == LLM_CHAT_TEMPLATE_SOLAR_OPEN) {
-        for (auto message : chat) {
-            std::string role(message->role);
-            ss << "<|begin|>" << role << "<|content|>" << message->content << "<|end|>";
-        }
-        if (add_ass) {
-            ss << "<|begin|>assistant";
-        }
    } else {
        // template not supported
        return -1;
--- a/llama/llama.cpp/src/llama-chat.h
+++ b/llama/llama.cpp/src/llama-chat.h
@@ -36,7 +36,6 @@ enum llm_chat_template {
    LLM_CHAT_TEMPLATE_MINICPM,
    LLM_CHAT_TEMPLATE_EXAONE_3,
    LLM_CHAT_TEMPLATE_EXAONE_4,
-    LLM_CHAT_TEMPLATE_EXAONE_MOE,
    LLM_CHAT_TEMPLATE_RWKV_WORLD,
    LLM_CHAT_TEMPLATE_GRANITE,
    LLM_CHAT_TEMPLATE_GIGACHAT,
@@ -55,7 +54,6 @@ enum llm_chat_template {
    LLM_CHAT_TEMPLATE_SEED_OSS,
    LLM_CHAT_TEMPLATE_GROK_2,
    LLM_CHAT_TEMPLATE_PANGU_EMBED,
-    LLM_CHAT_TEMPLATE_SOLAR_OPEN,
    LLM_CHAT_TEMPLATE_UNKNOWN,
 };

--- a/llama/llama.cpp/src/llama-context.cpp
+++ b/llama/llama.cpp/src/llama-context.cpp
--- a/llama/llama.cpp/src/llama-context.h
+++ b/llama/llama.cpp/src/llama-context.h
@@ -40,14 +40,6 @@ struct llama_context {

    ~llama_context();

-    // reserve a new backend scheduler (if needed)
-    // for example, when:
-    //   - changing loras
-    //   - changing samplers
-    //   - changing attention type
-    //   - etc.
-    void sched_reserve();
-
    void synchronize();

    const llama_model   & get_model()   const;
@@ -78,18 +70,6 @@ struct llama_context {
    float * get_embeddings_ith(int32_t i);
    float * get_embeddings_seq(llama_seq_id seq_id);

-    llama_token * get_sampled_tokens() const;
-    llama_token   get_sampled_token_ith(int32_t idx);
-
-    float * get_sampled_logits_ith(int32_t idx);
-    size_t  get_sampled_logits_count(int32_t idx);
-
-    float * get_sampled_probs_ith(int32_t idx);
-    size_t  get_sampled_probs_count(int32_t idx);
-
-    const llama_token * get_sampled_candidates_ith(int32_t idx);
-    size_t get_sampled_candidates_count(int32_t idx);
-
    void attach_threadpool(
            ggml_threadpool_t threadpool,
            ggml_threadpool_t threadpool_batch);
@@ -212,13 +192,10 @@ private:

    // Make sure enough space is available for outputs.
    // Returns max number of outputs for which space was reserved.
-    uint32_t output_reserve(int32_t n_outputs, const llama_batch & batch);
+    uint32_t output_reserve(int32_t n_outputs);

    void output_reorder();

-    // map the output row index `i` to batch index
-    int64_t output_resolve_row(int32_t i) const;
-
    //
    // graph
    //
@@ -236,8 +213,6 @@ public:
    ggml_cgraph * graph_reserve(
        uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false, size_t * sizes = nullptr);

-    bool set_sampler(llama_seq_id seq_id, llama_sampler * sampler);
-
 private:
    llm_graph_params graph_params(
                        llm_graph_result * res,
@@ -277,31 +252,6 @@ private:
    size_t  embd_size = 0; // capacity (of floats) for embeddings
    float * embd      = nullptr;

-    // TODO: simplify
-    struct sampling_info {
-        std::map<llama_seq_id, llama_sampler *> samplers;
-
-        float       * logits      = nullptr;
-        size_t        logits_size = 0;
-
-        llama_token * sampled      = nullptr;
-        size_t        sampled_size = 0;
-
-        float       * probs        = nullptr;
-        size_t        probs_size   = 0;
-
-        llama_token * candidates   = nullptr;
-        size_t        candidates_size = 0;
-
-        std::vector<uint32_t> logits_count;
-        std::vector<uint32_t> probs_count;
-        std::vector<uint32_t> candidates_count;
-
-        std::vector<llama_token> token_ids_full_vocab;
-    };
-
-    sampling_info sampling;
-
    // sequence embeddings output (map of [n_embd] vectors)
    // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
    std::map<llama_seq_id, std::vector<float>> embd_seq;
@@ -322,8 +272,6 @@ private:

    ggml_backend_sched_ptr sched;

-    bool sched_need_reserve = true;
-
    ggml_backend_t backend_cpu = nullptr;
    std::vector<ggml_backend_ptr> backends;

--- a/llama/llama.cpp/src/llama-cparams.h
+++ b/llama/llama.cpp/src/llama-cparams.h
@@ -30,12 +30,10 @@ struct llama_cparams {
    bool causal_attn;
    bool offload_kqv;
    bool flash_attn;
-    bool auto_fa;
    bool no_perf;
    bool warmup;
    bool op_offload;
    bool kv_unified;
-    bool pipeline_parallel;

    enum llama_pooling_type pooling_type;

--- a/llama/llama.cpp/src/llama-grammar.cpp
+++ b/llama/llama.cpp/src/llama-grammar.cpp
@@ -369,44 +369,6 @@ static void print_rule(
    fprintf(file, "\n");
 }

-//
-// Regex utilities
-//
-
-size_t llama_grammar_trigger_pattern::find(const std::string & input) const {
-    auto find_start_pos = [](const std::smatch & match) {
-        // get from the first matched capturing group to the end of the string
-        size_t start = std::string::npos;
-        for (auto i = 1u; i < match.size(); i++) {
-            if (match.length(i) > 0) {
-                start = match.position(i);
-                break;
-            }
-        }
-        if (start == std::string::npos) {
-            start = match.position(0);
-        }
-        return start;
-    };
-
-    if (!pattern.empty() && pattern.front() == '^' && pattern.back() == '$') {
-        // match against the entire input
-        std::smatch match;
-        if (std::regex_match(input, match, regex)) {
-            return find_start_pos(match);
-        }
-    }
-
-    // search anywhere
-    std::smatch match;
-    if (std::regex_search(input, match, regex)) {
-        return find_start_pos(match);
-    }
-
-    return std::string::npos;
-}
-
-
 //
 // implementation
 //
@@ -1359,10 +1321,21 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
            grammar.trigger_buffer_positions.push_back(std::make_pair(token, position));
            grammar.trigger_buffer += piece;

+            std::smatch match;
            for (const auto & trigger_pattern : grammar.trigger_patterns) {
-                auto start = trigger_pattern.find(grammar.trigger_buffer);
-                if (start != std::string::npos) {
+                if (std::regex_match(grammar.trigger_buffer, match, trigger_pattern.regex)) {
                    grammar.awaiting_trigger = false;
+                    // get from the first matched capturing group to the end of the string
+                    size_t start = std::string::npos;
+                    for (auto i = 1u; i < match.size(); i++) {
+                        if (match.length(i) > 0) {
+                            start = match.position(i);
+                            break;
+                        }
+                    }
+                    if (start == std::string::npos) {
+                        start = match.position(0);
+                    }

                    // replay tokens that overlap with [start, end)
                    for (const auto & [tok, tok_pos] : grammar.trigger_buffer_positions) {
--- a/llama/llama.cpp/src/llama-grammar.h
+++ b/llama/llama.cpp/src/llama-grammar.h
@@ -130,8 +130,6 @@ struct llama_grammar_parser {
 struct llama_grammar_trigger_pattern {
    std::string pattern;
    std::regex  regex;
-
-    size_t find(const std::string & input) const;
 };

 struct llama_grammar {
--- a/llama/llama.cpp/src/llama-graph.cpp
+++ b/llama/llama.cpp/src/llama-graph.cpp
@@ -7,13 +7,11 @@
 #include "llama-kv-cache.h"
 #include "llama-kv-cache-iswa.h"
 #include "llama-memory-hybrid.h"
-#include "llama-memory-hybrid-iswa.h"
 #include "llama-memory-recurrent.h"

 #include <cassert>
 #include <cmath>
 #include <cstring>
-#include <unordered_set>

 void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
    if (ubatch->token) {
@@ -23,8 +21,7 @@ void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
    }

    if (ubatch->embd) {
-        GGML_ASSERT(n_embd == embd->ne[0]);
-
+        const int64_t n_embd   = embd->ne[0];
        const int64_t n_tokens = ubatch->n_tokens;

        ggml_backend_tensor_set(embd, ubatch->embd, 0, n_tokens*n_embd*ggml_element_size(embd));
@@ -34,8 +31,8 @@ void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
 bool llm_graph_input_embd::can_reuse(const llm_graph_params & params) {
    bool res = true;

-    res &= (!params.ubatch.token) || (tokens && tokens->ne[0] == params.ubatch.n_tokens);
-    res &= (!params.ubatch.embd)  || (embd   &&   embd->ne[1] == params.ubatch.n_tokens);
+    res &= (!tokens && !params.ubatch.token) || (tokens && tokens->ne[0] == params.ubatch.n_tokens);
+    res &= (!embd   && !params.ubatch.embd)  || (embd   &&   embd->ne[0] == params.ubatch.n_tokens);

    return res;
 }
@@ -65,7 +62,7 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
 bool llm_graph_input_pos::can_reuse(const llm_graph_params & params) {
    bool res = true;

-    res &= pos->ne[0] == params.ubatch.n_tokens*n_pos_per_embd;
+    res &= pos->ne[0] == params.ubatch.n_tokens;

    return res;
 }
@@ -98,9 +95,11 @@ void llm_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) {

        int32_t * data = (int32_t *) pos_bucket->data;

-        for (int j = 0; j < n_tokens; ++j) {
-            for (int i = 0; i < n_tokens; ++i) {
-                data[j*n_tokens + i] = llama_relative_position_bucket(ubatch->pos[i], ubatch->pos[j], hparams.n_rel_attn_bkts, true);
+        for (int h = 0; h < 1; ++h) {
+            for (int j = 0; j < n_tokens; ++j) {
+                for (int i = 0; i < n_tokens; ++i) {
+                    data[h*(n_tokens*n_tokens) + j*n_tokens + i] = llama_relative_position_bucket(ubatch->pos[i], ubatch->pos[j], hparams.n_rel_attn_bkts, true);
+                }
            }
        }
    }
@@ -323,32 +322,34 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
    const int64_t n_tokens = ubatch->n_tokens;

    const auto fill_mask = [&](float * data, int n_swa, llama_swa_type swa_type) {
-        for (int i1 = 0; i1 < n_tokens; ++i1) {
-            const llama_seq_id s1 = ubatch->seq_id[i1][0];
-            const llama_pos    p1 = ubatch->pos[i1];
+        for (int h = 0; h < 1; ++h) {
+            for (int i1 = 0; i1 < n_tokens; ++i1) {
+                const llama_seq_id s1 = ubatch->seq_id[i1][0];
+                const llama_pos    p1 = ubatch->pos[i1];

-            const uint64_t idst = i1*n_kv;
+                const uint64_t idst = h*(n_kv*n_tokens) + i1*n_kv;

-            for (int i0 = 0; i0 < n_tokens; ++i0) {
-                const llama_seq_id s0 = ubatch->seq_id[i0][0];
-                const llama_pos p0    = ubatch->pos[i0];
+                for (int i0 = 0; i0 < n_tokens; ++i0) {
+                    const llama_seq_id s0 = ubatch->seq_id[i0][0];
+                    const llama_pos p0    = ubatch->pos[i0];

-                // mask different sequences
-                if (s0 != s1) {
-                    continue;
+                    // mask different sequences
+                    if (s0 != s1) {
+                        continue;
+                    }
+
+                    // mask future tokens
+                    if (cparams.causal_attn && p0 > p1) {
+                        continue;
+                    }
+
+                    // apply SWA if any
+                    if (llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1)) {
+                        continue;
+                    }
+
+                    data[idst + i0] = hparams.use_alibi ? -std::abs(p0 - p1) : 0.0f;
                }
-
-                // mask future tokens
-                if (cparams.causal_attn && p0 > p1) {
-                    continue;
-                }
-
-                // apply SWA if any
-                if (llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1)) {
-                    continue;
-                }
-
-                data[idst + i0] = hparams.use_alibi ? -std::abs(p0 - p1) : 0.0f;
            }
        }
    };
@@ -407,27 +408,6 @@ bool llm_graph_input_attn_kv::can_reuse(const llm_graph_params & params) {
    return res;
 }

-void llm_graph_input_attn_k::set_input(const llama_ubatch * ubatch) {
-    mctx->set_input_k_idxs(self_k_idxs, ubatch);
-
-    mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
-}
-
-bool llm_graph_input_attn_k::can_reuse(const llm_graph_params & params) {
-    const auto * mctx = static_cast<const llama_kv_cache_context *>(params.mctx);
-
-    this->mctx = mctx;
-
-    bool res = true;
-
-    res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
-
-    res &= self_kq_mask->ne[0] == mctx->get_n_kv();
-    res &= self_kq_mask->ne[1] == params.ubatch.n_tokens;
-
-    return res;
-}
-
 void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) {
    mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch);
    mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch);
@@ -473,19 +453,27 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {

    float * data = (float *) cross_kq_mask->data;

-    for (int i = 0; i < n_tokens; ++i) {
-        for (int j = 0; j < n_enc; ++j) {
-            float f = -INFINITY;
+    for (int h = 0; h < 1; ++h) {
+        for (int i = 0; i < n_tokens; ++i) {
+            for (int j = 0; j < n_enc; ++j) {
+                float f = -INFINITY;

-            for (int s = 0; s < ubatch->n_seq_id[i]; ++s) {
-                const llama_seq_id seq_id = ubatch->seq_id[i][s];
+                for (int s = 0; s < ubatch->n_seq_id[i]; ++s) {
+                    const llama_seq_id seq_id = ubatch->seq_id[i][s];

-                if (cross->seq_ids_enc[j].find(seq_id) != cross->seq_ids_enc[j].end()) {
-                    f = 0.0f;
+                    if (cross->seq_ids_enc[j].find(seq_id) != cross->seq_ids_enc[j].end()) {
+                        f = 0.0f;
+                    }
                }
-            }

-            data[i*n_enc + j] = f;
+                data[h*(n_enc*n_tokens) + i*n_enc + j] = f;
+            }
+        }
+
+        for (int i = n_tokens; i < n_tokens; ++i) {
+            for (int j = 0; j < n_enc; ++j) {
+                data[h*(n_enc*n_tokens) + i*n_enc + j] = -INFINITY;
+            }
        }
    }
 }
@@ -533,113 +521,6 @@ bool llm_graph_input_mem_hybrid::can_reuse(const llm_graph_params & params) {
    return res;
 }

-void llm_graph_input_mem_hybrid_iswa::set_input(const llama_ubatch * ubatch) {
-    const auto * attn_ctx = mctx->get_attn();
-
-    // base tensors may not be allocated if there are no non-SWA attention layers
-    if (inp_attn->self_k_idxs && inp_attn->self_k_idxs->buffer) {
-        attn_ctx->get_base()->set_input_k_idxs(inp_attn->self_k_idxs, ubatch);
-        attn_ctx->get_base()->set_input_v_idxs(inp_attn->self_v_idxs, ubatch);
-
-        attn_ctx->get_base()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn);
-    }
-
-    // swa tensors may not be allocated if there are no SWA attention layers
-    if (inp_attn->self_k_idxs_swa && inp_attn->self_k_idxs_swa->buffer) {
-        attn_ctx->get_swa()->set_input_k_idxs(inp_attn->self_k_idxs_swa, ubatch);
-        attn_ctx->get_swa()->set_input_v_idxs(inp_attn->self_v_idxs_swa, ubatch);
-
-        attn_ctx->get_swa()->set_input_kq_mask(inp_attn->self_kq_mask_swa, ubatch, cparams.causal_attn);
-    }
-
-    const int64_t n_rs = mctx->get_recr()->get_n_rs();
-
-    if (inp_rs->s_copy) {
-        GGML_ASSERT(ggml_backend_buffer_is_host(inp_rs->s_copy->buffer));
-        int32_t * data = (int32_t *) inp_rs->s_copy->data;
-
-        // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
-        for (uint32_t i = 0; i < n_rs; ++i) {
-            data[i] = mctx->get_recr()->s_copy(i);
-        }
-    }
-}
-
-bool llm_graph_input_mem_hybrid_iswa::can_reuse(const llm_graph_params & params) {
-    const auto * mctx = static_cast<const llama_memory_hybrid_iswa_context *>(params.mctx);
-
-    this->mctx = mctx;
-
-    bool res = true;
-
-    const auto * attn_ctx = mctx->get_attn();
-
-    // base tensors may not be allocated if there are no non-SWA attention layers
-    if (inp_attn->self_k_idxs && inp_attn->self_k_idxs->buffer) {
-        res &= inp_attn->self_k_idxs->ne[0] == params.ubatch.n_tokens;
-      //res &= inp_attn->self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
-
-        res &= inp_attn->self_kq_mask->ne[0] == attn_ctx->get_base()->get_n_kv();
-        res &= inp_attn->self_kq_mask->ne[1] == params.ubatch.n_tokens;
-    }
-
-    // swa tensors may not be allocated if there are no SWA attention layers
-    if (inp_attn->self_k_idxs_swa && inp_attn->self_k_idxs_swa->buffer) {
-        res &= inp_attn->self_k_idxs_swa->ne[0] == params.ubatch.n_tokens;
-      //res &= inp_attn->self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
-
-        res &= inp_attn->self_kq_mask_swa->ne[0] == attn_ctx->get_swa()->get_n_kv();
-        res &= inp_attn->self_kq_mask_swa->ne[1] == params.ubatch.n_tokens;
-    }
-
-    res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs();
-
-    res &= inp_rs->s_copy_main->ne[0]  == params.ubatch.n_seqs;
-    res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs;
-
-    res &= inp_rs->head == mctx->get_recr()->get_head();
-    res &= inp_rs->rs_z == mctx->get_recr()->get_rs_z();
-
-    return res;
-}
-
-void llm_graph_input_sampling::set_input(const llama_ubatch * ubatch) {
-    // set the inputs only for the active samplers in the current ubatch
-    std::unordered_set<llama_seq_id> active_samplers;
-    for (uint32_t i = 0; i < ubatch->n_tokens; i++) {
-        if (ubatch->output[i]) {
-            llama_seq_id seq_id = ubatch->seq_id[i][0];
-            active_samplers.insert(seq_id);
-        }
-    }
-
-    for (auto seq_id : active_samplers) {
-        if (samplers.find(seq_id) == samplers.end()) {
-            continue;
-        }
-
-        auto & sampler = samplers[seq_id];
-
-        if (sampler->iface->backend_set_input) {
-            sampler->iface->backend_set_input(sampler);
-        }
-    }
-}
-
-bool llm_graph_input_sampling::can_reuse(const llm_graph_params & params) {
-    if (samplers.size() != params.samplers.size()) {
-        return false;
-    }
-
-    for (const auto & [seq_id, sampler] : params.samplers) {
-        if (samplers[seq_id] != sampler) {
-            return false;
-        }
-    }
-
-    return true;
-}
-
 //
 // llm_graph_result
 //
@@ -656,15 +537,10 @@ int64_t llm_graph_result::get_max_nodes() const {
 }

 void llm_graph_result::reset() {
-    t_inp_tokens  = nullptr;
-    t_inp_embd    = nullptr;
+    t_tokens      = nullptr;
    t_logits      = nullptr;
    t_embd        = nullptr;
    t_embd_pooled = nullptr;
-    t_sampled.clear();
-    t_sampled_probs.clear();
-    t_sampled_logits.clear();
-    t_candidates.clear();

    params = {};

@@ -689,38 +565,6 @@ void llm_graph_result::set_inputs(const llama_ubatch * ubatch) {
    }
 }

-void llm_graph_result::set_outputs() {
-    if (t_logits != nullptr) {
-        ggml_set_output(t_logits);
-    }
-    if (t_embd != nullptr) {
-        ggml_set_output(t_embd);
-    }
-    if (t_embd_pooled != nullptr) {
-        ggml_set_output(t_embd_pooled);
-    }
-    for (auto & [seq_id, t] : t_sampled) {
-        if (t != nullptr) {
-            ggml_set_output(t);
-        }
-    }
-    for (auto & [seq_id, t] : t_sampled_probs) {
-        if (t != nullptr) {
-            ggml_set_output(t);
-        }
-    }
-    for (auto & [seq_id, t] : t_sampled_logits) {
-        if (t != nullptr) {
-            ggml_set_output(t);
-        }
-    }
-    for (auto & [seq_id, t] : t_candidates) {
-        if (t != nullptr) {
-            ggml_set_output(t);
-        }
-    }
-}
-
 bool llm_graph_result::can_reuse(const llm_graph_params & params) {
    if (!this->params.allow_reuse(params)) {
        if (debug > 1) {
@@ -802,7 +646,6 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
    loras            (params.loras),
    mctx             (params.mctx),
    cross            (params.cross),
-    samplers         (params.samplers),
    cb_func          (params.cb),
    res              (params.res),
    ctx0             (res->get_ctx()),
@@ -1361,29 +1204,17 @@ ggml_tensor * llm_graph_context::build_moe_ffn(

 // input embeddings with optional lora
 ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
-    const int64_t n_embd_inp = hparams.n_embd_inp();
-    const int64_t n_embd     = hparams.n_embd;
+    const int64_t n_embd = hparams.n_embd_inp();

-    assert(n_embd_inp >= n_embd);
+    auto inp = std::make_unique<llm_graph_input_embd>();

-    auto inp = std::make_unique<llm_graph_input_embd>(n_embd_inp);
+    ggml_tensor * cur = nullptr;

-    inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
-    cb(inp->tokens, "inp_tokens", -1);
-    ggml_set_input(inp->tokens);
-    res->t_inp_tokens = inp->tokens;
-
-    inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd_inp, ubatch.n_tokens);
-    cb(inp->embd, "inp_embd", -1);
-    ggml_set_input(inp->embd);
-
-    // select one of the 2 inputs, based on the batch contents
-    // ref: https://github.com/ggml-org/llama.cpp/pull/18550
-    std::array<ggml_tensor *, 2> inps;
-
-    // token embeddings path (ubatch.token != nullptr)
-    {
-        auto & cur = inps[0];
+    if (ubatch.token) {
+        inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
+        //cb(inp->tokens, "inp_tokens", -1);
+        ggml_set_input(inp->tokens);
+        res->t_tokens = inp->tokens;

        cur = ggml_get_rows(ctx0, tok_embd, inp->tokens);

@@ -1404,43 +1235,22 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {

            cur = ggml_add(ctx0, cur, inpL_delta);
        }
-
-        if (n_embd_inp != n_embd) {
-            cur = ggml_pad(ctx0, cur, hparams.n_embd_inp() - n_embd, 0, 0, 0);
-        }
-    }
-
-    // vector embeddings path (ubatch.embd != nullptr)
-    {
-        auto & cur = inps[1];
+    } else {
+        inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
+        ggml_set_input(inp->embd);

        cur = inp->embd;
    }

-    assert(ggml_are_same_shape (inps[0], inps[1]));
-    assert(ggml_are_same_stride(inps[0], inps[1]));
-
-    ggml_tensor * cur = ggml_build_forward_select(gf, inps.data(), inps.size(), ubatch.token ? 0 : 1);
-
-    if (n_embd_inp != n_embd) {
-        cur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0);
-    }
-
-    res->t_inp_embd = cur;
-
    // For Granite architecture
    if (hparams.f_embedding_scale != 0.0f) {
        cur = ggml_scale(ctx0, cur, hparams.f_embedding_scale);
    }

-    cb(cur, "embd", -1);
+    cb(cur, "inp_embd", -1);

    res->add_input(std::move(inp));

-    // make sure the produced embeddings are immediately materialized in the ggml graph
-    // ref: https://github.com/ggml-org/llama.cpp/pull/18599
-    ggml_build_forward_expand(gf, cur);
-
    return cur;
 }

@@ -1532,7 +1342,7 @@ ggml_tensor * llm_graph_context::build_inp_cross_embd() const {
    //}

    const auto n_embd = !cross->v_embd.empty() ? cross->n_embd : hparams.n_embd_inp();
-    const auto n_enc  = !cross->v_embd.empty() ? cross->n_enc  : hparams.n_ctx_train;
+    const auto n_enc  = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train;

    cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_enc);
    ggml_set_input(cur);
@@ -1630,11 +1440,6 @@ ggml_tensor * llm_graph_context::build_attn_mha(
                                  hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
        cb(cur, LLAMA_TENSOR_NAME_FATTN, il);

-        if (!cparams.offload_kqv) {
-            // all nodes between the KV store and the attention output are run on the CPU
-            ggml_backend_sched_set_tensor_backend(sched, cur, backend_cpu);
-        }
-
        ggml_flash_attn_ext_add_sinks(cur, sinks);
        ggml_flash_attn_ext_set_prec (cur, GGML_PREC_F32);

@@ -1844,11 +1649,9 @@ ggml_tensor * llm_graph_context::build_attn(
        ggml_tensor * v_cur,
        ggml_tensor * kq_b,
        ggml_tensor * sinks,
-        ggml_tensor * v_mla, // TODO: remove
+        ggml_tensor * v_mla,
            float     kq_scale,
            int       il) const {
-    GGML_ASSERT(v_mla == nullptr);
-
    // these nodes are added to the graph together so that they are not reordered
    // by doing so, the number of splits in the graph is reduced
    // expand k later to enable rope fusion which directly writes into k-v cache
@@ -1891,93 +1694,6 @@ ggml_tensor * llm_graph_context::build_attn(
    return cur;
 }

-static std::unique_ptr<llm_graph_input_attn_k> build_attn_inp_k_impl(
-           ggml_context * ctx0,
-     const llama_ubatch & ubatch,
-    const llama_hparams & hparams,
-    const llama_cparams & cparams,
-    const llama_kv_cache_context * mctx_cur) {
-
-    auto inp = std::make_unique<llm_graph_input_attn_k>(hparams, cparams, mctx_cur);
-
-    {
-        GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_iswa for SWA");
-
-        const auto n_kv     = mctx_cur->get_n_kv();
-        const auto n_tokens = ubatch.n_tokens;
-        const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq;
-
-        inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch);
-
-        inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
-        ggml_set_input(inp->self_kq_mask);
-
-        inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
-    }
-
-    return inp;
-}
-
-llm_graph_input_attn_k * llm_graph_context::build_attn_inp_k() const {
-    const auto * mctx_cur = static_cast<const llama_kv_cache_context *>(mctx);
-
-    auto inp = build_attn_inp_k_impl(ctx0, ubatch, hparams, cparams, mctx_cur);
-
-    return (llm_graph_input_attn_k *) res->add_input(std::move(inp));
-}
-
-ggml_tensor * llm_graph_context::build_attn(
-        llm_graph_input_attn_k * inp,
-        ggml_tensor * wo,
-        ggml_tensor * wo_b,
-        ggml_tensor * q_cur,
-        ggml_tensor * k_cur,
-        ggml_tensor * v_cur,
-        ggml_tensor * kq_b,
-        ggml_tensor * sinks,
-        ggml_tensor * v_mla,
-            float     kq_scale,
-            int       il) const {
-    // these nodes are added to the graph together so that they are not reordered
-    // by doing so, the number of splits in the graph is reduced
-    // expand k later to enable rope fusion which directly writes into k-v cache
-    ggml_build_forward_expand(gf, q_cur);
-    ggml_build_forward_expand(gf, v_cur);
-    ggml_build_forward_expand(gf, k_cur);
-
-    const auto * mctx_cur = inp->mctx;
-
-    // store to KV cache
-    {
-        const auto & k_idxs = inp->get_k_idxs();
-
-        ggml_build_forward_expand(gf, mctx_cur->cpy_k(ctx0, k_cur, k_idxs, il));
-    }
-
-    const auto & kq_mask = inp->get_kq_mask();
-
-    ggml_tensor * q = q_cur;
-    ggml_tensor * k = mctx_cur->get_k(ctx0, il);
-    ggml_tensor * v = ggml_view_4d(ctx0, k, v_cur->ne[0], k->ne[1], k->ne[2], k->ne[3], k->nb[1], k->nb[2], k->nb[3], 0);
-
-    ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
-    cb(cur, "kqv_out", il);
-
-    if (wo) {
-        cur = build_lora_mm(wo, cur);
-        if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) {
-            // GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
-            ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
-        }
-    }
-
-    if (wo_b) {
-        cur = ggml_add(ctx0, cur, wo_b);
-    }
-
-    return cur;
-}
-
 ggml_tensor * llm_graph_context::build_attn(
        llm_graph_input_attn_kv_iswa * inp,
        ggml_tensor * wo,
@@ -2118,10 +1834,8 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const

        inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
        ggml_set_input(inp->self_kq_mask);
-        ggml_set_name(inp->self_kq_mask, "self_kq_mask");

        inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
-        ggml_set_name(inp->self_kq_mask_cnv, "self_kq_mask_cnv");
    }

    {
@@ -2134,10 +1848,8 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const

        inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
        ggml_set_input(inp->self_kq_mask_swa);
-        ggml_set_name(inp->self_kq_mask_swa, "self_kq_mask_swa");

        inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
-        ggml_set_name(inp->self_kq_mask_swa_cnv, "self_kq_mask_swa_cnv");
    }

    return (llm_graph_input_attn_kv_iswa *) res->add_input(std::move(inp));
@@ -2273,62 +1985,17 @@ llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
    return (llm_graph_input_mem_hybrid *) res->add_input(std::move(inp));
 }

-llm_graph_input_mem_hybrid_iswa * llm_graph_context::build_inp_mem_hybrid_iswa() const {
-    const auto * mctx_cur = static_cast<const llama_memory_hybrid_iswa_context *>(mctx);
-
-    auto inp_rs = build_rs_inp_impl(ctx0, ubatch, mctx_cur->get_recr());
-
-    // build iswa attention input
-    const auto * attn_ctx = mctx_cur->get_attn();
-
-    auto inp_attn = std::make_unique<llm_graph_input_attn_kv_iswa>(hparams, cparams, attn_ctx);
-
-    const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq;
-
-    {
-        const auto n_kv = attn_ctx->get_base()->get_n_kv();
-
-        inp_attn->self_k_idxs = attn_ctx->get_base()->build_input_k_idxs(ctx0, ubatch);
-        inp_attn->self_v_idxs = attn_ctx->get_base()->build_input_v_idxs(ctx0, ubatch);
-
-        inp_attn->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
-        ggml_set_input(inp_attn->self_kq_mask);
-
-        inp_attn->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_attn->self_kq_mask, GGML_TYPE_F16) : inp_attn->self_kq_mask;
-    }
-
-    {
-        const auto n_kv = attn_ctx->get_swa()->get_n_kv();
-
-        inp_attn->self_k_idxs_swa = attn_ctx->get_swa()->build_input_k_idxs(ctx0, ubatch);
-        inp_attn->self_v_idxs_swa = attn_ctx->get_swa()->build_input_v_idxs(ctx0, ubatch);
-
-        inp_attn->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
-        ggml_set_input(inp_attn->self_kq_mask_swa);
-
-        inp_attn->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_attn->self_kq_mask_swa, GGML_TYPE_F16) : inp_attn->self_kq_mask_swa;
-    }
-
-    auto inp = std::make_unique<llm_graph_input_mem_hybrid_iswa>(cparams, std::move(inp_attn), std::move(inp_rs), mctx_cur);
-
-    return (llm_graph_input_mem_hybrid_iswa *) res->add_input(std::move(inp));
-}
-
 void llm_graph_context::build_dense_out(
    ggml_tensor * dense_2,
    ggml_tensor * dense_3) const {
-    if (!cparams.embeddings || !(dense_2 || dense_3)) {
+    if (!cparams.embeddings || dense_2 == nullptr || dense_3 == nullptr) {
        return;
    }
    ggml_tensor * cur = res->t_embd_pooled != nullptr ? res->t_embd_pooled : res->t_embd;
    GGML_ASSERT(cur != nullptr && "missing t_embd_pooled/t_embd");

-    if (dense_2) {
-        cur = ggml_mul_mat(ctx0, dense_2, cur);
-    }
-    if (dense_3) {
-        cur = ggml_mul_mat(ctx0, dense_3, cur);
-    }
+    cur = ggml_mul_mat(ctx0, dense_2, cur);
+    cur = ggml_mul_mat(ctx0, dense_3, cur);
    cb(cur, "result_embd_pooled", -1);
    res->t_embd_pooled = cur;
    ggml_build_forward_expand(gf, cur);
@@ -2419,87 +2086,6 @@ void llm_graph_context::build_pooling(
    ggml_build_forward_expand(gf, cur);
 }

-void llm_graph_context::build_sampling() const {
-    if (samplers.empty() || !res->t_logits) {
-        return;
-    }
-
-    auto inp_sampling = std::make_unique<llm_graph_input_sampling>(samplers);
-    res->add_input(std::move(inp_sampling));
-
-    std::map<llama_seq_id, int32_t> seq_to_logit_row;
-    int32_t logit_row_idx = 0;
-
-    for (uint32_t i = 0; i < ubatch.n_tokens; i++) {
-        if (ubatch.output[i]) {
-            llama_seq_id seq_id = ubatch.seq_id[i][0];
-            seq_to_logit_row[seq_id] = logit_row_idx;
-            logit_row_idx++;
-        }
-    }
-
-    // res->t_logits will contain logits for all tokens that want the logits calculated (logits=1 or output=1)
-    GGML_ASSERT(res->t_logits != nullptr && "missing t_logits tensor");
-
-    // add a dummy row of logits
-    // this trick makes the graph static, regardless of which samplers are activated
-    // this is important in order to minimize graph reallocations
-    // TODO: use `ggml_build_forward_select()` when available (https://github.com/ggml-org/llama.cpp/pull/18550)
-    ggml_tensor * logits_t = ggml_pad(ctx0, res->t_logits, 0, 1, 0, 0);
-
-    for (const auto & [seq_id, sampler] : samplers) {
-        const auto it = seq_to_logit_row.find(seq_id);
-
-        // inactive samplers always work on the first row
-        const auto row_idx = seq_to_logit_row.find(seq_id) != seq_to_logit_row.end() ? it->second : 0;
-
-        ggml_tensor * logits_seq = ggml_view_1d(ctx0, logits_t, logits_t->ne[0], row_idx * logits_t->nb[1]);
-        ggml_format_name(logits_seq, "logits_seq_%d", seq_id);
-
-        struct llama_sampler_data data = {
-            /*.logits      =*/ logits_seq,
-            /*.probs       =*/ nullptr,
-            /*.sampled     =*/ nullptr,
-            /*.candidates  =*/ nullptr,
-        };
-
-        assert(sampler->iface->backend_apply);
-        sampler->iface->backend_apply(sampler, ctx0, gf, &data);
-
-        if (data.sampled != nullptr) {
-            res->t_sampled[seq_id] = data.sampled;
-            ggml_build_forward_expand(gf, data.sampled);
-        }
-
-        if (data.probs != nullptr) {
-            res->t_sampled_probs[seq_id] = data.probs;
-            ggml_build_forward_expand(gf, data.probs);
-        }
-
-        if (data.logits != nullptr) {
-            res->t_sampled_logits[seq_id] = data.logits;
-            ggml_build_forward_expand(gf, data.logits);
-        }
-
-        if (data.candidates != nullptr) {
-            res->t_candidates[seq_id] = data.candidates;
-            ggml_build_forward_expand(gf, data.candidates);
-        }
-    }
-
-    // TODO: Call llama_sampler_accept_ggml after all samplers have been applied.
-    /*
-    for (const auto & [seq_id, sampler] : samplers) {
-        if (auto it = res->t_sampled.find(seq_id); it != res->t_sampled.end()) {
-            ggml_tensor * selected_token = it->second;
-            if (selected_token != nullptr) {
-                llama_sampler_accept_ggml(sampler, ctx0, gf, selected_token);
-            }
-        }
-    }
-    */
-}
-
 int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
    // TODO move to hparams if a T5 variant appears that uses a different value
    const int64_t max_distance = 128;
--- a/llama/llama.cpp/src/llama-graph.h
+++ b/llama/llama.cpp/src/llama-graph.h
@@ -10,7 +10,6 @@
 #include <memory>
 #include <set>
 #include <functional>
-#include <map>

 struct ggml_cgraph;
 struct ggml_context;
@@ -24,7 +23,6 @@ class llama_kv_cache_context;
 class llama_kv_cache_iswa_context;
 class llama_memory_recurrent_context;
 class llama_memory_hybrid_context;
-class llama_memory_hybrid_iswa_context;

 // certain models (typically multi-modal) can produce different types of graphs
 enum llm_graph_type {
@@ -106,7 +104,7 @@ using llm_graph_input_ptr = std::unique_ptr<llm_graph_input_i>;

 class llm_graph_input_embd : public llm_graph_input_i {
 public:
-    llm_graph_input_embd(int64_t n_embd) : n_embd(n_embd) {}
+    llm_graph_input_embd()          = default;
    virtual ~llm_graph_input_embd() = default;

    void set_input(const llama_ubatch * ubatch) override;
@@ -115,8 +113,6 @@ public:

    ggml_tensor * tokens = nullptr; // I32 [n_batch]
    ggml_tensor * embd   = nullptr; // F32 [n_embd, n_batch]
-
-    const int64_t n_embd = 0;
 };

 class llm_graph_input_pos : public llm_graph_input_i {
@@ -317,39 +313,6 @@ public:
    const llama_kv_cache_context * mctx;
 };

-// V-less input for the KV cache
-// ref: https://github.com/ggml-org/llama.cpp/pull/19067
-class llm_graph_input_attn_k : public llm_graph_input_i {
-public:
-    llm_graph_input_attn_k(
-            const llama_hparams & hparams,
-            const llama_cparams & cparams,
-            const llama_kv_cache_context * mctx) :
-        hparams(hparams),
-        cparams(cparams),
-        mctx(mctx) {
-    }
-    ~llm_graph_input_attn_k() = default;
-
-    void set_input(const llama_ubatch * ubatch) override;
-
-    bool can_reuse(const llm_graph_params & params) override;
-
-    ggml_tensor * get_k_idxs() const { return self_k_idxs; }
-
-    ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
-
-    ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch]
-
-    ggml_tensor * self_kq_mask     = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
-    ggml_tensor * self_kq_mask_cnv = nullptr; //     [n_kv, n_batch/n_stream, 1, n_stream]
-
-    const llama_hparams hparams;
-    const llama_cparams cparams;
-
-    const llama_kv_cache_context * mctx;
-};
-
 class llm_graph_input_attn_kv_iswa : public llm_graph_input_i {
 public:
    llm_graph_input_attn_kv_iswa(
@@ -433,46 +396,6 @@ public:
    const llama_memory_hybrid_context * mctx;
 };

-class llm_graph_input_mem_hybrid_iswa : public llm_graph_input_i {
-public:
-    llm_graph_input_mem_hybrid_iswa(
-            const llama_cparams & cparams,
-            std::unique_ptr<llm_graph_input_attn_kv_iswa> inp_attn,
-            std::unique_ptr<llm_graph_input_rs>          inp_rs,
-            const llama_memory_hybrid_iswa_context *     mctx) :
-        inp_attn(std::move(inp_attn)),
-        inp_rs(std::move(inp_rs)),
-        cparams(cparams),
-        mctx(mctx) { }
-    virtual ~llm_graph_input_mem_hybrid_iswa() = default;
-
-    void set_input(const llama_ubatch * ubatch) override;
-
-    bool can_reuse(const llm_graph_params & params) override;
-
-    std::unique_ptr<llm_graph_input_attn_kv_iswa> inp_attn;
-    std::unique_ptr<llm_graph_input_rs>          inp_rs;
-
-    llm_graph_input_attn_kv_iswa * get_attn() const { return inp_attn.get(); }
-    llm_graph_input_rs           * get_recr() const { return inp_rs.get(); }
-
-    const llama_cparams cparams;
-
-    const llama_memory_hybrid_iswa_context * mctx;
-};
-
-class llm_graph_input_sampling : public llm_graph_input_i {
-public:
-    llm_graph_input_sampling(std::map<llama_seq_id, llama_sampler *> samplers) :
-        samplers(std::move(samplers)) { }
-    virtual ~llm_graph_input_sampling() = default;
-
-    void set_input(const llama_ubatch * ubatch) override;
-    bool can_reuse(const llm_graph_params & params) override;
-
-    std::map<llama_seq_id, llama_sampler *> samplers;
-};
-
 //
 // llm_graph_result
 //
@@ -506,23 +429,6 @@ struct llm_graph_params {
    const llama_memory_context_i * mctx;
    const llama_cross            * cross;

-    std::map<llama_seq_id, llama_sampler *> samplers;
-
-    static bool samplers_equal(
-          const std::map<llama_seq_id, llama_sampler *> & lhs,
-          const std::map<llama_seq_id, llama_sampler *> & rhs) {
-        if (lhs.size() != rhs.size()) {
-            return false;
-        }
-        for (const auto & [seq_id, sampler] : lhs) {
-            auto it = rhs.find(seq_id);
-            if (it == rhs.end() || it->second != sampler) {
-                return false;
-            }
-        }
-        return true;
-    }
-
    uint32_t n_outputs;

    llm_graph_cb cb;
@@ -562,36 +468,15 @@ struct llm_graph_params {
            return false;
        }

-        if (n_outputs != other.n_outputs) {
-            return false;
-        }
-
-        if (!samplers_equal(samplers, other.samplers)) {
-            return false;
-        }
-
-        if (samplers.size() > 0) {
-            if (!ubatch.data || !other.ubatch.data) {
-                return false;
-            }
-
-            // check that the outputs are the same for all samplers
-            for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
-                if (ubatch.output[i]    != other.ubatch.output[i] ||
-                    ubatch.seq_id[i][0] != other.ubatch.seq_id[i][0]) {
-                    return false;
-                }
-            }
-        }
-
        return
            cparams.embeddings  == other.cparams.embeddings  &&
            cparams.causal_attn == other.cparams.causal_attn &&
-            arch  == other.arch  &&
-            gtype == other.gtype &&
-            cvec  == other.cvec  &&
-            loras == other.loras &&
-            cross == other.cross;
+            arch      == other.arch  &&
+            gtype     == other.gtype &&
+            cvec      == other.cvec  &&
+            loras     == other.loras &&
+            cross     == other.cross &&
+            n_outputs == other.n_outputs;
    }
 };

@@ -601,7 +486,7 @@ public:

    virtual ~llm_graph_result() = default;

-    ggml_tensor * get_inp_tokens()  const { return t_inp_tokens; }
+    ggml_tensor * get_tokens()      const { return t_tokens; }
    ggml_tensor * get_logits()      const { return t_logits; }
    ggml_tensor * get_embd()        const { return t_embd; }
    ggml_tensor * get_embd_pooled() const { return t_embd_pooled; }
@@ -614,7 +499,6 @@ public:
    void reset();

    void set_inputs(const llama_ubatch * ubatch);
-    void set_outputs();

    // try to update the existing graph result using the new graph parameters in order to reuse it
    // this can only be done if we determine that the resulting graph using the new graph parameters
@@ -628,17 +512,11 @@ public:
    void set_params(const llm_graph_params & params);

    // important graph nodes
-    ggml_tensor * t_inp_tokens  = nullptr;
-    ggml_tensor * t_inp_embd    = nullptr; // [n_embd_inp, n_tokens]
+    ggml_tensor * t_tokens      = nullptr;
    ggml_tensor * t_logits      = nullptr;
    ggml_tensor * t_embd        = nullptr;
    ggml_tensor * t_embd_pooled = nullptr;

-    std::map<llama_seq_id, ggml_tensor*> t_sampled_logits;
-    std::map<llama_seq_id, ggml_tensor*> t_candidates;
-    std::map<llama_seq_id, ggml_tensor*> t_sampled;
-    std::map<llama_seq_id, ggml_tensor*> t_sampled_probs;
-
    std::vector<llm_graph_input_ptr> inputs;

    ggml_context_ptr ctx_compute;
@@ -714,8 +592,6 @@ struct llm_graph_context {
    const llama_memory_context_i * mctx;
    const llama_cross            * cross;

-    std::map<llama_seq_id, llama_sampler *> samplers;
-
    const llm_graph_cb & cb_func;

    llm_graph_result * res;
@@ -866,21 +742,6 @@ struct llm_graph_context {
            ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
            ggml_tensor * kq_b,
            ggml_tensor * sinks, // [n_head_q]
-            ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] // TODO: remove
-                  float   kq_scale,
-                    int   il) const;
-
-    llm_graph_input_attn_k  * build_attn_inp_k() const;
-
-    ggml_tensor * build_attn(
-            llm_graph_input_attn_k * inp,
-            ggml_tensor * wo,
-            ggml_tensor * wo_b,
-            ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
-            ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
-            ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
-            ggml_tensor * kq_b,
-            ggml_tensor * sinks, // [n_head_q]
            ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
                  float   kq_scale,
                    int   il) const;
@@ -961,8 +822,6 @@ struct llm_graph_context {

    llm_graph_input_mem_hybrid * build_inp_mem_hybrid() const;

-    llm_graph_input_mem_hybrid_iswa * build_inp_mem_hybrid_iswa() const;
-
    //
    // pooling
    //
@@ -973,12 +832,6 @@ struct llm_graph_context {
            ggml_tensor * cls_out,
            ggml_tensor * cls_out_b) const;

-    //
-    // sampling (backend sampling)
-    //
-
-    void build_sampling() const;
-
    //
    // dense (out)
    //
--- a/llama/llama.cpp/src/llama-hparams.cpp
+++ b/llama/llama.cpp/src/llama-hparams.cpp
@@ -72,10 +72,6 @@ uint32_t llama_hparams::n_embd_inp() const {
    return n_embd_inp;
 }

-uint32_t llama_hparams::n_embd_out() const {
-    return n_embd_out_impl > 0 ? n_embd_out_impl : n_embd;
-}
-
 uint32_t llama_hparams::n_embd_k_gqa(uint32_t il) const {
    const uint32_t n_head_kv = this->n_head_kv(il);

@@ -183,21 +179,6 @@ bool llama_hparams::is_swa(uint32_t il) const {
    GGML_ABORT("fatal error");
 }

-bool llama_hparams::is_mla() const {
-    assert((n_embd_head_k_mla_impl == 0 && n_embd_head_v_mla_impl == 0) ||
-           (n_embd_head_k_mla_impl != 0 && n_embd_head_v_mla_impl != 0));
-
-    return n_embd_head_k_mla_impl != 0 && n_embd_head_v_mla_impl != 0;
-}
-
-uint32_t llama_hparams::n_embd_head_k_mla() const {
-    return is_mla() ? n_embd_head_k_mla_impl : n_embd_head_k;
-}
-
-uint32_t llama_hparams::n_embd_head_v_mla() const {
-    return is_mla() ? n_embd_head_v_mla_impl : n_embd_head_v;
-}
-
 bool llama_hparams::has_kv(uint32_t il) const {
    if (n_layer_kv_from_start >= 0) {
        if (il < (uint32_t) n_layer_kv_from_start) {
@@ -223,6 +204,42 @@ uint32_t llama_hparams::n_layer_kv() const {
    return res;
 }

+bool llama_hparams::is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1) {
+    assert(p0 >= 0 && p1 >= 0);
+
+    switch (swa_type) {
+        case LLAMA_SWA_TYPE_NONE:
+            {
+            } break;
+        case LLAMA_SWA_TYPE_STANDARD:
+            {
+                if (p1 - p0 >= (int32_t) n_swa) {
+                    return true;
+                }
+            } break;
+        case LLAMA_SWA_TYPE_CHUNKED:
+            {
+                const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
+
+                if (p0 < pos_chunk_start) {
+                    return true;
+                }
+            } break;
+        case LLAMA_SWA_TYPE_SYMMETRIC:
+            {
+                const int32_t half_n_swa = (int32_t) n_swa / 2;
+                const int32_t pos_diff = p1 - p0;
+
+                // Mask if outside the symmetric window
+                if (pos_diff < -half_n_swa || pos_diff > half_n_swa) {
+                    return true;
+                }
+            } break;
+    }
+
+    return false;
+}
+
 bool llama_hparams::use_mrope() const {
    return rope_sections[0] > 0 && rope_sections[1] > 0;
 }
--- a/llama/llama.cpp/src/llama-hparams.h
+++ b/llama/llama.cpp/src/llama-hparams.h
@@ -3,7 +3,6 @@
 #include "llama.h"

 #include <array>
-#include <cassert>

 // bump if necessary
 #define LLAMA_MAX_LAYERS  512
@@ -53,8 +52,8 @@ struct llama_hparams {
    uint32_t n_rel_attn_bkts = 0;

    // note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
-    uint32_t n_embd_head_k_mla_impl = 0;
-    uint32_t n_embd_head_v_mla_impl = 0;
+    uint32_t n_embd_head_k_mla = 0;
+    uint32_t n_embd_head_v_mla = 0;

    // for WavTokenizer
    struct llama_hparams_posnet   posnet;
@@ -108,9 +107,9 @@ struct llama_hparams {

    float    rope_attn_factor = 1.0f;
    float    rope_freq_base_train;
-    float    rope_freq_base_train_swa  = 10000.0f;
+    float    rope_freq_base_train_swa;
    float    rope_freq_scale_train;
-    float    rope_freq_scale_train_swa = 1.0f;
+    float    rope_freq_scale_train_swa;

    uint32_t n_ctx_orig_yarn;
    float    rope_yarn_log_mul = 0.0f;
@@ -126,11 +125,10 @@ struct llama_hparams {
    llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
    // the size of the sliding window (0 - no SWA)
    uint32_t n_swa = 0;
-    // if swa_layers[il] == 1, then layer il is SWA
-    // if swa_layers[il] == 0, then layer il is dense (i.e. non-SWA)
+    // if swa_layers[il] == true, then layer il is SWA
+    // if swa_layers[il] == false, then layer il is dense (i.e. non-SWA)
    // by default, all layers are dense
-    // note: using uint32_t type for compatibility reason
-    std::array<uint32_t, LLAMA_MAX_LAYERS> swa_layers;
+    std::array<bool, LLAMA_MAX_LAYERS> swa_layers;

    // for State Space Models
    uint32_t ssm_d_conv  = 0;
@@ -165,9 +163,6 @@ struct llama_hparams {
    // for Classifiers
    uint32_t n_cls_out = 1;

-    // output embedding dimension (0 = use n_embd)
-    uint32_t n_embd_out_impl = 0;
-
    // llama4 smallthinker
    uint32_t n_moe_layer_step        = 0;
    uint32_t n_no_rope_layer_step    = 4;
@@ -240,9 +235,6 @@ struct llama_hparams {
    // dimension of main + auxiliary input embeddings
    uint32_t n_embd_inp() const;

-    // dimension of output embeddings
-    uint32_t n_embd_out() const;
-
    // dimension of key embeddings across all k-v heads
    uint32_t n_embd_k_gqa(uint32_t il = 0) const;

@@ -274,57 +266,15 @@ struct llama_hparams {

    bool is_swa(uint32_t il) const;

-    // note: currently only support if either all or none of the layers are MLA
-    bool is_mla() const;
-
-    uint32_t n_embd_head_k_mla() const;
-    uint32_t n_embd_head_v_mla() const;
-
    bool has_kv(uint32_t il) const;

    // number of layers for which has_kv() returns true
    uint32_t n_layer_kv() const;

    // note that this function uses different SWA parameters from those in the hparams
-    // note: inlined on purpose for performance reasons
    // TODO: think of a better place for this function
    // TODO: pack the SWA params in a struct?
-    static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1) {
-        assert(p0 >= 0 && p1 >= 0);
-
-        switch (swa_type) {
-            case LLAMA_SWA_TYPE_NONE:
-                {
-                } break;
-            case LLAMA_SWA_TYPE_STANDARD:
-                {
-                    if (p1 - p0 >= (int32_t) n_swa) {
-                        return true;
-                    }
-                } break;
-            case LLAMA_SWA_TYPE_CHUNKED:
-                {
-                    const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
-
-                    if (p0 < pos_chunk_start) {
-                        return true;
-                    }
-                } break;
-            case LLAMA_SWA_TYPE_SYMMETRIC:
-                {
-                    const int32_t half_n_swa = (int32_t) n_swa / 2;
-                    const int32_t pos_diff = p1 - p0;
-
-                    // Mask if outside the symmetric window
-                    if (pos_diff < -half_n_swa || pos_diff > half_n_swa) {
-                        return true;
-                    }
-                } break;
-        }
-
-        return false;
-    }
-
+    static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1);

    bool use_mrope() const;
 };
--- a/llama/llama.cpp/src/llama-kv-cache.cpp
+++ b/llama/llama.cpp/src/llama-kv-cache.cpp
@@ -97,8 +97,6 @@ llama_kv_cache::llama_kv_cache(
                __func__, hparams.n_embd_v_gqa_max());
    }

-    const bool is_mla = hparams.is_mla();
-
    for (uint32_t il = 0; il < hparams.n_layer; il++) {
        if (!hparams.has_kv(il)) {
            LLAMA_LOG_DEBUG("%s: layer %3d: does not have KV cache\n", __func__, il);
@@ -132,21 +130,18 @@ llama_kv_cache::llama_kv_cache(
            throw std::runtime_error("failed to create ggml context for kv cache");
        }

-        const bool has_k = true;
-        const bool has_v = !is_mla;
+        ggml_tensor * k = ggml_new_tensor_3d(ctx, type_k, n_embd_k_gqa, kv_size, n_stream);
+        ggml_tensor * v = ggml_new_tensor_3d(ctx, type_v, n_embd_v_gqa, kv_size, n_stream);

-        ggml_tensor * k = has_k ? ggml_new_tensor_3d(ctx, type_k, n_embd_k_gqa, kv_size, n_stream) : nullptr;
-        ggml_tensor * v = has_v ? ggml_new_tensor_3d(ctx, type_v, n_embd_v_gqa, kv_size, n_stream) : nullptr;
-
-        has_k && ggml_format_name(k, "cache_k_l%d", il);
-        has_v && ggml_format_name(v, "cache_v_l%d", il);
+        ggml_format_name(k, "cache_k_l%d", il);
+        ggml_format_name(v, "cache_v_l%d", il);

        std::vector<ggml_tensor *> k_stream;
        std::vector<ggml_tensor *> v_stream;

        for (uint32_t s = 0; s < n_stream; ++s) {
-            k_stream.push_back(has_k ? ggml_view_2d(ctx, k, n_embd_k_gqa, kv_size, k->nb[1], s*k->nb[2]) : nullptr);
-            v_stream.push_back(has_v ? ggml_view_2d(ctx, v, n_embd_v_gqa, kv_size, v->nb[1], s*v->nb[2]) : nullptr);
+            k_stream.push_back(ggml_view_2d(ctx, k, n_embd_k_gqa, kv_size, k->nb[1], s*k->nb[2]));
+            v_stream.push_back(ggml_view_2d(ctx, v, n_embd_v_gqa, kv_size, v->nb[1], s*v->nb[2]));
        }

        map_layer_ids[il] = layers.size();
@@ -652,10 +647,7 @@ bool llama_kv_cache::update(llama_context * lctx, bool do_shift, const stream_co
                const auto & layer = layers[il];

                ggml_backend_tensor_copy(layer.k_stream[ssrc], layer.k_stream[sdst]);
-
-                if (layer.v_stream[ssrc]) {
-                    ggml_backend_tensor_copy(layer.v_stream[ssrc], layer.v_stream[sdst]);
-                }
+                ggml_backend_tensor_copy(layer.v_stream[ssrc], layer.v_stream[sdst]);
            }
        }
    }
@@ -860,7 +852,7 @@ llama_kv_cache::slot_info llama_kv_cache::find_slot(const llama_ubatch & ubatch,
                        const llama_seq_id seq_id_cell = cells.seq_get(idx);

                        // SWA mask
-                        if (llama_hparams::is_masked_swa(n_swa, swa_type, pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) {
+                        if (is_masked_swa(pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) {
                            can_use = true;
                        }
                    }
@@ -1245,197 +1237,6 @@ void llama_kv_cache::set_input_k_shift(ggml_tensor * dst) const {
    }
 }

-struct args_set_input_kq_mask {
-    const llama_hparams & hparams;
-    const llama_ubatch  * ubatch;
-
-    const std::vector<llama_kv_cells> & v_cells;
-    const std::vector<uint32_t>       & seq_to_stream;
-
-    uint32_t       n_swa;
-    llama_swa_type swa_type;
-
-    int64_t n_kv;
-    int64_t n_stream;
-    int64_t n_tps;
-};
-
-template<bool causal, bool swa, bool is_2d, bool alibi>
-static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
-  //const auto & hparams = args.hparams;
-    const auto & ubatch  = args.ubatch;
-
-    const auto & v_cells       = args.v_cells;
-    const auto & seq_to_stream = args.seq_to_stream;
-
-    const uint32_t       n_swa    = args.n_swa;
-    const llama_swa_type swa_type = args.swa_type;
-
-    const int64_t n_kv     = args.n_kv;
-    const int64_t n_stream = args.n_stream;
-    const int64_t n_tps    = args.n_tps;
-
-    // the min position in the batch for each sequence
-    llama_pos seq_pos_min[LLAMA_MAX_SEQ];
-    std::fill(seq_pos_min, seq_pos_min + LLAMA_MAX_SEQ, INT32_MAX);
-
-    for (uint32_t i = 0; i < ubatch->n_tokens; ++i) {
-        const llama_seq_id seq_id = ubatch->seq_id[i][0];
-
-        seq_pos_min[seq_id] = std::min(seq_pos_min[seq_id], ubatch->pos[i]);
-    }
-
-    for (uint32_t s = 0; s < n_stream; ++s) {
-        // bookeeping of the KQ mask cells that could change for other tokens of the same sequence
-        std::unordered_map<llama_seq_id, uint32_t>              seq_srct;
-        std::unordered_map<llama_seq_id, std::vector<uint32_t>> seq_idxs;
-
-        for (uint32_t ii = 0; ii < n_tps; ++ii) {
-            const uint32_t i = s*n_tps + ii;
-
-            const llama_seq_id seq_id = ubatch->seq_id[i][0];
-
-            const auto & cells = v_cells.at(seq_to_stream[seq_id]);
-
-                  llama_pos p0 = -1;
-            const llama_pos p1 = ubatch->pos[i];
-
-            // for M-RoPE
-            const llama_pos p1_x = is_2d ? ubatch->pos[i + ubatch->n_tokens*2] : 0;
-            const llama_pos p1_y = is_2d ? ubatch->pos[i + ubatch->n_tokens]   : 0;
-
-            const uint64_t idst = n_kv*i;
-
-            // for tokens of the same sequence, the mask is mostly the same, so we can reuse it
-            // the only cells that could change are the ones that are with similar positions as the
-            //   ones in the batch (i.e. due to causal masking, SWA, etc.)
-            // keep track of those cells and shortcut the loop to save time
-            // note: this optimization is not compatible with Alibi position encoding
-            // ref:  https://github.com/ggml-org/llama.cpp/pull/18842
-            bool prev = false;
-
-            auto & idxs = seq_idxs[seq_id];
-
-            if (!alibi) {
-                if (seq_srct.find(seq_id) != seq_srct.end()) {
-                    const uint32_t srct = seq_srct[seq_id];
-
-                    const uint64_t idst_prev = n_kv*srct;
-
-                    std::copy(data + idst_prev, data + idst_prev + n_kv, data + idst);
-
-                    prev = true;
-                } else {
-                    idxs.clear();
-                    idxs.reserve(ubatch->n_tokens + n_swa + 32);
-
-                    seq_srct[seq_id] = i;
-                }
-            }
-
-            for (uint32_t jj = 0; jj < n_kv; ++jj) {
-                uint32_t j = jj;
-
-                // we have an exiting mask for this sequence -> update just seq_idxs
-                if (!alibi) {
-                    if (prev) {
-                        if (jj >= idxs.size()) {
-                            break;
-                        }
-
-                        j = idxs[jj];
-                    }
-                }
-
-                if (cells.is_empty(j)) {
-                    goto skip;
-                }
-
-                // mask the token if not the same sequence
-                if (!cells.seq_has(j, seq_id)) {
-                    goto skip;
-                }
-
-                p0 = cells.pos_get(j);
-
-                if (!alibi) {
-                    if (!prev) {
-                        // record all cells for which: p0 >= seq_pos_min[seq_id] - n_swa - 32
-                        if (p0 + (int32_t) (n_swa + 32) >= seq_pos_min[seq_id]) {
-                            idxs.push_back(j);
-                        }
-                    }
-                }
-
-                if (causal) {
-                    // mask future tokens
-                    if (p0 > p1) {
-                        goto skip;
-                    }
-
-                    // M-RoPE causal mask
-                    if (is_2d) {
-                        if (p0 == p1) {
-                            const auto & p0_ext = cells.ext_get(j);
-
-                            if (p0_ext.is_2d_gt(p1_x, p1_y)) {
-                                goto skip;
-                            }
-                        }
-                    }
-                }
-
-                // apply SWA if any
-                if (swa) {
-                    if (llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1)) {
-                        goto skip;
-                    }
-                }
-
-                if (alibi) {
-                    data[idst + j] = -std::abs(p0 - p1);
-                } else {
-                    data[idst + j] = 0.0f;
-                }
-
-                continue;
-skip:
-                data[idst + j] = -INFINITY;
-            }
-        }
-    }
-}
-
-template<bool causal, bool swa, bool is_2d>
-static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
-    const bool alibi = args.hparams.use_alibi;
-    if (alibi) {
-        set_input_kq_mask_impl<causal, swa, is_2d, true> (args, data);
-    } else {
-        set_input_kq_mask_impl<causal, swa, is_2d, false>(args, data);
-    }
-}
-
-template<bool causal, bool swa>
-static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
-    const bool is_2d = args.ubatch->is_pos_2d();
-    if (is_2d) {
-        set_input_kq_mask_impl<causal, swa, true> (args, data);
-    } else {
-        set_input_kq_mask_impl<causal, swa, false>(args, data);
-    }
-}
-
-template<bool causal>
-static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
-    const bool swa = args.swa_type != LLAMA_SWA_TYPE_NONE;
-    if (swa) {
-        set_input_kq_mask_impl<causal, true> (args, data);
-    } else {
-        set_input_kq_mask_impl<causal, false>(args, data);
-    }
-}
-
 void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
    const uint32_t n_tokens = ubatch->n_tokens;

@@ -1450,29 +1251,74 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
    // n_tps == n_tokens_per_stream
    const int64_t n_tps = n_tokens/n_stream;

-    //const int64_t t_start = ggml_time_us();
+    std::fill(data, data + ggml_nelements(dst), -INFINITY);

-    const args_set_input_kq_mask args = {
-        /*.hparams          =*/ hparams,
-        /*.ubatch           =*/ ubatch,
-        /*.v_cells          =*/ v_cells,
-        /*.seq_to_stream    =*/ seq_to_stream,
-        /*.n_swa            =*/ n_swa,
-        /*.swa_type         =*/ swa_type,
-        /*.n_kv             =*/ n_kv,
-        /*.n_stream         =*/ n_stream,
-        /*.n_tps            =*/ n_tps,
-    };
+    // Use only the previous KV cells of the correct sequence for each token of the ubatch.
+    // It's assumed that if a token in the batch has multiple sequences, they are equivalent.
+    // Example with a cache of 10 tokens, 2 tokens populated in cache and 3 tokens in batch:
+    //   Causal mask:
+    //      xxx-------
+    //      xxxx------
+    //      xxxxx-----
+    //   Non-causal mask:
+    //      xxxxx-----
+    //      xxxxx-----
+    //      xxxxx-----
+    // To visualize the mask, see https://github.com/ggml-org/llama.cpp/pull/12615
+    // TODO: optimize this section
+    for (uint32_t h = 0; h < 1; ++h) {
+        for (uint32_t s = 0; s < n_stream; ++s) {
+            for (uint32_t ii = 0; ii < n_tps; ++ii) {
+                const uint32_t i = s*n_tps + ii;

-    if (causal_attn) {
-        set_input_kq_mask_impl<true> (args, data);
-    } else {
-        set_input_kq_mask_impl<false>(args, data);
+                const llama_seq_id seq_id = ubatch->seq_id[i][0];
+
+                const auto & cells = v_cells[seq_to_stream[seq_id]];
+
+                const llama_pos p1 = ubatch->pos[i];
+
+                // for M-RoPE
+                const bool is_2d = ubatch->is_pos_2d();
+                const llama_pos p1_x = is_2d ? ubatch->pos[i + ubatch->n_tokens*2] : 0;
+                const llama_pos p1_y = is_2d ? ubatch->pos[i + ubatch->n_tokens]   : 0;
+
+                const uint64_t idst = n_kv*(h*n_stream*n_tps + s*n_tps + ii);
+
+                for (uint32_t j = 0; j < n_kv; ++j) {
+                    if (cells.is_empty(j)) {
+                        continue;
+                    }
+
+                    // mask the token if not the same sequence
+                    if (!cells.seq_has(j, seq_id)) {
+                        continue;
+                    }
+
+                    const llama_pos p0 = cells.pos_get(j);
+
+                    // mask future tokens
+                    if (causal_attn && p0 > p1) {
+                        continue;
+                    }
+
+                    // M-RoPE causal mask
+                    if (causal_attn && is_2d && p0 == p1) {
+                        const auto & p0_ext = cells.ext_get(j);
+                        if (p0_ext.is_2d_gt(p1_x, p1_y)) {
+                            continue;
+                        }
+                    }
+
+                    // apply SWA if any
+                    if (is_masked_swa(p0, p1)) {
+                        continue;
+                    }
+
+                    data[idst + j] = hparams.use_alibi ? -std::abs(p0 - p1) : 0.0f;
+                }
+            }
+        }
    }
-
-    //const int64_t t_end = ggml_time_us();
-
-    //LLAMA_LOG_ERROR("%s: kq mask time: %0.3f ms\n", __func__, (t_end - t_start)/1000.0);
 }

 void llama_kv_cache::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
@@ -1524,7 +1370,7 @@ size_t llama_kv_cache::size_v_bytes() const {
    size_t size_v_bytes = 0;

    for (const auto & layer : layers) {
-        size_v_bytes += layer.v ? ggml_nbytes(layer.v) : 0;
+        size_v_bytes += ggml_nbytes(layer.v);
    }

    return size_v_bytes;
@@ -1602,10 +1448,6 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
    const auto & n_embd_head_k = hparams.n_embd_head_k;
  //const auto & n_embd_head_v = hparams.n_embd_head_v;

-    const auto & n_rot = hparams.n_rot;
-
-    const auto n_embd_nope = hparams.n_lora_kv > 0 ? n_embd_head_k - n_rot : 0;
-
    auto inp = std::make_unique<llm_graph_input_k_shift>(this);

    inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, (int64_t) get_size()*n_stream);
@@ -1626,10 +1468,10 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co

        ggml_tensor * k =
            ggml_view_3d(ctx, layer.k,
-                n_rot, n_head_kv, get_size()*n_stream,
+                n_embd_head_k, n_head_kv, get_size()*n_stream,
                ggml_row_size(layer.k->type, n_embd_head_k),
                ggml_row_size(layer.k->type, n_embd_k_gqa),
-                ggml_row_size(layer.k->type, n_embd_nope));
+                0);

        ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);

@@ -1641,6 +1483,10 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
    return gf;
 }

+bool llama_kv_cache::is_masked_swa(llama_pos p0, llama_pos p1) const {
+    return llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1);
+}
+
 void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
    GGML_UNUSED(flags);

@@ -1806,9 +1652,6 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t
            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);

            auto * v = layer.v_stream[cr.strm];
-            if (!v) {
-                continue;
-            }

            // Write value type
            const int32_t v_type_i = (int32_t) v->type;
@@ -1835,9 +1678,6 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t
            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);

            auto * v = layer.v_stream[cr.strm];
-            if (!v) {
-                continue;
-            }

            // Write value type
            const int32_t v_type_i = (int32_t) v->type;
@@ -2041,9 +1881,6 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32
            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);

            auto * v = layer.v_stream[strm];
-            if (!v) {
-                continue;
-            }

            // Read type of value
            int32_t v_type_i_ref;
@@ -2085,9 +1922,6 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32
            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);

            auto * v = layer.v_stream[strm];
-            if (!v) {
-                continue;
-            }

            // Read type of value
            int32_t v_type_i_ref;
--- a/llama/llama.cpp/src/llama-kv-cache.h
+++ b/llama/llama.cpp/src/llama-kv-cache.h
@@ -257,6 +257,8 @@ private:
    size_t size_k_bytes() const;
    size_t size_v_bytes() const;

+    bool is_masked_swa(llama_pos p0, llama_pos p1) const;
+
    ggml_tensor * build_rope_shift(
            const llama_cparams & cparams,
                   ggml_context * ctx,
@@ -303,7 +305,7 @@ public:
            bool do_shift,
            stream_copy_info sc_info);

-    // used to create a batch processing context from a batch
+    // used to create a batch procesing context from a batch
    llama_kv_cache_context(
            llama_kv_cache * kv,
            slot_info_vec_t sinfos,
--- a/llama/llama.cpp/src/llama-memory-hybrid-iswa.cpp
+++ b/llama/llama.cpp/src/llama-memory-hybrid-iswa.cpp
@@ -1,275 +0,0 @@
-#include "llama-memory-hybrid-iswa.h"
-
-#include "llama-impl.h"
-#include "llama-model.h"
-#include "llama-context.h"
-
-//
-// llama_memory_hybrid_iswa
-//
-
-llama_memory_hybrid_iswa::llama_memory_hybrid_iswa(
-        const llama_model & model,
-                            /* attn */
-                ggml_type   type_k,
-                ggml_type   type_v,
-                     bool   v_trans,
-                     bool   swa_full,
-                 uint32_t   kv_size,
-                 uint32_t   n_ubatch,
-                 uint32_t   n_pad,
-                            /* recurrent */
-                ggml_type   type_r,
-                ggml_type   type_s,
-                 uint32_t   rs_size,
-                            /* common */
-                 uint32_t   n_seq_max,
-                     bool   offload,
-                     bool   unified,
-                            /* layer filters */
-    const layer_filter_cb & filter_attn,
-    const layer_filter_cb & filter_recr) :
-    hparams(model.hparams),
-    mem_attn(new llama_kv_cache_iswa(
-        model,
-        type_k,
-        type_v,
-        v_trans,
-        offload,
-        swa_full,
-        unified,
-        kv_size,
-        n_seq_max,
-        n_ubatch,
-        n_pad,
-        filter_attn == nullptr ?
-            [&](int32_t il) { return !hparams.is_recurrent(il); }
-            : filter_attn,
-        nullptr
-    )),
-    mem_recr(new llama_memory_recurrent(
-        model,
-        type_r,
-        type_s,
-        offload,
-        rs_size,
-        n_seq_max,
-        filter_recr == nullptr ?
-            [&](int32_t il) { return hparams.is_recurrent(il); }
-            : filter_recr
-    )) {}
-
-llama_memory_context_ptr llama_memory_hybrid_iswa::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
-    do {
-        balloc.split_reset();
-
-        // follow the recurrent pattern for creating the ubatch splits
-        std::vector<llama_ubatch> ubatches;
-
-        while (true) {
-            llama_ubatch ubatch;
-
-            if (embd_all) {
-                // if all tokens are output, split by sequence
-                ubatch = balloc.split_seq(n_ubatch);
-            } else {
-                // TODO: non-sequential equal split can be done if using unified KV cache
-                //       for simplicity, we always use sequential equal split for now
-                ubatch = balloc.split_equal(n_ubatch, true);
-            }
-
-            if (ubatch.n_tokens == 0) {
-                break;
-            }
-
-            ubatches.push_back(std::move(ubatch)); // NOLINT
-        }
-
-        if (balloc.get_n_used() < balloc.get_n_tokens()) {
-            // failed to find a suitable split
-            break;
-        }
-
-        // prepare the recurrent batches first
-        if (!mem_recr->prepare(ubatches)) {
-            // TODO: will the recurrent cache be in an undefined context at this point?
-            LLAMA_LOG_ERROR("%s: failed to prepare recurrent ubatches\n", __func__);
-            return std::make_unique<llama_memory_hybrid_iswa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
-        }
-
-        // prepare the attention cache (iswa version returns both base and swa slot infos)
-        auto sinfos_base = mem_attn->get_base()->prepare(ubatches);
-        if (sinfos_base.empty()) {
-            LLAMA_LOG_ERROR("%s: failed to prepare attention base ubatches\n", __func__);
-            return std::make_unique<llama_memory_hybrid_iswa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
-        }
-
-        auto sinfos_swa = mem_attn->get_swa()->prepare(ubatches);
-        if (sinfos_swa.empty()) {
-            LLAMA_LOG_ERROR("%s: failed to prepare attention swa ubatches\n", __func__);
-            return std::make_unique<llama_memory_hybrid_iswa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
-        }
-
-        return std::make_unique<llama_memory_hybrid_iswa_context>(
-                this, std::move(sinfos_base), std::move(sinfos_swa), std::move(ubatches));
-    } while(false);
-
-    return std::make_unique<llama_memory_hybrid_iswa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
-}
-
-llama_memory_context_ptr llama_memory_hybrid_iswa::init_full() {
-    return std::make_unique<llama_memory_hybrid_iswa_context>(this);
-}
-
-llama_memory_context_ptr llama_memory_hybrid_iswa::init_update(llama_context * lctx, bool optimize) {
-    return std::make_unique<llama_memory_hybrid_iswa_context>(this, lctx, optimize);
-}
-
-bool llama_memory_hybrid_iswa::get_can_shift() const {
-    // Shifting is trivially supported for recurrent
-    return mem_attn->get_can_shift();
-}
-
-void llama_memory_hybrid_iswa::clear(bool data) {
-    mem_attn->clear(data);
-    mem_recr->clear(data);
-}
-
-bool llama_memory_hybrid_iswa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
-    // Try removing from the recurrent cache first since it may fail. If it does
-    // fail, the cache will not have been mutated.
-    if (!mem_recr->seq_rm(seq_id, p0, p1)) {
-        return false;
-    }
-    return mem_attn->seq_rm(seq_id, p0, p1);
-}
-
-void llama_memory_hybrid_iswa::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
-    mem_attn->seq_cp(seq_id_src, seq_id_dst, p0, p1);
-    mem_recr->seq_cp(seq_id_src, seq_id_dst, p0, p1);
-}
-
-void llama_memory_hybrid_iswa::seq_keep(llama_seq_id seq_id) {
-    mem_attn->seq_keep(seq_id);
-    mem_recr->seq_keep(seq_id);
-}
-
-void llama_memory_hybrid_iswa::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
-    mem_attn->seq_add(seq_id, p0, p1, shift);
-    mem_recr->seq_add(seq_id, p0, p1, shift);
-}
-
-void llama_memory_hybrid_iswa::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
-    mem_attn->seq_div(seq_id, p0, p1, d);
-    mem_recr->seq_div(seq_id, p0, p1, d);
-}
-
-llama_pos llama_memory_hybrid_iswa::seq_pos_min(llama_seq_id seq_id) const {
-    // the min of the total cache is the max of the two caches' min values
-    return std::max(mem_attn->seq_pos_min(seq_id), mem_recr->seq_pos_min(seq_id));
-}
-
-llama_pos llama_memory_hybrid_iswa::seq_pos_max(llama_seq_id seq_id) const {
-    // the max of the total cache is the min of the two caches' max values
-    return std::min(mem_attn->seq_pos_max(seq_id), mem_recr->seq_pos_max(seq_id));
-}
-
-std::map<ggml_backend_buffer_type_t, size_t> llama_memory_hybrid_iswa::memory_breakdown() const {
-    std::map<ggml_backend_buffer_type_t, size_t> mb = mem_attn->memory_breakdown();
-    for (const auto & buft_size : mem_recr->memory_breakdown()) {
-        mb[buft_size.first] += buft_size.second;
-    }
-    return mb;
-}
-
-void llama_memory_hybrid_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
-    mem_attn->state_write(io, seq_id, flags);
-    mem_recr->state_write(io, seq_id, flags);
-}
-
-void llama_memory_hybrid_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
-    mem_attn->state_read(io, seq_id, flags);
-    mem_recr->state_read(io, seq_id, flags);
-}
-
-llama_kv_cache_iswa * llama_memory_hybrid_iswa::get_mem_attn() const {
-    return mem_attn.get();
-}
-
-llama_memory_recurrent * llama_memory_hybrid_iswa::get_mem_recr() const {
-    return mem_recr.get();
-}
-
-//
-// llama_memory_hybrid_iswa_context
-//
-
-llama_memory_hybrid_iswa_context::llama_memory_hybrid_iswa_context(llama_memory_status status) : status(status) {}
-
-llama_memory_hybrid_iswa_context::llama_memory_hybrid_iswa_context(llama_memory_hybrid_iswa * mem) :
-    ctx_attn(mem->get_mem_attn()->init_full()),
-    ctx_recr(mem->get_mem_recr()->init_full()),
-    status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
-}
-
-llama_memory_hybrid_iswa_context::llama_memory_hybrid_iswa_context(
-        llama_memory_hybrid_iswa * mem,
-                   llama_context * lctx,
-                            bool   optimize) :
-    ctx_attn(mem->get_mem_attn()->init_update(lctx, optimize)),
-    ctx_recr(mem->get_mem_recr()->init_update(lctx, optimize)),
-    status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
-}
-
-llama_memory_hybrid_iswa_context::llama_memory_hybrid_iswa_context(
-           llama_memory_hybrid_iswa * mem,
-                    slot_info_vec_t   sinfos_base,
-                    slot_info_vec_t   sinfos_swa,
-          std::vector<llama_ubatch>   ubatches) :
-    ubatches(std::move(ubatches)),
-    // note: here we copy the ubatches. not sure if this is ideal
-    ctx_attn(new llama_kv_cache_iswa_context(mem->get_mem_attn(), std::move(sinfos_base), std::move(sinfos_swa), this->ubatches)),
-    ctx_recr(new llama_memory_recurrent_context(mem->get_mem_recr(), this->ubatches)),
-    status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
-}
-
-bool llama_memory_hybrid_iswa_context::next() {
-    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
-
-    ctx_attn->next();
-    ctx_recr->next();
-
-    if (++i_next >= ubatches.size()) {
-        return false;
-    }
-
-    return true;
-}
-
-bool llama_memory_hybrid_iswa_context::apply() {
-    assert(!llama_memory_status_is_fail(status));
-
-    bool res = true;
-
-    res = res & ctx_attn->apply();
-    res = res & ctx_recr->apply();
-
-    return res;
-}
-
-llama_memory_status llama_memory_hybrid_iswa_context::get_status() const {
-    return status;
-}
-
-const llama_ubatch & llama_memory_hybrid_iswa_context::get_ubatch() const {
-    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
-    return ubatches[i_next];
-}
-
-const llama_kv_cache_iswa_context * llama_memory_hybrid_iswa_context::get_attn() const {
-    return static_cast<const llama_kv_cache_iswa_context *>(ctx_attn.get());
-}
-
-const llama_memory_recurrent_context * llama_memory_hybrid_iswa_context::get_recr() const {
-    return static_cast<const llama_memory_recurrent_context *>(ctx_recr.get());
-}
--- a/llama/llama.cpp/src/llama-memory-hybrid-iswa.h
+++ b/llama/llama.cpp/src/llama-memory-hybrid-iswa.h
@@ -1,140 +0,0 @@
-#pragma once
-
-#include "llama-batch.h"
-#include "llama-graph.h"
-#include "llama-kv-cache-iswa.h"
-#include "llama-memory.h"
-#include "llama-memory-recurrent.h"
-
-#include <memory>
-#include <vector>
-
-//
-// llama_memory_hybrid_iswa
-//
-
-// utilizes instances of llama_memory_recurrent and llama_kv_cache_iswa to
-//   support models where each layer may be either attention-based (with SWA support) or recurrent
-
-class llama_memory_hybrid_iswa : public llama_memory_i {
-public:
-    llama_memory_hybrid_iswa(
-        const llama_model & model,
-                            /* attn */
-                ggml_type   type_k,
-                ggml_type   type_v,
-                     bool   v_trans,
-                     bool   swa_full,
-                 uint32_t   kv_size,
-                 uint32_t   n_ubatch,
-                 uint32_t   n_pad,
-                            /* recurrent */
-                ggml_type   type_r,
-                ggml_type   type_s,
-                 uint32_t   rs_size,
-                            /* common */
-                 uint32_t   n_seq_max,
-                     bool   offload,
-                     bool   unified,
-                            /* layer filters */
-    const layer_filter_cb & filter_attn = nullptr,
-    const layer_filter_cb & filter_recr = nullptr);
-
-    ~llama_memory_hybrid_iswa() = default;
-
-    //
-    // llama_memory_i
-    //
-
-    llama_memory_context_ptr init_batch(
-            llama_batch_allocr & balloc,
-            uint32_t n_ubatch,
-            bool embd_all) override;
-
-    llama_memory_context_ptr init_full() override;
-
-    llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) override;
-
-    bool get_can_shift() const override;
-
-    void clear(bool data) override;
-
-    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
-    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
-    void seq_keep(llama_seq_id seq_id)                                                          override;
-    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos shift) override;
-    void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;
-
-    llama_pos seq_pos_min(llama_seq_id seq_id) const override;
-    llama_pos seq_pos_max(llama_seq_id seq_id) const override;
-
-    std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
-
-    // state write/load
-
-    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
-    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0)       override;
-
-    //
-    // llama_memory_hybrid_iswa specific API
-    //
-
-    llama_kv_cache_iswa * get_mem_attn() const;
-    llama_memory_recurrent * get_mem_recr() const;
-
-private:
-    const llama_hparams & hparams;
-
-    const std::unique_ptr<llama_kv_cache_iswa> mem_attn;
-    const std::unique_ptr<llama_memory_recurrent> mem_recr;
-};
-
-class llama_memory_hybrid_iswa_context : public llama_memory_context_i {
-public:
-    using slot_info_vec_t = llama_kv_cache::slot_info_vec_t;
-
-    // init failure
-    explicit llama_memory_hybrid_iswa_context(llama_memory_status status);
-
-    // init full
-    explicit llama_memory_hybrid_iswa_context(llama_memory_hybrid_iswa * mem);
-
-    // init update
-    explicit llama_memory_hybrid_iswa_context(
-        llama_memory_hybrid_iswa * mem,
-                   llama_context * lctx,
-                            bool   optimize);
-
-    // init success
-    llama_memory_hybrid_iswa_context(
-           llama_memory_hybrid_iswa * mem,
-                    slot_info_vec_t   sinfos_base,
-                    slot_info_vec_t   sinfos_swa,
-          std::vector<llama_ubatch>   ubatches);
-
-    ~llama_memory_hybrid_iswa_context() = default;
-
-    bool next()  override;
-    bool apply() override;
-
-    llama_memory_status  get_status() const override;
-    const llama_ubatch & get_ubatch() const override;
-
-    //
-    // llama_memory_hybrid_iswa_context
-    //
-
-    const llama_kv_cache_iswa_context * get_attn() const;
-    const llama_memory_recurrent_context * get_recr() const;
-
-private:
-    // the index of the next ubatch to process
-    size_t i_next = 0;
-
-    std::vector<llama_ubatch> ubatches;
-
-    const llama_memory_context_ptr ctx_attn;
-    const llama_memory_context_ptr ctx_recr;
-
-    const llama_memory_status status;
-};
--- a/llama/llama.cpp/src/llama-mmap.cpp
+++ b/llama/llama.cpp/src/llama-mmap.cpp
@@ -13,10 +13,9 @@
 #ifdef __has_include
    #if __has_include(<unistd.h>)
        #include <unistd.h>
-        #include <fcntl.h>
-        #include <sys/stat.h>
        #if defined(_POSIX_MAPPED_FILES)
            #include <sys/mman.h>
+            #include <fcntl.h>
        #endif
        #if defined(_POSIX_MEMLOCK_RANGE)
            #include <sys/resource.h>
@@ -75,7 +74,7 @@ struct llama_file::impl {
        return ret;
    }

-    impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) {
+    impl(const char * fname, const char * mode) {
        fp = ggml_fopen(fname, mode);
        if (fp == NULL) {
            throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
@@ -110,7 +109,7 @@ struct llama_file::impl {
        }
    }

-    void read_raw(void * ptr, size_t len) {
+    void read_raw(void * ptr, size_t len) const {
        size_t bytes_read = 0;
        while (bytes_read < len) {
            size_t chunk_size = std::min<size_t>(len - bytes_read, 64*1024*1024);
@@ -127,7 +126,7 @@ struct llama_file::impl {
        }
    }

-    uint32_t read_u32() {
+    uint32_t read_u32() const {
        uint32_t val;
        read_raw(&val, sizeof(val));
        return val;
@@ -154,55 +153,16 @@ struct llama_file::impl {
        write_raw(&val, sizeof(val));
    }

-    bool has_direct_io() const {
-        return true;
-    }
-
    ~impl() {
        if (fp) {
            std::fclose(fp);
        }
    }
 #else
-    impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) : fname(fname) {
-#ifdef __linux__
-        // Try unbuffered I/O for read only
-        if (use_direct_io && std::strcmp(mode, "rb") == 0) {
-            if (init_fd()) {
-                return;
-            }
-            LLAMA_LOG_WARN("Failed to open file '%s' with error: %s. Falling back to buffered I/O",
-                           fname, strerror(errno));
-        }
-#endif
-        init_fp(mode);
-    }
-
-#ifdef __linux__
-    bool init_fd() {
-        fd = open(fname.c_str(), O_RDONLY | O_DIRECT);
-
-        if (fd != -1) {
-            struct stat file_stats{};
-            fstat(fd, &file_stats);
-
-            size = file_stats.st_size;
-            alignment = file_stats.st_blksize;
-
-            off_t ret = lseek(fd, 0, SEEK_SET);
-            if (ret == -1) {
-                throw std::runtime_error(format("seek error: %s", strerror(errno)));
-            }
-            return true;
-        }
-        return false;
-    }
-#endif
-
-    void init_fp(const char * mode) {
-        fp = ggml_fopen(fname.c_str(), mode);
+    impl(const char * fname, const char * mode) {
+        fp = ggml_fopen(fname, mode);
        if (fp == NULL) {
-            throw std::runtime_error(format("failed to open %s: %s", fname.c_str(), strerror(errno)));
+            throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
        }
        seek(0, SEEK_END);
        size = tell();
@@ -210,122 +170,46 @@ struct llama_file::impl {
    }

    size_t tell() const {
-        if (fd == -1) {
-            long ret = std::ftell(fp);
-            if (ret == -1) {
-                throw std::runtime_error(format("ftell error: %s", strerror(errno)));
-            }
-
-            return (size_t) ret;
+// TODO: this ifdef is never true?
+#ifdef _WIN32
+        __int64 ret = _ftelli64(fp);
+#else
+        long ret = std::ftell(fp);
+#endif
+        if (ret == -1) {
+            throw std::runtime_error(format("ftell error: %s", strerror(errno)));
        }

-        off_t pos = lseek(fd, 0, SEEK_CUR);
-        if (pos == -1) {
-            throw std::runtime_error(format("lseek error: %s", strerror(errno)));
-        }
-        return (size_t) pos;
+        return (size_t) ret;
    }

    void seek(size_t offset, int whence) const {
-        off_t ret = 0;
-        if (fd == -1) {
-            ret = std::fseek(fp, (long) offset, whence);
-        } else {
-            ret = lseek(fd, offset, whence);
-        }
-        if (ret == -1) {
+// TODO: this ifdef is never true?
+#ifdef _WIN32
+        int ret = _fseeki64(fp, (__int64) offset, whence);
+#else
+        int ret = std::fseek(fp, (long) offset, whence);
+#endif
+        if (ret != 0) {
            throw std::runtime_error(format("seek error: %s", strerror(errno)));
        }
    }

-    void read_raw_unsafe(void * ptr, size_t len) {
+    void read_raw(void * ptr, size_t len) const {
        if (len == 0) {
            return;
        }
        errno = 0;
-        if (fd == -1) {
-            const size_t curr_off = tell();
-            const size_t to_read = std::min(len, size - curr_off);
-
-            std::size_t ret = std::fread(ptr, to_read, 1, fp);
-            if (ferror(fp)) {
-                throw std::runtime_error(format("read error: %s", strerror(errno)));
-            }
-            if (to_read > 0 && ret != 1) {
-                throw std::runtime_error("unexpectedly reached end of file");
-            }
-        } else {
-            size_t bytes_read = 0;
-            while (bytes_read < len) {
-                const size_t to_read = len - bytes_read;
-                ssize_t ret = ::read(fd, reinterpret_cast<char *>(ptr) + bytes_read, to_read);
-
-                if (ret == -1) {
-                    if (errno == EINTR) {
-                        continue;  // Interrupted by signal, retry
-                    }
-                    // Fallback to std::fread in case the DMA controller cannot access the buffer
-                    if (errno == EFAULT || errno == EINVAL) {
-                        LLAMA_LOG_WARN("%s: Falling back to buffered IO due to %s\n", __func__, strerror(errno));
-                        auto curr_off = tell();
-                        close(fd);
-                        fd = -1;
-                        alignment = 1;
-                        init_fp("rb");
-                        seek(curr_off, SEEK_SET);
-                        read_raw_unsafe(ptr, len);
-                        return;
-                    }
-                    throw std::runtime_error(format("read error: %s", strerror(errno)));
-                }
-                if (ret == 0) {
-                    // EOF: allow if this read was only pulling alignment padding past file end
-                    off_t pos = lseek(fd, 0, SEEK_CUR);
-                    if (pos != -1 && (size_t) pos == size) {
-                        std::memset(reinterpret_cast<char *>(ptr) + bytes_read, 0, len - bytes_read);
-                        return;
-                    }
-                    throw std::runtime_error("unexpectedly reached end of file");
-                }
-
-                bytes_read += (size_t) ret;
-            }
+        std::size_t ret = std::fread(ptr, len, 1, fp);
+        if (ferror(fp)) {
+            throw std::runtime_error(format("read error: %s", strerror(errno)));
+        }
+        if (ret != 1) {
+            throw std::runtime_error("unexpectedly reached end of file");
        }
    }

-    void read_aligned_chunk(void * dest, size_t size) {
-        size_t offset = tell();
-        off_t aligned_offset = offset & ~(alignment - 1);
-        off_t offset_from_alignment = offset - aligned_offset;
-        size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1);
-
-        void * raw_buffer = nullptr;
-        int ret = posix_memalign(&raw_buffer, alignment, bytes_to_read);
-        if (ret != 0) {
-            throw std::runtime_error(format("posix_memalign failed with error %d", ret));
-        }
-
-        struct aligned_buffer_deleter {
-            void operator()(void * p) const { free(p); }
-        };
-        std::unique_ptr<void, aligned_buffer_deleter> buffer(raw_buffer);
-
-        seek(aligned_offset, SEEK_SET);
-        read_raw_unsafe(buffer.get(), bytes_to_read);
-
-        uintptr_t actual_data = reinterpret_cast<uintptr_t>(buffer.get()) + offset_from_alignment;
-        memcpy(dest, reinterpret_cast<void *>(actual_data), size);
-    }
-
-    void read_raw(void * ptr, size_t len) {
-        if (has_direct_io()) {
-            read_aligned_chunk(ptr, len);
-        } else {
-            read_raw_unsafe(ptr, len);
-        }
-    }
-
-    uint32_t read_u32() {
+    uint32_t read_u32() const {
        uint32_t ret;
        read_raw(&ret, sizeof(ret));
        return ret;
@@ -346,48 +230,27 @@ struct llama_file::impl {
        write_raw(&val, sizeof(val));
    }

-    bool has_direct_io() const {
-        return fd != -1 && alignment > 1;
-    }
-
    ~impl() {
-        if (fd != -1) {
-            close(fd);
-        } else {
+        if (fp) {
            std::fclose(fp);
        }
    }
-    int fd = -1;
-    std::string fname;
 #endif

-    size_t read_alignment() const {
-        return alignment;
-    }
-
-    size_t alignment = 1;
-
-    FILE * fp{};
-    size_t size{};
+    FILE * fp;
+    size_t size;
 };

-llama_file::llama_file(const char * fname, const char * mode, const bool use_direct_io) :
-    pimpl(std::make_unique<impl>(fname, mode, use_direct_io)) {}
+llama_file::llama_file(const char * fname, const char * mode) : pimpl(std::make_unique<impl>(fname, mode)) {}
 llama_file::~llama_file() = default;

 size_t llama_file::tell() const { return pimpl->tell(); }
 size_t llama_file::size() const { return pimpl->size; }

-size_t llama_file::read_alignment() const { return pimpl->read_alignment(); }
-bool llama_file::has_direct_io() const { return pimpl->has_direct_io(); }
-
 int llama_file::file_id() const {
 #ifdef _WIN32
    return _fileno(pimpl->fp);
 #else
-    if (pimpl->fd != -1) {
-        return pimpl->fd;
-    }
 #if defined(fileno)
    return fileno(pimpl->fp);
 #else
@@ -397,14 +260,9 @@ int llama_file::file_id() const {
 }

 void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
-void llama_file::read_raw(void * ptr, size_t len) { pimpl->read_raw(ptr, len); }
-#ifdef _WIN32
-void llama_file::read_raw_unsafe(void * ptr, size_t len) { pimpl->read_raw(ptr, len); }
-#else
-void llama_file::read_raw_unsafe(void * ptr, size_t len) { pimpl->read_raw_unsafe(ptr, len); }
-#endif
+void llama_file::read_raw(void * ptr, size_t len) const { pimpl->read_raw(ptr, len); }

-uint32_t llama_file::read_u32() { return pimpl->read_u32(); }
+uint32_t llama_file::read_u32() const { return pimpl->read_u32(); }

 void llama_file::write_raw(const void * ptr, size_t len) const { pimpl->write_raw(ptr, len); }
 void llama_file::write_u32(uint32_t val) const { pimpl->write_u32(val); }
@@ -618,9 +476,9 @@ struct llama_mlock::impl {

        char* errmsg = std::strerror(errno);
        bool suggest = (errno == ENOMEM);
-#if defined(TARGET_OS_VISION) || defined(TARGET_OS_TV) || defined(_AIX) || defined(__HAIKU__)
-        // visionOS/tvOS/Haiku don't support RLIMIT_MEMLOCK
-        // Skip resource limit checks on these platforms
+#if defined(TARGET_OS_VISION) || defined(TARGET_OS_TV) || defined(_AIX)
+        // visionOS/tvOS dont't support RLIMIT_MEMLOCK
+        // Skip resource limit checks on visionOS/tvOS
        suggest = false;
 #else
        struct rlimit lock_limit;
--- a/llama/llama.cpp/src/llama-mmap.h
+++ b/llama/llama.cpp/src/llama-mmap.h
@@ -3,7 +3,6 @@
 #include <cstdint>
 #include <memory>
 #include <vector>
-#include <cstdio>

 struct llama_file;
 struct llama_mmap;
@@ -14,7 +13,7 @@ using llama_mmaps  = std::vector<std::unique_ptr<llama_mmap>>;
 using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;

 struct llama_file {
-    llama_file(const char * fname, const char * mode, bool use_direct_io = false);
+    llama_file(const char * fname, const char * mode);
    ~llama_file();

    size_t tell() const;
@@ -24,16 +23,12 @@ struct llama_file {

    void seek(size_t offset, int whence) const;

-    void read_raw(void * ptr, size_t len);
-    void read_raw_unsafe(void * ptr, size_t len);
-    void read_aligned_chunk(void * dest, size_t size);
-    uint32_t read_u32();
+    void read_raw(void * ptr, size_t len) const;
+    uint32_t read_u32() const;

    void write_raw(const void * ptr, size_t len) const;
    void write_u32(uint32_t val) const;

-    size_t read_alignment() const;
-    bool has_direct_io() const;
 private:
    struct impl;
    std::unique_ptr<impl> pimpl;
--- a/llama/llama.cpp/src/llama-model-loader.cpp
+++ b/llama/llama.cpp/src/llama-model-loader.cpp
@@ -2,7 +2,6 @@

 #include "ggml.h"

-#include <algorithm>
 #include <array>
 #include <cinttypes>
 #include <cstring>
@@ -345,7 +344,6 @@ namespace GGUFMeta {
            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx, kid);

        switch (arr_info.gt) {
-            case GGUF_TYPE_BOOL:
            case GGUF_TYPE_UINT32:
            case GGUF_TYPE_INT32:   GGML_ASSERT((std::is_same<T,     int32_t>::value) ||
                                                (std::is_same<T,    uint32_t>::value)); break;
@@ -367,13 +365,7 @@ namespace GGUFMeta {
                result[i] = value;
            }
        } else {
-            if (arr_info.gt == GGUF_TYPE_BOOL) {
-                std::transform((const bool *)arr_info.data, (const bool *)arr_info.data + arr_info.length, result.begin(), [](bool x) {
-                    return static_cast<T>(x);
-                });
-            } else {
-                std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin());
-            }
+            std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin());
        }

        return true;
@@ -470,29 +462,6 @@ namespace GGUFMeta {
        return get_key_or_arr(llm_kv(kid), result, n, required);
    }

-    bool llama_model_loader::get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required) {
-        const std::string key = llm_kv(kid);
-
-        const int id = gguf_find_key(meta.get(), key.c_str());
-
-        if (id < 0) {
-            if (required) {
-                throw std::runtime_error(format("key not found in model: %s", key.c_str()));
-            }
-            return false;
-        }
-
-        // throw and error if type is an array
-        if (gguf_get_kv_type(meta.get(), id) == GGUF_TYPE_ARRAY) {
-            if (required) {
-                throw std::runtime_error(format("expected scalar, found array for key: %s", key.c_str()));
-            }
-            return false;
-        }
-
-        return get_key(key, result, required);
-    }
-
    // TODO: this is not very clever - figure out something better
    template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
    template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
@@ -503,7 +472,6 @@ llama_model_loader::llama_model_loader(
        const std::string & fname,
        std::vector<std::string> & splits,
        bool use_mmap,
-        bool use_direct_io,
        bool check_tensors,
        bool no_alloc,
        const llama_model_kv_override * param_overrides_p,
@@ -536,23 +504,9 @@ llama_model_loader::llama_model_loader(
    get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
    llm_kv = LLM_KV(llm_arch_from_string(arch_name));

-    files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
+    files.emplace_back(new llama_file(fname.c_str(), "rb"));
    contexts.emplace_back(ctx);

-    if (use_mmap && use_direct_io) {
-        if (files.back()->has_direct_io()) {
-            // Disable mmap, as DirectIO is available
-            use_mmap = false;
-            LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
-        } else {
-            // Disable DirectIO and reopen file using std::fopen for mmap
-            use_direct_io = false;
-            files.pop_back();
-            files.emplace_back(new llama_file(fname.c_str(), "rb", false));
-            LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__);
-        }
-    }
-
    // Save tensors data offset of the main file.
    // For subsidiary files, `meta` tensor data offset must not be used,
    // so we build a unified tensors index for weights.
@@ -618,7 +572,7 @@ llama_model_loader::llama_model_loader(
                }
            }

-            files.emplace_back(new llama_file(fname_split, "rb", use_direct_io));
+            files.emplace_back(new llama_file(fname_split, "rb"));
            contexts.emplace_back(ctx);

            // Save tensors data offset info of the shard.
@@ -762,7 +716,6 @@ llama_model_loader::llama_model_loader(
    }

    this->use_mmap = use_mmap;
-    this->use_direct_io = use_direct_io;
    this->check_tensors = check_tensors;
    this->no_alloc = no_alloc;
 }
@@ -982,15 +935,7 @@ bool llama_model_loader::load_all_data(
    // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
    // NVMe raid configurations might require more / larger buffers.
    constexpr size_t n_buffers = 4;
-
-    size_t alignment = 1;
-    for (const auto & file : files) {
-        alignment = std::max(file->read_alignment(), alignment);
-    }
-
-    // Buffer size: balance between memory usage and I/O efficiency
-    // 64MB works well for NVMe drives
-    const size_t buffer_size = alignment != 1 ? 64 * 1024 * 1024 + 2 * alignment : 1 * 1024 * 1024;
+    constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB

    std::vector<ggml_backend_buffer_t> host_buffers;
    std::vector<ggml_backend_event_t> events;
@@ -1040,7 +985,6 @@ bool llama_model_loader::load_all_data(
        // If the backend is supported, create pinned memory buffers and events for synchronisation.
        for (size_t idx = 0; idx < n_buffers; ++idx) {
            auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
-
            if (!buf) {
                LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func,
                    ggml_backend_dev_name(dev));
@@ -1122,7 +1066,6 @@ bool llama_model_loader::load_all_data(
            }
        } else {
            const auto & file = files.at(weight->idx);
-
            if (ggml_backend_buffer_is_host(cur->buffer)) {
                file->seek(weight->offs, SEEK_SET);
                file->read_raw(cur->data, n_size);
@@ -1134,54 +1077,19 @@ bool llama_model_loader::load_all_data(
            } else {
                // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
                if (upload_backend) {
-                    size_t offset = weight->offs;
-                    alignment = file->read_alignment();
-                    size_t aligned_offset = offset & ~(alignment - 1);
-                    size_t offset_from_alignment = offset - aligned_offset;
-                    file->seek(aligned_offset, SEEK_SET);
-
-                    // Calculate aligned read boundaries
-                    size_t read_start = aligned_offset;
-                    size_t read_end = (offset + n_size + alignment - 1) & ~(alignment - 1);
+                    file->seek(weight->offs, SEEK_SET);

                    size_t bytes_read = 0;
-                    size_t data_read = 0;  // Actual tensor data copied (excluding padding)

-                    while (bytes_read < read_end - read_start) {
-                        size_t read_size = std::min<size_t>(buffer_size, read_end - read_start - bytes_read);
+                    while (bytes_read < n_size) {
+                        size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);

-                        // Align the destination pointer within the pinned buffer
-                        uintptr_t ptr_dest_aligned = (reinterpret_cast<uintptr_t>(host_ptrs[buffer_idx]) + alignment - 1) & ~(alignment - 1);
-
-                        // Wait for previous upload to complete before reusing buffer
                        ggml_backend_event_synchronize(events[buffer_idx]);
-
-                        // Read aligned chunk from file
-                        file->read_raw_unsafe(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
-
-                        // Calculate actual data portion (excluding alignment padding)
-                        uintptr_t ptr_data = ptr_dest_aligned;
-                        size_t data_to_copy = read_size;
-
-                        // Skip alignment padding at start of first chunk
-                        if (bytes_read == 0) {
-                            ptr_data += offset_from_alignment;
-                            data_to_copy -= offset_from_alignment;
-                        }
-
-                        // Trim alignment padding at end of last chunk
-                        if (aligned_offset + bytes_read + read_size > offset + n_size) {
-                            data_to_copy -= (read_end - (offset + n_size));
-                        }
-
-                        // Async upload actual data to GPU
-                        ggml_backend_tensor_set_async(upload_backend, cur,
-                                                      reinterpret_cast<void *>(ptr_data), data_read, data_to_copy);
+                        file->read_raw(host_ptrs[buffer_idx], read_iteration);
+                        ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
                        ggml_backend_event_record(events[buffer_idx], upload_backend);

-                        data_read += data_to_copy;
-                        bytes_read += read_size;
-
+                        bytes_read += read_iteration;
                        ++buffer_idx;
                        buffer_idx %= n_buffers;
                    }
--- a/llama/llama.cpp/src/llama-model-loader.h
+++ b/llama/llama.cpp/src/llama-model-loader.h
@@ -70,7 +70,6 @@ struct llama_model_loader {
    size_t   n_bytes    = 0;

    bool use_mmap = false;
-    bool use_direct_io = false;
    bool check_tensors;
    bool no_alloc;

@@ -98,7 +97,6 @@ struct llama_model_loader {
        const std::string & fname,
        std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
        bool use_mmap,
-        bool use_direct_io,
        bool check_tensors,
        bool no_alloc,
        const llama_model_kv_override * param_overrides_p,
@@ -133,8 +131,6 @@ struct llama_model_loader {
    template<typename T>
    bool get_key_or_arr(enum llm_kv kid, T & result, uint32_t n, bool required = true);

-    bool get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required = true);
-
    std::string get_arch_name() const;

    enum llm_arch get_arch() const;
--- a/llama/llama.cpp/src/llama-model-saver.cpp
+++ b/llama/llama.cpp/src/llama-model-saver.cpp
@@ -146,9 +146,6 @@ void llama_model_saver::add_kv_from_model() {
    add_kv(LLM_KV_VOCAB_SIZE,                        vocab.n_tokens());
    add_kv(LLM_KV_CONTEXT_LENGTH,                    hparams.n_ctx_train);
    add_kv(LLM_KV_EMBEDDING_LENGTH,                  hparams.n_embd);
-    if (hparams.n_embd_out_impl > 0) {
-        add_kv(LLM_KV_EMBEDDING_LENGTH_OUT,          hparams.n_embd_out_impl);
-    }
    add_kv(LLM_KV_BLOCK_COUNT,                       hparams.n_layer);
    add_kv(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead);
    add_kv(LLM_KV_FEED_FORWARD_LENGTH,               hparams.n_ff_arr, true);
--- a/llama/llama.cpp/src/llama-model.cpp
+++ b/llama/llama.cpp/src/llama-model.cpp
--- a/llama/llama.cpp/src/llama-model.h
+++ b/llama/llama.cpp/src/llama-model.h
@@ -11,7 +11,6 @@
 #include <memory>
 #include <string>
 #include <unordered_map>
-#include <unordered_set>
 #include <vector>

 struct llama_cparams;
@@ -25,14 +24,12 @@ enum llm_type {
    LLM_TYPE_17M,
    LLM_TYPE_22M,
    LLM_TYPE_33M,
-    LLM_TYPE_47M,
    LLM_TYPE_60M,
    LLM_TYPE_70M,
    LLM_TYPE_80M,
    LLM_TYPE_109M,
    LLM_TYPE_137M,
    LLM_TYPE_140M,
-    LLM_TYPE_149M,
    LLM_TYPE_160M,
    LLM_TYPE_190M,
    LLM_TYPE_220M,
@@ -42,7 +39,6 @@ enum llm_type {
    LLM_TYPE_335M,
    LLM_TYPE_350M,
    LLM_TYPE_360M,
-    LLM_TYPE_395M,
    LLM_TYPE_410M,
    LLM_TYPE_450M,
    LLM_TYPE_475M,
@@ -121,12 +117,10 @@ enum llm_type {
    LLM_TYPE_31B_A3_5B,
    LLM_TYPE_80B_A3B, // Qwen3 Next
    LLM_TYPE_100B_A6B,
-    LLM_TYPE_102B_A12B, // Solar-Open
    LLM_TYPE_106B_A12B, // GLM-4.5-Air
    LLM_TYPE_230B_A10B, // Minimax M2
    LLM_TYPE_235B_A22B,
    LLM_TYPE_300B_A47B, // Ernie MoE big
-    LLM_TYPE_310B_A15B, // /MiMo-V2-Flash
    LLM_TYPE_355B_A32B, // GLM-4.5
    LLM_TYPE_E2B,
    LLM_TYPE_E4B,
@@ -471,6 +465,8 @@ struct llama_model {
    struct ggml_tensor * dense_2_out_layers = nullptr;
    struct ggml_tensor * dense_3_out_layers = nullptr;

+    llama_model_params params;
+
    // gguf metadata
    std::unordered_map<std::string, std::string> gguf_kv;

@@ -480,9 +476,6 @@ struct llama_model {
    // for quantize-stats only
    std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;

-    // for keeping track of associated LoRA adapters
-    std::unordered_set<llama_adapter_lora *> loras;
-
    int64_t t_load_us  = 0;
    int64_t t_start_us = 0;

@@ -504,9 +497,6 @@ struct llama_model {
    size_t n_tensors() const;
    size_t n_devices() const;

-    uint32_t n_gpu_layers() const;
-    llama_split_mode split_mode() const;
-
    std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const;

    // total number of parameters in the model
@@ -535,8 +525,6 @@ struct llama_model {
    ggml_cgraph * build_graph(const llm_graph_params & params) const;

 private:
-    llama_model_params params;
-
    struct impl;
    std::unique_ptr<impl> pimpl;
 };
--- a/llama/llama.cpp/src/llama-quant.cpp
+++ b/llama/llama.cpp/src/llama-quant.cpp
@@ -422,6 +422,57 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
        ++qs.i_ffn_up;
    }

+    //    if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
+    //}
+    // IK: let's remove this, else Q2_K is almost the same as Q3_K_S
+    //else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
+    //    if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
+    //}
+    // This can be used to reduce the size of the Q5_K_S model.
+    // The associated PPL increase is fully in line with the size reduction
+    //else {
+    //    if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
+    //}
+    bool convert_incompatible_tensor = false;
+    {
+        const int64_t nx = tensor->ne[0];
+        const int64_t ny = tensor->ne[1];
+        const int64_t qk_k = ggml_blck_size(new_type);
+
+        if (nx % qk_k != 0) {
+            LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
+            convert_incompatible_tensor = true;
+        } else {
+            ++qs.n_k_quantized;
+        }
+    }
+
+    if (convert_incompatible_tensor) {
+        switch (new_type) {
+            case GGML_TYPE_TQ1_0:
+            case GGML_TYPE_TQ2_0:  new_type = GGML_TYPE_Q4_0; break;  // TODO: use a symmetric type instead
+            case GGML_TYPE_IQ2_XXS:
+            case GGML_TYPE_IQ2_XS:
+            case GGML_TYPE_IQ2_S:
+            case GGML_TYPE_IQ3_XXS:
+            case GGML_TYPE_IQ3_S:
+            case GGML_TYPE_IQ1_S:
+            case GGML_TYPE_IQ1_M:
+            case GGML_TYPE_Q2_K:
+            case GGML_TYPE_Q3_K:
+            case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
+            case GGML_TYPE_Q4_K:   new_type = GGML_TYPE_Q5_0;   break;
+            case GGML_TYPE_Q5_K:   new_type = GGML_TYPE_Q5_1;   break;
+            case GGML_TYPE_Q6_K:   new_type = GGML_TYPE_Q8_0;   break;
+            default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
+        }
+        if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
+            new_type = GGML_TYPE_F16;
+        }
+        LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
+        ++qs.n_fallback;
+    }
+
    return new_type;
 }

@@ -545,7 +596,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
    }

    std::vector<std::string> splits = {};
-    llama_model_loader ml(fname_inp, splits, use_mmap, /*use_direct_io*/ true, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
+    llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
    ml.init_mappings(false); // no prefetching

    llama_model model(llama_model_default_params());
@@ -824,69 +875,21 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::

            // get more optimal quantization type based on the tensor shape, layer, etc.
            if (!params->pure && ggml_is_quantized(default_type)) {
-                // if the user provided tensor types - use those
-                bool manual = false;
-                if (params->tensor_types) {
+                int fallback = qs.n_fallback;
+                new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
+                // unless the user specifies a type, and the tensor geometry will not require fallback quantisation
+                if (params->tensor_types && qs.n_fallback - fallback == 0) {
                    const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
                    const std::string tensor_name(tensor->name);
                    for (const auto & [tname, qtype] : tensor_types) {
                        if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
                            if  (qtype != new_type) {
-                                LLAMA_LOG_WARN("(manual override: %s -> %s) ", ggml_type_name(new_type), ggml_type_name(qtype));
+                                LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type));
                                new_type = qtype; // if two or more types are specified for the same tensor, the last match wins
-                                manual = true;
-                                break;
                            }
                        }
                    }
                }
-
-                // if not manual - use the standard logic for choosing the quantization type based on the selected mixture
-                if (!manual) {
-                    new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
-                }
-
-                // incompatible tensor shapes are handled here - fallback to a compatible type
-                {
-                    bool convert_incompatible_tensor = false;
-
-                    const int64_t nx = tensor->ne[0];
-                    const int64_t ny = tensor->ne[1];
-                    const int64_t qk_k = ggml_blck_size(new_type);
-
-                    if (nx % qk_k != 0) {
-                        LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
-                        convert_incompatible_tensor = true;
-                    } else {
-                        ++qs.n_k_quantized;
-                    }
-
-                    if (convert_incompatible_tensor) {
-                        switch (new_type) {
-                            case GGML_TYPE_TQ1_0:
-                            case GGML_TYPE_TQ2_0:  new_type = GGML_TYPE_Q4_0; break;  // TODO: use a symmetric type instead
-                            case GGML_TYPE_IQ2_XXS:
-                            case GGML_TYPE_IQ2_XS:
-                            case GGML_TYPE_IQ2_S:
-                            case GGML_TYPE_IQ3_XXS:
-                            case GGML_TYPE_IQ3_S:
-                            case GGML_TYPE_IQ1_S:
-                            case GGML_TYPE_IQ1_M:
-                            case GGML_TYPE_Q2_K:
-                            case GGML_TYPE_Q3_K:
-                            case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
-                            case GGML_TYPE_Q4_K:   new_type = GGML_TYPE_Q5_0;   break;
-                            case GGML_TYPE_Q5_K:   new_type = GGML_TYPE_Q5_1;   break;
-                            case GGML_TYPE_Q6_K:   new_type = GGML_TYPE_Q8_0;   break;
-                            default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
-                        }
-                        if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
-                            new_type = GGML_TYPE_F16;
-                        }
-                        LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
-                        ++qs.n_fallback;
-                    }
-                }
            }
            if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
                new_type = params->token_embedding_type;
--- a/llama/llama.cpp/src/llama-sampling.cpp
+++ b/llama/llama.cpp/src/llama-sampling.cpp
--- a/llama/llama.cpp/src/llama-sampling.h
+++ b/llama/llama.cpp/src/llama-sampling.h
@@ -14,19 +14,7 @@ struct llama_grammar;
 struct llama_sampler_chain {
    llama_sampler_chain_params params;

-    // has .backend_init() been called?
-    bool is_init = false;
-
-    struct info {
-        bool is_backend;
-
-        llama_sampler * ptr;
-    };
-
-    std::vector<info> samplers;
-
-    // pre-allocated buffer for llama_sampler_sample to avoid repeated allocations
-    std::vector<llama_token_data> cur;
+    std::vector<struct llama_sampler *> samplers;

    // timing

@@ -36,9 +24,9 @@ struct llama_sampler_chain {
 };

 struct llama_sampler * llama_sampler_init_dry_testing(
-        int32_t context_size,
-        float   dry_multiplier,
-        float   dry_base,
-        int32_t dry_allowed_length,
-        int32_t dry_penalty_last_n,
-        const std::vector<std::vector<llama_token>> & seq_breakers);
+                         int32_t   context_size,
+                           float   dry_multiplier,
+                           float   dry_base,
+                         int32_t   dry_allowed_length,
+                         int32_t   dry_penalty_last_n,
+  const std::vector<std::vector<llama_token>>& seq_breakers);
--- a/llama/llama.cpp/src/llama-vocab.cpp
+++ b/llama/llama.cpp/src/llama-vocab.cpp
@@ -314,12 +314,6 @@ struct llm_tokenizer_bpe : llm_tokenizer {
                    "[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\r\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\r\n]*|\\s*[\r\n]+|\\s+(?!\\S)|\\s+",
                };
                break;
-            case LLAMA_VOCAB_PRE_TYPE_YOUTU:
-                regex_exprs = {
-                    "[가-힣ㄱ-ㆎ]+|[！…“”‘’—：；，、-〿︰-﹏]+|[ㄅ-ㄯ]+|[一-龥぀-ゟ゠-ヿ]+",
-                    "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
-                };
-                break;
            case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
                regex_exprs = {
                    "[\r\n]",
@@ -361,7 +355,6 @@ struct llm_tokenizer_bpe : llm_tokenizer {
            case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
            case LLAMA_VOCAB_PRE_TYPE_QWEN2:
            case LLAMA_VOCAB_PRE_TYPE_HUNYUAN:
-            case LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN:
                regex_exprs = {
                    // original regex from tokenizer.json
                    // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
@@ -461,13 +454,6 @@ struct llm_tokenizer_bpe : llm_tokenizer {
                    "[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\\r\\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
                };
                break;
-            case LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE:
-                regex_exprs = {
-                    // original regex from tokenizer.json
-                    // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+"
-                    "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
-                };
-                break;
            default:
                // default regex for BPE tokenization pre-processing
                regex_exprs = {
@@ -1863,11 +1849,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                    tokenizer_pre == "deepseek-v3") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM;
                clean_spaces = false;
-            } else if (
-                    tokenizer_pre == "youtu") {
-                pre_type = LLAMA_VOCAB_PRE_TYPE_YOUTU;
-                clean_spaces = false;
-                ignore_merges = true;
            } else if (
                    tokenizer_pre == "falcon") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_FALCON;
@@ -1886,8 +1867,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                    tokenizer_pre == "jina-v2-es" ||
                    tokenizer_pre == "jina-v2-de" ||
                    tokenizer_pre == "a.x-4.0" ||
-                    tokenizer_pre == "mellum"  ||
-                    tokenizer_pre == "modern-bert" ) {
+                    tokenizer_pre == "mellum") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
            } else if (
                    tokenizer_pre == "jina-v1-en" ||
@@ -1961,9 +1941,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
            } else if (
                tokenizer_pre == "exaone4") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
-            } else if (
-                tokenizer_pre == "exaone-moe") {
-                pre_type = LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE;
            } else if (
                tokenizer_pre == "chameleon") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
@@ -2026,10 +2003,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                tokenizer_pre == "minimax-m2") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2;
                clean_spaces = false;
-            } else if (
-                tokenizer_pre == "solar-open") {
-                pre_type = LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN;
-                clean_spaces = false;
            } else {
                LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
                pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
@@ -2076,7 +2049,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
        scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
    }

-    const uint32_t n_scores = score_idx != -1 ? gguf_get_arr_n(ctx, score_idx) : 0;
    const int * toktypes = nullptr;
    const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
    if (toktype_idx != -1) {
@@ -2098,7 +2070,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {

        auto & token_data = id_to_token[i];
        token_data.text  = std::move(word);
-        token_data.score = (scores && i < n_scores) ? scores[i] : 0.0f;
+        token_data.score = scores ? scores[i] : 0.0f;
        token_data.attr  = LLAMA_TOKEN_ATTR_NORMAL;

        if (toktypes) {  //TODO: remove, required until per token attributes are available from GGUF file
@@ -2204,8 +2176,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
        //       for now, we apply this workaround to find the tokens based on their text

        for (const auto & t : token_to_id) {
-            auto & attr = id_to_token[t.second].attr;
-
            // find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
            if (special_eot_id == LLAMA_TOKEN_NULL) {
                if (false
@@ -2221,10 +2191,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<end_of_utterance>" // smoldocling
                   ) {
                    special_eot_id = t.second;
-                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                __func__, t.second, t.first.c_str());
-                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                    }
                }
            }
@@ -2235,10 +2205,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<|eom_id|>"
                        ) {
                    special_eom_id = t.second;
-                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                __func__, t.second, t.first.c_str());
-                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                    }
                }
            }
@@ -2255,10 +2225,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<|code_prefix|>" // GLM-4.5
                        ) {
                    special_fim_pre_id = t.second;
-                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                __func__, t.second, t.first.c_str());
-                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                    }
                }
            }
@@ -2275,10 +2245,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<|code_suffix|>" // GLM-4.5
                        ) {
                    special_fim_suf_id = t.second;
-                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                __func__, t.second, t.first.c_str());
-                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                    }
                }
            }
@@ -2295,10 +2265,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<|code_middle|>" // GLM-4.5
                        ) {
                    special_fim_mid_id = t.second;
-                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                __func__, t.second, t.first.c_str());
-                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                    }
                }
            }
@@ -2312,10 +2282,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<PAD>"
                        ) {
                    special_fim_pad_id = t.second;
-                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                __func__, t.second, t.first.c_str());
-                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                    }
                }
            }
@@ -2330,10 +2300,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<reponame>"    // Granite
                        ) {
                    special_fim_rep_id = t.second;
-                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                __func__, t.second, t.first.c_str());
-                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                    }
                }
            }
@@ -2344,41 +2314,15 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<|file_sep|>" // Qwen
                        ) {
                    special_fim_sep_id = t.second;
-                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                                __func__, t.second, t.first.c_str());
-                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                    }
                }
            }
        }

-        // auto-detect unused tokens: e.g. control tokens with the word "unused"
-        // ideally, these tokens should be marked as unused during conversion
-        {
-            uint32_t n_unused = 0;
-
-            for (const auto & t : token_to_id) {
-                auto & attr = id_to_token[t.second].attr;
-
-                if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
-                    continue;
-                }
-
-                if ((attr & LLAMA_TOKEN_ATTR_UNUSED) == 0) {
-                    if (strstr(t.first.c_str(), "unused") != NULL) {
-                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_UNUSED);
-                    }
-                }
-
-                if (attr & LLAMA_TOKEN_ATTR_UNUSED) {
-                    n_unused++;
-                }
-            }
-
-            LLAMA_LOG_INFO("%s: %u unused tokens\n", __func__, n_unused);
-        }
-
        // maintain a list of tokens that cause end-of-generation
        // this is currently determined based on the token text, which is obviously not ideal
        // ref: https://github.com/ggerganov/llama.cpp/issues/9606
@@ -2397,16 +2341,12 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
        }

        for (const auto & t : token_to_id) {
-            auto & attr = id_to_token[t.second].attr;
-
            if (false
                    || t.first == "<|eot_id|>"
                    || t.first == "<|im_end|>"
                    || t.first == "<|end|>"
                    || t.first == "<|return|>" // o200k_harmony
                    || t.first == "<|call|>"   // o200k_harmony
-                    || t.first == "<|flush|>"  // solar-open
-                    || t.first == "<|calls|>"  // solar-open
                    || t.first == "<end_of_turn>"
                    || t.first == "<|endoftext|>"
                    || t.first == "<|eom_id|>"
@@ -2416,31 +2356,24 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                    || t.first == "<end_of_utterance>" // smoldocling
               ) {
                special_eog_ids.insert(t.second);
-                if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                    LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                            __func__, t.second, t.first.c_str());
-                    attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
+                    id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                }
            } else {
-                if (attr & LLAMA_TOKEN_ATTR_CONTROL && !(attr & LLAMA_TOKEN_ATTR_UNUSED)) {
-                    // token is control, but not marked as EOG -> print a debug log
-                    if (special_eog_ids.count(t.second) == 0) {
-                        LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
-                                __func__, t.second, t.first.c_str());
-                    }
+                // token is control, but not marked as EOG -> print a debug log
+                if (id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && special_eog_ids.count(t.second) == 0) {
+                    LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
+                            __func__, t.second, t.first.c_str());
                }
            }
        }

        // @ngxson : quick hack for gpt-oss, always render these tokens
        for (const auto & t : token_to_id) {
-            auto & attr = id_to_token[t.second].attr;
-
            if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>" || t.first == "<|constrain|>") {
-                LLAMA_LOG_WARN("%s: setting token '%s' (%d) attribute to USER_DEFINED (%u), old attributes: %u\n",
-                        __func__, t.first.c_str(), t.second, LLAMA_TOKEN_ATTR_USER_DEFINED, attr);
-
-                attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
+                id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
            }
        }

@@ -2460,42 +2393,34 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
            LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
        }

-        // TODO: workaround for o200k_harmony and solar-open tokenizer: the "<|end|>" token should not be EOG
-        //       we don't have a good way to detect this, so for now, if we have "<|return|>" and "<|call|>" tokens ("<|calls|>" and "<|flush|>" for solar-open),
+        // TODO: workaround for o200k_harmony tokenizer: the "<|end|>" token should not be EOG
+        //       we don't have a good way to detect this, so for now, if we have "<|return|>" and "<|call|>" tokens,
        //       we remove the "<|end|>" token from the EOG list
        {
            bool has_return = false;
            bool has_call   = false;
            bool has_end    = false;
-            bool has_flush  = false;

            llama_token end_id = LLAMA_TOKEN_NULL;

            LLAMA_LOG_INFO("%s: printing all EOG tokens:\n", __func__);
            for (auto tid : special_eog_ids) {
-                auto & text = id_to_token[tid].text;
+                LLAMA_LOG_INFO("%s:   - %d ('%s')\n", __func__, tid, id_to_token[tid].text.c_str());

-                LLAMA_LOG_INFO("%s:   - %d ('%s')\n", __func__, tid, text.c_str());
-
-                if (text == "<|return|>") {
+                if (id_to_token[tid].text == "<|return|>") {
                    has_return = true;
-                } else if (text == "<|call|>" || text == "<|calls|>") {
+                } else if (id_to_token[tid].text == "<|call|>") {
                    has_call = true;
-                } else if (text == "<|flush|>") {
-                    has_flush = true;
-                } else if (text == "<|end|>") {
+                } else if (id_to_token[tid].text == "<|end|>") {
                    has_end = true;
                    end_id = tid;
                }
            }

-            if ((has_return && has_call && has_end) || (has_call && has_flush && has_end)) {
+            if (has_return && has_call && has_end) {
                special_eog_ids.erase(end_id);
-
-                auto & attr = id_to_token[end_id].attr;
-                attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
-
-                LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>', or '<|calls|>' and '<|flush|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
+                id_to_token[end_id].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
+                LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
            }
        }
    }
@@ -2593,13 +2518,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
            for (const auto * token : {"<unk>", "<s>", "<|endoftext|>"}) {
                _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
            }
-        } else if (_contains_any(model_name, {"modern-bert"})) {
-            if (token_to_id.count("[MASK]") == 0 ) {
-                LLAMA_LOG_WARN("%s: Mask token missing in vocab!\n", __func__);
-            }
-            else {
-                _set_token_attr("[MASK]", LLAMA_TOKEN_ATTR_LSTRIP, true);
-            }
        }
    }
 }
@@ -3293,34 +3211,34 @@ int32_t llama_vocab::impl::detokenize(
 }

 void llama_vocab::impl::print_info() const {
-    LLAMA_LOG_INFO("%s: vocab type            = %s\n",     __func__, type_name().c_str());
-    LLAMA_LOG_INFO("%s: n_vocab               = %u\n",     __func__, vocab.n_tokens());
-    LLAMA_LOG_INFO("%s: n_merges              = %u\n",     __func__, (uint32_t) bpe_ranks.size());
+    LLAMA_LOG_INFO("%s: vocab type       = %s\n",     __func__, type_name().c_str());
+    LLAMA_LOG_INFO("%s: n_vocab          = %u\n",     __func__, vocab.n_tokens());
+    LLAMA_LOG_INFO("%s: n_merges         = %u\n",     __func__, (uint32_t) bpe_ranks.size());

    // special tokens
-    if (special_bos_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: BOS token             = %d '%s'\n", __func__, special_bos_id,     id_to_token.at(special_bos_id).text.c_str() );  }
-    if (special_eos_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: EOS token             = %d '%s'\n", __func__, special_eos_id,     id_to_token.at(special_eos_id).text.c_str() );  }
-    if (special_eot_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: EOT token             = %d '%s'\n", __func__, special_eot_id,     id_to_token.at(special_eot_id).text.c_str() );  }
-    if (special_eom_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: EOM token             = %d '%s'\n", __func__, special_eom_id,     id_to_token.at(special_eom_id).text.c_str() );  }
-    if (special_unk_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: UNK token             = %d '%s'\n", __func__, special_unk_id,     id_to_token.at(special_unk_id).text.c_str() );  }
-    if (special_sep_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: SEP token             = %d '%s'\n", __func__, special_sep_id,     id_to_token.at(special_sep_id).text.c_str() );  }
-    if (special_pad_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: PAD token             = %d '%s'\n", __func__, special_pad_id,     id_to_token.at(special_pad_id).text.c_str() );  }
-    if (special_mask_id != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: MASK token            = %d '%s'\n", __func__, special_mask_id,    id_to_token.at(special_mask_id).text.c_str() ); }
+    if (special_bos_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: BOS token        = %d '%s'\n", __func__, special_bos_id,     id_to_token.at(special_bos_id).text.c_str() );  }
+    if (special_eos_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: EOS token        = %d '%s'\n", __func__, special_eos_id,     id_to_token.at(special_eos_id).text.c_str() );  }
+    if (special_eot_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: EOT token        = %d '%s'\n", __func__, special_eot_id,     id_to_token.at(special_eot_id).text.c_str() );  }
+    if (special_eom_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: EOM token        = %d '%s'\n", __func__, special_eom_id,     id_to_token.at(special_eom_id).text.c_str() );  }
+    if (special_unk_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: UNK token        = %d '%s'\n", __func__, special_unk_id,     id_to_token.at(special_unk_id).text.c_str() );  }
+    if (special_sep_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: SEP token        = %d '%s'\n", __func__, special_sep_id,     id_to_token.at(special_sep_id).text.c_str() );  }
+    if (special_pad_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: PAD token        = %d '%s'\n", __func__, special_pad_id,     id_to_token.at(special_pad_id).text.c_str() );  }
+    if (special_mask_id != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: MASK token       = %d '%s'\n", __func__, special_mask_id,    id_to_token.at(special_mask_id).text.c_str() ); }

-    if (linefeed_id != LLAMA_TOKEN_NULL)        { LLAMA_LOG_INFO( "%s: LF token              = %d '%s'\n", __func__, linefeed_id,        id_to_token.at(linefeed_id).text.c_str() ); }
+    if (linefeed_id != LLAMA_TOKEN_NULL)        { LLAMA_LOG_INFO( "%s: LF token         = %d '%s'\n", __func__, linefeed_id,        id_to_token.at(linefeed_id).text.c_str() ); }

-    if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token         = %d '%s'\n", __func__, special_fim_pre_id, id_to_token.at(special_fim_pre_id).text.c_str() ); }
-    if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token         = %d '%s'\n", __func__, special_fim_suf_id, id_to_token.at(special_fim_suf_id).text.c_str() ); }
-    if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token         = %d '%s'\n", __func__, special_fim_mid_id, id_to_token.at(special_fim_mid_id).text.c_str() ); }
-    if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token         = %d '%s'\n", __func__, special_fim_pad_id, id_to_token.at(special_fim_pad_id).text.c_str() ); }
-    if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token         = %d '%s'\n", __func__, special_fim_rep_id, id_to_token.at(special_fim_rep_id).text.c_str() ); }
-    if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token         = %d '%s'\n", __func__, special_fim_sep_id, id_to_token.at(special_fim_sep_id).text.c_str() ); }
+    if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token    = %d '%s'\n", __func__, special_fim_pre_id, id_to_token.at(special_fim_pre_id).text.c_str() ); }
+    if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token    = %d '%s'\n", __func__, special_fim_suf_id, id_to_token.at(special_fim_suf_id).text.c_str() ); }
+    if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token    = %d '%s'\n", __func__, special_fim_mid_id, id_to_token.at(special_fim_mid_id).text.c_str() ); }
+    if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token    = %d '%s'\n", __func__, special_fim_pad_id, id_to_token.at(special_fim_pad_id).text.c_str() ); }
+    if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token    = %d '%s'\n", __func__, special_fim_rep_id, id_to_token.at(special_fim_rep_id).text.c_str() ); }
+    if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token    = %d '%s'\n", __func__, special_fim_sep_id, id_to_token.at(special_fim_sep_id).text.c_str() ); }

    for (const auto & id : special_eog_ids) {
-        LLAMA_LOG_INFO( "%s: EOG token             = %d '%s'\n", __func__, id, id_to_token.at(id).text.c_str() );
+        LLAMA_LOG_INFO( "%s: EOG token        = %d '%s'\n", __func__, id, id_to_token.at(id).text.c_str() );
    }

-    LLAMA_LOG_INFO("%s: max token length      = %d\n", __func__, max_token_len);
+    LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, max_token_len);
 }

 llama_vocab::llama_vocab() : pimpl(new impl(*this)) {
--- a/llama/llama.cpp/src/llama-vocab.h
+++ b/llama/llama.cpp/src/llama-vocab.h
@@ -51,9 +51,6 @@ enum llama_vocab_pre_type {
    LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40,
    LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2      = 41,
    LLAMA_VOCAB_PRE_TYPE_AFMOE           = 42,
-    LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN      = 43,
-    LLAMA_VOCAB_PRE_TYPE_YOUTU           = 44,
-    LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE      = 45,
 };

 struct LLM_KV;
--- a/llama/llama.cpp/src/llama.cpp
+++ b/llama/llama.cpp/src/llama.cpp
@@ -71,9 +71,8 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
    }, &ud);

    llama_model_params mparams_copy = *mparams;
-    mparams_copy.no_alloc  = true;
-    mparams_copy.use_mmap  = false;
-    mparams_copy.use_mlock = false;
+    mparams_copy.no_alloc = true;
+    mparams_copy.use_mmap = false;

    llama_model * model = llama_model_load_from_file(path_model, mparams_copy);
    if (model == nullptr) {
@@ -111,20 +110,8 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
        }
    }
    for (size_t i = 0; i < ret.size(); i++) {
-        size_t free;
-        size_t total;
+        size_t free, total;
        ggml_backend_dev_memory(model->devices[i], &free, &total);
-
-        // devices can return 0 bytes for free and total memory if they do not
-        // have any to report. in this case, we will use the host memory as a fallback
-        // fixes: https://github.com/ggml-org/llama.cpp/issues/18577
-        if (free == 0 && total == 0) {
-            ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
-            if (cpu_dev == nullptr) {
-                throw std::runtime_error(format("%s: no CPU backend found", __func__));
-            }
-            ggml_backend_dev_memory(cpu_dev, &free, &total);
-        }
        ret[i].free  = free;
        ret[i].total = total;
    }
@@ -152,15 +139,12 @@ enum layer_fraction_t {
 };
 // this enum is only used in llama_params_fit_impl but needs to be defined outside of it to fix a Windows compilation issue

-class llama_params_fit_exception : public std::runtime_error {
-    using std::runtime_error::runtime_error;
-};
-
 static void llama_params_fit_impl(
        const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
        float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
-        size_t * margins_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
+        size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
    constexpr int64_t MiB = 1024*1024;
+    const int64_t margin = margin_s; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
    typedef std::vector<llama_device_memory_data> dmds_t;
    const llama_model_params default_mparams = llama_model_default_params();

@@ -179,12 +163,6 @@ static void llama_params_fit_impl(
        return;
    }

-    std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
-    margins.reserve(nd);
-    for (size_t id = 0; id < nd; id++) {
-        margins.push_back(margins_s[id]);
-    }
-
    std::vector<std::string> dev_names;
    {
        dev_names.reserve(nd);
@@ -202,12 +180,11 @@ static void llama_params_fit_impl(
        }
    }

-    int64_t sum_free            = 0;
-    int64_t sum_projected_free  = 0;
-    int64_t sum_projected_used  = 0;
-    int64_t sum_projected_model = 0;
-    std::vector<int64_t> projected_free_per_device;
-    projected_free_per_device.reserve(nd);
+    int64_t sum_total          = 0;
+    int64_t sum_projected_free = 0;
+    int64_t min_projected_free = INT64_MAX;
+    int64_t sum_projected_used = 0;
+    int64_t sum_projected_ctx  = 0;

    if (nd > 1) {
        LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
@@ -217,106 +194,63 @@ static void llama_params_fit_impl(

        const int64_t projected_used = dmd.mb.total();
        const int64_t projected_free = dmd.free - projected_used;
-        projected_free_per_device.push_back(projected_free);

-        sum_free            += dmd.free;
-        sum_projected_used  += projected_used;
-        sum_projected_free  += projected_free;
-        sum_projected_model += dmd.mb.model;
+        sum_total          += dmd.total;
+        sum_projected_used += projected_used;
+        sum_projected_free += projected_free;
+        min_projected_free  = std::min(min_projected_free, projected_free);
+        sum_projected_ctx  += dmd.mb.context;

        if (nd > 1) {
-            LLAMA_LOG_INFO("%s:   - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
-                __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
+            LLAMA_LOG_INFO("%s:   - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " %s\n",
+                __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, std::abs(projected_free)/MiB,
+                projected_free >= 0 ? "surplus" : "deficit");
        }
    }
-    assert(sum_free >= 0 && sum_projected_used >= 0);
+    assert(sum_total >= 0 && sum_projected_used >= 0 && sum_projected_ctx >= 0);
+    assert(sum_projected_used >= sum_projected_ctx);
    LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
-        __func__, sum_projected_used/MiB, sum_free/MiB);
-    if (nd == 1) {
-        if (projected_free_per_device[0] >= margins[0]) {
+        __func__, sum_projected_used/MiB, sum_total/MiB);
+    if (min_projected_free >= margin) {
+        if (nd == 1) {
            LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
-                __func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
-            return;
-        }
-    } else {
-        bool changes_needed = false;
-        for (size_t id = 0; id < nd; id++) {
-            if (projected_free_per_device[id] < margins[id]) {
-                changes_needed = true;
-                break;
-            }
-        }
-        if (!changes_needed) {
-            LLAMA_LOG_INFO("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
+                __func__, min_projected_free/MiB, margin/MiB);
            return;
        }
+        LLAMA_LOG_INFO("%s: will leave at least %" PRId64 " >= %" PRId64 " MiB of free memory on all devices, no changes needed\n",
+            __func__, min_projected_free/MiB, margin/MiB);
+        return;
    }

    // step 2: try reducing memory use by reducing the context size

    {
-        int64_t global_surplus = sum_projected_free;
-        for (size_t id = 0; id < nd; id++) {
-            global_surplus -= margins[id];
-        }
+        int64_t global_surplus = sum_projected_free - int64_t(nd)*margin;
        if (global_surplus < 0) {
-            if (nd == 1) {
-                LLAMA_LOG_INFO("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
-                    __func__, margins[0]/MiB, -global_surplus/MiB);
-            } else {
-                LLAMA_LOG_INFO(
-                    "%s: cannot meet free memory targets on all devices, need to use %" PRId64 " MiB less in total\n",
-                    __func__, -global_surplus/MiB);
-            }
+            LLAMA_LOG_INFO(nd == 1 ?
+                "%s: cannot fulfill margin of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n" :
+                "%s: cannot fulfill margin of %" PRId64 " MiB on all devices, need to use %" PRId64 " MiB less in total\n",
+                __func__, margin/MiB, -global_surplus/MiB);
            if (cparams->n_ctx == 0) {
                if (hp_nct > n_ctx_min) {
-                    int64_t sum_used_target = sum_free;
-                    for (size_t id = 0; id < nd; id++) {
-                        sum_used_target -= margins[id];
-                    }
-                    if (nd > 1) {
-                        // for multiple devices we need to be more conservative in terms of how much context we think can fit:
-                        //   - for dense models only whole layers can be assigned to devices
-                        //   - for MoE models only whole tensors can be assigned to devices, which we estimate to be <= 1/3 of a layer
-                        //   - on average we expect a waste of 0.5 layers/tensors per device
-                        //   - use slightly more than the expected average for nd devices to be safe
-                        const int64_t model_per_layer = sum_projected_model / std::min(uint32_t(mparams->n_gpu_layers), hp_ngl);
-                        sum_used_target -= (nd + 1) * model_per_layer / (hp_nex == 0 ? 2 : 6);
-                    }
-
-                    int64_t sum_projected_used_min_ctx = 0;
-                    cparams->n_ctx = n_ctx_min;
-                    const dmds_t dmds_min_ctx = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
-                    for (const auto & dmd : dmds_min_ctx) {
-                        sum_projected_used_min_ctx += dmd.mb.total();
-                    }
-                    if (sum_used_target > sum_projected_used_min_ctx) {
-                        // linear interpolation between minimum and maximum context size:
-                        cparams->n_ctx += (hp_nct - n_ctx_min) * (sum_used_target - sum_projected_used_min_ctx)
-                            / (sum_projected_used - sum_projected_used_min_ctx);
-                        cparams->n_ctx = std::max(cparams->n_ctx - cparams->n_ctx % 256, n_ctx_min); // round down context for CUDA backend
-
-                        const int64_t bytes_per_ctx = (sum_projected_used - sum_projected_used_min_ctx) / (hp_nct - n_ctx_min);
-                        const int64_t memory_reduction = (hp_nct - cparams->n_ctx) * bytes_per_ctx;
-                        LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
-                            __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
+                    const int64_t bytes_per_ctx = sum_projected_ctx / hp_nct;
+                    const uint32_t ctx_reduction = std::min(
+                        uint32_t((-global_surplus + bytes_per_ctx - 1) / bytes_per_ctx), hp_nct - n_ctx_min);
+                    cparams->n_ctx = hp_nct - ctx_reduction;
+                    const int64_t memory_reduction = ctx_reduction * bytes_per_ctx;
+                    global_surplus += memory_reduction;
+                    LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
+                        __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
+                    if (global_surplus >= 0) {
                        if (nd == 1) {
                            LLAMA_LOG_INFO("%s: entire model can be fit by reducing context\n", __func__);
                            return;
                        }
                        LLAMA_LOG_INFO("%s: entire model should be fit across devices by reducing context\n", __func__);
-                    } else {
-                        const int64_t memory_reduction = sum_projected_used - sum_projected_used_min_ctx;
-                        LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
-                            __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
                    }
                } else {
-                    if (n_ctx_min == UINT32_MAX) {
-                        LLAMA_LOG_INFO("%s: user has requested full context size of %" PRIu32 " -> no change\n", __func__, hp_nct);
-                    } else {
-                        LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
-                            __func__, hp_nct, n_ctx_min);
-                    }
+                    LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
+                        __func__, hp_nct, n_ctx_min);
                }
            } else {
                LLAMA_LOG_INFO("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx);
@@ -325,28 +259,32 @@ static void llama_params_fit_impl(
    }

    if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
-        throw llama_params_fit_exception("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
+        throw std::runtime_error("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
    }
    if (nd > 1) {
        if (!tensor_split) {
-            throw llama_params_fit_exception("did not provide a buffer to write the tensor_split to, abort");
+            throw std::runtime_error("did not provide a buffer to write the tensor_split to, abort");
        }
        if (mparams->tensor_split) {
            for (size_t id = 0; id < nd; id++) {
                if (mparams->tensor_split[id] != 0.0f) {
-                    throw llama_params_fit_exception("model_params::tensor_split already set by user, abort");
+                    throw std::runtime_error("model_params::tensor_split already set by user, abort");
                }
            }
        }
        if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) {
-            throw llama_params_fit_exception("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
+            throw std::runtime_error("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
+        }
+        if (hp_ngl < 2*nd) {
+            throw std::runtime_error("model has only " + std::to_string(hp_ngl) + " layers but need at least "
+                + std::to_string(2*nd) + " to fit memory for " + std::to_string(nd) + " devices, abort");
        }
    }
    if (!tensor_buft_overrides) {
-        throw llama_params_fit_exception("did not provide buffer to set tensor_buft_overrides, abort");
+        throw std::runtime_error("did not provide buffer to set tensor_buft_overrides, abort");
    }
    if (mparams->tensor_buft_overrides && (mparams->tensor_buft_overrides->pattern || mparams->tensor_buft_overrides->buft)) {
-        throw llama_params_fit_exception("model_params::tensor_buft_overrides already set by user, abort");
+        throw std::runtime_error("model_params::tensor_buft_overrides already set by user, abort");
    }

    // step 3: iteratively fill the back to front with "dense" layers
@@ -399,11 +337,6 @@ static void llama_params_fit_impl(

        // for the first partial layer varying parts can overflow, all further layers use LAYER_FRACTION_MOE:
        layer_fraction_t overflow_type = LAYER_FRACTION_MOE;
-
-        uint32_t n_full() const {
-            assert(n_layer >= n_part);
-            return n_layer - n_part;
-        }
    };

    const size_t ntbo = llama_max_tensor_buft_overrides();
@@ -412,7 +345,8 @@ static void llama_params_fit_impl(
    auto set_ngl_tensor_split_tbo = [&](
            const std::vector<ngl_t> & ngl_per_device,
            const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
-            llama_model_params & mparams) {
+            llama_model_params & mparams,
+            const bool add_nonrepeating) {
        mparams.n_gpu_layers = 0;
        for (size_t id = 0; id < nd; id++) {
            mparams.n_gpu_layers += ngl_per_device[id].n_layer;
@@ -420,25 +354,29 @@ static void llama_params_fit_impl(
                tensor_split[id] = ngl_per_device[id].n_layer;
            }
        }
-        assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl + 1);
-        uint32_t il0 = hp_ngl + 1 - mparams.n_gpu_layers; // start index for tensor buft overrides
+        assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl);
+        uint32_t il0 = hp_ngl - mparams.n_gpu_layers; // start index for tensor buft overrides

+        if (add_nonrepeating) {
+            mparams.n_gpu_layers += 1;
+            tensor_split[nd - 1] += 1;
+        }
        mparams.tensor_split = tensor_split;

        size_t itbo = 0;
        for (size_t id = 0; id < nd; id++) {
-            il0 += ngl_per_device[id].n_full();
+            il0 += ngl_per_device[id].n_layer - ngl_per_device[id].n_part;
            for (uint32_t il = il0; il < il0 + ngl_per_device[id].n_part; il++) {
                if (itbo + 1 >= ntbo) {
                    tensor_buft_overrides[itbo].pattern = nullptr;
                    tensor_buft_overrides[itbo].buft    = nullptr;
                    itbo++;
                    mparams.tensor_buft_overrides = tensor_buft_overrides;
-                    throw llama_params_fit_exception("llama_max_tensor_buft_overrides() == "
-                        + std::to_string(ntbo) + " is insufficient for model");
+                    throw std::runtime_error("llama_params_fit_n_tensor_buft_overrides() == "
+                        + std::to_string(ntbo) + " is insufficient for model\n");
                }
                tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
-                tensor_buft_overrides[itbo].buft = il == il0 ? overflow_bufts[id] : ggml_backend_cpu_buffer_type();
+                tensor_buft_overrides[itbo].buft = overflow_bufts[id];
                itbo++;
            }
            il0 += ngl_per_device[id].n_part;
@@ -453,9 +391,10 @@ static void llama_params_fit_impl(
    auto get_memory_for_layers = [&](
            const char * func_name,
            const std::vector<ngl_t> & ngl_per_device,
-            const std::vector<ggml_backend_buffer_type_t> & overflow_bufts) -> std::vector<int64_t> {
+            const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
+            const bool add_nonrepeating) -> std::vector<int64_t> {
        llama_model_params mparams_copy = *mparams;
-        set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy);
+        set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy, add_nonrepeating);

        const dmds_t dmd_nl = llama_get_device_memory_data(
            path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
@@ -488,9 +427,9 @@ static void llama_params_fit_impl(
        const dmds_t dmds_cpu_moe = llama_get_device_memory_data(
            path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);

-        for (size_t id = 0; id < nd; id++) {
-            global_surplus_cpu_moe += dmds_cpu_moe[id].free;
-            global_surplus_cpu_moe -= int64_t(dmds_cpu_moe[id].mb.total()) + margins[id];
+        for (const llama_device_memory_data & dmd : dmds_cpu_moe) {
+            global_surplus_cpu_moe += dmd.free;
+            global_surplus_cpu_moe -= int64_t(dmd.mb.total()) + margin;
        }

        if (global_surplus_cpu_moe > 0) {
@@ -509,18 +448,27 @@ static void llama_params_fit_impl(
    std::vector<int64_t> targets; // maximum acceptable memory use per device
    targets.reserve(nd);
    for (size_t id = 0; id < nd; id++) {
-        targets.push_back(dmds_full[id].free - margins[id]);
+        targets.push_back(dmds_full[id].free - margin);
        LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
    }

-    std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the first partial layer of a device overflows to:
+    // whether for the optimal memory use we expect to load at least some MoE tensors:
+    const bool partial_moe = hp_nex > 0 && global_surplus_cpu_moe > 0;
+
+    std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the partial layers of a device overflow to:
    overflow_bufts.reserve(nd);
-    for (size_t id = 0; id < nd; id++) {
-        overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
+    for (size_t id = 0; id < nd - 1; ++id) {
+        overflow_bufts.push_back(ggml_backend_dev_buffer_type(devs[id + 1]));
    }
+    overflow_bufts.push_back(ggml_backend_cpu_buffer_type());

    std::vector<ngl_t> ngl_per_device(nd);
-    std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts);
+    std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts, partial_moe);
+    if (hp_nex > 0) {
+        for (size_t id = 0; id < nd; id++) {
+            ngl_per_device[id].overflow_type = LAYER_FRACTION_MOE;
+        }
+    }

    // optimize the number of layers per device using the method of false position:
    //   - ngl_per_device has 0 layers for each device, lower bound
@@ -528,30 +476,22 @@ static void llama_params_fit_impl(
    //   - interpolate the memory use / layer between low and high linearly to get a guess where it meets our target
    //   - check memory use of our guess, replace either the low or high bound
    //   - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits
-    //   - the last device has the output layer, which cannot be a partial layer
    if (hp_nex == 0) {
        LLAMA_LOG_INFO("%s: filling dense layers back-to-front:\n", __func__);
    } else {
        LLAMA_LOG_INFO("%s: filling dense-only layers back-to-front:\n", __func__);
    }
+    uint32_t n_unassigned = hp_ngl;
    for (int id = nd - 1; id >= 0; id--) {
-        uint32_t n_unassigned = hp_ngl + 1;
-        for (size_t jd = id + 1; jd < nd; ++jd) {
-            assert(n_unassigned >= ngl_per_device[jd].n_layer);
-            n_unassigned -= ngl_per_device[jd].n_layer;
-        }
-
        std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
        ngl_per_device_high[id].n_layer = n_unassigned;
        if (hp_nex > 0) {
-            ngl_per_device_high[id].n_part = size_t(id) < nd - 1 ? ngl_per_device_high[id].n_layer : ngl_per_device_high[id].n_layer - 1;
+            ngl_per_device_high[id].n_part = ngl_per_device_high[id].n_layer;
        }
        if (ngl_per_device_high[id].n_layer > 0) {
-            std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
+            std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts, partial_moe);
            if (mem_high[id] > targets[id]) {
-                assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer);
                uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
-                LLAMA_LOG_DEBUG("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
                while (delta > 1) {
                    uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
                    step_size = std::max(step_size, uint32_t(1));
@@ -560,26 +500,25 @@ static void llama_params_fit_impl(
                    std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
                    ngl_per_device_test[id].n_layer += step_size;
                    if (hp_nex) {
-                        ngl_per_device_test[id].n_part += size_t(id) == nd - 1 && ngl_per_device_test[id].n_part == 0 ?
-                            step_size - 1 : step_size; // the first layer is the output layer which must always be full
+                        ngl_per_device_test[id].n_part += step_size;
                    }
-                    const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
+                    const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);

                    if (mem_test[id] <= targets[id]) {
-                        ngl_per_device = ngl_per_device_test;
-                        mem            = mem_test;
+                        ngl_per_device  = ngl_per_device_test;
+                        mem             = mem_test;
+                        n_unassigned   -= ngl_per_device[id].n_layer;
                        LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
                    } else {
                        ngl_per_device_high = ngl_per_device_test;
                        mem_high            = mem_test;
-                        LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device_high[id].n_layer);
+                        LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
                    }
                    delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
                }
            } else {
-                assert(ngl_per_device_high[id].n_layer == n_unassigned);
-                ngl_per_device = ngl_per_device_high;
-                mem            = mem_high;
+                ngl_per_device  = ngl_per_device_high;
+                n_unassigned   -= ngl_per_device[id].n_layer;
                LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
            }
        }
@@ -590,7 +529,7 @@ static void llama_params_fit_impl(
            __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB);
    }
    if (hp_nex == 0 || global_surplus_cpu_moe <= 0) {
-        set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
+        set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams, partial_moe);
        return;
    }

@@ -610,20 +549,24 @@ static void llama_params_fit_impl(
    assert(id_dense_start < nd);

    LLAMA_LOG_INFO("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
-    for (size_t id = 0; id <= id_dense_start && id_dense_start < nd; id++) {
+    for (size_t id = 0; id <= id_dense_start; id++) {
        std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
        for (size_t jd = id_dense_start; jd < nd; jd++) {
-            const uint32_t n_layer_move = jd < nd - 1 ? ngl_per_device_high[jd].n_layer : ngl_per_device_high[jd].n_layer - 1;
+            const uint32_t n_layer_move = ngl_per_device_high[jd].n_layer;
            ngl_per_device_high[id].n_layer += n_layer_move;
            ngl_per_device_high[jd].n_layer -= n_layer_move;
            ngl_per_device_high[jd].n_part = 0;
        }
        size_t id_dense_start_high = nd - 1;
-        std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
+        std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts, partial_moe);

        if (mem_high[id] > targets[id]) {
-            assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
-            uint32_t delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
+            assert(ngl_per_device_high[id].n_layer >= ngl_per_device_high[id].n_part);
+            assert(ngl_per_device[id].n_layer >= ngl_per_device[id].n_part);
+            assert((ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
+                   >= ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
+            uint32_t delta = (ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
+                - (ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
            while (delta > 1) {
                uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
                step_size = std::max(step_size, uint32_t(1));
@@ -639,11 +582,11 @@ static void llama_params_fit_impl(
                    ngl_per_device_test[id].n_layer += n_convert_jd;
                    n_converted_test += n_convert_jd;

-                    if (ngl_per_device_test[id_dense_start_test].n_part > 0) {
+                    if (ngl_per_device_test[id_dense_start_test].n_layer > 0) {
                        break;
                    }
                }
-                const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
+                const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);

                if (mem_test[id] <= targets[id]) {
                    ngl_per_device = ngl_per_device_test;
@@ -658,38 +601,32 @@ static void llama_params_fit_impl(
                    LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
                        __func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high);
                }
-                assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
-                delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
+                delta = (ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
+                    - (ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
            }
        } else {
            ngl_per_device = ngl_per_device_high;
-            mem            = mem_high;
            id_dense_start = id_dense_start_high;
            LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
                __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
        }

        // try to fit at least part of one more layer
-        if (ngl_per_device[id_dense_start].n_layer > (id < nd - 1 ? 0 : 1)) {
+        if (ngl_per_device[id_dense_start].n_layer > 0) {
            std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
            size_t id_dense_start_test = id_dense_start;
            ngl_per_device_test[id_dense_start_test].n_layer--;
            ngl_per_device_test[id_dense_start_test].n_part--;
            ngl_per_device_test[id].n_layer++;
            ngl_per_device_test[id].n_part++;
-            if (ngl_per_device_test[id_dense_start_test].n_part == 0) {
+            if (ngl_per_device_test[id_dense_start_test].n_layer == 0) {
                id_dense_start_test++;
            }
            ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
-            std::vector<ggml_backend_buffer_type_t> overflow_bufts_test = overflow_bufts;
-            if (id < nd - 1) {
-                overflow_bufts_test[id] = ggml_backend_dev_buffer_type(devs[id + 1]);
-            }
            LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
-            std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
-            if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
+            std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
+            if (mem_test[id] < targets[id]) {
                ngl_per_device = ngl_per_device_test;
-                overflow_bufts = overflow_bufts_test;
                mem            = mem_test;
                id_dense_start = id_dense_start_test;
                LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
@@ -697,10 +634,9 @@ static void llama_params_fit_impl(

                ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
                LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
-                mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
-                if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
+                mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
+                if (mem_test[id] < targets[id]) {
                    ngl_per_device = ngl_per_device_test;
-                    overflow_bufts = overflow_bufts_test;
                    mem            = mem_test;
                    id_dense_start = id_dense_start_test;
                    LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
@@ -709,10 +645,9 @@ static void llama_params_fit_impl(
            } else {
                ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
                LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
-                mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
-                if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
+                mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
+                if (mem_test[id] < targets[id]) {
                    ngl_per_device = ngl_per_device_test;
-                    overflow_bufts = overflow_bufts_test;
                    mem            = mem_test;
                    id_dense_start = id_dense_start_test;
                    LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
@@ -727,41 +662,30 @@ static void llama_params_fit_impl(
            __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
    }

-    // print info for devices that were not changed during the conversion from dense only to full layers:
-    for (size_t id = id_dense_start + 1; id < nd; id++) {
-        const int64_t projected_margin = dmds_full[id].free - mem[id];
-        LLAMA_LOG_INFO(
-            "%s:   - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
-            __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
-    }
-
-    set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
+    set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams, partial_moe);
 }

-enum llama_params_fit_status llama_params_fit(
+bool llama_params_fit(
        const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
        float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
-        size_t * margins, uint32_t n_ctx_min, enum ggml_log_level log_level) {
+        size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
    const int64_t t0_us = llama_time_us();
-    llama_params_fit_status status = LLAMA_PARAMS_FIT_STATUS_SUCCESS;
+    bool ok = true;
    try {
-        llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margins, n_ctx_min, log_level);
+        llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margin_s, n_ctx_min, log_level);
        LLAMA_LOG_INFO("%s: successfully fit params to free device memory\n", __func__);
-    } catch (const llama_params_fit_exception & e) {
-        LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
-        status = LLAMA_PARAMS_FIT_STATUS_FAILURE;
    } catch (const std::runtime_error & e) {
-        LLAMA_LOG_ERROR("%s: encountered an error while trying to fit params to free device memory: %s\n", __func__, e.what());
-        status = LLAMA_PARAMS_FIT_STATUS_ERROR;
+        LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
+        ok = false;
    }
    const int64_t t1_us = llama_time_us();
    LLAMA_LOG_INFO("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
-    return status;
+    return ok;
 }

 struct llama_sampler_chain_params llama_sampler_chain_default_params() {
    struct llama_sampler_chain_params result = {
-        /*.no_perf =*/ true,
+        /*.no_perf                     =*/ true,
    };

    return result;
@@ -834,7 +758,7 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
    model.t_start_us = tm.t_start_us;

    try {
-        llama_model_loader ml(fname, splits, params.use_mmap, params.use_direct_io, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
+        llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);

        ml.print_info();

@@ -1097,55 +1021,25 @@ int32_t llama_chat_apply_template(
 // model split
 //

-int32_t llama_split_path(
-    char * split_path,
-    size_t maxlen,
-    const char * path_prefix,
-    int32_t split_no,
-    int32_t split_count) {
-
+int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) {
    static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
-
-    const int written = snprintf(
-        split_path,
-        maxlen,
-        SPLIT_PATH_FORMAT,
-        path_prefix,
-        split_no + 1,
-        split_count
-    );
-
-    if (written < 0 || (size_t) written >= maxlen) {
-        return 0;
+    if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) {
+        return strlen(split_path);
    }
-
-    return (int32_t) written;
+    return 0;
 }

-int32_t llama_split_prefix(
-    char * split_prefix,
-    size_t maxlen,
-    const char * split_path,
-    int32_t split_no,
-    int32_t split_count) {
-
-    const std::string str_split_path(split_path);
-
+int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count) {
+    std::string str_split_path(split_path);
    char postfix[32];
-    snprintf(postfix, sizeof(postfix), "-%05d-of-%05d.gguf", split_no + 1, split_count);
+    snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count);
+    std::string str_postfix(postfix);

-    const std::string str_postfix(postfix);
-    if (str_split_path.size() <= str_postfix.size()) {
-        return 0;
-    }
-
-    const size_t size_prefix = str_split_path.size() - str_postfix.size();
-
-    if (str_split_path.compare(size_prefix, std::string::npos, str_postfix) == 0) {
-        const size_t copy_len = std::min(size_prefix + 1, maxlen);
-        snprintf(split_prefix, copy_len, "%s", split_path);
-
-        return (int32_t) size_prefix;
+    // check if split_prefix ends with postfix
+    int size_prefix = str_split_path.size() - str_postfix.size();
+    if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) {
+        snprintf(split_prefix, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path);
+        return size_prefix;
    }

    return 0;
--- a/llama/llama.cpp/src/models/afmoe.cpp
+++ b/llama/llama.cpp/src/models/afmoe.cpp
@@ -22,15 +22,8 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
    const float kq_scale = 1.0f/sqrtf(float(n_embd_head));

    for (int il = 0; il < n_layer; ++il) {
-        const float freq_base_l  = model.get_rope_freq_base (cparams, il);
-        const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
-
        ggml_tensor * inpSA = inpL;

-        // This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous
-        const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
-                              (il + 1) % hparams.n_no_rope_layer_step != 0;
-
        // dual attention normalization (pre)
        cur = build_norm(inpL,
                model.layers[il].attn_norm, NULL,
@@ -63,16 +56,19 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
            cb(Qcur, "Qcur_normed", il);
            cb(Kcur, "Kcur_normed", il);

+            // RoPE only for sliding_attention layers
+            const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
+                                ((il + 1) % hparams.n_no_rope_layer_step) != 0;
            if (use_rope) {
                Qcur = ggml_rope_ext(
                        ctx0, Qcur, inp_pos, nullptr,
-                        n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                        ext_factor, attn_factor, beta_fast, beta_slow);
                cb(Qcur, "Qcur_rope", il);

                Kcur = ggml_rope_ext(
                        ctx0, Kcur, inp_pos, nullptr,
-                        n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                        ext_factor, attn_factor, beta_fast, beta_slow);
                cb(Kcur, "Kcur_rope", il);
            }
--- a/llama/llama.cpp/src/models/bert.cpp
+++ b/llama/llama.cpp/src/models/bert.cpp
@@ -142,13 +142,11 @@ llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params
                    LLM_FFN_GELU, LLM_FFN_SEQ, il);
            cb(cur, "ffn_out", il);
        } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
-            const bool up_contains_gate = !model.layers[il].ffn_gate && model.layers[il].ffn_up->ne[1] != hparams.n_ff();
-            auto type_op = up_contains_gate ? LLM_FFN_GEGLU : LLM_FFN_GELU;
            cur = build_ffn(cur,
-                    model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+                    model.layers[il].ffn_up, NULL, NULL,
                    model.layers[il].ffn_gate, NULL, NULL,
                    model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL,
-                    type_op, LLM_FFN_PAR, il);
+                    model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_GEGLU, LLM_FFN_PAR, il);
            cb(cur, "ffn_out", il);
        } else {
            cur = build_ffn(cur,
--- a/llama/llama.cpp/src/models/cogvlm.cpp
+++ b/llama/llama.cpp/src/models/cogvlm.cpp
@@ -3,14 +3,12 @@
 llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_params & params) :
    llm_graph_context(params) {
    const int64_t n_embd_head = hparams.n_embd_head_v;
-    const float   kq_scale    = 1.0f / sqrtf(float(n_embd_head));
+    float         kq_scale    = 1.0f / sqrtf(float(n_embd_head));

    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
    GGML_ASSERT(n_embd_head == hparams.n_rot);

-    ggml_tensor * inpL;
-    ggml_tensor * cur;
-
+    ggml_tensor *inpL, *cur;
    inpL = build_inp_embd(model.tok_embd);

    ggml_tensor * inp_pos = build_inp_pos();
@@ -46,7 +44,7 @@ llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_pa
        }

        ggml_tensor * inpSA = inpL;
-        cur = build_norm(inpSA, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+        cur                 = build_norm(inpSA, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);

        // build self attention
        {
--- a/llama/llama.cpp/src/models/cohere2-iswa.cpp
+++ b/llama/llama.cpp/src/models/cohere2-iswa.cpp
@@ -21,9 +21,6 @@ llm_build_cohere2_iswa::llm_build_cohere2_iswa(const llama_model & model, const

    for (int il = 0; il < n_layer; ++il) {
        const bool is_swa = hparams.is_swa(il);
-        // UNUSED:
-        // const float freq_base_l  = model.get_rope_freq_base (cparams, il);
-        // const float freq_scale_l = model.get_rope_freq_scale(cparams, il);

        // norm
        cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM, il);
--- a/llama/llama.cpp/src/models/deepseek2.cpp
+++ b/llama/llama.cpp/src/models/deepseek2.cpp
@@ -2,11 +2,14 @@

 llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) :
    llm_graph_context(params) {
-    const bool is_mla = hparams.is_mla();
+    // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
+    bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
+
+    const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);

    // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
-    const int64_t n_embd_head_k = hparams.n_embd_head_k_mla();
-    const int64_t n_embd_head_v = hparams.n_embd_head_v_mla();
+    const int64_t n_embd_head_k = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
+    const int64_t n_embd_head_v = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;

    const int64_t n_embd_head_qk_rope = hparams.n_rot;
    const int64_t n_embd_head_qk_nope = n_embd_head_k - n_embd_head_qk_rope;
@@ -40,8 +43,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
    // inp_pos - contains the positions
    ggml_tensor * inp_pos = build_inp_pos();

-    auto * inp_attn_kv = !is_mla ? build_attn_inp_kv() : nullptr;
-    auto * inp_attn_k  =  is_mla ? build_attn_inp_k()  : nullptr;
+    auto * inp_attn = build_attn_inp_kv();

    ggml_tensor * inp_out_ids = build_inp_out_ids();

@@ -55,9 +57,6 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
        // self_attention
        {
            ggml_tensor * q = NULL;
-
-            const bool is_lite = model.layers[il].wq;
-
            if (!is_lite) {
                q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
                cb(q, "q", il);
@@ -125,14 +124,14 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr

                // {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens}
                // note: rope must go first for in-place context shifting in build_rope_shift()
-                ggml_tensor * Qcur = ggml_concat(ctx0, q_nope_absorbed, q_pe, 0);
+                ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope_absorbed, 0);
                cb(Qcur, "Qcur", il);

                kv_cmpr = ggml_reshape_3d(ctx0, kv_cmpr, kv_lora_rank, 1, n_tokens);
                cb(kv_cmpr, "kv_cmpr_reshape", il);

                // {n_embd_head_qk_rope + kv_lora_rank, 1, n_tokens}
-                ggml_tensor * Kcur = ggml_concat(ctx0, kv_cmpr, k_pe, 0);
+                ggml_tensor * Kcur = ggml_concat(ctx0, k_pe, kv_cmpr, 0);
                cb(Kcur, "Kcur", il);

                // {kv_lora_rank, 1, n_tokens}
@@ -146,7 +145,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
                }

                // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
-                cur = build_attn(inp_attn_k,
+                cur = build_attn(inp_attn,
                        model.layers[il].wo, NULL,
                        Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, kq_scale, il);
            } else {
@@ -170,10 +169,11 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
                Vcur = ggml_cont(ctx0, Vcur);
                cb(Vcur, "Vcur_cont", il);

-                ggml_tensor * Qcur = ggml_concat(ctx0, q_nope, q_pe, 0);
+                // note: rope must go first for in-place context shifting in build_rope_shift()
+                ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope, 0);
                cb(Qcur, "Qcur", il);

-                ggml_tensor * Kcur = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
+                ggml_tensor * Kcur = ggml_concat(ctx0, ggml_repeat(ctx0, k_pe, q_pe), k_nope, 0);
                cb(Kcur, "Kcur", il);

                if (inp_attn_scale) {
@@ -183,7 +183,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
                }

                // note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
-                cur = build_attn(inp_attn_kv,
+                cur = build_attn(inp_attn,
                            model.layers[il].wo, NULL,
                            Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
            }
@@ -215,7 +215,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
                model.layers[il].ffn_exp_probs_b,
                n_expert, n_expert_used,
                LLM_FFN_SILU, hparams.expert_weights_norm,
-                hparams.expert_weights_scale, hparams.expert_weights_scale,
+                true, hparams.expert_weights_scale,
                (llama_expert_gating_func_type) hparams.expert_gating_func,
                il);
            cb(moe_out, "ffn_moe_out", il);
--- a/llama/llama.cpp/src/models/exaone-moe.cpp
+++ b/llama/llama.cpp/src/models/exaone-moe.cpp
@@ -1,146 +0,0 @@
-#include "models.h"
-
-
-llm_build_exaone_moe::llm_build_exaone_moe(const llama_model & model, const llm_graph_params & params) :
-    llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_k;
-
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_v);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
-
-    ggml_tensor * cur;
-    ggml_tensor * inpL;
-
-    inpL = build_inp_embd(model.tok_embd);
-
-    // inp_pos - contains the positions
-    ggml_tensor * inp_pos = build_inp_pos();
-
-    auto * inp_attn_iswa = build_attn_inp_kv_iswa();
-
-    ggml_tensor * inp_out_ids = build_inp_out_ids();
-
-    const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
-    for (int il = 0; il < n_transformer_layers; ++il) {
-        ggml_tensor * inpSA = inpL;
-
-        // use RoPE for SWA layers
-        const bool is_local_layer = hparams.is_swa(il);
-
-        // norm
-        cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
-        cb(cur, "attn_norm", il);
-
-        // self-attention
-        {
-            ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
-
-            // compute Q and K and RoPE them
-            ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-            cb(Qcur, "Qcur", il);
-
-            ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-            cb(Kcur, "Kcur", il);
-
-            ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-            cb(Vcur, "Vcur", il);
-
-            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
-            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
-            Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
-            Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
-            Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
-            cb(Qcur, "Qcur_normed", il);
-            cb(Kcur, "Kcur_normed", il);
-
-            if (is_local_layer) {
-                Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base,
-                                     freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
-
-                Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base,
-                                     freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
-            }
-            cb(Qcur, "Qcur", il);
-            cb(Kcur, "Kcur", il);
-            cb(Vcur, "Vcur", il);
-
-            cur = build_attn(inp_attn_iswa,
-                model.layers[il].wo, NULL,
-                Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
-            cb(cur, "attn_out", il);
-        }
-        if (il == n_transformer_layers - 1 && inp_out_ids) {
-            cur   = ggml_get_rows(ctx0, cur, inp_out_ids);
-            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-        }
-        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-        cb(ffn_inp, "ffn_inp", il);
-
-        // norm
-        cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
-        cb(cur, "ffn_norm", il);
-
-        // feed-forward network
-        if (model.layers[il].ffn_gate_inp == nullptr) {
-            // dense branch
-            cur = build_ffn(cur,
-                    model.layers[il].ffn_up, NULL, NULL,
-                    model.layers[il].ffn_gate, NULL, NULL,
-                    model.layers[il].ffn_down, NULL, NULL, NULL,
-                    LLM_FFN_SILU, LLM_FFN_PAR, il);
-            cb(cur, "ffn_out", il);
-        } else {
-            // MoE branch
-            ggml_tensor * moe_out = build_moe_ffn(cur,
-                model.layers[il].ffn_gate_inp,
-                model.layers[il].ffn_up_exps,
-                model.layers[il].ffn_gate_exps,
-                model.layers[il].ffn_down_exps,
-                model.layers[il].ffn_exp_probs_b,
-                n_expert, n_expert_used,
-                LLM_FFN_SILU, hparams.expert_weights_norm,
-                true, hparams.expert_weights_scale,
-                (llama_expert_gating_func_type) hparams.expert_gating_func,
-                il);
-            cb(moe_out, "ffn_moe_out", il);
-
-            // FFN shared expert
-            {
-                ggml_tensor * ffn_shexp =
-                    build_ffn(cur,
-                        model.layers[il].ffn_up_shexp, NULL, NULL,
-                        model.layers[il].ffn_gate_shexp, NULL, NULL,
-                        model.layers[il].ffn_down_shexp, NULL, NULL,
-                        NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
-                cb(ffn_shexp, "ffn_shexp", il);
-
-                cur = ggml_add(ctx0, moe_out, ffn_shexp);
-                cb(cur, "ffn_out", il);
-            }
-        }
-
-        cur = ggml_add(ctx0, cur, ffn_inp);
-
-        cur = build_cvec(cur, il);
-        cb(cur, "l_out", il);
-
-        // input for next layer
-        inpL = cur;
-    }
-    cur = inpL;
-
-    // final norm
-    cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
-
-    cb(cur, "result_norm", -1);
-    res->t_embd = cur;
-
-    // lm_head
-    cur = build_lora_mm(model.output, cur);
-
-    cb(cur, "result_output", -1);
-    res->t_logits = cur;
-
-    ggml_build_forward_expand(gf, cur);
-}
--- a/llama/llama.cpp/src/models/gemma-embedding.cpp
+++ b/llama/llama.cpp/src/models/gemma-embedding.cpp
@@ -1,5 +1,7 @@
 #include "models.h"

+
+
 llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params) :
    llm_graph_context(params) {
    const int64_t n_embd_head = hparams.n_embd_head_k;
@@ -10,8 +12,10 @@ llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model,
    inpL = build_inp_embd(model.tok_embd);

    // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
-    inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
-    cb(inpL, "inp_scaled", -1);
+    if (ubatch.token) {
+        inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
+        cb(inpL, "inp_scaled", -1);
+    }

    // inp_pos - contains the positions
    ggml_tensor * inp_pos = build_inp_pos();
--- a/llama/llama.cpp/src/models/gemma2-iswa.cpp
+++ b/llama/llama.cpp/src/models/gemma2-iswa.cpp
@@ -19,9 +19,6 @@ llm_build_gemma2_iswa::llm_build_gemma2_iswa(const llama_model & model, const ll
    ggml_tensor * inp_out_ids = build_inp_out_ids();

    for (int il = 0; il < n_layer; ++il) {
-        const float freq_base_l  = model.get_rope_freq_base (cparams, il);
-        const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
-
        // norm
        cur = build_norm(inpL,
                model.layers[il].attn_norm, NULL,
@@ -46,12 +43,12 @@ llm_build_gemma2_iswa::llm_build_gemma2_iswa(const llama_model & model, const ll

            Qcur = ggml_rope_ext(
                    ctx0, Qcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                    ext_factor, attn_factor, beta_fast, beta_slow);

            Kcur = ggml_rope_ext(
                    ctx0, Kcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                    ext_factor, attn_factor, beta_fast, beta_slow);

            cb(Qcur, "Qcur", il);
--- a/llama/llama.cpp/src/models/gemma3.cpp
+++ b/llama/llama.cpp/src/models/gemma3.cpp
@@ -10,9 +10,10 @@ llm_build_gemma3<iswa>::llm_build_gemma3(const llama_model & model, const llm_gr
    inpL = build_inp_embd(model.tok_embd);

    // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
-    inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
-    cb(inpL, "inp_scaled", -1);
-
+    if (ubatch.token) {
+        inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
+        cb(inpL, "inp_scaled", -1);
+    }
    // inp_pos - contains the positions
    ggml_tensor * inp_pos = build_inp_pos();

--- a/llama/llama.cpp/src/models/gemma3n-iswa.cpp
+++ b/llama/llama.cpp/src/models/gemma3n-iswa.cpp
@@ -1,5 +1,7 @@
 #include "models.h"

+
+
 llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params) :
    llm_graph_context(params),
    model(model),
@@ -13,9 +15,10 @@ llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const
    inpL = build_inp_embd(model.tok_embd);

    // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
-    inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
-    cb(inpL, "inp_scaled", -1);
-
+    if (ubatch.token) {
+        inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
+        cb(inpL, "inp_scaled", -1);
+    }
    // inp_pos - contains the positions
    ggml_tensor * inp_pos = build_inp_pos();

@@ -245,30 +248,20 @@ ggml_tensor * llm_build_gemma3n_iswa::view_2d_slice(ggml_tensor * x, int idx) {
 // equivalent to get_per_layer_inputs() in python code
 // output shape: [n_embd_altup, n_layer, n_tokens]
 ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() {
-    auto inp = std::make_unique<llm_graph_input_embd>(n_embd);
+    auto          inp = std::make_unique<llm_graph_input_embd>();
    ggml_tensor * inp_per_layer;
    if (ubatch.token) {
        inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
        ggml_set_input(inp->tokens);
-        res->t_inp_tokens = inp->tokens;
+        res->t_tokens = inp->tokens;
        inp_per_layer = ggml_get_rows(ctx0, model.tok_embd_per_layer, inp->tokens);
        inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, n_tokens);
        inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float) n_embd_altup));
        cb(inp_per_layer, "inp_per_layer_selected", -1);
-        res->add_input(std::move(inp));
    } else {
-        // Vision embedding path: use padding token (ID=0) embedding
-        // TODO: verify if this is the correct behavior in transformers implementation
-        const int64_t embd_size = model.tok_embd_per_layer->ne[0];  // n_embd_altup * n_layer
-
-        // Extract and dequantize padding token embedding (row 0)
-        ggml_tensor * padding = ggml_view_1d(ctx0, model.tok_embd_per_layer, embd_size, 0);
-        inp_per_layer = ggml_cast(ctx0, padding, GGML_TYPE_F32);
-
-        // Reshape to [n_embd_altup, n_layer, 1]
-        inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, 1);
-        cb(inp_per_layer, "inp_per_layer_vision", -1);
+        GGML_ABORT("TODO: support embd input");
    }
+    res->add_input(std::move(inp));
    return inp_per_layer;
 }

@@ -286,7 +279,7 @@ ggml_tensor * llm_build_gemma3n_iswa::project_per_layer_inputs(ggml_tensor * inp
                                              -1);  // [n_embd_altup, n_layer, n_tokens]
    cb(per_layer_proj, "per_layer_proj", -1);

-    inp_per_layer = ggml_add(ctx0, per_layer_proj, inp_per_layer);
+    inp_per_layer = ggml_add(ctx0, inp_per_layer, per_layer_proj);
    inp_per_layer = ggml_scale(ctx0, inp_per_layer, per_layer_input_scale);
    cb(inp_per_layer, "inp_per_layer", -1);

--- a/llama/llama.cpp/src/models/llama-iswa.cpp
+++ b/llama/llama.cpp/src/models/llama-iswa.cpp
@@ -25,12 +25,8 @@ llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_
    ggml_tensor * inp_out_ids = build_inp_out_ids();

    for (int il = 0; il < n_layer; ++il) {
-        const float freq_base_l  = model.get_rope_freq_base (cparams, il);
-        const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
-
        ggml_tensor * inpSA = inpL;

-        // This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous
        const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
                              (il + 1) % hparams.n_no_rope_layer_step != 0;

@@ -71,13 +67,13 @@ llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_
            if (use_rope) {
                Qcur = ggml_rope_ext(
                        ctx0, Qcur, inp_pos, rope_factors,
-                        n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                        ext_factor, attn_factor, beta_fast, beta_slow
                        );

                Kcur = ggml_rope_ext(
                        ctx0, Kcur, inp_pos, rope_factors,
-                        n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                        ext_factor, attn_factor, beta_fast, beta_slow
                        );
            } else if (inp_attn_scale) {
--- a/llama/llama.cpp/src/models/llama.cpp
+++ b/llama/llama.cpp/src/models/llama.cpp
@@ -1,7 +1,6 @@
 #include "models.h"

-template <bool embed>
-llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+llm_build_llama::llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
    const int64_t n_embd_head = hparams.n_embd_head_v;

    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -15,14 +14,7 @@ llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_gra
    // inp_pos - contains the positions
    ggml_tensor * inp_pos = build_inp_pos();

-    using inp_attn_type = std::conditional_t<embed, llm_graph_input_attn_no_cache, llm_graph_input_attn_kv>;
-
-    inp_attn_type * inp_attn = nullptr;
-    if constexpr (embed) {
-        inp_attn = build_attn_inp_no_cache();
-    } else {
-        inp_attn = build_attn_inp_kv();
-    }
+    auto * inp_attn = build_attn_inp_kv();

    const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;

@@ -153,16 +145,11 @@ llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_gra
    cb(cur, "result_norm", -1);
    res->t_embd = cur;

-    if constexpr (!embed) {
-        // lm_head
-        cur = build_lora_mm(model.output, cur);
+    // lm_head
+    cur = build_lora_mm(model.output, cur);

-        cb(cur, "result_output", -1);
-        res->t_logits = cur;
-    }
+    cb(cur, "result_output", -1);
+    res->t_logits = cur;

    ggml_build_forward_expand(gf, cur);
 }
-
-template struct llm_build_llama<false>;
-template struct llm_build_llama<true>;
--- a/llama/llama.cpp/src/models/maincoder.cpp
+++ b/llama/llama.cpp/src/models/maincoder.cpp
@@ -1,117 +0,0 @@
-#include "models.h"
-
-llm_build_maincoder::llm_build_maincoder(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
-
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
-
-    ggml_tensor * cur;
-    ggml_tensor * inpL;
-
-    inpL = build_inp_embd(model.tok_embd);
-
-    // inp_pos - contains the positions
-    ggml_tensor * inp_pos = build_inp_pos();
-
-    auto * inp_attn = build_attn_inp_kv();
-
-    ggml_tensor * inp_out_ids = build_inp_out_ids();
-
-    for (int il = 0; il < n_layer; ++il) {
-        ggml_tensor * inpSA = inpL;
-
-        // norm
-        cur = build_norm(inpL,
-                model.layers[il].attn_norm, NULL,
-                LLM_NORM_RMS, il);
-        cb(cur, "attn_norm", il);
-
-        // self-attention
-        {
-            // compute Q and K and RoPE them
-            ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-            cb(Qcur, "Qcur", il);
-
-            ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-            cb(Kcur, "Kcur", il);
-
-            ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-            cb(Vcur, "Vcur", il);
-
-            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
-            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
-            Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
-            Qcur = ggml_rope_ext(
-                    ctx0, Qcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                    );
-
-            Kcur = ggml_rope_ext(
-                    ctx0, Kcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                    );
-
-            Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
-            cb(Qcur, "Qcur_normed", il);
-
-            Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
-            cb(Kcur, "Kcur_normed", il);
-
-            cb(Qcur, "Qcur", il);
-            cb(Kcur, "Kcur", il);
-            cb(Vcur, "Vcur", il);
-
-            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].bo,
-                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
-        }
-        if (il == n_layer - 1 && inp_out_ids) {
-            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-        }
-        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-        cb(ffn_inp, "ffn_inp", il);
-
-        // feed-forward network
-        cur = build_norm(ffn_inp,
-                model.layers[il].ffn_norm, NULL,
-                LLM_NORM_RMS, il);
-        cb(cur, "ffn_norm", il);
-
-        cur = build_ffn(cur,
-                model.layers[il].ffn_up,   NULL, NULL,
-                model.layers[il].ffn_gate, NULL, NULL,
-                model.layers[il].ffn_down, NULL, NULL,
-                NULL,
-                LLM_FFN_SILU, LLM_FFN_PAR, il);
-        cb(cur, "ffn_out", il);
-
-        cur = ggml_add(ctx0, cur, ffn_inp);
-
-        cur = build_cvec(cur, il);
-        cb(cur, "l_out", il);
-
-        // input for next layer
-        inpL = cur;
-    }
-    cur = inpL;
-
-    cur = build_norm(cur,
-            model.output_norm, NULL,
-            LLM_NORM_RMS, -1);
-
-    cb(cur, "result_norm", -1);
-    res->t_embd = cur;
-
-    // lm_head
-    cur = build_lora_mm(model.output, cur);
-
-    cb(cur, "result_output", -1);
-    res->t_logits = cur;
-
-    ggml_build_forward_expand(gf, cur);
-}
--- a/llama/llama.cpp/src/models/mimo2-iswa.cpp
+++ b/llama/llama.cpp/src/models/mimo2-iswa.cpp
@@ -1,123 +0,0 @@
-
-#include "models.h"
-
-llm_build_mimo2_iswa::llm_build_mimo2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    ggml_tensor * cur;
-    ggml_tensor * inpL;
-
-    inpL = build_inp_embd(model.tok_embd);
-
-    ggml_tensor * inp_pos = build_inp_pos();
-    auto * inp_attn = build_attn_inp_kv_iswa();
-    ggml_tensor * inp_out_ids = build_inp_out_ids();
-
-    for (int il = 0; il < n_layer; ++il) {
-        ggml_tensor * inpSA = inpL;
-
-        uint32_t n_head_l    = hparams.n_head(il);
-        uint32_t n_head_kv_l = hparams.n_head_kv(il);
-        const float freq_base_l  = model.get_rope_freq_base(cparams, il);
-        const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
-
-        cur = inpL;
-
-        // self_attention
-        {
-            cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
-            cb(cur, "attn_norm", il);
-
-            // compute Q and K and RoPE them
-            ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-            cb(Qcur, "Qcur", il);
-
-            ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-            cb(Kcur, "Kcur", il);
-
-            ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-            cb(Vcur, "Vcur", il);
-
-            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head_l,    n_tokens);
-            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv_l, n_tokens);
-            Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head_v, n_head_kv_l, n_tokens);
-
-            Qcur = ggml_rope_ext(
-                ctx0, Qcur, inp_pos, nullptr,
-                n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
-                ext_factor, attn_factor, beta_fast, beta_slow
-                );
-
-            Kcur = ggml_rope_ext(
-                ctx0, Kcur, inp_pos, nullptr,
-                n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
-                ext_factor, attn_factor, beta_fast, beta_slow
-                );
-
-            cb(Qcur, "Qcur", il);
-            cb(Kcur, "Kcur", il);
-            cb(Vcur, "Vcur", il);
-
-            ggml_tensor * sinks = model.layers[il].attn_sinks;
-
-            cur = build_attn(inp_attn,
-                    model.layers[il].wo, NULL,
-                    Qcur, Kcur, Vcur, nullptr, sinks, nullptr, 1.0f/sqrtf(float(n_embd_head_k)), il);
-        }
-
-        if (il == n_layer - 1 && inp_out_ids) {
-            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-        }
-
-        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-        cb(ffn_inp, "ffn_inp", il);
-
-        cur = build_norm(ffn_inp,
-                model.layers[il].ffn_norm, NULL,
-                LLM_NORM_RMS, il);
-        cb(cur, "ffn_norm", il);
-
-        // feed-forward network
-        if (model.layers[il].ffn_gate_inp == nullptr) {
-            // dense branch
-            cur = build_ffn(cur,
-                    model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
-                    model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
-                    model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
-                    NULL,
-                    LLM_FFN_SILU, LLM_FFN_PAR, il);
-            cb(cur, "ffn_out", il);
-        } else {
-            // MoE branch
-            cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps,
-                                model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
-                                model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU, true, false,
-                                0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID, il);
-            cb(cur, "ffn_moe_out", il);
-        }
-
-        cur = ggml_add(ctx0, cur, ffn_inp);
-
-        cur = build_cvec(cur, il);
-        cb(cur, "l_out", il);
-
-        // input for next layer
-        inpL = cur;
-    }
-
-    cur = inpL;
-
-    cur = build_norm(cur,
-            model.output_norm, NULL,
-            LLM_NORM_RMS, -1);
-
-    cb(cur, "result_norm", -1);
-    res->t_embd = cur;
-
-    // lm_head
-    cur = build_lora_mm(model.output, cur);
-
-    cb(cur, "result_output", -1);
-    res->t_logits = cur;
-
-    ggml_build_forward_expand(gf, cur);
-}
--- a/llama/llama.cpp/src/models/minicpm3.cpp
+++ b/llama/llama.cpp/src/models/minicpm3.cpp
@@ -9,7 +9,6 @@ llm_build_minicpm3::llm_build_minicpm3(const llama_model & model, const llm_grap

    const uint32_t n_embd_head_qk_rope = hparams.n_rot;
    const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
-
    const uint32_t kv_lora_rank = hparams.n_lora_kv;

    ggml_tensor * cur;
--- a/llama/llama.cpp/src/models/models.h
+++ b/llama/llama.cpp/src/models/models.h
@@ -167,10 +167,6 @@ struct llm_build_exaone : public llm_graph_context {
    llm_build_exaone(const llama_model & model, const llm_graph_params & params);
 };

-struct llm_build_exaone_moe : public llm_graph_context {
-    llm_build_exaone_moe(const llama_model & model, const llm_graph_params & params);
-};
-
 struct llm_build_falcon : public llm_graph_context {
    llm_build_falcon(const llama_model & model, const llm_graph_params & params);
 };
@@ -307,7 +303,6 @@ struct llm_build_llada_moe : public llm_graph_context {
    llm_build_llada_moe(const llama_model & model, const llm_graph_params & params);
 };

-template <bool embed>
 struct llm_build_llama : public llm_graph_context {
    llm_build_llama(const llama_model & model, const llm_graph_params & params);
 };
@@ -316,18 +311,10 @@ struct llm_build_llama_iswa : public llm_graph_context {
    llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params);
 };

-struct llm_build_maincoder : public llm_graph_context {
-    llm_build_maincoder(const llama_model & model, const llm_graph_params & params);
-};
-
 struct llm_build_mamba : public llm_graph_context_mamba {
    llm_build_mamba(const llama_model & model, const llm_graph_params & params);
 };

-struct llm_build_mimo2_iswa : public llm_graph_context {
-    llm_build_mimo2_iswa(const llama_model & model, const llm_graph_params & params);
-};
-
 struct llm_build_minicpm3 : public llm_graph_context {
    llm_build_minicpm3(const llama_model & model, const llm_graph_params & params);
 };
@@ -340,10 +327,6 @@ struct llm_build_mistral3 : public llm_graph_context {
    llm_build_mistral3(const llama_model & model, const llm_graph_params & params);
 };

-struct llm_build_modern_bert : public llm_graph_context {
-    llm_build_modern_bert(const llama_model & model, const llm_graph_params & params);
-};
-
 struct llm_build_mpt : public llm_graph_context {
    llm_build_mpt(const llama_model & model, const llm_graph_params & params);
 };
@@ -413,11 +396,6 @@ struct llm_build_plamo : public llm_graph_context {
    llm_build_plamo(const llama_model & model, const llm_graph_params & params);
 };

-template <bool iswa>
-struct llm_build_plamo3 : public llm_graph_context {
-    llm_build_plamo3(const llama_model & model, const llm_graph_params & params);
-};
-
 struct llm_build_plm : public llm_graph_context {
    llm_build_plm(const llama_model & model, const llm_graph_params & params);
 };
@@ -470,8 +448,7 @@ private:
                ggml_tensor * cur,
                        int   il);

-    // returns pair of output and new state
-    std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_chunking(
+    ggml_tensor * build_delta_net_chunking(
                ggml_tensor * q,
                ggml_tensor * k,
                ggml_tensor * v,
@@ -483,8 +460,7 @@ private:
                ggml_tensor * diag_mask,
                        int   il);

-    // returns pair of output and new state
-    std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_autoregressive(
+    ggml_tensor * build_delta_net_autoregressive(
                ggml_tensor * q,
                ggml_tensor * k,
                ggml_tensor * v,
@@ -499,11 +475,6 @@ private:
                ggml_tensor * gate,
                        int   layer);

-    // returns pair of qkv, z
-    std::pair<ggml_tensor *, ggml_tensor *> build_qkvz(
-                ggml_tensor * input,
-                        int   il);
-
    const llama_model & model;
 };

--- a/llama/llama.cpp/src/models/modern-bert.cpp
+++ b/llama/llama.cpp/src/models/modern-bert.cpp
@@ -1,116 +0,0 @@
-#include "models.h"
-
-llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
-    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
-
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
-    ggml_tensor * cur;
-    ggml_tensor * inpL;
-    ggml_tensor * inp_pos = build_inp_pos();
-
-    // construct input embeddings (token, type, position)
-    inpL = build_inp_embd(model.tok_embd);
-    cb(inpL, "inp_embd", -1);
-
-    // embed layer norm
-    inpL = build_norm(inpL, model.tok_norm, nullptr, LLM_NORM, -1);
-    cb(inpL, "inp_norm", -1);
-
-    ggml_tensor * inp_out_ids = build_inp_out_ids();
-
-    auto * inp_attn = build_attn_inp_no_cache();
-
-    for (int il = 0; il < n_layer; ++il) {
-        const float freq_base_l  = model.get_rope_freq_base(cparams, il);
-        const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
-
-        cur = inpL;
-
-        // attention layer norm
-        if (model.layers[il].attn_norm) {
-            cur = build_norm(inpL,
-                    model.layers[il].attn_norm, NULL,
-                    LLM_NORM, il);
-            cb(cur, "attn_norm", il);
-        }
-
-        // self attention
-        cur = build_lora_mm(model.layers[il].wqkv, cur);
-        cb(cur, "wqkv", il);
-
-        const size_t type_size = ggml_type_size(cur->type);
-
-        ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head,    n_tokens, n_embd_head*type_size, cur->nb[1], 0*type_size*(n_embd));
-        ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*type_size, cur->nb[1], 1*type_size*(n_embd));
-        ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*type_size, cur->nb[1], 1*type_size*(n_embd + n_embd_gqa));
-
-        // RoPE
-        Qcur = ggml_rope_ext(
-                ctx0, Qcur, inp_pos, nullptr,
-                n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
-                ext_factor, attn_factor, beta_fast, beta_slow
-                );
-
-        Kcur = ggml_rope_ext(
-                ctx0, Kcur, inp_pos, nullptr,
-                n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
-                ext_factor, attn_factor, beta_fast, beta_slow
-                );
-
-        cb(Qcur, "Qcur", il);
-        cb(Kcur, "Kcur", il);
-        cb(Vcur, "Vcur", il);
-
-        cur = build_attn(inp_attn,
-                    model.layers[il].wo, nullptr,
-                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
-        cb(cur, "kqv_out", il);
-
-        if (il == n_layer - 1 && inp_out_ids) {
-            cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
-            inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
-        }
-
-        // re-add the layer input
-        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
-        cb(ffn_inp, "ffn_inp", il);
-
-        // attention layer norm
-        cur = build_norm(ffn_inp,
-                model.layers[il].ffn_norm, NULL,
-                LLM_NORM, il);
-        cb(cur, "ffn_norm", il);
-
-        cur = build_ffn(cur,
-                model.layers[il].ffn_up,   NULL, NULL,
-                NULL,                      NULL, NULL,
-                model.layers[il].ffn_down, NULL, NULL,
-                NULL,
-                LLM_FFN_GEGLU, LLM_FFN_SEQ, il);
-
-        // attentions bypass the intermediate layer
-        cur = ggml_add(ctx0, cur, ffn_inp);
-
-        // input for next layer
-        inpL = cur;
-    }
-
-    cur = inpL;
-
-    cur = build_norm(cur,
-            model.output_norm, NULL,
-            LLM_NORM, -1);
-    cb(cur, "final_norm_out", -1);
-
-    if (hparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
-        // extracting cls token
-        cur = ggml_view_1d(ctx0, cur, hparams.n_embd, 0);
-        cb(cur, "cls_pooled_embd", -1);
-    }
-
-    cb(cur, "res_embd", -1);
-    res->t_embd = cur;
-    ggml_build_forward_expand(gf, cur);
-}
--- a/llama/llama.cpp/src/models/nemotron-h.cpp
+++ b/llama/llama.cpp/src/models/nemotron-h.cpp
@@ -67,7 +67,7 @@ ggml_tensor * llm_build_nemotron_h::build_attention_layer(ggml_tensor *
                                                          const llama_model &       model,
                                                          const int64_t             n_embd_head,
                                                          const int                 il) {
-    // compute Q and K
+    // compute Q and K and (optionally) RoPE them
    ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
    cb(Qcur, "Qcur", il);
    if (model.layers[il].bq) {
--- a/llama/llama.cpp/src/models/openai-moe-iswa.cpp
+++ b/llama/llama.cpp/src/models/openai-moe-iswa.cpp
@@ -14,9 +14,6 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model,
    ggml_tensor * inp_out_ids = build_inp_out_ids();

    for (int il = 0; il < n_layer; ++il) {
-        const float freq_base_l  = model.get_rope_freq_base (cparams, il);
-        const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
-
        ggml_tensor * inpSA = inpL;

        // norm
@@ -52,13 +49,13 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model,

            Qcur = ggml_rope_ext(
                    ctx0, Qcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                    ext_factor, attn_factor, beta_fast, beta_slow
                    );

            Kcur = ggml_rope_ext(
                    ctx0, Kcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                    ext_factor, attn_factor, beta_fast, beta_slow
                    );

--- a/llama/llama.cpp/src/models/plamo3.cpp
+++ b/llama/llama.cpp/src/models/plamo3.cpp
@@ -1,128 +0,0 @@
-#include "models.h"
-
-template <bool iswa>
-llm_build_plamo3<iswa>::llm_build_plamo3(const llama_model & model, const llm_graph_params & params) :
-    llm_graph_context(params) {
-    const int64_t head_dim_q = hparams.n_embd_head_k;
-    const int64_t head_dim_v = hparams.n_embd_head_v;
-
-    ggml_tensor * cur;
-    ggml_tensor * inpL = build_inp_embd(model.tok_embd);
-    ggml_tensor * inp_pos = build_inp_pos();
-
-    using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
-    inp_attn_type * inp_attn = nullptr;
-
-    if constexpr (iswa) {
-        inp_attn = build_attn_inp_kv_iswa();
-    } else {
-        inp_attn = build_attn_inp_kv();
-    }
-
-    ggml_tensor * inp_out_ids = build_inp_out_ids();
-
-    for (int il = 0; il < n_layer; ++il) {
-        ggml_tensor * residual = inpL;
-
-        float freq_base_l  = 0.0f;
-        float freq_scale_l = 0.0f;
-        if constexpr (iswa) {
-            freq_base_l  = model.get_rope_freq_base (cparams, il);
-            freq_scale_l = model.get_rope_freq_scale(cparams, il);
-        } else {
-            freq_base_l  = freq_base;
-            freq_scale_l = freq_scale;
-        }
-
-        cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
-        cb(cur, "attn_norm", il);
-
-        ggml_tensor * qkv = build_lora_mm(model.layers[il].wqkv, cur);
-        cb(cur, "wqkv", il);
-
-        const int32_t n_head    = hparams.n_head(il);
-        const int32_t n_head_kv = hparams.n_head_kv(il);
-
-        const int64_t q_offset = 0;
-        const int64_t k_offset = head_dim_q * n_head;
-        const int64_t v_offset = k_offset + head_dim_q * n_head_kv;
-
-        ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, head_dim_q, n_head, n_tokens,
-                head_dim_q * sizeof(float), qkv->nb[1], q_offset * ggml_element_size(qkv));
-        ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, head_dim_q, n_head_kv, n_tokens,
-                head_dim_q * sizeof(float), qkv->nb[1], k_offset * ggml_element_size(qkv));
-        ggml_tensor * Vcur = ggml_view_3d(ctx0, qkv, head_dim_v, n_head_kv, n_tokens,
-                head_dim_v * sizeof(float), qkv->nb[1], v_offset * ggml_element_size(qkv));
-
-        cb(Qcur, "Qcur", il);
-        cb(Kcur, "Kcur", il);
-        cb(Vcur, "Vcur", il);
-
-        Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
-        cb(Qcur, "attn_q_norm", il);
-        Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
-        cb(Kcur, "attn_k_norm", il);
-
-        Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr,
-                n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
-                ext_factor, attn_factor, beta_fast, beta_slow);
-        Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr,
-                n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
-                ext_factor, attn_factor, beta_fast, beta_slow);
-
-        const float attn_scale = 1.0f / sqrtf(float(head_dim_q));
-
-        cur = build_attn(inp_attn,
-                model.layers[il].wo, NULL,
-                Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, attn_scale, il);
-        cb(cur, "attn_out", il);
-
-        if (il == n_layer - 1 && inp_out_ids) {
-            cur      = ggml_get_rows(ctx0, cur, inp_out_ids);
-            residual = ggml_get_rows(ctx0, residual, inp_out_ids);
-        }
-
-        cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
-        cb(cur, "attn_post_norm", il);
-
-        cur = ggml_add(ctx0, cur, residual);
-        cb(cur, "attn_residual", il);
-
-        residual = cur;
-
-        cur = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
-        cb(cur, "ffn_norm", il);
-
-        cur = build_ffn(cur,
-                model.layers[il].ffn_up,   NULL, NULL,
-                NULL,                      NULL, NULL,
-                model.layers[il].ffn_down, NULL, NULL,
-                NULL,
-                LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
-        cb(cur, "ffn_out", il);
-
-        cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, LLM_NORM_RMS, il);
-        cb(cur, "ffn_post_norm", il);
-
-        cur = ggml_add(ctx0, cur, residual);
-        cb(cur, "ffn_residual", il);
-
-        cur = build_cvec(cur, il);
-        cb(cur, "l_out", il);
-        inpL = cur;
-    }
-
-    cur = inpL;
-
-    cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
-    res->t_embd = cur;
-
-    cur = build_lora_mm(model.output, cur);
-    res->t_logits = cur;
-
-    ggml_build_forward_expand(gf, cur);
-}
-
-// Explicit template instantiations
-template struct llm_build_plamo3<false>;
-template struct llm_build_plamo3<true>;
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Bruce MacDonald	c0496e6125	fix lint	2026-01-28 13:16:52 -08:00
Bruce MacDonald	2d57bcbc64	fix tests	2026-01-28 13:07:48 -08:00
Bruce MacDonald	060f9341c0	server: usage api Add a new /api/usage endpoint that shows aggregate usage statistics per model since the server started.	2026-01-27 17:01:18 -08:00