x/grammar: add experimental GPU accelerated constrained decoding package

dockerfile: remove unused COPY command (#13664 )
Add z-image image generation prototype (#13659 )
2026-01-13 18:09:59 -05:00 · 2026-01-11 00:50:11 -08:00 · 2026-01-09 23:07:15 -08:00 · 2026-01-09 21:09:46 -08:00 · 2026-01-09 21:29:03 -05:00 · 2026-01-09 18:20:39 -08:00
406 changed files with 58852 additions and 7491 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -68,6 +68,7 @@ jobs:
          name: bundles-darwin
          path: |
            dist/*.tgz
+            dist/*.tar.zst
            dist/*.zip
            dist/*.dmg

@@ -392,13 +393,13 @@ jobs:
          done
      - run: |
          for ARCHIVE in dist/${{ matrix.os }}-${{ matrix.arch }}/*.tar.in; do
-            tar c -C dist/${{ matrix.os }}-${{ matrix.arch }} -T $ARCHIVE --owner 0 --group 0 | pigz -9vc >$(basename ${ARCHIVE//.*/}.tgz);
+            tar c -C dist/${{ matrix.os }}-${{ matrix.arch }} -T $ARCHIVE --owner 0 --group 0 | zstd --ultra -22 -T0 >$(basename ${ARCHIVE//.*/}.tar.zst);
          done
      - uses: actions/upload-artifact@v4
        with:
          name: bundles-${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.target }}
          path: |
-            *.tgz
+            *.tar.zst

  # Build each Docker variant (OS, arch, and flavor) separately. Using QEMU is unreliable and slower.
  docker-build-push:
@@ -531,7 +532,7 @@ jobs:
      - name: Upload release artifacts
        run: |
          pids=()
-          for payload in dist/*.txt dist/*.zip dist/*.tgz dist/*.exe dist/*.dmg ; do
+          for payload in dist/*.txt dist/*.zip dist/*.tgz dist/*.tar.zst dist/*.exe dist/*.dmg ; do
            echo "Uploading $payload"
            gh release upload ${GITHUB_REF_NAME} $payload --clobber &
            pids[$!]=$!
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,6 +2,22 @@ cmake_minimum_required(VERSION 3.21)

 project(Ollama C CXX)

+# Handle cross-compilation on macOS: when CMAKE_OSX_ARCHITECTURES is set to a
+# single architecture different from the host, override CMAKE_SYSTEM_PROCESSOR
+# to match. This is necessary because CMAKE_SYSTEM_PROCESSOR defaults to the
+# host architecture, but downstream projects (like MLX) use it to detect the
+# target architecture.
+if(CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_OSX_ARCHITECTURES MATCHES ";")
+    # Single architecture specified
+    if(CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" AND NOT CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+        message(STATUS "Cross-compiling for x86_64: overriding CMAKE_SYSTEM_PROCESSOR from ${CMAKE_SYSTEM_PROCESSOR} to x86_64")
+        set(CMAKE_SYSTEM_PROCESSOR "x86_64")
+    elseif(CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" AND NOT CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
+        message(STATUS "Cross-compiling for arm64: overriding CMAKE_SYSTEM_PROCESSOR from ${CMAKE_SYSTEM_PROCESSOR} to arm64")
+        set(CMAKE_SYSTEM_PROCESSOR "arm64")
+    endif()
+endif()
+
 include(CheckLanguage)
 include(GNUInstallDirs)

@@ -12,7 +28,7 @@ set(BUILD_SHARED_LIBS ON)

 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
-set(CMAKE_CXX_EXTENSIONS OFF)
+set(CMAKE_CXX_EXTENSIONS ON) # Recent versions of MLX Requires gnu++17 extensions to compile properly

 set(GGML_BUILD ON)
 set(GGML_SHARED ON)
@@ -147,14 +163,48 @@ if(CMAKE_HIP_COMPILER)
    endif()
 endif()

-find_package(Vulkan)
-if(Vulkan_FOUND)
-    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-vulkan)
-    install(TARGETS ggml-vulkan
-        RUNTIME_DEPENDENCIES
-            PRE_INCLUDE_REGEXES vulkan
-            PRE_EXCLUDE_REGEXES ".*"
-        RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT Vulkan
-        LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT Vulkan
-    )
+if(NOT APPLE)
+    find_package(Vulkan)
+    if(Vulkan_FOUND)
+        add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-vulkan)
+        install(TARGETS ggml-vulkan
+            RUNTIME_DEPENDENCIES
+                PRE_INCLUDE_REGEXES vulkan
+                PRE_EXCLUDE_REGEXES ".*"
+            RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT Vulkan
+            LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT Vulkan
+        )
+    endif()
 endif()
+
+option(MLX_ENGINE "Enable MLX backend" OFF)
+
+if(MLX_ENGINE)
+    message(STATUS "Setting up MLX (this takes a while...)")
+    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/x/ml/backend/mlx)
+
+    # Find CUDA toolkit if MLX is built with CUDA support
+    find_package(CUDAToolkit)
+
+    install(TARGETS mlx mlxc
+        RUNTIME_DEPENDENCIES
+            DIRECTORIES ${CUDAToolkit_BIN_DIR} ${CUDAToolkit_BIN_DIR}/x64 ${CUDAToolkit_LIBRARY_DIR}
+            PRE_INCLUDE_REGEXES cublas cublasLt cudart nvrtc cudnn nccl
+            PRE_EXCLUDE_REGEXES ".*"
+        RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT MLX
+        LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT MLX
+        FRAMEWORK DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT MLX
+    )
+
+    # Manually install cudart and cublas since they might not be picked up as direct dependencies
+    if(CUDAToolkit_FOUND)
+        file(GLOB CUDART_LIBS
+            "${CUDAToolkit_LIBRARY_DIR}/libcudart.so*"
+            "${CUDAToolkit_LIBRARY_DIR}/libcublas.so*")
+        if(CUDART_LIBS)
+            install(FILES ${CUDART_LIBS}
+                DESTINATION ${OLLAMA_INSTALL_DIR}
+                COMPONENT MLX)
+        endif()
+    endif()
+endif()
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -41,7 +41,7 @@
      "inherits": [ "CUDA" ],
      "cacheVariables": {
        "CMAKE_CUDA_ARCHITECTURES": "75-virtual;80-virtual;86-virtual;87-virtual;89-virtual;90-virtual;90a-virtual;100-virtual;103-virtual;110-virtual;120-virtual;121-virtual",
-        "CMAKE_CUDA_FLAGS": "-t 2",
+        "CMAKE_CUDA_FLAGS": "-t 4",
        "OLLAMA_RUNNER_DIR": "cuda_v13"
      }
    },
@@ -83,6 +83,28 @@
      "cacheVariables": {
        "OLLAMA_RUNNER_DIR": "vulkan"
      }
+    },
+    {
+      "name": "MLX",
+      "inherits": [ "Default" ],
+      "cacheVariables": {
+        "MLX_ENGINE": "ON",
+        "OLLAMA_RUNNER_DIR": "mlx"
+      }
+    },
+    {
+      "name": "MLX CUDA 12",
+      "inherits": [ "MLX", "CUDA 12" ],
+      "cacheVariables": {
+        "OLLAMA_RUNNER_DIR": "mlx_cuda_v12"
+      }
+    },
+    {
+      "name": "MLX CUDA 13",
+      "inherits": [ "MLX", "CUDA 13" ],
+      "cacheVariables": {
+        "OLLAMA_RUNNER_DIR": "mlx_cuda_v13"
+      }
    }
  ],
  "buildPresets": [
@@ -140,6 +162,21 @@
      "name": "Vulkan",
      "targets": [ "ggml-vulkan" ],
      "configurePreset": "Vulkan"
+    },
+    {
+      "name": "MLX",
+      "targets": [ "mlx", "mlxc" ],
+      "configurePreset": "MLX"
+    },
+    {
+      "name": "MLX CUDA 12",
+      "targets": [ "mlx", "mlxc" ],
+      "configurePreset": "MLX CUDA 12"
+    },
+    {
+      "name": "MLX CUDA 13",
+      "targets": [ "mlx", "mlxc" ],
+      "configurePreset": "MLX CUDA 13"
    }
  ]
 }
--- a/33
+++ b/33
@@ -131,8 +131,36 @@ COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
 RUN --mount=type=cache,target=/root/.ccache \
    cmake --preset 'Vulkan' \
        && cmake --build --parallel --preset 'Vulkan' \
-        && cmake --install build --component Vulkan --strip --parallel 8 
+        && cmake --install build --component Vulkan --strip --parallel 8

+FROM base AS mlx
+ARG CUDA13VERSION=13.0
+RUN dnf install -y cuda-toolkit-${CUDA13VERSION//./-} \
+    && dnf install -y openblas-devel lapack-devel \
+    && dnf install -y libcudnn9-cuda-13 libcudnn9-devel-cuda-13 \
+    && dnf install -y libnccl libnccl-devel
+ENV PATH=/usr/local/cuda-13/bin:$PATH
+ENV BLAS_INCLUDE_DIRS=/usr/include/openblas
+ENV LAPACK_INCLUDE_DIRS=/usr/include/openblas
+ENV CGO_LDFLAGS="-L/usr/local/cuda-13/lib64 -L/usr/local/cuda-13/targets/x86_64-linux/lib/stubs"
+ARG PARALLEL
+WORKDIR /go/src/github.com/ollama/ollama
+COPY CMakeLists.txt CMakePresets.json .
+COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
+COPY x/ml/backend/mlx x/ml/backend/mlx
+COPY go.mod go.sum .
+RUN curl -fsSL https://golang.org/dl/go$(awk '/^go/ { print $2 }' go.mod).linux-$(case $(uname -m) in x86_64) echo amd64 ;; aarch64) echo arm64 ;; esac).tar.gz | tar xz -C /usr/local
+ENV PATH=/usr/local/go/bin:$PATH
+RUN go mod download
+RUN --mount=type=cache,target=/root/.ccache \
+    cmake --preset 'MLX CUDA 13' -DBLAS_INCLUDE_DIRS=/usr/include/openblas -DLAPACK_INCLUDE_DIRS=/usr/include/openblas \
+        && cmake --build --parallel ${PARALLEL} --preset 'MLX CUDA 13' \
+        && cmake --install build --component MLX --strip --parallel ${PARALLEL}
+COPY . .
+ARG GOFLAGS="'-ldflags=-w -s'"
+ENV CGO_ENABLED=1
+ARG CGO_CFLAGS
+ARG CGO_CXXFLAGS

 FROM base AS build
 WORKDIR /go/src/github.com/ollama/ollama
@@ -153,6 +181,7 @@ FROM --platform=linux/amd64 scratch AS amd64
 COPY --from=cuda-12 dist/lib/ollama /lib/ollama/
 COPY --from=cuda-13 dist/lib/ollama /lib/ollama/
 COPY --from=vulkan  dist/lib/ollama  /lib/ollama/
+COPY --from=mlx     /go/src/github.com/ollama/ollama/dist/lib/ollama /lib/ollama/

 FROM --platform=linux/arm64 scratch AS arm64
 # COPY --from=cuda-11 dist/lib/ollama/ /lib/ollama/
@@ -171,7 +200,7 @@ COPY --from=build /bin/ollama /bin/ollama

 FROM ubuntu:24.04
 RUN apt-get update \
-    && apt-get install -y ca-certificates libvulkan1 \
+    && apt-get install -y ca-certificates libvulkan1 libopenblas0 \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/*
 COPY --from=archive /bin /usr/bin
--- a/Makefile.sync
+++ b/Makefile.sync
@@ -1,6 +1,6 @@
 UPSTREAM=https://github.com/ggml-org/llama.cpp.git
 WORKDIR=llama/vendor
-FETCH_HEAD=17f7f4baad8b3a716ee139da7bb56ae984e8c0fa
+FETCH_HEAD=ec98e2002

 .PHONY: help
 help:
--- a/anthropic/anthropic.go
+++ b/anthropic/anthropic.go
@@ -0,0 +1,778 @@
+package anthropic
+
+import (
+	"crypto/rand"
+	"encoding/base64"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"log/slog"
+	"net/http"
+	"strings"
+	"time"
+
+	"github.com/ollama/ollama/api"
+)
+
+// Error types matching Anthropic API
+type Error struct {
+	Type    string `json:"type"`
+	Message string `json:"message"`
+}
+
+type ErrorResponse struct {
+	Type      string `json:"type"` // always "error"
+	Error     Error  `json:"error"`
+	RequestID string `json:"request_id,omitempty"`
+}
+
+// NewError creates a new ErrorResponse with the appropriate error type based on HTTP status code
+func NewError(code int, message string) ErrorResponse {
+	var etype string
+	switch code {
+	case http.StatusBadRequest:
+		etype = "invalid_request_error"
+	case http.StatusUnauthorized:
+		etype = "authentication_error"
+	case http.StatusForbidden:
+		etype = "permission_error"
+	case http.StatusNotFound:
+		etype = "not_found_error"
+	case http.StatusTooManyRequests:
+		etype = "rate_limit_error"
+	case http.StatusServiceUnavailable, 529:
+		etype = "overloaded_error"
+	default:
+		etype = "api_error"
+	}
+
+	return ErrorResponse{
+		Type:      "error",
+		Error:     Error{Type: etype, Message: message},
+		RequestID: generateID("req"),
+	}
+}
+
+// Request types
+
+// MessagesRequest represents an Anthropic Messages API request
+type MessagesRequest struct {
+	Model         string          `json:"model"`
+	MaxTokens     int             `json:"max_tokens"`
+	Messages      []MessageParam  `json:"messages"`
+	System        any             `json:"system,omitempty"` // string or []ContentBlock
+	Stream        bool            `json:"stream,omitempty"`
+	Temperature   *float64        `json:"temperature,omitempty"`
+	TopP          *float64        `json:"top_p,omitempty"`
+	TopK          *int            `json:"top_k,omitempty"`
+	StopSequences []string        `json:"stop_sequences,omitempty"`
+	Tools         []Tool          `json:"tools,omitempty"`
+	ToolChoice    *ToolChoice     `json:"tool_choice,omitempty"`
+	Thinking      *ThinkingConfig `json:"thinking,omitempty"`
+	Metadata      *Metadata       `json:"metadata,omitempty"`
+}
+
+// MessageParam represents a message in the request
+type MessageParam struct {
+	Role    string `json:"role"`    // "user" or "assistant"
+	Content any    `json:"content"` // string or []ContentBlock
+}
+
+// ContentBlock represents a content block in a message.
+// Text and Thinking use pointers so they serialize as the field being present (even if empty)
+// only when set, which is required for SDK streaming accumulation.
+type ContentBlock struct {
+	Type string `json:"type"` // text, image, tool_use, tool_result, thinking
+
+	// For text blocks - pointer so field only appears when set (SDK requires it for accumulation)
+	Text *string `json:"text,omitempty"`
+
+	// For image blocks
+	Source *ImageSource `json:"source,omitempty"`
+
+	// For tool_use blocks
+	ID    string `json:"id,omitempty"`
+	Name  string `json:"name,omitempty"`
+	Input any    `json:"input,omitempty"`
+
+	// For tool_result blocks
+	ToolUseID string `json:"tool_use_id,omitempty"`
+	Content   any    `json:"content,omitempty"` // string or []ContentBlock
+	IsError   bool   `json:"is_error,omitempty"`
+
+	// For thinking blocks - pointer so field only appears when set (SDK requires it for accumulation)
+	Thinking  *string `json:"thinking,omitempty"`
+	Signature string  `json:"signature,omitempty"`
+}
+
+// ImageSource represents the source of an image
+type ImageSource struct {
+	Type      string `json:"type"` // "base64" or "url"
+	MediaType string `json:"media_type,omitempty"`
+	Data      string `json:"data,omitempty"`
+	URL       string `json:"url,omitempty"`
+}
+
+// Tool represents a tool definition
+type Tool struct {
+	Type        string          `json:"type,omitempty"` // "custom" for user-defined tools
+	Name        string          `json:"name"`
+	Description string          `json:"description,omitempty"`
+	InputSchema json.RawMessage `json:"input_schema,omitempty"`
+}
+
+// ToolChoice controls how the model uses tools
+type ToolChoice struct {
+	Type                   string `json:"type"` // "auto", "any", "tool", "none"
+	Name                   string `json:"name,omitempty"`
+	DisableParallelToolUse bool   `json:"disable_parallel_tool_use,omitempty"`
+}
+
+// ThinkingConfig controls extended thinking
+type ThinkingConfig struct {
+	Type         string `json:"type"` // "enabled" or "disabled"
+	BudgetTokens int    `json:"budget_tokens,omitempty"`
+}
+
+// Metadata for the request
+type Metadata struct {
+	UserID string `json:"user_id,omitempty"`
+}
+
+// Response types
+
+// MessagesResponse represents an Anthropic Messages API response
+type MessagesResponse struct {
+	ID           string         `json:"id"`
+	Type         string         `json:"type"` // "message"
+	Role         string         `json:"role"` // "assistant"
+	Model        string         `json:"model"`
+	Content      []ContentBlock `json:"content"`
+	StopReason   string         `json:"stop_reason,omitempty"`
+	StopSequence string         `json:"stop_sequence,omitempty"`
+	Usage        Usage          `json:"usage"`
+}
+
+// Usage contains token usage information
+type Usage struct {
+	InputTokens  int `json:"input_tokens"`
+	OutputTokens int `json:"output_tokens"`
+}
+
+// Streaming event types
+
+// MessageStartEvent is sent at the start of streaming
+type MessageStartEvent struct {
+	Type    string           `json:"type"` // "message_start"
+	Message MessagesResponse `json:"message"`
+}
+
+// ContentBlockStartEvent signals the start of a content block
+type ContentBlockStartEvent struct {
+	Type         string       `json:"type"` // "content_block_start"
+	Index        int          `json:"index"`
+	ContentBlock ContentBlock `json:"content_block"`
+}
+
+// ContentBlockDeltaEvent contains incremental content updates
+type ContentBlockDeltaEvent struct {
+	Type  string `json:"type"` // "content_block_delta"
+	Index int    `json:"index"`
+	Delta Delta  `json:"delta"`
+}
+
+// Delta represents an incremental update
+type Delta struct {
+	Type        string `json:"type"` // "text_delta", "input_json_delta", "thinking_delta", "signature_delta"
+	Text        string `json:"text,omitempty"`
+	PartialJSON string `json:"partial_json,omitempty"`
+	Thinking    string `json:"thinking,omitempty"`
+	Signature   string `json:"signature,omitempty"`
+}
+
+// ContentBlockStopEvent signals the end of a content block
+type ContentBlockStopEvent struct {
+	Type  string `json:"type"` // "content_block_stop"
+	Index int    `json:"index"`
+}
+
+// MessageDeltaEvent contains updates to the message
+type MessageDeltaEvent struct {
+	Type  string       `json:"type"` // "message_delta"
+	Delta MessageDelta `json:"delta"`
+	Usage DeltaUsage   `json:"usage"`
+}
+
+// MessageDelta contains stop information
+type MessageDelta struct {
+	StopReason   string `json:"stop_reason,omitempty"`
+	StopSequence string `json:"stop_sequence,omitempty"`
+}
+
+// DeltaUsage contains cumulative token usage
+type DeltaUsage struct {
+	OutputTokens int `json:"output_tokens"`
+}
+
+// MessageStopEvent signals the end of the message
+type MessageStopEvent struct {
+	Type string `json:"type"` // "message_stop"
+}
+
+// PingEvent is a keepalive event
+type PingEvent struct {
+	Type string `json:"type"` // "ping"
+}
+
+// StreamErrorEvent is an error during streaming
+type StreamErrorEvent struct {
+	Type  string `json:"type"` // "error"
+	Error Error  `json:"error"`
+}
+
+// FromMessagesRequest converts an Anthropic MessagesRequest to an Ollama api.ChatRequest
+func FromMessagesRequest(r MessagesRequest) (*api.ChatRequest, error) {
+	var messages []api.Message
+
+	if r.System != nil {
+		switch sys := r.System.(type) {
+		case string:
+			if sys != "" {
+				messages = append(messages, api.Message{Role: "system", Content: sys})
+			}
+		case []any:
+			// System can be an array of content blocks
+			var content strings.Builder
+			for _, block := range sys {
+				if blockMap, ok := block.(map[string]any); ok {
+					if blockMap["type"] == "text" {
+						if text, ok := blockMap["text"].(string); ok {
+							content.WriteString(text)
+						}
+					}
+				}
+			}
+			if content.Len() > 0 {
+				messages = append(messages, api.Message{Role: "system", Content: content.String()})
+			}
+		}
+	}
+
+	for _, msg := range r.Messages {
+		converted, err := convertMessage(msg)
+		if err != nil {
+			return nil, err
+		}
+		messages = append(messages, converted...)
+	}
+
+	options := make(map[string]any)
+
+	options["num_predict"] = r.MaxTokens
+
+	if r.Temperature != nil {
+		options["temperature"] = *r.Temperature
+	}
+
+	if r.TopP != nil {
+		options["top_p"] = *r.TopP
+	}
+
+	if r.TopK != nil {
+		options["top_k"] = *r.TopK
+	}
+
+	if len(r.StopSequences) > 0 {
+		options["stop"] = r.StopSequences
+	}
+
+	var tools api.Tools
+	for _, t := range r.Tools {
+		tool, err := convertTool(t)
+		if err != nil {
+			return nil, err
+		}
+		tools = append(tools, tool)
+	}
+
+	var think *api.ThinkValue
+	if r.Thinking != nil && r.Thinking.Type == "enabled" {
+		think = &api.ThinkValue{Value: true}
+	}
+
+	stream := r.Stream
+
+	return &api.ChatRequest{
+		Model:    r.Model,
+		Messages: messages,
+		Options:  options,
+		Stream:   &stream,
+		Tools:    tools,
+		Think:    think,
+	}, nil
+}
+
+// convertMessage converts an Anthropic MessageParam to Ollama api.Message(s)
+func convertMessage(msg MessageParam) ([]api.Message, error) {
+	var messages []api.Message
+	role := strings.ToLower(msg.Role)
+
+	switch content := msg.Content.(type) {
+	case string:
+		messages = append(messages, api.Message{Role: role, Content: content})
+
+	case []any:
+		var textContent strings.Builder
+		var images []api.ImageData
+		var toolCalls []api.ToolCall
+		var thinking string
+		var toolResults []api.Message
+
+		for _, block := range content {
+			blockMap, ok := block.(map[string]any)
+			if !ok {
+				return nil, errors.New("invalid content block format")
+			}
+
+			blockType, _ := blockMap["type"].(string)
+
+			switch blockType {
+			case "text":
+				if text, ok := blockMap["text"].(string); ok {
+					textContent.WriteString(text)
+				}
+
+			case "image":
+				source, ok := blockMap["source"].(map[string]any)
+				if !ok {
+					return nil, errors.New("invalid image source")
+				}
+
+				sourceType, _ := source["type"].(string)
+				if sourceType == "base64" {
+					data, _ := source["data"].(string)
+					decoded, err := base64.StdEncoding.DecodeString(data)
+					if err != nil {
+						return nil, fmt.Errorf("invalid base64 image data: %w", err)
+					}
+					images = append(images, decoded)
+				} else {
+					return nil, fmt.Errorf("invalid image source type: %s. Only base64 images are supported.", sourceType)
+				}
+				// URL images would need to be fetched - skip for now
+
+			case "tool_use":
+				id, ok := blockMap["id"].(string)
+				if !ok {
+					return nil, errors.New("tool_use block missing required 'id' field")
+				}
+				name, ok := blockMap["name"].(string)
+				if !ok {
+					return nil, errors.New("tool_use block missing required 'name' field")
+				}
+				tc := api.ToolCall{
+					ID: id,
+					Function: api.ToolCallFunction{
+						Name: name,
+					},
+				}
+				if input, ok := blockMap["input"].(map[string]any); ok {
+					tc.Function.Arguments = mapToArgs(input)
+				}
+				toolCalls = append(toolCalls, tc)
+
+			case "tool_result":
+				toolUseID, _ := blockMap["tool_use_id"].(string)
+				var resultContent string
+
+				switch c := blockMap["content"].(type) {
+				case string:
+					resultContent = c
+				case []any:
+					for _, cb := range c {
+						if cbMap, ok := cb.(map[string]any); ok {
+							if cbMap["type"] == "text" {
+								if text, ok := cbMap["text"].(string); ok {
+									resultContent += text
+								}
+							}
+						}
+					}
+				}
+
+				toolResults = append(toolResults, api.Message{
+					Role:       "tool",
+					Content:    resultContent,
+					ToolCallID: toolUseID,
+				})
+
+			case "thinking":
+				if t, ok := blockMap["thinking"].(string); ok {
+					thinking = t
+				}
+			}
+		}
+
+		if textContent.Len() > 0 || len(images) > 0 || len(toolCalls) > 0 || thinking != "" {
+			m := api.Message{
+				Role:      role,
+				Content:   textContent.String(),
+				Images:    images,
+				ToolCalls: toolCalls,
+				Thinking:  thinking,
+			}
+			messages = append(messages, m)
+		}
+
+		// Add tool results as separate messages
+		messages = append(messages, toolResults...)
+
+	default:
+		return nil, fmt.Errorf("invalid message content type: %T", content)
+	}
+
+	return messages, nil
+}
+
+// convertTool converts an Anthropic Tool to an Ollama api.Tool
+func convertTool(t Tool) (api.Tool, error) {
+	var params api.ToolFunctionParameters
+	if len(t.InputSchema) > 0 {
+		if err := json.Unmarshal(t.InputSchema, &params); err != nil {
+			return api.Tool{}, fmt.Errorf("invalid input_schema for tool %q: %w", t.Name, err)
+		}
+	}
+
+	return api.Tool{
+		Type: "function",
+		Function: api.ToolFunction{
+			Name:        t.Name,
+			Description: t.Description,
+			Parameters:  params,
+		},
+	}, nil
+}
+
+// ToMessagesResponse converts an Ollama api.ChatResponse to an Anthropic MessagesResponse
+func ToMessagesResponse(id string, r api.ChatResponse) MessagesResponse {
+	var content []ContentBlock
+
+	if r.Message.Thinking != "" {
+		content = append(content, ContentBlock{
+			Type:     "thinking",
+			Thinking: ptr(r.Message.Thinking),
+		})
+	}
+
+	if r.Message.Content != "" {
+		content = append(content, ContentBlock{
+			Type: "text",
+			Text: ptr(r.Message.Content),
+		})
+	}
+
+	for _, tc := range r.Message.ToolCalls {
+		content = append(content, ContentBlock{
+			Type:  "tool_use",
+			ID:    tc.ID,
+			Name:  tc.Function.Name,
+			Input: tc.Function.Arguments,
+		})
+	}
+
+	stopReason := mapStopReason(r.DoneReason, len(r.Message.ToolCalls) > 0)
+
+	return MessagesResponse{
+		ID:         id,
+		Type:       "message",
+		Role:       "assistant",
+		Model:      r.Model,
+		Content:    content,
+		StopReason: stopReason,
+		Usage: Usage{
+			InputTokens:  r.Metrics.PromptEvalCount,
+			OutputTokens: r.Metrics.EvalCount,
+		},
+	}
+}
+
+// mapStopReason converts Ollama done_reason to Anthropic stop_reason
+func mapStopReason(reason string, hasToolCalls bool) string {
+	if hasToolCalls {
+		return "tool_use"
+	}
+
+	switch reason {
+	case "stop":
+		return "end_turn"
+	case "length":
+		return "max_tokens"
+	default:
+		if reason != "" {
+			return "stop_sequence"
+		}
+		return ""
+	}
+}
+
+// StreamConverter manages state for converting Ollama streaming responses to Anthropic format
+type StreamConverter struct {
+	ID              string
+	Model           string
+	firstWrite      bool
+	contentIndex    int
+	inputTokens     int
+	outputTokens    int
+	thinkingStarted bool
+	thinkingDone    bool
+	textStarted     bool
+	toolCallsSent   map[string]bool
+}
+
+func NewStreamConverter(id, model string) *StreamConverter {
+	return &StreamConverter{
+		ID:            id,
+		Model:         model,
+		firstWrite:    true,
+		toolCallsSent: make(map[string]bool),
+	}
+}
+
+// StreamEvent represents a streaming event to be sent to the client
+type StreamEvent struct {
+	Event string
+	Data  any
+}
+
+// Process converts an Ollama ChatResponse to Anthropic streaming events
+func (c *StreamConverter) Process(r api.ChatResponse) []StreamEvent {
+	var events []StreamEvent
+
+	if c.firstWrite {
+		c.firstWrite = false
+		c.inputTokens = r.Metrics.PromptEvalCount
+
+		events = append(events, StreamEvent{
+			Event: "message_start",
+			Data: MessageStartEvent{
+				Type: "message_start",
+				Message: MessagesResponse{
+					ID:      c.ID,
+					Type:    "message",
+					Role:    "assistant",
+					Model:   c.Model,
+					Content: []ContentBlock{},
+					Usage: Usage{
+						InputTokens:  c.inputTokens,
+						OutputTokens: 0,
+					},
+				},
+			},
+		})
+	}
+
+	if r.Message.Thinking != "" && !c.thinkingDone {
+		if !c.thinkingStarted {
+			c.thinkingStarted = true
+			events = append(events, StreamEvent{
+				Event: "content_block_start",
+				Data: ContentBlockStartEvent{
+					Type:  "content_block_start",
+					Index: c.contentIndex,
+					ContentBlock: ContentBlock{
+						Type:     "thinking",
+						Thinking: ptr(""),
+					},
+				},
+			})
+		}
+
+		events = append(events, StreamEvent{
+			Event: "content_block_delta",
+			Data: ContentBlockDeltaEvent{
+				Type:  "content_block_delta",
+				Index: c.contentIndex,
+				Delta: Delta{
+					Type:     "thinking_delta",
+					Thinking: r.Message.Thinking,
+				},
+			},
+		})
+	}
+
+	if r.Message.Content != "" {
+		if c.thinkingStarted && !c.thinkingDone {
+			c.thinkingDone = true
+			events = append(events, StreamEvent{
+				Event: "content_block_stop",
+				Data: ContentBlockStopEvent{
+					Type:  "content_block_stop",
+					Index: c.contentIndex,
+				},
+			})
+			c.contentIndex++
+		}
+
+		if !c.textStarted {
+			c.textStarted = true
+			events = append(events, StreamEvent{
+				Event: "content_block_start",
+				Data: ContentBlockStartEvent{
+					Type:  "content_block_start",
+					Index: c.contentIndex,
+					ContentBlock: ContentBlock{
+						Type: "text",
+						Text: ptr(""),
+					},
+				},
+			})
+		}
+
+		events = append(events, StreamEvent{
+			Event: "content_block_delta",
+			Data: ContentBlockDeltaEvent{
+				Type:  "content_block_delta",
+				Index: c.contentIndex,
+				Delta: Delta{
+					Type: "text_delta",
+					Text: r.Message.Content,
+				},
+			},
+		})
+	}
+
+	for _, tc := range r.Message.ToolCalls {
+		if c.toolCallsSent[tc.ID] {
+			continue
+		}
+
+		if c.textStarted {
+			events = append(events, StreamEvent{
+				Event: "content_block_stop",
+				Data: ContentBlockStopEvent{
+					Type:  "content_block_stop",
+					Index: c.contentIndex,
+				},
+			})
+			c.contentIndex++
+			c.textStarted = false
+		}
+
+		argsJSON, err := json.Marshal(tc.Function.Arguments)
+		if err != nil {
+			slog.Error("failed to marshal tool arguments", "error", err, "tool_id", tc.ID)
+			continue
+		}
+
+		events = append(events, StreamEvent{
+			Event: "content_block_start",
+			Data: ContentBlockStartEvent{
+				Type:  "content_block_start",
+				Index: c.contentIndex,
+				ContentBlock: ContentBlock{
+					Type:  "tool_use",
+					ID:    tc.ID,
+					Name:  tc.Function.Name,
+					Input: map[string]any{},
+				},
+			},
+		})
+
+		events = append(events, StreamEvent{
+			Event: "content_block_delta",
+			Data: ContentBlockDeltaEvent{
+				Type:  "content_block_delta",
+				Index: c.contentIndex,
+				Delta: Delta{
+					Type:        "input_json_delta",
+					PartialJSON: string(argsJSON),
+				},
+			},
+		})
+
+		events = append(events, StreamEvent{
+			Event: "content_block_stop",
+			Data: ContentBlockStopEvent{
+				Type:  "content_block_stop",
+				Index: c.contentIndex,
+			},
+		})
+
+		c.toolCallsSent[tc.ID] = true
+		c.contentIndex++
+	}
+
+	if r.Done {
+		if c.textStarted {
+			events = append(events, StreamEvent{
+				Event: "content_block_stop",
+				Data: ContentBlockStopEvent{
+					Type:  "content_block_stop",
+					Index: c.contentIndex,
+				},
+			})
+		} else if c.thinkingStarted && !c.thinkingDone {
+			events = append(events, StreamEvent{
+				Event: "content_block_stop",
+				Data: ContentBlockStopEvent{
+					Type:  "content_block_stop",
+					Index: c.contentIndex,
+				},
+			})
+		}
+
+		c.outputTokens = r.Metrics.EvalCount
+		stopReason := mapStopReason(r.DoneReason, len(c.toolCallsSent) > 0)
+
+		events = append(events, StreamEvent{
+			Event: "message_delta",
+			Data: MessageDeltaEvent{
+				Type: "message_delta",
+				Delta: MessageDelta{
+					StopReason: stopReason,
+				},
+				Usage: DeltaUsage{
+					OutputTokens: c.outputTokens,
+				},
+			},
+		})
+
+		events = append(events, StreamEvent{
+			Event: "message_stop",
+			Data: MessageStopEvent{
+				Type: "message_stop",
+			},
+		})
+	}
+
+	return events
+}
+
+// generateID generates a unique ID with the given prefix using crypto/rand
+func generateID(prefix string) string {
+	b := make([]byte, 12)
+	if _, err := rand.Read(b); err != nil {
+		// Fallback to time-based ID if crypto/rand fails
+		return fmt.Sprintf("%s_%d", prefix, time.Now().UnixNano())
+	}
+	return fmt.Sprintf("%s_%x", prefix, b)
+}
+
+// GenerateMessageID generates a unique message ID
+func GenerateMessageID() string {
+	return generateID("msg")
+}
+
+// ptr returns a pointer to the given string value
+func ptr(s string) *string {
+	return &s
+}
+
+// mapToArgs converts a map to ToolCallFunctionArguments
+func mapToArgs(m map[string]any) api.ToolCallFunctionArguments {
+	args := api.NewToolCallFunctionArguments()
+	for k, v := range m {
+		args.Set(k, v)
+	}
+	return args
+}
--- a/anthropic/anthropic_test.go
+++ b/anthropic/anthropic_test.go
@@ -0,0 +1,953 @@
+package anthropic
+
+import (
+	"encoding/base64"
+	"encoding/json"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+
+	"github.com/ollama/ollama/api"
+)
+
+const (
+	testImage = `iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNk+A8AAQUBAScY42YAAAAASUVORK5CYII=`
+)
+
+// testArgs creates ToolCallFunctionArguments from a map (convenience function for tests)
+func testArgs(m map[string]any) api.ToolCallFunctionArguments {
+	args := api.NewToolCallFunctionArguments()
+	for k, v := range m {
+		args.Set(k, v)
+	}
+	return args
+}
+
+func TestFromMessagesRequest_Basic(t *testing.T) {
+	req := MessagesRequest{
+		Model:     "test-model",
+		MaxTokens: 1024,
+		Messages: []MessageParam{
+			{Role: "user", Content: "Hello"},
+		},
+	}
+
+	result, err := FromMessagesRequest(req)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	if result.Model != "test-model" {
+		t.Errorf("expected model 'test-model', got %q", result.Model)
+	}
+
+	if len(result.Messages) != 1 {
+		t.Fatalf("expected 1 message, got %d", len(result.Messages))
+	}
+
+	if result.Messages[0].Role != "user" || result.Messages[0].Content != "Hello" {
+		t.Errorf("unexpected message: %+v", result.Messages[0])
+	}
+
+	if numPredict, ok := result.Options["num_predict"].(int); !ok || numPredict != 1024 {
+		t.Errorf("expected num_predict 1024, got %v", result.Options["num_predict"])
+	}
+}
+
+func TestFromMessagesRequest_WithSystemPrompt(t *testing.T) {
+	req := MessagesRequest{
+		Model:     "test-model",
+		MaxTokens: 1024,
+		System:    "You are a helpful assistant.",
+		Messages: []MessageParam{
+			{Role: "user", Content: "Hello"},
+		},
+	}
+
+	result, err := FromMessagesRequest(req)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	if len(result.Messages) != 2 {
+		t.Fatalf("expected 2 messages, got %d", len(result.Messages))
+	}
+
+	if result.Messages[0].Role != "system" || result.Messages[0].Content != "You are a helpful assistant." {
+		t.Errorf("unexpected system message: %+v", result.Messages[0])
+	}
+}
+
+func TestFromMessagesRequest_WithSystemPromptArray(t *testing.T) {
+	req := MessagesRequest{
+		Model:     "test-model",
+		MaxTokens: 1024,
+		System: []any{
+			map[string]any{"type": "text", "text": "You are helpful."},
+			map[string]any{"type": "text", "text": " Be concise."},
+		},
+		Messages: []MessageParam{
+			{Role: "user", Content: "Hello"},
+		},
+	}
+
+	result, err := FromMessagesRequest(req)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	if len(result.Messages) != 2 {
+		t.Fatalf("expected 2 messages, got %d", len(result.Messages))
+	}
+
+	if result.Messages[0].Content != "You are helpful. Be concise." {
+		t.Errorf("unexpected system message content: %q", result.Messages[0].Content)
+	}
+}
+
+func TestFromMessagesRequest_WithOptions(t *testing.T) {
+	temp := 0.7
+	topP := 0.9
+	topK := 40
+	req := MessagesRequest{
+		Model:         "test-model",
+		MaxTokens:     2048,
+		Messages:      []MessageParam{{Role: "user", Content: "Hello"}},
+		Temperature:   &temp,
+		TopP:          &topP,
+		TopK:          &topK,
+		StopSequences: []string{"\n", "END"},
+	}
+
+	result, err := FromMessagesRequest(req)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	if result.Options["temperature"] != 0.7 {
+		t.Errorf("expected temperature 0.7, got %v", result.Options["temperature"])
+	}
+	if result.Options["top_p"] != 0.9 {
+		t.Errorf("expected top_p 0.9, got %v", result.Options["top_p"])
+	}
+	if result.Options["top_k"] != 40 {
+		t.Errorf("expected top_k 40, got %v", result.Options["top_k"])
+	}
+	if diff := cmp.Diff([]string{"\n", "END"}, result.Options["stop"]); diff != "" {
+		t.Errorf("stop sequences mismatch: %s", diff)
+	}
+}
+
+func TestFromMessagesRequest_WithImage(t *testing.T) {
+	imgData, _ := base64.StdEncoding.DecodeString(testImage)
+
+	req := MessagesRequest{
+		Model:     "test-model",
+		MaxTokens: 1024,
+		Messages: []MessageParam{
+			{
+				Role: "user",
+				Content: []any{
+					map[string]any{"type": "text", "text": "What's in this image?"},
+					map[string]any{
+						"type": "image",
+						"source": map[string]any{
+							"type":       "base64",
+							"media_type": "image/png",
+							"data":       testImage,
+						},
+					},
+				},
+			},
+		},
+	}
+
+	result, err := FromMessagesRequest(req)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	if len(result.Messages) != 1 {
+		t.Fatalf("expected 1 message, got %d", len(result.Messages))
+	}
+
+	if result.Messages[0].Content != "What's in this image?" {
+		t.Errorf("expected content 'What's in this image?', got %q", result.Messages[0].Content)
+	}
+
+	if len(result.Messages[0].Images) != 1 {
+		t.Fatalf("expected 1 image, got %d", len(result.Messages[0].Images))
+	}
+
+	if string(result.Messages[0].Images[0]) != string(imgData) {
+		t.Error("image data mismatch")
+	}
+}
+
+func TestFromMessagesRequest_WithToolUse(t *testing.T) {
+	req := MessagesRequest{
+		Model:     "test-model",
+		MaxTokens: 1024,
+		Messages: []MessageParam{
+			{Role: "user", Content: "What's the weather in Paris?"},
+			{
+				Role: "assistant",
+				Content: []any{
+					map[string]any{
+						"type":  "tool_use",
+						"id":    "call_123",
+						"name":  "get_weather",
+						"input": map[string]any{"location": "Paris"},
+					},
+				},
+			},
+		},
+	}
+
+	result, err := FromMessagesRequest(req)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	if len(result.Messages) != 2 {
+		t.Fatalf("expected 2 messages, got %d", len(result.Messages))
+	}
+
+	if len(result.Messages[1].ToolCalls) != 1 {
+		t.Fatalf("expected 1 tool call, got %d", len(result.Messages[1].ToolCalls))
+	}
+
+	tc := result.Messages[1].ToolCalls[0]
+	if tc.ID != "call_123" {
+		t.Errorf("expected tool call ID 'call_123', got %q", tc.ID)
+	}
+	if tc.Function.Name != "get_weather" {
+		t.Errorf("expected tool name 'get_weather', got %q", tc.Function.Name)
+	}
+}
+
+func TestFromMessagesRequest_WithToolResult(t *testing.T) {
+	req := MessagesRequest{
+		Model:     "test-model",
+		MaxTokens: 1024,
+		Messages: []MessageParam{
+			{
+				Role: "user",
+				Content: []any{
+					map[string]any{
+						"type":        "tool_result",
+						"tool_use_id": "call_123",
+						"content":     "The weather in Paris is sunny, 22°C",
+					},
+				},
+			},
+		},
+	}
+
+	result, err := FromMessagesRequest(req)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	if len(result.Messages) != 1 {
+		t.Fatalf("expected 1 message, got %d", len(result.Messages))
+	}
+
+	msg := result.Messages[0]
+	if msg.Role != "tool" {
+		t.Errorf("expected role 'tool', got %q", msg.Role)
+	}
+	if msg.ToolCallID != "call_123" {
+		t.Errorf("expected tool_call_id 'call_123', got %q", msg.ToolCallID)
+	}
+	if msg.Content != "The weather in Paris is sunny, 22°C" {
+		t.Errorf("unexpected content: %q", msg.Content)
+	}
+}
+
+func TestFromMessagesRequest_WithTools(t *testing.T) {
+	req := MessagesRequest{
+		Model:     "test-model",
+		MaxTokens: 1024,
+		Messages:  []MessageParam{{Role: "user", Content: "Hello"}},
+		Tools: []Tool{
+			{
+				Name:        "get_weather",
+				Description: "Get current weather",
+				InputSchema: json.RawMessage(`{"type":"object","properties":{"location":{"type":"string"}},"required":["location"]}`),
+			},
+		},
+	}
+
+	result, err := FromMessagesRequest(req)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	if len(result.Tools) != 1 {
+		t.Fatalf("expected 1 tool, got %d", len(result.Tools))
+	}
+
+	tool := result.Tools[0]
+	if tool.Type != "function" {
+		t.Errorf("expected type 'function', got %q", tool.Type)
+	}
+	if tool.Function.Name != "get_weather" {
+		t.Errorf("expected name 'get_weather', got %q", tool.Function.Name)
+	}
+	if tool.Function.Description != "Get current weather" {
+		t.Errorf("expected description 'Get current weather', got %q", tool.Function.Description)
+	}
+}
+
+func TestFromMessagesRequest_WithThinking(t *testing.T) {
+	req := MessagesRequest{
+		Model:     "test-model",
+		MaxTokens: 1024,
+		Messages:  []MessageParam{{Role: "user", Content: "Hello"}},
+		Thinking:  &ThinkingConfig{Type: "enabled", BudgetTokens: 1000},
+	}
+
+	result, err := FromMessagesRequest(req)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	if result.Think == nil {
+		t.Fatal("expected Think to be set")
+	}
+	if v, ok := result.Think.Value.(bool); !ok || !v {
+		t.Errorf("expected Think.Value to be true, got %v", result.Think.Value)
+	}
+}
+
+// TestFromMessagesRequest_ThinkingOnlyBlock verifies that messages containing only
+// a thinking block (no text, images, or tool calls) are preserved and not dropped.
+func TestFromMessagesRequest_ThinkingOnlyBlock(t *testing.T) {
+	req := MessagesRequest{
+		Model:     "test-model",
+		MaxTokens: 1024,
+		Messages: []MessageParam{
+			{Role: "user", Content: "Hello"},
+			{
+				Role: "assistant",
+				Content: []any{
+					map[string]any{
+						"type":     "thinking",
+						"thinking": "Let me think about this...",
+					},
+				},
+			},
+		},
+	}
+
+	result, err := FromMessagesRequest(req)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	if len(result.Messages) != 2 {
+		t.Fatalf("expected 2 messages, got %d", len(result.Messages))
+	}
+
+	assistantMsg := result.Messages[1]
+	if assistantMsg.Thinking != "Let me think about this..." {
+		t.Errorf("expected thinking content, got %q", assistantMsg.Thinking)
+	}
+}
+
+func TestFromMessagesRequest_ToolUseMissingID(t *testing.T) {
+	req := MessagesRequest{
+		Model:     "test-model",
+		MaxTokens: 1024,
+		Messages: []MessageParam{
+			{
+				Role: "assistant",
+				Content: []any{
+					map[string]any{
+						"type": "tool_use",
+						"name": "get_weather",
+					},
+				},
+			},
+		},
+	}
+
+	_, err := FromMessagesRequest(req)
+	if err == nil {
+		t.Fatal("expected error for missing tool_use id")
+	}
+	if err.Error() != "tool_use block missing required 'id' field" {
+		t.Errorf("unexpected error message: %v", err)
+	}
+}
+
+func TestFromMessagesRequest_ToolUseMissingName(t *testing.T) {
+	req := MessagesRequest{
+		Model:     "test-model",
+		MaxTokens: 1024,
+		Messages: []MessageParam{
+			{
+				Role: "assistant",
+				Content: []any{
+					map[string]any{
+						"type": "tool_use",
+						"id":   "call_123",
+					},
+				},
+			},
+		},
+	}
+
+	_, err := FromMessagesRequest(req)
+	if err == nil {
+		t.Fatal("expected error for missing tool_use name")
+	}
+	if err.Error() != "tool_use block missing required 'name' field" {
+		t.Errorf("unexpected error message: %v", err)
+	}
+}
+
+func TestFromMessagesRequest_InvalidToolSchema(t *testing.T) {
+	req := MessagesRequest{
+		Model:     "test-model",
+		MaxTokens: 1024,
+		Messages:  []MessageParam{{Role: "user", Content: "Hello"}},
+		Tools: []Tool{
+			{
+				Name:        "bad_tool",
+				InputSchema: json.RawMessage(`{invalid json`),
+			},
+		},
+	}
+
+	_, err := FromMessagesRequest(req)
+	if err == nil {
+		t.Fatal("expected error for invalid tool schema")
+	}
+}
+
+func TestToMessagesResponse_Basic(t *testing.T) {
+	resp := api.ChatResponse{
+		Model: "test-model",
+		Message: api.Message{
+			Role:    "assistant",
+			Content: "Hello there!",
+		},
+		Done:       true,
+		DoneReason: "stop",
+		Metrics: api.Metrics{
+			PromptEvalCount: 10,
+			EvalCount:       5,
+		},
+	}
+
+	result := ToMessagesResponse("msg_123", resp)
+
+	if result.ID != "msg_123" {
+		t.Errorf("expected ID 'msg_123', got %q", result.ID)
+	}
+	if result.Type != "message" {
+		t.Errorf("expected type 'message', got %q", result.Type)
+	}
+	if result.Role != "assistant" {
+		t.Errorf("expected role 'assistant', got %q", result.Role)
+	}
+	if len(result.Content) != 1 {
+		t.Fatalf("expected 1 content block, got %d", len(result.Content))
+	}
+	if result.Content[0].Type != "text" || result.Content[0].Text == nil || *result.Content[0].Text != "Hello there!" {
+		t.Errorf("unexpected content: %+v", result.Content[0])
+	}
+	if result.StopReason != "end_turn" {
+		t.Errorf("expected stop_reason 'end_turn', got %q", result.StopReason)
+	}
+	if result.Usage.InputTokens != 10 || result.Usage.OutputTokens != 5 {
+		t.Errorf("unexpected usage: %+v", result.Usage)
+	}
+}
+
+func TestToMessagesResponse_WithToolCalls(t *testing.T) {
+	resp := api.ChatResponse{
+		Model: "test-model",
+		Message: api.Message{
+			Role: "assistant",
+			ToolCalls: []api.ToolCall{
+				{
+					ID: "call_123",
+					Function: api.ToolCallFunction{
+						Name:      "get_weather",
+						Arguments: testArgs(map[string]any{"location": "Paris"}),
+					},
+				},
+			},
+		},
+		Done:       true,
+		DoneReason: "stop",
+	}
+
+	result := ToMessagesResponse("msg_123", resp)
+
+	if len(result.Content) != 1 {
+		t.Fatalf("expected 1 content block, got %d", len(result.Content))
+	}
+	if result.Content[0].Type != "tool_use" {
+		t.Errorf("expected type 'tool_use', got %q", result.Content[0].Type)
+	}
+	if result.Content[0].ID != "call_123" {
+		t.Errorf("expected ID 'call_123', got %q", result.Content[0].ID)
+	}
+	if result.Content[0].Name != "get_weather" {
+		t.Errorf("expected name 'get_weather', got %q", result.Content[0].Name)
+	}
+	if result.StopReason != "tool_use" {
+		t.Errorf("expected stop_reason 'tool_use', got %q", result.StopReason)
+	}
+}
+
+func TestToMessagesResponse_WithThinking(t *testing.T) {
+	resp := api.ChatResponse{
+		Model: "test-model",
+		Message: api.Message{
+			Role:     "assistant",
+			Content:  "The answer is 42.",
+			Thinking: "Let me think about this...",
+		},
+		Done:       true,
+		DoneReason: "stop",
+	}
+
+	result := ToMessagesResponse("msg_123", resp)
+
+	if len(result.Content) != 2 {
+		t.Fatalf("expected 2 content blocks, got %d", len(result.Content))
+	}
+	if result.Content[0].Type != "thinking" {
+		t.Errorf("expected first block type 'thinking', got %q", result.Content[0].Type)
+	}
+	if result.Content[0].Thinking == nil || *result.Content[0].Thinking != "Let me think about this..." {
+		t.Errorf("unexpected thinking content: %v", result.Content[0].Thinking)
+	}
+	if result.Content[1].Type != "text" {
+		t.Errorf("expected second block type 'text', got %q", result.Content[1].Type)
+	}
+}
+
+func TestMapStopReason(t *testing.T) {
+	tests := []struct {
+		reason       string
+		hasToolCalls bool
+		want         string
+	}{
+		{"stop", false, "end_turn"},
+		{"length", false, "max_tokens"},
+		{"stop", true, "tool_use"},
+		{"other", false, "stop_sequence"},
+		{"", false, ""},
+	}
+
+	for _, tt := range tests {
+		got := mapStopReason(tt.reason, tt.hasToolCalls)
+		if got != tt.want {
+			t.Errorf("mapStopReason(%q, %v) = %q, want %q", tt.reason, tt.hasToolCalls, got, tt.want)
+		}
+	}
+}
+
+func TestNewError(t *testing.T) {
+	tests := []struct {
+		code int
+		want string
+	}{
+		{400, "invalid_request_error"},
+		{401, "authentication_error"},
+		{403, "permission_error"},
+		{404, "not_found_error"},
+		{429, "rate_limit_error"},
+		{500, "api_error"},
+		{503, "overloaded_error"},
+		{529, "overloaded_error"},
+	}
+
+	for _, tt := range tests {
+		result := NewError(tt.code, "test message")
+		if result.Type != "error" {
+			t.Errorf("NewError(%d) type = %q, want 'error'", tt.code, result.Type)
+		}
+		if result.Error.Type != tt.want {
+			t.Errorf("NewError(%d) error.type = %q, want %q", tt.code, result.Error.Type, tt.want)
+		}
+		if result.Error.Message != "test message" {
+			t.Errorf("NewError(%d) message = %q, want 'test message'", tt.code, result.Error.Message)
+		}
+		if result.RequestID == "" {
+			t.Errorf("NewError(%d) request_id should not be empty", tt.code)
+		}
+	}
+}
+
+func TestGenerateMessageID(t *testing.T) {
+	id1 := GenerateMessageID()
+	id2 := GenerateMessageID()
+
+	if id1 == "" {
+		t.Error("GenerateMessageID returned empty string")
+	}
+	if id1 == id2 {
+		t.Error("GenerateMessageID returned duplicate IDs")
+	}
+	if len(id1) < 10 {
+		t.Errorf("GenerateMessageID returned short ID: %q", id1)
+	}
+	if id1[:4] != "msg_" {
+		t.Errorf("GenerateMessageID should start with 'msg_', got %q", id1[:4])
+	}
+}
+
+func TestStreamConverter_Basic(t *testing.T) {
+	conv := NewStreamConverter("msg_123", "test-model")
+
+	// First chunk
+	resp1 := api.ChatResponse{
+		Model: "test-model",
+		Message: api.Message{
+			Role:    "assistant",
+			Content: "Hello",
+		},
+		Metrics: api.Metrics{PromptEvalCount: 10},
+	}
+
+	events1 := conv.Process(resp1)
+	if len(events1) < 3 {
+		t.Fatalf("expected at least 3 events for first chunk, got %d", len(events1))
+	}
+
+	// Should have message_start, content_block_start, content_block_delta
+	if events1[0].Event != "message_start" {
+		t.Errorf("expected first event 'message_start', got %q", events1[0].Event)
+	}
+	if events1[1].Event != "content_block_start" {
+		t.Errorf("expected second event 'content_block_start', got %q", events1[1].Event)
+	}
+	if events1[2].Event != "content_block_delta" {
+		t.Errorf("expected third event 'content_block_delta', got %q", events1[2].Event)
+	}
+
+	// Final chunk
+	resp2 := api.ChatResponse{
+		Model: "test-model",
+		Message: api.Message{
+			Role:    "assistant",
+			Content: " world!",
+		},
+		Done:       true,
+		DoneReason: "stop",
+		Metrics:    api.Metrics{EvalCount: 5},
+	}
+
+	events2 := conv.Process(resp2)
+
+	// Should have content_block_delta, content_block_stop, message_delta, message_stop
+	hasStop := false
+	for _, e := range events2 {
+		if e.Event == "message_stop" {
+			hasStop = true
+		}
+	}
+	if !hasStop {
+		t.Error("expected message_stop event in final chunk")
+	}
+}
+
+func TestStreamConverter_WithToolCalls(t *testing.T) {
+	conv := NewStreamConverter("msg_123", "test-model")
+
+	resp := api.ChatResponse{
+		Model: "test-model",
+		Message: api.Message{
+			Role: "assistant",
+			ToolCalls: []api.ToolCall{
+				{
+					ID: "call_123",
+					Function: api.ToolCallFunction{
+						Name:      "get_weather",
+						Arguments: testArgs(map[string]any{"location": "Paris"}),
+					},
+				},
+			},
+		},
+		Done:       true,
+		DoneReason: "stop",
+		Metrics:    api.Metrics{PromptEvalCount: 10, EvalCount: 5},
+	}
+
+	events := conv.Process(resp)
+
+	hasToolStart := false
+	hasToolDelta := false
+	for _, e := range events {
+		if e.Event == "content_block_start" {
+			if start, ok := e.Data.(ContentBlockStartEvent); ok {
+				if start.ContentBlock.Type == "tool_use" {
+					hasToolStart = true
+				}
+			}
+		}
+		if e.Event == "content_block_delta" {
+			if delta, ok := e.Data.(ContentBlockDeltaEvent); ok {
+				if delta.Delta.Type == "input_json_delta" {
+					hasToolDelta = true
+				}
+			}
+		}
+	}
+
+	if !hasToolStart {
+		t.Error("expected tool_use content_block_start event")
+	}
+	if !hasToolDelta {
+		t.Error("expected input_json_delta event")
+	}
+}
+
+func TestStreamConverter_ToolCallWithUnmarshalableArgs(t *testing.T) {
+	// Test that unmarshalable arguments (like channels) are handled gracefully
+	// and don't cause a panic or corrupt stream
+	conv := NewStreamConverter("msg_123", "test-model")
+
+	// Create a channel which cannot be JSON marshaled
+	unmarshalable := make(chan int)
+	badArgs := api.NewToolCallFunctionArguments()
+	badArgs.Set("channel", unmarshalable)
+
+	resp := api.ChatResponse{
+		Model: "test-model",
+		Message: api.Message{
+			Role: "assistant",
+			ToolCalls: []api.ToolCall{
+				{
+					ID: "call_bad",
+					Function: api.ToolCallFunction{
+						Name:      "bad_function",
+						Arguments: badArgs,
+					},
+				},
+			},
+		},
+		Done:       true,
+		DoneReason: "stop",
+	}
+
+	// Should not panic and should skip the unmarshalable tool call
+	events := conv.Process(resp)
+
+	// Verify no tool_use block was started (since marshal failed before block start)
+	hasToolStart := false
+	for _, e := range events {
+		if e.Event == "content_block_start" {
+			if start, ok := e.Data.(ContentBlockStartEvent); ok {
+				if start.ContentBlock.Type == "tool_use" {
+					hasToolStart = true
+				}
+			}
+		}
+	}
+
+	if hasToolStart {
+		t.Error("expected no tool_use block when arguments cannot be marshaled")
+	}
+}
+
+func TestStreamConverter_MultipleToolCallsWithMixedValidity(t *testing.T) {
+	// Test that valid tool calls still work when mixed with invalid ones
+	conv := NewStreamConverter("msg_123", "test-model")
+
+	unmarshalable := make(chan int)
+	badArgs := api.NewToolCallFunctionArguments()
+	badArgs.Set("channel", unmarshalable)
+
+	resp := api.ChatResponse{
+		Model: "test-model",
+		Message: api.Message{
+			Role: "assistant",
+			ToolCalls: []api.ToolCall{
+				{
+					ID: "call_good",
+					Function: api.ToolCallFunction{
+						Name:      "good_function",
+						Arguments: testArgs(map[string]any{"location": "Paris"}),
+					},
+				},
+				{
+					ID: "call_bad",
+					Function: api.ToolCallFunction{
+						Name:      "bad_function",
+						Arguments: badArgs,
+					},
+				},
+			},
+		},
+		Done:       true,
+		DoneReason: "stop",
+	}
+
+	events := conv.Process(resp)
+
+	// Count tool_use blocks - should only have 1 (the valid one)
+	toolStartCount := 0
+	toolDeltaCount := 0
+	for _, e := range events {
+		if e.Event == "content_block_start" {
+			if start, ok := e.Data.(ContentBlockStartEvent); ok {
+				if start.ContentBlock.Type == "tool_use" {
+					toolStartCount++
+					if start.ContentBlock.Name != "good_function" {
+						t.Errorf("expected tool name 'good_function', got %q", start.ContentBlock.Name)
+					}
+				}
+			}
+		}
+		if e.Event == "content_block_delta" {
+			if delta, ok := e.Data.(ContentBlockDeltaEvent); ok {
+				if delta.Delta.Type == "input_json_delta" {
+					toolDeltaCount++
+				}
+			}
+		}
+	}
+
+	if toolStartCount != 1 {
+		t.Errorf("expected 1 tool_use block, got %d", toolStartCount)
+	}
+	if toolDeltaCount != 1 {
+		t.Errorf("expected 1 input_json_delta, got %d", toolDeltaCount)
+	}
+}
+
+// TestContentBlockJSON_EmptyFieldsPresent verifies that empty text and thinking fields
+// are serialized in JSON output. The Anthropic SDK requires these fields to be present
+// (even when empty) in content_block_start events to properly accumulate streaming deltas.
+// Without these fields, the SDK throws: "TypeError: unsupported operand type(s) for +=: 'NoneType' and 'str'"
+func TestContentBlockJSON_EmptyFieldsPresent(t *testing.T) {
+	tests := []struct {
+		name     string
+		block    ContentBlock
+		wantKeys []string
+	}{
+		{
+			name: "text block includes empty text field",
+			block: ContentBlock{
+				Type: "text",
+				Text: ptr(""),
+			},
+			wantKeys: []string{"type", "text"},
+		},
+		{
+			name: "thinking block includes empty thinking field",
+			block: ContentBlock{
+				Type:     "thinking",
+				Thinking: ptr(""),
+			},
+			wantKeys: []string{"type", "thinking"},
+		},
+		{
+			name: "text block with content",
+			block: ContentBlock{
+				Type: "text",
+				Text: ptr("hello"),
+			},
+			wantKeys: []string{"type", "text"},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			data, err := json.Marshal(tt.block)
+			if err != nil {
+				t.Fatalf("failed to marshal: %v", err)
+			}
+
+			var result map[string]any
+			if err := json.Unmarshal(data, &result); err != nil {
+				t.Fatalf("failed to unmarshal: %v", err)
+			}
+
+			for _, key := range tt.wantKeys {
+				if _, ok := result[key]; !ok {
+					t.Errorf("expected key %q to be present in JSON output, got: %s", key, string(data))
+				}
+			}
+		})
+	}
+}
+
+// TestStreamConverter_ContentBlockStartIncludesEmptyFields verifies that content_block_start
+// events include the required empty fields for SDK compatibility.
+func TestStreamConverter_ContentBlockStartIncludesEmptyFields(t *testing.T) {
+	t.Run("text block start includes empty text", func(t *testing.T) {
+		conv := NewStreamConverter("msg_123", "test-model")
+
+		resp := api.ChatResponse{
+			Model:   "test-model",
+			Message: api.Message{Role: "assistant", Content: "hello"},
+		}
+
+		events := conv.Process(resp)
+
+		var foundTextStart bool
+		for _, e := range events {
+			if e.Event == "content_block_start" {
+				if start, ok := e.Data.(ContentBlockStartEvent); ok {
+					if start.ContentBlock.Type == "text" {
+						foundTextStart = true
+						// Marshal and verify the text field is present
+						data, _ := json.Marshal(start)
+						var result map[string]any
+						json.Unmarshal(data, &result)
+						cb := result["content_block"].(map[string]any)
+						if _, ok := cb["text"]; !ok {
+							t.Error("content_block_start for text should include 'text' field")
+						}
+					}
+				}
+			}
+		}
+
+		if !foundTextStart {
+			t.Error("expected text content_block_start event")
+		}
+	})
+
+	t.Run("thinking block start includes empty thinking", func(t *testing.T) {
+		conv := NewStreamConverter("msg_123", "test-model")
+
+		resp := api.ChatResponse{
+			Model:   "test-model",
+			Message: api.Message{Role: "assistant", Thinking: "let me think..."},
+		}
+
+		events := conv.Process(resp)
+
+		var foundThinkingStart bool
+		for _, e := range events {
+			if e.Event == "content_block_start" {
+				if start, ok := e.Data.(ContentBlockStartEvent); ok {
+					if start.ContentBlock.Type == "thinking" {
+						foundThinkingStart = true
+						data, _ := json.Marshal(start)
+						var result map[string]any
+						json.Unmarshal(data, &result)
+						cb := result["content_block"].(map[string]any)
+						if _, ok := cb["thinking"]; !ok {
+							t.Error("content_block_start for thinking should include 'thinking' field")
+						}
+					}
+				}
+			}
+		}
+
+		if !foundThinkingStart {
+			t.Error("expected thinking content_block_start event")
+		}
+	})
+}
--- a/api/types.go
+++ b/api/types.go
@@ -3,6 +3,7 @@ package api
 import (
 	"encoding/json"
 	"fmt"
+	"iter"
 	"log/slog"
 	"math"
 	"os"
@@ -14,6 +15,7 @@ import (
 	"github.com/google/uuid"

 	"github.com/ollama/ollama/envconfig"
+	"github.com/ollama/ollama/internal/orderedmap"
 	"github.com/ollama/ollama/types/model"
 )

@@ -227,13 +229,79 @@ type ToolCallFunction struct {
 	Arguments ToolCallFunctionArguments `json:"arguments"`
 }

-type ToolCallFunctionArguments map[string]any
+// ToolCallFunctionArguments holds tool call arguments in insertion order.
+type ToolCallFunctionArguments struct {
+	om *orderedmap.Map[string, any]
+}
+
+// NewToolCallFunctionArguments creates a new empty ToolCallFunctionArguments.
+func NewToolCallFunctionArguments() ToolCallFunctionArguments {
+	return ToolCallFunctionArguments{om: orderedmap.New[string, any]()}
+}
+
+// Get retrieves a value by key.
+func (t *ToolCallFunctionArguments) Get(key string) (any, bool) {
+	if t == nil || t.om == nil {
+		return nil, false
+	}
+	return t.om.Get(key)
+}
+
+// Set sets a key-value pair, preserving insertion order.
+func (t *ToolCallFunctionArguments) Set(key string, value any) {
+	if t == nil {
+		return
+	}
+	if t.om == nil {
+		t.om = orderedmap.New[string, any]()
+	}
+	t.om.Set(key, value)
+}
+
+// Len returns the number of arguments.
+func (t *ToolCallFunctionArguments) Len() int {
+	if t == nil || t.om == nil {
+		return 0
+	}
+	return t.om.Len()
+}
+
+// All returns an iterator over all key-value pairs in insertion order.
+func (t *ToolCallFunctionArguments) All() iter.Seq2[string, any] {
+	if t == nil || t.om == nil {
+		return func(yield func(string, any) bool) {}
+	}
+	return t.om.All()
+}
+
+// ToMap returns a regular map (order not preserved).
+func (t *ToolCallFunctionArguments) ToMap() map[string]any {
+	if t == nil || t.om == nil {
+		return nil
+	}
+	return t.om.ToMap()
+}

 func (t *ToolCallFunctionArguments) String() string {
-	bts, _ := json.Marshal(t)
+	if t == nil || t.om == nil {
+		return "{}"
+	}
+	bts, _ := json.Marshal(t.om)
 	return string(bts)
 }

+func (t *ToolCallFunctionArguments) UnmarshalJSON(data []byte) error {
+	t.om = orderedmap.New[string, any]()
+	return json.Unmarshal(data, t.om)
+}
+
+func (t ToolCallFunctionArguments) MarshalJSON() ([]byte, error) {
+	if t.om == nil {
+		return []byte("{}"), nil
+	}
+	return json.Marshal(t.om)
+}
+
 type Tool struct {
 	Type     string       `json:"type"`
 	Items    any          `json:"items,omitempty"`
@@ -282,12 +350,78 @@ func (pt PropertyType) String() string {
 	return fmt.Sprintf("%v", []string(pt))
 }

+// ToolPropertiesMap holds tool properties in insertion order.
+type ToolPropertiesMap struct {
+	om *orderedmap.Map[string, ToolProperty]
+}
+
+// NewToolPropertiesMap creates a new empty ToolPropertiesMap.
+func NewToolPropertiesMap() *ToolPropertiesMap {
+	return &ToolPropertiesMap{om: orderedmap.New[string, ToolProperty]()}
+}
+
+// Get retrieves a property by name.
+func (t *ToolPropertiesMap) Get(key string) (ToolProperty, bool) {
+	if t == nil || t.om == nil {
+		return ToolProperty{}, false
+	}
+	return t.om.Get(key)
+}
+
+// Set sets a property, preserving insertion order.
+func (t *ToolPropertiesMap) Set(key string, value ToolProperty) {
+	if t == nil {
+		return
+	}
+	if t.om == nil {
+		t.om = orderedmap.New[string, ToolProperty]()
+	}
+	t.om.Set(key, value)
+}
+
+// Len returns the number of properties.
+func (t *ToolPropertiesMap) Len() int {
+	if t == nil || t.om == nil {
+		return 0
+	}
+	return t.om.Len()
+}
+
+// All returns an iterator over all properties in insertion order.
+func (t *ToolPropertiesMap) All() iter.Seq2[string, ToolProperty] {
+	if t == nil || t.om == nil {
+		return func(yield func(string, ToolProperty) bool) {}
+	}
+	return t.om.All()
+}
+
+// ToMap returns a regular map (order not preserved).
+func (t *ToolPropertiesMap) ToMap() map[string]ToolProperty {
+	if t == nil || t.om == nil {
+		return nil
+	}
+	return t.om.ToMap()
+}
+
+func (t ToolPropertiesMap) MarshalJSON() ([]byte, error) {
+	if t.om == nil {
+		return []byte("null"), nil
+	}
+	return json.Marshal(t.om)
+}
+
+func (t *ToolPropertiesMap) UnmarshalJSON(data []byte) error {
+	t.om = orderedmap.New[string, ToolProperty]()
+	return json.Unmarshal(data, t.om)
+}
+
 type ToolProperty struct {
-	AnyOf       []ToolProperty `json:"anyOf,omitempty"`
-	Type        PropertyType   `json:"type,omitempty"`
-	Items       any            `json:"items,omitempty"`
-	Description string         `json:"description,omitempty"`
-	Enum        []any          `json:"enum,omitempty"`
+	AnyOf       []ToolProperty     `json:"anyOf,omitempty"`
+	Type        PropertyType       `json:"type,omitempty"`
+	Items       any                `json:"items,omitempty"`
+	Description string             `json:"description,omitempty"`
+	Enum        []any              `json:"enum,omitempty"`
+	Properties  *ToolPropertiesMap `json:"properties,omitempty"`
 }

 // ToTypeScriptType converts a ToolProperty to a TypeScript type string
@@ -336,11 +470,11 @@ func mapToTypeScriptType(jsonType string) string {
 }

 type ToolFunctionParameters struct {
-	Type       string                  `json:"type"`
-	Defs       any                     `json:"$defs,omitempty"`
-	Items      any                     `json:"items,omitempty"`
-	Required   []string                `json:"required,omitempty"`
-	Properties map[string]ToolProperty `json:"properties"`
+	Type       string             `json:"type"`
+	Defs       any                `json:"$defs,omitempty"`
+	Items      any                `json:"items,omitempty"`
+	Required   []string           `json:"required,omitempty"`
+	Properties *ToolPropertiesMap `json:"properties"`
 }

 func (t *ToolFunctionParameters) String() string {
@@ -553,6 +687,9 @@ type CreateRequest struct {
 	Renderer string `json:"renderer,omitempty"`
 	Parser   string `json:"parser,omitempty"`

+	// Requires is the minimum version of Ollama required by the model.
+	Requires string `json:"requires,omitempty"`
+
 	// Info is a map of additional information for the model
 	Info map[string]any `json:"info,omitempty"`

@@ -603,6 +740,7 @@ type ShowResponse struct {
 	Tensors       []Tensor           `json:"tensors,omitempty"`
 	Capabilities  []model.Capability `json:"capabilities,omitempty"`
 	ModifiedAt    time.Time          `json:"modified_at,omitempty"`
+	Requires      string             `json:"requires,omitempty"`
 }

 // CopyRequest is the request passed to [Client.Copy].
--- a/api/types_test.go
+++ b/api/types_test.go
@@ -11,6 +11,24 @@ import (
 	"github.com/stretchr/testify/require"
 )

+// testPropsMap creates a ToolPropertiesMap from a map (convenience function for tests, order not preserved)
+func testPropsMap(m map[string]ToolProperty) *ToolPropertiesMap {
+	props := NewToolPropertiesMap()
+	for k, v := range m {
+		props.Set(k, v)
+	}
+	return props
+}
+
+// testArgs creates ToolCallFunctionArguments from a map (convenience function for tests, order not preserved)
+func testArgs(m map[string]any) ToolCallFunctionArguments {
+	args := NewToolCallFunctionArguments()
+	for k, v := range m {
+		args.Set(k, v)
+	}
+	return args
+}
+
 func TestKeepAliveParsingFromJSON(t *testing.T) {
 	tests := []struct {
 		name string
@@ -309,9 +327,9 @@ func TestToolFunctionParameters_MarshalJSON(t *testing.T) {
 			input: ToolFunctionParameters{
 				Type:     "object",
 				Required: []string{"name"},
-				Properties: map[string]ToolProperty{
+				Properties: testPropsMap(map[string]ToolProperty{
 					"name": {Type: PropertyType{"string"}},
-				},
+				}),
 			},
 			expected: `{"type":"object","required":["name"],"properties":{"name":{"type":"string"}}}`,
 		},
@@ -319,9 +337,9 @@ func TestToolFunctionParameters_MarshalJSON(t *testing.T) {
 			name: "no required",
 			input: ToolFunctionParameters{
 				Type: "object",
-				Properties: map[string]ToolProperty{
+				Properties: testPropsMap(map[string]ToolProperty{
 					"name": {Type: PropertyType{"string"}},
-				},
+				}),
 			},
 			expected: `{"type":"object","properties":{"name":{"type":"string"}}}`,
 		},
@@ -339,7 +357,7 @@ func TestToolFunctionParameters_MarshalJSON(t *testing.T) {
 func TestToolCallFunction_IndexAlwaysMarshals(t *testing.T) {
 	fn := ToolCallFunction{
 		Name:      "echo",
-		Arguments: ToolCallFunctionArguments{"message": "hi"},
+		Arguments: testArgs(map[string]any{"message": "hi"}),
 	}

 	data, err := json.Marshal(fn)
@@ -504,6 +522,116 @@ func TestThinking_UnmarshalJSON(t *testing.T) {
 	}
 }

+func TestToolPropertyNestedProperties(t *testing.T) {
+	tests := []struct {
+		name     string
+		input    string
+		expected ToolProperty
+	}{
+		{
+			name: "nested object properties",
+			input: `{
+				"type": "object",
+				"description": "Location details",
+				"properties": {
+					"address": {
+						"type": "string",
+						"description": "Street address"
+					},
+					"city": {
+						"type": "string",
+						"description": "City name"
+					}
+				}
+			}`,
+			expected: ToolProperty{
+				Type:        PropertyType{"object"},
+				Description: "Location details",
+				Properties: testPropsMap(map[string]ToolProperty{
+					"address": {
+						Type:        PropertyType{"string"},
+						Description: "Street address",
+					},
+					"city": {
+						Type:        PropertyType{"string"},
+						Description: "City name",
+					},
+				}),
+			},
+		},
+		{
+			name: "deeply nested properties",
+			input: `{
+				"type": "object",
+				"description": "Event",
+				"properties": {
+					"location": {
+						"type": "object",
+						"description": "Location",
+						"properties": {
+							"coordinates": {
+								"type": "object",
+								"description": "GPS coordinates",
+								"properties": {
+									"lat": {"type": "number", "description": "Latitude"},
+									"lng": {"type": "number", "description": "Longitude"}
+								}
+							}
+						}
+					}
+				}
+			}`,
+			expected: ToolProperty{
+				Type:        PropertyType{"object"},
+				Description: "Event",
+				Properties: testPropsMap(map[string]ToolProperty{
+					"location": {
+						Type:        PropertyType{"object"},
+						Description: "Location",
+						Properties: testPropsMap(map[string]ToolProperty{
+							"coordinates": {
+								Type:        PropertyType{"object"},
+								Description: "GPS coordinates",
+								Properties: testPropsMap(map[string]ToolProperty{
+									"lat": {Type: PropertyType{"number"}, Description: "Latitude"},
+									"lng": {Type: PropertyType{"number"}, Description: "Longitude"},
+								}),
+							},
+						}),
+					},
+				}),
+			},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			var prop ToolProperty
+			err := json.Unmarshal([]byte(tt.input), &prop)
+			require.NoError(t, err)
+
+			// Compare JSON representations since pointer comparison doesn't work
+			expectedJSON, err := json.Marshal(tt.expected)
+			require.NoError(t, err)
+			actualJSON, err := json.Marshal(prop)
+			require.NoError(t, err)
+			assert.JSONEq(t, string(expectedJSON), string(actualJSON))
+
+			// Round-trip test: marshal and unmarshal again
+			data, err := json.Marshal(prop)
+			require.NoError(t, err)
+
+			var prop2 ToolProperty
+			err = json.Unmarshal(data, &prop2)
+			require.NoError(t, err)
+
+			prop2JSON, err := json.Marshal(prop2)
+			require.NoError(t, err)
+			assert.JSONEq(t, string(expectedJSON), string(prop2JSON))
+		})
+	}
+}
+
 func TestToolFunctionParameters_String(t *testing.T) {
 	tests := []struct {
 		name     string
@@ -515,12 +643,12 @@ func TestToolFunctionParameters_String(t *testing.T) {
 			params: ToolFunctionParameters{
 				Type:     "object",
 				Required: []string{"name"},
-				Properties: map[string]ToolProperty{
+				Properties: testPropsMap(map[string]ToolProperty{
 					"name": {
 						Type:        PropertyType{"string"},
 						Description: "The name of the person",
 					},
-				},
+				}),
 			},
 			expected: `{"type":"object","required":["name"],"properties":{"name":{"type":"string","description":"The name of the person"}}}`,
 		},
@@ -537,7 +665,7 @@ func TestToolFunctionParameters_String(t *testing.T) {
 					s.Self = s
 					return s
 				}(),
-				Properties: map[string]ToolProperty{},
+				Properties: testPropsMap(map[string]ToolProperty{}),
 			},
 			expected: "",
 		},
@@ -550,3 +678,235 @@ func TestToolFunctionParameters_String(t *testing.T) {
 		})
 	}
 }
+
+func TestToolCallFunctionArguments_OrderPreservation(t *testing.T) {
+	t.Run("marshal preserves insertion order", func(t *testing.T) {
+		args := NewToolCallFunctionArguments()
+		args.Set("zebra", "z")
+		args.Set("apple", "a")
+		args.Set("mango", "m")
+
+		data, err := json.Marshal(args)
+		require.NoError(t, err)
+
+		// Should preserve insertion order, not alphabetical
+		assert.Equal(t, `{"zebra":"z","apple":"a","mango":"m"}`, string(data))
+	})
+
+	t.Run("unmarshal preserves JSON order", func(t *testing.T) {
+		jsonData := `{"zebra":"z","apple":"a","mango":"m"}`
+
+		var args ToolCallFunctionArguments
+		err := json.Unmarshal([]byte(jsonData), &args)
+		require.NoError(t, err)
+
+		// Verify iteration order matches JSON order
+		var keys []string
+		for k := range args.All() {
+			keys = append(keys, k)
+		}
+		assert.Equal(t, []string{"zebra", "apple", "mango"}, keys)
+	})
+
+	t.Run("round trip preserves order", func(t *testing.T) {
+		original := `{"z":1,"a":2,"m":3,"b":4}`
+
+		var args ToolCallFunctionArguments
+		err := json.Unmarshal([]byte(original), &args)
+		require.NoError(t, err)
+
+		data, err := json.Marshal(args)
+		require.NoError(t, err)
+
+		assert.Equal(t, original, string(data))
+	})
+
+	t.Run("String method returns ordered JSON", func(t *testing.T) {
+		args := NewToolCallFunctionArguments()
+		args.Set("c", 3)
+		args.Set("a", 1)
+		args.Set("b", 2)
+
+		assert.Equal(t, `{"c":3,"a":1,"b":2}`, args.String())
+	})
+
+	t.Run("Get retrieves correct values", func(t *testing.T) {
+		args := NewToolCallFunctionArguments()
+		args.Set("key1", "value1")
+		args.Set("key2", 42)
+
+		v, ok := args.Get("key1")
+		assert.True(t, ok)
+		assert.Equal(t, "value1", v)
+
+		v, ok = args.Get("key2")
+		assert.True(t, ok)
+		assert.Equal(t, 42, v)
+
+		_, ok = args.Get("nonexistent")
+		assert.False(t, ok)
+	})
+
+	t.Run("Len returns correct count", func(t *testing.T) {
+		args := NewToolCallFunctionArguments()
+		assert.Equal(t, 0, args.Len())
+
+		args.Set("a", 1)
+		assert.Equal(t, 1, args.Len())
+
+		args.Set("b", 2)
+		assert.Equal(t, 2, args.Len())
+	})
+
+	t.Run("empty args marshal to empty object", func(t *testing.T) {
+		args := NewToolCallFunctionArguments()
+		data, err := json.Marshal(args)
+		require.NoError(t, err)
+		assert.Equal(t, `{}`, string(data))
+	})
+
+	t.Run("zero value args marshal to empty object", func(t *testing.T) {
+		var args ToolCallFunctionArguments
+		assert.Equal(t, "{}", args.String())
+	})
+}
+
+func TestToolPropertiesMap_OrderPreservation(t *testing.T) {
+	t.Run("marshal preserves insertion order", func(t *testing.T) {
+		props := NewToolPropertiesMap()
+		props.Set("zebra", ToolProperty{Type: PropertyType{"string"}})
+		props.Set("apple", ToolProperty{Type: PropertyType{"number"}})
+		props.Set("mango", ToolProperty{Type: PropertyType{"boolean"}})
+
+		data, err := json.Marshal(props)
+		require.NoError(t, err)
+
+		// Should preserve insertion order, not alphabetical
+		expected := `{"zebra":{"type":"string"},"apple":{"type":"number"},"mango":{"type":"boolean"}}`
+		assert.Equal(t, expected, string(data))
+	})
+
+	t.Run("unmarshal preserves JSON order", func(t *testing.T) {
+		jsonData := `{"zebra":{"type":"string"},"apple":{"type":"number"},"mango":{"type":"boolean"}}`
+
+		var props ToolPropertiesMap
+		err := json.Unmarshal([]byte(jsonData), &props)
+		require.NoError(t, err)
+
+		// Verify iteration order matches JSON order
+		var keys []string
+		for k := range props.All() {
+			keys = append(keys, k)
+		}
+		assert.Equal(t, []string{"zebra", "apple", "mango"}, keys)
+	})
+
+	t.Run("round trip preserves order", func(t *testing.T) {
+		original := `{"z":{"type":"string"},"a":{"type":"number"},"m":{"type":"boolean"}}`
+
+		var props ToolPropertiesMap
+		err := json.Unmarshal([]byte(original), &props)
+		require.NoError(t, err)
+
+		data, err := json.Marshal(props)
+		require.NoError(t, err)
+
+		assert.Equal(t, original, string(data))
+	})
+
+	t.Run("Get retrieves correct values", func(t *testing.T) {
+		props := NewToolPropertiesMap()
+		props.Set("name", ToolProperty{Type: PropertyType{"string"}, Description: "The name"})
+		props.Set("age", ToolProperty{Type: PropertyType{"integer"}, Description: "The age"})
+
+		v, ok := props.Get("name")
+		assert.True(t, ok)
+		assert.Equal(t, "The name", v.Description)
+
+		v, ok = props.Get("age")
+		assert.True(t, ok)
+		assert.Equal(t, "The age", v.Description)
+
+		_, ok = props.Get("nonexistent")
+		assert.False(t, ok)
+	})
+
+	t.Run("Len returns correct count", func(t *testing.T) {
+		props := NewToolPropertiesMap()
+		assert.Equal(t, 0, props.Len())
+
+		props.Set("a", ToolProperty{})
+		assert.Equal(t, 1, props.Len())
+
+		props.Set("b", ToolProperty{})
+		assert.Equal(t, 2, props.Len())
+	})
+
+	t.Run("nil props marshal to null", func(t *testing.T) {
+		var props *ToolPropertiesMap
+		data, err := json.Marshal(props)
+		require.NoError(t, err)
+		assert.Equal(t, `null`, string(data))
+	})
+
+	t.Run("ToMap returns regular map", func(t *testing.T) {
+		props := NewToolPropertiesMap()
+		props.Set("a", ToolProperty{Type: PropertyType{"string"}})
+		props.Set("b", ToolProperty{Type: PropertyType{"number"}})
+
+		m := props.ToMap()
+		assert.Equal(t, 2, len(m))
+		assert.Equal(t, PropertyType{"string"}, m["a"].Type)
+		assert.Equal(t, PropertyType{"number"}, m["b"].Type)
+	})
+}
+
+func TestToolCallFunctionArguments_ComplexValues(t *testing.T) {
+	t.Run("nested objects preserve order", func(t *testing.T) {
+		jsonData := `{"outer":{"z":1,"a":2},"simple":"value"}`
+
+		var args ToolCallFunctionArguments
+		err := json.Unmarshal([]byte(jsonData), &args)
+		require.NoError(t, err)
+
+		// Outer keys should be in order
+		var keys []string
+		for k := range args.All() {
+			keys = append(keys, k)
+		}
+		assert.Equal(t, []string{"outer", "simple"}, keys)
+	})
+
+	t.Run("arrays as values", func(t *testing.T) {
+		args := NewToolCallFunctionArguments()
+		args.Set("items", []string{"a", "b", "c"})
+		args.Set("numbers", []int{1, 2, 3})
+
+		data, err := json.Marshal(args)
+		require.NoError(t, err)
+
+		assert.Equal(t, `{"items":["a","b","c"],"numbers":[1,2,3]}`, string(data))
+	})
+}
+
+func TestToolPropertiesMap_NestedProperties(t *testing.T) {
+	t.Run("nested properties preserve order", func(t *testing.T) {
+		props := NewToolPropertiesMap()
+
+		nestedProps := NewToolPropertiesMap()
+		nestedProps.Set("z_field", ToolProperty{Type: PropertyType{"string"}})
+		nestedProps.Set("a_field", ToolProperty{Type: PropertyType{"number"}})
+
+		props.Set("outer", ToolProperty{
+			Type:       PropertyType{"object"},
+			Properties: nestedProps,
+		})
+
+		data, err := json.Marshal(props)
+		require.NoError(t, err)
+
+		// Both outer and inner should preserve order
+		expected := `{"outer":{"type":"object","properties":{"z_field":{"type":"string"},"a_field":{"type":"number"}}}}`
+		assert.Equal(t, expected, string(data))
+	})
+}
--- a/app/ui/app/src/lib/highlighter.ts
+++ b/app/ui/app/src/lib/highlighter.ts
@@ -147,6 +147,7 @@ export const highlighterPromise = createHighlighter({
    "c",
    "cpp",
    "sql",
+    "swift",
    "yaml",
    "markdown",
  ],
--- a/app/ui/ui.go
+++ b/app/ui/ui.go
@@ -997,7 +997,7 @@ func (s *Server) chat(w http.ResponseWriter, r *http.Request) error {
 				for _, toolCall := range res.Message.ToolCalls {
 					// continues loop as tools were executed
 					toolsExecuted = true
-					result, content, err := registry.Execute(ctx, toolCall.Function.Name, toolCall.Function.Arguments)
+					result, content, err := registry.Execute(ctx, toolCall.Function.Name, toolCall.Function.Arguments.ToMap())
 					if err != nil {
 						errContent := fmt.Sprintf("Error: %v", err)
 						toolErrMsg := store.NewMessage("tool", errContent, nil)
@@ -1558,13 +1558,13 @@ func convertToOllamaTool(toolSchema map[string]any) api.Tool {

 	tool.Function.Parameters.Type = "object"
 	tool.Function.Parameters.Required = []string{}
-	tool.Function.Parameters.Properties = make(map[string]api.ToolProperty)
+	tool.Function.Parameters.Properties = api.NewToolPropertiesMap()

 	if schemaProps, ok := toolSchema["schema"].(map[string]any); ok {
 		tool.Function.Parameters.Type = getStringFromMap(schemaProps, "type", "object")

 		if props, ok := schemaProps["properties"].(map[string]any); ok {
-			tool.Function.Parameters.Properties = make(map[string]api.ToolProperty)
+			tool.Function.Parameters.Properties = api.NewToolPropertiesMap()

 			for propName, propDef := range props {
 				if propMap, ok := propDef.(map[string]any); ok {
@@ -1572,7 +1572,7 @@ func convertToOllamaTool(toolSchema map[string]any) api.Tool {
 						Type:        api.PropertyType{getStringFromMap(propMap, "type", "string")},
 						Description: getStringFromMap(propMap, "description", ""),
 					}
-					tool.Function.Parameters.Properties[propName] = prop
+					tool.Function.Parameters.Properties.Set(propName, prop)
 				}
 			}
 		}
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -45,6 +45,9 @@ import (
 	"github.com/ollama/ollama/types/model"
 	"github.com/ollama/ollama/types/syncmap"
 	"github.com/ollama/ollama/version"
+	xcmd "github.com/ollama/ollama/x/cmd"
+	"github.com/ollama/ollama/x/imagegen"
+	imagegenclient "github.com/ollama/ollama/x/imagegen/client"
 )

 const ConnectInstructions = "To sign in, navigate to:\n    %s\n\n"
@@ -95,6 +98,10 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 	filename, err := getModelfileName(cmd)
 	if os.IsNotExist(err) {
 		if filename == "" {
+			// No Modelfile found - check if current directory is an image gen model
+			if imagegen.IsTensorModelDir(".") {
+				return imagegenclient.CreateModel(args[0], ".", p)
+			}
 			reader = strings.NewReader("FROM .\n")
 		} else {
 			return errModelfileNotFound
@@ -456,6 +463,15 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 	}

 	name := args[0]
+
+	// Check if this is a known image generation model (skip Show/Pull)
+	if imagegen.HasTensorLayers(name) {
+		if opts.Prompt == "" && !interactive {
+			return errors.New("image generation models require a prompt. Usage: ollama run " + name + " \"your prompt here\"")
+		}
+		return imagegen.RunCLI(cmd, name, opts.Prompt, interactive, opts.KeepAlive)
+	}
+
 	info, err := func() (*api.ShowResponse, error) {
 		showReq := &api.ShowRequest{Name: name}
 		info, err := client.Show(cmd.Context(), showReq)
@@ -517,6 +533,10 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 		return generateEmbedding(cmd, name, opts.Prompt, opts.KeepAlive, truncate, dimensions)
 	}

+	// Check for experimental flag
+	isExperimental, _ := cmd.Flags().GetBool("experimental")
+	yoloMode, _ := cmd.Flags().GetBool("experimental-yolo")
+
 	if interactive {
 		if err := loadOrUnloadModel(cmd, &opts); err != nil {
 			var sErr api.AuthorizationError
@@ -543,6 +563,11 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 			}
 		}

+		// Use experimental agent loop with tools
+		if isExperimental {
+			return xcmd.GenerateInteractive(cmd, opts.Model, opts.WordWrap, opts.Options, opts.Think, opts.HideThinking, opts.KeepAlive, yoloMode)
+		}
+
 		return generateInteractive(cmd, opts)
 	}
 	return generate(cmd, opts)
@@ -812,6 +837,11 @@ func DeleteHandler(cmd *cobra.Command, args []string) error {
 }

 func ShowHandler(cmd *cobra.Command, args []string) error {
+	// Check if this is an image generation model
+	if imagegen.HasTensorLayers(args[0]) {
+		return imagegen.Show(args[0], os.Stdout)
+	}
+
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
 		return err
@@ -943,6 +973,9 @@ func showInfo(resp *api.ShowResponse, verbose bool, w io.Writer) error {
 			rows = append(rows, []string{"", "parameters", resp.Details.ParameterSize})
 		}
 		rows = append(rows, []string{"", "quantization", resp.Details.QuantizationLevel})
+		if resp.Requires != "" {
+			rows = append(rows, []string{"", "requires", resp.Requires})
+		}
 		return
 	})

@@ -1751,6 +1784,11 @@ func NewCLI() *cobra.Command {
 	runCmd.Flags().Bool("hidethinking", false, "Hide thinking output (if provided)")
 	runCmd.Flags().Bool("truncate", false, "For embedding models: truncate inputs exceeding context length (default: true). Set --truncate=false to error instead")
 	runCmd.Flags().Int("dimensions", 0, "Truncate output embeddings to specified dimension (embedding models only)")
+	runCmd.Flags().Bool("experimental", false, "Enable experimental agent loop with tools")
+	runCmd.Flags().Bool("experimental-yolo", false, "Skip all tool approval prompts (use with caution)")
+
+	// Image generation flags (width, height, steps, seed, etc.)
+	imagegen.RegisterFlags(runCmd)

 	stopCmd := &cobra.Command{
 		Use:     "stop MODEL",
--- a/cmd/cmd_test.go
+++ b/cmd/cmd_test.go
@@ -291,6 +291,31 @@ Weigh anchor!
 			t.Errorf("unexpected output (-want +got):\n%s", diff)
 		}
 	})
+
+	t.Run("min version", func(t *testing.T) {
+		var b bytes.Buffer
+		if err := showInfo(&api.ShowResponse{
+			Details: api.ModelDetails{
+				Family:            "test",
+				ParameterSize:     "7B",
+				QuantizationLevel: "FP16",
+			},
+			Requires: "0.14.0",
+		}, false, &b); err != nil {
+			t.Fatal(err)
+		}
+
+		expect := `  Model
+    architecture    test      
+    parameters      7B        
+    quantization    FP16      
+    requires        0.14.0    
+
+`
+		if diff := cmp.Diff(expect, b.String()); diff != "" {
+			t.Errorf("unexpected output (-want +got):\n%s", diff)
+		}
+	})
 }

 func TestDeleteHandler(t *testing.T) {
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -40,6 +40,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 		fmt.Fprintln(os.Stderr, "  /bye            Exit")
 		fmt.Fprintln(os.Stderr, "  /?, /help       Help for a command")
 		fmt.Fprintln(os.Stderr, "  /? shortcuts    Help for keyboard shortcuts")
+
 		fmt.Fprintln(os.Stderr, "")
 		fmt.Fprintln(os.Stderr, "Use \"\"\" to begin a multi-line message.")

--- a/convert/convert.go
+++ b/convert/convert.go
@@ -6,11 +6,14 @@ import (
 	"errors"
 	"fmt"
 	"io/fs"
+	"iter"
 	"log/slog"
+	"maps"
 	"os"
 	"slices"
 	"strings"

+	ofs "github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/fs/ggml"
 )

@@ -18,8 +21,13 @@ type ModelParameters struct {
 	Architectures []string `json:"architectures"`
 	VocabSize     uint32   `json:"vocab_size"`

+	// TODO is this needed?
+	ModelType string `json:"model_type"`
+
 	TextModel struct {
-		VocabSize uint32 `json:"vocab_size"`
+		VocabSize  uint32 `json:"vocab_size"`
+		HiddenSize uint32 `json:"hidden_size"`
+		ModelType  string `json:"model_type"`
 	} `json:"text_config"`
 }

@@ -33,8 +41,94 @@ type AdapterParameters struct {
 	} `json:"lora_parameters"`
 }

-func (ModelParameters) KV(t *Tokenizer) ggml.KV {
-	kv := ggml.KV{
+type KV map[string]any
+
+func (kv KV) Architecture() string {
+	return kv.String("general.architecture", "unknown")
+}
+
+type valueTypes interface {
+	uint8 | int8 | uint16 | int16 |
+		uint32 | int32 | uint64 | int64 |
+		string | float32 | float64 | bool
+}
+
+type arrayValueTypes interface {
+	[]uint8 | []int8 | []uint16 | []int16 |
+		[]uint32 | []int32 | []uint64 | []int64 |
+		[]string | []float32 | []float64 | []bool
+}
+
+func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ...T) (T, bool) {
+	if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
+		key = kv.Architecture() + "." + key
+	}
+
+	if val, ok := kv[key].(T); ok {
+		return val, true
+	}
+	return defaultValue[0], false
+}
+
+func (kv KV) String(key string, defaultValue ...string) string {
+	val, _ := keyValue(kv, key, append(defaultValue, "")...)
+	return val
+}
+
+func (kv KV) Uint(key string, defaultValue ...uint32) uint32 {
+	val, _ := keyValue(kv, key, append(defaultValue, 0)...)
+	return val
+}
+
+func (kv KV) Float(key string, defaultValue ...float32) float32 {
+	val, _ := keyValue(kv, key, append(defaultValue, 0)...)
+	return val
+}
+
+func (kv KV) Bool(key string, defaultValue ...bool) bool {
+	val, _ := keyValue(kv, key, append(defaultValue, false)...)
+	return val
+}
+
+func (kv KV) Strings(key string, defaultValue ...[]string) []string {
+	val, _ := keyValue(kv, key, append(defaultValue, []string{""})...)
+	return val
+}
+
+func (kv KV) Ints(key string, defaultValue ...[]int32) []int32 {
+	val, _ := keyValue(kv, key, append(defaultValue, []int32{0})...)
+	return val
+}
+
+func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
+	val, _ := keyValue(kv, key, append(defaultValue, []uint32{0})...)
+	return val
+}
+
+func (kv KV) Floats(key string, defaultValue ...[]float32) []float32 {
+	val, _ := keyValue(kv, key, append(defaultValue, []float32{0})...)
+	return val
+}
+
+func (kv KV) Bools(key string, defaultValue ...[]bool) []bool {
+	val, _ := keyValue(kv, key, append(defaultValue, []bool{false})...)
+	return val
+}
+
+func (kv KV) Len() int {
+	return len(kv)
+}
+
+func (kv KV) Keys() iter.Seq[string] {
+	return maps.Keys(kv)
+}
+
+func (kv KV) Value(key string) any {
+	return kv[key]
+}
+
+func (ModelParameters) KV(t *Tokenizer) KV {
+	kv := KV{
 		"general.file_type":            uint32(1),
 		"general.quantization_version": uint32(2),
 		"tokenizer.ggml.pre":           t.Pre,
@@ -63,7 +157,7 @@ func (ModelParameters) KV(t *Tokenizer) ggml.KV {
 	return kv
 }

-func (p AdapterParameters) KV() ggml.KV {
+func (p AdapterParameters) KV() KV {
 	var alpha float32
 	if p.LoraParameters.Alpha == 0 {
 		alpha = float32(p.Alpha)
@@ -71,7 +165,7 @@ func (p AdapterParameters) KV() ggml.KV {
 		alpha = p.LoraParameters.Alpha
 	}

-	kv := ggml.KV{
+	kv := KV{
 		"adapter.lora.alpha": alpha,
 		"adapter.type":       "lora",
 		"general.file_type":  uint32(1),
@@ -88,9 +182,14 @@ func (ModelParameters) specialTokenTypes() []string {
 	}
 }

-type ModelConverter interface {
+type ModelKV interface {
 	// KV maps parameters to LLM key-values
-	KV(*Tokenizer) ggml.KV
+	KV(*Tokenizer) KV
+}
+
+type ModelConverter interface {
+	ModelKV
+
 	// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
 	Tensors([]Tensor) []*ggml.Tensor
 	// Replacements returns a list of string pairs to replace in tensor names.
@@ -107,7 +206,7 @@ type moreParser interface {

 type AdapterConverter interface {
 	// KV maps parameters to LLM key-values
-	KV(ggml.KV) ggml.KV
+	KV(ofs.Config) KV
 	// Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here.
 	Tensors([]Tensor) []*ggml.Tensor
 	// Replacements returns a list of string pairs to replace in tensor names.
@@ -115,7 +214,7 @@ type AdapterConverter interface {
 	Replacements() []string
 }

-func ConvertAdapter(fsys fs.FS, f *os.File, baseKV ggml.KV) error {
+func ConvertAdapter(fsys fs.FS, f *os.File, baseKV ofs.Config) error {
 	bts, err := fs.ReadFile(fsys, "adapter_config.json")
 	if err != nil {
 		return err
@@ -126,8 +225,8 @@ func ConvertAdapter(fsys fs.FS, f *os.File, baseKV ggml.KV) error {
 		return err
 	}

-	arch, ok := baseKV["general.architecture"]
-	if !ok {
+	arch := baseKV.Architecture()
+	if arch == "" {
 		return errors.New("architecture not set for the base model")
 	}

@@ -153,23 +252,19 @@ func ConvertAdapter(fsys fs.FS, f *os.File, baseKV ggml.KV) error {
 	return writeFile(f, conv.KV(baseKV), conv.Tensors(ts))
 }

-// Convert writes an Ollama compatible model to the provided io.WriteSeeker based on configurations
-// and files it finds in the input path.
-// Supported input model formats include safetensors.
-// Supported input tokenizers files include tokenizer.json (preferred) and tokenizer.model.
-func ConvertModel(fsys fs.FS, f *os.File) error {
+func LoadModelMetadata(fsys fs.FS) (ModelKV, *Tokenizer, error) {
 	bts, err := fs.ReadFile(fsys, "config.json")
 	if err != nil {
-		return err
+		return nil, nil, err
 	}

 	var p ModelParameters
 	if err := json.Unmarshal(bts, &p); err != nil {
-		return err
+		return nil, nil, err
 	}

 	if len(p.Architectures) < 1 {
-		return errors.New("unknown architecture")
+		return nil, nil, errors.New("unknown architecture")
 	}

 	var conv ModelConverter
@@ -217,22 +312,22 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
 	case "DeepseekV3ForCausalLM":
 		conv = &deepseek2Model{}
 	default:
-		return fmt.Errorf("unsupported architecture %q", p.Architectures[0])
+		return nil, nil, fmt.Errorf("unsupported architecture %q", p.Architectures[0])
 	}

 	if err := json.Unmarshal(bts, conv); err != nil {
-		return err
+		return nil, nil, err
 	}

 	if t, ok := conv.(moreParser); ok {
 		if err := t.parseMore(fsys); err != nil {
-			return err
+			return nil, nil, err
 		}
 	}

 	t, err := parseTokenizer(fsys, conv.specialTokenTypes())
 	if err != nil {
-		return err
+		return nil, nil, err
 	}

 	vocabSize := int(cmp.Or(p.VocabSize, p.TextModel.VocabSize))
@@ -254,6 +349,19 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
 	default:
 		slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens))
 	}
+	return conv, t, nil
+}
+
+// Convert writes an Ollama compatible model to the provided io.WriteSeeker based on configurations
+// and files it finds in the input path.
+// Supported input model formats include safetensors.
+// Supported input tokenizers files include tokenizer.json (preferred) and tokenizer.model.
+func ConvertModel(fsys fs.FS, f *os.File) error {
+	kv, t, err := LoadModelMetadata(fsys)
+	if err != nil {
+		return err
+	}
+	conv := kv.(ModelConverter)

 	ts, err := parseTensors(fsys, strings.NewReplacer(conv.Replacements()...))
 	if err != nil {
@@ -263,7 +371,7 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
 	return writeFile(f, conv.KV(t), conv.Tensors(ts))
 }

-func writeFile(f *os.File, kv ggml.KV, ts []*ggml.Tensor) error {
+func writeFile(f *os.File, kv KV, ts []*ggml.Tensor) error {
 	for i := range ts {
 		ts[i].Shape = slices.Clone(ts[i].Shape)
 		slices.Reverse(ts[i].Shape)
--- a/convert/convert_bert.go
+++ b/convert/convert_bert.go
@@ -88,7 +88,7 @@ func (p *bertModel) parseMore(fsys fs.FS) error {
 	return nil
 }

-func (p *bertModel) KV(t *Tokenizer) ggml.KV {
+func (p *bertModel) KV(t *Tokenizer) KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "bert"
 	kv["bert.attention.causal"] = false
--- a/convert/convert_commandr.go
+++ b/convert/convert_commandr.go
@@ -24,7 +24,7 @@ type commandrModel struct {

 var _ ModelConverter = (*commandrModel)(nil)

-func (p *commandrModel) KV(t *Tokenizer) ggml.KV {
+func (p *commandrModel) KV(t *Tokenizer) KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "command-r"
 	kv["general.name"] = "command-r"
--- a/convert/convert_deepseek2.go
+++ b/convert/convert_deepseek2.go
@@ -47,7 +47,7 @@ type deepseek2Model struct {
 	Architecture string
 }

-func (p *deepseek2Model) KV(t *Tokenizer) ggml.KV {
+func (p *deepseek2Model) KV(t *Tokenizer) KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "deepseek2"
 	kv["general.type"] = "model"
--- a/convert/convert_deepseekocr.go
+++ b/convert/convert_deepseekocr.go
@@ -41,7 +41,7 @@ type deepseekocr struct {
 	} `json:"vision_config"`
 }

-func (m *deepseekocr) KV(t *Tokenizer) ggml.KV {
+func (m *deepseekocr) KV(t *Tokenizer) KV {
 	kv := m.ModelParameters.KV(t)
 	kv["general.architecture"] = "deepseekocr"
 	kv["block_count"] = m.LanguageConfig.HiddenLayers
--- a/convert/convert_gemma.go
+++ b/convert/convert_gemma.go
@@ -23,7 +23,7 @@ type gemmaModel struct {

 var _ ModelConverter = (*gemmaModel)(nil)

-func (p *gemmaModel) KV(t *Tokenizer) ggml.KV {
+func (p *gemmaModel) KV(t *Tokenizer) KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "gemma"
 	kv["gemma.context_length"] = p.MaxPositionEmbeddings
--- a/convert/convert_gemma2.go
+++ b/convert/convert_gemma2.go
@@ -1,7 +1,5 @@
 package convert

-import "github.com/ollama/ollama/fs/ggml"
-
 type gemma2Model struct {
 	gemmaModel
 	SlidingWindow         uint32  `json:"sliding_window"`
@@ -9,7 +7,7 @@ type gemma2Model struct {
 	FinalLogitSoftcap     float32 `json:"final_logit_softcapping"`
 }

-func (p *gemma2Model) KV(t *Tokenizer) ggml.KV {
+func (p *gemma2Model) KV(t *Tokenizer) KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "gemma2"
 	kv["gemma2.context_length"] = p.MaxPositionEmbeddings
--- a/convert/convert_gemma2_adapter.go
+++ b/convert/convert_gemma2_adapter.go
@@ -6,6 +6,7 @@ import (
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"

+	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/fs/ggml"
 )

@@ -15,7 +16,7 @@ type gemma2Adapter struct {

 var _ AdapterConverter = (*gemma2Adapter)(nil)

-func (p *gemma2Adapter) KV(baseKV ggml.KV) ggml.KV {
+func (p *gemma2Adapter) KV(baseKV fs.Config) KV {
 	kv := p.AdapterParameters.KV()
 	kv["general.architecture"] = "gemma2"
 	return kv
--- a/convert/convert_gemma3.go
+++ b/convert/convert_gemma3.go
@@ -3,8 +3,6 @@ package convert
 import (
 	"cmp"
 	"slices"
-
-	"github.com/ollama/ollama/fs/ggml"
 )

 type gemma3Model struct {
@@ -55,7 +53,7 @@ const (
 	gemma27BLayerCount = 62
 )

-func (p *gemma3Model) KV(t *Tokenizer) ggml.KV {
+func (p *gemma3Model) KV(t *Tokenizer) KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "gemma3"

--- a/convert/convert_gemma3n.go
+++ b/convert/convert_gemma3n.go
@@ -38,7 +38,7 @@ type gemma3nModel struct {
 	VisionModel struct{} `json:"vision_config"`
 }

-func (m *gemma3nModel) KV(t *Tokenizer) ggml.KV {
+func (m *gemma3nModel) KV(t *Tokenizer) KV {
 	kv := m.ModelParameters.KV(t)
 	kv["general.architecture"] = "gemma3n"
 	kv["gemma3n.activation_sparsity_scale"] = slices.Collect(func(yield func(float32) bool) {
--- a/convert/convert_gptoss.go
+++ b/convert/convert_gptoss.go
@@ -37,7 +37,7 @@ type gptossModel struct {

 var _ ModelConverter = (*gptossModel)(nil)

-func (m *gptossModel) KV(t *Tokenizer) ggml.KV {
+func (m *gptossModel) KV(t *Tokenizer) KV {
 	kv := m.ModelParameters.KV(t)
 	kv["general.architecture"] = "gptoss"
 	kv["general.file_type"] = uint32(4)
--- a/convert/convert_llama.go
+++ b/convert/convert_llama.go
@@ -48,7 +48,7 @@ type llamaModel struct {

 var _ ModelConverter = (*llamaModel)(nil)

-func (p *llamaModel) KV(t *Tokenizer) ggml.KV {
+func (p *llamaModel) KV(t *Tokenizer) KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "llama"
 	kv["llama.vocab_size"] = p.VocabSize
--- a/convert/convert_llama4.go
+++ b/convert/convert_llama4.go
@@ -35,7 +35,7 @@ type llama4Model struct {
 }

 // KV implements ModelConverter.
-func (p *llama4Model) KV(t *Tokenizer) ggml.KV {
+func (p *llama4Model) KV(t *Tokenizer) KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "llama4"

--- a/convert/convert_llama_adapter.go
+++ b/convert/convert_llama_adapter.go
@@ -7,6 +7,7 @@ import (
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"

+	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/fs/ggml"
 )

@@ -18,13 +19,13 @@ type llamaAdapter struct {

 var _ AdapterConverter = (*llamaAdapter)(nil)

-func (p *llamaAdapter) KV(baseKV ggml.KV) ggml.KV {
+func (p *llamaAdapter) KV(baseKV fs.Config) KV {
 	kv := p.AdapterParameters.KV()
 	kv["general.architecture"] = "llama"
-	kv["llama.attention.head_count"] = baseKV["llama.attention.head_count"]
-	kv["llama.attention.head_count_kv"] = baseKV["llama.attention.head_count_kv"]
+	kv["llama.attention.head_count"] = baseKV.Value("llama.attention.head_count")
+	kv["llama.attention.head_count_kv"] = baseKV.Value("llama.attention.head_count_kv")

-	p.NumAttentionHeads = baseKV["llama.attention.head_count"].(uint32)
+	p.NumAttentionHeads = baseKV.Value("llama.attention.head_count").(uint32)

 	return kv
 }
--- a/convert/convert_mistral.go
+++ b/convert/convert_mistral.go
@@ -60,7 +60,7 @@ type mistral3Model struct {
 	ProjectorHiddenAct      string `json:"projector_hidden_act"`
 }

-func (p *mistral3Model) KV(t *Tokenizer) ggml.KV {
+func (p *mistral3Model) KV(t *Tokenizer) KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "mistral3"
 	kv["mistral3.vocab_size"] = p.TextModel.VocabSize
--- a/convert/convert_mistral_causal.go
+++ b/convert/convert_mistral_causal.go
@@ -39,7 +39,7 @@ type mistral3CausalModel struct {
 	} `json:"rope_parameters"`
 }

-func (p *mistral3CausalModel) KV(t *Tokenizer) ggml.KV {
+func (p *mistral3CausalModel) KV(t *Tokenizer) KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "mistral3"
 	kv["mistral3.vocab_size"] = p.VocabSize
--- a/convert/convert_mixtral.go
+++ b/convert/convert_mixtral.go
@@ -12,7 +12,7 @@ type mixtralModel struct {
 	NumExpertsPerToken uint32 `json:"num_experts_per_tok"`
 }

-func (p *mixtralModel) KV(t *Tokenizer) ggml.KV {
+func (p *mixtralModel) KV(t *Tokenizer) KV {
 	kv := p.llamaModel.KV(t)

 	if p.NumLocalExperts > 0 {
--- a/convert/convert_mllama.go
+++ b/convert/convert_mllama.go
@@ -34,7 +34,7 @@ type mllamaModel struct {
 	} `json:"vision_config"`
 }

-func (m *mllamaModel) KV(t *Tokenizer) ggml.KV {
+func (m *mllamaModel) KV(t *Tokenizer) KV {
 	kv := m.ModelParameters.KV(t)
 	kv["general.architecture"] = "mllama"

--- a/convert/convert_nomicbert.go
+++ b/convert/convert_nomicbert.go
@@ -87,7 +87,7 @@ func (p *nomicbertModel) parseMore(fsys fs.FS) error {
 	return nil
 }

-func (p *nomicbertModel) KV(t *Tokenizer) ggml.KV {
+func (p *nomicbertModel) KV(t *Tokenizer) KV {
 	kv := p.ModelParameters.KV(t)

 	// Determine architecture based on MoE parameters (following qwen3 pattern)
--- a/convert/convert_olmo.go
+++ b/convert/convert_olmo.go
@@ -34,7 +34,7 @@ type olmoModel struct {

 var _ ModelConverter = (*olmoModel)(nil)

-func (p *olmoModel) KV(t *Tokenizer) ggml.KV {
+func (p *olmoModel) KV(t *Tokenizer) KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "olmo3"
 	kv["olmo3.block_count"] = p.NumHiddenLayers
--- a/convert/convert_phi3.go
+++ b/convert/convert_phi3.go
@@ -37,7 +37,7 @@ type phi3Model struct {

 var _ ModelConverter = (*phi3Model)(nil)

-func (p *phi3Model) KV(t *Tokenizer) ggml.KV {
+func (p *phi3Model) KV(t *Tokenizer) KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "phi3"
 	kv["phi3.context_length"] = p.MaxPositionEmbeddings
--- a/convert/convert_qwen2.go
+++ b/convert/convert_qwen2.go
@@ -22,7 +22,7 @@ type qwen2Model struct {

 var _ ModelConverter = (*qwen2Model)(nil)

-func (q *qwen2Model) KV(t *Tokenizer) ggml.KV {
+func (q *qwen2Model) KV(t *Tokenizer) KV {
 	kv := q.ModelParameters.KV(t)
 	kv["general.architecture"] = "qwen2"
 	kv["qwen2.block_count"] = q.HiddenLayers
--- a/convert/convert_qwen25vl.go
+++ b/convert/convert_qwen25vl.go
@@ -29,7 +29,7 @@ type qwen25VLModel struct {

 var _ ModelConverter = (*qwen25VLModel)(nil)

-func (q *qwen25VLModel) KV(t *Tokenizer) ggml.KV {
+func (q *qwen25VLModel) KV(t *Tokenizer) KV {
 	kv := q.ModelParameters.KV(t)
 	kv["general.architecture"] = "qwen25vl"

--- a/convert/convert_qwen3.go
+++ b/convert/convert_qwen3.go
@@ -32,7 +32,7 @@ type qwen3Model struct {
 }

 // KV implements ModelConverter.
-func (q *qwen3Model) KV(t *Tokenizer) ggml.KV {
+func (q *qwen3Model) KV(t *Tokenizer) KV {
 	arch := "qwen3"
 	if q.NumExperts > 0 {
 		arch += "moe"
--- a/convert/convert_qwen3vl.go
+++ b/convert/convert_qwen3vl.go
@@ -45,7 +45,7 @@ func (m *qwen3VLModel) parseMore(fsys fs.FS) error {
 	return json.Unmarshal(bts, &m.VisionModel)
 }

-func (m *qwen3VLModel) KV(t *Tokenizer) ggml.KV {
+func (m *qwen3VLModel) KV(t *Tokenizer) KV {
 	kv := m.qwen3Model.KV(t)

 	arch := "qwen3vl"
--- a/convert/convert_test.go
+++ b/convert/convert_test.go
@@ -19,6 +19,7 @@ import (
 	"testing"

 	"github.com/google/go-cmp/cmp"
+	fsc "github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/fs/ggml"
 )

@@ -28,7 +29,7 @@ type tensorData struct {
 	Shape   []int  `json:"shape"`
 }

-func convertFull(t *testing.T, fsys fs.FS) (*os.File, ggml.KV, ggml.Tensors) {
+func convertFull(t *testing.T, fsys fs.FS) (*os.File, fsc.Config, ggml.Tensors) {
 	t.Helper()

 	f, err := os.CreateTemp(t.TempDir(), "f16")
@@ -59,9 +60,10 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, ggml.KV, ggml.Tensors) {
 	return r, m.KV(), m.Tensors()
 }

-func generateResultsJSON(t *testing.T, f *os.File, kv ggml.KV, tensors ggml.Tensors) map[string]string {
+func generateResultsJSON(t *testing.T, f *os.File, kv fsc.Config, tensors ggml.Tensors) map[string]string {
 	actual := make(map[string]string)
-	for k, v := range kv {
+	for k := range kv.Keys() {
+		v := kv.Value(k)
 		if s, ok := v.(json.Marshaler); !ok {
 			actual[k] = fmt.Sprintf("%v", v)
 		} else {
@@ -277,7 +279,7 @@ func generateSafetensorTestData(t *testing.T, tempDir string, tensorData map[str
 func TestConvertAdapter(t *testing.T) {
 	type AdapterCase struct {
 		Name     string
-		BaseKV   map[string]any
+		BaseKV   KV
 		Expected map[string]string
 	}

--- a/convert/tokenizer_spm.go
+++ b/convert/tokenizer_spm.go
@@ -49,7 +49,8 @@ func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) {
 			tt := int32(sentencepiece.ModelProto_SentencePiece_NORMAL)

 			// temporary fix to handle gemma3 broken configs
-			if slices.Contains([]string{"<end_of_turn>", "<start_of_turn>"}, piece.GetPiece()) {
+			// TODO(parthsareen): allow reading of tokenizer.json to allow managing special tokens when using spm
+			if slices.Contains([]string{"<end_of_turn>", "<start_of_turn>", "<start_function_declaration>", "<end_function_declaration>", "<start_function_call>", "<end_function_call>", "<start_function_response>", "<end_function_response>", "<escape>"}, piece.GetPiece()) {
 				tt = int32(sentencepiece.ModelProto_SentencePiece_CONTROL)
 			}

--- a/docs/README.md
+++ b/docs/README.md
@@ -14,6 +14,7 @@
 * [API Reference](https://docs.ollama.com/api)
 * [Modelfile Reference](https://docs.ollama.com/modelfile)
 * [OpenAI Compatibility](https://docs.ollama.com/api/openai-compatibility)
+* [Anthropic Compatibility](./api/anthropic-compatibility.mdx)

 ### Resources

--- a/docs/api.md
+++ b/docs/api.md
@@ -895,11 +895,11 @@ curl http://localhost:11434/api/chat -d '{
      "tool_calls": [
        {
          "function": {
-            "name": "get_temperature",
+            "name": "get_weather",
            "arguments": {
              "city": "Toronto"
            }
-          },
+          }
        }
      ]
    },
@@ -907,7 +907,7 @@ curl http://localhost:11434/api/chat -d '{
    {
      "role": "tool",
      "content": "11 degrees celsius",
-      "tool_name": "get_temperature",
+      "tool_name": "get_weather"
    }
  ],
  "stream": false,
--- a/docs/api/anthropic-compatibility.mdx
+++ b/docs/api/anthropic-compatibility.mdx
@@ -0,0 +1,406 @@
+---
+title: Anthropic compatibility
+---
+
+Ollama provides compatibility with the [Anthropic Messages API](https://docs.anthropic.com/en/api/messages) to help connect existing applications to Ollama, including tools like Claude Code.
+
+## Recommended models
+
+For coding use cases, models like `glm-4.7:cloud`, `minimax-m2.1:cloud`, and `qwen3-coder` are recommended.
+
+Pull a model before use:
+```shell
+ollama pull qwen3-coder
+ollama pull glm-4.7:cloud
+```
+
+## Usage
+
+### Environment variables
+
+To use Ollama with tools that expect the Anthropic API (like Claude Code), set these environment variables:
+
+```shell
+export ANTHROPIC_BASE_URL=http://localhost:11434
+export ANTHROPIC_API_KEY=ollama  # required but ignored
+```
+
+### Simple `/v1/messages` example
+
+<CodeGroup dropdown>
+
+```python basic.py
+import anthropic
+
+client = anthropic.Anthropic(
+    base_url='http://localhost:11434',
+    api_key='ollama',  # required but ignored
+)
+
+message = client.messages.create(
+    model='qwen3-coder',
+    max_tokens=1024,
+    messages=[
+        {'role': 'user', 'content': 'Hello, how are you?'}
+    ]
+)
+print(message.content[0].text)
+```
+
+```javascript basic.js
+import Anthropic from "@anthropic-ai/sdk";
+
+const anthropic = new Anthropic({
+  baseURL: "http://localhost:11434",
+  apiKey: "ollama", // required but ignored
+});
+
+const message = await anthropic.messages.create({
+  model: "qwen3-coder",
+  max_tokens: 1024,
+  messages: [{ role: "user", content: "Hello, how are you?" }],
+});
+
+console.log(message.content[0].text);
+```
+
+```shell basic.sh
+curl -X POST http://localhost:11434/v1/messages \
+-H "Content-Type: application/json" \
+-H "x-api-key: ollama" \
+-H "anthropic-version: 2023-06-01" \
+-d '{
+  "model": "qwen3-coder",
+  "max_tokens": 1024,
+  "messages": [{ "role": "user", "content": "Hello, how are you?" }]
+}'
+```
+
+</CodeGroup>
+
+### Streaming example
+
+<CodeGroup dropdown>
+
+```python streaming.py
+import anthropic
+
+client = anthropic.Anthropic(
+    base_url='http://localhost:11434',
+    api_key='ollama',
+)
+
+with client.messages.stream(
+    model='qwen3-coder',
+    max_tokens=1024,
+    messages=[{'role': 'user', 'content': 'Count from 1 to 10'}]
+) as stream:
+    for text in stream.text_stream:
+        print(text, end='', flush=True)
+```
+
+```javascript streaming.js
+import Anthropic from "@anthropic-ai/sdk";
+
+const anthropic = new Anthropic({
+  baseURL: "http://localhost:11434",
+  apiKey: "ollama",
+});
+
+const stream = await anthropic.messages.stream({
+  model: "qwen3-coder",
+  max_tokens: 1024,
+  messages: [{ role: "user", content: "Count from 1 to 10" }],
+});
+
+for await (const event of stream) {
+  if (
+    event.type === "content_block_delta" &&
+    event.delta.type === "text_delta"
+  ) {
+    process.stdout.write(event.delta.text);
+  }
+}
+```
+
+```shell streaming.sh
+curl -X POST http://localhost:11434/v1/messages \
+-H "Content-Type: application/json" \
+-d '{
+  "model": "qwen3-coder",
+  "max_tokens": 1024,
+  "stream": true,
+  "messages": [{ "role": "user", "content": "Count from 1 to 10" }]
+}'
+```
+
+</CodeGroup>
+
+### Tool calling example
+
+<CodeGroup dropdown>
+
+```python tools.py
+import anthropic
+
+client = anthropic.Anthropic(
+    base_url='http://localhost:11434',
+    api_key='ollama',
+)
+
+message = client.messages.create(
+    model='qwen3-coder',
+    max_tokens=1024,
+    tools=[
+        {
+            'name': 'get_weather',
+            'description': 'Get the current weather in a location',
+            'input_schema': {
+                'type': 'object',
+                'properties': {
+                    'location': {
+                        'type': 'string',
+                        'description': 'The city and state, e.g. San Francisco, CA'
+                    }
+                },
+                'required': ['location']
+            }
+        }
+    ],
+    messages=[{'role': 'user', 'content': "What's the weather in San Francisco?"}]
+)
+
+for block in message.content:
+    if block.type == 'tool_use':
+        print(f'Tool: {block.name}')
+        print(f'Input: {block.input}')
+```
+
+```javascript tools.js
+import Anthropic from "@anthropic-ai/sdk";
+
+const anthropic = new Anthropic({
+  baseURL: "http://localhost:11434",
+  apiKey: "ollama",
+});
+
+const message = await anthropic.messages.create({
+  model: "qwen3-coder",
+  max_tokens: 1024,
+  tools: [
+    {
+      name: "get_weather",
+      description: "Get the current weather in a location",
+      input_schema: {
+        type: "object",
+        properties: {
+          location: {
+            type: "string",
+            description: "The city and state, e.g. San Francisco, CA",
+          },
+        },
+        required: ["location"],
+      },
+    },
+  ],
+  messages: [{ role: "user", content: "What's the weather in San Francisco?" }],
+});
+
+for (const block of message.content) {
+  if (block.type === "tool_use") {
+    console.log("Tool:", block.name);
+    console.log("Input:", block.input);
+  }
+}
+```
+
+```shell tools.sh
+curl -X POST http://localhost:11434/v1/messages \
+-H "Content-Type: application/json" \
+-d '{
+  "model": "qwen3-coder",
+  "max_tokens": 1024,
+  "tools": [
+    {
+      "name": "get_weather",
+      "description": "Get the current weather in a location",
+      "input_schema": {
+        "type": "object",
+        "properties": {
+          "location": {
+            "type": "string",
+            "description": "The city and state"
+          }
+        },
+        "required": ["location"]
+      }
+    }
+  ],
+  "messages": [{ "role": "user", "content": "What is the weather in San Francisco?" }]
+}'
+```
+
+</CodeGroup>
+
+## Using with Claude Code
+
+[Claude Code](https://code.claude.com/docs/en/overview) can be configured to use Ollama as its backend:
+
+```shell
+ANTHROPIC_BASE_URL=http://localhost:11434 ANTHROPIC_API_KEY=ollama claude --model qwen3-coder
+```
+
+Or set the environment variables in your shell profile:
+
+```shell
+export ANTHROPIC_BASE_URL=http://localhost:11434
+export ANTHROPIC_API_KEY=ollama
+```
+
+Then run Claude Code with any Ollama model:
+
+```shell
+# Local models
+claude --model qwen3-coder
+claude --model gpt-oss:20b
+
+# Cloud models
+claude --model glm-4.7:cloud
+claude --model minimax-m2.1:cloud
+```
+
+## Endpoints
+
+### `/v1/messages`
+
+#### Supported features
+
+- [x] Messages
+- [x] Streaming
+- [x] System prompts
+- [x] Multi-turn conversations
+- [x] Vision (images)
+- [x] Tools (function calling)
+- [x] Tool results
+- [x] Thinking/extended thinking
+
+#### Supported request fields
+
+- [x] `model`
+- [x] `max_tokens`
+- [x] `messages`
+  - [x] Text `content`
+  - [x] Image `content` (base64)
+  - [x] Array of content blocks
+  - [x] `tool_use` blocks
+  - [x] `tool_result` blocks
+  - [x] `thinking` blocks
+- [x] `system` (string or array)
+- [x] `stream`
+- [x] `temperature`
+- [x] `top_p`
+- [x] `top_k`
+- [x] `stop_sequences`
+- [x] `tools`
+- [x] `thinking`
+- [ ] `tool_choice`
+- [ ] `metadata`
+
+#### Supported response fields
+
+- [x] `id`
+- [x] `type`
+- [x] `role`
+- [x] `model`
+- [x] `content` (text, tool_use, thinking blocks)
+- [x] `stop_reason` (end_turn, max_tokens, tool_use)
+- [x] `usage` (input_tokens, output_tokens)
+
+#### Streaming events
+
+- [x] `message_start`
+- [x] `content_block_start`
+- [x] `content_block_delta` (text_delta, input_json_delta, thinking_delta)
+- [x] `content_block_stop`
+- [x] `message_delta`
+- [x] `message_stop`
+- [x] `ping`
+- [x] `error`
+
+## Models
+
+Ollama supports both local and cloud models.
+
+### Local models
+
+Pull a local model before use:
+
+```shell
+ollama pull qwen3-coder
+```
+
+Recommended local models:
+- `qwen3-coder` - Excellent for coding tasks
+- `gpt-oss:20b` - Strong general-purpose model
+
+### Cloud models
+
+Cloud models are available immediately without pulling:
+
+- `glm-4.7:cloud` - High-performance cloud model
+- `minimax-m2.1:cloud` - Fast cloud model
+
+### Default model names
+
+For tooling that relies on default Anthropic model names such as `claude-3-5-sonnet`, use `ollama cp` to copy an existing model name:
+
+```shell
+ollama cp qwen3-coder claude-3-5-sonnet
+```
+
+Afterwards, this new model name can be specified in the `model` field:
+
+```shell
+curl http://localhost:11434/v1/messages \
+    -H "Content-Type: application/json" \
+    -d '{
+        "model": "claude-3-5-sonnet",
+        "max_tokens": 1024,
+        "messages": [
+            {
+                "role": "user",
+                "content": "Hello!"
+            }
+        ]
+    }'
+```
+
+## Differences from the Anthropic API
+
+### Behavior differences
+
+- API key is accepted but not validated
+- `anthropic-version` header is accepted but not used
+- Token counts are approximations based on the underlying model's tokenizer
+
+### Not supported
+
+The following Anthropic API features are not currently supported:
+
+| Feature | Description |
+|---------|-------------|
+| `/v1/messages/count_tokens` | Token counting endpoint |
+| `tool_choice` | Forcing specific tool use or disabling tools |
+| `metadata` | Request metadata (user_id) |
+| Prompt caching | `cache_control` blocks for caching prefixes |
+| Batches API | `/v1/messages/batches` for async batch processing |
+| Citations | `citations` content blocks |
+| PDF support | `document` content blocks with PDF files |
+| Server-sent errors | `error` events during streaming (errors return HTTP status) |
+
+### Partial support
+
+| Feature | Status |
+|---------|--------|
+| Image content | Base64 images supported; URL images not supported |
+| Extended thinking | Basic support; `budget_tokens` accepted but not enforced |
--- a/docs/api/openai-compatibility.mdx
+++ b/docs/api/openai-compatibility.mdx
@@ -277,6 +277,8 @@ curl -X POST http://localhost:11434/v1/chat/completions \

 ### `/v1/responses`

+> Note: Added in Ollama v0.13.3
+
 Ollama supports the [OpenAI Responses API](https://platform.openai.com/docs/api-reference/responses). Only the non-stateful flavor is supported (i.e., there is no `previous_response_id` or `conversation` support).

 #### Supported features
--- a/docs/capabilities/vision.mdx
+++ b/docs/capabilities/vision.mdx
@@ -36,7 +36,6 @@ Provide an `images` array. SDKs accept file paths, URLs or raw bytes while the R
        }],
        "stream": false
    }'
-    "
    ```
  </Tab>
  <Tab title="Python">
--- a/docs/docs.json
+++ b/docs/docs.json
@@ -32,7 +32,9 @@
    "codeblocks": "system"
  },
  "contextual": {
-    "options": ["copy"]
+    "options": [
+      "copy"
+    ]
  },
  "navbar": {
    "links": [
@@ -52,7 +54,9 @@
      "display": "simple"
    },
    "examples": {
-      "languages": ["curl"]
+      "languages": [
+        "curl"
+      ]
    }
  },
  "redirects": [
@@ -97,6 +101,7 @@
          {
            "group": "Integrations",
            "pages": [
+              "/integrations/claude-code",
              "/integrations/vscode",
              "/integrations/jetbrains",
              "/integrations/codex",
@@ -139,7 +144,8 @@
              "/api/streaming",
              "/api/usage",
              "/api/errors",
-              "/api/openai-compatibility"
+              "/api/openai-compatibility",
+              "/api/anthropic-compatibility"
            ]
          },
          {
--- a/docs/faq.mdx
+++ b/docs/faq.mdx
@@ -14,11 +14,11 @@ curl -fsSL https://ollama.com/install.sh | sh

 ## How can I view the logs?

-Review the [Troubleshooting](./troubleshooting.md) docs for more about using logs.
+Review the [Troubleshooting](./troubleshooting) docs for more about using logs.

 ## Is my GPU compatible with Ollama?

-Please refer to the [GPU docs](./gpu.md).
+Please refer to the [GPU docs](./gpu).

 ## How can I specify the context window size?

--- a/docs/gpu.mdx
+++ b/docs/gpu.mdx
@@ -33,7 +33,7 @@ Check your compute compatibility to see if your card is supported:
 | 5.0                | GeForce GTX         | `GTX 750 Ti` `GTX 750` `NVS 810`                                                                                               |
 |                    | Quadro              | `K2200` `K1200` `K620` `M1200` `M520` `M5000M` `M4000M` `M3000M` `M2000M` `M1000M` `K620M` `M600M` `M500M`                     |

-For building locally to support older GPUs, see [developer.md](./development.md#linux-cuda-nvidia)
+For building locally to support older GPUs, see [developer](./development#linux-cuda-nvidia)

 ### GPU Selection

@@ -54,7 +54,7 @@ sudo modprobe nvidia_uvm`

 Ollama supports the following AMD GPUs via the ROCm library:

-> [!NOTE]
+> **NOTE:**
 > Additional AMD GPU support is provided by the Vulkan Library - see below.


@@ -132,9 +132,9 @@ Ollama supports GPU acceleration on Apple devices via the Metal API.

 ## Vulkan GPU Support

-> [!NOTE]
+> **NOTE:**
 > Vulkan is currently an Experimental feature.  To enable, you must set OLLAMA_VULKAN=1 for the Ollama server as
-described in the [FAQ](faq.md#how-do-i-configure-ollama-server)
+described in the [FAQ](faq#how-do-i-configure-ollama-server)

 Additional GPU support on Windows and Linux is provided via
 [Vulkan](https://www.vulkan.org/). On Windows most GPU vendors drivers come
@@ -161,6 +161,6 @@ sudo setcap cap_perfmon+ep /usr/local/bin/ollama

 To select specific Vulkan GPU(s), you can set the environment variable
 `GGML_VK_VISIBLE_DEVICES` to one or more numeric IDs on the Ollama server as
-described in the [FAQ](faq.md#how-do-i-configure-ollama-server). If you
+described in the [FAQ](faq#how-do-i-configure-ollama-server). If you
 encounter any problems with Vulkan based GPUs, you can disable all Vulkan GPUs
 by setting `GGML_VK_VISIBLE_DEVICES=-1` 
--- a/docs/integrations/claude-code.mdx
+++ b/docs/integrations/claude-code.mdx
@@ -0,0 +1,69 @@
+---
+title: Claude Code
+---
+
+## Install
+
+Install [Claude Code](https://code.claude.com/docs/en/overview):
+
+<CodeGroup>
+
+```shell macOS / Linux
+curl -fsSL https://claude.ai/install.sh | bash
+```
+
+```powershell Windows
+irm https://claude.ai/install.ps1 | iex
+```
+
+</CodeGroup>
+
+## Usage with Ollama
+
+Claude Code connects to Ollama using the Anthropic-compatible API.
+
+1. Set the environment variables:
+
+```shell
+export ANTHROPIC_BASE_URL=http://localhost:11434
+export ANTHROPIC_API_KEY=ollama
+```
+
+2. Run Claude Code with an Ollama model:
+
+```shell
+claude --model qwen3-coder
+```
+
+Or run with environment variables inline:
+
+```shell
+ANTHROPIC_BASE_URL=http://localhost:11434 ANTHROPIC_API_KEY=ollama claude --model qwen3-coder
+```
+
+## Connecting to ollama.com
+
+1. Create an [API key](https://ollama.com/settings/keys) on ollama.com
+2. Set the environment variables:
+
+```shell
+export ANTHROPIC_BASE_URL=https://ollama.com
+export ANTHROPIC_API_KEY=<your-api-key>
+```
+
+3. Run Claude Code with a cloud model:
+
+```shell
+claude --model glm-4.7:cloud
+```
+
+## Recommended Models
+
+### Cloud models
+- `glm-4.7:cloud` - High-performance cloud model
+- `minimax-m2.1:cloud` - Fast cloud model
+- `qwen3-coder:480b` - Large coding model
+
+### Local models
+- `qwen3-coder` - Excellent for coding tasks
+- `gpt-oss:20b` - Strong general-purpose model
--- a/docs/linux.mdx
+++ b/docs/linux.mdx
@@ -1,5 +1,5 @@
 ---
-title: Linux
+title: "Linux"
 ---

 ## Install
@@ -13,8 +13,7 @@ curl -fsSL https://ollama.com/install.sh | sh
 ## Manual install

 <Note>
-  If you are upgrading from a prior version, you should remove the old libraries
-  with `sudo rm -rf /usr/lib/ollama` first.
+  If you are upgrading from a prior version, you should remove the old libraries with `sudo rm -rf /usr/lib/ollama` first.
 </Note>

 Download and extract the package:
@@ -113,11 +112,7 @@ sudo systemctl status ollama
 ```

 <Note>
-  While AMD has contributed the `amdgpu` driver upstream to the official linux
-  kernel source, the version is older and may not support all ROCm features. We
-  recommend you install the latest driver from
-  https://www.amd.com/en/support/linux-drivers for best support of your Radeon
-  GPU.
+  While AMD has contributed the `amdgpu` driver upstream to the official linux kernel source, the version is older and may not support all ROCm features. We recommend you install the latest driver from https://www.amd.com/en/support/linux-drivers for best support of your Radeon GPU.
 </Note>

 ## Customizing
@@ -196,4 +191,4 @@ Remove the downloaded models and Ollama service user and group:
 sudo userdel ollama
 sudo groupdel ollama
 sudo rm -r /usr/share/ollama
-```
+```
--- a/docs/modelfile.mdx
+++ b/docs/modelfile.mdx
@@ -41,6 +41,7 @@ INSTRUCTION arguments
 | [`ADAPTER`](#adapter)               | Defines the (Q)LoRA adapters to apply to the model.            |
 | [`LICENSE`](#license)               | Specifies the legal license.                                   |
 | [`MESSAGE`](#message)               | Specify message history.                                       |
+| [`REQUIRES`](#requires)             | Specify the minimum version of Ollama required by the model.   |

 ## Examples

@@ -248,6 +249,16 @@ MESSAGE user Is Ontario in Canada?
 MESSAGE assistant yes
 ```

+### REQUIRES
+
+The `REQUIRES` instruction allows you to specify the minimum version of Ollama required by the model.
+
+```
+REQUIRES <version>
+```
+
+The version should be a valid Ollama version (e.g. 0.14.0).
+
 ## Notes

 - the **`Modelfile` is not case sensitive**. In the examples, uppercase instructions are used to make it easier to distinguish it from arguments.
--- a/docs/troubleshooting.mdx
+++ b/docs/troubleshooting.mdx
@@ -87,7 +87,7 @@ When Ollama starts up, it takes inventory of the GPUs present in the system to d

 ### Linux NVIDIA Troubleshooting

-If you are using a container to run Ollama, make sure you've set up the container runtime first as described in [docker.md](./docker.md)
+If you are using a container to run Ollama, make sure you've set up the container runtime first as described in [docker](./docker)

 Sometimes the Ollama can have difficulties initializing the GPU. When you check the server logs, this can show up as various error codes, such as "3" (not initialized), "46" (device unavailable), "100" (no device), "999" (unknown), or others. The following troubleshooting techniques may help resolve the problem

--- a/fs/config.go
+++ b/fs/config.go
@@ -1,5 +1,7 @@
 package fs

+import "iter"
+
 type Config interface {
 	Architecture() string
 	String(string, ...string) string
@@ -11,4 +13,8 @@ type Config interface {
 	Ints(string, ...[]int32) []int32
 	Floats(string, ...[]float32) []float32
 	Bools(string, ...[]bool) []bool
+
+	Len() int
+	Keys() iter.Seq[string]
+	Value(key string) any
 }
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -6,7 +6,9 @@ import (
 	"errors"
 	"fmt"
 	"io"
+	"iter"
 	"log/slog"
+	"maps"
 	"math"
 	"slices"
 	"strings"
@@ -239,21 +241,34 @@ func (kv KV) Bools(key string, defaultValue ...[]bool) []bool {
 	return val.values
 }

+func (kv KV) Len() int {
+	return len(kv)
+}
+
+func (kv KV) Keys() iter.Seq[string] {
+	return maps.Keys(kv)
+}
+
+func (kv KV) Value(key string) any {
+	return kv[key]
+}
+
 func (kv KV) OllamaEngineRequired() bool {
 	return slices.Contains([]string{
+		"bert",
+		"deepseek2",
+		"deepseekocr",
 		"gemma3",
 		"gemma3n",
 		"gptoss", "gpt-oss",
 		"llama4",
 		"mistral3",
 		"mllama",
+		"nomic-bert",
+		"olmo3",
 		"qwen25vl",
 		"qwen3", "qwen3moe",
 		"qwen3vl", "qwen3vlmoe",
-		"deepseekocr",
-		"deepseek2",
-		"nomic-bert",
-		"olmo3",
 	}, kv.Architecture())
 }

@@ -839,6 +854,7 @@ func (f GGML) SupportsFlashAttention() bool {
 // FlashAttention checks if the model should enable flash attention
 func (f GGML) FlashAttention() bool {
 	return slices.Contains([]string{
+		"bert",
 		"gemma3",
 		"gptoss", "gpt-oss",
 		"mistral3",
--- a/fs/ggml/gguf.go
+++ b/fs/ggml/gguf.go
@@ -8,12 +8,12 @@ import (
 	"fmt"
 	"io"
 	"log/slog"
-	"maps"
 	"os"
 	"runtime"
 	"slices"
 	"strings"

+	"github.com/ollama/ollama/fs"
 	"golang.org/x/sync/errgroup"
 )

@@ -508,7 +508,7 @@ func writeGGUFArray[S ~[]E, E any](w io.Writer, t uint32, s S) error {
 	return binary.Write(w, binary.LittleEndian, s)
 }

-func WriteGGUF(f *os.File, kv KV, ts []*Tensor) error {
+func WriteGGUF(f *os.File, kv fs.Config, ts []*Tensor) error {
 	arch := kv.String("general.architecture")
 	if arch == "" {
 		return fmt.Errorf("architecture not set")
@@ -526,12 +526,12 @@ func WriteGGUF(f *os.File, kv KV, ts []*Tensor) error {
 		return err
 	}

-	if err := binary.Write(f, binary.LittleEndian, uint64(len(kv))); err != nil {
+	if err := binary.Write(f, binary.LittleEndian, uint64(kv.Len())); err != nil {
 		return err
 	}

-	for _, key := range slices.Sorted(maps.Keys(kv)) {
-		if err := ggufWriteKV(f, arch, key, kv[key]); err != nil {
+	for _, key := range slices.Sorted(kv.Keys()) {
+		if err := ggufWriteKV(f, arch, key, kv.Value(key)); err != nil {
 			return err
 		}
 	}
--- a/go.mod
+++ b/go.mod
@@ -15,8 +15,8 @@ require (
 	github.com/spf13/cobra v1.7.0
 	github.com/stretchr/testify v1.9.0
 	github.com/x448/float16 v0.8.4
-	golang.org/x/sync v0.12.0
-	golang.org/x/sys v0.36.0
+	golang.org/x/sync v0.19.0
+	golang.org/x/sys v0.39.0
 )

 require (
@@ -28,13 +28,17 @@ require (
 	github.com/nlpodyssey/gopickle v0.3.0
 	github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c
 	github.com/tkrajina/typescriptify-golang-structs v0.2.0
+	github.com/wk8/go-ordered-map/v2 v2.1.8
 	golang.org/x/image v0.22.0
-	golang.org/x/tools v0.30.0
+	golang.org/x/mod v0.31.0
+	golang.org/x/tools v0.40.0
 	gonum.org/v1/gonum v0.15.0
 )

 require (
 	github.com/apache/arrow/go/arrow v0.0.0-20211112161151-bc219186db40 // indirect
+	github.com/bahlo/generic-list-go v0.2.0 // indirect
+	github.com/buger/jsonparser v1.1.1 // indirect
 	github.com/bytedance/sonic/loader v0.1.1 // indirect
 	github.com/chewxy/hm v1.0.0 // indirect
 	github.com/chewxy/math32 v1.11.0 // indirect
@@ -44,6 +48,7 @@ require (
 	github.com/gogo/protobuf v1.3.2 // indirect
 	github.com/google/flatbuffers v24.3.25+incompatible // indirect
 	github.com/kr/text v0.2.0 // indirect
+	github.com/mailru/easyjson v0.7.7 // indirect
 	github.com/pkg/errors v0.9.1 // indirect
 	github.com/pmezard/go-difflib v1.0.0 // indirect
 	github.com/rivo/uniseg v0.2.0 // indirect
@@ -76,11 +81,11 @@ require (
 	github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
 	github.com/ugorji/go/codec v1.2.12 // indirect
 	golang.org/x/arch v0.8.0 // indirect
-	golang.org/x/crypto v0.36.0
-	golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa // indirect
-	golang.org/x/net v0.38.0 // indirect
-	golang.org/x/term v0.30.0
-	golang.org/x/text v0.23.0
+	golang.org/x/crypto v0.46.0
+	golang.org/x/exp v0.0.0-20251219203646-944ab1f22d93
+	golang.org/x/net v0.48.0 // indirect
+	golang.org/x/term v0.38.0
+	golang.org/x/text v0.32.0
 	google.golang.org/protobuf v1.34.1
 	gopkg.in/yaml.v3 v3.0.1 // indirect
 )
--- a/go.sum
+++ b/go.sum
@@ -14,7 +14,11 @@ github.com/apache/arrow/go/arrow v0.0.0-20211112161151-bc219186db40 h1:q4dksr6IC
 github.com/apache/arrow/go/arrow v0.0.0-20211112161151-bc219186db40/go.mod h1:Q7yQnSMnLvcXlZ8RV+jwz/6y1rQTqbX6C82SndT52Zs=
 github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0 h1:jfIu9sQUG6Ig+0+Ap1h4unLjW6YQJpKZVmUzxsD4E/Q=
 github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0/go.mod h1:t2tdKJDJF9BV14lnkjHmOQgcvEKgtqs5a1N3LNdJhGE=
+github.com/bahlo/generic-list-go v0.2.0 h1:5sz/EEAK+ls5wF+NeqDpk5+iNdMDXrh3z3nPnH1Wvgk=
+github.com/bahlo/generic-list-go v0.2.0/go.mod h1:2KvAjgMlE5NNynlg/5iLrrCCZ2+5xWbdbCW3pNTGyYg=
 github.com/boombuler/barcode v1.0.0/go.mod h1:paBWMcWSl3LHKBqUq+rly7CNSldXjb2rDl3JlRe0mD8=
+github.com/buger/jsonparser v1.1.1 h1:2PnMjfWD7wBILjqQbt530v576A/cAbQvEW9gGIpYMUs=
+github.com/buger/jsonparser v1.1.1/go.mod h1:6RYKKt7H4d4+iWqouImQ9R2FZql3VbhNgx27UK13J/0=
 github.com/bytedance/sonic v1.11.6 h1:oUp34TzMlL+OY1OUWxHqsdkgC/Zfc85zGqw9siXjrc0=
 github.com/bytedance/sonic v1.11.6/go.mod h1:LysEHSvpvDySVdC2f87zGWf6CIKJcAvqab1ZaiQtds4=
 github.com/bytedance/sonic/loader v0.1.1 h1:c+e5Pt1k/cy5wMveRDyk2X4B9hF4g7an8N3zCYjJFNM=
@@ -123,6 +127,7 @@ github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+
 github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw=
 github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
 github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
+github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
 github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
 github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
 github.com/jung-kurt/gofpdf v1.0.0/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes=
@@ -143,6 +148,8 @@ github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728 h1:QwWKgMY28TAXaDl+
 github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728/go.mod h1:1fEHWurg7pvf5SG6XNE5Q8UZmOwex51Mkx3SLhrW5B4=
 github.com/leodido/go-urn v1.4.0 h1:WT9HwE9SGECu3lg4d/dIA+jxlljEa1/ffXKmRjqdmIQ=
 github.com/leodido/go-urn v1.4.0/go.mod h1:bvxc+MVxLKB4z00jd1z+Dvzr47oO32F/QSNjSBOlFxI=
+github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
+github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
 github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
 github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
 github.com/mattn/go-runewidth v0.0.9/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI=
@@ -207,6 +214,8 @@ github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS
 github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08=
 github.com/ugorji/go/codec v1.2.12 h1:9LC83zGrHhuUA9l16C9AHXAqEV/2wBQ4nkvumAE65EE=
 github.com/ugorji/go/codec v1.2.12/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZgYf6w6lg=
+github.com/wk8/go-ordered-map/v2 v2.1.8 h1:5h/BUHu93oj4gIdvHHHGsScSTMijfx5PeYkE/fJgbpc=
+github.com/wk8/go-ordered-map/v2 v2.1.8/go.mod h1:5nJHM5DyteebpVlHnWMV0rPz6Zp7+xBAnxjb1X5vnTw=
 github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM=
 github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg=
 github.com/xtgo/set v1.0.0 h1:6BCNBRv3ORNDQ7fyoJXRv+tstJz3m1JVFQErfeZz2pY=
@@ -224,16 +233,16 @@ golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACk
 golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
-golang.org/x/crypto v0.36.0 h1:AnAEvhDddvBdpY+uR+MyHmuZzzNqXSe/GvuDeob5L34=
-golang.org/x/crypto v0.36.0/go.mod h1:Y4J0ReaxCR1IMaabaSMugxJES1EpwhBHhv2bDHklZvc=
+golang.org/x/crypto v0.46.0 h1:cKRW/pmt1pKAfetfu+RCEvjvZkA9RimPbh7bhFjGVBU=
+golang.org/x/crypto v0.46.0/go.mod h1:Evb/oLKmMraqjZ2iQTwDwvCtJkczlDuTmdJXoZVzqU0=
 golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20191002040644-a1355ae1e2c3/go.mod h1:NOZ3BPKG0ec/BKJQgnvsSFpcKLM5xXVWnvZS97DWHgE=
-golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa h1:t2QcU6V556bFjYgu4L6C+6VrCPyJZ+eyRsABUPs1mz4=
-golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa/go.mod h1:BHOTPb3L19zxehTsLoJXVaTktb06DFgmdW6Wb9s8jqk=
+golang.org/x/exp v0.0.0-20251219203646-944ab1f22d93 h1:fQsdNF2N+/YewlRZiricy4P1iimyPKZ/xwniHj8Q2a0=
+golang.org/x/exp v0.0.0-20251219203646-944ab1f22d93/go.mod h1:EPRbTFwzwjXj9NpYyyrvenVh9Y+GFeEvMNh7Xuz7xgU=
 golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs=
 golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js=
 golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
@@ -255,6 +264,8 @@ golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzB
 golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
 golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
 golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
+golang.org/x/mod v0.31.0 h1:HaW9xtz0+kOcWKwli0ZXy79Ix+UW/vOfmWI5QVd2tgI=
+golang.org/x/mod v0.31.0/go.mod h1:43JraMp9cGx1Rx3AqioxrbrhNsLl2l/iNAvuBkrezpg=
 golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
 golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
 golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
@@ -267,8 +278,8 @@ golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81R
 golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
 golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM=
 golang.org/x/net v0.0.0-20210614182718-04defd469f4e/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
-golang.org/x/net v0.38.0 h1:vRMAPTMaeGqVhG5QyLJHqNDwecKTomGeqbnfZyKlBI8=
-golang.org/x/net v0.38.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8=
+golang.org/x/net v0.48.0 h1:zyQRTTrjc33Lhh0fBgT/H3oZq9WuvRR5gPC70xpDiQU=
+golang.org/x/net v0.48.0/go.mod h1:+ndRgGjkh8FGtu1w1FGbEC31if4VrNVMuKTgcAAnQRY=
 golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
 golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
 golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
@@ -278,8 +289,8 @@ golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJ
 golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.12.0 h1:MHc5BpPuC30uJk597Ri8TV3CNZcTLu6B6z4lJy+g6Jw=
-golang.org/x/sync v0.12.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
+golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4=
+golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
 golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
@@ -295,17 +306,17 @@ golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBc
 golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.36.0 h1:KVRy2GtZBrk1cBYA7MKu5bEZFxQk4NIDV6RLVcC8o0k=
-golang.org/x/sys v0.36.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
+golang.org/x/sys v0.39.0 h1:CvCKL8MeisomCi6qNZ+wbb0DN9E5AATixKsvNtMoMFk=
+golang.org/x/sys v0.39.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
 golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
-golang.org/x/term v0.30.0 h1:PQ39fJZ+mfadBm0y5WlL4vlM7Sx1Hgf13sMIY2+QS9Y=
-golang.org/x/term v0.30.0/go.mod h1:NYYFdzHoI5wRh/h5tDMdMqCqPJZEuNqVR5xJLd/n67g=
+golang.org/x/term v0.38.0 h1:PQ5pkm/rLO6HnxFR7N2lJHOZX6Kez5Y1gDSJla6jo7Q=
+golang.org/x/term v0.38.0/go.mod h1:bSEAKrOT1W+VSu9TSCMtoGEOUcKxOKgl3LE5QEF/xVg=
 golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
-golang.org/x/text v0.23.0 h1:D71I7dUrlY+VX0gQShAThNGHFxZ13dGLBHQLVl1mJlY=
-golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4=
+golang.org/x/text v0.32.0 h1:ZD01bjUt1FQ9WJ0ClOL5vxgxOI/sVCNgX1YtKwcY0mU=
+golang.org/x/text v0.32.0/go.mod h1:o/rUWzghvpD5TXrTIBuJU77MTaN0ljMWE47kxGJQ7jY=
 golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
@@ -319,8 +330,8 @@ golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapK
 golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
 golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
 golang.org/x/tools v0.1.4/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk=
-golang.org/x/tools v0.30.0 h1:BgcpHewrV5AUp2G9MebG4XPFI1E2W41zU1SaqVA9vJY=
-golang.org/x/tools v0.30.0/go.mod h1:c347cR/OJfw5TI+GfX7RUPNMdDRRbjvYTS0jPyvsVtY=
+golang.org/x/tools v0.40.0 h1:yLkxfA+Qnul4cs9QA3KnlFu0lVmd8JJfoq+E41uSutA=
+golang.org/x/tools v0.40.0/go.mod h1:Ik/tzLRlbscWpqqMRjyWYDisX8bG13FrdXp3o4Sr9lc=
 golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
--- a/integration/tools_test.go
+++ b/integration/tools_test.go
@@ -11,6 +11,15 @@ import (
 	"github.com/ollama/ollama/api"
 )

+// testPropsMap creates a ToolPropertiesMap from a map (convenience function for tests)
+func testPropsMap(m map[string]api.ToolProperty) *api.ToolPropertiesMap {
+	props := api.NewToolPropertiesMap()
+	for k, v := range m {
+		props.Set(k, v)
+	}
+	return props
+}
+
 func TestAPIToolCalling(t *testing.T) {
 	initialTimeout := 60 * time.Second
 	streamTimeout := 60 * time.Second
@@ -57,12 +66,12 @@ func TestAPIToolCalling(t *testing.T) {
 						Parameters: api.ToolFunctionParameters{
 							Type:     "object",
 							Required: []string{"location"},
-							Properties: map[string]api.ToolProperty{
+							Properties: testPropsMap(map[string]api.ToolProperty{
 								"location": {
 									Type:        api.PropertyType{"string"},
 									Description: "The city and state, e.g. San Francisco, CA",
 								},
-							},
+							}),
 						},
 					},
 				},
--- a/internal/orderedmap/orderedmap.go
+++ b/internal/orderedmap/orderedmap.go
@@ -0,0 +1,94 @@
+// Package orderedmap provides a generic ordered map that maintains insertion order.
+// It wraps github.com/wk8/go-ordered-map/v2 to encapsulate the dependency.
+package orderedmap
+
+import (
+	"encoding/json"
+	"iter"
+
+	orderedmap "github.com/wk8/go-ordered-map/v2"
+)
+
+// Map is a generic ordered map that maintains insertion order.
+type Map[K comparable, V any] struct {
+	om *orderedmap.OrderedMap[K, V]
+}
+
+// New creates a new empty ordered map.
+func New[K comparable, V any]() *Map[K, V] {
+	return &Map[K, V]{
+		om: orderedmap.New[K, V](),
+	}
+}
+
+// Get retrieves a value by key.
+func (m *Map[K, V]) Get(key K) (V, bool) {
+	if m == nil || m.om == nil {
+		var zero V
+		return zero, false
+	}
+	return m.om.Get(key)
+}
+
+// Set sets a key-value pair. If the key already exists, its value is updated
+// but its position in the iteration order is preserved. If the key is new,
+// it is appended to the end.
+func (m *Map[K, V]) Set(key K, value V) {
+	if m == nil {
+		return
+	}
+	if m.om == nil {
+		m.om = orderedmap.New[K, V]()
+	}
+	m.om.Set(key, value)
+}
+
+// Len returns the number of entries.
+func (m *Map[K, V]) Len() int {
+	if m == nil || m.om == nil {
+		return 0
+	}
+	return m.om.Len()
+}
+
+// All returns an iterator over all key-value pairs in insertion order.
+func (m *Map[K, V]) All() iter.Seq2[K, V] {
+	return func(yield func(K, V) bool) {
+		if m == nil || m.om == nil {
+			return
+		}
+		for pair := m.om.Oldest(); pair != nil; pair = pair.Next() {
+			if !yield(pair.Key, pair.Value) {
+				return
+			}
+		}
+	}
+}
+
+// ToMap converts to a regular Go map.
+// Note: The resulting map does not preserve order.
+func (m *Map[K, V]) ToMap() map[K]V {
+	if m == nil || m.om == nil {
+		return nil
+	}
+	result := make(map[K]V, m.om.Len())
+	for pair := m.om.Oldest(); pair != nil; pair = pair.Next() {
+		result[pair.Key] = pair.Value
+	}
+	return result
+}
+
+// MarshalJSON implements json.Marshaler. The JSON output preserves key order.
+func (m *Map[K, V]) MarshalJSON() ([]byte, error) {
+	if m == nil || m.om == nil {
+		return []byte("null"), nil
+	}
+	return json.Marshal(m.om)
+}
+
+// UnmarshalJSON implements json.Unmarshaler. The insertion order matches the
+// order of keys in the JSON input.
+func (m *Map[K, V]) UnmarshalJSON(data []byte) error {
+	m.om = orderedmap.New[K, V]()
+	return json.Unmarshal(data, &m.om)
+}
--- a/internal/orderedmap/orderedmap_test.go
+++ b/internal/orderedmap/orderedmap_test.go
@@ -0,0 +1,348 @@
+package orderedmap
+
+import (
+	"encoding/json"
+	"slices"
+	"testing"
+)
+
+func TestMap_BasicOperations(t *testing.T) {
+	m := New[string, int]()
+
+	// Test empty map
+	if m.Len() != 0 {
+		t.Errorf("expected Len() = 0, got %d", m.Len())
+	}
+	v, ok := m.Get("a")
+	if ok {
+		t.Error("expected Get on empty map to return false")
+	}
+	if v != 0 {
+		t.Errorf("expected zero value, got %d", v)
+	}
+
+	// Test Set and Get
+	m.Set("a", 1)
+	m.Set("b", 2)
+	m.Set("c", 3)
+
+	if m.Len() != 3 {
+		t.Errorf("expected Len() = 3, got %d", m.Len())
+	}
+
+	v, ok = m.Get("a")
+	if !ok || v != 1 {
+		t.Errorf("expected Get(a) = (1, true), got (%d, %v)", v, ok)
+	}
+
+	v, ok = m.Get("b")
+	if !ok || v != 2 {
+		t.Errorf("expected Get(b) = (2, true), got (%d, %v)", v, ok)
+	}
+
+	v, ok = m.Get("c")
+	if !ok || v != 3 {
+		t.Errorf("expected Get(c) = (3, true), got (%d, %v)", v, ok)
+	}
+
+	// Test updating existing key preserves position
+	m.Set("a", 10)
+	v, ok = m.Get("a")
+	if !ok || v != 10 {
+		t.Errorf("expected Get(a) = (10, true), got (%d, %v)", v, ok)
+	}
+	if m.Len() != 3 {
+		t.Errorf("expected Len() = 3 after update, got %d", m.Len())
+	}
+}
+
+func TestMap_InsertionOrderPreserved(t *testing.T) {
+	m := New[string, int]()
+
+	// Insert in non-alphabetical order
+	m.Set("z", 1)
+	m.Set("a", 2)
+	m.Set("m", 3)
+	m.Set("b", 4)
+
+	// Verify iteration order matches insertion order
+	var keys []string
+	var values []int
+	for k, v := range m.All() {
+		keys = append(keys, k)
+		values = append(values, v)
+	}
+
+	expectedKeys := []string{"z", "a", "m", "b"}
+	expectedValues := []int{1, 2, 3, 4}
+
+	if !slices.Equal(keys, expectedKeys) {
+		t.Errorf("expected keys %v, got %v", expectedKeys, keys)
+	}
+	if !slices.Equal(values, expectedValues) {
+		t.Errorf("expected values %v, got %v", expectedValues, values)
+	}
+}
+
+func TestMap_UpdatePreservesPosition(t *testing.T) {
+	m := New[string, int]()
+
+	m.Set("first", 1)
+	m.Set("second", 2)
+	m.Set("third", 3)
+
+	// Update middle element
+	m.Set("second", 20)
+
+	var keys []string
+	for k := range m.All() {
+		keys = append(keys, k)
+	}
+
+	// Order should still be first, second, third
+	expected := []string{"first", "second", "third"}
+	if !slices.Equal(keys, expected) {
+		t.Errorf("expected keys %v, got %v", expected, keys)
+	}
+}
+
+func TestMap_MarshalJSON_PreservesOrder(t *testing.T) {
+	m := New[string, int]()
+
+	// Insert in non-alphabetical order
+	m.Set("z", 1)
+	m.Set("a", 2)
+	m.Set("m", 3)
+
+	data, err := json.Marshal(m)
+	if err != nil {
+		t.Fatalf("Marshal failed: %v", err)
+	}
+
+	// JSON should preserve insertion order, not alphabetical
+	expected := `{"z":1,"a":2,"m":3}`
+	if string(data) != expected {
+		t.Errorf("expected %s, got %s", expected, string(data))
+	}
+}
+
+func TestMap_UnmarshalJSON_PreservesOrder(t *testing.T) {
+	// JSON with non-alphabetical key order
+	jsonData := `{"z":1,"a":2,"m":3}`
+
+	m := New[string, int]()
+	if err := json.Unmarshal([]byte(jsonData), m); err != nil {
+		t.Fatalf("Unmarshal failed: %v", err)
+	}
+
+	// Verify iteration order matches JSON order
+	var keys []string
+	for k := range m.All() {
+		keys = append(keys, k)
+	}
+
+	expected := []string{"z", "a", "m"}
+	if !slices.Equal(keys, expected) {
+		t.Errorf("expected keys %v, got %v", expected, keys)
+	}
+}
+
+func TestMap_JSONRoundTrip(t *testing.T) {
+	// Test that unmarshal -> marshal produces identical JSON
+	original := `{"zebra":"z","apple":"a","mango":"m","banana":"b"}`
+
+	m := New[string, string]()
+	if err := json.Unmarshal([]byte(original), m); err != nil {
+		t.Fatalf("Unmarshal failed: %v", err)
+	}
+
+	data, err := json.Marshal(m)
+	if err != nil {
+		t.Fatalf("Marshal failed: %v", err)
+	}
+
+	if string(data) != original {
+		t.Errorf("round trip failed: expected %s, got %s", original, string(data))
+	}
+}
+
+func TestMap_ToMap(t *testing.T) {
+	m := New[string, int]()
+	m.Set("a", 1)
+	m.Set("b", 2)
+
+	regular := m.ToMap()
+
+	if len(regular) != 2 {
+		t.Errorf("expected len 2, got %d", len(regular))
+	}
+	if regular["a"] != 1 {
+		t.Errorf("expected regular[a] = 1, got %d", regular["a"])
+	}
+	if regular["b"] != 2 {
+		t.Errorf("expected regular[b] = 2, got %d", regular["b"])
+	}
+}
+
+func TestMap_NilSafety(t *testing.T) {
+	var m *Map[string, int]
+
+	// All operations should be safe on nil
+	if m.Len() != 0 {
+		t.Errorf("expected Len() = 0 on nil map, got %d", m.Len())
+	}
+
+	v, ok := m.Get("a")
+	if ok {
+		t.Error("expected Get on nil map to return false")
+	}
+	if v != 0 {
+		t.Errorf("expected zero value from nil map, got %d", v)
+	}
+
+	// Set on nil is a no-op
+	m.Set("a", 1)
+	if m.Len() != 0 {
+		t.Errorf("expected Len() = 0 after Set on nil, got %d", m.Len())
+	}
+
+	// All returns empty iterator
+	var keys []string
+	for k := range m.All() {
+		keys = append(keys, k)
+	}
+	if len(keys) != 0 {
+		t.Errorf("expected empty iteration on nil map, got %v", keys)
+	}
+
+	// ToMap returns nil
+	if m.ToMap() != nil {
+		t.Error("expected ToMap to return nil on nil map")
+	}
+
+	// MarshalJSON returns null
+	data, err := json.Marshal(m)
+	if err != nil {
+		t.Fatalf("Marshal failed: %v", err)
+	}
+	if string(data) != "null" {
+		t.Errorf("expected null, got %s", string(data))
+	}
+}
+
+func TestMap_EmptyMapMarshal(t *testing.T) {
+	m := New[string, int]()
+
+	data, err := json.Marshal(m)
+	if err != nil {
+		t.Fatalf("Marshal failed: %v", err)
+	}
+	if string(data) != "{}" {
+		t.Errorf("expected {}, got %s", string(data))
+	}
+}
+
+func TestMap_NestedValues(t *testing.T) {
+	m := New[string, any]()
+	m.Set("string", "hello")
+	m.Set("number", 42)
+	m.Set("bool", true)
+	m.Set("nested", map[string]int{"x": 1})
+
+	data, err := json.Marshal(m)
+	if err != nil {
+		t.Fatalf("Marshal failed: %v", err)
+	}
+
+	expected := `{"string":"hello","number":42,"bool":true,"nested":{"x":1}}`
+	if string(data) != expected {
+		t.Errorf("expected %s, got %s", expected, string(data))
+	}
+}
+
+func TestMap_AllIteratorEarlyExit(t *testing.T) {
+	m := New[string, int]()
+	m.Set("a", 1)
+	m.Set("b", 2)
+	m.Set("c", 3)
+	m.Set("d", 4)
+
+	// Collect only first 2
+	var keys []string
+	for k := range m.All() {
+		keys = append(keys, k)
+		if len(keys) == 2 {
+			break
+		}
+	}
+
+	expected := []string{"a", "b"}
+	if !slices.Equal(keys, expected) {
+		t.Errorf("expected %v, got %v", expected, keys)
+	}
+}
+
+func TestMap_IntegerKeys(t *testing.T) {
+	m := New[int, string]()
+	m.Set(3, "three")
+	m.Set(1, "one")
+	m.Set(2, "two")
+
+	var keys []int
+	for k := range m.All() {
+		keys = append(keys, k)
+	}
+
+	// Should preserve insertion order, not numerical order
+	expected := []int{3, 1, 2}
+	if !slices.Equal(keys, expected) {
+		t.Errorf("expected %v, got %v", expected, keys)
+	}
+}
+
+func TestMap_UnmarshalIntoExisting(t *testing.T) {
+	m := New[string, int]()
+	m.Set("existing", 999)
+
+	// Unmarshal should replace contents
+	if err := json.Unmarshal([]byte(`{"new":1}`), m); err != nil {
+		t.Fatalf("Unmarshal failed: %v", err)
+	}
+
+	_, ok := m.Get("existing")
+	if ok {
+		t.Error("existing key should be gone after unmarshal")
+	}
+
+	v, ok := m.Get("new")
+	if !ok || v != 1 {
+		t.Errorf("expected Get(new) = (1, true), got (%d, %v)", v, ok)
+	}
+}
+
+func TestMap_LargeOrderPreservation(t *testing.T) {
+	m := New[string, int]()
+
+	// Create many keys in specific order
+	keys := make([]string, 100)
+	for i := range 100 {
+		keys[i] = string(rune('a' + (99 - i))) // reverse order: 'd', 'c', 'b', 'a' (extended)
+		if i >= 26 {
+			keys[i] = string(rune('A'+i-26)) + string(rune('a'+i%26))
+		}
+	}
+
+	for i, k := range keys {
+		m.Set(k, i)
+	}
+
+	// Verify order preserved
+	var resultKeys []string
+	for k := range m.All() {
+		resultKeys = append(resultKeys, k)
+	}
+
+	if !slices.Equal(keys, resultKeys) {
+		t.Error("large map should preserve insertion order")
+	}
+}
--- a/kvcache/causal.go
+++ b/kvcache/causal.go
@@ -140,10 +140,6 @@ func (c *Causal) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity
 		c.config.CachePadding = 1
 	}

-	if c.config.MaskBatchPadding == 0 {
-		c.config.MaskBatchPadding = 1
-	}
-
 	if c.config.MaskDType == ml.DTypeOther {
 		c.config.MaskDType = ml.DTypeF32
 	}
@@ -364,15 +360,12 @@ func roundUp(length, pad int) int {
 // token in the history should apply. This is based on both the sequence and causality (the
 // position of the history is not ahead of the token in the batch).
 func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
-	// Align and pad the two dimensions as required by the backend
-	batchSize := roundUp(c.curBatchSize, c.config.MaskBatchPadding)
-
 	c.curCellRange.min = roundDown(c.curCellRange.min, c.config.CachePadding)
 	c.curCellRange.max = roundUp(c.curCellRange.max+1, c.config.CachePadding) - 1

 	length := c.curCellRange.max - c.curCellRange.min + 1

-	mask := make([]float32, batchSize*length)
+	mask := make([]float32, c.curBatchSize*length)

 	for i := range c.curBatchSize {
 		enabled := !slices.Contains(c.opts.Except, i)
@@ -386,13 +379,7 @@ func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
 		}
 	}

-	// Mask out any padding tokens we added. For padding that we added to the cache history, this
-	// has already been masked out because the sequence doesn't match.
-	for i := c.curBatchSize * length; i < len(mask); i++ {
-		mask[i] = float32(math.Inf(-1))
-	}
-
-	maskTensor := ctx.Input().FromFloats(mask, length, batchSize)
+	maskTensor := ctx.Input().FromFloats(mask, length, c.curBatchSize)

 	if c.config.MaskDType != ml.DTypeF32 {
 		maskTensor = maskTensor.Cast(ctx, c.config.MaskDType)
--- a/llama/build-info.cpp
+++ b/llama/build-info.cpp
@@ -1,4 +1,4 @@
 int LLAMA_BUILD_NUMBER = 0;
-char const *LLAMA_COMMIT = "17f7f4baad8b3a716ee139da7bb56ae984e8c0fa";
+char const *LLAMA_COMMIT = "ec98e2002";
 char const *LLAMA_COMPILER = "";
 char const *LLAMA_BUILD_TARGET = "";
--- a/llama/llama.cpp/.rsync-filter
+++ b/llama/llama.cpp/.rsync-filter
@@ -17,6 +17,9 @@ include /tools/mtmd/clip.cpp
 include /tools/mtmd/mtmd.cpp
 include /tools/mtmd/mtmd-audio.cpp
 include /tools/mtmd/mtmd-helper.cpp
+include /tools/mtmd/models/
+include /tools/mtmd/models/*.h
+include /tools/mtmd/models/*.cpp
 include /src/
 include /src/llama.*
 include /src/llama-*.*
--- a/llama/llama.cpp/common/common.cpp
+++ b/llama/llama.cpp/common/common.cpp
@@ -1013,31 +1013,40 @@ bool tty_can_use_colors() {
 // Model utils
 //

-static inline void common_init_sampler_from_model(
+// TODO: move to common/sampling
+static void common_init_sampler_from_model(
    const llama_model * model,
    common_params_sampling & sparams) {

    const uint64_t config = sparams.user_sampling_config;

    auto get_int32 = [&](const char * key, int32_t & dst, uint64_t user_config) {
-        if (config & user_config) return;
+        if (config & user_config) {
+            return;
+        }

        char buf[64] = {0};
        if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
            char * end = nullptr;
            int32_t v = strtol(buf, &end, 10);
-            if (end && end != buf) dst = v;
+            if (end && end != buf) {
+                dst = v;
+            }
        }
    };

    auto get_float = [&](const char * key, float & dst, uint64_t user_config) {
-        if (config & user_config) return;
+        if (config & user_config) {
+            return;
+        }

        char buf[128] = {0};
        if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
            char * end = nullptr;
            float v = strtof(buf, &end);
-            if (end && end != buf) dst = v;
+            if (end && end != buf) {
+                dst = v;
+            }
        }
    };

@@ -1065,31 +1074,125 @@ static inline void common_init_sampler_from_model(
    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA),    sparams.mirostat_eta,    common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA);
 }

-struct common_init_result common_init_from_params(common_params & params) {
-    common_init_result iparams;
+struct common_init_result::impl {
+    impl() = default;
+    ~impl() = default;
+
+    llama_model_ptr   model;
+    llama_context_ptr context;
+
+    std::vector<llama_adapter_lora_ptr> lora;
+
+    std::vector<common_sampler_ptr> samplers;
+};
+
+common_init_result::common_init_result(common_params & params) :
+    pimpl(new impl{}) {
    auto mparams = common_model_params_to_llama(params);
+    auto cparams = common_context_params_to_llama(params);
+
+    if (params.fit_params) {
+        LOG_INF("%s: fitting params to device memory, to report bugs during this step use -fit off (or --verbose if you can't)\n", __func__);
+        llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
+            params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
+            params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
+    }

    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
    if (model == NULL) {
-        LOG_ERR("%s: failed to load model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
-            __func__, params.model.path.c_str());
-        return iparams;
+        return;
    }

-    common_init_sampler_from_model(model, params.sampling);
+    pimpl->model.reset(model);

    const llama_vocab * vocab = llama_model_get_vocab(model);

-    auto cparams = common_context_params_to_llama(params);
+    // updates params.sampling
+    // TODO: fix naming
+    common_init_sampler_from_model(model, params.sampling);
+
+    if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
+        LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
+        params.sampling.ignore_eos = false;
+    }
+
+    // initialize once
+    for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
+        if (llama_vocab_is_eog(vocab, i)) {
+            LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(vocab, i).c_str(), -INFINITY);
+            params.sampling.logit_bias_eog.push_back({i, -INFINITY});
+        }
+    }
+
+    if (params.sampling.ignore_eos) {
+        // add EOG biases to the active set of logit biases
+        params.sampling.logit_bias.insert(
+                params.sampling.logit_bias.end(),
+                params.sampling.logit_bias_eog.begin(), params.sampling.logit_bias_eog.end());
+    }
+
+    //if (params.sampling.penalty_last_n == -1) {
+    //    LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
+    //    params.sampling.penalty_last_n = llama_n_ctx(lctx);
+    //}
+
+    //if (params.sampling.dry_penalty_last_n == -1) {
+    //    LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
+    //    params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
+    //}
+
+    pimpl->samplers.resize(cparams.n_seq_max);
+
+    for (int i = 0; i < (int) cparams.n_seq_max; ++i) {
+        pimpl->samplers[i].reset(common_sampler_init(model, params.sampling));
+    }

    llama_context * lctx = llama_init_from_model(model, cparams);
    if (lctx == NULL) {
-        LOG_ERR("%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
-            __func__, params.model.path.c_str());
-        llama_model_free(model);
-        return iparams;
+        LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
+        return;
    }

+    pimpl->context.reset(lctx);
+}
+
+llama_model * common_init_result::model() {
+    return pimpl->model.get();
+}
+
+llama_context * common_init_result::context() {
+    return pimpl->context.get();
+}
+
+common_sampler * common_init_result::sampler(llama_seq_id seq_id) {
+    return pimpl->samplers[seq_id].get();
+}
+
+std::vector<llama_adapter_lora_ptr> & common_init_result::lora() {
+    return pimpl->lora;
+}
+
+void common_init_result::free_context() {
+    pimpl->context.reset();
+}
+
+common_init_result_ptr common_init_from_params(common_params & params) {
+    common_init_result_ptr res(new common_init_result(params));
+
+    llama_model * model = res->model();
+    if (model == NULL) {
+        LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
+        return res;
+    }
+
+    llama_context * lctx = res->context();
+    if (lctx == NULL) {
+        LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
+        return res;
+    }
+
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+
    if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) {
        LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
        params.ctx_shift = false;
@@ -1101,10 +1204,7 @@ struct common_init_result common_init_from_params(common_params & params) {

        const auto cvec = common_control_vector_load(params.control_vectors);
        if (cvec.n_embd == -1) {
-            llama_free(lctx);
-            llama_model_free(model);
-
-            return iparams;
+            return res;
        }

        int err = llama_apply_adapter_cvec(
@@ -1115,10 +1215,7 @@ struct common_init_result common_init_from_params(common_params & params) {
                params.control_vector_layer_start,
                params.control_vector_layer_end);
        if (err) {
-            llama_free(lctx);
-            llama_model_free(model);
-
-            return iparams;
+            return res;
        }
    }

@@ -1142,10 +1239,7 @@ struct common_init_result common_init_from_params(common_params & params) {
        }

        if (!ok) {
-            llama_free(lctx);
-            llama_model_free(model);
-
-            return iparams;
+            return res;
        }
    }

@@ -1155,9 +1249,7 @@ struct common_init_result common_init_from_params(common_params & params) {
        lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
        if (lora == nullptr) {
            LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
-            llama_free(lctx);
-            llama_model_free(model);
-            return iparams;
+            return res;
        }

        char buf[1024];
@@ -1166,43 +1258,13 @@ struct common_init_result common_init_from_params(common_params & params) {
        la.task_name = buf;
        llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
        la.prompt_prefix = buf;
-        iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
+        res->lora().emplace_back(std::move(lora)); // copy to list of loaded adapters
    }

    if (!params.lora_init_without_apply) {
        common_set_adapter_lora(lctx, params.lora_adapters);
    }

-    if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
-        LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
-        params.sampling.ignore_eos = false;
-    }
-
-    // initialize once
-    for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
-        if (llama_vocab_is_eog(vocab, i)) {
-            LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
-            params.sampling.logit_bias_eog.push_back({i, -INFINITY});
-        }
-    }
-
-    if (params.sampling.ignore_eos) {
-        // add EOG biases to the active set of logit biases
-        params.sampling.logit_bias.insert(
-                params.sampling.logit_bias.end(),
-                params.sampling.logit_bias_eog.begin(), params.sampling.logit_bias_eog.end());
-    }
-
-    if (params.sampling.penalty_last_n == -1) {
-        LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
-        params.sampling.penalty_last_n = llama_n_ctx(lctx);
-    }
-
-    if (params.sampling.dry_penalty_last_n == -1) {
-        LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
-        params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
-    }
-
    if (params.warmup) {
        LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);

@@ -1241,12 +1303,11 @@ struct common_init_result common_init_from_params(common_params & params) {
        llama_set_warmup(lctx, false);
    }

-    iparams.model.reset(model);
-    iparams.context.reset(lctx);
-
-    return iparams;
+    return res;
 }

+common_init_result::~common_init_result() = default;
+
 std::string get_model_endpoint() {
    const char * model_endpoint_env = getenv("MODEL_ENDPOINT");
    // We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility.
@@ -1255,7 +1316,9 @@ std::string get_model_endpoint() {
    std::string model_endpoint = "https://huggingface.co/";
    if (endpoint_env) {
        model_endpoint = endpoint_env;
-        if (model_endpoint.back() != '/') model_endpoint += '/';
+        if (model_endpoint.back() != '/') {
+            model_endpoint += '/';
+        }
    }
    return model_endpoint;
 }
--- a/llama/llama.cpp/common/common.h
+++ b/llama/llama.cpp/common/common.h
@@ -82,7 +82,8 @@ int32_t cpu_get_num_math();
 enum llama_example {
    LLAMA_EXAMPLE_COMMON,
    LLAMA_EXAMPLE_SPECULATIVE,
-    LLAMA_EXAMPLE_MAIN,
+    LLAMA_EXAMPLE_COMPLETION,
+    LLAMA_EXAMPLE_CLI,
    LLAMA_EXAMPLE_EMBEDDING,
    LLAMA_EXAMPLE_PERPLEXITY,
    LLAMA_EXAMPLE_RETRIEVAL,
@@ -98,6 +99,7 @@ enum llama_example {
    LLAMA_EXAMPLE_TTS,
    LLAMA_EXAMPLE_DIFFUSION,
    LLAMA_EXAMPLE_FINETUNE,
+    LLAMA_EXAMPLE_FIT_PARAMS,

    LLAMA_EXAMPLE_COUNT,
 };
@@ -194,7 +196,6 @@ struct common_params_sampling {

    std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"};     // default sequence breakers for DRY

-
    std::vector<enum common_sampler_type> samplers = {
        COMMON_SAMPLER_TYPE_PENALTIES,
        COMMON_SAMPLER_TYPE_DRY,
@@ -215,6 +216,10 @@ struct common_params_sampling {
    std::vector<llama_logit_bias> logit_bias;     // logit biases to apply
    std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens

+    bool has_logit_bias() const {
+        return !logit_bias.empty();
+    }
+
    // print the parameters into a string
    std::string print() const;
 };
@@ -302,8 +307,8 @@ struct lr_opt {
 struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);

 struct common_params {
-    int32_t n_predict             =    -1; // new tokens to predict
-    int32_t n_ctx                 =  4096; // context size
+    int32_t n_predict             =    -1; // max. number of new tokens to predict, -1 == no limit
+    int32_t n_ctx                 =     0; // context size, 0 == context the model was trained with
    int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
    int32_t n_ubatch              =   512; // physical batch size for prompt processing (must be >=32 to use BLAS)
    int32_t n_keep                =     0; // number of tokens to keep from initial prompt
@@ -324,9 +329,12 @@ struct common_params {
    // offload params
    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading

-    int32_t n_gpu_layers      = -1;  // number of layers to store in VRAM (-1 - use default)
-    int32_t main_gpu          = 0;   // the GPU that is used for scratch and small tensors
-    float   tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
+    int32_t n_gpu_layers       = -1;               // number of layers to store in VRAM (-1 - use default)
+    int32_t main_gpu           = 0;                // the GPU that is used for scratch and small tensors
+    float   tensor_split[128]  = {0};              // how split tensors should be distributed across GPUs
+    bool    fit_params         = true;             // whether to fit unset model/context parameters to free device memory
+    size_t  fit_params_target  = 1024 * 1024*1024; // margin per device in bytes for fitting parameters to free memory
+    int32_t fit_params_min_ctx = 4096;             // minimum context size to set when trying to reduce memory use

    enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs

@@ -406,6 +414,7 @@ struct common_params {
    bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
    bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
    bool no_perf           = false; // disable performance metrics
+    bool show_timings      = true;  // show timing information on CLI
    bool ctx_shift         = false; // context shift on infinite text generation
    bool swa_full          = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
    bool kv_unified        = false; // enable unified KV cache
@@ -462,7 +471,7 @@ struct common_params {
    std::string public_path   = "";                                                                         // NOLINT
    std::string api_prefix    = "";                                                                         // NOLINT
    std::string chat_template = "";                                                                         // NOLINT
-    bool use_jinja = false;                                                                                 // NOLINT
+    bool use_jinja = true;                                                                                  // NOLINT
    bool enable_chat_template = true;
    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
    int reasoning_budget = -1;
@@ -482,9 +491,10 @@ struct common_params {
    bool endpoint_metrics = false;

    // router server configs
-    std::string models_dir = ""; // directory containing models for the router server
-    int models_max = 4;          // maximum number of models to load simultaneously
-    bool models_autoload = true; // automatically load models when requested via the router server
+    std::string models_dir    = ""; // directory containing models for the router server
+    std::string models_preset = ""; // directory containing model presets for the router server
+    int models_max = 4;             // maximum number of models to load simultaneously
+    bool models_autoload = true;    // automatically load models when requested via the router server

    bool log_json = false;

@@ -666,15 +676,29 @@ bool tty_can_use_colors();
 // Model utils
 //

-// note: defines object's lifetime
-struct common_init_result {
-    llama_model_ptr   model;
-    llama_context_ptr context;
+struct common_sampler;

-    std::vector<llama_adapter_lora_ptr> lora;
+// note: defines the model, context, samplers, ets. lifetimes
+struct common_init_result {
+    common_init_result(common_params & params);
+    ~common_init_result();
+
+    llama_model * model();
+    llama_context * context();
+    common_sampler * sampler(llama_seq_id seq_id);
+
+    std::vector<llama_adapter_lora_ptr> & lora();
+
+    void free_context();
+
+private:
+    struct impl;
+    std::unique_ptr<impl> pimpl;
 };

-struct common_init_result     common_init_from_params(common_params & params);
+using common_init_result_ptr = std::unique_ptr<common_init_result>;
+
+common_init_result_ptr common_init_from_params(common_params & params);

 struct llama_model_params     common_model_params_to_llama  (      common_params & params);
 struct llama_context_params   common_context_params_to_llama(const common_params & params);
--- a/llama/llama.cpp/common/json-schema-to-grammar.cpp
+++ b/llama/llama.cpp/common/json-schema-to-grammar.cpp
@@ -305,8 +305,9 @@ static std::string format_literal(const std::string & literal) {

 std::string gbnf_format_literal(const std::string & literal) { return format_literal(literal); }

-class SchemaConverter {
+class common_schema_converter {
 private:
+    friend class common_schema_info;
    friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
    std::function<json(const std::string &)> _fetch_json;
    bool _dotall;
@@ -729,7 +730,7 @@ private:
    }

 public:
-    SchemaConverter(
+    common_schema_converter(
        const std::function<json(const std::string &)> & fetch_json,
        bool dotall)
          : _fetch_json(fetch_json), _dotall(dotall)
@@ -990,6 +991,134 @@ public:
    }
 };

+// common_schema_info implementation (pimpl)
+
+common_schema_info::common_schema_info()
+    : impl_(std::make_unique<common_schema_converter>(
+        [](const std::string &) { return json(); },
+        false)) {}
+
+common_schema_info::~common_schema_info() = default;
+
+common_schema_info::common_schema_info(common_schema_info &&) noexcept = default;
+common_schema_info & common_schema_info::operator=(common_schema_info &&) noexcept = default;
+
+void common_schema_info::resolve_refs(nlohmann::ordered_json & schema) {
+    impl_->resolve_refs(schema, "");
+}
+
+// Determines if a JSON schema can resolve to a string type through any path.
+// Some models emit raw string values rather than JSON-encoded strings for string parameters.
+// If any branch of the schema (via oneOf, anyOf, $ref, etc.) permits a string, this returns
+// true, allowing callers to handle the value as a raw string for simplicity.
+bool common_schema_info::resolves_to_string(const nlohmann::ordered_json & schema) {
+    std::unordered_set<std::string> visited_refs;
+
+    std::function<bool(const json &)> check = [&](const json & s) -> bool {
+        if (!s.is_object()) {
+            return false;
+        }
+
+        // Handle $ref
+        if (s.contains("$ref")) {
+            const std::string & ref = s["$ref"];
+            if (visited_refs.find(ref) != visited_refs.end()) {
+                // Circular reference, assume not a string to be safe
+                return false;
+            }
+            visited_refs.insert(ref);
+            auto it = impl_->_refs.find(ref);
+            if (it != impl_->_refs.end()) {
+                return check(it->second);
+            }
+            return false;
+        }
+
+        // Check type field
+        if (s.contains("type")) {
+            const json & schema_type = s["type"];
+            if (schema_type.is_string()) {
+                if (schema_type == "string") {
+                    return true;
+                }
+            } else if (schema_type.is_array()) {
+                // Type can be an array like ["string", "null"]
+                for (const auto & t : schema_type) {
+                    if (t == "string") {
+                        return true;
+                    }
+                }
+            }
+        }
+
+        // Check oneOf/anyOf - if any alternative can be a string
+        if (s.contains("oneOf")) {
+            for (const auto & alt : s["oneOf"]) {
+                if (check(alt)) {
+                    return true;
+                }
+            }
+        }
+        if (s.contains("anyOf")) {
+            for (const auto & alt : s["anyOf"]) {
+                if (check(alt)) {
+                    return true;
+                }
+            }
+        }
+
+        // Check allOf - all components must be compatible with string type
+        if (s.contains("allOf")) {
+            bool all_string = true;
+            for (const auto & component : s["allOf"]) {
+                if (!check(component)) {
+                    all_string = false;
+                    break;
+                }
+            }
+            if (all_string) {
+                return true;
+            }
+        }
+
+        // Check const - if the constant value is a string
+        if (s.contains("const")) {
+            if (s["const"].is_string()) {
+                return true;
+            }
+        }
+
+        // Check enum - if any enum value is a string
+        if (s.contains("enum")) {
+            for (const auto & val : s["enum"]) {
+                if (val.is_string()) {
+                    return true;
+                }
+            }
+        }
+
+        // String-specific keywords imply string type
+        if (s.contains("pattern") || s.contains("minLength") || s.contains("maxLength")) {
+            return true;
+        }
+
+        // Check format - many formats imply string
+        if (s.contains("format")) {
+            const std::string & fmt = s["format"];
+            if (fmt == "date" || fmt == "time" || fmt == "date-time" ||
+                fmt == "uri" || fmt == "email" || fmt == "hostname" ||
+                fmt == "ipv4" || fmt == "ipv6" || fmt == "uuid" ||
+                fmt.find("uuid") == 0) {
+                return true;
+            }
+        }
+
+        return false;
+    };
+
+    return check(schema);
+}
+
 std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
 #ifdef LLAMA_USE_LLGUIDANCE
    if (!force_gbnf) {
@@ -1006,7 +1135,7 @@ std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
 }

 std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options) {
-    SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall);
+    common_schema_converter converter([&](const std::string &) { return json(); }, options.dotall);
    common_grammar_builder builder {
        /* .add_rule = */ [&](const std::string & name, const std::string & rule) {
            return converter._add_rule(name, rule);
--- a/llama/llama.cpp/common/json-schema-to-grammar.h
+++ b/llama/llama.cpp/common/json-schema-to-grammar.h
@@ -3,11 +3,31 @@
 #include <nlohmann/json_fwd.hpp>

 #include <functional>
+#include <memory>
 #include <string>

 std::string json_schema_to_grammar(const nlohmann::ordered_json & schema,
                                   bool force_gbnf = false);

+class common_schema_converter;
+
+// Probes a JSON schema to extract information about its structure and type constraints.
+class common_schema_info {
+    std::unique_ptr<common_schema_converter> impl_;
+
+  public:
+    common_schema_info();
+    ~common_schema_info();
+
+    common_schema_info(const common_schema_info &) = delete;
+    common_schema_info & operator=(const common_schema_info &) = delete;
+    common_schema_info(common_schema_info &&) noexcept;
+    common_schema_info & operator=(common_schema_info &&) noexcept;
+
+    void resolve_refs(nlohmann::ordered_json & schema);
+    bool resolves_to_string(const nlohmann::ordered_json & schema);
+};
+
 struct common_grammar_builder {
    std::function<std::string(const std::string &, const std::string &)> add_rule;
    std::function<std::string(const std::string &, const nlohmann::ordered_json &)> add_schema;
--- a/llama/llama.cpp/common/log.cpp
+++ b/llama/llama.cpp/common/log.cpp
@@ -420,6 +420,11 @@ void common_log_set_timestamps(struct common_log * log, bool timestamps) {
    log->set_timestamps(timestamps);
 }

+void common_log_flush(struct common_log * log) {
+    log->pause();
+    log->resume();
+}
+
 static int common_get_verbosity(enum ggml_log_level level) {
    switch (level) {
        case GGML_LOG_LEVEL_DEBUG: return LOG_LEVEL_DEBUG;
--- a/llama/llama.cpp/common/log.h
+++ b/llama/llama.cpp/common/log.h
@@ -84,6 +84,7 @@ void common_log_set_file      (struct common_log * log, const char * file); // n
 void common_log_set_colors    (struct common_log * log, log_colors colors); // not thread-safe
 void common_log_set_prefix    (struct common_log * log, bool prefix);       // whether to output prefix to each log
 void common_log_set_timestamps(struct common_log * log, bool timestamps);   // whether to output timestamps in the prefix
+void common_log_flush         (struct common_log * log);                    // flush all pending log messages

 // helper macros for logging
 // use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
--- a/llama/llama.cpp/common/sampling.cpp
+++ b/llama/llama.cpp/common/sampling.cpp
@@ -104,9 +104,10 @@ struct ring_buffer {
 struct common_sampler {
    common_params_sampling params;

-    struct llama_sampler * grmr;
    struct llama_sampler * chain;

+    bool grammar;
+
    ring_buffer<llama_token> prev;

    std::vector<llama_token_data> cur;
@@ -116,7 +117,6 @@ struct common_sampler {
    void reset() {
        prev.clear();

-        llama_sampler_reset(grmr);
        llama_sampler_reset(chain);
    }

@@ -167,10 +167,15 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co

    lparams.no_perf = params.no_perf;

-    struct llama_sampler * grmr;
+    llama_sampler * chain = llama_sampler_chain_init(lparams);
+
+    bool grammar = false;
+    std::vector<llama_sampler *> samplers;
+
    if (params.grammar.compare(0, 11, "%llguidance") == 0) {
 #ifdef LLAMA_USE_LLGUIDANCE
-        grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str());
+        samplers.push_back(llama_sampler_init_llg(vocab, "lark", params.grammar.c_str()));
+        grammar = true;
 #else
        GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
 #endif // LLAMA_USE_LLGUIDANCE
@@ -217,30 +222,23 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
            trigger_patterns_c.push_back(regex.c_str());
        }

-        grmr = params.grammar_lazy
-             ? llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
-                                                        trigger_patterns_c.data(), trigger_patterns_c.size(),
-                                                        trigger_tokens.data(), trigger_tokens.size())
-             :      llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
-        if (!grmr) {
-            return nullptr;
+        if (!params.grammar.empty()) {
+             if (params.grammar_lazy) {
+                 samplers.push_back(
+                         llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
+                             trigger_patterns_c.data(), trigger_patterns_c.size(),
+                             trigger_tokens.data(),     trigger_tokens.size()));
+             } else {
+                 samplers.push_back(llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root"));
+             }
+
+             grammar = true;
        }
    }

-    auto * result = new common_sampler {
-        /* .params = */ params,
-        /* .grmr   = */ grmr,
-        /* .chain  = */ llama_sampler_chain_init(lparams),
-        /* .prev   = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
-        /* .cur    = */ {},
-        /* .cur_p  = */ {},
-    };
-
-    llama_sampler_chain_add(result->chain,
-            llama_sampler_init_logit_bias(
-                llama_vocab_n_tokens(vocab),
-                params.logit_bias.size(),
-                params.logit_bias.data()));
+    if (params.has_logit_bias()) {
+        samplers.push_back(llama_sampler_init_logit_bias(llama_vocab_n_tokens(vocab), params.logit_bias.size(), params.logit_bias.data()));
+    }

    if (params.mirostat == 0) {
        for (const auto & cnstr : params.samplers) {
@@ -253,58 +251,70 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
                            c_breakers.push_back(str.c_str());
                        }

-                        llama_sampler_chain_add(result->chain, llama_sampler_init_dry      (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
+                        samplers.push_back(llama_sampler_init_dry    (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
                    }
                    break;
                case COMMON_SAMPLER_TYPE_TOP_K:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_top_k       (params.top_k));
+                    samplers.push_back(llama_sampler_init_top_k      (params.top_k));
                    break;
                case COMMON_SAMPLER_TYPE_TOP_P:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_top_p       (params.top_p, params.min_keep));
+                    samplers.push_back(llama_sampler_init_top_p      (params.top_p, params.min_keep));
                    break;
                case COMMON_SAMPLER_TYPE_TOP_N_SIGMA:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
+                    samplers.push_back(llama_sampler_init_top_n_sigma(params.top_n_sigma));
                    break;
                case COMMON_SAMPLER_TYPE_MIN_P:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_min_p       (params.min_p, params.min_keep));
+                    samplers.push_back(llama_sampler_init_min_p      (params.min_p, params.min_keep));
                    break;
                case COMMON_SAMPLER_TYPE_XTC:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_xtc         (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
+                    samplers.push_back(llama_sampler_init_xtc        (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
                    break;
                case COMMON_SAMPLER_TYPE_TYPICAL_P:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_typical     (params.typ_p, params.min_keep));
+                    samplers.push_back(llama_sampler_init_typical    (params.typ_p, params.min_keep));
                    break;
                case COMMON_SAMPLER_TYPE_TEMPERATURE:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext    (params.temp, params.dynatemp_range, params.dynatemp_exponent));
+                    samplers.push_back(llama_sampler_init_temp_ext   (params.temp, params.dynatemp_range, params.dynatemp_exponent));
                    break;
                case COMMON_SAMPLER_TYPE_INFILL:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_infill      (vocab));
+                    samplers.push_back(llama_sampler_init_infill     (vocab));
                    break;
                case COMMON_SAMPLER_TYPE_PENALTIES:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_penalties   (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
+                    samplers.push_back(llama_sampler_init_penalties  (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
                    break;
                default:
                    GGML_ASSERT(false && "unknown sampler type");
            }
        }
-        llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
+
+        samplers.push_back(llama_sampler_init_dist(params.seed));
    } else if (params.mirostat == 1) {
-        llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
-        llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
+        samplers.push_back(llama_sampler_init_temp(params.temp));
+        samplers.push_back(llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
    } else if (params.mirostat == 2) {
-        llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
-        llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
+        samplers.push_back(llama_sampler_init_temp(params.temp));
+        samplers.push_back(llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
    } else {
        GGML_ASSERT(false && "unknown mirostat version");
    }

+    for (auto * smpl : samplers) {
+        llama_sampler_chain_add(chain, smpl);
+    }
+
+    auto * result = new common_sampler {
+        /* .params  = */ params,
+        /* .chain   = */ chain,
+        /* .grammar = */ grammar,
+        /* .prev    = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
+        /* .cur     = */ {},
+        /* .cur_p   = */ {},
+    };
+
    return result;
 }

 void common_sampler_free(struct common_sampler * gsmpl) {
    if (gsmpl) {
-        llama_sampler_free(gsmpl->grmr);
-
        llama_sampler_free(gsmpl->chain);

        delete gsmpl;
@@ -314,11 +324,24 @@ void common_sampler_free(struct common_sampler * gsmpl) {
 void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
    const auto tm = gsmpl->tm();

-    if (accept_grammar) {
-        llama_sampler_accept(gsmpl->grmr, token);
-    }
+    if (gsmpl->grammar) {
+        const int n_smpl = llama_sampler_chain_n(gsmpl->chain);

-    llama_sampler_accept(gsmpl->chain, token);
+        for (int i = 0; i < n_smpl; i++) {
+            auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
+
+            // the grammar sampler is always the first one
+            if (i == 0) {
+                if (accept_grammar) {
+                    llama_sampler_accept(smpl, token);
+                }
+            } else {
+                llama_sampler_accept(smpl, token);
+            }
+        }
+    } else {
+        llama_sampler_accept(gsmpl->chain, token);
+    }

    gsmpl->prev.push_back(token);
 }
@@ -329,12 +352,12 @@ void common_sampler_reset(struct common_sampler * gsmpl) {

 struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
    return new common_sampler {
-        /* .params = */ gsmpl->params,
-        /* .grmr   = */ llama_sampler_clone(gsmpl->grmr),
-        /* .chain  = */ llama_sampler_clone(gsmpl->chain),
-        /* .prev   = */ gsmpl->prev,
-        /* .cur    = */ gsmpl->cur,
-        /* .cur_p  = */ gsmpl->cur_p,
+        /* .params  = */ gsmpl->params,
+        /* .chain   = */ llama_sampler_clone(gsmpl->chain),
+        /* .grammar = */ gsmpl->grammar,
+        /* .prev    = */ gsmpl->prev,
+        /* .cur     = */ gsmpl->cur,
+        /* .cur_p   = */ gsmpl->cur_p,
    };
 }

@@ -383,58 +406,33 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
    }
 }

-llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
+struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl) {
+    return gsmpl->chain;
+}
+
+llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx) {
    llama_synchronize(ctx);

    // start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
    const auto tm = gsmpl->tm();

-    gsmpl->set_logits(ctx, idx);
+    llama_token id = LLAMA_TOKEN_NULL;

-    auto & grmr  = gsmpl->grmr;
    auto & chain = gsmpl->chain;
    auto & cur_p = gsmpl->cur_p; // initialized by set_logits

-    if (grammar_first) {
-        llama_sampler_apply(grmr, &cur_p);
-    }
+    gsmpl->set_logits(ctx, idx);

    llama_sampler_apply(chain, &cur_p);

    GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");

-    const llama_token id = cur_p.data[cur_p.selected].id;
+    id = cur_p.data[cur_p.selected].id;

-    if (grammar_first) {
-        return id;
-    }
-
-    // check if it the sampled token fits the grammar
-    {
-        llama_token_data       single_token_data       = { id, 1.0f, 0.0f };
-        llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
-
-        llama_sampler_apply(grmr, &single_token_data_array);
-
-        const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
-        if (is_valid) {
-            return id;
-        }
-    }
-
-    // resampling:
-    // if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
-    gsmpl->set_logits(ctx, idx);
-
-    llama_sampler_apply(grmr,  &cur_p);
-    llama_sampler_apply(chain, &cur_p);
-
-    GGML_ASSERT(cur_p.selected != -1 && "no selected token during re-sampling - check your sampling configuration");
-
-    return cur_p.data[cur_p.selected].id;
+    return id;
 }

-std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first) {
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft) {
    GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");

    std::vector<llama_token> result;
@@ -442,7 +440,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample

    size_t i = 0;
    for (; i < draft.size(); i++) {
-        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
+        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);

        common_sampler_accept(gsmpl, id, true);

@@ -454,7 +452,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
    }

    if (i == draft.size()) {
-        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
+        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);

        common_sampler_accept(gsmpl, id, true);

@@ -464,13 +462,13 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
    return result;
 }

-std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) {
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft) {
    std::vector<int> idxs(draft.size() + 1);
    for (size_t i = 0; i < idxs.size(); ++i) {
        idxs[i] = i;
    }

-    return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first);
+    return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft);
 }

 uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
@@ -515,7 +513,8 @@ std::string common_sampler_print(const struct common_sampler * gsmpl) {

    for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
        const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
-        result += std::string("-> ") + llama_sampler_name(smpl) + " ";
+        result += std::string("-> ");
+        result += std::string(llama_sampler_name(smpl)) + " ";
    }

    return result;
--- a/llama/llama.cpp/common/sampling.h
+++ b/llama/llama.cpp/common/sampling.h
@@ -48,6 +48,8 @@ struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
 // arguments can be nullptr to skip printing
 void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);

+struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl);
+
 // extended sampling implementation:
 //
 // - set logits
@@ -55,10 +57,7 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
 // - check if the token fits the grammar (if any)
 // - if not: resample by first applying the grammar constraints and then sampling again (slower path)
 //
-// if grammar_first is true, the grammar is applied before the samplers (slower)
-// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
-//
-llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
+llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx);

 // generalized version of common_sampler_sample
 //
@@ -76,10 +75,10 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
 //
 // returns at least 1 token, up to idxs.size()
 //
-std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first = false);
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft);

 // assume idxs == [ 0, 1, 2, ..., draft.size() ]
-std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false);
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft);

 uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);

@@ -107,3 +106,9 @@ std::vector<enum common_sampler_type> common_sampler_types_from_chars(const std:

 llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab,
                const char * grammar_kind, const char * grammar_data);
+
+struct common_sampler_deleter {
+    void operator()(common_sampler * s) { common_sampler_free(s); }
+};
+
+typedef std::unique_ptr<common_sampler, common_sampler_deleter> common_sampler_ptr;
--- a/llama/llama.cpp/include/llama.h
+++ b/llama/llama.cpp/include/llama.h
@@ -313,6 +313,7 @@ extern "C" {
        bool check_tensors;   // validate model tensor data
        bool use_extra_bufts; // use extra buffer types (used for weight repacking)
        bool no_host;         // bypass host buffer allowing extra buffers to be used
+        bool no_alloc;        // only load metadata and simulate memory allocations
    };

    // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
@@ -466,10 +467,24 @@ extern "C" {
    // Frees all allocated memory
    LLAMA_API void llama_free(struct llama_context * ctx);

+    // fits mparams and cparams to free device memory (assumes system memory is unlimited)
+    // returns true if the parameters could be successfully modified to fit device memory
+    // this function is NOT thread safe because it modifies the global llama logger state
+    LLAMA_API bool llama_params_fit(
+                                   const char   * path_model,
+                    struct llama_model_params   * mparams,
+                    struct llama_context_params * cparams,
+                                          float * tensor_split,          // writable buffer for tensor split, needs at least llama_max_devices elements
+        struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
+                                         size_t   margin,                // margin of memory to leave per device in bytes
+                                       uint32_t   n_ctx_min,             // minimum context size to set when trying to reduce memory use
+                            enum ggml_log_level   log_level);            // minimum log level to print during fitting, lower levels go to debug log
+
    LLAMA_API int64_t llama_time_us(void);

    LLAMA_API size_t llama_max_devices(void);
    LLAMA_API size_t llama_max_parallel_sequences(void);
+    LLAMA_API size_t llama_max_tensor_buft_overrides(void);

    LLAMA_API bool llama_supports_mmap       (void);
    LLAMA_API bool llama_supports_mlock      (void);
@@ -1354,7 +1369,9 @@ extern "C" {

    // Set callback for all future logging events.
    // If this is not called, or NULL is supplied, everything is output on stderr.
-    LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
+    // The logger state is global so these functions are NOT thread safe.
+    LLAMA_API void llama_log_get(ggml_log_callback * log_callback, void ** user_data);
+    LLAMA_API void llama_log_set(ggml_log_callback   log_callback, void *  user_data);

    //
    // Performance utils
--- a/llama/llama.cpp/src/llama-arch.cpp
+++ b/llama/llama.cpp/src/llama-arch.cpp
--- a/llama/llama.cpp/src/llama-arch.h
+++ b/llama/llama.cpp/src/llama-arch.h
@@ -3,6 +3,7 @@
 #include "ggml.h" // ggml_op

 #include <string>
+#include <set>

 //
 // gguf constants (sync with gguf.py)
@@ -318,6 +319,7 @@ enum llm_tensor {
    LLM_TENSOR_DENSE_3_OUT,
    LLM_TENSOR_OUTPUT,
    LLM_TENSOR_OUTPUT_NORM,
+    LLM_TENSOR_OUTPUT_NORM_LFM2, // fix for wrong tensor name
    LLM_TENSOR_ROPE_FREQS,
    LLM_TENSOR_ROPE_FACTORS_LONG,
    LLM_TENSOR_ROPE_FACTORS_SHORT,
@@ -529,6 +531,10 @@ struct LLM_TN_IMPL {
    const int bid;
    const int xid;

+    const std::set<llm_tensor> model_tensors;
+
+    LLM_TN_IMPL(llm_arch arch, llm_tensor tensor, const char * suffix, int bid, int xid);
+
    std::string str() const;

    operator std::string() const {
@@ -550,11 +556,11 @@ struct LLM_TN {
    llm_arch arch;

    LLM_TN_IMPL operator()(llm_tensor tensor, const char * suffix, int bid = -1, int xid = -1) const {
-        return { arch, tensor, suffix, bid, xid };
+        return LLM_TN_IMPL(arch, tensor, suffix, bid, xid);
    }

    LLM_TN_IMPL operator()(llm_tensor tensor, int bid = -1, int xid = -1) const {
-        return { arch, tensor, nullptr, bid, xid };
+        return LLM_TN_IMPL(arch, tensor, nullptr, bid, xid);
    }
 };

--- a/llama/llama.cpp/src/llama-batch.cpp
+++ b/llama/llama.cpp/src/llama-batch.cpp
@@ -695,6 +695,8 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
    udata->seq_idx   .resize(LLAMA_MAX_SEQ, -1);
    udata->output    .resize(n_tokens);

+    udata->seq_id_data.reserve(n_tokens);
+
    seq_set_t seq_set_unq;

    for (size_t i = 0; i < idxs.size(); ++i) {
@@ -716,11 +718,13 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
        }

        udata->n_seq_id[i] = batch.n_seq_id[idxs[i]];
-        udata->seq_id[i]   = batch.seq_id[idxs[i]];
        udata->output[i]   = batch.logits[idxs[i]];

        for (int s = 0; s < udata->n_seq_id[i]; ++s) {
-            seq_set_unq.set(udata->seq_id[i][s]);
+            const llama_seq_id seq_id = batch.seq_id[idxs[i]][s];
+
+            udata->seq_id_data.push_back(seq_id);
+            seq_set_unq.set(seq_id);
        }

        if (udata->output[i]) {
@@ -728,6 +732,12 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
        }
    }

+    llama_seq_id * seq_id_ptr = udata->seq_id_data.data();
+    for (size_t i = 0; i < idxs.size(); ++i) {
+        udata->seq_id[i] = seq_id_ptr;
+        seq_id_ptr += udata->n_seq_id[i];
+    }
+
    for (uint32_t s = 0; s < n_seq_max; ++s) {
        if (seq_set_unq.test(s)) {
            udata->seq_idx[s] = udata->seq_id_unq.size();
--- a/llama/llama.cpp/src/llama-batch.h
+++ b/llama/llama.cpp/src/llama-batch.h
@@ -56,13 +56,15 @@ struct llama_ubatch {
        std::vector<float>          embd;
        std::vector<llama_pos>      pos;
        std::vector<int32_t>        n_seq_id;
-        std::vector<llama_seq_id *> seq_id;
+        std::vector<llama_seq_id *> seq_id;      // these point into the seq_id_data below
        std::vector<llama_seq_id>   seq_id_unq;
        std::vector<int32_t>        seq_idx;
        std::vector<int8_t>         output;
+
+        std::vector<llama_seq_id> seq_id_data;
    };

-    // the llama_ubatch pointers above point to this data if set. otherwise - points to non-owning data
+    // the llama_ubatch pointers above point to this data if set. otherwise - point to external non-owning data
    std::shared_ptr<data_t> data;
 };

--- a/llama/llama.cpp/src/llama-context.cpp
+++ b/llama/llama.cpp/src/llama-context.cpp
@@ -9,6 +9,7 @@
 #include "llama-model.h"

 #include <cinttypes>
+#include <cmath>
 #include <cstring>
 #include <limits>
 #include <stdexcept>
@@ -72,6 +73,43 @@ llama_context::llama_context(
        cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
    }

+    if (cparams.yarn_ext_factor != 0) {
+        static auto get_mscale = [](float scale, float mscale) {
+            return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f);
+        };
+
+        const float factor = 1.0f / cparams.rope_freq_scale;
+
+        // ref: https://github.com/huggingface/transformers/blob/6d00f6b0a5679c36510f203e4226e36f517c3032/src/transformers/modeling_rope_utils.py#L336-L348
+        if (hparams.rope_yarn_log_mul != 0.0f) {
+            // note: here we assume `mscale == 1.0f`
+            // TODO: start reading the actual value of mscale and handle the case where it is not 1.0f
+                  float mscale          = 1.0f;
+            const float mscale_all_dims = hparams.rope_yarn_log_mul;
+
+            // [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
+            // special-case DEEPSEEK v2:
+            // https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite-Chat/blob/main/config.json#L42-L43
+            if (model.arch == LLM_ARCH_DEEPSEEK2 && mscale_all_dims != 1.0f) {
+                mscale = mscale_all_dims;
+            }
+
+            cparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims);
+
+            LLAMA_LOG_WARN("%s: setting new yarn_attn_factor = %.4f (mscale == %.1f, mscale_all_dim = %.1f)\n",
+                    __func__, cparams.yarn_attn_factor, mscale, mscale_all_dims);
+        } else {
+            cparams.yarn_attn_factor = get_mscale(factor, 1.0f);
+        }
+
+        // when YARN is applied with yarn_ext_factor != 0.0f, we need to cancel this factor:
+        // https://github.com/ggml-org/llama.cpp/blob/a81a569577cc38b32558958b048228150be63eae/ggml/src/ggml-cpu/ops.cpp#L5541-L5544
+        //
+        // ref: https://github.com/ggml-org/llama.cpp/discussions/7416
+        //      https://github.com/ggml-org/llama.cpp/pull/17945
+        cparams.yarn_attn_factor *= 1.0f / (1.0f + 0.1f * logf(factor));
+    }
+
    cparams.yarn_attn_factor *= hparams.rope_attn_factor;

    if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
@@ -93,14 +131,6 @@ llama_context::llama_context(
    // with causal attention, the batch size is limited by the context size
    cparams.n_batch = cparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;

-    // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
-    // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
-    // ref: https://github.com/ggerganov/llama.cpp/pull/5021
-    // TODO: this padding is not needed for the cache-less context so we should probably move it to llama_memory
-    if (cparams.n_batch < GGML_KQ_MASK_PAD) {
-        LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
-        cparams.n_batch = GGML_KQ_MASK_PAD;
-    }
    cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);

    cparams.op_offload = params.op_offload;
@@ -228,6 +258,7 @@ llama_context::llama_context(

        backend_buft.clear();
        backend_ptrs.clear();
+        backend_buf_exp_size.clear();

        for (auto & backend : backends) {
            auto * buft = ggml_backend_get_default_buffer_type(backend.get());
@@ -244,6 +275,7 @@ llama_context::llama_context(

            backend_buft.push_back(buft);
            backend_ptrs.push_back(backend.get());
+            backend_buf_exp_size.push_back(0);
        }

        LLAMA_LOG_DEBUG("%s: backend_ptrs.size() = %zu\n", __func__, backend_ptrs.size());
@@ -359,7 +391,8 @@ llama_context::llama_context(

        // reserve pp (prompt processing) graph first so that buffers are only allocated once
        {
-            auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
+            auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(),
+                model.hparams.no_alloc, model.hparams.no_alloc ? backend_buf_exp_size.data() : nullptr);
            if (!gf) {
                if (pipeline_parallel) {
                    LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__);
@@ -377,7 +410,7 @@ llama_context::llama_context(

        // reserve with tg (token generation) graph to get the number of splits and nodes
        {
-            auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get());
+            auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get(), model.hparams.no_alloc);
            if (!gf) {
                throw std::runtime_error("failed to allocate compute tg buffers");
            }
@@ -392,7 +425,7 @@ llama_context::llama_context(
            //
            // auto * gf = graph_reserve(n_tokens, 1, n_tokens, mctx.get());
            //
-            auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
+            auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(), model.hparams.no_alloc);
            if (!gf) {
                throw std::runtime_error("failed to allocate compute pp buffers");
            }
@@ -401,11 +434,13 @@ llama_context::llama_context(
        for (size_t i = 0; i < backend_ptrs.size(); ++i) {
            ggml_backend_t             backend = backend_ptrs[i];
            ggml_backend_buffer_type_t buft    = backend_buft[i];
-            size_t size = ggml_backend_sched_get_buffer_size(sched.get(), backend);
-            if (size > 1) {
+            if (!model.hparams.no_alloc) {
+                backend_buf_exp_size[i] = ggml_backend_sched_get_buffer_size(sched.get(), backend);
+            }
+            if (backend_buf_exp_size[i] > 1) {
                LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
                        ggml_backend_buft_name(buft),
-                        size / 1024.0 / 1024.0);
+                        backend_buf_exp_size[i] / 1024.0 / 1024.0);
            }
        }

@@ -424,6 +459,23 @@ llama_context::llama_context(
 }

 llama_context::~llama_context() {
+    // FIXME this currently results in a use-after-free bug if the model is freed before the context
+    // if (!model.hparams.no_alloc) {
+    //     for (size_t i = 0; i < backend_ptrs.size(); ++i) {
+    //         ggml_backend_t             backend = backend_ptrs[i];
+    //         ggml_backend_buffer_type_t buft    = backend_buft[i];
+
+    //         const size_t size_exp = backend_buf_exp_size[i];
+    //         const size_t size_act = ggml_backend_sched_get_buffer_size(sched.get(), backend);
+    //         if (size_exp == size_act) {
+    //             LLAMA_LOG_DEBUG("%s: %10s compute buffer size is %8.4f MiB, matches expectation of %8.4f MiB\n",
+    //                 __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
+    //         } else {
+    //             LLAMA_LOG_WARN("%s: %10s compute buffer size of %8.4f MiB, does not match expectation of %8.4f MiB\n",
+    //                 __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
+    //         }
+    //     }
+    // }
    ggml_opt_free(opt_ctx);
 }

@@ -1325,6 +1377,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
            // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
            LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
 #endif
+            synchronize();
            buf_output = nullptr;
            logits = nullptr;
            embd = nullptr;
@@ -1396,7 +1449,8 @@ llm_graph_result * llama_context::get_gf_res_reserve() const {
    return static_cast<llm_graph_result *>(gf_res_reserve.get());
 }

-ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only) {
+ggml_cgraph * llama_context::graph_reserve(
+        uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only, size_t * sizes) {
    LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs);
    GGML_ASSERT(n_outputs >= 1);

@@ -1433,8 +1487,13 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u

    // initialize scheduler with the specified graph
    if (split_only) {
-        ggml_backend_sched_split_graph(sched.get(), gf);
+        if (sizes) {
+            ggml_backend_sched_reserve_size(sched.get(), gf, sizes);
+        } else {
+            ggml_backend_sched_split_graph(sched.get(), gf);
+        }
    } else if (!ggml_backend_sched_reserve(sched.get(), gf)) {
+        GGML_ASSERT(!sizes);
        LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
        return nullptr;
    }
@@ -2056,15 +2115,26 @@ void llama_context::perf_reset() {

 std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> llama_context::memory_breakdown() const {
    std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> ret;
-    for (const auto & buft_size : model.memory_breakdown()) {
-        ret[buft_size.first].model += buft_size.second;
+    for (const auto & [buft, size] : model.memory_breakdown()) {
+        ret[buft].model += size;
    }
-    for (const auto & buft_size : memory->memory_breakdown()) {
-        ret[buft_size.first].context += buft_size.second;
+    if (memory) {
+        for (const auto & [buft, size] : memory->memory_breakdown()) {
+            ret[buft].context += size;
+        }
    }
-    for (const auto & backend_ptr : backends) {
-        ggml_backend_t backend = backend_ptr.get();
-        ret[ggml_backend_sched_get_buffer_type(sched.get(), backend)].compute += ggml_backend_sched_get_buffer_size(sched.get(), backend);
+    if (model.hparams.no_alloc) {
+        for (size_t i = 0; i < backends.size(); ++i) {
+            ggml_backend_t             backend = backends[i].get();
+            ggml_backend_buffer_type_t buft    = ggml_backend_sched_get_buffer_type(sched.get(), backend);
+            ret[buft].compute += backend_buf_exp_size[i];
+        }
+    } else {
+        for (const auto & backend_ptr : backends) {
+            ggml_backend_t             backend = backend_ptr.get();
+            ggml_backend_buffer_type_t buft    = ggml_backend_sched_get_buffer_type(sched.get(), backend);
+            ret[buft].compute += ggml_backend_sched_get_buffer_size(sched.get(), backend);
+        }
    }
    return ret;
 }
--- a/llama/llama.cpp/src/llama-context.h
+++ b/llama/llama.cpp/src/llama-context.h
@@ -26,6 +26,10 @@ struct llama_memory_breakdown_data {
    size_t model   = 0; // memory allocated for the model
    size_t context = 0; // memory allocated for the context
    size_t compute = 0; // memory allocated for temporary compute buffers
+
+    size_t total() const {
+        return model + context + compute;
+    }
 };

 struct llama_context {
@@ -206,7 +210,8 @@ public:
    ggml_status graph_compute(ggml_cgraph * gf, bool batched);

    // reserve a graph with a dummy ubatch of the specified size
-    ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false);
+    ggml_cgraph * graph_reserve(
+        uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false, size_t * sizes = nullptr);

 private:
    llm_graph_params graph_params(
@@ -281,9 +286,10 @@ private:

    std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;

-    // buffer types used for the compute buffer of each backend
+    // pointers and buffer types used for the compute buffer of each backend
    std::vector<ggml_backend_t>             backend_ptrs;
    std::vector<ggml_backend_buffer_type_t> backend_buft;
+    std::vector<size_t>                     backend_buf_exp_size; // expected buffer sizes

    llm_graph_result_ptr gf_res_prev;
    llm_graph_result_ptr gf_res_reserve;
--- a/llama/llama.cpp/src/llama-graph.cpp
+++ b/llama/llama.cpp/src/llama-graph.cpp
@@ -78,7 +78,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
        for (int i = 0; i < n_tokens; ++i) {
            const float pos = ubatch->pos[i];
            attn_scale_data[i] = std::log(
-                std::floor((pos + 1.0f) / n_attn_temp_floor_scale) + 1.0
+                std::floor((pos + f_attn_temp_offset) / n_attn_temp_floor_scale) + 1.0
            ) * f_attn_temp_scale + 1.0;
        }

@@ -254,6 +254,24 @@ void llm_graph_input_rs::set_input(const llama_ubatch * ubatch) {
    }
 }

+bool llm_graph_input_rs::can_reuse(const llm_graph_params & params) {
+    const auto * mctx = static_cast<const llama_memory_recurrent_context *>(params.mctx);
+
+    this->mctx = mctx;
+
+    bool res = true;
+
+    res &= s_copy->ne[0] == mctx->get_n_rs();
+
+    res &= s_copy_main->ne[0]  == params.ubatch.n_seqs;
+    res &= s_copy_extra->ne[0] == mctx->get_n_rs() - params.ubatch.n_seqs;
+
+    res &= head == mctx->get_head();
+    res &= rs_z == mctx->get_rs_z();
+
+    return res;
+}
+
 void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
    GGML_UNUSED(ubatch);

@@ -385,7 +403,7 @@ bool llm_graph_input_attn_kv::can_reuse(const llm_graph_params & params) {
  //res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there

    res &= self_kq_mask->ne[0] == mctx->get_n_kv();
-    res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
+    res &= self_kq_mask->ne[1] == params.ubatch.n_tokens;

    return res;
 }
@@ -416,10 +434,10 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
  //res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there

    res &= self_kq_mask->ne[0] == mctx->get_base()->get_n_kv();
-    res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
+    res &= self_kq_mask->ne[1] == params.ubatch.n_tokens;

    res &= self_kq_mask_swa->ne[0] == mctx->get_swa()->get_n_kv();
-    res &= self_kq_mask_swa->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
+    res &= self_kq_mask_swa->ne[1] == params.ubatch.n_tokens;

    return res;
 }
@@ -452,7 +470,7 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
            }
        }

-        for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
+        for (int i = n_tokens; i < n_tokens; ++i) {
            for (int j = 0; j < n_enc; ++j) {
                data[h*(n_enc*n_tokens) + i*n_enc + j] = -INFINITY;
            }
@@ -461,8 +479,46 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
 }

 void llm_graph_input_mem_hybrid::set_input(const llama_ubatch * ubatch) {
-    inp_attn->set_input(ubatch);
-    inp_rs->set_input(ubatch);
+    mctx->get_attn()->set_input_k_idxs(inp_attn->self_k_idxs, ubatch);
+    mctx->get_attn()->set_input_v_idxs(inp_attn->self_v_idxs, ubatch);
+
+    mctx->get_attn()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn);
+
+    const int64_t n_rs = mctx->get_recr()->get_n_rs();
+
+    if (inp_rs->s_copy) {
+        GGML_ASSERT(ggml_backend_buffer_is_host(inp_rs->s_copy->buffer));
+        int32_t * data = (int32_t *) inp_rs->s_copy->data;
+
+        // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
+        for (uint32_t i = 0; i < n_rs; ++i) {
+            data[i] = mctx->get_recr()->s_copy(i);
+        }
+    }
+}
+
+bool llm_graph_input_mem_hybrid::can_reuse(const llm_graph_params & params) {
+    const auto * mctx = static_cast<const llama_memory_hybrid_context *>(params.mctx);
+
+    this->mctx = mctx;
+
+    bool res = true;
+
+    res &= inp_attn->self_k_idxs->ne[0] == params.ubatch.n_tokens;
+  //res &= inp_attn->self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
+
+    res &= inp_attn->self_kq_mask->ne[0] == mctx->get_attn()->get_n_kv();
+    res &= inp_attn->self_kq_mask->ne[1] == params.ubatch.n_tokens;
+
+    res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs();
+
+    res &= inp_rs->s_copy_main->ne[0]  == params.ubatch.n_seqs;
+    res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs;
+
+    res &= inp_rs->head == mctx->get_recr()->get_head();
+    res &= inp_rs->rs_z == mctx->get_recr()->get_rs_z();
+
+    return res;
 }

 //
@@ -1097,8 +1153,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
                cur = ggml_relu(ctx0, cur);
                cur = ggml_sqr(ctx0, cur);
                cb(cur, "ffn_moe_relu_sqr", il);
-            }
-            break;
+            } break;
        default:
            GGML_ABORT("fatal error");
    }
@@ -1213,7 +1268,7 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
 }

 ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
-    auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
+    auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale, hparams.f_attn_temp_offset);

    auto & cur = inp->attn_scale;

@@ -1480,13 +1535,13 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con
    auto inp = std::make_unique<llm_graph_input_attn_no_cache>(hparams, cparams);

    // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
-    inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
+    inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1, 1);
    ggml_set_input(inp->self_kq_mask);

    inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;

    if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
-        inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
+        inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1, 1);
        ggml_set_input(inp->self_kq_mask_swa);

        inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
@@ -1568,7 +1623,7 @@ static std::unique_ptr<llm_graph_input_attn_kv> build_attn_inp_kv_impl(
        inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch);
        inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch);

-        inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
+        inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
        ggml_set_input(inp->self_kq_mask);

        inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
@@ -1711,7 +1766,7 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {

    const int32_t n_enc = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train;

-    inp->cross_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
+    inp->cross_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_enc, n_tokens, 1, 1);
    ggml_set_input(inp->cross_kq_mask);

    inp->cross_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->cross_kq_mask, GGML_TYPE_F16) : inp->cross_kq_mask;
@@ -1777,7 +1832,7 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
        inp->self_k_idxs = mctx_cur->get_base()->build_input_k_idxs(ctx0, ubatch);
        inp->self_v_idxs = mctx_cur->get_base()->build_input_v_idxs(ctx0, ubatch);

-        inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
+        inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
        ggml_set_input(inp->self_kq_mask);

        inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
@@ -1791,7 +1846,7 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
        inp->self_k_idxs_swa = mctx_cur->get_swa()->build_input_k_idxs(ctx0, ubatch);
        inp->self_v_idxs_swa = mctx_cur->get_swa()->build_input_v_idxs(ctx0, ubatch);

-        inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
+        inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
        ggml_set_input(inp->self_kq_mask_swa);

        inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
@@ -1851,6 +1906,9 @@ static std::unique_ptr<llm_graph_input_rs> build_rs_inp_impl(
    inp->s_copy_main  = ggml_view_1d(ctx0, inp->s_copy, n_seqs, 0);
    inp->s_copy_extra = ggml_view_1d(ctx0, inp->s_copy, n_rs - n_seqs, n_seqs * inp->s_copy->nb[0]);

+    inp->head = mctx_cur->get_head();
+    inp->rs_z = mctx_cur->get_rs_z();
+
    return inp;
 }

@@ -1919,10 +1977,10 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_store(
 llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
    const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx);

-    auto inp_rs   = build_rs_inp_impl(ctx0, ubatch, mctx_cur->get_recr());
+    auto inp_rs   = build_rs_inp_impl     (ctx0, ubatch, mctx_cur->get_recr());
    auto inp_attn = build_attn_inp_kv_impl(ctx0, ubatch, hparams, cparams, mctx_cur->get_attn());

-    auto inp = std::make_unique<llm_graph_input_mem_hybrid>(std::move(inp_attn), std::move(inp_rs), mctx_cur);
+    auto inp = std::make_unique<llm_graph_input_mem_hybrid>(cparams, std::move(inp_attn), std::move(inp_rs), mctx_cur);

    return (llm_graph_input_mem_hybrid *) res->add_input(std::move(inp));
 }
--- a/llama/llama.cpp/src/llama-graph.h
+++ b/llama/llama.cpp/src/llama-graph.h
@@ -132,8 +132,8 @@ public:
 // temperature tuning, used by llama4
 class llm_graph_input_attn_temp : public llm_graph_input_i {
 public:
-    llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
-        : n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
+    llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale, float f_attn_temp_offset)
+        : n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale), f_attn_temp_offset(f_attn_temp_offset) {}
    virtual ~llm_graph_input_attn_temp() = default;

    void set_input(const llama_ubatch * ubatch) override;
@@ -142,6 +142,7 @@ public:

    const uint32_t n_attn_temp_floor_scale;
    const float    f_attn_temp_scale;
+    const float    f_attn_temp_offset;
 };

 class llm_graph_input_pos_bucket : public llm_graph_input_i {
@@ -224,6 +225,8 @@ public:

    void set_input(const llama_ubatch * ubatch) override;

+    bool can_reuse(const llm_graph_params & params) override;
+
    ggml_tensor * s_copy;  // I32 [n_rs]

    // views of s_copy, computed once per graph
@@ -232,6 +235,10 @@ public:
    ggml_tensor * s_copy_extra;  // I32 [n_rs - n_seqs]

    const llama_memory_recurrent_context * mctx;
+
+    // used in view offsets, need to match for valid graph reuse
+    uint32_t head;
+    int32_t rs_z;
 };

 class llm_graph_input_cross_embd : public llm_graph_input_i {
@@ -364,22 +371,28 @@ public:
 class llm_graph_input_mem_hybrid : public llm_graph_input_i {
 public:
    llm_graph_input_mem_hybrid(
+            const llama_cparams & cparams,
            std::unique_ptr<llm_graph_input_attn_kv> inp_attn,
-            std::unique_ptr<llm_graph_input_rs>              inp_rs,
-            const llama_memory_hybrid_context *              mctx) :
+            std::unique_ptr<llm_graph_input_rs>      inp_rs,
+            const llama_memory_hybrid_context *      mctx) :
        inp_attn(std::move(inp_attn)),
        inp_rs(std::move(inp_rs)),
+        cparams(cparams),
        mctx(mctx) { }
    virtual ~llm_graph_input_mem_hybrid() = default;

    void set_input(const llama_ubatch * ubatch) override;

+    bool can_reuse(const llm_graph_params & params) override;
+
    std::unique_ptr<llm_graph_input_attn_kv> inp_attn;
    std::unique_ptr<llm_graph_input_rs>      inp_rs;

    llm_graph_input_attn_kv * get_attn() const { return inp_attn.get(); }
    llm_graph_input_rs      * get_recr() const { return inp_rs.get(); }

+    const llama_cparams cparams;
+
    const llama_memory_hybrid_context * mctx;
 };

--- a/llama/llama.cpp/src/llama-hparams.cpp
+++ b/llama/llama.cpp/src/llama-hparams.cpp
@@ -1,6 +1,8 @@
 #include "llama-hparams.h"

 #include "ggml.h"
+
+#include <algorithm>
 #include <cassert>

 void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
@@ -237,3 +239,7 @@ bool llama_hparams::is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama

    return false;
 }
+
+bool llama_hparams::use_mrope() const {
+    return rope_sections[0] > 0 && rope_sections[1] > 0;
+}
--- a/llama/llama.cpp/src/llama-hparams.h
+++ b/llama/llama.cpp/src/llama-hparams.h
@@ -34,6 +34,7 @@ struct llama_hparams_convnext {

 struct llama_hparams {
    bool vocab_only;
+    bool no_alloc;
    bool rope_finetuned;
    bool use_par_res;
    bool swin_norm;
@@ -109,6 +110,7 @@ struct llama_hparams {
    float    rope_freq_base_train_swa;
    float    rope_freq_scale_train;
    float    rope_freq_scale_train_swa;
+
    uint32_t n_ctx_orig_yarn;
    float    rope_yarn_log_mul = 0.0f;

@@ -166,6 +168,7 @@ struct llama_hparams {
    uint32_t n_no_rope_layer_step    = 4;
    uint32_t n_attn_temp_floor_scale = 0;
    float    f_attn_temp_scale       = 0.0f;
+    float    f_attn_temp_offset      = 0.0f; // offset position index

    // gemma3n altup
    uint32_t n_altup      = 4; // altup_num_inputs
@@ -272,7 +275,8 @@ struct llama_hparams {
    // TODO: think of a better place for this function
    // TODO: pack the SWA params in a struct?
    static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1);
+
+    bool use_mrope() const;
 };

 static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
-
--- a/llama/llama.cpp/src/llama-impl.cpp
+++ b/llama/llama.cpp/src/llama-impl.cpp
@@ -25,6 +25,10 @@ time_meas::~time_meas() {
    }
 }

+void llama_log_get(ggml_log_callback * log_callback, void ** user_data) {
+    ggml_log_get(log_callback, user_data);
+}
+
 void llama_log_set(ggml_log_callback log_callback, void * user_data) {
    ggml_log_set(log_callback, user_data);
    g_logger_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
--- a/llama/llama.cpp/src/llama-kv-cache.cpp
+++ b/llama/llama.cpp/src/llama-kv-cache.cpp
@@ -175,7 +175,15 @@ llama_kv_cache::llama_kv_cache(

    // allocate tensors and initialize the buffers to avoid NaNs in the padding
    for (auto & [buft, ctx] : ctx_map) {
-        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft);
+        ggml_backend_buffer_t buf;
+        if (model.hparams.no_alloc) {
+            buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // dummy buffer
+            for (ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != nullptr; t = ggml_get_next_tensor(ctx.get(), t)) {
+                t->buffer = buf; // set dummy buffer for KV cache so that the backend scheduler won't try to allocate it
+            }
+        } else {
+            buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft); // real buffer
+        }
        if (!buf) {
            throw std::runtime_error("failed to allocate buffer for kv cache");
        }
@@ -482,9 +490,18 @@ llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const {

 std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache::memory_breakdown() const {
    std::map<ggml_backend_buffer_type_t, size_t> ret;
-    for (const auto & [_, buf] : ctxs_bufs) {
-        ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
+    for (const auto & [ctx, buf] : ctxs_bufs) {
+        ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(buf.get());
+
+        if (hparams.no_alloc) {
+            GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) == nullptr);
+            ret[buft] += ggml_backend_alloc_ctx_tensors_from_buft_size(ctx.get(), buft);
+        } else {
+            // GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) != nullptr); // multi_buffer does not have a defined base
+            ret[buft] += ggml_backend_buffer_get_size(buf.get());
+        }
    }
+
    return ret;
 }

@@ -1232,8 +1249,7 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
    GGML_ASSERT(n_tokens%n_stream == 0);

    // n_tps == n_tokens_per_stream
-    const int64_t n_tps     = n_tokens/n_stream;
-    const int64_t n_tps_pad = GGML_PAD(n_tps, GGML_KQ_MASK_PAD);
+    const int64_t n_tps = n_tokens/n_stream;

    std::fill(data, data + ggml_nelements(dst), -INFINITY);

@@ -1266,7 +1282,7 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
                const llama_pos p1_x = is_2d ? ubatch->pos[i + ubatch->n_tokens*2] : 0;
                const llama_pos p1_y = is_2d ? ubatch->pos[i + ubatch->n_tokens]   : 0;

-                const uint64_t idst = n_kv*(h*n_stream*n_tps_pad + s*n_tps_pad + ii);
+                const uint64_t idst = n_kv*(h*n_stream*n_tps + s*n_tps + ii);

                for (uint32_t j = 0; j < n_kv; ++j) {
                    if (cells.is_empty(j)) {
@@ -1370,9 +1386,10 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
                      float   freq_scale) const {
    const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;

-    const auto & yarn_ext_factor = cparams.yarn_ext_factor;
-    const auto & yarn_beta_fast  = cparams.yarn_beta_fast;
-    const auto & yarn_beta_slow  = cparams.yarn_beta_slow;
+    const auto & yarn_ext_factor  = cparams.yarn_ext_factor;
+    const auto & yarn_beta_fast   = cparams.yarn_beta_fast;
+    const auto & yarn_beta_slow   = cparams.yarn_beta_slow;
+    const auto & yarn_attn_factor = cparams.yarn_attn_factor;

    const auto & n_rot     = hparams.n_rot;
    const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE || hparams.rope_type == LLAMA_ROPE_TYPE_IMROPE
@@ -1383,12 +1400,6 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
                                ? LLAMA_ROPE_TYPE_NEOX
                                : hparams.rope_type;

-    // See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly.
-    // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
-    const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2
-                                    ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale))
-                                    : cparams.yarn_attn_factor;
-
    ggml_tensor * tmp;

    if (ggml_is_quantized(cur->type)) {
@@ -1550,9 +1561,11 @@ void llama_kv_cache::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama

        const uint32_t strm = seq_id == -1 ? s : seq_to_stream[seq_id];

+        slot_info sinfo;
+
        bool res = true;
-        res = res && state_read_meta(io, strm, cell_count, seq_id);
-        res = res && state_read_data(io, strm, cell_count);
+        res = res && state_read_meta(io, strm, cell_count, sinfo, seq_id);
+        res = res && state_read_data(io, strm, cell_count, sinfo);

        if (!res) {
            if (seq_id == -1) {
@@ -1691,7 +1704,7 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t
    }
 }

-bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, llama_seq_id dest_seq_id) {
+bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, slot_info & sinfo, llama_seq_id dest_seq_id) {
    auto & cells = v_cells[strm];
    auto & head  = v_heads[strm];

@@ -1728,7 +1741,7 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32
            ubatch.seq_id[i]   = &dest_seq_id;
        }

-        const auto sinfo = find_slot(ubatch, true);
+        sinfo = find_slot(ubatch, false);
        if (sinfo.empty()) {
            LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
            return false;
@@ -1738,20 +1751,16 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32
        //       see: https://github.com/ggml-org/llama.cpp/pull/16825#issuecomment-3460868350
        apply_ubatch(sinfo, ubatch);

-        const auto head_cur = sinfo.head();
+        LLAMA_LOG_DEBUG("%s: cell_count = %d, dest_seq_id = %d\n", __func__, cell_count, dest_seq_id);

-        // keep the head at the old position because we will read the KV data into it in state_read_data()
-        head = head_cur;
-
-        LLAMA_LOG_DEBUG("%s: head_cur = %d, head = %d, cell_count = %d, dest_seq_id = %d\n", __func__, head_cur, head, cell_count, dest_seq_id);
-
-        // DEBUG CHECK: head_cur should be our first cell, head_cur + cell_count - 1 should be our last cell (verify seq_id and pos values)
-        // Assume that this is one contiguous block of cells
-        GGML_ASSERT(head_cur + cell_count <= cells.size());
-        GGML_ASSERT(cells.pos_get(head_cur)                  == ubatch.pos[0]);
-        GGML_ASSERT(cells.pos_get(head_cur + cell_count - 1) == ubatch.pos[cell_count - 1]);
-        GGML_ASSERT(cells.seq_has(head_cur,                  dest_seq_id));
-        GGML_ASSERT(cells.seq_has(head_cur + cell_count - 1, dest_seq_id));
+        // DEBUG CHECK: verify that all cells were allocated and have correct seq_id and pos values
+        GGML_ASSERT(sinfo.n_stream() == 1);
+        GGML_ASSERT(sinfo.idxs[0].size() == cell_count);
+        for (uint32_t i = 0; i < cell_count; ++i) {
+            const uint32_t idx = sinfo.idxs[0][i];
+            GGML_ASSERT(cells.pos_get(idx) == ubatch.pos[i]);
+            GGML_ASSERT(cells.seq_has(idx, dest_seq_id));
+        }
    } else {
        // whole KV cache restore

@@ -1784,15 +1793,24 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32
            }
        }

+        // Create contiguous slot_info for whole cache restore
+        sinfo.s0 = strm;
+        sinfo.s1 = strm;
+        sinfo.resize(1);
+        sinfo.strm[0] = strm;
+        sinfo.idxs[0].resize(cell_count);
+        for (uint32_t i = 0; i < cell_count; ++i) {
+            sinfo.idxs[0][i] = i;
+        }
+
        head = 0;
    }

    return true;
 }

-bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count) {
+bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, const slot_info & sinfo) {
    auto & cells = v_cells[strm];
-    auto & head  = v_heads[strm];

    uint32_t v_trans;
    uint32_t n_layer;
@@ -1842,8 +1860,17 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32
        }

        if (cell_count) {
-            // Read and set the keys for the whole cell range
-            ggml_backend_tensor_set(k, io.read(cell_count * k_size_row), head * k_size_row, cell_count * k_size_row);
+            if (sinfo.is_contiguous()) {
+                // Fast path: contiguous cells, single memcpy
+                ggml_backend_tensor_set(k, io.read(cell_count * k_size_row), sinfo.head() * k_size_row, cell_count * k_size_row);
+            } else {
+                // Slow path: scatter to non-contiguous positions
+                const void * src = io.read(cell_count * k_size_row);
+                for (uint32_t i = 0; i < cell_count; ++i) {
+                    const size_t dst_offset = sinfo.idxs[0][i] * k_size_row;
+                    ggml_backend_tensor_set(k, (const char*)src + i * k_size_row, dst_offset, k_size_row);
+                }
+            }
        }
    }

@@ -1874,8 +1901,17 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32
            }

            if (cell_count) {
-                // Read and set the values for the whole cell range
-                ggml_backend_tensor_set(v, io.read(cell_count * v_size_row), head * v_size_row, cell_count * v_size_row);
+                if (sinfo.is_contiguous()) {
+                    // Fast path: contiguous cells, single memcpy
+                    ggml_backend_tensor_set(v, io.read(cell_count * v_size_row), sinfo.head() * v_size_row, cell_count * v_size_row);
+                } else {
+                    // Slow path: scatter to non-contiguous positions
+                    const void * src = io.read(cell_count * v_size_row);
+                    for (uint32_t i = 0; i < cell_count; ++i) {
+                        const size_t dst_offset = sinfo.idxs[0][i] * v_size_row;
+                        ggml_backend_tensor_set(v, (const char*)src + i * v_size_row, dst_offset, v_size_row);
+                    }
+                }
            }
        }
    } else {
@@ -1914,10 +1950,22 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32
            }

            if (cell_count) {
-                // For each row in the transposed matrix, read the values for the whole cell range
-                for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
-                    const size_t dst_offset = (head + j * cells.size()) * v_size_el;
-                    ggml_backend_tensor_set(v, io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
+                if (sinfo.is_contiguous()) {
+                    // Fast path: contiguous cells
+                    const uint32_t h = sinfo.head();
+                    for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
+                        const size_t dst_offset = (h + j * cells.size()) * v_size_el;
+                        ggml_backend_tensor_set(v, io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
+                    }
+                } else {
+                    // Slow path: scatter to non-contiguous positions
+                    for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
+                        const void * src = io.read(cell_count * v_size_el);
+                        for (uint32_t i = 0; i < cell_count; ++i) {
+                            const size_t dst_offset = (sinfo.idxs[0][i] + j * cells.size()) * v_size_el;
+                            ggml_backend_tensor_set(v, (const char*)src + i * v_size_el, dst_offset, v_size_el);
+                        }
+                    }
                }
            }
        }
--- a/llama/llama.cpp/src/llama-kv-cache.h
+++ b/llama/llama.cpp/src/llama-kv-cache.h
@@ -72,6 +72,23 @@ public:
        void clear() {
            idxs.clear();
        }
+
+        // check if indices are contiguous starting from head()
+        bool is_contiguous() const {
+            if (idxs.empty() || idxs[0].empty()) {
+                return true;
+            }
+            if (idxs.size() > 1) {
+                return false;
+            }
+            const uint32_t h = idxs[0][0];
+            for (size_t i = 0; i < idxs[0].size(); ++i) {
+                if (idxs[0][i] != h + i) {
+                    return false;
+                }
+            }
+            return true;
+        }
    };

    using slot_info_vec_t = std::vector<slot_info>;
@@ -264,8 +281,8 @@ private:
    void state_write_meta(llama_io_write_i & io, const cell_ranges_t & cr, llama_seq_id seq_id = -1) const;
    void state_write_data(llama_io_write_i & io, const cell_ranges_t & cr) const;

-    bool state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
-    bool state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count);
+    bool state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count,       slot_info & sinfo, llama_seq_id dest_seq_id = -1);
+    bool state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, const slot_info & sinfo);
 };

 class llama_kv_cache_context : public llama_memory_context_i {
--- a/llama/llama.cpp/src/llama-memory-hybrid.cpp
+++ b/llama/llama.cpp/src/llama-memory-hybrid.cpp
@@ -222,7 +222,7 @@ llama_memory_hybrid_context::llama_memory_hybrid_context(
    ubatches(std::move(ubatches)),
    // note: here we copy the ubatches. not sure if this is ideal
    ctx_attn(new llama_kv_cache_context(mem->get_mem_attn(), std::move(sinfos_attn), this->ubatches)),
-    ctx_recr(new llama_memory_recurrent_context(mem->get_mem_recr(),                        this->ubatches)),
+    ctx_recr(new llama_memory_recurrent_context(mem->get_mem_recr(), this->ubatches)),
    status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
 }

--- a/llama/llama.cpp/src/llama-model-loader.cpp
+++ b/llama/llama.cpp/src/llama-model-loader.cpp
@@ -473,6 +473,7 @@ llama_model_loader::llama_model_loader(
        std::vector<std::string> & splits,
        bool use_mmap,
        bool check_tensors,
+        bool no_alloc,
        const llama_model_kv_override * param_overrides_p,
        const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) {
    int trace = 0;
@@ -716,6 +717,7 @@ llama_model_loader::llama_model_loader(

    this->use_mmap = use_mmap;
    this->check_tensors = check_tensors;
+    this->no_alloc = no_alloc;
 }

 std::string llama_model_loader::get_arch_name() const {
--- a/llama/llama.cpp/src/llama-model-loader.h
+++ b/llama/llama.cpp/src/llama-model-loader.h
@@ -71,6 +71,7 @@ struct llama_model_loader {

    bool use_mmap = false;
    bool check_tensors;
+    bool no_alloc;

    llama_files files;
    llama_ftype ftype;
@@ -97,6 +98,7 @@ struct llama_model_loader {
        std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
        bool use_mmap,
        bool check_tensors,
+        bool no_alloc,
        const llama_model_kv_override * param_overrides_p,
        const llama_model_tensor_buft_override * param_tensor_buft_overrides_p);

--- a/llama/llama.cpp/src/llama-model.cpp
+++ b/llama/llama.cpp/src/llama-model.cpp
@@ -669,6 +669,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    hparams.n_swa                   = 8192;
                    hparams.n_attn_temp_floor_scale = 8192;
                    hparams.f_attn_temp_scale       = 0.1f;
+                    hparams.f_attn_temp_offset      = 1.0f;
                    hparams.set_swa_pattern(4);   // pattern: 3 chunked - 1 full
                }

@@ -1636,12 +1637,19 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    // that have no expert_gating_func model parameter set
                    hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
                }
-                ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
+
+                if (ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f)) {
+                    // [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
+                    // cancel the factor from the convert script
+                    hparams.rope_yarn_log_mul /= 0.1f;
+                }

                // (optional) temperature tuning - used by mistral-large
                ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE,  hparams.f_attn_temp_scale,       false);
                ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.n_attn_temp_floor_scale, false);

+                hparams.f_attn_temp_offset = 0.0f;
+
                switch (hparams.n_layer) {
                    case 27: type = LLM_TYPE_16B; break;
                    case 60: type = LLM_TYPE_236B; break;
@@ -1681,7 +1689,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
            } break;
        case LLM_ARCH_GLM4:
            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,    hparams.f_norm_rms_eps);
+                ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
                switch (hparams.n_layer) {
                    case 40: type = LLM_TYPE_9B; break;
                    case 61: type = LLM_TYPE_32B; break;
@@ -1690,8 +1699,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
            } break;
        case LLM_ARCH_GLM4_MOE:
            {
-                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,     hparams.n_ff_exp);
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,    hparams.f_norm_rms_eps);
+                ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);

                // MoE parameters
                ml.get_key(LLM_KV_EXPERT_COUNT,                hparams.n_expert);
@@ -2282,7 +2292,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                }

                switch (hparams.n_layer) {
-                    case 80: type = LLM_TYPE_80B_A3B; break;
+                    case 48: type = LLM_TYPE_80B_A3B; break;
                    default: type = LLM_TYPE_UNKNOWN;
                }
            } break;
@@ -2291,9 +2301,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);

-                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST,   hparams.yarn_beta_fast, false);
-                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW,   hparams.yarn_beta_slow, false);
-                ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL,     hparams.rope_yarn_log_mul, false);
+                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast,    false);
+                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow,    false);
+                ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL,   hparams.rope_yarn_log_mul, 0.0f);
+
+                hparams.f_attn_temp_offset = 0.0f;

                // TODO: maybe add n_attn_temp_floor_scale as a separate KV?
                if (hparams.f_attn_temp_scale != 0.0f) {
@@ -2303,18 +2315,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    }
                }

-                // TODO: this seems to be correct with the case of mscale == mscale_all_dims == 1.0f
-                //       but may need further verification with other values
-                if (hparams.rope_yarn_log_mul != 0.0f) {
-                    float factor = 1.0f / hparams.rope_freq_scale_train;
-                    float mscale = 1.0f;
-                    float mscale_all_dims = hparams.rope_yarn_log_mul;
-                    static auto get_mscale = [](float scale, float mscale) {
-                        return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f);
-                    };
-                    hparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims);
-                }
-
                switch (hparams.n_layer) {
                    case 26: type = LLM_TYPE_3B; break;
                    case 34: type = LLM_TYPE_8B; break;
@@ -3414,9 +3414,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);

                        // optional bias tensors
-                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd}, 0);
-                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, 0);
-                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, 0);
+                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
+                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);

                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);

@@ -6678,9 +6678,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {

        std::vector<ggml_backend_buffer_ptr> bufs;
        if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
+            GGML_ASSERT(!ml.no_alloc);
            for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
                // only the mmap region containing the tensors in the model is mapped to the backend buffer
-                // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
+                // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer,
+                //     then we could just use metal for all layers
                // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
                void * addr = nullptr;
                size_t first, last; // NOLINT
@@ -6696,9 +6698,16 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                bufs.emplace_back(buf);
                buf_map.emplace(idx, buf);
            }
-        }
-        else {
-            ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
+        } else {
+            ggml_backend_buffer_t buf;
+            if (ml.no_alloc) {
+                buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // dummy buffer
+                for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
+                    t->buffer = buf; // set dummy buffer for weights so that the backend scheduler won't try to allocate them
+                }
+            } else {
+                buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); // real buffer
+            }
            if (buf == nullptr) {
                throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
            }
@@ -6753,6 +6762,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
        }
    }

+    if (ml.no_alloc) {
+        return true;
+    }
+
    // load tensor data
    for (auto & [ctx, buf_map] : ctx_buf_maps) {
        if (!ml.load_all_data(ctx, buf_map, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
@@ -6795,9 +6808,18 @@ size_t llama_model::n_devices() const {

 std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
    std::map<ggml_backend_buffer_type_t, size_t> ret;
-    for (const auto & [_, bufs] : pimpl->ctxs_bufs) {
-        for (const auto & buf : bufs) {
-            ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
+    for (const auto & [ctx, bufs] : pimpl->ctxs_bufs) {
+        if (hparams.no_alloc) {
+            GGML_ASSERT(bufs.size() == 1);
+            ggml_backend_buffer_t buf = bufs[0].get();
+            GGML_ASSERT(ggml_backend_buffer_get_base(buf) == nullptr);
+            ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(buf);
+            ret[buft] += ggml_backend_alloc_ctx_tensors_from_buft_size(ctx.get(), buft);
+        } else {
+            for (const auto & buf : bufs) {
+                // GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) != nullptr); // multi_buffer does not have a defined base
+                ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
+            }
        }
    }
    return ret;
@@ -6842,6 +6864,7 @@ void llama_model::print_info() const {
    // hparams
    LLAMA_LOG_INFO("%s: arch             = %s\n",     __func__, arch_name().c_str());
    LLAMA_LOG_INFO("%s: vocab_only       = %d\n",     __func__, hparams.vocab_only);
+    LLAMA_LOG_INFO("%s: no_alloc         = %d\n",     __func__, hparams.no_alloc);

    if (!hparams.vocab_only) {
        LLAMA_LOG_INFO("%s: n_ctx_train      = %u\n",     __func__, hparams.n_ctx_train);
@@ -6876,6 +6899,7 @@ void llama_model::print_info() const {
        LLAMA_LOG_INFO("%s: freq_base_train  = %.1f\n",   __func__, hparams.rope_freq_base_train);
        LLAMA_LOG_INFO("%s: freq_scale_train = %g\n",     __func__, hparams.rope_freq_scale_train);
        LLAMA_LOG_INFO("%s: n_ctx_orig_yarn  = %u\n",     __func__, hparams.n_ctx_orig_yarn);
+        LLAMA_LOG_INFO("%s: rope_yarn_log_mul= %.4f\n",   __func__, hparams.rope_yarn_log_mul);
        LLAMA_LOG_INFO("%s: rope_finetuned   = %s\n",     __func__, hparams.rope_finetuned ? "yes" : "unknown");
        // MRoPE (Multi-axis Rotary Position Embedding) sections
        if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) {
@@ -6940,7 +6964,6 @@ void llama_model::print_info() const {
        LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n",   __func__, hparams.expert_weights_scale);
        LLAMA_LOG_INFO("%s: expert_weights_norm  = %d\n",     __func__, hparams.expert_weights_norm);
        LLAMA_LOG_INFO("%s: expert_gating_func   = %s\n",     __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
-        LLAMA_LOG_INFO("%s: rope_yarn_log_mul    = %.4f\n",   __func__, hparams.rope_yarn_log_mul);
    }

    if (arch == LLM_ARCH_QWEN2MOE) {
@@ -7697,6 +7720,7 @@ llama_model_params llama_model_default_params() {
        /*.check_tensors               =*/ false,
        /*.use_extra_bufts             =*/ true,
        /*.no_host                     =*/ false,
+        /*.no_alloc                    =*/ false,
    };

    return result;
@@ -7817,7 +7841,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
        case LLM_ARCH_DEEPSEEK2:
        case LLM_ARCH_PLM:
        case LLM_ARCH_CHATGLM:
-        case LLM_ARCH_GLM4:
        case LLM_ARCH_GRANITE:
        case LLM_ARCH_GRANITE_MOE:
        case LLM_ARCH_GRANITE_HYBRID:
@@ -7880,7 +7903,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
        case LLM_ARCH_LFM2:
        case LLM_ARCH_LFM2MOE:
        case LLM_ARCH_SMALLTHINKER:
-        case LLM_ARCH_GLM4_MOE:
        case LLM_ARCH_SEED_OSS:
        case LLM_ARCH_GROVEMOE:
        case LLM_ARCH_APERTUS:
@@ -7897,6 +7919,11 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
        case LLM_ARCH_QWEN3VLMOE:
            return LLAMA_ROPE_TYPE_IMROPE;

+        case LLM_ARCH_GLM4:
+            return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NORM;
+        case LLM_ARCH_GLM4_MOE:
+            return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NEOX;
+
        // all model arches should be listed explicitly here
        case LLM_ARCH_UNKNOWN:
            GGML_ABORT("unknown architecture");
--- a/llama/llama.cpp/src/llama-quant.cpp
+++ b/llama/llama.cpp/src/llama-quant.cpp
@@ -596,7 +596,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
    }

    std::vector<std::string> splits = {};
-    llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides, nullptr);
+    llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
    ml.init_mappings(false); // no prefetching

    llama_model model(llama_model_default_params());
--- a/llama/llama.cpp/src/llama-vocab.cpp
+++ b/llama/llama.cpp/src/llama-vocab.cpp
@@ -1884,7 +1884,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                clean_spaces = false;
            } else if (
                    tokenizer_pre == "qwen2" ||
-                    tokenizer_pre == "deepseek-r1-qwen") {
+                    tokenizer_pre == "deepseek-r1-qwen" ||
+                    tokenizer_pre == "kormo") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
                clean_spaces = false;
            } else if (
--- a/llama/llama.cpp/src/llama.cpp
+++ b/llama/llama.cpp/src/llama.cpp
@@ -1,6 +1,9 @@
+#include "llama.h"
+
 #include "llama-impl.h"

 #include "llama-chat.h"
+#include "llama-context.h"
 #include "llama-mmap.h"
 #include "llama-vocab.h"
 #include "llama-model-loader.h"
@@ -11,11 +14,14 @@
 #include "ggml-backend.h"

 #include <algorithm>
+#include <cassert>
+#include <cinttypes>
 #include <cstddef>
 #include <cstdint>
 #include <cstdio>
 #include <cstring>
 #include <ctime>
+#include <stdexcept>

 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@@ -37,6 +43,646 @@ const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_ty
    GGML_ABORT("fatal error");
 }

+struct llama_device_memory_data {
+    int64_t total;
+    int64_t free;
+    llama_memory_breakdown_data mb;
+};
+
+static std::vector<llama_device_memory_data> llama_get_device_memory_data(
+        const char * path_model, const llama_model_params * mparams, const llama_context_params * cparams,
+        std::vector<ggml_backend_dev_t> & devs, uint32_t & hp_ngl, uint32_t & hp_n_ctx_train, uint32_t & hp_n_expert,
+        const ggml_log_level log_level) {
+    struct user_data_t {
+        struct {
+            ggml_log_callback callback;
+            void * user_data;
+        } original_logger;
+        ggml_log_level min_level; // prints below this log level go to debug log
+    };
+    user_data_t ud;
+    llama_log_get(&ud.original_logger.callback, &ud.original_logger.user_data);
+    ud.min_level = log_level;
+
+    llama_log_set([](ggml_log_level level, const char * text, void * user_data) {
+        const user_data_t * ud = (const user_data_t *) user_data;
+        const ggml_log_level level_eff = level >= ud->min_level ? level : GGML_LOG_LEVEL_DEBUG;
+        ud->original_logger.callback(level_eff, text, ud->original_logger.user_data);
+    }, &ud);
+
+    llama_model_params mparams_copy = *mparams;
+    mparams_copy.no_alloc = true;
+    mparams_copy.use_mmap = false;
+
+    llama_model * model = llama_model_load_from_file(path_model, mparams_copy);
+    if (model == nullptr) {
+        llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
+        throw std::runtime_error("failed to load model");
+    }
+
+    llama_context * ctx = llama_init_from_model(model, *cparams);
+    if (ctx == nullptr) {
+        llama_model_free(model);
+        llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
+        throw std::runtime_error("failed to create llama_context from model");
+    }
+
+    std::vector<llama_device_memory_data> ret(model->devices.size());
+
+    std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown = ctx->memory_breakdown();
+
+    for (const auto & [buft, mb] : memory_breakdown) {
+        if (ggml_backend_buft_is_host(buft)) {
+            continue;
+        }
+
+        ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
+        if (!dev) {
+            continue;
+        }
+        for (size_t i = 0; i < ret.size(); i++) {
+            if (model->devices[i] == dev) {
+                ret[i].mb.model   += mb.model;
+                ret[i].mb.context += mb.context;
+                ret[i].mb.compute += mb.compute;
+                break;
+            }
+        }
+    }
+    for (size_t i = 0; i < ret.size(); i++) {
+        size_t free, total;
+        ggml_backend_dev_memory(model->devices[i], &free, &total);
+        ret[i].free  = free;
+        ret[i].total = total;
+    }
+
+    devs           = model->devices;
+    hp_ngl         = model->hparams.n_layer;
+    hp_n_ctx_train = model->hparams.n_ctx_train;
+    hp_n_expert    = model->hparams.n_expert;
+
+    llama_memory_breakdown_print(ctx); // goes to debug log
+
+    llama_free(ctx);
+    llama_model_free(model);
+    llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
+    return ret;
+}
+
+// enum to identify part of a layer for distributing its tensors:
+enum layer_fraction_t {
+    LAYER_FRACTION_NONE = 0, // nothing
+    LAYER_FRACTION_ATTN = 1, // attention
+    LAYER_FRACTION_UP   = 2, // attention + up
+    LAYER_FRACTION_GATE = 3, // attention + up + gate
+    LAYER_FRACTION_MOE  = 4, // everything but sparse MoE weights
+};
+// this enum is only used in llama_params_fit_impl but needs to be defined outside of it to fix a Windows compilation issue
+
+static void llama_params_fit_impl(
+        const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
+        float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
+        size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
+    constexpr int64_t MiB = 1024*1024;
+    const int64_t margin = margin_s; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
+    typedef std::vector<llama_device_memory_data> dmds_t;
+    const llama_model_params default_mparams = llama_model_default_params();
+
+    std::vector<ggml_backend_dev_t> devs;
+    uint32_t hp_ngl = 0; // hparams.n_gpu_layers
+    uint32_t hp_nct = 0; // hparams.n_ctx_train
+    uint32_t hp_nex = 0; // hparams.n_expert
+
+    // step 1: get data for default parameters and check whether any changes are necessary in the first place
+
+    LLAMA_LOG_DEBUG("%s: getting device memory data for initial parameters:\n", __func__);
+    const dmds_t dmds_full = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
+    const size_t nd = devs.size(); // number of devices
+    if (nd == 0) {
+        LLAMA_LOG_INFO("%s: no devices with dedicated memory found\n", __func__);
+        return;
+    }
+
+    std::vector<std::string> dev_names;
+    {
+        dev_names.reserve(nd);
+        size_t max_length = 0;
+        for (ggml_backend_dev_t dev : devs) {
+            std::string name = ggml_backend_dev_name(dev);
+            name += " (";
+            name += ggml_backend_dev_description(dev);
+            name += ")";
+            dev_names.push_back(name);
+            max_length = std::max(max_length, name.length());
+        }
+        for (std::string & dn : dev_names) {
+            dn.insert(dn.end(), max_length - dn.length(), ' ');
+        }
+    }
+
+    int64_t sum_total          = 0;
+    int64_t sum_projected_free = 0;
+    int64_t min_projected_free = INT64_MAX;
+    int64_t sum_projected_used = 0;
+    int64_t sum_projected_ctx  = 0;
+
+    if (nd > 1) {
+        LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
+    }
+    for (size_t id = 0; id < nd; id++) {
+        const llama_device_memory_data & dmd = dmds_full[id];
+
+        const int64_t projected_used = dmd.mb.total();
+        const int64_t projected_free = dmd.free - projected_used;
+
+        sum_total          += dmd.total;
+        sum_projected_used += projected_used;
+        sum_projected_free += projected_free;
+        min_projected_free  = std::min(min_projected_free, projected_free);
+        sum_projected_ctx  += dmd.mb.context;
+
+        if (nd > 1) {
+            LLAMA_LOG_INFO("%s:   - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " %s\n",
+                __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, std::abs(projected_free)/MiB,
+                projected_free >= 0 ? "surplus" : "deficit");
+        }
+    }
+    assert(sum_total >= 0 && sum_projected_used >= 0 && sum_projected_ctx >= 0);
+    assert(sum_projected_used >= sum_projected_ctx);
+    LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
+        __func__, sum_projected_used/MiB, sum_total/MiB);
+    if (min_projected_free >= margin) {
+        if (nd == 1) {
+            LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
+                __func__, min_projected_free/MiB, margin/MiB);
+            return;
+        }
+        LLAMA_LOG_INFO("%s: will leave at least %" PRId64 " >= %" PRId64 " MiB of free memory on all devices, no changes needed\n",
+            __func__, min_projected_free/MiB, margin/MiB);
+        return;
+    }
+
+    // step 2: try reducing memory use by reducing the context size
+
+    {
+        int64_t global_surplus = sum_projected_free - int64_t(nd)*margin;
+        if (global_surplus < 0) {
+            LLAMA_LOG_INFO(nd == 1 ?
+                "%s: cannot fulfill margin of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n" :
+                "%s: cannot fulfill margin of %" PRId64 " MiB on all devices, need to use %" PRId64 " MiB less in total\n",
+                __func__, margin/MiB, -global_surplus/MiB);
+            if (cparams->n_ctx == 0) {
+                if (hp_nct > n_ctx_min) {
+                    const int64_t bytes_per_ctx = sum_projected_ctx / hp_nct;
+                    const uint32_t ctx_reduction = std::min(
+                        uint32_t((-global_surplus + bytes_per_ctx - 1) / bytes_per_ctx), hp_nct - n_ctx_min);
+                    cparams->n_ctx = hp_nct - ctx_reduction;
+                    const int64_t memory_reduction = ctx_reduction * bytes_per_ctx;
+                    global_surplus += memory_reduction;
+                    LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
+                        __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
+                    if (global_surplus >= 0) {
+                        if (nd == 1) {
+                            LLAMA_LOG_INFO("%s: entire model can be fit by reducing context\n", __func__);
+                            return;
+                        }
+                        LLAMA_LOG_INFO("%s: entire model should be fit across devices by reducing context\n", __func__);
+                    }
+                } else {
+                    LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
+                        __func__, hp_nct, n_ctx_min);
+                }
+            } else {
+                LLAMA_LOG_INFO("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx);
+            }
+        }
+    }
+
+    if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
+        throw std::runtime_error("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
+    }
+    if (nd > 1) {
+        if (!tensor_split) {
+            throw std::runtime_error("did not provide a buffer to write the tensor_split to, abort");
+        }
+        if (mparams->tensor_split) {
+            for (size_t id = 0; id < nd; id++) {
+                if (mparams->tensor_split[id] != 0.0f) {
+                    throw std::runtime_error("model_params::tensor_split already set by user, abort");
+                }
+            }
+        }
+        if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) {
+            throw std::runtime_error("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
+        }
+        if (hp_ngl < 2*nd) {
+            throw std::runtime_error("model has only " + std::to_string(hp_ngl) + " layers but need at least "
+                + std::to_string(2*nd) + " to fit memory for " + std::to_string(nd) + " devices, abort");
+        }
+    }
+    if (!tensor_buft_overrides) {
+        throw std::runtime_error("did not provide buffer to set tensor_buft_overrides, abort");
+    }
+    if (mparams->tensor_buft_overrides && (mparams->tensor_buft_overrides->pattern || mparams->tensor_buft_overrides->buft)) {
+        throw std::runtime_error("model_params::tensor_buft_overrides already set by user, abort");
+    }
+
+    // step 3: iteratively fill the back to front with "dense" layers
+    //   - for a dense model simply fill full layers, giving each device a contiguous slice of the model
+    //   - for a MoE model, same as dense model but with all MoE tensors in system memory
+
+    // utility function that returns a static C string matching the tensors for a specific layer index and layer fraction:
+    auto get_overflow_pattern = [&](const size_t il, const layer_fraction_t lf) -> const char * {
+        constexpr size_t n_strings = 1000;
+        if (il >= n_strings) {
+            throw std::runtime_error("at most " + std::to_string(n_strings) + " model layers are supported");
+        }
+        switch (lf) {
+            case LAYER_FRACTION_ATTN: {
+                static std::array<std::string, n_strings> patterns;
+                if (patterns[il].empty()) {
+                    patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(up|gate|down).*";
+                }
+                return patterns[il].c_str();
+            }
+            case LAYER_FRACTION_UP: {
+                static std::array<std::string, n_strings> patterns;
+                if (patterns[il].empty()) {
+                    patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(gate|down).*";
+                }
+                return patterns[il].c_str();
+            }
+            case LAYER_FRACTION_GATE: {
+                static std::array<std::string, n_strings> patterns;
+                if (patterns[il].empty()) {
+                    patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_down.*";
+                }
+                return patterns[il].c_str();
+            }
+            case LAYER_FRACTION_MOE: {
+                static std::array<std::string, n_strings> patterns;
+                if (patterns[il].empty()) {
+                    patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(up|down|gate)_(ch|)exps";
+                }
+                return patterns[il].c_str();
+            }
+            default:
+                GGML_ABORT("fatal error");
+        }
+    };
+
+    struct ngl_t {
+        uint32_t n_layer = 0; // number of total layers
+        uint32_t n_part  = 0; // number of partial layers, <= n_layer
+
+        // for the first partial layer varying parts can overflow, all further layers use LAYER_FRACTION_MOE:
+        layer_fraction_t overflow_type = LAYER_FRACTION_MOE;
+    };
+
+    const size_t ntbo = llama_max_tensor_buft_overrides();
+
+    // utility function to set n_gpu_layers and tensor_split
+    auto set_ngl_tensor_split_tbo = [&](
+            const std::vector<ngl_t> & ngl_per_device,
+            const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
+            llama_model_params & mparams,
+            const bool add_nonrepeating) {
+        mparams.n_gpu_layers = 0;
+        for (size_t id = 0; id < nd; id++) {
+            mparams.n_gpu_layers += ngl_per_device[id].n_layer;
+            if (nd > 1) {
+                tensor_split[id] = ngl_per_device[id].n_layer;
+            }
+        }
+        assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl);
+        uint32_t il0 = hp_ngl - mparams.n_gpu_layers; // start index for tensor buft overrides
+
+        if (add_nonrepeating) {
+            mparams.n_gpu_layers += 1;
+            tensor_split[nd - 1] += 1;
+        }
+        mparams.tensor_split = tensor_split;
+
+        size_t itbo = 0;
+        for (size_t id = 0; id < nd; id++) {
+            il0 += ngl_per_device[id].n_layer - ngl_per_device[id].n_part;
+            for (uint32_t il = il0; il < il0 + ngl_per_device[id].n_part; il++) {
+                if (itbo + 1 >= ntbo) {
+                    tensor_buft_overrides[itbo].pattern = nullptr;
+                    tensor_buft_overrides[itbo].buft    = nullptr;
+                    itbo++;
+                    mparams.tensor_buft_overrides = tensor_buft_overrides;
+                    throw std::runtime_error("llama_params_fit_n_tensor_buft_overrides() == "
+                        + std::to_string(ntbo) + " is insufficient for model\n");
+                }
+                tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
+                tensor_buft_overrides[itbo].buft = overflow_bufts[id];
+                itbo++;
+            }
+            il0 += ngl_per_device[id].n_part;
+        }
+        tensor_buft_overrides[itbo].pattern = nullptr;
+        tensor_buft_overrides[itbo].buft    = nullptr;
+        itbo++;
+        mparams.tensor_buft_overrides = tensor_buft_overrides;
+    };
+
+    // utility function that returns the memory use per device for given numbers of layers per device
+    auto get_memory_for_layers = [&](
+            const char * func_name,
+            const std::vector<ngl_t> & ngl_per_device,
+            const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
+            const bool add_nonrepeating) -> std::vector<int64_t> {
+        llama_model_params mparams_copy = *mparams;
+        set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy, add_nonrepeating);
+
+        const dmds_t dmd_nl = llama_get_device_memory_data(
+            path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
+
+        LLAMA_LOG_DEBUG("%s: memory for test allocation by device:\n", func_name);
+        for (size_t id = 0; id < nd; id++) {
+            const ngl_t & n = ngl_per_device[id];
+            LLAMA_LOG_DEBUG(
+                "%s: id=%zu, n_layer=%2" PRIu32 ", n_part=%2" PRIu32 ", overflow_type=%d, mem=%6" PRId64 " MiB\n",
+                func_name, id, n.n_layer, n.n_part, int(n.overflow_type), dmd_nl[id].mb.total()/MiB);
+        }
+
+        std::vector<int64_t> ret;
+        ret.reserve(nd);
+        for (const llama_device_memory_data & dmd : dmd_nl) {
+            ret.push_back(dmd.mb.total());
+        }
+        return ret;
+    };
+
+    int64_t global_surplus_cpu_moe = 0;
+    if (hp_nex > 0) {
+        const static std::string pattern_moe_all = "blk\\.\\d+\\.ffn_(up|down|gate)_(ch|)exps"; // matches all MoE tensors
+        ggml_backend_buffer_type_t cpu_buft = ggml_backend_cpu_buffer_type();
+        tensor_buft_overrides[0] = {pattern_moe_all.c_str(), cpu_buft};
+        tensor_buft_overrides[1] = {nullptr, nullptr};
+        mparams->tensor_buft_overrides = tensor_buft_overrides;
+
+        LLAMA_LOG_DEBUG("%s: getting device memory data with all MoE tensors moved to system memory:\n", __func__);
+        const dmds_t dmds_cpu_moe = llama_get_device_memory_data(
+            path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
+
+        for (const llama_device_memory_data & dmd : dmds_cpu_moe) {
+            global_surplus_cpu_moe += dmd.free;
+            global_surplus_cpu_moe -= int64_t(dmd.mb.total()) + margin;
+        }
+
+        if (global_surplus_cpu_moe > 0) {
+            LLAMA_LOG_INFO("%s: with only dense weights in device memory there is a total surplus of %" PRId64 " MiB\n",
+                __func__, global_surplus_cpu_moe/MiB);
+        } else {
+            LLAMA_LOG_INFO("%s: with only dense weights in device memory there is still a total deficit of %" PRId64 " MiB\n",
+                __func__, -global_surplus_cpu_moe/MiB);
+        }
+
+        // reset
+        tensor_buft_overrides[0] = {nullptr, nullptr};
+        mparams->tensor_buft_overrides = tensor_buft_overrides;
+    }
+
+    std::vector<int64_t> targets; // maximum acceptable memory use per device
+    targets.reserve(nd);
+    for (size_t id = 0; id < nd; id++) {
+        targets.push_back(dmds_full[id].free - margin);
+        LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
+    }
+
+    // whether for the optimal memory use we expect to load at least some MoE tensors:
+    const bool partial_moe = hp_nex > 0 && global_surplus_cpu_moe > 0;
+
+    std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the partial layers of a device overflow to:
+    overflow_bufts.reserve(nd);
+    for (size_t id = 0; id < nd - 1; ++id) {
+        overflow_bufts.push_back(ggml_backend_dev_buffer_type(devs[id + 1]));
+    }
+    overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
+
+    std::vector<ngl_t> ngl_per_device(nd);
+    std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts, partial_moe);
+    if (hp_nex > 0) {
+        for (size_t id = 0; id < nd; id++) {
+            ngl_per_device[id].overflow_type = LAYER_FRACTION_MOE;
+        }
+    }
+
+    // optimize the number of layers per device using the method of false position:
+    //   - ngl_per_device has 0 layers for each device, lower bound
+    //   - try a "high" configuration where a device is given all unassigned layers
+    //   - interpolate the memory use / layer between low and high linearly to get a guess where it meets our target
+    //   - check memory use of our guess, replace either the low or high bound
+    //   - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits
+    if (hp_nex == 0) {
+        LLAMA_LOG_INFO("%s: filling dense layers back-to-front:\n", __func__);
+    } else {
+        LLAMA_LOG_INFO("%s: filling dense-only layers back-to-front:\n", __func__);
+    }
+    uint32_t n_unassigned = hp_ngl;
+    for (int id = nd - 1; id >= 0; id--) {
+        std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
+        ngl_per_device_high[id].n_layer = n_unassigned;
+        if (hp_nex > 0) {
+            ngl_per_device_high[id].n_part = ngl_per_device_high[id].n_layer;
+        }
+        if (ngl_per_device_high[id].n_layer > 0) {
+            std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts, partial_moe);
+            if (mem_high[id] > targets[id]) {
+                uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
+                while (delta > 1) {
+                    uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
+                    step_size = std::max(step_size, uint32_t(1));
+                    step_size = std::min(step_size, delta - 1);
+
+                    std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
+                    ngl_per_device_test[id].n_layer += step_size;
+                    if (hp_nex) {
+                        ngl_per_device_test[id].n_part += step_size;
+                    }
+                    const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
+
+                    if (mem_test[id] <= targets[id]) {
+                        ngl_per_device  = ngl_per_device_test;
+                        mem             = mem_test;
+                        n_unassigned   -= ngl_per_device[id].n_layer;
+                        LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
+                    } else {
+                        ngl_per_device_high = ngl_per_device_test;
+                        mem_high            = mem_test;
+                        LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
+                    }
+                    delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
+                }
+            } else {
+                ngl_per_device  = ngl_per_device_high;
+                n_unassigned   -= ngl_per_device[id].n_layer;
+                LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
+            }
+        }
+
+        const int64_t projected_margin = dmds_full[id].free - mem[id];
+        LLAMA_LOG_INFO(
+            "%s:   - %s: %2" PRIu32 " layers, %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
+            __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB);
+    }
+    if (hp_nex == 0 || global_surplus_cpu_moe <= 0) {
+        set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams, partial_moe);
+        return;
+    }
+
+    // step 4: for a MoE model where all dense tensors fit,
+    //     convert the dense-only layers in the back to full layers in the front until all devices are full
+    // essentially the same procedure as for the dense-only layers except front-to-back
+    // also, try fitting at least part of one more layer to reduce waste for "small" GPUs with e.g. 24 GiB VRAM
+
+    size_t id_dense_start = nd;
+    for (int id = nd - 1; id >= 0; id--) {
+        if (ngl_per_device[id].n_layer > 0) {
+            id_dense_start = id;
+            continue;
+        }
+        break;
+    }
+    assert(id_dense_start < nd);
+
+    LLAMA_LOG_INFO("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
+    for (size_t id = 0; id <= id_dense_start; id++) {
+        std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
+        for (size_t jd = id_dense_start; jd < nd; jd++) {
+            const uint32_t n_layer_move = ngl_per_device_high[jd].n_layer;
+            ngl_per_device_high[id].n_layer += n_layer_move;
+            ngl_per_device_high[jd].n_layer -= n_layer_move;
+            ngl_per_device_high[jd].n_part = 0;
+        }
+        size_t id_dense_start_high = nd - 1;
+        std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts, partial_moe);
+
+        if (mem_high[id] > targets[id]) {
+            assert(ngl_per_device_high[id].n_layer >= ngl_per_device_high[id].n_part);
+            assert(ngl_per_device[id].n_layer >= ngl_per_device[id].n_part);
+            assert((ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
+                   >= ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
+            uint32_t delta = (ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
+                - (ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
+            while (delta > 1) {
+                uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
+                step_size = std::max(step_size, uint32_t(1));
+                step_size = std::min(step_size, delta - 1);
+
+                std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
+                size_t id_dense_start_test = id_dense_start;
+                uint32_t n_converted_test = 0;
+                for (;id_dense_start_test < nd; id_dense_start_test++) {
+                    const uint32_t n_convert_jd = std::min(step_size - n_converted_test, ngl_per_device_test[id_dense_start_test].n_part);
+                    ngl_per_device_test[id_dense_start_test].n_layer -= n_convert_jd;
+                    ngl_per_device_test[id_dense_start_test].n_part -= n_convert_jd;
+                    ngl_per_device_test[id].n_layer += n_convert_jd;
+                    n_converted_test += n_convert_jd;
+
+                    if (ngl_per_device_test[id_dense_start_test].n_layer > 0) {
+                        break;
+                    }
+                }
+                const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
+
+                if (mem_test[id] <= targets[id]) {
+                    ngl_per_device = ngl_per_device_test;
+                    mem            = mem_test;
+                    id_dense_start = id_dense_start_test;
+                    LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
+                        __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
+                } else {
+                    ngl_per_device_high = ngl_per_device_test;
+                    mem_high            = mem_test;
+                    id_dense_start_high = id_dense_start_test;
+                    LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
+                        __func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high);
+                }
+                delta = (ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
+                    - (ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
+            }
+        } else {
+            ngl_per_device = ngl_per_device_high;
+            id_dense_start = id_dense_start_high;
+            LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
+                __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
+        }
+
+        // try to fit at least part of one more layer
+        if (ngl_per_device[id_dense_start].n_layer > 0) {
+            std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
+            size_t id_dense_start_test = id_dense_start;
+            ngl_per_device_test[id_dense_start_test].n_layer--;
+            ngl_per_device_test[id_dense_start_test].n_part--;
+            ngl_per_device_test[id].n_layer++;
+            ngl_per_device_test[id].n_part++;
+            if (ngl_per_device_test[id_dense_start_test].n_layer == 0) {
+                id_dense_start_test++;
+            }
+            ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
+            LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
+            std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
+            if (mem_test[id] < targets[id]) {
+                ngl_per_device = ngl_per_device_test;
+                mem            = mem_test;
+                id_dense_start = id_dense_start_test;
+                LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
+                    __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
+
+                ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
+                LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
+                mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
+                if (mem_test[id] < targets[id]) {
+                    ngl_per_device = ngl_per_device_test;
+                    mem            = mem_test;
+                    id_dense_start = id_dense_start_test;
+                    LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
+                        __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
+                }
+            } else {
+                ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
+                LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
+                mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
+                if (mem_test[id] < targets[id]) {
+                    ngl_per_device = ngl_per_device_test;
+                    mem            = mem_test;
+                    id_dense_start = id_dense_start_test;
+                    LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
+                        __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
+                }
+            }
+        }
+
+        const int64_t projected_margin = dmds_full[id].free - mem[id];
+        LLAMA_LOG_INFO(
+            "%s:   - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
+            __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
+    }
+
+    set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams, partial_moe);
+}
+
+bool llama_params_fit(
+        const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
+        float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
+        size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
+    const int64_t t0_us = llama_time_us();
+    bool ok = true;
+    try {
+        llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margin_s, n_ctx_min, log_level);
+        LLAMA_LOG_INFO("%s: successfully fit params to free device memory\n", __func__);
+    } catch (const std::runtime_error & e) {
+        LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
+        ok = false;
+    }
+    const int64_t t1_us = llama_time_us();
+    LLAMA_LOG_INFO("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
+    return ok;
+}
+
 struct llama_sampler_chain_params llama_sampler_chain_default_params() {
    struct llama_sampler_chain_params result = {
        /*.no_perf                     =*/ true,
@@ -49,6 +695,10 @@ size_t llama_max_devices(void) {
    return 16;
 }

+size_t llama_max_tensor_buft_overrides() {
+    return 4096;
+}
+
 bool llama_supports_mmap(void) {
    return llama_mmap::SUPPORTED;
 }
@@ -108,11 +758,12 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
    model.t_start_us = tm.t_start_us;

    try {
-        llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides, params.tensor_buft_overrides);
+        llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);

        ml.print_info();

        model.hparams.vocab_only = params.vocab_only;
+        model.hparams.no_alloc   = params.no_alloc;

        try {
            model.load_arch(ml);
--- a/llama/llama.cpp/src/models/deepseek2.cpp
+++ b/llama/llama.cpp/src/models/deepseek2.cpp
@@ -1,7 +1,5 @@
 #include "models.h"

-
-
 llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) :
    llm_graph_context(params) {
    // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
@@ -20,9 +18,15 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr

    // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
    // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
-    const float mscale      = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
-    const float kq_scale    = 1.0f * mscale * mscale / sqrtf(float(n_embd_head_k));
-    const float attn_factor = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
+    // And also: https://github.com/ggml-org/llama.cpp/pull/17945 [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
+
+    // first cancel the adjustment from llama_hparams::yarn_attn_factor_adjust to get the original attn_factor
+    GGML_ASSERT(ext_factor >= 0.0f);
+    const float attn_factor_org = attn_factor * (1.0f + 0.1f * logf(1.0f / freq_scale));
+
+    // use the original attn_factor to pre-scale the kq_scale
+    const float mscale   = attn_factor_org * (1.0f + 0.1f * hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
+    const float kq_scale = 1.0f * mscale * mscale / sqrtf(float(n_embd_head_k));

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/llama/llama.cpp/src/models/glm4-moe.cpp
+++ b/llama/llama.cpp/src/models/glm4-moe.cpp
@@ -5,11 +5,20 @@ llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_grap

    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);

+    int sections[4];
+    std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
+
    ggml_tensor * cur;
    ggml_tensor * inpL;

    inpL = build_inp_embd(model.tok_embd);

+    bool use_mrope = hparams.use_mrope();
+    if (ubatch.embd && !use_mrope) {
+        // unfortunately, we need to forcefully stop here, to avoid users complaining about wrong results
+        GGML_ABORT("This GGUF does not support multimodal. Please reconvert it.");
+    }
+
    // inp_pos - contains the positions
    ggml_tensor * inp_pos = build_inp_pos();

@@ -60,17 +69,25 @@ llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_grap
                Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
                cb(Kcur, "Kcur_normed", il);
            }
-            Qcur = ggml_rope_ext(
-                    ctx0, Qcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                    );

-            Kcur = ggml_rope_ext(
-                    ctx0, Kcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                    );
+            if (use_mrope) {
+                Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
+                            n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+                            ext_factor, attn_factor, beta_fast, beta_slow);
+
+                Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr,
+                            n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+                            ext_factor, attn_factor, beta_fast, beta_slow);
+            } else {
+                // Normal RoPE
+                Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot,
+                                    rope_type, n_ctx_orig, freq_base, freq_scale,
+                                    ext_factor, attn_factor, beta_fast, beta_slow);
+
+                Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot,
+                                    rope_type, n_ctx_orig, freq_base, freq_scale,
+                                    ext_factor, attn_factor, beta_fast, beta_slow);
+            }

            cb(Qcur, "Qcur", il);
            cb(Kcur, "Kcur", il);
--- a/llama/llama.cpp/src/models/glm4.cpp
+++ b/llama/llama.cpp/src/models/glm4.cpp
@@ -8,11 +8,20 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params

    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);

+    int sections[4];
+    std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
+
    ggml_tensor * cur;
    ggml_tensor * inpL;

    inpL = build_inp_embd(model.tok_embd);

+    bool use_mrope = hparams.use_mrope();
+    if (ubatch.embd && !use_mrope) {
+        // unfortunately, we need to forcefully stop here, to avoid users complaining about wrong results
+        GGML_ABORT("This GGUF does not support multimodal. Please reconvert it.");
+    }
+
    // inp_pos - contains the positions
    ggml_tensor * inp_pos = build_inp_pos();

@@ -63,11 +72,25 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params
                Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
                                    cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
            }
-            Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                                 ext_factor, attn_factor, beta_fast, beta_slow);

-            Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                                 ext_factor, attn_factor, beta_fast, beta_slow);
+            if (use_mrope) {
+                Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
+                            n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+                            ext_factor, attn_factor, beta_fast, beta_slow);
+
+                Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr,
+                            n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+                            ext_factor, attn_factor, beta_fast, beta_slow);
+            } else {
+                // Normal RoPE
+                Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot,
+                                    rope_type, n_ctx_orig, freq_base, freq_scale,
+                                    ext_factor, attn_factor, beta_fast, beta_slow);
+
+                Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot,
+                                    rope_type, n_ctx_orig, freq_base, freq_scale,
+                                    ext_factor, attn_factor, beta_fast, beta_slow);
+            }

            cb(Qcur, "Qcur", il);
            cb(Kcur, "Kcur", il);
--- a/llama/llama.cpp/src/models/models.h
+++ b/llama/llama.cpp/src/models/models.h
@@ -441,23 +441,13 @@ private:
                ggml_tensor * cur,
                ggml_tensor * causal_mask,
                ggml_tensor * identity,
+                ggml_tensor * diag_mask,
                        int   il);

    ggml_tensor * build_layer_ffn(
                ggml_tensor * cur,
                        int   il);

-    ggml_tensor * build_delta_net_recurrent(
-                ggml_tensor * q,
-                ggml_tensor * k,
-                ggml_tensor * v,
-                ggml_tensor * g,
-                ggml_tensor * beta,
-                ggml_tensor * state,
-                ggml_tensor * causal_mask,
-                ggml_tensor * identity,
-                        int   il);
-
    ggml_tensor * build_delta_net_chunking(
                ggml_tensor * q,
                ggml_tensor * k,
@@ -467,8 +457,18 @@ private:
                ggml_tensor * state,
                ggml_tensor * causal_mask,
                ggml_tensor * identity,
+                ggml_tensor * diag_mask,
                        int   il);

+    ggml_tensor * build_delta_net_autoregressive(
+                ggml_tensor * q,
+                ggml_tensor * k,
+                ggml_tensor * v,
+                ggml_tensor * g,
+                ggml_tensor * beta,
+                ggml_tensor * state,
+                int           il);
+
    ggml_tensor * build_norm_gated(
                ggml_tensor * input,
                ggml_tensor * weights,
--- a/llama/llama.cpp/src/models/qwen2.cpp
+++ b/llama/llama.cpp/src/models/qwen2.cpp
@@ -31,16 +31,25 @@ llm_build_qwen2::llm_build_qwen2(const llama_model & model, const llm_graph_para
        {
            // compute Q and K and RoPE them
            ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-            Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
            cb(Qcur, "Qcur", il);
+            if (model.layers[il].bq) {
+                Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                cb(Qcur, "Qcur", il);
+            }

            ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-            Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
            cb(Kcur, "Kcur", il);
+            if (model.layers[il].bk) {
+                Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                cb(Kcur, "Kcur", il);
+            }

            ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-            Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
            cb(Vcur, "Vcur", il);
+            if (model.layers[il].bv) {
+                Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                cb(Vcur, "Vcur", il);
+            }

            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
--- a/llama/llama.cpp/src/models/qwen3next.cpp
+++ b/llama/llama.cpp/src/models/qwen3next.cpp
@@ -17,13 +17,15 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr
    ggml_tensor * inp_out_ids = build_inp_out_ids();

    ggml_tensor * causal_mask =
-        ggml_tri(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, ubatch.n_seq_tokens, ubatch.n_seq_tokens), 1.0f),
+        ggml_tri(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, CHUNK_SIZE, CHUNK_SIZE), 1.0f),
                    GGML_TRI_TYPE_LOWER);

-    ggml_tensor * identity = ggml_diag(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, ubatch.n_seq_tokens), 1.0f));
+    ggml_tensor * identity = ggml_diag(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, CHUNK_SIZE), 1.0f));
+    ggml_tensor * diag_mask = ggml_add(ctx0, causal_mask, identity);

    ggml_build_forward_expand(gf, causal_mask);
    ggml_build_forward_expand(gf, identity);
+    ggml_build_forward_expand(gf, diag_mask);

    for (int il = 0; il < n_layer; ++il) {
        ggml_tensor * inpSA = inpL;
@@ -34,7 +36,7 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr
        // Determine layer type and build appropriate attention mechanism
        if (hparams.is_recurrent(il)) {
            // Linear attention layer (gated delta net)
-            cur = build_layer_attn_linear(inp->get_recr(), cur, causal_mask, identity, il);
+            cur = build_layer_attn_linear(inp->get_recr(), cur, causal_mask, identity, diag_mask, il);
        } else {
            // Full attention layer
            cur = build_layer_attn(inp->get_attn(), cur, inp_pos, il);
@@ -93,14 +95,8 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
        ggml_tensor * state,
        ggml_tensor * causal_mask,
        ggml_tensor * identity,
+        ggml_tensor * diag_mask,
        int           il) {
-    GGML_ASSERT(ggml_is_contiguous(q));
-    GGML_ASSERT(ggml_is_contiguous(k));
-    GGML_ASSERT(ggml_is_contiguous(v));
-    GGML_ASSERT(ggml_is_contiguous(g));
-    GGML_ASSERT(ggml_is_contiguous(beta));
-    GGML_ASSERT(ggml_is_contiguous(state));
-
    const int64_t S_k      = q->ne[0];
    const int64_t H_k      = q->ne[1];
    const int64_t n_tokens = q->ne[2];
@@ -120,15 +116,10 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(

    GGML_ASSERT(H_k == H_v);  // we did a repeat to make sure this is the case

-    // TODO: can this ever be false?
-    const bool use_qk_l2norm = true;
+    const float eps_norm = hparams.f_norm_rms_eps;

-    if (use_qk_l2norm) {
-        const float eps_norm = hparams.f_norm_rms_eps;
-
-        q = ggml_l2_norm(ctx0, q, eps_norm);
-        k = ggml_l2_norm(ctx0, k, eps_norm);
-    }
+    q = ggml_l2_norm(ctx0, q, eps_norm);
+    k = ggml_l2_norm(ctx0, k, eps_norm);

    const float scale = 1.0f / sqrtf(S_v);

@@ -136,8 +127,6 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(

    beta = ggml_sigmoid(ctx0, beta);

-    ggml_tensor * causal_diag_mask = ggml_add(ctx0, causal_mask, identity);
-
    cb(q, "q_in", il);
    cb(k, "k_in", il);
    cb(v, "v_in", il);
@@ -188,36 +177,21 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
    cb(v_beta, "v_beta", il);
    cb(k_beta, "k_beta", il);

-    ggml_tensor * chunked_mask =
-        ggml_view_4d(ctx0, causal_mask, chunk_size,
-                chunk_size,         causal_mask->ne[2], causal_mask->ne[3],
-                causal_mask->nb[1], causal_mask->nb[2], causal_mask->nb[3], 0);
+    q      = ggml_reshape_4d(ctx0, q,      S_k, chunk_size, n_chunks, H_k * n_seqs);
+    k      = ggml_reshape_4d(ctx0, k,      S_k, chunk_size, n_chunks, H_k * n_seqs);
+    k_beta = ggml_reshape_4d(ctx0, k_beta, S_k, chunk_size, n_chunks, H_k * n_seqs);
+    v      = ggml_reshape_4d(ctx0, v,      S_v, chunk_size, n_chunks, H_v * n_seqs);
+    v_beta = ggml_reshape_4d(ctx0, v_beta, S_v, chunk_size, n_chunks, H_v * n_seqs);

-    ggml_tensor * chunked_diag_mask =
-        ggml_view_4d(ctx0, causal_diag_mask, chunk_size,
-                chunk_size,              causal_diag_mask->ne[2], causal_diag_mask->ne[3],
-                causal_diag_mask->nb[1], causal_diag_mask->nb[2], causal_diag_mask->nb[3], 0);
-
-    ggml_tensor * chunked_identity =
-        ggml_view_4d(ctx0, identity, chunk_size,
-            chunk_size,      identity->ne[2], identity->ne[3],
-            identity->nb[1], identity->nb[2], identity->nb[3], 0);
-
-    q      = ggml_cont_4d(ctx0, q,      S_k, chunk_size, n_chunks, H_k * n_seqs);
-    k      = ggml_cont_4d(ctx0, k,      S_k, chunk_size, n_chunks, H_k * n_seqs);
-    k_beta = ggml_cont_4d(ctx0, k_beta, S_k, chunk_size, n_chunks, H_k * n_seqs);
-    v      = ggml_cont_4d(ctx0, v,      S_v, chunk_size, n_chunks, H_v * n_seqs);
-    v_beta = ggml_cont_4d(ctx0, v_beta, S_v, chunk_size, n_chunks, H_v * n_seqs);
-
-    g    = ggml_cont_4d(ctx0, g, chunk_size, 1, n_chunks, H_k * n_seqs);
-    beta = ggml_cont_4d(ctx0, beta, 1, chunk_size, n_chunks, H_k * n_seqs);
+    g    = ggml_reshape_4d(ctx0, g, chunk_size, 1, n_chunks, H_k * n_seqs);
+    beta = ggml_reshape_4d(ctx0, beta, 1, chunk_size, n_chunks, H_k * n_seqs);

    ggml_tensor * g_cumsum = ggml_cumsum(ctx0, g);

    cb(g_cumsum, "g_cumsum", il);

-    ggml_tensor * gcs_i = ggml_cont_4d(ctx0, g_cumsum, chunk_size, 1, n_chunks, H_v * n_seqs);
-    ggml_tensor * gcs_j = ggml_cont_4d(ctx0, g_cumsum, 1, chunk_size, n_chunks, H_v * n_seqs);
+    ggml_tensor * gcs_i = ggml_reshape_4d(ctx0, g_cumsum, chunk_size, 1, n_chunks, H_v * n_seqs);
+    ggml_tensor * gcs_j = ggml_reshape_4d(ctx0, g_cumsum, 1, chunk_size, n_chunks, H_v * n_seqs);

    ggml_tensor * gcs_j_broadcast =
        ggml_repeat_4d(ctx0, gcs_j, chunk_size, chunk_size, n_chunks, H_v * n_seqs);
@@ -226,23 +200,23 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(

    cb(decay_mask, "decay_mask", il);

-    decay_mask = ggml_mul(ctx0, decay_mask, chunked_diag_mask);
+    decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
    decay_mask = ggml_exp(ctx0, decay_mask);
-    decay_mask = ggml_mul(ctx0, decay_mask, chunked_diag_mask);
+    decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);

    ggml_tensor * kmulkbeta = ggml_mul_mat(ctx0, k, k_beta);

    ggml_tensor * k_decay = ggml_mul(ctx0, kmulkbeta, decay_mask);
-    ggml_tensor * attn    = ggml_neg(ctx0, ggml_mul(ctx0, k_decay, chunked_mask));
+    ggml_tensor * attn    = ggml_neg(ctx0, ggml_mul(ctx0, k_decay, causal_mask));

    cb(attn, "attn_pre_solve", il);

-    ggml_tensor * attn_lower = ggml_mul(ctx0, attn, chunked_mask);
-    ggml_tensor * lhs        = ggml_sub(ctx0, ggml_repeat(ctx0, chunked_identity, attn_lower), attn_lower);
+    ggml_tensor * attn_lower = ggml_mul(ctx0, attn, causal_mask);
+    ggml_tensor * lhs        = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower);

    ggml_tensor * lin_solve  = ggml_solve_tri(ctx0, lhs, attn, true, true, false);
-    attn                     = ggml_mul(ctx0, lin_solve, chunked_mask);
-    attn                     = ggml_add(ctx0, attn, chunked_identity);
+    attn                     = ggml_mul(ctx0, lin_solve, causal_mask);
+    attn                     = ggml_add(ctx0, attn, identity);

    cb(attn, "attn_solved", il);

@@ -291,7 +265,7 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
        // attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0)
        attn = ggml_mul_mat(ctx0, k_chunk, q_chunk);
        attn = ggml_mul(ctx0, attn, decay_mask_chunk);
-        attn = ggml_mul(ctx0, attn, ggml_add(ctx0, chunked_identity, chunked_mask));
+        attn = ggml_mul(ctx0, attn, diag_mask);

        ggml_tensor * state_t = ggml_cont_4d(ctx0, ggml_permute(ctx0, new_state, 1, 0, 2, 3), S_v, S_v, 1, H_v * n_seqs);

@@ -361,23 +335,14 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
    return ggml_concat(ctx0, flat_output, flat_state, 0);
 }

-ggml_tensor * llm_build_qwen3next::build_delta_net_recurrent(
+ggml_tensor * llm_build_qwen3next::build_delta_net_autoregressive(
        ggml_tensor * q,
        ggml_tensor * k,
        ggml_tensor * v,
        ggml_tensor * g,
        ggml_tensor * beta,
        ggml_tensor * state,
-        ggml_tensor * causal_mask,
-        ggml_tensor * identity,
        int           il) {
-    GGML_ASSERT(ggml_is_contiguous(q));
-    GGML_ASSERT(ggml_is_contiguous(k));
-    GGML_ASSERT(ggml_is_contiguous(v));
-    GGML_ASSERT(ggml_is_contiguous(g));
-    GGML_ASSERT(ggml_is_contiguous(beta));
-    GGML_ASSERT(ggml_is_contiguous(state));
-
    const int64_t S_k      = q->ne[0];
    const int64_t H_k      = q->ne[1];
    const int64_t n_tokens = q->ne[2];
@@ -386,6 +351,7 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_recurrent(
    const int64_t S_v = v->ne[0];
    const int64_t H_v = v->ne[1];

+    GGML_ASSERT(n_tokens == 1);  // This function is optimized for single token processing
    GGML_ASSERT(v->ne[2] == n_tokens);
    GGML_ASSERT(k->ne[2] == n_tokens);
    GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs);
@@ -397,215 +363,65 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_recurrent(

    GGML_ASSERT(H_k == H_v);  // we did a repeat to make sure this is the case

-    // TODO: can this ever be false?
-    const bool use_qk_l2norm = true;
+    const float eps_norm = hparams.f_norm_rms_eps;

-    if (use_qk_l2norm) {
-        const float eps_norm = hparams.f_norm_rms_eps;
-
-        q = ggml_l2_norm(ctx0, q, eps_norm);
-        k = ggml_l2_norm(ctx0, k, eps_norm);
-    }
+    q = ggml_l2_norm(ctx0, q, eps_norm);
+    k = ggml_l2_norm(ctx0, k, eps_norm);

    const float scale = 1.0f / sqrtf(S_v);

-    q = ggml_scale(ctx0, q, scale);
-
+    q    = ggml_scale(ctx0, q, scale);
    beta = ggml_sigmoid(ctx0, beta);

-    ggml_tensor * causal_diag_mask = ggml_add(ctx0, causal_mask, identity);
-
    cb(q, "q_in", il);
    cb(k, "k_in", il);
    cb(v, "v_in", il);
    cb(beta, "beta_in", il);
    cb(g, "g_in", il);

-    q = ggml_cont_4d(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
-    k = ggml_cont_4d(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
-    v = ggml_cont_4d(ctx0, ggml_permute(ctx0, v, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
-    g = ggml_cont_4d(ctx0, ggml_permute(ctx0, g, 2, 0, 3, 1), n_tokens, 1, H_k, n_seqs);
-
-    beta  = ggml_cont(ctx0, ggml_permute(ctx0, beta, 2, 0, 1, 3));
    state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs);

-    cb(q, "q_perm", il);
-    cb(k, "k_perm", il);
-    cb(v, "v_perm", il);
-    cb(beta, "beta_perm", il);
-    cb(g, "g_perm", il);
-    cb(state, "state_in", il);
+    ggml_tensor * g_t    = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, g), 1, 1, H_k, n_seqs);
+    ggml_tensor * beta_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, beta), 1, 1, H_k, n_seqs);

-    GGML_ASSERT(q->ne[1] == n_tokens && q->ne[0] == S_k && q->ne[2] == H_k && q->ne[3] == n_seqs);
-    GGML_ASSERT(k->ne[1] == n_tokens && k->ne[0] == S_k && k->ne[2] == H_k && k->ne[3] == n_seqs);
-    GGML_ASSERT(v->ne[1] == n_tokens && v->ne[0] == S_v && v->ne[2] == H_k && v->ne[3] == n_seqs);
-    GGML_ASSERT(beta->ne[1] == n_tokens && beta->ne[2] == H_k && beta->ne[0] == 1 && beta->ne[3] == n_seqs);
+    // Apply exponential to g_t
+    g_t = ggml_exp(ctx0, g_t);

-    ggml_tensor * v_beta = ggml_mul(ctx0, v, beta);
-    ggml_tensor * k_beta = ggml_mul(ctx0, k, beta);
+    // Apply the gated delta rule for the single timestep
+    // last_recurrent_state = last_recurrent_state * g_t
+    state = ggml_mul(ctx0, state, g_t);

-    ggml_tensor * g_cumsum = ggml_cumsum(ctx0, g);
+    // kv_mem = (last_recurrent_state * k_t.unsqueeze(-1)).sum(dim=-2)
+    ggml_tensor * k_t_unsqueezed = ggml_reshape_4d(ctx0, k, 1, S_v, H_v, n_seqs);
+    ggml_tensor * kv_mem         = ggml_mul(ctx0, state, k_t_unsqueezed);
+    // we need to sum over dim=-2, so we transpose, sum, then transpose again
+    kv_mem = ggml_transpose(ctx0, ggml_sum_rows(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, kv_mem))));

-    cb(k_beta, "k_beta", il);
-    cb(v_beta, "v_beta", il);
-    cb(g_cumsum, "g_cumsum", il);
+    // v_t = v.unsqueeze(2) (we insert the singleton dimension after n_seqs and H_v)
+    ggml_tensor * v_t    = ggml_reshape_4d(ctx0, v, S_v, 1, H_v, n_seqs);
+    // delta = (v_t - kv_mem) * beta_t
+    ggml_tensor * v_diff = ggml_sub(ctx0, v_t, kv_mem);  // both should be [S_v, 1, H_v, n_seqs]
+    ggml_tensor * delta  = ggml_mul(ctx0, v_diff, beta_t);

-    ggml_tensor * gcs_i = ggml_cont_4d(ctx0, g_cumsum, n_tokens, 1, H_v, n_seqs);  // [chunk_size, 1, n_tokens, n_seqs]
-    ggml_tensor * gcs_j = ggml_cont_4d(ctx0, g_cumsum, 1, n_tokens, H_v, n_seqs);  // [1, chunk_size, n_tokens, n_seqs]
+    // last_recurrent_state = last_recurrent_state + k_t.unsqueeze(-1) * delta
+    ggml_tensor * k_t_delta = ggml_mul(ctx0, ggml_repeat_4d(ctx0, k_t_unsqueezed, S_v, S_v, H_v, n_seqs), delta);
+    state                   = ggml_add(ctx0, state, k_t_delta);

-    // Broadcast both tensors to [chunk_size, chunk_size, H_v, n_seqs]
-    // ggml_tensor * gcs_i_broadcast =
-    //     ggml_repeat_4d(ctx0, gcs_i, GGML_DELTA_NET_CHUNK, GGML_DELTA_NET_CHUNK, num_chunks * H_v,
-    //                     n_seqs);  // [chunk_size, 1, H_v, n_seqs] -> [chunk_size, chunk_size, H_v, n_seqs]
-    // Don't need this, this one will get auto-broadcast
-    ggml_tensor * gcs_j_broadcast =
-        ggml_repeat_4d(ctx0, gcs_j, n_tokens, n_tokens, H_v, n_seqs);  // [1, chunk_size, H_v, n_seqs] -> [chunk_size, chunk_size, H_v, n_seqs]
-
-    ggml_tensor * decay_mask = ggml_sub(ctx0, gcs_j_broadcast, gcs_i);
-
-    // Apply lower triangular mask to ensure attention is causal (only past tokens influence current)
-    decay_mask = ggml_mul(ctx0, decay_mask, causal_diag_mask);
-    // Apply exponential to get the decay mask values
-    decay_mask = ggml_exp(ctx0, decay_mask);
-    // Apply lower triangular mask again to ensure only lower triangular values remain
-    decay_mask = ggml_mul(ctx0, decay_mask, causal_diag_mask);
-
-    cb(decay_mask, "decay_mask", il);
-
-    // attn = -((k_beta @ key.transpose(-1, -2)) * decay_mask).masked_fill(mask, 0)
-    ggml_tensor * kmulkbeta = ggml_mul_mat(ctx0, k, k_beta);
-
-    cb(kmulkbeta, "kmulkbeta", il);
-
-    ggml_tensor * k_decay = ggml_mul(ctx0, kmulkbeta, decay_mask);
-    ggml_tensor * attn    = ggml_neg(ctx0, ggml_mul(ctx0, k_decay, causal_mask));
-
-    cb(attn, "attn_pre_rec", il);
-
-    // for i in range(1, chunk_size):
-    //          row = attn[..., i, :i].clone()
-    //          sub = attn[..., :i, :i].clone()
-    //          attn[..., i, :i] = row + (row.unsqueeze(-1) * sub).sum(-2)
-    // attn = attn + torch.eye(chunk_size, dtype=attn.dtype, device=attn.device)
-    //
-    // We reduce this to a linear triangular solve: AX = B, where B = attn, A = I - tril(A)
-    ggml_tensor * attn_lower = ggml_mul(ctx0, attn, causal_mask);
-    ggml_tensor * lhs        = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower);
-
-    ggml_tensor * lin_solve  = ggml_solve_tri(ctx0, lhs, attn, true, true, false);
-    attn                     = ggml_mul(ctx0, lin_solve, causal_mask);
-    attn                     = ggml_add(ctx0, attn, identity);
-
-    // value = attn @ v_beta
-    v = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), attn);
-
-    cb(v, "value_beta", il);
-
-    // k_cumdecay = attn @ (k_beta * g.exp().unsqueeze(-1))
-    ggml_tensor * g_cumsum_t = ggml_cont(ctx0, ggml_transpose(ctx0, g_cumsum));
-    ggml_tensor * gexp       = ggml_exp(ctx0, g_cumsum_t);
-
-    cb(gexp, "g_cum_exp", il);
-
-    ggml_tensor * kbeta_gexp = ggml_mul(ctx0, k_beta, gexp);
-
-    cb(kbeta_gexp, "kbeta_gexp", il);
-
-    ggml_tensor * k_cumdecay =
-        ggml_cont(ctx0, ggml_transpose(ctx0, ggml_mul_mat(ctx0, attn, ggml_cont(ctx0, ggml_transpose(ctx0, kbeta_gexp)))));
-
-    cb(k_cumdecay, "k_cumdecay", il);
-
-    // attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0)
-    attn = ggml_mul_mat(ctx0, k, q);
-    attn = ggml_mul(ctx0, attn, decay_mask);
-    attn = ggml_mul(ctx0, attn, ggml_add(ctx0, identity, causal_mask));
-
-    cb(attn, "attn_decay_key", il);
-
-    ggml_tensor * state_t = ggml_cont(ctx0, ggml_transpose(ctx0, state));
-
-    // v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state
-    ggml_tensor * v_prime = ggml_mul_mat(ctx0, state_t, k_cumdecay);
-
-    cb(v_prime, "v_prime", il);
-
-    // v_new = v_i - v_prime
-    ggml_tensor * v_new = ggml_sub(ctx0, ggml_repeat(ctx0, v, v_prime), v_prime);
-
-    ggml_tensor * v_new_t = ggml_cont(ctx0, ggml_transpose(ctx0, v_new));
-
-    cb(v_new, "v_new", il);
-
-    // attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state
-    ggml_tensor * q_g_exp    = ggml_mul(ctx0, q, gexp);
-    ggml_tensor * attn_inter = ggml_mul_mat(ctx0, state_t, q_g_exp);
-
-    cb(attn_inter, "attn_inter", il);
-
-    // core_attn_out[:, :, i] = attn_inter + attn @ v_new
-    ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, attn);
-
-    cb(v_attn, "v_attn", il);
-
-    ggml_tensor * core_attn_out = ggml_add(ctx0, attn_inter, v_attn);
-
-    cb(core_attn_out, "core_attn_out", il);
-
-    // g_last = torch.clamp(g_cum[:, :, -1], max=50.0).exp().unsqueeze(-1).unsqueeze(-1)
-    // g_diff = torch.clamp(g_cum[:, :, -1:] - g_cum, max=50.0).exp()
-    // key_gdiff = key * g_diff.unsqueeze(-1)
-    // kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new
-    // last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
-
-    ggml_tensor * g_cum_last =
-        ggml_cont(ctx0, ggml_view_4d(ctx0, g_cumsum_t, g_cumsum_t->ne[0], 1, g_cumsum_t->ne[2], g_cumsum_t->ne[3],
-                                    g_cumsum_t->nb[1], g_cumsum_t->nb[2], g_cumsum_t->nb[3],
-                                    g_cumsum_t->nb[0] * (g_cumsum_t->ne[1] - 1)));
-
-    cb(g_cum_last, "g_cum_last", il);
-
-    ggml_tensor * gexp_last =
-        ggml_reshape_4d(ctx0, ggml_exp(ctx0, g_cum_last), 1, 1, g_cum_last->ne[0] * g_cum_last->ne[2], g_cum_last->ne[3]);
-
-    cb(gexp_last, "gexp_last", il);
-
-    ggml_tensor * g_cum_last_3d =
-        ggml_reshape_3d(ctx0, g_cum_last, g_cum_last->ne[0], g_cum_last->ne[2], g_cum_last->ne[3]);
-
-    cb(g_cum_last_3d, "g_cum_last_3d", il);
-
-    ggml_tensor * g_cumsum_3d = ggml_reshape_3d(ctx0, g_cumsum, g_cumsum->ne[0], g_cumsum->ne[2], g_cumsum->ne[3]);
-
-    cb(g_cumsum_3d, "g_cumsum_3d", il);
-
-    ggml_tensor * g_diff = ggml_neg(ctx0, ggml_sub(ctx0, g_cumsum_3d, g_cum_last_3d));
-
-    cb(g_diff, "g_diff", il);
-
-    ggml_tensor * g_diff_exp = ggml_exp(ctx0, g_diff);
-
-    cb(g_diff_exp, "g_diff_exp", il);
-
-    ggml_tensor * key_gdiff = ggml_mul(ctx0, k,
-                                    ggml_reshape_4d(ctx0, g_diff_exp, 1, g_diff_exp->ne[0], g_diff_exp->ne[1],
-                                                    g_diff_exp->ne[2] * g_diff_exp->ne[3]));
-
-    cb(key_gdiff, "key_gdiff", il);
-
-    ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, ggml_cont(ctx0, ggml_transpose(ctx0, key_gdiff)));
-
-    cb(kgdmulvnew, "kgdmulvnew", il);
-
-    state = ggml_add(ctx0, ggml_mul(ctx0, state, gexp_last), kgdmulvnew);
+    // Compute the attention output
+    // core_attn_out = (last_recurrent_state * q_t.unsqueeze(-1)).sum(dim=-2)
+    ggml_tensor * q_t_unsqueezed = ggml_reshape_4d(ctx0, q, 1, S_v, H_v, n_seqs);  // unsqueeze q_t
+    ggml_tensor * state_q        = ggml_mul(ctx0, state, q_t_unsqueezed);
+    // again, since it's over dim = -2, transpose, sum, transpose back
+    ggml_tensor * core_attn_out =
+        ggml_transpose(ctx0, ggml_sum_rows(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, state_q))));

+    // core_attn_out should be [S_v, 1, H_v, n_seqs] after this
+    cb(core_attn_out, "output_tokens", il);
    cb(state, "new_state", il);

-    // flatten output
-    ggml_tensor * flat_output =
-        ggml_cont_1d(ctx0, ggml_permute(ctx0, core_attn_out, 0, 2, 1, 3), S_v * H_v * n_tokens * n_seqs);
-
-    ggml_tensor * flat_state = ggml_cont_1d(ctx0, state, S_v * S_v * H_v * n_seqs);
+    // flatten output, no need to permute since n_tokens is 1 so [S_v, 1, H_v, n_seqs] and [S_v, H_v, 1, n_seqs] are equivalent memory-layout wise
+    ggml_tensor * flat_output = ggml_reshape_1d(ctx0, core_attn_out, S_v * H_v * n_tokens * n_seqs);
+    ggml_tensor * flat_state  = ggml_reshape_1d(ctx0, state, S_v * S_v * H_v * n_seqs);

    return ggml_concat(ctx0, flat_output, flat_state, 0);
 }
@@ -712,6 +528,7 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
        ggml_tensor *        cur,
        ggml_tensor *        causal_mask,
        ggml_tensor *        identity,
+        ggml_tensor *        diag_mask,
        int                  il) {
    const auto * mctx_cur = inp->mctx;

@@ -737,11 +554,11 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
    cb(mixed_ba, "linear_attn_mixed_ba", il);

    int64_t       qkvz_new_dim        = 2 * head_k_dim + 2 * head_v_dim * (num_v_heads / num_k_heads);
-    ggml_tensor * mixed_qkvz_reshaped = ggml_cont_4d(ctx0, mixed_qkvz, qkvz_new_dim, num_k_heads, n_seq_tokens, n_seqs);
+    ggml_tensor * mixed_qkvz_reshaped = ggml_reshape_4d(ctx0, mixed_qkvz, qkvz_new_dim, num_k_heads, n_seq_tokens, n_seqs);

    // Reshape mixed_ba: [batch, seq_len, hidden_size] -> [batch, seq_len, num_k_heads, 2*num_v_heads/num_k_heads]
    int64_t       ba_new_dim        = 2 * num_v_heads / num_k_heads;
-    ggml_tensor * mixed_ba_reshaped = ggml_cont_4d(ctx0, mixed_ba, ba_new_dim, num_k_heads, n_seq_tokens, n_seqs);
+    ggml_tensor * mixed_ba_reshaped = ggml_reshape_4d(ctx0, mixed_ba, ba_new_dim, num_k_heads, n_seq_tokens, n_seqs);

    // Split mixed_ba into b and a (beta and alpha parameters)
    int64_t split_sizes_ba[2] = {
@@ -762,8 +579,6 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
    ggml_tensor * beta  = ggml_cont_3d(ctx0, b, num_v_heads, n_seq_tokens, n_seqs);
    ggml_tensor * alpha = ggml_cont_3d(ctx0, a, num_v_heads, n_seq_tokens, n_seqs);

-    GGML_ASSERT(ggml_nelements(beta) + ggml_nelements(alpha) == ggml_nelements(mixed_ba));
-
    ggml_tensor * alpha_biased   = ggml_add(ctx0, alpha, model.layers[il].ssm_dt);
    ggml_tensor * alpha_softplus = ggml_softplus(ctx0, alpha_biased);
    cb(alpha_softplus, "a_softplus", il);
@@ -799,9 +614,6 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
                                   (split_sizes_qkvz[0] + split_sizes_qkvz[1] + split_sizes_qkvz[2]) * sizeof(float));
    cb(z, "z", il);

-    GGML_ASSERT(ggml_nelements(query) + ggml_nelements(key) + ggml_nelements(value) + ggml_nelements(z) ==
-                ggml_nelements(mixed_qkvz));
-
    // After creating query, key, and value_reshaped, reshape each to flatten the head dimensions
    // query: [head_k_dim, num_k_heads, n_tokens, n_seqs] -> [head_k_dim * num_k_heads, n_tokens, n_seqs]
    ggml_tensor * query_flat = ggml_cont_3d(ctx0, query, head_k_dim * num_k_heads, n_seq_tokens, n_seqs);
@@ -925,10 +737,13 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
    cb(k_conv, "k_conv_predelta", il);
    cb(v_conv, "v_conv_predelta", il);

-    // Choose between build_delta_net_chunking and build_delta_net_recurrent based on n_tokens
-    ggml_tensor * attn_out = n_seq_tokens > CHUNK_SIZE ?
-        build_delta_net_chunking (q_conv, k_conv, v_conv, gate, beta, state, causal_mask, identity, il) :
-        build_delta_net_recurrent(q_conv, k_conv, v_conv, gate, beta, state, causal_mask, identity, il);
+    // Choose between build_delta_net_chunking, build_delta_net_recurrent, and build_delta_net_autoregressive based on n_tokens
+    ggml_tensor * attn_out;
+    if (n_seq_tokens == 1) {
+        attn_out = build_delta_net_autoregressive(q_conv, k_conv, v_conv, gate, beta, state, il);
+    } else {
+        attn_out = build_delta_net_chunking(q_conv, k_conv, v_conv, gate, beta, state, causal_mask, identity, diag_mask, il);
+    }
    cb(attn_out, "attn_out", il);

    // The tensors were concatenated 1d, so we need to extract them 1d as well
--- a/llama/llama.cpp/tools/mtmd/clip-graph.h
+++ b/llama/llama.cpp/tools/mtmd/clip-graph.h
@@ -0,0 +1,121 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-cpp.h"
+#include "clip.h"
+#include "clip-impl.h"
+#include "clip-model.h"
+
+#include <vector>
+#include <functional>
+
+#define DEFAULT_INTERPOLATION_MODE (GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS)
+
+struct clip_graph {
+    const clip_model & model;
+    const clip_hparams & hparams;
+    projector_type proj_type;
+
+    // we only support single image per batch
+    const clip_image_f32 & img;
+
+    const int patch_size;
+    const int n_patches_x;
+    const int n_patches_y;
+    const int n_patches;
+    const int n_embd;
+    const int n_head;
+    const int d_head;
+    const int n_layer;
+    const int n_mmproj_embd;
+    const float eps;
+    const float kq_scale;
+    const clip_flash_attn_type flash_attn_type;
+
+    // for debugging
+    const bool debug_graph;
+    std::vector<ggml_tensor *> & debug_print_tensors;
+
+    ggml_context_ptr ctx0_ptr;
+    ggml_context * ctx0;
+    ggml_cgraph * gf;
+
+    clip_graph(clip_ctx * ctx, const clip_image_f32 & img);
+
+    virtual ~clip_graph() = default;
+    virtual ggml_cgraph * build() = 0;
+
+    //
+    // utility functions
+    //
+    void cb(ggml_tensor * cur0, const char * name, int il) const;
+
+    // siglip2 naflex
+    ggml_tensor * resize_position_embeddings(uint32_t interpolation_mode = DEFAULT_INTERPOLATION_MODE);
+
+    // build vision transformer (ViT) cgraph
+    // this function should cover most of the models
+    // if your model has specific features, you should probably duplicate this function
+    ggml_tensor * build_vit(
+                ggml_tensor * inp,
+                int64_t n_pos,
+                norm_type norm_t,
+                ffn_op_type ffn_t,
+                ggml_tensor * learned_pos_embd,
+                std::function<ggml_tensor *(ggml_tensor *, const clip_layer &)> add_pos);
+
+    // build the input after conv2d (inp_raw --> patches)
+    // returns tensor with shape [n_embd, n_patches]
+    ggml_tensor * build_inp();
+
+    ggml_tensor * build_inp_raw(int channels = 3);
+
+    ggml_tensor * build_norm(
+            ggml_tensor * cur,
+            ggml_tensor * mw,
+            ggml_tensor * mb,
+            norm_type type,
+            float norm_eps,
+            int il) const;
+
+    ggml_tensor * build_ffn(
+            ggml_tensor * cur,
+            ggml_tensor * up,
+            ggml_tensor * up_b,
+            ggml_tensor * gate,
+            ggml_tensor * gate_b,
+            ggml_tensor * down,
+            ggml_tensor * down_b,
+            ffn_op_type type_op,
+            int il) const;
+
+    ggml_tensor * build_attn(
+            ggml_tensor * wo,
+            ggml_tensor * wo_b,
+            ggml_tensor * q_cur,
+            ggml_tensor * k_cur,
+            ggml_tensor * v_cur,
+            ggml_tensor * kq_mask,
+            float kq_scale,
+            int il) const;
+
+    // implementation of the 2D RoPE without adding a new op in ggml
+    // this is not efficient (use double the memory), but works on all backends
+    // TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065
+    ggml_tensor * build_rope_2d(
+        ggml_context * ctx0,
+        ggml_tensor * cur,
+        ggml_tensor * pos_a, // first half
+        ggml_tensor * pos_b, // second half
+        const float freq_base,
+        const bool interleave_freq
+    );
+
+    // aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL)
+    // support dynamic resolution
+    ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int scale_factor);
+
+    // Generic function to stack frames for audio processing
+    // Abstracts out the StackAudioFrames logic used by ultravox
+    ggml_tensor * build_stack(ggml_tensor * cur, int32_t stack_factor, int32_t n_embed);
+};
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
jmorganca	e23ddd84b8	x/grammar: add experimental GPU accelerated constrained decoding package	2026-01-11 00:50:11 -08:00
Jeffrey Morgan	7cc2a653f2	dockerfile: remove unused COPY command (#13664 )	2026-01-09 23:07:15 -08:00
Jeffrey Morgan	2584940016	Add z-image image generation prototype (#13659 )	2026-01-09 21:09:46 -08:00
Michael	c6d4c0c7f2	Documentation edits made through Mintlify web editor	2026-01-09 21:29:03 -05:00
Parth Sareen	1ef4241727	x: request access for all commands, add welcome message (#13662 )	2026-01-09 18:20:39 -08:00
Parth Sareen	68fafd3002	x: improve approval selector with clearer labels (#13663 )	2026-01-09 17:08:12 -08:00
Parth Sareen	2b2cda7a2b	api: implement anthropic api (#13600 ) * api: add Anthropic Messages API compatibility layer Add middleware to support the Anthropic Messages API format at /v1/messages. This enables tools like Claude Code to work with Ollama local and cloud models through the Anthropic API interface.	2026-01-09 11:53:36 -08:00
Daniel Hiltgen	3cfe9fe146	docker: add missing deps (#13654 ) The new MLX library has extra dependencies.	2026-01-09 07:34:40 -08:00
Parth Sareen	a23b559b4c	x: disable web search tool registration (#13656 )	2026-01-09 01:42:20 -08:00
Daniel Hiltgen	33ee7168ba	Add experimental MLX backend and engine with imagegen support (#13648 ) * WIP - MLX backend with gemma3 * MLX: add cmake and go tag build toggles To build the new MLX backend code: cmake --preset MLX cmake --build --preset MLX --parallel cmake --install build --component MLX go build -tags mlx . Note: the main.go entrypoint for the MLX engine will change in a follow up commit. * add experimental image generation runtime * add experimental image generation runtime * MLX: wire up cuda build for linux * MLX: get dependencies correct and dedup This is still too large for a unified github artifact, but is now "correct" for the mlx_cuda_v13 directory. * fix relative link bug in dedup * Add darwin build and readme * add go build tag for mlx dependent code and wire up build_darwin.sh * lint cleanup * macos: build mlx for x86 This will be CPU only. * cuda build instructions and fix drift from mlx bump * stale comment * Delete agent helper doc * Clean up readme.md * Revise README for tokenizer clarity and details Updated README to clarify tokenizer functionality and removed correctness section. --------- Co-authored-by: jmorganca <jmorganca@gmail.com>	2026-01-08 16:18:59 -08:00
Daniel Hiltgen	34d0c55ea5	Linux: switch to zstd compression (#13651 ) With the upcoming addition of MLX, the linux bundle will exceed the maximum github artifact size of 2G. This change will bring the size back down. The install.sh changes support backwards compatibility for prior versions thus should be safe to merge concurrently with this change.	2026-01-08 15:47:32 -08:00
Parth Sareen	53a5a9e9ae	x: redesign agent UI with minimal styling (#13650 )	2026-01-08 15:40:07 -08:00
Parth Sareen	e30e08a7d6	x: remove Ctrl+O tool output expansion feature (#13640 )	2026-01-07 15:34:08 -08:00
Parth Sareen	12e2b3514a	x: agent loop ux improvements (#13635 )	2026-01-07 01:27:15 -08:00
Devon Rifkin	626af2d809	template: fix args-as-json rendering (#13636 ) In #13525, I accidentally broke templates' ability to automatically render tool call function arguments as JSON. We do need these to be proper maps because we need templates to be able to call range, which can't be done on custom types.	2026-01-06 18:33:57 -08:00
Parth Sareen	76912c062a	x: add experimental agent loop (#13628 )	2026-01-05 23:38:40 -08:00
Devon Rifkin	6c3faafed2	olmo3: fix flaky test (#13629 ) I introduced this in <https://github.com/ollama/ollama/pull/13525>	2026-01-05 22:37:20 -08:00
Devon Rifkin	e51dead636	preserve tool definition and call JSON ordering (#13525 ) * preserve tool definition and call JSON ordering This is another iteration of <https://github.com/ollama/ollama/pull/12518>, but this time we've simplified things by relaxing the competing requirements of being compatible AND order-preserving with templates (vs. renderers). We maintain backwards compatibility at the cost of not guaranteeing order for templates. We plan on moving more and more models to renderers, which have been updated to use these new data types, and additionally we could add an opt-in way of templates getting an order-preserved list (e.g., via sibling template vars) * orderedmap_test: remove testify	2026-01-05 18:03:36 -08:00
Harry V. Kiselev	d087e46bd1	docs/capabilities/vision: fix curl related code snippet (#13615 )	2026-01-03 17:27:46 -05:00
lif	37f6f3af24	server: return error when embedding contains NaN or Inf values (#13599 ) The normalize function now checks for NaN and Inf values in the embedding vector before processing. This prevents JSON encoding failures when models produce invalid floating-point values. Fixes #13572 Signed-off-by: majiayu000 <1835304752@qq.com>	2026-01-03 02:20:12 -05:00
Nhan Nguyen	e1bdc23dd2	docs: fix tool name mismatch and trailing commas in api.md example (#13559 ) The tool calling example used "get_temperature" for tool_calls but defined the tool as "get_weather". Also removed trailing commas that made the JSON invalid. Fixes #13031	2026-01-03 02:14:53 -05:00
lif	2e78653ff9	app/ui: add swift syntax highlighting support (#13574 ) Fixes #13476 Signed-off-by: majiayu000 <1835304752@qq.com>	2026-01-03 02:12:08 -05:00
lif	f5f74e12c1	docs: add version note for /v1/responses API (#13596 ) Signed-off-by: majiayu000 <1835304752@qq.com>	2026-01-03 01:58:20 -05:00
Vallabh Mahajan	18fdcc94e5	docs: fix broken .md links and render issues (#13550 )	2025-12-23 12:44:55 -05:00
Daniel Hiltgen	7ad036992f	amd: use GTT on iGPUs on linux (#13196 ) On Linux, look at the GTT memory information for iGPUs.	2025-12-23 09:30:05 -08:00
Jesse Gross	172b5924af	llm: Avoid integer underflow on llama engine memory layout On the llama engine, when we compute the memory layout, we reserve a buffer to allow for some flexibility for incorrect estimates. This is subtracted from GPU free memory and on GPUs with limited memory, it may underflow. Fixes #13494	2025-12-19 15:48:15 -08:00
Jeffrey Morgan	8852220f59	add REQUIRES command to Modelfile (#13361 )	2025-12-18 13:21:29 -08:00
Parth Sareen	7325791599	parsers/renderers: functiongemma (#13521 )	2025-12-18 07:55:37 -08:00
Grace	522c11a763	Revert "Omit args and params in tool function def and calls (#13516 )" (#13518 ) This reverts commit `0fadeffaee`.	2025-12-17 19:06:56 -08:00
Grace	0fadeffaee	Omit args and params in tool function def and calls (#13516 )	2025-12-17 18:42:21 -08:00
Daniel Hiltgen	49a9c9ba6a	GGML update to ec98e2002 (#13451 ) * Revert "add support for NVIDIA Nemotron 3 Nano" This reverts commit `e7d2ae9d69`. * GGML update to 380b4c984 Remove MaskBatchPadding as GGML_KQ_MASK_PAD is no longer present (no padding required) * update to c45f89d55 * ec98e2002 solar pro needed more adjusting - needs verification * review comments	2025-12-17 13:13:55 -08:00
Parth Sareen	1c094038bc	types: add nested property support for tool definitions (#13508 )	2025-12-17 11:54:09 -08:00
Grace	a013693f80	DeepseekV3 Family Parser (#13484 )	2025-12-16 18:56:30 -08:00
Michael Yang	f6a016f49d	revert granite-embedding (#13505 )	2025-12-16 15:44:52 -08:00
Bruce MacDonald	45c4739374	types: ConfigV2 and RootFS (#13504 ) Refactored the ConfigV2 and RootFS types from server/images.go to a new types/model/config.go file under the model package. Updated all references to use model.ConfigV2 and model.RootFS. This allows for use in other projects without worrying about compiling the c code in the llama package.	2025-12-16 15:18:17 -08:00
Michael Yang	2dd029de12	remove unnecessary code (#13502 ) slog is already lazily evaluated so this code is completely redundant	2025-12-16 15:11:26 -08:00
Michael Yang	903b1fc97f	use ollama engine for bert models (#13501 ) register bpe tokenizer which enables granite-embedding	2025-12-16 11:29:19 -08:00