x/grammar: add experimental GPU accelerated constrained decoding package

2026-01-21 05:48:35 -05:00 · 2026-01-11 00:50:11 -08:00
190 changed files with 8016 additions and 25008 deletions
--- a/.github/ISSUE_TEMPLATE/10_bug_report.yml
+++ b/.github/ISSUE_TEMPLATE/10_bug_report.yml
@@ -13,7 +13,7 @@ body:
    id: logs
    attributes:
      label: Relevant log output
-      description: Please copy and paste any relevant log output. See [Troubleshooting Guide](https://github.com/ollama/ollama/blob/main/docs/troubleshooting.mdx#how-to-troubleshoot-issues) for details.
+      description: Please copy and paste any relevant log output. See [Troubleshooting Guide](https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md#how-to-troubleshoot-issues) for details.
      render: shell
    validations:
      required: false
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -372,17 +372,13 @@ jobs:
          outputs: type=local,dest=dist/${{ matrix.os }}-${{ matrix.arch }}
          cache-from: type=registry,ref=${{ vars.DOCKER_REPO }}:latest
          cache-to: type=inline
-      - name: Deduplicate CUDA libraries
-        run: |
-          ./scripts/deduplicate_cuda_libs.sh dist/${{ matrix.os }}-${{ matrix.arch }}
      - run: |
          for COMPONENT in bin/* lib/ollama/*; do
            case "$COMPONENT" in
-              bin/ollama*)               echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
+              bin/ollama)                echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/*.so*)          echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/cuda_v*)        echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/vulkan*)        echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
-              lib/ollama/mlx*)           echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/cuda_jetpack5)  echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
              lib/ollama/cuda_jetpack6)  echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
              lib/ollama/rocm)           echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-rocm.tar.in ;;
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -48,10 +48,9 @@ if((CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
    set(GGML_CPU_ALL_VARIANTS ON)
 endif()

-if(APPLE)
+if (CMAKE_OSX_ARCHITECTURES MATCHES "x86_64")
    set(CMAKE_BUILD_RPATH "@loader_path")
    set(CMAKE_INSTALL_RPATH "@loader_path")
-    set(CMAKE_BUILD_WITH_INSTALL_RPATH ON)
 endif()

 set(OLLAMA_BUILD_DIR ${CMAKE_BINARY_DIR}/lib/ollama)
@@ -190,21 +189,13 @@ if(MLX_ENGINE)
    install(TARGETS mlx mlxc
        RUNTIME_DEPENDENCIES
            DIRECTORIES ${CUDAToolkit_BIN_DIR} ${CUDAToolkit_BIN_DIR}/x64 ${CUDAToolkit_LIBRARY_DIR}
-            PRE_INCLUDE_REGEXES cublas cublasLt cudart nvrtc nvrtc-builtins cudnn nccl openblas gfortran
+            PRE_INCLUDE_REGEXES cublas cublasLt cudart nvrtc cudnn nccl
            PRE_EXCLUDE_REGEXES ".*"
        RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT MLX
        LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT MLX
        FRAMEWORK DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT MLX
    )

-    # Install the Metal library for macOS arm64 (must be colocated with the binary)
-    # Metal backend is only built for arm64, not x86_64
-    if(APPLE AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
-        install(FILES ${CMAKE_BINARY_DIR}/_deps/mlx-build/mlx/backend/metal/kernels/mlx.metallib
-            DESTINATION ${OLLAMA_INSTALL_DIR}
-            COMPONENT MLX)
-    endif()
-
    # Manually install cudart and cublas since they might not be picked up as direct dependencies
    if(CUDAToolkit_FOUND)
        file(GLOB CUDART_LIBS
--- a/14
+++ b/14
@@ -32,7 +32,7 @@ ENV PATH=/${VULKANVERSION}/x86_64/bin:$PATH
 FROM --platform=linux/arm64 almalinux:8 AS base-arm64
 # install epel-release for ccache
 RUN yum install -y yum-utils epel-release \
-    && dnf install -y clang ccache git \
+    && dnf install -y clang ccache \
    && yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo
 ENV CC=clang CXX=clang++

@@ -149,7 +149,6 @@ COPY CMakeLists.txt CMakePresets.json .
 COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
 COPY x/ml/backend/mlx x/ml/backend/mlx
 COPY go.mod go.sum .
-COPY MLX_VERSION .
 RUN curl -fsSL https://golang.org/dl/go$(awk '/^go/ { print $2 }' go.mod).linux-$(case $(uname -m) in x86_64) echo amd64 ;; aarch64) echo arm64 ;; esac).tar.gz | tar xz -C /usr/local
 ENV PATH=/usr/local/go/bin:$PATH
 RUN go mod download
@@ -157,6 +156,11 @@ RUN --mount=type=cache,target=/root/.ccache \
    cmake --preset 'MLX CUDA 13' -DBLAS_INCLUDE_DIRS=/usr/include/openblas -DLAPACK_INCLUDE_DIRS=/usr/include/openblas \
        && cmake --build --parallel ${PARALLEL} --preset 'MLX CUDA 13' \
        && cmake --install build --component MLX --strip --parallel ${PARALLEL}
+COPY . .
+ARG GOFLAGS="'-ldflags=-w -s'"
+ENV CGO_ENABLED=1
+ARG CGO_CFLAGS
+ARG CGO_CXXFLAGS

 FROM base AS build
 WORKDIR /go/src/github.com/ollama/ollama
@@ -165,14 +169,12 @@ RUN curl -fsSL https://golang.org/dl/go$(awk '/^go/ { print $2 }' go.mod).linux-
 ENV PATH=/usr/local/go/bin:$PATH
 RUN go mod download
 COPY . .
-# Clone mlx-c headers for CGO (version from MLX_VERSION file)
-RUN git clone --depth 1 --branch "$(cat MLX_VERSION)" https://github.com/ml-explore/mlx-c.git build/_deps/mlx-c-src
 ARG GOFLAGS="'-ldflags=-w -s'"
 ENV CGO_ENABLED=1
-ENV CGO_CFLAGS="-I/go/src/github.com/ollama/ollama/build/_deps/mlx-c-src"
+ARG CGO_CFLAGS
 ARG CGO_CXXFLAGS
 RUN --mount=type=cache,target=/root/.cache/go-build \
-    go build -tags mlx -trimpath -buildmode=pie -o /bin/ollama .
+    go build -trimpath -buildmode=pie -o /bin/ollama .

 FROM --platform=linux/amd64 scratch AS amd64
 # COPY --from=cuda-11 dist/lib/ollama/ /lib/ollama/
--- a/1
+++ b/1
@@ -1 +0,0 @@
-v0.4.1
--- a/README.md
+++ b/README.md
@@ -48,7 +48,7 @@ ollama run gemma3

 ## Model library

-Ollama supports a list of models available on [ollama.com/library](https://ollama.com/library "ollama model library")
+Ollama supports a list of models available on [ollama.com/library](https://ollama.com/library 'ollama model library')

 Here are some example models that can be downloaded:

@@ -79,7 +79,7 @@ Here are some example models that can be downloaded:
 | Code Llama         | 7B         | 3.8GB | `ollama run codellama`           |
 | Llama 2 Uncensored | 7B         | 3.8GB | `ollama run llama2-uncensored`   |
 | LLaVA              | 7B         | 4.5GB | `ollama run llava`               |
-| Granite-3.3        | 8B         | 4.9GB | `ollama run granite3.3`          |
+| Granite-3.3         | 8B         | 4.9GB | `ollama run granite3.3`          |

 > [!NOTE]
 > You should have at least 8 GB of RAM available to run the 7B models, 16 GB to run the 13B models, and 32 GB to run the 33B models.
@@ -260,38 +260,6 @@ Finally, in a separate shell, run a model:
 ./ollama run llama3.2
 ```

-## Building with MLX (experimental)
-
-First build the MLX libraries:
-
-```shell
-cmake --preset MLX
-cmake --build --preset MLX --parallel
-cmake --install build --component MLX
-```
-
-When building with the `-tags mlx` flag, the main `ollama` binary includes MLX support for experimental features like image generation:
-
-```shell
-go build -tags mlx .
-```
-
-Finally, start the server:
-
-```
-./ollama serve
-```
-
-### Building MLX with CUDA
-
-When building with CUDA, use the preset "MLX CUDA 13" or "MLX CUDA 12" to enable CUDA with default architectures:
-
-```shell
-cmake --preset 'MLX CUDA 13'
-cmake --build --preset 'MLX CUDA 13' --parallel
-cmake --install build --component MLX
-```
-
 ## REST API

 Ollama has a REST API for running and managing models.
@@ -322,7 +290,6 @@ See the [API documentation](./docs/api.md) for all endpoints.

 ### Web & Desktop

- [Onyx](https://github.com/onyx-dot-app/onyx)
 - [Open WebUI](https://github.com/open-webui/open-webui)
 - [SwiftChat (macOS with ReactNative)](https://github.com/aws-samples/swift-chat)
 - [Enchanted (macOS native)](https://github.com/AugustDev/enchanted)
@@ -454,7 +421,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [AppFlowy](https://github.com/AppFlowy-IO/AppFlowy) (AI collaborative workspace with Ollama, cross-platform and self-hostable)
 - [Lumina](https://github.com/cushydigit/lumina.git) (A lightweight, minimal React.js frontend for interacting with Ollama servers)
 - [Tiny Notepad](https://pypi.org/project/tiny-notepad) (A lightweight, notepad-like interface to chat with ollama available on PyPI)
- [macLlama (macOS native)](https://github.com/hellotunamayo/macLlama) (A native macOS GUI application for interacting with Ollama models, featuring a chat interface.)
+- [macLlama (macOS native)](https://github.com/hellotunamayo/macLlama) (A native macOS GUI application for interacting with Ollama models, featuring a chat interface.) 
 - [GPTranslate](https://github.com/philberndt/GPTranslate) (A fast and lightweight, AI powered desktop translation application written with Rust and Tauri. Features real-time translation with OpenAI/Azure/Ollama.)
 - [ollama launcher](https://github.com/NGC13009/ollama-launcher) (A launcher for Ollama, aiming to provide users with convenient functions such as ollama server launching, management, or configuration.)
 - [ai-hub](https://github.com/Aj-Seven/ai-hub) (AI Hub supports multiple models via API keys and Chat support via Ollama API.)
@@ -526,7 +493,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 ### Database

 - [pgai](https://github.com/timescale/pgai) - PostgreSQL as a vector database (Create and search embeddings from Ollama models using pgvector)
-  - [Get started guide](https://github.com/timescale/pgai/blob/main/docs/vectorizer-quick-start.md)
+   - [Get started guide](https://github.com/timescale/pgai/blob/main/docs/vectorizer-quick-start.md)
 - [MindsDB](https://github.com/mindsdb/mindsdb/blob/staging/mindsdb/integrations/handlers/ollama_handler/README.md) (Connects Ollama models with nearly 200 data platforms and apps)
 - [chromem-go](https://github.com/philippgille/chromem-go/blob/v0.5.0/embed_ollama.go) with [example](https://github.com/philippgille/chromem-go/tree/v0.5.0/examples/rag-wikipedia-ollama)
 - [Kangaroo](https://github.com/dbkangaroo/kangaroo) (AI-powered SQL client and admin tool for popular databases)
@@ -669,7 +636,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [llama.cpp](https://github.com/ggml-org/llama.cpp) project founded by Georgi Gerganov.

 ### Observability
-
 - [Opik](https://www.comet.com/docs/opik/cookbook/ollama) is an open-source platform to debug, evaluate, and monitor your LLM applications, RAG systems, and agentic workflows with comprehensive tracing, automated evaluations, and production-ready dashboards. Opik supports native integration to Ollama.
 - [Lunary](https://lunary.ai/docs/integrations/ollama) is the leading open-source LLM observability platform. It provides a variety of enterprise-grade features such as real-time analytics, prompt templates management, PII masking, and comprehensive agent tracing.
 - [OpenLIT](https://github.com/openlit/openlit) is an OpenTelemetry-native tool for monitoring Ollama Applications & GPUs using traces and metrics.
@@ -678,5 +644,4 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [MLflow Tracing](https://mlflow.org/docs/latest/llms/tracing/index.html#automatic-tracing) is an open source LLM observability tool with a convenient API to log and visualize traces, making it easy to debug and evaluate GenAI applications.

 ### Security
-
 - [Ollama Fortress](https://github.com/ParisNeo/ollama_proxy_server)
--- a/api/client.go
+++ b/api/client.go
@@ -165,7 +165,7 @@ func (c *Client) do(ctx context.Context, method, path string, reqData, respData
 	return nil
 }

-const maxBufferSize = 8 * format.MegaByte
+const maxBufferSize = 512 * format.KiloByte

 func (c *Client) stream(ctx context.Context, method, path string, data any, fn func([]byte) error) error {
 	var buf io.Reader
--- a/api/types.go
+++ b/api/types.go
@@ -127,20 +127,6 @@ type GenerateRequest struct {
 	// each with an associated log probability. Only applies when Logprobs is true.
 	// Valid values are 0-20. Default is 0 (only return the selected token's logprob).
 	TopLogprobs int `json:"top_logprobs,omitempty"`
-
-	// Experimental: Image generation fields (may change or be removed)
-
-	// Width is the width of the generated image in pixels.
-	// Only used for image generation models.
-	Width int32 `json:"width,omitempty"`
-
-	// Height is the height of the generated image in pixels.
-	// Only used for image generation models.
-	Height int32 `json:"height,omitempty"`
-
-	// Steps is the number of diffusion steps for image generation.
-	// Only used for image generation models.
-	Steps int32 `json:"steps,omitempty"`
 }

 // ChatRequest describes a request sent by [Client.Chat].
@@ -749,7 +735,7 @@ type ShowResponse struct {
 	Messages      []Message          `json:"messages,omitempty"`
 	RemoteModel   string             `json:"remote_model,omitempty"`
 	RemoteHost    string             `json:"remote_host,omitempty"`
-	ModelInfo     map[string]any     `json:"model_info"`
+	ModelInfo     map[string]any     `json:"model_info,omitempty"`
 	ProjectorInfo map[string]any     `json:"projector_info,omitempty"`
 	Tensors       []Tensor           `json:"tensors,omitempty"`
 	Capabilities  []model.Capability `json:"capabilities,omitempty"`
@@ -874,20 +860,6 @@ type GenerateResponse struct {
 	// Logprobs contains log probability information for the generated tokens,
 	// if requested via the Logprobs parameter.
 	Logprobs []Logprob `json:"logprobs,omitempty"`
-
-	// Experimental: Image generation fields (may change or be removed)
-
-	// Image contains a base64-encoded generated image.
-	// Only present for image generation models.
-	Image string `json:"image,omitempty"`
-
-	// Completed is the number of completed steps in image generation.
-	// Only present for image generation models during streaming.
-	Completed int64 `json:"completed,omitempty"`
-
-	// Total is the total number of steps for image generation.
-	// Only present for image generation models during streaming.
-	Total int64 `json:"total,omitempty"`
 }

 // ModelDetails provides details about a model.
--- a/app/cmd/app/app_darwin.m
+++ b/app/cmd/app/app_darwin.m
@@ -14,7 +14,6 @@ extern NSString *SystemWidePath;
@interface AppDelegate () <NSWindowDelegate, WKNavigationDelegate, WKUIDelegate>
@property(strong, nonatomic) NSStatusItem *statusItem;
@property(assign, nonatomic) BOOL updateAvailable;
-@property(assign, nonatomic) BOOL systemShutdownInProgress;
@end

@implementation AppDelegate
@@ -41,13 +40,6 @@ bool firstTimeRun,startHidden; // Set in run before initialization
 }

 - (void)applicationDidFinishLaunching:(NSNotification *)aNotification {
-    // Register for system shutdown/restart notification so we can allow termination
-    [[[NSWorkspace sharedWorkspace] notificationCenter]
-        addObserver:self
-           selector:@selector(systemWillPowerOff:)
-               name:NSWorkspaceWillPowerOffNotification
-             object:nil];
-
    // if we're in development mode, set the app icon
    NSString *bundlePath = [[NSBundle mainBundle] bundlePath];
    if (![bundlePath hasSuffix:@".app"]) {
@@ -286,18 +278,7 @@ bool firstTimeRun,startHidden; // Set in run before initialization
    [NSApp activateIgnoringOtherApps:YES];
 }

- (void)systemWillPowerOff:(NSNotification *)notification {
-    // Set flag so applicationShouldTerminate: knows to allow termination.
-    // The system will call applicationShouldTerminate: after posting this notification.
-    self.systemShutdownInProgress = YES;
-}
-
 - (NSApplicationTerminateReply)applicationShouldTerminate:(NSApplication *)sender {
-    // Allow termination if the system is shutting down or restarting
-    if (self.systemShutdownInProgress) {
-        return NSTerminateNow;
-    }
-    // Otherwise just hide the app (for Cmd+Q, close button, etc.)
    [NSApp hide:nil];
    [NSApp setActivationPolicy:NSApplicationActivationPolicyAccessory];
    return NSTerminateCancel;
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -46,9 +46,8 @@ import (
 	"github.com/ollama/ollama/types/syncmap"
 	"github.com/ollama/ollama/version"
 	xcmd "github.com/ollama/ollama/x/cmd"
-	"github.com/ollama/ollama/x/create"
-	xcreateclient "github.com/ollama/ollama/x/create/client"
 	"github.com/ollama/ollama/x/imagegen"
+	imagegenclient "github.com/ollama/ollama/x/imagegen/client"
 )

 const ConnectInstructions = "To sign in, navigate to:\n    %s\n\n"
@@ -94,87 +93,14 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 	p := progress.NewProgress(os.Stderr)
 	defer p.Stop()

-	// Validate model name early to fail fast
-	modelName := args[0]
-	name := model.ParseName(modelName)
-	if !name.IsValid() {
-		return fmt.Errorf("invalid model name: %s", modelName)
-	}
-
-	// Check for --experimental flag for safetensors model creation
-	experimental, _ := cmd.Flags().GetBool("experimental")
-	if experimental {
-		// Get Modelfile content - either from -f flag or default to "FROM ."
-		var reader io.Reader
-		filename, err := getModelfileName(cmd)
-		if os.IsNotExist(err) || filename == "" {
-			// No Modelfile specified or found - use default
-			reader = strings.NewReader("FROM .\n")
-		} else if err != nil {
-			return err
-		} else {
-			f, err := os.Open(filename)
-			if err != nil {
-				return err
-			}
-			defer f.Close()
-			reader = f
-		}
-
-		// Parse the Modelfile
-		modelfile, err := parser.ParseFile(reader)
-		if err != nil {
-			return fmt.Errorf("failed to parse Modelfile: %w", err)
-		}
-
-		// Extract FROM path and configuration
-		var modelDir string
-		mfConfig := &xcreateclient.ModelfileConfig{}
-
-		for _, cmd := range modelfile.Commands {
-			switch cmd.Name {
-			case "model":
-				modelDir = cmd.Args
-			case "template":
-				mfConfig.Template = cmd.Args
-			case "system":
-				mfConfig.System = cmd.Args
-			case "license":
-				mfConfig.License = cmd.Args
-			}
-		}
-
-		if modelDir == "" {
-			modelDir = "."
-		}
-
-		// Resolve relative paths based on Modelfile location
-		if !filepath.IsAbs(modelDir) && filename != "" {
-			modelDir = filepath.Join(filepath.Dir(filename), modelDir)
-		}
-
-		quantize, _ := cmd.Flags().GetString("quantize")
-		return xcreateclient.CreateModel(xcreateclient.CreateOptions{
-			ModelName: modelName,
-			ModelDir:  modelDir,
-			Quantize:  quantize,
-			Modelfile: mfConfig,
-		}, p)
-	}
-
 	var reader io.Reader

 	filename, err := getModelfileName(cmd)
 	if os.IsNotExist(err) {
 		if filename == "" {
 			// No Modelfile found - check if current directory is an image gen model
-			if create.IsTensorModelDir(".") {
-				quantize, _ := cmd.Flags().GetString("quantize")
-				return xcreateclient.CreateModel(xcreateclient.CreateOptions{
-					ModelName: modelName,
-					ModelDir:  ".",
-					Quantize:  quantize,
-				}, p)
+			if imagegen.IsTensorModelDir(".") {
+				return imagegenclient.CreateModel(args[0], ".", p)
 			}
 			reader = strings.NewReader("FROM .\n")
 		} else {
@@ -207,7 +133,7 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 	}
 	spinner.Stop()

-	req.Model = modelName
+	req.Model = args[0]
 	quantize, _ := cmd.Flags().GetString("quantize")
 	if quantize != "" {
 		req.Quantize = quantize
@@ -538,6 +464,14 @@ func RunHandler(cmd *cobra.Command, args []string) error {

 	name := args[0]

+	// Check if this is a known image generation model (skip Show/Pull)
+	if imagegen.HasTensorLayers(name) {
+		if opts.Prompt == "" && !interactive {
+			return errors.New("image generation models require a prompt. Usage: ollama run " + name + " \"your prompt here\"")
+		}
+		return imagegen.RunCLI(cmd, name, opts.Prompt, interactive, opts.KeepAlive)
+	}
+
 	info, err := func() (*api.ShowResponse, error) {
 		showReq := &api.ShowRequest{Name: name}
 		info, err := client.Show(cmd.Context(), showReq)
@@ -599,18 +533,9 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 		return generateEmbedding(cmd, name, opts.Prompt, opts.KeepAlive, truncate, dimensions)
 	}

-	// Check if this is an image generation model
-	if slices.Contains(info.Capabilities, model.CapabilityImage) {
-		if opts.Prompt == "" && !interactive {
-			return errors.New("image generation models require a prompt. Usage: ollama run " + name + " \"your prompt here\"")
-		}
-		return imagegen.RunCLI(cmd, name, opts.Prompt, interactive, opts.KeepAlive)
-	}
-
 	// Check for experimental flag
 	isExperimental, _ := cmd.Flags().GetBool("experimental")
 	yoloMode, _ := cmd.Flags().GetBool("experimental-yolo")
-	enableWebsearch, _ := cmd.Flags().GetBool("experimental-websearch")

 	if interactive {
 		if err := loadOrUnloadModel(cmd, &opts); err != nil {
@@ -640,7 +565,7 @@ func RunHandler(cmd *cobra.Command, args []string) error {

 		// Use experimental agent loop with tools
 		if isExperimental {
-			return xcmd.GenerateInteractive(cmd, opts.Model, opts.WordWrap, opts.Options, opts.Think, opts.HideThinking, opts.KeepAlive, yoloMode, enableWebsearch)
+			return xcmd.GenerateInteractive(cmd, opts.Model, opts.WordWrap, opts.Options, opts.Think, opts.HideThinking, opts.KeepAlive, yoloMode)
 		}

 		return generateInteractive(cmd, opts)
@@ -746,11 +671,7 @@ func PushHandler(cmd *cobra.Command, args []string) error {

 			bar, ok := bars[resp.Digest]
 			if !ok {
-				msg := resp.Status
-				if msg == "" {
-					msg = fmt.Sprintf("pushing %s...", resp.Digest[7:19])
-				}
-				bar = progress.NewBar(msg, resp.Total, resp.Completed)
+				bar = progress.NewBar(fmt.Sprintf("pushing %s...", resp.Digest[7:19]), resp.Total, resp.Completed)
 				bars[resp.Digest] = bar
 				p.Add(resp.Digest, bar)
 			}
@@ -899,11 +820,11 @@ func DeleteHandler(cmd *cobra.Command, args []string) error {
 	for _, arg := range args {
 		// Unload the model if it's running before deletion
 		if err := loadOrUnloadModel(cmd, &runOptions{
-			Model:     arg,
+			Model:     args[0],
 			KeepAlive: &api.Duration{Duration: 0},
 		}); err != nil {
 			if !strings.Contains(strings.ToLower(err.Error()), "not found") {
-				fmt.Fprintf(os.Stderr, "Warning: unable to stop model '%s'\n", arg)
+				fmt.Fprintf(os.Stderr, "Warning: unable to stop model '%s'\n", args[0])
 			}
 		}

@@ -916,6 +837,11 @@ func DeleteHandler(cmd *cobra.Command, args []string) error {
 }

 func ShowHandler(cmd *cobra.Command, args []string) error {
+	// Check if this is an image generation model
+	if imagegen.HasTensorLayers(args[0]) {
+		return imagegen.Show(args[0], os.Stdout)
+	}
+
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
 		return err
@@ -1815,22 +1741,15 @@ func NewCLI() *cobra.Command {
 	rootCmd.Flags().BoolP("version", "v", false, "Show version information")

 	createCmd := &cobra.Command{
-		Use:   "create MODEL",
-		Short: "Create a model",
-		Args:  cobra.ExactArgs(1),
-		PreRunE: func(cmd *cobra.Command, args []string) error {
-			// Skip server check for experimental mode (writes directly to disk)
-			if experimental, _ := cmd.Flags().GetBool("experimental"); experimental {
-				return nil
-			}
-			return checkServerHeartbeat(cmd, args)
-		},
-		RunE: CreateHandler,
+		Use:     "create MODEL",
+		Short:   "Create a model",
+		Args:    cobra.ExactArgs(1),
+		PreRunE: checkServerHeartbeat,
+		RunE:    CreateHandler,
 	}

 	createCmd.Flags().StringP("file", "f", "", "Name of the Modelfile (default \"Modelfile\")")
 	createCmd.Flags().StringP("quantize", "q", "", "Quantize model to this level (e.g. q4_K_M)")
-	createCmd.Flags().Bool("experimental", false, "Enable experimental safetensors model creation")

 	showCmd := &cobra.Command{
 		Use:     "show MODEL",
@@ -1867,7 +1786,6 @@ func NewCLI() *cobra.Command {
 	runCmd.Flags().Int("dimensions", 0, "Truncate output embeddings to specified dimension (embedding models only)")
 	runCmd.Flags().Bool("experimental", false, "Enable experimental agent loop with tools")
 	runCmd.Flags().Bool("experimental-yolo", false, "Skip all tool approval prompts (use with caution)")
-	runCmd.Flags().Bool("experimental-websearch", false, "Enable web search tool in experimental mode")

 	// Image generation flags (width, height, steps, seed, etc.)
 	imagegen.RegisterFlags(runCmd)
@@ -1985,7 +1903,6 @@ func NewCLI() *cobra.Command {
 	} {
 		switch cmd {
 		case runCmd:
-			imagegen.AppendFlagsDocs(cmd)
 			appendEnvDocs(cmd, []envconfig.EnvVar{envVars["OLLAMA_HOST"], envVars["OLLAMA_NOHISTORY"]})
 		case serveCmd:
 			appendEnvDocs(cmd, []envconfig.EnvVar{
--- a/cmd/cmd_test.go
+++ b/cmd/cmd_test.go
@@ -1547,79 +1547,6 @@ func TestRunOptions_Copy_ThinkValueVariants(t *testing.T) {
 	}
 }

-func TestShowInfoImageGen(t *testing.T) {
-	var b bytes.Buffer
-	err := showInfo(&api.ShowResponse{
-		Details: api.ModelDetails{
-			Family:            "ZImagePipeline",
-			ParameterSize:     "10.3B",
-			QuantizationLevel: "FP8",
-		},
-		Capabilities: []model.Capability{model.CapabilityImage},
-		Requires:     "0.14.0",
-	}, false, &b)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	expect := "  Model\n" +
-		"    architecture    ZImagePipeline    \n" +
-		"    parameters      10.3B             \n" +
-		"    quantization    FP8               \n" +
-		"    requires        0.14.0            \n" +
-		"\n" +
-		"  Capabilities\n" +
-		"    image    \n" +
-		"\n"
-	if diff := cmp.Diff(expect, b.String()); diff != "" {
-		t.Errorf("unexpected output (-want +got):\n%s", diff)
-	}
-}
-
-func TestPushProgressMessage(t *testing.T) {
-	tests := []struct {
-		name    string
-		status  string
-		digest  string
-		wantMsg string
-	}{
-		{
-			name:    "uses status when provided",
-			status:  "uploading model",
-			digest:  "sha256:abc123456789def",
-			wantMsg: "uploading model",
-		},
-		{
-			name:    "falls back to digest when status empty",
-			status:  "",
-			digest:  "sha256:abc123456789def",
-			wantMsg: "pushing abc123456789...",
-		},
-		{
-			name:    "handles short digest gracefully",
-			status:  "",
-			digest:  "sha256:abc",
-			wantMsg: "pushing sha256:abc...",
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			msg := tt.status
-			if msg == "" {
-				if len(tt.digest) >= 19 {
-					msg = fmt.Sprintf("pushing %s...", tt.digest[7:19])
-				} else {
-					msg = fmt.Sprintf("pushing %s...", tt.digest)
-				}
-			}
-			if msg != tt.wantMsg {
-				t.Errorf("got %q, want %q", msg, tt.wantMsg)
-			}
-		})
-	}
-}
-
 func TestRunOptions_Copy_Independence(t *testing.T) {
 	// Test that modifications to original don't affect copy
 	originalThink := &api.ThinkValue{Value: "original"}
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -116,7 +116,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 		Prompt:         ">>> ",
 		AltPrompt:      "... ",
 		Placeholder:    "Send a message (/? for help)",
-		AltPlaceholder: "Press Enter to send",
+		AltPlaceholder: `Use """ to end multi-line input`,
 	})
 	if err != nil {
 		return err
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -311,10 +311,6 @@ func LoadModelMetadata(fsys fs.FS) (ModelKV, *Tokenizer, error) {
 		conv = &deepseekocr{}
 	case "DeepseekV3ForCausalLM":
 		conv = &deepseek2Model{}
-	case "Glm4MoeLiteForCausalLM":
-		conv = &glm4MoeLiteModel{}
-	case "Lfm2ForCausalLM":
-		conv = &lfm2Model{}
 	default:
 		return nil, nil, fmt.Errorf("unsupported architecture %q", p.Architectures[0])
 	}
--- a/convert/convert_glm4moelite.go
+++ b/convert/convert_glm4moelite.go
@@ -1,264 +0,0 @@
-package convert
-
-import (
-	"cmp"
-	"fmt"
-	"log/slog"
-	"regexp"
-	"strconv"
-	"strings"
-
-	"github.com/pdevine/tensor"
-	"github.com/pdevine/tensor/native"
-
-	"github.com/ollama/ollama/fs/ggml"
-)
-
-type glm4MoeLiteModel struct {
-	ModelParameters
-	MaxPositionEmbeddings uint32  `json:"max_position_embeddings"`
-	HiddenSize            uint32  `json:"hidden_size"`
-	HiddenLayers          uint32  `json:"num_hidden_layers"`
-	IntermediateSize      uint32  `json:"intermediate_size"`
-	NumAttentionHeads     uint32  `json:"num_attention_heads"`
-	NumKeyValueHeads      uint32  `json:"num_key_value_heads"`
-	RMSNormEPS            float32 `json:"rms_norm_eps"`
-
-	RopeTheta     float32 `json:"rope_theta"`
-	QKNopeHeadDim uint32  `json:"qk_nope_head_dim"`
-	QKRopeHeadDim uint32  `json:"qk_rope_head_dim"`
-	KVLoraRank    uint32  `json:"kv_lora_rank"`
-	QLoraRank     uint32  `json:"q_lora_rank"`
-	VHeadDim      uint32  `json:"v_head_dim"`
-
-	ExpertCount            uint32  `json:"n_routed_experts"`
-	ExpertSharedCount      uint32  `json:"n_shared_experts"`
-	ExpertIntermediateSize uint32  `json:"moe_intermediate_size"`
-	ExpertUsedCount        uint32  `json:"num_experts_per_tok"`
-	ExpertWeightsNorm      bool    `json:"norm_topk_prob"`
-	ExpertWeightsScale     float32 `json:"routed_scaling_factor"`
-
-	LeadingDenseBlockCount uint32 `json:"first_k_dense_replace"`
-}
-
-func (p *glm4MoeLiteModel) KV(t *Tokenizer) KV {
-	kv := p.ModelParameters.KV(t)
-	kv["general.architecture"] = "glm4moelite"
-	kv["general.type"] = "model"
-	kv["glm4moelite.block_count"] = p.HiddenLayers
-
-	numHeads := p.NumAttentionHeads
-	numKVHeads := p.NumKeyValueHeads
-
-	kv["glm4moelite.attention.head_count"] = numHeads
-	kv["glm4moelite.attention.head_count_kv"] = numKVHeads
-	kv["glm4moelite.attention.key_length"] = p.QKNopeHeadDim + p.QKRopeHeadDim
-	kv["glm4moelite.attention.kv_lora_rank"] = p.KVLoraRank
-	kv["glm4moelite.attention.layer_norm_rms_epsilon"] = p.RMSNormEPS
-	kv["glm4moelite.attention.q_lora_rank"] = p.QLoraRank
-	kv["glm4moelite.attention.value_length"] = p.VHeadDim
-	kv["glm4moelite.context_length"] = p.MaxPositionEmbeddings
-	kv["glm4moelite.embedding_length"] = p.HiddenSize
-	kv["glm4moelite.expert_count"] = p.ExpertCount
-	kv["glm4moelite.expert_feed_forward_length"] = p.ExpertIntermediateSize
-	kv["glm4moelite.expert_shared_count"] = p.ExpertSharedCount
-
-	kv["glm4moelite.expert_gating_func"] = uint32(2)
-	kv["glm4moelite.expert_used_count"] = p.ExpertUsedCount
-	kv["glm4moelite.expert_weights_norm"] = p.ExpertWeightsNorm
-	kv["glm4moelite.expert_weights_scale"] = p.ExpertWeightsScale
-	kv["glm4moelite.feed_forward_length"] = p.IntermediateSize
-	kv["glm4moelite.leading_dense_block_count"] = p.LeadingDenseBlockCount
-
-	kv["glm4moelite.rope.dimension_count"] = p.QKRopeHeadDim
-	kv["glm4moelite.rope.freq_base"] = cmp.Or(p.RopeTheta, float32(1000000.0))
-
-	kv["glm4moelite.attention.key_length_mla"] = p.KVLoraRank + p.QKRopeHeadDim
-	kv["glm4moelite.attention.value_length_mla"] = p.KVLoraRank
-
-	kv["tokenizer.ggml.pre"] = "glm4"
-
-	return kv
-}
-
-func (p *glm4MoeLiteModel) Replacements() []string {
-	return []string{
-		"lm_head", "output",
-		"model.embed_tokens", "token_embd",
-		"model.norm", "output_norm",
-		"model.layers", "blk",
-		"input_layernorm", "attn_norm",
-		"self_attn.kv_a_proj_with_mqa", "attn_kv_a_mqa",
-		"self_attn.kv_a_layernorm", "attn_kv_a_norm",
-		"self_attn.kv_b_proj", "attn_kv_b",
-		"self_attn.q_a_proj", "attn_q_a",
-		"self_attn.q_a_layernorm", "attn_q_a_norm",
-		"self_attn.q_b_proj", "attn_q_b",
-		"self_attn.o_proj", "attn_output",
-		"post_attention_layernorm", "ffn_norm",
-		"mlp.shared_experts.down_proj", "ffn_down_shexp",
-		"mlp.shared_experts.gate_proj", "ffn_gate_shexp",
-		"mlp.shared_experts.up_proj", "ffn_up_shexp",
-		"mlp.gate_proj", "ffn_gate",
-		"mlp.down_proj", "ffn_down",
-		"mlp.up_proj", "ffn_up",
-		"mlp.gate.e_score_correction_bias", "exp_probs_b.bias",
-		"mlp.gate", "ffn_gate_inp",
-	}
-}
-
-// repackKVB extracts K or V from the combined KV_B tensor for MLA absorption.
-// K output row-major: [n_head, kv_lora_rank, qk_nope] -> GGML ne[]={qk_nope, kv_lora_rank, n_head}
-// V output row-major: [n_head, v_head, kv_lora_rank] -> GGML ne[]={kv_lora_rank, v_head, n_head}
-func (p *glm4MoeLiteModel) repackKVB(extractK bool, kvFirst bool, numHeads int) Repacker {
-	qkNope := int(p.QKNopeHeadDim)
-	vHeadDim := int(p.VHeadDim)
-	kvLoraRank := int(p.KVLoraRank)
-	kvPerHead := qkNope + vHeadDim
-
-	return func(_ string, data []float32, shape []uint64) ([]float32, error) {
-		dims := make([]int, len(shape))
-		for i := range shape {
-			dims[i] = int(shape[i])
-		}
-
-		var tt tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
-		var err error
-
-		// Normalize to [n_head * (qk_nope + v_head), kv_lora_rank] layout
-		if kvFirst {
-			tt, err = tensor.Transpose(tt, 1, 0)
-			if err != nil {
-				return nil, err
-			}
-			tt = tensor.Materialize(tt)
-		}
-
-		// Reshape to [n_head, qk_nope + v_head, kv_lora_rank]
-		if err := tt.Reshape(numHeads, kvPerHead, kvLoraRank); err != nil {
-			return nil, err
-		}
-
-		if extractK {
-			// Slice K: [n_head, qk_nope, kv_lora_rank]
-			tt, err = tt.Slice(nil, tensor.S(0, qkNope), nil)
-			if err != nil {
-				return nil, err
-			}
-			tt = tensor.Materialize(tt)
-			// Transpose to [n_head, kv_lora_rank, qk_nope]
-			tt, err = tensor.Transpose(tt, 0, 2, 1)
-			if err != nil {
-				return nil, err
-			}
-			tt = tensor.Materialize(tt)
-		} else {
-			// Slice V: [n_head, v_head, kv_lora_rank] - already correct layout
-			tt, err = tt.Slice(nil, tensor.S(qkNope, kvPerHead), nil)
-			if err != nil {
-				return nil, err
-			}
-			tt = tensor.Materialize(tt)
-		}
-
-		if err := tt.Reshape(tt.Shape().TotalSize()); err != nil {
-			return nil, err
-		}
-		return native.VectorF32(tt.(*tensor.Dense))
-	}
-}
-
-func (p *glm4MoeLiteModel) Tensors(s []Tensor) (out []*ggml.Tensor) {
-	merges := make([]merge, p.HiddenLayers*3)
-	for i := range p.HiddenLayers {
-		merges[i*3+0] = merge{
-			fmt.Sprintf("blk.%d.mlp.experts.*.gate_proj.weight", i),
-			fmt.Sprintf("blk.%d.ffn_gate_exps.weight", i),
-		}
-		merges[i*3+1] = merge{
-			fmt.Sprintf("blk.%d.mlp.experts.*.up_proj.weight", i),
-			fmt.Sprintf("blk.%d.ffn_up_exps.weight", i),
-		}
-		merges[i*3+2] = merge{
-			fmt.Sprintf("blk.%d.mlp.experts.*.down_proj.weight", i),
-			fmt.Sprintf("blk.%d.ffn_down_exps.weight", i),
-		}
-	}
-
-	skipLayer := func(n string, minValue uint32) bool {
-		re := regexp.MustCompile(`^blk\.(\d+)`)
-		matches := re.FindStringSubmatch(n)
-		if matches == nil {
-			return false
-		}
-
-		blkNum, err := strconv.Atoi(matches[1])
-		if err != nil {
-			return false
-		}
-
-		return uint32(blkNum) >= minValue
-	}
-
-	out, s = mergeTensors(s, merges...)
-	for _, t := range s {
-		// skip any additional layers (such as the Multi-Token Prediction layer)
-		if skipLayer(t.Name(), p.HiddenLayers) {
-			slog.Debug("skipping layer", "name", t.Name())
-			continue
-		}
-
-		// Split attn_kv_b into separate attn_k_b and attn_v_b for MLA absorption
-		if strings.HasSuffix(t.Name(), ".attn_kv_b.weight") {
-			qkNope := int(p.QKNopeHeadDim)
-			vHeadDim := int(p.VHeadDim)
-			kvLoraRank := int(p.KVLoraRank)
-			kvPerHead := qkNope + vHeadDim
-			numHeads := int(p.NumAttentionHeads)
-			kvFirst := true
-			if len(t.Shape()) == 2 {
-				switch {
-				case int(t.Shape()[0]) == kvLoraRank:
-					if kvPerHead > 0 && int(t.Shape()[1])%kvPerHead == 0 {
-						numHeads = int(t.Shape()[1]) / kvPerHead
-					}
-					kvFirst = true
-				case int(t.Shape()[1]) == kvLoraRank:
-					if kvPerHead > 0 && int(t.Shape()[0])%kvPerHead == 0 {
-						numHeads = int(t.Shape()[0]) / kvPerHead
-					}
-					kvFirst = false
-				default:
-					slog.Warn("glm4moelite: unexpected attn_kv_b layout", "name", t.Name(), "shape", t.Shape())
-				}
-			}
-
-			kTensor := t.Clone()
-			kTensor.SetRepacker(p.repackKVB(true, kvFirst, numHeads))
-			out = append(out, &ggml.Tensor{
-				Name:     strings.Replace(t.Name(), "attn_kv_b", "attn_k_b", 1),
-				Kind:     t.Kind(),
-				Shape:    []uint64{uint64(numHeads), uint64(kvLoraRank), uint64(qkNope)},
-				WriterTo: kTensor,
-			})
-
-			vTensor := t.Clone()
-			vTensor.SetRepacker(p.repackKVB(false, kvFirst, numHeads))
-			out = append(out, &ggml.Tensor{
-				Name:     strings.Replace(t.Name(), "attn_kv_b", "attn_v_b", 1),
-				Kind:     t.Kind(),
-				Shape:    []uint64{uint64(numHeads), uint64(vHeadDim), uint64(kvLoraRank)},
-				WriterTo: vTensor,
-			})
-			continue
-		}
-
-		out = append(out, &ggml.Tensor{
-			Name:     t.Name(),
-			Kind:     t.Kind(),
-			Shape:    t.Shape(),
-			WriterTo: t,
-		})
-	}
-	return out
-}
--- a/convert/convert_lfm2.go
+++ b/convert/convert_lfm2.go
@@ -1,100 +0,0 @@
-package convert
-
-import (
-	"slices"
-	"strings"
-
-	"github.com/ollama/ollama/fs/ggml"
-)
-
-type lfm2Model struct {
-	ModelParameters
-	HiddenSize            uint32   `json:"hidden_size"`
-	NumHiddenLayers       uint32   `json:"num_hidden_layers"`
-	MaxPositionEmbeddings uint32   `json:"max_position_embeddings"`
-	IntermediateSize      uint32   `json:"intermediate_size"`
-	NumAttentionHeads     uint32   `json:"num_attention_heads"`
-	NumKeyValueHeads      uint32   `json:"num_key_value_heads"`
-	RopeTheta             float32  `json:"rope_theta"`
-	NormEps               float32  `json:"norm_eps"`
-	ConvLCache            uint32   `json:"conv_L_cache"`
-	LayerTypes            []string `json:"layer_types"`
-	TieEmbedding          bool     `json:"tie_embedding"`
-}
-
-var _ ModelConverter = (*lfm2Model)(nil)
-
-func (p *lfm2Model) KV(t *Tokenizer) KV {
-	kv := p.ModelParameters.KV(t)
-	kv["general.architecture"] = "lfm2"
-	kv["lfm2.vocab_size"] = p.VocabSize
-	kv["lfm2.block_count"] = p.NumHiddenLayers
-	kv["lfm2.embedding_length"] = p.HiddenSize
-	kv["lfm2.feed_forward_length"] = p.IntermediateSize
-	kv["lfm2.context_length"] = p.MaxPositionEmbeddings
-
-	// Build per-layer KV head count array based on layer_types
-	// (0 = shortconv layer, non-zero = attention layer with that many KV heads)
-	kvHeadCounts := make([]uint32, p.NumHiddenLayers)
-	for i := range p.NumHiddenLayers {
-		if int(i) < len(p.LayerTypes) && p.LayerTypes[i] == "full_attention" {
-			kvHeadCounts[i] = p.NumKeyValueHeads
-		}
-	}
-
-	kv["lfm2.attention.head_count"] = p.NumAttentionHeads
-	kv["lfm2.attention.head_count_kv"] = kvHeadCounts
-	kv["lfm2.attention.key_length"] = p.HiddenSize / p.NumAttentionHeads
-	kv["lfm2.attention.value_length"] = p.HiddenSize / p.NumAttentionHeads
-	kv["lfm2.attention.layer_norm_rms_epsilon"] = p.NormEps
-	kv["lfm2.rope.freq_base"] = p.RopeTheta
-	kv["lfm2.shortconv.l_cache"] = p.ConvLCache
-
-	return kv
-}
-
-func (p *lfm2Model) Tensors(ts []Tensor) []*ggml.Tensor {
-	var out []*ggml.Tensor
-
-	for _, t := range ts {
-		shape := t.Shape()
-
-		// Squeeze conv weights: [D, 1, K] -> [D, K]
-		if strings.HasSuffix(t.Name(), "shortconv.conv.weight") {
-			if len(shape) == 3 && shape[1] == 1 {
-				shape = []uint64{shape[0], shape[2]}
-			}
-		}
-
-		out = append(out, &ggml.Tensor{
-			Name:     t.Name(),
-			Kind:     t.Kind(),
-			Shape:    slices.Clone(shape),
-			WriterTo: t,
-		})
-	}
-
-	return out
-}
-
-func (p *lfm2Model) Replacements() []string {
-	return []string{
-		"model.embed_tokens", "token_embd",
-		"model.embedding_norm", "output_norm",
-		"model.layers", "blk",
-		"operator_norm", "attn_norm",
-		"self_attn.q_proj", "attn_q",
-		"self_attn.k_proj", "attn_k",
-		"self_attn.v_proj", "attn_v",
-		"self_attn.out_proj", "attn_output",
-		"self_attn.q_layernorm", "attn_q_norm",
-		"self_attn.k_layernorm", "attn_k_norm",
-		"conv.conv", "shortconv.conv",
-		"conv.in_proj", "shortconv.in_proj",
-		"conv.out_proj", "shortconv.out_proj",
-		"feed_forward.w1", "ffn_gate",
-		"feed_forward.w2", "ffn_down",
-		"feed_forward.w3", "ffn_up",
-		"ffn_norm", "ffn_norm",
-	}
-}
--- a/convert/reader.go
+++ b/convert/reader.go
@@ -40,7 +40,6 @@ const (
 func (t tensorBase) Kind() uint32 {
 	if strings.HasSuffix(t.name, ".ffn_gate_inp.weight") ||
 		strings.HasSuffix(t.name, ".bias") ||
-		strings.HasSuffix(t.name, ".shortconv.conv.weight") ||
 		t.name == "token_types.weight" ||
 		t.name == "v.positional_embedding_vlm" ||
 		t.name == "v.tile_position_embd.weight" ||
--- a/docs/api.md
+++ b/docs/api.md
@@ -16,7 +16,6 @@
 - [Generate Embeddings](#generate-embeddings)
 - [List Running Models](#list-running-models)
 - [Version](#version)
- [Experimental: Image Generation](#image-generation-experimental)

 ## Conventions

@@ -59,15 +58,6 @@ Advanced parameters (optional):
 - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
 - `context` (deprecated): the context parameter returned from a previous request to `/generate`, this can be used to keep a short conversational memory

-Experimental image generation parameters (for image generation models only):
-
-> [!WARNING]
-> These parameters are experimental and may change in future versions.
-
- `width`: width of the generated image in pixels
- `height`: height of the generated image in pixels
- `steps`: number of diffusion steps
-
 #### Structured outputs

 Structured outputs are supported by providing a JSON schema in the `format` parameter. The model will generate a response that matches the schema. See the [structured outputs](#request-structured-outputs) example below.
@@ -1877,55 +1867,3 @@ curl http://localhost:11434/api/version
  "version": "0.5.1"
 }
 ```
-
-## Experimental Features
-
-### Image Generation (Experimental)
-
-> [!WARNING]
-> Image generation is experimental and may change in future versions.
-
-Image generation is now supported through the standard `/api/generate` endpoint when using image generation models. The API automatically detects when an image generation model is being used.
-
-See the [Generate a completion](#generate-a-completion) section for the full API documentation. The experimental image generation parameters (`width`, `height`, `steps`) are documented there.
-
-#### Example
-
-##### Request
-
-```shell
-curl http://localhost:11434/api/generate -d '{
-  "model": "x/z-image-turbo",
-  "prompt": "a sunset over mountains",
-  "width": 1024,
-  "height": 768
-}'
-```
-
-##### Response (streaming)
-
-Progress updates during generation:
-
-```json
-{
-  "model": "x/z-image-turbo",
-  "created_at": "2024-01-15T10:30:00.000000Z",
-  "completed": 5,
-  "total": 20,
-  "done": false
-}
-```
-
-##### Final Response
-
-```json
-{
-  "model": "x/z-image-turbo",
-  "created_at": "2024-01-15T10:30:15.000000Z",
-  "image": "iVBORw0KGgoAAAANSUhEUg...",
-  "done": true,
-  "done_reason": "stop",
-  "total_duration": 15000000000,
-  "load_duration": 2000000000
-}
-```
--- a/docs/api/anthropic-compatibility.mdx
+++ b/docs/api/anthropic-compatibility.mdx
@@ -21,7 +21,6 @@ ollama pull glm-4.7:cloud
 To use Ollama with tools that expect the Anthropic API (like Claude Code), set these environment variables:

 ```shell
-export ANTHROPIC_AUTH_TOKEN=ollama  # required but ignored
 export ANTHROPIC_BASE_URL=http://localhost:11434
 export ANTHROPIC_API_KEY=ollama  # required but ignored
 ```
@@ -248,13 +247,12 @@ curl -X POST http://localhost:11434/v1/messages \
 [Claude Code](https://code.claude.com/docs/en/overview) can be configured to use Ollama as its backend:

 ```shell
-ANTHROPIC_AUTH_TOKEN=ollama ANTHROPIC_BASE_URL=http://localhost:11434 ANTHROPIC_API_KEY=ollama claude --model qwen3-coder
+ANTHROPIC_BASE_URL=http://localhost:11434 ANTHROPIC_API_KEY=ollama claude --model qwen3-coder
 ```

 Or set the environment variables in your shell profile:

 ```shell
-export ANTHROPIC_AUTH_TOKEN=ollama
 export ANTHROPIC_BASE_URL=http://localhost:11434
 export ANTHROPIC_API_KEY=ollama
 ```
--- a/docs/api/openai-compatibility.mdx
+++ b/docs/api/openai-compatibility.mdx
@@ -275,73 +275,6 @@ curl -X POST http://localhost:11434/v1/chat/completions \
 - [x] `dimensions`
 - [ ] `user`

-### `/v1/images/generations` (experimental)
-
-> Note: This endpoint is experimental and may change or be removed in future versions.
-
-Generate images using image generation models.
-
-<CodeGroup dropdown>
-
-```python images.py
-from openai import OpenAI
-
-client = OpenAI(
-    base_url='http://localhost:11434/v1/',
-    api_key='ollama',  # required but ignored
-)
-
-response = client.images.generate(
-    model='x/z-image-turbo',
-    prompt='A cute robot learning to paint',
-    size='1024x1024',
-    response_format='b64_json',
-)
-print(response.data[0].b64_json[:50] + '...')
-```
-
-```javascript images.js
-import OpenAI from "openai";
-
-const openai = new OpenAI({
-  baseURL: "http://localhost:11434/v1/",
-  apiKey: "ollama", // required but ignored
-});
-
-const response = await openai.images.generate({
-  model: "x/z-image-turbo",
-  prompt: "A cute robot learning to paint",
-  size: "1024x1024",
-  response_format: "b64_json",
-});
-
-console.log(response.data[0].b64_json.slice(0, 50) + "...");
-```
-
-```shell images.sh
-curl -X POST http://localhost:11434/v1/images/generations \
-H "Content-Type: application/json" \
-d '{
-  "model": "x/z-image-turbo",
-  "prompt": "A cute robot learning to paint",
-  "size": "1024x1024",
-  "response_format": "b64_json"
-}'
-```
-
-</CodeGroup>
-
-#### Supported request fields
-
- [x] `model`
- [x] `prompt`
- [x] `size` (e.g. "1024x1024")
- [x] `response_format` (only `b64_json` supported)
- [ ] `n`
- [ ] `quality`
- [ ] `style`
- [ ] `user`
-
 ### `/v1/responses`

 > Note: Added in Ollama v0.13.3
--- a/docs/capabilities/web-search.mdx
+++ b/docs/capabilities/web-search.mdx
@@ -110,7 +110,7 @@ More Ollama [Python example](https://github.com/ollama/ollama-python/blob/main/e
 import { Ollama } from "ollama";

 const client = new Ollama();
-const results = await client.webSearch("what is ollama?");
+const results = await client.webSearch({ query: "what is ollama?" });
 console.log(JSON.stringify(results, null, 2));
 ```

@@ -213,7 +213,7 @@ models](https://ollama.com/models)\n\nAvailable for macOS, Windows, and Linux',
 import { Ollama } from "ollama";

 const client = new Ollama();
-const fetchResult = await client.webFetch("https://ollama.com");
+const fetchResult = await client.webFetch({ url: "https://ollama.com" });
 console.log(JSON.stringify(fetchResult, null, 2));
 ```

--- a/docs/docs.json
+++ b/docs/docs.json
@@ -111,9 +111,7 @@
              "/integrations/zed",
              "/integrations/roo-code",
              "/integrations/n8n",
-              "/integrations/xcode",
-              "/integrations/onyx",
-              "/integrations/marimo"
+              "/integrations/xcode"
            ]
          },
          {
--- a/docs/faq.mdx
+++ b/docs/faq.mdx
@@ -22,7 +22,7 @@ Please refer to the [GPU docs](./gpu).

 ## How can I specify the context window size?

-By default, Ollama uses a context window size of 4096 tokens.
+By default, Ollama uses a context window size of 2048 tokens.

 This can be overridden with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use:

--- a/docs/images/marimo-add-model.png
+++ b/docs/images/marimo-add-model.png
--- a/docs/images/marimo-chat.png
+++ b/docs/images/marimo-chat.png
--- a/docs/images/marimo-code-completion.png
+++ b/docs/images/marimo-code-completion.png
--- a/docs/images/marimo-models.png
+++ b/docs/images/marimo-models.png
--- a/docs/images/marimo-settings.png
+++ b/docs/images/marimo-settings.png
--- a/docs/images/onyx-login.png
+++ b/docs/images/onyx-login.png
--- a/docs/images/onyx-ollama-form.png
+++ b/docs/images/onyx-ollama-form.png
--- a/docs/images/onyx-ollama-llm.png
+++ b/docs/images/onyx-ollama-llm.png
--- a/docs/images/onyx-query.png
+++ b/docs/images/onyx-query.png
--- a/docs/integrations/claude-code.mdx
+++ b/docs/integrations/claude-code.mdx
@@ -2,12 +2,6 @@
 title: Claude Code
 ---

-Claude Code is Anthropic's agentic coding tool that can read, modify, and execute code in your working directory. 
-
-Open models can be used with Claude Code through Ollama's Anthropic-compatible API, enabling you to use models such as `qwen3-coder`, `gpt-oss:20b`, or other models.
-
-![Claude Code with Ollama](https://files.ollama.com/claude-code.png)
-
 ## Install

 Install [Claude Code](https://code.claude.com/docs/en/overview):
@@ -31,24 +25,22 @@ Claude Code connects to Ollama using the Anthropic-compatible API.
 1. Set the environment variables:

 ```shell
-export ANTHROPIC_AUTH_TOKEN=ollama
 export ANTHROPIC_BASE_URL=http://localhost:11434
+export ANTHROPIC_API_KEY=ollama
 ```

 2. Run Claude Code with an Ollama model:

 ```shell
-claude --model gpt-oss:20b
+claude --model qwen3-coder
 ```

 Or run with environment variables inline:

 ```shell
-ANTHROPIC_AUTH_TOKEN=ollama ANTHROPIC_BASE_URL=http://localhost:11434 claude --model gpt-oss:20b
+ANTHROPIC_BASE_URL=http://localhost:11434 ANTHROPIC_API_KEY=ollama claude --model qwen3-coder
 ```

-**Note:** Claude Code requires a large context window. We recommend at least 32K tokens. See the [context length documentation](/context-length) for how to adjust context length in Ollama.
-
 ## Connecting to ollama.com

 1. Create an [API key](https://ollama.com/settings/keys) on ollama.com
@@ -75,4 +67,3 @@ claude --model glm-4.7:cloud
 ### Local models
 - `qwen3-coder` - Excellent for coding tasks
 - `gpt-oss:20b` - Strong general-purpose model
- `gpt-oss:120b` - Larger general-purpose model for more complex tasks
--- a/docs/integrations/marimo.mdx
+++ b/docs/integrations/marimo.mdx
@@ -1,73 +0,0 @@
---
-title: marimo
---
-
-## Install
-
-Install [marimo](https://marimo.io). You can use `pip` or `uv` for this. You 
-can also use `uv` to create a sandboxed environment for marimo by running:
-
-```
-uvx marimo edit --sandbox notebook.py
-```
-
-## Usage with Ollama
-
-1. In marimo, go to the user settings and go to the AI tab. From here
-you can find and configure Ollama as an AI provider. For local use you
-would typically point the base url to `http://localhost:11434/v1`.
-
-<div style={{ display: 'flex', justifyContent: 'center' }}>
-  <img 
-    src="/images/marimo-settings.png" 
-    alt="Ollama settings in marimo"
-    width="50%"
-  />
-</div>
-
-2. Once the AI provider is set up, you can turn on/off specific AI models you'd like to access. 
-
-<div style={{ display: 'flex', justifyContent: 'center' }}>
-  <img 
-    src="/images/marimo-models.png" 
-    alt="Selecting an Ollama model"
-    width="50%"
-  />
-</div>
-
-3. You can also add a model to the list of available models by scrolling to the bottom and using the UI there. 
-
-<div style={{ display: 'flex', justifyContent: 'center' }}>
-  <img 
-    src="/images/marimo-add-model.png" 
-    alt="Adding a new Ollama model"
-    width="50%"
-  />
-</div>
-
-4. Once configured, you can now use Ollama for AI chats in marimo.
-
-<div style={{ display: 'flex', justifyContent: 'center' }}>
-  <img 
-    src="/images/marimo-chat.png" 
-    alt="Configure code completion"
-    width="50%"
-  />
-</div>
-
-4. Alternatively, you can now use Ollama for **inline code completion** in marimo. This can be configured in the "AI Features" tab. 
-
-<div style={{ display: 'flex', justifyContent: 'center' }}>
-  <img 
-    src="/images/marimo-code-completion.png" 
-    alt="Configure code completion"
-    width="50%"
-  />
-</div>
-
-
-## Connecting to ollama.com
-
-1. Sign in to ollama cloud via `ollama signin` 
-2. In the ollama model settings add a model that ollama hosts, like `gpt-oss:120b`.
-3. You can now refer to this model in marimo!
--- a/docs/integrations/onyx.mdx
+++ b/docs/integrations/onyx.mdx
@@ -1,63 +0,0 @@
---
-title: Onyx
---
-
-## Overview
-[Onyx](http://onyx.app/) is a self-hostable Chat UI that integrates with all Ollama models. Features include:
- Creating custom Agents
- Web search
- Deep Research
- RAG over uploaded documents and connected apps
- Connectors to applications like Google Drive, Email, Slack, etc.
- MCP and OpenAPI Actions support
- Image generation
- User/Groups management, RBAC, SSO, etc.
-
-Onyx can be deployed for single users or large organizations.
-
-## Install Onyx
-
-Deploy Onyx with the [quickstart guide](https://docs.onyx.app/deployment/getting_started/quickstart).
-
-<Info>
-Resourcing/scaling docs [here](https://docs.onyx.app/deployment/getting_started/resourcing).
-</Info>
-
-## Usage with Ollama 
-
-1. Login to your Onyx deployment (create an account first).
-<div style={{ display: 'flex', justifyContent: 'center' }}>
-  <img 
-    src="/images/onyx-login.png" 
-    alt="Onyx Login Page"
-    width="75%"
-  />
-</div>
-2. In the set-up process select `Ollama` as the LLM provider.
-<div style={{ display: 'flex', justifyContent: 'center' }}>
-  <img 
-    src="/images/onyx-ollama-llm.png" 
-    alt="Onyx Set Up Form"
-    width="75%"
-  />
-</div>
-3. Provide your **Ollama API URL** and select your models.
-<Note>If you're running Onyx in Docker, to access your computer's local network use `http://host.docker.internal` instead of `http://127.0.0.1`.</Note>
-<div style={{ display: 'flex', justifyContent: 'center' }}>
-  <img 
-    src="/images/onyx-ollama-form.png" 
-    alt="Selecting Ollama Models"
-    width="75%"
-  />
-</div>
-
-You can also easily connect up Onyx Cloud with the `Ollama Cloud` tab of the setup.
-
-## Send your first query
-<div style={{ display: 'flex', justifyContent: 'center' }}>
-  <img 
-    src="/images/onyx-query.png" 
-    alt="Onyx Query Example"
-    width="75%"
-  />
-</div>
--- a/docs/linux.mdx
+++ b/docs/linux.mdx
@@ -1,5 +1,5 @@
 ---
-title: Linux
+title: "Linux"
 ---

 ## Install
@@ -13,15 +13,14 @@ curl -fsSL https://ollama.com/install.sh | sh
 ## Manual install

 <Note>
-  If you are upgrading from a prior version, you should remove the old libraries
-  with `sudo rm -rf /usr/lib/ollama` first.
+  If you are upgrading from a prior version, you should remove the old libraries with `sudo rm -rf /usr/lib/ollama` first.
 </Note>

 Download and extract the package:

 ```shell
-curl -fsSL https://ollama.com/download/ollama-linux-amd64.tar.zst \
-    | sudo tar x -C /usr
+curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz \
+    | sudo tar zx -C /usr
 ```

 Start Ollama:
@@ -41,8 +40,8 @@ ollama -v
 If you have an AMD GPU, also download and extract the additional ROCm package:

 ```shell
-curl -fsSL https://ollama.com/download/ollama-linux-amd64-rocm.tar.zst \
-    | sudo tar x -C /usr
+curl -fsSL https://ollama.com/download/ollama-linux-amd64-rocm.tgz \
+    | sudo tar zx -C /usr
 ```

 ### ARM64 install
@@ -50,8 +49,8 @@ curl -fsSL https://ollama.com/download/ollama-linux-amd64-rocm.tar.zst \
 Download and extract the ARM64-specific package:

 ```shell
-curl -fsSL https://ollama.com/download/ollama-linux-arm64.tar.zst \
-    | sudo tar x -C /usr
+curl -fsSL https://ollama.com/download/ollama-linux-arm64.tgz \
+    | sudo tar zx -C /usr
 ```

 ### Adding Ollama as a startup service (recommended)
@@ -113,11 +112,7 @@ sudo systemctl status ollama
 ```

 <Note>
-  While AMD has contributed the `amdgpu` driver upstream to the official linux
-  kernel source, the version is older and may not support all ROCm features. We
-  recommend you install the latest driver from
-  https://www.amd.com/en/support/linux-drivers for best support of your Radeon
-  GPU.
+  While AMD has contributed the `amdgpu` driver upstream to the official linux kernel source, the version is older and may not support all ROCm features. We recommend you install the latest driver from https://www.amd.com/en/support/linux-drivers for best support of your Radeon GPU.
 </Note>

 ## Customizing
@@ -146,8 +141,8 @@ curl -fsSL https://ollama.com/install.sh | sh
 Or by re-downloading Ollama:

 ```shell
-curl -fsSL https://ollama.com/download/ollama-linux-amd64.tar.zst \
-    | sudo tar x -C /usr
+curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz \
+    | sudo tar zx -C /usr
 ```

 ## Installing specific versions
@@ -196,4 +191,4 @@ Remove the downloaded models and Ollama service user and group:
 sudo userdel ollama
 sudo groupdel ollama
 sudo rm -r /usr/share/ollama
-```
+```
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -0,0 +1,3 @@
+# Troubleshooting
+
+For troubleshooting, see [https://docs.ollama.com/troubleshooting](https://docs.ollama.com/troubleshooting)
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -269,8 +269,6 @@ func (kv KV) OllamaEngineRequired() bool {
 		"qwen25vl",
 		"qwen3", "qwen3moe",
 		"qwen3vl", "qwen3vlmoe",
-		"glm4moelite",
-		"lfm2",
 	}, kv.Architecture())
 }

@@ -858,9 +856,7 @@ func (f GGML) FlashAttention() bool {
 	return slices.Contains([]string{
 		"bert",
 		"gemma3",
-		"glm4moelite",
 		"gptoss", "gpt-oss",
-		"lfm2",
 		"mistral3",
 		"olmo3",
 		"qwen3", "qwen3moe",
--- a/go.mod
+++ b/go.mod
@@ -15,8 +15,8 @@ require (
 	github.com/spf13/cobra v1.7.0
 	github.com/stretchr/testify v1.9.0
 	github.com/x448/float16 v0.8.4
-	golang.org/x/sync v0.17.0
-	golang.org/x/sys v0.37.0
+	golang.org/x/sync v0.19.0
+	golang.org/x/sys v0.39.0
 )

 require (
@@ -30,8 +30,8 @@ require (
 	github.com/tkrajina/typescriptify-golang-structs v0.2.0
 	github.com/wk8/go-ordered-map/v2 v2.1.8
 	golang.org/x/image v0.22.0
-	golang.org/x/mod v0.30.0
-	golang.org/x/tools v0.38.0
+	golang.org/x/mod v0.31.0
+	golang.org/x/tools v0.40.0
 	gonum.org/v1/gonum v0.15.0
 )

@@ -81,11 +81,11 @@ require (
 	github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
 	github.com/ugorji/go/codec v1.2.12 // indirect
 	golang.org/x/arch v0.8.0 // indirect
-	golang.org/x/crypto v0.43.0
-	golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa // indirect
-	golang.org/x/net v0.46.0 // indirect
-	golang.org/x/term v0.36.0
-	golang.org/x/text v0.30.0
+	golang.org/x/crypto v0.46.0
+	golang.org/x/exp v0.0.0-20251219203646-944ab1f22d93
+	golang.org/x/net v0.48.0 // indirect
+	golang.org/x/term v0.38.0
+	golang.org/x/text v0.32.0
 	google.golang.org/protobuf v1.34.1
 	gopkg.in/yaml.v3 v3.0.1 // indirect
 )
--- a/go.sum
+++ b/go.sum
@@ -233,16 +233,16 @@ golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACk
 golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
-golang.org/x/crypto v0.43.0 h1:dduJYIi3A3KOfdGOHX8AVZ/jGiyPa3IbBozJ5kNuE04=
-golang.org/x/crypto v0.43.0/go.mod h1:BFbav4mRNlXJL4wNeejLpWxB7wMbc79PdRGhWKncxR0=
+golang.org/x/crypto v0.46.0 h1:cKRW/pmt1pKAfetfu+RCEvjvZkA9RimPbh7bhFjGVBU=
+golang.org/x/crypto v0.46.0/go.mod h1:Evb/oLKmMraqjZ2iQTwDwvCtJkczlDuTmdJXoZVzqU0=
 golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20191002040644-a1355ae1e2c3/go.mod h1:NOZ3BPKG0ec/BKJQgnvsSFpcKLM5xXVWnvZS97DWHgE=
-golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa h1:t2QcU6V556bFjYgu4L6C+6VrCPyJZ+eyRsABUPs1mz4=
-golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa/go.mod h1:BHOTPb3L19zxehTsLoJXVaTktb06DFgmdW6Wb9s8jqk=
+golang.org/x/exp v0.0.0-20251219203646-944ab1f22d93 h1:fQsdNF2N+/YewlRZiricy4P1iimyPKZ/xwniHj8Q2a0=
+golang.org/x/exp v0.0.0-20251219203646-944ab1f22d93/go.mod h1:EPRbTFwzwjXj9NpYyyrvenVh9Y+GFeEvMNh7Xuz7xgU=
 golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs=
 golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js=
 golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
@@ -264,8 +264,8 @@ golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzB
 golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
 golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
 golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
-golang.org/x/mod v0.30.0 h1:fDEXFVZ/fmCKProc/yAXXUijritrDzahmwwefnjoPFk=
-golang.org/x/mod v0.30.0/go.mod h1:lAsf5O2EvJeSFMiBxXDki7sCgAxEUcZHXoXMKT4GJKc=
+golang.org/x/mod v0.31.0 h1:HaW9xtz0+kOcWKwli0ZXy79Ix+UW/vOfmWI5QVd2tgI=
+golang.org/x/mod v0.31.0/go.mod h1:43JraMp9cGx1Rx3AqioxrbrhNsLl2l/iNAvuBkrezpg=
 golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
 golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
 golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
@@ -278,8 +278,8 @@ golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81R
 golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
 golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM=
 golang.org/x/net v0.0.0-20210614182718-04defd469f4e/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
-golang.org/x/net v0.46.0 h1:giFlY12I07fugqwPuWJi68oOnpfqFnJIJzaIIm2JVV4=
-golang.org/x/net v0.46.0/go.mod h1:Q9BGdFy1y4nkUwiLvT5qtyhAnEHgnQ/zd8PfU6nc210=
+golang.org/x/net v0.48.0 h1:zyQRTTrjc33Lhh0fBgT/H3oZq9WuvRR5gPC70xpDiQU=
+golang.org/x/net v0.48.0/go.mod h1:+ndRgGjkh8FGtu1w1FGbEC31if4VrNVMuKTgcAAnQRY=
 golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
 golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
 golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
@@ -289,8 +289,8 @@ golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJ
 golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.17.0 h1:l60nONMj9l5drqw6jlhIELNv9I0A4OFgRsG9k2oT9Ug=
-golang.org/x/sync v0.17.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
+golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4=
+golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
 golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
@@ -306,17 +306,17 @@ golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBc
 golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.37.0 h1:fdNQudmxPjkdUTPnLn5mdQv7Zwvbvpaxqs831goi9kQ=
-golang.org/x/sys v0.37.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
+golang.org/x/sys v0.39.0 h1:CvCKL8MeisomCi6qNZ+wbb0DN9E5AATixKsvNtMoMFk=
+golang.org/x/sys v0.39.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
 golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
-golang.org/x/term v0.36.0 h1:zMPR+aF8gfksFprF/Nc/rd1wRS1EI6nDBGyWAvDzx2Q=
-golang.org/x/term v0.36.0/go.mod h1:Qu394IJq6V6dCBRgwqshf3mPF85AqzYEzofzRdZkWss=
+golang.org/x/term v0.38.0 h1:PQ5pkm/rLO6HnxFR7N2lJHOZX6Kez5Y1gDSJla6jo7Q=
+golang.org/x/term v0.38.0/go.mod h1:bSEAKrOT1W+VSu9TSCMtoGEOUcKxOKgl3LE5QEF/xVg=
 golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
-golang.org/x/text v0.30.0 h1:yznKA/E9zq54KzlzBEAWn1NXSQ8DIp/NYMy88xJjl4k=
-golang.org/x/text v0.30.0/go.mod h1:yDdHFIX9t+tORqspjENWgzaCVXgk0yYnYuSZ8UzzBVM=
+golang.org/x/text v0.32.0 h1:ZD01bjUt1FQ9WJ0ClOL5vxgxOI/sVCNgX1YtKwcY0mU=
+golang.org/x/text v0.32.0/go.mod h1:o/rUWzghvpD5TXrTIBuJU77MTaN0ljMWE47kxGJQ7jY=
 golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
@@ -330,8 +330,8 @@ golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapK
 golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
 golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
 golang.org/x/tools v0.1.4/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk=
-golang.org/x/tools v0.38.0 h1:Hx2Xv8hISq8Lm16jvBZ2VQf+RLmbd7wVUsALibYI/IQ=
-golang.org/x/tools v0.38.0/go.mod h1:yEsQ/d/YK8cjh0L6rZlY8tgtlKiBNTL14pGDJPJpYQs=
+golang.org/x/tools v0.40.0 h1:yLkxfA+Qnul4cs9QA3KnlFu0lVmd8JJfoq+E41uSutA=
+golang.org/x/tools v0.40.0/go.mod h1:Ik/tzLRlbscWpqqMRjyWYDisX8bG13FrdXp3o4Sr9lc=
 golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
--- a/integration/imagegen_test.go
+++ b/integration/imagegen_test.go
@@ -1,148 +0,0 @@
-//go:build integration
-
-package integration
-
-import (
-	"context"
-	"encoding/base64"
-	"fmt"
-	"strings"
-	"testing"
-	"time"
-
-	"github.com/ollama/ollama/api"
-)
-
-func TestImageGeneration(t *testing.T) {
-	skipUnderMinVRAM(t, 8)
-
-	type testCase struct {
-		imageGenModel string
-		visionModel   string
-		prompt        string
-		expectedWords []string
-	}
-
-	testCases := []testCase{
-		{
-			imageGenModel: "jmorgan/z-image-turbo",
-			visionModel:   "llama3.2-vision",
-			prompt:        "A cartoon style llama flying like a superhero through the air with clouds in the background",
-			expectedWords: []string{"llama", "flying", "cartoon", "cloud", "sky", "superhero", "air", "animal", "camelid"},
-		},
-	}
-
-	for _, tc := range testCases {
-		t.Run(fmt.Sprintf("%s->%s", tc.imageGenModel, tc.visionModel), func(t *testing.T) {
-			ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
-			defer cancel()
-
-			client, _, cleanup := InitServerConnection(ctx, t)
-			defer cleanup()
-
-			// Pull both models
-			if err := PullIfMissing(ctx, client, tc.imageGenModel); err != nil {
-				t.Fatalf("failed to pull image gen model: %v", err)
-			}
-			if err := PullIfMissing(ctx, client, tc.visionModel); err != nil {
-				t.Fatalf("failed to pull vision model: %v", err)
-			}
-
-			// Generate the image
-			t.Logf("Generating image with prompt: %s", tc.prompt)
-			imageBase64, err := generateImage(ctx, client, tc.imageGenModel, tc.prompt)
-			if err != nil {
-				if strings.Contains(err.Error(), "image generation not available") {
-					t.Skip("Target system does not support image generation")
-				} else if strings.Contains(err.Error(), "executable file not found in") { // Windows pattern, not yet supported
-					t.Skip("Windows does not support image generation yet")
-				} else if strings.Contains(err.Error(), "CUDA driver version is insufficient") {
-					t.Skip("Driver is too old")
-				} else if strings.Contains(err.Error(), "insufficient memory for image generation") {
-					t.Skip("insufficient memory for image generation")
-				} else if strings.Contains(err.Error(), "error while loading shared libraries: libcuda.so.1") { // AMD GPU or CPU
-					t.Skip("CUDA GPU is not available")
-				} else if strings.Contains(err.Error(), "ollama-mlx: no such file or directory") {
-					// most likely linux arm - not supported yet
-					t.Skip("unsupported architecture")
-				}
-				t.Fatalf("failed to generate image: %v", err)
-			}
-
-			imageData, err := base64.StdEncoding.DecodeString(imageBase64)
-			if err != nil {
-				t.Fatalf("failed to decode image: %v", err)
-			}
-			t.Logf("Generated image: %d bytes", len(imageData))
-
-			// Preload vision model and check GPU loading
-			err = client.Generate(ctx, &api.GenerateRequest{Model: tc.visionModel}, func(response api.GenerateResponse) error { return nil })
-			if err != nil {
-				t.Fatalf("failed to load vision model: %v", err)
-			}
-
-			// Use vision model to describe the image
-			chatReq := api.ChatRequest{
-				Model: tc.visionModel,
-				Messages: []api.Message{
-					{
-						Role:    "user",
-						Content: "Describe this image in detail. What is shown? What style is it? What is the main subject doing?",
-						Images:  []api.ImageData{imageData},
-					},
-				},
-				Stream: &stream,
-				Options: map[string]any{
-					"seed":        42,
-					"temperature": 0.0,
-				},
-			}
-
-			// Verify the vision model's response contains expected keywords
-			response := DoChat(ctx, t, client, chatReq, tc.expectedWords, 240*time.Second, 30*time.Second)
-			if response != nil {
-				t.Logf("Vision model response: %s", response.Content)
-
-				// Additional detailed check for keywords
-				content := strings.ToLower(response.Content)
-				foundWords := []string{}
-				missingWords := []string{}
-				for _, word := range tc.expectedWords {
-					if strings.Contains(content, word) {
-						foundWords = append(foundWords, word)
-					} else {
-						missingWords = append(missingWords, word)
-					}
-				}
-				t.Logf("Found keywords: %v", foundWords)
-				if len(missingWords) > 0 {
-					t.Logf("Missing keywords (at least one was found so test passed): %v", missingWords)
-				}
-			}
-		})
-	}
-}
-
-// generateImage calls the Ollama API to generate an image and returns the base64 image data
-func generateImage(ctx context.Context, client *api.Client, model, prompt string) (string, error) {
-	var imageBase64 string
-
-	err := client.Generate(ctx, &api.GenerateRequest{
-		Model:  model,
-		Prompt: prompt,
-	}, func(resp api.GenerateResponse) error {
-		if resp.Image != "" {
-			imageBase64 = resp.Image
-		}
-		return nil
-	})
-	if err != nil {
-		return "", fmt.Errorf("failed to generate image: %w", err)
-	}
-
-	if imageBase64 == "" {
-		return "", fmt.Errorf("no image data in response")
-	}
-
-	return imageBase64, nil
-}
--- a/integration/tools_test.go
+++ b/integration/tools_test.go
@@ -131,7 +131,7 @@ func TestAPIToolCalling(t *testing.T) {
 					t.Errorf("unexpected tool called: got %q want %q", lastToolCall.Function.Name, "get_weather")
 				}

-				if _, ok := lastToolCall.Function.Arguments.Get("location"); !ok {
+				if _, ok := lastToolCall.Function.Arguments["location"]; !ok {
 					t.Errorf("expected tool arguments to include 'location', got: %s", lastToolCall.Function.Arguments.String())
 				}
 			case <-ctx.Done():
--- a/integration/utils_test.go
+++ b/integration/utils_test.go
@@ -38,7 +38,6 @@ var (

 	// Note: add newer models at the top of the list to test them first
 	ollamaEngineChatModels = []string{
-		"lfm2.5-thinking",
 		"ministral-3",
 		"qwen3-coder:30b",
 		"gpt-oss:20b",
@@ -144,7 +143,6 @@ var (
 		"granite3.3",
 		"hermes3",
 		"internlm2",
-		"lfm2.5-thinking",
 		"llama-guard3",
 		"llama-pro",
 		"llama2-chinese",
@@ -265,7 +263,6 @@ var (
 		"snowflake-arctic-embed2",
 	}
 	libraryToolsModels = []string{
-		"lfm2.5-thinking",
 		"qwen3-vl",
 		"gpt-oss:20b",
 		"gpt-oss:120b",
--- a/llm/server.go
+++ b/llm/server.go
@@ -1464,12 +1464,6 @@ type CompletionRequest struct {

 	// TopLogprobs specifies the number of most likely alternative tokens to return (0-20)
 	TopLogprobs int
-
-	// Image generation fields
-	Width  int32 `json:"width,omitempty"`
-	Height int32 `json:"height,omitempty"`
-	Steps  int32 `json:"steps,omitempty"`
-	Seed   int64 `json:"seed,omitempty"`
 }

 // DoneReason represents the reason why a completion response is done
@@ -1518,15 +1512,6 @@ type CompletionResponse struct {

 	// Logprobs contains log probability information if requested
 	Logprobs []Logprob `json:"logprobs,omitempty"`
-
-	// Image contains base64-encoded image data for image generation
-	Image string `json:"image,omitempty"`
-
-	// Step is the current step in image generation
-	Step int `json:"step,omitempty"`
-
-	// TotalSteps is the total number of steps for image generation
-	TotalSteps int `json:"total_steps,omitempty"`
 }

 func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
--- a/middleware/anthropic.go
+++ b/middleware/anthropic.go
@@ -118,9 +118,6 @@ func AnthropicMessagesMiddleware() gin.HandlerFunc {
 			return
 		}

-		// Set think to nil when being used with Anthropic API to connect to tools like claude code
-		c.Set("relax_thinking", true)
-
 		var b bytes.Buffer
 		if err := json.NewEncoder(&b).Encode(chatReq); err != nil {
 			c.AbortWithStatusJSON(http.StatusInternalServerError, anthropic.NewError(http.StatusInternalServerError, err.Error()))
--- a/middleware/anthropic_test.go
+++ b/middleware/anthropic_test.go
@@ -582,26 +582,3 @@ func TestAnthropicWriter_ErrorFromRoutes(t *testing.T) {
 		})
 	}
 }
-
-func TestAnthropicMessagesMiddleware_SetsRelaxThinkingFlag(t *testing.T) {
-	gin.SetMode(gin.TestMode)
-
-	var flagSet bool
-	router := gin.New()
-	router.Use(AnthropicMessagesMiddleware())
-	router.POST("/v1/messages", func(c *gin.Context) {
-		_, flagSet = c.Get("relax_thinking")
-		c.Status(http.StatusOK)
-	})
-
-	body := `{"model": "test-model", "max_tokens": 100, "messages": [{"role": "user", "content": "Hi"}]}`
-	req, _ := http.NewRequest(http.MethodPost, "/v1/messages", strings.NewReader(body))
-	req.Header.Set("Content-Type", "application/json")
-
-	resp := httptest.NewRecorder()
-	router.ServeHTTP(resp, req)
-
-	if !flagSet {
-		t.Error("expected relax_thinking flag to be set in context")
-	}
-}
--- a/middleware/openai.go
+++ b/middleware/openai.go
@@ -8,7 +8,6 @@ import (
 	"math/rand"
 	"net/http"
 	"strings"
-	"time"

 	"github.com/gin-gonic/gin"

@@ -442,7 +441,6 @@ type ResponsesWriter struct {
 	stream     bool
 	responseID string
 	itemID     string
-	request    openai.ResponsesRequest
 }

 func (w *ResponsesWriter) writeEvent(eventType string, data any) error {
@@ -480,9 +478,7 @@ func (w *ResponsesWriter) writeResponse(data []byte) (int, error) {

 	// Non-streaming response
 	w.ResponseWriter.Header().Set("Content-Type", "application/json")
-	response := openai.ToResponse(w.model, w.responseID, w.itemID, chatResponse, w.request)
-	completedAt := time.Now().Unix()
-	response.CompletedAt = &completedAt
+	response := openai.ToResponse(w.model, w.responseID, w.itemID, chatResponse)
 	return len(data), json.NewEncoder(w.ResponseWriter).Encode(response)
 }

@@ -527,12 +523,11 @@ func ResponsesMiddleware() gin.HandlerFunc {

 		w := &ResponsesWriter{
 			BaseWriter: BaseWriter{ResponseWriter: c.Writer},
-			converter:  openai.NewResponsesStreamConverter(responseID, itemID, req.Model, req),
+			converter:  openai.NewResponsesStreamConverter(responseID, itemID, req.Model),
 			model:      req.Model,
 			stream:     streamRequested,
 			responseID: responseID,
 			itemID:     itemID,
-			request:    req,
 		}

 		// Set headers based on streaming mode
@@ -546,66 +541,3 @@ func ResponsesMiddleware() gin.HandlerFunc {
 		c.Next()
 	}
 }
-
-type ImageWriter struct {
-	BaseWriter
-}
-
-func (w *ImageWriter) writeResponse(data []byte) (int, error) {
-	var generateResponse api.GenerateResponse
-	if err := json.Unmarshal(data, &generateResponse); err != nil {
-		return 0, err
-	}
-
-	// Only write response when done with image
-	if generateResponse.Done && generateResponse.Image != "" {
-		w.ResponseWriter.Header().Set("Content-Type", "application/json")
-		return len(data), json.NewEncoder(w.ResponseWriter).Encode(openai.ToImageGenerationResponse(generateResponse))
-	}
-
-	return len(data), nil
-}
-
-func (w *ImageWriter) Write(data []byte) (int, error) {
-	code := w.ResponseWriter.Status()
-	if code != http.StatusOK {
-		return w.writeError(data)
-	}
-
-	return w.writeResponse(data)
-}
-
-func ImageGenerationsMiddleware() gin.HandlerFunc {
-	return func(c *gin.Context) {
-		var req openai.ImageGenerationRequest
-		if err := c.ShouldBindJSON(&req); err != nil {
-			c.AbortWithStatusJSON(http.StatusBadRequest, openai.NewError(http.StatusBadRequest, err.Error()))
-			return
-		}
-
-		if req.Prompt == "" {
-			c.AbortWithStatusJSON(http.StatusBadRequest, openai.NewError(http.StatusBadRequest, "prompt is required"))
-			return
-		}
-
-		if req.Model == "" {
-			c.AbortWithStatusJSON(http.StatusBadRequest, openai.NewError(http.StatusBadRequest, "model is required"))
-			return
-		}
-
-		var b bytes.Buffer
-		if err := json.NewEncoder(&b).Encode(openai.FromImageGenerationRequest(req)); err != nil {
-			c.AbortWithStatusJSON(http.StatusInternalServerError, openai.NewError(http.StatusInternalServerError, err.Error()))
-			return
-		}
-
-		c.Request.Body = io.NopCloser(&b)
-
-		w := &ImageWriter{
-			BaseWriter: BaseWriter{ResponseWriter: c.Writer},
-		}
-
-		c.Writer = w
-		c.Next()
-	}
-}
--- a/middleware/openai_test.go
+++ b/middleware/openai_test.go
@@ -961,154 +961,3 @@ func TestRetrieveMiddleware(t *testing.T) {
 		}
 	}
 }
-
-func TestImageGenerationsMiddleware(t *testing.T) {
-	type testCase struct {
-		name string
-		body string
-		req  api.GenerateRequest
-		err  openai.ErrorResponse
-	}
-
-	var capturedRequest *api.GenerateRequest
-
-	testCases := []testCase{
-		{
-			name: "image generation basic",
-			body: `{
-				"model": "test-model",
-				"prompt": "a beautiful sunset"
-			}`,
-			req: api.GenerateRequest{
-				Model:  "test-model",
-				Prompt: "a beautiful sunset",
-			},
-		},
-		{
-			name: "image generation with size",
-			body: `{
-				"model": "test-model",
-				"prompt": "a beautiful sunset",
-				"size": "512x768"
-			}`,
-			req: api.GenerateRequest{
-				Model:  "test-model",
-				Prompt: "a beautiful sunset",
-				Width:  512,
-				Height: 768,
-			},
-		},
-		{
-			name: "image generation missing prompt",
-			body: `{
-				"model": "test-model"
-			}`,
-			err: openai.ErrorResponse{
-				Error: openai.Error{
-					Message: "prompt is required",
-					Type:    "invalid_request_error",
-				},
-			},
-		},
-		{
-			name: "image generation missing model",
-			body: `{
-				"prompt": "a beautiful sunset"
-			}`,
-			err: openai.ErrorResponse{
-				Error: openai.Error{
-					Message: "model is required",
-					Type:    "invalid_request_error",
-				},
-			},
-		},
-	}
-
-	endpoint := func(c *gin.Context) {
-		c.Status(http.StatusOK)
-	}
-
-	gin.SetMode(gin.TestMode)
-	router := gin.New()
-	router.Use(ImageGenerationsMiddleware(), captureRequestMiddleware(&capturedRequest))
-	router.Handle(http.MethodPost, "/api/generate", endpoint)
-
-	for _, tc := range testCases {
-		t.Run(tc.name, func(t *testing.T) {
-			req, _ := http.NewRequest(http.MethodPost, "/api/generate", strings.NewReader(tc.body))
-			req.Header.Set("Content-Type", "application/json")
-
-			defer func() { capturedRequest = nil }()
-
-			resp := httptest.NewRecorder()
-			router.ServeHTTP(resp, req)
-
-			if tc.err.Error.Message != "" {
-				var errResp openai.ErrorResponse
-				if err := json.Unmarshal(resp.Body.Bytes(), &errResp); err != nil {
-					t.Fatal(err)
-				}
-				if diff := cmp.Diff(tc.err, errResp); diff != "" {
-					t.Fatalf("errors did not match:\n%s", diff)
-				}
-				return
-			}
-
-			if resp.Code != http.StatusOK {
-				t.Fatalf("expected status 200, got %d: %s", resp.Code, resp.Body.String())
-			}
-
-			if diff := cmp.Diff(&tc.req, capturedRequest); diff != "" {
-				t.Fatalf("requests did not match:\n%s", diff)
-			}
-		})
-	}
-}
-
-func TestImageWriterResponse(t *testing.T) {
-	gin.SetMode(gin.TestMode)
-
-	// Test that ImageWriter transforms GenerateResponse to OpenAI format
-	endpoint := func(c *gin.Context) {
-		resp := api.GenerateResponse{
-			Model:     "test-model",
-			CreatedAt: time.Unix(1234567890, 0).UTC(),
-			Done:      true,
-			Image:     "dGVzdC1pbWFnZS1kYXRh", // base64 of "test-image-data"
-		}
-		data, _ := json.Marshal(resp)
-		c.Writer.Write(append(data, '\n'))
-	}
-
-	router := gin.New()
-	router.Use(ImageGenerationsMiddleware())
-	router.Handle(http.MethodPost, "/api/generate", endpoint)
-
-	body := `{"model": "test-model", "prompt": "test"}`
-	req, _ := http.NewRequest(http.MethodPost, "/api/generate", strings.NewReader(body))
-	req.Header.Set("Content-Type", "application/json")
-
-	resp := httptest.NewRecorder()
-	router.ServeHTTP(resp, req)
-
-	if resp.Code != http.StatusOK {
-		t.Fatalf("expected status 200, got %d: %s", resp.Code, resp.Body.String())
-	}
-
-	var imageResp openai.ImageGenerationResponse
-	if err := json.Unmarshal(resp.Body.Bytes(), &imageResp); err != nil {
-		t.Fatalf("failed to unmarshal response: %v", err)
-	}
-
-	if imageResp.Created != 1234567890 {
-		t.Errorf("expected created 1234567890, got %d", imageResp.Created)
-	}
-
-	if len(imageResp.Data) != 1 {
-		t.Fatalf("expected 1 image, got %d", len(imageResp.Data))
-	}
-
-	if imageResp.Data[0].B64JSON != "dGVzdC1pbWFnZS1kYXRh" {
-		t.Errorf("expected image data 'dGVzdC1pbWFnZS1kYXRh', got %s", imageResp.Data[0].B64JSON)
-	}
-}
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -162,7 +162,6 @@ type Tensor interface {
 	AvgPool2D(ctx Context, k, s int, p float32) Tensor
 	Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
 	Conv3D(ctx Context, weight Tensor, c, s0, s1, s2, p0, p1, p2, d0, d1, d2 int) Tensor
-	SSMConv(ctx Context, kernel Tensor) Tensor

 	IM2Col(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor

--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -1641,13 +1641,6 @@ func (t *Tensor) Conv3D(ctx ml.Context, t2 ml.Tensor, c, s0, s1, s2, p0, p1, p2,
 	return tt
 }

-func (t *Tensor) SSMConv(ctx ml.Context, kernel ml.Tensor) ml.Tensor {
-	return &Tensor{
-		b: t.b,
-		t: C.ggml_ssm_conv(ctx.(*Context).ctx, t.t, kernel.(*Tensor).t),
-	}
-}
-
 func (t *Tensor) AvgPool2D(ctx ml.Context, k, s int, p float32) ml.Tensor {
 	return &Tensor{
 		b: t.b,
--- a/model/models/glm4moelite/model.go
+++ b/model/models/glm4moelite/model.go
@@ -1,316 +0,0 @@
-package glm4moelite
-
-import (
-	"math"
-
-	"github.com/ollama/ollama/fs"
-	"github.com/ollama/ollama/kvcache"
-	"github.com/ollama/ollama/ml"
-	"github.com/ollama/ollama/ml/nn"
-	"github.com/ollama/ollama/model"
-	"github.com/ollama/ollama/model/input"
-)
-
-type Options struct {
-	numExpertsUsed      int
-	numExperts          int
-	normTopKProb        bool
-	routedScalingFactor float32
-
-	kvLoraRank,
-	qkNopeHeadDim,
-	qkRopeHeadDim,
-	kqNopeHeadDim,
-	qkHeadDim int
-	qLoraRank int
-	vHeadDim  int
-
-	hiddenSize,
-	numHeads,
-	numKVHeads int
-
-	eps,
-	ropeBase float32
-	kqScale float64
-}
-
-func (o Options) applyRotaryPositionEmbeddings(ctx ml.Context, t, p ml.Tensor) ml.Tensor {
-	return nn.RoPE(ctx, t, p, o.qkRopeHeadDim, o.ropeBase, 1.0)
-}
-
-type Attention struct {
-	Q *nn.Linear `gguf:"attn_q"`
-
-	QA     *nn.Linear  `gguf:"attn_q_a"`
-	QANorm *nn.RMSNorm `gguf:"attn_q_a_norm"`
-	QB     *nn.Linear  `gguf:"attn_q_b"`
-
-	KVA     *nn.Linear  `gguf:"attn_kv_a_mqa"`
-	KVANorm *nn.RMSNorm `gguf:"attn_kv_a_norm"`
-
-	KB *nn.Linear `gguf:"attn_k_b"`
-	VB *nn.Linear `gguf:"attn_v_b"`
-
-	Output *nn.Linear `gguf:"attn_out,alt:attn_output"`
-}
-
-func (attn *Attention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
-	seqLength := hiddenStates.Dim(1)
-
-	var query ml.Tensor
-	if opts.qLoraRank == 0 {
-		query = attn.Q.Forward(ctx, hiddenStates)
-	} else {
-		query = attn.QA.Forward(ctx, hiddenStates)
-		query = attn.QANorm.Forward(ctx, query, opts.eps)
-		query = attn.QB.Forward(ctx, query)
-	}
-
-	query = query.Reshape(ctx, query.Dim(0)/opts.numHeads, opts.numHeads, seqLength)
-	queryChunks := query.ChunkSections(ctx, 0, opts.qkNopeHeadDim, opts.qkRopeHeadDim)
-
-	compressedKV := attn.KVA.Forward(ctx, hiddenStates)
-	kPass := compressedKV.Slice(ctx, 0, 0, opts.kvLoraRank, 1)
-	kRot := compressedKV.View(ctx,
-		opts.kvLoraRank*compressedKV.Stride(0), opts.qkRopeHeadDim,
-		compressedKV.Stride(1), 1,
-		compressedKV.Stride(1), compressedKV.Dim(1),
-	)
-
-	qRot := opts.applyRotaryPositionEmbeddings(ctx, queryChunks[1], positions)
-	kRot = opts.applyRotaryPositionEmbeddings(ctx, kRot, positions)
-	kPass = attn.KVANorm.Forward(ctx, kPass, opts.eps)
-
-	// MLA absorption: absorb K projection into query
-	qPass := queryChunks[0].Permute(ctx, 0, 2, 1, 3)
-	qPassAbsorb := attn.KB.Forward(ctx, qPass)
-	qPassAbsorb = qPassAbsorb.Permute(ctx, 0, 2, 1, 3)
-
-	// Build absorbed query (rope first for in-place context shifting)
-	query = qRot.Concat(ctx, qPassAbsorb, 0)
-
-	// Compressed KV
-	kPass = kPass.Reshape(ctx, opts.kvLoraRank, 1, seqLength)
-	key := kRot.Concat(ctx, kPass, 0)
-	value := kPass
-
-	attention := nn.AttentionWithVMLA(ctx, query, key, value, nil, attn.VB.Weight, opts.kqScale, cache)
-
-	attention = attention.Reshape(ctx, attention.Dim(0)*attention.Dim(1), seqLength)
-	return attn.Output.Forward(ctx, attention)
-}
-
-type MLP interface {
-	Forward(ml.Context, ml.Tensor, *Options) ml.Tensor
-}
-
-type sparse struct {
-	Router       *nn.Linear `gguf:"ffn_gate_inp"`
-	Gate         *nn.Linear `gguf:"ffn_gate_exps"`
-	Up           *nn.Linear `gguf:"ffn_up_exps"`
-	Down         *nn.Linear `gguf:"ffn_down_exps"`
-	SharedExpert *dense     `gguf:",suf:_shexp"`
-	ExpProbsBias ml.Tensor  `gguf:"exp_probs_b.bias,alt:exp_probs_b"`
-}
-
-func (moe *sparse) Moe(ctx ml.Context, hiddenStates, topKIndices, topKWeights ml.Tensor, opts *Options) ml.Tensor {
-	hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0), 1, hiddenStates.Dim(1))
-
-	upStates := moe.Up.Weight.MulmatID(ctx, hiddenStates, topKIndices)
-	hiddenStates = moe.Gate.Weight.MulmatID(ctx, hiddenStates, topKIndices)
-	hiddenStates = hiddenStates.SILU(ctx, upStates)
-
-	experts := moe.Down.Weight.MulmatID(ctx, hiddenStates, topKIndices)
-	experts = experts.Mul(ctx, topKWeights)
-
-	nextStates := experts.View(ctx, 0, experts.Dim(0), experts.Stride(2), experts.Dim(2))
-	for i := 1; i < opts.numExpertsUsed; i++ {
-		nextStates = nextStates.Add(ctx, experts.View(ctx, i*experts.Stride(1), experts.Dim(0), experts.Stride(2), experts.Dim(2)))
-	}
-	return nextStates
-}
-
-func (moe *sparse) topKIndices(ctx ml.Context, scores ml.Tensor, opts *Options) ml.Tensor {
-	if moe.ExpProbsBias != nil {
-		scores = scores.Add(ctx, moe.ExpProbsBias)
-	}
-	topKIndices := scores.TopK(ctx, opts.numExpertsUsed)
-	return topKIndices
-}
-
-func (moe *sparse) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *Options) ml.Tensor {
-	residuals := hiddenStates
-
-	routerLogits := moe.Router.Forward(ctx, hiddenStates)
-	scores := routerLogits.Sigmoid(ctx)
-	topKIndices := moe.topKIndices(ctx, scores, opts)
-	topKWeights := scores.Reshape(ctx, 1, opts.numExperts, hiddenStates.Dim(1)).Rows(ctx, topKIndices)
-
-	if opts.normTopKProb {
-		topKWeights = topKWeights.Reshape(ctx, opts.numExpertsUsed, hiddenStates.Dim(1))
-		topKWeights = topKWeights.Div(ctx, topKWeights.SumRows(ctx))
-		topKWeights = topKWeights.Reshape(ctx, 1, opts.numExpertsUsed, hiddenStates.Dim(1))
-	}
-
-	topKWeights = topKWeights.Scale(ctx, float64(opts.routedScalingFactor))
-	hiddenStates = moe.Moe(ctx, hiddenStates, topKIndices, topKWeights, opts)
-	sharedExpertResult := moe.SharedExpert.Forward(ctx, residuals, opts)
-
-	hiddenStates = hiddenStates.Add(ctx, sharedExpertResult)
-	return hiddenStates
-}
-
-type dense struct {
-	Gate *nn.Linear `gguf:"ffn_gate"`
-	Up   *nn.Linear `gguf:"ffn_up"`
-	Down *nn.Linear `gguf:"ffn_down"`
-}
-
-func (mlp *dense) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *Options) ml.Tensor {
-	hiddenStates = mlp.Gate.Forward(ctx, hiddenStates).SILU(ctx, mlp.Up.Forward(ctx, hiddenStates))
-	return mlp.Down.Forward(ctx, hiddenStates)
-}
-
-type Layer struct {
-	AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
-	Attention     *Attention
-
-	MLPNorm *nn.RMSNorm `gguf:"ffn_norm"`
-	MLP     MLP
-}
-
-func (t *Layer) Forward(ctx ml.Context, hiddenStates, positions, outputs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
-	residual := hiddenStates
-	hiddenStates = t.AttentionNorm.Forward(ctx, hiddenStates, opts.eps)
-	hiddenStates = t.Attention.Forward(ctx, hiddenStates, positions, cache, opts)
-
-	if outputs != nil {
-		hiddenStates = hiddenStates.Rows(ctx, outputs)
-		residual = residual.Rows(ctx, outputs)
-	}
-
-	hiddenStates = hiddenStates.Add(ctx, residual)
-	residual = hiddenStates
-
-	hiddenStates = t.MLPNorm.Forward(ctx, hiddenStates, opts.eps)
-	hiddenStates = t.MLP.Forward(ctx, hiddenStates, opts)
-	hiddenStates = hiddenStates.Add(ctx, residual)
-	return hiddenStates
-}
-
-type Model struct {
-	model.Base
-	model.BytePairEncoding
-
-	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
-	Layers         []Layer       `gguf:"blk"`
-
-	OutputNorm *nn.RMSNorm `gguf:"output_norm"`
-	Output     *nn.Linear  `gguf:"output,alt:token_embd"`
-
-	*Options
-}
-
-func New(c fs.Config) (model.Model, error) {
-	layers := make([]Layer, c.Uint("block_count"))
-
-	firstDenseLayerIndex := int(c.Uint("leading_dense_block_count"))
-	for i := range layers {
-		if i < firstDenseLayerIndex {
-			layers[i].MLP = &dense{}
-		} else {
-			layers[i].MLP = &sparse{}
-		}
-	}
-
-	keyLength := int(c.Uint("attention.key_length"))
-	valueLength := int(c.Uint("attention.value_length"))
-	kvLoraRank := int(c.Uint("attention.kv_lora_rank"))
-	qkRopeHeadDim := int(c.Uint("rope.dimension_count"))
-
-	// For MLA absorption, the effective key dimension is kvLoraRank + qkRopeHeadDim
-	mlaKeyLength := kvLoraRank + qkRopeHeadDim
-	kqScale := 1.0 / math.Sqrt(float64(mlaKeyLength))
-
-	var pre []string
-	switch c.String("tokenizer.ggml.pre") {
-	case "glm4":
-		pre = []string{
-			`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
-		}
-	default:
-		return nil, model.ErrUnsupportedTokenizer
-	}
-
-	m := Model{
-		BytePairEncoding: model.NewBytePairEncoding(
-			&model.Vocabulary{
-				Values: c.Strings("tokenizer.ggml.tokens"),
-				Types:  c.Ints("tokenizer.ggml.token_type"),
-				Merges: c.Strings("tokenizer.ggml.merges"),
-				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
-				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				EOS: append(
-					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
-					c.Ints("tokenizer.ggml.eos_token_ids")...,
-				),
-			},
-			pre...,
-		),
-		Layers: layers,
-		Options: &Options{
-			hiddenSize:     int(c.Uint("embedding_length")),
-			numHeads:       int(c.Uint("attention.head_count")),
-			numKVHeads:     int(c.Uint("attention.head_count_kv")),
-			eps:            c.Float("attention.layer_norm_rms_epsilon"),
-			ropeBase:       c.Float("rope.freq_base"),
-			numExperts:     int(c.Uint("expert_count")),
-			numExpertsUsed: int(c.Uint("expert_used_count")),
-			normTopKProb:   c.Bool("expert_weights_norm", true),
-
-			qLoraRank:     int(c.Uint("attention.q_lora_rank")),
-			kvLoraRank:    int(c.Uint("attention.kv_lora_rank")),
-			qkHeadDim:     keyLength,
-			vHeadDim:      valueLength,
-			qkRopeHeadDim: int(c.Uint("rope.dimension_count")),
-			qkNopeHeadDim: keyLength - int(c.Uint("rope.dimension_count")),
-			kqNopeHeadDim: keyLength - int(c.Uint("rope.dimension_count")),
-
-			routedScalingFactor: c.Float("expert_weights_scale"),
-
-			kqScale: kqScale,
-		},
-	}
-
-	m.Cache = kvcache.NewCausalCache(m.Shift)
-	return &m, nil
-}
-
-func (m Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return m.applyRotaryPositionEmbeddings(ctx, key, shift), nil
-}
-
-func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions := ctx.Input().FromInts(batch.Positions, len(batch.Positions))
-
-	hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs)
-
-	for i, layer := range m.Layers {
-		m.Cache.SetLayer(i)
-
-		var outputs ml.Tensor
-		if i == len(m.Layers)-1 {
-			outputs = batch.Outputs
-		}
-
-		hiddenStates = layer.Forward(ctx, hiddenStates, positions, outputs, m.Cache, m.Options)
-	}
-
-	hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.eps)
-	return m.Output.Forward(ctx, hiddenStates), nil
-}
-
-func init() {
-	model.Register("glm4moelite", New)
-}
--- a/model/models/lfm2/cache.go
+++ b/model/models/lfm2/cache.go
@@ -1,410 +0,0 @@
-package lfm2
-
-import (
-	"slices"
-
-	"github.com/ollama/ollama/kvcache"
-	"github.com/ollama/ollama/ml"
-	"github.com/ollama/ollama/model/input"
-)
-
-var _ kvcache.Cache = (*HybridCache)(nil)
-
-// HybridCache stores:
-// - a standard causal KV cache for attention layers
-// - a per-sequence recurrent conv state for shortconv layers
-//
-// Conv state shape (per layer, per sequence): [dConv, hiddenSize] where dConv = L_cache - 1.
-// Stored internally as a tensor of shape [dConv * hiddenSize, maxSlots].
-type HybridCache struct {
-	kv *kvcache.Causal
-
-	backend      ml.Backend
-	dtype        ml.DType
-	maxSequences int
-
-	hiddenSize int
-	dConv      int
-
-	// slot mapping for recurrent state
-	slotForSeq map[int]int
-	refCount   []int
-	freeSlots  []int
-
-	// per-layer conv state buffers (allocated lazily)
-	convCtxs   map[int]ml.Context
-	convStates map[int]ml.Tensor // [dConv*hiddenSize, maxSlots]
-
-	// current forward batch (derived in StartForward)
-	curSeqs       []int
-	curSlots      []int
-	curSlotsInput ml.Tensor
-	curSeqTokens  int
-
-	// track if EnsureWritable has been called for this forward pass
-	writableEnsured bool
-	// track any error from EnsureWritable to propagate later
-	writableError error
-}
-
-func NewHybridCache(shift func(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error), hiddenSize, dConv int) *HybridCache {
-	return &HybridCache{
-		kv:         kvcache.NewCausalCache(shift),
-		hiddenSize: hiddenSize,
-		dConv:      dConv,
-		slotForSeq: make(map[int]int),
-		convCtxs:   make(map[int]ml.Context),
-		convStates: make(map[int]ml.Tensor),
-	}
-}
-
-func (c *HybridCache) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity, maxBatch int) {
-	c.backend = backend
-	c.dtype = dtype
-	c.maxSequences = maxSequences
-
-	// initialize slot allocator
-	c.refCount = make([]int, maxSequences)
-	c.freeSlots = c.freeSlots[:0]
-	for i := maxSequences - 1; i >= 0; i-- {
-		c.freeSlots = append(c.freeSlots, i)
-	}
-
-	c.kv.Init(backend, dtype, maxSequences, capacity, maxBatch)
-}
-
-func (c *HybridCache) Close() {
-	for _, ctx := range c.convCtxs {
-		ctx.Close()
-	}
-	c.kv.Close()
-}
-
-func (c *HybridCache) SetConfig(config ml.CacheConfig) {
-	c.kv.SetConfig(config)
-}
-
-func (c *HybridCache) SetLayer(layer int) {
-	c.kv.SetLayer(layer)
-}
-
-func (c *HybridCache) Get(ctx ml.Context) (ml.Tensor, ml.Tensor, ml.Tensor) {
-	return c.kv.Get(ctx)
-}
-
-func (c *HybridCache) Put(ctx ml.Context, key, value ml.Tensor) {
-	c.kv.Put(ctx, key, value)
-}
-
-func (c *HybridCache) StartForward(ctx ml.Context, batch input.Batch, reserve bool) error {
-	if err := c.kv.StartForward(ctx, batch, reserve); err != nil {
-		return err
-	}
-
-	// Derive equal-length sequence layout for shortconv.
-	// LFM2 shortconv assumes tokens form a [seq_tokens, seqs] grid.
-	seqCounts := make(map[int]int)
-	c.curSeqs = c.curSeqs[:0]
-	for _, s := range batch.Sequences {
-		if _, ok := seqCounts[s]; !ok {
-			c.curSeqs = append(c.curSeqs, s)
-		}
-		seqCounts[s]++
-	}
-
-	if len(c.curSeqs) == 0 {
-		return nil
-	}
-
-	nTokens := len(batch.Sequences)
-	nSeqs := len(c.curSeqs)
-	want := nTokens / nSeqs
-	for _, s := range c.curSeqs {
-		if seqCounts[s] != want {
-			return kvcache.ErrNotSupported
-		}
-	}
-
-	c.curSeqTokens = want
-
-	// When reserving memory for estimation, use fake slot assignments
-	// without modifying permanent state (slotForSeq, refCount)
-	if reserve {
-		c.curSlots = c.curSlots[:0]
-		slots := make([]int32, nSeqs)
-		for i := range nSeqs {
-			c.curSlots = append(c.curSlots, i)
-			slots[i] = int32(i)
-		}
-		c.curSlotsInput = ctx.Input().FromInts(slots, len(slots))
-		return nil
-	}
-
-	// Ensure slots exist for sequences in this batch
-	c.curSlots = c.curSlots[:0]
-	var newSlots []int // track newly allocated slots that need zeroing
-	for _, s := range c.curSeqs {
-		slot, ok := c.slotForSeq[s]
-		if !ok {
-			var err error
-			slot, err = c.allocSlot()
-			if err != nil {
-				return err
-			}
-			c.slotForSeq[s] = slot
-			c.refCount[slot] = 1
-			newSlots = append(newSlots, slot)
-		}
-		c.curSlots = append(c.curSlots, slot)
-	}
-
-	// Zero conv state for newly allocated slots to clear stale data from previous sequences
-	if len(newSlots) > 0 {
-		c.zeroConvSlots(ctx, newSlots)
-	}
-
-	// Create a tensor for the current slots
-	slots := make([]int32, len(c.curSlots))
-	for i, v := range c.curSlots {
-		slots[i] = int32(v)
-	}
-	c.curSlotsInput = ctx.Input().FromInts(slots, len(slots))
-
-	// Reset writable state for new forward pass
-	c.writableEnsured = false
-	c.writableError = nil
-
-	return nil
-}
-
-func (c *HybridCache) allocSlot() (int, error) {
-	if len(c.freeSlots) == 0 {
-		return 0, kvcache.ErrKvCacheFull
-	}
-	slot := c.freeSlots[len(c.freeSlots)-1]
-	c.freeSlots = c.freeSlots[:len(c.freeSlots)-1]
-	return slot, nil
-}
-
-func (c *HybridCache) freeSlot(slot int) {
-	// Bounds check before freeing
-	if slot >= 0 && slot < c.maxSequences {
-		c.freeSlots = append(c.freeSlots, slot)
-	}
-}
-
-// zeroConvSlots zeros the conv state for the given slots across all layers.
-// This must be called when recycling slots to prevent stale state from affecting new sequences.
-func (c *HybridCache) zeroConvSlots(ctx ml.Context, slots []int) {
-	if len(slots) == 0 || len(c.convStates) == 0 {
-		return
-	}
-
-	// Use input context for creating tensors
-	inputCtx := ctx.Input()
-
-	// Create slot indices tensor
-	slotIndices := make([]int32, len(slots))
-	for i, s := range slots {
-		slotIndices[i] = int32(s)
-	}
-	slotsTensor := inputCtx.FromInts(slotIndices, len(slotIndices))
-
-	// Create zero tensor for the slots (SetRows requires F32 source)
-	zeros := inputCtx.Zeros(ml.DTypeF32, c.dConv*c.hiddenSize, len(slots))
-
-	// Zero each layer's conv state for these slots
-	for _, buf := range c.convStates {
-		ctx.Forward(buf.SetRows(ctx, zeros, slotsTensor))
-	}
-}
-
-// EnsureWritable ensures that sequences in the current batch have private (non-shared) conv slots.
-// Returns an error if slot allocation fails.
-func (c *HybridCache) EnsureWritable(ctx ml.Context) error {
-	for i, seq := range c.curSeqs {
-		slot, ok := c.slotForSeq[seq]
-		if !ok {
-			continue
-		}
-
-		// Bounds check
-		if slot < 0 || slot >= len(c.refCount) {
-			continue
-		}
-
-		if c.refCount[slot] <= 1 {
-			continue
-		}
-
-		newSlot, err := c.allocSlot()
-		if err != nil {
-			return err
-		}
-		c.refCount[slot]--
-		c.refCount[newSlot] = 1
-		c.slotForSeq[seq] = newSlot
-		c.curSlots[i] = newSlot
-
-		// Copy existing conv state for all initialized layers
-		for _, buf := range c.convStates {
-			// buf: [dConv*hiddenSize, maxSlots]
-			src := buf.Rows(ctx, ctx.Input().FromInts([]int32{int32(slot)}, 1))
-			// SetRows requires F32 source
-			srcF32 := src.Cast(ctx, ml.DTypeF32)
-			ctx.Forward(buf.SetRows(ctx, srcF32, ctx.Input().FromInts([]int32{int32(newSlot)}, 1)))
-		}
-	}
-
-	// Rebuild current slots tensor
-	slots := make([]int32, len(c.curSlots))
-	for i, v := range c.curSlots {
-		slots[i] = int32(v)
-	}
-	c.curSlotsInput = ctx.Input().FromInts(slots, len(slots))
-
-	return nil
-}
-
-func (c *HybridCache) CopyPrefix(srcSeq, dstSeq int, prefixLen int32) {
-	// KV cache shares prefix metadata (no copy) which is correct for prefix reuse.
-	c.kv.CopyPrefix(srcSeq, dstSeq, prefixLen)
-
-	// For shortconv state we implement copy-on-write: dst shares the same slot as src.
-	// On the first write to dst, EnsureWritable will create a private slot.
-	if dstSlot, ok := c.slotForSeq[dstSeq]; ok {
-		// Bounds check before decrementing
-		if dstSlot >= 0 && dstSlot < len(c.refCount) {
-			c.refCount[dstSlot]--
-			if c.refCount[dstSlot] <= 0 {
-				c.refCount[dstSlot] = 0
-				c.freeSlot(dstSlot)
-			}
-		}
-		delete(c.slotForSeq, dstSeq)
-	}
-
-	srcSlot, ok := c.slotForSeq[srcSeq]
-	if !ok {
-		// src may not have a slot yet; dst will allocate on demand
-		return
-	}
-
-	// Bounds check before incrementing
-	if srcSlot >= 0 && srcSlot < len(c.refCount) {
-		c.slotForSeq[dstSeq] = srcSlot
-		c.refCount[srcSlot]++
-	}
-}
-
-func (c *HybridCache) CanResume(seq int, pos int32) bool {
-	return c.kv.CanResume(seq, pos)
-}
-
-func (c *HybridCache) Remove(seq int, beginIndex, endIndex int32) error {
-	if err := c.kv.Remove(seq, beginIndex, endIndex); err != nil {
-		return err
-	}
-
-	// For recurrent state, any removal invalidates the state because
-	// the state at position N depends on all previous positions.
-	// Drop the slot mapping so it resets on next use.
-	slot, ok := c.slotForSeq[seq]
-	if !ok {
-		return nil
-	}
-
-	// Bounds check
-	if slot < 0 || slot >= len(c.refCount) {
-		delete(c.slotForSeq, seq)
-		return nil
-	}
-
-	c.refCount[slot]--
-	if c.refCount[slot] <= 0 {
-		c.refCount[slot] = 0
-		c.freeSlot(slot)
-	}
-	delete(c.slotForSeq, seq)
-
-	return nil
-}
-
-func (c *HybridCache) slotsTensor() ml.Tensor {
-	return c.curSlotsInput
-}
-
-func (c *HybridCache) seqTokens() int {
-	return c.curSeqTokens
-}
-
-func (c *HybridCache) numSeqs() int {
-	return len(c.curSeqs)
-}
-
-func (c *HybridCache) convBuffer(ctx ml.Context, layer int) ml.Tensor {
-	if buf, ok := c.convStates[layer]; ok {
-		return buf
-	}
-
-	if _, ok := c.convCtxs[layer]; !ok {
-		c.convCtxs[layer] = c.backend.NewContextSize(1).Layer(layer)
-	}
-
-	buf := c.convCtxs[layer].Zeros(c.dtype, c.dConv*c.hiddenSize, c.maxSequences)
-	c.convStates[layer] = buf
-	return buf
-}
-
-// ConvState returns the conv state for current batch sequences as shape [dConv, hiddenSize, nSeqs].
-// Returns an error if copy-on-write allocation fails.
-func (c *HybridCache) ConvState(ctx ml.Context, layer int) (ml.Tensor, error) {
-	if !c.writableEnsured {
-		needsWritable := false
-		for _, seq := range c.curSeqs {
-			slot, ok := c.slotForSeq[seq]
-			if !ok {
-				continue
-			}
-			if slot >= 0 && slot < len(c.refCount) && c.refCount[slot] > 1 {
-				needsWritable = true
-				break
-			}
-		}
-
-		if needsWritable {
-			if err := c.EnsureWritable(ctx); err != nil {
-				c.writableError = err
-			}
-		}
-		c.writableEnsured = true
-	}
-
-	if c.writableError != nil {
-		return nil, c.writableError
-	}
-
-	buf := c.convBuffer(ctx, layer)
-	cur := buf.Rows(ctx, c.slotsTensor())
-	return cur.Reshape(ctx, c.dConv, c.hiddenSize, c.numSeqs()), nil
-}
-
-// UpdateConvState writes a new conv state for current batch sequences.
-// newState must have shape [dConv, hiddenSize, nSeqs].
-func (c *HybridCache) UpdateConvState(ctx ml.Context, layer int, newState ml.Tensor) {
-	buf := c.convBuffer(ctx, layer)
-	src := newState.Reshape(ctx, c.dConv*c.hiddenSize, c.numSeqs())
-	// SetRows requires F32 source
-	srcF32 := src.Cast(ctx, ml.DTypeF32)
-	ctx.Forward(buf.SetRows(ctx, srcF32, c.slotsTensor()))
-}
-
-// IsSupportedForBatch returns true if the current batch layout supports shortconv.
-func (c *HybridCache) IsSupportedForBatch() bool {
-	return c.curSeqTokens > 0 && len(c.curSeqs) > 0
-}
-
-// Seqs returns the ordered unique sequences for the current forward pass.
-func (c *HybridCache) Seqs() []int {
-	return slices.Clone(c.curSeqs)
-}
--- a/model/models/lfm2/cache_test.go
+++ b/model/models/lfm2/cache_test.go
@@ -1,444 +0,0 @@
-package lfm2
-
-import (
-	"testing"
-
-	"github.com/ollama/ollama/kvcache"
-	"github.com/ollama/ollama/ml"
-)
-
-// TestHybridCache tests verify the slot management logic of HybridCache.
-// These tests focus on the recurrent state slot allocation, reference counting,
-// and copy-on-write semantics without requiring a full ML backend.
-
-// createSlotOnlyCache creates a HybridCache with only the slot management
-// fields initialized. Used to test slot logic in isolation.
-func createSlotOnlyCache(maxSequences int) *HybridCache {
-	return &HybridCache{
-		hiddenSize:   256,
-		dConv:        3,
-		maxSequences: maxSequences,
-		refCount:     make([]int, maxSequences),
-		freeSlots:    initFreeSlots(maxSequences),
-		slotForSeq:   make(map[int]int),
-		convCtxs:     make(map[int]ml.Context),
-		convStates:   make(map[int]ml.Tensor),
-	}
-}
-
-func initFreeSlots(n int) []int {
-	slots := make([]int, 0, n)
-	for i := n - 1; i >= 0; i-- {
-		slots = append(slots, i)
-	}
-	return slots
-}
-
-func TestHybridCache_SlotAllocation(t *testing.T) {
-	cache := createSlotOnlyCache(4)
-
-	// Verify initial state
-	if len(cache.freeSlots) != 4 {
-		t.Errorf("expected 4 free slots, got %d", len(cache.freeSlots))
-	}
-
-	// Allocate all slots
-	for range 4 {
-		slot, err := cache.allocSlot()
-		if err != nil {
-			t.Fatalf("allocSlot failed: %v", err)
-		}
-		cache.refCount[slot] = 1
-	}
-
-	// Should be full now
-	if len(cache.freeSlots) != 0 {
-		t.Errorf("expected 0 free slots, got %d", len(cache.freeSlots))
-	}
-
-	// Trying to allocate another should fail
-	_, err := cache.allocSlot()
-	if err != kvcache.ErrKvCacheFull {
-		t.Errorf("expected ErrKvCacheFull, got %v", err)
-	}
-}
-
-func TestHybridCache_SlotReuse(t *testing.T) {
-	cache := createSlotOnlyCache(4)
-
-	// Allocate a slot
-	slot1, _ := cache.allocSlot()
-	cache.refCount[slot1] = 1
-
-	// Free it
-	cache.refCount[slot1] = 0
-	cache.freeSlot(slot1)
-
-	// Allocate again - should get the same slot back (LIFO)
-	slot2, _ := cache.allocSlot()
-	if slot2 != slot1 {
-		t.Errorf("expected slot %d to be reused, got %d", slot1, slot2)
-	}
-}
-
-func TestHybridCache_SlotRefCounting_ShareSlot(t *testing.T) {
-	cache := createSlotOnlyCache(4)
-
-	// Allocate slot for seq 1
-	slot1, _ := cache.allocSlot()
-	cache.slotForSeq[1] = slot1
-	cache.refCount[slot1] = 1
-
-	// Simulate sharing slot with seq 2 (copy-on-write style)
-	cache.slotForSeq[2] = slot1
-	cache.refCount[slot1]++
-
-	// Should share the same slot
-	if cache.slotForSeq[2] != slot1 {
-		t.Errorf("expected seq 2 to share slot %d, got %d", slot1, cache.slotForSeq[2])
-	}
-
-	// Ref count should be 2
-	if cache.refCount[slot1] != 2 {
-		t.Errorf("expected refCount 2, got %d", cache.refCount[slot1])
-	}
-}
-
-func TestHybridCache_SlotRefCounting_DecRef(t *testing.T) {
-	cache := createSlotOnlyCache(4)
-
-	// Allocate slot for seq 1
-	slot1, _ := cache.allocSlot()
-	cache.slotForSeq[1] = slot1
-	cache.refCount[slot1] = 1
-
-	// Share with seq 2
-	cache.slotForSeq[2] = slot1
-	cache.refCount[slot1]++
-
-	// Unshare seq 2
-	cache.refCount[slot1]--
-	delete(cache.slotForSeq, 2)
-
-	// Ref count should be back to 1
-	if cache.refCount[slot1] != 1 {
-		t.Errorf("expected refCount 1 after unshare, got %d", cache.refCount[slot1])
-	}
-
-	// Seq 2 should no longer have a slot
-	if _, ok := cache.slotForSeq[2]; ok {
-		t.Error("seq 2 should not have a slot after unshare")
-	}
-}
-
-func TestHybridCache_SlotFreeWhenUnused(t *testing.T) {
-	cache := createSlotOnlyCache(4)
-
-	initialFreeSlots := len(cache.freeSlots)
-
-	// Allocate slot for seq 1
-	slot1, _ := cache.allocSlot()
-	cache.slotForSeq[1] = slot1
-	cache.refCount[slot1] = 1
-
-	// Free the slot when refCount drops to 0
-	cache.refCount[slot1]--
-	if cache.refCount[slot1] <= 0 {
-		cache.refCount[slot1] = 0
-		cache.freeSlot(slot1)
-	}
-	delete(cache.slotForSeq, 1)
-
-	// Slot should be freed
-	if len(cache.freeSlots) != initialFreeSlots {
-		t.Errorf("expected %d free slots, got %d", initialFreeSlots, len(cache.freeSlots))
-	}
-
-	// Ref count should be 0
-	if cache.refCount[slot1] != 0 {
-		t.Errorf("expected refCount 0, got %d", cache.refCount[slot1])
-	}
-}
-
-func TestHybridCache_SlotOverwrite(t *testing.T) {
-	cache := createSlotOnlyCache(4)
-
-	// Allocate slots for seq 1 and seq 2
-	slot1, _ := cache.allocSlot()
-	cache.slotForSeq[1] = slot1
-	cache.refCount[slot1] = 1
-
-	slot2, _ := cache.allocSlot()
-	cache.slotForSeq[2] = slot2
-	cache.refCount[slot2] = 1
-
-	initialFreeSlots := len(cache.freeSlots)
-
-	// Simulate overwriting seq 2's slot with slot1 (sharing)
-	// First free the old slot
-	cache.refCount[slot2]--
-	if cache.refCount[slot2] <= 0 {
-		cache.refCount[slot2] = 0
-		cache.freeSlot(slot2)
-	}
-	// Then share slot1
-	cache.slotForSeq[2] = slot1
-	cache.refCount[slot1]++
-
-	// Seq 2 should now share slot1
-	if cache.slotForSeq[2] != slot1 {
-		t.Errorf("expected seq 2 to share slot %d, got %d", slot1, cache.slotForSeq[2])
-	}
-
-	// Old slot2 should be freed
-	if len(cache.freeSlots) != initialFreeSlots+1 {
-		t.Errorf("expected %d free slots, got %d", initialFreeSlots+1, len(cache.freeSlots))
-	}
-}
-
-func TestHybridCache_BoundsChecking(t *testing.T) {
-	cache := createSlotOnlyCache(4)
-
-	// Test freeing invalid slot (should not panic)
-	cache.freeSlot(-1)
-	cache.freeSlot(100) // out of bounds
-
-	// freeSlot does bounds checking, so invalid slots should be ignored
-	if len(cache.freeSlots) != 4 {
-		t.Errorf("invalid slots should not affect free list, got %d slots", len(cache.freeSlots))
-	}
-}
-
-func TestHybridCache_MultipleSequences_RefCounting(t *testing.T) {
-	cache := createSlotOnlyCache(8)
-
-	// Allocate slot for seq 1
-	slot1, _ := cache.allocSlot()
-	cache.slotForSeq[1] = slot1
-	cache.refCount[slot1] = 1
-
-	// Fork to seq 2, 3, 4 (all share slot1)
-	for _, seq := range []int{2, 3, 4} {
-		cache.slotForSeq[seq] = slot1
-		cache.refCount[slot1]++
-	}
-
-	// Ref count should be 4
-	if cache.refCount[slot1] != 4 {
-		t.Errorf("expected refCount 4, got %d", cache.refCount[slot1])
-	}
-
-	// Remove seq 2, 3
-	for _, seq := range []int{2, 3} {
-		delete(cache.slotForSeq, seq)
-		cache.refCount[slot1]--
-	}
-
-	if cache.refCount[slot1] != 2 {
-		t.Errorf("expected refCount 2, got %d", cache.refCount[slot1])
-	}
-
-	// Slot should still be allocated (not in free list)
-	found := false
-	for _, s := range cache.freeSlots {
-		if s == slot1 {
-			found = true
-			break
-		}
-	}
-	if found {
-		t.Error("slot1 should not be in free list yet")
-	}
-
-	// Remove remaining sequences
-	for _, seq := range []int{1, 4} {
-		delete(cache.slotForSeq, seq)
-		cache.refCount[slot1]--
-	}
-
-	if cache.refCount[slot1] != 0 {
-		t.Errorf("expected refCount 0, got %d", cache.refCount[slot1])
-	}
-}
-
-func TestHybridCache_ChainedSharing(t *testing.T) {
-	cache := createSlotOnlyCache(8)
-
-	// Create seq 1
-	slot1, _ := cache.allocSlot()
-	cache.slotForSeq[1] = slot1
-	cache.refCount[slot1] = 1
-
-	// Share 1 -> 2
-	cache.slotForSeq[2] = slot1
-	cache.refCount[slot1]++
-
-	// Share 2 -> 3 (should still share slot1)
-	cache.slotForSeq[3] = cache.slotForSeq[2] // which is slot1
-	cache.refCount[slot1]++
-
-	// All should share slot1
-	if cache.slotForSeq[1] != slot1 || cache.slotForSeq[2] != slot1 || cache.slotForSeq[3] != slot1 {
-		t.Error("all sequences should share slot1")
-	}
-
-	if cache.refCount[slot1] != 3 {
-		t.Errorf("expected refCount 3, got %d", cache.refCount[slot1])
-	}
-}
-
-func TestHybridCache_CacheParameters(t *testing.T) {
-	cache := NewHybridCache(nil, 512, 5) // hiddenSize=512, dConv=5
-
-	if cache.hiddenSize != 512 {
-		t.Errorf("expected hiddenSize 512, got %d", cache.hiddenSize)
-	}
-	if cache.dConv != 5 {
-		t.Errorf("expected dConv 5, got %d", cache.dConv)
-	}
-}
-
-func TestHybridCache_NumSeqs(t *testing.T) {
-	cache := createSlotOnlyCache(4)
-
-	// Initially no sequences
-	if cache.numSeqs() != 0 {
-		t.Errorf("expected 0 seqs, got %d", cache.numSeqs())
-	}
-
-	// Manually set up current batch state
-	cache.curSeqs = []int{1, 2, 3}
-
-	if cache.numSeqs() != 3 {
-		t.Errorf("expected 3 seqs, got %d", cache.numSeqs())
-	}
-}
-
-func TestHybridCache_SeqTokens(t *testing.T) {
-	cache := createSlotOnlyCache(4)
-
-	// Initially 0
-	if cache.seqTokens() != 0 {
-		t.Errorf("expected 0 seqTokens, got %d", cache.seqTokens())
-	}
-
-	// Manually set up current batch state
-	cache.curSeqTokens = 16
-
-	if cache.seqTokens() != 16 {
-		t.Errorf("expected 16 seqTokens, got %d", cache.seqTokens())
-	}
-}
-
-// Test that Seqs returns a clone of curSeqs
-func TestHybridCache_Seqs_ReturnsClone(t *testing.T) {
-	cache := createSlotOnlyCache(4)
-
-	cache.curSeqs = []int{1, 2, 3}
-
-	seqs := cache.Seqs()
-
-	// Modify returned slice
-	seqs[0] = 999
-
-	// Original should be unchanged
-	if cache.curSeqs[0] != 1 {
-		t.Error("Seqs should return a clone, not the original slice")
-	}
-}
-
-func TestHybridCache_IsSupportedForBatch(t *testing.T) {
-	cache := createSlotOnlyCache(4)
-
-	// Initially not supported (no batch set up)
-	if cache.IsSupportedForBatch() {
-		t.Error("expected IsSupportedForBatch to be false initially")
-	}
-
-	// Set up a valid batch
-	cache.curSeqTokens = 1
-	cache.curSeqs = []int{1}
-
-	if !cache.IsSupportedForBatch() {
-		t.Error("expected IsSupportedForBatch to be true with valid batch")
-	}
-}
-
-func TestHybridCache_ZeroConvSlots_EmptyInputs(t *testing.T) {
-	cache := createSlotOnlyCache(4)
-
-	// zeroConvSlots should handle empty slots without panicking
-	cache.zeroConvSlots(nil, nil)
-	cache.zeroConvSlots(nil, []int{})
-
-	// zeroConvSlots should handle empty convStates without panicking
-	cache.zeroConvSlots(nil, []int{0, 1, 2})
-}
-
-func TestHybridCache_SlotRecycling_TracksNewSlots(t *testing.T) {
-	cache := createSlotOnlyCache(4)
-
-	// Allocate slot for seq 1
-	slot1, _ := cache.allocSlot()
-	cache.slotForSeq[1] = slot1
-	cache.refCount[slot1] = 1
-
-	// Free the slot (simulating sequence removal)
-	cache.refCount[slot1]--
-	cache.freeSlot(slot1)
-	delete(cache.slotForSeq, 1)
-
-	// Verify slot is in free list
-	if len(cache.freeSlots) != 4 {
-		t.Errorf("expected 4 free slots after freeing, got %d", len(cache.freeSlots))
-	}
-
-	// Allocate for new seq 2 - should get recycled slot
-	slot2, _ := cache.allocSlot()
-	if slot2 != slot1 {
-		t.Errorf("expected recycled slot %d, got %d", slot1, slot2)
-	}
-
-	// This recycled slot would need zeroing in the real implementation
-	// The actual zeroing is tested via integration tests since it requires ML context
-}
-
-func TestHybridCache_NewSequence_GetsTrackedForZeroing(t *testing.T) {
-	cache := createSlotOnlyCache(4)
-
-	// Simulate the slot allocation flow from StartForward
-	// When a sequence doesn't have a slot, it gets allocated and tracked as "new"
-
-	newSlots := []int{}
-
-	// Seq 1 doesn't have a slot - allocate and track
-	seq := 1
-	if _, ok := cache.slotForSeq[seq]; !ok {
-		slot, err := cache.allocSlot()
-		if err != nil {
-			t.Fatalf("allocSlot failed: %v", err)
-		}
-		cache.slotForSeq[seq] = slot
-		cache.refCount[slot] = 1
-		newSlots = append(newSlots, slot)
-	}
-
-	// Verify newSlots contains the allocated slot
-	if len(newSlots) != 1 {
-		t.Errorf("expected 1 new slot, got %d", len(newSlots))
-	}
-
-	// Seq 1 already has a slot - should NOT be tracked as new
-	newSlots2 := []int{}
-	if _, ok := cache.slotForSeq[seq]; !ok {
-		slot, _ := cache.allocSlot()
-		cache.slotForSeq[seq] = slot
-		cache.refCount[slot] = 1
-		newSlots2 = append(newSlots2, slot)
-	}
-
-	// Verify no new slots for existing sequence
-	if len(newSlots2) != 0 {
-		t.Errorf("expected 0 new slots for existing sequence, got %d", len(newSlots2))
-	}
-}
--- a/model/models/lfm2/model.go
+++ b/model/models/lfm2/model.go
@@ -1,253 +0,0 @@
-package lfm2
-
-import (
-	"cmp"
-	"math"
-
-	"github.com/ollama/ollama/fs"
-	"github.com/ollama/ollama/ml"
-	"github.com/ollama/ollama/ml/nn"
-	"github.com/ollama/ollama/ml/nn/rope"
-	"github.com/ollama/ollama/model"
-	"github.com/ollama/ollama/model/input"
-)
-
-type Options struct {
-	hiddenSize       int
-	headDim, ropeDim int
-
-	eps, ropeBase, ropeScale float32
-
-	ropeType              string
-	originalContextLength int
-
-	// per-layer head counts (LFM2 alternates attention and recurrent layers)
-	numHeadsByLayer   []int
-	numKVHeadsByLayer []int
-}
-
-func (o Options) headDimValue() int {
-	// Head dim is shared across layers; fall back to first attention layer head count.
-	for _, h := range o.numHeadsByLayer {
-		if h > 0 {
-			return cmp.Or(o.headDim, o.hiddenSize/h)
-		}
-	}
-	return cmp.Or(o.headDim, o.hiddenSize)
-}
-
-func (o Options) applyRotaryPositionEmbeddings(ctx ml.Context, states, positions ml.Tensor) ml.Tensor {
-	opts := []func(*rope.Options){rope.WithTypeNeoX()}
-	if o.ropeType == "yarn" {
-		attnFactor := float32(1.0 / (1.0 + 0.1*math.Log(float64(o.ropeScale))))
-		opts = append(opts,
-			rope.WithOriginalContextLength(o.originalContextLength),
-			rope.WithExtrapolationFactor(1.),
-			rope.WithAttentionFactor(attnFactor),
-		)
-	}
-
-	headCount := 1
-	for _, h := range o.numHeadsByLayer {
-		if h > 0 {
-			headCount = h
-			break
-		}
-	}
-	return nn.RoPE(ctx, states, positions, cmp.Or(o.ropeDim, o.headDim, o.hiddenSize/headCount), o.ropeBase, 1./o.ropeScale, opts...)
-}
-
-type Model struct {
-	model.Base
-	model.TextProcessor
-
-	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
-	Layers         []Layer       `gguf:"blk"`
-	OutputNorm     *nn.RMSNorm   `gguf:"output_norm,alt:token_embd_norm"`
-	Output         *nn.Linear    `gguf:"output,alt:token_embd"`
-
-	Options
-}
-
-func New(c fs.Config) (model.Model, error) {
-	if c.Uint("expert_count") > 0 {
-		return nil, model.ErrUnsupportedModel
-	}
-
-	if c.String("tokenizer.ggml.model") != "gpt2" {
-		return nil, model.ErrUnsupportedTokenizer
-	}
-
-	vocabulary := model.Vocabulary{
-		Values: c.Strings("tokenizer.ggml.tokens"),
-		Scores: c.Floats("tokenizer.ggml.scores"),
-		Types:  c.Ints("tokenizer.ggml.token_type"),
-		Merges: c.Strings("tokenizer.ggml.merges"),
-		AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-		BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
-		AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-		EOS: append(
-			[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
-			c.Ints("tokenizer.ggml.eos_token_ids")...,
-		),
-	}
-
-	var pretokenizers []string
-	switch c.String("tokenizer.ggml.pre") {
-	case "default":
-		// use default BPE pretokenizer
-	default:
-		// llama-bpe style (default for LFM2)
-		pretokenizers = []string{
-			`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
-		}
-	}
-
-	m := Model{
-		TextProcessor: model.NewBytePairEncoding(&vocabulary, pretokenizers...),
-		Layers:        make([]Layer, c.Uint("block_count")),
-		Options: Options{
-			hiddenSize:            int(c.Uint("embedding_length")),
-			headDim:               int(c.Uint("attention.key_length")),
-			ropeDim:               int(c.Uint("rope.dimension_count")),
-			eps:                   c.Float("attention.layer_norm_rms_epsilon"),
-			ropeType:              c.String("rope.scaling.type"),
-			ropeBase:              c.Float("rope.freq_base"),
-			ropeScale:             c.Float("rope.scaling.factor", 1),
-			originalContextLength: int(c.Uint("rope.scaling.original_context_length")),
-		},
-	}
-
-	type headCounts interface {
-		HeadCount() []uint64
-		HeadCountKV() []uint64
-	}
-	hc, ok := c.(headCounts)
-	if !ok {
-		return nil, model.ErrUnsupportedModel
-	}
-
-	headCount := hc.HeadCount()
-	headCountKV := hc.HeadCountKV()
-
-	m.numHeadsByLayer = make([]int, len(m.Layers))
-	m.numKVHeadsByLayer = make([]int, len(m.Layers))
-	for i := range m.Layers {
-		m.numHeadsByLayer[i] = int(headCount[i])
-		m.numKVHeadsByLayer[i] = int(headCountKV[i])
-
-		if m.numKVHeadsByLayer[i] == 0 {
-			m.Layers[i].Operator = &ShortConv{}
-		} else {
-			m.Layers[i].Operator = &Attention{}
-		}
-	}
-
-	lCache := int(c.Uint("shortconv.l_cache"))
-	dConv := max(0, lCache-1)
-	m.Cache = NewHybridCache(m.Shift, m.hiddenSize, dConv)
-	return &m, nil
-}
-
-type Operator interface {
-	Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, cache *HybridCache, layer int, opts *Options) ml.Tensor
-}
-
-type Attention struct {
-	Query     *nn.Linear  `gguf:"attn_q"`
-	QueryNorm *nn.RMSNorm `gguf:"attn_q_norm"`
-	Key       *nn.Linear  `gguf:"attn_k"`
-	KeyNorm   *nn.RMSNorm `gguf:"attn_k_norm"`
-	Value     *nn.Linear  `gguf:"attn_v"`
-	Output    *nn.Linear  `gguf:"attn_output,alt:attn_out"`
-}
-
-func (sa *Attention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, cache *HybridCache, layer int, opts *Options) ml.Tensor {
-	batchSize := hiddenStates.Dim(1)
-	headDim := opts.headDimValue()
-	numHeads := opts.numHeadsByLayer[layer]
-	numKVHeads := opts.numKVHeadsByLayer[layer]
-
-	query := sa.Query.Forward(ctx, hiddenStates)
-	key := sa.Key.Forward(ctx, hiddenStates)
-	value := sa.Value.Forward(ctx, hiddenStates)
-
-	query = query.Reshape(ctx, headDim, numHeads, batchSize)
-	key = key.Reshape(ctx, headDim, numKVHeads, batchSize)
-	value = value.Reshape(ctx, headDim, numKVHeads, batchSize)
-
-	query = sa.QueryNorm.Forward(ctx, query, opts.eps)
-	key = sa.KeyNorm.Forward(ctx, key, opts.eps)
-
-	query = opts.applyRotaryPositionEmbeddings(ctx, query, positions)
-	key = opts.applyRotaryPositionEmbeddings(ctx, key, positions)
-
-	attention := nn.Attention(ctx, query, key, value, 1./math.Sqrt(float64(headDim)), cache)
-	attention = attention.Reshape(ctx, attention.Dim(0)*attention.Dim(1), batchSize)
-	return sa.Output.Forward(ctx, attention)
-}
-
-type MLP struct {
-	Up   *nn.Linear `gguf:"ffn_up"`
-	Down *nn.Linear `gguf:"ffn_down"`
-	Gate *nn.Linear `gguf:"ffn_gate"`
-}
-
-func (mlp *MLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *Options) ml.Tensor {
-	hiddenState = mlp.Gate.Forward(ctx, hiddenState).SILU(ctx, mlp.Up.Forward(ctx, hiddenState))
-	return mlp.Down.Forward(ctx, hiddenState)
-}
-
-type Layer struct {
-	AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
-	Operator      Operator
-	MLPNorm       *nn.RMSNorm `gguf:"ffn_norm"`
-	MLP           *MLP
-}
-
-func (l *Layer) Forward(ctx ml.Context, layer int, hiddenState, positions, outputs ml.Tensor, cache *HybridCache, opts *Options) ml.Tensor {
-	residual := hiddenState
-
-	hiddenState = l.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
-	hiddenState = l.Operator.Forward(ctx, hiddenState, positions, cache, layer, opts)
-
-	if outputs != nil {
-		hiddenState = hiddenState.Rows(ctx, outputs)
-		residual = residual.Rows(ctx, outputs)
-	}
-
-	hiddenState = hiddenState.Add(ctx, residual)
-	residual = hiddenState
-
-	hiddenState = l.MLPNorm.Forward(ctx, hiddenState, opts.eps)
-	hiddenState = l.MLP.Forward(ctx, hiddenState, opts)
-	return hiddenState.Add(ctx, residual)
-}
-
-func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return m.applyRotaryPositionEmbeddings(ctx, key, shift), nil
-}
-
-func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions := ctx.Input().FromInts(batch.Positions, len(batch.Positions))
-
-	hiddenState := m.TokenEmbedding.Forward(ctx, batch.Inputs)
-
-	for i, layer := range m.Layers {
-		m.Cache.SetLayer(i)
-
-		var outputs ml.Tensor
-		if i == len(m.Layers)-1 {
-			outputs = batch.Outputs
-		}
-
-		hiddenState = layer.Forward(ctx, i, hiddenState, positions, outputs, m.Cache.(*HybridCache), &m.Options)
-	}
-
-	hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
-	return m.Output.Forward(ctx, hiddenState), nil
-}
-
-func init() {
-	model.Register("lfm2", New)
-}
--- a/model/models/lfm2/shortconv.go
+++ b/model/models/lfm2/shortconv.go
@@ -1,50 +0,0 @@
-package lfm2
-
-import (
-	"github.com/ollama/ollama/ml"
-	"github.com/ollama/ollama/ml/nn"
-)
-
-type shortConvKernel struct {
-	Weight ml.Tensor `gguf:"weight"`
-}
-
-// ShortConv implements the LFM2 short-convolution block (GGML_OP_SSM_CONV) with a recurrent
-// state stored in the HybridCache.
-type ShortConv struct {
-	Conv    *shortConvKernel `gguf:"shortconv.conv"`
-	InProj  *nn.Linear       `gguf:"shortconv.in_proj"`
-	OutProj *nn.Linear       `gguf:"shortconv.out_proj"`
-}
-
-func (sc *ShortConv) Forward(ctx ml.Context, hiddenStates ml.Tensor, _ ml.Tensor, cache *HybridCache, layer int, opts *Options) ml.Tensor {
-	nSeqs := cache.numSeqs()
-	seqTokens := cache.seqTokens()
-	hiddenSize := hiddenStates.Dim(0)
-	if nSeqs <= 0 || seqTokens <= 0 || hiddenStates.Dim(1) != nSeqs*seqTokens {
-		panic("lfm2: unsupported batch layout for shortconv")
-	}
-
-	bcx := sc.InProj.Forward(ctx, hiddenStates).Reshape(ctx, 3*hiddenSize, seqTokens, nSeqs)
-
-	elementSize := bcx.Stride(0)
-	b := bcx.View(ctx, 0*hiddenSize*elementSize, hiddenSize, bcx.Stride(1), seqTokens, bcx.Stride(2), nSeqs)
-	c := bcx.View(ctx, 1*hiddenSize*elementSize, hiddenSize, bcx.Stride(1), seqTokens, bcx.Stride(2), nSeqs)
-	x := bcx.View(ctx, 2*hiddenSize*elementSize, hiddenSize, bcx.Stride(1), seqTokens, bcx.Stride(2), nSeqs)
-
-	bx := b.Mul(ctx, x).Permute(ctx, 1, 0, 2, 3)
-
-	state, err := cache.ConvState(ctx, layer)
-	if err != nil {
-		panic("lfm2: failed to get conv state: " + err.Error())
-	}
-	sx := state.Concat(ctx, bx, 0)
-
-	convOut := sx.SSMConv(ctx, sc.Conv.Weight)
-	y := c.Mul(ctx, convOut)
-
-	dConv := sx.Dim(0) - seqTokens
-	cache.UpdateConvState(ctx, layer, sx.Slice(ctx, 0, sx.Dim(0)-dConv, sx.Dim(0), 1))
-
-	return sc.OutProj.Forward(ctx, y.Reshape(ctx, hiddenSize, seqTokens*nSeqs))
-}
--- a/model/models/models.go
+++ b/model/models/models.go
@@ -7,9 +7,7 @@ import (
 	_ "github.com/ollama/ollama/model/models/gemma2"
 	_ "github.com/ollama/ollama/model/models/gemma3"
 	_ "github.com/ollama/ollama/model/models/gemma3n"
-	_ "github.com/ollama/ollama/model/models/glm4moelite"
 	_ "github.com/ollama/ollama/model/models/gptoss"
-	_ "github.com/ollama/ollama/model/models/lfm2"
 	_ "github.com/ollama/ollama/model/models/llama"
 	_ "github.com/ollama/ollama/model/models/llama4"
 	_ "github.com/ollama/ollama/model/models/mistral3"
--- a/model/parsers/glm46.go
+++ b/model/parsers/glm46.go
@@ -1,410 +0,0 @@
-package parsers
-
-import (
-	"context"
-	"encoding/xml"
-	"fmt"
-	"log/slog"
-	"strings"
-	"unicode"
-
-	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/logutil"
-)
-
-type glm46ParserState int
-
-const (
-	glm46ParserState_LookingForThinkingOpen glm46ParserState = iota
-	glm46ParserState_ThinkingStartedEatingWhitespace
-	glm46ParserState_CollectingThinking
-	glm46ParserState_ThinkingDoneEatingWhitespace
-	glm46ParserState_CollectingContent
-	glm46ParserState_ToolStartedEatingWhitespace
-	glm46ParserState_CollectingToolContent
-)
-
-const (
-	glm46ThinkingOpenTag  = "<think>"
-	glm46ThinkingCloseTag = "</think>"
-	glm46ToolOpenTag      = "<tool_call>"
-	glm46ToolCloseTag     = "</tool_call>"
-)
-
-type GLM46Parser struct {
-	state  glm46ParserState
-	buffer strings.Builder
-	tools  []api.Tool
-}
-
-func (p *GLM46Parser) HasToolSupport() bool {
-	return true
-}
-
-func (p *GLM46Parser) HasThinkingSupport() bool {
-	return true
-}
-
-// func (p *GLM46Parser) Init(tools []api.Tool, lastMessage *api.Message) []api.Tool {
-func (p *GLM46Parser) Init(tools []api.Tool, lastMessage *api.Message, thinkValue *api.ThinkValue) []api.Tool {
-	p.tools = tools
-	return tools
-}
-
-type glm46Event interface {
-	isGLM46Event()
-}
-
-type glm46EventContent struct {
-	content string
-}
-
-func (glm46EventContent) isGLM46Event() {}
-
-type glm46EventRawToolCall struct {
-	raw string
-}
-
-func (glm46EventRawToolCall) isGLM46Event() {}
-
-type glm46EventThinkingContent struct {
-	content string
-}
-
-func (glm46EventThinkingContent) isGLM46Event() {}
-
-func (p *GLM46Parser) Add(s string, done bool) (content string, thinking string, calls []api.ToolCall, err error) {
-	p.buffer.WriteString(s)
-	events := p.parseEvents()
-
-	var toolCalls []api.ToolCall
-	var contentSb strings.Builder
-	var thinkingSb strings.Builder
-
-	for _, event := range events {
-		switch event := event.(type) {
-		case glm46EventRawToolCall:
-			toolCall, err := parseGLM46ToolCall(event, p.tools)
-			if err != nil {
-				slog.Warn("glm-4.6 tool call parsing failed", "error", err)
-				return "", "", nil, err
-			}
-			toolCalls = append(toolCalls, toolCall)
-		case glm46EventThinkingContent:
-			thinkingSb.WriteString(event.content)
-		case glm46EventContent:
-			// TODO(drifkin): if the same turn contains multiple interleaved content
-			// events, we naively append them together here.
-			contentSb.WriteString(event.content)
-		}
-	}
-
-	return contentSb.String(), thinkingSb.String(), toolCalls, nil
-}
-
-func (p *GLM46Parser) parseEvents() []glm46Event {
-	var all []glm46Event
-
-	keepLooping := true
-	for keepLooping {
-		var events []glm46Event
-		events, keepLooping = p.eat()
-		if len(events) > 0 {
-			all = append(all, events...)
-		}
-	}
-
-	if len(all) > 0 {
-		slog.Log(context.TODO(), logutil.LevelTrace, "glm-4.6 events parsed", "events", all, "state", p.state, "buffer", p.buffer.String())
-	}
-
-	return all
-}
-
-// eatLeadingWhitespaceAndTransitionTo consumes leading whitespace from the buffer
-// and transitions to the next state. Returns (nil, false) if only whitespace remains
-// in the buffer (needs more input), or (nil, true) if we successfully transitioned.
-func (p *GLM46Parser) eatLeadingWhitespaceAndTransitionTo(nextState glm46ParserState) ([]glm46Event, bool) {
-	trimmed := strings.TrimLeftFunc(p.buffer.String(), unicode.IsSpace)
-	p.buffer.Reset()
-	if trimmed == "" {
-		return nil, false // Still only whitespace, keep waiting for more input
-	}
-	p.state = nextState
-	p.buffer.WriteString(trimmed)
-	return nil, true // Successfully transitioned
-}
-
-// glm46SplitAtTag splits the buffer at the given tag, returns the content before (trimmed of trailing whitespace),
-// the content after (optionally trimmed of leading whitespace), and updates the buffer
-func glm46SplitAtTag(p *GLM46Parser, tag string, trimAfter bool) (string, string) {
-	split := strings.SplitN(p.buffer.String(), tag, 2)
-	before := split[0]
-	before = strings.TrimRightFunc(before, unicode.IsSpace)
-	after := split[1]
-	if trimAfter {
-		after = strings.TrimLeftFunc(after, unicode.IsSpace)
-	}
-	p.buffer.Reset()
-	p.buffer.WriteString(after)
-	return before, after
-}
-
-func (p *GLM46Parser) eat() ([]glm46Event, bool) {
-	var events []glm46Event
-
-	switch p.state {
-	case glm46ParserState_LookingForThinkingOpen:
-		trimmed := strings.TrimLeftFunc(p.buffer.String(), unicode.IsSpace)
-		if strings.HasPrefix(trimmed, glm46ThinkingOpenTag) {
-			// Found <think> opening tag
-			after := strings.TrimPrefix(trimmed, glm46ThinkingOpenTag)
-			after = strings.TrimLeftFunc(after, unicode.IsSpace)
-			p.buffer.Reset()
-			p.buffer.WriteString(after)
-			if after == "" {
-				p.state = glm46ParserState_ThinkingStartedEatingWhitespace
-			} else {
-				p.state = glm46ParserState_CollectingThinking
-			}
-			return events, true
-		} else if strings.HasPrefix(glm46ThinkingOpenTag, trimmed) {
-			// Partial opening tag seen, keep accumulating
-			return events, false
-		} else if trimmed == "" {
-			// Only whitespace, keep accumulating
-			return events, false
-		} else {
-			// No thinking tag found, skip to content collection
-			p.state = glm46ParserState_CollectingContent
-			// Don't trim - we want to keep the original content
-			return events, true
-		}
-
-	case glm46ParserState_ThinkingStartedEatingWhitespace:
-		return p.eatLeadingWhitespaceAndTransitionTo(glm46ParserState_CollectingThinking)
-
-	case glm46ParserState_CollectingThinking:
-		acc := p.buffer.String()
-		if strings.Contains(acc, glm46ThinkingCloseTag) {
-			thinking, remaining := glm46SplitAtTag(p, glm46ThinkingCloseTag, true)
-			if len(thinking) > 0 {
-				events = append(events, glm46EventThinkingContent{content: thinking})
-			}
-			if remaining == "" {
-				p.state = glm46ParserState_ThinkingDoneEatingWhitespace
-			} else {
-				p.state = glm46ParserState_CollectingContent
-			}
-			return events, true
-		} else if overlapLen := overlap(acc, glm46ThinkingCloseTag); overlapLen > 0 {
-			// Partial closing tag - withhold it along with any trailing whitespace before it
-			beforePartialTag := acc[:len(acc)-overlapLen]
-			trailingWhitespaceLen := trailingWhitespaceLen(beforePartialTag)
-			ambiguousStart := len(beforePartialTag) - trailingWhitespaceLen
-
-			unambiguous := acc[:ambiguousStart]
-			ambiguous := acc[ambiguousStart:]
-			p.buffer.Reset()
-			p.buffer.WriteString(ambiguous)
-			if len(unambiguous) > 0 {
-				events = append(events, glm46EventThinkingContent{content: unambiguous})
-			}
-			return events, false
-		} else {
-			// Pure thinking content - withhold trailing whitespace (might precede closing tag)
-			whitespaceLen := trailingWhitespaceLen(acc)
-			ambiguousStart := len(acc) - whitespaceLen
-
-			unambiguous := acc[:ambiguousStart]
-			ambiguous := acc[ambiguousStart:]
-			p.buffer.Reset()
-			p.buffer.WriteString(ambiguous)
-			if len(unambiguous) > 0 {
-				events = append(events, glm46EventThinkingContent{content: unambiguous})
-			}
-			return events, false
-		}
-
-	case glm46ParserState_ThinkingDoneEatingWhitespace:
-		return p.eatLeadingWhitespaceAndTransitionTo(glm46ParserState_CollectingContent)
-
-	case glm46ParserState_CollectingContent:
-		if strings.Contains(p.buffer.String(), glm46ToolOpenTag) {
-			before, after := glm46SplitAtTag(p, glm46ToolOpenTag, true)
-			if len(before) > 0 {
-				events = append(events, glm46EventContent{content: before})
-			}
-			if after == "" {
-				p.state = glm46ParserState_ToolStartedEatingWhitespace
-			} else {
-				p.state = glm46ParserState_CollectingToolContent
-			}
-			return events, true
-		} else if overlapLen := overlap(p.buffer.String(), glm46ToolOpenTag); overlapLen > 0 {
-			beforePartialTag := p.buffer.String()[:len(p.buffer.String())-overlapLen]
-			trailingWhitespaceLen := trailingWhitespaceLen(beforePartialTag)
-			ambiguousStart := len(beforePartialTag) - trailingWhitespaceLen
-
-			unambiguous := p.buffer.String()[:ambiguousStart]
-			ambiguous := p.buffer.String()[ambiguousStart:]
-			p.buffer.Reset()
-			p.buffer.WriteString(ambiguous)
-			if len(unambiguous) > 0 {
-				events = append(events, glm46EventContent{content: unambiguous})
-			}
-			return events, false
-		} else {
-			whitespaceLen := trailingWhitespaceLen(p.buffer.String())
-			ambiguousStart := len(p.buffer.String()) - whitespaceLen
-
-			unambiguous := p.buffer.String()[:ambiguousStart]
-			ambiguous := p.buffer.String()[ambiguousStart:]
-			p.buffer.Reset()
-			p.buffer.WriteString(ambiguous)
-			if len(unambiguous) > 0 {
-				events = append(events, glm46EventContent{content: unambiguous})
-			}
-			return events, false
-		}
-
-	case glm46ParserState_ToolStartedEatingWhitespace:
-		return p.eatLeadingWhitespaceAndTransitionTo(glm46ParserState_CollectingToolContent)
-
-	case glm46ParserState_CollectingToolContent:
-		acc := p.buffer.String()
-		if strings.Contains(acc, glm46ToolCloseTag) {
-			toolContent, _ := glm46SplitAtTag(p, glm46ToolCloseTag, true)
-			if len(toolContent) == 0 {
-				slog.Warn("glm46 tool call closing tag found but no content before it")
-			}
-			events = append(events, glm46EventRawToolCall{raw: toolContent})
-			p.state = glm46ParserState_CollectingContent
-			return events, true
-		} else {
-			// Keep accumulating - tool calls are not streamed
-			// We just wait for the closing tag
-			return events, false
-		}
-
-	default:
-		panic("unreachable")
-	}
-}
-
-// GLMToolCallXML represents the structure of a GLM-4.6 tool call for XML parsing
-type GLMToolCallXML struct {
-	XMLName xml.Name `xml:"tool_call"`
-	Content string   `xml:",chardata"` // Function name (text nodes between tags)
-	Keys    []string `xml:"arg_key"`   // All arg_key elements in document order
-	Values  []string `xml:"arg_value"` // All arg_value elements in document order
-}
-
-// escapeGLM46Content escapes XML entities in text content while preserving arg_key/arg_value tags
-func escapeGLM46Content(s string) string {
-	var result strings.Builder
-	inTag := false
-
-	for i := range len(s) {
-		ch := s[i]
-
-		if ch == '<' {
-			// Check if this is a known tag
-			if strings.HasPrefix(s[i:], "<arg_key>") ||
-				strings.HasPrefix(s[i:], "</arg_key>") ||
-				strings.HasPrefix(s[i:], "<arg_value>") ||
-				strings.HasPrefix(s[i:], "</arg_value>") {
-				inTag = true
-			}
-		}
-
-		if inTag {
-			result.WriteByte(ch)
-			if ch == '>' {
-				inTag = false
-			}
-		} else {
-			// Escape special characters in text content
-			switch ch {
-			case '&':
-				result.WriteString("&amp;")
-			case '<':
-				result.WriteString("&lt;")
-			case '>':
-				result.WriteString("&gt;")
-			default:
-				result.WriteByte(ch)
-			}
-		}
-	}
-
-	return result.String()
-}
-
-func parseGLM46ToolCall(raw glm46EventRawToolCall, tools []api.Tool) (api.ToolCall, error) {
-	// Escape any unescaped entities in text content
-	// We need to escape text between tags, but not the tags themselves
-	escaped := escapeGLM46Content(raw.raw)
-
-	// Wrap the content in a root element to make it valid XML
-	xmlString := "<tool_call>" + escaped + "</tool_call>"
-
-	// Parse XML into struct
-	var parsed GLMToolCallXML
-	if err := xml.Unmarshal([]byte(xmlString), &parsed); err != nil {
-		return api.ToolCall{}, fmt.Errorf("failed to parse XML: %w", err)
-	}
-
-	// Extract and trim function name
-	functionName := strings.TrimSpace(parsed.Content)
-	if functionName == "" {
-		return api.ToolCall{}, fmt.Errorf("empty function name")
-	}
-
-	// Verify keys and values are paired correctly
-	if len(parsed.Keys) != len(parsed.Values) {
-		return api.ToolCall{}, fmt.Errorf("mismatched arg_key and arg_value counts: %d keys, %d values", len(parsed.Keys), len(parsed.Values))
-	}
-
-	// Find the matching tool to get parameter types
-	var matchedTool *api.Tool
-	for i := range tools {
-		if tools[i].Function.Name == functionName {
-			matchedTool = &tools[i]
-			break
-		}
-	}
-
-	// Build arguments map by pairing keys and values
-	toolCall := api.ToolCall{
-		Function: api.ToolCallFunction{
-			Name:      functionName,
-			Arguments: api.NewToolCallFunctionArguments(),
-		},
-	}
-
-	for i := range parsed.Keys {
-		key := strings.TrimSpace(parsed.Keys[i])
-		value := parsed.Values[i] // Don't trim here - parseValue handles it
-
-		// Look up parameter type
-		var paramType api.PropertyType
-		if matchedTool != nil && matchedTool.Function.Parameters.Properties != nil {
-			if prop, ok := matchedTool.Function.Parameters.Properties.Get(key); ok {
-				// Handle anyOf by collecting all types from the union
-				if len(prop.AnyOf) > 0 {
-					for _, anyOfProp := range prop.AnyOf {
-						paramType = append(paramType, anyOfProp.Type...)
-					}
-				} else {
-					paramType = prop.Type
-				}
-			}
-		}
-
-		// Parse value with type coercion
-		toolCall.Function.Arguments.Set(key, parseValue(value, paramType))
-	}
-
-	return toolCall, nil
-}
--- a/model/parsers/glm46_test.go
+++ b/model/parsers/glm46_test.go
@@ -1,862 +0,0 @@
-package parsers
-
-import (
-	"encoding/xml"
-	"reflect"
-	"testing"
-
-	"github.com/ollama/ollama/api"
-)
-
-func TestGLM46ParserStreaming(t *testing.T) {
-	type step struct {
-		input      string
-		wantEvents []glm46Event
-	}
-
-	cases := []struct {
-		desc  string
-		steps []step
-		only  bool
-	}{
-		{
-			desc: "leading whitespace before think tag",
-			steps: []step{
-				{
-					input:      "   \n\t  ",
-					wantEvents: []glm46Event{},
-				},
-				{
-					input:      "<think>thinking</think>",
-					wantEvents: []glm46Event{glm46EventThinkingContent{content: "thinking"}},
-				},
-			},
-		},
-		{
-			desc: "think tag with whitespace inside",
-			steps: []step{
-				{
-					input: "<think>  \n  thinking content  \n  </think>regular content",
-					wantEvents: []glm46Event{
-						glm46EventThinkingContent{content: "thinking content"},
-						glm46EventContent{content: "regular content"},
-					},
-				},
-			},
-		},
-		{
-			desc: "tool call with leading whitespace after opening tag",
-			steps: []step{
-				{
-					input: "<think></think><tool_call>  \n  test  \n  </tool_call>",
-					wantEvents: []glm46Event{
-						glm46EventRawToolCall{raw: "test"},
-					},
-				},
-			},
-		},
-		{
-			desc: "simple thinking then content",
-			steps: []step{
-				{
-					input: "<think>I am thinking</think>Now I respond",
-					wantEvents: []glm46Event{
-						glm46EventThinkingContent{content: "I am thinking"},
-						glm46EventContent{content: "Now I respond"},
-					},
-				},
-			},
-		},
-		{
-			desc: "streamed thinking content",
-			steps: []step{
-				{
-					input:      "<think>hello",
-					wantEvents: []glm46Event{glm46EventThinkingContent{content: "hello"}},
-				},
-				{
-					input:      " world",
-					wantEvents: []glm46Event{glm46EventThinkingContent{content: " world"}},
-				},
-				{
-					input: "</think>content",
-					wantEvents: []glm46Event{
-						glm46EventContent{content: "content"},
-					},
-				},
-			},
-		},
-		{
-			desc: "content before tool call",
-			steps: []step{
-				{
-					input: "<think>Let me call a tool</think>here is text<tool_call>",
-					wantEvents: []glm46Event{
-						glm46EventThinkingContent{content: "Let me call a tool"},
-						glm46EventContent{content: "here is text"},
-					},
-				},
-				{
-					input: "function_name\n<arg_key>param</arg_key>\n<arg_value>value</arg_value>\n</tool_call>",
-					wantEvents: []glm46Event{
-						glm46EventRawToolCall{raw: "function_name\n<arg_key>param</arg_key>\n<arg_value>value</arg_value>"},
-					},
-				},
-			},
-		},
-		{
-			desc: "tool call with content after",
-			steps: []step{
-				{
-					input: "<think>thinking</think><tool_call>test</tool_call>after tool",
-					wantEvents: []glm46Event{
-						glm46EventThinkingContent{content: "thinking"},
-						glm46EventRawToolCall{raw: "test"},
-						glm46EventContent{content: "after tool"},
-					},
-				},
-			},
-		},
-		{
-			desc: "trailing whitespace between content and tool call is trimmed",
-			steps: []step{
-				{
-					input: "<think>thinking</think>content\n  \t  <tool_call>test</tool_call>",
-					wantEvents: []glm46Event{
-						glm46EventThinkingContent{content: "thinking"},
-						glm46EventContent{content: "content"},
-						glm46EventRawToolCall{raw: "test"},
-					},
-				},
-			},
-		},
-		{
-			desc: "trailing whitespace between tool call and content is trimmed",
-			steps: []step{
-				{
-					input: "<think>think</think><tool_call>test</tool_call>\n\t  after",
-					wantEvents: []glm46Event{
-						glm46EventThinkingContent{content: "think"},
-						glm46EventRawToolCall{raw: "test"},
-						glm46EventContent{content: "after"},
-					},
-				},
-			},
-		},
-		{
-			desc: "split thinking close tag",
-			steps: []step{
-				{
-					input:      "<think>thinking content</th",
-					wantEvents: []glm46Event{glm46EventThinkingContent{content: "thinking content"}},
-				},
-				{
-					input: "ink>after",
-					wantEvents: []glm46Event{
-						glm46EventContent{content: "after"},
-					},
-				},
-			},
-		},
-		{
-			desc: "split thinking open tag",
-			steps: []step{
-				{
-					input:      "  <thi",
-					wantEvents: []glm46Event{},
-				},
-				{
-					input:      "nk>content</think>",
-					wantEvents: []glm46Event{glm46EventThinkingContent{content: "content"}},
-				},
-			},
-		},
-		{
-			desc: "split tool open tag",
-			steps: []step{
-				{
-					input:      "<think>think</think>content<tool",
-					wantEvents: []glm46Event{glm46EventThinkingContent{content: "think"}, glm46EventContent{content: "content"}},
-				},
-				{
-					input:      "_call>inside",
-					wantEvents: []glm46Event{},
-				},
-				{
-					input: "</tool_call>",
-					wantEvents: []glm46Event{
-						glm46EventRawToolCall{raw: "inside"},
-					},
-				},
-			},
-		},
-		{
-			desc: "partial thinking close tag fakeout",
-			steps: []step{
-				{
-					input:      "<think>content</th",
-					wantEvents: []glm46Event{glm46EventThinkingContent{content: "content"}},
-				},
-				{
-					input:      "ought more",
-					wantEvents: []glm46Event{glm46EventThinkingContent{content: "</thought more"}},
-				},
-			},
-		},
-		{
-			desc: "partial thinking open tag fakeout",
-			steps: []step{
-				{
-					input:      "  <thi",
-					wantEvents: []glm46Event{},
-				},
-				{
-					input: "nking is fun",
-					wantEvents: []glm46Event{
-						glm46EventContent{content: "  <thinking is fun"},
-					},
-				},
-			},
-		},
-		{
-			desc: "partial tool open tag fakeout",
-			steps: []step{
-				{
-					input: "<think></think>content\n<tool",
-					wantEvents: []glm46Event{
-						glm46EventContent{content: "content"},
-					},
-				},
-				{
-					input: " fakeout",
-					wantEvents: []glm46Event{
-						glm46EventContent{content: "\n<tool fakeout"},
-					},
-				},
-			},
-		},
-		{
-			desc: "partial tool close tag fakeout",
-			steps: []step{
-				{
-					input:      "<think></think><tool_call>content</tool",
-					wantEvents: []glm46Event{},
-				},
-				{
-					input:      " fakeout",
-					wantEvents: []glm46Event{},
-				},
-				{
-					input: "</tool_call>",
-					wantEvents: []glm46Event{
-						glm46EventRawToolCall{raw: "content</tool fakeout"},
-					},
-				},
-			},
-		},
-		{
-			desc: "empty thinking tag",
-			steps: []step{
-				{
-					input: "<think></think>content here",
-					wantEvents: []glm46Event{
-						glm46EventContent{content: "content here"},
-					},
-				},
-			},
-		},
-		{
-			desc: "multiple tool calls in sequence",
-			steps: []step{
-				{
-					input: "<think>think</think><tool_call>first</tool_call>between<tool_call>second</tool_call>end",
-					wantEvents: []glm46Event{
-						glm46EventThinkingContent{content: "think"},
-						glm46EventRawToolCall{raw: "first"},
-						glm46EventContent{content: "between"},
-						glm46EventRawToolCall{raw: "second"},
-						glm46EventContent{content: "end"},
-					},
-				},
-			},
-		},
-		{
-			desc: "no thinking tag - direct to content",
-			steps: []step{
-				{
-					input: "just content here",
-					wantEvents: []glm46Event{
-						glm46EventContent{content: "just content here"},
-					},
-				},
-			},
-		},
-		{
-			desc: "no thinking tag - skip to content then tool call",
-			steps: []step{
-				{
-					input: "Here's the answer:<tool_call>test</tool_call>done",
-					wantEvents: []glm46Event{
-						glm46EventContent{content: "Here's the answer:"},
-						glm46EventRawToolCall{raw: "test"},
-						glm46EventContent{content: "done"},
-					},
-				},
-			},
-		},
-		{
-			desc: "no thinking tag - whitespace preserved when no tags",
-			steps: []step{
-				{
-					input: "  \n  content with leading whitespace",
-					wantEvents: []glm46Event{
-						glm46EventContent{content: "  \n  content with leading whitespace"},
-					},
-				},
-			},
-		},
-		{
-			desc: "whitespace after think close tag gets eaten",
-			steps: []step{
-				{
-					input: "<think>thinking</think>  \n\t  content",
-					wantEvents: []glm46Event{
-						glm46EventThinkingContent{content: "thinking"},
-						glm46EventContent{content: "content"},
-					},
-				},
-			},
-		},
-		{
-			desc: "whitespace after tool_call close tag gets eaten",
-			steps: []step{
-				{
-					input: "<think></think><tool_call>test</tool_call>  \n\t  content",
-					wantEvents: []glm46Event{
-						glm46EventRawToolCall{raw: "test"},
-						glm46EventContent{content: "content"},
-					},
-				},
-			},
-		},
-		{
-			desc: "thinking content withholds trailing whitespace (single chunk)",
-			steps: []step{
-				{
-					input: "<think>thinking content   ",
-					wantEvents: []glm46Event{
-						glm46EventThinkingContent{content: "thinking content"},
-					},
-				},
-				{
-					input: "</think>after",
-					wantEvents: []glm46Event{
-						glm46EventContent{content: "after"},
-					},
-				},
-			},
-		},
-		{
-			desc: "thinking content withholds trailing whitespace with newlines",
-			steps: []step{
-				{
-					input: "<think>thinking\n\n  ",
-					wantEvents: []glm46Event{
-						glm46EventThinkingContent{content: "thinking"},
-					},
-				},
-				{
-					input: "</think>content",
-					wantEvents: []glm46Event{
-						glm46EventContent{content: "content"},
-					},
-				},
-			},
-		},
-		{
-			desc: "thinking content trailing whitespace emitted when more content arrives",
-			steps: []step{
-				{
-					input: "<think>thinking   ",
-					wantEvents: []glm46Event{
-						glm46EventThinkingContent{content: "thinking"},
-					},
-				},
-				{
-					input: "more thinking",
-					wantEvents: []glm46Event{
-						glm46EventThinkingContent{content: "   more thinking"},
-					},
-				},
-				{
-					input:      "</think>",
-					wantEvents: []glm46Event{},
-				},
-			},
-		},
-		{
-			desc: "thinking content withholds trailing whitespace before partial close tag",
-			steps: []step{
-				{
-					input: "<think>thinking   </th",
-					wantEvents: []glm46Event{
-						glm46EventThinkingContent{content: "thinking"},
-					},
-				},
-				{
-					input: "ink>content",
-					wantEvents: []glm46Event{
-						glm46EventContent{content: "content"},
-					},
-				},
-			},
-		},
-	}
-
-	anyOnlies := false
-	for _, tc := range cases {
-		if tc.only {
-			anyOnlies = true
-		}
-	}
-
-	for _, tc := range cases {
-		if anyOnlies && !tc.only {
-			continue
-		}
-
-		t.Run(tc.desc, func(t *testing.T) {
-			parser := GLM46Parser{}
-
-			for i, step := range tc.steps {
-				parser.buffer.WriteString(step.input)
-				gotEvents := parser.parseEvents()
-
-				if len(gotEvents) == 0 && len(step.wantEvents) == 0 {
-					// avoid deep equal on empty vs. nil slices
-					continue
-				}
-
-				if !reflect.DeepEqual(gotEvents, step.wantEvents) {
-					t.Errorf("step %d: input %q: got events %#v, want %#v", i, step.input, gotEvents, step.wantEvents)
-				}
-			}
-		})
-	}
-}
-
-// TestGLMToolCallXMLOrderPreservation verifies that xml.Unmarshal preserves
-// document order when collecting multiple elements with the same tag name into slices.
-// This is a critical assumption for the GLM-4.6 parser's struct-based approach.
-func TestGLMToolCallXMLOrderPreservation(t *testing.T) {
-	testCases := []struct {
-		name       string
-		xml        string
-		wantKeys   []string
-		wantValues []string
-	}{
-		{
-			name: "alternating keys and values",
-			xml: `<tool_call>
-function_name
-<arg_key>first</arg_key>
-<arg_value>A</arg_value>
-<arg_key>second</arg_key>
-<arg_value>B</arg_value>
-<arg_key>third</arg_key>
-<arg_value>C</arg_value>
-</tool_call>`,
-			wantKeys:   []string{"first", "second", "third"},
-			wantValues: []string{"A", "B", "C"},
-		},
-		{
-			name: "all keys then all values",
-			xml: `<tool_call>
-function_name
-<arg_key>key1</arg_key>
-<arg_key>key2</arg_key>
-<arg_key>key3</arg_key>
-<arg_value>val1</arg_value>
-<arg_value>val2</arg_value>
-<arg_value>val3</arg_value>
-</tool_call>`,
-			wantKeys:   []string{"key1", "key2", "key3"},
-			wantValues: []string{"val1", "val2", "val3"},
-		},
-		{
-			name: "mixed grouping",
-			xml: `<tool_call>
-function_name
-<arg_key>a</arg_key>
-<arg_value>1</arg_value>
-<arg_key>b</arg_key>
-<arg_key>c</arg_key>
-<arg_value>2</arg_value>
-<arg_value>3</arg_value>
-</tool_call>`,
-			wantKeys:   []string{"a", "b", "c"},
-			wantValues: []string{"1", "2", "3"},
-		},
-		{
-			name: "reverse order - all values then all keys",
-			xml: `<tool_call>
-function_name
-<arg_value>X</arg_value>
-<arg_value>Y</arg_value>
-<arg_value>Z</arg_value>
-<arg_key>x</arg_key>
-<arg_key>y</arg_key>
-<arg_key>z</arg_key>
-</tool_call>`,
-			wantKeys:   []string{"x", "y", "z"},
-			wantValues: []string{"X", "Y", "Z"},
-		},
-	}
-
-	for _, tc := range testCases {
-		t.Run(tc.name, func(t *testing.T) {
-			var parsed GLMToolCallXML
-			err := xml.Unmarshal([]byte(tc.xml), &parsed)
-			if err != nil {
-				t.Fatalf("failed to unmarshal XML: %v", err)
-			}
-
-			if !reflect.DeepEqual(parsed.Keys, tc.wantKeys) {
-				t.Errorf("Keys order mismatch:\ngot:  %v\nwant: %v", parsed.Keys, tc.wantKeys)
-			}
-
-			if !reflect.DeepEqual(parsed.Values, tc.wantValues) {
-				t.Errorf("Values order mismatch:\ngot:  %v\nwant: %v", parsed.Values, tc.wantValues)
-			}
-		})
-	}
-}
-
-func TestGLM46ToolCallParsing(t *testing.T) {
-	type testCase struct {
-		name         string
-		rawToolCall  string
-		tools        []api.Tool
-		wantToolCall api.ToolCall
-	}
-
-	cases := []testCase{
-		{
-			name:  "simple tool call",
-			tools: []api.Tool{},
-			rawToolCall: `get-current-weather
-<arg_key>location</arg_key>
-<arg_value>New York, NY</arg_value>
-<arg_key>unit</arg_key>
-<arg_value>celsius</arg_value>`,
-			wantToolCall: api.ToolCall{
-				Function: api.ToolCallFunction{
-					Name:      "get-current-weather",
-					Arguments: args(`{"location": "New York, NY", "unit": "celsius"}`),
-				},
-			},
-		},
-		{
-			name: "tool call with typed parameters",
-			tools: []api.Tool{
-				tool("calculate", map[string]api.ToolProperty{
-					"x":       {Type: api.PropertyType{"number"}},
-					"y":       {Type: api.PropertyType{"integer"}},
-					"enabled": {Type: api.PropertyType{"boolean"}},
-					"items":   {Type: api.PropertyType{"array"}},
-				}),
-			},
-			rawToolCall: `calculate
-<arg_key>x</arg_key>
-<arg_value>3.14</arg_value>
-<arg_key>y</arg_key>
-<arg_value>42</arg_value>
-<arg_key>enabled</arg_key>
-<arg_value>true</arg_value>
-<arg_key>items</arg_key>
-<arg_value>["a", "b", "c"]</arg_value>`,
-			wantToolCall: api.ToolCall{
-				Function: api.ToolCallFunction{
-					Name:      "calculate",
-					Arguments: args(`{"enabled": true, "items": ["a", "b", "c"], "x": 3.14, "y": 42}`),
-				},
-			},
-		},
-		{
-			name:  "function name with whitespace",
-			tools: []api.Tool{},
-			rawToolCall: `  get-weather
-<arg_key>city</arg_key>
-<arg_value>Paris</arg_value>`,
-			wantToolCall: api.ToolCall{
-				Function: api.ToolCallFunction{
-					Name:      "get-weather",
-					Arguments: args(`{"city": "Paris"}`),
-				},
-			},
-		},
-		{
-			name:  "values with special characters",
-			tools: []api.Tool{},
-			rawToolCall: `execute-command
-<arg_key>command</arg_key>
-<arg_value>ls && echo "done"</arg_value>
-<arg_key>message</arg_key>
-<arg_value>a < b and c > d</arg_value>`,
-			wantToolCall: api.ToolCall{
-				Function: api.ToolCallFunction{
-					Name:      "execute-command",
-					Arguments: args(`{"command": "ls && echo \"done\"", "message": "a < b and c > d"}`),
-				},
-			},
-		},
-		{
-			name:  "unicode in function names and values",
-			tools: []api.Tool{},
-			rawToolCall: `获取天气
-<arg_key>城市</arg_key>
-<arg_value>北京</arg_value>
-<arg_key>message</arg_key>
-<arg_value>Hello! 你好! 🌟</arg_value>`,
-			wantToolCall: api.ToolCall{
-				Function: api.ToolCallFunction{
-					Name:      "获取天气",
-					Arguments: args(`{"message": "Hello! 你好! 🌟", "城市": "北京"}`),
-				},
-			},
-		},
-		{
-			name:  "empty value",
-			tools: []api.Tool{},
-			rawToolCall: `test-function
-<arg_key>param1</arg_key>
-<arg_value></arg_value>`,
-			wantToolCall: api.ToolCall{
-				Function: api.ToolCallFunction{
-					Name:      "test-function",
-					Arguments: args(`{"param1": ""}`),
-				},
-			},
-		},
-		{
-			name:  "special chars in arg_key names",
-			tools: []api.Tool{},
-			rawToolCall: `test-function
-<arg_key>param<1></arg_key>
-<arg_value>value1</arg_value>
-<arg_key>a&b</arg_key>
-<arg_value>value2</arg_value>
-<arg_key>x>y</arg_key>
-<arg_value>value3</arg_value>`,
-			wantToolCall: api.ToolCall{
-				Function: api.ToolCallFunction{
-					Name:      "test-function",
-					Arguments: args(`{"a&b": "value2", "param<1>": "value1", "x>y": "value3"}`),
-				},
-			},
-		},
-		{
-			name:  "multiple consecutive ampersands",
-			tools: []api.Tool{},
-			rawToolCall: `test-function
-<arg_key>param</arg_key>
-<arg_value>test &&&& more</arg_value>`,
-			wantToolCall: api.ToolCall{
-				Function: api.ToolCallFunction{
-					Name:      "test-function",
-					Arguments: args(`{"param": "test &&&& more"}`),
-				},
-			},
-		},
-		{
-			name:  "mixed special chars together",
-			tools: []api.Tool{},
-			rawToolCall: `test-function
-<arg_key>param</arg_key>
-<arg_value><>&<>&</arg_value>`,
-			wantToolCall: api.ToolCall{
-				Function: api.ToolCallFunction{
-					Name:      "test-function",
-					Arguments: args(`{"param": "<>&<>&"}`),
-				},
-			},
-		},
-		{
-			name:  "newlines and tabs in parameter values",
-			tools: []api.Tool{},
-			rawToolCall: `test-function
-<arg_key>multiline</arg_key>
-<arg_value>line1
-	indented line2
-line3</arg_value>`,
-			wantToolCall: api.ToolCall{
-				Function: api.ToolCallFunction{
-					Name:      "test-function",
-					Arguments: args(`{"multiline": "line1\n\tindented line2\nline3"}`),
-				},
-			},
-		},
-		{
-			name:  "single and double quotes in values",
-			tools: []api.Tool{},
-			rawToolCall: `test-function
-<arg_key>quotes</arg_key>
-<arg_value>She said "Hello's there!"</arg_value>`,
-			wantToolCall: api.ToolCall{
-				Function: api.ToolCallFunction{
-					Name:      "test-function",
-					Arguments: args(`{"quotes": "She said \"Hello's there!\""}`),
-				},
-			},
-		},
-		{
-			name:  "CDATA-like content that should be treated as text",
-			tools: []api.Tool{},
-			rawToolCall: `test-function
-<arg_key>cdata</arg_key>
-<arg_value><![CDATA[not actual cdata]]></arg_value>`,
-			wantToolCall: api.ToolCall{
-				Function: api.ToolCallFunction{
-					Name:      "test-function",
-					Arguments: args(`{"cdata": "<![CDATA[not actual cdata]]>"}`),
-				},
-			},
-		},
-		{
-			name:  "all special XML entities",
-			tools: []api.Tool{},
-			rawToolCall: `test-function
-<arg_key>entities</arg_key>
-<arg_value>&lt;&gt;&amp;&apos;&quot;</arg_value>`,
-			wantToolCall: api.ToolCall{
-				Function: api.ToolCallFunction{
-					Name:      "test-function",
-					Arguments: args(`{"entities": "&lt;&gt;&amp;&apos;&quot;"}`),
-				},
-			},
-		},
-		{
-			name:  "order preservation with multiple parameters",
-			tools: []api.Tool{},
-			rawToolCall: `test-function
-<arg_key>first</arg_key>
-<arg_value>value1</arg_value>
-<arg_key>second</arg_key>
-<arg_value>value2</arg_value>
-<arg_key>third</arg_key>
-<arg_value>value3</arg_value>
-<arg_key>fourth</arg_key>
-<arg_value>value4</arg_value>
-<arg_key>fifth</arg_key>
-<arg_value>value5</arg_value>`,
-			wantToolCall: api.ToolCall{
-				Function: api.ToolCallFunction{
-					Name:      "test-function",
-					Arguments: args(`{"fifth": "value5", "first": "value1", "fourth": "value4", "second": "value2", "third": "value3"}`),
-				},
-			},
-		},
-		{
-			name:  "order preservation with identical key names but different positions",
-			tools: []api.Tool{},
-			rawToolCall: `test-function
-<arg_key>param</arg_key>
-<arg_value>first occurrence</arg_value>
-<arg_key>other</arg_key>
-<arg_value>middle</arg_value>
-<arg_key>param</arg_key>
-<arg_value>second occurrence</arg_value>`,
-			wantToolCall: api.ToolCall{
-				Function: api.ToolCallFunction{
-					Name: "test-function",
-					// Later occurrence should overwrite earlier one
-					Arguments: args(`{"other": "middle", "param": "second occurrence"}`),
-				},
-			},
-		},
-		{
-			name: "array with mixed types",
-			tools: []api.Tool{
-				tool("process", map[string]api.ToolProperty{
-					"items": {Type: api.PropertyType{"array"}},
-				}),
-			},
-			rawToolCall: `process
-<arg_key>items</arg_key>
-<arg_value>[1, "hello", true, null]</arg_value>`,
-			wantToolCall: api.ToolCall{
-				Function: api.ToolCallFunction{
-					Name:      "process",
-					Arguments: args(`{"items": [1, "hello", true, null]}`),
-				},
-			},
-		},
-		{
-			name: "empty array",
-			tools: []api.Tool{
-				tool("test", map[string]api.ToolProperty{
-					"tags": {Type: api.PropertyType{"array"}},
-				}),
-			},
-			rawToolCall: `test
-<arg_key>tags</arg_key>
-<arg_value>[]</arg_value>`,
-			wantToolCall: api.ToolCall{
-				Function: api.ToolCallFunction{
-					Name:      "test",
-					Arguments: args(`{"tags": []}`),
-				},
-			},
-		},
-		{
-			name: "anyOf array or string - with array of objects",
-			tools: []api.Tool{
-				tool("TodoWrite", map[string]api.ToolProperty{
-					"todos": {AnyOf: []api.ToolProperty{{Type: api.PropertyType{"array"}}, {Type: api.PropertyType{"string"}}}},
-				}),
-			},
-			// <tool_call>TodoWrite
-			// <arg_key>todos</arg_key>
-			// <arg_value>[{"content": "Set up HTML file and basic structure", "id": "1", "priority": "high", "status": "pending"}, {"content": "Create 3D scene with Three.js", "id": "2", "priority": "high", "status": "pending"}, {"content": "Implement terrain generation with blocks", "id": "3", "priority": "high", "status": "pending"}, {"content": "Add player controls (movement, camera)", "id": "4", "priority": "high", "status": "pending"}, {"content": "Implement block placement/destruction", "id": "5", "priority": "medium", "status": "pending"}, {"content": "Add lighting and textures", "id": "6", "priority": "medium", "status": "pending"}, {"content": "Test and optimize performance", "id": "7", "priority": "low", "status": "pending"}]</arg_value>
-			// </tool_call>
-			rawToolCall: `TodoWrite
-<arg_key>todos</arg_key>
-<arg_value>[{"content": "task 1", "status": "pending", "priority": "high", "id": "1"}, {"content": "task 2", "status": "completed", "priority": "low", "id": "2"}]</arg_value>`,
-			wantToolCall: api.ToolCall{
-				Function: api.ToolCallFunction{
-					Name:      "TodoWrite",
-					Arguments: args(`{"todos": [{"content": "task 1", "id": "1", "priority": "high", "status": "pending"}, {"content": "task 2", "id": "2", "priority": "low", "status": "completed"}]}`),
-				},
-			},
-		},
-		{
-			name: "anyOf array or string - with plain string",
-			tools: []api.Tool{
-				tool("TodoWrite", map[string]api.ToolProperty{
-					"todos": {Type: api.PropertyType{"array", "string"}},
-				}),
-			},
-			rawToolCall: `TodoWrite
-<arg_key>todos</arg_key>
-<arg_value>Error: could not load todos</arg_value>`,
-			wantToolCall: api.ToolCall{
-				Function: api.ToolCallFunction{
-					Name:      "TodoWrite",
-					Arguments: args(`{"todos": "Error: could not load todos"}`),
-				},
-			},
-		},
-	}
-
-	for i, tc := range cases {
-		t.Run(tc.name, func(t *testing.T) {
-			gotToolCall, err := parseGLM46ToolCall(glm46EventRawToolCall{raw: tc.rawToolCall}, tc.tools)
-			if err != nil {
-				t.Errorf("case %d (%s): %v", i, tc.name, err)
-			}
-			if !toolCallEqual(gotToolCall, tc.wantToolCall) {
-				t.Errorf("case %d (%s): got tool call %#v, want %#v", i, tc.name, gotToolCall, tc.wantToolCall)
-			}
-		})
-	}
-}
--- a/model/parsers/glm47.go
+++ b/model/parsers/glm47.go
@@ -1,20 +0,0 @@
-package parsers
-
-import "github.com/ollama/ollama/api"
-
-// GLM47Parser extends GLM46Parser with thinking-aware initialization.
-// GLM-4.7's prompt ends with <think> when thinking is enabled, so the parser
-// must start in CollectingThinking state (the model outputs thinking content directly).
-type GLM47Parser struct {
-	GLM46Parser
-}
-
-func (p *GLM47Parser) Init(tools []api.Tool, lastMessage *api.Message, thinkValue *api.ThinkValue) []api.Tool {
-	p.tools = tools
-	// When thinking is enabled (nil or true), the prompt ends with <think>,
-	// so model output starts directly with thinking content (no opening tag).
-	if thinkValue == nil || thinkValue.Bool() {
-		p.state = glm46ParserState_CollectingThinking
-	}
-	return tools
-}
--- a/model/parsers/glm47_test.go
+++ b/model/parsers/glm47_test.go
@@ -1,99 +0,0 @@
-package parsers
-
-import (
-	"reflect"
-	"testing"
-
-	"github.com/ollama/ollama/api"
-)
-
-func TestGLM47ParserAdd(t *testing.T) {
-	parser := GLM47Parser{}
-	parser.Init([]api.Tool{
-		tool("calculate", map[string]api.ToolProperty{
-			"count":   {Type: api.PropertyType{"integer"}},
-			"enabled": {Type: api.PropertyType{"boolean"}},
-		}),
-	}, nil, nil)
-
-	// When thinking is enabled (thinkValue nil), the prompt ends with <think>,
-	// so the model output does NOT include the opening <think> tag.
-	content, thinking, calls, err := parser.Add("plan</think>Answer<tool_call>calculate<arg_key>count</arg_key><arg_value>3</arg_value><arg_key>enabled</arg_key><arg_value>true</arg_value></tool_call>", true)
-	if err != nil {
-		t.Fatalf("parse failed: %v", err)
-	}
-	if thinking != "plan" {
-		t.Fatalf("expected thinking 'plan', got %q", thinking)
-	}
-	if content != "Answer" {
-		t.Fatalf("expected content 'Answer', got %q", content)
-	}
-	if len(calls) != 1 {
-		t.Fatalf("expected 1 tool call, got %d", len(calls))
-	}
-	expectedArgs := args(`{"count": 3, "enabled": true}`)
-	if !toolCallEqual(api.ToolCall{Function: api.ToolCallFunction{Arguments: calls[0].Function.Arguments}}, api.ToolCall{Function: api.ToolCallFunction{Arguments: expectedArgs}}) {
-		t.Fatalf("expected args %#v, got %#v", expectedArgs.ToMap(), calls[0].Function.Arguments.ToMap())
-	}
-}
-
-func TestGLM47ParserNoThinkingContent(t *testing.T) {
-	parser := GLM47Parser{}
-	parser.Init(nil, nil, nil)
-
-	// When thinking is enabled but model has no thinking to output,
-	// it should output </think> immediately followed by content.
-	content, thinking, calls, err := parser.Add("</think>Plain answer", true)
-	if err != nil {
-		t.Fatalf("parse failed: %v", err)
-	}
-	if thinking != "" {
-		t.Fatalf("expected empty thinking, got %q", thinking)
-	}
-	if content != "Plain answer" {
-		t.Fatalf("expected content 'Plain answer', got %q", content)
-	}
-	if len(calls) != 0 {
-		t.Fatalf("expected no tool calls, got %d", len(calls))
-	}
-}
-
-func TestGLM47ParserThinkingDisabled(t *testing.T) {
-	parser := GLM47Parser{}
-	// When thinking is disabled, parser stays in LookingForThinkingOpen state
-	parser.Init(nil, nil, &api.ThinkValue{Value: false})
-
-	// Model outputs plain content (prompt ended with </think>)
-	content, thinking, calls, err := parser.Add("Plain answer", true)
-	if err != nil {
-		t.Fatalf("parse failed: %v", err)
-	}
-	if thinking != "" {
-		t.Fatalf("expected empty thinking, got %q", thinking)
-	}
-	if content != "Plain answer" {
-		t.Fatalf("expected content 'Plain answer', got %q", content)
-	}
-	if len(calls) != 0 {
-		t.Fatalf("expected no tool calls, got %d", len(calls))
-	}
-}
-
-func TestGLM47ParserToolCallEscaping(t *testing.T) {
-	toolCall, err := parseGLM46ToolCall(glm46EventRawToolCall{raw: `exec
-<arg_key>expr</arg_key>
-<arg_value>a < b && c > d</arg_value>`}, nil)
-	if err != nil {
-		t.Fatalf("parse failed: %v", err)
-	}
-
-	expected := api.ToolCall{
-		Function: api.ToolCallFunction{
-			Name:      "exec",
-			Arguments: args(`{"expr": "a < b && c > d"}`),
-		},
-	}
-	if !reflect.DeepEqual(toolCall, expected) {
-		t.Fatalf("expected %#v, got %#v", expected, toolCall)
-	}
-}
--- a/model/parsers/lfm2.go
+++ b/model/parsers/lfm2.go
@@ -1,498 +0,0 @@
-package parsers
-
-import (
-	"encoding/json"
-	"errors"
-	"log/slog"
-	"strconv"
-	"strings"
-	"unicode"
-
-	"github.com/ollama/ollama/api"
-)
-
-type LFM2ParserState int
-
-const (
-	LFM2CollectingThinking LFM2ParserState = iota
-	LFM2CollectingContent
-	LFM2CollectingToolCalls
-)
-
-const (
-	lfm2ThinkingOpenTag  = "<think>"
-	lfm2ThinkingCloseTag = "</think>"
-	lfm2ToolCallStartTag = "<|tool_call_start|>"
-	lfm2ToolCallEndTag   = "<|tool_call_end|>"
-)
-
-type LFM2Parser struct {
-	state                    LFM2ParserState
-	buffer                   strings.Builder
-	hasThinkingSupport       bool
-	needsThinkingLeadingTrim bool // trim leading whitespace after <think> tag
-	needsContentLeadingTrim  bool // trim leading whitespace after </think> tag
-}
-
-func (p *LFM2Parser) HasToolSupport() bool {
-	return true
-}
-
-func (p *LFM2Parser) HasThinkingSupport() bool {
-	return p.hasThinkingSupport
-}
-
-func (p *LFM2Parser) setInitialState(lastMessage *api.Message, thinkValue *api.ThinkValue) {
-	prefill := lastMessage != nil && lastMessage.Role == "assistant"
-
-	// Check both model capability AND request preference
-	thinkingEnabled := p.HasThinkingSupport() && (thinkValue != nil && thinkValue.Bool())
-
-	if !thinkingEnabled {
-		p.state = LFM2CollectingContent
-		return
-	}
-
-	if prefill && lastMessage.Content != "" {
-		p.state = LFM2CollectingContent
-		return
-	}
-
-	p.state = LFM2CollectingThinking
-	p.needsThinkingLeadingTrim = true
-}
-
-func (p *LFM2Parser) Init(tools []api.Tool, lastMessage *api.Message, thinkValue *api.ThinkValue) []api.Tool {
-	p.setInitialState(lastMessage, thinkValue)
-	return tools
-}
-
-type lfm2Event interface {
-	isLFM2Event()
-}
-
-type lfm2EventThinkingContent struct {
-	content string
-}
-
-type lfm2EventContent struct {
-	content string
-}
-
-type lfm2EventToolCall struct {
-	toolCall api.ToolCall
-}
-
-func (lfm2EventThinkingContent) isLFM2Event() {}
-func (lfm2EventContent) isLFM2Event()         {}
-func (lfm2EventToolCall) isLFM2Event()        {}
-
-func (p *LFM2Parser) Add(s string, done bool) (content string, thinking string, calls []api.ToolCall, err error) {
-	p.buffer.WriteString(s)
-	events := p.parseEvents()
-
-	var toolCalls []api.ToolCall
-	var contentSb strings.Builder
-	var thinkingSb strings.Builder
-	for _, event := range events {
-		switch event := event.(type) {
-		case lfm2EventToolCall:
-			toolCalls = append(toolCalls, event.toolCall)
-		case lfm2EventThinkingContent:
-			thinkingSb.WriteString(event.content)
-		case lfm2EventContent:
-			contentSb.WriteString(event.content)
-		}
-	}
-
-	return contentSb.String(), thinkingSb.String(), toolCalls, nil
-}
-
-func (p *LFM2Parser) parseEvents() []lfm2Event {
-	var all []lfm2Event
-
-	keepLooping := true
-	for keepLooping {
-		var events []lfm2Event
-		events, keepLooping = p.eat()
-		if len(events) > 0 {
-			all = append(all, events...)
-		}
-	}
-
-	return all
-}
-
-func (p *LFM2Parser) eat() ([]lfm2Event, bool) {
-	var events []lfm2Event
-	bufStr := p.buffer.String()
-	if bufStr == "" {
-		return events, false
-	}
-
-	switch p.state {
-	case LFM2CollectingThinking:
-		// Strip opening <think> tag if present
-		if strings.HasPrefix(bufStr, lfm2ThinkingOpenTag) {
-			bufStr = bufStr[len(lfm2ThinkingOpenTag):]
-			p.needsThinkingLeadingTrim = true
-			p.buffer.Reset()
-			p.buffer.WriteString(bufStr)
-		}
-
-		// Trim leading whitespace after <think> tag (may span multiple chunks)
-		if p.needsThinkingLeadingTrim {
-			if trimmed := strings.TrimLeftFunc(bufStr, unicode.IsSpace); trimmed != bufStr {
-				bufStr = trimmed
-				p.buffer.Reset()
-				p.buffer.WriteString(bufStr)
-			}
-			// Clear flag once we have non-whitespace content or buffer is empty
-			if len(bufStr) > 0 {
-				p.needsThinkingLeadingTrim = false
-			}
-		}
-
-		if strings.Contains(bufStr, lfm2ThinkingCloseTag) { // thinking[</think>] -> content
-			split := strings.SplitN(bufStr, lfm2ThinkingCloseTag, 2)
-			thinking := split[0]
-			thinking = strings.TrimRightFunc(thinking, unicode.IsSpace)
-
-			remaining := split[1]
-			remaining = strings.TrimLeftFunc(remaining, unicode.IsSpace)
-
-			p.buffer.Reset()
-			p.buffer.WriteString(remaining)
-			p.state = LFM2CollectingContent
-			p.needsThinkingLeadingTrim = false
-			// Set flag to trim any additional whitespace that may arrive in later chunks
-			p.needsContentLeadingTrim = len(remaining) == 0
-
-			if len(thinking) > 0 {
-				events = append(events, lfm2EventThinkingContent{content: thinking})
-			}
-			return events, true
-		} else if overlapLen := overlap(bufStr, lfm2ThinkingCloseTag); overlapLen > 0 { // partial </think>
-			beforePartialTag := bufStr[:len(bufStr)-overlapLen]
-			trailingLen := trailingWhitespaceLen(beforePartialTag)
-			ambiguousStart := len(beforePartialTag) - trailingLen
-
-			unambiguous := bufStr[:ambiguousStart]
-			ambiguous := bufStr[ambiguousStart:]
-			p.buffer.Reset()
-			p.buffer.WriteString(ambiguous)
-			if len(unambiguous) > 0 {
-				events = append(events, lfm2EventThinkingContent{content: unambiguous})
-			}
-			return events, false
-		} else { // otherwise its thinking content
-			whitespaceLen := trailingWhitespaceLen(bufStr)
-			ambiguousStart := len(bufStr) - whitespaceLen
-
-			unambiguous := bufStr[:ambiguousStart]
-			ambiguous := bufStr[ambiguousStart:]
-			p.buffer.Reset()
-			p.buffer.WriteString(ambiguous)
-			if len(unambiguous) > 0 {
-				events = append(events, lfm2EventThinkingContent{content: unambiguous})
-			}
-			return events, false
-		}
-
-	case LFM2CollectingContent:
-		// Trim leading whitespace after </think> tag (may span multiple chunks)
-		if p.needsContentLeadingTrim {
-			if trimmed := strings.TrimLeftFunc(bufStr, unicode.IsSpace); trimmed != bufStr {
-				bufStr = trimmed
-				p.buffer.Reset()
-				p.buffer.WriteString(bufStr)
-			}
-			// Clear flag once we have non-whitespace content
-			if len(bufStr) > 0 {
-				p.needsContentLeadingTrim = false
-			}
-		}
-
-		if strings.Contains(bufStr, lfm2ToolCallStartTag) { // content[<|tool_call_start|>] -> tool calls
-			split := strings.SplitN(bufStr, lfm2ToolCallStartTag, 2)
-			contentBefore := strings.TrimRightFunc(split[0], unicode.IsSpace)
-			remaining := split[1]
-
-			p.buffer.Reset()
-			p.buffer.WriteString(remaining)
-			p.state = LFM2CollectingToolCalls
-
-			if len(contentBefore) > 0 {
-				events = append(events, lfm2EventContent{content: contentBefore})
-			}
-			return events, true
-		} else { // otherwise its content
-			p.buffer.Reset()
-			if len(bufStr) > 0 {
-				events = append(events, lfm2EventContent{content: bufStr})
-			}
-			return events, false
-		}
-
-	case LFM2CollectingToolCalls:
-		// Look for complete tool call JSON between tags
-		if idx := strings.Index(bufStr, lfm2ToolCallEndTag); idx != -1 {
-			toolCallContent := bufStr[:idx]
-
-			if toolCalls, err := p.parseToolCallsContent(toolCallContent); err == nil && len(toolCalls) > 0 {
-				remaining := bufStr[idx+len(lfm2ToolCallEndTag):]
-
-				// Check if there's another tool call
-				if strings.HasPrefix(remaining, lfm2ToolCallStartTag) {
-					remaining = remaining[len(lfm2ToolCallStartTag):]
-				} else {
-					// No more tool calls, go back to content
-					remaining = strings.TrimLeftFunc(remaining, unicode.IsSpace)
-					p.state = LFM2CollectingContent
-				}
-
-				p.buffer.Reset()
-				p.buffer.WriteString(remaining)
-
-				for _, tc := range toolCalls {
-					events = append(events, lfm2EventToolCall{toolCall: tc})
-				}
-				return events, true
-			} else if err != nil {
-				slog.Warn("lfm2 tool call parsing failed", "error", err, "content", toolCallContent)
-			}
-		}
-
-		return events, false
-	}
-
-	return events, false
-}
-
-// parseToolCallsContent parses one or more tool calls from content
-// Supports JSON format and Python-style format including multiple calls: [func1(...),func2(...)]
-func (p *LFM2Parser) parseToolCallsContent(content string) ([]api.ToolCall, error) {
-	content = strings.TrimSpace(content)
-
-	// Try JSON format first: {"name": "func", "arguments": {...}}
-	var parsed struct {
-		Name      string          `json:"name"`
-		Arguments json.RawMessage `json:"arguments"`
-	}
-
-	if err := json.Unmarshal([]byte(content), &parsed); err == nil && parsed.Name != "" {
-		var args api.ToolCallFunctionArguments
-		if len(parsed.Arguments) > 0 {
-			if err := json.Unmarshal(parsed.Arguments, &args); err != nil {
-				return nil, err
-			}
-		} else {
-			args = api.NewToolCallFunctionArguments()
-		}
-
-		return []api.ToolCall{{
-			Function: api.ToolCallFunction{
-				Name:      parsed.Name,
-				Arguments: args,
-			},
-		}}, nil
-	}
-
-	// Try Python-style format: [func(arg1='val1'),func2(arg2='val2')] or func(arg1='val1')
-	return p.parsePythonStyleToolCalls(content)
-}
-
-// parsePythonStyleToolCalls parses one or more Python-style tool calls
-// Examples: [bash(command='ls'),bash(command='pwd')] or bash(command='ls')
-func (p *LFM2Parser) parsePythonStyleToolCalls(content string) ([]api.ToolCall, error) {
-	content = strings.TrimSpace(content)
-
-	// Strip outer brackets if present: [func(...)] -> func(...)
-	if strings.HasPrefix(content, "[") && strings.HasSuffix(content, "]") {
-		content = content[1 : len(content)-1]
-	}
-
-	var toolCalls []api.ToolCall
-
-	// Parse multiple function calls separated by commas at the top level
-	for len(content) > 0 {
-		content = strings.TrimSpace(content)
-		if content == "" {
-			break
-		}
-
-		// Skip leading comma from previous iteration
-		if strings.HasPrefix(content, ",") {
-			content = strings.TrimSpace(content[1:])
-			if content == "" {
-				break
-			}
-		}
-
-		// Find function name
-		parenIdx := strings.Index(content, "(")
-		if parenIdx == -1 {
-			return nil, errors.New("invalid tool call: no opening parenthesis")
-		}
-
-		funcName := strings.TrimSpace(content[:parenIdx])
-		if funcName == "" {
-			return nil, errors.New("invalid tool call: empty function name")
-		}
-
-		// Find matching closing parenthesis
-		closeIdx := findMatchingParen(content, parenIdx)
-		if closeIdx == -1 {
-			return nil, errors.New("invalid tool call: no matching closing parenthesis")
-		}
-
-		argsStr := content[parenIdx+1 : closeIdx]
-		args := api.NewToolCallFunctionArguments()
-
-		if argsStr != "" {
-			if err := parsePythonArgs(argsStr, &args); err != nil {
-				return nil, err
-			}
-		}
-
-		toolCalls = append(toolCalls, api.ToolCall{
-			Function: api.ToolCallFunction{
-				Name:      funcName,
-				Arguments: args,
-			},
-		})
-
-		// Move past this function call
-		content = content[closeIdx+1:]
-	}
-
-	if len(toolCalls) == 0 {
-		return nil, errors.New("no tool calls found")
-	}
-
-	return toolCalls, nil
-}
-
-// findMatchingParen finds the index of the closing parenthesis matching the one at openIdx
-// Returns -1 if not found. Handles nested parentheses and quoted strings.
-func findMatchingParen(s string, openIdx int) int {
-	depth := 1
-	i := openIdx + 1
-	for i < len(s) && depth > 0 {
-		switch s[i] {
-		case '(':
-			depth++
-		case ')':
-			depth--
-			if depth == 0 {
-				return i
-			}
-		case '\'', '"':
-			// Skip quoted string
-			quote := s[i]
-			i++
-			for i < len(s) && s[i] != quote {
-				if s[i] == '\\' && i+1 < len(s) {
-					i++ // skip escaped char
-				}
-				i++
-			}
-		}
-		i++
-	}
-	return -1
-}
-
-// parseToolCallContent parses a single tool call (for backward compatibility with tests)
-func (p *LFM2Parser) parseToolCallContent(content string) (api.ToolCall, error) {
-	calls, err := p.parseToolCallsContent(content)
-	if err != nil {
-		return api.ToolCall{}, err
-	}
-	if len(calls) == 0 {
-		return api.ToolCall{}, errors.New("no tool call found")
-	}
-	return calls[0], nil
-}
-
-// parsePythonArgs parses Python-style keyword arguments: key='value', key2="value2"
-func parsePythonArgs(argsStr string, args *api.ToolCallFunctionArguments) error {
-	// Simple state machine to parse key='value' pairs
-	// Handles: command='ls', flag="-la", count=42, enabled=true
-	var key string
-	i := 0
-
-	for i < len(argsStr) {
-		// Skip whitespace
-		for i < len(argsStr) && (argsStr[i] == ' ' || argsStr[i] == '\t' || argsStr[i] == '\n') {
-			i++
-		}
-		if i >= len(argsStr) {
-			break
-		}
-
-		// Parse key
-		keyStart := i
-		for i < len(argsStr) && argsStr[i] != '=' && argsStr[i] != ',' {
-			i++
-		}
-		if i >= len(argsStr) || argsStr[i] != '=' {
-			return errors.New("invalid argument: expected '='")
-		}
-		key = strings.TrimSpace(argsStr[keyStart:i])
-		i++ // skip '='
-
-		// Skip whitespace after =
-		for i < len(argsStr) && (argsStr[i] == ' ' || argsStr[i] == '\t') {
-			i++
-		}
-
-		// Parse value
-		var value string
-		if i < len(argsStr) && (argsStr[i] == '\'' || argsStr[i] == '"') {
-			// Quoted string
-			quote := argsStr[i]
-			i++
-			valueStart := i
-			for i < len(argsStr) && argsStr[i] != quote {
-				if argsStr[i] == '\\' && i+1 < len(argsStr) {
-					i += 2 // skip escaped char
-				} else {
-					i++
-				}
-			}
-			value = argsStr[valueStart:i]
-			if i < len(argsStr) {
-				i++ // skip closing quote
-			}
-			args.Set(key, value)
-		} else {
-			// Unquoted value (number, bool, etc)
-			valueStart := i
-			for i < len(argsStr) && argsStr[i] != ',' {
-				i++
-			}
-			value = strings.TrimSpace(argsStr[valueStart:i])
-
-			// Try to parse as number or bool
-			if v, err := strconv.ParseInt(value, 10, 64); err == nil {
-				args.Set(key, v)
-			} else if v, err := strconv.ParseFloat(value, 64); err == nil {
-				args.Set(key, v)
-			} else if value == "true" {
-				args.Set(key, true)
-			} else if value == "false" {
-				args.Set(key, false)
-			} else {
-				args.Set(key, value)
-			}
-		}
-
-		// Skip comma and whitespace
-		for i < len(argsStr) && (argsStr[i] == ',' || argsStr[i] == ' ' || argsStr[i] == '\t' || argsStr[i] == '\n') {
-			i++
-		}
-	}
-
-	return nil
-}
--- a/model/parsers/lfm2_test.go
+++ b/model/parsers/lfm2_test.go
--- a/model/parsers/nemotron3nano.go
+++ b/model/parsers/nemotron3nano.go
@@ -1,6 +1,7 @@
 package parsers

 import (
+	"regexp"
 	"strings"
 	"unicode"

@@ -13,114 +14,243 @@ const (
 	Nemotron3NanoCollectingThinking Nemotron3NanoParserState = iota
 	Nemotron3NanoSkipWhitespaceAfterThinking
 	Nemotron3NanoCollectingContent
+	Nemotron3NanoCollectingToolCalls
 )

 const (
-	nemotronThinkClose   = "</think>"
-	nemotronToolCallOpen = "<tool_call>"
+	nemotronThinkClose    = "</think>"
+	nemotronToolCallOpen  = "<tool_call>"
+	nemotronToolCallClose = "</tool_call>"
 )

 type Nemotron3NanoParser struct {
-	state      Nemotron3NanoParserState
-	buffer     strings.Builder
-	toolParser *Qwen3CoderParser
+	state  Nemotron3NanoParserState
+	buffer strings.Builder
+	tools  []api.Tool
 }

 func (p *Nemotron3NanoParser) HasToolSupport() bool     { return true }
 func (p *Nemotron3NanoParser) HasThinkingSupport() bool { return true }

 func (p *Nemotron3NanoParser) Init(tools []api.Tool, lastMessage *api.Message, thinkValue *api.ThinkValue) []api.Tool {
-	p.toolParser = &Qwen3CoderParser{}
-	p.toolParser.Init(tools, nil, nil)
+	p.tools = tools

+	// thinking is enabled if user requests it
 	thinkingEnabled := thinkValue != nil && thinkValue.Bool()
+
 	prefill := lastMessage != nil && lastMessage.Role == "assistant"

-	if !thinkingEnabled || (prefill && lastMessage.Content != "") {
+	if !thinkingEnabled {
 		p.state = Nemotron3NanoCollectingContent
-	} else {
-		p.state = Nemotron3NanoCollectingThinking
+		return tools
 	}

+	if prefill && lastMessage.Content != "" {
+		p.state = Nemotron3NanoCollectingContent
+		return tools
+	}
+
+	p.state = Nemotron3NanoCollectingThinking
 	return tools
 }

+type nemotronEvent interface {
+	isNemotronEvent()
+}
+
+type nemotronEventThinkingContent struct {
+	content string
+}
+
+type nemotronEventContent struct {
+	content string
+}
+
+type nemotronEventToolCall struct {
+	toolCall api.ToolCall
+}
+
+func (nemotronEventThinkingContent) isNemotronEvent() {}
+func (nemotronEventContent) isNemotronEvent()         {}
+func (nemotronEventToolCall) isNemotronEvent()        {}
+
 func (p *Nemotron3NanoParser) Add(s string, done bool) (content string, thinking string, calls []api.ToolCall, err error) {
-	if p.state == Nemotron3NanoCollectingContent {
-		return p.toolParser.Add(s, done)
+	p.buffer.WriteString(s)
+	events := p.parseEvents()
+
+	var toolCalls []api.ToolCall
+	var contentSb strings.Builder
+	var thinkingSb strings.Builder
+	for _, event := range events {
+		switch event := event.(type) {
+		case nemotronEventToolCall:
+			toolCalls = append(toolCalls, event.toolCall)
+		case nemotronEventThinkingContent:
+			thinkingSb.WriteString(event.content)
+		case nemotronEventContent:
+			contentSb.WriteString(event.content)
+		}
 	}

-	if p.state == Nemotron3NanoSkipWhitespaceAfterThinking {
-		s = strings.TrimLeftFunc(s, unicode.IsSpace)
-		if s == "" {
-			return "", "", nil, nil
+	return contentSb.String(), thinkingSb.String(), toolCalls, nil
+}
+
+func (p *Nemotron3NanoParser) parseEvents() []nemotronEvent {
+	var all []nemotronEvent
+
+	keepLooping := true
+	for keepLooping {
+		var events []nemotronEvent
+		events, keepLooping = p.eat()
+		if len(events) > 0 {
+			all = append(all, events...)
+		}
+	}
+
+	return all
+}
+
+// emitWithPartialCheck extracts unambiguous content before a potential partial tag
+func (p *Nemotron3NanoParser) emitWithPartialCheck(bufStr, tag string) (unambiguous, ambiguous string) {
+	if overlapLen := overlap(bufStr, tag); overlapLen > 0 {
+		beforePartialTag := bufStr[:len(bufStr)-overlapLen]
+		trailingLen := trailingWhitespaceLen(beforePartialTag)
+		return bufStr[:len(beforePartialTag)-trailingLen], bufStr[len(beforePartialTag)-trailingLen:]
+	}
+	wsLen := trailingWhitespaceLen(bufStr)
+	return bufStr[:len(bufStr)-wsLen], bufStr[len(bufStr)-wsLen:]
+}
+
+func (p *Nemotron3NanoParser) eat() ([]nemotronEvent, bool) {
+	bufStr := p.buffer.String()
+	if bufStr == "" {
+		return nil, false
+	}
+
+	switch p.state {
+	case Nemotron3NanoCollectingThinking:
+		if strings.Contains(bufStr, nemotronThinkClose) {
+			split := strings.SplitN(bufStr, nemotronThinkClose, 2)
+			thinking := strings.TrimRightFunc(split[0], unicode.IsSpace)
+			p.buffer.Reset()
+			remainder := strings.TrimLeftFunc(split[1], unicode.IsSpace)
+			p.buffer.WriteString(remainder)
+			// Transition to whitespace-skipping state if buffer is empty,
+			// otherwise go directly to content collection
+			if remainder == "" {
+				p.state = Nemotron3NanoSkipWhitespaceAfterThinking
+			} else {
+				p.state = Nemotron3NanoCollectingContent
+			}
+			if thinking != "" {
+				return []nemotronEvent{nemotronEventThinkingContent{content: thinking}}, true
+			}
+			return nil, true
+		}
+		unambig, ambig := p.emitWithPartialCheck(bufStr, nemotronThinkClose)
+		p.buffer.Reset()
+		p.buffer.WriteString(ambig)
+		if unambig != "" {
+			return []nemotronEvent{nemotronEventThinkingContent{content: unambig}}, false
+		}
+		return nil, false
+
+	// We only want to skip whitespace between thinking and content
+	case Nemotron3NanoSkipWhitespaceAfterThinking:
+		bufStr = strings.TrimLeftFunc(bufStr, unicode.IsSpace)
+		p.buffer.Reset()
+		p.buffer.WriteString(bufStr)
+		if bufStr == "" {
+			return nil, false
 		}
 		p.state = Nemotron3NanoCollectingContent
-		return p.toolParser.Add(s, done)
-	}
+		return nil, true

-	// Nemotron3NanoCollectingThinking - buffer and look for end markers
-	p.buffer.WriteString(s)
-	bufStr := p.buffer.String()
-
-	// Look for end of thinking: </think> or <tool_call> (model may skip </think>)
-	thinkIdx := strings.Index(bufStr, nemotronThinkClose)
-	toolIdx := strings.Index(bufStr, nemotronToolCallOpen)
-
-	var endIdx int = -1
-	var remainder string
-
-	if thinkIdx != -1 && (toolIdx == -1 || thinkIdx < toolIdx) {
-		endIdx = thinkIdx
-		remainder = strings.TrimLeftFunc(bufStr[thinkIdx+len(nemotronThinkClose):], unicode.IsSpace)
-	} else if toolIdx != -1 {
-		endIdx = toolIdx
-		remainder = bufStr[toolIdx:] // Include <tool_call> tag
-	}
-
-	if endIdx != -1 {
-		thinking = strings.TrimRightFunc(bufStr[:endIdx], unicode.IsSpace)
-		p.buffer.Reset()
-
-		if remainder == "" {
-			p.state = Nemotron3NanoSkipWhitespaceAfterThinking
-		} else {
-			p.state = Nemotron3NanoCollectingContent
-			content, _, calls, err = p.toolParser.Add(remainder, done)
+	case Nemotron3NanoCollectingContent:
+		if strings.Contains(bufStr, nemotronToolCallOpen) {
+			split := strings.SplitN(bufStr, nemotronToolCallOpen, 2)
+			content := strings.TrimRightFunc(split[0], unicode.IsSpace)
+			p.buffer.Reset()
+			p.buffer.WriteString(split[1])
+			p.state = Nemotron3NanoCollectingToolCalls
+			if content != "" {
+				return []nemotronEvent{nemotronEventContent{content: content}}, true
+			}
+			return nil, true
 		}
-		return content, thinking, calls, err
+		unambig, ambig := p.emitWithPartialCheck(bufStr, nemotronToolCallOpen)
+		p.buffer.Reset()
+		p.buffer.WriteString(ambig)
+		if unambig != "" {
+			return []nemotronEvent{nemotronEventContent{content: unambig}}, false
+		}
+		return nil, false
+
+	case Nemotron3NanoCollectingToolCalls:
+		if strings.Contains(bufStr, nemotronToolCallClose) {
+			split := strings.SplitN(bufStr, nemotronToolCallClose, 2)
+			remaining := strings.TrimLeftFunc(split[1], unicode.IsSpace)
+			p.buffer.Reset()
+			p.buffer.WriteString(remaining)
+
+			var events []nemotronEvent
+			if tc, err := p.parseToolCall(split[0]); err == nil {
+				events = append(events, nemotronEventToolCall{toolCall: tc})
+			}
+
+			if !strings.Contains(remaining, nemotronToolCallOpen) {
+				p.state = Nemotron3NanoCollectingContent
+			}
+			return events, true
+		}
+		return nil, false
 	}

-	// No end marker - emit unambiguous thinking
-	thinking = p.emitThinking(bufStr)
-	return "", thinking, nil, nil
+	return nil, false
 }

-// emitThinking returns unambiguous thinking content, keeping potential partial tags in buffer
-func (p *Nemotron3NanoParser) emitThinking(bufStr string) string {
-	// Check for partial </think> or <tool_call> at end
-	thinkOverlap := overlap(bufStr, nemotronThinkClose)
-	toolOverlap := overlap(bufStr, nemotronToolCallOpen)
-	maxOverlap := max(thinkOverlap, toolOverlap)
+var (
+	nemotronFunctionRegex  = regexp.MustCompile(`<function=([^>]+)>`)
+	nemotronParameterRegex = regexp.MustCompile(`<parameter=([^>]+)>\n?([\s\S]*?)\n?</parameter>`)
+)

-	if maxOverlap > 0 {
-		unambiguous := bufStr[:len(bufStr)-maxOverlap]
-		unambiguous = strings.TrimRightFunc(unambiguous, unicode.IsSpace)
-		p.buffer.Reset()
-		p.buffer.WriteString(bufStr[len(bufStr)-maxOverlap:])
-		return unambiguous
+func (p *Nemotron3NanoParser) parseToolCall(content string) (api.ToolCall, error) {
+	toolCall := api.ToolCall{}
+
+	// Extract function name
+	fnMatch := nemotronFunctionRegex.FindStringSubmatch(content)
+	if len(fnMatch) < 2 {
+		return toolCall, nil
+	}
+	toolCall.Function.Name = fnMatch[1]
+
+	// Extract parameters
+	toolCall.Function.Arguments = api.NewToolCallFunctionArguments()
+	paramMatches := nemotronParameterRegex.FindAllStringSubmatch(content, -1)
+	for _, match := range paramMatches {
+		if len(match) >= 3 {
+			paramName := match[1]
+			paramValue := strings.TrimSpace(match[2])
+
+			// Try to parse as typed value based on tool definition
+			toolCall.Function.Arguments.Set(paramName, p.parseParamValue(paramName, paramValue))
+		}
 	}

-	// No partial tags - emit all but trailing whitespace
-	wsLen := trailingWhitespaceLen(bufStr)
-	if wsLen > 0 {
-		unambiguous := bufStr[:len(bufStr)-wsLen]
-		p.buffer.Reset()
-		p.buffer.WriteString(bufStr[len(bufStr)-wsLen:])
-		return unambiguous
-	}
-
-	// Nothing to hold back
-	p.buffer.Reset()
-	return bufStr
+	return toolCall, nil
+}
+
+func (p *Nemotron3NanoParser) parseParamValue(paramName string, raw string) any {
+	// Find the matching tool to get parameter type
+	var paramType api.PropertyType
+	for _, tool := range p.tools {
+		if tool.Function.Parameters.Properties != nil {
+			if prop, ok := tool.Function.Parameters.Properties.Get(paramName); ok {
+				paramType = prop.Type
+				break
+			}
+		}
+	}
+
+	return parseValue(raw, paramType)
 }
--- a/model/parsers/nemotron3nano_test.go
+++ b/model/parsers/nemotron3nano_test.go
@@ -8,8 +8,6 @@ import (
 	"github.com/ollama/ollama/api"
 )

-// TestNemotron3NanoParser tests Nemotron-specific behavior (thinking support).
-// Tool call parsing is tested in qwen3coder_test.go since Nemotron delegates to Qwen3CoderParser.
 func TestNemotron3NanoParser(t *testing.T) {
 	tests := []struct {
 		name             string
@@ -19,6 +17,18 @@ func TestNemotron3NanoParser(t *testing.T) {
 		expectedThinking string
 		expectedCalls    []api.ToolCall
 	}{
+		{
+			name:            "simple content - no thinking",
+			input:           "Hello, how can I help you?",
+			thinkValue:      nil,
+			expectedContent: "Hello, how can I help you?",
+		},
+		{
+			name:            "simple content - thinking disabled",
+			input:           "Hello, how can I help you?",
+			thinkValue:      &api.ThinkValue{Value: false},
+			expectedContent: "Hello, how can I help you?",
+		},
 		{
 			name:             "thinking then content",
 			input:            "Let me think about this...</think>\nHere is my answer.",
@@ -33,6 +43,69 @@ func TestNemotron3NanoParser(t *testing.T) {
 			expectedThinking: "Step 1: Analyze\nStep 2: Process\nStep 3: Conclude",
 			expectedContent:  "The answer is 42.",
 		},
+		{
+			name:       "simple tool call",
+			input:      "<tool_call>\n<function=get_weather>\n<parameter=city>\nParis\n</parameter>\n</function>\n</tool_call>",
+			thinkValue: nil,
+			expectedCalls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Name:      "get_weather",
+						Arguments: testArgs(map[string]any{"city": "Paris"}),
+					},
+				},
+			},
+		},
+		{
+			name:            "content then tool call",
+			input:           "Let me check the weather.\n<tool_call>\n<function=get_weather>\n<parameter=city>\nNYC\n</parameter>\n</function>\n</tool_call>",
+			thinkValue:      nil,
+			expectedContent: "Let me check the weather.",
+			expectedCalls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Name:      "get_weather",
+						Arguments: testArgs(map[string]any{"city": "NYC"}),
+					},
+				},
+			},
+		},
+		{
+			name:       "tool call with multiple parameters",
+			input:      "<tool_call>\n<function=book_flight>\n<parameter=from>\nSFO\n</parameter>\n<parameter=to>\nNYC\n</parameter>\n</function>\n</tool_call>",
+			thinkValue: nil,
+			expectedCalls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Name: "book_flight",
+						Arguments: testArgs(map[string]any{
+							"from": "SFO",
+							"to":   "NYC",
+						}),
+					},
+				},
+			},
+		},
+		{
+			name: "multiple tool calls",
+			input: "<tool_call>\n<function=get_weather>\n<parameter=city>\nSan Francisco\n</parameter>\n</function>\n</tool_call>\n" +
+				"<tool_call>\n<function=get_weather>\n<parameter=city>\nNew York\n</parameter>\n</function>\n</tool_call>",
+			thinkValue: nil,
+			expectedCalls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Name:      "get_weather",
+						Arguments: testArgs(map[string]any{"city": "San Francisco"}),
+					},
+				},
+				{
+					Function: api.ToolCallFunction{
+						Name:      "get_weather",
+						Arguments: testArgs(map[string]any{"city": "New York"}),
+					},
+				},
+			},
+		},
 		{
 			name:             "thinking then tool call",
 			input:            "I should check the weather...</think>\n<tool_call>\n<function=get_weather>\n<parameter=city>\nParis\n</parameter>\n</function>\n</tool_call>",
@@ -62,6 +135,19 @@ func TestNemotron3NanoParser(t *testing.T) {
 				},
 			},
 		},
+		{
+			name:       "tool call with multiline parameter value",
+			input:      "<tool_call>\n<function=create_note>\n<parameter=content>\nLine 1\nLine 2\nLine 3\n</parameter>\n</function>\n</tool_call>",
+			thinkValue: nil,
+			expectedCalls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Name:      "create_note",
+						Arguments: testArgs(map[string]any{"content": "Line 1\nLine 2\nLine 3"}),
+					},
+				},
+			},
+		},
 		{
 			name:             "empty thinking block - immediate close",
 			input:            "</think>\nHere is my answer.",
@@ -75,6 +161,18 @@ func TestNemotron3NanoParser(t *testing.T) {
 			thinkValue:      &api.ThinkValue{Value: false},
 			expectedContent: "</think>\nSome content after spurious tag.",
 		},
+		{
+			name:          "tool call with no function name - returns empty tool call",
+			input:         "<tool_call>\n<function=>\n</function>\n</tool_call>",
+			thinkValue:    nil,
+			expectedCalls: []api.ToolCall{{Function: api.ToolCallFunction{Name: "", Arguments: api.NewToolCallFunctionArguments()}}},
+		},
+		{
+			name:            "content with newlines preserved",
+			input:           "Line 1\n\nLine 2\n\n\nLine 3",
+			thinkValue:      nil,
+			expectedContent: "Line 1\n\nLine 2\n\n\nLine 3",
+		},
 		{
 			name:             "thinking with only whitespace after close tag",
 			input:            "My thoughts...</think>   \n\t\n   Content here.",
@@ -82,6 +180,25 @@ func TestNemotron3NanoParser(t *testing.T) {
 			expectedThinking: "My thoughts...",
 			expectedContent:  "Content here.",
 		},
+		{
+			name:            "unicode content",
+			input:           "Hello 世界! 🌍 Ñoño",
+			thinkValue:      nil,
+			expectedContent: "Hello 世界! 🌍 Ñoño",
+		},
+		{
+			name:       "tool call with numeric parameter",
+			input:      "<tool_call>\n<function=set_temp>\n<parameter=value>\n42\n</parameter>\n</function>\n</tool_call>",
+			thinkValue: nil,
+			expectedCalls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Name:      "set_temp",
+						Arguments: testArgs(map[string]any{"value": "42"}),
+					},
+				},
+			},
+		},
 	}

 	for _, tt := range tests {
@@ -116,8 +233,6 @@ func TestNemotron3NanoParser(t *testing.T) {
 	}
 }

-// TestNemotron3NanoParser_Streaming tests streaming behavior for thinking support.
-// Tool call streaming is tested in qwen3coder_test.go.
 func TestNemotron3NanoParser_Streaming(t *testing.T) {
 	tests := []struct {
 		name             string
@@ -127,6 +242,18 @@ func TestNemotron3NanoParser_Streaming(t *testing.T) {
 		expectedThinking string
 		expectedCalls    []api.ToolCall
 	}{
+		{
+			name:            "streaming content character by character",
+			chunks:          []string{"H", "e", "l", "l", "o", ",", " ", "w", "o", "r", "l", "d", "!"},
+			thinkValue:      nil,
+			expectedContent: "Hello, world!",
+		},
+		{
+			name:            "streaming content small tokens",
+			chunks:          []string{"Hel", "lo", ", ", "how ", "can", " I", " help", " you", " today", "?"},
+			thinkValue:      nil,
+			expectedContent: "Hello, how can I help you today?",
+		},
 		{
 			name:             "streaming thinking then content - granular",
 			chunks:           []string{"Let", " me", " th", "ink", " about", " this", "...", "<", "/", "think", ">", "\n", "Here", " is", " my", " answer", "."},
@@ -141,6 +268,45 @@ func TestNemotron3NanoParser_Streaming(t *testing.T) {
 			expectedThinking: "Step 1: Analyze\nStep 2: Process",
 			expectedContent:  "The answer.",
 		},
+		{
+			name:       "streaming tool call - highly granular",
+			chunks:     []string{"<", "tool", "_", "call", ">", "\n", "<", "func", "tion", "=", "get", "_", "weather", ">", "\n", "<", "param", "eter", "=", "city", ">", "\n", "Par", "is", "\n", "</", "param", "eter", ">", "\n", "</", "func", "tion", ">", "\n", "</", "tool", "_", "call", ">"},
+			thinkValue: nil,
+			expectedCalls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Name:      "get_weather",
+						Arguments: testArgs(map[string]any{"city": "Paris"}),
+					},
+				},
+			},
+		},
+		{
+			name:            "streaming content then tool call - granular",
+			chunks:          []string{"Let", " me", " check", " the", " weather", ".", "\n<", "tool_call", ">", "\n", "<function=", "get_weather", ">", "\n", "<parameter=", "city", ">", "\n", "NYC", "\n", "</parameter>", "\n", "</function>", "\n", "</tool_call>"},
+			thinkValue:      nil,
+			expectedContent: "Let me check the weather.",
+			expectedCalls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Name:      "get_weather",
+						Arguments: testArgs(map[string]any{"city": "NYC"}),
+					},
+				},
+			},
+		},
+		{
+			name:   "tool call tag split character by character",
+			chunks: []string{"<", "t", "o", "o", "l", "_", "c", "a", "l", "l", ">", "\n", "<", "f", "u", "n", "c", "t", "i", "o", "n", "=", "t", "e", "s", "t", ">", "\n", "<", "/", "f", "u", "n", "c", "t", "i", "o", "n", ">", "\n", "<", "/", "t", "o", "o", "l", "_", "c", "a", "l", "l", ">"},
+			expectedCalls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Name:      "test",
+						Arguments: api.NewToolCallFunctionArguments(),
+					},
+				},
+			},
+		},
 		{
 			name:             "thinking close tag split character by character",
 			chunks:           []string{"I", "'", "m", " ", "t", "h", "i", "n", "k", "i", "n", "g", ".", ".", ".", "<", "/", "t", "h", "i", "n", "k", ">", "\n", "D", "o", "n", "e", "!"},
@@ -155,6 +321,22 @@ func TestNemotron3NanoParser_Streaming(t *testing.T) {
 			expectedThinking: "Thinking...",
 			expectedContent:  "Content here.",
 		},
+		{
+			name:       "tool call with multiple parameters - streaming",
+			chunks:     []string{"<tool_", "call>\n", "<function", "=book_", "flight>", "\n<para", "meter=", "from>\n", "SFO\n", "</param", "eter>", "\n<param", "eter=to", ">\nNYC", "\n</para", "meter>", "\n</func", "tion>\n", "</tool_", "call>"},
+			thinkValue: nil,
+			expectedCalls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Name: "book_flight",
+						Arguments: testArgs(map[string]any{
+							"from": "SFO",
+							"to":   "NYC",
+						}),
+					},
+				},
+			},
+		},
 		{
 			name:             "thinking then content then tool call - streaming",
 			chunks:           []string{"Ana", "lyzing", " your", " request", "...", "</", "think", ">\n", "I'll", " check", " that", " for", " you", ".", "\n", "<tool", "_call", ">\n", "<function", "=search", ">\n", "<parameter", "=query", ">\n", "test", " query", "\n</", "parameter", ">\n", "</function", ">\n", "</tool", "_call", ">"},
@@ -170,6 +352,45 @@ func TestNemotron3NanoParser_Streaming(t *testing.T) {
 				},
 			},
 		},
+		{
+			name: "multiple tool calls - streaming",
+			chunks: []string{
+				"<tool_call>", "\n", "<function=", "get_weather>", "\n",
+				"<parameter=", "city>\n", "San Fran", "cisco\n", "</parameter>", "\n",
+				"</function>", "\n", "</tool_call>", "\n",
+				"<tool_", "call>\n", "<function", "=get_weather", ">\n",
+				"<param", "eter=city", ">\nNew", " York\n", "</parameter>\n",
+				"</function>\n", "</tool_call>",
+			},
+			thinkValue: nil,
+			expectedCalls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Name:      "get_weather",
+						Arguments: testArgs(map[string]any{"city": "San Francisco"}),
+					},
+				},
+				{
+					Function: api.ToolCallFunction{
+						Name:      "get_weather",
+						Arguments: testArgs(map[string]any{"city": "New York"}),
+					},
+				},
+			},
+		},
+		{
+			name:       "tool call with multiline parameter - streaming",
+			chunks:     []string{"<tool_call>\n", "<function=", "create_note>\n", "<parameter=", "content>\n", "Line 1", "\nLine", " 2\n", "Line 3", "\n</parameter>\n", "</function>\n", "</tool_call>"},
+			thinkValue: nil,
+			expectedCalls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Name:      "create_note",
+						Arguments: testArgs(map[string]any{"content": "Line 1\nLine 2\nLine 3"}),
+					},
+				},
+			},
+		},
 		{
 			name:             "empty thinking block",
 			chunks:           []string{"</think>", "\n", "Just content."},
@@ -177,6 +398,12 @@ func TestNemotron3NanoParser_Streaming(t *testing.T) {
 			expectedThinking: "",
 			expectedContent:  "Just content.",
 		},
+		{
+			name:            "empty input chunks interspersed",
+			chunks:          []string{"Hello", "", " ", "", "world", "", "!"},
+			thinkValue:      nil,
+			expectedContent: "Hello world!",
+		},
 		{
 			name:             "tool call immediately after think close - no content",
 			chunks:           []string{"Analyzing...", "</think>", "\n", "<tool_call>", "\n<function=test>\n</function>\n", "</tool_call>"},
@@ -191,6 +418,25 @@ func TestNemotron3NanoParser_Streaming(t *testing.T) {
 				},
 			},
 		},
+		{
+			name:       "tool call with empty parameter value",
+			chunks:     []string{"<tool_call>\n<function=test>\n<parameter=name>\n", "\n</parameter>\n</function>\n</tool_call>"},
+			thinkValue: nil,
+			expectedCalls: []api.ToolCall{
+				{
+					Function: api.ToolCallFunction{
+						Name:      "test",
+						Arguments: testArgs(map[string]any{"name": ""}),
+					},
+				},
+			},
+		},
+		{
+			name:            "partial tool call tag at end - buffered",
+			chunks:          []string{"Here's some content", "<tool"},
+			thinkValue:      nil,
+			expectedContent: "Here's some content",
+		},
 	}

 	for _, tt := range tests {
@@ -326,65 +572,3 @@ func TestNemotron3NanoParser_WithTools(t *testing.T) {
 		t.Errorf("calls mismatch (-got +want):\n%s", diff)
 	}
 }
-
-// TestNemotron3NanoParser_ToolCallWithoutThinkClose tests the case where thinking is enabled
-// but the model outputs content + tool call WITHOUT the </think> tag.
-// The parser should still parse the tool call (content before is treated as thinking).
-func TestNemotron3NanoParser_ToolCallWithoutThinkClose(t *testing.T) {
-	chunks := []string{
-		"Let", " me", " analyze", " this", ".", "\n",
-		"<tool_call>", "\n",
-		"<function=get_weather>", "\n",
-		"<parameter=city>", "Paris", "</parameter>", "\n",
-		"</function>", "\n",
-		"</tool_call>",
-	}
-
-	p := &Nemotron3NanoParser{}
-	p.Init(nil, nil, &api.ThinkValue{Value: true}) // thinking ENABLED but model doesn't output </think>
-
-	var allContent string
-	var allThinking string
-	var allCalls []api.ToolCall
-
-	for _, chunk := range chunks {
-		content, thinking, calls, err := p.Add(chunk, false)
-		if err != nil {
-			t.Fatalf("unexpected error: %v", err)
-		}
-		allContent += content
-		allThinking += thinking
-		allCalls = append(allCalls, calls...)
-	}
-
-	// Drain
-	content, thinking, calls, err := p.Add("", true)
-	if err != nil {
-		t.Fatalf("unexpected error on done: %v", err)
-	}
-	allContent += content
-	allThinking += thinking
-	allCalls = append(allCalls, calls...)
-
-	// The parser was in thinking mode, so text before <tool_call> is emitted as thinking.
-	expectedThinking := "Let me analyze this."
-
-	expectedCalls := []api.ToolCall{
-		{
-			Function: api.ToolCallFunction{
-				Name:      "get_weather",
-				Arguments: testArgs(map[string]any{"city": "Paris"}),
-			},
-		},
-	}
-
-	if allContent != "" {
-		t.Errorf("expected no content (text was streamed as thinking), got: %q", allContent)
-	}
-	if diff := cmp.Diff(allThinking, expectedThinking); diff != "" {
-		t.Errorf("thinking mismatch (-got +want):\n%s", diff)
-	}
-	if diff := cmp.Diff(allCalls, expectedCalls, argsComparer); diff != "" {
-		t.Errorf("calls mismatch (-got +want):\n%s", diff)
-	}
-}
--- a/model/parsers/parsers.go
+++ b/model/parsers/parsers.go
@@ -68,12 +68,6 @@ func ParserForName(name string) Parser {
 		return &Nemotron3NanoParser{}
 	case "functiongemma":
 		return &FunctionGemmaParser{}
-	case "glm-4.7":
-		return &GLM47Parser{}
-	case "lfm2":
-		return &LFM2Parser{hasThinkingSupport: false}
-	case "lfm2-thinking":
-		return &LFM2Parser{hasThinkingSupport: true}
 	default:
 		return nil
 	}
--- a/model/parsers/qwen3coder_test.go
+++ b/model/parsers/qwen3coder_test.go
@@ -91,37 +91,6 @@ func TestQwenParserStreaming(t *testing.T) {
 				},
 			},
 		},
-		{
-			desc: "tool call tags split character by character",
-			steps: []step{
-				{input: "<", wantEvents: []qwenEvent{}},
-				{input: "t", wantEvents: []qwenEvent{}},
-				{input: "o", wantEvents: []qwenEvent{}},
-				{input: "o", wantEvents: []qwenEvent{}},
-				{input: "l", wantEvents: []qwenEvent{}},
-				{input: "_", wantEvents: []qwenEvent{}},
-				{input: "c", wantEvents: []qwenEvent{}},
-				{input: "a", wantEvents: []qwenEvent{}},
-				{input: "l", wantEvents: []qwenEvent{}},
-				{input: "l", wantEvents: []qwenEvent{}},
-				{input: ">", wantEvents: []qwenEvent{}},
-				{input: "a", wantEvents: []qwenEvent{}},
-				{input: "b", wantEvents: []qwenEvent{}},
-				{input: "c", wantEvents: []qwenEvent{}},
-				{input: "<", wantEvents: []qwenEvent{}},
-				{input: "/", wantEvents: []qwenEvent{}},
-				{input: "t", wantEvents: []qwenEvent{}},
-				{input: "o", wantEvents: []qwenEvent{}},
-				{input: "o", wantEvents: []qwenEvent{}},
-				{input: "l", wantEvents: []qwenEvent{}},
-				{input: "_", wantEvents: []qwenEvent{}},
-				{input: "c", wantEvents: []qwenEvent{}},
-				{input: "a", wantEvents: []qwenEvent{}},
-				{input: "l", wantEvents: []qwenEvent{}},
-				{input: "l", wantEvents: []qwenEvent{}},
-				{input: ">", wantEvents: []qwenEvent{qwenEventRawToolCall{raw: "abc"}}},
-			},
-		},
 		{
 			desc: "trailing whitespace between content and tool call",
 			steps: []step{
--- a/model/parsers/testhelpers_test.go
+++ b/model/parsers/testhelpers_test.go
@@ -96,11 +96,3 @@ func testArgs(m map[string]any) api.ToolCallFunctionArguments {
 	}
 	return args
 }
-
-func args(s string) api.ToolCallFunctionArguments {
-	var result api.ToolCallFunctionArguments
-	if err := json.Unmarshal([]byte(s), &result); err != nil {
-		panic("invalid JSON in args(): " + err.Error())
-	}
-	return result
-}
--- a/model/renderers/glm46.go
+++ b/model/renderers/glm46.go
@@ -1,110 +0,0 @@
-package renderers
-
-import (
-	"encoding/json"
-	"fmt"
-	"strings"
-
-	"github.com/ollama/ollama/api"
-)
-
-type GLM46Renderer struct{}
-
-func (r *GLM46Renderer) Render(messages []api.Message, tools []api.Tool, thinkValue *api.ThinkValue) (string, error) {
-	var sb strings.Builder
-
-	sb.WriteString("[gMASK]<sop>")
-
-	var lastUserIndex int
-	for i, message := range messages {
-		if message.Role == "user" {
-			lastUserIndex = i
-		}
-	}
-
-	if len(tools) > 0 {
-		sb.WriteString("<|system|>\n")
-		sb.WriteString("# Tools\n\n")
-		sb.WriteString("You may call one or more functions to assist with the user query.\n\n")
-		sb.WriteString("You are provided with function signatures within <tools></tools> XML tags:\n")
-		sb.WriteString("<tools>\n")
-		for _, tool := range tools {
-			d, _ := json.Marshal(tool)
-			sb.WriteString(string(d) + "\n")
-		}
-		sb.WriteString("</tools>\n\n")
-		sb.WriteString("For each function call, output the function name and arguments within the following XML format:\n")
-		sb.WriteString("<tool_call>{function-name}\n")
-		sb.WriteString("<arg_key>{arg-key-1}</arg_key>\n")
-		sb.WriteString("<arg_value>{arg-value-1}</arg_value>\n")
-		sb.WriteString("<arg_key>{arg-key-2}</arg_key>\n")
-		sb.WriteString("<arg_value>{arg-value-2}</arg_value>\n")
-		sb.WriteString("...\n")
-		sb.WriteString("</tool_call>")
-	}
-
-	for i, message := range messages {
-		switch message.Role {
-		case "user":
-			sb.WriteString("<|user|>\n")
-			sb.WriteString(message.Content)
-			if thinkValue != nil && !thinkValue.Bool() && !strings.HasSuffix(message.Content, "/nothink") {
-				sb.WriteString("/nothink")
-			}
-		case "assistant":
-			sb.WriteString("<|assistant|>")
-			if i > lastUserIndex {
-				if message.Thinking != "" {
-					sb.WriteString("\n<think>" + message.Thinking + "</think>")
-				} else {
-					sb.WriteString("\n<think></think>")
-				}
-			}
-			if message.Content != "" {
-				sb.WriteString("\n" + message.Content)
-			}
-			if len(message.ToolCalls) > 0 {
-				for _, toolCall := range message.ToolCalls {
-					sb.WriteString("\n<tool_call>" + toolCall.Function.Name + "\n")
-					for key, value := range toolCall.Function.Arguments.All() {
-						sb.WriteString("<arg_key>" + key + "</arg_key>\n")
-
-						var valueStr string
-						if str, ok := value.(string); ok {
-							valueStr = str
-						} else {
-							jsonBytes, err := json.Marshal(value)
-							if err != nil {
-								valueStr = fmt.Sprintf("%v", value)
-							} else {
-								valueStr = string(jsonBytes)
-							}
-						}
-
-						sb.WriteString("<arg_value>" + valueStr + "</arg_value>\n")
-					}
-
-					sb.WriteString("</tool_call>")
-				}
-			}
-		case "tool":
-			if i == 0 || messages[i-1].Role != "tool" {
-				sb.WriteString("<|observation|>")
-			}
-			sb.WriteString("\n<tool_response>\n")
-			sb.WriteString(message.Content)
-			sb.WriteString("\n</tool_response>")
-		case "system":
-			sb.WriteString("<|system|>\n")
-			sb.WriteString(message.Content)
-		}
-	}
-
-	// Add generation prompt
-	sb.WriteString("<|assistant|>")
-	if thinkValue != nil && !thinkValue.Bool() {
-		sb.WriteString("\n<think></think>\n")
-	}
-
-	return sb.String(), nil
-}
--- a/model/renderers/glm46_test.go
+++ b/model/renderers/glm46_test.go
@@ -1,223 +0,0 @@
-package renderers
-
-import (
-	"testing"
-
-	"github.com/google/go-cmp/cmp"
-	"github.com/ollama/ollama/api"
-)
-
-func TestGLM46Renderer(t *testing.T) {
-	tests := []struct {
-		name       string
-		messages   []api.Message
-		tools      []api.Tool
-		thinkValue *api.ThinkValue
-		expected   string
-		skip       string
-	}{
-		{
-			name: "basic",
-			messages: []api.Message{
-				{Role: "user", Content: "Hello, how are you?"},
-			},
-			expected: `[gMASK]<sop><|user|>
-Hello, how are you?<|assistant|>`,
-		},
-		{
-			name: "basic with system message",
-			messages: []api.Message{
-				{Role: "system", Content: "You are a helpful assistant."},
-				{Role: "user", Content: "Hello, how are you?"},
-			},
-			expected: `[gMASK]<sop><|system|>
-You are a helpful assistant.<|user|>
-Hello, how are you?<|assistant|>`,
-		},
-		{
-			name: "basic with user assistant user",
-			messages: []api.Message{
-				{Role: "user", Content: "What is the capital of France?"},
-				{Role: "assistant", Thinking: "Let me analyze the request...", Content: "The capital of France is Paris."},
-				{Role: "user", Content: "Fantastic!"},
-			},
-			expected: `[gMASK]<sop><|user|>
-What is the capital of France?<|assistant|>
-The capital of France is Paris.<|user|>
-Fantastic!<|assistant|>`,
-		},
-		{
-			skip: "tool call ordering not guaranteed yet",
-			name: "tools",
-			messages: []api.Message{
-				{Role: "system", Content: "You are a helpful assistant with access to tools."},
-				{Role: "user", Content: "What is the weather like in Tokyo?"},
-			},
-			tools: []api.Tool{
-				{
-					Type: "function",
-					Function: api.ToolFunction{
-						Name:        "get_weather",
-						Description: "Get the current weather in a given location",
-						Parameters: api.ToolFunctionParameters{
-							Type:       "object",
-							Required:   []string{"location"},
-							Properties: propsMap(`{"location": {"type": "string", "description": "The city and state, e.g. San Francisco, CA"}, "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}}`),
-						},
-					},
-				},
-			},
-			expected: `[gMASK]<sop><|system|>
-# Tools
-
-You may call one or more functions to assist with the user query.
-
-You are provided with function signatures within <tools></tools> XML tags:
-<tools>
-{"type":"function","function":{"name":"get_weather","description":"Get the current weather in a given location","parameters":{"type":"object","required":["location"],"properties":{"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"},"unit":{"type":"string","description":"","enum":["celsius","fahrenheit"]}}}}}
-</tools>
-
-For each function call, output the function name and arguments within the following XML format:
-<tool_call>{function-name}
-<arg_key>{arg-key-1}</arg_key>
-<arg_value>{arg-value-1}</arg_value>
-<arg_key>{arg-key-2}</arg_key>
-<arg_value>{arg-value-2}</arg_value>
-...
-</tool_call><|system|>
-You are a helpful assistant with access to tools.<|user|>
-What is the weather like in Tokyo?<|assistant|>`,
-		},
-		{
-			skip: "tool call ordering not guaranteed yet",
-			name: "tool calls",
-			messages: []api.Message{
-				{Role: "system", Content: "You are a helpful assistant with access to tools."},
-				{Role: "user", Content: "What is the weather like in Tokyo?"},
-				{
-					Role: "assistant",
-					ToolCalls: []api.ToolCall{
-						{
-							Function: api.ToolCallFunction{
-								Name:      "get_weather",
-								Arguments: args(`{"location": "Tokyo, Japan", "unit": "celsius"}`),
-							},
-						},
-						{
-							Function: api.ToolCallFunction{
-								Name:      "get_weather",
-								Arguments: args(`{"location": "Japan", "unit": "fahrenheit"}`),
-							},
-						},
-					},
-				},
-				{
-					Role:     "tool",
-					Content:  "{\"temperature\": 22, \"weather\": \"partly cloudy\", \"humidity\": 65}",
-					ToolName: "get_weather",
-				},
-				{
-					Role:     "tool",
-					Content:  "{\"temperature\": 68, \"weather\": \"sunny\", \"humidity\": 75}",
-					ToolName: "get_weather",
-				},
-				{
-					Role:    "assistant",
-					Content: "The weather in Tokyo is currently partly cloudy with a temperature of 22°C and 65% humidity. It's a pleasant day with moderate temperatures.",
-				},
-			},
-			tools: []api.Tool{
-				{
-					Type: "function",
-					Function: api.ToolFunction{
-						Name:        "get_weather",
-						Description: "Get the current weather in a given location",
-						Parameters: api.ToolFunctionParameters{
-							Type:       "object",
-							Required:   []string{"location"},
-							Properties: propsMap(`{"location": {"type": "string", "description": "The city and state, e.g. San Francisco, CA"}, "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}}`),
-						},
-					},
-				},
-			},
-			expected: `[gMASK]<sop><|system|>
-# Tools
-
-You may call one or more functions to assist with the user query.
-
-You are provided with function signatures within <tools></tools> XML tags:
-<tools>
-{"type":"function","function":{"name":"get_weather","description":"Get the current weather in a given location","parameters":{"type":"object","required":["location"],"properties":{"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"},"unit":{"type":"string","description":"","enum":["celsius","fahrenheit"]}}}}}
-</tools>
-
-For each function call, output the function name and arguments within the following XML format:
-<tool_call>{function-name}
-<arg_key>{arg-key-1}</arg_key>
-<arg_value>{arg-value-1}</arg_value>
-<arg_key>{arg-key-2}</arg_key>
-<arg_value>{arg-value-2}</arg_value>
-...
-</tool_call><|system|>
-You are a helpful assistant with access to tools.<|user|>
-What is the weather like in Tokyo?<|assistant|>
-<think></think>
-<tool_call>get_weather
-<arg_key>location</arg_key>
-<arg_value>Tokyo, Japan</arg_value>
-<arg_key>unit</arg_key>
-<arg_value>celsius</arg_value>
-</tool_call>
-<tool_call>get_weather
-<arg_key>location</arg_key>
-<arg_value>Japan</arg_value>
-<arg_key>unit</arg_key>
-<arg_value>fahrenheit</arg_value>
-</tool_call><|observation|>
-<tool_response>
-{"temperature": 22, "weather": "partly cloudy", "humidity": 65}
-</tool_response>
-<tool_response>
-{"temperature": 68, "weather": "sunny", "humidity": 75}
-</tool_response><|assistant|>
-<think></think>
-The weather in Tokyo is currently partly cloudy with a temperature of 22°C and 65% humidity. It's a pleasant day with moderate temperatures.<|assistant|>`,
-		},
-		{
-			name: "think true",
-			messages: []api.Message{
-				{Role: "user", Content: "Hello, how are you?"},
-			},
-			thinkValue: &api.ThinkValue{Value: true},
-			expected: `[gMASK]<sop><|user|>
-Hello, how are you?<|assistant|>`,
-		},
-		{
-			name: "think false",
-			messages: []api.Message{
-				{Role: "user", Content: "Hello, how are you?"},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected: `[gMASK]<sop><|user|>
-Hello, how are you?/nothink<|assistant|>
-<think></think>
-`,
-		},
-	}
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			if tt.skip != "" {
-				t.Skip(tt.skip)
-			}
-			renderer := &GLM46Renderer{}
-			rendered, err := renderer.Render(tt.messages, tt.tools, tt.thinkValue)
-			if err != nil {
-				t.Fatal(err)
-			}
-			if diff := cmp.Diff(rendered, tt.expected); diff != "" {
-				t.Errorf("mismatch (-got +want):\n%s", diff)
-				t.Logf("Got:\n%s", rendered)
-				t.Logf("Expected:\n%s", tt.expected)
-			}
-		})
-	}
-}
--- a/model/renderers/glm47.go
+++ b/model/renderers/glm47.go
@@ -1,170 +0,0 @@
-package renderers
-
-import (
-	"encoding/json"
-	"fmt"
-	"strings"
-
-	"github.com/ollama/ollama/api"
-)
-
-// GLM47Renderer renders messages for GLM-4.7 models.
-//
-// GLM-4.7 Thinking Modes (ref: https://docs.z.ai/guides/capabilities/thinking-mode):
-//
-//  1. INTERLEAVED THINKING
-//     The model thinks between tool calls and after receiving tool results.
-//     This enables complex step-by-step reasoning: interpreting each tool output
-//     before deciding what to do next. Thinking blocks are preserved and returned
-//     with tool results to maintain reasoning continuity.
-//
-//  2. PRESERVED THINKING
-//     The model retains reasoning content from previous assistant turns in context.
-//     This preserves reasoning continuity across multi-turn conversations. The
-//     upstream API has a "clear_thinking" parameter to control this:
-//     - clear_thinking=true:  clears reasoning from previous turns (outputs </think>)
-//     - clear_thinking=false: preserves <think>...</think> blocks from previous turns
-//
-//  3. TURN-LEVEL THINKING
-//     Controls whether the model should reason on each turn. The upstream API
-//     uses "enable_thinking" parameter:
-//     - enable_thinking=true:  outputs <think> to start reasoning
-//     - enable_thinking=false: outputs </think> to skip reasoning
-//
-// OLLAMA DEFAULTS:
-//   - Thinking is ENABLED by default (thinkValue=nil or true outputs <think>)
-//   - Thinking is PRESERVED by default (reasoning content from previous turns is always
-//     included in <think>...</think> blocks, equivalent to clear_thinking=false)
-//   - Users can disable thinking per-turn via thinkValue=false
-type GLM47Renderer struct{}
-
-func (r *GLM47Renderer) Render(messages []api.Message, tools []api.Tool, thinkValue *api.ThinkValue) (string, error) {
-	var sb strings.Builder
-
-	sb.WriteString("[gMASK]<sop>")
-
-	if len(tools) > 0 {
-		sb.WriteString("<|system|>\n")
-		sb.WriteString("# Tools\n\n")
-		sb.WriteString("You may call one or more functions to assist with the user query.\n\n")
-		sb.WriteString("You are provided with function signatures within <tools></tools> XML tags:\n")
-		sb.WriteString("<tools>\n")
-		for _, tool := range tools {
-			d, _ := json.Marshal(tool)
-			sb.WriteString(formatGLM47ToolJSON(d))
-			sb.WriteString("\n")
-		}
-		sb.WriteString("</tools>\n\n")
-		sb.WriteString("For each function call, output the function name and arguments within the following XML format:\n")
-		sb.WriteString("<tool_call>{function-name}<arg_key>{arg-key-1}</arg_key><arg_value>{arg-value-1}</arg_value><arg_key>{arg-key-2}</arg_key><arg_value>{arg-value-2}</arg_value>...</tool_call>")
-	}
-
-	think := true
-	if thinkValue != nil && !thinkValue.Bool() {
-		think = false
-	}
-
-	for i, message := range messages {
-		switch message.Role {
-		case "user":
-			sb.WriteString("<|user|>")
-			sb.WriteString(message.Content)
-		case "assistant":
-			sb.WriteString("<|assistant|>")
-			if message.Thinking != "" {
-				sb.WriteString("<think>" + message.Thinking + "</think>")
-			} else {
-				sb.WriteString("</think>")
-			}
-			if message.Content != "" {
-				sb.WriteString(message.Content)
-			}
-			if len(message.ToolCalls) > 0 {
-				for _, toolCall := range message.ToolCalls {
-					sb.WriteString("<tool_call>" + toolCall.Function.Name)
-					sb.WriteString(renderGLM47ToolArguments(toolCall.Function.Arguments))
-					sb.WriteString("</tool_call>")
-				}
-			}
-		case "tool":
-			if i == 0 || messages[i-1].Role != "tool" {
-				sb.WriteString("<|observation|>")
-			}
-			sb.WriteString("<tool_response>")
-			sb.WriteString(message.Content)
-			sb.WriteString("</tool_response>")
-		case "system":
-			sb.WriteString("<|system|>")
-			sb.WriteString(message.Content)
-		}
-	}
-
-	sb.WriteString("<|assistant|>")
-	if think {
-		sb.WriteString("<think>")
-	} else {
-		sb.WriteString("</think>")
-	}
-
-	return sb.String(), nil
-}
-
-func renderGLM47ToolArguments(args api.ToolCallFunctionArguments) string {
-	var sb strings.Builder
-	for key, value := range args.All() {
-		sb.WriteString("<arg_key>" + key + "</arg_key>")
-		var valueStr string
-		if str, ok := value.(string); ok {
-			valueStr = str
-		} else {
-			jsonBytes, err := json.Marshal(value)
-			if err != nil {
-				valueStr = fmt.Sprintf("%v", value)
-			} else {
-				valueStr = string(jsonBytes)
-			}
-		}
-
-		sb.WriteString("<arg_value>" + valueStr + "</arg_value>")
-	}
-
-	return sb.String()
-}
-
-func formatGLM47ToolJSON(raw []byte) string {
-	var sb strings.Builder
-	sb.Grow(len(raw) + len(raw)/10)
-
-	inString := false
-	escaped := false
-	for i := range raw {
-		ch := raw[i]
-		sb.WriteByte(ch)
-
-		if inString {
-			if escaped {
-				escaped = false
-				continue
-			}
-			if ch == '\\' {
-				escaped = true
-				continue
-			}
-			if ch == '"' {
-				inString = false
-			}
-			continue
-		}
-
-		if ch == '"' {
-			inString = true
-			continue
-		}
-
-		if ch == ':' || ch == ',' {
-			sb.WriteByte(' ')
-		}
-	}
-
-	return sb.String()
-}
--- a/model/renderers/glm47_test.go
+++ b/model/renderers/glm47_test.go
@@ -1,191 +0,0 @@
-package renderers
-
-import (
-	"testing"
-
-	"github.com/google/go-cmp/cmp"
-	"github.com/ollama/ollama/api"
-)
-
-func TestGLM47Renderer(t *testing.T) {
-	tests := []struct {
-		name       string
-		messages   []api.Message
-		tools      []api.Tool
-		thinkValue *api.ThinkValue
-		expected   string
-	}{
-		{
-			name: "basic user message",
-			messages: []api.Message{
-				{Role: "user", Content: "Hello"},
-			},
-			expected: "[gMASK]<sop><|user|>Hello<|assistant|><think>",
-		},
-		{
-			name: "thinking disabled",
-			messages: []api.Message{
-				{Role: "user", Content: "Hello"},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "[gMASK]<sop><|user|>Hello<|assistant|></think>",
-		},
-		{
-			name: "system and user",
-			messages: []api.Message{
-				{Role: "system", Content: "You are helpful."},
-				{Role: "user", Content: "Hello"},
-			},
-			expected: "[gMASK]<sop><|system|>You are helpful.<|user|>Hello<|assistant|><think>",
-		},
-		{
-			name: "multi-turn conversation",
-			messages: []api.Message{
-				{Role: "user", Content: "Hi"},
-				{Role: "assistant", Content: "Hello there"},
-				{Role: "user", Content: "How are you?"},
-			},
-			expected: "[gMASK]<sop><|user|>Hi<|assistant|></think>Hello there<|user|>How are you?<|assistant|><think>",
-		},
-		{
-			name: "assistant with reasoning_content",
-			messages: []api.Message{
-				{Role: "user", Content: "Answer with reasoning."},
-				{Role: "assistant", Thinking: "Plan.", Content: "Done."},
-			},
-			expected: "[gMASK]<sop><|user|>Answer with reasoning.<|assistant|><think>Plan.</think>Done.<|assistant|><think>",
-		},
-		{
-			name: "tool call with empty content",
-			messages: []api.Message{
-				{Role: "user", Content: "Weather?"},
-				{
-					Role: "assistant",
-					ToolCalls: []api.ToolCall{
-						{
-							Function: api.ToolCallFunction{
-								Name:      "get_weather",
-								Arguments: args(`{"location": "Tokyo", "unit": "celsius"}`),
-							},
-						},
-					},
-				},
-				{Role: "tool", Content: `{"temperature":22}`},
-			},
-			tools: []api.Tool{
-				{
-					Type: "function",
-					Function: api.ToolFunction{
-						Name:        "get_weather",
-						Description: "Get weather",
-						Parameters: api.ToolFunctionParameters{
-							Type:       "object",
-							Required:   []string{"location"},
-							Properties: propsMap(`{"location": {"type": "string"}}`),
-						},
-					},
-				},
-			},
-			expected: "[gMASK]<sop><|system|>\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>\n{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"description\": \"Get weather\", \"parameters\": {\"type\": \"object\", \"required\": [\"location\"], \"properties\": {\"location\": {\"type\": \"string\"}}}}}\n</tools>\n\nFor each function call, output the function name and arguments within the following XML format:\n<tool_call>{function-name}<arg_key>{arg-key-1}</arg_key><arg_value>{arg-value-1}</arg_value><arg_key>{arg-key-2}</arg_key><arg_value>{arg-value-2}</arg_value>...</tool_call><|user|>Weather?<|assistant|></think><tool_call>get_weather<arg_key>location</arg_key><arg_value>Tokyo</arg_value><arg_key>unit</arg_key><arg_value>celsius</arg_value></tool_call><|observation|><tool_response>{\"temperature\":22}</tool_response><|assistant|><think>",
-		},
-		{
-			name: "tool call with content",
-			messages: []api.Message{
-				{Role: "user", Content: "Weather?"},
-				{
-					Role:    "assistant",
-					Content: "Let me check",
-					ToolCalls: []api.ToolCall{
-						{
-							Function: api.ToolCallFunction{
-								Name:      "get_weather",
-								Arguments: args(`{"location": "Tokyo"}`),
-							},
-						},
-					},
-				},
-				{Role: "tool", Content: `{"temperature":22}`},
-				{Role: "assistant", Content: "It is 22C."},
-			},
-			tools: []api.Tool{
-				{
-					Type: "function",
-					Function: api.ToolFunction{
-						Name:        "get_weather",
-						Description: "Get weather",
-						Parameters: api.ToolFunctionParameters{
-							Type:       "object",
-							Required:   []string{"location"},
-							Properties: propsMap(`{"location": {"type": "string"}}`),
-						},
-					},
-				},
-			},
-			expected: "[gMASK]<sop><|system|>\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>\n{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"description\": \"Get weather\", \"parameters\": {\"type\": \"object\", \"required\": [\"location\"], \"properties\": {\"location\": {\"type\": \"string\"}}}}}\n</tools>\n\nFor each function call, output the function name and arguments within the following XML format:\n<tool_call>{function-name}<arg_key>{arg-key-1}</arg_key><arg_value>{arg-value-1}</arg_value><arg_key>{arg-key-2}</arg_key><arg_value>{arg-value-2}</arg_value>...</tool_call><|user|>Weather?<|assistant|></think>Let me check<tool_call>get_weather<arg_key>location</arg_key><arg_value>Tokyo</arg_value></tool_call><|observation|><tool_response>{\"temperature\":22}</tool_response><|assistant|></think>It is 22C.<|assistant|><think>",
-		},
-		{
-			name: "multiple tool calls and responses",
-			messages: []api.Message{
-				{Role: "user", Content: "Compare weather"},
-				{
-					Role: "assistant",
-					ToolCalls: []api.ToolCall{
-						{
-							Function: api.ToolCallFunction{
-								Name:      "get_weather",
-								Arguments: args(`{"location": "Tokyo"}`),
-							},
-						},
-						{
-							Function: api.ToolCallFunction{
-								Name:      "get_weather",
-								Arguments: args(`{"location": "Paris"}`),
-							},
-						},
-					},
-				},
-				{Role: "tool", Content: `{"temperature":22}`},
-				{Role: "tool", Content: `{"temperature":18}`},
-			},
-			tools: []api.Tool{
-				{
-					Type: "function",
-					Function: api.ToolFunction{
-						Name:        "get_weather",
-						Description: "Get weather",
-						Parameters: api.ToolFunctionParameters{
-							Type:       "object",
-							Required:   []string{"location"},
-							Properties: propsMap(`{"location": {"type": "string"}}`),
-						},
-					},
-				},
-			},
-			expected: "[gMASK]<sop><|system|>\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>\n{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"description\": \"Get weather\", \"parameters\": {\"type\": \"object\", \"required\": [\"location\"], \"properties\": {\"location\": {\"type\": \"string\"}}}}}\n</tools>\n\nFor each function call, output the function name and arguments within the following XML format:\n<tool_call>{function-name}<arg_key>{arg-key-1}</arg_key><arg_value>{arg-value-1}</arg_value><arg_key>{arg-key-2}</arg_key><arg_value>{arg-value-2}</arg_value>...</tool_call><|user|>Compare weather<|assistant|></think><tool_call>get_weather<arg_key>location</arg_key><arg_value>Tokyo</arg_value></tool_call><tool_call>get_weather<arg_key>location</arg_key><arg_value>Paris</arg_value></tool_call><|observation|><tool_response>{\"temperature\":22}</tool_response><tool_response>{\"temperature\":18}</tool_response><|assistant|><think>",
-		},
-		{
-			name: "preserved thinking in multi-turn",
-			messages: []api.Message{
-				{Role: "user", Content: "Think step by step"},
-				{Role: "assistant", Thinking: "Let me think...", Content: "Here's my answer."},
-				{Role: "user", Content: "Continue"},
-			},
-			expected: "[gMASK]<sop><|user|>Think step by step<|assistant|><think>Let me think...</think>Here's my answer.<|user|>Continue<|assistant|><think>",
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			renderer := &GLM47Renderer{}
-			rendered, err := renderer.Render(tt.messages, tt.tools, tt.thinkValue)
-			if err != nil {
-				t.Fatal(err)
-			}
-			if diff := cmp.Diff(rendered, tt.expected); diff != "" {
-				t.Errorf("mismatch (-got +want):\n%s", diff)
-				t.Logf("Got:\n%s", rendered)
-				t.Logf("Expected:\n%s", tt.expected)
-			}
-		})
-	}
-}
--- a/model/renderers/lfm2.go
+++ b/model/renderers/lfm2.go
@@ -1,144 +0,0 @@
-package renderers
-
-import (
-	"encoding/json"
-	"strings"
-
-	"github.com/ollama/ollama/api"
-)
-
-type LFM2Renderer struct {
-	IsThinking bool
-}
-
-func (r *LFM2Renderer) Render(messages []api.Message, tools []api.Tool, thinkValue *api.ThinkValue) (string, error) {
-	var sb strings.Builder
-
-	// Note: BOS token is added by the tokenizer (add_bos_token: true), not the renderer
-
-	// Extract first system message if present (to combine with tools)
-	var firstSystemContent string
-	startIdx := 0
-	if len(messages) > 0 && messages[0].Role == "system" {
-		firstSystemContent = messages[0].Content
-		startIdx = 1
-	}
-
-	// Append tools to first system content
-	if len(tools) > 0 {
-		if firstSystemContent != "" {
-			firstSystemContent += "\n"
-		}
-		firstSystemContent += "List of tools: ["
-		for i, tool := range tools {
-			toolJSON, err := json.Marshal(tool)
-			if err != nil {
-				return "", err
-			}
-			firstSystemContent += string(toolJSON)
-			if i < len(tools)-1 {
-				firstSystemContent += ", "
-			}
-		}
-		firstSystemContent += "]"
-	}
-
-	// Output first system block if it has content
-	if firstSystemContent != "" {
-		sb.WriteString("<|im_start|>system\n")
-		sb.WriteString(firstSystemContent)
-		sb.WriteString("<|im_end|>\n")
-	}
-
-	// Find the index of the last assistant message for thinking stripping
-	lastAssistantIndex := -1
-	for i := len(messages) - 1; i >= startIdx; i-- {
-		if messages[i].Role == "assistant" {
-			lastAssistantIndex = i
-			break
-		}
-	}
-
-	// Track whether we need to add generation prompt
-	needsGenerationPrompt := len(messages) > 0
-
-	for i := startIdx; i < len(messages); i++ {
-		message := messages[i]
-		switch message.Role {
-		case "system":
-			// Additional system messages (after the first) are rendered normally
-			sb.WriteString("<|im_start|>system\n")
-			sb.WriteString(message.Content)
-			sb.WriteString("<|im_end|>\n")
-
-		case "user":
-			sb.WriteString("<|im_start|>user\n")
-			sb.WriteString(message.Content)
-			sb.WriteString("<|im_end|>\n")
-			needsGenerationPrompt = true
-
-		case "assistant":
-			sb.WriteString("<|im_start|>assistant\n")
-
-			// Check if this is the last assistant message
-			isLastAssistant := i == lastAssistantIndex
-
-			// Process content (may need thinking stripped)
-			content := message.Content
-
-			// Handle thinking tags in assistant content
-			keepPastThinking := r.IsThinking && (thinkValue != nil && thinkValue.Bool())
-			if strings.Contains(content, "</think>") {
-				parts := strings.SplitN(content, "</think>", 2)
-				if len(parts) > 1 {
-					if !isLastAssistant && !keepPastThinking {
-						// Strip thinking entirely for past assistant messages
-						content = strings.TrimSpace(parts[1])
-					} else {
-						// Preserve thinking but trim whitespace after </think>
-						content = parts[0] + "</think>" + strings.TrimLeft(parts[1], " \t\n\r")
-					}
-				}
-			}
-
-			if len(message.ToolCalls) > 0 {
-				// Assistant with tool calls - write content first (if any after stripping)
-				if content != "" {
-					sb.WriteString(content)
-				}
-
-				for _, toolCall := range message.ToolCalls {
-					sb.WriteString("<|tool_call_start|>")
-					toolCallJSON := map[string]any{
-						"name":      toolCall.Function.Name,
-						"arguments": toolCall.Function.Arguments,
-					}
-					callJSON, _ := json.Marshal(toolCallJSON)
-					sb.WriteString(string(callJSON))
-					sb.WriteString("<|tool_call_end|>")
-				}
-			} else {
-				sb.WriteString(content)
-			}
-
-			sb.WriteString("<|im_end|>\n")
-			needsGenerationPrompt = true // Always add gen prompt after assistant when add_generation_prompt=true
-
-		case "tool":
-			// Tool responses are rendered as plain messages per the chat template
-			sb.WriteString("<|im_start|>tool\n")
-			sb.WriteString(message.Content)
-			sb.WriteString("<|im_end|>\n")
-			needsGenerationPrompt = true
-		}
-	}
-
-	// Add generation prompt
-	if needsGenerationPrompt {
-		sb.WriteString("<|im_start|>assistant\n")
-		// Note: Model is a "thinking-only" model - it will output <think> itself
-		// We don't add <think> tag to the prompt
-	}
-
-	return sb.String(), nil
-}
--- a/model/renderers/lfm2_test.go
+++ b/model/renderers/lfm2_test.go
@@ -1,427 +0,0 @@
-package renderers
-
-import (
-	"testing"
-
-	"github.com/google/go-cmp/cmp"
-
-	"github.com/ollama/ollama/api"
-)
-
-func TestLFM2Renderer(t *testing.T) {
-	tests := []struct {
-		name       string
-		messages   []api.Message
-		tools      []api.Tool
-		thinkValue *api.ThinkValue
-		expected   string
-	}{
-		{
-			name: "basic user message",
-			messages: []api.Message{
-				{Role: "user", Content: "Hello!"},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>user\nHello!<|im_end|>\n<|im_start|>assistant\n",
-		},
-		{
-			name: "basic with system message",
-			messages: []api.Message{
-				{Role: "system", Content: "You are a helpful assistant."},
-				{Role: "user", Content: "Hello!"},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nHello!<|im_end|>\n<|im_start|>assistant\n",
-		},
-		{
-			name: "multiple system messages rendered separately",
-			messages: []api.Message{
-				{Role: "system", Content: "First instruction."},
-				{Role: "system", Content: "Second instruction."},
-				{Role: "user", Content: "Hello!"},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>system\nFirst instruction.<|im_end|>\n<|im_start|>system\nSecond instruction.<|im_end|>\n<|im_start|>user\nHello!<|im_end|>\n<|im_start|>assistant\n",
-		},
-		{
-			name: "multi-turn conversation",
-			messages: []api.Message{
-				{Role: "user", Content: "What is 2+2?"},
-				{Role: "assistant", Content: "The answer is 4."},
-				{Role: "user", Content: "Thanks!"},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>user\nWhat is 2+2?<|im_end|>\n<|im_start|>assistant\nThe answer is 4.<|im_end|>\n<|im_start|>user\nThanks!<|im_end|>\n<|im_start|>assistant\n",
-		},
-		{
-			name: "only system message",
-			messages: []api.Message{
-				{Role: "system", Content: "You are helpful."},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>system\nYou are helpful.<|im_end|>\n<|im_start|>assistant\n",
-		},
-		{
-			// When assistant is the LAST assistant, thinking is preserved (even with keep_past_thinking=false)
-			name: "user-assistant-user: last assistant preserves thinking",
-			messages: []api.Message{
-				{Role: "user", Content: "Q1"},
-				{Role: "assistant", Content: "<think>reasoning</think>A1"},
-				{Role: "user", Content: "Q2"},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>user\nQ1<|im_end|>\n<|im_start|>assistant\n<think>reasoning</think>A1<|im_end|>\n<|im_start|>user\nQ2<|im_end|>\n<|im_start|>assistant\n",
-		},
-		{
-			// With two assistants, first is stripped (not last), second preserved (is last)
-			name: "multi-turn thinking: first stripped, second preserved",
-			messages: []api.Message{
-				{Role: "user", Content: "Q1"},
-				{Role: "assistant", Content: "<think>reason1</think>A1"},
-				{Role: "user", Content: "Q2"},
-				{Role: "assistant", Content: "<think>reason2</think>A2"},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>user\nQ1<|im_end|>\n<|im_start|>assistant\nA1<|im_end|>\n<|im_start|>user\nQ2<|im_end|>\n<|im_start|>assistant\n<think>reason2</think>A2<|im_end|>\n<|im_start|>assistant\n",
-		},
-		{
-			// With thinking enabled (keep_past_thinking=true), both preserved
-			name: "multi-turn thinking: both preserved when thinking enabled",
-			messages: []api.Message{
-				{Role: "user", Content: "Q1"},
-				{Role: "assistant", Content: "<think>reason1</think>A1"},
-				{Role: "user", Content: "Q2"},
-				{Role: "assistant", Content: "<think>reason2</think>A2"},
-			},
-			thinkValue: &api.ThinkValue{Value: true},
-			expected:   "<|im_start|>user\nQ1<|im_end|>\n<|im_start|>assistant\n<think>reason1</think>A1<|im_end|>\n<|im_start|>user\nQ2<|im_end|>\n<|im_start|>assistant\n<think>reason2</think>A2<|im_end|>\n<|im_start|>assistant\n",
-		},
-		{
-			name: "assistant with tool calls",
-			messages: []api.Message{
-				{Role: "user", Content: "What's the weather?"},
-				{
-					Role: "assistant",
-					ToolCalls: []api.ToolCall{
-						{
-							Function: api.ToolCallFunction{
-								Name: "get_weather",
-								Arguments: testArgs(map[string]any{
-									"location": "Paris",
-								}),
-							},
-						},
-					},
-				},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   `<|im_start|>user` + "\n" + `What's the weather?<|im_end|>` + "\n" + `<|im_start|>assistant` + "\n" + `<|tool_call_start|>{"arguments":{"location":"Paris"},"name":"get_weather"}<|tool_call_end|><|im_end|>` + "\n" + `<|im_start|>assistant` + "\n",
-		},
-		{
-			name: "assistant with content and tool calls",
-			messages: []api.Message{
-				{Role: "user", Content: "What's the weather in Paris?"},
-				{
-					Role:    "assistant",
-					Content: "Let me check.",
-					ToolCalls: []api.ToolCall{
-						{
-							Function: api.ToolCallFunction{
-								Name: "get_weather",
-								Arguments: testArgs(map[string]any{
-									"location": "Paris",
-								}),
-							},
-						},
-					},
-				},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   `<|im_start|>user` + "\n" + `What's the weather in Paris?<|im_end|>` + "\n" + `<|im_start|>assistant` + "\n" + `Let me check.<|tool_call_start|>{"arguments":{"location":"Paris"},"name":"get_weather"}<|tool_call_end|><|im_end|>` + "\n" + `<|im_start|>assistant` + "\n",
-		},
-		{
-			name: "tool response",
-			messages: []api.Message{
-				{Role: "user", Content: "What's the weather?"},
-				{Role: "assistant", Content: "Let me check."},
-				{Role: "tool", Content: "22C, Sunny"},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>user\nWhat's the weather?<|im_end|>\n<|im_start|>assistant\nLet me check.<|im_end|>\n<|im_start|>tool\n22C, Sunny<|im_end|>\n<|im_start|>assistant\n",
-		},
-		{
-			name: "multiple tool calls",
-			messages: []api.Message{
-				{Role: "user", Content: "Get weather for Paris and London"},
-				{
-					Role: "assistant",
-					ToolCalls: []api.ToolCall{
-						{
-							Function: api.ToolCallFunction{
-								Name: "get_weather",
-								Arguments: testArgs(map[string]any{
-									"location": "Paris",
-								}),
-							},
-						},
-						{
-							Function: api.ToolCallFunction{
-								Name: "get_weather",
-								Arguments: testArgs(map[string]any{
-									"location": "London",
-								}),
-							},
-						},
-					},
-				},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   `<|im_start|>user` + "\n" + `Get weather for Paris and London<|im_end|>` + "\n" + `<|im_start|>assistant` + "\n" + `<|tool_call_start|>{"arguments":{"location":"Paris"},"name":"get_weather"}<|tool_call_end|><|tool_call_start|>{"arguments":{"location":"London"},"name":"get_weather"}<|tool_call_end|><|im_end|>` + "\n" + `<|im_start|>assistant` + "\n",
-		},
-		{
-			name: "tools definitions with system message",
-			messages: []api.Message{
-				{Role: "system", Content: "You are helpful."},
-				{Role: "user", Content: "What's the weather?"},
-			},
-			tools: []api.Tool{
-				{
-					Type: "function",
-					Function: api.ToolFunction{
-						Name:        "get_weather",
-						Description: "Get current weather",
-						Parameters: api.ToolFunctionParameters{
-							Type: "object",
-							Properties: testPropsMap(map[string]api.ToolProperty{
-								"location": {
-									Type:        api.PropertyType{"string"},
-									Description: "City name",
-								},
-							}),
-							Required: []string{"location"},
-						},
-					},
-				},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   `<|im_start|>system` + "\n" + `You are helpful.` + "\n" + `List of tools: [{"type":"function","function":{"name":"get_weather","description":"Get current weather","parameters":{"type":"object","required":["location"],"properties":{"location":{"type":"string","description":"City name"}}}}}]<|im_end|>` + "\n" + `<|im_start|>user` + "\n" + `What's the weather?<|im_end|>` + "\n" + `<|im_start|>assistant` + "\n",
-		},
-		{
-			name: "tools definitions without system message",
-			messages: []api.Message{
-				{Role: "user", Content: "What's the weather?"},
-			},
-			tools: []api.Tool{
-				{
-					Type: "function",
-					Function: api.ToolFunction{
-						Name:        "get_weather",
-						Description: "Get current weather",
-						Parameters: api.ToolFunctionParameters{
-							Type: "object",
-							Properties: testPropsMap(map[string]api.ToolProperty{
-								"location": {
-									Type:        api.PropertyType{"string"},
-									Description: "City name",
-								},
-							}),
-							Required: []string{"location"},
-						},
-					},
-				},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   `<|im_start|>system` + "\n" + `List of tools: [{"type":"function","function":{"name":"get_weather","description":"Get current weather","parameters":{"type":"object","required":["location"],"properties":{"location":{"type":"string","description":"City name"}}}}}]<|im_end|>` + "\n" + `<|im_start|>user` + "\n" + `What's the weather?<|im_end|>` + "\n" + `<|im_start|>assistant` + "\n",
-		},
-		{
-			name: "multiple tools without system message",
-			messages: []api.Message{
-				{Role: "user", Content: "Hello"},
-			},
-			tools: []api.Tool{
-				{
-					Type: "function",
-					Function: api.ToolFunction{
-						Name:        "get_weather",
-						Description: "Get weather",
-					},
-				},
-				{
-					Type: "function",
-					Function: api.ToolFunction{
-						Name:        "get_time",
-						Description: "Get time",
-					},
-				},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>system\nList of tools: [{\"type\":\"function\",\"function\":{\"name\":\"get_weather\",\"description\":\"Get weather\",\"parameters\":{\"type\":\"\",\"properties\":null}}}, {\"type\":\"function\",\"function\":{\"name\":\"get_time\",\"description\":\"Get time\",\"parameters\":{\"type\":\"\",\"properties\":null}}}]<|im_end|>\n<|im_start|>user\nHello<|im_end|>\n<|im_start|>assistant\n",
-		},
-		{
-			name: "user-tool sequence",
-			messages: []api.Message{
-				{Role: "user", Content: "Check weather"},
-				{Role: "tool", Content: "22C"},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>user\nCheck weather<|im_end|>\n<|im_start|>tool\n22C<|im_end|>\n<|im_start|>assistant\n",
-		},
-		{
-			name: "full tool call cycle",
-			messages: []api.Message{
-				{Role: "user", Content: "Check weather"},
-				{Role: "assistant", Content: "Let me check"},
-				{Role: "tool", Content: "22C"},
-				{Role: "assistant", Content: "It's 22C"},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>user\nCheck weather<|im_end|>\n<|im_start|>assistant\nLet me check<|im_end|>\n<|im_start|>tool\n22C<|im_end|>\n<|im_start|>assistant\nIt's 22C<|im_end|>\n<|im_start|>assistant\n",
-		},
-		{
-			name: "unicode content",
-			messages: []api.Message{
-				{Role: "user", Content: "你好世界! مرحبا 🌍"},
-				{Role: "assistant", Content: "Hello! 👋"},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>user\n你好世界! مرحبا 🌍<|im_end|>\n<|im_start|>assistant\nHello! 👋<|im_end|>\n<|im_start|>assistant\n",
-		},
-		{
-			name: "newlines in content",
-			messages: []api.Message{
-				{Role: "user", Content: "Line 1\nLine 2\n\nLine 4"},
-				{Role: "assistant", Content: "Response with\nmultiple\nlines"},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>user\nLine 1\nLine 2\n\nLine 4<|im_end|>\n<|im_start|>assistant\nResponse with\nmultiple\nlines<|im_end|>\n<|im_start|>assistant\n",
-		},
-		{
-			name: "empty assistant content",
-			messages: []api.Message{
-				{Role: "user", Content: "Hello"},
-				{Role: "assistant", Content: ""},
-				{Role: "user", Content: "OK"},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>user\nHello<|im_end|>\n<|im_start|>assistant\n<|im_end|>\n<|im_start|>user\nOK<|im_end|>\n<|im_start|>assistant\n",
-		},
-		{
-			// Generation prompt does NOT include <think> - model outputs it
-			name: "generation prompt has no think tag",
-			messages: []api.Message{
-				{Role: "user", Content: "Think hard"},
-			},
-			thinkValue: &api.ThinkValue{Value: true},
-			expected:   "<|im_start|>user\nThink hard<|im_end|>\n<|im_start|>assistant\n",
-		},
-		{
-			// Interleaved: thinking before tool call - last assistant preserves thinking
-			name: "thinking before tool call (last assistant)",
-			messages: []api.Message{
-				{Role: "user", Content: "What's the weather?"},
-				{
-					Role:    "assistant",
-					Content: "<think>I need to check the weather</think>",
-					ToolCalls: []api.ToolCall{
-						{
-							Function: api.ToolCallFunction{
-								Name: "get_weather",
-								Arguments: testArgs(map[string]any{
-									"location": "Paris",
-								}),
-							},
-						},
-					},
-				},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>user\nWhat's the weather?<|im_end|>\n<|im_start|>assistant\n<think>I need to check the weather</think><|tool_call_start|>{\"arguments\":{\"location\":\"Paris\"},\"name\":\"get_weather\"}<|tool_call_end|><|im_end|>\n<|im_start|>assistant\n",
-		},
-		{
-			// Two assistants with tool calls - first has thinking stripped
-			name: "two assistants with tools: first thinking stripped",
-			messages: []api.Message{
-				{Role: "user", Content: "What's the weather?"},
-				{
-					Role:    "assistant",
-					Content: "<think>checking</think>",
-					ToolCalls: []api.ToolCall{
-						{
-							Function: api.ToolCallFunction{
-								Name: "get_weather",
-								Arguments: testArgs(map[string]any{
-									"location": "Paris",
-								}),
-							},
-						},
-					},
-				},
-				{Role: "tool", Content: "22C"},
-				{Role: "assistant", Content: "<think>got result</think>It's 22C!"},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>user\nWhat's the weather?<|im_end|>\n<|im_start|>assistant\n<|tool_call_start|>{\"arguments\":{\"location\":\"Paris\"},\"name\":\"get_weather\"}<|tool_call_end|><|im_end|>\n<|im_start|>tool\n22C<|im_end|>\n<|im_start|>assistant\n<think>got result</think>It's 22C!<|im_end|>\n<|im_start|>assistant\n",
-		},
-		{
-			// Two assistants with tools - both preserved when thinking enabled
-			name: "two assistants with tools: both preserved when thinking enabled",
-			messages: []api.Message{
-				{Role: "user", Content: "What's the weather?"},
-				{
-					Role:    "assistant",
-					Content: "<think>checking</think>",
-					ToolCalls: []api.ToolCall{
-						{
-							Function: api.ToolCallFunction{
-								Name: "get_weather",
-								Arguments: testArgs(map[string]any{
-									"location": "Paris",
-								}),
-							},
-						},
-					},
-				},
-				{Role: "tool", Content: "22C"},
-				{Role: "assistant", Content: "<think>got result</think>It's 22C!"},
-			},
-			thinkValue: &api.ThinkValue{Value: true},
-			expected:   "<|im_start|>user\nWhat's the weather?<|im_end|>\n<|im_start|>assistant\n<think>checking</think><|tool_call_start|>{\"arguments\":{\"location\":\"Paris\"},\"name\":\"get_weather\"}<|tool_call_end|><|im_end|>\n<|im_start|>tool\n22C<|im_end|>\n<|im_start|>assistant\n<think>got result</think>It's 22C!<|im_end|>\n<|im_start|>assistant\n",
-		},
-		{
-			// Content before thinking before tool call
-			name: "content then thinking then tool call",
-			messages: []api.Message{
-				{Role: "user", Content: "What's the weather?"},
-				{
-					Role:    "assistant",
-					Content: "Let me check.<think>Using weather API</think>",
-					ToolCalls: []api.ToolCall{
-						{
-							Function: api.ToolCallFunction{
-								Name: "get_weather",
-								Arguments: testArgs(map[string]any{
-									"location": "Paris",
-								}),
-							},
-						},
-					},
-				},
-			},
-			thinkValue: &api.ThinkValue{Value: false},
-			expected:   "<|im_start|>user\nWhat's the weather?<|im_end|>\n<|im_start|>assistant\nLet me check.<think>Using weather API</think><|tool_call_start|>{\"arguments\":{\"location\":\"Paris\"},\"name\":\"get_weather\"}<|tool_call_end|><|im_end|>\n<|im_start|>assistant\n",
-		},
-	}
-
-	renderer := &LFM2Renderer{IsThinking: true}
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			rendered, err := renderer.Render(tt.messages, tt.tools, tt.thinkValue)
-			if err != nil {
-				t.Fatalf("Render() error = %v", err)
-			}
-			if diff := cmp.Diff(tt.expected, rendered); diff != "" {
-				t.Errorf("Render() mismatch (-want +got):\n%s", diff)
-			}
-		})
-	}
-}
--- a/model/renderers/renderer.go
+++ b/model/renderers/renderer.go
@@ -80,12 +80,6 @@ func rendererForName(name string) Renderer {
 		return &Nemotron3NanoRenderer{}
 	case "functiongemma":
 		return &FunctionGemmaRenderer{}
-	case "glm-4.7":
-		return &GLM47Renderer{}
-	case "lfm2":
-		return &LFM2Renderer{IsThinking: false}
-	case "lfm2-thinking":
-		return &LFM2Renderer{IsThinking: true}
 	default:
 		return nil
 	}
--- a/model/renderers/testhelpers_test.go
+++ b/model/renderers/testhelpers_test.go
@@ -1,26 +1,6 @@
 package renderers

-import (
-	"encoding/json"
-
-	"github.com/ollama/ollama/api"
-)
-
-func args(s string) api.ToolCallFunctionArguments {
-	var result api.ToolCallFunctionArguments
-	if err := json.Unmarshal([]byte(s), &result); err != nil {
-		panic("invalid JSON in args(): " + err.Error())
-	}
-	return result
-}
-
-func propsMap(s string) *api.ToolPropertiesMap {
-	var result api.ToolPropertiesMap
-	if err := json.Unmarshal([]byte(s), &result); err != nil {
-		panic("invalid JSON in propsMap(): " + err.Error())
-	}
-	return &result
-}
+import "github.com/ollama/ollama/api"

 // testPropsMap creates a ToolPropertiesMap from a map (convenience function for tests, order not preserved)
 func testPropsMap(m map[string]api.ToolProperty) *api.ToolPropertiesMap {
--- a/openai/openai.go
+++ b/openai/openai.go
@@ -630,10 +630,6 @@ func nameFromToolCallID(messages []Message, toolCallID string) string {

 // decodeImageURL decodes a base64 data URI into raw image bytes.
 func decodeImageURL(url string) (api.ImageData, error) {
-	if strings.HasPrefix(url, "http://") || strings.HasPrefix(url, "https://") {
-		return nil, errors.New("image URLs are not currently supported, please use base64 encoded data instead")
-	}
-
 	types := []string{"jpeg", "jpg", "png", "webp"}

 	// Support blank mime type to match /api/chat's behavior of taking just unadorned base64
@@ -737,60 +733,3 @@ func FromCompleteRequest(r CompletionRequest) (api.GenerateRequest, error) {
 		DebugRenderOnly: r.DebugRenderOnly,
 	}, nil
 }
-
-// ImageGenerationRequest is an OpenAI-compatible image generation request.
-type ImageGenerationRequest struct {
-	Model          string `json:"model"`
-	Prompt         string `json:"prompt"`
-	N              int    `json:"n,omitempty"`
-	Size           string `json:"size,omitempty"`
-	ResponseFormat string `json:"response_format,omitempty"`
-	Seed           *int64 `json:"seed,omitempty"`
-}
-
-// ImageGenerationResponse is an OpenAI-compatible image generation response.
-type ImageGenerationResponse struct {
-	Created int64            `json:"created"`
-	Data    []ImageURLOrData `json:"data"`
-}
-
-// ImageURLOrData contains either a URL or base64-encoded image data.
-type ImageURLOrData struct {
-	URL     string `json:"url,omitempty"`
-	B64JSON string `json:"b64_json,omitempty"`
-}
-
-// FromImageGenerationRequest converts an OpenAI image generation request to an Ollama GenerateRequest.
-func FromImageGenerationRequest(r ImageGenerationRequest) api.GenerateRequest {
-	req := api.GenerateRequest{
-		Model:  r.Model,
-		Prompt: r.Prompt,
-	}
-	// Parse size if provided (e.g., "1024x768")
-	if r.Size != "" {
-		var w, h int32
-		if _, err := fmt.Sscanf(r.Size, "%dx%d", &w, &h); err == nil {
-			req.Width = w
-			req.Height = h
-		}
-	}
-	if r.Seed != nil {
-		if req.Options == nil {
-			req.Options = map[string]any{}
-		}
-		req.Options["seed"] = *r.Seed
-	}
-	return req
-}
-
-// ToImageGenerationResponse converts an Ollama GenerateResponse to an OpenAI ImageGenerationResponse.
-func ToImageGenerationResponse(resp api.GenerateResponse) ImageGenerationResponse {
-	var data []ImageURLOrData
-	if resp.Image != "" {
-		data = []ImageURLOrData{{B64JSON: resp.Image}}
-	}
-	return ImageGenerationResponse{
-		Created: resp.CreatedAt.Unix(),
-		Data:    data,
-	}
-}
--- a/openai/responses.go
+++ b/openai/responses.go
@@ -4,7 +4,6 @@ import (
 	"encoding/json"
 	"fmt"
 	"math/rand"
-	"time"

 	"github.com/ollama/ollama/api"
 )
@@ -266,9 +265,9 @@ type ResponsesText struct {
 type ResponsesTool struct {
 	Type        string         `json:"type"` // "function"
 	Name        string         `json:"name"`
-	Description *string        `json:"description"` // nullable but required
-	Strict      *bool          `json:"strict"`      // nullable but required
-	Parameters  map[string]any `json:"parameters"`  // nullable but required
+	Description string         `json:"description,omitempty"`
+	Strict      bool           `json:"strict,omitempty"`
+	Parameters  map[string]any `json:"parameters,omitempty"`
 }

 type ResponsesRequest struct {
@@ -476,16 +475,11 @@ func convertTool(t ResponsesTool) (api.Tool, error) {
 		}
 	}

-	var description string
-	if t.Description != nil {
-		description = *t.Description
-	}
-
 	return api.Tool{
 		Type: t.Type,
 		Function: api.ToolFunction{
 			Name:        t.Name,
-			Description: description,
+			Description: t.Description,
 			Parameters:  params,
 		},
 	}, nil
@@ -522,60 +516,17 @@ func convertInputMessage(m ResponsesInputMessage) (api.Message, error) {

 // Response types for the Responses API

-// ResponsesTextField represents the text output configuration in the response.
-type ResponsesTextField struct {
-	Format ResponsesTextFormat `json:"format"`
-}
-
-// ResponsesReasoningOutput represents reasoning configuration in the response.
-type ResponsesReasoningOutput struct {
-	Effort  *string `json:"effort,omitempty"`
-	Summary *string `json:"summary,omitempty"`
-}
-
-// ResponsesError represents an error in the response.
-type ResponsesError struct {
-	Code    string `json:"code"`
-	Message string `json:"message"`
-}
-
-// ResponsesIncompleteDetails represents details about why a response was incomplete.
-type ResponsesIncompleteDetails struct {
-	Reason string `json:"reason"`
-}
-
 type ResponsesResponse struct {
-	ID                 string                      `json:"id"`
-	Object             string                      `json:"object"`
-	CreatedAt          int64                       `json:"created_at"`
-	CompletedAt        *int64                      `json:"completed_at"`
-	Status             string                      `json:"status"`
-	IncompleteDetails  *ResponsesIncompleteDetails `json:"incomplete_details"`
-	Model              string                      `json:"model"`
-	PreviousResponseID *string                     `json:"previous_response_id"`
-	Instructions       *string                     `json:"instructions"`
-	Output             []ResponsesOutputItem       `json:"output"`
-	Error              *ResponsesError             `json:"error"`
-	Tools              []ResponsesTool             `json:"tools"`
-	ToolChoice         any                         `json:"tool_choice"`
-	Truncation         string                      `json:"truncation"`
-	ParallelToolCalls  bool                        `json:"parallel_tool_calls"`
-	Text               ResponsesTextField          `json:"text"`
-	TopP               float64                     `json:"top_p"`
-	PresencePenalty    float64                     `json:"presence_penalty"`
-	FrequencyPenalty   float64                     `json:"frequency_penalty"`
-	TopLogprobs        int                         `json:"top_logprobs"`
-	Temperature        float64                     `json:"temperature"`
-	Reasoning          *ResponsesReasoningOutput   `json:"reasoning"`
-	Usage              *ResponsesUsage             `json:"usage"`
-	MaxOutputTokens    *int                        `json:"max_output_tokens"`
-	MaxToolCalls       *int                        `json:"max_tool_calls"`
-	Store              bool                        `json:"store"`
-	Background         bool                        `json:"background"`
-	ServiceTier        string                      `json:"service_tier"`
-	Metadata           map[string]any              `json:"metadata"`
-	SafetyIdentifier   *string                     `json:"safety_identifier"`
-	PromptCacheKey     *string                     `json:"prompt_cache_key"`
+	ID        string                `json:"id"`
+	Object    string                `json:"object"`
+	CreatedAt int64                 `json:"created_at"`
+	Status    string                `json:"status"`
+	Model     string                `json:"model"`
+	Output    []ResponsesOutputItem `json:"output"`
+	Usage     *ResponsesUsage       `json:"usage,omitempty"`
+	// TODO(drifkin): add `temperature` and `top_p` to the response, but this
+	// requires additional plumbing to find the effective values since the
+	// defaults can come from the model or the request
 }

 type ResponsesOutputItem struct {
@@ -599,39 +550,18 @@ type ResponsesReasoningSummary struct {
 }

 type ResponsesOutputContent struct {
-	Type        string `json:"type"` // "output_text"
-	Text        string `json:"text"`
-	Annotations []any  `json:"annotations"`
-	Logprobs    []any  `json:"logprobs"`
-}
-
-type ResponsesInputTokensDetails struct {
-	CachedTokens int `json:"cached_tokens"`
-}
-
-type ResponsesOutputTokensDetails struct {
-	ReasoningTokens int `json:"reasoning_tokens"`
+	Type string `json:"type"` // "output_text"
+	Text string `json:"text"`
 }

 type ResponsesUsage struct {
-	InputTokens         int                          `json:"input_tokens"`
-	OutputTokens        int                          `json:"output_tokens"`
-	TotalTokens         int                          `json:"total_tokens"`
-	InputTokensDetails  ResponsesInputTokensDetails  `json:"input_tokens_details"`
-	OutputTokensDetails ResponsesOutputTokensDetails `json:"output_tokens_details"`
+	InputTokens  int `json:"input_tokens"`
+	OutputTokens int `json:"output_tokens"`
+	TotalTokens  int `json:"total_tokens"`
 }

-// derefFloat64 returns the value of a float64 pointer, or a default if nil.
-func derefFloat64(p *float64, def float64) float64 {
-	if p != nil {
-		return *p
-	}
-	return def
-}
-
-// ToResponse converts an api.ChatResponse to a Responses API response.
-// The request is used to echo back request parameters in the response.
-func ToResponse(model, responseID, itemID string, chatResponse api.ChatResponse, request ResponsesRequest) ResponsesResponse {
+// ToResponse converts an api.ChatResponse to a Responses API response
+func ToResponse(model, responseID, itemID string, chatResponse api.ChatResponse) ResponsesResponse {
 	var output []ResponsesOutputItem

 	// Add reasoning item if thinking is present
@@ -655,7 +585,6 @@ func ToResponse(model, responseID, itemID string, chatResponse api.ChatResponse,
 			output = append(output, ResponsesOutputItem{
 				ID:        fmt.Sprintf("fc_%s_%d", responseID, i),
 				Type:      "function_call",
-				Status:    "completed",
 				CallID:    tc.ID,
 				Name:      tc.Function.Name,
 				Arguments: tc.Function.Arguments,
@@ -669,90 +598,25 @@ func ToResponse(model, responseID, itemID string, chatResponse api.ChatResponse,
 			Role:   "assistant",
 			Content: []ResponsesOutputContent{
 				{
-					Type:        "output_text",
-					Text:        chatResponse.Message.Content,
-					Annotations: []any{},
-					Logprobs:    []any{},
+					Type: "output_text",
+					Text: chatResponse.Message.Content,
 				},
 			},
 		})
 	}

-	var instructions *string
-	if request.Instructions != "" {
-		instructions = &request.Instructions
-	}
-
-	// Build truncation with default
-	truncation := "disabled"
-	if request.Truncation != nil {
-		truncation = *request.Truncation
-	}
-
-	tools := request.Tools
-	if tools == nil {
-		tools = []ResponsesTool{}
-	}
-
-	text := ResponsesTextField{
-		Format: ResponsesTextFormat{Type: "text"},
-	}
-	if request.Text != nil && request.Text.Format != nil {
-		text.Format = *request.Text.Format
-	}
-
-	// Build reasoning output from request
-	var reasoning *ResponsesReasoningOutput
-	if request.Reasoning.Effort != "" || request.Reasoning.Summary != "" {
-		reasoning = &ResponsesReasoningOutput{}
-		if request.Reasoning.Effort != "" {
-			reasoning.Effort = &request.Reasoning.Effort
-		}
-		if request.Reasoning.Summary != "" {
-			reasoning.Summary = &request.Reasoning.Summary
-		}
-	}
-
 	return ResponsesResponse{
-		ID:                 responseID,
-		Object:             "response",
-		CreatedAt:          chatResponse.CreatedAt.Unix(),
-		CompletedAt:        nil, // Set by middleware when writing final response
-		Status:             "completed",
-		IncompleteDetails:  nil, // Only populated if response incomplete
-		Model:              model,
-		PreviousResponseID: nil, // Not supported
-		Instructions:       instructions,
-		Output:             output,
-		Error:              nil, // Only populated on failure
-		Tools:              tools,
-		ToolChoice:         "auto", // Default value
-		Truncation:         truncation,
-		ParallelToolCalls:  true, // Default value
-		Text:               text,
-		TopP:               derefFloat64(request.TopP, 1.0),
-		PresencePenalty:    0, // Default value
-		FrequencyPenalty:   0, // Default value
-		TopLogprobs:        0, // Default value
-		Temperature:        derefFloat64(request.Temperature, 1.0),
-		Reasoning:          reasoning,
+		ID:        responseID,
+		Object:    "response",
+		CreatedAt: chatResponse.CreatedAt.Unix(),
+		Status:    "completed",
+		Model:     model,
+		Output:    output,
 		Usage: &ResponsesUsage{
 			InputTokens:  chatResponse.PromptEvalCount,
 			OutputTokens: chatResponse.EvalCount,
 			TotalTokens:  chatResponse.PromptEvalCount + chatResponse.EvalCount,
-			// TODO(drifkin): wire through the actual values
-			InputTokensDetails: ResponsesInputTokensDetails{CachedTokens: 0},
-			// TODO(drifkin): wire through the actual values
-			OutputTokensDetails: ResponsesOutputTokensDetails{ReasoningTokens: 0},
 		},
-		MaxOutputTokens:  request.MaxOutputTokens,
-		MaxToolCalls:     nil,   // Not supported
-		Store:            false, // We don't store responses
-		Background:       request.Background,
-		ServiceTier:      "default", // Default value
-		Metadata:         map[string]any{},
-		SafetyIdentifier: nil, // Not supported
-		PromptCacheKey:   nil, // Not supported
 	}
 }

@@ -772,7 +636,6 @@ type ResponsesStreamConverter struct {
 	responseID string
 	itemID     string
 	model      string
-	request    ResponsesRequest

 	// State tracking (mutated across Process calls)
 	firstWrite      bool
@@ -805,12 +668,11 @@ func (c *ResponsesStreamConverter) newEvent(eventType string, data map[string]an
 }

 // NewResponsesStreamConverter creates a new converter with the given configuration.
-func NewResponsesStreamConverter(responseID, itemID, model string, request ResponsesRequest) *ResponsesStreamConverter {
+func NewResponsesStreamConverter(responseID, itemID, model string) *ResponsesStreamConverter {
 	return &ResponsesStreamConverter{
 		responseID: responseID,
 		itemID:     itemID,
 		model:      model,
-		request:    request,
 		firstWrite: true,
 	}
 }
@@ -855,120 +717,25 @@ func (c *ResponsesStreamConverter) Process(r api.ChatResponse) []ResponsesStream
 	return events
 }

-// buildResponseObject creates a full response object with all required fields for streaming events.
-func (c *ResponsesStreamConverter) buildResponseObject(status string, output []any, usage map[string]any) map[string]any {
-	var instructions any = nil
-	if c.request.Instructions != "" {
-		instructions = c.request.Instructions
-	}
-
-	truncation := "disabled"
-	if c.request.Truncation != nil {
-		truncation = *c.request.Truncation
-	}
-
-	var tools []any
-	if c.request.Tools != nil {
-		for _, t := range c.request.Tools {
-			tools = append(tools, map[string]any{
-				"type":        t.Type,
-				"name":        t.Name,
-				"description": t.Description,
-				"strict":      t.Strict,
-				"parameters":  t.Parameters,
-			})
-		}
-	}
-	if tools == nil {
-		tools = []any{}
-	}
-
-	textFormat := map[string]any{"type": "text"}
-	if c.request.Text != nil && c.request.Text.Format != nil {
-		textFormat = map[string]any{
-			"type": c.request.Text.Format.Type,
-		}
-		if c.request.Text.Format.Name != "" {
-			textFormat["name"] = c.request.Text.Format.Name
-		}
-		if c.request.Text.Format.Schema != nil {
-			textFormat["schema"] = c.request.Text.Format.Schema
-		}
-		if c.request.Text.Format.Strict != nil {
-			textFormat["strict"] = *c.request.Text.Format.Strict
-		}
-	}
-
-	var reasoning any = nil
-	if c.request.Reasoning.Effort != "" || c.request.Reasoning.Summary != "" {
-		r := map[string]any{}
-		if c.request.Reasoning.Effort != "" {
-			r["effort"] = c.request.Reasoning.Effort
-		} else {
-			r["effort"] = nil
-		}
-		if c.request.Reasoning.Summary != "" {
-			r["summary"] = c.request.Reasoning.Summary
-		} else {
-			r["summary"] = nil
-		}
-		reasoning = r
-	}
-
-	// Build top_p and temperature with defaults
-	topP := 1.0
-	if c.request.TopP != nil {
-		topP = *c.request.TopP
-	}
-	temperature := 1.0
-	if c.request.Temperature != nil {
-		temperature = *c.request.Temperature
-	}
-
-	return map[string]any{
-		"id":                   c.responseID,
-		"object":               "response",
-		"created_at":           time.Now().Unix(),
-		"completed_at":         nil,
-		"status":               status,
-		"incomplete_details":   nil,
-		"model":                c.model,
-		"previous_response_id": nil,
-		"instructions":         instructions,
-		"output":               output,
-		"error":                nil,
-		"tools":                tools,
-		"tool_choice":          "auto",
-		"truncation":           truncation,
-		"parallel_tool_calls":  true,
-		"text":                 map[string]any{"format": textFormat},
-		"top_p":                topP,
-		"presence_penalty":     0,
-		"frequency_penalty":    0,
-		"top_logprobs":         0,
-		"temperature":          temperature,
-		"reasoning":            reasoning,
-		"usage":                usage,
-		"max_output_tokens":    c.request.MaxOutputTokens,
-		"max_tool_calls":       nil,
-		"store":                false,
-		"background":           c.request.Background,
-		"service_tier":         "default",
-		"metadata":             map[string]any{},
-		"safety_identifier":    nil,
-		"prompt_cache_key":     nil,
-	}
-}
-
 func (c *ResponsesStreamConverter) createResponseCreatedEvent() ResponsesStreamEvent {
 	return c.newEvent("response.created", map[string]any{
-		"response": c.buildResponseObject("in_progress", []any{}, nil),
+		"response": map[string]any{
+			"id":     c.responseID,
+			"object": "response",
+			"status": "in_progress",
+			"output": []any{},
+		},
 	})
 }

 func (c *ResponsesStreamConverter) createResponseInProgressEvent() ResponsesStreamEvent {
 	return c.newEvent("response.in_progress", map[string]any{
-		"response": c.buildResponseObject("in_progress", []any{}, nil),
+		"response": map[string]any{
+			"id":     c.responseID,
+			"object": "response",
+			"status": "in_progress",
+			"output": []any{},
+		},
 	})
 }

@@ -995,10 +762,9 @@ func (c *ResponsesStreamConverter) processThinking(thinking string) []ResponsesS

 	// Emit delta
 	events = append(events, c.newEvent("response.reasoning_summary_text.delta", map[string]any{
-		"item_id":       c.reasoningItemID,
-		"output_index":  c.outputIndex,
-		"summary_index": 0,
-		"delta":         thinking,
+		"item_id":      c.reasoningItemID,
+		"output_index": c.outputIndex,
+		"delta":        thinking,
 	}))

 	// TODO(drifkin): consider adding
@@ -1017,10 +783,9 @@ func (c *ResponsesStreamConverter) finishReasoning() []ResponsesStreamEvent {

 	events := []ResponsesStreamEvent{
 		c.newEvent("response.reasoning_summary_text.done", map[string]any{
-			"item_id":       c.reasoningItemID,
-			"output_index":  c.outputIndex,
-			"summary_index": 0,
-			"text":          c.accumulatedThinking,
+			"item_id":      c.reasoningItemID,
+			"output_index": c.outputIndex,
+			"text":         c.accumulatedThinking,
 		}),
 		c.newEvent("response.output_item.done", map[string]any{
 			"output_index": c.outputIndex,
@@ -1133,10 +898,8 @@ func (c *ResponsesStreamConverter) processTextContent(content string) []Response
 			"output_index":  c.outputIndex,
 			"content_index": c.contentIndex,
 			"part": map[string]any{
-				"type":        "output_text",
-				"text":        "",
-				"annotations": []any{},
-				"logprobs":    []any{},
+				"type": "output_text",
+				"text": "",
 			},
 		}))
 	}
@@ -1150,7 +913,6 @@ func (c *ResponsesStreamConverter) processTextContent(content string) []Response
 		"output_index":  c.outputIndex,
 		"content_index": 0,
 		"delta":         content,
-		"logprobs":      []any{},
 	}))

 	return events
@@ -1182,10 +944,8 @@ func (c *ResponsesStreamConverter) buildFinalOutput() []any {
 			"status": "completed",
 			"role":   "assistant",
 			"content": []map[string]any{{
-				"type":        "output_text",
-				"text":        c.accumulatedText,
-				"annotations": []any{},
-				"logprobs":    []any{},
+				"type": "output_text",
+				"text": c.accumulatedText,
 			}},
 		})
 	}
@@ -1207,7 +967,6 @@ func (c *ResponsesStreamConverter) processCompletion(r api.ChatResponse) []Respo
 			"output_index":  c.outputIndex,
 			"content_index": 0,
 			"text":          c.accumulatedText,
-			"logprobs":      []any{},
 		}))

 		// response.content_part.done
@@ -1216,10 +975,8 @@ func (c *ResponsesStreamConverter) processCompletion(r api.ChatResponse) []Respo
 			"output_index":  c.outputIndex,
 			"content_index": 0,
 			"part": map[string]any{
-				"type":        "output_text",
-				"text":        c.accumulatedText,
-				"annotations": []any{},
-				"logprobs":    []any{},
+				"type": "output_text",
+				"text": c.accumulatedText,
 			},
 		}))

@@ -1232,31 +989,26 @@ func (c *ResponsesStreamConverter) processCompletion(r api.ChatResponse) []Respo
 				"status": "completed",
 				"role":   "assistant",
 				"content": []map[string]any{{
-					"type":        "output_text",
-					"text":        c.accumulatedText,
-					"annotations": []any{},
-					"logprobs":    []any{},
+					"type": "output_text",
+					"text": c.accumulatedText,
 				}},
 			},
 		}))
 	}

 	// response.completed
-	usage := map[string]any{
-		"input_tokens":  r.PromptEvalCount,
-		"output_tokens": r.EvalCount,
-		"total_tokens":  r.PromptEvalCount + r.EvalCount,
-		"input_tokens_details": map[string]any{
-			"cached_tokens": 0,
-		},
-		"output_tokens_details": map[string]any{
-			"reasoning_tokens": 0,
-		},
-	}
-	response := c.buildResponseObject("completed", c.buildFinalOutput(), usage)
-	response["completed_at"] = time.Now().Unix()
 	events = append(events, c.newEvent("response.completed", map[string]any{
-		"response": response,
+		"response": map[string]any{
+			"id":     c.responseID,
+			"object": "response",
+			"status": "completed",
+			"output": c.buildFinalOutput(),
+			"usage": map[string]any{
+				"input_tokens":  r.PromptEvalCount,
+				"output_tokens": r.EvalCount,
+				"total_tokens":  r.PromptEvalCount + r.EvalCount,
+			},
+		},
 	}))

 	return events
--- a/openai/responses_test.go
+++ b/openai/responses_test.go
@@ -850,7 +850,7 @@ func TestFromResponsesRequest_Images(t *testing.T) {
 }

 func TestResponsesStreamConverter_TextOnly(t *testing.T) {
-	converter := NewResponsesStreamConverter("resp_123", "msg_456", "gpt-oss:20b", ResponsesRequest{})
+	converter := NewResponsesStreamConverter("resp_123", "msg_456", "gpt-oss:20b")

 	// First chunk with content
 	events := converter.Process(api.ChatResponse{
@@ -916,7 +916,7 @@ func TestResponsesStreamConverter_TextOnly(t *testing.T) {
 }

 func TestResponsesStreamConverter_ToolCalls(t *testing.T) {
-	converter := NewResponsesStreamConverter("resp_123", "msg_456", "gpt-oss:20b", ResponsesRequest{})
+	converter := NewResponsesStreamConverter("resp_123", "msg_456", "gpt-oss:20b")

 	events := converter.Process(api.ChatResponse{
 		Message: api.Message{
@@ -952,7 +952,7 @@ func TestResponsesStreamConverter_ToolCalls(t *testing.T) {
 }

 func TestResponsesStreamConverter_Reasoning(t *testing.T) {
-	converter := NewResponsesStreamConverter("resp_123", "msg_456", "gpt-oss:20b", ResponsesRequest{})
+	converter := NewResponsesStreamConverter("resp_123", "msg_456", "gpt-oss:20b")

 	// First chunk with thinking
 	events := converter.Process(api.ChatResponse{
@@ -1267,7 +1267,7 @@ func TestToResponse_WithReasoning(t *testing.T) {
 			Content:  "The answer is 42",
 		},
 		Done: true,
-	}, ResponsesRequest{})
+	})

 	// Should have 2 output items: reasoning + message
 	if len(response.Output) != 2 {
@@ -1638,7 +1638,7 @@ func TestFromResponsesRequest_ShorthandFormats(t *testing.T) {

 func TestResponsesStreamConverter_OutputIncludesContent(t *testing.T) {
 	// Verify that response.output_item.done includes content field for messages
-	converter := NewResponsesStreamConverter("resp_123", "msg_456", "gpt-oss:20b", ResponsesRequest{})
+	converter := NewResponsesStreamConverter("resp_123", "msg_456", "gpt-oss:20b")

 	// First chunk
 	converter.Process(api.ChatResponse{
@@ -1686,7 +1686,7 @@ func TestResponsesStreamConverter_OutputIncludesContent(t *testing.T) {

 func TestResponsesStreamConverter_ResponseCompletedIncludesOutput(t *testing.T) {
 	// Verify that response.completed includes the output array
-	converter := NewResponsesStreamConverter("resp_123", "msg_456", "gpt-oss:20b", ResponsesRequest{})
+	converter := NewResponsesStreamConverter("resp_123", "msg_456", "gpt-oss:20b")

 	// Process some content
 	converter.Process(api.ChatResponse{
@@ -1730,7 +1730,7 @@ func TestResponsesStreamConverter_ResponseCompletedIncludesOutput(t *testing.T)

 func TestResponsesStreamConverter_ResponseCreatedIncludesOutput(t *testing.T) {
 	// Verify that response.created includes an empty output array
-	converter := NewResponsesStreamConverter("resp_123", "msg_456", "gpt-oss:20b", ResponsesRequest{})
+	converter := NewResponsesStreamConverter("resp_123", "msg_456", "gpt-oss:20b")

 	events := converter.Process(api.ChatResponse{
 		Message: api.Message{Content: "Hi"},
@@ -1757,7 +1757,7 @@ func TestResponsesStreamConverter_ResponseCreatedIncludesOutput(t *testing.T) {

 func TestResponsesStreamConverter_SequenceNumbers(t *testing.T) {
 	// Verify that events include incrementing sequence numbers
-	converter := NewResponsesStreamConverter("resp_123", "msg_456", "gpt-oss:20b", ResponsesRequest{})
+	converter := NewResponsesStreamConverter("resp_123", "msg_456", "gpt-oss:20b")

 	events := converter.Process(api.ChatResponse{
 		Message: api.Message{Content: "Hello"},
@@ -1791,7 +1791,7 @@ func TestResponsesStreamConverter_SequenceNumbers(t *testing.T) {

 func TestResponsesStreamConverter_FunctionCallStatus(t *testing.T) {
 	// Verify that function call items include status field
-	converter := NewResponsesStreamConverter("resp_123", "msg_456", "gpt-oss:20b", ResponsesRequest{})
+	converter := NewResponsesStreamConverter("resp_123", "msg_456", "gpt-oss:20b")

 	events := converter.Process(api.ChatResponse{
 		Message: api.Message{
--- a/readline/readline.go
+++ b/readline/readline.go
@@ -5,7 +5,6 @@ import (
 	"fmt"
 	"io"
 	"os"
-	"strings"
 )

 type Prompt struct {
@@ -37,11 +36,10 @@ type Terminal struct {
 }

 type Instance struct {
-	Prompt      *Prompt
-	Terminal    *Terminal
-	History     *History
-	Pasting     bool
-	pastedLines []string
+	Prompt   *Prompt
+	Terminal *Terminal
+	History  *History
+	Pasting  bool
 }

 func New(prompt Prompt) (*Instance, error) {
@@ -176,8 +174,6 @@ func (i *Instance) Readline() (string, error) {
 		case CharEsc:
 			esc = true
 		case CharInterrupt:
-			i.pastedLines = nil
-			i.Prompt.UseAlt = false
 			return "", ErrInterrupt
 		case CharPrev:
 			i.historyPrev(buf, &currentLineBuf)
@@ -192,23 +188,7 @@ func (i *Instance) Readline() (string, error) {
 		case CharForward:
 			buf.MoveRight()
 		case CharBackspace, CharCtrlH:
-			if buf.IsEmpty() && len(i.pastedLines) > 0 {
-				lastIdx := len(i.pastedLines) - 1
-				prevLine := i.pastedLines[lastIdx]
-				i.pastedLines = i.pastedLines[:lastIdx]
-				fmt.Print(CursorBOL + ClearToEOL + CursorUp + CursorBOL + ClearToEOL)
-				if len(i.pastedLines) == 0 {
-					fmt.Print(i.Prompt.Prompt)
-					i.Prompt.UseAlt = false
-				} else {
-					fmt.Print(i.Prompt.AltPrompt)
-				}
-				for _, r := range prevLine {
-					buf.Add(r)
-				}
-			} else {
-				buf.Remove()
-			}
+			buf.Remove()
 		case CharTab:
 			// todo: convert back to real tabs
 			for range 8 {
@@ -231,28 +211,13 @@ func (i *Instance) Readline() (string, error) {
 		case CharCtrlZ:
 			fd := os.Stdin.Fd()
 			return handleCharCtrlZ(fd, i.Terminal.termios)
-		case CharCtrlJ:
-			i.pastedLines = append(i.pastedLines, buf.String())
-			buf.Buf.Clear()
-			buf.Pos = 0
-			buf.DisplayPos = 0
-			buf.LineHasSpace.Clear()
-			fmt.Println()
-			fmt.Print(i.Prompt.AltPrompt)
-			i.Prompt.UseAlt = true
-			continue
-		case CharEnter:
+		case CharEnter, CharCtrlJ:
 			output := buf.String()
-			if len(i.pastedLines) > 0 {
-				output = strings.Join(i.pastedLines, "\n") + "\n" + output
-				i.pastedLines = nil
-			}
 			if output != "" {
 				i.History.Add(output)
 			}
 			buf.MoveToEnd()
 			fmt.Println()
-			i.Prompt.UseAlt = false

 			return output, nil
 		default:
--- a/scripts/build_darwin.sh
+++ b/scripts/build_darwin.sh
@@ -60,7 +60,7 @@ _build_darwin() {
            cmake --install $BUILD_DIR --component MLX
            # Override CGO flags to point to the amd64 build directory
            MLX_CGO_CFLAGS="-O3 -I$(pwd)/$BUILD_DIR/_deps/mlx-c-src -mmacosx-version-min=14.0"
-            MLX_CGO_LDFLAGS="-ldl -lc++ -framework Accelerate -mmacosx-version-min=14.0"
+            MLX_CGO_LDFLAGS="-L$(pwd)/$BUILD_DIR/lib/ollama -lmlxc -lmlx -Wl,-rpath,@executable_path -lc++ -framework Accelerate -mmacosx-version-min=14.0"
        else
            BUILD_DIR=build
            cmake --preset MLX \
@@ -71,12 +71,10 @@ _build_darwin() {
            cmake --install $BUILD_DIR --component MLX
            # Use default CGO flags from mlx.go for arm64
            MLX_CGO_CFLAGS="-O3 -I$(pwd)/$BUILD_DIR/_deps/mlx-c-src -mmacosx-version-min=14.0"
-            MLX_CGO_LDFLAGS="-lc++ -framework Metal -framework Foundation -framework Accelerate -mmacosx-version-min=14.0"
+            MLX_CGO_LDFLAGS="-L$(pwd)/$BUILD_DIR/lib/ollama -lmlxc -lmlx -Wl,-rpath,@executable_path -lc++ -framework Metal -framework Foundation -framework Accelerate -mmacosx-version-min=14.0"
        fi
-        GOOS=darwin GOARCH=$ARCH CGO_ENABLED=1 CGO_CFLAGS="$MLX_CGO_CFLAGS" CGO_LDFLAGS="$MLX_CGO_LDFLAGS" go build -tags mlx -o $INSTALL_PREFIX .
-        # Copy MLX libraries to same directory as executable for dlopen
-        cp $INSTALL_PREFIX/lib/ollama/libmlxc.dylib $INSTALL_PREFIX/
-        cp $INSTALL_PREFIX/lib/ollama/libmlx.dylib $INSTALL_PREFIX/
+        GOOS=darwin GOARCH=$ARCH CGO_ENABLED=1 CGO_CFLAGS="$MLX_CGO_CFLAGS" CGO_LDFLAGS="$MLX_CGO_LDFLAGS" go build -tags mlx -o $INSTALL_PREFIX/imagegen ./x/imagegen/cmd/engine
+        GOOS=darwin GOARCH=$ARCH CGO_ENABLED=1 go build -o $INSTALL_PREFIX .
    done
 }

@@ -84,17 +82,19 @@ _sign_darwin() {
    status "Creating universal binary..."
    mkdir -p dist/darwin
    lipo -create -output dist/darwin/ollama dist/darwin-*/ollama
+    lipo -create -output dist/darwin/imagegen dist/darwin-*/imagegen
    chmod +x dist/darwin/ollama
+    chmod +x dist/darwin/imagegen

    if [ -n "$APPLE_IDENTITY" ]; then
-        for F in dist/darwin/ollama dist/darwin-*/lib/ollama/*; do
+        for F in dist/darwin/ollama dist/darwin-*/lib/ollama/* dist/darwin/imagegen; do
            codesign -f --timestamp -s "$APPLE_IDENTITY" --identifier ai.ollama.ollama --options=runtime $F
        done

        # create a temporary zip for notarization
        TEMP=$(mktemp -u).zip
        ditto -c -k --keepParent dist/darwin/ollama "$TEMP"
-        xcrun notarytool submit "$TEMP" --wait --timeout 20m --apple-id $APPLE_ID --password $APPLE_PASSWORD --team-id $APPLE_TEAM_ID
+        xcrun notarytool submit "$TEMP" --wait --timeout 10m --apple-id $APPLE_ID --password $APPLE_PASSWORD --team-id $APPLE_TEAM_ID
        rm -f "$TEMP"
    fi

@@ -154,38 +154,38 @@ _build_macapp() {
    mkdir -p dist/Ollama.app/Contents/Resources
    if [ -d dist/darwin-amd64 ]; then
        lipo -create -output dist/Ollama.app/Contents/Resources/ollama dist/darwin-amd64/ollama dist/darwin-arm64/ollama
+        lipo -create -output dist/Ollama.app/Contents/Resources/imagegen dist/darwin-amd64/imagegen dist/darwin-arm64/imagegen
        for F in dist/darwin-amd64/lib/ollama/*mlx*.dylib ; do
            lipo -create -output dist/darwin/$(basename $F) $F dist/darwin-arm64/lib/ollama/$(basename $F)
        done
        cp dist/darwin-*/lib/ollama/*.so dist/darwin-*/lib/ollama/*.dylib dist/Ollama.app/Contents/Resources/
        cp dist/darwin/*.dylib dist/Ollama.app/Contents/Resources/
-        # Copy MLX metallib (architecture-independent, just use arm64 version)
-        cp dist/darwin-arm64/lib/ollama/*.metallib dist/Ollama.app/Contents/Resources/ 2>/dev/null || true
    else
        cp -a dist/darwin/ollama dist/Ollama.app/Contents/Resources/ollama
        cp dist/darwin/*.so dist/darwin/*.dylib dist/Ollama.app/Contents/Resources/
    fi
+    cp -a dist/darwin/imagegen dist/Ollama.app/Contents/Resources/imagegen
    chmod a+x dist/Ollama.app/Contents/Resources/ollama

    # Sign
    if [ -n "$APPLE_IDENTITY" ]; then
        codesign -f --timestamp -s "$APPLE_IDENTITY" --identifier ai.ollama.ollama --options=runtime dist/Ollama.app/Contents/Resources/ollama
-        for lib in dist/Ollama.app/Contents/Resources/*.so dist/Ollama.app/Contents/Resources/*.dylib dist/Ollama.app/Contents/Resources/*.metallib ; do
+        for lib in dist/Ollama.app/Contents/Resources/*.so dist/Ollama.app/Contents/Resources/*.dylib dist/Ollama.app/Contents/Resources/imagegen ; do
            codesign -f --timestamp -s "$APPLE_IDENTITY" --identifier ai.ollama.ollama --options=runtime ${lib}
        done
        codesign -f --timestamp -s "$APPLE_IDENTITY" --identifier com.electron.ollama --deep --options=runtime dist/Ollama.app
    fi

    rm -f dist/Ollama-darwin.zip
-    ditto -c -k --norsrc --keepParent dist/Ollama.app dist/Ollama-darwin.zip
-    (cd dist/Ollama.app/Contents/Resources/; tar -cf - ollama *.so *.dylib *.metallib 2>/dev/null) | gzip -9vc > dist/ollama-darwin.tgz
+    ditto -c -k --keepParent dist/Ollama.app dist/Ollama-darwin.zip
+    (cd dist/Ollama.app/Contents/Resources/; tar -cf - ollama imagegen *.so *.dylib) | gzip -9vc > dist/ollama-darwin.tgz

    # Notarize and Staple
    if [ -n "$APPLE_IDENTITY" ]; then
-        $(xcrun -f notarytool) submit dist/Ollama-darwin.zip --wait --timeout 20m --apple-id "$APPLE_ID" --password "$APPLE_PASSWORD" --team-id "$APPLE_TEAM_ID"
+        $(xcrun -f notarytool) submit dist/Ollama-darwin.zip --wait --timeout 10m --apple-id "$APPLE_ID" --password "$APPLE_PASSWORD" --team-id "$APPLE_TEAM_ID"
        rm -f dist/Ollama-darwin.zip
        $(xcrun -f stapler) staple dist/Ollama.app
-        ditto -c -k --norsrc --keepParent dist/Ollama.app dist/Ollama-darwin.zip
+        ditto -c -k --keepParent dist/Ollama.app dist/Ollama-darwin.zip

        rm -f dist/Ollama.dmg

@@ -206,7 +206,7 @@ _build_macapp() {
        rm -f dist/rw*.dmg

        codesign -f --timestamp -s "$APPLE_IDENTITY" --identifier ai.ollama.ollama --options=runtime dist/Ollama.dmg
-        $(xcrun -f notarytool) submit dist/Ollama.dmg --wait --timeout 20m --apple-id "$APPLE_ID" --password "$APPLE_PASSWORD" --team-id "$APPLE_TEAM_ID"
+        $(xcrun -f notarytool) submit dist/Ollama.dmg --wait --timeout 10m --apple-id "$APPLE_ID" --password "$APPLE_PASSWORD" --team-id "$APPLE_TEAM_ID"
        $(xcrun -f stapler) staple dist/Ollama.dmg
    else
        echo "WARNING: Code signing disabled, this bundle will not work for upgrade testing"
--- a/scripts/build_linux.sh
+++ b/scripts/build_linux.sh
@@ -48,12 +48,53 @@ if echo $PLATFORM | grep "amd64" > /dev/null; then
        .
 fi

+# Deduplicate CUDA libraries across mlx_* and cuda_* directories
+deduplicate_cuda_libs() {
+    local base_dir="$1"
+    echo "Deduplicating CUDA libraries in ${base_dir}..."
+
+    # Find all mlx_cuda_* directories
+    for mlx_dir in "${base_dir}"/lib/ollama/mlx_cuda_*; do
+        [ -d "${mlx_dir}" ] || continue
+
+        # Extract CUDA version (e.g., v12, v13)
+        cuda_version=$(basename "${mlx_dir}" | sed 's/mlx_cuda_//')
+        cuda_dir="${base_dir}/lib/ollama/cuda_${cuda_version}"
+
+        # Skip if corresponding cuda_* directory doesn't exist
+        [ -d "${cuda_dir}" ] || continue
+
+        echo "  Checking ${mlx_dir} against ${cuda_dir}..."
+
+        # Find all .so* files in mlx directory
+        find "${mlx_dir}" -type f -name "*.so*" | while read mlx_file; do
+            filename=$(basename "${mlx_file}")
+            cuda_file="${cuda_dir}/${filename}"
+
+            # Skip if file doesn't exist in cuda directory
+            [ -f "${cuda_file}" ] || continue
+
+            # Compare checksums
+            mlx_sum=$(sha256sum "${mlx_file}" | awk '{print $1}')
+            cuda_sum=$(sha256sum "${cuda_file}" | awk '{print $1}')
+
+            if [ "${mlx_sum}" = "${cuda_sum}" ]; then
+                echo "    Deduplicating ${filename}"
+                # Calculate relative path from mlx_dir to cuda_dir
+                rel_path="../cuda_${cuda_version}/${filename}"
+                rm -f "${mlx_file}"
+                ln -s "${rel_path}" "${mlx_file}"
+            fi
+        done
+    done
+}
+
 # Run deduplication for each platform output directory
 if echo $PLATFORM | grep "," > /dev/null ; then
-    $(dirname $0)/deduplicate_cuda_libs.sh "./dist/linux_amd64"
-    $(dirname $0)/deduplicate_cuda_libs.sh "./dist/linux_arm64"
+    deduplicate_cuda_libs "./dist/linux_amd64"
+    deduplicate_cuda_libs "./dist/linux_arm64"
 elif echo $PLATFORM | grep "amd64\|arm64" > /dev/null ; then
-    $(dirname $0)/deduplicate_cuda_libs.sh "./dist"
+    deduplicate_cuda_libs "./dist"
 fi

 # buildx behavior changes for single vs. multiplatform
--- a/scripts/deduplicate_cuda_libs.sh
+++ b/scripts/deduplicate_cuda_libs.sh
@@ -1,60 +0,0 @@
-#!/bin/sh
-#
-# Deduplicate CUDA libraries across mlx_* and cuda_* directories
-# This script finds identical .so* files in mlx_cuda_* directories that exist
-# in corresponding cuda_* directories and replaces them with symlinks.
-#
-
-set -eu
-
-if [ $# -eq 0 ]; then
-    echo "ERROR: No directory specified" >&2
-    echo "Usage: $0 <base_directory>" >&2
-    exit 1
-fi
-
-base_dir="$1"
-
-if [ ! -d "${base_dir}" ]; then
-    echo "ERROR: Directory ${base_dir} does not exist" >&2
-    exit 1
-fi
-
-echo "Deduplicating CUDA libraries in ${base_dir}..."
-
-# Find all mlx_cuda_* directories
-for mlx_dir in "${base_dir}"/lib/ollama/mlx_cuda_*; do
-    [ -d "${mlx_dir}" ] || continue
-
-    # Extract CUDA version (e.g., v12, v13)
-    cuda_version=$(basename "${mlx_dir}" | sed 's/mlx_cuda_//')
-    cuda_dir="${base_dir}/lib/ollama/cuda_${cuda_version}"
-
-    # Skip if corresponding cuda_* directory doesn't exist
-    [ -d "${cuda_dir}" ] || continue
-
-    echo "  Checking ${mlx_dir} against ${cuda_dir}..."
-
-    # Find all .so* files in mlx directory
-    find "${mlx_dir}" -type f -name "*.so*" | while read mlx_file; do
-        filename=$(basename "${mlx_file}")
-        cuda_file="${cuda_dir}/${filename}"
-
-        # Skip if file doesn't exist in cuda directory
-        [ -f "${cuda_file}" ] || continue
-
-        # Compare checksums
-        mlx_sum=$(sha256sum "${mlx_file}" | awk '{print $1}')
-        cuda_sum=$(sha256sum "${cuda_file}" | awk '{print $1}')
-
-        if [ "${mlx_sum}" = "${cuda_sum}" ]; then
-            echo "    Deduplicating ${filename}"
-            # Calculate relative path from mlx_dir to cuda_dir
-            rel_path="../cuda_${cuda_version}/${filename}"
-            rm -f "${mlx_file}"
-            ln -s "${rel_path}" "${mlx_file}"
-        fi
-    done
-done
-
-echo "Deduplication complete"
--- a/server/auth.go
+++ b/server/auth.go
@@ -50,17 +50,12 @@ func (r registryChallenge) URL() (*url.URL, error) {
 	return redirectURL, nil
 }

-func getAuthorizationToken(ctx context.Context, challenge registryChallenge, originalHost string) (string, error) {
+func getAuthorizationToken(ctx context.Context, challenge registryChallenge) (string, error) {
 	redirectURL, err := challenge.URL()
 	if err != nil {
 		return "", err
 	}

-	// Validate that the realm host matches the original request host to prevent sending tokens cross-origin.
-	if redirectURL.Host != originalHost {
-		return "", fmt.Errorf("realm host %q does not match original host %q", redirectURL.Host, originalHost)
-	}
-
 	sha256sum := sha256.Sum256(nil)
 	data := []byte(fmt.Sprintf("%s,%s,%s", http.MethodGet, redirectURL.String(), base64.StdEncoding.EncodeToString([]byte(hex.EncodeToString(sha256sum[:])))))

--- a/server/auth_test.go
+++ b/server/auth_test.go
@@ -1,113 +0,0 @@
-package server
-
-import (
-	"context"
-	"strings"
-	"testing"
-	"time"
-)
-
-func TestGetAuthorizationTokenRejectsCrossDomain(t *testing.T) {
-	tests := []struct {
-		realm        string
-		originalHost string
-		wantMismatch bool
-	}{
-		{"https://example.com/token", "example.com", false},
-		{"https://example.com/token", "other.com", true},
-		{"https://example.com/token", "localhost:8000", true},
-		{"https://localhost:5000/token", "localhost:5000", false},
-		{"https://localhost:5000/token", "localhost:6000", true},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.originalHost, func(t *testing.T) {
-			ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
-			defer cancel()
-
-			challenge := registryChallenge{Realm: tt.realm, Service: "test", Scope: "repo:x:pull"}
-			_, err := getAuthorizationToken(ctx, challenge, tt.originalHost)
-
-			isMismatch := err != nil && strings.Contains(err.Error(), "does not match")
-			if tt.wantMismatch && !isMismatch {
-				t.Errorf("expected domain mismatch error, got: %v", err)
-			}
-			if !tt.wantMismatch && isMismatch {
-				t.Errorf("unexpected domain mismatch error: %v", err)
-			}
-		})
-	}
-}
-
-func TestParseRegistryChallenge(t *testing.T) {
-	tests := []struct {
-		input                             string
-		wantRealm, wantService, wantScope string
-	}{
-		{
-			`Bearer realm="https://auth.example.com/token",service="registry",scope="repo:foo:pull"`,
-			"https://auth.example.com/token", "registry", "repo:foo:pull",
-		},
-		{
-			`Bearer realm="https://r.ollama.ai/v2/token",service="ollama",scope="-"`,
-			"https://r.ollama.ai/v2/token", "ollama", "-",
-		},
-		{"", "", "", ""},
-	}
-
-	for _, tt := range tests {
-		result := parseRegistryChallenge(tt.input)
-		if result.Realm != tt.wantRealm || result.Service != tt.wantService || result.Scope != tt.wantScope {
-			t.Errorf("parseRegistryChallenge(%q) = {%q, %q, %q}, want {%q, %q, %q}",
-				tt.input, result.Realm, result.Service, result.Scope,
-				tt.wantRealm, tt.wantService, tt.wantScope)
-		}
-	}
-}
-
-func TestRegistryChallengeURL(t *testing.T) {
-	challenge := registryChallenge{
-		Realm:   "https://auth.example.com/token",
-		Service: "registry",
-		Scope:   "repo:foo:pull repo:bar:push",
-	}
-
-	u, err := challenge.URL()
-	if err != nil {
-		t.Fatalf("URL() error: %v", err)
-	}
-
-	if u.Host != "auth.example.com" {
-		t.Errorf("host = %q, want %q", u.Host, "auth.example.com")
-	}
-	if u.Path != "/token" {
-		t.Errorf("path = %q, want %q", u.Path, "/token")
-	}
-
-	q := u.Query()
-	if q.Get("service") != "registry" {
-		t.Errorf("service = %q, want %q", q.Get("service"), "registry")
-	}
-	if scopes := q["scope"]; len(scopes) != 2 {
-		t.Errorf("scope count = %d, want 2", len(scopes))
-	}
-	if q.Get("ts") == "" {
-		t.Error("missing ts")
-	}
-	if q.Get("nonce") == "" {
-		t.Error("missing nonce")
-	}
-
-	// Nonces should differ between calls
-	u2, _ := challenge.URL()
-	if q.Get("nonce") == u2.Query().Get("nonce") {
-		t.Error("nonce should be unique per call")
-	}
-}
-
-func TestRegistryChallengeURLInvalid(t *testing.T) {
-	challenge := registryChallenge{Realm: "://invalid"}
-	if _, err := challenge.URL(); err == nil {
-		t.Error("expected error for invalid URL")
-	}
-}
--- a/server/images.go
+++ b/server/images.go
@@ -41,7 +41,6 @@ var (
 	errCapabilityVision     = errors.New("vision")
 	errCapabilityEmbedding  = errors.New("embedding")
 	errCapabilityThinking   = errors.New("thinking")
-	errCapabilityImage      = errors.New("image generation")
 	errInsecureProtocol     = errors.New("insecure protocol http")
 )

@@ -77,7 +76,7 @@ func (m *Model) Capabilities() []model.Capability {

 	// Check for image generation model via config capabilities
 	if slices.Contains(m.Config.Capabilities, "image") {
-		return []model.Capability{model.CapabilityImage}
+		return []model.Capability{model.CapabilityImageGeneration}
 	}

 	// Check for completion capability
@@ -160,7 +159,6 @@ func (m *Model) CheckCapabilities(want ...model.Capability) error {
 		model.CapabilityVision:     errCapabilityVision,
 		model.CapabilityEmbedding:  errCapabilityEmbedding,
 		model.CapabilityThinking:   errCapabilityThinking,
-		model.CapabilityImage:      errCapabilityImage,
 	}

 	for _, cap := range want {
@@ -777,7 +775,7 @@ func pullWithTransfer(ctx context.Context, mp ModelPath, layers []Layer, manifes
 			Realm:   challenge.Realm,
 			Service: challenge.Service,
 			Scope:   challenge.Scope,
-		}, base.Host)
+		})
 	}

 	if err := transfer.Download(ctx, transfer.DownloadOptions{
@@ -852,7 +850,7 @@ func pushWithTransfer(ctx context.Context, mp ModelPath, layers []Layer, manifes
 			Realm:   challenge.Realm,
 			Service: challenge.Service,
 			Scope:   challenge.Scope,
-		}, base.Host)
+		})
 	}

 	return transfer.Upload(ctx, transfer.UploadOptions{
@@ -918,7 +916,7 @@ func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.UR

 			// Handle authentication error with one retry
 			challenge := parseRegistryChallenge(resp.Header.Get("www-authenticate"))
-			token, err := getAuthorizationToken(ctx, challenge, requestURL.Host)
+			token, err := getAuthorizationToken(ctx, challenge)
 			if err != nil {
 				return nil, err
 			}
--- a/server/images_test.go
+++ b/server/images_test.go
@@ -54,7 +54,7 @@ func TestModelCapabilities(t *testing.T) {
 					Capabilities: []string{"image"},
 				},
 			},
-			expectedCaps: []model.Capability{model.CapabilityImage},
+			expectedCaps: []model.Capability{model.CapabilityImageGeneration},
 		},
 		{
 			name: "model with completion capability",
@@ -242,24 +242,6 @@ func TestModelCheckCapabilities(t *testing.T) {
 			checkCaps:      []model.Capability{"unknown"},
 			expectedErrMsg: "unknown capability",
 		},
-		{
-			name: "model missing image generation capability",
-			model: Model{
-				ModelPath: completionModelPath,
-				Template:  chatTemplate,
-			},
-			checkCaps:      []model.Capability{model.CapabilityImage},
-			expectedErrMsg: "does not support image generation",
-		},
-		{
-			name: "model with image generation capability",
-			model: Model{
-				Config: model.ConfigV2{
-					Capabilities: []string{"image"},
-				},
-			},
-			checkCaps: []model.Capability{model.CapabilityImage},
-		},
 	}

 	for _, tt := range tests {
--- a/server/manifest.go
+++ b/server/manifest.go
@@ -47,37 +47,13 @@ func (m *Manifest) Remove() error {
 }

 func (m *Manifest) RemoveLayers() error {
-	ms, err := Manifests(true)
-	if err != nil {
-		return err
-	}
-
-	// Build set of digests still in use by other manifests
-	inUse := make(map[string]struct{})
-	for _, other := range ms {
-		for _, layer := range append(other.Layers, other.Config) {
-			if layer.Digest != "" {
-				inUse[layer.Digest] = struct{}{}
-			}
-		}
-	}
-
-	// Remove layers not used by any other manifest
 	for _, layer := range append(m.Layers, m.Config) {
-		if layer.Digest == "" {
-			continue
-		}
-		if _, used := inUse[layer.Digest]; used {
-			continue
-		}
-		blob, err := GetBlobsPath(layer.Digest)
-		if err != nil {
-			return err
-		}
-		if err := os.Remove(blob); errors.Is(err, os.ErrNotExist) {
-			slog.Debug("layer does not exist", "digest", layer.Digest)
-		} else if err != nil {
-			return err
+		if layer.Digest != "" {
+			if err := layer.Remove(); errors.Is(err, os.ErrNotExist) {
+				slog.Debug("layer does not exist", "digest", layer.Digest)
+			} else if err != nil {
+				return err
+			}
 		}
 	}

--- a/server/quantization.go
+++ b/server/quantization.go
@@ -198,8 +198,8 @@ func newType(t *fsggml.Tensor, kv fsggml.KV, qs *quantizeState, ftype fsggml.Fil
 	name := t.Name
 	quantize := strings.HasSuffix(name, "weight")

-	// don't quantize vision encoder tensors (named with "v." prefix)
-	quantize = quantize && !strings.HasPrefix(name, "v.")
+	// don't quantize vision stuff
+	quantize = quantize && (!strings.Contains(name, "v.") || strings.Contains(name, "_v."))
 	quantize = quantize && !strings.Contains(name, "mm.")

 	// quantize only 2D and 3D tensors (experts)
@@ -219,9 +219,6 @@ func newType(t *fsggml.Tensor, kv fsggml.KV, qs *quantizeState, ftype fsggml.Fil
 	// NOTE: can't use LLM_TN here because the layer number is not known
 	quantize = quantize && !strings.Contains(name, "ssm_conv1d.weight")

-	// do not quantize LFM2's shortconv kernel weights
-	quantize = quantize && !strings.Contains(name, "shortconv.conv.weight")
-
 	// do not quantize RWKV's time_mix_first tensors
 	quantize = quantize && !strings.Contains(name, "time_mix_first.weight")
 	quantize = quantize && !strings.Contains(name, "time_mix_w1.weight")
--- a/server/routes.go
+++ b/server/routes.go
@@ -51,7 +51,7 @@ import (
 	"github.com/ollama/ollama/types/model"
 	"github.com/ollama/ollama/version"
 	"github.com/ollama/ollama/x/imagegen"
-	xserver "github.com/ollama/ollama/x/server"
+	imagegenapi "github.com/ollama/ollama/x/imagegen/api"
 )

 const signinURLStr = "https://ollama.com/connect?name=%s&key=%s"
@@ -164,6 +164,29 @@ func (s *Server) scheduleRunner(ctx context.Context, name string, caps []model.C
 	return runner.llama, model, &opts, nil
 }

+// ScheduleImageGenRunner schedules an image generation model runner.
+// This implements the imagegenapi.RunnerScheduler interface.
+func (s *Server) ScheduleImageGenRunner(c *gin.Context, modelName string, opts api.Options, keepAlive *api.Duration) (llm.LlamaServer, error) {
+	m := &Model{
+		Name:      modelName,
+		ShortName: modelName,
+		ModelPath: modelName, // For image gen, ModelPath is just the model name
+		Config: model.ConfigV2{
+			Capabilities: []string{"image"},
+		},
+	}
+
+	runnerCh, errCh := s.sched.GetRunner(c.Request.Context(), m, opts, keepAlive)
+	var runner *runnerRef
+	select {
+	case runner = <-runnerCh:
+	case err := <-errCh:
+		return nil, err
+	}
+
+	return runner.llama, nil
+}
+
 func signinURL() (string, error) {
 	pubKey, err := auth.GetPublicKey()
 	if err != nil {
@@ -191,6 +214,12 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 		return
 	}

+	// Check if this is a known image generation model
+	if imagegen.ResolveModelName(req.Model) != "" {
+		imagegenapi.HandleGenerateRequest(c, s, req.Model, req.Prompt, req.KeepAlive, streamResponse)
+		return
+	}
+
 	name := model.ParseName(req.Model)
 	if !name.IsValid() {
 		// Ideally this is "invalid model name" but we're keeping with
@@ -315,7 +344,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 		return
 	}

-	// expire the runner if unload is requested (empty prompt, keep alive is 0)
+	// expire the runner
 	if req.Prompt == "" && req.KeepAlive != nil && req.KeepAlive.Duration == 0 {
 		s.sched.expireRunner(m)

@@ -329,12 +358,6 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 		return
 	}

-	// Handle image generation models
-	if slices.Contains(m.Capabilities(), model.CapabilityImage) {
-		s.handleImageGenerate(c, req, name.String(), checkpointStart)
-		return
-	}
-
 	if req.Raw && (req.Template != "" || req.System != "" || len(req.Context) > 0) {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "raw mode does not support template, system, or context"})
 		return
@@ -1101,31 +1124,6 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
 		QuantizationLevel: m.Config.FileType,
 	}

-	// For image generation models, populate details from imagegen package
-	if slices.Contains(m.Capabilities(), model.CapabilityImage) {
-		if info, err := imagegen.GetModelInfo(name.String()); err == nil {
-			modelDetails.Family = info.Architecture
-			modelDetails.ParameterSize = format.HumanNumber(uint64(info.ParameterCount))
-			modelDetails.QuantizationLevel = info.Quantization
-		}
-	}
-
-	// For safetensors LLM models (experimental), populate details from config.json
-	if m.Config.ModelFormat == "safetensors" && slices.Contains(m.Config.Capabilities, "completion") {
-		if info, err := xserver.GetSafetensorsLLMInfo(name.String()); err == nil {
-			if arch, ok := info["general.architecture"].(string); ok && arch != "" {
-				modelDetails.Family = arch
-			}
-			if paramCount, ok := info["general.parameter_count"].(int64); ok && paramCount > 0 {
-				modelDetails.ParameterSize = format.HumanNumber(uint64(paramCount))
-			}
-		}
-		// Get torch_dtype directly from config.json for quantization level
-		if dtype, err := xserver.GetSafetensorsDtype(name.String()); err == nil && dtype != "" {
-			modelDetails.QuantizationLevel = dtype
-		}
-	}
-
 	if req.System != "" {
 		m.System = req.System
 	}
@@ -1149,9 +1147,6 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
 		Capabilities: m.Capabilities(),
 		ModifiedAt:   manifest.fi.ModTime(),
 		Requires:     m.Config.Requires,
-		// Several integrations crash on a nil/omitempty+empty ModelInfo, so by
-		// default we return an empty map.
-		ModelInfo: make(map[string]any),
 	}

 	if m.Config.RemoteHost != "" {
@@ -1211,30 +1206,6 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
 		return resp, nil
 	}

-	if slices.Contains(m.Capabilities(), model.CapabilityImage) {
-		// Populate tensor info if verbose
-		if req.Verbose {
-			if tensors, err := xserver.GetSafetensorsTensorInfo(name.String()); err == nil {
-				resp.Tensors = tensors
-			}
-		}
-		return resp, nil
-	}
-
-	// For safetensors LLM models (experimental), populate ModelInfo from config.json
-	if m.Config.ModelFormat == "safetensors" && slices.Contains(m.Config.Capabilities, "completion") {
-		if info, err := xserver.GetSafetensorsLLMInfo(name.String()); err == nil {
-			resp.ModelInfo = info
-		}
-		// Populate tensor info if verbose
-		if req.Verbose {
-			if tensors, err := xserver.GetSafetensorsTensorInfo(name.String()); err == nil {
-				resp.Tensors = tensors
-			}
-		}
-		return resp, nil
-	}
-
 	kvData, tensors, err := getModelData(m.ModelPath, req.Verbose)
 	if err != nil {
 		return nil, err
@@ -1603,12 +1574,13 @@ func (s *Server) GenerateRoutes(rc *ollama.Registry) (http.Handler, error) {
 	r.GET("/v1/models", middleware.ListMiddleware(), s.ListHandler)
 	r.GET("/v1/models/:model", middleware.RetrieveMiddleware(), s.ShowHandler)
 	r.POST("/v1/responses", middleware.ResponsesMiddleware(), s.ChatHandler)
-	// OpenAI-compatible image generation endpoint
-	r.POST("/v1/images/generations", middleware.ImageGenerationsMiddleware(), s.GenerateHandler)

 	// Inference (Anthropic compatibility)
 	r.POST("/v1/messages", middleware.AnthropicMessagesMiddleware(), s.ChatHandler)

+	// Experimental image generation support
+	imagegenapi.RegisterRoutes(r, s)
+
 	if rc != nil {
 		// wrap old with new
 		rs := &registry.Local{
@@ -2087,14 +2059,8 @@ func (s *Server) ChatHandler(c *gin.Context) {
 		}
 	} else {
 		if req.Think != nil && req.Think.Bool() {
-			// Set think to nil when being used with Anthropic API to connect to tools like claude code
-			if _, ok := c.Get("relax_thinking"); ok {
-				slog.Warn("model does not support thinking, relaxing thinking to nil", "model", req.Model)
-				req.Think = nil
-			} else {
-				c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("%q does not support thinking", req.Model)})
-				return
-			}
+			c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("%q does not support thinking", req.Model)})
+			return
 		}
 	}

@@ -2475,91 +2441,3 @@ func filterThinkTags(msgs []api.Message, m *Model) []api.Message {
 	}
 	return msgs
 }
-
-// handleImageGenerate handles image generation requests within GenerateHandler.
-// This is called when the model has the Image capability.
-func (s *Server) handleImageGenerate(c *gin.Context, req api.GenerateRequest, modelName string, checkpointStart time.Time) {
-	// Validate image dimensions
-	const maxDimension int32 = 4096
-	if req.Width > maxDimension || req.Height > maxDimension {
-		c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("width and height must be <= %d", maxDimension)})
-		return
-	}
-
-	// Schedule the runner for image generation
-	runner, _, _, err := s.scheduleRunner(c.Request.Context(), modelName, []model.Capability{model.CapabilityImage}, nil, req.KeepAlive)
-	if err != nil {
-		handleScheduleError(c, req.Model, err)
-		return
-	}
-
-	checkpointLoaded := time.Now()
-
-	// Handle load-only request (empty prompt)
-	if req.Prompt == "" {
-		c.JSON(http.StatusOK, api.GenerateResponse{
-			Model:      req.Model,
-			CreatedAt:  time.Now().UTC(),
-			Done:       true,
-			DoneReason: "load",
-		})
-		return
-	}
-
-	// Set headers for streaming response
-	c.Header("Content-Type", "application/x-ndjson")
-
-	// Get seed from options if provided
-	var seed int64
-	if s, ok := req.Options["seed"]; ok {
-		switch v := s.(type) {
-		case int:
-			seed = int64(v)
-		case int64:
-			seed = v
-		case float64:
-			seed = int64(v)
-		}
-	}
-
-	var streamStarted bool
-	if err := runner.Completion(c.Request.Context(), llm.CompletionRequest{
-		Prompt: req.Prompt,
-		Width:  req.Width,
-		Height: req.Height,
-		Steps:  req.Steps,
-		Seed:   seed,
-	}, func(cr llm.CompletionResponse) {
-		streamStarted = true
-		res := api.GenerateResponse{
-			Model:     req.Model,
-			CreatedAt: time.Now().UTC(),
-			Done:      cr.Done,
-		}
-
-		if cr.TotalSteps > 0 {
-			res.Completed = int64(cr.Step)
-			res.Total = int64(cr.TotalSteps)
-		}
-
-		if cr.Image != "" {
-			res.Image = cr.Image
-		}
-
-		if cr.Done {
-			res.DoneReason = cr.DoneReason.String()
-			res.Metrics.TotalDuration = time.Since(checkpointStart)
-			res.Metrics.LoadDuration = checkpointLoaded.Sub(checkpointStart)
-		}
-
-		data, _ := json.Marshal(res)
-		c.Writer.Write(append(data, '\n'))
-		c.Writer.Flush()
-	}); err != nil {
-		// Only send JSON error if streaming hasn't started yet
-		// (once streaming starts, headers are committed and we can't change status code)
-		if !streamStarted {
-			c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
-		}
-	}
-}
--- a/server/routes_generate_test.go
+++ b/server/routes_generate_test.go
@@ -2101,95 +2101,3 @@ func TestChatWithPromptEndingInThinkTag(t *testing.T) {
 		}
 	})
 }
-
-func TestGenerateUnload(t *testing.T) {
-	gin.SetMode(gin.TestMode)
-
-	var loadFnCalled bool
-
-	s := Server{
-		sched: &Scheduler{
-			pendingReqCh:    make(chan *LlmRequest, 1),
-			finishedReqCh:   make(chan *LlmRequest, 1),
-			expiredCh:       make(chan *runnerRef, 1),
-			unloadedCh:      make(chan any, 1),
-			loaded:          make(map[string]*runnerRef),
-			newServerFn:     newMockServer(&mockRunner{}),
-			getGpuFn:        getGpuFn,
-			getSystemInfoFn: getSystemInfoFn,
-			loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
-				loadFnCalled = true
-				req.successCh <- &runnerRef{llama: &mockRunner{}}
-				return false
-			},
-		},
-	}
-
-	go s.sched.Run(t.Context())
-
-	_, digest := createBinFile(t, ggml.KV{
-		"general.architecture":          "llama",
-		"llama.block_count":             uint32(1),
-		"llama.context_length":          uint32(8192),
-		"llama.embedding_length":        uint32(4096),
-		"llama.attention.head_count":    uint32(32),
-		"llama.attention.head_count_kv": uint32(8),
-		"tokenizer.ggml.tokens":         []string{""},
-		"tokenizer.ggml.scores":         []float32{0},
-		"tokenizer.ggml.token_type":     []int32{0},
-	}, []*ggml.Tensor{
-		{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.ffn_gate.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.ffn_up.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.ffn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.attn_k.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.attn_output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.attn_q.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.attn_v.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-	})
-
-	w := createRequest(t, s.CreateHandler, api.CreateRequest{
-		Model:  "test",
-		Files:  map[string]string{"file.gguf": digest},
-		Stream: &stream,
-	})
-
-	if w.Code != http.StatusOK {
-		t.Fatalf("expected status 200, got %d", w.Code)
-	}
-
-	t.Run("unload with empty prompt and keepalive 0", func(t *testing.T) {
-		loadFnCalled = false
-
-		w := createRequest(t, s.GenerateHandler, api.GenerateRequest{
-			Model:     "test",
-			Prompt:    "",
-			KeepAlive: &api.Duration{Duration: 0},
-			Stream:    &stream,
-		})
-
-		if w.Code != http.StatusOK {
-			t.Errorf("expected status 200, got %d", w.Code)
-		}
-
-		var resp api.GenerateResponse
-		if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
-			t.Fatalf("failed to unmarshal response: %v", err)
-		}
-
-		if resp.DoneReason != "unload" {
-			t.Errorf("expected done_reason 'unload', got %q", resp.DoneReason)
-		}
-
-		if !resp.Done {
-			t.Error("expected done to be true")
-		}
-
-		if loadFnCalled {
-			t.Error("expected model NOT to be loaded for unload request, but loadFn was called")
-		}
-	})
-}
--- a/server/sched.go
+++ b/server/sched.go
@@ -574,8 +574,7 @@ func (s *Scheduler) loadImageGen(req *LlmRequest) bool {
 		Options:         &req.opts,
 		loading:         false,
 		sessionDuration: sessionDuration,
-		totalSize:       server.TotalSize(),
-		vramSize:        server.VRAMSize(),
+		refCount:        1,
 	}

 	s.loadedMu.Lock()
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -6,6 +6,7 @@ import (
 	"errors"
 	"log/slog"
 	"os"
+	"slices"
 	"testing"
 	"time"

@@ -16,6 +17,7 @@ import (
 	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/types/model"
 )

 func TestMain(m *testing.M) {
@@ -805,8 +807,32 @@ func (s *mockLlm) GetDeviceInfos(ctx context.Context) []ml.DeviceInfo { return n
 func (s *mockLlm) HasExited() bool                                    { return false }
 func (s *mockLlm) GetActiveDeviceIDs() []ml.DeviceID                  { return nil }

+// TestImageGenCapabilityDetection verifies that models with "image" capability
+// are correctly identified and routed differently from language models.
+func TestImageGenCapabilityDetection(t *testing.T) {
+	// Model with image capability should be detected
+	imageModel := &Model{
+		Config: model.ConfigV2{
+			Capabilities: []string{"image"},
+		},
+	}
+	require.True(t, slices.Contains(imageModel.Config.Capabilities, "image"))
+
+	// Model without image capability should not be detected
+	langModel := &Model{
+		Config: model.ConfigV2{
+			Capabilities: []string{"completion"},
+		},
+	}
+	require.False(t, slices.Contains(langModel.Config.Capabilities, "image"))
+
+	// Empty capabilities should not match
+	emptyModel := &Model{}
+	require.False(t, slices.Contains(emptyModel.Config.Capabilities, "image"))
+}
+
 // TestImageGenRunnerCanBeEvicted verifies that an image generation model
-// loaded in the scheduler can be evicted when idle.
+// loaded in the scheduler can be evicted by a language model request.
 func TestImageGenRunnerCanBeEvicted(t *testing.T) {
 	ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond)
 	defer done()
@@ -838,59 +864,3 @@ func TestImageGenRunnerCanBeEvicted(t *testing.T) {
 	require.NotNil(t, runner)
 	require.Equal(t, "/fake/image/model", runner.modelPath)
 }
-
-// TestImageGenSchedulerCoexistence verifies that image generation models
-// can coexist with language models in the scheduler and VRAM is tracked correctly.
-func TestImageGenSchedulerCoexistence(t *testing.T) {
-	ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond)
-	defer done()
-
-	s := InitScheduler(ctx)
-	s.getGpuFn = getGpuFn
-	s.getSystemInfoFn = getSystemInfoFn
-
-	// Load both an imagegen runner and a language model runner
-	imageGenRunner := &runnerRef{
-		model:           &Model{Name: "flux", ModelPath: "/fake/flux/model"},
-		modelPath:       "/fake/flux/model",
-		llama:           &mockLlm{vramSize: 8 * format.GigaByte, vramByGPU: map[ml.DeviceID]uint64{{Library: "Metal"}: 8 * format.GigaByte}},
-		sessionDuration: 10 * time.Millisecond,
-		numParallel:     1,
-		refCount:        0,
-	}
-
-	langModelRunner := &runnerRef{
-		model:           &Model{Name: "llama3", ModelPath: "/fake/llama3/model"},
-		modelPath:       "/fake/llama3/model",
-		llama:           &mockLlm{vramSize: 4 * format.GigaByte, vramByGPU: map[ml.DeviceID]uint64{{Library: "Metal"}: 4 * format.GigaByte}},
-		sessionDuration: 10 * time.Millisecond,
-		numParallel:     1,
-		refCount:        0,
-	}
-
-	s.loadedMu.Lock()
-	s.loaded["/fake/flux/model"] = imageGenRunner
-	s.loaded["/fake/llama3/model"] = langModelRunner
-	s.loadedMu.Unlock()
-
-	// Verify both are loaded
-	s.loadedMu.Lock()
-	require.Len(t, s.loaded, 2)
-	require.NotNil(t, s.loaded["/fake/flux/model"])
-	require.NotNil(t, s.loaded["/fake/llama3/model"])
-	s.loadedMu.Unlock()
-
-	// Verify updateFreeSpace accounts for both
-	gpus := []ml.DeviceInfo{
-		{
-			DeviceID:    ml.DeviceID{Library: "Metal"},
-			TotalMemory: 24 * format.GigaByte,
-			FreeMemory:  24 * format.GigaByte,
-		},
-	}
-	s.updateFreeSpace(gpus)
-
-	// Free memory should be reduced by both models
-	expectedFree := uint64(24*format.GigaByte) - uint64(8*format.GigaByte) - uint64(4*format.GigaByte)
-	require.Equal(t, expectedFree, gpus[0].FreeMemory)
-}
--- a/server/upload.go
+++ b/server/upload.go
@@ -279,7 +279,7 @@ func (b *blobUpload) uploadPart(ctx context.Context, method string, requestURL *
 	case resp.StatusCode == http.StatusUnauthorized:
 		w.Rollback()
 		challenge := parseRegistryChallenge(resp.Header.Get("www-authenticate"))
-		token, err := getAuthorizationToken(ctx, challenge, requestURL.Host)
+		token, err := getAuthorizationToken(ctx, challenge)
 		if err != nil {
 			return err
 		}
--- a/types/model/capability.go
+++ b/types/model/capability.go
@@ -9,7 +9,7 @@ const (
 	CapabilityVision          = Capability("vision")
 	CapabilityEmbedding       = Capability("embedding")
 	CapabilityThinking        = Capability("thinking")
-	CapabilityImage = Capability("image")
+	CapabilityImageGeneration = Capability("image")
 )

 func (c Capability) String() string {
--- a/x/README.md
+++ b/x/README.md
@@ -0,0 +1,24 @@
+# Experimental Features 
+
+## MLX Backend
+
+We're working on a new experimental backend based on the [MLX project](https://github.com/ml-explore/mlx)
+
+Support is currently limited to MacOS and Linux with CUDA GPUs.  We're looking to add support for Windows CUDA soon, and other GPU vendors.  To build:
+
+```
+cmake --preset MLX
+cmake --build --preset MLX --parallel
+cmake --install --component MLX
+go build -tags mlx .
+```
+
+On linux, use the preset "MLX CUDA 13" or "MLX CUDA 12" to enable CUDA with the default Ollama NVIDIA GPU architectures enabled. 
+
+## Image Generation
+
+Based on the experimental MLX backend, we're working on adding imagegen support.  After running the cmake commands above:
+
+```
+go build -o imagegen ./x/imagegen/cmd/engine
+```
--- a/x/agent/approval.go
+++ b/x/agent/approval.go
@@ -41,7 +41,6 @@ var optionLabels = []string{
 var toolDisplayNames = map[string]string{
 	"bash":       "Bash",
 	"web_search": "Web Search",
-	"web_fetch":  "Web Fetch",
 }

 // ToolDisplayName returns the human-readable display name for a tool.
@@ -566,16 +565,6 @@ func formatToolDisplay(toolName string, args map[string]any) string {
 		}
 	}

-	// For web fetch, show URL and internet notice
-	if toolName == "web_fetch" {
-		if url, ok := args["url"].(string); ok {
-			sb.WriteString(fmt.Sprintf("Tool: %s\n", displayName))
-			sb.WriteString(fmt.Sprintf("URL: %s\n", url))
-			sb.WriteString("Uses internet via ollama.com")
-			return sb.String()
-		}
-	}
-
 	// Generic display
 	sb.WriteString(fmt.Sprintf("Tool: %s", displayName))
 	if len(args) > 0 {
@@ -1028,16 +1017,6 @@ func FormatApprovalResult(toolName string, args map[string]any, result ApprovalR
 		}
 	}

-	if toolName == "web_fetch" {
-		if url, ok := args["url"].(string); ok {
-			// Truncate long URLs
-			if len(url) > 50 {
-				url = url[:47] + "..."
-			}
-			return fmt.Sprintf("\033[1m%s:\033[0m %s: %s", label, displayName, url)
-		}
-	}
-
 	return fmt.Sprintf("\033[1m%s:\033[0m %s", label, displayName)
 }

--- a/x/cmd/run.go
+++ b/x/cmd/run.go
@@ -9,7 +9,6 @@ import (
 	"net/url"
 	"os"
 	"os/signal"
-	"slices"
 	"strings"
 	"syscall"
 	"time"
@@ -131,7 +130,6 @@ type RunOptions struct {
 	KeepAlive    *api.Duration
 	Think        *api.ThinkValue
 	HideThinking bool
-	Verbose      bool

 	// Agent fields (managed externally for session persistence)
 	Tools    *tools.Registry
@@ -180,7 +178,6 @@ func Chat(ctx context.Context, opts RunOptions) (*api.Message, error) {
 	var thinkTagClosed bool = false
 	var pendingToolCalls []api.ToolCall
 	var consecutiveErrors int // Track consecutive 500 errors for retry limit
-	var latest api.ChatResponse

 	role := "assistant"
 	messages := opts.Messages
@@ -190,7 +187,6 @@ func Chat(ctx context.Context, opts RunOptions) (*api.Message, error) {
 			p.StopAndClear()
 		}

-		latest = response
 		role = response.Message.Role
 		if response.Message.Thinking != "" && !opts.HideThinking {
 			if !thinkTagOpened {
@@ -487,10 +483,6 @@ func Chat(ctx context.Context, opts RunOptions) (*api.Message, error) {
 		fmt.Println()
 	}

-	if opts.Verbose {
-		latest.Summary()
-	}
-
 	return &api.Message{Role: role, Thinking: thinkingContent.String(), Content: fullResponse.String()}, nil
 }

@@ -642,13 +634,12 @@ func checkModelCapabilities(ctx context.Context, modelName string) (supportsTool
 // GenerateInteractive runs an interactive agent session.
 // This is called from cmd.go when --experimental flag is set.
 // If yoloMode is true, all tool approvals are skipped.
-// If enableWebsearch is true, the web search tool is registered.
-func GenerateInteractive(cmd *cobra.Command, modelName string, wordWrap bool, options map[string]any, think *api.ThinkValue, hideThinking bool, keepAlive *api.Duration, yoloMode bool, enableWebsearch bool) error {
+func GenerateInteractive(cmd *cobra.Command, modelName string, wordWrap bool, options map[string]any, think *api.ThinkValue, hideThinking bool, keepAlive *api.Duration, yoloMode bool) error {
 	scanner, err := readline.New(readline.Prompt{
 		Prompt:         ">>> ",
 		AltPrompt:      "... ",
 		Placeholder:    "Send a message (/? for help)",
-		AltPlaceholder: "Press Enter to send",
+		AltPlaceholder: `Use """ to end multi-line input`,
 	})
 	if err != nil {
 		return err
@@ -669,12 +660,6 @@ func GenerateInteractive(cmd *cobra.Command, modelName string, wordWrap bool, op
 	if supportsTools {
 		toolRegistry = tools.DefaultRegistry()

-		// Register web search and web fetch tools if enabled via flag
-		if enableWebsearch {
-			toolRegistry.RegisterWebSearch()
-			toolRegistry.RegisterWebFetch()
-		}
-
 		if toolRegistry.Has("bash") {
 			fmt.Fprintln(os.Stderr)
 			fmt.Fprintln(os.Stderr, "This experimental version of Ollama has the \033[1mbash\033[0m tool enabled.")
@@ -682,11 +667,6 @@ func GenerateInteractive(cmd *cobra.Command, modelName string, wordWrap bool, op
 			fmt.Fprintln(os.Stderr)
 		}

-		if toolRegistry.Has("web_search") || toolRegistry.Has("web_fetch") {
-			fmt.Fprintln(os.Stderr, "The \033[1mWeb Search\033[0m and \033[1mWeb Fetch\033[0m tools are enabled. Models can search and fetch web content via ollama.com.")
-			fmt.Fprintln(os.Stderr)
-		}
-
 		if yoloMode {
 			fmt.Fprintf(os.Stderr, "\033[1mwarning:\033[0m yolo mode - all tool approvals will be skipped\n")
 		}
@@ -697,8 +677,6 @@ func GenerateInteractive(cmd *cobra.Command, modelName string, wordWrap bool, op

 	var messages []api.Message
 	var sb strings.Builder
-	var format string
-	var system string

 	for {
 		line, err := scanner.Readline()
@@ -710,7 +688,6 @@ func GenerateInteractive(cmd *cobra.Command, modelName string, wordWrap bool, op
 			if line == "" {
 				fmt.Println("\nUse Ctrl + d or /bye to exit.")
 			}
-			scanner.Prompt.UseAlt = false
 			sb.Reset()
 			continue
 		case err != nil:
@@ -730,10 +707,6 @@ func GenerateInteractive(cmd *cobra.Command, modelName string, wordWrap bool, op
 			continue
 		case strings.HasPrefix(line, "/help"), strings.HasPrefix(line, "/?"):
 			fmt.Fprintln(os.Stderr, "Available Commands:")
-			fmt.Fprintln(os.Stderr, "  /set            Set session variables")
-			fmt.Fprintln(os.Stderr, "  /show           Show model information")
-			fmt.Fprintln(os.Stderr, "  /load           Load a different model")
-			fmt.Fprintln(os.Stderr, "  /save           Save session as a model")
 			fmt.Fprintln(os.Stderr, "  /tools          Show available tools and approvals")
 			fmt.Fprintln(os.Stderr, "  /clear          Clear session context and approvals")
 			fmt.Fprintln(os.Stderr, "  /bye            Exit")
@@ -743,280 +716,6 @@ func GenerateInteractive(cmd *cobra.Command, modelName string, wordWrap bool, op
 			fmt.Fprintln(os.Stderr, "  Ctrl+O          Expand last tool output")
 			fmt.Fprintln(os.Stderr, "")
 			continue
-		case strings.HasPrefix(line, "/set"):
-			args := strings.Fields(line)
-			if len(args) > 1 {
-				switch args[1] {
-				case "history":
-					scanner.HistoryEnable()
-				case "nohistory":
-					scanner.HistoryDisable()
-				case "wordwrap":
-					wordWrap = true
-					fmt.Println("Set 'wordwrap' mode.")
-				case "nowordwrap":
-					wordWrap = false
-					fmt.Println("Set 'nowordwrap' mode.")
-				case "verbose":
-					if err := cmd.Flags().Set("verbose", "true"); err != nil {
-						return err
-					}
-					fmt.Println("Set 'verbose' mode.")
-				case "quiet":
-					if err := cmd.Flags().Set("verbose", "false"); err != nil {
-						return err
-					}
-					fmt.Println("Set 'quiet' mode.")
-				case "think":
-					thinkValue := api.ThinkValue{Value: true}
-					var maybeLevel string
-					if len(args) > 2 {
-						maybeLevel = args[2]
-					}
-					if maybeLevel != "" {
-						thinkValue.Value = maybeLevel
-					}
-					think = &thinkValue
-					// Check if model supports thinking
-					if client, err := api.ClientFromEnvironment(); err == nil {
-						if resp, err := client.Show(cmd.Context(), &api.ShowRequest{Model: modelName}); err == nil {
-							if !slices.Contains(resp.Capabilities, model.CapabilityThinking) {
-								fmt.Fprintf(os.Stderr, "warning: model %q does not support thinking output\n", modelName)
-							}
-						}
-					}
-					if maybeLevel != "" {
-						fmt.Printf("Set 'think' mode to '%s'.\n", maybeLevel)
-					} else {
-						fmt.Println("Set 'think' mode.")
-					}
-				case "nothink":
-					think = &api.ThinkValue{Value: false}
-					// Check if model supports thinking
-					if client, err := api.ClientFromEnvironment(); err == nil {
-						if resp, err := client.Show(cmd.Context(), &api.ShowRequest{Model: modelName}); err == nil {
-							if !slices.Contains(resp.Capabilities, model.CapabilityThinking) {
-								fmt.Fprintf(os.Stderr, "warning: model %q does not support thinking output\n", modelName)
-							}
-						}
-					}
-					fmt.Println("Set 'nothink' mode.")
-				case "format":
-					if len(args) < 3 || args[2] != "json" {
-						fmt.Println("Invalid or missing format. For 'json' mode use '/set format json'")
-					} else {
-						format = args[2]
-						fmt.Printf("Set format to '%s' mode.\n", args[2])
-					}
-				case "noformat":
-					format = ""
-					fmt.Println("Disabled format.")
-				case "parameter":
-					if len(args) < 4 {
-						fmt.Println("Usage: /set parameter <name> <value>")
-						continue
-					}
-					params := args[3:]
-					fp, err := api.FormatParams(map[string][]string{args[2]: params})
-					if err != nil {
-						fmt.Printf("Couldn't set parameter: %q\n", err)
-						continue
-					}
-					fmt.Printf("Set parameter '%s' to '%s'\n", args[2], strings.Join(params, ", "))
-					options[args[2]] = fp[args[2]]
-				case "system":
-					if len(args) < 3 {
-						fmt.Println("Usage: /set system <message>")
-						continue
-					}
-
-					system = strings.Join(args[2:], " ")
-					newMessage := api.Message{Role: "system", Content: system}
-					if len(messages) > 0 && messages[len(messages)-1].Role == "system" {
-						messages[len(messages)-1] = newMessage
-					} else {
-						messages = append(messages, newMessage)
-					}
-					fmt.Println("Set system message.")
-					continue
-				default:
-					fmt.Printf("Unknown command '/set %s'. Type /? for help\n", args[1])
-				}
-			} else {
-				fmt.Println("Usage: /set <parameter|system|history|format|wordwrap|think|verbose> [value]")
-			}
-			continue
-		case strings.HasPrefix(line, "/show"):
-			args := strings.Fields(line)
-			if len(args) > 1 {
-				client, err := api.ClientFromEnvironment()
-				if err != nil {
-					fmt.Println("error: couldn't connect to ollama server")
-					continue
-				}
-				req := &api.ShowRequest{
-					Name:    modelName,
-					Options: options,
-				}
-				resp, err := client.Show(cmd.Context(), req)
-				if err != nil {
-					fmt.Println("error: couldn't get model")
-					continue
-				}
-
-				switch args[1] {
-				case "info":
-					fmt.Fprintf(os.Stderr, "  Model\n")
-					fmt.Fprintf(os.Stderr, "    %-16s %s\n", "Name", modelName)
-					if resp.Details.Family != "" {
-						fmt.Fprintf(os.Stderr, "    %-16s %s\n", "Family", resp.Details.Family)
-					}
-					if resp.Details.ParameterSize != "" {
-						fmt.Fprintf(os.Stderr, "    %-16s %s\n", "Parameter Size", resp.Details.ParameterSize)
-					}
-					if resp.Details.QuantizationLevel != "" {
-						fmt.Fprintf(os.Stderr, "    %-16s %s\n", "Quantization", resp.Details.QuantizationLevel)
-					}
-					if len(resp.Capabilities) > 0 {
-						caps := make([]string, len(resp.Capabilities))
-						for i, c := range resp.Capabilities {
-							caps[i] = string(c)
-						}
-						fmt.Fprintf(os.Stderr, "    %-16s %s\n", "Capabilities", strings.Join(caps, ", "))
-					}
-					fmt.Fprintln(os.Stderr)
-				case "license":
-					if resp.License == "" {
-						fmt.Println("No license was specified for this model.")
-					} else {
-						fmt.Println(resp.License)
-					}
-				case "modelfile":
-					fmt.Println(resp.Modelfile)
-				case "parameters":
-					fmt.Println("Model defined parameters:")
-					if resp.Parameters == "" {
-						fmt.Println("  No additional parameters were specified.")
-					} else {
-						for _, l := range strings.Split(resp.Parameters, "\n") {
-							fmt.Printf("  %s\n", l)
-						}
-					}
-					if len(options) > 0 {
-						fmt.Println("\nUser defined parameters:")
-						for k, v := range options {
-							fmt.Printf("  %-30s %v\n", k, v)
-						}
-					}
-				case "system":
-					switch {
-					case system != "":
-						fmt.Println(system + "\n")
-					case resp.System != "":
-						fmt.Println(resp.System + "\n")
-					default:
-						fmt.Println("No system message was specified for this model.")
-					}
-				case "template":
-					if resp.Template != "" {
-						fmt.Println(resp.Template)
-					} else {
-						fmt.Println("No prompt template was specified for this model.")
-					}
-				default:
-					fmt.Printf("Unknown command '/show %s'. Type /? for help\n", args[1])
-				}
-			} else {
-				fmt.Println("Usage: /show <info|license|modelfile|parameters|system|template>")
-			}
-			continue
-		case strings.HasPrefix(line, "/load"):
-			args := strings.Fields(line)
-			if len(args) != 2 {
-				fmt.Println("Usage: /load <modelname>")
-				continue
-			}
-			newModelName := args[1]
-			fmt.Printf("Loading model '%s'\n", newModelName)
-
-			// Create progress spinner
-			p := progress.NewProgress(os.Stderr)
-			spinner := progress.NewSpinner("")
-			p.Add("", spinner)
-
-			// Get client
-			client, err := api.ClientFromEnvironment()
-			if err != nil {
-				p.StopAndClear()
-				fmt.Println("error: couldn't connect to ollama server")
-				continue
-			}
-
-			// Check if model exists and get its info
-			info, err := client.Show(cmd.Context(), &api.ShowRequest{Model: newModelName})
-			if err != nil {
-				p.StopAndClear()
-				if strings.Contains(err.Error(), "not found") {
-					fmt.Printf("Couldn't find model '%s'\n", newModelName)
-				} else {
-					fmt.Printf("error: %v\n", err)
-				}
-				continue
-			}
-
-			// For cloud models, no need to preload
-			if info.RemoteHost == "" {
-				// Preload the model by sending an empty generate request
-				req := &api.GenerateRequest{
-					Model: newModelName,
-					Think: think,
-				}
-				err = client.Generate(cmd.Context(), req, func(r api.GenerateResponse) error {
-					return nil
-				})
-				if err != nil {
-					p.StopAndClear()
-					if strings.Contains(err.Error(), "not found") {
-						fmt.Printf("Couldn't find model '%s'\n", newModelName)
-					} else if strings.Contains(err.Error(), "does not support thinking") {
-						fmt.Printf("error: %v\n", err)
-					} else {
-						fmt.Printf("error loading model: %v\n", err)
-					}
-					continue
-				}
-			}
-
-			p.StopAndClear()
-			modelName = newModelName
-			messages = []api.Message{}
-			approval.Reset()
-			continue
-		case strings.HasPrefix(line, "/save"):
-			args := strings.Fields(line)
-			if len(args) != 2 {
-				fmt.Println("Usage: /save <modelname>")
-				continue
-			}
-			client, err := api.ClientFromEnvironment()
-			if err != nil {
-				fmt.Println("error: couldn't connect to ollama server")
-				continue
-			}
-			req := &api.CreateRequest{
-				Model:      args[1],
-				From:       modelName,
-				Parameters: options,
-				Messages:   messages,
-			}
-			fn := func(resp api.ProgressResponse) error { return nil }
-			err = client.Create(cmd.Context(), req, fn)
-			if err != nil {
-				fmt.Printf("error: %v\n", err)
-				continue
-			}
-			fmt.Printf("Created new model '%s'\n", args[1])
-			continue
 		case strings.HasPrefix(line, "/"):
 			fmt.Printf("Unknown command '%s'. Type /? for help\n", strings.Fields(line)[0])
 			continue
@@ -1028,12 +727,10 @@ func GenerateInteractive(cmd *cobra.Command, modelName string, wordWrap bool, op
 			newMessage := api.Message{Role: "user", Content: sb.String()}
 			messages = append(messages, newMessage)

-			verbose, _ := cmd.Flags().GetBool("verbose")
 			opts := RunOptions{
 				Model:        modelName,
 				Messages:     messages,
 				WordWrap:     wordWrap,
-				Format:       format,
 				Options:      options,
 				Think:        think,
 				HideThinking: hideThinking,
@@ -1041,7 +738,6 @@ func GenerateInteractive(cmd *cobra.Command, modelName string, wordWrap bool, op
 				Tools:        toolRegistry,
 				Approval:     approval,
 				YoloMode:     yoloMode,
-				Verbose:      verbose,
 			}

 			assistant, err := Chat(cmd.Context(), opts)
--- a/x/create/client/create.go
+++ b/x/create/client/create.go
@@ -1,282 +0,0 @@
-// Package client provides client-side model creation for safetensors-based models.
-//
-// This package is in x/ because the safetensors model storage format is under development.
-// It also exists to break an import cycle: server imports x/create, so x/create
-// cannot import server. This sub-package can import server because server doesn't
-// import it.
-package client
-
-import (
-	"bytes"
-	"encoding/json"
-	"fmt"
-	"io"
-
-	"github.com/ollama/ollama/progress"
-	"github.com/ollama/ollama/server"
-	"github.com/ollama/ollama/types/model"
-	"github.com/ollama/ollama/x/create"
-)
-
-// MinOllamaVersion is the minimum Ollama version required for safetensors models.
-const MinOllamaVersion = "0.14.0"
-
-// ModelfileConfig holds configuration extracted from a Modelfile.
-type ModelfileConfig struct {
-	Template string
-	System   string
-	License  string
-}
-
-// CreateOptions holds all options for model creation.
-type CreateOptions struct {
-	ModelName string
-	ModelDir  string
-	Quantize  string           // "fp8" for quantization
-	Modelfile *ModelfileConfig // template/system/license from Modelfile
-}
-
-// CreateModel imports a model from a local directory.
-// This creates blobs and manifest directly on disk, bypassing the HTTP API.
-// Automatically detects model type (safetensors LLM vs image gen) and routes accordingly.
-func CreateModel(opts CreateOptions, p *progress.Progress) error {
-	// Detect model type
-	isSafetensors := create.IsSafetensorsModelDir(opts.ModelDir)
-	isImageGen := create.IsTensorModelDir(opts.ModelDir)
-
-	if !isSafetensors && !isImageGen {
-		return fmt.Errorf("%s is not a supported model directory (needs config.json + *.safetensors or model_index.json)", opts.ModelDir)
-	}
-
-	// Determine model type settings
-	var modelType, spinnerKey string
-	var capabilities []string
-	if isSafetensors {
-		modelType = "safetensors model"
-		spinnerKey = "create"
-		capabilities = []string{"completion"}
-	} else {
-		modelType = "image generation model"
-		spinnerKey = "imagegen"
-		capabilities = []string{"image"}
-	}
-
-	// Set up progress spinner
-	statusMsg := "importing " + modelType
-	spinner := progress.NewSpinner(statusMsg)
-	p.Add(spinnerKey, spinner)
-
-	progressFn := func(msg string) {
-		spinner.Stop()
-		statusMsg = msg
-		spinner = progress.NewSpinner(statusMsg)
-		p.Add(spinnerKey, spinner)
-	}
-
-	// Create the model using shared callbacks
-	var err error
-	if isSafetensors {
-		err = create.CreateSafetensorsModel(
-			opts.ModelName, opts.ModelDir, opts.Quantize,
-			newLayerCreator(), newTensorLayerCreator(),
-			newManifestWriter(opts, capabilities),
-			progressFn,
-		)
-	} else {
-		err = create.CreateImageGenModel(
-			opts.ModelName, opts.ModelDir, opts.Quantize,
-			newLayerCreator(), newTensorLayerCreator(),
-			newManifestWriter(opts, capabilities),
-			progressFn,
-		)
-	}
-
-	spinner.Stop()
-	if err != nil {
-		return err
-	}
-
-	fmt.Printf("Created %s '%s'\n", modelType, opts.ModelName)
-	return nil
-}
-
-// newLayerCreator returns a LayerCreator callback for creating config/JSON layers.
-func newLayerCreator() create.LayerCreator {
-	return func(r io.Reader, mediaType, name string) (create.LayerInfo, error) {
-		layer, err := server.NewLayer(r, mediaType)
-		if err != nil {
-			return create.LayerInfo{}, err
-		}
-
-		return create.LayerInfo{
-			Digest:    layer.Digest,
-			Size:      layer.Size,
-			MediaType: layer.MediaType,
-			Name:      name,
-		}, nil
-	}
-}
-
-// newTensorLayerCreator returns a QuantizingTensorLayerCreator callback for creating tensor layers.
-// When quantize is non-empty, returns multiple layers (weight + scales + optional qbias).
-func newTensorLayerCreator() create.QuantizingTensorLayerCreator {
-	return func(r io.Reader, name, dtype string, shape []int32, quantize string) ([]create.LayerInfo, error) {
-		if quantize != "" {
-			return createQuantizedLayers(r, name, dtype, shape, quantize)
-		}
-		return createUnquantizedLayer(r, name)
-	}
-}
-
-// createQuantizedLayers quantizes a tensor and returns the resulting layers.
-func createQuantizedLayers(r io.Reader, name, dtype string, shape []int32, quantize string) ([]create.LayerInfo, error) {
-	if !QuantizeSupported() {
-		return nil, fmt.Errorf("quantization requires MLX support")
-	}
-
-	// Quantize the tensor
-	qweightData, scalesData, qbiasData, _, _, _, err := quantizeTensor(r, name, dtype, shape, quantize)
-	if err != nil {
-		return nil, fmt.Errorf("failed to quantize %s: %w", name, err)
-	}
-
-	// Create layer for quantized weight
-	weightLayer, err := server.NewLayer(bytes.NewReader(qweightData), server.MediaTypeImageTensor)
-	if err != nil {
-		return nil, err
-	}
-
-	// Create layer for scales
-	scalesLayer, err := server.NewLayer(bytes.NewReader(scalesData), server.MediaTypeImageTensor)
-	if err != nil {
-		return nil, err
-	}
-
-	layers := []create.LayerInfo{
-		{
-			Digest:    weightLayer.Digest,
-			Size:      weightLayer.Size,
-			MediaType: weightLayer.MediaType,
-			Name:      name,
-		},
-		{
-			Digest:    scalesLayer.Digest,
-			Size:      scalesLayer.Size,
-			MediaType: scalesLayer.MediaType,
-			Name:      name + "_scale",
-		},
-	}
-
-	// Add qbiases layer if present (affine mode)
-	if qbiasData != nil {
-		qbiasLayer, err := server.NewLayer(bytes.NewReader(qbiasData), server.MediaTypeImageTensor)
-		if err != nil {
-			return nil, err
-		}
-		layers = append(layers, create.LayerInfo{
-			Digest:    qbiasLayer.Digest,
-			Size:      qbiasLayer.Size,
-			MediaType: qbiasLayer.MediaType,
-			Name:      name + "_qbias",
-		})
-	}
-
-	return layers, nil
-}
-
-// createUnquantizedLayer creates a single tensor layer without quantization.
-func createUnquantizedLayer(r io.Reader, name string) ([]create.LayerInfo, error) {
-	layer, err := server.NewLayer(r, server.MediaTypeImageTensor)
-	if err != nil {
-		return nil, err
-	}
-
-	return []create.LayerInfo{
-		{
-			Digest:    layer.Digest,
-			Size:      layer.Size,
-			MediaType: layer.MediaType,
-			Name:      name,
-		},
-	}, nil
-}
-
-// newManifestWriter returns a ManifestWriter callback for writing the model manifest.
-func newManifestWriter(opts CreateOptions, capabilities []string) create.ManifestWriter {
-	return func(modelName string, config create.LayerInfo, layers []create.LayerInfo) error {
-		name := model.ParseName(modelName)
-		if !name.IsValid() {
-			return fmt.Errorf("invalid model name: %s", modelName)
-		}
-
-		// Create config blob with version requirement
-		configData := model.ConfigV2{
-			ModelFormat:  "safetensors",
-			Capabilities: capabilities,
-			Requires:     MinOllamaVersion,
-		}
-		configJSON, err := json.Marshal(configData)
-		if err != nil {
-			return fmt.Errorf("failed to marshal config: %w", err)
-		}
-
-		// Create config layer blob
-		configLayer, err := server.NewLayer(bytes.NewReader(configJSON), "application/vnd.docker.container.image.v1+json")
-		if err != nil {
-			return fmt.Errorf("failed to create config layer: %w", err)
-		}
-
-		// Convert LayerInfo to server.Layer
-		serverLayers := make([]server.Layer, 0, len(layers))
-		for _, l := range layers {
-			serverLayers = append(serverLayers, server.Layer{
-				MediaType: l.MediaType,
-				Digest:    l.Digest,
-				Size:      l.Size,
-				Name:      l.Name,
-			})
-		}
-
-		// Add Modelfile layers if present
-		if opts.Modelfile != nil {
-			modelfileLayers, err := createModelfileLayers(opts.Modelfile)
-			if err != nil {
-				return err
-			}
-			serverLayers = append(serverLayers, modelfileLayers...)
-		}
-
-		return server.WriteManifest(name, configLayer, serverLayers)
-	}
-}
-
-// createModelfileLayers creates layers for template, system, and license from Modelfile config.
-func createModelfileLayers(mf *ModelfileConfig) ([]server.Layer, error) {
-	var layers []server.Layer
-
-	if mf.Template != "" {
-		layer, err := server.NewLayer(bytes.NewReader([]byte(mf.Template)), "application/vnd.ollama.image.template")
-		if err != nil {
-			return nil, fmt.Errorf("failed to create template layer: %w", err)
-		}
-		layers = append(layers, layer)
-	}
-
-	if mf.System != "" {
-		layer, err := server.NewLayer(bytes.NewReader([]byte(mf.System)), "application/vnd.ollama.image.system")
-		if err != nil {
-			return nil, fmt.Errorf("failed to create system layer: %w", err)
-		}
-		layers = append(layers, layer)
-	}
-
-	if mf.License != "" {
-		layer, err := server.NewLayer(bytes.NewReader([]byte(mf.License)), "application/vnd.ollama.image.license")
-		if err != nil {
-			return nil, fmt.Errorf("failed to create license layer: %w", err)
-		}
-		layers = append(layers, layer)
-	}
-
-	return layers, nil
-}
--- a/x/create/client/create_test.go
+++ b/x/create/client/create_test.go
@@ -1,146 +0,0 @@
-package client
-
-import (
-	"testing"
-)
-
-func TestModelfileConfig(t *testing.T) {
-	// Test that ModelfileConfig struct works as expected
-	config := &ModelfileConfig{
-		Template: "{{ .Prompt }}",
-		System:   "You are a helpful assistant.",
-		License:  "MIT",
-	}
-
-	if config.Template != "{{ .Prompt }}" {
-		t.Errorf("Template = %q, want %q", config.Template, "{{ .Prompt }}")
-	}
-	if config.System != "You are a helpful assistant." {
-		t.Errorf("System = %q, want %q", config.System, "You are a helpful assistant.")
-	}
-	if config.License != "MIT" {
-		t.Errorf("License = %q, want %q", config.License, "MIT")
-	}
-}
-
-func TestModelfileConfig_Empty(t *testing.T) {
-	config := &ModelfileConfig{}
-
-	if config.Template != "" {
-		t.Errorf("Template should be empty, got %q", config.Template)
-	}
-	if config.System != "" {
-		t.Errorf("System should be empty, got %q", config.System)
-	}
-	if config.License != "" {
-		t.Errorf("License should be empty, got %q", config.License)
-	}
-}
-
-func TestModelfileConfig_PartialFields(t *testing.T) {
-	// Test config with only some fields set
-	config := &ModelfileConfig{
-		Template: "{{ .Prompt }}",
-		// System and License intentionally empty
-	}
-
-	if config.Template == "" {
-		t.Error("Template should not be empty")
-	}
-	if config.System != "" {
-		t.Error("System should be empty")
-	}
-	if config.License != "" {
-		t.Error("License should be empty")
-	}
-}
-
-func TestMinOllamaVersion(t *testing.T) {
-	// Verify the minimum version constant is set
-	if MinOllamaVersion == "" {
-		t.Error("MinOllamaVersion should not be empty")
-	}
-	if MinOllamaVersion != "0.14.0" {
-		t.Errorf("MinOllamaVersion = %q, want %q", MinOllamaVersion, "0.14.0")
-	}
-}
-
-func TestCreateModel_InvalidDir(t *testing.T) {
-	// Test that CreateModel returns error for invalid directory
-	err := CreateModel(CreateOptions{
-		ModelName: "test-model",
-		ModelDir:  "/nonexistent/path",
-	}, nil)
-	if err == nil {
-		t.Error("expected error for nonexistent directory, got nil")
-	}
-}
-
-func TestCreateModel_NotSafetensorsDir(t *testing.T) {
-	// Test that CreateModel returns error for directory without safetensors
-	dir := t.TempDir()
-
-	err := CreateModel(CreateOptions{
-		ModelName: "test-model",
-		ModelDir:  dir,
-	}, nil)
-	if err == nil {
-		t.Error("expected error for empty directory, got nil")
-	}
-}
-
-func TestCreateOptions(t *testing.T) {
-	opts := CreateOptions{
-		ModelName: "my-model",
-		ModelDir:  "/path/to/model",
-		Quantize:  "fp8",
-		Modelfile: &ModelfileConfig{
-			Template: "test",
-			System:   "system",
-			License:  "MIT",
-		},
-	}
-
-	if opts.ModelName != "my-model" {
-		t.Errorf("ModelName = %q, want %q", opts.ModelName, "my-model")
-	}
-	if opts.ModelDir != "/path/to/model" {
-		t.Errorf("ModelDir = %q, want %q", opts.ModelDir, "/path/to/model")
-	}
-	if opts.Quantize != "fp8" {
-		t.Errorf("Quantize = %q, want %q", opts.Quantize, "fp8")
-	}
-	if opts.Modelfile == nil {
-		t.Error("Modelfile should not be nil")
-	}
-	if opts.Modelfile.Template != "test" {
-		t.Errorf("Modelfile.Template = %q, want %q", opts.Modelfile.Template, "test")
-	}
-}
-
-func TestCreateOptions_Defaults(t *testing.T) {
-	opts := CreateOptions{
-		ModelName: "test",
-		ModelDir:  "/tmp",
-	}
-
-	// Quantize should default to empty
-	if opts.Quantize != "" {
-		t.Errorf("Quantize should be empty by default, got %q", opts.Quantize)
-	}
-
-	// Modelfile should default to nil
-	if opts.Modelfile != nil {
-		t.Error("Modelfile should be nil by default")
-	}
-}
-
-func TestQuantizeSupported(t *testing.T) {
-	// This just verifies the function exists and returns a boolean
-	// The actual value depends on build tags (mlx vs non-mlx)
-	supported := QuantizeSupported()
-
-	// In non-mlx builds, this should be false
-	// We can't easily test both cases, so just verify it returns something
-	_ = supported
-}
--- a/x/create/client/quantize.go
+++ b/x/create/client/quantize.go
@@ -1,130 +0,0 @@
-//go:build mlx
-
-package client
-
-import (
-	"fmt"
-	"io"
-	"os"
-	"path/filepath"
-
-	"github.com/ollama/ollama/x/imagegen/mlx"
-)
-
-// quantizeTensor loads a tensor from safetensors format, quantizes it,
-// and returns safetensors data for the quantized weights, scales, and biases.
-// Supported quantization types: "fp8" (affine 8-bit)
-// Uses MLX's native SaveSafetensors to ensure correct dtype handling (especially uint32 for quantized weights).
-func quantizeTensor(r io.Reader, name, dtype string, shape []int32, quantize string) (qweightData, scalesData, qbiasData []byte, qweightShape, scalesShape, qbiasShape []int32, err error) {
-	tmpDir := ensureTempDir()
-
-	// Read safetensors data to a temp file (LoadSafetensorsNative needs a path)
-	tmpFile, err := os.CreateTemp(tmpDir, "quant-input-*.safetensors")
-	if err != nil {
-		return nil, nil, nil, nil, nil, nil, fmt.Errorf("failed to create temp file: %w", err)
-	}
-	tmpPath := tmpFile.Name()
-	defer os.Remove(tmpPath)
-
-	if _, err := io.Copy(tmpFile, r); err != nil {
-		tmpFile.Close()
-		return nil, nil, nil, nil, nil, nil, fmt.Errorf("failed to write temp file: %w", err)
-	}
-	tmpFile.Close()
-
-	// Load the tensor using MLX's native loader
-	st, err := mlx.LoadSafetensorsNative(tmpPath)
-	if err != nil {
-		return nil, nil, nil, nil, nil, nil, fmt.Errorf("failed to load safetensors: %w", err)
-	}
-	defer st.Free()
-
-	// Get the tensor (it's stored as "data" in our minimal safetensors format)
-	arr := st.Get("data")
-	if arr == nil {
-		return nil, nil, nil, nil, nil, nil, fmt.Errorf("tensor 'data' not found in safetensors")
-	}
-
-	// Convert to BFloat16 if needed (quantize expects float type)
-	if arr.Dtype() != mlx.DtypeBFloat16 && arr.Dtype() != mlx.DtypeFloat32 && arr.Dtype() != mlx.DtypeFloat16 {
-		arr = mlx.AsType(arr, mlx.DtypeBFloat16)
-		mlx.Eval(arr)
-	}
-
-	// Quantize based on quantization type
-	var qweight, scales, qbiases *mlx.Array
-	switch quantize {
-	case "fp4":
-		// affine mode: group_size=32, bits=4
-		qweight, scales, qbiases = mlx.Quantize(arr, 32, 4, "affine")
-	case "fp8":
-		// affine mode: group_size=32, bits=8
-		qweight, scales, qbiases = mlx.Quantize(arr, 32, 8, "affine")
-	default:
-		return nil, nil, nil, nil, nil, nil, fmt.Errorf("unsupported quantization type: %s", quantize)
-	}
-
-	// Eval and make contiguous for data access
-	qweight = mlx.Contiguous(qweight)
-	scales = mlx.Contiguous(scales)
-	if qbiases != nil {
-		qbiases = mlx.Contiguous(qbiases)
-		mlx.Eval(qweight, scales, qbiases)
-	} else {
-		mlx.Eval(qweight, scales)
-	}
-
-	// Get shapes
-	qweightShape = qweight.Shape()
-	scalesShape = scales.Shape()
-
-	// Save quantized weight using MLX's native safetensors (correctly handles uint32 dtype)
-	qweightPath := filepath.Join(tmpDir, "qweight.safetensors")
-	defer os.Remove(qweightPath)
-	if err := mlx.SaveSafetensors(qweightPath, map[string]*mlx.Array{"data": qweight}); err != nil {
-		return nil, nil, nil, nil, nil, nil, fmt.Errorf("failed to save quantized weight: %w", err)
-	}
-	qweightData, err = os.ReadFile(qweightPath)
-	if err != nil {
-		return nil, nil, nil, nil, nil, nil, fmt.Errorf("failed to read quantized weight: %w", err)
-	}
-
-	// Save scales using MLX's native safetensors
-	scalesPath := filepath.Join(tmpDir, "scales.safetensors")
-	defer os.Remove(scalesPath)
-	if err := mlx.SaveSafetensors(scalesPath, map[string]*mlx.Array{"data": scales}); err != nil {
-		return nil, nil, nil, nil, nil, nil, fmt.Errorf("failed to save scales: %w", err)
-	}
-	scalesData, err = os.ReadFile(scalesPath)
-	if err != nil {
-		return nil, nil, nil, nil, nil, nil, fmt.Errorf("failed to read scales: %w", err)
-	}
-
-	// Affine mode returns qbiases for zero-point offset
-	if qbiases != nil {
-		qbiasShape = qbiases.Shape()
-		qbiasPath := filepath.Join(tmpDir, "qbias.safetensors")
-		defer os.Remove(qbiasPath)
-		if err := mlx.SaveSafetensors(qbiasPath, map[string]*mlx.Array{"data": qbiases}); err != nil {
-			return nil, nil, nil, nil, nil, nil, fmt.Errorf("failed to save qbiases: %w", err)
-		}
-		qbiasData, err = os.ReadFile(qbiasPath)
-		if err != nil {
-			return nil, nil, nil, nil, nil, nil, fmt.Errorf("failed to read qbiases: %w", err)
-		}
-	}
-
-	return qweightData, scalesData, qbiasData, qweightShape, scalesShape, qbiasShape, nil
-}
-
-// QuantizeSupported returns true if quantization is supported (MLX build)
-func QuantizeSupported() bool {
-	return true
-}
-
-// ensureTempDir creates the temp directory for quantization if it doesn't exist
-func ensureTempDir() string {
-	tmpDir := filepath.Join(os.TempDir(), "ollama-quantize")
-	os.MkdirAll(tmpDir, 0755)
-	return tmpDir
-}
--- a/x/create/client/quantize_stub.go
+++ b/x/create/client/quantize_stub.go
@@ -1,18 +0,0 @@
-//go:build !mlx
-
-package client
-
-import (
-	"fmt"
-	"io"
-)
-
-// quantizeTensor is not available without MLX
-func quantizeTensor(r io.Reader, name, dtype string, shape []int32, quantize string) (qweightData, scalesData, qbiasData []byte, qweightShape, scalesShape, qbiasShape []int32, err error) {
-	return nil, nil, nil, nil, nil, nil, fmt.Errorf("quantization requires MLX support (build with mlx tag)")
-}
-
-// QuantizeSupported returns false when MLX is not available
-func QuantizeSupported() bool {
-	return false
-}
--- a/Show More
+++ b/Show More