fixes from rebase

fix: agent loop message handling and cloud model inheritance
- Fix tool result messages losing ToolName and ToolCallID fields - Include Thinking in intermediate assistant messages during tool loops - Inherit capabilities, remote config, and model family from base model when creating agents (fixes "does not support generate" for cloud models) - Add tests for tool message construction and message stitching 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-08 15:39:54 -05:00 · 2026-01-07 15:44:19 -08:00 · 2026-01-06 12:15:18 -08:00 · 2026-01-06 12:15:18 -08:00 · 2026-01-06 12:12:27 -08:00 · 2026-01-06 12:12:27 -08:00
153 changed files with 4258 additions and 29002 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,7 +12,7 @@ set(BUILD_SHARED_LIBS ON)

 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
-set(CMAKE_CXX_EXTENSIONS ON) # Recent versions of MLX Requires gnu++17 extensions to compile properly
+set(CMAKE_CXX_EXTENSIONS OFF)

 set(GGML_BUILD ON)
 set(GGML_SHARED ON)
@@ -147,48 +147,14 @@ if(CMAKE_HIP_COMPILER)
    endif()
 endif()

-if(NOT APPLE)
-    find_package(Vulkan)
-    if(Vulkan_FOUND)
-        add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-vulkan)
-        install(TARGETS ggml-vulkan
-            RUNTIME_DEPENDENCIES
-                PRE_INCLUDE_REGEXES vulkan
-                PRE_EXCLUDE_REGEXES ".*"
-            RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT Vulkan
-            LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT Vulkan
-        )
-    endif()
-endif()
-
-option(MLX_ENGINE "Enable MLX backend" OFF)
-
-if(MLX_ENGINE)
-    message(STATUS "Setting up MLX (this takes a while...)")
-    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/x/ml/backend/mlx)
-
-    # Find CUDA toolkit if MLX is built with CUDA support
-    find_package(CUDAToolkit)
-
-    install(TARGETS mlx mlxc
+find_package(Vulkan)
+if(Vulkan_FOUND)
+    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-vulkan)
+    install(TARGETS ggml-vulkan
        RUNTIME_DEPENDENCIES
-            DIRECTORIES ${CUDAToolkit_BIN_DIR} ${CUDAToolkit_BIN_DIR}/x64 ${CUDAToolkit_LIBRARY_DIR}
-            PRE_INCLUDE_REGEXES cublas cublasLt cudart nvrtc cudnn nccl
+            PRE_INCLUDE_REGEXES vulkan
            PRE_EXCLUDE_REGEXES ".*"
-        RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT MLX
-        LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT MLX
-        FRAMEWORK DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT MLX
+        RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT Vulkan
+        LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT Vulkan
    )
-
-    # Manually install cudart and cublas since they might not be picked up as direct dependencies
-    if(CUDAToolkit_FOUND)
-        file(GLOB CUDART_LIBS
-            "${CUDAToolkit_LIBRARY_DIR}/libcudart.so*"
-            "${CUDAToolkit_LIBRARY_DIR}/libcublas.so*")
-        if(CUDART_LIBS)
-            install(FILES ${CUDART_LIBS}
-                DESTINATION ${OLLAMA_INSTALL_DIR}
-                COMPONENT MLX)
-        endif()
-    endif()
-endif()
+endif()
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -41,7 +41,7 @@
      "inherits": [ "CUDA" ],
      "cacheVariables": {
        "CMAKE_CUDA_ARCHITECTURES": "75-virtual;80-virtual;86-virtual;87-virtual;89-virtual;90-virtual;90a-virtual;100-virtual;103-virtual;110-virtual;120-virtual;121-virtual",
-        "CMAKE_CUDA_FLAGS": "-t 4",
+        "CMAKE_CUDA_FLAGS": "-t 2",
        "OLLAMA_RUNNER_DIR": "cuda_v13"
      }
    },
@@ -83,28 +83,6 @@
      "cacheVariables": {
        "OLLAMA_RUNNER_DIR": "vulkan"
      }
-    },
-    {
-      "name": "MLX",
-      "inherits": [ "Default" ],
-      "cacheVariables": {
-        "MLX_ENGINE": "ON",
-        "OLLAMA_RUNNER_DIR": "mlx"
-      }
-    },
-    {
-      "name": "MLX CUDA 12",
-      "inherits": [ "MLX", "CUDA 12" ],
-      "cacheVariables": {
-        "OLLAMA_RUNNER_DIR": "mlx_cuda_v12"
-      }
-    },
-    {
-      "name": "MLX CUDA 13",
-      "inherits": [ "MLX", "CUDA 13" ],
-      "cacheVariables": {
-        "OLLAMA_RUNNER_DIR": "mlx_cuda_v13"
-      }
    }
  ],
  "buildPresets": [
@@ -162,21 +140,6 @@
      "name": "Vulkan",
      "targets": [ "ggml-vulkan" ],
      "configurePreset": "Vulkan"
-    },
-    {
-      "name": "MLX",
-      "targets": [ "mlx", "mlxc" ],
-      "configurePreset": "MLX"
-    },
-    {
-      "name": "MLX CUDA 12",
-      "targets": [ "mlx", "mlxc" ],
-      "configurePreset": "MLX CUDA 12"
-    },
-    {
-      "name": "MLX CUDA 13",
-      "targets": [ "mlx", "mlxc" ],
-      "configurePreset": "MLX CUDA 13"
    }
  ]
 }
--- a/37
+++ b/37
@@ -131,40 +131,7 @@ COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
 RUN --mount=type=cache,target=/root/.ccache \
    cmake --preset 'Vulkan' \
        && cmake --build --parallel --preset 'Vulkan' \
-        && cmake --install build --component Vulkan --strip --parallel 8
-
-FROM base AS mlx
-ARG CUDA13VERSION=13.0
-RUN dnf install -y cuda-toolkit-${CUDA13VERSION//./-} \
-    && dnf install -y openblas-devel lapack-devel \
-    && dnf install -y libcudnn9-cuda-13 libcudnn9-devel-cuda-13 \
-    && dnf install -y libnccl libnccl-devel
-ENV PATH=/usr/local/cuda-13/bin:$PATH
-ENV BLAS_INCLUDE_DIRS=/usr/include/openblas
-ENV LAPACK_INCLUDE_DIRS=/usr/include/openblas
-ENV CGO_LDFLAGS="-L/usr/local/cuda-13/lib64 -L/usr/local/cuda-13/targets/x86_64-linux/lib/stubs"
-ARG PARALLEL
-WORKDIR /go/src/github.com/ollama/ollama
-COPY CMakeLists.txt CMakePresets.json .
-COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
-COPY x/ml/backend/mlx x/ml/backend/mlx
-COPY go.mod go.sum .
-RUN curl -fsSL https://golang.org/dl/go$(awk '/^go/ { print $2 }' go.mod).linux-$(case $(uname -m) in x86_64) echo amd64 ;; aarch64) echo arm64 ;; esac).tar.gz | tar xz -C /usr/local
-ENV PATH=/usr/local/go/bin:$PATH
-RUN go mod download
-RUN --mount=type=cache,target=/root/.ccache \
-    cmake --preset 'MLX CUDA 13' -DBLAS_INCLUDE_DIRS=/usr/include/openblas -DLAPACK_INCLUDE_DIRS=/usr/include/openblas \
-        && cmake --build --parallel ${PARALLEL} --preset 'MLX CUDA 13' \
-        && cmake --install build --component MLX --strip --parallel ${PARALLEL}
-COPY . .
-ARG GOFLAGS="'-ldflags=-w -s'"
-ENV CGO_ENABLED=1
-ARG CGO_CFLAGS
-ARG CGO_CXXFLAGS
-# TODO wire up the actual MLX engine here instead of building the main binary...
-RUN mkdir -p dist/bin
-RUN go build -tags mlx -trimpath -buildmode=pie -o dist/bin/ollama-mlx-engine .
-RUN go build -trimpath -buildmode=pie -o dist/bin/imagegen ./x/imagegen/cmd/engine
+        && cmake --install build --component Vulkan --strip --parallel 8 


 FROM base AS build
@@ -186,8 +153,6 @@ FROM --platform=linux/amd64 scratch AS amd64
 COPY --from=cuda-12 dist/lib/ollama /lib/ollama/
 COPY --from=cuda-13 dist/lib/ollama /lib/ollama/
 COPY --from=vulkan  dist/lib/ollama  /lib/ollama/
-COPY --from=mlx     /go/src/github.com/ollama/ollama/dist/lib/ollama /lib/ollama/
-COPY --from=mlx     /go/src/github.com/ollama/ollama/dist/bin/ /bin/

 FROM --platform=linux/arm64 scratch AS arm64
 # COPY --from=cuda-11 dist/lib/ollama/ /lib/ollama/
--- a/api/types.go
+++ b/api/types.go
@@ -19,6 +19,12 @@ import (
 	"github.com/ollama/ollama/types/model"
 )

+// SkillRef is an alias for model.SkillRef representing a skill reference.
+type SkillRef = model.SkillRef
+
+// MCPRef is an alias for model.MCPRef representing an MCP server reference.
+type MCPRef = model.MCPRef
+
 // StatusError is an error with an HTTP status code and message.
 type StatusError struct {
 	StatusCode   int
@@ -690,6 +696,18 @@ type CreateRequest struct {
 	// Requires is the minimum version of Ollama required by the model.
 	Requires string `json:"requires,omitempty"`

+	// Skills is a list of skill references for the agent (local paths or registry refs)
+	Skills []SkillRef `json:"skills,omitempty"`
+
+	// MCPs is a list of MCP server references for the agent
+	MCPs []MCPRef `json:"mcps,omitempty"`
+
+	// AgentType defines the type of agent (e.g., "conversational", "task-based")
+	AgentType string `json:"agent_type,omitempty"`
+
+	// Entrypoint specifies an external command to run instead of the built-in chat loop
+	Entrypoint string `json:"entrypoint,omitempty"`
+
 	// Info is a map of additional information for the model
 	Info map[string]any `json:"info,omitempty"`

@@ -741,6 +759,10 @@ type ShowResponse struct {
 	Capabilities  []model.Capability `json:"capabilities,omitempty"`
 	ModifiedAt    time.Time          `json:"modified_at,omitempty"`
 	Requires      string             `json:"requires,omitempty"`
+	Skills        []SkillRef         `json:"skills,omitempty"`
+	MCPs          []MCPRef           `json:"mcps,omitempty"`
+	AgentType     string             `json:"agent_type,omitempty"`
+	Entrypoint    string             `json:"entrypoint,omitempty"`
 }

 // CopyRequest is the request passed to [Client.Copy].
--- a/cmd/agent_loop_test.go
+++ b/cmd/agent_loop_test.go
@@ -0,0 +1,402 @@
+package cmd
+
+import (
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+	"github.com/ollama/ollama/api"
+)
+
+// TestToolMessage verifies that tool messages are constructed correctly
+// with ToolName and ToolCallID preserved from the tool call.
+func TestToolMessage(t *testing.T) {
+	tests := []struct {
+		name     string
+		call     api.ToolCall
+		content  string
+		expected api.Message
+	}{
+		{
+			name: "basic tool message with ID",
+			call: api.ToolCall{
+				ID: "call_abc123",
+				Function: api.ToolCallFunction{
+					Name: "get_weather",
+					Arguments: api.ToolCallFunctionArguments{
+						"location": "Paris",
+					},
+				},
+			},
+			content: "Sunny, 22°C",
+			expected: api.Message{
+				Role:       "tool",
+				Content:    "Sunny, 22°C",
+				ToolName:   "get_weather",
+				ToolCallID: "call_abc123",
+			},
+		},
+		{
+			name: "tool message without ID",
+			call: api.ToolCall{
+				Function: api.ToolCallFunction{
+					Name: "calculate",
+					Arguments: api.ToolCallFunctionArguments{
+						"expression": "2+2",
+					},
+				},
+			},
+			content: "4",
+			expected: api.Message{
+				Role:     "tool",
+				Content:  "4",
+				ToolName: "calculate",
+				// ToolCallID should be empty when call.ID is empty
+			},
+		},
+		{
+			name: "MCP tool message",
+			call: api.ToolCall{
+				ID: "call_mcp123",
+				Function: api.ToolCallFunction{
+					Name: "mcp_websearch_search",
+					Arguments: api.ToolCallFunctionArguments{
+						"query": "ollama agents",
+					},
+				},
+			},
+			content: "Found 10 results",
+			expected: api.Message{
+				Role:       "tool",
+				Content:    "Found 10 results",
+				ToolName:   "mcp_websearch_search",
+				ToolCallID: "call_mcp123",
+			},
+		},
+		{
+			name: "skill tool message",
+			call: api.ToolCall{
+				ID: "call_skill456",
+				Function: api.ToolCallFunction{
+					Name: "run_skill_script",
+					Arguments: api.ToolCallFunctionArguments{
+						"skill":   "calculator",
+						"command": "python scripts/calc.py 2+2",
+					},
+				},
+			},
+			content: "Result: 4",
+			expected: api.Message{
+				Role:       "tool",
+				Content:    "Result: 4",
+				ToolName:   "run_skill_script",
+				ToolCallID: "call_skill456",
+			},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := toolMessage(tt.call, tt.content)
+			if diff := cmp.Diff(tt.expected, result); diff != "" {
+				t.Errorf("toolMessage() mismatch (-want +got):\n%s", diff)
+			}
+		})
+	}
+}
+
+// TestAssistantMessageWithThinking verifies that assistant messages
+// in the tool loop should include thinking content.
+func TestAssistantMessageConstruction(t *testing.T) {
+	tests := []struct {
+		name        string
+		content     string
+		thinking    string
+		toolCalls   []api.ToolCall
+		expectedMsg api.Message
+	}{
+		{
+			name:     "assistant with thinking and tool calls",
+			content:  "",
+			thinking: "I need to check the weather for Paris.",
+			toolCalls: []api.ToolCall{
+				{
+					ID: "call_1",
+					Function: api.ToolCallFunction{
+						Name:      "get_weather",
+						Arguments: api.ToolCallFunctionArguments{"city": "Paris"},
+					},
+				},
+			},
+			expectedMsg: api.Message{
+				Role:     "assistant",
+				Content:  "",
+				Thinking: "I need to check the weather for Paris.",
+				ToolCalls: []api.ToolCall{
+					{
+						ID: "call_1",
+						Function: api.ToolCallFunction{
+							Name:      "get_weather",
+							Arguments: api.ToolCallFunctionArguments{"city": "Paris"},
+						},
+					},
+				},
+			},
+		},
+		{
+			name:     "assistant with content, thinking, and tool calls",
+			content:  "Let me check that for you.",
+			thinking: "User wants weather info.",
+			toolCalls: []api.ToolCall{
+				{
+					ID: "call_2",
+					Function: api.ToolCallFunction{
+						Name:      "search",
+						Arguments: api.ToolCallFunctionArguments{"query": "weather"},
+					},
+				},
+			},
+			expectedMsg: api.Message{
+				Role:     "assistant",
+				Content:  "Let me check that for you.",
+				Thinking: "User wants weather info.",
+				ToolCalls: []api.ToolCall{
+					{
+						ID: "call_2",
+						Function: api.ToolCallFunction{
+							Name:      "search",
+							Arguments: api.ToolCallFunctionArguments{"query": "weather"},
+						},
+					},
+				},
+			},
+		},
+		{
+			name:     "assistant with multiple tool calls",
+			content:  "",
+			thinking: "I'll check both cities.",
+			toolCalls: []api.ToolCall{
+				{
+					ID: "call_a",
+					Function: api.ToolCallFunction{
+						Name:      "get_weather",
+						Arguments: api.ToolCallFunctionArguments{"city": "Paris"},
+					},
+				},
+				{
+					ID: "call_b",
+					Function: api.ToolCallFunction{
+						Name:      "get_weather",
+						Arguments: api.ToolCallFunctionArguments{"city": "London"},
+					},
+				},
+			},
+			expectedMsg: api.Message{
+				Role:     "assistant",
+				Content:  "",
+				Thinking: "I'll check both cities.",
+				ToolCalls: []api.ToolCall{
+					{
+						ID: "call_a",
+						Function: api.ToolCallFunction{
+							Name:      "get_weather",
+							Arguments: api.ToolCallFunctionArguments{"city": "Paris"},
+						},
+					},
+					{
+						ID: "call_b",
+						Function: api.ToolCallFunction{
+							Name:      "get_weather",
+							Arguments: api.ToolCallFunctionArguments{"city": "London"},
+						},
+					},
+				},
+			},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			// Simulate the assistant message construction as done in chat()
+			assistantMsg := api.Message{
+				Role:      "assistant",
+				Content:   tt.content,
+				Thinking:  tt.thinking,
+				ToolCalls: tt.toolCalls,
+			}
+
+			if diff := cmp.Diff(tt.expectedMsg, assistantMsg); diff != "" {
+				t.Errorf("assistant message mismatch (-want +got):\n%s", diff)
+			}
+		})
+	}
+}
+
+// TestMessageStitchingOrder verifies that messages in a tool loop
+// are stitched in the correct order:
+// 1. User message
+// 2. Assistant message with tool calls (and thinking)
+// 3. Tool result messages (one per tool call, in order)
+// 4. Next assistant response
+func TestMessageStitchingOrder(t *testing.T) {
+	// Simulate a complete tool loop conversation
+	messages := []api.Message{
+		// Initial user message
+		{Role: "user", Content: "What's the weather in Paris and London?"},
+		// Assistant's first response with tool calls
+		{
+			Role:     "assistant",
+			Content:  "",
+			Thinking: "I need to check the weather for both cities.",
+			ToolCalls: []api.ToolCall{
+				{ID: "call_1", Function: api.ToolCallFunction{Name: "get_weather", Arguments: api.ToolCallFunctionArguments{"city": "Paris"}}},
+				{ID: "call_2", Function: api.ToolCallFunction{Name: "get_weather", Arguments: api.ToolCallFunctionArguments{"city": "London"}}},
+			},
+		},
+		// Tool results (in order matching tool calls)
+		{Role: "tool", Content: "Sunny, 22°C", ToolName: "get_weather", ToolCallID: "call_1"},
+		{Role: "tool", Content: "Rainy, 15°C", ToolName: "get_weather", ToolCallID: "call_2"},
+		// Final assistant response
+		{Role: "assistant", Content: "Paris is sunny at 22°C, and London is rainy at 15°C.", Thinking: "Got the data, now summarizing."},
+	}
+
+	// Verify structure
+	expectedRoles := []string{"user", "assistant", "tool", "tool", "assistant"}
+	for i, msg := range messages {
+		if msg.Role != expectedRoles[i] {
+			t.Errorf("message %d: expected role %q, got %q", i, expectedRoles[i], msg.Role)
+		}
+	}
+
+	// Verify tool results match tool calls in order
+	assistantWithTools := messages[1]
+	toolResults := []api.Message{messages[2], messages[3]}
+
+	if len(toolResults) != len(assistantWithTools.ToolCalls) {
+		t.Errorf("expected %d tool results for %d tool calls", len(assistantWithTools.ToolCalls), len(toolResults))
+	}
+
+	for i, result := range toolResults {
+		expectedToolCallID := assistantWithTools.ToolCalls[i].ID
+		if result.ToolCallID != expectedToolCallID {
+			t.Errorf("tool result %d: expected ToolCallID %q, got %q", i, expectedToolCallID, result.ToolCallID)
+		}
+		expectedToolName := assistantWithTools.ToolCalls[i].Function.Name
+		if result.ToolName != expectedToolName {
+			t.Errorf("tool result %d: expected ToolName %q, got %q", i, expectedToolName, result.ToolName)
+		}
+	}
+
+	// Verify thinking is present in assistant messages
+	if messages[1].Thinking == "" {
+		t.Error("first assistant message should have thinking content")
+	}
+	if messages[4].Thinking == "" {
+		t.Error("final assistant message should have thinking content")
+	}
+}
+
+// TestMultiTurnToolLoop verifies message stitching across multiple
+// tool call iterations.
+func TestMultiTurnToolLoop(t *testing.T) {
+	messages := []api.Message{
+		{Role: "user", Content: "What's 2+2 and also what's the weather in Paris?"},
+		// First tool call: calculate
+		{
+			Role:     "assistant",
+			Thinking: "I'll start with the calculation.",
+			ToolCalls: []api.ToolCall{
+				{ID: "calc_1", Function: api.ToolCallFunction{Name: "calculate", Arguments: api.ToolCallFunctionArguments{"expr": "2+2"}}},
+			},
+		},
+		{Role: "tool", Content: "4", ToolName: "calculate", ToolCallID: "calc_1"},
+		// Second tool call: weather
+		{
+			Role:     "assistant",
+			Thinking: "Got the calculation. Now checking weather.",
+			ToolCalls: []api.ToolCall{
+				{ID: "weather_1", Function: api.ToolCallFunction{Name: "get_weather", Arguments: api.ToolCallFunctionArguments{"city": "Paris"}}},
+			},
+		},
+		{Role: "tool", Content: "Sunny, 20°C", ToolName: "get_weather", ToolCallID: "weather_1"},
+		// Final response
+		{Role: "assistant", Content: "2+2 equals 4, and Paris is sunny at 20°C."},
+	}
+
+	// Count message types
+	roleCounts := map[string]int{}
+	for _, msg := range messages {
+		roleCounts[msg.Role]++
+	}
+
+	if roleCounts["user"] != 1 {
+		t.Errorf("expected 1 user message, got %d", roleCounts["user"])
+	}
+	if roleCounts["assistant"] != 3 {
+		t.Errorf("expected 3 assistant messages, got %d", roleCounts["assistant"])
+	}
+	if roleCounts["tool"] != 2 {
+		t.Errorf("expected 2 tool messages, got %d", roleCounts["tool"])
+	}
+
+	// Verify each tool message follows an assistant with matching tool call
+	for i, msg := range messages {
+		if msg.Role == "tool" {
+			// Find preceding assistant message with tool calls
+			var precedingAssistant *api.Message
+			for j := i - 1; j >= 0; j-- {
+				if messages[j].Role == "assistant" && len(messages[j].ToolCalls) > 0 {
+					precedingAssistant = &messages[j]
+					break
+				}
+			}
+
+			if precedingAssistant == nil {
+				t.Errorf("tool message at index %d has no preceding assistant with tool calls", i)
+				continue
+			}
+
+			// Verify tool result matches one of the tool calls
+			found := false
+			for _, tc := range precedingAssistant.ToolCalls {
+				if tc.ID == msg.ToolCallID {
+					found = true
+					break
+				}
+			}
+			if !found {
+				t.Errorf("tool message at index %d has ToolCallID %q not found in preceding tool calls", i, msg.ToolCallID)
+			}
+		}
+	}
+}
+
+// TestSkillCatalogRunToolCallPreservesFields tests that skill catalog
+// returns tool messages with correct fields.
+func TestSkillCatalogToolMessageFields(t *testing.T) {
+	// Create a minimal test for toolMessage function
+	call := api.ToolCall{
+		ID: "test_id_123",
+		Function: api.ToolCallFunction{
+			Name: "run_skill_script",
+			Arguments: api.ToolCallFunctionArguments{
+				"skill":   "test-skill",
+				"command": "echo hello",
+			},
+		},
+	}
+
+	msg := toolMessage(call, "hello")
+
+	if msg.Role != "tool" {
+		t.Errorf("expected role 'tool', got %q", msg.Role)
+	}
+	if msg.Content != "hello" {
+		t.Errorf("expected content 'hello', got %q", msg.Content)
+	}
+	if msg.ToolName != "run_skill_script" {
+		t.Errorf("expected ToolName 'run_skill_script', got %q", msg.ToolName)
+	}
+	if msg.ToolCallID != "test_id_123" {
+		t.Errorf("expected ToolCallID 'test_id_123', got %q", msg.ToolCallID)
+	}
+}
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -15,6 +15,7 @@ import (
 	"net"
 	"net/http"
 	"os"
+	"os/exec"
 	"os/signal"
 	"path/filepath"
 	"runtime"
@@ -495,6 +496,16 @@ func RunHandler(cmd *cobra.Command, args []string) error {

 	opts.ParentModel = info.Details.ParentModel

+	// Check if this is an agent
+	isAgent := info.AgentType != "" || len(info.Skills) > 0 || len(info.MCPs) > 0 || info.Entrypoint != ""
+	if isAgent {
+		opts.IsAgent = true
+		opts.AgentType = info.AgentType
+		opts.Skills = info.Skills
+		opts.MCPs = info.MCPs
+		opts.Entrypoint = info.Entrypoint
+	}
+
 	// Check if this is an embedding model
 	isEmbeddingModel := slices.Contains(info.Capabilities, model.CapabilityEmbedding)

@@ -520,7 +531,10 @@ func RunHandler(cmd *cobra.Command, args []string) error {

 	// Check for experimental flag
 	isExperimental, _ := cmd.Flags().GetBool("experimental")
-	yoloMode, _ := cmd.Flags().GetBool("yolo")
+	// If agent has entrypoint, run it instead of chat loop
+	if opts.Entrypoint != "" {
+		return runEntrypoint(cmd, opts)
+	}

 	if interactive {
 		if err := loadOrUnloadModel(cmd, &opts); err != nil {
@@ -548,16 +562,69 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 			}
 		}

-		// Use experimental agent loop with tools
+		// Use experimental agent loop with
 		if isExperimental {
-			return xcmd.GenerateInteractive(cmd, opts.Model, opts.WordWrap, opts.Options, opts.Think, opts.HideThinking, opts.KeepAlive, yoloMode)
+			return xcmd.GenerateInteractive(cmd, opts.Model, opts.WordWrap, opts.Options, opts.Think, opts.HideThinking, opts.KeepAlive)
 		}

 		return generateInteractive(cmd, opts)
 	}
+
+	// For agents, use chat API even in non-interactive mode to support tools
+	if opts.IsAgent {
+		opts.Messages = append(opts.Messages, api.Message{Role: "user", Content: opts.Prompt})
+		_, err := chat(cmd, opts)
+		return err
+	}
+
 	return generate(cmd, opts)
 }

+// runEntrypoint executes the agent's entrypoint command instead of the built-in chat loop.
+func runEntrypoint(cmd *cobra.Command, opts runOptions) error {
+	entrypoint := opts.Entrypoint
+
+	// Check if entrypoint contains $PROMPT placeholder
+	hasPlaceholder := strings.Contains(entrypoint, "$PROMPT")
+
+	if hasPlaceholder && opts.Prompt != "" {
+		// Replace $PROMPT with the actual prompt
+		entrypoint = strings.ReplaceAll(entrypoint, "$PROMPT", opts.Prompt)
+	} else if hasPlaceholder {
+		// No prompt provided but placeholder exists - remove placeholder
+		entrypoint = strings.ReplaceAll(entrypoint, "$PROMPT", "")
+	}
+
+	// Parse entrypoint into command and args
+	parts := strings.Fields(entrypoint)
+	if len(parts) == 0 {
+		return fmt.Errorf("empty entrypoint")
+	}
+
+	command := parts[0]
+	args := parts[1:]
+
+	// If user provided a prompt and no placeholder was used, append it as argument
+	if opts.Prompt != "" && !hasPlaceholder {
+		args = append(args, opts.Prompt)
+	}
+
+	// Look up command in PATH
+	execPath, err := exec.LookPath(command)
+	if err != nil {
+		return fmt.Errorf("entrypoint command not found: %s", command)
+	}
+
+	// Create subprocess
+	proc := exec.Command(execPath, args...)
+	proc.Stdin = os.Stdin
+	proc.Stdout = os.Stdout
+	proc.Stderr = os.Stderr
+
+	// Run and wait
+	return proc.Run()
+}
+
 func SigninHandler(cmd *cobra.Command, args []string) error {
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
@@ -917,47 +984,96 @@ func showInfo(resp *api.ShowResponse, verbose bool, w io.Writer) error {
 		fmt.Fprintln(w)
 	}

-	tableRender("Model", func() (rows [][]string) {
-		if resp.RemoteHost != "" {
-			rows = append(rows, []string{"", "Remote model", resp.RemoteModel})
-			rows = append(rows, []string{"", "Remote URL", resp.RemoteHost})
-		}
-
-		if resp.ModelInfo != nil {
-			arch := resp.ModelInfo["general.architecture"].(string)
-			rows = append(rows, []string{"", "architecture", arch})
-
-			var paramStr string
-			if resp.Details.ParameterSize != "" {
-				paramStr = resp.Details.ParameterSize
-			} else if v, ok := resp.ModelInfo["general.parameter_count"]; ok {
-				if f, ok := v.(float64); ok {
-					paramStr = format.HumanNumber(uint64(f))
-				}
-			}
-			rows = append(rows, []string{"", "parameters", paramStr})
-
-			if v, ok := resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)]; ok {
-				if f, ok := v.(float64); ok {
-					rows = append(rows, []string{"", "context length", strconv.FormatFloat(f, 'f', -1, 64)})
-				}
+	// Only show Model section if there's actual model info (not for entrypoint-only agents)
+	hasModelInfo := resp.RemoteHost != "" || resp.ModelInfo != nil || resp.Details.Family != "" || resp.Details.ParameterSize != "" || resp.Details.QuantizationLevel != ""
+	if hasModelInfo {
+		tableRender("Model", func() (rows [][]string) {
+			if resp.RemoteHost != "" {
+				rows = append(rows, []string{"", "Remote model", resp.RemoteModel})
+				rows = append(rows, []string{"", "Remote URL", resp.RemoteHost})
 			}

-			if v, ok := resp.ModelInfo[fmt.Sprintf("%s.embedding_length", arch)]; ok {
-				if f, ok := v.(float64); ok {
-					rows = append(rows, []string{"", "embedding length", strconv.FormatFloat(f, 'f', -1, 64)})
+			if resp.ModelInfo != nil {
+				arch := resp.ModelInfo["general.architecture"].(string)
+				rows = append(rows, []string{"", "architecture", arch})
+
+				var paramStr string
+				if resp.Details.ParameterSize != "" {
+					paramStr = resp.Details.ParameterSize
+				} else if v, ok := resp.ModelInfo["general.parameter_count"]; ok {
+					if f, ok := v.(float64); ok {
+						paramStr = format.HumanNumber(uint64(f))
+					}
+				}
+				rows = append(rows, []string{"", "parameters", paramStr})
+
+				if v, ok := resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)]; ok {
+					if f, ok := v.(float64); ok {
+						rows = append(rows, []string{"", "context length", strconv.FormatFloat(f, 'f', -1, 64)})
+					}
+				}
+
+				if v, ok := resp.ModelInfo[fmt.Sprintf("%s.embedding_length", arch)]; ok {
+					if f, ok := v.(float64); ok {
+						rows = append(rows, []string{"", "embedding length", strconv.FormatFloat(f, 'f', -1, 64)})
+					}
+				}
+			} else {
+				rows = append(rows, []string{"", "architecture", resp.Details.Family})
+				rows = append(rows, []string{"", "parameters", resp.Details.ParameterSize})
+			}
+			rows = append(rows, []string{"", "quantization", resp.Details.QuantizationLevel})
+			if resp.Requires != "" {
+				rows = append(rows, []string{"", "requires", resp.Requires})
+			}
+			return
+		})
+	}
+
+	// Display agent information if this is an agent
+	if resp.AgentType != "" || len(resp.Skills) > 0 || len(resp.MCPs) > 0 || resp.Entrypoint != "" {
+		tableRender("Agent", func() (rows [][]string) {
+			if resp.AgentType != "" {
+				rows = append(rows, []string{"", "type", resp.AgentType})
+			}
+			if resp.Entrypoint != "" {
+				rows = append(rows, []string{"", "entrypoint", resp.Entrypoint})
+			}
+			if len(resp.Skills) > 0 {
+				for i, skill := range resp.Skills {
+					label := "skill"
+					if i > 0 {
+						label = ""
+					}
+					// Show skill name or digest
+					skillDisplay := skill.Name
+					if skillDisplay == "" && skill.Digest != "" {
+						skillDisplay = skill.Digest[:12] + "..."
+					}
+					rows = append(rows, []string{"", label, skillDisplay})
 				}
 			}
-		} else {
-			rows = append(rows, []string{"", "architecture", resp.Details.Family})
-			rows = append(rows, []string{"", "parameters", resp.Details.ParameterSize})
-		}
-		rows = append(rows, []string{"", "quantization", resp.Details.QuantizationLevel})
-		if resp.Requires != "" {
-			rows = append(rows, []string{"", "requires", resp.Requires})
-		}
-		return
-	})
+			if len(resp.MCPs) > 0 {
+				for i, mcp := range resp.MCPs {
+					label := "mcp"
+					if i > 0 {
+						label = ""
+					}
+					// Show MCP name and command
+					mcpDisplay := mcp.Name
+					if mcp.Command != "" {
+						cmdLine := mcp.Command
+						if len(mcp.Args) > 0 {
+							cmdLine += " " + strings.Join(mcp.Args, " ")
+						}
+						mcpDisplay += " (" + cmdLine + ")"
+					}
+					rows = append(rows, []string{"", label, mcpDisplay})
+				}
+			}
+			return
+		})
+	}

 	if len(resp.Capabilities) > 0 {
 		tableRender("Capabilities", func() (rows [][]string) {
@@ -1199,6 +1315,11 @@ type runOptions struct {
 	Think        *api.ThinkValue
 	HideThinking bool
 	ShowConnect  bool
+	IsAgent      bool
+	AgentType    string
+	Skills       []api.SkillRef
+	MCPs         []api.MCPRef
+	Entrypoint   string
 }

 func (r runOptions) Copy() runOptions {
@@ -1228,6 +1349,12 @@ func (r runOptions) Copy() runOptions {
 		think = &cThink
 	}

+	var skills []api.SkillRef
+	if r.Skills != nil {
+		skills = make([]api.SkillRef, len(r.Skills))
+		copy(skills, r.Skills)
+	}
+
 	return runOptions{
 		Model:        r.Model,
 		ParentModel:  r.ParentModel,
@@ -1243,6 +1370,9 @@ func (r runOptions) Copy() runOptions {
 		Think:        think,
 		HideThinking: r.HideThinking,
 		ShowConnect:  r.ShowConnect,
+		IsAgent:      r.IsAgent,
+		AgentType:    r.AgentType,
+		Skills:       skills,
 	}
 }

@@ -1326,6 +1456,65 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 		return nil, err
 	}

+	// Load skills for agents
+	var skillsCatalog *skillCatalog
+	if opts.IsAgent && len(opts.Skills) > 0 {
+		skillsCatalog, err = loadSkillsFromRefs(opts.Skills)
+		if err != nil {
+			return nil, fmt.Errorf("failed to load skills: %w", err)
+		}
+		if skillsCatalog != nil && len(skillsCatalog.Skills) > 0 {
+			var skillNames []string
+			for _, s := range skillsCatalog.Skills {
+				skillNames = append(skillNames, s.Name)
+			}
+			fmt.Fprintf(os.Stderr, "Loaded skills: %s\n", strings.Join(skillNames, ", "))
+		}
+	}
+
+	// Load MCP servers for agents (from opts and global config)
+	var mcpMgr *mcpManager
+	allMCPs := opts.MCPs
+
+	// Load global MCPs from ~/.ollama/mcp.json
+	if globalConfig, err := loadMCPConfig(); err == nil && len(globalConfig.MCPServers) > 0 {
+		for name, srv := range globalConfig.MCPServers {
+			// Skip disabled MCPs
+			if srv.Disabled {
+				continue
+			}
+			// Check if already in opts.MCPs (model takes precedence)
+			found := false
+			for _, m := range opts.MCPs {
+				if m.Name == name {
+					found = true
+					break
+				}
+			}
+			if !found {
+				allMCPs = append(allMCPs, api.MCPRef{
+					Name:    name,
+					Command: srv.Command,
+					Args:    srv.Args,
+					Env:     srv.Env,
+					Type:    srv.Type,
+				})
+			}
+		}
+	}
+
+	if len(allMCPs) > 0 {
+		mcpMgr = newMCPManager()
+		if err := mcpMgr.loadMCPsFromRefs(allMCPs); err != nil {
+			return nil, fmt.Errorf("failed to load MCP servers: %w", err)
+		}
+		if mcpMgr.ToolCount() > 0 {
+			fmt.Fprintf(os.Stderr, "Loaded MCP servers: %s (%d tools)\n",
+				strings.Join(mcpMgr.ServerNames(), ", "), mcpMgr.ToolCount())
+		}
+		defer mcpMgr.Shutdown()
+	}
+
 	p := progress.NewProgress(os.Stderr)
 	defer p.StopAndClear()

@@ -1349,6 +1538,7 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 	var fullResponse strings.Builder
 	var thinkTagOpened bool = false
 	var thinkTagClosed bool = false
+	var pendingToolCalls []api.ToolCall

 	role := "assistant"

@@ -1389,7 +1579,13 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 		if response.Message.ToolCalls != nil {
 			toolCalls := response.Message.ToolCalls
 			if len(toolCalls) > 0 {
-				fmt.Print(renderToolCalls(toolCalls, false))
+				if skillsCatalog != nil || mcpMgr != nil {
+					// Store tool calls for execution after response is complete
+					pendingToolCalls = append(pendingToolCalls, toolCalls...)
+				} else {
+					// No skills catalog or MCP, just display tool calls
+					fmt.Print(renderToolCalls(toolCalls, false))
+				}
 			}
 		}

@@ -1402,31 +1598,161 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 		opts.Format = `"` + opts.Format + `"`
 	}

-	req := &api.ChatRequest{
-		Model:    opts.Model,
-		Messages: opts.Messages,
-		Format:   json.RawMessage(opts.Format),
-		Options:  opts.Options,
-		Think:    opts.Think,
+	// Prepare messages with agent-specific system prompt
+	messages := opts.Messages
+	if skillsCatalog != nil {
+		// Add skills system prompt as the first system message
+		skillsPrompt := skillsCatalog.SystemPrompt()
+		if skillsPrompt != "" {
+			// Insert skills prompt at the beginning, or append to existing system message
+			if len(messages) > 0 && messages[0].Role == "system" {
+				// Append to existing system message
+				messages[0].Content = messages[0].Content + "\n\n" + skillsPrompt
+			} else {
+				// Insert new system message at the beginning
+				systemMsg := api.Message{Role: "system", Content: skillsPrompt}
+				messages = append([]api.Message{systemMsg}, messages...)
+			}
+		}
 	}

-	if opts.KeepAlive != nil {
-		req.KeepAlive = opts.KeepAlive
-	}
-
-	if err := client.Chat(cancelCtx, req, fn); err != nil {
-		if errors.Is(err, context.Canceled) {
-			return nil, nil
+	// Agentic loop: continue until no more tool calls
+	for {
+		req := &api.ChatRequest{
+			Model:    opts.Model,
+			Messages: messages,
+			Format:   json.RawMessage(opts.Format),
+			Options:  opts.Options,
+			Think:    opts.Think,
 		}

-		// this error should ideally be wrapped properly by the client
-		if strings.Contains(err.Error(), "upstream error") {
-			p.StopAndClear()
-			fmt.Println("An error occurred while processing your message. Please try again.")
-			fmt.Println()
-			return nil, nil
+		// Add tools for agents (combine skills and MCP tools)
+		var allTools api.Tools
+		if skillsCatalog != nil {
+			allTools = append(allTools, skillsCatalog.Tools()...)
 		}
-		return nil, err
+		if mcpMgr != nil {
+			allTools = append(allTools, mcpMgr.Tools()...)
+		}
+		if len(allTools) > 0 {
+			req.Tools = allTools
+		}
+
+		if opts.KeepAlive != nil {
+			req.KeepAlive = opts.KeepAlive
+		}
+
+		if err := client.Chat(cancelCtx, req, fn); err != nil {
+			if errors.Is(err, context.Canceled) {
+				return nil, nil
+			}
+
+			// this error should ideally be wrapped properly by the client
+			if strings.Contains(err.Error(), "upstream error") {
+				p.StopAndClear()
+				fmt.Println("An error occurred while processing your message. Please try again.")
+				fmt.Println()
+				return nil, nil
+			}
+			return nil, err
+		}
+
+		// If no tool calls, we're done
+		if len(pendingToolCalls) == 0 || (skillsCatalog == nil && mcpMgr == nil) {
+			break
+		}
+
+		// Execute tool calls and continue the conversation
+		fmt.Fprintf(os.Stderr, "\n")
+
+		// Add assistant's tool call message to history (include thinking for proper rendering)
+		assistantMsg := api.Message{
+			Role:      "assistant",
+			Content:   fullResponse.String(),
+			Thinking:  thinkingContent.String(),
+			ToolCalls: pendingToolCalls,
+		}
+		messages = append(messages, assistantMsg)
+
+		// Execute each tool call and collect results
+		var toolResults []api.Message
+		for _, call := range pendingToolCalls {
+			// Show what's being executed
+			switch call.Function.Name {
+			case "run_skill_script":
+				skillVal, _ := call.Function.Arguments.Get("skill")
+				skill, _ := skillVal.(string)
+				commandVal, _ := call.Function.Arguments.Get("command")
+				command, _ := commandVal.(string)
+				fmt.Fprintf(os.Stderr, "Running script in %s: %s\n", skill, command)
+			case "read_skill_file":
+				skillVal, _ := call.Function.Arguments.Get("skill")
+				skill, _ := skillVal.(string)
+				pathVal, _ := call.Function.Arguments.Get("path")
+				path, _ := pathVal.(string)
+				fmt.Fprintf(os.Stderr, "Reading file from %s: %s\n", skill, path)
+			default:
+				fmt.Fprintf(os.Stderr, "Executing: %s\n", call.Function.Name)
+			}
+
+			var result api.Message
+			var handled bool
+			var err error
+
+			// Try skill catalog first
+			if skillsCatalog != nil {
+				result, handled, err = skillsCatalog.RunToolCall(call)
+			}
+
+			// If not handled by skills, try MCP
+			if !handled && mcpMgr != nil {
+				result, handled, err = mcpMgr.RunToolCall(call)
+			}
+
+			if err != nil {
+				fmt.Fprintf(os.Stderr, "Error: %v\n", err)
+				// Add error result
+				toolResults = append(toolResults, api.Message{
+					Role:    "tool",
+					Content: fmt.Sprintf("Error: %v", err),
+				})
+				continue
+			}
+			if !handled {
+				fmt.Fprintf(os.Stderr, "Warning: Unknown tool %s\n", call.Function.Name)
+				toolResults = append(toolResults, api.Message{
+					Role:    "tool",
+					Content: fmt.Sprintf("Unknown tool: %s", call.Function.Name),
+				})
+				continue
+			}
+
+			// Display tool output
+			if result.Content != "" {
+				fmt.Fprintf(os.Stderr, "Output:\n%s\n", result.Content)
+			}
+
+			// Add tool result to messages (preserves ToolName, ToolCallID from result)
+			toolResults = append(toolResults, result)
+		}
+
+		// Add tool results to message history
+		messages = append(messages, toolResults...)
+
+		fmt.Fprintf(os.Stderr, "\n")
+
+		// Reset state for next iteration
+		fullResponse.Reset()
+		thinkingContent.Reset()
+		thinkTagOpened = false
+		thinkTagClosed = false
+		pendingToolCalls = nil
+		state = &displayResponseState{}
+
+		// Start new progress spinner for next API call
+		p = progress.NewProgress(os.Stderr)
+		spinner = progress.NewSpinner("")
+		p.Add("", spinner)
 	}

 	if len(opts.Messages) > 0 {
@@ -1765,7 +2091,6 @@ func NewCLI() *cobra.Command {
 	runCmd.Flags().Bool("truncate", false, "For embedding models: truncate inputs exceeding context length (default: true). Set --truncate=false to error instead")
 	runCmd.Flags().Int("dimensions", 0, "Truncate output embeddings to specified dimension (embedding models only)")
 	runCmd.Flags().Bool("experimental", false, "Enable experimental agent loop with tools")
-	runCmd.Flags().BoolP("yolo", "y", false, "Skip all tool approval prompts (use with caution)")

 	stopCmd := &cobra.Command{
 		Use:     "stop MODEL",
@@ -1920,6 +2245,8 @@ func NewCLI() *cobra.Command {
 		copyCmd,
 		deleteCmd,
 		runnerCmd,
+		NewSkillCommand(),
+		NewMCPCommand(),
 	)

 	return rootCmd
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -34,6 +34,9 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 		fmt.Fprintln(os.Stderr, "Available Commands:")
 		fmt.Fprintln(os.Stderr, "  /set            Set session variables")
 		fmt.Fprintln(os.Stderr, "  /show           Show model information")
+		fmt.Fprintln(os.Stderr, "  /skills         Show available skills")
+		fmt.Fprintln(os.Stderr, "  /skill          Add or remove skills dynamically")
+		fmt.Fprintln(os.Stderr, "  /mcp            Show/add/remove MCP servers")
 		fmt.Fprintln(os.Stderr, "  /load <model>   Load a session or model")
 		fmt.Fprintln(os.Stderr, "  /save <model>   Save your current session")
 		fmt.Fprintln(os.Stderr, "  /clear          Clear session context")
@@ -444,6 +447,411 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 			} else {
 				usageShow()
 			}
+		case strings.HasPrefix(line, "/skill "):
+			args := strings.Fields(line)
+			if len(args) < 2 {
+				fmt.Fprintln(os.Stderr, "Usage:")
+				fmt.Fprintln(os.Stderr, "  /skill add <path>      Add a skill from local path")
+				fmt.Fprintln(os.Stderr, "  /skill remove <name>   Remove a skill by name")
+				fmt.Fprintln(os.Stderr, "  /skill list            List current skills")
+				continue
+			}
+
+			switch args[1] {
+			case "add":
+				if len(args) < 3 {
+					fmt.Println("Usage: /skill add <path>")
+					continue
+				}
+				skillPath := args[2]
+
+				// Expand ~ to home directory
+				if strings.HasPrefix(skillPath, "~") {
+					home, err := os.UserHomeDir()
+					if err != nil {
+						fmt.Printf("Error expanding path: %v\n", err)
+						continue
+					}
+					skillPath = filepath.Join(home, skillPath[1:])
+				}
+
+				// Make absolute
+				absPath, err := filepath.Abs(skillPath)
+				if err != nil {
+					fmt.Printf("Error resolving path: %v\n", err)
+					continue
+				}
+
+				// Verify SKILL.md exists
+				skillMdPath := filepath.Join(absPath, "SKILL.md")
+				if _, err := os.Stat(skillMdPath); err != nil {
+					fmt.Printf("Error: %s does not contain SKILL.md\n", skillPath)
+					continue
+				}
+
+				// Extract skill name from SKILL.md
+				content, err := os.ReadFile(skillMdPath)
+				if err != nil {
+					fmt.Printf("Error reading SKILL.md: %v\n", err)
+					continue
+				}
+				skillName, _ := extractSkillMetadata(string(content))
+				if skillName == "" {
+					skillName = filepath.Base(absPath)
+				}
+
+				// Check if already added
+				for _, s := range opts.Skills {
+					if s.Name == skillName {
+						fmt.Printf("Skill '%s' is already loaded\n", skillName)
+						continue
+					}
+				}
+
+				// Add to skills (using path as Name, no digest for local skills)
+				opts.Skills = append(opts.Skills, api.SkillRef{Name: absPath})
+				opts.IsAgent = true // Enable agent mode if not already
+				fmt.Printf("Added skill '%s' from %s\n", skillName, skillPath)
+
+			case "remove", "rm":
+				if len(args) < 3 {
+					fmt.Println("Usage: /skill remove <name>")
+					continue
+				}
+				skillName := args[2]
+
+				found := false
+				newSkills := make([]api.SkillRef, 0, len(opts.Skills))
+				for _, s := range opts.Skills {
+					// Match by name or by path basename
+					name := s.Name
+					if strings.Contains(name, string(os.PathSeparator)) {
+						name = filepath.Base(name)
+					}
+					if name == skillName || s.Name == skillName {
+						found = true
+						fmt.Printf("Removed skill '%s'\n", skillName)
+					} else {
+						newSkills = append(newSkills, s)
+					}
+				}
+				if !found {
+					fmt.Printf("Skill '%s' not found\n", skillName)
+				} else {
+					opts.Skills = newSkills
+				}
+
+			case "list", "ls":
+				if len(opts.Skills) == 0 {
+					fmt.Println("No skills loaded in this session.")
+				} else {
+					fmt.Println("Skills loaded in this session:")
+					for _, skill := range opts.Skills {
+						if skill.Digest != "" {
+							fmt.Printf("  %s (%s)\n", skill.Name, skill.Digest[:19])
+						} else {
+							// For local paths, show basename
+							name := skill.Name
+							if strings.Contains(name, string(os.PathSeparator)) {
+								name = filepath.Base(name) + " (local: " + skill.Name + ")"
+							}
+							fmt.Printf("  %s\n", name)
+						}
+					}
+				}
+				fmt.Println()
+
+			default:
+				fmt.Printf("Unknown skill command '%s'. Use /skill add, /skill remove, or /skill list\n", args[1])
+			}
+			continue
+
+		case strings.HasPrefix(line, "/skills"):
+			// Show skills from model (bundled) + session skills
+			client, err := api.ClientFromEnvironment()
+			if err != nil {
+				fmt.Println("error: couldn't connect to ollama server")
+				return err
+			}
+			req := &api.ShowRequest{
+				Name: opts.Model,
+			}
+			resp, err := client.Show(cmd.Context(), req)
+			if err != nil {
+				fmt.Println("error: couldn't get model info")
+				return err
+			}
+
+			// Combine model skills with session skills
+			allSkills := make([]api.SkillRef, 0)
+			allSkills = append(allSkills, resp.Skills...)
+
+			// Add session skills that aren't already in model skills
+			for _, sessionSkill := range opts.Skills {
+				found := false
+				for _, modelSkill := range resp.Skills {
+					if modelSkill.Name == sessionSkill.Name || modelSkill.Digest == sessionSkill.Digest {
+						found = true
+						break
+					}
+				}
+				if !found {
+					allSkills = append(allSkills, sessionSkill)
+				}
+			}
+
+			if len(allSkills) == 0 {
+				fmt.Println("No skills available.")
+			} else {
+				fmt.Println("Available Skills:")
+				for _, skill := range allSkills {
+					if skill.Digest != "" {
+						fmt.Printf("  %s (%s)\n", skill.Name, skill.Digest[:19])
+					} else {
+						name := skill.Name
+						if strings.Contains(name, string(os.PathSeparator)) {
+							name = filepath.Base(name) + " (session)"
+						}
+						fmt.Printf("  %s\n", name)
+					}
+				}
+			}
+			fmt.Println()
+			continue
+
+		case strings.HasPrefix(line, "/mcp"):
+			args := strings.Fields(line)
+
+			// If just "/mcp" with no args, show all MCP servers
+			if len(args) == 1 {
+				// Show MCPs from model (bundled) + global config
+				client, err := api.ClientFromEnvironment()
+				if err != nil {
+					fmt.Println("error: couldn't connect to ollama server")
+					return err
+				}
+				req := &api.ShowRequest{
+					Name: opts.Model,
+				}
+				resp, err := client.Show(cmd.Context(), req)
+				if err != nil {
+					fmt.Println("error: couldn't get model info")
+					return err
+				}
+
+				// Combine model MCPs with global config MCPs
+				allMCPs := make([]api.MCPRef, 0)
+				allMCPs = append(allMCPs, resp.MCPs...)
+
+				// Load global config
+				globalConfig, _ := loadMCPConfig()
+				globalMCPNames := make(map[string]bool)
+
+				if globalConfig != nil {
+					for name, srv := range globalConfig.MCPServers {
+						// Check if already in model MCPs
+						found := false
+						for _, modelMCP := range resp.MCPs {
+							if modelMCP.Name == name {
+								found = true
+								break
+							}
+						}
+						if !found {
+							allMCPs = append(allMCPs, api.MCPRef{
+								Name:    name,
+								Command: srv.Command,
+								Args:    srv.Args,
+								Env:     srv.Env,
+								Type:    srv.Type,
+							})
+						}
+						globalMCPNames[name] = true
+					}
+				}
+
+				if len(allMCPs) == 0 {
+					fmt.Println("No MCP servers available.")
+					fmt.Println("Use '/mcp add <name> <command> [args...]' to add one.")
+				} else {
+					fmt.Println("Available MCP Servers:")
+					for _, mcp := range allMCPs {
+						cmdLine := mcp.Command
+						if len(mcp.Args) > 0 {
+							cmdLine += " " + strings.Join(mcp.Args, " ")
+						}
+						source := ""
+						disabled := ""
+						// Check if it's from model or global config
+						isFromModel := false
+						for _, modelMCP := range resp.MCPs {
+							if modelMCP.Name == mcp.Name {
+								isFromModel = true
+								break
+							}
+						}
+						if isFromModel {
+							source = " (model)"
+						} else if globalMCPNames[mcp.Name] {
+							source = " (global)"
+							// Check if disabled
+							if srv, ok := globalConfig.MCPServers[mcp.Name]; ok && srv.Disabled {
+								disabled = " [disabled]"
+							}
+						}
+						fmt.Printf("  %s: %s%s%s\n", mcp.Name, cmdLine, source, disabled)
+					}
+				}
+				fmt.Println()
+				continue
+			}
+
+			switch args[1] {
+			case "add":
+				if len(args) < 4 {
+					fmt.Println("Usage: /mcp add <name> <command> [args...]")
+					continue
+				}
+				mcpName := args[2]
+				mcpCommand := args[3]
+				mcpArgs := args[4:]
+
+				// Load global config
+				config, err := loadMCPConfig()
+				if err != nil {
+					fmt.Printf("Error loading MCP config: %v\n", err)
+					continue
+				}
+
+				// Check if already exists
+				if _, exists := config.MCPServers[mcpName]; exists {
+					fmt.Printf("Warning: overwriting existing MCP server '%s'\n", mcpName)
+				}
+
+				// Add to global config
+				config.MCPServers[mcpName] = MCPServerConfig{
+					Type:    "stdio",
+					Command: mcpCommand,
+					Args:    mcpArgs,
+				}
+
+				// Save config
+				if err := saveMCPConfig(config); err != nil {
+					fmt.Printf("Error saving MCP config: %v\n", err)
+					continue
+				}
+
+				cmdLine := mcpCommand
+				if len(mcpArgs) > 0 {
+					cmdLine += " " + strings.Join(mcpArgs, " ")
+				}
+				fmt.Printf("Added MCP server '%s' (%s) to %s\n", mcpName, cmdLine, getMCPConfigPath())
+				fmt.Println("Note: MCP server will be started on next message.")
+
+			case "remove", "rm":
+				if len(args) < 3 {
+					fmt.Println("Usage: /mcp remove <name>")
+					continue
+				}
+				mcpName := args[2]
+
+				// Load global config
+				config, err := loadMCPConfig()
+				if err != nil {
+					fmt.Printf("Error loading MCP config: %v\n", err)
+					continue
+				}
+
+				if _, exists := config.MCPServers[mcpName]; !exists {
+					fmt.Printf("MCP server '%s' not found in global config\n", mcpName)
+					continue
+				}
+
+				delete(config.MCPServers, mcpName)
+
+				if err := saveMCPConfig(config); err != nil {
+					fmt.Printf("Error saving MCP config: %v\n", err)
+					continue
+				}
+
+				fmt.Printf("Removed MCP server '%s' from %s\n", mcpName, getMCPConfigPath())
+				fmt.Println("Note: Changes will take effect on next message.")
+
+			case "disable":
+				if len(args) < 3 {
+					fmt.Println("Usage: /mcp disable <name>")
+					continue
+				}
+				mcpName := args[2]
+
+				config, err := loadMCPConfig()
+				if err != nil {
+					fmt.Printf("Error loading MCP config: %v\n", err)
+					continue
+				}
+
+				srv, exists := config.MCPServers[mcpName]
+				if !exists {
+					fmt.Printf("MCP server '%s' not found in global config\n", mcpName)
+					continue
+				}
+
+				if srv.Disabled {
+					fmt.Printf("MCP server '%s' is already disabled\n", mcpName)
+					continue
+				}
+
+				srv.Disabled = true
+				config.MCPServers[mcpName] = srv
+
+				if err := saveMCPConfig(config); err != nil {
+					fmt.Printf("Error saving MCP config: %v\n", err)
+					continue
+				}
+
+				fmt.Printf("Disabled MCP server '%s'\n", mcpName)
+				fmt.Println("Note: Changes will take effect on next message.")
+
+			case "enable":
+				if len(args) < 3 {
+					fmt.Println("Usage: /mcp enable <name>")
+					continue
+				}
+				mcpName := args[2]
+
+				config, err := loadMCPConfig()
+				if err != nil {
+					fmt.Printf("Error loading MCP config: %v\n", err)
+					continue
+				}
+
+				srv, exists := config.MCPServers[mcpName]
+				if !exists {
+					fmt.Printf("MCP server '%s' not found in global config\n", mcpName)
+					continue
+				}
+
+				if !srv.Disabled {
+					fmt.Printf("MCP server '%s' is already enabled\n", mcpName)
+					continue
+				}
+
+				srv.Disabled = false
+				config.MCPServers[mcpName] = srv
+
+				if err := saveMCPConfig(config); err != nil {
+					fmt.Printf("Error saving MCP config: %v\n", err)
+					continue
+				}
+
+				fmt.Printf("Enabled MCP server '%s'\n", mcpName)
+				fmt.Println("Note: Changes will take effect on next message.")
+
+			default:
+				fmt.Printf("Unknown mcp command '%s'. Use /mcp, /mcp add, /mcp remove, /mcp disable, or /mcp enable\n", args[1])
+			}
+			continue
+
 		case strings.HasPrefix(line, "/help"), strings.HasPrefix(line, "/?"):
 			args := strings.Fields(line)
 			if len(args) > 1 {
@@ -452,6 +860,20 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 					usageSet()
 				case "show", "/show":
 					usageShow()
+				case "skill", "/skill":
+					fmt.Fprintln(os.Stderr, "Available Commands:")
+					fmt.Fprintln(os.Stderr, "  /skill add <path>      Add a skill from local path")
+					fmt.Fprintln(os.Stderr, "  /skill remove <name>   Remove a skill by name")
+					fmt.Fprintln(os.Stderr, "  /skill list            List current session skills")
+					fmt.Fprintln(os.Stderr, "")
+				case "mcp", "/mcp":
+					fmt.Fprintln(os.Stderr, "Available Commands:")
+					fmt.Fprintln(os.Stderr, "  /mcp                                  Show all MCP servers")
+					fmt.Fprintln(os.Stderr, "  /mcp add <name> <command> [args...]   Add an MCP server to global config")
+					fmt.Fprintln(os.Stderr, "  /mcp remove <name>                    Remove an MCP server from global config")
+					fmt.Fprintln(os.Stderr, "  /mcp disable <name>                   Disable an MCP server (keep in config)")
+					fmt.Fprintln(os.Stderr, "  /mcp enable <name>                    Re-enable a disabled MCP server")
+					fmt.Fprintln(os.Stderr, "")
 				case "shortcut", "shortcuts":
 					usageShortcuts()
 				}
--- a/cmd/skill_cmd.go
+++ b/cmd/skill_cmd.go
@@ -0,0 +1,570 @@
+package cmd
+
+import (
+	"context"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+	"text/tabwriter"
+	"time"
+
+	"github.com/spf13/cobra"
+
+	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/format"
+	"github.com/ollama/ollama/progress"
+	"github.com/ollama/ollama/server"
+	"github.com/ollama/ollama/types/model"
+)
+
+// SkillPushHandler handles the skill push command.
+func SkillPushHandler(cmd *cobra.Command, args []string) error {
+	if len(args) != 2 {
+		return fmt.Errorf("usage: ollama skill push NAME[:TAG] PATH")
+	}
+
+	name := args[0]
+	path := args[1]
+
+	// Expand path
+	if strings.HasPrefix(path, "~") {
+		home, err := os.UserHomeDir()
+		if err != nil {
+			return fmt.Errorf("expanding home directory: %w", err)
+		}
+		path = filepath.Join(home, path[1:])
+	}
+
+	absPath, err := filepath.Abs(path)
+	if err != nil {
+		return fmt.Errorf("resolving path: %w", err)
+	}
+
+	// Validate skill directory
+	skillMdPath := filepath.Join(absPath, "SKILL.md")
+	if _, err := os.Stat(skillMdPath); err != nil {
+		return fmt.Errorf("skill directory must contain SKILL.md: %w", err)
+	}
+
+	// Parse skill name (will set Kind="skill")
+	n := server.ParseSkillName(name)
+	if n.Model == "" {
+		return fmt.Errorf("invalid skill name: %s", name)
+	}
+
+	p := progress.NewProgress(os.Stderr)
+	defer p.Stop()
+
+	// Create skill layer
+	displayName := n.DisplayShortest()
+	status := fmt.Sprintf("Creating skill layer for %s", displayName)
+	spinner := progress.NewSpinner(status)
+	p.Add(status, spinner)
+
+	layer, err := server.CreateSkillLayer(absPath)
+	if err != nil {
+		return fmt.Errorf("creating skill layer: %w", err)
+	}
+
+	spinner.Stop()
+
+	// Create skill manifest
+	manifest, configLayer, err := createSkillManifest(absPath, layer)
+	if err != nil {
+		return fmt.Errorf("creating skill manifest: %w", err)
+	}
+
+	// Write manifest locally
+	manifestPath, err := server.GetSkillManifestPath(n)
+	if err != nil {
+		return fmt.Errorf("getting manifest path: %w", err)
+	}
+
+	if err := os.MkdirAll(filepath.Dir(manifestPath), 0o755); err != nil {
+		return fmt.Errorf("creating manifest directory: %w", err)
+	}
+
+	manifestJSON, err := json.Marshal(manifest)
+	if err != nil {
+		return fmt.Errorf("marshaling manifest: %w", err)
+	}
+
+	if err := os.WriteFile(manifestPath, manifestJSON, 0o644); err != nil {
+		return fmt.Errorf("writing manifest: %w", err)
+	}
+
+	fmt.Fprintf(os.Stderr, "Skill %s created locally\n", displayName)
+	fmt.Fprintf(os.Stderr, "  Config: %s (%s)\n", configLayer.Digest, format.HumanBytes(configLayer.Size))
+	fmt.Fprintf(os.Stderr, "  Layer:  %s (%s)\n", layer.Digest, format.HumanBytes(layer.Size))
+
+	// Push to registry
+	client, err := api.ClientFromEnvironment()
+	if err != nil {
+		return fmt.Errorf("creating client: %w", err)
+	}
+
+	insecure, _ := cmd.Flags().GetBool("insecure")
+
+	// For now, we'll use the existing push mechanism
+	fmt.Fprintf(os.Stderr, "\nPushing to registry...\n")
+
+	fn := func(resp api.ProgressResponse) error {
+		if resp.Digest != "" {
+			bar := progress.NewBar(resp.Status, resp.Total, resp.Completed)
+			p.Add(resp.Digest, bar)
+		} else if resp.Status != "" {
+			spinner := progress.NewSpinner(resp.Status)
+			p.Add(resp.Status, spinner)
+		}
+		return nil
+	}
+
+	req := &api.PushRequest{
+		Model:    displayName,
+		Insecure: insecure,
+	}
+
+	if err := client.Push(context.Background(), req, fn); err != nil {
+		// If push fails, still show success for local creation
+		fmt.Fprintf(os.Stderr, "\nNote: Local skill created but push failed: %v\n", err)
+		fmt.Fprintf(os.Stderr, "You can try pushing later with: ollama skill push %s\n", name)
+		return nil
+	}
+
+	fmt.Fprintf(os.Stderr, "Successfully pushed %s\n", displayName)
+	return nil
+}
+
+// SkillPullHandler handles the skill pull command.
+func SkillPullHandler(cmd *cobra.Command, args []string) error {
+	if len(args) != 1 {
+		return fmt.Errorf("usage: ollama skill pull NAME[:TAG]")
+	}
+
+	name := args[0]
+	n := server.ParseSkillName(name)
+	if n.Model == "" {
+		return fmt.Errorf("invalid skill name: %s", name)
+	}
+
+	client, err := api.ClientFromEnvironment()
+	if err != nil {
+		return fmt.Errorf("creating client: %w", err)
+	}
+
+	insecure, _ := cmd.Flags().GetBool("insecure")
+
+	p := progress.NewProgress(os.Stderr)
+	defer p.Stop()
+
+	fn := func(resp api.ProgressResponse) error {
+		if resp.Digest != "" {
+			bar := progress.NewBar(resp.Status, resp.Total, resp.Completed)
+			p.Add(resp.Digest, bar)
+		} else if resp.Status != "" {
+			spinner := progress.NewSpinner(resp.Status)
+			p.Add(resp.Status, spinner)
+		}
+		return nil
+	}
+
+	displayName := n.DisplayShortest()
+	req := &api.PullRequest{
+		Model:    displayName,
+		Insecure: insecure,
+	}
+
+	if err := client.Pull(context.Background(), req, fn); err != nil {
+		return fmt.Errorf("pulling skill: %w", err)
+	}
+
+	fmt.Fprintf(os.Stderr, "Successfully pulled %s\n", displayName)
+	return nil
+}
+
+// SkillListHandler handles the skill list command.
+func SkillListHandler(cmd *cobra.Command, args []string) error {
+	skills, err := listLocalSkills()
+	if err != nil {
+		return fmt.Errorf("listing skills: %w", err)
+	}
+
+	if len(skills) == 0 {
+		fmt.Println("No skills installed")
+		return nil
+	}
+
+	w := tabwriter.NewWriter(os.Stdout, 0, 0, 3, ' ', 0)
+	fmt.Fprintln(w, "NAME\tTAG\tSIZE\tMODIFIED")
+
+	for _, skill := range skills {
+		fmt.Fprintf(w, "%s/%s\t%s\t%s\t%s\n",
+			skill.Namespace,
+			skill.Name,
+			skill.Tag,
+			format.HumanBytes(skill.Size),
+			format.HumanTime(skill.ModifiedAt, "Never"),
+		)
+	}
+
+	return w.Flush()
+}
+
+// SkillRemoveHandler handles the skill rm command.
+func SkillRemoveHandler(cmd *cobra.Command, args []string) error {
+	if len(args) == 0 {
+		return fmt.Errorf("usage: ollama skill rm NAME[:TAG] [NAME[:TAG]...]")
+	}
+
+	for _, name := range args {
+		n := server.ParseSkillName(name)
+		if n.Model == "" {
+			fmt.Fprintf(os.Stderr, "Invalid skill name: %s\n", name)
+			continue
+		}
+
+		displayName := n.DisplayShortest()
+		manifestPath, err := server.GetSkillManifestPath(n)
+		if err != nil {
+			fmt.Fprintf(os.Stderr, "Error getting manifest path for %s: %v\n", name, err)
+			continue
+		}
+
+		if _, err := os.Stat(manifestPath); os.IsNotExist(err) {
+			fmt.Fprintf(os.Stderr, "Skill not found: %s\n", displayName)
+			continue
+		}
+
+		if err := os.Remove(manifestPath); err != nil {
+			fmt.Fprintf(os.Stderr, "Error removing %s: %v\n", displayName, err)
+			continue
+		}
+
+		// Clean up empty parent directories
+		dir := filepath.Dir(manifestPath)
+		for dir != filepath.Join(os.Getenv("HOME"), ".ollama", "models", "manifests") {
+			entries, _ := os.ReadDir(dir)
+			if len(entries) == 0 {
+				os.Remove(dir)
+				dir = filepath.Dir(dir)
+			} else {
+				break
+			}
+		}
+
+		fmt.Fprintf(os.Stderr, "Deleted '%s'\n", displayName)
+	}
+
+	return nil
+}
+
+// SkillShowHandler handles the skill show command.
+func SkillShowHandler(cmd *cobra.Command, args []string) error {
+	if len(args) != 1 {
+		return fmt.Errorf("usage: ollama skill show NAME[:TAG]")
+	}
+
+	name := args[0]
+	n := server.ParseSkillName(name)
+	if n.Model == "" {
+		return fmt.Errorf("invalid skill name: %s", name)
+	}
+
+	displayName := n.DisplayShortest()
+	manifestPath, err := server.GetSkillManifestPath(n)
+	if err != nil {
+		return fmt.Errorf("getting manifest path: %w", err)
+	}
+
+	data, err := os.ReadFile(manifestPath)
+	if err != nil {
+		if os.IsNotExist(err) {
+			return fmt.Errorf("skill not found: %s", displayName)
+		}
+		return fmt.Errorf("reading manifest: %w", err)
+	}
+
+	var manifest server.Manifest
+	if err := json.Unmarshal(data, &manifest); err != nil {
+		return fmt.Errorf("parsing manifest: %w", err)
+	}
+
+	fmt.Printf("Skill: %s\n\n", displayName)
+
+	fmt.Println("Layers:")
+	for _, layer := range manifest.Layers {
+		fmt.Printf("  %s  %s  %s\n", layer.MediaType, layer.Digest[:19], format.HumanBytes(layer.Size))
+	}
+
+	// Try to read and display SKILL.md content
+	if len(manifest.Layers) > 0 {
+		for _, layer := range manifest.Layers {
+			if layer.MediaType == server.MediaTypeSkill {
+				skillPath, err := server.GetSkillsPath(layer.Digest)
+				if err == nil {
+					skillMdPath := filepath.Join(skillPath, "SKILL.md")
+					if content, err := os.ReadFile(skillMdPath); err == nil {
+						fmt.Println("\nContent:")
+						fmt.Println(string(content))
+					}
+				}
+			}
+		}
+	}
+
+	return nil
+}
+
+// SkillInfo represents information about an installed skill.
+type SkillInfo struct {
+	Namespace  string
+	Name       string
+	Tag        string
+	Size       int64
+	ModifiedAt time.Time
+}
+
+// listLocalSkills returns a list of locally installed skills.
+// Skills are stored with 5-part paths: host/namespace/kind/model/tag
+// where kind is "skill".
+func listLocalSkills() ([]SkillInfo, error) {
+	manifestsPath := filepath.Join(os.Getenv("HOME"), ".ollama", "models", "manifests")
+
+	var skills []SkillInfo
+
+	// Walk through all registries
+	registries, err := os.ReadDir(manifestsPath)
+	if err != nil {
+		if os.IsNotExist(err) {
+			return skills, nil
+		}
+		return nil, err
+	}
+
+	for _, registry := range registries {
+		if !registry.IsDir() {
+			continue
+		}
+
+		// Walk namespaces
+		namespaces, err := os.ReadDir(filepath.Join(manifestsPath, registry.Name()))
+		if err != nil {
+			continue
+		}
+
+		for _, namespace := range namespaces {
+			if !namespace.IsDir() {
+				continue
+			}
+
+			// Walk kinds looking for "skill"
+			kinds, err := os.ReadDir(filepath.Join(manifestsPath, registry.Name(), namespace.Name()))
+			if err != nil {
+				continue
+			}
+
+			for _, kind := range kinds {
+				if !kind.IsDir() {
+					continue
+				}
+
+				// Only process skill kind
+				if kind.Name() != server.SkillNamespace {
+					continue
+				}
+
+				// Walk skill names (model names)
+				skillNames, err := os.ReadDir(filepath.Join(manifestsPath, registry.Name(), namespace.Name(), kind.Name()))
+				if err != nil {
+					continue
+				}
+
+				for _, skillName := range skillNames {
+					if !skillName.IsDir() {
+						continue
+					}
+
+					// Walk tags
+					tags, err := os.ReadDir(filepath.Join(manifestsPath, registry.Name(), namespace.Name(), kind.Name(), skillName.Name()))
+					if err != nil {
+						continue
+					}
+
+					for _, tag := range tags {
+						manifestPath := filepath.Join(manifestsPath, registry.Name(), namespace.Name(), kind.Name(), skillName.Name(), tag.Name())
+						fi, err := os.Stat(manifestPath)
+						if err != nil || fi.IsDir() {
+							continue
+						}
+
+						// Read manifest to get size
+						data, err := os.ReadFile(manifestPath)
+						if err != nil {
+							continue
+						}
+
+						var manifest server.Manifest
+						if err := json.Unmarshal(data, &manifest); err != nil {
+							continue
+						}
+
+						var totalSize int64
+						for _, layer := range manifest.Layers {
+							totalSize += layer.Size
+						}
+
+						// Build display name using model.Name
+						n := model.Name{
+							Host:      registry.Name(),
+							Namespace: namespace.Name(),
+							Kind:      kind.Name(),
+							Model:     skillName.Name(),
+							Tag:       tag.Name(),
+						}
+
+						skills = append(skills, SkillInfo{
+							Namespace:  n.Namespace + "/" + n.Kind,
+							Name:       n.Model,
+							Tag:        n.Tag,
+							Size:       totalSize,
+							ModifiedAt: fi.ModTime(),
+						})
+					}
+				}
+			}
+		}
+	}
+
+	return skills, nil
+}
+
+// createSkillManifest creates a manifest for a standalone skill.
+func createSkillManifest(skillDir string, layer server.Layer) (*server.Manifest, *server.Layer, error) {
+	// Read SKILL.md to extract metadata
+	skillMdPath := filepath.Join(skillDir, "SKILL.md")
+	content, err := os.ReadFile(skillMdPath)
+	if err != nil {
+		return nil, nil, fmt.Errorf("reading SKILL.md: %w", err)
+	}
+
+	// Extract name and description from frontmatter
+	name, description := extractSkillMetadata(string(content))
+	if name == "" {
+		return nil, nil, errors.New("skill name not found in SKILL.md frontmatter")
+	}
+
+	// Create config
+	config := map[string]any{
+		"name":         name,
+		"description":  description,
+		"architecture": "amd64",
+		"os":           "linux",
+	}
+
+	configJSON, err := json.Marshal(config)
+	if err != nil {
+		return nil, nil, fmt.Errorf("marshaling config: %w", err)
+	}
+
+	// Create config layer
+	configLayer, err := server.NewLayer(strings.NewReader(string(configJSON)), "application/vnd.docker.container.image.v1+json")
+	if err != nil {
+		return nil, nil, fmt.Errorf("creating config layer: %w", err)
+	}
+
+	manifest := &server.Manifest{
+		SchemaVersion: 2,
+		MediaType:     "application/vnd.docker.distribution.manifest.v2+json",
+		Config:        configLayer,
+		Layers:        []server.Layer{layer},
+	}
+
+	return manifest, &configLayer, nil
+}
+
+// extractSkillMetadata extracts name and description from SKILL.md frontmatter.
+func extractSkillMetadata(content string) (name, description string) {
+	lines := strings.Split(content, "\n")
+
+	inFrontmatter := false
+	for _, line := range lines {
+		trimmed := strings.TrimSpace(line)
+
+		if trimmed == "---" {
+			if !inFrontmatter {
+				inFrontmatter = true
+				continue
+			} else {
+				break // End of frontmatter
+			}
+		}
+
+		if inFrontmatter {
+			if strings.HasPrefix(trimmed, "name:") {
+				name = strings.TrimSpace(strings.TrimPrefix(trimmed, "name:"))
+			} else if strings.HasPrefix(trimmed, "description:") {
+				description = strings.TrimSpace(strings.TrimPrefix(trimmed, "description:"))
+			}
+		}
+	}
+
+	return name, description
+}
+
+// NewSkillCommand creates the skill parent command with subcommands.
+func NewSkillCommand() *cobra.Command {
+	skillCmd := &cobra.Command{
+		Use:   "skill",
+		Short: "Manage skills",
+		Long:  "Commands for managing agent skills (push, pull, list, rm, show)",
+	}
+
+	pushCmd := &cobra.Command{
+		Use:     "push NAME[:TAG] PATH",
+		Short:   "Push a skill to a registry",
+		Long:    "Package a local skill directory and push it to a registry",
+		Args:    cobra.ExactArgs(2),
+		PreRunE: checkServerHeartbeat,
+		RunE:    SkillPushHandler,
+	}
+	pushCmd.Flags().Bool("insecure", false, "Use an insecure registry")
+
+	pullCmd := &cobra.Command{
+		Use:     "pull NAME[:TAG]",
+		Short:   "Pull a skill from a registry",
+		Args:    cobra.ExactArgs(1),
+		PreRunE: checkServerHeartbeat,
+		RunE:    SkillPullHandler,
+	}
+	pullCmd.Flags().Bool("insecure", false, "Use an insecure registry")
+
+	listCmd := &cobra.Command{
+		Use:     "list",
+		Aliases: []string{"ls"},
+		Short:   "List installed skills",
+		Args:    cobra.NoArgs,
+		RunE:    SkillListHandler,
+	}
+
+	rmCmd := &cobra.Command{
+		Use:     "rm NAME[:TAG] [NAME[:TAG]...]",
+		Aliases: []string{"remove", "delete"},
+		Short:   "Remove a skill",
+		Args:    cobra.MinimumNArgs(1),
+		RunE:    SkillRemoveHandler,
+	}
+
+	showCmd := &cobra.Command{
+		Use:   "show NAME[:TAG]",
+		Short: "Show skill details",
+		Args:  cobra.ExactArgs(1),
+		RunE:  SkillShowHandler,
+	}
+
+	skillCmd.AddCommand(pushCmd, pullCmd, listCmd, rmCmd, showCmd)
+
+	return skillCmd
+}
--- a/cmd/skills.go
+++ b/cmd/skills.go
@@ -0,0 +1,591 @@
+package cmd
+
+import (
+	"bufio"
+	"bytes"
+	"context"
+	"errors"
+	"fmt"
+	"io/fs"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"regexp"
+	"sort"
+	"strings"
+	"time"
+
+	"gopkg.in/yaml.v3"
+
+	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/server"
+)
+
+const (
+	skillFileName       = "SKILL.md"
+	maxSkillDescription = 1024
+	maxSkillNameLength  = 64
+)
+
+var skillNamePattern = regexp.MustCompile(`^[a-z0-9]+(?:-[a-z0-9]+)*$`)
+
+type skillMetadata struct {
+	Name        string `yaml:"name"`
+	Description string `yaml:"description"`
+}
+
+type skillDefinition struct {
+	Name        string
+	Description string
+	Content     string // Full SKILL.md content (without frontmatter)
+	Dir         string
+	SkillPath   string
+}
+
+type skillCatalog struct {
+	Skills []skillDefinition
+	byName map[string]skillDefinition
+}
+
+func loadSkills(paths []string) (*skillCatalog, error) {
+	if len(paths) == 0 {
+		return nil, nil
+	}
+
+	var skills []skillDefinition
+	byName := make(map[string]skillDefinition)
+	for _, root := range paths {
+		info, err := os.Stat(root)
+		if err != nil {
+			return nil, fmt.Errorf("skills directory %q: %w", root, err)
+		}
+		if !info.IsDir() {
+			return nil, fmt.Errorf("skills path %q is not a directory", root)
+		}
+
+		err = filepath.WalkDir(root, func(path string, entry fs.DirEntry, walkErr error) error {
+			if walkErr != nil {
+				return walkErr
+			}
+			if entry.IsDir() {
+				return nil
+			}
+			if entry.Name() != skillFileName {
+				return nil
+			}
+
+			skillDir := filepath.Dir(path)
+			skill, err := parseSkillFile(path, skillDir)
+			if err != nil {
+				fmt.Fprintf(os.Stderr, "Warning: skipping skill at %s: %v\n", path, err)
+				return nil
+			}
+
+			if _, exists := byName[skill.Name]; exists {
+				fmt.Fprintf(os.Stderr, "Warning: duplicate skill name %q at %s\n", skill.Name, path)
+				return nil
+			}
+
+			byName[skill.Name] = skill
+			skills = append(skills, skill)
+			return nil
+		})
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	if len(skills) == 0 {
+		return nil, nil
+	}
+
+	sort.Slice(skills, func(i, j int) bool {
+		return skills[i].Name < skills[j].Name
+	})
+
+	return &skillCatalog{Skills: skills, byName: byName}, nil
+}
+
+// loadSkillsFromRefs loads skills from a list of SkillRef objects.
+// Skills can be referenced by:
+//   - Digest: loaded from the extracted skill cache (for bundled/pulled skills)
+//   - Name (local path): loaded from the filesystem (for development)
+func loadSkillsFromRefs(refs []api.SkillRef) (*skillCatalog, error) {
+	if len(refs) == 0 {
+		return nil, nil
+	}
+
+	var skills []skillDefinition
+	byName := make(map[string]skillDefinition)
+
+	for _, ref := range refs {
+		var skillDir string
+
+		if ref.Digest != "" {
+			// Load from extracted skill cache
+			path, err := server.GetSkillsPath(ref.Digest)
+			if err != nil {
+				return nil, fmt.Errorf("getting skill path for %s: %w", ref.Digest, err)
+			}
+
+			// Check if skill is already extracted
+			skillMdPath := filepath.Join(path, skillFileName)
+			if _, err := os.Stat(skillMdPath); os.IsNotExist(err) {
+				// Try to extract the skill blob
+				path, err = server.ExtractSkillBlob(ref.Digest)
+				if err != nil {
+					return nil, fmt.Errorf("extracting skill %s: %w", ref.Digest, err)
+				}
+			}
+
+			skillDir = path
+		} else if ref.Name != "" {
+			// Check if this is a local path or a registry reference
+			if !server.IsLocalSkillPath(ref.Name) {
+				// Registry reference without a digest - skill needs to be pulled first
+				// This happens when an agent references a skill that hasn't been bundled
+				return nil, fmt.Errorf("skill %q is a registry reference but has no digest - the agent may need to be recreated or the skill pulled separately", ref.Name)
+			}
+
+			// Local path - resolve it
+			skillPath := ref.Name
+			if strings.HasPrefix(skillPath, "~") {
+				home, err := os.UserHomeDir()
+				if err != nil {
+					return nil, fmt.Errorf("expanding home directory: %w", err)
+				}
+				skillPath = filepath.Join(home, skillPath[1:])
+			}
+
+			absPath, err := filepath.Abs(skillPath)
+			if err != nil {
+				return nil, fmt.Errorf("resolving skill path %q: %w", ref.Name, err)
+			}
+
+			// Check if this is a directory containing skills or a single skill
+			info, err := os.Stat(absPath)
+			if err != nil {
+				return nil, fmt.Errorf("skill path %q: %w", ref.Name, err)
+			}
+
+			if info.IsDir() {
+				// Check if it's a skill directory (has SKILL.md) or a parent of skill directories
+				skillMdPath := filepath.Join(absPath, skillFileName)
+				if _, err := os.Stat(skillMdPath); err == nil {
+					// Direct skill directory
+					skillDir = absPath
+				} else {
+					// Parent directory - walk to find skill subdirectories
+					err := filepath.WalkDir(absPath, func(path string, entry fs.DirEntry, walkErr error) error {
+						if walkErr != nil {
+							return walkErr
+						}
+						if entry.IsDir() {
+							return nil
+						}
+						if entry.Name() != skillFileName {
+							return nil
+						}
+
+						skillSubDir := filepath.Dir(path)
+						skill, err := parseSkillFile(path, skillSubDir)
+						if err != nil {
+							fmt.Fprintf(os.Stderr, "Warning: skipping skill at %s: %v\n", path, err)
+							return nil
+						}
+
+						if _, exists := byName[skill.Name]; exists {
+							fmt.Fprintf(os.Stderr, "Warning: duplicate skill name %q at %s\n", skill.Name, path)
+							return nil
+						}
+
+						byName[skill.Name] = skill
+						skills = append(skills, skill)
+						return nil
+					})
+					if err != nil {
+						return nil, err
+					}
+					continue
+				}
+			} else {
+				return nil, fmt.Errorf("skill path %q is not a directory", ref.Name)
+			}
+		} else {
+			// Both empty - skip
+			continue
+		}
+
+		// Parse the skill from skillDir if set
+		if skillDir != "" {
+			skillMdPath := filepath.Join(skillDir, skillFileName)
+			skill, err := parseSkillFile(skillMdPath, skillDir)
+			if err != nil {
+				return nil, fmt.Errorf("parsing skill at %s: %w", skillDir, err)
+			}
+
+			if _, exists := byName[skill.Name]; exists {
+				fmt.Fprintf(os.Stderr, "Warning: duplicate skill name %q\n", skill.Name)
+				continue
+			}
+
+			byName[skill.Name] = skill
+			skills = append(skills, skill)
+		}
+	}
+
+	if len(skills) == 0 {
+		return nil, nil
+	}
+
+	sort.Slice(skills, func(i, j int) bool {
+		return skills[i].Name < skills[j].Name
+	})
+
+	return &skillCatalog{Skills: skills, byName: byName}, nil
+}
+
+func parseSkillFile(path, skillDir string) (skillDefinition, error) {
+	rawContent, err := os.ReadFile(path)
+	if err != nil {
+		return skillDefinition{}, err
+	}
+
+	frontmatter, bodyContent, err := extractFrontmatterAndContent(string(rawContent))
+	if err != nil {
+		return skillDefinition{}, err
+	}
+
+	var meta skillMetadata
+	if err := yaml.Unmarshal([]byte(frontmatter), &meta); err != nil {
+		return skillDefinition{}, fmt.Errorf("invalid frontmatter: %w", err)
+	}
+
+	if err := validateSkillMetadata(meta, skillDir); err != nil {
+		return skillDefinition{}, err
+	}
+
+	absPath, err := filepath.Abs(path)
+	if err != nil {
+		return skillDefinition{}, err
+	}
+	absDir, err := filepath.Abs(skillDir)
+	if err != nil {
+		return skillDefinition{}, err
+	}
+
+	return skillDefinition{
+		Name:        meta.Name,
+		Description: meta.Description,
+		Content:     bodyContent,
+		Dir:         absDir,
+		SkillPath:   absPath,
+	}, nil
+}
+
+func extractFrontmatterAndContent(content string) (frontmatter string, body string, err error) {
+	scanner := bufio.NewScanner(strings.NewReader(content))
+	if !scanner.Scan() {
+		return "", "", errors.New("empty SKILL.md")
+	}
+	if strings.TrimSpace(scanner.Text()) != "---" {
+		return "", "", errors.New("missing YAML frontmatter")
+	}
+
+	var fmLines []string
+	foundEnd := false
+	for scanner.Scan() {
+		line := scanner.Text()
+		if strings.TrimSpace(line) == "---" {
+			foundEnd = true
+			break
+		}
+		fmLines = append(fmLines, line)
+	}
+	if !foundEnd {
+		return "", "", errors.New("frontmatter not terminated")
+	}
+
+	// Collect remaining content as body
+	var bodyLines []string
+	for scanner.Scan() {
+		bodyLines = append(bodyLines, scanner.Text())
+	}
+
+	return strings.Join(fmLines, "\n"), strings.TrimSpace(strings.Join(bodyLines, "\n")), nil
+}
+
+func validateSkillMetadata(meta skillMetadata, skillDir string) error {
+	name := strings.TrimSpace(meta.Name)
+	description := strings.TrimSpace(meta.Description)
+
+	switch {
+	case name == "":
+		return errors.New("missing skill name")
+	case len(name) > maxSkillNameLength:
+		return fmt.Errorf("skill name exceeds %d characters", maxSkillNameLength)
+	case !skillNamePattern.MatchString(name):
+		return fmt.Errorf("invalid skill name %q", name)
+	}
+
+	if description == "" {
+		return errors.New("missing skill description")
+	}
+	if len(description) > maxSkillDescription {
+		return fmt.Errorf("skill description exceeds %d characters", maxSkillDescription)
+	}
+
+	// Skip directory name check for digest-based paths (extracted from blobs)
+	dirName := filepath.Base(skillDir)
+	if !strings.HasPrefix(dirName, "sha256-") && dirName != name {
+		return fmt.Errorf("skill directory %q does not match name %q", dirName, name)
+	}
+
+	return nil
+}
+
+func (c *skillCatalog) SystemPrompt() string {
+	if c == nil || len(c.Skills) == 0 {
+		return ""
+	}
+
+	var b strings.Builder
+	b.WriteString("# Skills\n\n")
+	b.WriteString("You have the following skills loaded. Each skill provides instructions and may include executable scripts.\n\n")
+	b.WriteString("## Available Tools\n\n")
+	b.WriteString("- `run_skill_script`: Execute a script bundled with a skill. Use this when the skill instructions tell you to run a script.\n")
+	b.WriteString("- `read_skill_file`: Read additional files from a skill directory.\n\n")
+
+	for _, skill := range c.Skills {
+		fmt.Fprintf(&b, "## Skill: %s\n\n", skill.Name)
+		fmt.Fprintf(&b, "%s\n\n", skill.Content)
+		b.WriteString("---\n\n")
+	}
+
+	return b.String()
+}
+
+func (c *skillCatalog) Tools() api.Tools {
+	if c == nil || len(c.Skills) == 0 {
+		return nil
+	}
+
+	runScriptProps := api.NewToolPropertiesMap()
+	runScriptProps.Set("skill", api.ToolProperty{
+		Type:        api.PropertyType{"string"},
+		Description: "The name of the skill containing the script",
+	})
+	runScriptProps.Set("command", api.ToolProperty{
+		Type:        api.PropertyType{"string"},
+		Description: "The command to execute (e.g., 'python scripts/calculate.py 25 4' or './scripts/run.sh')",
+	})
+
+	readFileProps := api.NewToolPropertiesMap()
+	readFileProps.Set("skill", api.ToolProperty{
+		Type:        api.PropertyType{"string"},
+		Description: "The name of the skill containing the file",
+	})
+	readFileProps.Set("path", api.ToolProperty{
+		Type:        api.PropertyType{"string"},
+		Description: "The relative path to the file within the skill directory",
+	})
+
+	return api.Tools{
+		{
+			Type: "function",
+			Function: api.ToolFunction{
+				Name:        "run_skill_script",
+				Description: "Execute a script or command within a skill's directory. Use this to run Python scripts, shell scripts, or other executables bundled with a skill.",
+				Parameters: api.ToolFunctionParameters{
+					Type:       "object",
+					Required:   []string{"skill", "command"},
+					Properties: runScriptProps,
+				},
+			},
+		},
+		{
+			Type: "function",
+			Function: api.ToolFunction{
+				Name:        "read_skill_file",
+				Description: "Read a file from a skill's directory. Use this to read additional documentation, reference files, or data files bundled with a skill.",
+				Parameters: api.ToolFunctionParameters{
+					Type:       "object",
+					Required:   []string{"skill", "path"},
+					Properties: readFileProps,
+				},
+			},
+		},
+	}
+}
+
+func (c *skillCatalog) RunToolCall(call api.ToolCall) (api.Message, bool, error) {
+	switch call.Function.Name {
+	case "read_skill_file":
+		skillName, err := requireStringArg(call.Function.Arguments, "skill")
+		if err != nil {
+			return toolMessage(call, err.Error()), true, nil
+		}
+		relPath, err := requireStringArg(call.Function.Arguments, "path")
+		if err != nil {
+			return toolMessage(call, err.Error()), true, nil
+		}
+		skill, ok := c.byName[skillName]
+		if !ok {
+			return toolMessage(call, fmt.Sprintf("unknown skill %q", skillName)), true, nil
+		}
+		content, err := readSkillFile(skill.Dir, relPath)
+		if err != nil {
+			return toolMessage(call, err.Error()), true, nil
+		}
+		return toolMessage(call, content), true, nil
+
+	case "run_skill_script":
+		skillName, err := requireStringArg(call.Function.Arguments, "skill")
+		if err != nil {
+			return toolMessage(call, err.Error()), true, nil
+		}
+		command, err := requireStringArg(call.Function.Arguments, "command")
+		if err != nil {
+			return toolMessage(call, err.Error()), true, nil
+		}
+		skill, ok := c.byName[skillName]
+		if !ok {
+			return toolMessage(call, fmt.Sprintf("unknown skill %q", skillName)), true, nil
+		}
+		output, err := runSkillScript(skill.Dir, command)
+		if err != nil {
+			return toolMessage(call, fmt.Sprintf("error: %v\noutput: %s", err, output)), true, nil
+		}
+		return toolMessage(call, output), true, nil
+
+	default:
+		return api.Message{}, false, nil
+	}
+}
+
+// runSkillScript executes a shell command within a skill's directory.
+//
+// SECURITY LIMITATIONS (TODO):
+//   - No sandboxing: commands run with full user permissions
+//   - No path validation: model can run any command, not just scripts in skill dir
+//   - Shell injection risk: sh -c is used, malicious input could be crafted
+//   - No executable allowlist: any program can be called (curl, rm, etc.)
+//   - No environment isolation: scripts inherit full environment variables
+//
+// POTENTIAL IMPROVEMENTS:
+//   - Restrict commands to only reference files within skill directory
+//   - Allowlist specific executables (python3, node, bash)
+//   - Use sandboxing (Docker, nsjail, seccomp)
+//   - Require explicit script registration in SKILL.md frontmatter
+//   - Add per-skill configurable timeouts
+func runSkillScript(skillDir, command string) (string, error) {
+	// Validate the skill directory exists
+	absSkillDir, err := filepath.Abs(skillDir)
+	if err != nil {
+		return "", err
+	}
+	if _, err := os.Stat(absSkillDir); err != nil {
+		return "", fmt.Errorf("skill directory not found: %w", err)
+	}
+
+	// Create command with timeout
+	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+	defer cancel()
+
+	cmd := exec.CommandContext(ctx, "sh", "-c", command)
+	cmd.Dir = absSkillDir
+
+	// Inject the current working directory (where ollama run was called from)
+	// as an environment variable so scripts can reference files in that directory
+	workingDir, err := os.Getwd()
+	if err != nil {
+		return "", fmt.Errorf("failed to get working directory: %w", err)
+	}
+	cmd.Env = append(os.Environ(), "OLLAMA_WORKING_DIR="+workingDir)
+
+	// Capture both stdout and stderr
+	var stdout, stderr bytes.Buffer
+	cmd.Stdout = &stdout
+	cmd.Stderr = &stderr
+
+	err = cmd.Run()
+
+	// Combine output
+	output := stdout.String()
+	if stderr.Len() > 0 {
+		if output != "" {
+			output += "\n"
+		}
+		output += stderr.String()
+	}
+
+	if err != nil {
+		if ctx.Err() == context.DeadlineExceeded {
+			return output, fmt.Errorf("command timed out after 30 seconds")
+		}
+		return output, err
+	}
+
+	return output, nil
+}
+
+func readSkillFile(skillDir, relPath string) (string, error) {
+	relPath = filepath.Clean(strings.TrimSpace(relPath))
+	if relPath == "" {
+		return "", errors.New("path is required")
+	}
+	if filepath.IsAbs(relPath) {
+		return "", errors.New("path must be relative to the skill directory")
+	}
+
+	target := filepath.Join(skillDir, relPath)
+	absTarget, err := filepath.Abs(target)
+	if err != nil {
+		return "", err
+	}
+	absSkillDir, err := filepath.Abs(skillDir)
+	if err != nil {
+		return "", err
+	}
+	rel, err := filepath.Rel(absSkillDir, absTarget)
+	if err != nil {
+		return "", err
+	}
+	if strings.HasPrefix(rel, "..") {
+		return "", errors.New("path escapes the skill directory")
+	}
+
+	content, err := os.ReadFile(absTarget)
+	if err != nil {
+		return "", fmt.Errorf("failed to read %q: %w", relPath, err)
+	}
+
+	return string(content), nil
+}
+
+func requireStringArg(args api.ToolCallFunctionArguments, name string) (string, error) {
+	value, ok := args.Get(name)
+	if !ok {
+		return "", fmt.Errorf("missing required argument %q", name)
+	}
+	str, ok := value.(string)
+	if !ok {
+		return "", fmt.Errorf("argument %q must be a string", name)
+	}
+	if strings.TrimSpace(str) == "" {
+		return "", fmt.Errorf("argument %q cannot be empty", name)
+	}
+	return str, nil
+}
+
+func toolMessage(call api.ToolCall, content string) api.Message {
+	msg := api.Message{
+		Role:     "tool",
+		Content:  content,
+		ToolName: call.Function.Name,
+	}
+	if call.ID != "" {
+		msg.ToolCallID = call.ID
+	}
+	return msg
+}
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -6,14 +6,11 @@ import (
 	"errors"
 	"fmt"
 	"io/fs"
-	"iter"
 	"log/slog"
-	"maps"
 	"os"
 	"slices"
 	"strings"

-	ofs "github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/fs/ggml"
 )

@@ -21,28 +18,9 @@ type ModelParameters struct {
 	Architectures []string `json:"architectures"`
 	VocabSize     uint32   `json:"vocab_size"`

-	// TODO is this needed?
-	ModelType string `json:"model_type"`
-
 	TextModel struct {
-		VocabSize  uint32 `json:"vocab_size"`
-		HiddenSize uint32 `json:"hidden_size"`
-		ModelType  string `json:"model_type"`
+		VocabSize uint32 `json:"vocab_size"`
 	} `json:"text_config"`
-
-	// TODO vision config
-	/*
-		"vision_config": {
-			"hidden_size": 1152,
-			"image_size": 896,
-			"intermediate_size": 4304,
-			"model_type": "siglip_vision_model",
-			"num_attention_heads": 16,
-			"num_hidden_layers": 27,
-			"patch_size": 14,
-			"vision_use_head": false
-		}
-	*/
 }

 type AdapterParameters struct {
@@ -55,91 +33,8 @@ type AdapterParameters struct {
 	} `json:"lora_parameters"`
 }

-type KV map[string]any
-
-func (kv KV) Architecture() string {
-	return kv.String("general.architecture", "unknown")
-}
-
-type valueTypes interface {
-	uint8 | int8 | uint16 | int16 |
-		uint32 | int32 | uint64 | int64 |
-		string | float32 | float64 | bool
-}
-
-type arrayValueTypes interface {
-	[]uint8 | []int8 | []uint16 | []int16 |
-		[]uint32 | []int32 | []uint64 | []int64 |
-		[]string | []float32 | []float64 | []bool
-}
-
-func keyValue[T valueTypes | arrayValueTypes](kv KV, key string, defaultValue ...T) (T, bool) {
-	if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
-		key = kv.Architecture() + "." + key
-	}
-
-	if val, ok := kv[key].(T); ok {
-		return val, true
-	}
-	return defaultValue[0], false
-}
-
-func (kv KV) String(key string, defaultValue ...string) string {
-	val, _ := keyValue(kv, key, append(defaultValue, "")...)
-	return val
-}
-func (kv KV) Uint(key string, defaultValue ...uint32) uint32 {
-	val, _ := keyValue(kv, key, append(defaultValue, 0)...)
-	return val
-}
-
-func (kv KV) Float(key string, defaultValue ...float32) float32 {
-	val, _ := keyValue(kv, key, append(defaultValue, 0)...)
-	return val
-}
-
-func (kv KV) Bool(key string, defaultValue ...bool) bool {
-	val, _ := keyValue(kv, key, append(defaultValue, false)...)
-	return val
-}
-
-func (kv KV) Strings(key string, defaultValue ...[]string) []string {
-	val, _ := keyValue(kv, key, append(defaultValue, []string{""})...)
-	return val
-}
-func (kv KV) Ints(key string, defaultValue ...[]int32) []int32 {
-	val, _ := keyValue(kv, key, append(defaultValue, []int32{0})...)
-	return val
-}
-
-func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
-	val, _ := keyValue(kv, key, append(defaultValue, []uint32{0})...)
-	return val
-}
-
-func (kv KV) Floats(key string, defaultValue ...[]float32) []float32 {
-	val, _ := keyValue(kv, key, append(defaultValue, []float32{0})...)
-	return val
-}
-
-func (kv KV) Bools(key string, defaultValue ...[]bool) []bool {
-	val, _ := keyValue(kv, key, append(defaultValue, []bool{false})...)
-	return val
-}
-func (kv KV) Len() int {
-	return len(kv)
-}
-
-func (kv KV) Keys() iter.Seq[string] {
-	return maps.Keys(kv)
-}
-
-func (kv KV) Value(key string) any {
-	return kv[key]
-}
-
-func (ModelParameters) KV(t *Tokenizer) KV {
-	kv := KV{
+func (ModelParameters) KV(t *Tokenizer) ggml.KV {
+	kv := ggml.KV{
 		"general.file_type":            uint32(1),
 		"general.quantization_version": uint32(2),
 		"tokenizer.ggml.pre":           t.Pre,
@@ -168,7 +63,7 @@ func (ModelParameters) KV(t *Tokenizer) KV {
 	return kv
 }

-func (p AdapterParameters) KV() KV {
+func (p AdapterParameters) KV() ggml.KV {
 	var alpha float32
 	if p.LoraParameters.Alpha == 0 {
 		alpha = float32(p.Alpha)
@@ -176,7 +71,7 @@ func (p AdapterParameters) KV() KV {
 		alpha = p.LoraParameters.Alpha
 	}

-	kv := KV{
+	kv := ggml.KV{
 		"adapter.lora.alpha": alpha,
 		"adapter.type":       "lora",
 		"general.file_type":  uint32(1),
@@ -193,14 +88,9 @@ func (ModelParameters) specialTokenTypes() []string {
 	}
 }

-type ModelKV interface {
-	// KV maps parameters to LLM key-values
-	KV(*Tokenizer) KV
-}
-
 type ModelConverter interface {
-	ModelKV
-
+	// KV maps parameters to LLM key-values
+	KV(*Tokenizer) ggml.KV
 	// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
 	Tensors([]Tensor) []*ggml.Tensor
 	// Replacements returns a list of string pairs to replace in tensor names.
@@ -217,7 +107,7 @@ type moreParser interface {

 type AdapterConverter interface {
 	// KV maps parameters to LLM key-values
-	KV(ofs.Config) KV
+	KV(ggml.KV) ggml.KV
 	// Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here.
 	Tensors([]Tensor) []*ggml.Tensor
 	// Replacements returns a list of string pairs to replace in tensor names.
@@ -225,7 +115,7 @@ type AdapterConverter interface {
 	Replacements() []string
 }

-func ConvertAdapter(fsys fs.FS, f *os.File, baseKV ofs.Config) error {
+func ConvertAdapter(fsys fs.FS, f *os.File, baseKV ggml.KV) error {
 	bts, err := fs.ReadFile(fsys, "adapter_config.json")
 	if err != nil {
 		return err
@@ -236,8 +126,8 @@ func ConvertAdapter(fsys fs.FS, f *os.File, baseKV ofs.Config) error {
 		return err
 	}

-	arch := baseKV.Architecture()
-	if arch == "" {
+	arch, ok := baseKV["general.architecture"]
+	if !ok {
 		return errors.New("architecture not set for the base model")
 	}

@@ -263,19 +153,23 @@ func ConvertAdapter(fsys fs.FS, f *os.File, baseKV ofs.Config) error {
 	return writeFile(f, conv.KV(baseKV), conv.Tensors(ts))
 }

-func LoadModelMetadata(fsys fs.FS) (ModelKV, *Tokenizer, error) {
+// Convert writes an Ollama compatible model to the provided io.WriteSeeker based on configurations
+// and files it finds in the input path.
+// Supported input model formats include safetensors.
+// Supported input tokenizers files include tokenizer.json (preferred) and tokenizer.model.
+func ConvertModel(fsys fs.FS, f *os.File) error {
 	bts, err := fs.ReadFile(fsys, "config.json")
 	if err != nil {
-		return nil, nil, err
+		return err
 	}

 	var p ModelParameters
 	if err := json.Unmarshal(bts, &p); err != nil {
-		return nil, nil, err
+		return err
 	}

 	if len(p.Architectures) < 1 {
-		return nil, nil, errors.New("unknown architecture")
+		return errors.New("unknown architecture")
 	}

 	var conv ModelConverter
@@ -323,22 +217,22 @@ func LoadModelMetadata(fsys fs.FS) (ModelKV, *Tokenizer, error) {
 	case "DeepseekV3ForCausalLM":
 		conv = &deepseek2Model{}
 	default:
-		return nil, nil, fmt.Errorf("unsupported architecture %q", p.Architectures[0])
+		return fmt.Errorf("unsupported architecture %q", p.Architectures[0])
 	}

 	if err := json.Unmarshal(bts, conv); err != nil {
-		return nil, nil, err
+		return err
 	}

 	if t, ok := conv.(moreParser); ok {
 		if err := t.parseMore(fsys); err != nil {
-			return nil, nil, err
+			return err
 		}
 	}

 	t, err := parseTokenizer(fsys, conv.specialTokenTypes())
 	if err != nil {
-		return nil, nil, err
+		return err
 	}

 	vocabSize := int(cmp.Or(p.VocabSize, p.TextModel.VocabSize))
@@ -360,19 +254,6 @@ func LoadModelMetadata(fsys fs.FS) (ModelKV, *Tokenizer, error) {
 	default:
 		slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens))
 	}
-	return conv, t, nil
-}
-
-// Convert writes an Ollama compatible model to the provided io.WriteSeeker based on configurations
-// and files it finds in the input path.
-// Supported input model formats include safetensors.
-// Supported input tokenizers files include tokenizer.json (preferred) and tokenizer.model.
-func ConvertModel(fsys fs.FS, f *os.File) error {
-	kv, t, err := LoadModelMetadata(fsys)
-	if err != nil {
-		return err
-	}
-	conv := kv.(ModelConverter)

 	ts, err := parseTensors(fsys, strings.NewReplacer(conv.Replacements()...))
 	if err != nil {
@@ -382,7 +263,7 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
 	return writeFile(f, conv.KV(t), conv.Tensors(ts))
 }

-func writeFile(f *os.File, kv KV, ts []*ggml.Tensor) error {
+func writeFile(f *os.File, kv ggml.KV, ts []*ggml.Tensor) error {
 	for i := range ts {
 		ts[i].Shape = slices.Clone(ts[i].Shape)
 		slices.Reverse(ts[i].Shape)
--- a/convert/convert_bert.go
+++ b/convert/convert_bert.go
@@ -88,7 +88,7 @@ func (p *bertModel) parseMore(fsys fs.FS) error {
 	return nil
 }

-func (p *bertModel) KV(t *Tokenizer) KV {
+func (p *bertModel) KV(t *Tokenizer) ggml.KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "bert"
 	kv["bert.attention.causal"] = false
--- a/convert/convert_commandr.go
+++ b/convert/convert_commandr.go
@@ -24,7 +24,7 @@ type commandrModel struct {

 var _ ModelConverter = (*commandrModel)(nil)

-func (p *commandrModel) KV(t *Tokenizer) KV {
+func (p *commandrModel) KV(t *Tokenizer) ggml.KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "command-r"
 	kv["general.name"] = "command-r"
--- a/convert/convert_deepseek2.go
+++ b/convert/convert_deepseek2.go
@@ -47,7 +47,7 @@ type deepseek2Model struct {
 	Architecture string
 }

-func (p *deepseek2Model) KV(t *Tokenizer) KV {
+func (p *deepseek2Model) KV(t *Tokenizer) ggml.KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "deepseek2"
 	kv["general.type"] = "model"
--- a/convert/convert_deepseekocr.go
+++ b/convert/convert_deepseekocr.go
@@ -41,7 +41,7 @@ type deepseekocr struct {
 	} `json:"vision_config"`
 }

-func (m *deepseekocr) KV(t *Tokenizer) KV {
+func (m *deepseekocr) KV(t *Tokenizer) ggml.KV {
 	kv := m.ModelParameters.KV(t)
 	kv["general.architecture"] = "deepseekocr"
 	kv["block_count"] = m.LanguageConfig.HiddenLayers
--- a/convert/convert_gemma.go
+++ b/convert/convert_gemma.go
@@ -23,7 +23,7 @@ type gemmaModel struct {

 var _ ModelConverter = (*gemmaModel)(nil)

-func (p *gemmaModel) KV(t *Tokenizer) KV {
+func (p *gemmaModel) KV(t *Tokenizer) ggml.KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "gemma"
 	kv["gemma.context_length"] = p.MaxPositionEmbeddings
--- a/convert/convert_gemma2.go
+++ b/convert/convert_gemma2.go
@@ -1,5 +1,7 @@
 package convert

+import "github.com/ollama/ollama/fs/ggml"
+
 type gemma2Model struct {
 	gemmaModel
 	SlidingWindow         uint32  `json:"sliding_window"`
@@ -7,7 +9,7 @@ type gemma2Model struct {
 	FinalLogitSoftcap     float32 `json:"final_logit_softcapping"`
 }

-func (p *gemma2Model) KV(t *Tokenizer) KV {
+func (p *gemma2Model) KV(t *Tokenizer) ggml.KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "gemma2"
 	kv["gemma2.context_length"] = p.MaxPositionEmbeddings
--- a/convert/convert_gemma2_adapter.go
+++ b/convert/convert_gemma2_adapter.go
@@ -6,7 +6,6 @@ import (
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"

-	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/fs/ggml"
 )

@@ -16,7 +15,7 @@ type gemma2Adapter struct {

 var _ AdapterConverter = (*gemma2Adapter)(nil)

-func (p *gemma2Adapter) KV(baseKV fs.Config) KV {
+func (p *gemma2Adapter) KV(baseKV ggml.KV) ggml.KV {
 	kv := p.AdapterParameters.KV()
 	kv["general.architecture"] = "gemma2"
 	return kv
--- a/convert/convert_gemma3.go
+++ b/convert/convert_gemma3.go
@@ -3,6 +3,8 @@ package convert
 import (
 	"cmp"
 	"slices"
+
+	"github.com/ollama/ollama/fs/ggml"
 )

 type gemma3Model struct {
@@ -53,7 +55,7 @@ const (
 	gemma27BLayerCount = 62
 )

-func (p *gemma3Model) KV(t *Tokenizer) KV {
+func (p *gemma3Model) KV(t *Tokenizer) ggml.KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "gemma3"

--- a/convert/convert_gemma3n.go
+++ b/convert/convert_gemma3n.go
@@ -38,7 +38,7 @@ type gemma3nModel struct {
 	VisionModel struct{} `json:"vision_config"`
 }

-func (m *gemma3nModel) KV(t *Tokenizer) KV {
+func (m *gemma3nModel) KV(t *Tokenizer) ggml.KV {
 	kv := m.ModelParameters.KV(t)
 	kv["general.architecture"] = "gemma3n"
 	kv["gemma3n.activation_sparsity_scale"] = slices.Collect(func(yield func(float32) bool) {
--- a/convert/convert_gptoss.go
+++ b/convert/convert_gptoss.go
@@ -37,7 +37,7 @@ type gptossModel struct {

 var _ ModelConverter = (*gptossModel)(nil)

-func (m *gptossModel) KV(t *Tokenizer) KV {
+func (m *gptossModel) KV(t *Tokenizer) ggml.KV {
 	kv := m.ModelParameters.KV(t)
 	kv["general.architecture"] = "gptoss"
 	kv["general.file_type"] = uint32(4)
--- a/convert/convert_llama.go
+++ b/convert/convert_llama.go
@@ -48,7 +48,7 @@ type llamaModel struct {

 var _ ModelConverter = (*llamaModel)(nil)

-func (p *llamaModel) KV(t *Tokenizer) KV {
+func (p *llamaModel) KV(t *Tokenizer) ggml.KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "llama"
 	kv["llama.vocab_size"] = p.VocabSize
--- a/convert/convert_llama4.go
+++ b/convert/convert_llama4.go
@@ -35,7 +35,7 @@ type llama4Model struct {
 }

 // KV implements ModelConverter.
-func (p *llama4Model) KV(t *Tokenizer) KV {
+func (p *llama4Model) KV(t *Tokenizer) ggml.KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "llama4"

--- a/convert/convert_llama_adapter.go
+++ b/convert/convert_llama_adapter.go
@@ -7,7 +7,6 @@ import (
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"

-	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/fs/ggml"
 )

@@ -19,13 +18,13 @@ type llamaAdapter struct {

 var _ AdapterConverter = (*llamaAdapter)(nil)

-func (p *llamaAdapter) KV(baseKV fs.Config) KV {
+func (p *llamaAdapter) KV(baseKV ggml.KV) ggml.KV {
 	kv := p.AdapterParameters.KV()
 	kv["general.architecture"] = "llama"
-	kv["llama.attention.head_count"] = baseKV.Value("llama.attention.head_count")
-	kv["llama.attention.head_count_kv"] = baseKV.Value("llama.attention.head_count_kv")
+	kv["llama.attention.head_count"] = baseKV["llama.attention.head_count"]
+	kv["llama.attention.head_count_kv"] = baseKV["llama.attention.head_count_kv"]

-	p.NumAttentionHeads = baseKV.Value("llama.attention.head_count").(uint32)
+	p.NumAttentionHeads = baseKV["llama.attention.head_count"].(uint32)

 	return kv
 }
--- a/convert/convert_mistral.go
+++ b/convert/convert_mistral.go
@@ -60,7 +60,7 @@ type mistral3Model struct {
 	ProjectorHiddenAct      string `json:"projector_hidden_act"`
 }

-func (p *mistral3Model) KV(t *Tokenizer) KV {
+func (p *mistral3Model) KV(t *Tokenizer) ggml.KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "mistral3"
 	kv["mistral3.vocab_size"] = p.TextModel.VocabSize
--- a/convert/convert_mistral_causal.go
+++ b/convert/convert_mistral_causal.go
@@ -39,7 +39,7 @@ type mistral3CausalModel struct {
 	} `json:"rope_parameters"`
 }

-func (p *mistral3CausalModel) KV(t *Tokenizer) KV {
+func (p *mistral3CausalModel) KV(t *Tokenizer) ggml.KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "mistral3"
 	kv["mistral3.vocab_size"] = p.VocabSize
--- a/convert/convert_mixtral.go
+++ b/convert/convert_mixtral.go
@@ -12,7 +12,7 @@ type mixtralModel struct {
 	NumExpertsPerToken uint32 `json:"num_experts_per_tok"`
 }

-func (p *mixtralModel) KV(t *Tokenizer) KV {
+func (p *mixtralModel) KV(t *Tokenizer) ggml.KV {
 	kv := p.llamaModel.KV(t)

 	if p.NumLocalExperts > 0 {
--- a/convert/convert_mllama.go
+++ b/convert/convert_mllama.go
@@ -34,7 +34,7 @@ type mllamaModel struct {
 	} `json:"vision_config"`
 }

-func (m *mllamaModel) KV(t *Tokenizer) KV {
+func (m *mllamaModel) KV(t *Tokenizer) ggml.KV {
 	kv := m.ModelParameters.KV(t)
 	kv["general.architecture"] = "mllama"

--- a/convert/convert_nomicbert.go
+++ b/convert/convert_nomicbert.go
@@ -87,7 +87,7 @@ func (p *nomicbertModel) parseMore(fsys fs.FS) error {
 	return nil
 }

-func (p *nomicbertModel) KV(t *Tokenizer) KV {
+func (p *nomicbertModel) KV(t *Tokenizer) ggml.KV {
 	kv := p.ModelParameters.KV(t)

 	// Determine architecture based on MoE parameters (following qwen3 pattern)
--- a/convert/convert_olmo.go
+++ b/convert/convert_olmo.go
@@ -34,7 +34,7 @@ type olmoModel struct {

 var _ ModelConverter = (*olmoModel)(nil)

-func (p *olmoModel) KV(t *Tokenizer) KV {
+func (p *olmoModel) KV(t *Tokenizer) ggml.KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "olmo3"
 	kv["olmo3.block_count"] = p.NumHiddenLayers
--- a/convert/convert_phi3.go
+++ b/convert/convert_phi3.go
@@ -37,7 +37,7 @@ type phi3Model struct {

 var _ ModelConverter = (*phi3Model)(nil)

-func (p *phi3Model) KV(t *Tokenizer) KV {
+func (p *phi3Model) KV(t *Tokenizer) ggml.KV {
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "phi3"
 	kv["phi3.context_length"] = p.MaxPositionEmbeddings
--- a/convert/convert_qwen2.go
+++ b/convert/convert_qwen2.go
@@ -22,7 +22,7 @@ type qwen2Model struct {

 var _ ModelConverter = (*qwen2Model)(nil)

-func (q *qwen2Model) KV(t *Tokenizer) KV {
+func (q *qwen2Model) KV(t *Tokenizer) ggml.KV {
 	kv := q.ModelParameters.KV(t)
 	kv["general.architecture"] = "qwen2"
 	kv["qwen2.block_count"] = q.HiddenLayers
--- a/convert/convert_qwen25vl.go
+++ b/convert/convert_qwen25vl.go
@@ -29,7 +29,7 @@ type qwen25VLModel struct {

 var _ ModelConverter = (*qwen25VLModel)(nil)

-func (q *qwen25VLModel) KV(t *Tokenizer) KV {
+func (q *qwen25VLModel) KV(t *Tokenizer) ggml.KV {
 	kv := q.ModelParameters.KV(t)
 	kv["general.architecture"] = "qwen25vl"

--- a/convert/convert_qwen3.go
+++ b/convert/convert_qwen3.go
@@ -32,7 +32,7 @@ type qwen3Model struct {
 }

 // KV implements ModelConverter.
-func (q *qwen3Model) KV(t *Tokenizer) KV {
+func (q *qwen3Model) KV(t *Tokenizer) ggml.KV {
 	arch := "qwen3"
 	if q.NumExperts > 0 {
 		arch += "moe"
--- a/convert/convert_qwen3vl.go
+++ b/convert/convert_qwen3vl.go
@@ -45,7 +45,7 @@ func (m *qwen3VLModel) parseMore(fsys fs.FS) error {
 	return json.Unmarshal(bts, &m.VisionModel)
 }

-func (m *qwen3VLModel) KV(t *Tokenizer) KV {
+func (m *qwen3VLModel) KV(t *Tokenizer) ggml.KV {
 	kv := m.qwen3Model.KV(t)

 	arch := "qwen3vl"
--- a/convert/convert_test.go
+++ b/convert/convert_test.go
@@ -19,7 +19,6 @@ import (
 	"testing"

 	"github.com/google/go-cmp/cmp"
-	fsc "github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/fs/ggml"
 )

@@ -29,7 +28,7 @@ type tensorData struct {
 	Shape   []int  `json:"shape"`
 }

-func convertFull(t *testing.T, fsys fs.FS) (*os.File, fsc.Config, ggml.Tensors) {
+func convertFull(t *testing.T, fsys fs.FS) (*os.File, ggml.KV, ggml.Tensors) {
 	t.Helper()

 	f, err := os.CreateTemp(t.TempDir(), "f16")
@@ -60,10 +59,9 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, fsc.Config, ggml.Tensors)
 	return r, m.KV(), m.Tensors()
 }

-func generateResultsJSON(t *testing.T, f *os.File, kv fsc.Config, tensors ggml.Tensors) map[string]string {
+func generateResultsJSON(t *testing.T, f *os.File, kv ggml.KV, tensors ggml.Tensors) map[string]string {
 	actual := make(map[string]string)
-	for k := range kv.Keys() {
-		v := kv.Value(k)
+	for k, v := range kv {
 		if s, ok := v.(json.Marshaler); !ok {
 			actual[k] = fmt.Sprintf("%v", v)
 		} else {
@@ -279,7 +277,7 @@ func generateSafetensorTestData(t *testing.T, tempDir string, tensorData map[str
 func TestConvertAdapter(t *testing.T) {
 	type AdapterCase struct {
 		Name     string
-		BaseKV   KV
+		BaseKV   map[string]any
 		Expected map[string]string
 	}

--- a/docs/skills.md
+++ b/docs/skills.md
@@ -0,0 +1,548 @@
+# Ollama Skills
+
+Skills are reusable capability packages that extend what agents can do. They bundle instructions, scripts, and data that teach an agent how to perform specific tasks.
+
+## Quick Start
+
+### Creating a Skill
+
+Create a directory with a `SKILL.md` file:
+
+```
+my-skill/
+├── SKILL.md          # Required: Instructions for the agent
+└── scripts/          # Optional: Executable scripts
+    └── run.py
+```
+
+The `SKILL.md` file must have YAML frontmatter:
+
+```markdown
+---
+name: my-skill
+description: A brief description of what this skill does
+---
+
+# My Skill
+
+## Purpose
+Explain what this skill does and when to use it.
+
+## Instructions
+Step-by-step instructions for the agent on how to use this skill.
+
+## Examples
+Show example inputs and expected outputs.
+```
+
+### Using Skills in an Agent
+
+Reference skills in your Agentfile:
+
+```dockerfile
+FROM llama3.2:3b
+AGENT_TYPE conversational
+
+# Local skill (bundled with agent)
+SKILL ./path/to/my-skill
+
+# Registry skill (pulled from ollama.com)
+SKILL library/skill/calculator:1.0.0
+
+# User skill from registry
+SKILL myname/skill/calculator:1.0.0
+
+SYSTEM You are a helpful assistant.
+```
+
+### Managing Skills
+
+```bash
+# Push a skill to the registry (uses your namespace)
+ollama skill push myname/skill/calculator:1.0.0 ./my-skill
+
+# Pull a skill from the official library
+ollama skill pull skill/calculator:1.0.0
+
+# Pull a skill from a user's namespace
+ollama skill pull myname/skill/calculator:1.0.0
+
+# List installed skills
+ollama skill list
+
+# Show skill details
+ollama skill show skill/calculator:1.0.0
+
+# Remove a skill
+ollama skill rm skill/calculator:1.0.0
+```
+
+### Dynamic Skills in Chat
+
+You can add and remove skills dynamically during an interactive chat session:
+
+```
+>>> /skills
+Available Skills:
+  calculator (sha256:abc123def456...)
+
+>>> /skill add ./my-local-skill
+Added skill 'my-skill' from ./my-local-skill
+
+>>> /skill list
+Skills loaded in this session:
+  my-skill (local: /path/to/my-local-skill)
+
+>>> /skill remove my-skill
+Removed skill 'my-skill'
+```
+
+| Command | Description |
+|---------|-------------|
+| `/skills` | Show all available skills (model + session) |
+| `/skill add <path>` | Add a skill from a local path |
+| `/skill remove <name>` | Remove a skill by name |
+| `/skill list` | List skills loaded in this session |
+
+Dynamic skills take effect on the next message. This is useful for:
+- Testing skills during development
+- Temporarily adding capabilities to a model
+- Experimenting with skill combinations
+
+## Skill Reference Formats
+
+Skills use a 5-part name structure: `host/namespace/kind/model:tag`
+
+| Format | Example | Description |
+|--------|---------|-------------|
+| Local path | `./skills/calc` | Bundled with agent at create time |
+| Library skill | `skill/calculator:1.0.0` | From the official skill library (library/skill/calculator) |
+| User skill | `alice/skill/calc:1.0.0` | From a user's namespace |
+| Full path | `registry.ollama.ai/alice/skill/calc:1.0.0` | Fully qualified with host |
+
+The `kind` field distinguishes skills from models:
+- `skill` - Skill packages
+- `agent` - Agent packages (future)
+- (empty) - Regular models
+
+## SKILL.md Structure
+
+### Required Frontmatter
+
+```yaml
+---
+name: skill-name        # Must match directory name
+description: Brief description of the skill
+---
+```
+
+### Recommended Sections
+
+1. **Purpose**: What the skill does and when to use it
+2. **When to use**: Trigger conditions for the agent
+3. **Instructions**: Step-by-step usage guide
+4. **Examples**: Input/output examples
+5. **Scripts**: Documentation for any bundled scripts
+
+### Example: Calculator Skill
+
+```markdown
+---
+name: calculator
+description: Performs mathematical calculations using Python
+---
+
+# Calculator Skill
+
+## Purpose
+This skill performs mathematical calculations using a bundled Python script.
+
+## When to use
+- User asks to calculate something
+- User wants to do math operations
+- Any arithmetic is needed
+
+## Instructions
+1. When calculation is needed, use the `run_skill_script` tool
+2. Call: `python3 scripts/calculate.py "<expression>"`
+3. Return the result to the user
+
+## Examples
+
+**Input**: "What is 25 * 4?"
+**Action**: `run_skill_script` with command `python3 scripts/calculate.py '25 * 4'`
+**Output**: "25 * 4 = 100"
+```
+
+## Storage Layout
+
+```
+~/.ollama/models/
+├── blobs/
+│   └── sha256-<digest>           # Skill tar.gz blob
+├── manifests/
+│   └── registry.ollama.ai/
+│       └── skill/                # Library skills
+│           └── calculator/
+│               └── 1.0.0
+│       └── skill-username/       # User skills
+│           └── my-skill/
+│               └── latest
+└── skills/
+    └── sha256-<digest>/          # Extracted skill cache
+        ├── SKILL.md
+        └── scripts/
+```
+
+---
+
+# Security Considerations
+
+## Current State (Development)
+
+The current implementation has several security considerations that need to be addressed before production use.
+
+### 1. Script Execution
+
+**Risk**: Skills can bundle arbitrary scripts that execute on the host system.
+
+**Current behavior**:
+- Scripts run with the same permissions as the Ollama process
+- No sandboxing or isolation
+- Full filesystem access
+
+**Mitigations needed**:
+- [ ] Sandbox script execution (containers, seccomp, etc.)
+- [ ] Resource limits (CPU, memory, time)
+- [ ] Filesystem isolation (read-only mounts, restricted paths)
+- [ ] Network policy controls
+- [ ] Capability dropping
+
+### 2. Skill Provenance
+
+**Risk**: Malicious skills could be pushed to the registry.
+
+**Current behavior**:
+- No code signing or verification
+- No malware scanning
+- Trust based on namespace ownership
+
+**Mitigations needed**:
+- [ ] Skill signing with author keys
+- [ ] Registry-side malware scanning
+- [ ] Content policy enforcement
+- [ ] Reputation system for skill authors
+
+### 3. Namespace Squatting
+
+**Risk**: Malicious actors could register skill names that impersonate official tools.
+
+**Current behavior**:
+- First-come-first-served namespace registration
+- No verification of skill names
+
+**Mitigations needed**:
+- [ ] Reserved namespace list (official tools, common names)
+- [ ] Trademark/name verification for popular skills
+- [ ] Clear namespacing conventions
+
+### 4. Supply Chain Attacks
+
+**Risk**: Compromised skills could inject malicious code into agents.
+
+**Current behavior**:
+- Skills pulled without integrity verification beyond digest
+- No dependency tracking
+
+**Mitigations needed**:
+- [ ] SBOM (Software Bill of Materials) for skills
+- [ ] Dependency vulnerability scanning
+- [ ] Pinned versions in Agentfiles
+- [ ] Audit logging of skill usage
+
+### 5. Data Exfiltration
+
+**Risk**: Skills could exfiltrate sensitive data from conversations or the host.
+
+**Current behavior**:
+- Skills have access to conversation context
+- Scripts can make network requests
+
+**Mitigations needed**:
+- [ ] Network egress controls
+- [ ] Sensitive data detection/masking
+- [ ] Audit logging of script network activity
+- [ ] User consent for data access
+
+### 6. Privilege Escalation
+
+**Risk**: Skills could escalate privileges through script execution.
+
+**Current behavior**:
+- Scripts inherit Ollama process privileges
+- No capability restrictions
+
+**Mitigations needed**:
+- [ ] Run scripts as unprivileged user
+- [ ] Drop all capabilities
+- [ ] Mandatory access controls (SELinux/AppArmor)
+
+## Recommended Security Model
+
+### Skill Trust Levels
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│ Level 0: Untrusted (default)                                │
+│ - No script execution                                       │
+│ - Instructions only                                         │
+│ - Safe for any skill                                        │
+├─────────────────────────────────────────────────────────────┤
+│ Level 1: Sandboxed                                          │
+│ - Scripts run in isolated container                         │
+│ - No network access                                         │
+│ - Read-only filesystem                                      │
+│ - Resource limits enforced                                  │
+├─────────────────────────────────────────────────────────────┤
+│ Level 2: Trusted                                            │
+│ - Scripts run with network access                           │
+│ - Can write to designated directories                       │
+│ - Requires explicit user approval                           │
+├─────────────────────────────────────────────────────────────┤
+│ Level 3: Privileged (admin only)                            │
+│ - Full host access                                          │
+│ - System administration skills                              │
+│ - Requires admin approval                                   │
+└─────────────────────────────────────────────────────────────┘
+```
+
+### Skill Manifest Security Fields (Future)
+
+```yaml
+---
+name: my-skill
+description: A skill description
+security:
+  trust_level: sandboxed
+  permissions:
+    - network:read          # Can make HTTP GET requests
+    - filesystem:read:/data # Can read from /data
+  resource_limits:
+    max_memory: 256MB
+    max_cpu_time: 30s
+    max_disk: 100MB
+  signature: sha256:abc...  # Author signature
+---
+```
+
+---
+
+# Future Considerations
+
+## Feature Roadmap
+
+### Phase 1: Foundation (Current)
+- [x] Skill bundling with agents
+- [x] Local skill development
+- [x] Basic CLI commands (push, pull, list, rm, show)
+- [x] Registry blob storage
+- [ ] Registry namespace configuration
+
+### Phase 2: Security
+- [ ] Script sandboxing
+- [ ] Permission model
+- [ ] Skill signing
+- [ ] Audit logging
+
+### Phase 3: Discovery
+- [ ] Skill search on ollama.com
+- [ ] Skill ratings and reviews
+- [ ] Usage analytics
+- [ ] Featured/trending skills
+
+### Phase 4: Advanced Features
+- [ ] Skill dependencies
+- [ ] Skill versioning constraints
+- [ ] Skill composition (skills using skills)
+- [ ] Skill testing framework
+
+## Open Questions
+
+### 1. Skill Execution Model
+
+**Question**: How should skills execute scripts?
+
+Options:
+- **A) In-process**: Fast but unsafe
+- **B) Subprocess**: Current approach, moderate isolation
+- **C) Container**: Good isolation, requires container runtime
+- **D) WASM**: Portable and safe, limited capabilities
+- **E) Remote execution**: Offload to secure service
+
+### 2. Skill Versioning
+
+**Question**: How strict should version pinning be?
+
+Options:
+- **A) Always latest**: Simple but risky
+- **B) Semantic versioning**: `^1.0.0` allows minor updates
+- **C) Exact pinning**: `=1.0.0` requires explicit updates
+- **D) Digest pinning**: `@sha256:abc` immutable reference
+
+### 3. Skill Permissions
+
+**Question**: How should users grant permissions to skills?
+
+Options:
+- **A) All or nothing**: Accept all permissions or don't use
+- **B) Granular consent**: Approve each permission individually
+- **C) Trust levels**: Pre-defined permission bundles
+- **D) Runtime prompts**: Ask when permission is first used
+
+### 4. Skill Discovery
+
+**Question**: How should users find skills?
+
+Options:
+- **A) Central registry only**: ollama.com/skills
+- **B) Federated registries**: Multiple skill sources
+- **C) Git repositories**: Pull from GitHub, etc.
+- **D) All of the above**: Multiple discovery mechanisms
+
+### 5. Skill Monetization
+
+**Question**: Should skill authors be able to monetize?
+
+Options:
+- **A) Free only**: All skills are free and open
+- **B) Paid skills**: Authors can charge for skills
+- **C) Freemium**: Free tier with paid features
+- **D) Donations**: Voluntary support for authors
+
+### 6. Skill Updates
+
+**Question**: How should skill updates be handled?
+
+Options:
+- **A) Manual**: User explicitly updates
+- **B) Auto-update**: Always use latest
+- **C) Notify**: Alert user to available updates
+- **D) Policy-based**: Organization controls update policy
+
+## API Considerations
+
+### Skill Metadata API
+
+```
+GET /api/skills
+GET /api/skills/:namespace/:name
+GET /api/skills/:namespace/:name/versions
+GET /api/skills/:namespace/:name/readme
+```
+
+### Skill Execution API
+
+```
+POST /api/skills/:namespace/:name/execute
+{
+  "command": "python3 scripts/run.py",
+  "args": ["--input", "data"],
+  "timeout": 30
+}
+```
+
+### Skill Permissions API
+
+```
+GET /api/skills/:namespace/:name/permissions
+POST /api/skills/:namespace/:name/permissions/grant
+DELETE /api/skills/:namespace/:name/permissions/revoke
+```
+
+## Testing Considerations
+
+### Skill Testing Framework
+
+```bash
+# Run skill tests
+ollama skill test ./my-skill
+
+# Test with specific model
+ollama skill test ./my-skill --model llama3.2:3b
+
+# Generate test report
+ollama skill test ./my-skill --report
+```
+
+### Test File Format
+
+```yaml
+# my-skill/tests/test.yaml
+tests:
+  - name: "basic calculation"
+    input: "What is 2 + 2?"
+    expect:
+      contains: "4"
+      tool_called: "run_skill_script"
+
+  - name: "complex expression"
+    input: "Calculate 15% of 200"
+    expect:
+      contains: "30"
+```
+
+## Compatibility Considerations
+
+### Minimum Ollama Version
+
+Skills should declare minimum Ollama version:
+
+```yaml
+---
+name: my-skill
+requires:
+  ollama: ">=0.4.0"
+---
+```
+
+### Model Compatibility
+
+Skills may require specific model capabilities:
+
+```yaml
+---
+name: vision-skill
+requires:
+  capabilities:
+    - vision
+    - tools
+---
+```
+
+## Migration Path
+
+### From Local to Registry
+
+```bash
+# Develop locally
+SKILL ./my-skill
+
+# Push when ready
+ollama skill push myname/my-skill:1.0.0 ./my-skill
+
+# Update Agentfile
+SKILL skill/myname/my-skill:1.0.0
+```
+
+### Version Upgrades
+
+```bash
+# Check for updates
+ollama skill outdated
+
+# Update specific skill
+ollama skill update calculator:1.0.0
+
+# Update all skills
+ollama skill update --all
+```
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -148,6 +148,16 @@ func Remotes() []string {
 	return r
 }

+// Skills returns the list of skill directories. Skills directories can be configured via the OLLAMA_SKILLS environment variable.
+// Returns empty slice if not configured.
+func Skills() []string {
+	raw := strings.TrimSpace(Var("OLLAMA_SKILLS"))
+	if raw == "" {
+		return []string{}
+	}
+	return strings.Split(raw, ",")
+}
+
 func BoolWithDefault(k string) func(defaultValue bool) bool {
 	return func(defaultValue bool) bool {
 		if s := Var(k); s != "" {
@@ -317,6 +327,9 @@ func AsMap() map[string]EnvVar {
 		ret["OLLAMA_VULKAN"] = EnvVar{"OLLAMA_VULKAN", EnableVulkan(), "Enable experimental Vulkan support"}
 	}

+	// Skills configuration would go here when added
+	ret["OLLAMA_SKILLS"] = EnvVar{"OLLAMA_SKILLS", Skills(), "Comma-separated list of skill directories"}
+
 	return ret
 }

--- a/fs/config.go
+++ b/fs/config.go
@@ -1,7 +1,5 @@
 package fs

-import "iter"
-
 type Config interface {
 	Architecture() string
 	String(string, ...string) string
@@ -13,8 +11,4 @@ type Config interface {
 	Ints(string, ...[]int32) []int32
 	Floats(string, ...[]float32) []float32
 	Bools(string, ...[]bool) []bool
-
-	Len() int
-	Keys() iter.Seq[string]
-	Value(key string) any
 }
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -6,9 +6,7 @@ import (
 	"errors"
 	"fmt"
 	"io"
-	"iter"
 	"log/slog"
-	"maps"
 	"math"
 	"slices"
 	"strings"
@@ -241,18 +239,6 @@ func (kv KV) Bools(key string, defaultValue ...[]bool) []bool {
 	return val.values
 }

-func (kv KV) Len() int {
-	return len(kv)
-}
-
-func (kv KV) Keys() iter.Seq[string] {
-	return maps.Keys(kv)
-}
-
-func (kv KV) Value(key string) any {
-	return kv[key]
-}
-
 func (kv KV) OllamaEngineRequired() bool {
 	return slices.Contains([]string{
 		"bert",
--- a/fs/ggml/gguf.go
+++ b/fs/ggml/gguf.go
@@ -8,12 +8,12 @@ import (
 	"fmt"
 	"io"
 	"log/slog"
+	"maps"
 	"os"
 	"runtime"
 	"slices"
 	"strings"

-	"github.com/ollama/ollama/fs"
 	"golang.org/x/sync/errgroup"
 )

@@ -508,7 +508,7 @@ func writeGGUFArray[S ~[]E, E any](w io.Writer, t uint32, s S) error {
 	return binary.Write(w, binary.LittleEndian, s)
 }

-func WriteGGUF(f *os.File, kv fs.Config, ts []*Tensor) error {
+func WriteGGUF(f *os.File, kv KV, ts []*Tensor) error {
 	arch := kv.String("general.architecture")
 	if arch == "" {
 		return fmt.Errorf("architecture not set")
@@ -526,12 +526,12 @@ func WriteGGUF(f *os.File, kv fs.Config, ts []*Tensor) error {
 		return err
 	}

-	if err := binary.Write(f, binary.LittleEndian, uint64(kv.Len())); err != nil {
+	if err := binary.Write(f, binary.LittleEndian, uint64(len(kv))); err != nil {
 		return err
 	}

-	for _, key := range slices.Sorted(kv.Keys()) {
-		if err := ggufWriteKV(f, arch, key, kv.Value(key)); err != nil {
+	for _, key := range slices.Sorted(maps.Keys(kv)) {
+		if err := ggufWriteKV(f, arch, key, kv[key]); err != nil {
 			return err
 		}
 	}
--- a/go.mod
+++ b/go.mod
@@ -87,5 +87,5 @@ require (
 	golang.org/x/term v0.36.0
 	golang.org/x/text v0.30.0
 	google.golang.org/protobuf v1.34.1
-	gopkg.in/yaml.v3 v3.0.1 // indirect
+	gopkg.in/yaml.v3 v3.0.1
 )
--- a/parser/parser.go
+++ b/parser/parser.go
@@ -4,6 +4,7 @@ import (
 	"bufio"
 	"bytes"
 	"crypto/sha256"
+	"encoding/json"
 	"errors"
 	"fmt"
 	"io"
@@ -58,6 +59,8 @@ func (f Modelfile) CreateRequest(relativeDir string) (*api.CreateRequest, error)

 	var messages []api.Message
 	var licenses []string
+	var skills []api.SkillRef
+	var mcps []api.MCPRef
 	params := make(map[string]any)

 	for _, c := range f.Commands {
@@ -118,6 +121,32 @@ func (f Modelfile) CreateRequest(relativeDir string) (*api.CreateRequest, error)
 		case "message":
 			role, msg, _ := strings.Cut(c.Args, ": ")
 			messages = append(messages, api.Message{Role: role, Content: msg})
+		case "skill":
+			skillName := c.Args
+			// Expand local paths relative to the Agentfile directory
+			if isLocalPath(skillName) {
+				expanded, err := expandPath(skillName, relativeDir)
+				if err != nil {
+					return nil, fmt.Errorf("expanding skill path %q: %w", skillName, err)
+				}
+				skillName = expanded
+			}
+			skills = append(skills, api.SkillRef{Name: skillName})
+		case "mcp":
+			mcpRef, err := parseMCPArg(c.Args, relativeDir)
+			if err != nil {
+				return nil, fmt.Errorf("invalid MCP: %w", err)
+			}
+			mcps = append(mcps, mcpRef)
+		case "agent_type":
+			// Handle "AGENT TYPE conversational" -> strip "TYPE " prefix
+			args := c.Args
+			if strings.HasPrefix(strings.ToLower(args), "type ") {
+				args = strings.TrimSpace(args[5:])
+			}
+			req.AgentType = args
+		case "entrypoint":
+			req.Entrypoint = c.Args
 		default:
 			if slices.Contains(deprecatedParameters, c.Name) {
 				fmt.Printf("warning: parameter %s is deprecated\n", c.Name)
@@ -150,6 +179,12 @@ func (f Modelfile) CreateRequest(relativeDir string) (*api.CreateRequest, error)
 	if len(licenses) > 0 {
 		req.License = licenses
 	}
+	if len(skills) > 0 {
+		req.Skills = skills
+	}
+	if len(mcps) > 0 {
+		req.MCPs = mcps
+	}

 	return req, nil
 }
@@ -333,7 +368,7 @@ func (c Command) String() string {
 	switch c.Name {
 	case "model":
 		fmt.Fprintf(&sb, "FROM %s", c.Args)
-	case "license", "template", "system", "adapter", "renderer", "parser", "requires":
+	case "license", "template", "system", "adapter", "renderer", "parser", "requires", "skill", "agent_type", "entrypoint":
 		fmt.Fprintf(&sb, "%s %s", strings.ToUpper(c.Name), quote(c.Args))
 	case "message":
 		role, message, _ := strings.Cut(c.Args, ": ")
@@ -359,7 +394,7 @@ const (
 var (
 	errMissingFrom        = errors.New("no FROM line")
 	errInvalidMessageRole = errors.New("message role must be one of \"system\", \"user\", or \"assistant\"")
-	errInvalidCommand     = errors.New("command must be one of \"from\", \"license\", \"template\", \"system\", \"adapter\", \"renderer\", \"parser\", \"parameter\", \"message\", or \"requires\"")
+	errInvalidCommand     = errors.New("command must be one of \"from\", \"license\", \"template\", \"system\", \"adapter\", \"renderer\", \"parser\", \"parameter\", \"message\", \"requires\", \"skill\", \"agent_type\", \"mcp\", or \"entrypoint\"")
 )

 type ParserError struct {
@@ -423,6 +458,9 @@ func ParseFile(r io.Reader) (*Modelfile, error) {
 				switch s := strings.ToLower(b.String()); s {
 				case "from":
 					cmd.Name = "model"
+				case "agent":
+					// "AGENT TYPE" -> "agent_type", consume next word
+					cmd.Name = "agent_type"
 				case "parameter":
 					// transition to stateParameter which sets command name
 					next = stateParameter
@@ -500,6 +538,10 @@ func ParseFile(r io.Reader) (*Modelfile, error) {
 		if cmd.Name == "model" {
 			return &f, nil
 		}
+		// Allow entrypoint-only agents without FROM
+		if cmd.Name == "entrypoint" {
+			return &f, nil
+		}
 	}

 	return nil, errMissingFrom
@@ -518,7 +560,7 @@ func parseRuneForState(r rune, cs state) (state, rune, error) {
 		}
 	case stateName:
 		switch {
-		case isAlpha(r):
+		case isAlpha(r), r == '_':
 			return stateName, r, nil
 		case isSpace(r):
 			return stateValue, 0, nil
@@ -619,7 +661,7 @@ func isValidMessageRole(role string) bool {

 func isValidCommand(cmd string) bool {
 	switch strings.ToLower(cmd) {
-	case "from", "license", "template", "system", "adapter", "renderer", "parser", "parameter", "message", "requires":
+	case "from", "license", "template", "system", "adapter", "renderer", "parser", "parameter", "message", "requires", "skill", "agent_type", "agent", "mcp", "entrypoint":
 		return true
 	default:
 		return false
@@ -666,3 +708,79 @@ func expandPathImpl(path, relativeDir string, currentUserFunc func() (*user.User
 func expandPath(path, relativeDir string) (string, error) {
 	return expandPathImpl(path, relativeDir, user.Current, user.Lookup)
 }
+
+// parseMCPArg parses MCP command arguments.
+// Supports two formats:
+//
+//	JSON: {"name": "web-search", "command": "uv", "args": ["run", "./script.py"]}
+//	Simple: web-search uv run ./script.py (name, command, args...)
+func parseMCPArg(args string, relativeDir string) (api.MCPRef, error) {
+	args = strings.TrimSpace(args)
+	if args == "" {
+		return api.MCPRef{}, errors.New("MCP requires arguments")
+	}
+
+	// Try JSON format first
+	if strings.HasPrefix(args, "{") {
+		var ref api.MCPRef
+		if err := json.Unmarshal([]byte(args), &ref); err != nil {
+			return api.MCPRef{}, fmt.Errorf("invalid JSON: %w", err)
+		}
+		if ref.Name == "" {
+			return api.MCPRef{}, errors.New("MCP name is required")
+		}
+		if ref.Command == "" {
+			return api.MCPRef{}, errors.New("MCP command is required")
+		}
+		if ref.Type == "" {
+			ref.Type = "stdio"
+		}
+		// Expand relative paths in args
+		for i, arg := range ref.Args {
+			if isLocalPath(arg) {
+				expanded, err := expandPath(arg, relativeDir)
+				if err != nil {
+					return api.MCPRef{}, fmt.Errorf("expanding path %q: %w", arg, err)
+				}
+				ref.Args[i] = expanded
+			}
+		}
+		return ref, nil
+	}
+
+	// Simple format: name command args...
+	parts := strings.Fields(args)
+	if len(parts) < 2 {
+		return api.MCPRef{}, errors.New("MCP requires at least name and command")
+	}
+
+	ref := api.MCPRef{
+		Name:    parts[0],
+		Command: parts[1],
+		Type:    "stdio",
+	}
+	if len(parts) > 2 {
+		ref.Args = parts[2:]
+	}
+
+	// Expand relative paths in args
+	for i, arg := range ref.Args {
+		if isLocalPath(arg) {
+			expanded, err := expandPath(arg, relativeDir)
+			if err != nil {
+				return api.MCPRef{}, fmt.Errorf("expanding path %q: %w", arg, err)
+			}
+			ref.Args[i] = expanded
+		}
+	}
+
+	return ref, nil
+}
+
+// isLocalPath checks if a string looks like a local filesystem path.
+func isLocalPath(s string) bool {
+	return strings.HasPrefix(s, "/") ||
+		strings.HasPrefix(s, "./") ||
+		strings.HasPrefix(s, "../") ||
+		strings.HasPrefix(s, "~")
+}
--- a/parser/parser_test.go
+++ b/parser/parser_test.go
@@ -21,7 +21,6 @@ import (
 	"golang.org/x/text/encoding/unicode"

 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/convert"
 	"github.com/ollama/ollama/fs/ggml"
 )

@@ -802,8 +801,7 @@ func createBinFile(t *testing.T, kv map[string]any, ti []*ggml.Tensor) (string,
 	}
 	defer f.Close()

-	var base convert.KV
-	base = map[string]any{"general.architecture": "test"}
+	base := map[string]any{"general.architecture": "test"}
 	maps.Copy(base, kv)

 	if err := ggml.WriteGGUF(f, base, ti); err != nil {
--- a/readline/errors.go
+++ b/readline/errors.go
@@ -6,9 +6,6 @@ import (

 var ErrInterrupt = errors.New("Interrupt")

-// ErrExpandOutput is returned when user presses Ctrl+O to expand tool output
-var ErrExpandOutput = errors.New("ExpandOutput")
-
 type InterruptError struct {
 	Line []rune
 }
--- a/readline/readline.go
+++ b/readline/readline.go
@@ -206,9 +206,6 @@ func (i *Instance) Readline() (string, error) {
 			buf.DeleteBefore()
 		case CharCtrlL:
 			buf.ClearScreen()
-		case CharCtrlO:
-			// Ctrl+O - expand tool output
-			return "", ErrExpandOutput
 		case CharCtrlW:
 			buf.DeleteWord()
 		case CharCtrlZ:
--- a/readline/types.go
+++ b/readline/types.go
@@ -18,7 +18,6 @@ const (
 	CharCtrlL     = 12
 	CharEnter     = 13
 	CharNext      = 14
-	CharCtrlO     = 15 // Ctrl+O - used for expanding tool output
 	CharPrev      = 16
 	CharBckSearch = 18
 	CharFwdSearch = 19
--- a/scripts/build_linux.sh
+++ b/scripts/build_linux.sh
@@ -37,55 +37,6 @@ if echo $PLATFORM | grep "amd64" > /dev/null; then
        .
 fi

-# Deduplicate CUDA libraries across mlx_* and cuda_* directories
-deduplicate_cuda_libs() {
-    local base_dir="$1"
-    echo "Deduplicating CUDA libraries in ${base_dir}..."
-
-    # Find all mlx_cuda_* directories
-    for mlx_dir in "${base_dir}"/lib/ollama/mlx_cuda_*; do
-        [ -d "${mlx_dir}" ] || continue
-
-        # Extract CUDA version (e.g., v12, v13)
-        cuda_version=$(basename "${mlx_dir}" | sed 's/mlx_cuda_//')
-        cuda_dir="${base_dir}/lib/ollama/cuda_${cuda_version}"
-
-        # Skip if corresponding cuda_* directory doesn't exist
-        [ -d "${cuda_dir}" ] || continue
-
-        echo "  Checking ${mlx_dir} against ${cuda_dir}..."
-
-        # Find all .so* files in mlx directory
-        find "${mlx_dir}" -type f -name "*.so*" | while read mlx_file; do
-            filename=$(basename "${mlx_file}")
-            cuda_file="${cuda_dir}/${filename}"
-
-            # Skip if file doesn't exist in cuda directory
-            [ -f "${cuda_file}" ] || continue
-
-            # Compare checksums
-            mlx_sum=$(sha256sum "${mlx_file}" | awk '{print $1}')
-            cuda_sum=$(sha256sum "${cuda_file}" | awk '{print $1}')
-
-            if [ "${mlx_sum}" = "${cuda_sum}" ]; then
-                echo "    Deduplicating ${filename}"
-                # Calculate relative path from mlx_dir to cuda_dir
-                rel_path="../cuda_${cuda_version}/${filename}"
-                rm -f "${mlx_file}"
-                ln -s "${rel_path}" "${mlx_file}"
-            fi
-        done
-    done
-}
-
-# Run deduplication for each platform output directory
-if echo $PLATFORM | grep "," > /dev/null ; then
-    deduplicate_cuda_libs "./dist/linux_amd64"
-    deduplicate_cuda_libs "./dist/linux_arm64"
-elif echo $PLATFORM | grep "amd64\|arm64" > /dev/null ; then
-    deduplicate_cuda_libs "./dist"
-fi
-
 # buildx behavior changes for single vs. multiplatform
 echo "Compressing linux tar bundles..."
 if echo $PLATFORM | grep "," > /dev/null ; then
--- a/server/create.go
+++ b/server/create.go
@@ -26,7 +26,6 @@ import (
 	"github.com/ollama/ollama/convert"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
-	ofs "github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/template"
 	"github.com/ollama/ollama/types/errtypes"
@@ -63,6 +62,10 @@ func (s *Server) CreateHandler(c *gin.Context) {
 	config.Renderer = r.Renderer
 	config.Parser = r.Parser
 	config.Requires = r.Requires
+	config.Skills = r.Skills
+	config.MCPs = r.MCPs
+	config.AgentType = r.AgentType
+	config.Entrypoint = r.Entrypoint

 	for v := range r.Files {
 		if !fs.ValidPath(v) {
@@ -122,7 +125,10 @@ func (s *Server) CreateHandler(c *gin.Context) {
 					ch <- gin.H{"error": err.Error()}
 				}

-				if err == nil && !remote && (config.Renderer == "" || config.Parser == "" || config.Requires == "") {
+				// Inherit config from base model (Renderer, Parser, Requires, Capabilities, etc.)
+				// This is especially important for cloud models which don't have GGUF files
+				// to detect capabilities from.
+				if err == nil && !remote {
 					manifest, mErr := ParseNamedManifest(fromName)
 					if mErr == nil && manifest.Config.Digest != "" {
 						configPath, pErr := GetBlobsPath(manifest.Config.Digest)
@@ -139,6 +145,29 @@ func (s *Server) CreateHandler(c *gin.Context) {
 									if config.Requires == "" {
 										config.Requires = baseConfig.Requires
 									}
+									// Inherit capabilities for cloud/remote models
+									// (local models detect capabilities from GGUF file)
+									if len(config.Capabilities) == 0 && len(baseConfig.Capabilities) > 0 {
+										config.Capabilities = baseConfig.Capabilities
+									}
+									// Inherit remote host/model if base is a cloud model
+									if config.RemoteHost == "" && baseConfig.RemoteHost != "" {
+										config.RemoteHost = baseConfig.RemoteHost
+									}
+									if config.RemoteModel == "" && baseConfig.RemoteModel != "" {
+										config.RemoteModel = baseConfig.RemoteModel
+									}
+									// Inherit model family for proper rendering
+									if config.ModelFamily == "" && baseConfig.ModelFamily != "" {
+										config.ModelFamily = baseConfig.ModelFamily
+									}
+									if len(config.ModelFamilies) == 0 && len(baseConfig.ModelFamilies) > 0 {
+										config.ModelFamilies = baseConfig.ModelFamilies
+									}
+									// Inherit context length for cloud models
+									if config.ContextLen == 0 && baseConfig.ContextLen > 0 {
+										config.ContextLen = baseConfig.ContextLen
+									}
 								}
 								cfgFile.Close()
 							}
@@ -158,6 +187,9 @@ func (s *Server) CreateHandler(c *gin.Context) {
 				ch <- gin.H{"error": err.Error()}
 				return
 			}
+		} else if r.Entrypoint != "" {
+			// Entrypoint-only agent: no base model needed
+			slog.Debug("create entrypoint-only agent", "entrypoint", r.Entrypoint)
 		} else {
 			ch <- gin.H{"error": errNeitherFromOrFiles.Error(), "status": http.StatusBadRequest}
 			return
@@ -455,7 +487,7 @@ func convertFromSafetensors(files map[string]string, baseLayers []*layerGGML, is
 	return layers, nil
 }

-func kvFromLayers(baseLayers []*layerGGML) (ofs.Config, error) {
+func kvFromLayers(baseLayers []*layerGGML) (ggml.KV, error) {
 	for _, l := range baseLayers {
 		if l.GGML != nil {
 			return l.KV(), nil
@@ -544,6 +576,18 @@ func createModel(r api.CreateRequest, name model.Name, baseLayers []*layerGGML,
 		return err
 	}

+	// Handle skill layers for agents
+	layers, config.Skills, err = setSkillLayers(layers, config.Skills, fn)
+	if err != nil {
+		return err
+	}
+
+	// Handle MCP layers for agents
+	layers, config.MCPs, err = setMCPLayers(layers, config.MCPs, fn)
+	if err != nil {
+		return err
+	}
+
 	configLayer, err := createConfigLayer(layers, *config)
 	if err != nil {
 		return err
@@ -794,6 +838,135 @@ func setMessages(layers []Layer, m []api.Message) ([]Layer, error) {
 	return layers, nil
 }

+// setSkillLayers creates skill layers for local skill paths and updates the skill refs.
+// Local paths are converted to bundled skill layers with digests.
+// Registry references are kept as-is for later resolution during pull.
+func setSkillLayers(layers []Layer, skills []model.SkillRef, fn func(resp api.ProgressResponse)) ([]Layer, []model.SkillRef, error) {
+	if len(skills) == 0 {
+		return layers, skills, nil
+	}
+
+	// Remove any existing skill layers
+	layers = removeLayer(layers, MediaTypeSkill)
+
+	var updatedSkills []model.SkillRef
+
+	for _, skill := range skills {
+		// Check if this is a local path
+		if IsLocalSkillPath(skill.Name) {
+			// Expand home directory if needed
+			skillPath := skill.Name
+			if strings.HasPrefix(skillPath, "~") {
+				home, err := os.UserHomeDir()
+				if err != nil {
+					return nil, nil, fmt.Errorf("expanding home directory: %w", err)
+				}
+				skillPath = filepath.Join(home, skillPath[1:])
+			}
+
+			// Make absolute
+			absPath, err := filepath.Abs(skillPath)
+			if err != nil {
+				return nil, nil, fmt.Errorf("resolving skill path %q: %w", skill.Name, err)
+			}
+
+			// Check if this is a direct skill directory or a parent containing skills
+			skillMdPath := filepath.Join(absPath, "SKILL.md")
+			if _, err := os.Stat(skillMdPath); err == nil {
+				// Direct skill directory
+				fn(api.ProgressResponse{Status: fmt.Sprintf("packaging skill: %s", filepath.Base(absPath))})
+
+				layer, err := CreateSkillLayer(absPath)
+				if err != nil {
+					return nil, nil, fmt.Errorf("creating skill layer for %q: %w", skill.Name, err)
+				}
+
+				layers = append(layers, layer)
+				updatedSkills = append(updatedSkills, model.SkillRef{
+					Name:   filepath.Base(absPath),
+					Digest: layer.Digest,
+				})
+			} else {
+				// Parent directory - walk to find skill subdirectories
+				err := filepath.WalkDir(absPath, func(path string, entry fs.DirEntry, walkErr error) error {
+					if walkErr != nil {
+						return walkErr
+					}
+					if entry.IsDir() {
+						return nil
+					}
+					if entry.Name() != "SKILL.md" {
+						return nil
+					}
+
+					skillDir := filepath.Dir(path)
+					skillName := filepath.Base(skillDir)
+					fn(api.ProgressResponse{Status: fmt.Sprintf("packaging skill: %s", skillName)})
+
+					layer, err := CreateSkillLayer(skillDir)
+					if err != nil {
+						return fmt.Errorf("creating skill layer for %q: %w", skillDir, err)
+					}
+
+					layers = append(layers, layer)
+					updatedSkills = append(updatedSkills, model.SkillRef{
+						Name:   skillName,
+						Digest: layer.Digest,
+					})
+					return nil
+				})
+				if err != nil {
+					return nil, nil, fmt.Errorf("walking skill directory %q: %w", skill.Name, err)
+				}
+			}
+		} else if skill.Digest != "" {
+			// Already has a digest (from a pulled agent), keep as-is
+			updatedSkills = append(updatedSkills, skill)
+		} else {
+			// Registry reference - keep as-is for later resolution
+			updatedSkills = append(updatedSkills, skill)
+		}
+	}
+
+	return layers, updatedSkills, nil
+}
+
+// setMCPLayers handles MCP server references.
+// Currently, MCPs are stored as config data (command/args).
+// Future: support bundling MCP server directories as layers.
+func setMCPLayers(layers []Layer, mcps []model.MCPRef, fn func(resp api.ProgressResponse)) ([]Layer, []model.MCPRef, error) {
+	if len(mcps) == 0 {
+		return layers, mcps, nil
+	}
+
+	// Remove any existing MCP layers
+	layers = removeLayer(layers, MediaTypeMCP)
+
+	var updatedMCPs []model.MCPRef
+
+	for _, mcp := range mcps {
+		// Validate MCP has required fields
+		if mcp.Name == "" {
+			return nil, nil, fmt.Errorf("MCP server requires a name")
+		}
+		if mcp.Command == "" {
+			return nil, nil, fmt.Errorf("MCP server %q requires a command", mcp.Name)
+		}
+
+		// Set default type if not specified
+		if mcp.Type == "" {
+			mcp.Type = "stdio"
+		}
+
+		// For now, just keep MCPs as config data
+		// Future: detect local paths in args and bundle them
+		updatedMCPs = append(updatedMCPs, mcp)
+		fn(api.ProgressResponse{Status: fmt.Sprintf("configuring MCP: %s", mcp.Name)})
+	}
+
+	return layers, updatedMCPs, nil
+}
+
 func createConfigLayer(layers []Layer, config model.ConfigV2) (*Layer, error) {
 	digests := make([]string, len(layers))
 	for i, layer := range layers {
--- a/server/images.go
+++ b/server/images.go
@@ -232,6 +232,13 @@ func (m *Model) String() string {
 		})
 	}

+	if m.Config.Entrypoint != "" {
+		modelfile.Commands = append(modelfile.Commands, parser.Command{
+			Name: "entrypoint",
+			Args: m.Config.Entrypoint,
+		})
+	}
+
 	for k, v := range m.Options {
 		switch v := v.(type) {
 		case []any:
@@ -657,6 +664,16 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
 		}
 	}

+	// Extract skill layers to the skills cache
+	for _, layer := range manifest.Layers {
+		if layer.MediaType == MediaTypeSkill {
+			fn(api.ProgressResponse{Status: fmt.Sprintf("extracting skill %s", layer.Digest)})
+			if _, err := ExtractSkillBlob(layer.Digest); err != nil {
+				return fmt.Errorf("extracting skill layer %s: %w", layer.Digest, err)
+			}
+		}
+	}
+
 	fn(api.ProgressResponse{Status: "writing manifest"})

 	manifestJSON, err := json.Marshal(manifest)
--- a/server/manifest.go
+++ b/server/manifest.go
@@ -129,11 +129,30 @@ func Manifests(continueOnError bool) (map[model.Name]*Manifest, error) {
 		return nil, err
 	}

-	// TODO(mxyng): use something less brittle
-	matches, err := filepath.Glob(filepath.Join(manifests, "*", "*", "*", "*"))
+	// Find both 4-part (models) and 5-part (skills/agents) manifest paths
+	matches4, err := filepath.Glob(filepath.Join(manifests, "*", "*", "*", "*"))
 	if err != nil {
 		return nil, err
 	}
+	matches5, err := filepath.Glob(filepath.Join(manifests, "*", "*", "*", "*", "*"))
+	if err != nil {
+		return nil, err
+	}
+
+	// Combine matches, filtering to only include files
+	var matches []string
+	for _, match := range matches4 {
+		fi, err := os.Stat(match)
+		if err == nil && !fi.IsDir() {
+			matches = append(matches, match)
+		}
+	}
+	for _, match := range matches5 {
+		fi, err := os.Stat(match)
+		if err == nil && !fi.IsDir() {
+			matches = append(matches, match)
+		}
+	}

 	ms := make(map[model.Name]*Manifest)
 	for _, match := range matches {
--- a/server/mcp.go
+++ b/server/mcp.go
@@ -0,0 +1,315 @@
+package server
+
+import (
+	"archive/tar"
+	"compress/gzip"
+	"fmt"
+	"io"
+	"os"
+	"path/filepath"
+	"regexp"
+	"strings"
+
+	"github.com/ollama/ollama/envconfig"
+	"github.com/ollama/ollama/types/model"
+)
+
+// MediaTypeMCP is the media type for MCP server layers in manifests.
+const MediaTypeMCP = "application/vnd.ollama.image.mcp"
+
+// GetMCPsPath returns the path to the extracted MCPs cache directory.
+// If digest is empty, returns the mcps directory itself.
+// If digest is provided, returns the path to the extracted MCP for that digest.
+func GetMCPsPath(digest string) (string, error) {
+	// only accept actual sha256 digests
+	pattern := "^sha256[:-][0-9a-fA-F]{64}$"
+	re := regexp.MustCompile(pattern)
+
+	if digest != "" && !re.MatchString(digest) {
+		return "", ErrInvalidDigestFormat
+	}
+
+	digest = strings.ReplaceAll(digest, ":", "-")
+	path := filepath.Join(envconfig.Models(), "mcps", digest)
+	dirPath := filepath.Dir(path)
+	if digest == "" {
+		dirPath = path
+	}
+
+	if err := os.MkdirAll(dirPath, 0o755); err != nil {
+		return "", fmt.Errorf("%w: ensure path elements are traversable", err)
+	}
+
+	return path, nil
+}
+
+// ExtractMCPBlob extracts an MCP tar.gz blob to the mcps cache.
+// The blob is expected to be at the blobs path for the given digest.
+// Returns the path to the extracted MCP directory.
+func ExtractMCPBlob(digest string) (string, error) {
+	// Get the blob path
+	blobPath, err := GetBlobsPath(digest)
+	if err != nil {
+		return "", fmt.Errorf("getting blob path: %w", err)
+	}
+
+	// Get the extraction path
+	mcpPath, err := GetMCPsPath(digest)
+	if err != nil {
+		return "", fmt.Errorf("getting mcp path: %w", err)
+	}
+
+	// Check if already extracted (look for any file)
+	entries, err := os.ReadDir(mcpPath)
+	if err == nil && len(entries) > 0 {
+		return mcpPath, nil
+	}
+
+	// Open the blob
+	f, err := os.Open(blobPath)
+	if err != nil {
+		return "", fmt.Errorf("opening blob: %w", err)
+	}
+	defer f.Close()
+
+	// Create gzip reader
+	gzr, err := gzip.NewReader(f)
+	if err != nil {
+		return "", fmt.Errorf("creating gzip reader: %w", err)
+	}
+	defer gzr.Close()
+
+	// Create tar reader
+	tr := tar.NewReader(gzr)
+
+	// Create the mcp directory
+	if err := os.MkdirAll(mcpPath, 0o755); err != nil {
+		return "", fmt.Errorf("creating mcp directory: %w", err)
+	}
+
+	// Extract files
+	for {
+		header, err := tr.Next()
+		if err == io.EOF {
+			break
+		}
+		if err != nil {
+			return "", fmt.Errorf("reading tar: %w", err)
+		}
+
+		// Clean the name and ensure it doesn't escape the target directory
+		name := filepath.Clean(header.Name)
+		if strings.HasPrefix(name, "..") {
+			return "", fmt.Errorf("invalid path in archive: %s", header.Name)
+		}
+
+		target := filepath.Join(mcpPath, name)
+
+		// Verify the target is within mcpPath
+		if !strings.HasPrefix(target, filepath.Clean(mcpPath)+string(os.PathSeparator)) && target != filepath.Clean(mcpPath) {
+			return "", fmt.Errorf("path escapes mcp directory: %s", header.Name)
+		}
+
+		switch header.Typeflag {
+		case tar.TypeDir:
+			if err := os.MkdirAll(target, 0o755); err != nil {
+				return "", fmt.Errorf("creating directory: %w", err)
+			}
+		case tar.TypeReg:
+			// Ensure parent directory exists
+			if err := os.MkdirAll(filepath.Dir(target), 0o755); err != nil {
+				return "", fmt.Errorf("creating parent directory: %w", err)
+			}
+
+			outFile, err := os.OpenFile(target, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, os.FileMode(header.Mode))
+			if err != nil {
+				return "", fmt.Errorf("creating file: %w", err)
+			}
+
+			if _, err := io.Copy(outFile, tr); err != nil {
+				outFile.Close()
+				return "", fmt.Errorf("writing file: %w", err)
+			}
+			outFile.Close()
+		}
+	}
+
+	return mcpPath, nil
+}
+
+// CreateMCPLayer creates an MCP layer from a local directory.
+// The directory can optionally contain an mcp.json or package.json file.
+// Returns the created layer.
+func CreateMCPLayer(mcpDir string) (Layer, error) {
+	// Verify directory exists
+	info, err := os.Stat(mcpDir)
+	if err != nil {
+		return Layer{}, fmt.Errorf("mcp directory not found: %w", err)
+	}
+	if !info.IsDir() {
+		return Layer{}, fmt.Errorf("mcp path is not a directory: %s", mcpDir)
+	}
+
+	// Create a temporary file for the tar.gz
+	blobsPath, err := GetBlobsPath("")
+	if err != nil {
+		return Layer{}, fmt.Errorf("getting blobs path: %w", err)
+	}
+
+	tmpFile, err := os.CreateTemp(blobsPath, "mcp-*.tar.gz")
+	if err != nil {
+		return Layer{}, fmt.Errorf("creating temp file: %w", err)
+	}
+	tmpPath := tmpFile.Name()
+	defer func() {
+		tmpFile.Close()
+		os.Remove(tmpPath)
+	}()
+
+	// Create gzip writer
+	gzw := gzip.NewWriter(tmpFile)
+	defer gzw.Close()
+
+	// Create tar writer
+	tw := tar.NewWriter(gzw)
+	defer tw.Close()
+
+	// Walk the mcp directory and add files to tar
+	err = filepath.Walk(mcpDir, func(path string, info os.FileInfo, err error) error {
+		if err != nil {
+			return err
+		}
+
+		// Get relative path
+		relPath, err := filepath.Rel(mcpDir, path)
+		if err != nil {
+			return err
+		}
+
+		// Skip the root directory itself
+		if relPath == "." {
+			return nil
+		}
+
+		// Create tar header
+		header, err := tar.FileInfoHeader(info, "")
+		if err != nil {
+			return err
+		}
+		header.Name = relPath
+
+		if err := tw.WriteHeader(header); err != nil {
+			return err
+		}
+
+		// Write file contents if it's a regular file
+		if !info.IsDir() {
+			f, err := os.Open(path)
+			if err != nil {
+				return err
+			}
+			defer f.Close()
+
+			if _, err := io.Copy(tw, f); err != nil {
+				return err
+			}
+		}
+
+		return nil
+	})
+	if err != nil {
+		return Layer{}, fmt.Errorf("creating tar archive: %w", err)
+	}
+
+	// Close writers to flush
+	if err := tw.Close(); err != nil {
+		return Layer{}, fmt.Errorf("closing tar writer: %w", err)
+	}
+	if err := gzw.Close(); err != nil {
+		return Layer{}, fmt.Errorf("closing gzip writer: %w", err)
+	}
+	if err := tmpFile.Close(); err != nil {
+		return Layer{}, fmt.Errorf("closing temp file: %w", err)
+	}
+
+	// Open the temp file for reading
+	tmpFile, err = os.Open(tmpPath)
+	if err != nil {
+		return Layer{}, fmt.Errorf("reopening temp file: %w", err)
+	}
+	defer tmpFile.Close()
+
+	// Create the layer (this will compute the digest and move to blobs)
+	layer, err := NewLayer(tmpFile, MediaTypeMCP)
+	if err != nil {
+		return Layer{}, fmt.Errorf("creating layer: %w", err)
+	}
+
+	// Extract the mcp to the cache so it's ready to use
+	if _, err := ExtractMCPBlob(layer.Digest); err != nil {
+		return Layer{}, fmt.Errorf("extracting mcp: %w", err)
+	}
+
+	return layer, nil
+}
+
+// IsLocalMCPPath checks if an MCP reference looks like a local path.
+// Local paths are explicitly prefixed with /, ./, ../, or ~.
+func IsLocalMCPPath(name string) bool {
+	return strings.HasPrefix(name, "/") ||
+		strings.HasPrefix(name, "./") ||
+		strings.HasPrefix(name, "../") ||
+		strings.HasPrefix(name, "~")
+}
+
+// MCPNamespace is the namespace used for standalone MCPs in the registry.
+const MCPNamespace = "mcp"
+
+// IsMCPReference checks if a name refers to an MCP (has mcp/ prefix).
+func IsMCPReference(name string) bool {
+	name = strings.ReplaceAll(name, string(os.PathSeparator), "/")
+	parts := strings.Split(name, "/")
+
+	// mcp/name or mcp/name:tag
+	if len(parts) >= 1 && parts[0] == MCPNamespace {
+		return true
+	}
+	// namespace/mcp/name (e.g., myuser/mcp/websearch)
+	if len(parts) >= 2 && parts[1] == MCPNamespace {
+		return true
+	}
+	return false
+}
+
+// ParseMCPName parses an MCP reference string into a model.Name.
+// The Kind field is set to "mcp".
+func ParseMCPName(name string) model.Name {
+	n := model.ParseName(name)
+
+	// If Kind wasn't set (old format without mcp/), set it
+	if n.Kind == "" {
+		n.Kind = MCPNamespace
+	}
+
+	return n
+}
+
+// GetMCPManifestPath returns the path to the MCP manifest file.
+func GetMCPManifestPath(n model.Name) (string, error) {
+	if n.Model == "" {
+		return "", fmt.Errorf("mcp name is required")
+	}
+
+	// Ensure Kind is set
+	if n.Kind == "" {
+		n.Kind = MCPNamespace
+	}
+
+	path := filepath.Join(
+		envconfig.Models(),
+		"manifests",
+		n.Filepath(),
+	)
+
+	return path, nil
+}
--- a/server/modelpath.go
+++ b/server/modelpath.go
@@ -18,6 +18,7 @@ type ModelPath struct {
 	ProtocolScheme string
 	Registry       string
 	Namespace      string
+	Kind           string // Optional: "skill", "agent", or empty for models
 	Repository     string
 	Tag            string
 }
@@ -42,6 +43,7 @@ func ParseModelPath(name string) ModelPath {
 		ProtocolScheme: DefaultProtocolScheme,
 		Registry:       DefaultRegistry,
 		Namespace:      DefaultNamespace,
+		Kind:           "",
 		Repository:     "",
 		Tag:            DefaultTag,
 	}
@@ -55,13 +57,41 @@ func ParseModelPath(name string) ModelPath {
 	name = strings.ReplaceAll(name, string(os.PathSeparator), "/")
 	parts := strings.Split(name, "/")
 	switch len(parts) {
-	case 3:
+	case 4:
+		// host/namespace/kind/model or host/namespace/model:tag with kind
 		mp.Registry = parts[0]
 		mp.Namespace = parts[1]
-		mp.Repository = parts[2]
+		if model.ValidKinds[parts[2]] {
+			mp.Kind = parts[2]
+			mp.Repository = parts[3]
+		} else {
+			// Not a valid kind, treat as old format with extra part
+			mp.Repository = parts[3]
+		}
+	case 3:
+		// Could be: host/namespace/model OR namespace/kind/model
+		if model.ValidKinds[parts[1]] {
+			// namespace/kind/model
+			mp.Namespace = parts[0]
+			mp.Kind = parts[1]
+			mp.Repository = parts[2]
+		} else {
+			// host/namespace/model
+			mp.Registry = parts[0]
+			mp.Namespace = parts[1]
+			mp.Repository = parts[2]
+		}
 	case 2:
-		mp.Namespace = parts[0]
-		mp.Repository = parts[1]
+		// Could be: namespace/model OR kind/model
+		if model.ValidKinds[parts[0]] {
+			// kind/model (library skill)
+			mp.Kind = parts[0]
+			mp.Repository = parts[1]
+		} else {
+			// namespace/model
+			mp.Namespace = parts[0]
+			mp.Repository = parts[1]
+		}
 	case 1:
 		mp.Repository = parts[0]
 	}
@@ -75,20 +105,35 @@ func ParseModelPath(name string) ModelPath {
 }

 func (mp ModelPath) GetNamespaceRepository() string {
+	if mp.Kind != "" {
+		return fmt.Sprintf("%s/%s/%s", mp.Namespace, mp.Kind, mp.Repository)
+	}
 	return fmt.Sprintf("%s/%s", mp.Namespace, mp.Repository)
 }

 func (mp ModelPath) GetFullTagname() string {
+	if mp.Kind != "" {
+		return fmt.Sprintf("%s/%s/%s/%s:%s", mp.Registry, mp.Namespace, mp.Kind, mp.Repository, mp.Tag)
+	}
 	return fmt.Sprintf("%s/%s/%s:%s", mp.Registry, mp.Namespace, mp.Repository, mp.Tag)
 }

 func (mp ModelPath) GetShortTagname() string {
 	if mp.Registry == DefaultRegistry {
 		if mp.Namespace == DefaultNamespace {
+			if mp.Kind != "" {
+				return fmt.Sprintf("%s/%s:%s", mp.Kind, mp.Repository, mp.Tag)
+			}
 			return fmt.Sprintf("%s:%s", mp.Repository, mp.Tag)
 		}
+		if mp.Kind != "" {
+			return fmt.Sprintf("%s/%s/%s:%s", mp.Namespace, mp.Kind, mp.Repository, mp.Tag)
+		}
 		return fmt.Sprintf("%s/%s:%s", mp.Namespace, mp.Repository, mp.Tag)
 	}
+	if mp.Kind != "" {
+		return fmt.Sprintf("%s/%s/%s/%s:%s", mp.Registry, mp.Namespace, mp.Kind, mp.Repository, mp.Tag)
+	}
 	return fmt.Sprintf("%s/%s/%s:%s", mp.Registry, mp.Namespace, mp.Repository, mp.Tag)
 }

@@ -97,6 +142,7 @@ func (mp ModelPath) GetManifestPath() (string, error) {
 	name := model.Name{
 		Host:      mp.Registry,
 		Namespace: mp.Namespace,
+		Kind:      mp.Kind,
 		Model:     mp.Repository,
 		Tag:       mp.Tag,
 	}
--- a/server/routes.go
+++ b/server/routes.go
@@ -978,6 +978,9 @@ func getExistingName(n model.Name) (model.Name, error) {
 		if set.Namespace == "" && strings.EqualFold(e.Namespace, n.Namespace) {
 			n.Namespace = e.Namespace
 		}
+		if set.Kind == "" && strings.EqualFold(e.Kind, n.Kind) {
+			n.Kind = e.Kind
+		}
 		if set.Model == "" && strings.EqualFold(e.Model, n.Model) {
 			n.Model = e.Model
 		}
@@ -1116,6 +1119,10 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
 		Capabilities: m.Capabilities(),
 		ModifiedAt:   manifest.fi.ModTime(),
 		Requires:     m.Config.Requires,
+		Skills:       m.Config.Skills,
+		MCPs:         m.Config.MCPs,
+		AgentType:    m.Config.AgentType,
+		Entrypoint:   m.Config.Entrypoint,
 	}

 	if m.Config.RemoteHost != "" {
@@ -1170,11 +1177,16 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
 	fmt.Fprint(&sb, m.String())
 	resp.Modelfile = sb.String()

-	// skip loading tensor information if this is a remote model
+	// skip loading tensor information if this is a remote model or a skill
 	if m.Config.RemoteHost != "" && m.Config.RemoteModel != "" {
 		return resp, nil
 	}

+	// Skills don't have model weights, skip tensor loading
+	if m.ModelPath == "" {
+		return resp, nil
+	}
+
 	kvData, tensors, err := getModelData(m.ModelPath, req.Verbose)
 	if err != nil {
 		return nil, err
--- a/server/routes_create_test.go
+++ b/server/routes_create_test.go
@@ -22,7 +22,6 @@ import (
 	gocmpopts "github.com/google/go-cmp/cmp/cmpopts"

 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/convert"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/types/model"
@@ -42,8 +41,7 @@ func createBinFile(t *testing.T, kv map[string]any, ti []*ggml.Tensor) (string,
 	}
 	defer f.Close()

-	var base convert.KV
-	base = map[string]any{"general.architecture": "test"}
+	base := map[string]any{"general.architecture": "test"}
 	maps.Copy(base, kv)

 	if err := ggml.WriteGGUF(f, base, ti); err != nil {
--- a/server/skill.go
+++ b/server/skill.go
@@ -0,0 +1,326 @@
+package server
+
+import (
+	"archive/tar"
+	"compress/gzip"
+	"fmt"
+	"io"
+	"os"
+	"path/filepath"
+	"regexp"
+	"strings"
+
+	"github.com/ollama/ollama/envconfig"
+	"github.com/ollama/ollama/types/model"
+)
+
+// MediaTypeSkill is the media type for skill layers in manifests.
+const MediaTypeSkill = "application/vnd.ollama.image.skill"
+
+// GetSkillsPath returns the path to the extracted skills cache directory.
+// If digest is empty, returns the skills directory itself.
+// If digest is provided, returns the path to the extracted skill for that digest.
+func GetSkillsPath(digest string) (string, error) {
+	// only accept actual sha256 digests
+	pattern := "^sha256[:-][0-9a-fA-F]{64}$"
+	re := regexp.MustCompile(pattern)
+
+	if digest != "" && !re.MatchString(digest) {
+		return "", ErrInvalidDigestFormat
+	}
+
+	digest = strings.ReplaceAll(digest, ":", "-")
+	path := filepath.Join(envconfig.Models(), "skills", digest)
+	dirPath := filepath.Dir(path)
+	if digest == "" {
+		dirPath = path
+	}
+
+	if err := os.MkdirAll(dirPath, 0o755); err != nil {
+		return "", fmt.Errorf("%w: ensure path elements are traversable", err)
+	}
+
+	return path, nil
+}
+
+// ExtractSkillBlob extracts a skill tar.gz blob to the skills cache.
+// The blob is expected to be at the blobs path for the given digest.
+// Returns the path to the extracted skill directory.
+func ExtractSkillBlob(digest string) (string, error) {
+	// Get the blob path
+	blobPath, err := GetBlobsPath(digest)
+	if err != nil {
+		return "", fmt.Errorf("getting blob path: %w", err)
+	}
+
+	// Get the extraction path
+	skillPath, err := GetSkillsPath(digest)
+	if err != nil {
+		return "", fmt.Errorf("getting skill path: %w", err)
+	}
+
+	// Check if already extracted
+	if _, err := os.Stat(filepath.Join(skillPath, "SKILL.md")); err == nil {
+		return skillPath, nil
+	}
+
+	// Open the blob
+	f, err := os.Open(blobPath)
+	if err != nil {
+		return "", fmt.Errorf("opening blob: %w", err)
+	}
+	defer f.Close()
+
+	// Create gzip reader
+	gzr, err := gzip.NewReader(f)
+	if err != nil {
+		return "", fmt.Errorf("creating gzip reader: %w", err)
+	}
+	defer gzr.Close()
+
+	// Create tar reader
+	tr := tar.NewReader(gzr)
+
+	// Create the skill directory
+	if err := os.MkdirAll(skillPath, 0o755); err != nil {
+		return "", fmt.Errorf("creating skill directory: %w", err)
+	}
+
+	// Extract files
+	for {
+		header, err := tr.Next()
+		if err == io.EOF {
+			break
+		}
+		if err != nil {
+			return "", fmt.Errorf("reading tar: %w", err)
+		}
+
+		// Clean the name and ensure it doesn't escape the target directory
+		name := filepath.Clean(header.Name)
+		if strings.HasPrefix(name, "..") {
+			return "", fmt.Errorf("invalid path in archive: %s", header.Name)
+		}
+
+		target := filepath.Join(skillPath, name)
+
+		// Verify the target is within skillPath
+		if !strings.HasPrefix(target, filepath.Clean(skillPath)+string(os.PathSeparator)) && target != filepath.Clean(skillPath) {
+			return "", fmt.Errorf("path escapes skill directory: %s", header.Name)
+		}
+
+		switch header.Typeflag {
+		case tar.TypeDir:
+			if err := os.MkdirAll(target, 0o755); err != nil {
+				return "", fmt.Errorf("creating directory: %w", err)
+			}
+		case tar.TypeReg:
+			// Ensure parent directory exists
+			if err := os.MkdirAll(filepath.Dir(target), 0o755); err != nil {
+				return "", fmt.Errorf("creating parent directory: %w", err)
+			}
+
+			outFile, err := os.OpenFile(target, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, os.FileMode(header.Mode))
+			if err != nil {
+				return "", fmt.Errorf("creating file: %w", err)
+			}
+
+			if _, err := io.Copy(outFile, tr); err != nil {
+				outFile.Close()
+				return "", fmt.Errorf("writing file: %w", err)
+			}
+			outFile.Close()
+		}
+	}
+
+	return skillPath, nil
+}
+
+// CreateSkillLayer creates a skill layer from a local directory.
+// The directory must contain a SKILL.md file.
+// Returns the created layer.
+func CreateSkillLayer(skillDir string) (Layer, error) {
+	// Verify SKILL.md exists
+	skillMdPath := filepath.Join(skillDir, "SKILL.md")
+	if _, err := os.Stat(skillMdPath); err != nil {
+		return Layer{}, fmt.Errorf("skill directory must contain SKILL.md: %w", err)
+	}
+
+	// Create a temporary file for the tar.gz
+	blobsPath, err := GetBlobsPath("")
+	if err != nil {
+		return Layer{}, fmt.Errorf("getting blobs path: %w", err)
+	}
+
+	tmpFile, err := os.CreateTemp(blobsPath, "skill-*.tar.gz")
+	if err != nil {
+		return Layer{}, fmt.Errorf("creating temp file: %w", err)
+	}
+	tmpPath := tmpFile.Name()
+	defer func() {
+		tmpFile.Close()
+		os.Remove(tmpPath)
+	}()
+
+	// Create gzip writer
+	gzw := gzip.NewWriter(tmpFile)
+	defer gzw.Close()
+
+	// Create tar writer
+	tw := tar.NewWriter(gzw)
+	defer tw.Close()
+
+	// Walk the skill directory and add files to tar
+	err = filepath.Walk(skillDir, func(path string, info os.FileInfo, err error) error {
+		if err != nil {
+			return err
+		}
+
+		// Get relative path
+		relPath, err := filepath.Rel(skillDir, path)
+		if err != nil {
+			return err
+		}
+
+		// Skip the root directory itself
+		if relPath == "." {
+			return nil
+		}
+
+		// Create tar header
+		header, err := tar.FileInfoHeader(info, "")
+		if err != nil {
+			return err
+		}
+		header.Name = relPath
+
+		if err := tw.WriteHeader(header); err != nil {
+			return err
+		}
+
+		// Write file contents if it's a regular file
+		if !info.IsDir() {
+			f, err := os.Open(path)
+			if err != nil {
+				return err
+			}
+			defer f.Close()
+
+			if _, err := io.Copy(tw, f); err != nil {
+				return err
+			}
+		}
+
+		return nil
+	})
+	if err != nil {
+		return Layer{}, fmt.Errorf("creating tar archive: %w", err)
+	}
+
+	// Close writers to flush
+	if err := tw.Close(); err != nil {
+		return Layer{}, fmt.Errorf("closing tar writer: %w", err)
+	}
+	if err := gzw.Close(); err != nil {
+		return Layer{}, fmt.Errorf("closing gzip writer: %w", err)
+	}
+	if err := tmpFile.Close(); err != nil {
+		return Layer{}, fmt.Errorf("closing temp file: %w", err)
+	}
+
+	// Open the temp file for reading
+	tmpFile, err = os.Open(tmpPath)
+	if err != nil {
+		return Layer{}, fmt.Errorf("reopening temp file: %w", err)
+	}
+	defer tmpFile.Close()
+
+	// Create the layer (this will compute the digest and move to blobs)
+	layer, err := NewLayer(tmpFile, MediaTypeSkill)
+	if err != nil {
+		return Layer{}, fmt.Errorf("creating layer: %w", err)
+	}
+
+	// Extract the skill to the cache so it's ready to use
+	if _, err := ExtractSkillBlob(layer.Digest); err != nil {
+		return Layer{}, fmt.Errorf("extracting skill: %w", err)
+	}
+
+	return layer, nil
+}
+
+// IsLocalSkillPath checks if a skill reference looks like a local path.
+// Local paths are explicitly prefixed with /, ./, ../, or ~.
+// Registry references like "skill/calculator:1.0.0" should NOT be treated as local paths.
+func IsLocalSkillPath(name string) bool {
+	// Local paths are explicitly indicated by path prefixes
+	return strings.HasPrefix(name, "/") ||
+		strings.HasPrefix(name, "./") ||
+		strings.HasPrefix(name, "../") ||
+		strings.HasPrefix(name, "~")
+}
+
+// SkillNamespace is the namespace used for standalone skills in the registry.
+const SkillNamespace = "skill"
+
+// IsSkillReference checks if a name refers to a skill (has skill/ prefix).
+func IsSkillReference(name string) bool {
+	// Check for skill/ prefix (handles both "skill/foo" and "registry/skill/foo")
+	name = strings.ReplaceAll(name, string(os.PathSeparator), "/")
+	parts := strings.Split(name, "/")
+
+	// skill/name or skill/name:tag
+	if len(parts) >= 1 && parts[0] == SkillNamespace {
+		return true
+	}
+	// namespace/skill/name (e.g., myuser/skill/calc) - not a skill ref
+	// registry/skill/name (e.g., registry.ollama.ai/skill/calc)
+	if len(parts) >= 2 && parts[1] == SkillNamespace {
+		return true
+	}
+	return false
+}
+
+// ParseSkillName parses a skill reference string into a model.Name.
+// The Kind field is set to "skill".
+// Examples:
+//   - "calculator" -> library/skill/calculator:latest
+//   - "myname/calculator" -> myname/skill/calculator:latest
+//   - "myname/skill/calculator:1.0.0" -> myname/skill/calculator:1.0.0
+func ParseSkillName(name string) model.Name {
+	// Use the standard parser which now handles Kind
+	n := model.ParseName(name)
+
+	// If Kind wasn't set (old format without skill/), set it
+	if n.Kind == "" {
+		n.Kind = SkillNamespace
+	}
+
+	return n
+}
+
+// SkillDisplayName returns a user-friendly display name for a skill.
+func SkillDisplayName(n model.Name) string {
+	return n.DisplayShortest()
+}
+
+// GetSkillManifestPath returns the path to the skill manifest file.
+// Uses the 5-part structure: host/namespace/kind/model/tag
+func GetSkillManifestPath(n model.Name) (string, error) {
+	if n.Model == "" {
+		return "", fmt.Errorf("skill name is required")
+	}
+
+	// Ensure Kind is set
+	if n.Kind == "" {
+		n.Kind = SkillNamespace
+	}
+
+	path := filepath.Join(
+		envconfig.Models(),
+		"manifests",
+		n.Filepath(),
+	)
+
+	return path, nil
+}
--- a/template/template.go
+++ b/template/template.go
@@ -381,28 +381,6 @@ func (t templateTools) String() string {
 	return string(bts)
 }

-// templateArgs is a map type with JSON string output for templates.
-type templateArgs map[string]any
-
-func (t templateArgs) String() string {
-	if t == nil {
-		return "{}"
-	}
-	bts, _ := json.Marshal(t)
-	return string(bts)
-}
-
-// templateProperties is a map type with JSON string output for templates.
-type templateProperties map[string]api.ToolProperty
-
-func (t templateProperties) String() string {
-	if t == nil {
-		return "{}"
-	}
-	bts, _ := json.Marshal(t)
-	return string(bts)
-}
-
 // templateTool is a template-compatible representation of api.Tool
 // with Properties as a regular map for template ranging.
 type templateTool struct {
@@ -418,11 +396,11 @@ type templateToolFunction struct {
 }

 type templateToolFunctionParameters struct {
-	Type       string             `json:"type"`
-	Defs       any                `json:"$defs,omitempty"`
-	Items      any                `json:"items,omitempty"`
-	Required   []string           `json:"required,omitempty"`
-	Properties templateProperties `json:"properties"`
+	Type       string                      `json:"type"`
+	Defs       any                         `json:"$defs,omitempty"`
+	Items      any                         `json:"items,omitempty"`
+	Required   []string                    `json:"required,omitempty"`
+	Properties map[string]api.ToolProperty `json:"properties"`
 }

 // templateToolCall is a template-compatible representation of api.ToolCall
@@ -435,7 +413,7 @@ type templateToolCall struct {
 type templateToolCallFunction struct {
 	Index     int
 	Name      string
-	Arguments templateArgs
+	Arguments map[string]any
 }

 // templateMessage is a template-compatible representation of api.Message
@@ -468,7 +446,7 @@ func convertToolsForTemplate(tools api.Tools) templateTools {
 					Defs:       tool.Function.Parameters.Defs,
 					Items:      tool.Function.Parameters.Items,
 					Required:   tool.Function.Parameters.Required,
-					Properties: templateProperties(tool.Function.Parameters.Properties.ToMap()),
+					Properties: tool.Function.Parameters.Properties.ToMap(),
 				},
 			},
 		}
@@ -490,7 +468,7 @@ func convertMessagesForTemplate(messages []*api.Message) []*templateMessage {
 				Function: templateToolCallFunction{
 					Index:     tc.Function.Index,
 					Name:      tc.Function.Name,
-					Arguments: templateArgs(tc.Function.Arguments.ToMap()),
+					Arguments: tc.Function.Arguments.ToMap(),
 				},
 			})
 		}
--- a/template/template_test.go
+++ b/template/template_test.go
@@ -613,159 +613,3 @@ func TestCollate(t *testing.T) {
 		})
 	}
 }
-
-func TestTemplateArgumentsJSON(t *testing.T) {
-	// Test that {{ .Function.Arguments }} outputs valid JSON, not map[key:value]
-	tmpl := `{{- range .Messages }}{{- range .ToolCalls }}{{ .Function.Arguments }}{{- end }}{{- end }}`
-
-	template, err := Parse(tmpl)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	args := api.NewToolCallFunctionArguments()
-	args.Set("location", "Tokyo")
-	args.Set("unit", "celsius")
-
-	var buf bytes.Buffer
-	err = template.Execute(&buf, Values{
-		Messages: []api.Message{{
-			Role: "assistant",
-			ToolCalls: []api.ToolCall{{
-				Function: api.ToolCallFunction{
-					Name:      "get_weather",
-					Arguments: args,
-				},
-			}},
-		}},
-	})
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	got := buf.String()
-	// Should be valid JSON, not "map[location:Tokyo unit:celsius]"
-	if strings.HasPrefix(got, "map[") {
-		t.Errorf("Arguments output as Go map format: %s", got)
-	}
-
-	var parsed map[string]any
-	if err := json.Unmarshal([]byte(got), &parsed); err != nil {
-		t.Errorf("Arguments not valid JSON: %s, error: %v", got, err)
-	}
-}
-
-func TestTemplatePropertiesJSON(t *testing.T) {
-	// Test that {{ .Function.Parameters.Properties }} outputs valid JSON
-	// Note: template must reference .Messages to trigger the modern code path that converts Tools
-	tmpl := `{{- range .Messages }}{{- end }}{{- range .Tools }}{{ .Function.Parameters.Properties }}{{- end }}`
-
-	template, err := Parse(tmpl)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	props := api.NewToolPropertiesMap()
-	props.Set("location", api.ToolProperty{Type: api.PropertyType{"string"}, Description: "City name"})
-
-	var buf bytes.Buffer
-	err = template.Execute(&buf, Values{
-		Messages: []api.Message{{Role: "user", Content: "test"}},
-		Tools: api.Tools{{
-			Type: "function",
-			Function: api.ToolFunction{
-				Name:        "get_weather",
-				Description: "Get weather",
-				Parameters: api.ToolFunctionParameters{
-					Type:       "object",
-					Properties: props,
-				},
-			},
-		}},
-	})
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	got := buf.String()
-	// Should be valid JSON, not "map[location:{...}]"
-	if strings.HasPrefix(got, "map[") {
-		t.Errorf("Properties output as Go map format: %s", got)
-	}
-
-	var parsed map[string]any
-	if err := json.Unmarshal([]byte(got), &parsed); err != nil {
-		t.Errorf("Properties not valid JSON: %s, error: %v", got, err)
-	}
-}
-
-func TestTemplateArgumentsRange(t *testing.T) {
-	// Test that we can range over Arguments in templates
-	tmpl := `{{- range .Messages }}{{- range .ToolCalls }}{{- range $k, $v := .Function.Arguments }}{{ $k }}={{ $v }};{{- end }}{{- end }}{{- end }}`
-
-	template, err := Parse(tmpl)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	args := api.NewToolCallFunctionArguments()
-	args.Set("city", "Tokyo")
-
-	var buf bytes.Buffer
-	err = template.Execute(&buf, Values{
-		Messages: []api.Message{{
-			Role: "assistant",
-			ToolCalls: []api.ToolCall{{
-				Function: api.ToolCallFunction{
-					Name:      "get_weather",
-					Arguments: args,
-				},
-			}},
-		}},
-	})
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	got := buf.String()
-	if got != "city=Tokyo;" {
-		t.Errorf("Range over Arguments failed, got: %s, want: city=Tokyo;", got)
-	}
-}
-
-func TestTemplatePropertiesRange(t *testing.T) {
-	// Test that we can range over Properties in templates
-	// Note: template must reference .Messages to trigger the modern code path that converts Tools
-	tmpl := `{{- range .Messages }}{{- end }}{{- range .Tools }}{{- range $name, $prop := .Function.Parameters.Properties }}{{ $name }}:{{ $prop.Type }};{{- end }}{{- end }}`
-
-	template, err := Parse(tmpl)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	props := api.NewToolPropertiesMap()
-	props.Set("location", api.ToolProperty{Type: api.PropertyType{"string"}})
-
-	var buf bytes.Buffer
-	err = template.Execute(&buf, Values{
-		Messages: []api.Message{{Role: "user", Content: "test"}},
-		Tools: api.Tools{{
-			Type: "function",
-			Function: api.ToolFunction{
-				Name: "get_weather",
-				Parameters: api.ToolFunctionParameters{
-					Type:       "object",
-					Properties: props,
-				},
-			},
-		}},
-	})
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	got := buf.String()
-	if got != "location:string;" {
-		t.Errorf("Range over Properties failed, got: %s, want: location:string;", got)
-	}
-}
--- a/types/model/config.go
+++ b/types/model/config.go
@@ -1,5 +1,29 @@
 package model

+// SkillRef represents a reference to a skill, either by local path or by registry digest.
+type SkillRef struct {
+	// Name is the local path (for development) or registry name (e.g., "skill/calculator:1.0.0")
+	Name string `json:"name,omitempty"`
+	// Digest is the content-addressable digest of the skill blob (e.g., "sha256:abc123...")
+	Digest string `json:"digest,omitempty"`
+}
+
+// MCPRef represents a reference to an MCP (Model Context Protocol) server.
+type MCPRef struct {
+	// Name is the identifier for the MCP server (used for tool namespacing)
+	Name string `json:"name,omitempty"`
+	// Digest is the content-addressable digest of the bundled MCP server blob
+	Digest string `json:"digest,omitempty"`
+	// Command is the executable to run (e.g., "uv", "node", "python3")
+	Command string `json:"command,omitempty"`
+	// Args are the arguments to pass to the command
+	Args []string `json:"args,omitempty"`
+	// Env is optional environment variables for the MCP server
+	Env map[string]string `json:"env,omitempty"`
+	// Type is the transport type (currently only "stdio" is supported)
+	Type string `json:"type,omitempty"`
+}
+
 // ConfigV2 represents the configuration metadata for a model.
 type ConfigV2 struct {
 	ModelFormat   string   `json:"model_format"`
@@ -20,6 +44,12 @@ type ConfigV2 struct {
 	EmbedLen     int      `json:"embedding_length,omitempty"`
 	BaseName     string   `json:"base_name,omitempty"`

+	// agent-specific fields
+	Skills     []SkillRef `json:"skills,omitempty"`
+	MCPs       []MCPRef   `json:"mcps,omitempty"`
+	AgentType  string     `json:"agent_type,omitempty"`
+	Entrypoint string     `json:"entrypoint,omitempty"`
+
 	// required by spec
 	Architecture string `json:"architecture"`
 	OS           string `json:"os"`
--- a/types/model/name.go
+++ b/types/model/name.go
@@ -59,6 +59,7 @@ type partKind int
 const (
 	kindHost partKind = iota
 	kindNamespace
+	kindKind
 	kindModel
 	kindTag
 	kindDigest
@@ -70,6 +71,8 @@ func (k partKind) String() string {
 		return "host"
 	case kindNamespace:
 		return "namespace"
+	case kindKind:
+		return "kind"
 	case kindModel:
 		return "model"
 	case kindTag:
@@ -89,6 +92,7 @@ func (k partKind) String() string {
 type Name struct {
 	Host      string
 	Namespace string
+	Kind      string // Optional: "skill", "agent", or empty for models
 	Model     string
 	Tag       string
 }
@@ -97,34 +101,27 @@ type Name struct {
 // format of a valid name string is:
 //
 //	  s:
-//		  { host } "/" { namespace } "/" { model } ":" { tag } "@" { digest }
+//		  { host } "/" { namespace } "/" { kind } "/" { model } ":" { tag }
 //		  { host } "/" { namespace } "/" { model } ":" { tag }
-//		  { host } "/" { namespace } "/" { model } "@" { digest }
-//		  { host } "/" { namespace } "/" { model }
-//		  { namespace } "/" { model } ":" { tag } "@" { digest }
+//		  { namespace } "/" { kind } "/" { model } ":" { tag }
 //		  { namespace } "/" { model } ":" { tag }
-//		  { namespace } "/" { model } "@" { digest }
-//		  { namespace } "/" { model }
-//		  { model } ":" { tag } "@" { digest }
 //		  { model } ":" { tag }
-//		  { model } "@" { digest }
 //		  { model }
-//		  "@" { digest }
 //	  host:
 //	      pattern: { alphanum | "_" } { alphanum | "-" | "_" | "." | ":" }*
 //	      length:  [1, 350]
 //	  namespace:
 //	      pattern: { alphanum | "_" } { alphanum | "-" | "_" }*
 //	      length:  [1, 80]
+//	  kind:
+//	      pattern: "skill" | "agent" | "" (empty for models)
+//	      length:  [0, 80]
 //	  model:
 //	      pattern: { alphanum | "_" } { alphanum | "-" | "_" | "." }*
 //	      length:  [1, 80]
 //	  tag:
 //	      pattern: { alphanum | "_" } { alphanum | "-" | "_" | "." }*
 //	      length:  [1, 80]
-//	  digest:
-//	      pattern: { alphanum | "_" } { alphanum | "-" | ":" }*
-//	      length:  [1, 80]
 //
 // Most users should use [ParseName] instead, unless need to support
 // different defaults than DefaultName.
@@ -136,6 +133,13 @@ func ParseName(s string) Name {
 	return Merge(ParseNameBare(s), DefaultName())
 }

+// ValidKinds are the allowed values for the Kind field
+var ValidKinds = map[string]bool{
+	"skill": true,
+	"agent": true,
+	"mcp":   true,
+}
+
 // ParseNameBare parses s as a name string and returns a Name. No merge with
 // [DefaultName] is performed.
 func ParseNameBare(s string) Name {
@@ -153,6 +157,30 @@ func ParseNameBare(s string) Name {
 		return n
 	}

+	s, n.Kind, promised = cutPromised(s, "/")
+	if !promised {
+		// Only 2 parts: namespace/model - what we parsed as Kind is actually Namespace
+		n.Namespace = n.Kind
+		n.Kind = ""
+		return n
+	}
+
+	// Check if what we parsed as Kind is actually a valid kind value
+	if !ValidKinds[n.Kind] {
+		// Not a valid kind - this is the old 3-part format: host/namespace/model
+		// Shift: Kind -> Namespace, s -> Host
+		n.Namespace = n.Kind
+		n.Kind = ""
+
+		scheme, host, ok := strings.Cut(s, "://")
+		if !ok {
+			host = scheme
+		}
+		n.Host = host
+		return n
+	}
+
+	// Valid kind found - continue parsing for namespace and optional host
 	s, n.Namespace, promised = cutPromised(s, "/")
 	if !promised {
 		n.Namespace = s
@@ -168,20 +196,32 @@ func ParseNameBare(s string) Name {
 	return n
 }

-// ParseNameFromFilepath parses a 4-part filepath as a Name. The parts are
+// ParseNameFromFilepath parses a 4 or 5-part filepath as a Name. The parts are
 // expected to be in the form:
 //
 // { host } "/" { namespace } "/" { model } "/" { tag }
+// { host } "/" { namespace } "/" { kind } "/" { model } "/" { tag }
 func ParseNameFromFilepath(s string) (n Name) {
 	parts := strings.Split(s, string(filepath.Separator))
-	if len(parts) != 4 {
+
+	switch len(parts) {
+	case 4:
+		// Old format: host/namespace/model/tag
+		n.Host = parts[0]
+		n.Namespace = parts[1]
+		n.Model = parts[2]
+		n.Tag = parts[3]
+	case 5:
+		// New format: host/namespace/kind/model/tag
+		n.Host = parts[0]
+		n.Namespace = parts[1]
+		n.Kind = parts[2]
+		n.Model = parts[3]
+		n.Tag = parts[4]
+	default:
 		return Name{}
 	}

-	n.Host = parts[0]
-	n.Namespace = parts[1]
-	n.Model = parts[2]
-	n.Tag = parts[3]
 	if !n.IsFullyQualified() {
 		return Name{}
 	}
@@ -189,11 +229,12 @@ func ParseNameFromFilepath(s string) (n Name) {
 	return n
 }

-// Merge merges the host, namespace, and tag parts of the two names,
+// Merge merges the host, namespace, kind, and tag parts of the two names,
 // preferring the non-empty parts of a.
 func Merge(a, b Name) Name {
 	a.Host = cmp.Or(a.Host, b.Host)
 	a.Namespace = cmp.Or(a.Namespace, b.Namespace)
+	a.Kind = cmp.Or(a.Kind, b.Kind)
 	a.Tag = cmp.Or(a.Tag, b.Tag)
 	return a
 }
@@ -211,6 +252,10 @@ func (n Name) String() string {
 		b.WriteString(n.Namespace)
 		b.WriteByte('/')
 	}
+	if n.Kind != "" {
+		b.WriteString(n.Kind)
+		b.WriteByte('/')
+	}
 	b.WriteString(n.Model)
 	if n.Tag != "" {
 		b.WriteByte(':')
@@ -233,6 +278,12 @@ func (n Name) DisplayShortest() string {
 		sb.WriteByte('/')
 	}

+	// include kind if present
+	if n.Kind != "" {
+		sb.WriteString(n.Kind)
+		sb.WriteByte('/')
+	}
+
 	// always include model and tag
 	sb.WriteString(n.Model)
 	sb.WriteString(":")
@@ -256,18 +307,23 @@ func (n Name) IsValid() bool {
 }

 // IsFullyQualified returns true if all parts of the name are present and
-// valid without the digest.
+// valid without the digest. Kind is optional and only validated if non-empty.
 func (n Name) IsFullyQualified() bool {
-	parts := []string{
-		n.Host,
-		n.Namespace,
-		n.Model,
-		n.Tag,
+	if !isValidPart(kindHost, n.Host) {
+		return false
 	}
-	for i, part := range parts {
-		if !isValidPart(partKind(i), part) {
-			return false
-		}
+	if !isValidPart(kindNamespace, n.Namespace) {
+		return false
+	}
+	// Kind is optional - only validate if present
+	if n.Kind != "" && !isValidPart(kindKind, n.Kind) {
+		return false
+	}
+	if !isValidPart(kindModel, n.Model) {
+		return false
+	}
+	if !isValidPart(kindTag, n.Tag) {
+		return false
 	}
 	return true
 }
@@ -276,6 +332,7 @@ func (n Name) IsFullyQualified() bool {
 // host to tag as a directory in the form:
 //
 //	{host}/{namespace}/{model}/{tag}
+//	{host}/{namespace}/{kind}/{model}/{tag}
 //
 // It uses the system's filepath separator and ensures the path is clean.
 //
@@ -285,6 +342,15 @@ func (n Name) Filepath() string {
 	if !n.IsFullyQualified() {
 		panic("illegal attempt to get filepath of invalid name")
 	}
+	if n.Kind != "" {
+		return filepath.Join(
+			n.Host,
+			n.Namespace,
+			n.Kind,
+			n.Model,
+			n.Tag,
+		)
+	}
 	return filepath.Join(
 		n.Host,
 		n.Namespace,
@@ -301,6 +367,7 @@ func (n Name) LogValue() slog.Value {
 func (n Name) EqualFold(o Name) bool {
 	return strings.EqualFold(n.Host, o.Host) &&
 		strings.EqualFold(n.Namespace, o.Namespace) &&
+		strings.EqualFold(n.Kind, o.Kind) &&
 		strings.EqualFold(n.Model, o.Model) &&
 		strings.EqualFold(n.Tag, o.Tag)
 }
@@ -317,6 +384,11 @@ func isValidLen(kind partKind, s string) bool {
 }

 func isValidPart(kind partKind, s string) bool {
+	// Kind must be one of the valid values
+	if kind == kindKind {
+		return ValidKinds[s]
+	}
+
 	if !isValidLen(kind, s) {
 		return false
 	}
--- a/x/agent/approval.go
+++ b/x/agent/approval.go
@@ -4,7 +4,6 @@ package agent
 import (
 	"fmt"
 	"os"
-	"path"
 	"path/filepath"
 	"strings"
 	"sync"
@@ -180,7 +179,6 @@ func FormatDeniedResult(command string, pattern string) string {
 // extractBashPrefix extracts a prefix pattern from a bash command.
 // For commands like "cat tools/tools_test.go | head -200", returns "cat:tools/"
 // For commands without path args, returns empty string.
-// Paths with ".." traversal that escape the base directory return empty string for security.
 func extractBashPrefix(command string) string {
 	// Split command by pipes and get the first part
 	parts := strings.Split(command, "|")
@@ -206,8 +204,8 @@ func extractBashPrefix(command string) string {
 		return ""
 	}

-	// Find the first path-like argument (must contain / or \ or start with .)
-	// First pass: look for clear paths (containing path separators or starting with .)
+	// Find the first path-like argument (must contain / or start with .)
+	// First pass: look for clear paths (containing / or starting with .)
 	for _, arg := range fields[1:] {
 		// Skip flags
 		if strings.HasPrefix(arg, "-") {
@@ -217,49 +215,19 @@ func extractBashPrefix(command string) string {
 		if isNumeric(arg) {
 			continue
 		}
-		// Only process if it looks like a path (contains / or \ or starts with .)
-		if !strings.Contains(arg, "/") && !strings.Contains(arg, "\\") && !strings.HasPrefix(arg, ".") {
+		// Only process if it looks like a path (contains / or starts with .)
+		if !strings.Contains(arg, "/") && !strings.HasPrefix(arg, ".") {
 			continue
 		}
-		// Normalize to forward slashes for consistent cross-platform matching
-		arg = strings.ReplaceAll(arg, "\\", "/")
-
-		// Security: reject absolute paths
-		if path.IsAbs(arg) {
-			return "" // Absolute path - don't create prefix
+		// If arg ends with /, it's a directory - use it directly
+		if strings.HasSuffix(arg, "/") {
+			return fmt.Sprintf("%s:%s", baseCmd, arg)
 		}
-
-		// Normalize the path using stdlib path.Clean (resolves . and ..)
-		cleaned := path.Clean(arg)
-
-		// Security: reject if cleaned path escapes to parent directory
-		if strings.HasPrefix(cleaned, "..") {
-			return "" // Path escapes - don't create prefix
-		}
-
-		// Security: if original had "..", verify cleaned path didn't escape to sibling
-		// e.g., "tools/a/b/../../../etc" -> "etc" (escaped tools/ to sibling)
-		if strings.Contains(arg, "..") {
-			origBase := strings.SplitN(arg, "/", 2)[0]
-			cleanedBase := strings.SplitN(cleaned, "/", 2)[0]
-			if origBase != cleanedBase {
-				return "" // Path escaped to sibling directory
-			}
-		}
-
-		// Check if arg ends with / (explicit directory)
-		isDir := strings.HasSuffix(arg, "/")
-
-		// Get the directory part
-		var dir string
-		if isDir {
-			dir = cleaned
-		} else {
-			dir = path.Dir(cleaned)
-		}
-
+		// Get the directory part of a file path
+		dir := filepath.Dir(arg)
 		if dir == "." {
-			return fmt.Sprintf("%s:./", baseCmd)
+			// Path is just a directory like "tools" or "src" (no trailing /)
+			return fmt.Sprintf("%s:%s/", baseCmd, arg)
 		}
 		return fmt.Sprintf("%s:%s/", baseCmd, dir)
 	}
@@ -364,8 +332,6 @@ func AllowlistKey(toolName string, args map[string]any) string {
 }

 // IsAllowed checks if a tool/command is allowed (exact match or prefix match).
-// For bash commands, hierarchical path matching is used - if "cat:tools/" is allowed,
-// then "cat:tools/subdir/" is also allowed (subdirectories inherit parent permissions).
 func (a *ApprovalManager) IsAllowed(toolName string, args map[string]any) bool {
 	a.mu.RLock()
 	defer a.mu.RUnlock()
@@ -376,20 +342,12 @@ func (a *ApprovalManager) IsAllowed(toolName string, args map[string]any) bool {
 		return true
 	}

-	// For bash commands, check prefix matches with hierarchical path support
+	// For bash commands, check prefix matches
 	if toolName == "bash" {
 		if cmd, ok := args["command"].(string); ok {
 			prefix := extractBashPrefix(cmd)
-			if prefix != "" {
-				// Check exact prefix match first
-				if a.prefixes[prefix] {
-					return true
-				}
-				// Check hierarchical match: if any stored prefix is a parent of current prefix
-				// e.g., stored "cat:tools/" should match current "cat:tools/subdir/"
-				if a.matchesHierarchicalPrefix(prefix) {
-					return true
-				}
+			if prefix != "" && a.prefixes[prefix] {
+				return true
 			}
 		}
 	}
@@ -402,40 +360,6 @@ func (a *ApprovalManager) IsAllowed(toolName string, args map[string]any) bool {
 	return false
 }

-// matchesHierarchicalPrefix checks if the given prefix matches any stored prefix hierarchically.
-// For example, if "cat:tools/" is stored, it will match "cat:tools/subdir/" or "cat:tools/a/b/c/".
-func (a *ApprovalManager) matchesHierarchicalPrefix(currentPrefix string) bool {
-	// Split prefix into command and path parts (format: "cmd:path/")
-	colonIdx := strings.Index(currentPrefix, ":")
-	if colonIdx == -1 {
-		return false
-	}
-	currentCmd := currentPrefix[:colonIdx]
-	currentPath := currentPrefix[colonIdx+1:]
-
-	for storedPrefix := range a.prefixes {
-		storedColonIdx := strings.Index(storedPrefix, ":")
-		if storedColonIdx == -1 {
-			continue
-		}
-		storedCmd := storedPrefix[:storedColonIdx]
-		storedPath := storedPrefix[storedColonIdx+1:]
-
-		// Commands must match exactly
-		if currentCmd != storedCmd {
-			continue
-		}
-
-		// Check if current path starts with stored path (hierarchical match)
-		// e.g., "tools/subdir/" starts with "tools/"
-		if strings.HasPrefix(currentPath, storedPath) {
-			return true
-		}
-	}
-
-	return false
-}
-
 // AddToAllowlist adds a tool/command to the session allowlist.
 // For bash commands, it adds the prefix pattern instead of exact command.
 func (a *ApprovalManager) AddToAllowlist(toolName string, args map[string]any) {
@@ -519,12 +443,11 @@ func formatToolDisplay(toolName string, args map[string]any) string {
 		}
 	}

-	// For web search, show query and internet notice
+	// For web search, show query
 	if toolName == "web_search" {
 		if query, ok := args["query"].(string); ok {
 			sb.WriteString(fmt.Sprintf("Tool: %s\n", toolName))
-			sb.WriteString(fmt.Sprintf("Query: %s\n", query))
-			sb.WriteString("Uses internet via ollama.com")
+			sb.WriteString(fmt.Sprintf("Query: %s", query))
 			return sb.String()
 		}
 	}
@@ -1028,79 +951,3 @@ func FormatDenyResult(toolName string, reason string) string {
 	}
 	return fmt.Sprintf("User denied execution of %s.", toolName)
 }
-
-// PromptYesNo displays a simple Yes/No prompt and returns the user's choice.
-// Returns true for Yes, false for No.
-func PromptYesNo(question string) (bool, error) {
-	fd := int(os.Stdin.Fd())
-	oldState, err := term.MakeRaw(fd)
-	if err != nil {
-		return false, err
-	}
-	defer term.Restore(fd, oldState)
-
-	selected := 0 // 0 = Yes, 1 = No
-	options := []string{"Yes", "No"}
-
-	// Hide cursor
-	fmt.Fprint(os.Stderr, "\033[?25l")
-	defer fmt.Fprint(os.Stderr, "\033[?25h")
-
-	renderYesNo := func() {
-		// Move to start of line and clear
-		fmt.Fprintf(os.Stderr, "\r\033[K")
-		fmt.Fprintf(os.Stderr, "\033[36m%s\033[0m ", question)
-		for i, opt := range options {
-			if i == selected {
-				fmt.Fprintf(os.Stderr, "\033[1;32m[%s]\033[0m ", opt)
-			} else {
-				fmt.Fprintf(os.Stderr, "\033[90m %s \033[0m ", opt)
-			}
-		}
-		fmt.Fprintf(os.Stderr, "\033[90m(←/→ or y/n, Enter to confirm)\033[0m")
-	}
-
-	renderYesNo()
-
-	buf := make([]byte, 3)
-	for {
-		n, err := os.Stdin.Read(buf)
-		if err != nil {
-			return false, err
-		}
-
-		if n == 1 {
-			switch buf[0] {
-			case 'y', 'Y':
-				selected = 0
-				renderYesNo()
-			case 'n', 'N':
-				selected = 1
-				renderYesNo()
-			case '\r', '\n': // Enter
-				fmt.Fprintf(os.Stderr, "\r\033[K") // Clear line
-				return selected == 0, nil
-			case 3: // Ctrl+C
-				fmt.Fprintf(os.Stderr, "\r\033[K")
-				return false, nil
-			case 27: // Escape - could be arrow key
-				// Read more bytes for arrow keys
-				continue
-			}
-		} else if n == 3 && buf[0] == 27 && buf[1] == 91 {
-			// Arrow keys
-			switch buf[2] {
-			case 'D': // Left
-				if selected > 0 {
-					selected--
-				}
-				renderYesNo()
-			case 'C': // Right
-				if selected < len(options)-1 {
-					selected++
-				}
-				renderYesNo()
-			}
-		}
-	}
-}
--- a/x/agent/approval_test.go
+++ b/x/agent/approval_test.go
@@ -151,27 +151,6 @@ func TestExtractBashPrefix(t *testing.T) {
 			command:  "head -n 100",
 			expected: "",
 		},
-		// Path traversal security tests
-		{
-			name:     "path traversal - parent escape",
-			command:  "cat tools/../../etc/passwd",
-			expected: "", // Should NOT create a prefix - path escapes
-		},
-		{
-			name:     "path traversal - deep escape",
-			command:  "cat tools/a/b/../../../etc/passwd",
-			expected: "", // Normalizes to "../etc/passwd" - escapes
-		},
-		{
-			name:     "path traversal - absolute path",
-			command:  "cat /etc/passwd",
-			expected: "", // Absolute paths should not create prefix
-		},
-		{
-			name:     "path with safe dotdot - normalized",
-			command:  "cat tools/subdir/../file.go",
-			expected: "cat:tools/", // Normalizes to tools/file.go - safe, creates prefix
-		},
 	}

 	for _, tt := range tests {
@@ -185,34 +164,6 @@ func TestExtractBashPrefix(t *testing.T) {
 	}
 }

-func TestApprovalManager_PathTraversalBlocked(t *testing.T) {
-	am := NewApprovalManager()
-
-	// Allow "cat tools/file.go" - creates prefix "cat:tools/"
-	am.AddToAllowlist("bash", map[string]any{"command": "cat tools/file.go"})
-
-	// Path traversal attack: should NOT be allowed
-	if am.IsAllowed("bash", map[string]any{"command": "cat tools/../../etc/passwd"}) {
-		t.Error("SECURITY: path traversal attack should NOT be allowed")
-	}
-
-	// Another traversal variant
-	if am.IsAllowed("bash", map[string]any{"command": "cat tools/../../../etc/shadow"}) {
-		t.Error("SECURITY: deep path traversal should NOT be allowed")
-	}
-
-	// Valid subdirectory access should still work
-	if !am.IsAllowed("bash", map[string]any{"command": "cat tools/subdir/file.go"}) {
-		t.Error("expected cat tools/subdir/file.go to be allowed")
-	}
-
-	// Safe ".." that normalizes to within allowed directory should work
-	// tools/subdir/../other.go normalizes to tools/other.go which is under tools/
-	if !am.IsAllowed("bash", map[string]any{"command": "cat tools/subdir/../other.go"}) {
-		t.Error("expected cat tools/subdir/../other.go to be allowed (normalizes to tools/other.go)")
-	}
-}
-
 func TestApprovalManager_PrefixAllowlist(t *testing.T) {
 	am := NewApprovalManager()

@@ -235,119 +186,6 @@ func TestApprovalManager_PrefixAllowlist(t *testing.T) {
 	}
 }

-func TestApprovalManager_HierarchicalPrefixAllowlist(t *testing.T) {
-	am := NewApprovalManager()
-
-	// Allow "cat tools/file.go" - this creates prefix "cat:tools/"
-	am.AddToAllowlist("bash", map[string]any{"command": "cat tools/file.go"})
-
-	// Should allow subdirectories (hierarchical matching)
-	if !am.IsAllowed("bash", map[string]any{"command": "cat tools/subdir/file.go"}) {
-		t.Error("expected cat tools/subdir/file.go to be allowed via hierarchical prefix")
-	}
-
-	// Should allow deeply nested subdirectories
-	if !am.IsAllowed("bash", map[string]any{"command": "cat tools/a/b/c/deep.go"}) {
-		t.Error("expected cat tools/a/b/c/deep.go to be allowed via hierarchical prefix")
-	}
-
-	// Should still allow same directory
-	if !am.IsAllowed("bash", map[string]any{"command": "cat tools/another.go"}) {
-		t.Error("expected cat tools/another.go to be allowed")
-	}
-
-	// Should NOT allow different base directory
-	if am.IsAllowed("bash", map[string]any{"command": "cat src/main.go"}) {
-		t.Error("expected cat src/main.go to NOT be allowed")
-	}
-
-	// Should NOT allow different command even in subdirectory
-	if am.IsAllowed("bash", map[string]any{"command": "ls tools/subdir/"}) {
-		t.Error("expected ls tools/subdir/ to NOT be allowed (different command)")
-	}
-
-	// Should NOT allow similar but different directory name
-	if am.IsAllowed("bash", map[string]any{"command": "cat toolsbin/file.go"}) {
-		t.Error("expected cat toolsbin/file.go to NOT be allowed (different directory)")
-	}
-}
-
-func TestApprovalManager_HierarchicalPrefixAllowlist_CrossPlatform(t *testing.T) {
-	am := NewApprovalManager()
-
-	// Allow with forward slashes (Unix-style)
-	am.AddToAllowlist("bash", map[string]any{"command": "cat tools/file.go"})
-
-	// Should work with backslashes too (Windows-style) - normalized internally
-	if !am.IsAllowed("bash", map[string]any{"command": "cat tools\\subdir\\file.go"}) {
-		t.Error("expected cat tools\\subdir\\file.go to be allowed via hierarchical prefix (Windows path)")
-	}
-
-	// Mixed slashes should also work
-	if !am.IsAllowed("bash", map[string]any{"command": "cat tools\\a/b\\c/deep.go"}) {
-		t.Error("expected mixed slash path to be allowed via hierarchical prefix")
-	}
-}
-
-func TestMatchesHierarchicalPrefix(t *testing.T) {
-	am := NewApprovalManager()
-
-	// Add prefix for "cat:tools/"
-	am.prefixes["cat:tools/"] = true
-
-	tests := []struct {
-		name     string
-		prefix   string
-		expected bool
-	}{
-		{
-			name:     "exact match",
-			prefix:   "cat:tools/",
-			expected: true, // exact match also passes HasPrefix - caller handles exact match first
-		},
-		{
-			name:     "subdirectory",
-			prefix:   "cat:tools/subdir/",
-			expected: true,
-		},
-		{
-			name:     "deeply nested",
-			prefix:   "cat:tools/a/b/c/",
-			expected: true,
-		},
-		{
-			name:     "different base directory",
-			prefix:   "cat:src/",
-			expected: false,
-		},
-		{
-			name:     "different command same path",
-			prefix:   "ls:tools/",
-			expected: false,
-		},
-		{
-			name:     "similar directory name",
-			prefix:   "cat:toolsbin/",
-			expected: false,
-		},
-		{
-			name:     "invalid prefix format",
-			prefix:   "cattools",
-			expected: false,
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			result := am.matchesHierarchicalPrefix(tt.prefix)
-			if result != tt.expected {
-				t.Errorf("matchesHierarchicalPrefix(%q) = %v, expected %v",
-					tt.prefix, result, tt.expected)
-			}
-		})
-	}
-}
-
 func TestFormatApprovalResult(t *testing.T) {
 	tests := []struct {
 		name     string
--- a/x/cmd/run.go
+++ b/x/cmd/run.go
@@ -6,12 +6,10 @@ import (
 	"errors"
 	"fmt"
 	"io"
-	"net/url"
 	"os"
 	"os/signal"
 	"strings"
 	"syscall"
-	"time"

 	"github.com/spf13/cobra"
 	"golang.org/x/term"
@@ -24,101 +22,6 @@ import (
 	"github.com/ollama/ollama/x/tools"
 )

-// Tool output capping constants
-const (
-	// localModelTokenLimit is the token limit for local models (smaller context).
-	localModelTokenLimit = 4000
-
-	// defaultTokenLimit is the token limit for cloud/remote models.
-	defaultTokenLimit = 10000
-
-	// charsPerToken is a rough estimate of characters per token.
-	// TODO: Estimate tokens more accurately using tokenizer if available
-	charsPerToken = 4
-)
-
-// isLocalModel checks if the model is running locally (not a cloud model).
-// TODO: Improve local/cloud model identification - could check model metadata
-func isLocalModel(modelName string) bool {
-	return !strings.HasSuffix(modelName, "-cloud")
-}
-
-// isLocalServer checks if connecting to a local Ollama server.
-// TODO: Could also check other indicators of local vs cloud server
-func isLocalServer() bool {
-	host := os.Getenv("OLLAMA_HOST")
-	if host == "" {
-		return true // Default is localhost:11434
-	}
-
-	// Parse the URL to check host
-	parsed, err := url.Parse(host)
-	if err != nil {
-		return true // If can't parse, assume local
-	}
-
-	hostname := parsed.Hostname()
-	return hostname == "localhost" || hostname == "127.0.0.1" || strings.Contains(parsed.Host, ":11434")
-}
-
-// truncateToolOutput truncates tool output to prevent context overflow.
-// Uses a smaller limit (4k tokens) for local models, larger (10k) for cloud/remote.
-func truncateToolOutput(output, modelName string) string {
-	var tokenLimit int
-	if isLocalModel(modelName) && isLocalServer() {
-		tokenLimit = localModelTokenLimit
-	} else {
-		tokenLimit = defaultTokenLimit
-	}
-
-	maxChars := tokenLimit * charsPerToken
-	if len(output) > maxChars {
-		return output[:maxChars] + "\n... (output truncated)"
-	}
-	return output
-}
-
-// waitForOllamaSignin shows the signin URL and polls until authentication completes.
-func waitForOllamaSignin(ctx context.Context) error {
-	client, err := api.ClientFromEnvironment()
-	if err != nil {
-		return err
-	}
-
-	// Get signin URL from initial Whoami call
-	_, err = client.Whoami(ctx)
-	if err != nil {
-		var aErr api.AuthorizationError
-		if errors.As(err, &aErr) && aErr.SigninURL != "" {
-			fmt.Fprintf(os.Stderr, "\n  To sign in, navigate to:\n")
-			fmt.Fprintf(os.Stderr, "      \033[36m%s\033[0m\n\n", aErr.SigninURL)
-			fmt.Fprintf(os.Stderr, "  \033[90mWaiting for sign in to complete...\033[0m")
-
-			// Poll until auth succeeds
-			ticker := time.NewTicker(2 * time.Second)
-			defer ticker.Stop()
-
-			for {
-				select {
-				case <-ctx.Done():
-					fmt.Fprintf(os.Stderr, "\n")
-					return ctx.Err()
-				case <-ticker.C:
-					user, whoamiErr := client.Whoami(ctx)
-					if whoamiErr == nil && user != nil && user.Name != "" {
-						fmt.Fprintf(os.Stderr, "\r\033[K  \033[32mSigned in as %s\033[0m\n", user.Name)
-						return nil
-					}
-					// Still waiting, show dot
-					fmt.Fprintf(os.Stderr, ".")
-				}
-			}
-		}
-		return err
-	}
-	return nil
-}
-
 // RunOptions contains options for running an interactive agent session.
 type RunOptions struct {
 	Model        string
@@ -134,16 +37,6 @@ type RunOptions struct {
 	// Agent fields (managed externally for session persistence)
 	Tools    *tools.Registry
 	Approval *agent.ApprovalManager
-
-	// YoloMode skips all tool approval prompts
-	YoloMode bool
-
-	// LastToolOutput stores the full output of the last tool execution
-	// for Ctrl+O expansion. Updated by Chat(), read by caller.
-	LastToolOutput *string
-
-	// LastToolOutputTruncated stores the truncated version shown inline
-	LastToolOutputTruncated *string
 }

 // Chat runs an agent chat loop with tool support.
@@ -184,7 +77,6 @@ func Chat(ctx context.Context, opts RunOptions) (*api.Message, error) {
 	var thinkTagOpened bool = false
 	var thinkTagClosed bool = false
 	var pendingToolCalls []api.ToolCall
-	var consecutiveErrors int // Track consecutive 500 errors for retry limit

 	role := "assistant"
 	messages := opts.Messages
@@ -267,58 +159,6 @@ func Chat(ctx context.Context, opts RunOptions) (*api.Message, error) {
 				return nil, nil
 			}

-			// Check for 401 Unauthorized - prompt user to sign in
-			var authErr api.AuthorizationError
-			if errors.As(err, &authErr) {
-				p.StopAndClear()
-				fmt.Fprintf(os.Stderr, "\033[33mAuthentication required to use this cloud model.\033[0m\n")
-				result, promptErr := agent.PromptYesNo("Sign in to Ollama?")
-				if promptErr == nil && result {
-					if signinErr := waitForOllamaSignin(ctx); signinErr == nil {
-						// Retry the chat request
-						fmt.Fprintf(os.Stderr, "\033[90mRetrying...\033[0m\n")
-						continue // Retry the loop
-					}
-				}
-				return nil, fmt.Errorf("authentication required - run 'ollama signin' to authenticate")
-			}
-
-			// Check for 500 errors (often tool parsing failures) - inform the model
-			var statusErr api.StatusError
-			if errors.As(err, &statusErr) && statusErr.StatusCode >= 500 {
-				consecutiveErrors++
-				p.StopAndClear()
-
-				if consecutiveErrors >= 3 {
-					fmt.Fprintf(os.Stderr, "\033[31m✗ Too many consecutive errors, giving up\033[0m\n")
-					return nil, fmt.Errorf("too many consecutive server errors: %s", statusErr.ErrorMessage)
-				}
-
-				fmt.Fprintf(os.Stderr, "\033[33m⚠ Server error (attempt %d/3): %s\033[0m\n", consecutiveErrors, statusErr.ErrorMessage)
-
-				// Include both the model's response and the error so it can learn
-				assistantContent := fullResponse.String()
-				if assistantContent == "" {
-					assistantContent = "(empty response)"
-				}
-				errorMsg := fmt.Sprintf("Your previous response caused an error: %s\n\nYour response was:\n%s\n\nPlease try again with a valid response.", statusErr.ErrorMessage, assistantContent)
-				messages = append(messages,
-					api.Message{Role: "user", Content: errorMsg},
-				)
-
-				// Reset state and retry
-				fullResponse.Reset()
-				thinkingContent.Reset()
-				thinkTagOpened = false
-				thinkTagClosed = false
-				pendingToolCalls = nil
-				state = &displayResponseState{}
-				p = progress.NewProgress(os.Stderr)
-				spinner = progress.NewSpinner("")
-				p.Add("", spinner)
-				continue
-			}
-
 			if strings.Contains(err.Error(), "upstream error") {
 				p.StopAndClear()
 				fmt.Println("An error occurred while processing your message. Please try again.")
@@ -328,9 +168,6 @@ func Chat(ctx context.Context, opts RunOptions) (*api.Message, error) {
 			return nil, err
 		}

-		// Reset consecutive error counter on success
-		consecutiveErrors = 0
-
 		// If no tool calls, we're done
 		if len(pendingToolCalls) == 0 || toolRegistry == nil {
 			break
@@ -379,12 +216,7 @@ func Chat(ctx context.Context, opts RunOptions) (*api.Message, error) {
 			}

 			// Check approval (uses prefix matching for bash commands)
-			// In yolo mode, skip all approval prompts
-			if opts.YoloMode {
-				if !skipApproval {
-					fmt.Fprintf(os.Stderr, "\033[90m▶ Running: %s\033[0m\n", formatToolShort(toolName, args))
-				}
-			} else if !skipApproval && !approval.IsAllowed(toolName, args) {
+			if !skipApproval && !approval.IsAllowed(toolName, args) {
 				result, err := approval.RequestApproval(toolName, args)
 				if err != nil {
 					fmt.Fprintf(os.Stderr, "Error requesting approval: %v\n", err)
@@ -418,23 +250,6 @@ func Chat(ctx context.Context, opts RunOptions) (*api.Message, error) {
 			// Execute the tool
 			toolResult, err := toolRegistry.Execute(call)
 			if err != nil {
-				// Check if web search needs authentication
-				if errors.Is(err, tools.ErrWebSearchAuthRequired) {
-					// Prompt user to sign in
-					fmt.Fprintf(os.Stderr, "\033[33m  Web search requires authentication.\033[0m\n")
-					result, promptErr := agent.PromptYesNo("Sign in to Ollama?")
-					if promptErr == nil && result {
-						// Get signin URL and wait for auth completion
-						if signinErr := waitForOllamaSignin(ctx); signinErr == nil {
-							// Retry the web search
-							fmt.Fprintf(os.Stderr, "\033[90m  Retrying web search...\033[0m\n")
-							toolResult, err = toolRegistry.Execute(call)
-							if err == nil {
-								goto toolSuccess
-							}
-						}
-					}
-				}
 				fmt.Fprintf(os.Stderr, "\033[31m  Error: %v\033[0m\n", err)
 				toolResults = append(toolResults, api.Message{
 					Role:       "tool",
@@ -443,34 +258,20 @@ func Chat(ctx context.Context, opts RunOptions) (*api.Message, error) {
 				})
 				continue
 			}
-		toolSuccess:

 			// Display tool output (truncated for display)
-			truncatedOutput := ""
 			if toolResult != "" {
 				output := toolResult
 				if len(output) > 300 {
-					output = output[:300] + "... (truncated, press Ctrl+O to expand)"
+					output = output[:300] + "... (truncated)"
 				}
-				truncatedOutput = output
 				// Show result in grey, indented
 				fmt.Fprintf(os.Stderr, "\033[90m  %s\033[0m\n", strings.ReplaceAll(output, "\n", "\n  "))
 			}

-			// Store full and truncated output for Ctrl+O toggle
-			if opts.LastToolOutput != nil {
-				*opts.LastToolOutput = toolResult
-			}
-			if opts.LastToolOutputTruncated != nil {
-				*opts.LastToolOutputTruncated = truncatedOutput
-			}
-
-			// Truncate output to prevent context overflow
-			toolResultForLLM := truncateToolOutput(toolResult, opts.Model)
-
 			toolResults = append(toolResults, api.Message{
 				Role:       "tool",
-				Content:    toolResultForLLM,
+				Content:    toolResult,
 				ToolCallID: call.ID,
 			})
 		}
@@ -648,8 +449,7 @@ func checkModelCapabilities(ctx context.Context, modelName string) (supportsTool

 // GenerateInteractive runs an interactive agent session.
 // This is called from cmd.go when --experimental flag is set.
-// If yoloMode is true, all tool approvals are skipped.
-func GenerateInteractive(cmd *cobra.Command, modelName string, wordWrap bool, options map[string]any, think *api.ThinkValue, hideThinking bool, keepAlive *api.Duration, yoloMode bool) error {
+func GenerateInteractive(cmd *cobra.Command, modelName string, wordWrap bool, options map[string]any, think *api.ThinkValue, hideThinking bool, keepAlive *api.Duration) error {
 	scanner, err := readline.New(readline.Prompt{
 		Prompt:         ">>> ",
 		AltPrompt:      "... ",
@@ -674,11 +474,11 @@ func GenerateInteractive(cmd *cobra.Command, modelName string, wordWrap bool, op
 	var toolRegistry *tools.Registry
 	if supportsTools {
 		toolRegistry = tools.DefaultRegistry()
-		if toolRegistry.Count() > 0 {
-			fmt.Fprintf(os.Stderr, "\033[90mTools available: %s\033[0m\n", strings.Join(toolRegistry.Names(), ", "))
-		}
-		if yoloMode {
-			fmt.Fprintf(os.Stderr, "\033[33m⚠ YOLO mode: All tool approvals will be skipped\033[0m\n")
+		fmt.Fprintf(os.Stderr, "Tools available: %s\n", strings.Join(toolRegistry.Names(), ", "))
+
+		// Check for OLLAMA_API_KEY for web search
+		if os.Getenv("OLLAMA_API_KEY") == "" {
+			fmt.Fprintf(os.Stderr, "\033[33mWarning: OLLAMA_API_KEY not set - web search will not work\033[0m\n")
 		}
 	} else {
 		fmt.Fprintf(os.Stderr, "\033[33mNote: Model does not support tools - running in chat-only mode\033[0m\n")
@@ -690,11 +490,6 @@ func GenerateInteractive(cmd *cobra.Command, modelName string, wordWrap bool, op
 	var messages []api.Message
 	var sb strings.Builder

-	// Track last tool output for Ctrl+O toggle
-	var lastToolOutput string
-	var lastToolOutputTruncated string
-	var toolOutputExpanded bool
-
 	for {
 		line, err := scanner.Readline()
 		switch {
@@ -707,20 +502,6 @@ func GenerateInteractive(cmd *cobra.Command, modelName string, wordWrap bool, op
 			}
 			sb.Reset()
 			continue
-		case errors.Is(err, readline.ErrExpandOutput):
-			// Ctrl+O pressed - toggle between expanded and collapsed tool output
-			if lastToolOutput == "" {
-				fmt.Fprintf(os.Stderr, "\033[90mNo tool output to expand\033[0m\n")
-			} else if toolOutputExpanded {
-				// Currently expanded, show truncated
-				fmt.Fprintf(os.Stderr, "\033[90m  %s\033[0m\n", strings.ReplaceAll(lastToolOutputTruncated, "\n", "\n  "))
-				toolOutputExpanded = false
-			} else {
-				// Currently collapsed, show full
-				fmt.Fprintf(os.Stderr, "\033[90m  %s\033[0m\n", strings.ReplaceAll(lastToolOutput, "\n", "\n  "))
-				toolOutputExpanded = true
-			}
-			continue
 		case err != nil:
 			return err
 		}
@@ -743,9 +524,6 @@ func GenerateInteractive(cmd *cobra.Command, modelName string, wordWrap bool, op
 			fmt.Fprintln(os.Stderr, "  /bye            Exit")
 			fmt.Fprintln(os.Stderr, "  /?, /help       Help for a command")
 			fmt.Fprintln(os.Stderr, "")
-			fmt.Fprintln(os.Stderr, "Keyboard Shortcuts:")
-			fmt.Fprintln(os.Stderr, "  Ctrl+O          Expand last tool output")
-			fmt.Fprintln(os.Stderr, "")
 			continue
 		case strings.HasPrefix(line, "/"):
 			fmt.Printf("Unknown command '%s'. Type /? for help\n", strings.Fields(line)[0])
@@ -759,21 +537,16 @@ func GenerateInteractive(cmd *cobra.Command, modelName string, wordWrap bool, op
 			messages = append(messages, newMessage)

 			opts := RunOptions{
-				Model:                   modelName,
-				Messages:                messages,
-				WordWrap:                wordWrap,
-				Options:                 options,
-				Think:                   think,
-				HideThinking:            hideThinking,
-				KeepAlive:               keepAlive,
-				Tools:                   toolRegistry,
-				Approval:                approval,
-				YoloMode:                yoloMode,
-				LastToolOutput:          &lastToolOutput,
-				LastToolOutputTruncated: &lastToolOutputTruncated,
+				Model:        modelName,
+				Messages:     messages,
+				WordWrap:     wordWrap,
+				Options:      options,
+				Think:        think,
+				HideThinking: hideThinking,
+				KeepAlive:    keepAlive,
+				Tools:        toolRegistry,
+				Approval:     approval,
 			}
-			// Reset expanded state for new tool execution
-			toolOutputExpanded = false

 			assistant, err := Chat(cmd.Context(), opts)
 			if err != nil {
--- a/x/cmd/run_test.go
+++ b/x/cmd/run_test.go
@@ -1,180 +0,0 @@
-package cmd
-
-import (
-	"testing"
-)
-
-func TestIsLocalModel(t *testing.T) {
-	tests := []struct {
-		name      string
-		modelName string
-		expected  bool
-	}{
-		{
-			name:      "local model without suffix",
-			modelName: "llama3.2",
-			expected:  true,
-		},
-		{
-			name:      "local model with version",
-			modelName: "qwen2.5:7b",
-			expected:  true,
-		},
-		{
-			name:      "cloud model",
-			modelName: "gpt-4-cloud",
-			expected:  false,
-		},
-		{
-			name:      "cloud model with version",
-			modelName: "claude-3-cloud",
-			expected:  false,
-		},
-		{
-			name:      "empty model name",
-			modelName: "",
-			expected:  true,
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			result := isLocalModel(tt.modelName)
-			if result != tt.expected {
-				t.Errorf("isLocalModel(%q) = %v, expected %v", tt.modelName, result, tt.expected)
-			}
-		})
-	}
-}
-
-func TestIsLocalServer(t *testing.T) {
-	tests := []struct {
-		name     string
-		host     string
-		expected bool
-	}{
-		{
-			name:     "empty host (default)",
-			host:     "",
-			expected: true,
-		},
-		{
-			name:     "localhost",
-			host:     "http://localhost:11434",
-			expected: true,
-		},
-		{
-			name:     "127.0.0.1",
-			host:     "http://127.0.0.1:11434",
-			expected: true,
-		},
-		{
-			name:     "custom port on localhost",
-			host:     "http://localhost:8080",
-			expected: true, // localhost is always considered local
-		},
-		{
-			name:     "remote host",
-			host:     "http://ollama.example.com:11434",
-			expected: true, // has :11434
-		},
-		{
-			name:     "remote host different port",
-			host:     "http://ollama.example.com:8080",
-			expected: false,
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			t.Setenv("OLLAMA_HOST", tt.host)
-			result := isLocalServer()
-			if result != tt.expected {
-				t.Errorf("isLocalServer() with OLLAMA_HOST=%q = %v, expected %v", tt.host, result, tt.expected)
-			}
-		})
-	}
-}
-
-func TestTruncateToolOutput(t *testing.T) {
-	// Create outputs of different sizes
-	localLimitOutput := make([]byte, 20000)   // > 4k tokens (16k chars)
-	defaultLimitOutput := make([]byte, 50000) // > 10k tokens (40k chars)
-	for i := range localLimitOutput {
-		localLimitOutput[i] = 'a'
-	}
-	for i := range defaultLimitOutput {
-		defaultLimitOutput[i] = 'b'
-	}
-
-	tests := []struct {
-		name          string
-		output        string
-		modelName     string
-		host          string
-		shouldTrim    bool
-		expectedLimit int
-	}{
-		{
-			name:          "short output local model",
-			output:        "hello world",
-			modelName:     "llama3.2",
-			host:          "",
-			shouldTrim:    false,
-			expectedLimit: localModelTokenLimit,
-		},
-		{
-			name:          "long output local model - trimmed at 4k",
-			output:        string(localLimitOutput),
-			modelName:     "llama3.2",
-			host:          "",
-			shouldTrim:    true,
-			expectedLimit: localModelTokenLimit,
-		},
-		{
-			name:          "long output cloud model - uses 10k limit",
-			output:        string(localLimitOutput), // 20k chars, under 10k token limit
-			modelName:     "gpt-4-cloud",
-			host:          "",
-			shouldTrim:    false,
-			expectedLimit: defaultTokenLimit,
-		},
-		{
-			name:          "very long output cloud model - trimmed at 10k",
-			output:        string(defaultLimitOutput),
-			modelName:     "gpt-4-cloud",
-			host:          "",
-			shouldTrim:    true,
-			expectedLimit: defaultTokenLimit,
-		},
-		{
-			name:          "long output remote server - uses 10k limit",
-			output:        string(localLimitOutput),
-			modelName:     "llama3.2",
-			host:          "http://remote.example.com:8080",
-			shouldTrim:    false,
-			expectedLimit: defaultTokenLimit,
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			t.Setenv("OLLAMA_HOST", tt.host)
-			result := truncateToolOutput(tt.output, tt.modelName)
-
-			if tt.shouldTrim {
-				maxLen := tt.expectedLimit * charsPerToken
-				if len(result) > maxLen+50 { // +50 for the truncation message
-					t.Errorf("expected output to be truncated to ~%d chars, got %d", maxLen, len(result))
-				}
-				if result == tt.output {
-					t.Error("expected output to be truncated but it wasn't")
-				}
-			} else {
-				if result != tt.output {
-					t.Error("expected output to not be truncated")
-				}
-			}
-		})
-	}
-}
--- a/x/imagegen/.gitignore
+++ b/x/imagegen/.gitignore
@@ -1,38 +0,0 @@
-# Build directories
-build/
-dist/
-
-# CMake
-CMakeCache.txt
-CMakeFiles/
-cmake_install.cmake
-Makefile
-*.cmake
-
-# IDE
-.idea/
-.vscode/
-*.swp
-*.swo
-*~
-
-# macOS
-.DS_Store
-*.dSYM/
-
-# Go
-*.exe
-*.exe~
-*.dll
-*.so
-*.dylib
-
-# Python
-*.npy
-
-/engine
-weights
-outputs
-
-prompt.txt
-negative.txt
--- a/x/imagegen/README.md
+++ b/x/imagegen/README.md
@@ -1,151 +0,0 @@
-# MLX engine
-
-This is a small inference engine written in Go using [MLX](https://github.com/ml-explore/mlx), Apple's array framework for machine learning
-
-## Goals
-
-1. Implement multimodal runners: in a dedicated runner but eventually to be integrated into Ollama's primary runner.
-2. Optimizing for image generation memory usage and output speed
-3. (secondary): implement fast text model inference for gpt-oss, Llama.
-
-## Prerequisites
-
-**macOS:**
-
- macOS 14.0+ (Sonoma or later)
- Apple Silicon (M1/M2/M3)
- Xcode Command Line Tools
-
-**Linux (building from source):**
-
- NVIDIA GPU (compute capability 7.0+)
- CUDA 12.0+ toolkit
- cuDNN
-
-**Linux (prebuilt binaries):**
-
- NVIDIA GPU (compute capability 7.0+)
- NVIDIA driver 525+ (CUDA runtime libs are bundled)
-
-**Both:**
-
- CMake 3.25+
- Go 1.21+
-
-## Quick Start
-
-### Build MLX
-
-```bash
-cmake -B build
-cmake --build build --parallel
-cmake --install build
-```
-
-This fetches MLX and mlx-c, builds them, and installs to `dist/`:
-
- `dist/lib/libmlxc.so` (or `.dylib`) - MLX C bindings
- `dist/lib/libmlx.a` - MLX static library
- `dist/include/` - Headers (mlx-c, CCCL for CUDA JIT)
-
-To update MLX version, change `MLX_GIT_TAG` in `CMakeLists.txt` and rebuild.
-
-### 2. Download a Model
-
-Download Llama 3.1 8B (or any compatible model) in safetensors format:
-
-```bash
-mkdir -p ./weights
-
-# Example using huggingface-cli
-hf download meta-llama/Llama-3.1-8B --local-dir ./weights/Llama-3.1-8B
-hf download openai/gpt-oss-20b --local-dir ./weights/gpt-oss-20b
-```
-
-### 3. Run Inference
-
-```bash
-# Build
-go build ./cmd/engine
-
-# Text generation
-./engine -model ./weights/Llama-3.1-8B -prompt "Hello, world!" -max-tokens 250
-
-# Qwen-Image 2512 (text-to-image)
-./engine -qwen-image -model ./weights/Qwen-Image-2512 -prompt "A mountain landscape at sunset" \
-  -width 1024 -height 1024 -steps 20 -seed 42 -output landscape.png
-
-# Qwen-Image Edit (experimental) - 8 steps for speed, but model recommends 50
-./engine -qwen-image-edit -model ./weights/Qwen-Image-Edit-2511 \
-  -input-image input.png -prompt "Make it winter" -negative-prompt " " -cfg-scale 4.0 \
-  -steps 8 -seed 42 -output edited.png
-```
-
-## Adding a Model
-
-Use Claude Code with this repo. See `models/CLAUDE.md` for the full guide covering:
-
- Porting Python models to Go (forward pass, weight loading)
- Component testing with Python reference data
- Performance optimization
-
-Reference implementations: `llama` (LLM), `qwen_image` (image generation), `qwen_image_edit` (image editing)
-
-## Memory Management
-
-MLX Python/C++ uses scope-based memory management - arrays are freed when they go out of scope. Go's garbage collector is non-deterministic, so we can't rely on finalizers to free GPU memory promptly.
-
-Instead, arrays are automatically tracked and freed on `Eval()`:
-
-```go
-// All arrays are automatically tracked when created
-x := mlx.Add(a, b)
-y := mlx.Matmul(x, w)
-
-// Eval frees non-kept arrays, evaluates outputs (auto-kept)
-mlx.Eval(y)
-
-// After copying to CPU, free the array
-data := y.Data()
-y.Free()
-```
-
-Key points:
-
- All created arrays are automatically tracked
- `mlx.Eval(outputs...)` frees non-kept arrays, evaluates outputs (outputs auto-kept)
- `mlx.Keep(arrays...)` marks arrays to survive multiple Eval cycles (for weights, caches)
- Call `.Free()` when done with an array
-
-## Testing
-
-### Running Tests
-
-```bash
-# Run all tests (tests skip if dependencies missing)
-go test ./...
-
-# Run specific model tests
-go test ./models/qwen_image/...
-```
-
-### Model Weights
-
-Tests require model weights in `./weights/<model-name>/`:
-
-```
-weights/
-├── Qwen-Image-2512/      # Qwen image generation
-│   ├── text_encoder/
-│   ├── transformer/
-│   ├── vae/
-│   └── tokenizer/
-├── Llama-3.1-8B/         # LLM
-└── ...
-```
-
-Download models using `huggingface-cli`:
-
-```bash
-hf download ./weights/Qwen-Image-2512 --local-dir ./weights/Qwen-Image-2512
-```
--- a/x/imagegen/cache/cache.go
+++ b/x/imagegen/cache/cache.go
@@ -1,154 +0,0 @@
-package cache
-
-import "github.com/ollama/ollama/x/imagegen/mlx"
-
-type Cache interface {
-	Update(k, v *mlx.Array, seqLen int) (*mlx.Array, *mlx.Array)
-	Offset() int
-	Len() int
-	State() []*mlx.Array
-}
-
-type KVCache struct {
-	keys, values *mlx.Array
-	offset       int
-	step         int
-}
-
-func NewKVCache() *KVCache {
-	return &KVCache{step: 256}
-}
-
-func (c *KVCache) Update(k, v *mlx.Array, seqLen int) (*mlx.Array, *mlx.Array) {
-	prev := c.offset
-	shape := k.Shape()
-	B, H, Dk := shape[0], shape[1], shape[3]
-	Dv := v.Shape()[3]
-
-	// Grow buffer if needed
-	if c.keys == nil || (prev+seqLen) > int(c.keys.Shape()[2]) {
-		nSteps := (c.step + seqLen - 1) / c.step
-		newK := mlx.Zeros([]int32{B, H, int32(nSteps * c.step), Dk}, k.Dtype())
-		newV := mlx.Zeros([]int32{B, H, int32(nSteps * c.step), Dv}, v.Dtype())
-
-		if c.keys != nil {
-			if prev%c.step != 0 {
-				c.keys = mlx.Slice(c.keys, []int32{0, 0, 0, 0}, []int32{B, H, int32(prev), Dk})
-				c.values = mlx.Slice(c.values, []int32{0, 0, 0, 0}, []int32{B, H, int32(prev), Dv})
-			}
-			c.keys = mlx.Concatenate([]*mlx.Array{c.keys, newK}, 2)
-			c.values = mlx.Concatenate([]*mlx.Array{c.values, newV}, 2)
-		} else {
-			c.keys, c.values = newK, newV
-		}
-	}
-
-	c.offset += seqLen
-	c.keys = mlx.SliceUpdateInplace(c.keys, k, []int32{0, 0, int32(prev), 0}, []int32{B, H, int32(c.offset), Dk})
-	c.values = mlx.SliceUpdateInplace(c.values, v, []int32{0, 0, int32(prev), 0}, []int32{B, H, int32(c.offset), Dv})
-
-	return mlx.Slice(c.keys, []int32{0, 0, 0, 0}, []int32{B, H, int32(c.offset), Dk}),
-		mlx.Slice(c.values, []int32{0, 0, 0, 0}, []int32{B, H, int32(c.offset), Dv})
-}
-
-func (c *KVCache) State() []*mlx.Array {
-	if c.keys == nil {
-		return nil
-	}
-	return []*mlx.Array{c.keys, c.values}
-}
-
-func (c *KVCache) Offset() int { return c.offset }
-func (c *KVCache) Len() int    { return c.offset }
-
-// RotatingKVCache implements sliding window attention with bounded memory
-type RotatingKVCache struct {
-	keys, values *mlx.Array
-	offset       int
-	maxSize      int
-	step         int
-	idx          int
-}
-
-func NewRotatingKVCache(maxSize int) *RotatingKVCache {
-	return &RotatingKVCache{maxSize: maxSize, step: 256}
-}
-
-func (c *RotatingKVCache) Update(k, v *mlx.Array, seqLen int) (*mlx.Array, *mlx.Array) {
-	if seqLen > 1 {
-		return c.updateConcat(k, v, seqLen)
-	}
-	return c.updateInPlace(k, v)
-}
-
-func (c *RotatingKVCache) updateInPlace(k, v *mlx.Array) (*mlx.Array, *mlx.Array) {
-	shape := k.Shape()
-	B, H, Dk := shape[0], shape[1], shape[3]
-	Dv := v.Shape()[3]
-
-	// Grow buffer if not yet at max
-	if c.keys == nil || (c.idx >= int(c.keys.Shape()[2]) && int(c.keys.Shape()[2]) < c.maxSize) {
-		var cap int
-		if c.keys != nil {
-			cap = int(c.keys.Shape()[2])
-		}
-		newSize := min(c.step, c.maxSize-cap)
-		newK := mlx.Zeros([]int32{B, H, int32(newSize), Dk}, k.Dtype())
-		newV := mlx.Zeros([]int32{B, H, int32(newSize), Dv}, v.Dtype())
-		if c.keys != nil {
-			c.keys = mlx.Concatenate([]*mlx.Array{c.keys, newK}, 2)
-			c.values = mlx.Concatenate([]*mlx.Array{c.values, newV}, 2)
-		} else {
-			c.keys, c.values = newK, newV
-		}
-	}
-
-	// Rotate when hitting max
-	if c.idx >= c.maxSize {
-		c.idx = 0
-	}
-
-	c.keys = mlx.SliceUpdateInplace(c.keys, k, []int32{0, 0, int32(c.idx), 0}, []int32{B, H, int32(c.idx + 1), Dk})
-	c.values = mlx.SliceUpdateInplace(c.values, v, []int32{0, 0, int32(c.idx), 0}, []int32{B, H, int32(c.idx + 1), Dv})
-
-	c.offset++
-	c.idx++
-
-	validLen := int32(min(c.offset, c.maxSize))
-	return mlx.Slice(c.keys, []int32{0, 0, 0, 0}, []int32{B, H, validLen, Dk}),
-		mlx.Slice(c.values, []int32{0, 0, 0, 0}, []int32{B, H, validLen, Dv})
-}
-
-func (c *RotatingKVCache) updateConcat(k, v *mlx.Array, seqLen int) (*mlx.Array, *mlx.Array) {
-	shape := k.Shape()
-	B, H, Dk := shape[0], shape[1], shape[3]
-	Dv := v.Shape()[3]
-
-	if c.keys == nil {
-		c.keys, c.values = k, v
-	} else {
-		c.keys = mlx.Concatenate([]*mlx.Array{c.keys, k}, 2)
-		c.values = mlx.Concatenate([]*mlx.Array{c.values, v}, 2)
-	}
-	c.offset += seqLen
-
-	// Trim to max_size to maintain sliding window
-	cap := int(c.keys.Shape()[2])
-	if trim := cap - c.maxSize; trim > 0 {
-		c.keys = mlx.Slice(c.keys, []int32{0, 0, int32(trim), 0}, []int32{B, H, int32(cap), Dk})
-		c.values = mlx.Slice(c.values, []int32{0, 0, int32(trim), 0}, []int32{B, H, int32(cap), Dv})
-	}
-
-	c.idx = int(c.keys.Shape()[2])
-	return c.keys, c.values
-}
-
-func (c *RotatingKVCache) State() []*mlx.Array {
-	if c.keys == nil {
-		return nil
-	}
-	return []*mlx.Array{c.keys, c.values}
-}
-
-func (c *RotatingKVCache) Offset() int { return c.offset }
-func (c *RotatingKVCache) Len() int    { return min(c.offset, c.maxSize) }
--- a/x/imagegen/cache/step.go
+++ b/x/imagegen/cache/step.go
@@ -1,162 +0,0 @@
-package cache
-
-import "github.com/ollama/ollama/x/imagegen/mlx"
-
-// StepCache caches layer outputs across diffusion denoising steps.
-// Based on DeepCache (CVPR 2024) and Learning-to-Cache (NeurIPS 2024):
-// shallow layers change little between consecutive steps, so we can
-// cache their outputs and skip recomputation on non-refresh steps.
-//
-// Supports both single-stream (Z-Image) and dual-stream (Qwen-Image) architectures:
-//   - Single-stream: use Get/Set for the single output per layer
-//   - Dual-stream: use Get/Set for stream 1 (imgH), Get2/Set2 for stream 2 (txtH)
-//
-// Usage (single-stream):
-//
-//	cache := NewStepCache(15)  // cache first 15 layers
-//	for step := 0; step < numSteps; step++ {
-//	    refresh := cache.ShouldRefresh(step, 3)  // refresh every 3 steps
-//	    for i, layer := range layers {
-//	        if i < 15 && !refresh && cache.Get(i) != nil {
-//	            output = cache.Get(i)  // reuse cached
-//	        } else {
-//	            output = layer.Forward(input)
-//	            if i < 15 && refresh {
-//	                cache.Set(i, output)
-//	            }
-//	        }
-//	    }
-//	}
-//	cache.Free()  // cleanup when done
-//
-// Usage (dual-stream):
-//
-//	cache := NewStepCache(15)
-//	for step := 0; step < numSteps; step++ {
-//	    refresh := cache.ShouldRefresh(step, 3)
-//	    for i, layer := range layers {
-//	        if i < 15 && !refresh && cache.Get(i) != nil {
-//	            imgH, txtH = cache.Get(i), cache.Get2(i)
-//	        } else {
-//	            imgH, txtH = layer.Forward(imgH, txtH, ...)
-//	            if i < 15 && refresh {
-//	                cache.Set(i, imgH)
-//	                cache.Set2(i, txtH)
-//	            }
-//	        }
-//	    }
-//	}
-type StepCache struct {
-	layers   []*mlx.Array // cached layer outputs (stream 1)
-	layers2  []*mlx.Array // cached layer outputs (stream 2, for dual-stream models)
-	constant *mlx.Array   // optional constant (e.g., text embeddings)
-}
-
-// NewStepCache creates a cache for the given number of layers.
-func NewStepCache(numLayers int) *StepCache {
-	return &StepCache{
-		layers:  make([]*mlx.Array, numLayers),
-		layers2: make([]*mlx.Array, numLayers),
-	}
-}
-
-// ShouldRefresh returns true if the cache should be refreshed at this step.
-// Refresh happens on step 0, interval, 2*interval, etc.
-func (c *StepCache) ShouldRefresh(step, interval int) bool {
-	return step%interval == 0
-}
-
-// Get returns the cached output for a layer, or nil if not cached.
-func (c *StepCache) Get(layer int) *mlx.Array {
-	if layer < len(c.layers) {
-		return c.layers[layer]
-	}
-	return nil
-}
-
-// Set stores a layer output (stream 1), freeing any previous value.
-func (c *StepCache) Set(layer int, arr *mlx.Array) {
-	if layer < len(c.layers) {
-		if c.layers[layer] != nil {
-			c.layers[layer].Free()
-		}
-		c.layers[layer] = arr
-	}
-}
-
-// Get2 returns the cached output for a layer (stream 2), or nil if not cached.
-// Used for dual-stream architectures like Qwen-Image.
-func (c *StepCache) Get2(layer int) *mlx.Array {
-	if layer < len(c.layers2) {
-		return c.layers2[layer]
-	}
-	return nil
-}
-
-// Set2 stores a layer output (stream 2), freeing any previous value.
-// Used for dual-stream architectures like Qwen-Image.
-func (c *StepCache) Set2(layer int, arr *mlx.Array) {
-	if layer < len(c.layers2) {
-		if c.layers2[layer] != nil {
-			c.layers2[layer].Free()
-		}
-		c.layers2[layer] = arr
-	}
-}
-
-// GetConstant returns the cached constant value.
-func (c *StepCache) GetConstant() *mlx.Array {
-	return c.constant
-}
-
-// SetConstant stores a constant value, freeing any previous value.
-func (c *StepCache) SetConstant(arr *mlx.Array) {
-	if c.constant != nil {
-		c.constant.Free()
-	}
-	c.constant = arr
-}
-
-// Arrays returns all non-nil cached arrays (for pool.Keep).
-func (c *StepCache) Arrays() []*mlx.Array {
-	var result []*mlx.Array
-	if c.constant != nil {
-		result = append(result, c.constant)
-	}
-	for _, arr := range c.layers {
-		if arr != nil {
-			result = append(result, arr)
-		}
-	}
-	for _, arr := range c.layers2 {
-		if arr != nil {
-			result = append(result, arr)
-		}
-	}
-	return result
-}
-
-// Free releases all cached arrays. Call when generation completes.
-func (c *StepCache) Free() {
-	if c.constant != nil {
-		c.constant.Free()
-		c.constant = nil
-	}
-	for i, arr := range c.layers {
-		if arr != nil {
-			arr.Free()
-			c.layers[i] = nil
-		}
-	}
-	for i, arr := range c.layers2 {
-		if arr != nil {
-			arr.Free()
-			c.layers2[i] = nil
-		}
-	}
-}
-
-// NumLayers returns the number of layers this cache can store.
-func (c *StepCache) NumLayers() int {
-	return len(c.layers)
-}
--- a/x/imagegen/cmd/engine/generate.go
+++ b/x/imagegen/cmd/engine/generate.go
@@ -1,357 +0,0 @@
-package main
-
-import (
-	"context"
-	"fmt"
-	"time"
-	"unicode/utf8"
-
-	"github.com/ollama/ollama/x/imagegen/cache"
-	"github.com/ollama/ollama/x/imagegen/mlx"
-	"github.com/ollama/ollama/x/imagegen/tokenizer"
-)
-
-// Dedicated stream for generation (like mlx-lm's generation_stream)
-var generationStream *mlx.Stream
-
-// utf8Streamer buffers decoded text and emits only complete UTF-8 characters.
-// This handles cases where tokenizers output partial multi-byte sequences.
-type utf8Streamer struct {
-	buffer []byte
-}
-
-// Write adds decoded text to the buffer and returns complete UTF-8 characters.
-func (s *utf8Streamer) Write(text string) string {
-	s.buffer = append(s.buffer, text...)
-
-	// Find the last position that ends with a complete UTF-8 character
-	validLen := 0
-	for i := 0; i < len(s.buffer); {
-		r, size := utf8.DecodeRune(s.buffer[i:])
-		if r == utf8.RuneError && size == 1 {
-			// Invalid or incomplete UTF-8 sequence at this position
-			// Check if it could be a valid start of a multi-byte sequence
-			if len(s.buffer)-i < 4 {
-				// Might be incomplete, keep it in buffer
-				break
-			}
-			// Definitely invalid, skip this byte
-			i++
-			validLen = i
-		} else {
-			i += size
-			validLen = i
-		}
-	}
-
-	if validLen == 0 {
-		return ""
-	}
-
-	result := string(s.buffer[:validLen])
-	s.buffer = s.buffer[validLen:]
-	return result
-}
-
-// Flush returns any remaining buffered bytes (may be incomplete UTF-8).
-func (s *utf8Streamer) Flush() string {
-	if len(s.buffer) == 0 {
-		return ""
-	}
-	result := string(s.buffer)
-	s.buffer = nil
-	return result
-}
-
-func init() {
-	generationStream = mlx.NewStream()
-}
-
-// withStream runs fn with the generation stream as default
-func withStream(fn func()) {
-	orig := mlx.GetDefaultStream()
-	mlx.SetDefaultStream(generationStream)
-	fn()
-	mlx.SetDefaultStream(orig)
-}
-
-type Model interface {
-	Tokenizer() *tokenizer.Tokenizer
-	VocabSize() int32
-	NewCache(maxSeqLen int32) []cache.Cache
-	Forward(input *mlx.Array, caches []cache.Cache) *mlx.Array
-}
-
-// ChatModel is an optional interface for models that support chat formatting
-type ChatModel interface {
-	FormatPrompt(prompt string) string
-}
-
-// MultimodalModel is for models that support image input
-type MultimodalModel interface {
-	Model
-	FormatPromptWithImage(prompt string) string
-	ExpandImageTokens(tokens []int32) []int32
-	ForwardWithImage(tokens *mlx.Array, image *mlx.Array, caches []cache.Cache) *mlx.Array
-	ImageSize() int32 // Returns expected image size for preprocessing
-}
-
-// ImageLoader loads and preprocesses an image for multimodal models
-// Returns nil if path is empty
-type ImageLoader func(path string, imageSize int32) (*mlx.Array, error)
-
-type input struct {
-	Prompt       string
-	Image        *mlx.Array // Optional preprocessed image for multimodal models
-	MaxTokens    int
-	Temperature  float32
-	TopP         float32
-	TopK         int
-	WiredLimitGB int // Metal wired memory limit in GB (default 32)
-}
-
-type output struct {
-	Text          string
-	Done          bool
-	PrefillTokSec float64
-	GenTokSec     float64
-}
-
-// Decoder wraps model + cache for autoregressive generation.
-type Decoder struct {
-	model         Model
-	caches        []cache.Cache
-	vocabSize     int32
-	temp          float32
-	topK          int
-	topP          float32
-	token         *mlx.Array   // Current token (kept across pools)
-	oldCacheState []*mlx.Array // Preallocated slice for old cache state
-	image         *mlx.Array   // Optional image for multimodal prefill
-}
-
-func NewDecoder(m Model, temp float32, topK int, topP float32) *Decoder {
-	caches := m.NewCache(0)
-	return &Decoder{
-		model:         m,
-		caches:        caches,
-		vocabSize:     m.VocabSize(),
-		temp:          temp,
-		topK:          topK,
-		topP:          topP,
-		oldCacheState: make([]*mlx.Array, 0, len(caches)*2),
-	}
-}
-
-// SetImage sets the image for multimodal prefill (call before prefill)
-func (d *Decoder) SetImage(img *mlx.Array) {
-	d.image = img
-}
-
-func (d *Decoder) prefill(inputIDs []int32) int {
-	processed := 0
-
-	// Track old cache state to free after each chunk
-	var oldCacheState []*mlx.Array
-
-	// For multimodal models with an image, we need to process all tokens together
-	// in the first forward pass so the image embeddings can be inserted properly.
-	// Skip chunking for multimodal prefill.
-	isMultimodal := d.image != nil
-
-	// Process all-but-1 tokens in chunks, eval cache state for memory management
-	// Skip chunking for multimodal - process everything in the final step
-	if !isMultimodal {
-		for len(inputIDs) > 1 {
-			chunkSize := min(2048, len(inputIDs)-1)
-			if chunkSize <= 0 {
-				break
-			}
-			chunk := inputIDs[:chunkSize]
-
-			// Save old cache state before forward
-			oldCacheState = oldCacheState[:0]
-			for _, c := range d.caches {
-				oldCacheState = append(oldCacheState, c.State()...)
-			}
-
-			var cacheState []*mlx.Array
-			withStream(func() {
-				x := mlx.NewArrayInt32(chunk, []int32{1, int32(len(chunk))})
-				d.model.Forward(x, d.caches)
-				for _, c := range d.caches {
-					cacheState = append(cacheState, c.State()...)
-				}
-			})
-			mlx.Eval(cacheState...)
-
-			// Free old cache state
-			for _, arr := range oldCacheState {
-				if arr != nil {
-					arr.Free()
-				}
-			}
-
-			inputIDs = inputIDs[chunkSize:]
-			processed += chunkSize
-		}
-	}
-
-	// Save old cache state before final step
-	oldCacheState = oldCacheState[:0]
-	for _, c := range d.caches {
-		oldCacheState = append(oldCacheState, c.State()...)
-	}
-
-	// Final token + sampling (or all tokens for multimodal)
-	withStream(func() {
-		x := mlx.NewArrayInt32(inputIDs, []int32{1, int32(len(inputIDs))})
-		mlx.Eval(x) // Materialize before any other evals
-
-		var logits *mlx.Array
-		// Use ForwardWithImage if we have an image and model supports it
-		if d.image != nil {
-			if mm, ok := d.model.(MultimodalModel); ok {
-				logits = mm.ForwardWithImage(x, d.image, d.caches)
-				d.image = nil // Only use image for first forward
-			} else {
-				logits = d.model.Forward(x, d.caches)
-			}
-		} else {
-			logits = d.model.Forward(x, d.caches)
-		}
-		d.token = sample(logits, d.temp, d.topK, d.topP, d.vocabSize)
-	})
-	// Keep cache state (token auto-kept by AsyncEval)
-	for _, c := range d.caches {
-		mlx.Keep(c.State()...)
-	}
-	mlx.AsyncEval(d.token)
-
-	// Free old cache state from before final step
-	for _, arr := range oldCacheState {
-		if arr != nil {
-			arr.Free()
-		}
-	}
-
-	mlx.ClearCache()
-
-	return processed + len(inputIDs)
-}
-
-func (d *Decoder) step() int32 {
-	prevToken := d.token
-
-	// Save old cache state (reuse preallocated slice)
-	d.oldCacheState = d.oldCacheState[:0]
-	for _, c := range d.caches {
-		d.oldCacheState = append(d.oldCacheState, c.State()...)
-	}
-
-	withStream(func() {
-		logits := d.model.Forward(mlx.Reshape(prevToken, 1, 1), d.caches)
-		d.token = sample(logits, d.temp, d.topK, d.topP, d.vocabSize)
-	})
-	// Keep token and new cache state so they survive cleanup
-	mlx.Keep(d.token)
-	for _, c := range d.caches {
-		mlx.Keep(c.State()...)
-	}
-	mlx.AsyncEval(d.token)
-
-	// Sync on previous token (GPU already working on next step)
-	val := prevToken.ItemInt32()
-
-	// Free old token and old cache state
-	prevToken.Free()
-	for _, arr := range d.oldCacheState {
-		arr.Free()
-	}
-	return val
-}
-
-func generate(ctx context.Context, m Model, in input, cb func(output)) error {
-	mlx.EnableCompile()
-	wiredLimit := in.WiredLimitGB
-	if wiredLimit <= 0 {
-		wiredLimit = 32 // default 32GB
-	}
-	mlx.MetalSetWiredLimit(uint64(wiredLimit) << 30)
-
-	temp := in.Temperature
-	if temp < 0 {
-		temp = 0.7
-	}
-
-	tok := m.Tokenizer()
-	dec := NewDecoder(m, temp, in.TopK, in.TopP)
-
-	// Apply chat template - use image template if we have an image
-	prompt := in.Prompt
-	var tokens []int32
-	if mm, ok := m.(MultimodalModel); ok && in.Image != nil {
-		prompt = mm.FormatPromptWithImage(prompt)
-		tokens = tok.Encode(prompt, true)
-		tokens = mm.ExpandImageTokens(tokens) // Expand <start_of_image> to 256 image tokens
-		dec.SetImage(in.Image)
-	} else if cm, ok := m.(ChatModel); ok {
-		prompt = cm.FormatPrompt(prompt)
-		tokens = tok.Encode(prompt, true)
-	} else {
-		tokens = tok.Encode(prompt, true)
-	}
-
-	prefillStart := time.Now()
-	prefillTokens := dec.prefill(tokens)
-	// Prefill measurement should include time to first token (like mlx-lm)
-	// Step() waits for prefill to complete and returns first token
-	firstToken := dec.step()
-	prefillTokSec := float64(prefillTokens) / time.Since(prefillStart).Seconds()
-
-	genStart := time.Now()
-	maxTokens := max(in.MaxTokens, 100)
-	var genTokens int
-
-	// UTF-8 streamer to handle partial multi-byte characters
-	streamer := &utf8Streamer{}
-
-	// Handle first token
-	genTokens++
-	if tok.IsEOS(firstToken) {
-		cb(output{Done: true, PrefillTokSec: prefillTokSec, GenTokSec: 0})
-		return nil
-	}
-	if text := streamer.Write(tok.Decode([]int32{firstToken})); text != "" {
-		cb(output{Text: text})
-	}
-
-	for n := 1; n < maxTokens; n++ {
-		if ctx.Err() != nil {
-			return ctx.Err()
-		}
-		token := dec.step()
-		genTokens++
-
-		if tok.IsEOS(token) {
-			break
-		}
-		if text := streamer.Write(tok.Decode([]int32{token})); text != "" {
-			cb(output{Text: text})
-		}
-
-		if n%256 == 0 {
-			mlx.ClearCache()
-		}
-	}
-
-	// Flush any remaining buffered bytes
-	if text := streamer.Flush(); text != "" {
-		cb(output{Text: text})
-	}
-
-	fmt.Printf("\nPeak memory: %.2fGB\n", float64(mlx.MetalGetPeakMemory())/(1<<30))
-	cb(output{Done: true, PrefillTokSec: prefillTokSec,
-		GenTokSec: float64(genTokens) / time.Since(genStart).Seconds()})
-	return nil
-}
--- a/x/imagegen/cmd/engine/image.go
+++ b/x/imagegen/cmd/engine/image.go
@@ -1,87 +0,0 @@
-package main
-
-import (
-	"fmt"
-	"image"
-	"image/png"
-	"os"
-	"path/filepath"
-
-	"github.com/ollama/ollama/x/imagegen/mlx"
-)
-
-// saveImageArray saves an MLX array as a PNG image.
-// Expected format: [B, C, H, W] with values in [0, 1] range and C=3 (RGB).
-func saveImageArray(arr *mlx.Array, path string) error {
-	img, err := arrayToImage(arr)
-	if err != nil {
-		return err
-	}
-	return savePNG(img, path)
-}
-
-func savePNG(img *image.RGBA, path string) error {
-	if filepath.Ext(path) != ".png" {
-		path = path + ".png"
-	}
-	f, err := os.Create(path)
-	if err != nil {
-		return err
-	}
-	defer f.Close()
-	return png.Encode(f, img)
-}
-
-func arrayToImage(arr *mlx.Array) (*image.RGBA, error) {
-	shape := arr.Shape()
-	if len(shape) != 4 {
-		return nil, fmt.Errorf("expected 4D array [B, C, H, W], got %v", shape)
-	}
-
-	// Transform to [H, W, C] for image conversion
-	img := mlx.Squeeze(arr, 0)
-	arr.Free()
-	img = mlx.Transpose(img, 1, 2, 0)
-	img = mlx.Contiguous(img)
-	mlx.Eval(img)
-
-	imgShape := img.Shape()
-	H := int(imgShape[0])
-	W := int(imgShape[1])
-	C := int(imgShape[2])
-
-	if C != 3 {
-		img.Free()
-		return nil, fmt.Errorf("expected 3 channels (RGB), got %d", C)
-	}
-
-	// Copy to CPU and free GPU memory
-	data := img.Data()
-	img.Free()
-
-	// Write directly to Pix slice (faster than SetRGBA)
-	goImg := image.NewRGBA(image.Rect(0, 0, W, H))
-	pix := goImg.Pix
-	for y := 0; y < H; y++ {
-		for x := 0; x < W; x++ {
-			srcIdx := (y*W + x) * C
-			dstIdx := (y*W + x) * 4
-			pix[dstIdx+0] = uint8(clampF(data[srcIdx+0]*255+0.5, 0, 255))
-			pix[dstIdx+1] = uint8(clampF(data[srcIdx+1]*255+0.5, 0, 255))
-			pix[dstIdx+2] = uint8(clampF(data[srcIdx+2]*255+0.5, 0, 255))
-			pix[dstIdx+3] = 255
-		}
-	}
-
-	return goImg, nil
-}
-
-func clampF(v, min, max float32) float32 {
-	if v < min {
-		return min
-	}
-	if v > max {
-		return max
-	}
-	return v
-}
--- a/x/imagegen/cmd/engine/main.go
+++ b/x/imagegen/cmd/engine/main.go
@@ -1,284 +0,0 @@
-package main
-
-import (
-	"context"
-	"encoding/json"
-	"flag"
-	"fmt"
-	"log"
-	"os"
-	"path/filepath"
-	"runtime/pprof"
-
-	"github.com/ollama/ollama/x/imagegen/mlx"
-	"github.com/ollama/ollama/x/imagegen/models/gemma3"
-	"github.com/ollama/ollama/x/imagegen/models/gpt_oss"
-	"github.com/ollama/ollama/x/imagegen/models/llama"
-	"github.com/ollama/ollama/x/imagegen/models/qwen_image"
-	"github.com/ollama/ollama/x/imagegen/models/qwen_image_edit"
-	"github.com/ollama/ollama/x/imagegen/models/zimage"
-	"github.com/ollama/ollama/x/imagegen/safetensors"
-)
-
-// stringSlice is a flag type that accumulates multiple values
-type stringSlice []string
-
-func (s *stringSlice) String() string {
-	return fmt.Sprintf("%v", *s)
-}
-
-func (s *stringSlice) Set(value string) error {
-	*s = append(*s, value)
-	return nil
-}
-
-func main() {
-	modelPath := flag.String("model", "", "Model directory")
-	prompt := flag.String("prompt", "Hello", "Prompt")
-
-	// Text generation params
-	maxTokens := flag.Int("max-tokens", 100, "Max tokens")
-	temperature := flag.Float64("temperature", 0.7, "Temperature")
-	topP := flag.Float64("top-p", 0.9, "Top-p sampling")
-	topK := flag.Int("top-k", 40, "Top-k sampling")
-	imagePath := flag.String("image", "", "Image path for multimodal models")
-
-	// Image generation params
-	width := flag.Int("width", 1024, "Image width")
-	height := flag.Int("height", 1024, "Image height")
-	steps := flag.Int("steps", 9, "Denoising steps")
-	seed := flag.Int64("seed", 42, "Random seed")
-	out := flag.String("output", "output.png", "Output path")
-
-	// Utility flags
-	listTensors := flag.Bool("list", false, "List tensors only")
-	cpuProfile := flag.String("cpuprofile", "", "Write CPU profile to file")
-	gpuCapture := flag.String("gpu-capture", "", "Capture GPU trace to .gputrace file (run with MTL_CAPTURE_ENABLED=1)")
-	layerCache := flag.Bool("layer-cache", false, "Enable layer caching for faster diffusion (Z-Image, Qwen-Image). Not compatible with CFG/negative prompts.")
-	wiredLimitGB := flag.Int("wired-limit", 32, "Metal wired memory limit in GB")
-
-	// Legacy mode flags
-	zimageFlag := flag.Bool("zimage", false, "Z-Image generation")
-	qwenImage := flag.Bool("qwen-image", false, "Qwen-Image text-to-image generation")
-	qwenImageEdit := flag.Bool("qwen-image-edit", false, "Qwen-Image-Edit image editing")
-	var inputImages stringSlice
-	flag.Var(&inputImages, "input-image", "Input image for image editing (can be specified multiple times)")
-	negativePrompt := flag.String("negative-prompt", "", "Negative prompt for CFG (empty = no CFG, matching Python)")
-	cfgScale := flag.Float64("cfg-scale", 4.0, "CFG scale for image editing")
-
-	flag.Parse()
-
-	if *modelPath == "" {
-		flag.Usage()
-		return
-	}
-
-	// CPU profiling
-	if *cpuProfile != "" {
-		f, err := os.Create(*cpuProfile)
-		if err != nil {
-			log.Fatal(err)
-		}
-		defer f.Close()
-		if err := pprof.StartCPUProfile(f); err != nil {
-			log.Fatal(err)
-		}
-		defer pprof.StopCPUProfile()
-	}
-
-	var err error
-
-	// Handle legacy mode flags that aren't unified yet
-	switch {
-	case *zimageFlag:
-		m := &zimage.Model{}
-		if loadErr := m.Load(*modelPath); loadErr != nil {
-			log.Fatal(loadErr)
-		}
-		var img *mlx.Array
-		img, err = m.GenerateFromConfig(&zimage.GenerateConfig{
-			Prompt:      *prompt,
-			Width:       int32(*width),
-			Height:      int32(*height),
-			Steps:       *steps,
-			Seed:        *seed,
-			CapturePath: *gpuCapture,
-			LayerCache:  *layerCache,
-		})
-		if err == nil {
-			err = saveImageArray(img, *out)
-		}
-	case *qwenImage:
-		m, loadErr := qwen_image.LoadPersistent(*modelPath)
-		if loadErr != nil {
-			log.Fatal(loadErr)
-		}
-		var img *mlx.Array
-		img, err = m.GenerateFromConfig(&qwen_image.GenerateConfig{
-			Prompt:         *prompt,
-			NegativePrompt: *negativePrompt,
-			CFGScale:       float32(*cfgScale),
-			Width:          int32(*width),
-			Height:         int32(*height),
-			Steps:          *steps,
-			Seed:           *seed,
-			LayerCache:     *layerCache,
-		})
-		if err == nil {
-			err = saveImageArray(img, *out)
-		}
-	case *qwenImageEdit:
-		if len(inputImages) == 0 {
-			log.Fatal("qwen-image-edit requires at least one -input-image")
-		}
-
-		m, loadErr := qwen_image_edit.LoadPersistent(*modelPath)
-		if loadErr != nil {
-			log.Fatal(loadErr)
-		}
-		// For image editing, use 0 for dimensions to auto-detect from input image
-		// unless explicitly overridden from defaults
-		editWidth := int32(0)
-		editHeight := int32(0)
-		if *width != 1024 {
-			editWidth = int32(*width)
-		}
-		if *height != 1024 {
-			editHeight = int32(*height)
-		}
-
-		cfg := &qwen_image_edit.GenerateConfig{
-			Prompt:         *prompt,
-			NegativePrompt: *negativePrompt,
-			CFGScale:       float32(*cfgScale),
-			Width:          editWidth,
-			Height:         editHeight,
-			Steps:          *steps,
-			Seed:           *seed,
-		}
-
-		var img *mlx.Array
-		img, err = m.EditFromConfig(inputImages, cfg)
-		if err == nil {
-			err = saveImageArray(img, *out)
-		}
-	case *listTensors:
-		err = listModelTensors(*modelPath)
-	default:
-		// llm path
-		m, err := load(*modelPath)
-		if err != nil {
-			log.Fatal(err)
-		}
-
-		// Load image if provided and model supports it
-		var image *mlx.Array
-		if *imagePath != "" {
-			if mm, ok := m.(interface{ ImageSize() int32 }); ok {
-				image, err = gemma3.ProcessImage(*imagePath, mm.ImageSize())
-				if err != nil {
-					log.Fatal("load image:", err)
-				}
-			} else {
-				log.Fatal("model does not support image input")
-			}
-		}
-
-		err = generate(context.Background(), m, input{
-			Prompt:       *prompt,
-			Image:        image,
-			MaxTokens:    *maxTokens,
-			Temperature:  float32(*temperature),
-			TopP:         float32(*topP),
-			TopK:         *topK,
-			WiredLimitGB: *wiredLimitGB,
-		}, func(out output) {
-			if out.Text != "" {
-				fmt.Print(out.Text)
-			}
-			if out.Done {
-				fmt.Printf("\n\n[prefill: %.1f tok/s, gen: %.1f tok/s]\n", out.PrefillTokSec, out.GenTokSec)
-			}
-		})
-	}
-
-	if err != nil {
-		log.Fatal(err)
-	}
-}
-
-func listModelTensors(modelPath string) error {
-	weights, err := safetensors.LoadModelWeights(modelPath)
-	if err != nil {
-		return err
-	}
-	for _, name := range weights.ListTensors() {
-		info, _ := weights.GetTensorInfo(name)
-		fmt.Printf("%s: %v (%s)\n", name, info.Shape, info.Dtype)
-	}
-	return nil
-}
-
-// loadModel builds and evaluates a model using the common load pattern.
-// Release safetensors BEFORE eval - lazy arrays have captured their data,
-// and this reduces peak memory by ~6GB (matches mlx-lm behavior).
-func loadModel[T Model](build func() T, cleanup func()) T {
-	m := build()
-	weights := mlx.Collect(m)
-	cleanup()
-	mlx.Eval(weights...)
-	return m
-}
-
-func load(modelPath string) (Model, error) {
-	kind, err := detectModelKind(modelPath)
-	if err != nil {
-		return nil, fmt.Errorf("detect model kind: %w", err)
-	}
-
-	switch kind {
-	case "gpt_oss":
-		return gpt_oss.Load(modelPath)
-	case "gemma3":
-		return gemma3.Load(modelPath)
-	case "gemma3_text":
-		return gemma3.LoadText(modelPath)
-	default:
-		return llama.Load(modelPath)
-	}
-}
-
-func detectModelKind(modelPath string) (string, error) {
-	indexPath := filepath.Join(modelPath, "model_index.json")
-	if _, err := os.Stat(indexPath); err == nil {
-		data, err := os.ReadFile(indexPath)
-		if err != nil {
-			return "zimage", nil
-		}
-		var index struct {
-			ClassName string `json:"_class_name"`
-		}
-		if err := json.Unmarshal(data, &index); err == nil {
-			switch index.ClassName {
-			case "FluxPipeline", "ZImagePipeline":
-				return "zimage", nil
-			}
-		}
-		return "zimage", nil
-	}
-
-	configPath := filepath.Join(modelPath, "config.json")
-	data, err := os.ReadFile(configPath)
-	if err != nil {
-		return "", fmt.Errorf("no config.json or model_index.json found: %w", err)
-	}
-
-	var cfg struct {
-		ModelType string `json:"model_type"`
-	}
-	if err := json.Unmarshal(data, &cfg); err != nil {
-		return "", fmt.Errorf("parse config.json: %w", err)
-	}
-
-	return cfg.ModelType, nil
-}
--- a/x/imagegen/cmd/engine/sample.go
+++ b/x/imagegen/cmd/engine/sample.go
@@ -1,47 +0,0 @@
-package main
-
-import "github.com/ollama/ollama/x/imagegen/mlx"
-
-// sampleTopK samples from top-k logits using global random state
-func sampleTopK(scaledLogits *mlx.Array, k int) *mlx.Array {
-	neg := mlx.Neg(scaledLogits)
-	indices := mlx.Argpartition(neg, k-1, -1)
-	topKIdx := mlx.Slice(indices, []int32{0}, []int32{int32(k)})
-	values := mlx.TakeAlongAxis(scaledLogits, topKIdx, -1)
-	sampled := mlx.RandomCategorical(values, -1, 1)
-	return mlx.Take(topKIdx, sampled, -1)
-}
-
-// sampleTopP samples using nucleus sampling with global random state
-func sampleTopP(scaledLogits *mlx.Array, p float32, vocabSize int32) *mlx.Array {
-	sorted := mlx.Argsort(mlx.Neg(scaledLogits), -1)
-	sortedLogits := mlx.TakeAlongAxis(scaledLogits, sorted, -1)
-	probs := mlx.Softmax(sortedLogits, -1)
-	cumProbs := mlx.Cumsum(probs, -1)
-	mask := mlx.LessScalar(cumProbs, p)
-	negInf := mlx.FullDtype(float32(-1e9), scaledLogits.Dtype(), vocabSize)
-	masked := mlx.Where(mask, sortedLogits, negInf)
-	sampled := mlx.RandomCategorical(masked, -1, 1)
-	return mlx.Take(sorted, sampled, -1)
-}
-
-// sample samples from logits at the last position
-func sample(logits *mlx.Array, temp float32, topK int, topP float32, vocab int32) *mlx.Array {
-	// Get last position logits: [1, L, vocab] -> [vocab]
-	shape := logits.Shape()
-	seqLen := shape[1]
-	lastLogits := mlx.Slice(logits, []int32{0, seqLen - 1, 0}, []int32{1, seqLen, vocab})
-	lastLogits = mlx.Reshape(lastLogits, vocab)
-
-	if temp == 0 {
-		return mlx.Argmax(lastLogits, -1, false)
-	}
-	scaled := mlx.DivScalar(lastLogits, temp)
-	if topK > 0 && topK < int(vocab) {
-		return sampleTopK(scaled, topK)
-	}
-	if topP > 0 && topP < 1.0 {
-		return sampleTopP(scaled, topP, vocab)
-	}
-	return mlx.RandomCategorical(scaled, -1, 1)
-}
--- a/x/imagegen/mlx/README.md
+++ b/x/imagegen/mlx/README.md
@@ -1,46 +0,0 @@
-# MLX Memory Management
-
-| This package will get consolidated with `x/ml/backend/mlx` in the future.
-
-## Automatic Tracking
-
-All arrays are automatically tracked when created. On `Eval()`, non-kept arrays are freed.
-
-### API
-
-```go
-result := mlx.Matmul(x, w) // arrays automatically tracked
-mlx.Eval(result)           // free non-kept, eval result (auto-kept)
-```
-
-### Key Functions
-
- `mlx.Eval(outputs...)` - free non-kept arrays, then evaluate (outputs auto-kept)
- `mlx.AsyncEval(outputs...)` - async version of Eval (outputs auto-kept)
- `mlx.Keep(arrays...)` - mark arrays to survive cleanup (for weights, caches)
- `array.Free()` - mark array for cleanup on next Eval
-
-### Loop Pattern
-
-```go
-for step := 0; step < maxTokens; step++ {
-    logits := model.Forward(token, caches)
-    oldToken := token
-    token = sample(logits)
-
-    // Keep cache state across iterations
-    for _, c := range caches {
-        mlx.Keep(c.State()...)
-    }
-
-    oldToken.Free()       // mark for cleanup
-    mlx.AsyncEval(token)  // frees old, evals new
-}
-```
-
-### Notes
-
- `Eval()` and `AsyncEval()` auto-keep their outputs
- `Free()` marks for cleanup - actual free happens during next Eval
- Use `Keep()` for weights and cache state that must survive multiple Eval cycles
- Arrays created inside compiled closures are managed by MLX, not tracked
--- a/x/imagegen/mlx/compile.go
+++ b/x/imagegen/mlx/compile.go
@@ -1,171 +0,0 @@
-package mlx
-
-/*
-#include "mlx/c/mlx.h"
-#include <stdlib.h>
-
-// Forward declaration for Go callback
-extern int goClosureCallback(mlx_vector_array* res, mlx_vector_array input, void* payload);
-
-// Destructor for payload (Go handle)
-extern void goClosureDestructor(void* payload);
-*/
-import "C"
-import (
-	"runtime/cgo"
-	"sync"
-	"unsafe"
-)
-
-// inClosureCallback is set to true during closure callback execution.
-var inClosureCallback bool
-var closureCallbackMu sync.Mutex
-
-// InClosureCallback returns true if we're currently executing inside a closure callback.
-func InClosureCallback() bool {
-	closureCallbackMu.Lock()
-	defer closureCallbackMu.Unlock()
-	return inClosureCallback
-}
-
-// CompiledFunc is a compiled MLX function that can be called efficiently.
-// All intermediate arrays during execution stay inside MLX - only inputs
-// and outputs cross the Go boundary.
-type CompiledFunc struct {
-	closure  C.mlx_closure
-	compiled C.mlx_closure
-}
-
-// ClosureFunc is the signature for functions that can be compiled.
-// It takes a slice of input arrays and returns a slice of output arrays.
-type ClosureFunc func(inputs []*Array) []*Array
-
-// Compile compiles a Go function into an optimized MLX closure.
-// The function is traced once during compilation, then subsequent calls
-// run the optimized graph without creating Go intermediate arrays.
-//
-// Example:
-//
-//	compiled := mlx.Compile(func(inputs []*mlx.Array) []*mlx.Array {
-//	    a, b := inputs[0], inputs[1]
-//	    c := mlx.Add(a, b)
-//	    d := mlx.Mul(c, c)
-//	    return []*mlx.Array{d}
-//	})
-//	defer compiled.Free()
-//
-//	result := compiled.Call(x, y)[0]
-func Compile(fn ClosureFunc) *CompiledFunc {
-	return CompileShapeless(fn, false)
-}
-
-// CompileShapeless compiles with optional shapeless mode.
-// If shapeless=true, the function works for any input shape after tracing.
-func CompileShapeless(fn ClosureFunc, shapeless bool) *CompiledFunc {
-	// Create a cgo.Handle to prevent the Go function from being GC'd
-	handle := cgo.NewHandle(fn)
-
-	// Create the closure from the Go callback
-	closure := C.mlx_closure_new_func_payload(
-		(*[0]byte)(C.goClosureCallback),
-		unsafe.Pointer(handle),
-		(*[0]byte)(C.goClosureDestructor),
-	)
-
-	// Compile the closure
-	compiled := C.mlx_closure_new()
-	C.mlx_compile(&compiled, closure, C.bool(shapeless))
-
-	return &CompiledFunc{
-		closure:  closure,
-		compiled: compiled,
-	}
-}
-
-// Call invokes the compiled function with the given inputs.
-func (cf *CompiledFunc) Call(inputs ...*Array) []*Array {
-	// Pack inputs into vector
-	inputVec := C.mlx_vector_array_new()
-	for _, arr := range inputs {
-		C.mlx_vector_array_append_value(inputVec, arr.c)
-	}
-
-	// Apply compiled closure
-	outputVec := C.mlx_vector_array_new()
-	C.mlx_closure_apply(&outputVec, cf.compiled, inputVec)
-	C.mlx_vector_array_free(inputVec)
-
-	// Unpack outputs
-	numOutputs := int(C.mlx_vector_array_size(outputVec))
-	outputs := make([]*Array, numOutputs)
-	for i := 0; i < numOutputs; i++ {
-		var arr C.mlx_array
-		C.mlx_vector_array_get(&arr, outputVec, C.size_t(i))
-		outputs[i] = newArray(arr)
-	}
-	C.mlx_vector_array_free(outputVec)
-
-	return outputs
-}
-
-// CallEval invokes the compiled function and evaluates the results.
-func (cf *CompiledFunc) CallEval(inputs ...*Array) []*Array {
-	outputs := cf.Call(inputs...)
-	Eval(outputs...)
-	return outputs
-}
-
-// Free releases the compiled function resources.
-func (cf *CompiledFunc) Free() {
-	C.mlx_closure_free(cf.compiled)
-	C.mlx_closure_free(cf.closure)
-}
-
-// borrowArray wraps a C array WITHOUT setting up GC cleanup.
-// Use this for arrays we don't own (e.g., borrowed references in callbacks).
-func borrowArray(array C.mlx_array) *Array {
-	return &Array{c: array}
-}
-
-//export goClosureCallback
-func goClosureCallback(res *C.mlx_vector_array, input C.mlx_vector_array, payload unsafe.Pointer) C.int {
-	// Set flag to disable AddCleanup during callback
-	closureCallbackMu.Lock()
-	inClosureCallback = true
-	closureCallbackMu.Unlock()
-	defer func() {
-		closureCallbackMu.Lock()
-		inClosureCallback = false
-		closureCallbackMu.Unlock()
-	}()
-
-	// Recover the Go function from the handle
-	handle := cgo.Handle(payload)
-	fn := handle.Value().(ClosureFunc)
-
-	// Convert input vector to Go slice - use borrowArray since MLX owns these
-	numInputs := int(C.mlx_vector_array_size(input))
-	inputs := make([]*Array, numInputs)
-	for i := 0; i < numInputs; i++ {
-		var arr C.mlx_array
-		C.mlx_vector_array_get(&arr, input, C.size_t(i))
-		inputs[i] = borrowArray(arr) // Don't set up cleanup - MLX owns these
-	}
-
-	// Call the Go function
-	outputs := fn(inputs)
-
-	// Build output vector
-	*res = C.mlx_vector_array_new()
-	for _, arr := range outputs {
-		C.mlx_vector_array_append_value(*res, arr.c)
-	}
-
-	return 0
-}
-
-//export goClosureDestructor
-func goClosureDestructor(payload unsafe.Pointer) {
-	handle := cgo.Handle(payload)
-	handle.Delete()
-}
--- a/x/imagegen/mlx/mlx.go
+++ b/x/imagegen/mlx/mlx.go
--- a/x/imagegen/mlx/mlx_test.go
+++ b/x/imagegen/mlx/mlx_test.go
--- a/x/imagegen/models/CLAUDE.md
+++ b/x/imagegen/models/CLAUDE.md
@@ -1,277 +0,0 @@
-# Model Implementation Guide
-
-See `README.md` for memory management (critical for Go + MLX).
-
-## Phase 1: Import & Forward Pass
-
- Read Python reference implementation (PyTorch/Transformers)
- Create Go struct mirroring layer hierarchy
- Implement weight loading from safetensors (see `safetensors.go`)
- Port forward pass layer-by-layer, bottom-up
- For tokenizers: check if BPE (`bpe.go`) or custom needed
-
-**Key files to reference:** `llama` (dense LLM), `gpt_oss` (MoE LLM), `zimage` (image generation), `qwen_image_edit` (image editing)
-
-### Vision Models: Image Preprocessing
-
-When implementing vision models (image-to-text, image editing, etc.), image preprocessing must match Python exactly. Common pitfalls:
-
-1. **Resolution constraints**: Many vision models use `min_pixels` and `max_pixels` to constrain image size, not a fixed target area. Check the Python processor's `smart_resize` logic.
-
-2. **Patch alignment**: Images must be resized to multiples of `factor = patch_size * spatial_merge_size` (e.g., 14 \* 2 = 28 for Qwen2.5-VL).
-
-3. **Normalization**: Vision encoders use ImageNet stats (mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711]), not simple [-1, 1] scaling.
-
-4. **Temporal dimension**: Video/image models may expect a temporal dimension (e.g., `[B, T, C, H, W]`). For single images, duplicate frames if `temporal_patch_size > 1`.
-
-**Verification**: Always compare Go preprocessed image shape and statistics against Python to catch sizing mismatches early.
-
-### Tokenizer & Chat Templates
-
-Most instruction-tuned models require:
-
-1. **BOS token**: Added at start of input (token ID 2 for most models)
-2. **Chat template**: Wraps user prompt in model-specific format
-
-**Common chat templates:**
-
-| Model   | Format                                                                                                                                     |
-| ------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
-| Llama 3 | `<\|begin_of_text\|><\|start_header_id\|>user<\|end_header_id\|>\n{prompt}<\|eot_id\|><\|start_header_id\|>assistant<\|end_header_id\|>\n` |
-| Gemma 3 | `<bos><start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>model\n`                                                                  |
-| Qwen    | `<\|im_start\|>user\n{prompt}<\|im_end\|>\n<\|im_start\|>assistant\n`                                                                      |
-
-**Checking tokenization:**
-
-```bash
-source .venv/bin/activate && python3 -c "
-from transformers import AutoTokenizer
-tok = AutoTokenizer.from_pretrained('./weights/model-name')
-tokens = tok.encode('Hello', add_special_tokens=True)
-print('Tokens:', tokens)
-print('Decoded:', [tok.decode([t]) for t in tokens])
-"
-```
-
-### Text Model Checklist
-
-Before moving to vision components, ensure the text model is fully working:
-
-1. **Sliding window cache**: Some models (Gemma 3, GPT-OSS) use sliding window attention on certain layers. Use `cache.NewRotatingKVCache(windowSize)` for those layers, not `cache.NewKVCache()`. Check config for `sliding_window` and `sliding_window_pattern`.
-
-2. **Unicode/UTF-8 decoding**: If output shows garbled characters like `Â` before spaces, the tokenizer's byte-level encoding isn't being decoded properly. Check `Decode()` handles UTF-8 byte sequences correctly.
-
-3. **EOS tokens from vocabulary**: Don't hardcode EOS token IDs. The tokenizer should extract them from `added_tokens` in `tokenizer.json`. Multiple EOS tokens are common (e.g., Gemma has both `<eos>` and `<end_of_turn>`).
-
-4. **Chat template**: Instruction-tuned models need chat formatting. Test with and without to ensure the model responds coherently.
-
-5. **Compare with reference**: Always test against `mlx_lm.generate` with same prompt and `--temp 0` to verify outputs match.
-
-## Phase 2: Correctness Testing
-
-Run the model and look at the output. Make sure it outputs something coherent.
-
-To compare correctness, add hooks to the python model and compare the output with debug statements in Go.
-
-## Phase 3: Memory Verification
-
-After loading, verify peak memory is close to final model size:
-
-```bash
-# Run and check peak vs active memory
-/tmp/engine -model ./weights/MyModel -steps 1 2>&1 | grep -E "(peak|GB)"
-```
-
-**Expected:** Peak should be ~1.1x final size (small overhead is OK). If peak is 2-3x final size, you have a memory problem.
-
-### Checking Weight Dtypes
-
-```bash
-# Check dtype of weights in safetensors files
-python3 -c "
-from safetensors import safe_open
-f = safe_open('model.safetensors', 'pt')
-for k in list(f.keys())[:5]:
-    print(k, f.get_tensor(k).dtype)
-"
-```
-
-### f32 Weights Need Special Handling
-
-If weights are f32 but model runs in bf16, use `GetTensorBF16()` instead of `GetTensor()`:
-
- `GetTensor()` uses MLX's native loader (loads all tensors from file at once)
- `GetTensorBF16()` loads one tensor at a time, converts to bf16, frees f32 immediately
-
-This prevents peak memory from being 2x model size during loading.
-
-## Phase 4: Performance
-
-### Evaluation Strategy
-
- Call `mlx.Eval()` once per token/step, not inside loops
- Use `mlx.AsyncEval()` to pipeline: build next step's graph while current executes
- Never call `mlx.Eval()` inside attention or MLP - batch it at the end
-
-### Fast Operations (Already Built-in)
-
-These Go functions use MLX's fast fused kernels internally:
-
- `mlx.RMSNorm(x, weight, eps)` → uses `mlx_fast_rms_norm`
- `mlx.RoPE(x, dims, traditional, base, scale, offset)` → uses `mlx_fast_rope`
- `mlx.ScaledDotProductAttention(q, k, v, scale, causalMask)` → uses `mlx_fast_scaled_dot_product_attention`
-
-### Type Promotion Gotchas
-
- `mlx.Mul(bf16Array, mlx.Full(shape, 2.0, mlx.Float32))` → upcasts everything to f32
- Use `mlx.MulScalar(bf16Array, 2.0)` to preserve dtype (if available), or ensure scalar arrays match input dtype
-
-### Profiling
-
- Use `mactop` to check GPU utilization - should be ~100%
- If low, bottleneck is likely Go code (tokenization, data prep), not MLX
- Use `pprof` for CPU profiling to find Go-side overhead (CGO calls, tokenization, etc.)
- Use Metal debugger for kernel-level profiling (see docs/performance.md)
- Profile with `time.Since()` around major blocks
- Compare tok/s against reference (llama.cpp, MLX-LM)
-
-## Phase 5: Polish
-
- Remove debug prints
- Add proper error handling
- Document config.json fields used
-
-## Tips
-
- MLX is lazy; call `Eval()` only when you need values
- Check `model.safetensors.index.json` for weight→file mapping
-
-## Common Gotchas
-
-### MLX Transpose requires Contiguous
-
-`mlx.Transpose()` returns a view with modified strides - calling `Data()` returns the original memory layout. Always follow with `mlx.Contiguous()` if you need correct data ordering:
-
-```go
-// Wrong - Data() returns original layout
-x = mlx.Transpose(x, 0, 2, 3, 4, 1)
-data := x.Data()  // Bug: data is in wrong order
-
-// Correct
-x = mlx.Contiguous(mlx.Transpose(x, 0, 2, 3, 4, 1))
-data := x.Data()  // Data is in transposed order
-```
-
-### Missing Biases in Weight Loading
-
-Python layers often have optional biases. Check the safetensors files for bias tensors:
-
-```bash
-python3 -c "from safetensors import safe_open; f=safe_open('model.safetensors','pt'); print([k for k in f.keys() if 'bias' in k])"
-```
-
-### Don't Spam ClearCache() or Eval()
-
- `mlx.ClearCache()` clears the GPU cache but doesn't free arrays - it has minimal effect on memory. Don't call it repeatedly.
- `mlx.Eval()` forces synchronous evaluation and frees non-kept arrays. Call it once per step/token, not inside loops.
-
-### Lazy Eval and Free() - The Critical Pattern
-
-MLX arrays are lazy - operations build a graph, actual computation happens at `Eval()`. This has a critical implication for `Free()`:
-
-```go
-// WRONG: Lazy array references freed input
-func BadForward(x *mlx.Array) *mlx.Array {
-    return mlx.Add(compute(x), x)  // Returns lazy array referencing x
-}
-
-func Caller() {
-    result := BadForward(input)
-    input.Free()       // Frees input, but result still references it!
-    mlx.Eval(result)   // CRASH: "expected a non-empty mlx_array"
-}
-
-// CORRECT: Eval before caller can free inputs
-func GoodForward(x *mlx.Array) *mlx.Array {
-    out := mlx.Add(compute(x), x)
-    mlx.Eval(out)  // Materialize before returning
-    return out
-}
-```
-
-**Rule**: If your function returns an array that references its input (residual connections, skip connections), you MUST `Eval()` before returning - otherwise the caller may free the input while the result still needs it.
-
-**Debugging**: Errors like "expected a non-empty mlx_array" at Eval time often mean a tensor was freed while still referenced by a lazy graph. Add logging BEFORE the Free() calls to find which one, not inside the lazy operations.
-
-### Data() and DataInt32() Trigger Eval
-
-Calling `.Data()` or `.DataInt32()` on an array does an implicit `Eval()`, which frees any un-eval'd arrays:
-
-```go
-// WRONG: tokenArray gets freed when we eval image
-tokenArray := mlx.NewArrayInt32(tokens, shape)
-image := processImage(path)  // This evals image internally
-mlx.Eval(image)              // This frees tokenArray!
-
-tokenData := tokenArray.DataInt32()  // CRASH: tokenArray was freed
-
-// CORRECT: Eval arrays you need to keep before other evals
-tokenArray := mlx.NewArrayInt32(tokens, shape)
-mlx.Eval(tokenArray)  // Materialize it first
-image := processImage(path)
-
-tokenData := tokenArray.DataInt32()  // Works fine
-```
-
-**Rule**: Before calling any function that might do an `Eval()` internally, make sure to `Eval()` any arrays you'll need later. When passing arrays to model forward functions, eval them first if they were just created.
-
-### Diffusers Pipeline vs Scheduler Defaults
-
-Diffusers pipelines often pass custom parameters that override scheduler defaults. When writing tests, match what the **pipeline** does, not the raw scheduler:
-
-```python
-# Scheduler default (when no sigmas passed):
-#   sigmas from 1.0 to 1/1000 = 0.001
-
-# But pipeline passes custom sigmas:
-sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
-scheduler.set_timesteps(sigmas=sigmas, ...)  # 1.0 to 1/30 for 30 steps
-```
-
-Always check the pipeline source to see what parameters it passes to components.
-
-### Diffusion Models: Timestep Scaling
-
-Diffusion transformers use sinusoidal timestep embeddings with internal scaling. **Critical**: Check what the pipeline actually passes to the transformer, not just what the scheduler stores.
-
-**Common pattern in diffusers (tricky!):**
-
- `scheduler.sigmas` = values in [0, 1] range (e.g., 1.0, 0.608, 0.02)
- `scheduler.timesteps` = sigmas × 1000 (e.g., 1000, 608, 20)
- **BUT** the pipeline often divides by 1000 before passing to transformer: `timestep=t / 1000`
- Transformer's `Timesteps` class has `scale=1000`, multiplying input by 1000
- Net effect: transformer receives sigma (0.608), scales to 608
-
-**Verification - check the actual pipeline source:**
-
-```bash
-grep -A2 "timestep=" .venv/.../pipeline_*.py
-# Look for: timestep=timestep / 1000  ← pipeline normalizes!
-```
-
-**Go approach (skip the multiply/divide dance):**
-
-```go
-// Store sigmas directly as timesteps - equivalent to Python's
-// scheduler.timesteps / 1000 that the pipeline passes to transformer
-s.Timesteps[i] = sigmas[i]  // 0.608
-// Transformer does: 0.608 * 1000 = 608 ✓
-```
-
-**Symptoms of wrong timestep scaling:**
-
- Noise predictions have wrong magnitude (off by orders of magnitude)
- Output images are completely noisy/corrupted or have extreme contrast
- Latents diverge from Python after first denoising step
-
-**Key lesson:** Don't assume scheduler.timesteps is what the transformer receives - always check the pipeline's forward pass for any normalization.
--- a/x/imagegen/models/gemma3/gemma3.go
+++ b/x/imagegen/models/gemma3/gemma3.go
@@ -1,612 +0,0 @@
-package gemma3
-
-import (
-	"encoding/json"
-	"fmt"
-	"math"
-	"os"
-	"path/filepath"
-
-	"github.com/ollama/ollama/x/imagegen/cache"
-	"github.com/ollama/ollama/x/imagegen/mlx"
-	"github.com/ollama/ollama/x/imagegen/nn"
-	"github.com/ollama/ollama/x/imagegen/safetensors"
-	"github.com/ollama/ollama/x/imagegen/tokenizer"
-)
-
-// TextConfig holds configuration for the text model
-type TextConfig struct {
-	HiddenSize            int32   `json:"hidden_size"`
-	NumHiddenLayers       int32   `json:"num_hidden_layers"`
-	IntermediateSize      int32   `json:"intermediate_size"`
-	NumAttentionHeads     int32   `json:"num_attention_heads"`
-	NumKeyValueHeads      int32   `json:"num_key_value_heads"`
-	HeadDim               int32   `json:"head_dim"`
-	VocabSize             int32   `json:"vocab_size"`
-	RMSNormEps            float32 `json:"rms_norm_eps"`
-	RopeTheta             float32 `json:"rope_theta"`
-	RopeLocalBaseFreq     float32 `json:"rope_local_base_freq"`
-	MaxPositionEmbeddings int32   `json:"max_position_embeddings"`
-	SlidingWindow         int32   `json:"sliding_window"`
-	SlidingWindowPattern  int32   `json:"sliding_window_pattern"`
-
-	// Computed fields
-	Scale float32 `json:"-"`
-}
-
-// TextModel is the Gemma 3 text-only model
-type TextModel struct {
-	EmbedTokens *nn.Embedding   `weight:"model.embed_tokens"`
-	Layers      []*DecoderLayer `weight:"model.layers"`
-	Norm        *nn.RMSNorm     `weight:"model.norm"`
-	Output      *nn.Linear      `weight:"-"` // Tied to EmbedTokens, set manually
-
-	// Precomputed (1 + weight) for Gemma-style RMSNorm to avoid allocation per forward
-	NormScaled *mlx.Array `weight:"-"`
-
-	tok *tokenizer.Tokenizer
-	*TextConfig
-}
-
-// DecoderLayer is a single transformer block
-type DecoderLayer struct {
-	InputNorm    *nn.RMSNorm `weight:"input_layernorm"`
-	Attention    *Attention
-	PostAttnNorm *nn.RMSNorm `weight:"post_attention_layernorm"`
-	PreFFNorm    *nn.RMSNorm `weight:"pre_feedforward_layernorm"`
-	MLP          *MLP
-	PostFFNorm   *nn.RMSNorm `weight:"post_feedforward_layernorm"`
-
-	// Precomputed (1 + weight) for Gemma-style RMSNorm
-	InputNormScaled    *mlx.Array `weight:"-"`
-	PostAttnNormScaled *mlx.Array `weight:"-"`
-	PreFFNormScaled    *mlx.Array `weight:"-"`
-	PostFFNormScaled   *mlx.Array `weight:"-"`
-
-	// Whether this layer uses sliding window attention
-	IsSliding bool
-	LayerIdx  int32
-}
-
-// Attention implements Gemma 3 attention with Q/K normalization
-type Attention struct {
-	QProj *nn.Linear  `weight:"self_attn.q_proj"`
-	KProj *nn.Linear  `weight:"self_attn.k_proj"`
-	VProj *nn.Linear  `weight:"self_attn.v_proj"`
-	OProj *nn.Linear  `weight:"self_attn.o_proj"`
-	QNorm *nn.RMSNorm `weight:"self_attn.q_norm"`
-	KNorm *nn.RMSNorm `weight:"self_attn.k_norm"`
-
-	// Precomputed (1 + weight) for Gemma-style RMSNorm
-	QNormScaled *mlx.Array `weight:"-"`
-	KNormScaled *mlx.Array `weight:"-"`
-}
-
-// MLP is the feed-forward network with GELU activation
-type MLP struct {
-	GateProj *nn.Linear `weight:"mlp.gate_proj"`
-	UpProj   *nn.Linear `weight:"mlp.up_proj"`
-	DownProj *nn.Linear `weight:"mlp.down_proj"`
-}
-
-// LoadText loads the text-only Gemma 3 model
-func LoadText(modelPath string) (*TextModel, error) {
-	data, err := os.ReadFile(filepath.Join(modelPath, "config.json"))
-	if err != nil {
-		return nil, fmt.Errorf("load config: %w", err)
-	}
-	var cfg TextConfig
-	if err := json.Unmarshal(data, &cfg); err != nil {
-		return nil, fmt.Errorf("parse config: %w", err)
-	}
-
-	// Compute scale
-	cfg.Scale = float32(1.0 / math.Sqrt(float64(cfg.HeadDim)))
-
-	// Set defaults if not specified
-	if cfg.RopeTheta == 0 {
-		cfg.RopeTheta = 1000000
-	}
-	if cfg.RopeLocalBaseFreq == 0 {
-		cfg.RopeLocalBaseFreq = 10000
-	}
-	if cfg.RMSNormEps == 0 {
-		cfg.RMSNormEps = 1e-6
-	}
-
-	weights, err := safetensors.LoadModelWeights(modelPath)
-	if err != nil {
-		return nil, fmt.Errorf("load weights: %w", err)
-	}
-
-	tok, err := tokenizer.Load(filepath.Join(modelPath, "tokenizer.json"))
-	if err != nil {
-		return nil, fmt.Errorf("load tokenizer: %w", err)
-	}
-
-	m := &TextModel{
-		Layers:     make([]*DecoderLayer, cfg.NumHiddenLayers),
-		TextConfig: &cfg,
-		tok:        tok,
-	}
-
-	// Initialize layer metadata
-	for i := range m.Layers {
-		m.Layers[i] = &DecoderLayer{
-			LayerIdx:  int32(i),
-			IsSliding: isLayerSliding(int32(i), cfg.SlidingWindowPattern),
-		}
-	}
-
-	if err := safetensors.LoadModule(m, weights, ""); err != nil {
-		return nil, err
-	}
-
-	// Tied embeddings for output
-	m.Output = nn.NewLinear(m.EmbedTokens.Weight, nil)
-
-	mlx.Eval(mlx.Collect(m)...)
-	weights.ReleaseAll()
-
-	// Precompute (1 + weight) for Gemma-style RMSNorm to avoid per-forward allocation
-	precomputeGemmaScaledWeights(m)
-
-	return m, nil
-}
-
-// precomputeGemmaScaledWeights computes (1 + weight) for all RMSNorm layers
-// This avoids creating temporary arrays on every forward pass
-func precomputeGemmaScaledWeights(m *TextModel) {
-	m.NormScaled = mlx.AddScalar(m.Norm.Weight, 1.0)
-
-	for _, layer := range m.Layers {
-		layer.InputNormScaled = mlx.AddScalar(layer.InputNorm.Weight, 1.0)
-		layer.PostAttnNormScaled = mlx.AddScalar(layer.PostAttnNorm.Weight, 1.0)
-		layer.PreFFNormScaled = mlx.AddScalar(layer.PreFFNorm.Weight, 1.0)
-		layer.PostFFNormScaled = mlx.AddScalar(layer.PostFFNorm.Weight, 1.0)
-
-		layer.Attention.QNormScaled = mlx.AddScalar(layer.Attention.QNorm.Weight, 1.0)
-		layer.Attention.KNormScaled = mlx.AddScalar(layer.Attention.KNorm.Weight, 1.0)
-	}
-
-	// Eval all the precomputed weights
-	var scaled []*mlx.Array
-	scaled = append(scaled, m.NormScaled)
-	for _, layer := range m.Layers {
-		scaled = append(scaled, layer.InputNormScaled, layer.PostAttnNormScaled,
-			layer.PreFFNormScaled, layer.PostFFNormScaled,
-			layer.Attention.QNormScaled, layer.Attention.KNormScaled)
-	}
-	mlx.Eval(scaled...)
-}
-
-// isLayerSliding determines if a layer uses sliding window attention
-// Pattern N means: layers 0 to N-1 sliding, N full, N+1 to 2N-1 sliding, 2N full, etc.
-func isLayerSliding(layerIdx, pattern int32) bool {
-	if pattern <= 0 {
-		return false // No sliding window
-	}
-	// Layer is full attention if (layerIdx + 1) % pattern == 0
-	return (layerIdx+1)%pattern != 0
-}
-
-// Forward runs the text model forward pass
-func (m *TextModel) Forward(tokens *mlx.Array, caches []cache.Cache) *mlx.Array {
-	B, L := tokens.Shape()[0], tokens.Shape()[1]
-
-	// Get embeddings and scale by sqrt(hidden_size)
-	h := m.EmbedTokens.Forward(tokens)
-	h = mlx.MulScalar(h, float32(math.Sqrt(float64(m.HiddenSize))))
-
-	for i, layer := range m.Layers {
-		h = layer.Forward(h, caches[i], B, L, m.TextConfig)
-	}
-
-	// Final norm and output projection
-	return m.Output.Forward(mlx.RMSNorm(h, m.NormScaled, m.RMSNormEps))
-}
-
-// Forward runs a decoder layer
-func (l *DecoderLayer) Forward(x *mlx.Array, c cache.Cache, B, L int32, cfg *TextConfig) *mlx.Array {
-	// Pre-attention norm (use precomputed scaled weight)
-	normed := mlx.RMSNorm(x, l.InputNormScaled, cfg.RMSNormEps)
-
-	// Attention
-	attnOut := l.Attention.Forward(normed, c, B, L, l.IsSliding, cfg)
-
-	// Post-attention norm and residual
-	attnOut = mlx.RMSNorm(attnOut, l.PostAttnNormScaled, cfg.RMSNormEps)
-	h := mlx.Add(x, attnOut)
-
-	// Pre-FFN norm
-	normed = mlx.RMSNorm(h, l.PreFFNormScaled, cfg.RMSNormEps)
-
-	// MLP
-	mlpOut := l.MLP.Forward(normed)
-
-	// Post-FFN norm and residual
-	mlpOut = mlx.RMSNorm(mlpOut, l.PostFFNormScaled, cfg.RMSNormEps)
-	return mlx.Add(h, mlpOut)
-}
-
-// Forward runs attention with Q/K normalization
-func (a *Attention) Forward(x *mlx.Array, c cache.Cache, B, L int32, isSliding bool, cfg *TextConfig) *mlx.Array {
-	q := a.QProj.Forward(x)
-	k := a.KProj.Forward(x)
-	v := a.VProj.Forward(x)
-
-	// Reshape to [B, num_heads, L, head_dim]
-	q = mlx.AsStrided(q, []int32{B, cfg.NumAttentionHeads, L, cfg.HeadDim},
-		[]int64{int64(L * cfg.NumAttentionHeads * cfg.HeadDim), int64(cfg.HeadDim), int64(cfg.NumAttentionHeads * cfg.HeadDim), 1}, 0)
-	k = mlx.AsStrided(k, []int32{B, cfg.NumKeyValueHeads, L, cfg.HeadDim},
-		[]int64{int64(L * cfg.NumKeyValueHeads * cfg.HeadDim), int64(cfg.HeadDim), int64(cfg.NumKeyValueHeads * cfg.HeadDim), 1}, 0)
-	v = mlx.AsStrided(v, []int32{B, cfg.NumKeyValueHeads, L, cfg.HeadDim},
-		[]int64{int64(L * cfg.NumKeyValueHeads * cfg.HeadDim), int64(cfg.HeadDim), int64(cfg.NumKeyValueHeads * cfg.HeadDim), 1}, 0)
-
-	// Q/K normalization after reshaping (use precomputed scaled weight)
-	q = mlx.RMSNorm(q, a.QNormScaled, cfg.RMSNormEps)
-	k = mlx.RMSNorm(k, a.KNormScaled, cfg.RMSNormEps)
-
-	// Apply RoPE with appropriate theta
-	ropeTheta := cfg.RopeTheta
-	if isSliding {
-		ropeTheta = cfg.RopeLocalBaseFreq
-	}
-	q = mlx.RoPE(q, int(cfg.HeadDim), false, ropeTheta, 1.0, c.Offset())
-	k = mlx.RoPE(k, int(cfg.HeadDim), false, ropeTheta, 1.0, c.Offset())
-
-	// Update cache
-	k, v = c.Update(k, v, int(L))
-
-	// Repeat K/V for GQA if needed
-	repeatFactor := cfg.NumAttentionHeads / cfg.NumKeyValueHeads
-	if repeatFactor > 1 {
-		k = nn.RepeatKV(k, repeatFactor)
-		v = nn.RepeatKV(v, repeatFactor)
-	}
-
-	// Attention
-	out := mlx.ScaledDotProductAttention(q, k, v, cfg.Scale, L > 1)
-	out = mlx.Reshape(mlx.Transpose(out, 0, 2, 1, 3), B, L, cfg.NumAttentionHeads*cfg.HeadDim)
-	return a.OProj.Forward(out)
-}
-
-// compiledGeluApprox is a singleton compiled GELU function shared across all layers
-var compiledGeluApprox *mlx.CompiledFunc
-
-// getCompiledGeluApprox returns the compiled GELU function, creating it once if needed
-func getCompiledGeluApprox() *mlx.CompiledFunc {
-	if compiledGeluApprox == nil {
-		compiledGeluApprox = mlx.CompileShapeless(func(inputs []*mlx.Array) []*mlx.Array {
-			return []*mlx.Array{geluApproxImpl(inputs[0])}
-		}, true)
-	}
-	return compiledGeluApprox
-}
-
-// Forward runs the MLP with GELU approximation (tanh variant)
-func (m *MLP) Forward(x *mlx.Array) *mlx.Array {
-	gate := getCompiledGeluApprox().Call(m.GateProj.Forward(x))[0]
-	return m.DownProj.Forward(mlx.Mul(gate, m.UpProj.Forward(x)))
-}
-
-// geluApproxImpl computes GELU using the tanh approximation (gelu_pytorch_tanh):
-// 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
-func geluApproxImpl(x *mlx.Array) *mlx.Array {
-	// Constants
-	const sqrt2OverPi = 0.7978845608028654 // sqrt(2/pi)
-	const coeff = 0.044715
-
-	// x^3
-	x3 := mlx.Mul(mlx.Mul(x, x), x)
-	// x + 0.044715 * x^3
-	inner := mlx.Add(x, mlx.MulScalar(x3, coeff))
-	// sqrt(2/pi) * (x + 0.044715 * x^3)
-	scaled := mlx.MulScalar(inner, sqrt2OverPi)
-	// tanh(...)
-	tanh := mlx.Tanh(scaled)
-	// 1 + tanh(...)
-	onePlusTanh := mlx.AddScalar(tanh, 1.0)
-	// 0.5 * x * (1 + tanh(...))
-	return mlx.Mul(mlx.MulScalar(x, 0.5), onePlusTanh)
-}
-
-// gemmaRMSNorm applies Gemma-style RMS normalization: x * rsqrt(mean(x^2) + eps) * (1 + weight)
-// Uses mlx.RMSNorm fast kernel with pre-computed (1 + weight)
-func gemmaRMSNorm(x, weight *mlx.Array, eps float32) *mlx.Array {
-	// Gemma uses (1 + weight) instead of weight
-	scaledWeight := mlx.AddScalar(weight, 1.0)
-	return mlx.RMSNorm(x, scaledWeight, eps)
-}
-
-// Interface methods
-func (m *TextModel) NumLayers() int          { return len(m.Layers) }
-func (m *TextModel) MaxContextLength() int32 { return m.MaxPositionEmbeddings }
-func (m *TextModel) VocabSize() int32        { return m.TextConfig.VocabSize }
-
-// Tokenizer returns the tokenizer wrapped to add BOS and apply chat template
-func (m *TextModel) Tokenizer() *tokenizer.Tokenizer {
-	return m.tok
-}
-
-// FormatPrompt applies the Gemma 3 chat template to a prompt
-func (m *TextModel) FormatPrompt(prompt string) string {
-	// Gemma 3 chat format: <start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>model\n
-	return fmt.Sprintf("<start_of_turn>user\n%s<end_of_turn>\n<start_of_turn>model\n", prompt)
-}
-
-func (m *TextModel) NewCache(maxSeqLen int32) []cache.Cache {
-	caches := make([]cache.Cache, len(m.Layers))
-	for i := range caches {
-		if m.Layers[i].IsSliding {
-			// Use rotating cache for sliding window layers
-			caches[i] = cache.NewRotatingKVCache(int(m.SlidingWindow))
-		} else {
-			// Use regular cache for global attention layers
-			caches[i] = cache.NewKVCache()
-		}
-	}
-	return caches
-}
-
-// Config holds config for the full multimodal model
-type Config struct {
-	TextConfig   TextConfig   `json:"text_config"`
-	VisionConfig VisionConfig `json:"vision_config"`
-
-	// Image token config (from config.json)
-	BOITokenIndex     int32 `json:"boi_token_index"`     // <start_of_image> = 255999
-	EOITokenIndex     int32 `json:"eoi_token_index"`     // <end_of_image> = 256000
-	ImageTokenIndex   int32 `json:"image_token_index"`   // <image_soft_token> = 262144
-	MMTokensPerImage  int32 `json:"mm_tokens_per_image"` // 256
-}
-
-// Model is the full Gemma 3 multimodal model
-type Model struct {
-	VisionTower *VisionTower         `weight:"vision_tower"`
-	Projector   *MultiModalProjector `weight:"multi_modal_projector"`
-	TextModel   *TextModel           `weight:"language_model"`
-	Config      *Config
-	tok         *tokenizer.Tokenizer
-}
-
-// Load loads the full multimodal Gemma 3 model
-func Load(modelPath string) (*Model, error) {
-	data, err := os.ReadFile(filepath.Join(modelPath, "config.json"))
-	if err != nil {
-		return nil, fmt.Errorf("load config: %w", err)
-	}
-
-	var cfg Config
-	if err := json.Unmarshal(data, &cfg); err != nil {
-		return nil, fmt.Errorf("parse config: %w", err)
-	}
-
-	// Set defaults for text config (multimodal config often has incomplete text_config)
-	// These defaults match transformers.Gemma3TextConfig defaults
-	tc := &cfg.TextConfig
-	if tc.HeadDim == 0 {
-		tc.HeadDim = 256 // Gemma 3 uses head_dim=256
-	}
-	if tc.NumAttentionHeads == 0 {
-		// Gemma 3 4B uses 8 attention heads (cannot infer from hidden_size/head_dim)
-		tc.NumAttentionHeads = 8
-	}
-	if tc.NumKeyValueHeads == 0 {
-		// Gemma 3 4B uses 4 KV heads (GQA with 2:1 ratio)
-		tc.NumKeyValueHeads = 4
-	}
-	if tc.VocabSize == 0 {
-		tc.VocabSize = 262208 // Gemma 3 vocab size (not 262144!)
-	}
-	if tc.RopeTheta == 0 {
-		tc.RopeTheta = 1000000
-	}
-	if tc.RopeLocalBaseFreq == 0 {
-		tc.RopeLocalBaseFreq = 10000
-	}
-	if tc.RMSNormEps == 0 {
-		tc.RMSNormEps = 1e-6
-	}
-	if tc.SlidingWindowPattern == 0 {
-		tc.SlidingWindowPattern = 6
-	}
-	if tc.MaxPositionEmbeddings == 0 {
-		tc.MaxPositionEmbeddings = 131072 // Gemma 3 4B default
-	}
-
-	// Compute text model scale
-	tc.Scale = float32(1.0 / math.Sqrt(float64(tc.HeadDim)))
-
-	// Set defaults for image token config
-	if cfg.BOITokenIndex == 0 {
-		cfg.BOITokenIndex = 255999 // <start_of_image>
-	}
-	if cfg.EOITokenIndex == 0 {
-		cfg.EOITokenIndex = 256000 // <end_of_image>
-	}
-	if cfg.ImageTokenIndex == 0 {
-		cfg.ImageTokenIndex = 262144 // <image_soft_token>
-	}
-	if cfg.MMTokensPerImage == 0 {
-		cfg.MMTokensPerImage = 256
-	}
-
-	weights, err := safetensors.LoadModelWeights(modelPath)
-	if err != nil {
-		return nil, fmt.Errorf("load weights: %w", err)
-	}
-
-	tok, err := tokenizer.Load(filepath.Join(modelPath, "tokenizer.json"))
-	if err != nil {
-		return nil, fmt.Errorf("load tokenizer: %w", err)
-	}
-
-	m := &Model{
-		VisionTower: &VisionTower{
-			Embeddings: &VisionEmbeddings{},
-			Encoder:    make([]*VisionEncoderLayer, cfg.VisionConfig.NumHiddenLayers),
-			Config:     &cfg.VisionConfig,
-		},
-		Projector: &MultiModalProjector{},
-		TextModel: &TextModel{
-			Layers:     make([]*DecoderLayer, cfg.TextConfig.NumHiddenLayers),
-			TextConfig: &cfg.TextConfig,
-		},
-		Config: &cfg,
-		tok:    tok,
-	}
-
-	// Initialize text layer metadata
-	for i := range m.TextModel.Layers {
-		m.TextModel.Layers[i] = &DecoderLayer{
-			LayerIdx:  int32(i),
-			IsSliding: isLayerSliding(int32(i), cfg.TextConfig.SlidingWindowPattern),
-		}
-	}
-
-	// Initialize vision encoder layers
-	for i := range m.VisionTower.Encoder {
-		m.VisionTower.Encoder[i] = &VisionEncoderLayer{}
-	}
-
-	if err := safetensors.LoadModule(m, weights, ""); err != nil {
-		return nil, err
-	}
-
-	// Tied embeddings for text output
-	m.TextModel.Output = nn.NewLinear(m.TextModel.EmbedTokens.Weight, nil)
-	m.TextModel.tok = tok
-
-	mlx.Eval(mlx.Collect(m)...)
-	weights.ReleaseAll()
-
-	// Precompute (1 + weight) for Gemma-style RMSNorm
-	precomputeGemmaScaledWeights(m.TextModel)
-
-	// Precompute projector's scaled weight
-	m.Projector.SoftEmbNormScaled = mlx.AddScalar(m.Projector.SoftEmbNorm.Weight, 1.0)
-	mlx.Eval(m.Projector.SoftEmbNormScaled)
-
-	return m, nil
-}
-
-// Forward runs the text-only forward pass
-func (m *Model) Forward(tokens *mlx.Array, caches []cache.Cache) *mlx.Array {
-	return m.TextModel.Forward(tokens, caches)
-}
-
-// ForwardWithImage runs the multimodal forward pass
-// tokens: [B, L] input token IDs (with image placeholder tokens)
-// image: [B, H, W, C] preprocessed image tensor
-func (m *Model) ForwardWithImage(tokens *mlx.Array, image *mlx.Array, caches []cache.Cache) *mlx.Array {
-	B, L := tokens.Shape()[0], tokens.Shape()[1]
-	cfg := m.Config.TextConfig
-
-	// Find image token position FIRST before any eval that might free tokens
-	imageStartPos := int32(-1)
-	if image != nil && B == 1 {
-		tokenData := tokens.DataInt32() // This evals tokens
-		for i, t := range tokenData {
-			if t == m.Config.ImageTokenIndex {
-				imageStartPos = int32(i)
-				break
-			}
-		}
-	}
-
-	// Get text embeddings and scale
-	h := m.TextModel.EmbedTokens.Forward(tokens)
-	h = mlx.MulScalar(h, float32(math.Sqrt(float64(cfg.HiddenSize))))
-
-	// Process image if provided
-	if image != nil && imageStartPos >= 0 {
-		// Vision tower: [B, H, W, C] -> [B, num_patches, vision_hidden]
-		visionFeatures := m.VisionTower.Forward(image)
-
-		// Project to text space: [B, num_patches, vision_hidden] -> [B, 256, text_hidden]
-		imageEmbeds := m.Projector.Forward(visionFeatures, cfg.RMSNormEps)
-
-		// Eval h and imageEmbeds together so neither gets freed
-		mlx.Eval(h, imageEmbeds)
-
-		// Cast imageEmbeds to match text embeddings dtype (bf16)
-		if imageEmbeds.Dtype() != h.Dtype() {
-			imageEmbeds = mlx.AsType(imageEmbeds, h.Dtype())
-			mlx.Eval(imageEmbeds)
-		}
-
-		// Insert image embeddings at the known position
-		h = m.insertImageEmbeddingsAt(h, imageEmbeds, imageStartPos)
-	}
-
-	// Run through text model layers
-	for i, layer := range m.TextModel.Layers {
-		h = layer.Forward(h, caches[i], B, L, m.TextModel.TextConfig)
-	}
-
-	// Final norm and output projection
-	return m.TextModel.Output.Forward(mlx.RMSNorm(h, m.TextModel.NormScaled, cfg.RMSNormEps))
-}
-
-// insertImageEmbeddingsAt replaces image placeholder tokens with actual image embeddings
-// at a known position (to avoid re-scanning tokens after eval)
-// textEmbeds: [B, L, hidden_size] text embeddings
-// imageEmbeds: [B, 256, hidden_size] image embeddings from projector
-// startPos: starting position of image tokens in the sequence
-func (m *Model) insertImageEmbeddingsAt(textEmbeds, imageEmbeds *mlx.Array, startPos int32) *mlx.Array {
-	numImageTokens := imageEmbeds.Shape()[1]
-	L := textEmbeds.Shape()[1]
-
-	// Split text embeddings: [0:startPos] + imageEmbeds + [startPos+256:L]
-	afterStart := startPos + numImageTokens
-
-	// Slice before image tokens: textEmbeds[:, 0:startPos, :]
-	before := mlx.SliceAxis(textEmbeds, 1, 0, startPos)
-
-	// Slice after image tokens: textEmbeds[:, startPos+256:L, :]
-	after := mlx.SliceAxis(textEmbeds, 1, afterStart, L)
-
-	// Concatenate: before + imageEmbeds + after along axis 1
-	return mlx.Concatenate([]*mlx.Array{before, imageEmbeds, after}, 1)
-}
-
-// Interface methods for Model
-func (m *Model) NumLayers() int                         { return len(m.TextModel.Layers) }
-func (m *Model) MaxContextLength() int32                { return m.Config.TextConfig.MaxPositionEmbeddings }
-func (m *Model) VocabSize() int32                       { return m.Config.TextConfig.VocabSize }
-func (m *Model) Tokenizer() *tokenizer.Tokenizer     { return m.tok }
-func (m *Model) NewCache(maxSeqLen int32) []cache.Cache { return m.TextModel.NewCache(maxSeqLen) }
-func (m *Model) ImageSize() int32                       { return m.Config.VisionConfig.ImageSize }
-
-// FormatPrompt applies the Gemma 3 multimodal chat template
-func (m *Model) FormatPrompt(prompt string) string {
-	return fmt.Sprintf("<start_of_turn>user\n%s<end_of_turn>\n<start_of_turn>model\n", prompt)
-}
-
-// FormatPromptWithImage applies the Gemma 3 multimodal chat template with image
-func (m *Model) FormatPromptWithImage(prompt string) string {
-	return fmt.Sprintf("<start_of_turn>user\n<start_of_image>%s<end_of_turn>\n<start_of_turn>model\n", prompt)
-}
-
-// ExpandImageTokens expands <start_of_image> into 256 image placeholder tokens
-// Input tokens containing boi_token (255999) are expanded to:
-// boi_token + 256 * image_token + eoi_token
-func (m *Model) ExpandImageTokens(tokens []int32) []int32 {
-	result := make([]int32, 0, len(tokens)+int(m.Config.MMTokensPerImage)+1)
-
-	for _, t := range tokens {
-		if t == m.Config.BOITokenIndex {
-			// Expand: boi + 256 * image_token + eoi
-			result = append(result, m.Config.BOITokenIndex)
-			for i := int32(0); i < m.Config.MMTokensPerImage; i++ {
-				result = append(result, m.Config.ImageTokenIndex)
-			}
-			result = append(result, m.Config.EOITokenIndex)
-		} else {
-			result = append(result, t)
-		}
-	}
-
-	return result
-}
--- a/x/imagegen/models/gemma3/image.go
+++ b/x/imagegen/models/gemma3/image.go
@@ -1,56 +0,0 @@
-package gemma3
-
-import (
-	"fmt"
-	"image"
-	_ "image/jpeg"
-	_ "image/png"
-	"os"
-
-	"github.com/ollama/ollama/x/imagegen/mlx"
-	"golang.org/x/image/draw"
-)
-
-// ProcessImage loads and preprocesses an image for the vision tower
-// Returns [1, H, W, C] tensor in NHWC format normalized for SigLIP
-func ProcessImage(path string, imageSize int32) (*mlx.Array, error) {
-	f, err := os.Open(path)
-	if err != nil {
-		return nil, fmt.Errorf("open image: %w", err)
-	}
-	defer f.Close()
-
-	img, _, err := image.Decode(f)
-	if err != nil {
-		return nil, fmt.Errorf("decode image: %w", err)
-	}
-
-	return ProcessImageData(img, imageSize)
-}
-
-// ProcessImageData preprocesses an image.Image for the vision tower
-func ProcessImageData(img image.Image, imageSize int32) (*mlx.Array, error) {
-	// Resize to target size using bilinear interpolation
-	resized := image.NewRGBA(image.Rect(0, 0, int(imageSize), int(imageSize)))
-	draw.BiLinear.Scale(resized, resized.Bounds(), img, img.Bounds(), draw.Over, nil)
-
-	// Convert to float32 array [H, W, C] and normalize
-	// SigLIP normalization: (pixel / 255.0 - 0.5) / 0.5 = pixel / 127.5 - 1.0
-	data := make([]float32, imageSize*imageSize*3)
-	idx := 0
-	for y := int32(0); y < imageSize; y++ {
-		for x := int32(0); x < imageSize; x++ {
-			r, g, b, _ := resized.At(int(x), int(y)).RGBA()
-			// RGBA returns 16-bit values, convert to 8-bit
-			data[idx] = float32(r>>8)/127.5 - 1.0
-			data[idx+1] = float32(g>>8)/127.5 - 1.0
-			data[idx+2] = float32(b>>8)/127.5 - 1.0
-			idx += 3
-		}
-	}
-
-	// Create MLX array [1, H, W, C] for NHWC layout
-	arr := mlx.NewArrayFloat32(data, []int32{1, imageSize, imageSize, 3})
-	mlx.Eval(arr) // Materialize to prevent use-after-free
-	return arr, nil
-}
--- a/x/imagegen/models/gemma3/projector.go
+++ b/x/imagegen/models/gemma3/projector.go
@@ -1,48 +0,0 @@
-package gemma3
-
-import (
-	"github.com/ollama/ollama/x/imagegen/mlx"
-	"github.com/ollama/ollama/x/imagegen/nn"
-)
-
-// MultiModalProjector projects vision features to text embedding space
-type MultiModalProjector struct {
-	// mm_input_projection_weight: [vision_hidden, text_hidden]
-	InputProjection *mlx.Array  `weight:"mm_input_projection_weight"`
-	SoftEmbNorm     *nn.RMSNorm `weight:"mm_soft_emb_norm"`
-
-	// Precomputed (1 + weight) for Gemma-style RMSNorm
-	SoftEmbNormScaled *mlx.Array `weight:"-"`
-}
-
-// Forward projects vision features to text space
-// Input: [B, num_patches, vision_hidden] (e.g., [1, 4096, 1152])
-// Output: [B, num_image_tokens, text_hidden] (e.g., [1, 256, 2560])
-func (p *MultiModalProjector) Forward(visionFeatures *mlx.Array, eps float32) *mlx.Array {
-	// Average pool 4x4: [B, 4096, 1152] -> [B, 256, 1152]
-	// 4096 patches = 64x64 grid, pool to 16x16 = 256 tokens
-	B := visionFeatures.Shape()[0]
-	visionHidden := visionFeatures.Shape()[2]
-
-	// Reshape to [B, 64, 64, hidden]
-	gridSize := int32(64) // sqrt(4096)
-	pooledSize := int32(16) // 64/4
-	h := mlx.Reshape(visionFeatures, B, gridSize, gridSize, visionHidden)
-
-	// Reshape to [B, 16, 4, 16, 4, hidden] for 4x4 pooling
-	h = mlx.Reshape(h, B, pooledSize, 4, pooledSize, 4, visionHidden)
-
-	// Average over pooling dimensions (axes 2 and 4)
-	h = mlx.Mean(h, 4, false)
-	h = mlx.Mean(h, 2, false)
-
-	// h is now [B, 16, 16, hidden], reshape to [B, 256, hidden]
-	numTokens := pooledSize * pooledSize
-	h = mlx.Reshape(h, B, numTokens, visionHidden)
-
-	// Apply Gemma-style RMS norm (use precomputed 1 + weight)
-	h = mlx.RMSNorm(h, p.SoftEmbNormScaled, eps)
-
-	// Project to text space: [B, 256, vision_hidden] @ [vision_hidden, text_hidden]
-	return mlx.Linear(h, p.InputProjection)
-}
--- a/x/imagegen/models/gemma3/vision.go
+++ b/x/imagegen/models/gemma3/vision.go
@@ -1,136 +0,0 @@
-package gemma3
-
-import (
-	"math"
-
-	"github.com/ollama/ollama/x/imagegen/mlx"
-	"github.com/ollama/ollama/x/imagegen/nn"
-)
-
-// VisionConfig holds configuration for the SigLIP vision tower
-type VisionConfig struct {
-	HiddenSize        int32 `json:"hidden_size"`
-	ImageSize         int32 `json:"image_size"`
-	IntermediateSize  int32 `json:"intermediate_size"`
-	NumAttentionHeads int32 `json:"num_attention_heads"`
-	NumHiddenLayers   int32 `json:"num_hidden_layers"`
-	PatchSize         int32 `json:"patch_size"`
-}
-
-// VisionTower is the SigLIP vision encoder
-type VisionTower struct {
-	Embeddings    *VisionEmbeddings     `weight:"vision_model.embeddings"`
-	Encoder       []*VisionEncoderLayer `weight:"vision_model.encoder.layers"`
-	PostLayerNorm *nn.LayerNorm         `weight:"vision_model.post_layernorm"`
-	Config        *VisionConfig
-}
-
-// VisionEmbeddings handles patch and position embeddings
-type VisionEmbeddings struct {
-	// PatchWeight: [O, C, kH, kW] from PyTorch, transposed to [O, kH, kW, C] for MLX
-	PatchWeight *mlx.Array    `weight:"patch_embedding.weight"`
-	PatchBias   *mlx.Array    `weight:"patch_embedding.bias"`
-	PosEmbed    *nn.Embedding `weight:"position_embedding"`
-}
-
-// VisionEncoderLayer is a single transformer encoder layer
-type VisionEncoderLayer struct {
-	LayerNorm1 *nn.LayerNorm     `weight:"layer_norm1"`
-	Attention  *VisionAttention  `weight:"self_attn"`
-	LayerNorm2 *nn.LayerNorm     `weight:"layer_norm2"`
-	MLP        *VisionMLP        `weight:"mlp"`
-}
-
-// VisionAttention implements multi-head self-attention
-type VisionAttention struct {
-	QProj   *nn.Linear `weight:"q_proj"`
-	KProj   *nn.Linear `weight:"k_proj"`
-	VProj   *nn.Linear `weight:"v_proj"`
-	OutProj *nn.Linear `weight:"out_proj"`
-}
-
-// VisionMLP is the feed-forward network
-type VisionMLP struct {
-	FC1 *nn.Linear `weight:"fc1"`
-	FC2 *nn.Linear `weight:"fc2"`
-}
-
-// Forward runs the vision tower on preprocessed images
-// Input: [B, H, W, C] normalized image tensor (NHWC layout for MLX)
-// Output: [B, num_patches, hidden_size]
-func (v *VisionTower) Forward(x *mlx.Array) *mlx.Array {
-	// Patch embedding conv: input [B, H, W, C], weight [O, kH, kW, C] -> [B, grid, grid, O]
-	// Weight comes as [O, C, kH, kW] from PyTorch, transpose to [O, kH, kW, C]
-	weight := mlx.Transpose(v.Embeddings.PatchWeight, 0, 2, 3, 1)
-	h := mlx.Conv2d(x, weight, v.Config.PatchSize, 0) // stride=patch_size, no padding
-
-	// Add bias: [O] -> [1, 1, 1, O] for broadcasting
-	bias := mlx.Reshape(v.Embeddings.PatchBias, 1, 1, 1, v.Embeddings.PatchBias.Shape()[0])
-	h = mlx.Add(h, bias)
-
-	// h is [B, grid, grid, hidden], flatten to [B, num_patches, hidden]
-	B := h.Shape()[0]
-	gridH, gridW := h.Shape()[1], h.Shape()[2]
-	hidden := h.Shape()[3]
-	numPatches := gridH * gridW
-	h = mlx.Reshape(h, B, numPatches, hidden)
-
-	// Add position embeddings
-	posIds := mlx.ArangeInt(0, numPatches, 1, mlx.DtypeInt32)
-	posEmbed := v.Embeddings.PosEmbed.Forward(posIds)
-	h = mlx.Add(h, posEmbed)
-
-	// Encoder layers
-	headDim := float32(v.Config.HiddenSize / v.Config.NumAttentionHeads)
-	scale := float32(1.0 / math.Sqrt(float64(headDim)))
-	for _, layer := range v.Encoder {
-		h = layer.Forward(h, v.Config, scale)
-	}
-
-	// Final layer norm
-	h = v.PostLayerNorm.Forward(h)
-
-	return h
-}
-
-// Forward runs a vision encoder layer
-func (l *VisionEncoderLayer) Forward(x *mlx.Array, cfg *VisionConfig, scale float32) *mlx.Array {
-	// Pre-norm attention
-	h := l.LayerNorm1.Forward(x)
-	h = l.Attention.Forward(h, cfg, scale)
-	x = mlx.Add(x, h)
-
-	// Pre-norm MLP
-	h = l.LayerNorm2.Forward(x)
-	h = l.MLP.Forward(h)
-	return mlx.Add(x, h)
-}
-
-// Forward runs multi-head self-attention
-func (a *VisionAttention) Forward(x *mlx.Array, cfg *VisionConfig, scale float32) *mlx.Array {
-	B, L := x.Shape()[0], x.Shape()[1]
-	headDim := cfg.HiddenSize / cfg.NumAttentionHeads
-
-	q := a.QProj.Forward(x)
-	k := a.KProj.Forward(x)
-	v := a.VProj.Forward(x)
-
-	// Reshape to [B, num_heads, L, head_dim]
-	q = mlx.Transpose(mlx.Reshape(q, B, L, cfg.NumAttentionHeads, headDim), 0, 2, 1, 3)
-	k = mlx.Transpose(mlx.Reshape(k, B, L, cfg.NumAttentionHeads, headDim), 0, 2, 1, 3)
-	v = mlx.Transpose(mlx.Reshape(v, B, L, cfg.NumAttentionHeads, headDim), 0, 2, 1, 3)
-
-	// Scaled dot-product attention (no causal mask for vision)
-	out := mlx.ScaledDotProductAttention(q, k, v, scale, false)
-
-	// Reshape back: [B, num_heads, L, head_dim] -> [B, L, hidden]
-	out = mlx.Reshape(mlx.Transpose(out, 0, 2, 1, 3), B, L, cfg.HiddenSize)
-
-	return a.OutProj.Forward(out)
-}
-
-// Forward runs the MLP with GELU activation
-func (m *VisionMLP) Forward(x *mlx.Array) *mlx.Array {
-	h := mlx.GELU(m.FC1.Forward(x))
-	return m.FC2.Forward(h)
-}
--- a/x/imagegen/models/gpt_oss/gpt_oss.go
+++ b/x/imagegen/models/gpt_oss/gpt_oss.go
@@ -1,485 +0,0 @@
-package gpt_oss
-
-import (
-	"encoding/json"
-	"fmt"
-	"math"
-	"os"
-	"path/filepath"
-
-	"github.com/ollama/ollama/x/imagegen/cache"
-	"github.com/ollama/ollama/x/imagegen/mlx"
-	"github.com/ollama/ollama/x/imagegen/nn"
-	"github.com/ollama/ollama/x/imagegen/safetensors"
-	"github.com/ollama/ollama/x/imagegen/tokenizer"
-)
-
-// RopeScaling holds YaRN or other RoPE scaling configuration
-type RopeScaling struct {
-	RopeType                      string  `json:"rope_type"`
-	Factor                        float32 `json:"factor"`
-	OriginalMaxPositionEmbeddings int32   `json:"original_max_position_embeddings"`
-	BetaFast                      float32 `json:"beta_fast"`
-	BetaSlow                      float32 `json:"beta_slow"`
-}
-
-type Config struct {
-	HiddenSize        int32        `json:"hidden_size"`
-	NumHiddenLayers   int32        `json:"num_hidden_layers"`
-	IntermediateSize  int32        `json:"intermediate_size"`
-	NumAttentionHeads int32        `json:"num_attention_heads"`
-	NumKeyValueHeads  int32        `json:"num_key_value_heads"`
-	VocabSize         int32        `json:"vocab_size"`
-	RMSNormEps        float32      `json:"rms_norm_eps"`
-	RopeTheta         float32      `json:"rope_theta"`
-	HeadDim           int32        `json:"head_dim"`
-	SlidingWindow     int32        `json:"sliding_window"`
-	NumLocalExperts   int32        `json:"num_local_experts"`
-	NumExpertsPerTok  int32        `json:"num_experts_per_tok"`
-	LayerTypes        []string     `json:"layer_types"`
-	SwiGLULimit       float32      `json:"swiglu_limit"`
-	RopeScaling       *RopeScaling `json:"rope_scaling"`
-	Scale             float32      `json:"-"` // computed: 1/sqrt(HeadDim)
-}
-
-type Attention struct {
-	QProj      *nn.Linear `weight:"self_attn.q_proj"`
-	KProj      *nn.Linear `weight:"self_attn.k_proj"`
-	VProj      *nn.Linear `weight:"self_attn.v_proj"`
-	OProj      *nn.Linear `weight:"self_attn.o_proj"`
-	Sinks      *mlx.Array `weight:"self_attn.sinks,optional"`
-	YarnFreqs  *mlx.Array // computed
-	YarnMscale float32
-}
-
-// swiGLU applies the GPT-OSS custom SwiGLU activation.
-// Formula: (gate * sigmoid(alpha * gate)) * (up + 1)
-// with clipping: gate to [None, limit], up to [-limit, limit]
-func swiGLU(gate, up *mlx.Array, alpha, limit float32) *mlx.Array {
-	// Clip gate to [None, limit]
-	gateClipped := mlx.ClipScalar(gate, 0, limit, false, true)
-
-	// Clip up to [-limit, limit]
-	upClipped := mlx.ClipScalar(up, -limit, limit, true, true)
-
-	// glu_scaled = alpha * gate_clipped
-	gluScaled := mlx.MulScalar(gateClipped, alpha)
-
-	// sig = sigmoid(glu_scaled)
-	sig := mlx.Sigmoid(gluScaled)
-
-	// out_glu = gate_clipped * sig
-	outGlu := mlx.Mul(gateClipped, sig)
-
-	// result = out_glu * (up_clipped + 1)
-	return mlx.Mul(outGlu, mlx.AddScalar(upClipped, 1.0))
-}
-
-// compiledSwiGLU is a singleton compiled SwiGLU function shared across all layers
-var compiledSwiGLU *mlx.CompiledFunc
-
-// getCompiledSwiGLU returns the compiled SwiGLU function, creating it once if needed
-func getCompiledSwiGLU() *mlx.CompiledFunc {
-	if compiledSwiGLU == nil {
-		const alpha float32 = 1.702
-		const limit float32 = 7.0
-		compiledSwiGLU = mlx.CompileShapeless(func(inputs []*mlx.Array) []*mlx.Array {
-			return []*mlx.Array{swiGLU(inputs[0], inputs[1], alpha, limit)}
-		}, true) // shapeless=true so it works for any input size
-	}
-	return compiledSwiGLU
-}
-
-// ComputeYarnFreqs computes YaRN-modified RoPE frequencies
-// Based on mlx-lm's YarnRoPE implementation
-func ComputeYarnFreqs(dims int32, base, scalingFactor float32, origMaxPos int32, betaFast, betaSlow float32) (*mlx.Array, float32) {
-	// yarn_find_correction_dim
-	yarnFindCorrectionDim := func(numRotations float64) float64 {
-		return float64(dims) * math.Log(float64(origMaxPos)/(numRotations*2*math.Pi)) / (2 * math.Log(float64(base)))
-	}
-
-	// yarn_find_correction_range
-	low := int(math.Floor(yarnFindCorrectionDim(float64(betaFast))))
-	high := int(math.Ceil(yarnFindCorrectionDim(float64(betaSlow))))
-	if low < 0 {
-		low = 0
-	}
-	if high > int(dims)-1 {
-		high = int(dims) - 1
-	}
-
-	// yarn_get_mscale
-	yarnGetMscale := func(scale, mscale float64) float64 {
-		if scale <= 1 {
-			return 1.0
-		}
-		return 0.1*mscale*math.Log(scale) + 1.0
-	}
-	mscale := float32(yarnGetMscale(float64(scalingFactor), 1.0) / yarnGetMscale(float64(scalingFactor), 0.0))
-
-	// Compute frequencies
-	// freq_extra = base ** (arange(0, dims, 2) / dims)
-	// freq_inter = scaling_factor * freq_extra
-	halfDims := dims / 2
-	freqData := make([]float32, halfDims)
-	for i := int32(0); i < halfDims; i++ {
-		exp := float64(2*i) / float64(dims)
-		freqExtra := math.Pow(float64(base), exp)
-		freqInter := float64(scalingFactor) * freqExtra
-
-		// linear ramp mask
-		var freqMask float64
-		if low == high {
-			freqMask = 0.0
-		} else {
-			t := (float64(i) - float64(low)) / float64(high-low)
-			if t < 0 {
-				t = 0
-			}
-			if t > 1 {
-				t = 1
-			}
-			freqMask = 1.0 - t
-		}
-
-		// Combined frequency: (inter * extra) / (inter * mask + extra * (1 - mask))
-		freqData[i] = float32((freqInter * freqExtra) / (freqInter*freqMask + freqExtra*(1-freqMask)))
-	}
-
-	return mlx.NewArray(freqData, []int32{halfDims}), mscale
-}
-
-// initYarn initializes YaRN RoPE if configured
-func (a *Attention) initYarn(cfg *Config) {
-	a.YarnMscale = 1.0
-	if cfg.RopeScaling != nil && cfg.RopeScaling.RopeType == "yarn" {
-		a.YarnFreqs, a.YarnMscale = ComputeYarnFreqs(
-			cfg.HeadDim,
-			cfg.RopeTheta,
-			cfg.RopeScaling.Factor,
-			cfg.RopeScaling.OriginalMaxPositionEmbeddings,
-			cfg.RopeScaling.BetaFast,
-			cfg.RopeScaling.BetaSlow,
-		)
-	}
-}
-
-func (a *Attention) Forward(x *mlx.Array, c cache.Cache, B, L int32, mask *mlx.Array, maskMode string, cfg *Config) *mlx.Array {
-	q := a.QProj.Forward(x)
-	k := a.KProj.Forward(x)
-	v := a.VProj.Forward(x)
-
-	// Reshape via AsStrided: [B, L, n_heads * head_dim] -> [B, n_heads, L, head_dim]
-	q = mlx.AsStrided(q, []int32{B, cfg.NumAttentionHeads, L, cfg.HeadDim},
-		[]int64{int64(L * cfg.NumAttentionHeads * cfg.HeadDim), int64(cfg.HeadDim), int64(cfg.NumAttentionHeads * cfg.HeadDim), 1}, 0)
-	k = mlx.AsStrided(k, []int32{B, cfg.NumKeyValueHeads, L, cfg.HeadDim},
-		[]int64{int64(L * cfg.NumKeyValueHeads * cfg.HeadDim), int64(cfg.HeadDim), int64(cfg.NumKeyValueHeads * cfg.HeadDim), 1}, 0)
-	v = mlx.AsStrided(v, []int32{B, cfg.NumKeyValueHeads, L, cfg.HeadDim},
-		[]int64{int64(L * cfg.NumKeyValueHeads * cfg.HeadDim), int64(cfg.HeadDim), int64(cfg.NumKeyValueHeads * cfg.HeadDim), 1}, 0)
-
-	offset := 0
-	if c != nil {
-		offset = c.Offset()
-	}
-	if a.YarnFreqs != nil {
-		if a.YarnMscale != 1.0 {
-			q = mlx.MulScalar(q, a.YarnMscale)
-		}
-		q = mlx.RoPEWithFreqs(q, a.YarnFreqs, int(cfg.HeadDim), false, 1.0, offset)
-		k = mlx.RoPEWithFreqs(k, a.YarnFreqs, int(cfg.HeadDim), false, 1.0, offset)
-	} else {
-		q = mlx.RoPE(q, int(cfg.HeadDim), false, cfg.RopeTheta, 1.0, offset)
-		k = mlx.RoPE(k, int(cfg.HeadDim), false, cfg.RopeTheta, 1.0, offset)
-	}
-
-	if c != nil {
-		k, v = c.Update(k, v, int(L))
-	}
-
-	out := mlx.ScaledDotProductAttentionWithSinks(q, k, v, cfg.Scale, maskMode, mask, a.Sinks)
-	out = mlx.Reshape(mlx.Transpose(out, 0, 2, 1, 3), B, L, cfg.NumAttentionHeads*cfg.HeadDim)
-	return a.OProj.Forward(out)
-}
-
-// CreateSlidingWindowMask creates a causal mask with sliding window
-// Mirrors mlx-lm's create_causal_mask with window_size
-func CreateSlidingWindowMask(seqLen, queryStart, keyStart, keyLen, windowSize int) *mlx.Array {
-	// Build mask aligned to actual cache length (may be rotated)
-	// rinds covers existing keys: [keyStart, keyStart+keyLen)
-	// linds covers new queries: [queryStart, queryStart+seqLen)
-	rinds := mlx.Arange(float32(keyStart), float32(keyStart+keyLen), 1)     // [keyLen]
-	linds := mlx.Arange(float32(queryStart), float32(queryStart+seqLen), 1) // [seqLen]
-
-	linds = mlx.ExpandDims(linds, 1) // [seqLen, 1]
-	rinds = mlx.ExpandDims(rinds, 0) // [1, keyLen]
-
-	causalMask := mlx.GreaterEqual(linds, rinds) // [seqLen, keyLen]
-	windowLimit := mlx.AddScalar(rinds, float32(windowSize))
-	windowMask := mlx.LessArray(linds, windowLimit) // [seqLen, keyLen]
-
-	return mlx.LogicalAnd(causalMask, windowMask)
-}
-
-// MoE represents the Mixture of Experts SwiGLU layer with quantized experts.
-type MoE struct {
-	Router     *nn.Linear `weight:"mlp.router"`
-	TopK       int32
-	HiddenSize int32
-	GroupSize  int
-	Bits       int
-	// Expert weights (loaded manually via sanitizeExpertWeights)
-	GateBlocks, GateScales, GateBias *mlx.Array
-	UpBlocks, UpScales, UpBias       *mlx.Array
-	DownBlocks, DownScales, DownBias *mlx.Array
-}
-
-func (moe *MoE) Forward(x *mlx.Array, B, L int32) *mlx.Array {
-	logits := moe.Router.Forward(x)
-	neg := mlx.Neg(logits)
-	part := mlx.Argpartition(neg, int(moe.TopK)-1, -1)
-	topKIdx := mlx.Slice(part, []int32{0, 0, 0}, []int32{B, L, moe.TopK})
-	topKVal := mlx.TakeAlongAxis(logits, topKIdx, -1)
-	weights := mlx.Softmax(topKVal, -1)
-
-	xFlat := mlx.Reshape(x, B*L, 1, 1, moe.HiddenSize)
-	idxFlat := mlx.Reshape(topKIdx, B*L, moe.TopK)
-
-	doSort := B*L >= 64
-	var invOrder *mlx.Array
-	sorted := false
-	n := B * L * moe.TopK
-
-	if doSort {
-		idxAll := mlx.Flatten(idxFlat)
-		order := mlx.Argsort(idxAll, 0)
-		invOrder = mlx.Argsort(order, 0)
-		xFlat = mlx.ExpandDims(mlx.Take(mlx.Squeeze(xFlat, 1), mlx.FloorDivideScalar(order, moe.TopK), 0), 1)
-		idxFlat = mlx.Reshape(mlx.Take(idxAll, order, 0), n, 1)
-		sorted = true
-	}
-
-	gate := mlx.GatherQMM(xFlat, moe.GateBlocks, moe.GateScales, nil, nil, idxFlat, true, moe.GroupSize, moe.Bits, "mxfp4", sorted)
-	up := mlx.GatherQMM(xFlat, moe.UpBlocks, moe.UpScales, nil, nil, idxFlat, true, moe.GroupSize, moe.Bits, "mxfp4", sorted)
-
-	if moe.GateBias != nil {
-		gate = mlx.Add(gate, mlx.ExpandDims(mlx.Take(moe.GateBias, idxFlat, 0), 2))
-	}
-	if moe.UpBias != nil {
-		up = mlx.Add(up, mlx.ExpandDims(mlx.Take(moe.UpBias, idxFlat, 0), 2))
-	}
-
-	hidden := getCompiledSwiGLU().Call(gate, up)[0]
-
-	down := mlx.GatherQMM(hidden, moe.DownBlocks, moe.DownScales, nil, nil, idxFlat, true, moe.GroupSize, moe.Bits, "mxfp4", sorted)
-	if moe.DownBias != nil {
-		down = mlx.Add(down, mlx.ExpandDims(mlx.Take(moe.DownBias, idxFlat, 0), 2))
-	}
-
-	if doSort {
-		down = mlx.Reshape(mlx.Take(mlx.Squeeze(mlx.Squeeze(down, 2), 1), invOrder, 0), B*L, moe.TopK, moe.HiddenSize)
-	} else {
-		down = mlx.Squeeze(down, 2)
-	}
-
-	ewFlat := mlx.Reshape(weights, B*L, moe.TopK, 1)
-	return mlx.Reshape(mlx.Sum(mlx.Mul(down, ewFlat), 1, false), B, L, moe.HiddenSize)
-}
-
-type Block struct {
-	Attention    *Attention
-	MLP          *MoE
-	InputNorm    *nn.RMSNorm `weight:"input_layernorm"`
-	PostAttnNorm *nn.RMSNorm `weight:"post_attention_layernorm"`
-	LayerType    string      // "sliding_attention" or "full_attention"
-}
-
-func (b *Block) Forward(x *mlx.Array, c cache.Cache, B, L int32, mask *mlx.Array, maskMode string, cfg *Config) *mlx.Array {
-	h := mlx.Add(x, b.Attention.Forward(b.InputNorm.Forward(x, cfg.RMSNormEps), c, B, L, mask, maskMode, cfg))
-	return mlx.Add(h, b.MLP.Forward(b.PostAttnNorm.Forward(h, cfg.RMSNormEps), B, L))
-}
-
-type Model struct {
-	EmbedTokens *nn.Embedding `weight:"model.embed_tokens"`
-	Layers      []*Block      `weight:"-"` // loaded manually due to MoE sanitization
-	Norm        *nn.RMSNorm   `weight:"model.norm"`
-	LMHead      *nn.Linear    `weight:"lm_head"`
-
-	tok *tokenizer.Tokenizer
-	*Config
-}
-
-func (m *Model) Tokenizer() *tokenizer.Tokenizer { return m.tok }
-func (m *Model) NumLayers() int                     { return len(m.Layers) }
-func (m *Model) VocabSize() int32                   { return m.Config.VocabSize }
-
-func (m *Model) NewCache(int32) []cache.Cache {
-	caches := make([]cache.Cache, len(m.Layers))
-	for i, layer := range m.Layers {
-		if layer.LayerType == "sliding_attention" && m.SlidingWindow > 0 {
-			caches[i] = cache.NewRotatingKVCache(int(m.SlidingWindow))
-		} else {
-			caches[i] = cache.NewKVCache()
-		}
-	}
-	return caches
-}
-
-func (m *Model) Forward(tokens *mlx.Array, caches []cache.Cache) *mlx.Array {
-	B, L := tokens.Shape()[0], tokens.Shape()[1]
-	x := m.EmbedTokens.Forward(tokens)
-
-	// Find representative cache indices for sliding window attention
-	var swaIdx int = -1
-	for i, layer := range m.Layers {
-		if layer.LayerType == "sliding_attention" {
-			swaIdx = i
-			break
-		}
-	}
-
-	// Create masks once at model level
-	var fullMask, swaMask *mlx.Array
-	var fullMaskMode, swaMaskMode string
-
-	if L > 1 {
-		fullMaskMode = "causal"
-		if swaIdx >= 0 && m.SlidingWindow > 0 && caches != nil {
-			c := caches[swaIdx]
-			offset := c.Offset()
-			windowSize := int(m.SlidingWindow)
-			cacheLen := min(int(L), windowSize)
-			if offset > 0 {
-				cacheLen = min(c.Len()+int(L), windowSize)
-			}
-			if int(L) > windowSize {
-				swaMask = CreateSlidingWindowMask(int(L), offset, offset+int(L)-cacheLen, cacheLen, windowSize)
-			} else {
-				swaMaskMode = "causal"
-			}
-		} else {
-			swaMaskMode = "causal"
-		}
-	}
-
-	for i, layer := range m.Layers {
-		var c cache.Cache
-		if caches != nil {
-			c = caches[i]
-		}
-		mask, maskMode := fullMask, fullMaskMode
-		if layer.LayerType == "sliding_attention" {
-			mask, maskMode = swaMask, swaMaskMode
-		}
-		x = layer.Forward(x, c, B, L, mask, maskMode, m.Config)
-	}
-
-	return m.LMHead.Forward(m.Norm.Forward(x, m.RMSNormEps))
-}
-
-// sanitizeExpertWeights splits merged gate_up weights into separate gate/up arrays.
-// MXFP4 quantized weights require contiguous memory - strided views give wrong results.
-func sanitizeExpertWeights(weights *safetensors.ModelWeights, prefix string) (moe *MoE) {
-	gateUpBlocks, _ := weights.GetTensor(prefix + ".mlp.experts.gate_up_proj_blocks")
-	gateUpScales, _ := weights.GetTensor(prefix + ".mlp.experts.gate_up_proj_scales")
-	gateUpBias, _ := weights.GetTensor(prefix + ".mlp.experts.gate_up_proj_bias")
-	downBlocks, _ := weights.GetTensor(prefix + ".mlp.experts.down_proj_blocks")
-	downScales, _ := weights.GetTensor(prefix + ".mlp.experts.down_proj_scales")
-	downBias, _ := weights.GetTensor(prefix + ".mlp.experts.down_proj_bias")
-
-	moe = &MoE{GroupSize: 32, Bits: 4, DownScales: downScales, DownBias: downBias}
-
-	if gateUpBlocks != nil {
-		gub := mlx.FlattenRange(mlx.View(gateUpBlocks, int(mlx.DtypeUint32)), -2, -1)
-		s := gub.Shape()
-		moe.GateBlocks = mlx.Contiguous(mlx.SliceStride(gub, []int32{0, 0, 0}, []int32{s[0], s[1], s[2]}, []int32{1, 2, 1}))
-		moe.UpBlocks = mlx.Contiguous(mlx.SliceStride(gub, []int32{0, 1, 0}, []int32{s[0], s[1], s[2]}, []int32{1, 2, 1}))
-	}
-	if gateUpScales != nil {
-		s := gateUpScales.Shape()
-		moe.GateScales = mlx.Contiguous(mlx.SliceStride(gateUpScales, []int32{0, 0, 0}, []int32{s[0], s[1], s[2]}, []int32{1, 2, 1}))
-		moe.UpScales = mlx.Contiguous(mlx.SliceStride(gateUpScales, []int32{0, 1, 0}, []int32{s[0], s[1], s[2]}, []int32{1, 2, 1}))
-	}
-	if gateUpBias != nil {
-		s := gateUpBias.Shape()
-		moe.GateBias = mlx.Contiguous(mlx.SliceStride(gateUpBias, []int32{0, 0}, []int32{s[0], s[1]}, []int32{1, 2}))
-		moe.UpBias = mlx.Contiguous(mlx.SliceStride(gateUpBias, []int32{0, 1}, []int32{s[0], s[1]}, []int32{1, 2}))
-	}
-	if downBlocks != nil {
-		moe.DownBlocks = mlx.FlattenRange(mlx.View(downBlocks, int(mlx.DtypeUint32)), -2, -1)
-	}
-	return moe
-}
-
-func Load(modelPath string) (*Model, error) {
-	data, err := os.ReadFile(filepath.Join(modelPath, "config.json"))
-	if err != nil {
-		return nil, fmt.Errorf("load config: %w", err)
-	}
-	var cfg Config
-	if err := json.Unmarshal(data, &cfg); err != nil {
-		return nil, fmt.Errorf("parse config: %w", err)
-	}
-	cfg.Scale = float32(1.0 / math.Sqrt(float64(cfg.HeadDim)))
-
-	weights, err := safetensors.LoadModelWeights(modelPath)
-	if err != nil {
-		return nil, fmt.Errorf("load weights: %w", err)
-	}
-
-	tok, err := tokenizer.Load(filepath.Join(modelPath, "tokenizer.json"))
-	if err != nil {
-		return nil, fmt.Errorf("load tokenizer: %w", err)
-	}
-
-	m := &Model{
-		Layers: make([]*Block, cfg.NumHiddenLayers),
-		Config: &cfg,
-		tok:    tok,
-	}
-
-	// Load simple weights via struct tags
-	if err := safetensors.LoadModule(m, weights, ""); err != nil {
-		return nil, err
-	}
-
-	// Load layers with custom MoE handling
-	for i := int32(0); i < cfg.NumHiddenLayers; i++ {
-		prefix := fmt.Sprintf("model.layers.%d", i)
-		layer := &Block{}
-		if err := safetensors.LoadModule(layer, weights, prefix); err != nil {
-			return nil, fmt.Errorf("layer %d: %w", i, err)
-		}
-
-		// Initialize attention YaRN
-		layer.Attention.initYarn(&cfg)
-
-		// Load MoE with weight sanitization
-		moe := sanitizeExpertWeights(weights, prefix)
-		moe.Router = layer.MLP.Router // Router was loaded by LoadModule
-		moe.TopK = cfg.NumExpertsPerTok
-		moe.HiddenSize = cfg.HiddenSize
-		layer.MLP = moe
-
-		// Set layer type
-		layer.LayerType = "full_attention"
-		if int(i) < len(cfg.LayerTypes) {
-			layer.LayerType = cfg.LayerTypes[i]
-		}
-
-		m.Layers[i] = layer
-	}
-
-	// Release safetensors BEFORE eval - lazy arrays have captured data,
-	// this reduces peak memory by freeing mmap during materialization
-	weights.ReleaseAll()
-	mlx.Eval(mlx.Collect(m)...)
-
-	return m, nil
-}
-
-func (m *Model) MaxContextLength() int32 {
-	if m.RopeScaling != nil && m.RopeScaling.OriginalMaxPositionEmbeddings > 0 {
-		return m.RopeScaling.OriginalMaxPositionEmbeddings
-	}
-	return 131072
-}
--- a/x/imagegen/models/llama/llama.go
+++ b/x/imagegen/models/llama/llama.go
@@ -1,150 +0,0 @@
-package llama
-
-import (
-	"encoding/json"
-	"fmt"
-	"math"
-	"os"
-	"path/filepath"
-
-	"github.com/ollama/ollama/x/imagegen/cache"
-	"github.com/ollama/ollama/x/imagegen/mlx"
-	"github.com/ollama/ollama/x/imagegen/nn"
-	"github.com/ollama/ollama/x/imagegen/safetensors"
-	"github.com/ollama/ollama/x/imagegen/tokenizer"
-)
-
-type Config struct {
-	HiddenSize            int32   `json:"hidden_size"`
-	NumHiddenLayers       int32   `json:"num_hidden_layers"`
-	IntermediateSize      int32   `json:"intermediate_size"`
-	NumAttentionHeads     int32   `json:"num_attention_heads"`
-	NumKeyValueHeads      int32   `json:"num_key_value_heads"`
-	VocabSize             int32   `json:"vocab_size"`
-	RMSNormEps            float32 `json:"rms_norm_eps"`
-	RopeTheta             float32 `json:"rope_theta"`
-	MaxPositionEmbeddings int32   `json:"max_position_embeddings"`
-	HeadDim               int32   `json:"-"`
-	Scale                 float32 `json:"-"`
-}
-
-type Model struct {
-	EmbedTokens *nn.Embedding `weight:"model.embed_tokens"`
-	Layers      []*Layer      `weight:"model.layers"`
-	Norm        *nn.RMSNorm   `weight:"model.norm"`
-	Output      *nn.Linear    `weight:"lm_head,optional"`
-
-	tok *tokenizer.Tokenizer
-	*Config
-}
-
-type Layer struct {
-	Attention     *Attention
-	MLP           *MLP
-	AttentionNorm *nn.RMSNorm `weight:"input_layernorm"`
-	MLPNorm       *nn.RMSNorm `weight:"post_attention_layernorm"`
-}
-
-type Attention struct {
-	QProj *nn.Linear `weight:"self_attn.q_proj"`
-	KProj *nn.Linear `weight:"self_attn.k_proj"`
-	VProj *nn.Linear `weight:"self_attn.v_proj"`
-	OProj *nn.Linear `weight:"self_attn.o_proj"`
-}
-
-type MLP struct {
-	GateProj *nn.Linear `weight:"mlp.gate_proj"`
-	UpProj   *nn.Linear `weight:"mlp.up_proj"`
-	DownProj *nn.Linear `weight:"mlp.down_proj"`
-}
-
-func Load(modelPath string) (*Model, error) {
-	data, err := os.ReadFile(filepath.Join(modelPath, "config.json"))
-	if err != nil {
-		return nil, fmt.Errorf("load config: %w", err)
-	}
-	var cfg Config
-	if err := json.Unmarshal(data, &cfg); err != nil {
-		return nil, fmt.Errorf("parse config: %w", err)
-	}
-	cfg.HeadDim = cfg.HiddenSize / cfg.NumAttentionHeads
-	cfg.Scale = float32(1.0 / math.Sqrt(float64(cfg.HeadDim)))
-
-	weights, err := safetensors.LoadModelWeights(modelPath)
-	if err != nil {
-		return nil, fmt.Errorf("load weights: %w", err)
-	}
-
-	tok, err := tokenizer.Load(filepath.Join(modelPath, "tokenizer.json"))
-	if err != nil {
-		return nil, fmt.Errorf("load tokenizer: %w", err)
-	}
-
-	m := &Model{
-		Layers: make([]*Layer, cfg.NumHiddenLayers),
-		Config: &cfg,
-		tok:    tok,
-	}
-	if err := safetensors.LoadModule(m, weights, ""); err != nil {
-		return nil, err
-	}
-	m.Output = nn.NewLinear(m.EmbedTokens.Weight, nil)
-
-	mlx.Eval(mlx.Collect(m)...)
-	weights.ReleaseAll()
-
-	return m, nil
-}
-
-func (m *Model) Forward(tokens *mlx.Array, caches []cache.Cache) *mlx.Array {
-	B, L := tokens.Shape()[0], tokens.Shape()[1]
-	h := m.EmbedTokens.Forward(tokens)
-	for i, layer := range m.Layers {
-		h = layer.Forward(h, caches[i], B, L, m.Config)
-	}
-	return m.Output.Forward(m.Norm.Forward(h, m.RMSNormEps))
-}
-
-func (l *Layer) Forward(x *mlx.Array, c cache.Cache, B, L int32, cfg *Config) *mlx.Array {
-	h := mlx.Add(x, l.Attention.Forward(l.AttentionNorm.Forward(x, cfg.RMSNormEps), c, B, L, cfg))
-	return mlx.Add(h, l.MLP.Forward(l.MLPNorm.Forward(h, cfg.RMSNormEps)))
-}
-
-func (a *Attention) Forward(x *mlx.Array, c cache.Cache, B, L int32, cfg *Config) *mlx.Array {
-	q := a.QProj.Forward(x)
-	k := a.KProj.Forward(x)
-	v := a.VProj.Forward(x)
-
-	q = mlx.AsStrided(q, []int32{B, cfg.NumAttentionHeads, L, cfg.HeadDim},
-		[]int64{int64(L * cfg.NumAttentionHeads * cfg.HeadDim), int64(cfg.HeadDim), int64(cfg.NumAttentionHeads * cfg.HeadDim), 1}, 0)
-	k = mlx.AsStrided(k, []int32{B, cfg.NumKeyValueHeads, L, cfg.HeadDim},
-		[]int64{int64(L * cfg.NumKeyValueHeads * cfg.HeadDim), int64(cfg.HeadDim), int64(cfg.NumKeyValueHeads * cfg.HeadDim), 1}, 0)
-	v = mlx.AsStrided(v, []int32{B, cfg.NumKeyValueHeads, L, cfg.HeadDim},
-		[]int64{int64(L * cfg.NumKeyValueHeads * cfg.HeadDim), int64(cfg.HeadDim), int64(cfg.NumKeyValueHeads * cfg.HeadDim), 1}, 0)
-
-	q = mlx.RoPE(q, int(cfg.HeadDim), false, cfg.RopeTheta, 1.0, c.Offset())
-	k = mlx.RoPE(k, int(cfg.HeadDim), false, cfg.RopeTheta, 1.0, c.Offset())
-
-	k, v = c.Update(k, v, int(L))
-	out := mlx.ScaledDotProductAttention(q, k, v, cfg.Scale, L > 1)
-	out = mlx.Reshape(mlx.Transpose(out, 0, 2, 1, 3), B, L, cfg.NumAttentionHeads*cfg.HeadDim)
-	return a.OProj.Forward(out)
-}
-
-func (m *MLP) Forward(x *mlx.Array) *mlx.Array {
-	return m.DownProj.Forward(mlx.Mul(mlx.SiLU(m.GateProj.Forward(x)), m.UpProj.Forward(x)))
-}
-
-// Interface methods
-func (m *Model) NumLayers() int                     { return len(m.Layers) }
-func (m *Model) MaxContextLength() int32            { return m.MaxPositionEmbeddings }
-func (m *Model) VocabSize() int32                   { return m.Config.VocabSize }
-func (m *Model) Tokenizer() *tokenizer.Tokenizer { return m.tok }
-
-func (m *Model) NewCache(maxSeqLen int32) []cache.Cache {
-	caches := make([]cache.Cache, len(m.Layers))
-	for i := range caches {
-		caches[i] = cache.NewKVCache()
-	}
-	return caches
-}
--- a/x/imagegen/models/qwen_image/pipeline_test.go
+++ b/x/imagegen/models/qwen_image/pipeline_test.go
@@ -1,64 +0,0 @@
-package qwen_image
-
-import (
-	"os"
-	"testing"
-
-	"github.com/ollama/ollama/x/imagegen/mlx"
-)
-
-// TestPipelineOutput runs the full pipeline (integration test).
-// Skips if model weights not found. Requires ~50GB VRAM.
-func TestPipelineOutput(t *testing.T) {
-	modelPath := "../../../weights/Qwen-Image-2512"
-	if _, err := os.Stat(modelPath); os.IsNotExist(err) {
-		t.Skip("Skipping: model weights not found at " + modelPath)
-	}
-
-	// Load model
-	pm, err := LoadPersistent(modelPath)
-	if err != nil {
-		t.Skipf("Skipping: failed to load model: %v", err)
-	}
-
-	// Run 2-step pipeline (minimum for stable scheduler)
-	cfg := &GenerateConfig{
-		Prompt: "a cat",
-		Width:  256,
-		Height: 256,
-		Steps:  2,
-		Seed:   42,
-	}
-
-	output, err := pm.GenerateFromConfig(cfg)
-	if err != nil {
-		t.Fatalf("Pipeline failed: %v", err)
-	}
-	mlx.Eval(output)
-
-	// Verify output shape [1, C, H, W]
-	shape := output.Shape()
-	if len(shape) != 4 {
-		t.Errorf("Expected 4D output, got %v", shape)
-	}
-	if shape[0] != 1 || shape[1] != 3 || shape[2] != cfg.Height || shape[3] != cfg.Width {
-		t.Errorf("Shape mismatch: got %v, expected [1, 3, %d, %d]", shape, cfg.Height, cfg.Width)
-	}
-
-	// Verify values in expected range [0, 1]
-	data := output.Data()
-	minVal, maxVal := float32(1.0), float32(0.0)
-	for _, v := range data {
-		if v < minVal {
-			minVal = v
-		}
-		if v > maxVal {
-			maxVal = v
-		}
-	}
-	t.Logf("Output range: [%.4f, %.4f]", minVal, maxVal)
-
-	if minVal < -0.1 || maxVal > 1.1 {
-		t.Errorf("Output values out of range: [%.4f, %.4f]", minVal, maxVal)
-	}
-}
--- a/x/imagegen/models/qwen_image/qwen25vl.go
+++ b/x/imagegen/models/qwen_image/qwen25vl.go
--- a/x/imagegen/models/qwen_image/qwen_image.go
+++ b/x/imagegen/models/qwen_image/qwen_image.go
@@ -1,348 +0,0 @@
-// Package qwen_image implements the Qwen-Image diffusion transformer model.
-package qwen_image
-
-import (
-	"context"
-	"fmt"
-	"path/filepath"
-	"time"
-
-	"github.com/ollama/ollama/x/imagegen/cache"
-	"github.com/ollama/ollama/x/imagegen/mlx"
-	"github.com/ollama/ollama/x/imagegen/tokenizer"
-)
-
-// GenerateConfig holds all options for image generation.
-type GenerateConfig struct {
-	Prompt         string
-	NegativePrompt string       // Empty = no CFG
-	CFGScale       float32      // Only used if NegativePrompt is set (default: 4.0)
-	Width          int32        // Image width (default: 1024)
-	Height         int32        // Image height (default: 1024)
-	Steps          int          // Denoising steps (default: 30)
-	Seed           int64        // Random seed
-	Progress       ProgressFunc // Optional progress callback
-
-	// Layer caching (DeepCache/Learning-to-Cache speedup)
-	LayerCache    bool // Enable layer caching (default: false)
-	CacheInterval int  // Refresh cache every N steps (default: 3)
-	CacheLayers   int  // Number of shallow layers to cache (default: 25)
-}
-
-// ProgressFunc is called during generation with step progress.
-type ProgressFunc func(step, totalSteps int)
-
-// Model represents a Qwen-Image diffusion model.
-type Model struct {
-	ModelPath   string
-	Tokenizer   *tokenizer.Tokenizer
-	TextEncoder *Qwen25VL
-	Transformer *Transformer
-	VAEDecoder  *VAEDecoder
-}
-
-// Load loads the Qwen-Image model from a directory.
-func (m *Model) Load(modelPath string) error {
-	fmt.Println("Loading Qwen-Image model...")
-	start := time.Now()
-
-	if mlx.GPUIsAvailable() {
-		mlx.SetDefaultDeviceGPU()
-		mlx.EnableCompile()
-	}
-
-	m.ModelPath = modelPath
-
-	// Load tokenizer
-	fmt.Print("  Loading tokenizer... ")
-	tokenizerPath := filepath.Join(modelPath, "tokenizer")
-	tok, err := tokenizer.Load(tokenizerPath)
-	if err != nil {
-		return fmt.Errorf("tokenizer: %w", err)
-	}
-	m.Tokenizer = tok
-	fmt.Println("✓")
-
-	// Load text encoder (Qwen2.5-VL in text-only mode - skip vision tower for efficiency)
-	m.TextEncoder = &Qwen25VL{}
-	if err := m.TextEncoder.LoadTextOnly(filepath.Join(modelPath, "text_encoder")); err != nil {
-		return fmt.Errorf("text encoder: %w", err)
-	}
-	mlx.Eval(mlx.Collect(m.TextEncoder)...)
-	fmt.Printf("  (%.1f GB, peak %.1f GB)\n",
-		float64(mlx.MetalGetActiveMemory())/(1024*1024*1024),
-		float64(mlx.MetalGetPeakMemory())/(1024*1024*1024))
-
-	// Load transformer
-	m.Transformer = &Transformer{}
-	if err := m.Transformer.Load(filepath.Join(modelPath, "transformer")); err != nil {
-		return fmt.Errorf("transformer: %w", err)
-	}
-	mlx.Eval(mlx.Collect(m.Transformer)...)
-	fmt.Printf("  (%.1f GB, peak %.1f GB)\n",
-		float64(mlx.MetalGetActiveMemory())/(1024*1024*1024),
-		float64(mlx.MetalGetPeakMemory())/(1024*1024*1024))
-
-	// Load VAE decoder
-	m.VAEDecoder = &VAEDecoder{}
-	if err := m.VAEDecoder.Load(filepath.Join(modelPath, "vae")); err != nil {
-		return fmt.Errorf("VAE decoder: %w", err)
-	}
-	mlx.Eval(mlx.Collect(m.VAEDecoder)...)
-	fmt.Printf("  (%.1f GB, peak %.1f GB)\n",
-		float64(mlx.MetalGetActiveMemory())/(1024*1024*1024),
-		float64(mlx.MetalGetPeakMemory())/(1024*1024*1024))
-
-	mem := mlx.MetalGetActiveMemory()
-	peak := mlx.MetalGetPeakMemory()
-	fmt.Printf("  Loaded in %.2fs (%.1f GB active, %.1f GB peak)\n",
-		time.Since(start).Seconds(),
-		float64(mem)/(1024*1024*1024),
-		float64(peak)/(1024*1024*1024))
-
-	return nil
-}
-
-// Generate creates an image from a prompt.
-func (m *Model) Generate(prompt string, width, height int32, steps int, seed int64) (*mlx.Array, error) {
-	return m.GenerateFromConfig(&GenerateConfig{
-		Prompt: prompt,
-		Width:  width,
-		Height: height,
-		Steps:  steps,
-		Seed:   seed,
-	})
-}
-
-// GenerateWithProgress creates an image with progress callback.
-func (m *Model) GenerateWithProgress(prompt string, width, height int32, steps int, seed int64, progress ProgressFunc) (*mlx.Array, error) {
-	return m.GenerateFromConfig(&GenerateConfig{
-		Prompt:   prompt,
-		Width:    width,
-		Height:   height,
-		Steps:    steps,
-		Seed:     seed,
-		Progress: progress,
-	})
-}
-
-// GenerateWithCFG creates an image with classifier-free guidance.
-func (m *Model) GenerateWithCFG(prompt, negativePrompt string, width, height int32, steps int, seed int64, cfgScale float32, progress ProgressFunc) (*mlx.Array, error) {
-	return m.GenerateFromConfig(&GenerateConfig{
-		Prompt:         prompt,
-		NegativePrompt: negativePrompt,
-		CFGScale:       cfgScale,
-		Width:          width,
-		Height:         height,
-		Steps:          steps,
-		Seed:           seed,
-		Progress:       progress,
-	})
-}
-
-// GenerateFromConfig generates an image using the unified config struct.
-func (m *Model) GenerateFromConfig(cfg *GenerateConfig) (*mlx.Array, error) {
-	start := time.Now()
-	result, err := m.generate(cfg)
-	if err != nil {
-		return nil, err
-	}
-	if cfg.NegativePrompt != "" {
-		fmt.Printf("Generated with CFG (scale=%.1f) in %.2fs (%d steps)\n", cfg.CFGScale, time.Since(start).Seconds(), cfg.Steps)
-	} else {
-		fmt.Printf("Generated in %.2fs (%d steps)\n", time.Since(start).Seconds(), cfg.Steps)
-	}
-	return result, nil
-}
-
-// GenerateImage implements model.ImageModel interface.
-func (m *Model) GenerateImage(ctx context.Context, prompt string, width, height int32, steps int, seed int64) (*mlx.Array, error) {
-	return m.Generate(prompt, width, height, steps, seed)
-}
-
-// generate is the internal denoising pipeline.
-func (m *Model) generate(cfg *GenerateConfig) (*mlx.Array, error) {
-	// Apply defaults
-	if cfg.Width <= 0 {
-		cfg.Width = 1024
-	}
-	if cfg.Height <= 0 {
-		cfg.Height = 1024
-	}
-	if cfg.Steps <= 0 {
-		cfg.Steps = 30
-	}
-	if cfg.CFGScale <= 0 {
-		cfg.CFGScale = 4.0
-	}
-	if cfg.CacheInterval <= 0 {
-		cfg.CacheInterval = 3
-	}
-	if cfg.CacheLayers <= 0 {
-		cfg.CacheLayers = 25 // ~42% of 60 layers (similar ratio to Z-Image's 15/38)
-	}
-
-	useCFG := cfg.NegativePrompt != ""
-	tcfg := m.Transformer.Config
-	latentH := cfg.Height / 8
-	latentW := cfg.Width / 8
-	pH := latentH / tcfg.PatchSize
-	pW := latentW / tcfg.PatchSize
-	imgSeqLen := pH * pW
-
-	// Text encoding
-	var posEmb, negEmb *mlx.Array
-	{
-		posEmb = m.TextEncoder.EncodePrompt(m.Tokenizer, cfg.Prompt)
-		if useCFG {
-			negEmb = m.TextEncoder.EncodePrompt(m.Tokenizer, cfg.NegativePrompt)
-			mlx.Keep(posEmb, negEmb)
-			mlx.Eval(posEmb, negEmb)
-		} else {
-			mlx.Keep(posEmb)
-			mlx.Eval(posEmb)
-		}
-	}
-
-	// Pad sequences to same length for CFG
-	txtLen := posEmb.Shape()[1]
-	if useCFG {
-		negLen := negEmb.Shape()[1]
-		if negLen > txtLen {
-			txtLen = negLen
-		}
-		if posEmb.Shape()[1] < txtLen {
-			posEmb = padSequence(posEmb, txtLen)
-		}
-		if negEmb.Shape()[1] < txtLen {
-			negEmb = padSequence(negEmb, txtLen)
-		}
-		mlx.Keep(posEmb, negEmb)
-	}
-
-	// Scheduler
-	scheduler := NewFlowMatchScheduler(DefaultSchedulerConfig())
-	scheduler.SetTimesteps(cfg.Steps, imgSeqLen)
-
-	// Init latents [B, C, T, H, W]
-	var latents *mlx.Array
-	{
-		latents = scheduler.InitNoise([]int32{1, tcfg.OutChannels, 1, latentH, latentW}, cfg.Seed)
-		mlx.Eval(latents)
-	}
-
-	// RoPE cache
-	var ropeCache *RoPECache
-	{
-		ropeCache = PrepareRoPE(pH, pW, txtLen, tcfg.AxesDimsRope)
-		mlx.Keep(ropeCache.ImgFreqs, ropeCache.TxtFreqs)
-		mlx.Eval(ropeCache.ImgFreqs)
-	}
-
-	// Layer cache for DeepCache/Learning-to-Cache speedup
-	var stepCache *cache.StepCache
-	if cfg.LayerCache {
-		stepCache = cache.NewStepCache(cfg.CacheLayers)
-		fmt.Printf("  Layer caching: %d layers, refresh every %d steps\n", cfg.CacheLayers, cfg.CacheInterval)
-	}
-
-	// Denoising loop
-	for i := 0; i < cfg.Steps; i++ {
-		stepStart := time.Now()
-		if cfg.Progress != nil {
-			cfg.Progress(i+1, cfg.Steps)
-		}
-
-		t := scheduler.Timesteps[i]
-		timestep := mlx.ToBFloat16(mlx.NewArray([]float32{t}, []int32{1}))
-
-		// Squeeze temporal dim: [B, C, T, H, W] -> [B, C, H, W]
-		latents2D := mlx.Squeeze(latents, 2)
-		patches := PackLatents(latents2D, tcfg.PatchSize)
-
-		var output *mlx.Array
-		if useCFG {
-			// True CFG: run twice and combine with norm rescaling
-			// Note: layer caching with CFG is not supported yet (would need 2 caches)
-			posOutput := m.Transformer.Forward(patches, posEmb, timestep, ropeCache.ImgFreqs, ropeCache.TxtFreqs)
-			negOutput := m.Transformer.Forward(patches, negEmb, timestep, ropeCache.ImgFreqs, ropeCache.TxtFreqs)
-
-			diff := mlx.Sub(posOutput, negOutput)
-			scaledDiff := mlx.MulScalar(diff, cfg.CFGScale)
-			combPred := mlx.Add(negOutput, scaledDiff)
-
-			// Norm rescaling: rescale combined prediction to match conditional prediction's norm
-			condNorm := mlx.Sqrt(mlx.Sum(mlx.Square(posOutput), -1, true))
-			combNorm := mlx.Sqrt(mlx.Sum(mlx.Square(combPred), -1, true))
-			output = mlx.Mul(combPred, mlx.Div(condNorm, combNorm))
-		} else if stepCache != nil {
-			output = m.Transformer.ForwardWithCache(patches, posEmb, timestep, ropeCache.ImgFreqs, ropeCache.TxtFreqs,
-				stepCache, i, cfg.CacheInterval, cfg.CacheLayers)
-		} else {
-			output = m.Transformer.Forward(patches, posEmb, timestep, ropeCache.ImgFreqs, ropeCache.TxtFreqs)
-		}
-
-		noisePred := UnpackLatents(output, latentH, latentW, tcfg.PatchSize)
-		oldLatents := latents
-		latents = scheduler.Step(noisePred, latents, i)
-
-		// Keep cached arrays alive across cleanup
-		if stepCache != nil {
-			mlx.Keep(stepCache.Arrays()...)
-		}
-		mlx.Eval(latents)
-		oldLatents.Free()
-
-		activeMem := float64(mlx.MetalGetActiveMemory()) / (1024 * 1024 * 1024)
-		peakMem := float64(mlx.MetalGetPeakMemory()) / (1024 * 1024 * 1024)
-		fmt.Printf("  Step %d/%d: t=%.4f (%.2fs) [%.1f GB active, %.1f GB peak]\n", i+1, cfg.Steps, t, time.Since(stepStart).Seconds(), activeMem, peakMem)
-	}
-
-	// Free denoising temporaries before VAE decode
-	posEmb.Free()
-	if negEmb != nil {
-		negEmb.Free()
-	}
-	ropeCache.ImgFreqs.Free()
-	ropeCache.TxtFreqs.Free()
-	if stepCache != nil {
-		stepCache.Free()
-	}
-
-	// VAE decode (Decode manages its own pools for staged memory)
-	decoded := m.VAEDecoder.Decode(latents)
-	latents.Free()
-	// Post-process: squeeze temporal dim and rescale to [0, 1]
-	{
-		decoded = mlx.Squeeze(decoded, 2)
-		decoded = mlx.AddScalar(decoded, 1.0)
-		decoded = mlx.DivScalar(decoded, 2.0)
-		mlx.Eval(decoded)
-	}
-
-	fmt.Printf("  Peak memory: %.2f GB\n", float64(mlx.MetalGetPeakMemory())/(1024*1024*1024))
-
-	return decoded, nil
-}
-
-// padSequence pads a sequence tensor to the target length with zeros
-func padSequence(x *mlx.Array, targetLen int32) *mlx.Array {
-	shape := x.Shape()
-	currentLen := shape[1]
-	if currentLen >= targetLen {
-		return x
-	}
-	padLen := targetLen - currentLen
-	// Pad on sequence dimension (axis 1)
-	return mlx.Pad(x, []int32{0, 0, 0, padLen, 0, 0})
-}
-
-// LoadPersistent is an alias for backward compatibility.
-// Use m := &Model{}; m.Load(path) instead.
-func LoadPersistent(modelPath string) (*Model, error) {
-	m := &Model{}
-	if err := m.Load(modelPath); err != nil {
-		return nil, err
-	}
-	return m, nil
-}
--- a/x/imagegen/models/qwen_image/scheduler.go
+++ b/x/imagegen/models/qwen_image/scheduler.go
@@ -1,216 +0,0 @@
-package qwen_image
-
-import (
-	"math"
-
-	"github.com/ollama/ollama/x/imagegen/mlx"
-)
-
-// SchedulerConfig holds FlowMatchEulerDiscreteScheduler configuration
-type SchedulerConfig struct {
-	NumTrainTimesteps int32   `json:"num_train_timesteps"` // 1000
-	BaseShift         float32 `json:"base_shift"`          // 0.5
-	MaxShift          float32 `json:"max_shift"`           // 0.9
-	BaseImageSeqLen   int32   `json:"base_image_seq_len"`  // 256
-	MaxImageSeqLen    int32   `json:"max_image_seq_len"`   // 8192
-	ShiftTerminal     float32 `json:"shift_terminal"`      // 0.02
-	UseDynamicShift   bool    `json:"use_dynamic_shifting"` // true
-}
-
-// DefaultSchedulerConfig returns config for FlowMatchEulerDiscreteScheduler
-func DefaultSchedulerConfig() *SchedulerConfig {
-	return &SchedulerConfig{
-		NumTrainTimesteps: 1000,
-		BaseShift:         0.5,
-		MaxShift:          0.9, // Matches scheduler_config.json
-		BaseImageSeqLen:   256,
-		MaxImageSeqLen:    8192,
-		ShiftTerminal:     0.02,
-		UseDynamicShift:   true,
-	}
-}
-
-// FlowMatchScheduler implements the Flow Match Euler discrete scheduler
-type FlowMatchScheduler struct {
-	Config    *SchedulerConfig
-	Timesteps []float32
-	Sigmas    []float32
-	NumSteps  int
-}
-
-// NewFlowMatchScheduler creates a new scheduler
-func NewFlowMatchScheduler(cfg *SchedulerConfig) *FlowMatchScheduler {
-	return &FlowMatchScheduler{
-		Config: cfg,
-	}
-}
-
-// CalculateShift computes the dynamic shift based on image sequence length
-// This matches Python's calculate_shift function
-func CalculateShift(imageSeqLen int32, baseSeqLen int32, maxSeqLen int32, baseShift float32, maxShift float32) float32 {
-	m := (maxShift - baseShift) / float32(maxSeqLen-baseSeqLen)
-	b := baseShift - m*float32(baseSeqLen)
-	mu := float32(imageSeqLen)*m + b
-	return mu
-}
-
-// SetTimesteps sets up the scheduler for the given number of inference steps
-// Matches Python diffusers FlowMatchEulerDiscreteScheduler behavior:
-// 1. Create sigmas from sigma_max to sigma_min (linspace)
-// 2. Apply time_shift with mu (if dynamic shifting)
-// 3. Apply stretch_shift_to_terminal to make final value = shift_terminal
-func (s *FlowMatchScheduler) SetTimesteps(numSteps int, imageSeqLen int32) {
-	s.NumSteps = numSteps
-
-	// Calculate mu for dynamic shifting
-	var mu float32
-	if s.Config.UseDynamicShift {
-		mu = CalculateShift(
-			imageSeqLen,
-			s.Config.BaseImageSeqLen,
-			s.Config.MaxImageSeqLen,
-			s.Config.BaseShift,
-			s.Config.MaxShift,
-		)
-	}
-
-	// Step 1: Create sigmas from 1.0 to 1/num_steps
-	// Python (pipeline_qwenimage.py:639):
-	//   sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
-	// This gives sigmas from 1.0 to 1/30 = 0.033 for 30 steps
-	sigmas := make([]float32, numSteps)
-	sigmaMax := float32(1.0)
-	sigmaMin := 1.0 / float32(numSteps) // 1/30 = 0.033 for 30 steps
-	if numSteps == 1 {
-		sigmas[0] = sigmaMax
-	} else {
-		for i := 0; i < numSteps; i++ {
-			sigmas[i] = sigmaMax + float32(i)*(sigmaMin-sigmaMax)/float32(numSteps-1)
-		}
-	}
-
-	// Step 2: Apply time shift if using dynamic shifting
-	if s.Config.UseDynamicShift && mu != 0 {
-		for i := range sigmas {
-			sigmas[i] = s.timeShift(mu, sigmas[i])
-		}
-	}
-
-	// Step 3: Apply stretch_shift_to_terminal
-	if s.Config.ShiftTerminal > 0 {
-		sigmas = s.stretchShiftToTerminal(sigmas)
-	}
-
-	// Step 4: Append terminal sigma (0) and store
-	// Note: Python's scheduler.timesteps are sigmas*1000, but the pipeline divides by 1000
-	// before passing to transformer. We skip both steps and just use sigmas directly.
-	s.Sigmas = make([]float32, numSteps+1)
-	s.Timesteps = make([]float32, numSteps+1)
-	for i := 0; i < numSteps; i++ {
-		s.Sigmas[i] = sigmas[i]
-		s.Timesteps[i] = sigmas[i]
-	}
-	s.Sigmas[numSteps] = 0.0
-	s.Timesteps[numSteps] = 0.0
-}
-
-// stretchShiftToTerminal stretches and shifts the timestep schedule
-// so the final value equals shift_terminal (matches Python behavior)
-func (s *FlowMatchScheduler) stretchShiftToTerminal(sigmas []float32) []float32 {
-	if len(sigmas) == 0 {
-		return sigmas
-	}
-
-	// one_minus_z = 1 - t
-	// scale_factor = one_minus_z[-1] / (1 - shift_terminal)
-	// stretched_t = 1 - (one_minus_z / scale_factor)
-	lastSigma := sigmas[len(sigmas)-1]
-	scaleFactor := (1.0 - lastSigma) / (1.0 - s.Config.ShiftTerminal)
-
-	// Handle edge case: if scaleFactor is 0 or near 0, skip stretch
-	// This happens when lastSigma ≈ 1.0 (e.g., single step with timeshift)
-	if scaleFactor < 1e-6 {
-		return sigmas
-	}
-
-	result := make([]float32, len(sigmas))
-	for i, t := range sigmas {
-		oneMinusZ := 1.0 - t
-		result[i] = 1.0 - (oneMinusZ / scaleFactor)
-	}
-	return result
-}
-
-// timeShift applies the dynamic time shift (exponential)
-// exp(mu) / (exp(mu) + (1/t - 1))
-func (s *FlowMatchScheduler) timeShift(mu float32, t float32) float32 {
-	if t <= 0 {
-		return 0
-	}
-	expMu := float32(math.Exp(float64(mu)))
-	return expMu / (expMu + (1.0/t - 1.0))
-}
-
-// Step performs one denoising step
-// modelOutput: predicted velocity from the transformer
-// sample: current noisy sample
-// timestepIdx: current timestep index
-func (s *FlowMatchScheduler) Step(modelOutput, sample *mlx.Array, timestepIdx int) *mlx.Array {
-	// Get current and next sigma
-	sigma := s.Sigmas[timestepIdx]
-	sigmaNext := s.Sigmas[timestepIdx+1]
-
-	// Euler step: x_{t-dt} = x_t + (sigma_next - sigma) * v_t
-	dt := sigmaNext - sigma
-
-	// Upcast to float32 to avoid precision issues (matches Python diffusers)
-	sampleF32 := mlx.AsType(sample, mlx.DtypeFloat32)
-	modelOutputF32 := mlx.AsType(modelOutput, mlx.DtypeFloat32)
-
-	scaledOutput := mlx.MulScalar(modelOutputF32, dt)
-	result := mlx.Add(sampleF32, scaledOutput)
-
-	// Cast back to original dtype
-	return mlx.ToBFloat16(result)
-}
-
-// GetTimestep returns the timestep value at the given index
-func (s *FlowMatchScheduler) GetTimestep(idx int) float32 {
-	if idx < len(s.Timesteps) {
-		return s.Timesteps[idx]
-	}
-	return 0.0
-}
-
-// InitNoise creates initial noise for sampling in unpacked format [B, C, T, H, W]
-func (s *FlowMatchScheduler) InitNoise(shape []int32, seed int64) *mlx.Array {
-	return mlx.RandomNormal(shape, uint64(seed))
-}
-
-// InitNoisePacked creates initial noise directly in packed format [B, L, C*4]
-// This matches how Python diffusers generates noise - directly in packed space.
-// Generating in unpacked format and then packing produces different spatial
-// correlation structure, which affects model output quality.
-func (s *FlowMatchScheduler) InitNoisePacked(batchSize, seqLen, channels int32, seed int64) *mlx.Array {
-	shape := []int32{batchSize, seqLen, channels}
-	return mlx.RandomNormal(shape, uint64(seed))
-}
-
-// GetLatentShape returns the latent shape for a given image size
-// For qwen_image: VAE downscale is 8x (spatial), latent has 16 channels
-func GetLatentShape(batchSize, height, width int32) []int32 {
-	latentH := height / 8
-	latentW := width / 8
-	return []int32{batchSize, 16, 1, latentH, latentW} // [B, C, T, H, W]
-}
-
-// GetPatchedLatentShape returns the patchified latent shape
-// After patchification: [B, L, C*patch_size^2] where L = H/2 * W/2
-func GetPatchedLatentShape(batchSize, height, width, patchSize int32) []int32 {
-	latentH := height / 8
-	latentW := width / 8
-	pH := latentH / patchSize
-	pW := latentW / patchSize
-	inChannels := int32(64) // 16 * patch_size^2
-	return []int32{batchSize, pH * pW, inChannels}
-}
--- a/x/imagegen/models/qwen_image/scheduler_test.go
+++ b/x/imagegen/models/qwen_image/scheduler_test.go
@@ -1,133 +0,0 @@
-package qwen_image
-
-import (
-	"math"
-	"testing"
-)
-
-// TestSchedulerSetTimesteps verifies scheduler sigmas match Python diffusers reference.
-// Golden values generated via:
-//
-//	python3 -c "
-//	from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
-//	import numpy as np
-//	s = FlowMatchEulerDiscreteScheduler(num_train_timesteps=1000, base_shift=0.5, max_shift=0.9,
-//	    base_image_seq_len=256, max_image_seq_len=8192, shift_terminal=0.02, use_dynamic_shifting=True)
-//	mu = 4096 * (0.9-0.5)/(8192-256) + 0.5 - (0.9-0.5)/(8192-256)*256
-//	sigmas = np.linspace(1.0, 1.0/30, 30)
-//	s.set_timesteps(sigmas=sigmas, mu=mu)
-//	print(s.sigmas.numpy())"
-func TestSchedulerSetTimesteps(t *testing.T) {
-	cfg := DefaultSchedulerConfig()
-	scheduler := NewFlowMatchScheduler(cfg)
-	scheduler.SetTimesteps(30, 4096)
-
-	// Golden values from Python diffusers (first 3, last 3 before terminal)
-	wantFirst := []float32{1.000000, 0.982251, 0.963889}
-	wantLast := []float32{0.142924, 0.083384, 0.020000}
-
-	// Check first 3
-	for i, want := range wantFirst {
-		got := scheduler.Sigmas[i]
-		if abs32(got-want) > 1e-4 {
-			t.Errorf("sigma[%d]: got %v, want %v", i, got, want)
-		}
-	}
-
-	// Check last 3 (indices 27, 28, 29)
-	for i, want := range wantLast {
-		idx := 27 + i
-		got := scheduler.Sigmas[idx]
-		if abs32(got-want) > 1e-4 {
-			t.Errorf("sigma[%d]: got %v, want %v", idx, got, want)
-		}
-	}
-
-	// Check terminal is 0
-	if scheduler.Sigmas[30] != 0.0 {
-		t.Errorf("terminal sigma: got %v, want 0", scheduler.Sigmas[30])
-	}
-
-	// Check length
-	if len(scheduler.Sigmas) != 31 {
-		t.Errorf("sigmas length: got %d, want 31", len(scheduler.Sigmas))
-	}
-}
-
-// TestSchedulerProperties tests mathematical invariants of the scheduler.
-func TestSchedulerProperties(t *testing.T) {
-	cfg := DefaultSchedulerConfig()
-	scheduler := NewFlowMatchScheduler(cfg)
-	scheduler.SetTimesteps(30, 4096)
-
-	// Property: sigmas monotonically decreasing
-	for i := 1; i < len(scheduler.Sigmas); i++ {
-		if scheduler.Sigmas[i] > scheduler.Sigmas[i-1] {
-			t.Errorf("sigmas not monotonically decreasing at %d: %v > %v",
-				i, scheduler.Sigmas[i], scheduler.Sigmas[i-1])
-		}
-	}
-
-	// Property: first sigma should be ~1.0 (with time shift)
-	if scheduler.Sigmas[0] < 0.9 || scheduler.Sigmas[0] > 1.01 {
-		t.Errorf("first sigma out of expected range [0.9, 1.01]: %v", scheduler.Sigmas[0])
-	}
-
-	// Property: terminal sigma should be exactly 0
-	if scheduler.Sigmas[len(scheduler.Sigmas)-1] != 0.0 {
-		t.Errorf("terminal sigma should be 0, got %v", scheduler.Sigmas[len(scheduler.Sigmas)-1])
-	}
-
-	// Property: last non-terminal sigma should be shift_terminal (0.02)
-	lastNonTerminal := scheduler.Sigmas[len(scheduler.Sigmas)-2]
-	if abs32(lastNonTerminal-0.02) > 1e-5 {
-		t.Errorf("last non-terminal sigma should be 0.02, got %v", lastNonTerminal)
-	}
-
-	// Property: length = steps + 1
-	if len(scheduler.Sigmas) != scheduler.NumSteps+1 {
-		t.Errorf("sigmas length should be steps+1: got %d, want %d",
-			len(scheduler.Sigmas), scheduler.NumSteps+1)
-	}
-}
-
-// TestCalculateShift verifies the mu calculation against Python reference.
-// Golden values from: mu = img_seq_len * m + b where m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
-func TestCalculateShift(t *testing.T) {
-	cases := []struct {
-		imgSeqLen int32
-		want      float32
-	}{
-		{256, 0.5},     // base case
-		{8192, 0.9},    // max case
-		{4096, 0.6935}, // middle case (rounded)
-	}
-
-	for _, c := range cases {
-		got := CalculateShift(c.imgSeqLen, 256, 8192, 0.5, 0.9)
-		if abs32(got-c.want) > 0.001 {
-			t.Errorf("CalculateShift(%d): got %v, want %v", c.imgSeqLen, got, c.want)
-		}
-	}
-}
-
-// TestSchedulerStep verifies the Euler step formula.
-func TestSchedulerStep(t *testing.T) {
-	cfg := DefaultSchedulerConfig()
-	scheduler := NewFlowMatchScheduler(cfg)
-	scheduler.SetTimesteps(30, 4096)
-
-	// Verify dt calculation for first step
-	sigma0 := scheduler.Sigmas[0]
-	sigma1 := scheduler.Sigmas[1]
-	expectedDt := sigma1 - sigma0
-
-	// dt should be negative (sigmas decrease)
-	if expectedDt >= 0 {
-		t.Errorf("expected negative dt, got %v (sigma0=%v, sigma1=%v)", expectedDt, sigma0, sigma1)
-	}
-}
-
-func abs32(x float32) float32 {
-	return float32(math.Abs(float64(x)))
-}
--- a/x/imagegen/models/qwen_image/text_encoder_test.go
+++ b/x/imagegen/models/qwen_image/text_encoder_test.go
@@ -1,172 +0,0 @@
-package qwen_image
-
-import (
-	"encoding/json"
-	"math"
-	"os"
-	"path/filepath"
-	"slices"
-	"testing"
-
-	"github.com/ollama/ollama/x/imagegen/mlx"
-	"github.com/ollama/ollama/x/imagegen/safetensors"
-)
-
-// TinyTextEncoderConfig holds config for the tiny test text encoder
-type TinyTextEncoderConfig struct {
-	HiddenSize        int32   `json:"hidden_size"`
-	NumHiddenLayers   int32   `json:"num_hidden_layers"`
-	IntermediateSize  int32   `json:"intermediate_size"`
-	NumAttentionHeads int32   `json:"num_attention_heads"`
-	NumKeyValueHeads  int32   `json:"num_key_value_heads"`
-	VocabSize         int32   `json:"vocab_size"`
-	RMSNormEps        float32 `json:"rms_norm_eps"`
-	RopeTheta         float32 `json:"rope_theta"`
-	HeadDim           int32   `json:"head_dim"`
-	MRoPESection      []int32 `json:"mrope_section"`
-}
-
-// loadTinyTextEncoder loads the tiny text encoder from testdata
-func loadTinyTextEncoder(t *testing.T) (*Qwen25VL, *TinyTextEncoderConfig) {
-	t.Helper()
-
-	testdataDir := filepath.Join("testdata", "tiny_text_encoder")
-
-	// Load config
-	configData, err := os.ReadFile(filepath.Join(testdataDir, "config.json"))
-	if err != nil {
-		t.Skipf("Skipping: tiny weights not found. Regenerate with Python (see models/CLAUDE.md)")
-	}
-
-	var tinyCfg TinyTextEncoderConfig
-	if err := json.Unmarshal(configData, &tinyCfg); err != nil {
-		t.Fatalf("Failed to parse config: %v", err)
-	}
-
-	// Create encoder config (using Qwen25VLConfig)
-	cfg := &Qwen25VLConfig{
-		HiddenSize:        tinyCfg.HiddenSize,
-		NumHiddenLayers:   tinyCfg.NumHiddenLayers,
-		IntermediateSize:  tinyCfg.IntermediateSize,
-		NumAttentionHeads: tinyCfg.NumAttentionHeads,
-		NumKeyValueHeads:  tinyCfg.NumKeyValueHeads,
-		VocabSize:         tinyCfg.VocabSize,
-		RMSNormEps:        tinyCfg.RMSNormEps,
-		RopeTheta:         tinyCfg.RopeTheta,
-		HeadDim:           tinyCfg.HeadDim,
-		MRoPESection:      tinyCfg.MRoPESection,
-	}
-
-	// Load weights
-	weights, err := safetensors.LoadModelWeights(testdataDir)
-	if err != nil {
-		t.Fatalf("Failed to load weights: %v", err)
-	}
-
-	if err := weights.Load(mlx.DtypeBFloat16); err != nil {
-		t.Fatalf("Failed to bulk load weights: %v", err)
-	}
-
-	// Build encoder
-	embedding, err := weights.Get("model.embed_tokens.weight")
-	if err != nil {
-		t.Fatalf("Failed to get embedding: %v", err)
-	}
-
-	blocks := make([]*VLTextBlock, cfg.NumHiddenLayers)
-	for i := int32(0); i < cfg.NumHiddenLayers; i++ {
-		block, err := newVLTextBlock(weights, int(i), cfg)
-		if err != nil {
-			t.Fatalf("Failed to load block %d: %v", i, err)
-		}
-		blocks[i] = block
-	}
-
-	finalNorm, err := weights.Get("model.norm.weight")
-	if err != nil {
-		t.Fatalf("Failed to get final norm: %v", err)
-	}
-
-	encoder := &Qwen25VL{
-		Config:    cfg,
-		Embedding: embedding,
-		Blocks:    blocks,
-		FinalNorm: finalNorm,
-		HasVision: false, // Text-only mode
-	}
-
-	return encoder, &tinyCfg
-}
-
-// TestTextEncoderForward verifies the text encoder forward pass with tiny weights.
-func TestTextEncoderForward(t *testing.T) {
-	encoder, cfg := loadTinyTextEncoder(t)
-
-	// Create test tokens (within vocab range)
-	tokens := []int32{1, 2, 3, 4, 5}
-
-	// Forward pass using EncodeTextOnly
-	out := encoder.EncodeTextOnly(tokens)
-	mlx.Eval(out)
-
-	// Verify output shape: [batch, seq_len, hidden_size]
-	wantShape := []int32{1, 5, cfg.HiddenSize}
-	if !slices.Equal(out.Shape(), wantShape) {
-		t.Errorf("output shape: got %v, want %v", out.Shape(), wantShape)
-	}
-
-	// Verify output is finite (not NaN or Inf)
-	data := out.Data()
-	for i, v := range data {
-		if math.IsNaN(float64(v)) || math.IsInf(float64(v), 0) {
-			t.Errorf("output[%d] is not finite: %v", i, v)
-			break
-		}
-	}
-}
-
-// TestTextEncoderBatch tests batch processing.
-func TestTextEncoderBatch(t *testing.T) {
-	encoder, cfg := loadTinyTextEncoder(t)
-
-	// For batch test, we'll use EncodeTextOnly with a single sequence
-	// (EncodeTextOnly doesn't support batch, but we can verify single sequence works)
-	tokens := []int32{1, 2, 3}
-
-	out := encoder.EncodeTextOnly(tokens)
-	mlx.Eval(out)
-
-	wantShape := []int32{1, 3, cfg.HiddenSize}
-	if !slices.Equal(out.Shape(), wantShape) {
-		t.Errorf("shape: got %v, want %v", out.Shape(), wantShape)
-	}
-}
-
-// TestMRoPEComputation verifies M-RoPE frequency computation produces valid values.
-func TestMRoPEComputation(t *testing.T) {
-	encoder, cfg := loadTinyTextEncoder(t)
-
-	cossin := encoder.computeTextRoPE(10, 1)
-	mlx.Eval(cossin[0], cossin[1])
-
-	// Verify shapes: [3, B, L, head_dim]
-	wantShape := []int32{3, 1, 10, cfg.HeadDim}
-	if !slices.Equal(cossin[0].Shape(), wantShape) {
-		t.Errorf("cos shape: got %v, want %v", cossin[0].Shape(), wantShape)
-	}
-	if !slices.Equal(cossin[1].Shape(), wantShape) {
-		t.Errorf("sin shape: got %v, want %v", cossin[1].Shape(), wantShape)
-	}
-
-	// Verify cos/sin values are in valid range [-1, 1]
-	cosData := cossin[0].Data()
-	sinData := cossin[1].Data()
-	for i := 0; i < min(100, len(cosData)); i++ {
-		if cosData[i] < -1.01 || cosData[i] > 1.01 {
-			t.Errorf("cos[%d] out of range: %v", i, cosData[i])
-		}
-		if sinData[i] < -1.01 || sinData[i] > 1.01 {
-			t.Errorf("sin[%d] out of range: %v", i, sinData[i])
-		}
-	}
-}
--- a/x/imagegen/models/qwen_image/transformer.go
+++ b/x/imagegen/models/qwen_image/transformer.go
@@ -1,866 +0,0 @@
-package qwen_image
-
-import (
-	"fmt"
-	"math"
-	"path/filepath"
-
-	"github.com/ollama/ollama/x/imagegen/cache"
-	"github.com/ollama/ollama/x/imagegen/mlx"
-	"github.com/ollama/ollama/x/imagegen/safetensors"
-)
-
-// TransformerConfig holds Qwen-Image transformer configuration
-type TransformerConfig struct {
-	HiddenDim         int32   `json:"hidden_dim"`          // 3072 (24 * 128)
-	NHeads            int32   `json:"num_attention_heads"` // 24
-	HeadDim           int32   `json:"attention_head_dim"`  // 128
-	NLayers           int32   `json:"num_layers"`          // 60
-	InChannels        int32   `json:"in_channels"`         // 64
-	OutChannels       int32   `json:"out_channels"`        // 16
-	PatchSize         int32   `json:"patch_size"`          // 2
-	JointAttentionDim int32   `json:"joint_attention_dim"` // 3584 (text encoder dim)
-	NormEps           float32 `json:"norm_eps"`            // 1e-6
-	AxesDimsRope      []int32 `json:"axes_dims_rope"`      // [16, 56, 56]
-	GuidanceEmbeds    bool    `json:"guidance_embeds"`     // false
-}
-
-// defaultTransformerConfig returns config for Qwen-Image transformer
-func defaultTransformerConfig() *TransformerConfig {
-	return &TransformerConfig{
-		HiddenDim:         3072, // 24 * 128
-		NHeads:            24,
-		HeadDim:           128,
-		NLayers:           60,
-		InChannels:        64,
-		OutChannels:       16,
-		PatchSize:         2,
-		JointAttentionDim: 3584,
-		NormEps:           1e-6,
-		AxesDimsRope:      []int32{16, 56, 56},
-		GuidanceEmbeds:    false,
-	}
-}
-
-// TimestepEmbedder creates timestep embeddings
-type TimestepEmbedder struct {
-	Linear1Weight *mlx.Array // [256, hidden_dim]
-	Linear1Bias   *mlx.Array
-	Linear2Weight *mlx.Array // [hidden_dim, hidden_dim]
-	Linear2Bias   *mlx.Array
-}
-
-// newTimestepEmbedder creates a timestep embedder from weights
-func newTimestepEmbedder(weights *safetensors.ModelWeights) (*TimestepEmbedder, error) {
-	linear1Weight, err := weights.Get("time_text_embed.timestep_embedder.linear_1.weight")
-	if err != nil {
-		return nil, err
-	}
-	linear1Bias, err := weights.Get("time_text_embed.timestep_embedder.linear_1.bias")
-	if err != nil {
-		return nil, err
-	}
-	linear2Weight, err := weights.Get("time_text_embed.timestep_embedder.linear_2.weight")
-	if err != nil {
-		return nil, err
-	}
-	linear2Bias, err := weights.Get("time_text_embed.timestep_embedder.linear_2.bias")
-	if err != nil {
-		return nil, err
-	}
-
-	return &TimestepEmbedder{
-		Linear1Weight: mlx.Transpose(linear1Weight, 1, 0),
-		Linear1Bias:   linear1Bias,
-		Linear2Weight: mlx.Transpose(linear2Weight, 1, 0),
-		Linear2Bias:   linear2Bias,
-	}, nil
-}
-
-// Forward computes timestep embeddings
-// t: [B] timesteps (normalized 0-1, will be scaled by 1000 internally)
-func (te *TimestepEmbedder) Forward(t *mlx.Array) *mlx.Array {
-	half := int32(128) // embedding_dim / 2
-
-	// Sinusoidal embedding with flip_sin_to_cos=True, scale=1000
-	freqs := make([]float32, half)
-	for i := int32(0); i < half; i++ {
-		freqs[i] = float32(math.Exp(-math.Log(10000.0) * float64(i) / float64(half)))
-	}
-	freqsArr := mlx.NewArray(freqs, []int32{1, half})
-
-	tExpanded := mlx.ExpandDims(t, 1)
-	args := mlx.Mul(tExpanded, freqsArr)
-	args = mlx.MulScalar(args, 1000.0) // scale
-
-	// [cos, sin] (flip_sin_to_cos=True)
-	sinArgs := mlx.Sin(args)
-	cosArgs := mlx.Cos(args)
-	embedding := mlx.Concatenate([]*mlx.Array{cosArgs, sinArgs}, 1) // [B, 256]
-
-	// MLP: linear1 -> silu -> linear2
-	h := mlx.Linear(embedding, te.Linear1Weight)
-	h = mlx.Add(h, te.Linear1Bias)
-	h = mlx.SiLU(h)
-	h = mlx.Linear(h, te.Linear2Weight)
-	h = mlx.Add(h, te.Linear2Bias)
-
-	return h
-}
-
-// JointAttention implements dual-stream joint attention
-type JointAttention struct {
-	// Image projections
-	ToQ    *mlx.Array
-	ToQB   *mlx.Array
-	ToK    *mlx.Array
-	ToKB   *mlx.Array
-	ToV    *mlx.Array
-	ToVB   *mlx.Array
-	ToOut  *mlx.Array
-	ToOutB *mlx.Array
-	NormQ  *mlx.Array
-	NormK  *mlx.Array
-
-	// Text (added) projections
-	AddQProj  *mlx.Array
-	AddQProjB *mlx.Array
-	AddKProj  *mlx.Array
-	AddKProjB *mlx.Array
-	AddVProj  *mlx.Array
-	AddVProjB *mlx.Array
-	ToAddOut  *mlx.Array
-	ToAddOutB *mlx.Array
-	NormAddQ  *mlx.Array
-	NormAddK  *mlx.Array
-
-	NHeads  int32
-	HeadDim int32
-	Scale   float32
-}
-
-// newJointAttention creates a joint attention layer
-func newJointAttention(weights *safetensors.ModelWeights, prefix string, cfg *TransformerConfig) (*JointAttention, error) {
-	toQ, _ := weights.Get(prefix + ".attn.to_q.weight")
-	toQB, _ := weights.Get(prefix + ".attn.to_q.bias")
-	toK, _ := weights.Get(prefix + ".attn.to_k.weight")
-	toKB, _ := weights.Get(prefix + ".attn.to_k.bias")
-	toV, _ := weights.Get(prefix + ".attn.to_v.weight")
-	toVB, _ := weights.Get(prefix + ".attn.to_v.bias")
-	toOut, _ := weights.Get(prefix + ".attn.to_out.0.weight")
-	toOutB, _ := weights.Get(prefix + ".attn.to_out.0.bias")
-	normQ, _ := weights.Get(prefix + ".attn.norm_q.weight")
-	normK, _ := weights.Get(prefix + ".attn.norm_k.weight")
-
-	addQProj, _ := weights.Get(prefix + ".attn.add_q_proj.weight")
-	addQProjB, _ := weights.Get(prefix + ".attn.add_q_proj.bias")
-	addKProj, _ := weights.Get(prefix + ".attn.add_k_proj.weight")
-	addKProjB, _ := weights.Get(prefix + ".attn.add_k_proj.bias")
-	addVProj, _ := weights.Get(prefix + ".attn.add_v_proj.weight")
-	addVProjB, _ := weights.Get(prefix + ".attn.add_v_proj.bias")
-	toAddOut, _ := weights.Get(prefix + ".attn.to_add_out.weight")
-	toAddOutB, _ := weights.Get(prefix + ".attn.to_add_out.bias")
-	normAddQ, _ := weights.Get(prefix + ".attn.norm_added_q.weight")
-	normAddK, _ := weights.Get(prefix + ".attn.norm_added_k.weight")
-
-	return &JointAttention{
-		ToQ:       mlx.Transpose(toQ, 1, 0),
-		ToQB:      toQB,
-		ToK:       mlx.Transpose(toK, 1, 0),
-		ToKB:      toKB,
-		ToV:       mlx.Transpose(toV, 1, 0),
-		ToVB:      toVB,
-		ToOut:     mlx.Transpose(toOut, 1, 0),
-		ToOutB:    toOutB,
-		NormQ:     normQ,
-		NormK:     normK,
-		AddQProj:  mlx.Transpose(addQProj, 1, 0),
-		AddQProjB: addQProjB,
-		AddKProj:  mlx.Transpose(addKProj, 1, 0),
-		AddKProjB: addKProjB,
-		AddVProj:  mlx.Transpose(addVProj, 1, 0),
-		AddVProjB: addVProjB,
-		ToAddOut:  mlx.Transpose(toAddOut, 1, 0),
-		ToAddOutB: toAddOutB,
-		NormAddQ:  normAddQ,
-		NormAddK:  normAddK,
-		NHeads:    cfg.NHeads,
-		HeadDim:   cfg.HeadDim,
-		Scale:     float32(1.0 / math.Sqrt(float64(cfg.HeadDim))),
-	}, nil
-}
-
-// Forward computes joint attention
-// img: [B, L_img, D], txt: [B, L_txt, D]
-// imgFreqs, txtFreqs: complex RoPE frequencies [L, head_dim/2] as interleaved real/imag
-func (attn *JointAttention) Forward(img, txt *mlx.Array, imgFreqs, txtFreqs *mlx.Array) (*mlx.Array, *mlx.Array) {
-	imgShape := img.Shape()
-	B := imgShape[0]
-	Limg := imgShape[1]
-	D := imgShape[2]
-
-	txtShape := txt.Shape()
-	Ltxt := txtShape[1]
-
-	// === Image Q/K/V ===
-	imgFlat := mlx.Reshape(img, B*Limg, D)
-	qImg := mlx.Add(mlx.Linear(imgFlat, attn.ToQ), attn.ToQB)
-	kImg := mlx.Add(mlx.Linear(imgFlat, attn.ToK), attn.ToKB)
-	vImg := mlx.Add(mlx.Linear(imgFlat, attn.ToV), attn.ToVB)
-
-	qImg = mlx.Reshape(qImg, B, Limg, attn.NHeads, attn.HeadDim)
-	kImg = mlx.Reshape(kImg, B, Limg, attn.NHeads, attn.HeadDim)
-	vImg = mlx.Reshape(vImg, B, Limg, attn.NHeads, attn.HeadDim)
-
-	// QK norm (RMSNorm per head)
-	qImg = mlx.RMSNorm(qImg, attn.NormQ, 1e-6)
-	kImg = mlx.RMSNorm(kImg, attn.NormK, 1e-6)
-
-	// Apply RoPE
-	if imgFreqs != nil {
-		qImg = applyRoPE(qImg, imgFreqs)
-		kImg = applyRoPE(kImg, imgFreqs)
-	}
-
-	// === Text Q/K/V ===
-	txtFlat := mlx.Reshape(txt, B*Ltxt, D)
-	qTxt := mlx.Add(mlx.Linear(txtFlat, attn.AddQProj), attn.AddQProjB)
-	kTxt := mlx.Add(mlx.Linear(txtFlat, attn.AddKProj), attn.AddKProjB)
-	vTxt := mlx.Add(mlx.Linear(txtFlat, attn.AddVProj), attn.AddVProjB)
-
-	qTxt = mlx.Reshape(qTxt, B, Ltxt, attn.NHeads, attn.HeadDim)
-	kTxt = mlx.Reshape(kTxt, B, Ltxt, attn.NHeads, attn.HeadDim)
-	vTxt = mlx.Reshape(vTxt, B, Ltxt, attn.NHeads, attn.HeadDim)
-
-	qTxt = mlx.RMSNorm(qTxt, attn.NormAddQ, 1e-6)
-	kTxt = mlx.RMSNorm(kTxt, attn.NormAddK, 1e-6)
-
-	if txtFreqs != nil {
-		qTxt = applyRoPE(qTxt, txtFreqs)
-		kTxt = applyRoPE(kTxt, txtFreqs)
-	}
-
-	// Concatenate for joint attention: [txt, img] order
-	qJoint := mlx.Concatenate([]*mlx.Array{qTxt, qImg}, 1)
-	kJoint := mlx.Concatenate([]*mlx.Array{kTxt, kImg}, 1)
-	vJoint := mlx.Concatenate([]*mlx.Array{vTxt, vImg}, 1)
-
-	// Transpose to [B, nheads, L, head_dim]
-	qJoint = mlx.Transpose(qJoint, 0, 2, 1, 3)
-	kJoint = mlx.Transpose(kJoint, 0, 2, 1, 3)
-	vJoint = mlx.Transpose(vJoint, 0, 2, 1, 3)
-
-	// SDPA
-	outJoint := mlx.ScaledDotProductAttention(qJoint, kJoint, vJoint, attn.Scale, false)
-
-	// Transpose back and split
-	outJoint = mlx.Transpose(outJoint, 0, 2, 1, 3) // [B, L, nheads, head_dim]
-	outJoint = mlx.Reshape(outJoint, B, Ltxt+Limg, D)
-
-	outTxt := mlx.Slice(outJoint, []int32{0, 0, 0}, []int32{B, Ltxt, D})
-	outImg := mlx.Slice(outJoint, []int32{0, Ltxt, 0}, []int32{B, Ltxt + Limg, D})
-
-	// Output projections
-	outImg = mlx.Reshape(outImg, B*Limg, D)
-	outImg = mlx.Add(mlx.Linear(outImg, attn.ToOut), attn.ToOutB)
-	outImg = mlx.Reshape(outImg, B, Limg, D)
-
-	outTxt = mlx.Reshape(outTxt, B*Ltxt, D)
-	outTxt = mlx.Add(mlx.Linear(outTxt, attn.ToAddOut), attn.ToAddOutB)
-	outTxt = mlx.Reshape(outTxt, B, Ltxt, D)
-
-	return outImg, outTxt
-}
-
-// applyRoPE applies rotary embeddings using complex multiplication
-// x: [B, L, nheads, head_dim]
-// freqs: [L, head_dim] as complex (interleaved real/imag pairs)
-func applyRoPE(x *mlx.Array, freqs *mlx.Array) *mlx.Array {
-	shape := x.Shape()
-	B := shape[0]
-	L := shape[1]
-	nheads := shape[2]
-	headDim := shape[3]
-	halfDim := headDim / 2
-
-	// Reshape x to pairs: [B, L, nheads, half, 2]
-	xPairs := mlx.Reshape(x, B, L, nheads, halfDim, 2)
-
-	// freqs: [L, head_dim] -> [1, L, 1, half, 2]
-	freqsExp := mlx.Reshape(freqs, 1, L, 1, halfDim, 2)
-
-	// Extract real/imag parts
-	xReal := mlx.SliceStride(xPairs, []int32{0, 0, 0, 0, 0}, []int32{B, L, nheads, halfDim, 1}, []int32{1, 1, 1, 1, 1})
-	xImag := mlx.SliceStride(xPairs, []int32{0, 0, 0, 0, 1}, []int32{B, L, nheads, halfDim, 2}, []int32{1, 1, 1, 1, 1})
-	xReal = mlx.Squeeze(xReal, 4)
-	xImag = mlx.Squeeze(xImag, 4)
-
-	freqReal := mlx.SliceStride(freqsExp, []int32{0, 0, 0, 0, 0}, []int32{1, L, 1, halfDim, 1}, []int32{1, 1, 1, 1, 1})
-	freqImag := mlx.SliceStride(freqsExp, []int32{0, 0, 0, 0, 1}, []int32{1, L, 1, halfDim, 2}, []int32{1, 1, 1, 1, 1})
-	freqReal = mlx.Squeeze(freqReal, 4)
-	freqImag = mlx.Squeeze(freqImag, 4)
-
-	// Complex multiplication: (a + bi) * (c + di) = (ac - bd) + (ad + bc)i
-	outReal := mlx.Sub(mlx.Mul(xReal, freqReal), mlx.Mul(xImag, freqImag))
-	outImag := mlx.Add(mlx.Mul(xReal, freqImag), mlx.Mul(xImag, freqReal))
-
-	// Interleave back
-	outReal = mlx.ExpandDims(outReal, 4)
-	outImag = mlx.ExpandDims(outImag, 4)
-	out := mlx.Concatenate([]*mlx.Array{outReal, outImag}, 4)
-
-	return mlx.Reshape(out, B, L, nheads, headDim)
-}
-
-// MLP implements GELU MLP (not GEGLU)
-type MLP struct {
-	ProjWeight *mlx.Array
-	ProjBias   *mlx.Array
-	OutWeight  *mlx.Array
-	OutBias    *mlx.Array
-}
-
-// newMLP creates a GELU MLP
-func newMLP(weights *safetensors.ModelWeights, prefix string) (*MLP, error) {
-	projWeight, _ := weights.Get(prefix + ".net.0.proj.weight")
-	projBias, _ := weights.Get(prefix + ".net.0.proj.bias")
-	outWeight, _ := weights.Get(prefix + ".net.2.weight")
-	outBias, _ := weights.Get(prefix + ".net.2.bias")
-
-	return &MLP{
-		ProjWeight: mlx.Transpose(projWeight, 1, 0),
-		ProjBias:   projBias,
-		OutWeight:  mlx.Transpose(outWeight, 1, 0),
-		OutBias:    outBias,
-	}, nil
-}
-
-// Forward applies GELU MLP
-func (m *MLP) Forward(x *mlx.Array) *mlx.Array {
-	shape := x.Shape()
-	B := shape[0]
-	L := shape[1]
-	D := shape[2]
-
-	xFlat := mlx.Reshape(x, B*L, D)
-	h := mlx.Add(mlx.Linear(xFlat, m.ProjWeight), m.ProjBias)
-	h = geluApprox(h)
-	h = mlx.Add(mlx.Linear(h, m.OutWeight), m.OutBias)
-	return mlx.Reshape(h, B, L, m.OutBias.Dim(0))
-}
-
-// geluApprox implements approximate GELU
-func geluApprox(x *mlx.Array) *mlx.Array {
-	sqrt2OverPi := float32(math.Sqrt(2.0 / math.Pi))
-	x3 := mlx.Mul(mlx.Mul(x, x), x)
-	inner := mlx.Add(x, mlx.MulScalar(x3, 0.044715))
-	inner = mlx.MulScalar(inner, sqrt2OverPi)
-	return mlx.Mul(mlx.MulScalar(x, 0.5), mlx.AddScalar(mlx.Tanh(inner), 1.0))
-}
-
-// TransformerBlock is a single dual-stream transformer block
-type TransformerBlock struct {
-	Attention *JointAttention
-	ImgMLP    *MLP
-	TxtMLP    *MLP
-
-	ImgModWeight *mlx.Array
-	ImgModBias   *mlx.Array
-	TxtModWeight *mlx.Array
-	TxtModBias   *mlx.Array
-
-	HiddenDim int32
-	NormEps   float32
-}
-
-// newTransformerBlock creates a transformer block
-func newTransformerBlock(weights *safetensors.ModelWeights, prefix string, cfg *TransformerConfig) (*TransformerBlock, error) {
-	attn, err := newJointAttention(weights, prefix, cfg)
-	if err != nil {
-		return nil, err
-	}
-
-	imgMLP, _ := newMLP(weights, prefix+".img_mlp")
-	txtMLP, _ := newMLP(weights, prefix+".txt_mlp")
-
-	imgModWeight, _ := weights.Get(prefix + ".img_mod.1.weight")
-	imgModBias, _ := weights.Get(prefix + ".img_mod.1.bias")
-	txtModWeight, _ := weights.Get(prefix + ".txt_mod.1.weight")
-	txtModBias, _ := weights.Get(prefix + ".txt_mod.1.bias")
-
-	return &TransformerBlock{
-		Attention:    attn,
-		ImgMLP:       imgMLP,
-		TxtMLP:       txtMLP,
-		ImgModWeight: mlx.Transpose(imgModWeight, 1, 0),
-		ImgModBias:   imgModBias,
-		TxtModWeight: mlx.Transpose(txtModWeight, 1, 0),
-		TxtModBias:   txtModBias,
-		HiddenDim:    cfg.HiddenDim,
-		NormEps:      cfg.NormEps,
-	}, nil
-}
-
-// Forward applies the transformer block
-func (tb *TransformerBlock) Forward(img, txt, temb *mlx.Array, imgFreqs, txtFreqs *mlx.Array) (*mlx.Array, *mlx.Array) {
-	// Compute modulation: silu(temb) -> linear -> [B, 6*D]
-	siluT := mlx.SiLU(temb)
-	imgMod := mlx.Add(mlx.Linear(siluT, tb.ImgModWeight), tb.ImgModBias)
-	txtMod := mlx.Add(mlx.Linear(siluT, tb.TxtModWeight), tb.TxtModBias)
-
-	// Split into 6 parts: shift1, scale1, gate1, shift2, scale2, gate2
-	imgModParts := splitMod6(imgMod, tb.HiddenDim)
-	txtModParts := splitMod6(txtMod, tb.HiddenDim)
-
-	// Pre-attention: norm + modulate
-	imgNorm := layerNormNoAffine(img, tb.NormEps)
-	imgNorm = mlx.Add(mlx.Mul(imgNorm, mlx.AddScalar(imgModParts[1], 1.0)), imgModParts[0])
-
-	txtNorm := layerNormNoAffine(txt, tb.NormEps)
-	txtNorm = mlx.Add(mlx.Mul(txtNorm, mlx.AddScalar(txtModParts[1], 1.0)), txtModParts[0])
-
-	// Joint attention
-	attnImg, attnTxt := tb.Attention.Forward(imgNorm, txtNorm, imgFreqs, txtFreqs)
-
-	// Residual with gate
-	img = mlx.Add(img, mlx.Mul(imgModParts[2], attnImg))
-	txt = mlx.Add(txt, mlx.Mul(txtModParts[2], attnTxt))
-
-	// Pre-MLP: norm + modulate
-	imgNorm2 := layerNormNoAffine(img, tb.NormEps)
-	imgNorm2 = mlx.Add(mlx.Mul(imgNorm2, mlx.AddScalar(imgModParts[4], 1.0)), imgModParts[3])
-
-	txtNorm2 := layerNormNoAffine(txt, tb.NormEps)
-	txtNorm2 = mlx.Add(mlx.Mul(txtNorm2, mlx.AddScalar(txtModParts[4], 1.0)), txtModParts[3])
-
-	// MLP
-	mlpImg := tb.ImgMLP.Forward(imgNorm2)
-	mlpTxt := tb.TxtMLP.Forward(txtNorm2)
-
-	// Residual with gate
-	img = mlx.Add(img, mlx.Mul(imgModParts[5], mlpImg))
-	txt = mlx.Add(txt, mlx.Mul(txtModParts[5], mlpTxt))
-
-	return img, txt
-}
-
-// splitMod6 splits modulation into 6 parts each [B, 1, D]
-func splitMod6(mod *mlx.Array, hiddenDim int32) []*mlx.Array {
-	shape := mod.Shape()
-	B := shape[0]
-	parts := make([]*mlx.Array, 6)
-	for i := int32(0); i < 6; i++ {
-		part := mlx.Slice(mod, []int32{0, i * hiddenDim}, []int32{B, (i + 1) * hiddenDim})
-		parts[i] = mlx.ExpandDims(part, 1)
-	}
-	return parts
-}
-
-// layerNormNoAffine applies layer norm without learnable parameters
-func layerNormNoAffine(x *mlx.Array, eps float32) *mlx.Array {
-	ndim := x.Ndim()
-	lastAxis := ndim - 1
-	mean := mlx.Mean(x, lastAxis, true)
-	xCentered := mlx.Sub(x, mean)
-	variance := mlx.Mean(mlx.Square(xCentered), lastAxis, true)
-	return mlx.Div(xCentered, mlx.Sqrt(mlx.AddScalar(variance, eps)))
-}
-
-// Transformer is the full Qwen-Image transformer model
-type Transformer struct {
-	Config *TransformerConfig
-
-	ImgIn     *mlx.Array
-	ImgInBias *mlx.Array
-	TxtIn     *mlx.Array
-	TxtInBias *mlx.Array
-	TxtNorm   *mlx.Array
-
-	TEmbed *TimestepEmbedder
-	Layers []*TransformerBlock
-
-	NormOutWeight *mlx.Array
-	NormOutBias   *mlx.Array
-	ProjOut       *mlx.Array
-	ProjOutBias   *mlx.Array
-}
-
-// Load loads the transformer from a directory
-func (m *Transformer) Load(path string) error {
-	fmt.Println("Loading Qwen-Image transformer...")
-
-	cfg := defaultTransformerConfig()
-	m.Config = cfg
-
-	weights, err := safetensors.LoadModelWeights(path)
-	if err != nil {
-		return fmt.Errorf("weights: %w", err)
-	}
-
-	// Bulk load all weights as bf16
-	fmt.Print("  Loading weights as bf16... ")
-	if err := weights.Load(mlx.DtypeBFloat16); err != nil {
-		return fmt.Errorf("load weights: %w", err)
-	}
-	fmt.Printf("✓ (%.1f GB)\n", float64(mlx.MetalGetActiveMemory())/(1024*1024*1024))
-
-	fmt.Print("  Loading input projections... ")
-	imgIn, _ := weights.Get("img_in.weight")
-	imgInBias, _ := weights.Get("img_in.bias")
-	txtIn, _ := weights.Get("txt_in.weight")
-	txtInBias, _ := weights.Get("txt_in.bias")
-	txtNorm, _ := weights.Get("txt_norm.weight")
-	m.ImgIn = mlx.Transpose(imgIn, 1, 0)
-	m.ImgInBias = imgInBias
-	m.TxtIn = mlx.Transpose(txtIn, 1, 0)
-	m.TxtInBias = txtInBias
-	m.TxtNorm = txtNorm
-	fmt.Println("✓")
-
-	fmt.Print("  Loading timestep embedder... ")
-	m.TEmbed, err = newTimestepEmbedder(weights)
-	if err != nil {
-		return fmt.Errorf("timestep embedder: %w", err)
-	}
-	fmt.Println("✓")
-
-	m.Layers = make([]*TransformerBlock, cfg.NLayers)
-	for i := int32(0); i < cfg.NLayers; i++ {
-		fmt.Printf("\r  Loading transformer layers... %d/%d", i+1, cfg.NLayers)
-		prefix := fmt.Sprintf("transformer_blocks.%d", i)
-		m.Layers[i], err = newTransformerBlock(weights, prefix, cfg)
-		if err != nil {
-			return fmt.Errorf("layer %d: %w", i, err)
-		}
-	}
-	fmt.Printf("\r  Loading transformer layers... ✓ [%d blocks]          \n", cfg.NLayers)
-
-	fmt.Print("  Loading output layers... ")
-	normOutWeight, _ := weights.Get("norm_out.linear.weight")
-	normOutBias, _ := weights.Get("norm_out.linear.bias")
-	projOut, _ := weights.Get("proj_out.weight")
-	projOutBias, _ := weights.Get("proj_out.bias")
-	m.NormOutWeight = mlx.Transpose(normOutWeight, 1, 0)
-	m.NormOutBias = normOutBias
-	m.ProjOut = mlx.Transpose(projOut, 1, 0)
-	m.ProjOutBias = projOutBias
-	fmt.Println("✓")
-
-	weights.ReleaseAll()
-	return nil
-}
-
-// LoadFromPath is a convenience function to load transformer from path
-func LoadTransformerFromPath(path string) (*Transformer, error) {
-	m := &Transformer{}
-	if err := m.Load(filepath.Join(path, "transformer")); err != nil {
-		return nil, err
-	}
-	return m, nil
-}
-
-// Forward runs the transformer
-// img: [B, L_img, in_channels] patchified latents
-// txt: [B, L_txt, joint_attention_dim] text embeddings
-// t: [B] timesteps (0-1)
-// imgFreqs, txtFreqs: RoPE frequencies
-func (tr *Transformer) Forward(img, txt, t *mlx.Array, imgFreqs, txtFreqs *mlx.Array) *mlx.Array {
-	imgShape := img.Shape()
-	B := imgShape[0]
-	Limg := imgShape[1]
-
-	txtShape := txt.Shape()
-	Ltxt := txtShape[1]
-
-	// Timestep embedding
-	temb := tr.TEmbed.Forward(t)
-
-	// Project image: [B, L, in_channels] -> [B, L, hidden_dim]
-	imgFlat := mlx.Reshape(img, B*Limg, tr.Config.InChannels)
-	imgH := mlx.Add(mlx.Linear(imgFlat, tr.ImgIn), tr.ImgInBias)
-	imgH = mlx.Reshape(imgH, B, Limg, tr.Config.HiddenDim)
-
-	// Project text: RMSNorm then linear
-	txtFlat := mlx.Reshape(txt, B*Ltxt, tr.Config.JointAttentionDim)
-	txtNormed := mlx.RMSNorm(txtFlat, tr.TxtNorm, 1e-6)
-	txtH := mlx.Add(mlx.Linear(txtNormed, tr.TxtIn), tr.TxtInBias)
-	txtH = mlx.Reshape(txtH, B, Ltxt, tr.Config.HiddenDim)
-
-	for _, layer := range tr.Layers {
-		imgH, txtH = layer.Forward(imgH, txtH, temb, imgFreqs, txtFreqs)
-	}
-
-	// Final norm with modulation (AdaLayerNormContinuous)
-	// Python: scale, shift = torch.chunk(emb, 2, dim=1)
-	finalMod := mlx.Add(mlx.Linear(mlx.SiLU(temb), tr.NormOutWeight), tr.NormOutBias)
-	modShape := finalMod.Shape()
-	halfDim := modShape[1] / 2
-	scale := mlx.ExpandDims(mlx.Slice(finalMod, []int32{0, 0}, []int32{B, halfDim}), 1)
-	shift := mlx.ExpandDims(mlx.Slice(finalMod, []int32{0, halfDim}, []int32{B, modShape[1]}), 1)
-
-	imgH = layerNormNoAffine(imgH, tr.Config.NormEps)
-	imgH = mlx.Add(mlx.Mul(imgH, mlx.AddScalar(scale, 1.0)), shift)
-
-	// Final projection: [B, L, hidden_dim] -> [B, L, patch_size^2 * out_channels]
-	imgFlat = mlx.Reshape(imgH, B*Limg, tr.Config.HiddenDim)
-	out := mlx.Add(mlx.Linear(imgFlat, tr.ProjOut), tr.ProjOutBias)
-
-	outChannels := tr.Config.PatchSize * tr.Config.PatchSize * tr.Config.OutChannels
-	return mlx.Reshape(out, B, Limg, outChannels)
-}
-
-// ForwardWithCache runs the transformer with layer caching for speedup.
-// Based on DeepCache (CVPR 2024) / Learning-to-Cache (NeurIPS 2024):
-// shallow layers change little between denoising steps, so we cache their
-// outputs and reuse them on non-refresh steps.
-//
-// stepCache: cache for layer outputs (use cache.NewStepCache(cacheLayers))
-// step: current denoising step (0-indexed)
-// cacheInterval: refresh cache every N steps (e.g., 3)
-// cacheLayers: number of shallow layers to cache (e.g., 15)
-func (tr *Transformer) ForwardWithCache(
-	img, txt, t *mlx.Array,
-	imgFreqs, txtFreqs *mlx.Array,
-	stepCache *cache.StepCache,
-	step, cacheInterval, cacheLayers int,
-) *mlx.Array {
-	imgShape := img.Shape()
-	B := imgShape[0]
-	Limg := imgShape[1]
-
-	txtShape := txt.Shape()
-	Ltxt := txtShape[1]
-
-	// Timestep embedding
-	temb := tr.TEmbed.Forward(t)
-
-	// Project image: [B, L, in_channels] -> [B, L, hidden_dim]
-	imgFlat := mlx.Reshape(img, B*Limg, tr.Config.InChannels)
-	imgH := mlx.Add(mlx.Linear(imgFlat, tr.ImgIn), tr.ImgInBias)
-	imgH = mlx.Reshape(imgH, B, Limg, tr.Config.HiddenDim)
-
-	// Project text: RMSNorm then linear
-	txtFlat := mlx.Reshape(txt, B*Ltxt, tr.Config.JointAttentionDim)
-	txtNormed := mlx.RMSNorm(txtFlat, tr.TxtNorm, 1e-6)
-	txtH := mlx.Add(mlx.Linear(txtNormed, tr.TxtIn), tr.TxtInBias)
-	txtH = mlx.Reshape(txtH, B, Ltxt, tr.Config.HiddenDim)
-
-	// Check if we should refresh the cache
-	refreshCache := stepCache.ShouldRefresh(step, cacheInterval)
-
-	for i, layer := range tr.Layers {
-		if i < cacheLayers && !refreshCache && stepCache.Get(i) != nil {
-			// Use cached outputs for shallow layers
-			imgH = stepCache.Get(i)
-			txtH = stepCache.Get2(i)
-		} else {
-			// Compute layer
-			imgH, txtH = layer.Forward(imgH, txtH, temb, imgFreqs, txtFreqs)
-			// Cache shallow layers on refresh steps
-			if i < cacheLayers && refreshCache {
-				stepCache.Set(i, imgH)
-				stepCache.Set2(i, txtH)
-			}
-		}
-	}
-
-	// Final norm with modulation (AdaLayerNormContinuous)
-	finalMod := mlx.Add(mlx.Linear(mlx.SiLU(temb), tr.NormOutWeight), tr.NormOutBias)
-	modShape := finalMod.Shape()
-	halfDim := modShape[1] / 2
-	scale := mlx.ExpandDims(mlx.Slice(finalMod, []int32{0, 0}, []int32{B, halfDim}), 1)
-	shift := mlx.ExpandDims(mlx.Slice(finalMod, []int32{0, halfDim}, []int32{B, modShape[1]}), 1)
-
-	imgH = layerNormNoAffine(imgH, tr.Config.NormEps)
-	imgH = mlx.Add(mlx.Mul(imgH, mlx.AddScalar(scale, 1.0)), shift)
-
-	// Final projection: [B, L, hidden_dim] -> [B, L, patch_size^2 * out_channels]
-	imgFlat = mlx.Reshape(imgH, B*Limg, tr.Config.HiddenDim)
-	out := mlx.Add(mlx.Linear(imgFlat, tr.ProjOut), tr.ProjOutBias)
-
-	outChannels := tr.Config.PatchSize * tr.Config.PatchSize * tr.Config.OutChannels
-	return mlx.Reshape(out, B, Limg, outChannels)
-}
-
-// RoPECache holds precomputed RoPE frequencies
-type RoPECache struct {
-	ImgFreqs *mlx.Array // [L_img, head_dim]
-	TxtFreqs *mlx.Array // [L_txt, head_dim]
-}
-
-// PrepareRoPE computes RoPE for image and text sequences
-// This matches Python's QwenEmbedRope with scale_rope=True
-func PrepareRoPE(imgH, imgW int32, txtLen int32, axesDims []int32) *RoPECache {
-	theta := float64(10000)
-	maxIdx := int32(4096)
-
-	// Compute base frequencies for each axis dimension
-	freqsT := ComputeAxisFreqs(axesDims[0], theta)
-	freqsH := ComputeAxisFreqs(axesDims[1], theta)
-	freqsW := ComputeAxisFreqs(axesDims[2], theta)
-
-	// Build frequency lookup tables
-	posFreqsT := MakeFreqTable(maxIdx, freqsT, false)
-	posFreqsH := MakeFreqTable(maxIdx, freqsH, false)
-	posFreqsW := MakeFreqTable(maxIdx, freqsW, false)
-	negFreqsH := MakeFreqTable(maxIdx, freqsH, true)
-	negFreqsW := MakeFreqTable(maxIdx, freqsW, true)
-
-	// Image frequencies with scale_rope=True
-	imgLen := imgH * imgW
-	headDim := int32(len(freqsT)+len(freqsH)+len(freqsW)) * 2
-	imgFreqsData := make([]float32, imgLen*headDim)
-
-	hHalf := imgH / 2
-	wHalf := imgW / 2
-
-	idx := int32(0)
-	for y := int32(0); y < imgH; y++ {
-		for x := int32(0); x < imgW; x++ {
-			// Frame = 0
-			for i := 0; i < len(freqsT)*2; i++ {
-				imgFreqsData[idx+int32(i)] = posFreqsT[0][i]
-			}
-			idx += int32(len(freqsT) * 2)
-
-			// Height: scale_rope pattern
-			hNegCount := imgH - hHalf
-			if y < hNegCount {
-				negTableIdx := maxIdx - hNegCount + y
-				for i := 0; i < len(freqsH)*2; i++ {
-					imgFreqsData[idx+int32(i)] = negFreqsH[negTableIdx][i]
-				}
-			} else {
-				posIdx := y - hNegCount
-				for i := 0; i < len(freqsH)*2; i++ {
-					imgFreqsData[idx+int32(i)] = posFreqsH[posIdx][i]
-				}
-			}
-			idx += int32(len(freqsH) * 2)
-
-			// Width: scale_rope pattern
-			wNegCount := imgW - wHalf
-			if x < wNegCount {
-				negTableIdx := maxIdx - wNegCount + x
-				for i := 0; i < len(freqsW)*2; i++ {
-					imgFreqsData[idx+int32(i)] = negFreqsW[negTableIdx][i]
-				}
-			} else {
-				posIdx := x - wNegCount
-				for i := 0; i < len(freqsW)*2; i++ {
-					imgFreqsData[idx+int32(i)] = posFreqsW[posIdx][i]
-				}
-			}
-			idx += int32(len(freqsW) * 2)
-		}
-	}
-
-	imgFreqs := mlx.NewArray(imgFreqsData, []int32{imgLen, headDim})
-	imgFreqs = mlx.ToBFloat16(imgFreqs)
-
-	// Text frequencies
-	maxVidIdx := max(hHalf, wHalf)
-	txtFreqsData := make([]float32, txtLen*headDim)
-
-	idx = 0
-	for t := int32(0); t < txtLen; t++ {
-		pos := maxVidIdx + t
-		for i := 0; i < len(freqsT)*2; i++ {
-			txtFreqsData[idx+int32(i)] = posFreqsT[pos][i]
-		}
-		idx += int32(len(freqsT) * 2)
-		for i := 0; i < len(freqsH)*2; i++ {
-			txtFreqsData[idx+int32(i)] = posFreqsH[pos][i]
-		}
-		idx += int32(len(freqsH) * 2)
-		for i := 0; i < len(freqsW)*2; i++ {
-			txtFreqsData[idx+int32(i)] = posFreqsW[pos][i]
-		}
-		idx += int32(len(freqsW) * 2)
-	}
-
-	txtFreqs := mlx.NewArray(txtFreqsData, []int32{txtLen, headDim})
-	txtFreqs = mlx.ToBFloat16(txtFreqs)
-
-	return &RoPECache{
-		ImgFreqs: imgFreqs,
-		TxtFreqs: txtFreqs,
-	}
-}
-
-// ComputeAxisFreqs computes RoPE base frequencies for a given dimension.
-func ComputeAxisFreqs(dim int32, theta float64) []float64 {
-	halfDim := dim / 2
-	freqs := make([]float64, halfDim)
-	for i := int32(0); i < halfDim; i++ {
-		freqs[i] = 1.0 / math.Pow(theta, float64(i)/float64(halfDim))
-	}
-	return freqs
-}
-
-// MakeFreqTable builds a table of cos/sin values for RoPE positions.
-func MakeFreqTable(maxIdx int32, baseFreqs []float64, negative bool) [][]float32 {
-	table := make([][]float32, maxIdx)
-	for idx := int32(0); idx < maxIdx; idx++ {
-		var pos float64
-		if negative {
-			pos = float64(-maxIdx + int32(idx))
-		} else {
-			pos = float64(idx)
-		}
-
-		row := make([]float32, len(baseFreqs)*2)
-		for i, f := range baseFreqs {
-			angle := pos * f
-			row[i*2] = float32(math.Cos(angle))
-			row[i*2+1] = float32(math.Sin(angle))
-		}
-		table[idx] = row
-	}
-	return table
-}
-
-func max(a, b int32) int32 {
-	if a > b {
-		return a
-	}
-	return b
-}
-
-// PackLatents converts [B, C, H, W] to [B, L, C*4] patches
-func PackLatents(latents *mlx.Array, patchSize int32) *mlx.Array {
-	shape := latents.Shape()
-	B := shape[0]
-	C := shape[1]
-	H := shape[2]
-	W := shape[3]
-
-	pH := H / patchSize
-	pW := W / patchSize
-
-	// [B, C, H, W] -> [B, C, pH, 2, pW, 2]
-	x := mlx.Reshape(latents, B, C, pH, patchSize, pW, patchSize)
-	// -> [B, pH, pW, C, 2, 2]
-	x = mlx.Transpose(x, 0, 2, 4, 1, 3, 5)
-	// -> [B, pH*pW, C*4]
-	return mlx.Reshape(x, B, pH*pW, C*patchSize*patchSize)
-}
-
-// UnpackLatents converts [B, L, C*4] back to [B, C, 1, H, W] (5D for VAE)
-func UnpackLatents(patches *mlx.Array, H, W, patchSize int32) *mlx.Array {
-	shape := patches.Shape()
-	B := shape[0]
-	channels := shape[2] / (patchSize * patchSize)
-
-	pH := H / patchSize
-	pW := W / patchSize
-
-	// [B, L, C*4] -> [B, pH, pW, C, 2, 2]
-	x := mlx.Reshape(patches, B, pH, pW, channels, patchSize, patchSize)
-	// -> [B, C, pH, 2, pW, 2]
-	x = mlx.Transpose(x, 0, 3, 1, 4, 2, 5)
-	// -> [B, C, H, W]
-	x = mlx.Reshape(x, B, channels, pH*patchSize, pW*patchSize)
-	// Add temporal dimension for VAE: [B, C, 1, H, W]
-	return mlx.ExpandDims(x, 2)
-}
--- a/x/imagegen/models/qwen_image/transformer_test.go
+++ b/x/imagegen/models/qwen_image/transformer_test.go
@@ -1,117 +0,0 @@
-package qwen_image
-
-import (
-	"math"
-	"os"
-	"testing"
-
-	"github.com/ollama/ollama/x/imagegen/mlx"
-)
-
-// TestTransformerConfig tests configuration invariants.
-func TestTransformerConfig(t *testing.T) {
-	cfg := defaultTransformerConfig()
-
-	// Property: hidden_dim = n_heads * head_dim
-	if cfg.HiddenDim != cfg.NHeads*cfg.HeadDim {
-		t.Errorf("hidden_dim != n_heads * head_dim: %d != %d * %d",
-			cfg.HiddenDim, cfg.NHeads, cfg.HeadDim)
-	}
-
-	// Property: axes_dims_rope sums to head_dim
-	var ropeSum int32
-	for _, d := range cfg.AxesDimsRope {
-		ropeSum += d
-	}
-	if ropeSum != cfg.HeadDim {
-		t.Errorf("axes_dims_rope sum != head_dim: %d != %d", ropeSum, cfg.HeadDim)
-	}
-
-	// Property: in_channels = out_channels * patch_size^2
-	expectedIn := cfg.OutChannels * cfg.PatchSize * cfg.PatchSize
-	if cfg.InChannels != expectedIn {
-		t.Errorf("in_channels != out_channels * patch_size^2: %d != %d", cfg.InChannels, expectedIn)
-	}
-}
-
-// TestTransformerRoPE tests RoPE frequency computation produces valid values.
-func TestTransformerRoPE(t *testing.T) {
-	cfg := defaultTransformerConfig()
-
-	// Test with small image dimensions
-	imgH, imgW := int32(4), int32(4) // 4x4 latent = 16 patches
-	txtLen := int32(5)
-
-	ropeCache := PrepareRoPE(imgH, imgW, txtLen, cfg.AxesDimsRope)
-	mlx.Eval(ropeCache.ImgFreqs, ropeCache.TxtFreqs)
-
-	// Verify shapes: [seq_len, head_dim]
-	imgSeqLen := imgH * imgW
-	if ropeCache.ImgFreqs.Shape()[0] != imgSeqLen {
-		t.Errorf("ImgFreqs seq_len: got %d, want %d", ropeCache.ImgFreqs.Shape()[0], imgSeqLen)
-	}
-	if ropeCache.ImgFreqs.Shape()[1] != cfg.HeadDim {
-		t.Errorf("ImgFreqs head_dim: got %d, want %d", ropeCache.ImgFreqs.Shape()[1], cfg.HeadDim)
-	}
-
-	if ropeCache.TxtFreqs.Shape()[0] != txtLen {
-		t.Errorf("TxtFreqs seq_len: got %d, want %d", ropeCache.TxtFreqs.Shape()[0], txtLen)
-	}
-
-	// Verify values are finite
-	imgData := ropeCache.ImgFreqs.Data()
-	for i := 0; i < min(100, len(imgData)); i++ {
-		if math.IsNaN(float64(imgData[i])) || math.IsInf(float64(imgData[i]), 0) {
-			t.Errorf("ImgFreqs[%d] not finite: %v", i, imgData[i])
-			break
-		}
-	}
-}
-
-// TestTransformerForward tests full forward pass (integration test).
-// Skips if model weights are not available.
-func TestTransformerForward(t *testing.T) {
-	weightsPath := "../../../weights/Qwen-Image-2512/transformer"
-	if _, err := os.Stat(weightsPath); os.IsNotExist(err) {
-		t.Skip("Skipping: model weights not found at " + weightsPath)
-	}
-
-	transformer := &Transformer{}
-	if err := transformer.Load(weightsPath); err != nil {
-		t.Fatalf("Failed to load transformer: %v", err)
-	}
-	mlx.Keep(mlx.Collect(transformer)...)
-	cfg := transformer.Config
-
-	// Small test inputs
-	batchSize := int32(1)
-	imgH, imgW := int32(4), int32(4)
-	imgSeqLen := imgH * imgW
-	txtSeqLen := int32(5)
-
-	hiddenStates := mlx.RandomNormal([]int32{batchSize, imgSeqLen, cfg.InChannels}, 0)
-	encoderHiddenStates := mlx.RandomNormal([]int32{batchSize, txtSeqLen, cfg.JointAttentionDim}, 0)
-	timestep := mlx.NewArray([]float32{0.5}, []int32{batchSize})
-
-	ropeCache := PrepareRoPE(imgH, imgW, txtSeqLen, cfg.AxesDimsRope)
-
-	// Forward pass
-	out := transformer.Forward(hiddenStates, encoderHiddenStates, timestep, ropeCache.ImgFreqs, ropeCache.TxtFreqs)
-	mlx.Eval(out)
-
-	// Verify output shape: [batch, img_seq_len, in_channels]
-	wantShape := []int32{batchSize, imgSeqLen, cfg.InChannels}
-	gotShape := out.Shape()
-	if gotShape[0] != wantShape[0] || gotShape[1] != wantShape[1] || gotShape[2] != wantShape[2] {
-		t.Errorf("output shape: got %v, want %v", gotShape, wantShape)
-	}
-
-	// Verify output is finite
-	outData := out.Data()
-	for i := 0; i < min(100, len(outData)); i++ {
-		if math.IsNaN(float64(outData[i])) || math.IsInf(float64(outData[i]), 0) {
-			t.Errorf("output[%d] not finite: %v", i, outData[i])
-			break
-		}
-	}
-}
--- a/x/imagegen/models/qwen_image/vae.go
+++ b/x/imagegen/models/qwen_image/vae.go
@@ -1,852 +0,0 @@
-package qwen_image
-
-import (
-	"fmt"
-	"math"
-	"path/filepath"
-
-	"github.com/ollama/ollama/x/imagegen/mlx"
-	"github.com/ollama/ollama/x/imagegen/safetensors"
-)
-
-// VAEConfig holds Qwen-Image VAE configuration
-type VAEConfig struct {
-	ZDim               int32     `json:"z_dim"`               // 16
-	BaseDim            int32     `json:"base_dim"`            // 96
-	DimMult            []int32   `json:"dim_mult"`            // [1, 2, 4, 4]
-	NumResBlocks       int32     `json:"num_res_blocks"`      // 2
-	LatentsMean        []float32 `json:"latents_mean"`        // 16 values
-	LatentsStd         []float32 `json:"latents_std"`         // 16 values
-	TemperalDownsample []bool    `json:"temperal_downsample"` // [false, true, true]
-}
-
-// defaultVAEConfig returns config for Qwen-Image VAE
-func defaultVAEConfig() *VAEConfig {
-	return &VAEConfig{
-		ZDim:         16,
-		BaseDim:      96,
-		DimMult:      []int32{1, 2, 4, 4},
-		NumResBlocks: 2,
-		LatentsMean: []float32{
-			-0.7571, -0.7089, -0.9113, 0.1075,
-			-0.1745, 0.9653, -0.1517, 1.5508,
-			0.4134, -0.0715, 0.5517, -0.3632,
-			-0.1922, -0.9497, 0.2503, -0.2921,
-		},
-		LatentsStd: []float32{
-			2.8184, 1.4541, 2.3275, 2.6558,
-			1.2196, 1.7708, 2.6052, 2.0743,
-			3.2687, 2.1526, 2.8652, 1.5579,
-			1.6382, 1.1253, 2.8251, 1.916,
-		},
-		TemperalDownsample: []bool{false, true, true},
-	}
-}
-
-// CausalConv3d is a causal 3D convolution (for temporal causality)
-type CausalConv3d struct {
-	Weight       *mlx.Array
-	Bias         *mlx.Array
-	BiasReshaped *mlx.Array // [1, C, 1, 1, 1]
-	KernelT      int32
-}
-
-// newCausalConv3d creates a 3D causal conv
-func newCausalConv3d(weights *safetensors.ModelWeights, prefix string) (*CausalConv3d, error) {
-	weight, err := weights.Get(prefix + ".weight")
-	if err != nil {
-		return nil, fmt.Errorf("weight not found: %s", prefix)
-	}
-	bias, _ := weights.Get(prefix + ".bias")
-
-	kernelT := weight.Shape()[2]
-	outC := weight.Shape()[0]
-
-	var biasReshaped *mlx.Array
-	if bias != nil {
-		biasReshaped = mlx.Reshape(bias, 1, outC, 1, 1, 1)
-	}
-
-	return &CausalConv3d{
-		Weight:       weight,
-		Bias:         bias,
-		BiasReshaped: biasReshaped,
-		KernelT:      kernelT,
-	}, nil
-}
-
-// Forward applies causal 3D convolution
-// x: [B, T, H, W, C] (channels-last, MLX format)
-func (c *CausalConv3d) Forward(x *mlx.Array) *mlx.Array {
-	shape := c.Weight.Shape() // PyTorch format: [O, I, kT, kH, kW]
-	kernelT := shape[2]
-	kernelH := shape[3]
-	kernelW := shape[4]
-
-	// Causal temporal padding, same spatial padding
-	// Input is channels-last: [B, T, H, W, C]
-	padT := kernelT - 1
-	padH := kernelH / 2
-	padW := kernelW / 2
-
-	// Stage 1: Pad
-	{
-			x = pad3DChannelsLast(x, padT, 0, padH, padH, padW, padW)
-		mlx.Eval(x)
-	}
-
-	// Stage 2: Conv + bias
-	var out *mlx.Array
-	{
-			prev := x
-		weight := mlx.Transpose(c.Weight, 0, 2, 3, 4, 1)
-		out = mlx.Conv3d(x, weight, 1, 1, 1, 0, 0, 0)
-		if c.Bias != nil {
-			bias := mlx.Reshape(c.Bias, 1, 1, 1, 1, c.Bias.Dim(0))
-			out = mlx.Add(out, bias)
-		}
-		prev.Free()
-		mlx.Eval(out)
-	}
-
-	return out
-}
-
-// RMSNorm3D applies RMS normalization over channels
-// Works with channels-last [B, T, H, W, C] format
-type RMSNorm3D struct {
-	Gamma *mlx.Array // [1, 1, 1, 1, C] for broadcasting
-}
-
-// newRMSNorm3D creates an RMS norm
-func newRMSNorm3D(weights *safetensors.ModelWeights, prefix string, dim int32) (*RMSNorm3D, error) {
-	gamma, err := weights.Get(prefix + ".gamma")
-	if err != nil {
-		return nil, err
-	}
-	// Reshape for channels-last broadcasting: [1, 1, 1, 1, C]
-	gamma = mlx.Reshape(gamma, 1, 1, 1, 1, gamma.Dim(0))
-	return &RMSNorm3D{Gamma: gamma}, nil
-}
-
-// Forward applies RMS norm to channels-last input [B, T, H, W, C]
-func (n *RMSNorm3D) Forward(x *mlx.Array) *mlx.Array {
-	// RMSNorm: x * rsqrt(mean(x^2) + eps) * gamma
-	normalized := mlx.RMSNormNoWeight(x, 1e-6)
-	return mlx.Mul(normalized, n.Gamma)
-}
-
-// ResBlock is a residual block with RMS norm and causal convs
-type ResBlock struct {
-	Norm1    *RMSNorm3D
-	Conv1    *CausalConv3d
-	Norm2    *RMSNorm3D
-	Conv2    *CausalConv3d
-	Shortcut *CausalConv3d
-}
-
-// newResBlock creates a residual block
-func newResBlock(weights *safetensors.ModelWeights, prefix string, inDim, outDim int32) (*ResBlock, error) {
-	norm1, err := newRMSNorm3D(weights, prefix+".norm1", inDim)
-	if err != nil {
-		return nil, err
-	}
-	conv1, err := newCausalConv3d(weights, prefix+".conv1")
-	if err != nil {
-		return nil, err
-	}
-	norm2, err := newRMSNorm3D(weights, prefix+".norm2", outDim)
-	if err != nil {
-		return nil, err
-	}
-	conv2, err := newCausalConv3d(weights, prefix+".conv2")
-	if err != nil {
-		return nil, err
-	}
-
-	var shortcut *CausalConv3d
-	if inDim != outDim {
-		shortcut, err = newCausalConv3d(weights, prefix+".conv_shortcut")
-		if err != nil {
-			return nil, err
-		}
-	}
-
-	return &ResBlock{
-		Norm1:    norm1,
-		Conv1:    conv1,
-		Norm2:    norm2,
-		Conv2:    conv2,
-		Shortcut: shortcut,
-	}, nil
-}
-
-// Forward applies the residual block
-func (r *ResBlock) Forward(x *mlx.Array) *mlx.Array {
-	// Use h as working variable, keep x intact for residual (caller will free x)
-	// Conv handles its own pools, so we just need pools for non-conv operations
-	var h *mlx.Array
-
-	// Keep x so it survives Eval() cleanup - needed for residual connection
-	mlx.Keep(x)
-
-	// Stage 1: norm1 + silu
-	{
-			h = r.Norm1.Forward(x)
-		h = silu3D(h)
-		mlx.Eval(h)
-	}
-
-	// Stage 2: conv1 (handles its own pools)
-	{
-		prev := h
-		h = r.Conv1.Forward(h)
-		prev.Free()
-	}
-
-	// Stage 3: norm2 + silu
-	{
-			prev := h
-		h = r.Norm2.Forward(h)
-		h = silu3D(h)
-		prev.Free()
-		mlx.Eval(h)
-	}
-
-	// Stage 4: conv2 (handles its own pools)
-	{
-		prev := h
-		h = r.Conv2.Forward(h)
-		prev.Free()
-	}
-
-	// Residual connection (shortcut handles its own pools if present)
-	if r.Shortcut != nil {
-		shortcut := r.Shortcut.Forward(x)
-			h = mlx.Add(h, shortcut)
-		mlx.Eval(h)
-	} else {
-			h = mlx.Add(h, x)
-		mlx.Eval(h)
-	}
-
-	return h
-}
-
-// AttentionBlock is a 2D attention block
-type AttentionBlock struct {
-	Norm      *RMSNorm3D
-	ToQKV     *mlx.Array
-	ToQKVBias *mlx.Array
-	Proj      *mlx.Array
-	ProjBias  *mlx.Array
-	Dim       int32
-}
-
-// newAttentionBlock creates an attention block
-func newAttentionBlock(weights *safetensors.ModelWeights, prefix string, dim int32) (*AttentionBlock, error) {
-	norm, err := newRMSNorm3D(weights, prefix+".norm", dim)
-	if err != nil {
-		return nil, err
-	}
-	toQKV, _ := weights.Get(prefix + ".to_qkv.weight")
-	toQKVBias, _ := weights.Get(prefix + ".to_qkv.bias")
-	proj, _ := weights.Get(prefix + ".proj.weight")
-	projBias, _ := weights.Get(prefix + ".proj.bias")
-
-	return &AttentionBlock{
-		Norm:      norm,
-		ToQKV:     toQKV,
-		ToQKVBias: toQKVBias,
-		Proj:      proj,
-		ProjBias:  projBias,
-		Dim:       dim,
-	}, nil
-}
-
-// Forward applies 2D attention
-// Input: [B, T, H, W, C] (channels-last)
-func (a *AttentionBlock) Forward(x *mlx.Array) *mlx.Array {
-	shape := x.Shape()
-	B := shape[0]
-	T := shape[1]
-	H := shape[2]
-	W := shape[3]
-	C := shape[4]
-
-	identity := x
-
-	// Flatten to [B*T, 1, H, W, C] for norm
-	x = mlx.Reshape(x, B*T, 1, H, W, C)
-	x = a.Norm.Forward(x)
-	x = mlx.Reshape(x, B*T, H, W, C)
-
-	// Flatten spatial to [B*T, H*W, C]
-	x = mlx.Reshape(x, B*T, H*W, C)
-
-	// Linear to get Q, K, V: [B*T, H*W, 3*C]
-	// Weight is [outC, inC] or [outC, inC, 1, 1]
-	wShape := a.ToQKV.Shape()
-	var w *mlx.Array
-	if len(wShape) == 4 {
-		w = mlx.Reshape(a.ToQKV, wShape[0], wShape[1])
-	} else {
-		w = a.ToQKV
-	}
-	w = mlx.Transpose(w, 1, 0) // [inC, outC]
-
-	qkv := mlx.Linear(x, w) // [B*T, H*W, 3*C]
-	if a.ToQKVBias != nil {
-		qkv = mlx.Add(qkv, a.ToQKVBias)
-	}
-	qkv = mlx.Reshape(qkv, B*T, 1, H*W, 3*C)
-
-	q := mlx.Slice(qkv, []int32{0, 0, 0, 0}, []int32{B * T, 1, H * W, C})
-	k := mlx.Slice(qkv, []int32{0, 0, 0, C}, []int32{B * T, 1, H * W, 2 * C})
-	v := mlx.Slice(qkv, []int32{0, 0, 0, 2 * C}, []int32{B * T, 1, H * W, 3 * C})
-
-	scale := float32(1.0 / math.Sqrt(float64(C)))
-	out := mlx.ScaledDotProductAttention(q, k, v, scale, false)
-
-	// out: [B*T, 1, H*W, C]
-	out = mlx.Reshape(out, B*T, H*W, C)
-
-	// Project back
-	pShape := a.Proj.Shape()
-	var p *mlx.Array
-	if len(pShape) == 4 {
-		p = mlx.Reshape(a.Proj, pShape[0], pShape[1])
-	} else {
-		p = a.Proj
-	}
-	p = mlx.Transpose(p, 1, 0) // [inC, outC]
-	out = mlx.Linear(out, p) // [B*T, H*W, C]
-	if a.ProjBias != nil {
-		out = mlx.Add(out, a.ProjBias)
-	}
-
-	out = mlx.Reshape(out, B, T, H, W, C)
-	return mlx.Add(out, identity)
-}
-
-// UpBlock handles upsampling in decoder
-type UpBlock struct {
-	ResBlocks []*ResBlock
-	Upsampler *Upsample
-}
-
-// newUpBlock creates an up block
-func newUpBlock(weights *safetensors.ModelWeights, prefix string, inDim, outDim int32, numBlocks int32, upsampleMode string) (*UpBlock, error) {
-	resBlocks := make([]*ResBlock, numBlocks+1)
-
-	currentDim := inDim
-	for i := int32(0); i <= numBlocks; i++ {
-		resPrefix := fmt.Sprintf("%s.resnets.%d", prefix, i)
-		block, err := newResBlock(weights, resPrefix, currentDim, outDim)
-		if err != nil {
-			return nil, err
-		}
-		resBlocks[i] = block
-		currentDim = outDim
-	}
-
-	var upsampler *Upsample
-	if upsampleMode != "" {
-		upsampler = newUpsample(weights, prefix+".upsamplers.0", outDim, upsampleMode)
-	}
-
-	return &UpBlock{
-		ResBlocks: resBlocks,
-		Upsampler: upsampler,
-	}, nil
-}
-
-// Forward applies up block with staged memory management
-func (u *UpBlock) Forward(x *mlx.Array) *mlx.Array {
-	// ResBlocks handle their own pools
-	for _, block := range u.ResBlocks {
-		prev := x
-		x = block.Forward(x)
-		prev.Free()
-	}
-
-	// Upsampler handles its own pools
-	if u.Upsampler != nil {
-		prev := x
-		x = u.Upsampler.Forward(x)
-		prev.Free()
-	}
-	return x
-}
-
-// Upsample handles spatial upsampling
-type Upsample struct {
-	Conv *mlx.Array
-	Bias *mlx.Array
-	Mode string
-}
-
-// newUpsample creates an upsampler
-func newUpsample(weights *safetensors.ModelWeights, prefix string, dim int32, mode string) *Upsample {
-	conv, _ := weights.Get(prefix + ".resample.1.weight")
-	bias, _ := weights.Get(prefix + ".resample.1.bias")
-	return &Upsample{
-		Conv: conv,
-		Bias: bias,
-		Mode: mode,
-	}
-}
-
-// Forward applies upsampling to channels-last input [B, T, H, W, C]
-// Uses staged pools to reduce peak memory during 2x upsampling
-func (u *Upsample) Forward(x *mlx.Array) *mlx.Array {
-	shape := x.Shape()
-	B := shape[0]
-	T := shape[1]
-	H := shape[2]
-	W := shape[3]
-	C := shape[4]
-	outC := u.Conv.Shape()[0]
-
-	// Stage 1: 2x nearest neighbor upsample
-	{
-			x = mlx.Reshape(x, B*T, H, W, C)
-		x = upsample2xChannelsLast(x)
-		mlx.Eval(x)
-	}
-
-	// Stage 2: Conv + bias
-	{
-			prev := x
-		weight := mlx.Transpose(u.Conv, 0, 2, 3, 1)
-		x = conv2D3x3PaddedChannelsLast(x, weight)
-		if u.Bias != nil {
-			bias := mlx.Reshape(u.Bias, 1, 1, 1, outC)
-			x = mlx.Add(x, bias)
-		}
-		x = mlx.Reshape(x, B, T, H*2, W*2, outC)
-		prev.Free()
-		mlx.Eval(x)
-	}
-
-	return x
-}
-
-// MidBlock is the middle block of decoder
-type MidBlock struct {
-	ResBlock1 *ResBlock
-	Attention *AttentionBlock
-	ResBlock2 *ResBlock
-}
-
-// newMidBlock creates a mid block
-func newMidBlock(weights *safetensors.ModelWeights, prefix string, dim int32) (*MidBlock, error) {
-	res1, err := newResBlock(weights, prefix+".resnets.0", dim, dim)
-	if err != nil {
-		return nil, err
-	}
-	attn, err := newAttentionBlock(weights, prefix+".attentions.0", dim)
-	if err != nil {
-		return nil, err
-	}
-	res2, err := newResBlock(weights, prefix+".resnets.1", dim, dim)
-	if err != nil {
-		return nil, err
-	}
-
-	return &MidBlock{
-		ResBlock1: res1,
-		Attention: attn,
-		ResBlock2: res2,
-	}, nil
-}
-
-// Forward applies mid block
-func (m *MidBlock) Forward(x *mlx.Array) *mlx.Array {
-	// Each component handles its own pools; we just free inputs
-	prev := x
-	x = m.ResBlock1.Forward(x)
-	prev.Free()
-
-	prev = x
-	x = m.Attention.Forward(x)
-	prev.Free()
-
-	prev = x
-	x = m.ResBlock2.Forward(x)
-	prev.Free()
-
-	return x
-}
-
-// VAEDecoder is the full VAE decoder
-type VAEDecoder struct {
-	Config *VAEConfig
-
-	PostQuantConv *CausalConv3d
-	ConvIn        *CausalConv3d
-	MidBlock      *MidBlock
-	UpBlocks      []*UpBlock
-	NormOut       *RMSNorm3D
-	ConvOut       *CausalConv3d
-}
-
-// Load loads the VAE decoder from a directory
-func (m *VAEDecoder) Load(path string) error {
-	fmt.Println("Loading Qwen-Image VAE decoder...")
-
-	cfg := defaultVAEConfig()
-	m.Config = cfg
-
-	weights, err := safetensors.LoadModelWeights(path)
-	if err != nil {
-		return fmt.Errorf("weights: %w", err)
-	}
-
-	// Bulk load all weights as bf16
-	fmt.Print("  Loading weights as bf16... ")
-	if err := weights.Load(mlx.DtypeBFloat16); err != nil {
-		return fmt.Errorf("failed to load weights: %w", err)
-	}
-	fmt.Printf("✓ (%.1f GB)\n", float64(mlx.MetalGetActiveMemory())/(1024*1024*1024))
-
-	fmt.Print("  Loading post_quant_conv... ")
-	postQuantConv, err := newCausalConv3d(weights, "post_quant_conv")
-	if err != nil {
-		return err
-	}
-	m.PostQuantConv = postQuantConv
-	fmt.Println("✓")
-
-	fmt.Print("  Loading conv_in... ")
-	convIn, err := newCausalConv3d(weights, "decoder.conv_in")
-	if err != nil {
-		return err
-	}
-	m.ConvIn = convIn
-	fmt.Println("✓")
-
-	// Mid block (dim = base_dim * dim_mult[-1] = 96 * 4 = 384)
-	fmt.Print("  Loading mid_block... ")
-	midDim := cfg.BaseDim * cfg.DimMult[len(cfg.DimMult)-1]
-	midBlock, err := newMidBlock(weights, "decoder.mid_block", midDim)
-	if err != nil {
-		return err
-	}
-	m.MidBlock = midBlock
-	fmt.Println("✓")
-
-	// Up blocks (reversed dim_mult)
-	fmt.Print("  Loading up_blocks... ")
-	numUpBlocks := len(cfg.DimMult)
-	m.UpBlocks = make([]*UpBlock, numUpBlocks)
-
-	dimsMult := make([]int32, numUpBlocks+1)
-	dimsMult[0] = cfg.DimMult[numUpBlocks-1]
-	for i := 0; i < numUpBlocks; i++ {
-		dimsMult[i+1] = cfg.DimMult[numUpBlocks-1-i]
-	}
-
-	temporalUpsample := make([]bool, len(cfg.TemperalDownsample))
-	for i := range cfg.TemperalDownsample {
-		temporalUpsample[i] = cfg.TemperalDownsample[len(cfg.TemperalDownsample)-1-i]
-	}
-
-	for i := 0; i < numUpBlocks; i++ {
-		inDim := cfg.BaseDim * dimsMult[i]
-		outDim := cfg.BaseDim * dimsMult[i+1]
-
-		if i > 0 {
-			inDim = inDim / 2
-		}
-
-		upsampleMode := ""
-		if i < numUpBlocks-1 {
-			if temporalUpsample[i] {
-				upsampleMode = "upsample3d"
-			} else {
-				upsampleMode = "upsample2d"
-			}
-		}
-
-		prefix := fmt.Sprintf("decoder.up_blocks.%d", i)
-		upBlock, err := newUpBlock(weights, prefix, inDim, outDim, cfg.NumResBlocks, upsampleMode)
-		if err != nil {
-			return err
-		}
-		m.UpBlocks[i] = upBlock
-	}
-	fmt.Printf("✓ [%d blocks]\n", numUpBlocks)
-
-	fmt.Print("  Loading output layers... ")
-	normOut, err := newRMSNorm3D(weights, "decoder.norm_out", cfg.BaseDim)
-	if err != nil {
-		return err
-	}
-	m.NormOut = normOut
-	convOut, err := newCausalConv3d(weights, "decoder.conv_out")
-	if err != nil {
-		return err
-	}
-	m.ConvOut = convOut
-	fmt.Println("✓")
-
-	weights.ReleaseAll()
-	return nil
-}
-
-// LoadVAEDecoderFromPath is a convenience function to load VAE from path
-func LoadVAEDecoderFromPath(path string) (*VAEDecoder, error) {
-	m := &VAEDecoder{}
-	if err := m.Load(filepath.Join(path, "vae")); err != nil {
-		return nil, err
-	}
-	return m, nil
-}
-
-// Decode converts latents to image
-// z: [B, C, T, H, W] normalized latents
-// Uses staged pools to free intermediate arrays and reduce peak memory.
-func (vae *VAEDecoder) Decode(z *mlx.Array) *mlx.Array {
-	var x *mlx.Array
-
-	// Stage 1a: Denormalize and transpose
-	{
-			z = vae.Denormalize(z)
-		// Convert from channels-first [N, C, T, H, W] to channels-last [N, T, H, W, C]
-		z = mlx.Contiguous(mlx.Transpose(z, 0, 2, 3, 4, 1))
-		mlx.Eval(z)
-	}
-
-	// Stage 1b: PostQuantConv (handles its own pools)
-	x = vae.PostQuantConv.Forward(z)
-	z.Free()
-
-	// Stage 1c: ConvIn (handles its own pools)
-	{
-		prev := x
-		x = vae.ConvIn.Forward(x)
-		prev.Free()
-	}
-
-	// Stage 2: Mid block (handles its own pools)
-	x = vae.MidBlock.Forward(x)
-
-	// Stage 3: Up blocks (each handles its own pools)
-	for _, upBlock := range vae.UpBlocks {
-		x = upBlock.Forward(x)
-	}
-
-	// Stage 4a: NormOut + silu
-	{
-			prev := x
-		x = vae.NormOut.Forward(x)
-		x = silu3D(x)
-		prev.Free()
-		mlx.Eval(x)
-	}
-
-	// Stage 4b: ConvOut (handles its own pools)
-	{
-		prev := x
-		x = vae.ConvOut.Forward(x)
-		prev.Free()
-	}
-
-	// Stage 4c: Post-processing
-	{
-			prev := x
-		// Clamp to [-1, 1]
-		x = mlx.ClipScalar(x, -1.0, 1.0, true, true)
-		// Convert back from channels-last to channels-first
-		x = mlx.Contiguous(mlx.Transpose(x, 0, 4, 1, 2, 3))
-		prev.Free()
-		mlx.Eval(x)
-	}
-
-	return x
-}
-
-// Denormalize reverses the normalization applied during encoding
-func (vae *VAEDecoder) Denormalize(z *mlx.Array) *mlx.Array {
-	shape := z.Shape()
-	C := shape[1]
-
-	mean := mlx.NewArray(vae.Config.LatentsMean[:C], []int32{1, C, 1, 1, 1})
-	std := mlx.NewArray(vae.Config.LatentsStd[:C], []int32{1, C, 1, 1, 1})
-
-	mean = mlx.ToBFloat16(mean)
-	std = mlx.ToBFloat16(std)
-
-	return mlx.Add(mlx.Mul(z, std), mean)
-}
-
-// Helper functions
-
-func silu3D(x *mlx.Array) *mlx.Array {
-	return mlx.Mul(x, mlx.Sigmoid(x))
-}
-
-// pad3DChannelsLast pads a channels-last [B, T, H, W, C] tensor
-func pad3DChannelsLast(x *mlx.Array, tBefore, tAfter, hBefore, hAfter, wBefore, wAfter int32) *mlx.Array {
-	if tBefore == 0 && tAfter == 0 && hBefore == 0 && hAfter == 0 && wBefore == 0 && wAfter == 0 {
-		return x
-	}
-	// Pad dims: [B before, B after, T before, T after, H before, H after, W before, W after, C before, C after]
-	return mlx.Pad(x, []int32{0, 0, tBefore, tAfter, hBefore, hAfter, wBefore, wAfter, 0, 0})
-}
-
-func pad2D(x *mlx.Array, hBefore, hAfter, wBefore, wAfter int32) *mlx.Array {
-	if hBefore == 0 && hAfter == 0 && wBefore == 0 && wAfter == 0 {
-		return x
-	}
-	return mlx.Pad(x, []int32{0, 0, 0, 0, hBefore, hAfter, wBefore, wAfter})
-}
-
-func conv2D1x1(x, weight *mlx.Array) *mlx.Array {
-	shape := x.Shape()
-	B := shape[0]
-	H := shape[2]
-	W := shape[3]
-
-	x = mlx.Transpose(x, 0, 2, 3, 1)
-	x = mlx.Reshape(x, B*H*W, shape[1])
-
-	wShape := weight.Shape()
-	var w *mlx.Array
-	if len(wShape) == 4 {
-		w = mlx.Reshape(weight, wShape[0], wShape[1])
-	} else {
-		w = weight
-	}
-	w = mlx.Transpose(w, 1, 0)
-
-	out := mlx.Linear(x, w)
-	outC := w.Dim(1)
-	out = mlx.Reshape(out, B, H, W, outC)
-	return mlx.Transpose(out, 0, 3, 1, 2)
-}
-
-func conv2D3x3Padded(x, weight *mlx.Array) *mlx.Array {
-	x = pad2D(x, 1, 1, 1, 1)
-	return conv2D(x, weight, 1, 1)
-}
-
-func conv2D(x, w *mlx.Array, strideH, strideW int32) *mlx.Array {
-	x = mlx.Transpose(x, 0, 2, 3, 1)
-	w = mlx.Transpose(w, 0, 2, 3, 1)
-
-	shape := x.Shape()
-	B := shape[0]
-	H := shape[1]
-	W := shape[2]
-
-	wShape := w.Shape()
-	Cout := wShape[0]
-	kH := wShape[1]
-	kW := wShape[2]
-
-	outH := (H-kH)/strideH + 1
-	outW := (W-kW)/strideW + 1
-
-	patches := extractPatches2D(x, kH, kW, strideH, strideW)
-	wFlat := mlx.Reshape(w, Cout, -1)
-	patches = mlx.Reshape(patches, B*outH*outW, -1)
-	out := mlx.Linear(patches, mlx.Transpose(wFlat, 1, 0))
-	out = mlx.Reshape(out, B, outH, outW, Cout)
-	return mlx.Transpose(out, 0, 3, 1, 2)
-}
-
-func extractPatches2D(x *mlx.Array, kH, kW, strideH, strideW int32) *mlx.Array {
-	shape := x.Shape()
-	B := shape[0]
-	H := shape[1]
-	W := shape[2]
-	C := shape[3]
-
-	outH := (H-kH)/strideH + 1
-	outW := (W-kW)/strideW + 1
-
-	patches := make([]*mlx.Array, outH*outW)
-	idx := 0
-	for i := int32(0); i < outH; i++ {
-		for j := int32(0); j < outW; j++ {
-			startH := i * strideH
-			startW := j * strideW
-			patch := mlx.Slice(x, []int32{0, startH, startW, 0}, []int32{B, startH + kH, startW + kW, C})
-			patch = mlx.Reshape(patch, B, kH*kW*C)
-			patches[idx] = patch
-			idx++
-		}
-	}
-
-	for i := range patches {
-		patches[i] = mlx.ExpandDims(patches[i], 1)
-	}
-	stacked := mlx.Concatenate(patches, 1)
-	return mlx.Reshape(stacked, B, outH, outW, kH*kW*C)
-}
-
-func upsample2x(x *mlx.Array) *mlx.Array {
-	shape := x.Shape()
-	H := shape[2]
-	W := shape[3]
-
-	rowIdxData := make([]int32, H*2)
-	for i := int32(0); i < H; i++ {
-		rowIdxData[i*2] = i
-		rowIdxData[i*2+1] = i
-	}
-	rowIdx := mlx.NewArrayInt32(rowIdxData, []int32{H * 2})
-
-	colIdxData := make([]int32, W*2)
-	for i := int32(0); i < W; i++ {
-		colIdxData[i*2] = i
-		colIdxData[i*2+1] = i
-	}
-	colIdx := mlx.NewArrayInt32(colIdxData, []int32{W * 2})
-
-	x = mlx.Take(x, rowIdx, 2)
-	x = mlx.Take(x, colIdx, 3)
-
-	return x
-}
-
-// upsample2xChannelsLast upsamples channels-last input [B, H, W, C] by 2x
-func upsample2xChannelsLast(x *mlx.Array) *mlx.Array {
-	shape := x.Shape()
-	H := shape[1]
-	W := shape[2]
-
-	// Create repeat indices for rows
-	rowIdxData := make([]int32, H*2)
-	for i := int32(0); i < H; i++ {
-		rowIdxData[i*2] = i
-		rowIdxData[i*2+1] = i
-	}
-	rowIdx := mlx.NewArrayInt32(rowIdxData, []int32{H * 2})
-
-	// Create repeat indices for columns
-	colIdxData := make([]int32, W*2)
-	for i := int32(0); i < W; i++ {
-		colIdxData[i*2] = i
-		colIdxData[i*2+1] = i
-	}
-	colIdx := mlx.NewArrayInt32(colIdxData, []int32{W * 2})
-
-	// Take along H (axis 1) then W (axis 2)
-	x = mlx.Take(x, rowIdx, 1)
-	x = mlx.Take(x, colIdx, 2)
-
-	return x
-}
-
-// conv2D3x3PaddedChannelsLast applies 3x3 conv with padding to channels-last input [B, H, W, C]
-// weight: [outC, kH, kW, inC] (MLX channels-last format)
-func conv2D3x3PaddedChannelsLast(x, weight *mlx.Array) *mlx.Array {
-	// Pad spatial dims: [B, H, W, C] -> pad H and W by 1 each side
-	x = mlx.Pad(x, []int32{0, 0, 1, 1, 1, 1, 0, 0})
-	// Conv2d expects: input [B, H, W, inC], weight [outC, kH, kW, inC]
-	// stride=1, padding=0 (we already padded manually)
-	return mlx.Conv2d(x, weight, 1, 0)
-}
--- a/x/imagegen/models/qwen_image/vae_test.go
+++ b/x/imagegen/models/qwen_image/vae_test.go
@@ -1,112 +0,0 @@
-package qwen_image
-
-import (
-	"math"
-	"os"
-	"testing"
-
-	"github.com/ollama/ollama/x/imagegen/mlx"
-)
-
-// TestVAEConfig tests configuration invariants.
-func TestVAEConfig(t *testing.T) {
-	cfg := defaultVAEConfig()
-
-	// Property: latents_mean and latents_std have z_dim elements
-	if int32(len(cfg.LatentsMean)) != cfg.ZDim {
-		t.Errorf("latents_mean length != z_dim: %d != %d", len(cfg.LatentsMean), cfg.ZDim)
-	}
-	if int32(len(cfg.LatentsStd)) != cfg.ZDim {
-		t.Errorf("latents_std length != z_dim: %d != %d", len(cfg.LatentsStd), cfg.ZDim)
-	}
-
-	// Property: dim_mult defines 4 stages
-	if len(cfg.DimMult) != 4 {
-		t.Errorf("dim_mult should have 4 stages: got %d", len(cfg.DimMult))
-	}
-
-	// Property: temperal_downsample has 3 elements (for 3 transitions)
-	if len(cfg.TemperalDownsample) != 3 {
-		t.Errorf("temperal_downsample should have 3 elements: got %d", len(cfg.TemperalDownsample))
-	}
-}
-
-// TestVAELatentsNormalization tests the latent denormalization values.
-func TestVAELatentsNormalization(t *testing.T) {
-	cfg := defaultVAEConfig()
-
-	// Verify latents_std values are all positive
-	for i, std := range cfg.LatentsStd {
-		if std <= 0 {
-			t.Errorf("latents_std[%d] should be positive: %v", i, std)
-		}
-	}
-
-	// Verify values are in reasonable range (from actual model)
-	for i, mean := range cfg.LatentsMean {
-		if math.Abs(float64(mean)) > 5 {
-			t.Errorf("latents_mean[%d] seems too large: %v", i, mean)
-		}
-	}
-	for i, std := range cfg.LatentsStd {
-		if std > 10 {
-			t.Errorf("latents_std[%d] seems too large: %v", i, std)
-		}
-	}
-}
-
-// TestVAEDecoderForward tests full forward pass (integration test).
-// Skips if model weights are not available.
-func TestVAEDecoderForward(t *testing.T) {
-	weightsPath := "../../../weights/Qwen-Image-2512/vae"
-	if _, err := os.Stat(weightsPath); os.IsNotExist(err) {
-		t.Skip("Skipping: model weights not found at " + weightsPath)
-	}
-
-	vae := &VAEDecoder{}
-	if err := vae.Load(weightsPath); err != nil {
-		t.Fatalf("Failed to load VAE decoder: %v", err)
-	}
-	mlx.Keep(mlx.Collect(vae)...)
-
-	// Small test input: [B, C, T, H, W]
-	// After 4 upsampling stages (2x each), H/W multiply by 16
-	batchSize := int32(1)
-	channels := int32(16)
-	frames := int32(1)
-	latentH := int32(4)
-	latentW := int32(4)
-
-	latents := mlx.RandomNormal([]int32{batchSize, channels, frames, latentH, latentW}, 0)
-
-	// Decode
-	out := vae.Decode(latents)
-	mlx.Eval(out)
-
-	// Verify output shape: [B, 3, T, H*16, W*16]
-	outShape := out.Shape()
-	if outShape[0] != batchSize {
-		t.Errorf("batch size: got %d, want %d", outShape[0], batchSize)
-	}
-	if outShape[1] != 3 {
-		t.Errorf("channels: got %d, want 3", outShape[1])
-	}
-	if outShape[2] != frames {
-		t.Errorf("frames: got %d, want %d", outShape[2], frames)
-	}
-	expectedH := latentH * 16 // 4 stages of 2x upsampling
-	expectedW := latentW * 16
-	if outShape[3] != expectedH || outShape[4] != expectedW {
-		t.Errorf("spatial dims: got [%d, %d], want [%d, %d]",
-			outShape[3], outShape[4], expectedH, expectedW)
-	}
-
-	// Verify output is in valid range (should be clamped to [0, 1] by decode)
-	outData := out.Data()
-	for i := 0; i < min(100, len(outData)); i++ {
-		if math.IsNaN(float64(outData[i])) || math.IsInf(float64(outData[i]), 0) {
-			t.Errorf("output[%d] not finite: %v", i, outData[i])
-			break
-		}
-	}
-}
--- a/x/imagegen/models/qwen_image_edit/layers.go
+++ b/x/imagegen/models/qwen_image_edit/layers.go
@@ -1,680 +0,0 @@
-package qwen_image_edit
-
-import (
-	"fmt"
-	"math"
-
-	"github.com/ollama/ollama/x/imagegen/mlx"
-	"github.com/ollama/ollama/x/imagegen/safetensors"
-)
-
-// CausalConv3d is a causal 3D convolution (for temporal causality)
-type CausalConv3d struct {
-	Weight       *mlx.Array
-	Bias         *mlx.Array
-	BiasReshaped *mlx.Array // [1, C, 1, 1, 1]
-	KernelT      int32
-}
-
-// newCausalConv3d creates a 3D causal conv
-func newCausalConv3d(weights *safetensors.ModelWeights, prefix string) (*CausalConv3d, error) {
-	weight, err := weights.Get(prefix + ".weight")
-	if err != nil {
-		return nil, fmt.Errorf("weight not found: %s", prefix)
-	}
-	bias, _ := weights.Get(prefix + ".bias")
-
-	kernelT := weight.Shape()[2]
-	outC := weight.Shape()[0]
-
-	var biasReshaped *mlx.Array
-	if bias != nil {
-		biasReshaped = mlx.Reshape(bias, 1, outC, 1, 1, 1)
-	}
-
-	return &CausalConv3d{
-		Weight:       weight,
-		Bias:         bias,
-		BiasReshaped: biasReshaped,
-		KernelT:      kernelT,
-	}, nil
-}
-
-// Forward applies causal 3D convolution (or 2D if weight is 4D)
-// x: [B, T, H, W, C] (channels-last, MLX format)
-func (c *CausalConv3d) Forward(x *mlx.Array) *mlx.Array {
-	shape := c.Weight.Shape()
-
-	// Handle both 5D (3D conv) and 4D (2D conv) weights
-	if len(shape) == 4 {
-		// 2D conv: [O, I, kH, kW] - need to apply per-frame
-		return c.forward2D(x)
-	}
-
-	// 3D conv: [O, I, kT, kH, kW]
-	kernelT := shape[2]
-	kernelH := shape[3]
-	kernelW := shape[4]
-
-	// Causal temporal padding, same spatial padding
-	padT := kernelT - 1
-	padH := kernelH / 2
-	padW := kernelW / 2
-
-	// Stage 1: Pad
-	{
-		x = pad3DChannelsLast(x, padT, 0, padH, padH, padW, padW)
-		mlx.Eval(x)
-	}
-
-	// Stage 2: Conv + bias
-	var out *mlx.Array
-	{
-		prev := x
-		weight := mlx.Transpose(c.Weight, 0, 2, 3, 4, 1)
-		out = mlx.Conv3d(x, weight, 1, 1, 1, 0, 0, 0)
-		if c.Bias != nil {
-			bias := mlx.Reshape(c.Bias, 1, 1, 1, 1, c.Bias.Dim(0))
-			out = mlx.Add(out, bias)
-		}
-		prev.Free()
-		mlx.Eval(out)
-	}
-
-	return out
-}
-
-// forward2D applies 2D conv per-frame for [B, T, H, W, C] input
-func (c *CausalConv3d) forward2D(x *mlx.Array) *mlx.Array {
-	xShape := x.Shape()
-	B := xShape[0]
-	T := xShape[1]
-	H := xShape[2]
-	W := xShape[3]
-	C := xShape[4]
-
-	wShape := c.Weight.Shape() // [O, I, kH, kW]
-	kernelH := wShape[2]
-	kernelW := wShape[3]
-	outC := wShape[0]
-
-	padH := kernelH / 2
-	padW := kernelW / 2
-
-	// Reshape to [B*T, H, W, C] for 2D conv
-	x = mlx.Reshape(x, B*T, H, W, C)
-
-	// Pad spatially
-	x = mlx.Pad(x, []int32{0, 0, padH, padH, padW, padW, 0, 0})
-
-	// Apply 2D conv
-	weight := mlx.Transpose(c.Weight, 0, 2, 3, 1) // [O, I, kH, kW] -> [O, kH, kW, I]
-	x = mlx.Conv2d(x, weight, 1, 0)
-
-	if c.Bias != nil {
-		bias := mlx.Reshape(c.Bias, 1, 1, 1, outC)
-		x = mlx.Add(x, bias)
-	}
-
-	// Get output spatial dims
-	outH := H
-	outW := W
-
-	// Reshape back to [B, T, H, W, C]
-	x = mlx.Reshape(x, B, T, outH, outW, outC)
-	mlx.Eval(x)
-
-	return x
-}
-
-// RMSNorm3D applies RMS normalization over channels
-type RMSNorm3D struct {
-	Gamma *mlx.Array // [1, 1, 1, 1, C] for broadcasting
-}
-
-// newRMSNorm3D creates an RMS norm
-func newRMSNorm3D(weights *safetensors.ModelWeights, prefix string, dim int32) (*RMSNorm3D, error) {
-	gamma, err := weights.Get(prefix + ".gamma")
-	if err != nil {
-		return nil, err
-	}
-	gamma = mlx.Reshape(gamma, 1, 1, 1, 1, gamma.Dim(0))
-	return &RMSNorm3D{Gamma: gamma}, nil
-}
-
-// Forward applies RMS norm to channels-last input [B, T, H, W, C]
-func (n *RMSNorm3D) Forward(x *mlx.Array) *mlx.Array {
-	normalized := mlx.RMSNormNoWeight(x, 1e-6)
-	return mlx.Mul(normalized, n.Gamma)
-}
-
-// ResBlock is a residual block with RMS norm and causal convs
-type ResBlock struct {
-	Norm1    *RMSNorm3D
-	Conv1    *CausalConv3d
-	Norm2    *RMSNorm3D
-	Conv2    *CausalConv3d
-	Shortcut *CausalConv3d
-}
-
-// newResBlock creates a residual block
-func newResBlock(weights *safetensors.ModelWeights, prefix string, inDim, outDim int32) (*ResBlock, error) {
-	norm1, err := newRMSNorm3D(weights, prefix+".norm1", inDim)
-	if err != nil {
-		return nil, err
-	}
-	conv1, err := newCausalConv3d(weights, prefix+".conv1")
-	if err != nil {
-		return nil, err
-	}
-	norm2, err := newRMSNorm3D(weights, prefix+".norm2", outDim)
-	if err != nil {
-		return nil, err
-	}
-	conv2, err := newCausalConv3d(weights, prefix+".conv2")
-	if err != nil {
-		return nil, err
-	}
-
-	var shortcut *CausalConv3d
-	if inDim != outDim {
-		shortcut, err = newCausalConv3d(weights, prefix+".conv_shortcut")
-		if err != nil {
-			return nil, err
-		}
-	}
-
-	return &ResBlock{
-		Norm1:    norm1,
-		Conv1:    conv1,
-		Norm2:    norm2,
-		Conv2:    conv2,
-		Shortcut: shortcut,
-	}, nil
-}
-
-// Forward applies the residual block
-func (r *ResBlock) Forward(x *mlx.Array) *mlx.Array {
-	var h *mlx.Array
-
-	mlx.Keep(x)
-
-	// Stage 1: norm1 + silu
-	{
-		h = r.Norm1.Forward(x)
-		h = silu3D(h)
-		mlx.Eval(h)
-	}
-
-	// Stage 2: conv1
-	{
-		prev := h
-		h = r.Conv1.Forward(h)
-		prev.Free()
-	}
-
-	// Stage 3: norm2 + silu
-	{
-		prev := h
-		h = r.Norm2.Forward(h)
-		h = silu3D(h)
-		prev.Free()
-		mlx.Eval(h)
-	}
-
-	// Stage 4: conv2
-	{
-		prev := h
-		h = r.Conv2.Forward(h)
-		prev.Free()
-	}
-
-	// Residual connection
-	if r.Shortcut != nil {
-		shortcut := r.Shortcut.Forward(x)
-		h = mlx.Add(h, shortcut)
-		mlx.Eval(h)
-	} else {
-		h = mlx.Add(h, x)
-		mlx.Eval(h)
-	}
-
-	return h
-}
-
-// AttentionBlock is a 2D attention block
-type AttentionBlock struct {
-	Norm      *RMSNorm3D
-	ToQKV     *mlx.Array
-	ToQKVBias *mlx.Array
-	Proj      *mlx.Array
-	ProjBias  *mlx.Array
-	Dim       int32
-}
-
-// newAttentionBlock creates an attention block
-func newAttentionBlock(weights *safetensors.ModelWeights, prefix string, dim int32) (*AttentionBlock, error) {
-	norm, err := newRMSNorm3D(weights, prefix+".norm", dim)
-	if err != nil {
-		return nil, err
-	}
-	toQKV, _ := weights.Get(prefix + ".to_qkv.weight")
-	toQKVBias, _ := weights.Get(prefix + ".to_qkv.bias")
-	proj, _ := weights.Get(prefix + ".proj.weight")
-	projBias, _ := weights.Get(prefix + ".proj.bias")
-
-	return &AttentionBlock{
-		Norm:      norm,
-		ToQKV:     toQKV,
-		ToQKVBias: toQKVBias,
-		Proj:      proj,
-		ProjBias:  projBias,
-		Dim:       dim,
-	}, nil
-}
-
-// Forward applies 2D attention
-// Input: [B, T, H, W, C] (channels-last)
-func (a *AttentionBlock) Forward(x *mlx.Array) *mlx.Array {
-	shape := x.Shape()
-	B := shape[0]
-	T := shape[1]
-	H := shape[2]
-	W := shape[3]
-	C := shape[4]
-
-	identity := x
-
-	// Flatten to [B*T, 1, H, W, C] for norm
-	x = mlx.Reshape(x, B*T, 1, H, W, C)
-	x = a.Norm.Forward(x)
-	x = mlx.Reshape(x, B*T, H, W, C)
-
-	// Flatten spatial to [B*T, H*W, C]
-	x = mlx.Reshape(x, B*T, H*W, C)
-
-	// Linear to get Q, K, V
-	wShape := a.ToQKV.Shape()
-	var w *mlx.Array
-	if len(wShape) == 4 {
-		w = mlx.Reshape(a.ToQKV, wShape[0], wShape[1])
-	} else {
-		w = a.ToQKV
-	}
-	w = mlx.Transpose(w, 1, 0)
-
-	qkv := mlx.Linear(x, w)
-	if a.ToQKVBias != nil {
-		qkv = mlx.Add(qkv, a.ToQKVBias)
-	}
-	qkv = mlx.Reshape(qkv, B*T, 1, H*W, 3*C)
-
-	q := mlx.Slice(qkv, []int32{0, 0, 0, 0}, []int32{B * T, 1, H * W, C})
-	k := mlx.Slice(qkv, []int32{0, 0, 0, C}, []int32{B * T, 1, H * W, 2 * C})
-	v := mlx.Slice(qkv, []int32{0, 0, 0, 2 * C}, []int32{B * T, 1, H * W, 3 * C})
-
-	scale := float32(1.0 / math.Sqrt(float64(C)))
-	out := mlx.ScaledDotProductAttention(q, k, v, scale, false)
-
-	out = mlx.Reshape(out, B*T, H*W, C)
-
-	// Project back
-	pShape := a.Proj.Shape()
-	var p *mlx.Array
-	if len(pShape) == 4 {
-		p = mlx.Reshape(a.Proj, pShape[0], pShape[1])
-	} else {
-		p = a.Proj
-	}
-	p = mlx.Transpose(p, 1, 0)
-	out = mlx.Linear(out, p)
-	if a.ProjBias != nil {
-		out = mlx.Add(out, a.ProjBias)
-	}
-
-	out = mlx.Reshape(out, B, T, H, W, C)
-	return mlx.Add(out, identity)
-}
-
-// UpBlock handles upsampling in decoder
-type UpBlock struct {
-	ResBlocks []*ResBlock
-	Upsampler *Upsample
-}
-
-// newUpBlock creates an up block
-func newUpBlock(weights *safetensors.ModelWeights, prefix string, inDim, outDim int32, numBlocks int32, upsampleMode string) (*UpBlock, error) {
-	resBlocks := make([]*ResBlock, numBlocks+1)
-
-	currentDim := inDim
-	for i := int32(0); i <= numBlocks; i++ {
-		resPrefix := fmt.Sprintf("%s.resnets.%d", prefix, i)
-		block, err := newResBlock(weights, resPrefix, currentDim, outDim)
-		if err != nil {
-			return nil, err
-		}
-		resBlocks[i] = block
-		currentDim = outDim
-	}
-
-	var upsampler *Upsample
-	if upsampleMode != "" {
-		upsampler = newUpsample(weights, prefix+".upsamplers.0", outDim, upsampleMode)
-	}
-
-	return &UpBlock{
-		ResBlocks: resBlocks,
-		Upsampler: upsampler,
-	}, nil
-}
-
-// Forward applies up block
-func (u *UpBlock) Forward(x *mlx.Array) *mlx.Array {
-	for _, block := range u.ResBlocks {
-		prev := x
-		x = block.Forward(x)
-		prev.Free()
-	}
-
-	if u.Upsampler != nil {
-		prev := x
-		x = u.Upsampler.Forward(x)
-		prev.Free()
-	}
-	return x
-}
-
-// Upsample handles spatial upsampling
-type Upsample struct {
-	Conv *mlx.Array
-	Bias *mlx.Array
-	Mode string
-}
-
-// newUpsample creates an upsampler
-func newUpsample(weights *safetensors.ModelWeights, prefix string, dim int32, mode string) *Upsample {
-	conv, _ := weights.Get(prefix + ".resample.1.weight")
-	bias, _ := weights.Get(prefix + ".resample.1.bias")
-	return &Upsample{
-		Conv: conv,
-		Bias: bias,
-		Mode: mode,
-	}
-}
-
-// Forward applies upsampling to channels-last input [B, T, H, W, C]
-func (u *Upsample) Forward(x *mlx.Array) *mlx.Array {
-	shape := x.Shape()
-	B := shape[0]
-	T := shape[1]
-	H := shape[2]
-	W := shape[3]
-	C := shape[4]
-	outC := u.Conv.Shape()[0]
-
-	// Stage 1: 2x nearest neighbor upsample
-	{
-		x = mlx.Reshape(x, B*T, H, W, C)
-		x = upsample2xChannelsLast(x)
-		mlx.Eval(x)
-	}
-
-	// Stage 2: Conv + bias
-	{
-		prev := x
-		weight := mlx.Transpose(u.Conv, 0, 2, 3, 1)
-		x = conv2D3x3PaddedChannelsLast(x, weight)
-		if u.Bias != nil {
-			bias := mlx.Reshape(u.Bias, 1, 1, 1, outC)
-			x = mlx.Add(x, bias)
-		}
-		x = mlx.Reshape(x, B, T, H*2, W*2, outC)
-		prev.Free()
-		mlx.Eval(x)
-	}
-
-	return x
-}
-
-// MidBlock is the middle block
-type MidBlock struct {
-	ResBlock1 *ResBlock
-	Attention *AttentionBlock
-	ResBlock2 *ResBlock
-}
-
-// newMidBlock creates a mid block
-func newMidBlock(weights *safetensors.ModelWeights, prefix string, dim int32) (*MidBlock, error) {
-	res1, err := newResBlock(weights, prefix+".resnets.0", dim, dim)
-	if err != nil {
-		return nil, err
-	}
-	attn, err := newAttentionBlock(weights, prefix+".attentions.0", dim)
-	if err != nil {
-		return nil, err
-	}
-	res2, err := newResBlock(weights, prefix+".resnets.1", dim, dim)
-	if err != nil {
-		return nil, err
-	}
-
-	return &MidBlock{
-		ResBlock1: res1,
-		Attention: attn,
-		ResBlock2: res2,
-	}, nil
-}
-
-// Forward applies mid block
-func (m *MidBlock) Forward(x *mlx.Array) *mlx.Array {
-	prev := x
-	x = m.ResBlock1.Forward(x)
-	prev.Free()
-
-	prev = x
-	x = m.Attention.Forward(x)
-	prev.Free()
-
-	prev = x
-	x = m.ResBlock2.Forward(x)
-	prev.Free()
-
-	return x
-}
-
-// Helper functions
-
-func silu3D(x *mlx.Array) *mlx.Array {
-	return mlx.Mul(x, mlx.Sigmoid(x))
-}
-
-// pad3DChannelsLast pads a channels-last [B, T, H, W, C] tensor
-func pad3DChannelsLast(x *mlx.Array, tBefore, tAfter, hBefore, hAfter, wBefore, wAfter int32) *mlx.Array {
-	if tBefore == 0 && tAfter == 0 && hBefore == 0 && hAfter == 0 && wBefore == 0 && wAfter == 0 {
-		return x
-	}
-	return mlx.Pad(x, []int32{0, 0, tBefore, tAfter, hBefore, hAfter, wBefore, wAfter, 0, 0})
-}
-
-// upsample2xChannelsLast upsamples channels-last input [B, H, W, C] by 2x
-func upsample2xChannelsLast(x *mlx.Array) *mlx.Array {
-	shape := x.Shape()
-	H := shape[1]
-	W := shape[2]
-
-	rowIdxData := make([]int32, H*2)
-	for i := int32(0); i < H; i++ {
-		rowIdxData[i*2] = i
-		rowIdxData[i*2+1] = i
-	}
-	rowIdx := mlx.NewArrayInt32(rowIdxData, []int32{H * 2})
-
-	colIdxData := make([]int32, W*2)
-	for i := int32(0); i < W; i++ {
-		colIdxData[i*2] = i
-		colIdxData[i*2+1] = i
-	}
-	colIdx := mlx.NewArrayInt32(colIdxData, []int32{W * 2})
-
-	x = mlx.Take(x, rowIdx, 1)
-	x = mlx.Take(x, colIdx, 2)
-
-	return x
-}
-
-// conv2D3x3PaddedChannelsLast applies 3x3 conv with padding to channels-last input [B, H, W, C]
-func conv2D3x3PaddedChannelsLast(x, weight *mlx.Array) *mlx.Array {
-	x = mlx.Pad(x, []int32{0, 0, 1, 1, 1, 1, 0, 0})
-	return mlx.Conv2d(x, weight, 1, 0)
-}
-
-// conv2DStrided applies conv with stride > 1 using manual patch extraction
-// x: [B, H, W, C] (channels-last), weight: [O, kH, kW, I]
-func conv2DStrided(x, weight *mlx.Array, stride int32) *mlx.Array {
-	shape := x.Shape()
-	B := shape[0]
-	H := shape[1]
-	W := shape[2]
-
-	wShape := weight.Shape()
-	Cout := wShape[0]
-	kH := wShape[1]
-	kW := wShape[2]
-
-	outH := (H - kH) / stride + 1
-	outW := (W - kW) / stride + 1
-
-	patches := extractPatches2DStrided(x, kH, kW, stride)
-	wFlat := mlx.Reshape(weight, Cout, -1)
-	patches = mlx.Reshape(patches, B*outH*outW, -1)
-	out := mlx.Linear(patches, mlx.Transpose(wFlat, 1, 0))
-	return mlx.Reshape(out, B, outH, outW, Cout)
-}
-
-// conv3DStrided applies 3D conv with strides using manual patch extraction
-// x: [B, T, H, W, C] (channels-last), weight: [O, I, kT, kH, kW] (PyTorch format)
-// strideT, strideH, strideW are the strides for each dimension
-// Patches are extracted in [C, T, H, W] order to match Python's preprocessing
-func conv3DStrided(x, weight *mlx.Array, strideT, strideH, strideW int32) *mlx.Array {
-	shape := x.Shape()
-	B := shape[0]
-	T := shape[1]
-	H := shape[2]
-	W := shape[3]
-	C := shape[4]
-
-	wShape := weight.Shape()
-	Cout := wShape[0]
-	// I := wShape[1]
-	kT := wShape[2]
-	kH := wShape[3]
-	kW := wShape[4]
-
-	// For temporal: if T < kT, we need to repeat frames temporally
-	// For single image with T=1 and kT=2, we duplicate the frame to T=kT
-	// Python Qwen2.5-VL duplicates the frame, not zero-pads
-	if T < kT {
-		// Tile along T dimension: [B, T, H, W, C] -> [B, kT, H, W, C]
-		x = mlx.Tile(x, []int32{1, kT, 1, 1, 1})
-		T = kT
-	}
-
-	outT := (T - kT) / strideT + 1
-	outH := (H - kH) / strideH + 1
-	outW := (W - kW) / strideW + 1
-
-	// Extract 3D patches in [C, T, H, W] order to match Python
-	patches := extractPatches3DStrided(x, kT, kH, kW, strideT, strideH, strideW)
-	// patches shape: [B, outT, outH, outW, C*kT*kH*kW]
-
-	// Weight is [O, I, kT, kH, kW] - flatten to [O, I*kT*kH*kW] to match patch order [C, T, H, W]
-	wFlat := mlx.Reshape(weight, Cout, -1) // [Cout, I*kT*kH*kW]
-	patches = mlx.Reshape(patches, B*outT*outH*outW, C*kT*kH*kW)
-	out := mlx.Linear(patches, mlx.Transpose(wFlat, 1, 0))
-	return mlx.Reshape(out, B, outT, outH, outW, Cout)
-}
-
-// extractPatches3DStrided extracts 3D patches with given strides
-// Returns patches with values in [C, T, H, W] order to match Python's preprocessing
-func extractPatches3DStrided(x *mlx.Array, kT, kH, kW, strideT, strideH, strideW int32) *mlx.Array {
-	shape := x.Shape()
-	B := shape[0]
-	T := shape[1]
-	H := shape[2]
-	W := shape[3]
-	C := shape[4]
-
-	outT := (T - kT) / strideT + 1
-	outH := (H - kH) / strideH + 1
-	outW := (W - kW) / strideW + 1
-
-	numPatches := outT * outH * outW
-	patches := make([]*mlx.Array, numPatches)
-	idx := 0
-	for t := int32(0); t < outT; t++ {
-		for i := int32(0); i < outH; i++ {
-			for j := int32(0); j < outW; j++ {
-				startT := t * strideT
-				startH := i * strideH
-				startW := j * strideW
-				// Extract patch: [B, kT, kH, kW, C]
-				patch := mlx.Slice(x,
-					[]int32{0, startT, startH, startW, 0},
-					[]int32{B, startT + kT, startH + kH, startW + kW, C})
-				// Transpose from [B, T, H, W, C] to [B, C, T, H, W] to match Python's order
-				patch = mlx.Transpose(patch, 0, 4, 1, 2, 3)
-				// Flatten to [B, C*T*H*W]
-				patch = mlx.Reshape(patch, B, C*kT*kH*kW)
-				patches[idx] = patch
-				idx++
-			}
-		}
-	}
-
-	for i := range patches {
-		patches[i] = mlx.ExpandDims(patches[i], 1)
-	}
-	stacked := mlx.Concatenate(patches, 1)
-	return mlx.Reshape(stacked, B, outT, outH, outW, C*kT*kH*kW)
-}
-
-// extractPatches2DStrided extracts patches with given stride
-func extractPatches2DStrided(x *mlx.Array, kH, kW, stride int32) *mlx.Array {
-	shape := x.Shape()
-	B := shape[0]
-	H := shape[1]
-	W := shape[2]
-	C := shape[3]
-
-	outH := (H - kH) / stride + 1
-	outW := (W - kW) / stride + 1
-
-	patches := make([]*mlx.Array, outH*outW)
-	idx := 0
-	for i := int32(0); i < outH; i++ {
-		for j := int32(0); j < outW; j++ {
-			startH := i * stride
-			startW := j * stride
-			patch := mlx.Slice(x, []int32{0, startH, startW, 0}, []int32{B, startH + kH, startW + kW, C})
-			patch = mlx.Reshape(patch, B, kH*kW*C)
-			patches[idx] = patch
-			idx++
-		}
-	}
-
-	for i := range patches {
-		patches[i] = mlx.ExpandDims(patches[i], 1)
-	}
-	stacked := mlx.Concatenate(patches, 1)
-	return mlx.Reshape(stacked, B, outH, outW, kH*kW*C)
-}
-
-// layerNormNoAffine applies layer norm without learnable parameters
-func layerNormNoAffine(x *mlx.Array, eps float32) *mlx.Array {
-	ndim := x.Ndim()
-	lastAxis := ndim - 1
-	mean := mlx.Mean(x, lastAxis, true)
-	xCentered := mlx.Sub(x, mean)
-	variance := mlx.Mean(mlx.Square(xCentered), lastAxis, true)
-	return mlx.Div(xCentered, mlx.Sqrt(mlx.AddScalar(variance, eps)))
-}
--- a/x/imagegen/models/qwen_image_edit/processor.go
+++ b/x/imagegen/models/qwen_image_edit/processor.go
@@ -1,473 +0,0 @@
-package qwen_image_edit
-
-import (
-	"fmt"
-	"image"
-	"image/color"
-	_ "image/jpeg"
-	_ "image/png"
-	"math"
-	"os"
-
-	"github.com/ollama/ollama/x/imagegen/mlx"
-	"golang.org/x/image/draw"
-	_ "golang.org/x/image/webp"
-)
-
-// loadImageFile loads an image from disk
-func loadImageFile(path string) (image.Image, error) {
-	f, err := os.Open(path)
-	if err != nil {
-		return nil, fmt.Errorf("open image: %w", err)
-	}
-	defer f.Close()
-
-	img, _, err := image.Decode(f)
-	if err != nil {
-		return nil, fmt.Errorf("decode image: %w", err)
-	}
-	return img, nil
-}
-
-// imageToFloat32Pixels converts an image to a float32 pixel array [H, W, C] in [0, 1] range
-func imageToFloat32Pixels(img image.Image, width, height int) []float32 {
-	pixels := make([]float32, width*height*3)
-	idx := 0
-	for y := 0; y < height; y++ {
-		for x := 0; x < width; x++ {
-			r, g, b, _ := img.At(x, y).RGBA()
-			pixels[idx] = float32(r) / 65535.0
-			pixels[idx+1] = float32(g) / 65535.0
-			pixels[idx+2] = float32(b) / 65535.0
-			idx += 3
-		}
-	}
-	return pixels
-}
-
-// normalizeImageNet applies ImageNet normalization to an image tensor
-func (p *Processor) normalizeImageNet(arr *mlx.Array) *mlx.Array {
-	mean := mlx.NewArray(p.Config.ImageMean, []int32{1, 1, 3})
-	std := mlx.NewArray(p.Config.ImageStd, []int32{1, 1, 3})
-	return mlx.Div(mlx.Sub(arr, mean), std)
-}
-
-// prepareImageTensor transforms [H, W, C] to [B, C, H, W] and converts to bf16
-func prepareImageTensor(arr *mlx.Array) *mlx.Array {
-	// Transpose to [C, H, W] and make contiguous
-	arr = mlx.Contiguous(mlx.Transpose(arr, 2, 0, 1))
-	// Add batch dimension [1, C, H, W]
-	arr = mlx.ExpandDims(arr, 0)
-	// Convert to bf16
-	arr = mlx.ToBFloat16(arr)
-	mlx.Eval(arr)
-	return arr
-}
-
-// clampFloat clamps a value to [0, 255] and returns uint8
-func clampFloat(v, weightSum float64) uint8 {
-	v /= weightSum
-	if v < 0 {
-		v = 0
-	}
-	if v > 255 {
-		v = 255
-	}
-	return uint8(math.Round(v))
-}
-
-// ImageDims holds dimensions for a preprocessed image
-type ImageDims struct {
-	// Original image dimensions
-	OrigW, OrigH int32
-	// Condition image dimensions (for vision encoder)
-	CondW, CondH int32
-	// VAE image dimensions
-	VaeW, VaeH int32
-	// Latent dimensions (VAE dims / vae_scale_factor)
-	LatentW, LatentH int32
-	// Patch dimensions (latent dims / patch_size)
-	PatchW, PatchH int32
-}
-
-// ProcessorConfig holds image processor configuration
-type ProcessorConfig struct {
-	// Condition image size (target pixel area for vision encoder input)
-	// Python: CONDITION_IMAGE_SIZE = 384 * 384 = 147456
-	// Pipeline resizes image to this area before passing to encode_prompt
-	ConditionImageSize int32
-
-	// VAE image size (target pixel area)
-	// Python: VAE_IMAGE_SIZE = 1024 * 1024 = 1048576
-	VAEImageSize int32
-
-	// Image normalization (ImageNet stats for vision encoder)
-	ImageMean []float32
-	ImageStd  []float32
-}
-
-// defaultProcessorConfig returns default processor config
-func defaultProcessorConfig() *ProcessorConfig {
-	return &ProcessorConfig{
-		ConditionImageSize: 384 * 384,   // 147456 - matches Python CONDITION_IMAGE_SIZE
-		VAEImageSize:       1024 * 1024, // 1048576 - matches Python VAE_IMAGE_SIZE
-		ImageMean:          []float32{0.48145466, 0.4578275, 0.40821073},
-		ImageStd:           []float32{0.26862954, 0.26130258, 0.27577711},
-	}
-}
-
-// Processor handles image preprocessing for Qwen-Image-Edit
-type Processor struct {
-	Config *ProcessorConfig
-}
-
-// Load loads the processor config
-func (p *Processor) Load(path string) error {
-	p.Config = defaultProcessorConfig()
-	return nil
-}
-
-// LoadAndPreprocess loads an image and preprocesses it for both paths
-// Returns: condImage (for vision encoder), vaeImage (for VAE encoding)
-func (p *Processor) LoadAndPreprocess(imagePath string) (*mlx.Array, *mlx.Array, error) {
-	img, err := loadImageFile(imagePath)
-	if err != nil {
-		return nil, nil, err
-	}
-
-	bounds := img.Bounds()
-	origW := bounds.Dx()
-	origH := bounds.Dy()
-	ratio := float64(origW) / float64(origH)
-
-	// Calculate dimensions for condition image (vision encoder)
-	// Python pipeline does TWO resizes:
-	// 1. VaeImageProcessor.resize with Lanczos to CONDITION_IMAGE_SIZE (384x384 area)
-	// 2. Qwen2VLProcessor's smart_resize with Bicubic to multiple of 28
-	intermediateW, intermediateH := calculateDimensions(p.Config.ConditionImageSize, ratio, 32)
-	finalH, finalW := smartResize(intermediateH, intermediateW, 28, 56*56, 28*28*1280)
-
-	// Calculate dimensions for VAE image (1024x1024 area)
-	// Use multiple of 32 (vae_scale_factor * patch_size * 2 = 8 * 2 * 2 = 32)
-	vaeW, vaeH := calculateDimensions(p.Config.VAEImageSize, ratio, 32)
-
-	// Preprocess for condition (vision encoder) - two-step resize
-	condImage := p.preprocessImageTwoStep(img, intermediateW, intermediateH, finalW, finalH)
-
-	// Preprocess for VAE ([-1, 1] range, 5D tensor)
-	vaeImage := p.preprocessImageForVAE(img, vaeW, vaeH)
-
-	return condImage, vaeImage, nil
-}
-
-// preprocessImageLanczos does single-step Lanczos resize for vision encoder
-// Matches Python VaeImageProcessor.resize with resample='lanczos' (the default)
-// Used by edit_plus pipeline for multi-image input
-// Returns: [B, C, H, W] normalized tensor
-func (p *Processor) preprocessImageLanczos(img image.Image, width, height int32) *mlx.Array {
-	resized := resizeImageLanczos(img, int(width), int(height))
-	pixels := imageToFloat32Pixels(resized, int(width), int(height))
-	arr := mlx.NewArray(pixels, []int32{height, width, 3})
-	arr = p.normalizeImageNet(arr)
-	return prepareImageTensor(arr)
-}
-
-// preprocessImageTwoStep does two-step resize for vision encoder to match Python pipeline
-// Step 1: Lanczos resize from original to intermediate size (VaeImageProcessor.resize)
-// Step 2: Bicubic resize from intermediate to final size (Qwen2VLProcessor smart_resize)
-// Returns: [B, C, H, W] normalized tensor
-func (p *Processor) preprocessImageTwoStep(img image.Image, intermediateW, intermediateH, finalW, finalH int32) *mlx.Array {
-	intermediate := resizeImageLanczos(img, int(intermediateW), int(intermediateH))
-	resized := resizeImageBicubic(intermediate, int(finalW), int(finalH))
-	pixels := imageToFloat32Pixels(resized, int(finalW), int(finalH))
-	arr := mlx.NewArray(pixels, []int32{finalH, finalW, 3})
-	arr = p.normalizeImageNet(arr)
-	return prepareImageTensor(arr)
-}
-
-// preprocessImage converts image to tensor for vision encoder
-// Returns: [B, C, H, W] normalized tensor
-func (p *Processor) preprocessImage(img image.Image, width, height int32, normalize bool) *mlx.Array {
-	resized := resizeImageBicubic(img, int(width), int(height))
-	pixels := imageToFloat32Pixels(resized, int(width), int(height))
-	arr := mlx.NewArray(pixels, []int32{height, width, 3})
-	if normalize {
-		arr = p.normalizeImageNet(arr)
-	}
-	return prepareImageTensor(arr)
-}
-
-// preprocessImageForVAE converts image to tensor for VAE encoding
-// Returns: [B, C, T, H, W] tensor in [-1, 1] range
-func (p *Processor) preprocessImageForVAE(img image.Image, width, height int32) *mlx.Array {
-	resized := resizeImageLanczos(img, int(width), int(height))
-	pixels := imageToFloat32Pixels(resized, int(width), int(height))
-	arr := mlx.NewArray(pixels, []int32{height, width, 3})
-
-	// Scale to [-1, 1]: arr * 2 - 1
-	arr = mlx.MulScalar(arr, 2.0)
-	arr = mlx.AddScalar(arr, -1.0)
-
-	// Transpose to [C, H, W] and make contiguous
-	arr = mlx.Contiguous(mlx.Transpose(arr, 2, 0, 1))
-
-	// Add batch and temporal dimensions [1, C, 1, H, W]
-	arr = mlx.ExpandDims(arr, 0) // [1, C, H, W]
-	arr = mlx.ExpandDims(arr, 2) // [1, C, 1, H, W]
-
-	arr = mlx.ToBFloat16(arr)
-	mlx.Eval(arr)
-	return arr
-}
-
-// smartResize implements Python Qwen2VL processor's smart_resize logic
-// Returns (resizedHeight, resizedWidth) that fit within min/max pixel constraints
-func smartResize(height, width, factor, minPixels, maxPixels int32) (int32, int32) {
-	// Round to factor
-	hBar := int32(math.Round(float64(height)/float64(factor))) * factor
-	wBar := int32(math.Round(float64(width)/float64(factor))) * factor
-
-	// Ensure minimum factor size
-	if hBar < factor {
-		hBar = factor
-	}
-	if wBar < factor {
-		wBar = factor
-	}
-
-	// Check pixel constraints
-	total := hBar * wBar
-	if total > maxPixels {
-		// Scale down
-		beta := math.Sqrt(float64(maxPixels) / float64(total))
-		hBar = int32(math.Floor(float64(height)*beta/float64(factor))) * factor
-		wBar = int32(math.Floor(float64(width)*beta/float64(factor))) * factor
-	} else if total < minPixels {
-		// Scale up
-		beta := math.Sqrt(float64(minPixels) / float64(total))
-		hBar = int32(math.Ceil(float64(height)*beta/float64(factor))) * factor
-		wBar = int32(math.Ceil(float64(width)*beta/float64(factor))) * factor
-	}
-
-	return hBar, wBar
-}
-
-// calculateDimensions calculates width and height for a target area while maintaining ratio
-// multiple: the value to round dimensions to (e.g., 28 for vision encoder with patch 14 and 2x2 merge)
-func calculateDimensions(targetArea int32, ratio float64, multiple int32) (int32, int32) {
-	width := math.Sqrt(float64(targetArea) * ratio)
-	height := width / ratio
-
-	m := float64(multiple)
-	width = math.Round(width/m) * m
-	height = math.Round(height/m) * m
-
-	// Ensure minimum dimensions
-	if width < m {
-		width = m
-	}
-	if height < m {
-		height = m
-	}
-
-	return int32(width), int32(height)
-}
-
-// resizeImageLanczos resizes an image using Lanczos3 interpolation (matches PIL.LANCZOS)
-func resizeImageLanczos(img image.Image, width, height int) image.Image {
-	bounds := img.Bounds()
-	dst := image.NewRGBA(image.Rect(0, 0, width, height))
-
-	// Lanczos3 kernel (a=3) to match PIL.LANCZOS
-	lanczos3 := &draw.Kernel{
-		Support: 3.0,
-		At: func(t float64) float64 {
-			if t == 0 {
-				return 1.0
-			}
-			if t < 0 {
-				t = -t
-			}
-			if t >= 3.0 {
-				return 0.0
-			}
-			// sinc(t) * sinc(t/3)
-			piT := math.Pi * t
-			return (math.Sin(piT) / piT) * (math.Sin(piT/3) / (piT / 3))
-		},
-	}
-	lanczos3.Scale(dst, dst.Bounds(), img, bounds, draw.Over, nil)
-
-	return dst
-}
-
-// resizeImageBicubic resizes an image using bicubic interpolation (matches PIL.BICUBIC)
-// Uses separable interpolation with PIL's coordinate mapping for exact match
-func resizeImageBicubic(img image.Image, width, height int) image.Image {
-	bounds := img.Bounds()
-	srcW := bounds.Dx()
-	srcH := bounds.Dy()
-
-	// Convert to RGBA if needed
-	var src *image.RGBA
-	if rgba, ok := img.(*image.RGBA); ok {
-		src = rgba
-	} else {
-		src = image.NewRGBA(bounds)
-		for y := bounds.Min.Y; y < bounds.Max.Y; y++ {
-			for x := bounds.Min.X; x < bounds.Max.X; x++ {
-				src.Set(x, y, img.At(x, y))
-			}
-		}
-	}
-
-	// Keys cubic with a=-0.5 (PIL BICUBIC)
-	cubic := func(x float64) float64 {
-		if x < 0 {
-			x = -x
-		}
-		if x < 1 {
-			return 1.5*x*x*x - 2.5*x*x + 1
-		}
-		if x < 2 {
-			return -0.5*x*x*x + 2.5*x*x - 4*x + 2
-		}
-		return 0
-	}
-
-	// Horizontal pass: srcW -> width, keep srcH rows
-	temp := image.NewRGBA(image.Rect(0, 0, width, srcH))
-	for y := 0; y < srcH; y++ {
-		for dstX := 0; dstX < width; dstX++ {
-			// PIL coordinate mapping: center-to-center
-			srcXf := (float64(dstX)+0.5)*(float64(srcW)/float64(width)) - 0.5
-			baseX := int(math.Floor(srcXf))
-
-			var sumR, sumG, sumB, sumA, weightSum float64
-			for i := -1; i <= 2; i++ {
-				sx := baseX + i
-				if sx < 0 {
-					sx = 0
-				}
-				if sx >= srcW {
-					sx = srcW - 1
-				}
-
-				w := cubic(math.Abs(srcXf - float64(baseX+i)))
-				c := src.RGBAAt(sx, y)
-				sumR += float64(c.R) * w
-				sumG += float64(c.G) * w
-				sumB += float64(c.B) * w
-				sumA += float64(c.A) * w
-				weightSum += w
-			}
-
-			temp.SetRGBA(dstX, y, color.RGBA{
-				clampFloat(sumR, weightSum),
-				clampFloat(sumG, weightSum),
-				clampFloat(sumB, weightSum),
-				clampFloat(sumA, weightSum),
-			})
-		}
-	}
-
-	// Vertical pass: srcH -> height
-	dst := image.NewRGBA(image.Rect(0, 0, width, height))
-	for x := 0; x < width; x++ {
-		for dstY := 0; dstY < height; dstY++ {
-			srcYf := (float64(dstY)+0.5)*(float64(srcH)/float64(height)) - 0.5
-			baseY := int(math.Floor(srcYf))
-
-			var sumR, sumG, sumB, sumA, weightSum float64
-			for j := -1; j <= 2; j++ {
-				sy := baseY + j
-				if sy < 0 {
-					sy = 0
-				}
-				if sy >= srcH {
-					sy = srcH - 1
-				}
-
-				w := cubic(math.Abs(srcYf - float64(baseY+j)))
-				c := temp.RGBAAt(x, sy)
-				sumR += float64(c.R) * w
-				sumG += float64(c.G) * w
-				sumB += float64(c.B) * w
-				sumA += float64(c.A) * w
-				weightSum += w
-			}
-
-			dst.SetRGBA(x, dstY, color.RGBA{
-				clampFloat(sumR, weightSum),
-				clampFloat(sumG, weightSum),
-				clampFloat(sumB, weightSum),
-				clampFloat(sumA, weightSum),
-			})
-		}
-	}
-
-	return dst
-}
-
-// LoadAndPreprocessMultiple loads multiple images and preprocesses them
-// Returns: condImages (for vision encoder), vaeImages (for VAE encoding), dims (per-image dimensions)
-func (p *Processor) LoadAndPreprocessMultiple(imagePaths []string) ([]*mlx.Array, []*mlx.Array, []ImageDims, error) {
-	const vaeScaleFactor int32 = 8
-	const patchSize int32 = 2
-
-	condImages := make([]*mlx.Array, len(imagePaths))
-	vaeImages := make([]*mlx.Array, len(imagePaths))
-	dims := make([]ImageDims, len(imagePaths))
-
-	for i, imagePath := range imagePaths {
-		img, err := loadImageFile(imagePath)
-		if err != nil {
-			return nil, nil, nil, fmt.Errorf("image %d: %w", i, err)
-		}
-
-		bounds := img.Bounds()
-		origW := int32(bounds.Dx())
-		origH := int32(bounds.Dy())
-		ratio := float64(origW) / float64(origH)
-
-		// Calculate dimensions for condition image (vision encoder)
-		// Python pipeline does TWO resizes:
-		// 1. VaeImageProcessor.resize with Lanczos to CONDITION_IMAGE_SIZE (384x384 area)
-		// 2. Qwen2VLProcessor's smart_resize with Bicubic to multiple of 28
-		intermediateW, intermediateH := calculateDimensions(p.Config.ConditionImageSize, ratio, 32)
-		condH, condW := smartResize(intermediateH, intermediateW, 28, 56*56, 28*28*1280)
-
-		// Calculate dimensions for VAE image (1024x1024 area)
-		vaeW, vaeH := calculateDimensions(p.Config.VAEImageSize, ratio, 32)
-
-		// Calculate derived dimensions
-		latentW := vaeW / vaeScaleFactor
-		latentH := vaeH / vaeScaleFactor
-		patchW := latentW / patchSize
-		patchH := latentH / patchSize
-
-		dims[i] = ImageDims{
-			OrigW:   origW,
-			OrigH:   origH,
-			CondW:   condW,
-			CondH:   condH,
-			VaeW:    vaeW,
-			VaeH:    vaeH,
-			LatentW: latentW,
-			LatentH: latentH,
-			PatchW:  patchW,
-			PatchH:  patchH,
-		}
-
-		fmt.Printf("  Image %d: orig=%dx%d, cond=%dx%d, vae=%dx%d, latent=%dx%d, patch=%dx%d\n",
-			i+1, origW, origH, condW, condH, vaeW, vaeH, latentW, latentH, patchW, patchH)
-
-		// Preprocess for condition (vision encoder) - two-step resize to match Python pipeline
-		condImages[i] = p.preprocessImageTwoStep(img, intermediateW, intermediateH, condW, condH)
-
-		// Preprocess for VAE ([-1, 1] range, 5D tensor)
-		vaeImages[i] = p.preprocessImageForVAE(img, vaeW, vaeH)
-	}
-
-	return condImages, vaeImages, dims, nil
-}
--- a/x/imagegen/models/qwen_image_edit/qwen_image_edit.go
+++ b/x/imagegen/models/qwen_image_edit/qwen_image_edit.go
@@ -1,608 +0,0 @@
-// Package qwen_image_edit implements the Qwen-Image-Edit diffusion model for image editing.
-// It reuses components from qwen_image where possible.
-package qwen_image_edit
-
-import (
-	"context"
-	"fmt"
-	"path/filepath"
-	"time"
-
-	"github.com/ollama/ollama/x/imagegen/mlx"
-	"github.com/ollama/ollama/x/imagegen/models/qwen_image"
-	"github.com/ollama/ollama/x/imagegen/tokenizer"
-)
-
-// GenerateConfig holds all options for image editing.
-type GenerateConfig struct {
-	Prompt         string
-	NegativePrompt string       // Unconditional prompt for CFG (empty string "" is valid)
-	CFGScale       float32      // CFG enabled when > 1.0 (default: 4.0)
-	Width          int32        // Output width (default: from input image)
-	Height         int32        // Output height (default: from input image)
-	Steps          int          // Denoising steps (default: 50)
-	Seed           int64        // Random seed
-	Progress       ProgressFunc // Optional progress callback
-}
-
-// ProgressFunc is called during generation with step progress.
-type ProgressFunc func(step, totalSteps int)
-
-// Model represents a Qwen-Image-Edit diffusion model.
-type Model struct {
-	ModelPath     string
-	Tokenizer     *tokenizer.Tokenizer
-	Processor     *Processor                // Image processor for vision encoder
-	TextEncoder   *qwen_image.Qwen25VL      // Qwen2.5-VL vision-language encoder (from qwen_image)
-	Transformer   *qwen_image.Transformer   // Reuse qwen_image transformer
-	VAE           *VAE                      // Combined encoder + decoder
-}
-
-// Load loads the Qwen-Image-Edit model from a directory.
-func (m *Model) Load(modelPath string) error {
-	fmt.Println("Loading Qwen-Image-Edit model...")
-	start := time.Now()
-
-	if mlx.GPUIsAvailable() {
-		mlx.SetDefaultDeviceGPU()
-		mlx.EnableCompile()
-	}
-
-	m.ModelPath = modelPath
-
-	// Load tokenizer from processor directory
-	fmt.Print("  Loading tokenizer... ")
-	processorPath := filepath.Join(modelPath, "processor")
-	tok, err := tokenizer.Load(processorPath)
-	if err != nil {
-		// Fallback to tokenizer directory
-		tokenizerPath := filepath.Join(modelPath, "tokenizer")
-		tok, err = tokenizer.Load(tokenizerPath)
-		if err != nil {
-			return fmt.Errorf("tokenizer: %w", err)
-		}
-	}
-	m.Tokenizer = tok
-	fmt.Println("✓")
-
-	// Load processor (image preprocessing config)
-	fmt.Print("  Loading processor... ")
-	m.Processor = &Processor{}
-	if err := m.Processor.Load(processorPath); err != nil {
-		return fmt.Errorf("processor: %w", err)
-	}
-	fmt.Println("✓")
-
-	// Load vision-language text encoder (Qwen2.5-VL from qwen_image package)
-	m.TextEncoder = &qwen_image.Qwen25VL{}
-	if err := m.TextEncoder.Load(filepath.Join(modelPath, "text_encoder")); err != nil {
-		return fmt.Errorf("text encoder: %w", err)
-	}
-	mlx.Eval(mlx.Collect(m.TextEncoder)...)
-	fmt.Printf("  (%.1f GB, peak %.1f GB)\n",
-		float64(mlx.MetalGetActiveMemory())/(1024*1024*1024),
-		float64(mlx.MetalGetPeakMemory())/(1024*1024*1024))
-
-	// Load transformer (reuse qwen_image)
-	m.Transformer = &qwen_image.Transformer{}
-	if err := m.Transformer.Load(filepath.Join(modelPath, "transformer")); err != nil {
-		return fmt.Errorf("transformer: %w", err)
-	}
-	mlx.Eval(mlx.Collect(m.Transformer)...)
-	fmt.Printf("  (%.1f GB, peak %.1f GB)\n",
-		float64(mlx.MetalGetActiveMemory())/(1024*1024*1024),
-		float64(mlx.MetalGetPeakMemory())/(1024*1024*1024))
-
-	// Load VAE (encoder + decoder)
-	m.VAE = &VAE{}
-	if err := m.VAE.Load(filepath.Join(modelPath, "vae")); err != nil {
-		return fmt.Errorf("VAE: %w", err)
-	}
-	mlx.Eval(mlx.Collect(m.VAE)...)
-	fmt.Printf("  (%.1f GB, peak %.1f GB)\n",
-		float64(mlx.MetalGetActiveMemory())/(1024*1024*1024),
-		float64(mlx.MetalGetPeakMemory())/(1024*1024*1024))
-
-	mem := mlx.MetalGetActiveMemory()
-	peak := mlx.MetalGetPeakMemory()
-	fmt.Printf("  Loaded in %.2fs (%.1f GB active, %.1f GB peak)\n",
-		time.Since(start).Seconds(),
-		float64(mem)/(1024*1024*1024),
-		float64(peak)/(1024*1024*1024))
-
-	return nil
-}
-
-// Edit edits an image based on a text prompt.
-// inputImagePath: path to input image
-// prompt: text description of desired edit
-func (m *Model) Edit(inputImagePath string, prompt string, width, height int32, steps int, seed int64) (*mlx.Array, error) {
-	return m.EditFromConfig([]string{inputImagePath}, &GenerateConfig{
-		Prompt: prompt,
-		Width:  width,
-		Height: height,
-		Steps:  steps,
-		Seed:   seed,
-	})
-}
-
-// EditFromConfig edits images using the unified config struct.
-// Accepts one or more input images.
-func (m *Model) EditFromConfig(inputImagePaths []string, cfg *GenerateConfig) (*mlx.Array, error) {
-	if len(inputImagePaths) == 0 {
-		return nil, fmt.Errorf("no input images provided")
-	}
-
-	start := time.Now()
-	result, err := m.edit(inputImagePaths, cfg)
-	if err != nil {
-		return nil, err
-	}
-
-	if cfg.NegativePrompt != "" {
-		fmt.Printf("Edited %d image(s) with CFG (scale=%.1f) in %.2fs (%d steps)\n",
-			len(inputImagePaths), cfg.CFGScale, time.Since(start).Seconds(), cfg.Steps)
-	} else {
-		fmt.Printf("Edited %d image(s) in %.2fs (%d steps)\n",
-			len(inputImagePaths), time.Since(start).Seconds(), cfg.Steps)
-	}
-	return result, nil
-}
-
-// EditImage implements model.ImageEditModel interface.
-func (m *Model) EditImage(ctx context.Context, inputImagePath, prompt string, width, height int32, steps int, seed int64) (*mlx.Array, error) {
-	return m.Edit(inputImagePath, prompt, width, height, steps, seed)
-}
-
-// EditMultiImage edits using multiple source images.
-// This matches diffusers' QwenImageEditPlusPipeline behavior.
-func (m *Model) EditMultiImage(inputImagePaths []string, cfg *GenerateConfig) (*mlx.Array, error) {
-	return m.EditFromConfig(inputImagePaths, cfg)
-}
-
-// edit is the internal editing pipeline that handles one or more images.
-func (m *Model) edit(inputImagePaths []string, cfg *GenerateConfig) (*mlx.Array, error) {
-	// Apply defaults
-	if cfg.Steps <= 0 {
-		cfg.Steps = 50
-	}
-	if cfg.CFGScale <= 0 {
-		cfg.CFGScale = 4.0
-	}
-
-	// Load and preprocess all input images
-	fmt.Printf("Loading %d image(s)...\n", len(inputImagePaths))
-	condImages, vaeImages, inputDims, err := m.Processor.LoadAndPreprocessMultiple(inputImagePaths)
-	if err != nil {
-		return nil, fmt.Errorf("preprocess images: %w", err)
-	}
-	for _, img := range condImages {
-		mlx.Keep(img)
-	}
-	for _, img := range vaeImages {
-		mlx.Keep(img)
-	}
-	mlx.Eval(append(condImages, vaeImages...)...)
-
-	useCFG := cfg.NegativePrompt != ""
-	tcfg := m.Transformer.Config
-	vaeScaleFactor := int32(8)
-
-	// Output dimensions - if not specified, use first input image dimensions
-	if cfg.Width <= 0 {
-		cfg.Width = inputDims[0].VaeW
-	}
-	if cfg.Height <= 0 {
-		cfg.Height = inputDims[0].VaeH
-	}
-
-	// Output (noise) latent dimensions
-	outLatentH := cfg.Height / vaeScaleFactor
-	outLatentW := cfg.Width / vaeScaleFactor
-	outPH := outLatentH / tcfg.PatchSize
-	outPW := outLatentW / tcfg.PatchSize
-	noiseSeqLen := outPH * outPW
-	imgSeqLen := noiseSeqLen
-
-	// Encode prompt with all images for conditioning
-	posEmb, _, _, err := m.TextEncoder.EncodePromptWithImages(m.Tokenizer, cfg.Prompt, condImages)
-	if err != nil {
-		return nil, fmt.Errorf("encoding prompt: %w", err)
-	}
-	mlx.Keep(posEmb)
-	mlx.Eval(posEmb)
-
-	var negEmb *mlx.Array
-	if useCFG {
-		negEmb, _, _, err = m.TextEncoder.EncodePromptWithImages(m.Tokenizer, cfg.NegativePrompt, condImages)
-		if err != nil {
-			return nil, fmt.Errorf("encoding negative prompt: %w", err)
-		}
-		mlx.Keep(negEmb)
-		mlx.Eval(negEmb)
-	}
-
-	// Pad sequences to same length for CFG
-	txtLen := posEmb.Shape()[1]
-	if useCFG {
-		negLen := negEmb.Shape()[1]
-		if negLen > txtLen {
-			txtLen = negLen
-		}
-		if posEmb.Shape()[1] < txtLen {
-			posEmb = padSequence(posEmb, txtLen)
-		}
-		if negEmb.Shape()[1] < txtLen {
-			negEmb = padSequence(negEmb, txtLen)
-		}
-		mlx.Keep(posEmb, negEmb)
-		mlx.Eval(posEmb, negEmb)
-	}
-
-	// Encode all input images to latents and concatenate
-	fmt.Println("Encoding images to latents...")
-	allImageLatentsPacked := make([]*mlx.Array, len(vaeImages))
-	for i, vaeImage := range vaeImages {
-		imageLatents := m.VAE.Encode(vaeImage)
-		imageLatents = m.VAE.Normalize(imageLatents)
-		imageLatents2D := mlx.Squeeze(imageLatents, 2)
-		packed := qwen_image.PackLatents(imageLatents2D, tcfg.PatchSize)
-		mlx.Keep(packed)
-		mlx.Eval(packed)
-		allImageLatentsPacked[i] = packed
-	}
-
-	imageLatentsPacked := mlx.Concatenate(allImageLatentsPacked, 1)
-	mlx.Keep(imageLatentsPacked)
-	mlx.Eval(imageLatentsPacked)
-
-	// Scheduler
-	scheduler := qwen_image.NewFlowMatchScheduler(qwen_image.DefaultSchedulerConfig())
-	scheduler.SetTimesteps(cfg.Steps, noiseSeqLen)
-
-	// Init noise latents in packed format
-	packedChannels := tcfg.OutChannels * tcfg.PatchSize * tcfg.PatchSize
-	packedNoise := scheduler.InitNoisePacked(1, noiseSeqLen, packedChannels, cfg.Seed)
-	latents := qwen_image.UnpackLatents(packedNoise, outLatentH, outLatentW, tcfg.PatchSize)
-	mlx.Eval(latents)
-
-	// RoPE cache
-	ropeCache := PrepareRoPEMultiImage(outPH, outPW, inputDims, txtLen, tcfg.AxesDimsRope)
-	mlx.Keep(ropeCache.ImgFreqs, ropeCache.TxtFreqs)
-	mlx.Eval(ropeCache.ImgFreqs, ropeCache.TxtFreqs)
-
-	// Denoising loop
-	fmt.Printf("Running denoising (%d steps)...\n", cfg.Steps)
-	for i := 0; i < cfg.Steps; i++ {
-		stepStart := time.Now()
-		if cfg.Progress != nil {
-			cfg.Progress(i+1, cfg.Steps)
-		}
-
-		t := scheduler.Timesteps[i]
-		timestep := mlx.ToBFloat16(mlx.NewArray([]float32{t}, []int32{1}))
-		mlx.Eval(timestep)
-
-		latents2D := mlx.Squeeze(latents, 2)
-		patches := qwen_image.PackLatents(latents2D, tcfg.PatchSize)
-		latentInput := mlx.Concatenate([]*mlx.Array{patches, imageLatentsPacked}, 1)
-
-		var output *mlx.Array
-		if useCFG {
-			posOutput := m.Transformer.Forward(latentInput, posEmb, timestep, ropeCache.ImgFreqs, ropeCache.TxtFreqs)
-			negOutput := m.Transformer.Forward(latentInput, negEmb, timestep, ropeCache.ImgFreqs, ropeCache.TxtFreqs)
-
-			posOutput = mlx.Slice(posOutput, []int32{0, 0, 0}, []int32{1, imgSeqLen, posOutput.Shape()[2]})
-			negOutput = mlx.Slice(negOutput, []int32{0, 0, 0}, []int32{1, imgSeqLen, negOutput.Shape()[2]})
-
-			output = applyCFGWithNormRescale(posOutput, negOutput, cfg.CFGScale)
-		} else {
-			output = m.Transformer.Forward(latentInput, posEmb, timestep, ropeCache.ImgFreqs, ropeCache.TxtFreqs)
-			output = mlx.Slice(output, []int32{0, 0, 0}, []int32{1, imgSeqLen, output.Shape()[2]})
-		}
-
-		noisePred := qwen_image.UnpackLatents(output, outLatentH, outLatentW, tcfg.PatchSize)
-		oldLatents := latents
-		latents = scheduler.Step(noisePred, latents, i)
-		mlx.Eval(latents)
-		oldLatents.Free()
-
-		fmt.Printf("  Step %d/%d: t=%.4f (%.2fs)\n", i+1, cfg.Steps, t, time.Since(stepStart).Seconds())
-	}
-
-	// Free denoising temporaries
-	posEmb.Free()
-	if negEmb != nil {
-		negEmb.Free()
-	}
-	ropeCache.ImgFreqs.Free()
-	ropeCache.TxtFreqs.Free()
-	imageLatentsPacked.Free()
-
-	// Decode latents
-	decoded := m.decodeAndPostprocess(latents)
-	latents.Free()
-
-	fmt.Printf("  Peak memory: %.2f GB\n", float64(mlx.MetalGetPeakMemory())/(1024*1024*1024))
-	return decoded, nil
-}
-
-// applyCFGWithNormRescale applies classifier-free guidance with norm rescaling.
-// This prevents CFG from inflating magnitude too much.
-func applyCFGWithNormRescale(posOutput, negOutput *mlx.Array, scale float32) *mlx.Array {
-	// Upcast to float32 for precision
-	posF32 := mlx.AsType(posOutput, mlx.DtypeFloat32)
-	negF32 := mlx.AsType(negOutput, mlx.DtypeFloat32)
-
-	// CFG: pred = neg + scale * (pos - neg)
-	diff := mlx.Sub(posF32, negF32)
-	scaledDiff := mlx.MulScalar(diff, scale)
-	combPred := mlx.Add(negF32, scaledDiff)
-
-	// Norm rescaling: rescale combined prediction to match conditional norm
-	condNorm := mlx.Sqrt(mlx.Sum(mlx.Square(posF32), -1, true))
-	combNorm := mlx.Sqrt(mlx.Sum(mlx.Square(combPred), -1, true))
-	output := mlx.Mul(combPred, mlx.Div(condNorm, combNorm))
-
-	mlx.Eval(output)
-	return mlx.ToBFloat16(output)
-}
-
-// decodeAndPostprocess denormalizes latents, decodes through VAE, and scales to [0,1].
-func (m *Model) decodeAndPostprocess(latents *mlx.Array) *mlx.Array {
-	latents = m.VAE.Denormalize(latents)
-	decoded := m.VAE.Decode(latents)
-
-	// Post-process: squeeze temporal dim and rescale to [0, 1]
-	decoded = mlx.Squeeze(decoded, 2)
-	decoded = mlx.AddScalar(decoded, 1.0)
-	decoded = mlx.DivScalar(decoded, 2.0)
-	decoded = mlx.ClipScalar(decoded, 0.0, 1.0, true, true)
-	mlx.Eval(decoded)
-	return decoded
-}
-
-// padSequence pads a sequence tensor to the target length with zeros
-func padSequence(x *mlx.Array, targetLen int32) *mlx.Array {
-	shape := x.Shape()
-	currentLen := shape[1]
-	if currentLen >= targetLen {
-		return x
-	}
-	padLen := targetLen - currentLen
-	// Pad on sequence dimension (axis 1)
-	return mlx.Pad(x, []int32{0, 0, 0, padLen, 0, 0})
-}
-
-// LoadPersistent is an alias for backward compatibility.
-func LoadPersistent(modelPath string) (*Model, error) {
-	m := &Model{}
-	if err := m.Load(modelPath); err != nil {
-		return nil, err
-	}
-	return m, nil
-}
-
-// PrepareRoPEMultiImage computes RoPE with interpolation for image editing.
-// Handles single or multiple input images with different resolutions.
-//
-// Parameters:
-//   - outPH, outPW: output patch dimensions (noise latent resolution)
-//   - inputDims: patch dimensions for each input image [(pH1, pW1), (pH2, pW2), ...]
-//   - txtLen: text sequence length
-//   - axesDims: RoPE axis dimensions [16, 56, 56]
-//
-// Returns RoPE cache where:
-//   - ImgFreqs has (outPH*outPW + sum(inPH*inPW for each image)) positions
-//   - First outPH*outPW positions are for noise latents (standard RoPE at output res)
-//   - Following positions are for each input image (interpolated from output res)
-func PrepareRoPEMultiImage(outPH, outPW int32, inputDims []ImageDims, txtLen int32, axesDims []int32) *qwen_image.RoPECache {
-	theta := float64(10000)
-	maxIdx := int32(4096)
-
-	// Compute base frequencies for each axis dimension
-	freqsT := qwen_image.ComputeAxisFreqs(axesDims[0], theta)
-	freqsH := qwen_image.ComputeAxisFreqs(axesDims[1], theta)
-	freqsW := qwen_image.ComputeAxisFreqs(axesDims[2], theta)
-
-	// Build frequency lookup tables
-	posFreqsT := qwen_image.MakeFreqTable(maxIdx, freqsT, false)
-	posFreqsH := qwen_image.MakeFreqTable(maxIdx, freqsH, false)
-	posFreqsW := qwen_image.MakeFreqTable(maxIdx, freqsW, false)
-	negFreqsT := qwen_image.MakeFreqTable(maxIdx, freqsT, true) // For frame -1 on last condition image
-	negFreqsH := qwen_image.MakeFreqTable(maxIdx, freqsH, true)
-	negFreqsW := qwen_image.MakeFreqTable(maxIdx, freqsW, true)
-
-	headDim := int32(len(freqsT)+len(freqsH)+len(freqsW)) * 2
-
-	// Helper to compute RoPE for a single position at output resolution with scale_rope
-	computePosFreqs := func(framePos, y, x int32) []float32 {
-		row := make([]float32, headDim)
-		idx := 0
-
-		// Frame position
-		for i := 0; i < len(freqsT)*2; i++ {
-			row[idx+i] = posFreqsT[framePos][i]
-		}
-		idx += len(freqsT) * 2
-
-		// Height with scale_rope centering (using OUTPUT dimensions)
-		outHHalf := outPH / 2
-		hNegCount := outPH - outHHalf
-		if y < hNegCount {
-			negTableIdx := maxIdx - hNegCount + y
-			for i := 0; i < len(freqsH)*2; i++ {
-				row[idx+i] = negFreqsH[negTableIdx][i]
-			}
-		} else {
-			posIdx := y - hNegCount
-			for i := 0; i < len(freqsH)*2; i++ {
-				row[idx+i] = posFreqsH[posIdx][i]
-			}
-		}
-		idx += len(freqsH) * 2
-
-		// Width with scale_rope centering (using OUTPUT dimensions)
-		outWHalf := outPW / 2
-		wNegCount := outPW - outWHalf
-		if x < wNegCount {
-			negTableIdx := maxIdx - wNegCount + x
-			for i := 0; i < len(freqsW)*2; i++ {
-				row[idx+i] = negFreqsW[negTableIdx][i]
-			}
-		} else {
-			posIdx := x - wNegCount
-			for i := 0; i < len(freqsW)*2; i++ {
-				row[idx+i] = posFreqsW[posIdx][i]
-			}
-		}
-
-		return row
-	}
-
-	// Helper to compute RoPE for frame -1 (used for last condition image)
-	// This matches Python's _compute_condition_freqs which uses freqs_neg[0][-1:]
-	computeNegFrameFreqs := func(y, x int32) []float32 {
-		row := make([]float32, headDim)
-		idx := 0
-
-		// Frame -1: use last row of negative frame frequencies
-		negFrameIdx := maxIdx - 1
-		for i := 0; i < len(freqsT)*2; i++ {
-			row[idx+i] = negFreqsT[negFrameIdx][i]
-		}
-		idx += len(freqsT) * 2
-
-		// Height with scale_rope centering (using OUTPUT dimensions)
-		outHHalf := outPH / 2
-		hNegCount := outPH - outHHalf
-		if y < hNegCount {
-			negTableIdx := maxIdx - hNegCount + y
-			for i := 0; i < len(freqsH)*2; i++ {
-				row[idx+i] = negFreqsH[negTableIdx][i]
-			}
-		} else {
-			posIdx := y - hNegCount
-			for i := 0; i < len(freqsH)*2; i++ {
-				row[idx+i] = posFreqsH[posIdx][i]
-			}
-		}
-		idx += len(freqsH) * 2
-
-		// Width with scale_rope centering (using OUTPUT dimensions)
-		outWHalf := outPW / 2
-		wNegCount := outPW - outWHalf
-		if x < wNegCount {
-			negTableIdx := maxIdx - wNegCount + x
-			for i := 0; i < len(freqsW)*2; i++ {
-				row[idx+i] = negFreqsW[negTableIdx][i]
-			}
-		} else {
-			posIdx := x - wNegCount
-			for i := 0; i < len(freqsW)*2; i++ {
-				row[idx+i] = posFreqsW[posIdx][i]
-			}
-		}
-
-		return row
-	}
-
-	// Total image sequence length: noise + all input images
-	noiseSeqLen := outPH * outPW
-	totalImgLen := noiseSeqLen
-	for _, dims := range inputDims {
-		totalImgLen += dims.PatchH * dims.PatchW
-	}
-
-	imgFreqsData := make([]float32, totalImgLen*headDim)
-	idx := int32(0)
-
-	// Segment 0: Noise latents - standard RoPE at output resolution (frame 0)
-	for y := int32(0); y < outPH; y++ {
-		for x := int32(0); x < outPW; x++ {
-			row := computePosFreqs(0, y, x)
-			copy(imgFreqsData[idx:], row)
-			idx += headDim
-		}
-	}
-
-	// Segments 1..N: Edit image latents - INTERPOLATED RoPE
-	// For single image: use frame 1 (matches original PrepareRoPEInterpolated)
-	// For multiple images: Python uses frame -1 for the LAST condition image
-	// (_compute_condition_freqs), positive indices for others.
-	numImages := len(inputDims)
-	lastImgIdx := numImages - 1
-	for imgIdx, dims := range inputDims {
-		inPH := dims.PatchH
-		inPW := dims.PatchW
-
-		// Determine frame index for this image
-		// Single image case: use frame 1 (like original PrepareRoPEInterpolated)
-		// Multi-image case: last image uses frame -1, others use frame 1, 2, etc.
-		useNegFrame := numImages > 1 && imgIdx == lastImgIdx
-
-		// Map each input position to an output position using linear interpolation
-		for y := int32(0); y < inPH; y++ {
-			for x := int32(0); x < inPW; x++ {
-				// Interpolate: map input (y, x) to output grid position
-				// This is the key fix from DiffSynth's forward_sampling
-				var yOut, xOut int32
-				if inPH == 1 {
-					yOut = 0
-				} else {
-					// Linear interpolation: y_out = y * (outPH - 1) / (inPH - 1)
-					yOut = y * (outPH - 1) / (inPH - 1)
-				}
-				if inPW == 1 {
-					xOut = 0
-				} else {
-					xOut = x * (outPW - 1) / (inPW - 1)
-				}
-
-				var row []float32
-				if useNegFrame {
-					// Last image in multi-image uses frame -1
-					row = computeNegFrameFreqs(yOut, xOut)
-				} else {
-					// Single image uses frame 1, multi-image uses frame 1, 2, etc.
-					frameIdx := int32(imgIdx + 1)
-					row = computePosFreqs(frameIdx, yOut, xOut)
-				}
-				copy(imgFreqsData[idx:], row)
-				idx += headDim
-			}
-		}
-	}
-
-	imgFreqs := mlx.NewArray(imgFreqsData, []int32{totalImgLen, headDim})
-	imgFreqs = mlx.ToBFloat16(imgFreqs)
-
-	// Text frequencies - start after max video index
-	maxVidIdx := max(outPH/2, outPW/2)
-
-	txtFreqsData := make([]float32, txtLen*headDim)
-	idx = 0
-	for t := int32(0); t < txtLen; t++ {
-		pos := maxVidIdx + t
-		for i := 0; i < len(freqsT)*2; i++ {
-			txtFreqsData[idx+int32(i)] = posFreqsT[pos][i]
-		}
-		idx += int32(len(freqsT) * 2)
-		for i := 0; i < len(freqsH)*2; i++ {
-			txtFreqsData[idx+int32(i)] = posFreqsH[pos][i]
-		}
-		idx += int32(len(freqsH) * 2)
-		for i := 0; i < len(freqsW)*2; i++ {
-			txtFreqsData[idx+int32(i)] = posFreqsW[pos][i]
-		}
-		idx += int32(len(freqsW) * 2)
-	}
-
-	txtFreqs := mlx.NewArray(txtFreqsData, []int32{txtLen, headDim})
-	txtFreqs = mlx.ToBFloat16(txtFreqs)
-
-	return &qwen_image.RoPECache{
-		ImgFreqs: imgFreqs,
-		TxtFreqs: txtFreqs,
-	}
-}
--- a/x/imagegen/models/qwen_image_edit/rope_test.go
+++ b/x/imagegen/models/qwen_image_edit/rope_test.go
@@ -1,225 +0,0 @@
-package qwen_image_edit
-
-import (
-	"math"
-	"testing"
-
-	"github.com/ollama/ollama/x/imagegen/mlx"
-	"github.com/ollama/ollama/x/imagegen/models/qwen_image"
-)
-
-// TestComputeAxisFreqs verifies frequency computation matches Python reference
-func TestComputeAxisFreqs(t *testing.T) {
-	theta := float64(10000)
-
-	// Expected values from Python:
-	// freqs = 1.0 / (theta ** (np.arange(0, half_dim) / half_dim))
-	expectedFreqsT := []float64{
-		1.000000000000000, 0.316227766016838, 0.100000000000000, 0.031622776601684,
-		0.010000000000000, 0.003162277660168, 0.001000000000000, 0.000316227766017,
-	}
-
-	expectedFreqsH_first4 := []float64{
-		1.000000000000000, 0.719685673001152, 0.517947467923121, 0.372759372031494,
-	}
-
-	expectedFreqsH_last4 := []float64{
-		0.000372759372031, 0.000268269579528, 0.000193069772888, 0.000138949549437,
-	}
-
-	// Test temporal frequencies (dim=16)
-	freqsT := qwen_image.ComputeAxisFreqs(16, theta)
-	if len(freqsT) != 8 {
-		t.Fatalf("expected 8 temporal frequencies, got %d", len(freqsT))
-	}
-	for i, expected := range expectedFreqsT {
-		if diff := math.Abs(freqsT[i] - expected); diff > 1e-10 {
-			t.Errorf("freqsT[%d]: expected %.15f, got %.15f, diff %.2e", i, expected, freqsT[i], diff)
-		}
-	}
-
-	// Test height/width frequencies (dim=56)
-	freqsH := qwen_image.ComputeAxisFreqs(56, theta)
-	if len(freqsH) != 28 {
-		t.Fatalf("expected 28 height frequencies, got %d", len(freqsH))
-	}
-	for i, expected := range expectedFreqsH_first4 {
-		if diff := math.Abs(freqsH[i] - expected); diff > 1e-10 {
-			t.Errorf("freqsH[%d]: expected %.15f, got %.15f, diff %.2e", i, expected, freqsH[i], diff)
-		}
-	}
-	for i, expected := range expectedFreqsH_last4 {
-		idx := 24 + i // last 4 of 28
-		if diff := math.Abs(freqsH[idx] - expected); diff > 1e-10 {
-			t.Errorf("freqsH[%d]: expected %.15f, got %.15f, diff %.2e", idx, expected, freqsH[idx], diff)
-		}
-	}
-}
-
-// TestMakeFreqTable verifies the frequency lookup table for both positive and negative positions
-func TestMakeFreqTable(t *testing.T) {
-	theta := float64(10000)
-	freqsT := qwen_image.ComputeAxisFreqs(16, theta)
-	maxIdx := int32(4096)
-
-	// Test positive table
-	posTable := qwen_image.MakeFreqTable(maxIdx, freqsT, false)
-
-	// Position 0 should give cos=1, sin=0 for all frequencies
-	for i := 0; i < len(freqsT)*2; i += 2 {
-		if posTable[0][i] != 1.0 {
-			t.Errorf("posTable[0][%d] (cos): expected 1.0, got %f", i, posTable[0][i])
-		}
-		if posTable[0][i+1] != 0.0 {
-			t.Errorf("posTable[0][%d] (sin): expected 0.0, got %f", i+1, posTable[0][i+1])
-		}
-	}
-
-	// Position 1, first frequency (1.0): angle = 1*1 = 1
-	// cos(1) = 0.5403, sin(1) = 0.8415
-	if diff := math.Abs(float64(posTable[1][0]) - 0.5403023058681398); diff > 1e-6 {
-		t.Errorf("posTable[1][0] (cos): expected 0.5403, got %f", posTable[1][0])
-	}
-	if diff := math.Abs(float64(posTable[1][1]) - 0.8414709848078965); diff > 1e-6 {
-		t.Errorf("posTable[1][1] (sin): expected 0.8415, got %f", posTable[1][1])
-	}
-
-	// Test negative table
-	negTable := qwen_image.MakeFreqTable(maxIdx, freqsT, true)
-
-	// negTable[4095] corresponds to position -1
-	// cos(-1) = cos(1), sin(-1) = -sin(1)
-	if diff := math.Abs(float64(negTable[4095][0]) - 0.5403023058681398); diff > 1e-6 {
-		t.Errorf("negTable[4095][0] (cos(-1)): expected 0.5403, got %f", negTable[4095][0])
-	}
-	if diff := math.Abs(float64(negTable[4095][1]) - (-0.8414709848078965)); diff > 1e-6 {
-		t.Errorf("negTable[4095][1] (sin(-1)): expected -0.8415, got %f", negTable[4095][1])
-	}
-
-	// negTable[4094] corresponds to position -2
-	// cos(-2) = cos(2), sin(-2) = -sin(2)
-	cos2 := math.Cos(2.0)
-	sin2 := math.Sin(2.0)
-	if diff := math.Abs(float64(negTable[4094][0]) - cos2); diff > 1e-6 {
-		t.Errorf("negTable[4094][0] (cos(-2)): expected %f, got %f", cos2, negTable[4094][0])
-	}
-	if diff := math.Abs(float64(negTable[4094][1]) - (-sin2)); diff > 1e-6 {
-		t.Errorf("negTable[4094][1] (sin(-2)): expected %f, got %f", -sin2, negTable[4094][1])
-	}
-}
-
-// TestPrepareRoPE_QwenImage verifies qwen_image.PrepareRoPE for single-segment case
-func TestPrepareRoPE_QwenImage(t *testing.T) {
-	if !mlx.GPUIsAvailable() {
-		t.Skip("GPU not available")
-	}
-
-	mlx.SetDefaultDeviceCPU()
-
-	// 4x4 patch grid, single image
-	imgH, imgW := int32(4), int32(4)
-	txtLen := int32(5)
-	axesDims := []int32{16, 56, 56}
-
-	cache := qwen_image.PrepareRoPE(imgH, imgW, txtLen, axesDims)
-	mlx.Eval(cache.ImgFreqs, cache.TxtFreqs)
-
-	// Check shapes
-	imgShape := cache.ImgFreqs.Shape()
-	if imgShape[0] != 16 { // 4*4 patches
-		t.Errorf("ImgFreqs seq len: expected 16, got %d", imgShape[0])
-	}
-
-	// For single image (frame=0), all temporal values should be cos=1, sin=0
-	imgFreqsCPU := mlx.AsType(cache.ImgFreqs, mlx.DtypeFloat32)
-	mlx.Eval(imgFreqsCPU)
-	imgData := imgFreqsCPU.Data()
-
-	// Check first 16 values of patch 0 (temporal cos/sin pairs)
-	for i := 0; i < 16; i += 2 {
-		cosVal := imgData[i]
-		sinVal := imgData[i+1]
-		if diff := math.Abs(float64(cosVal - 1.0)); diff > 1e-5 {
-			t.Errorf("ImgFreqs[0][%d] (cos): expected 1.0, got %f", i, cosVal)
-		}
-		if diff := math.Abs(float64(sinVal - 0.0)); diff > 1e-5 {
-			t.Errorf("ImgFreqs[0][%d] (sin): expected 0.0, got %f", i+1, sinVal)
-		}
-	}
-
-	cache.ImgFreqs.Free()
-	cache.TxtFreqs.Free()
-}
-
-// TestScaleRopePositions verifies the centered position calculation for scale_rope=True
-func TestScaleRopePositions(t *testing.T) {
-	// For a 4x4 grid with scale_rope=True:
-	// hHalf = 2, wHalf = 2
-	// hNegCount = 4 - 2 = 2 (positions 0,1 are negative)
-	// wNegCount = 4 - 2 = 2 (positions 0,1 are negative)
-	//
-	// Height positions:
-	//   y=0: -(4-2) + 0 = -2
-	//   y=1: -(4-2) + 1 = -1
-	//   y=2: 2 - 2 = 0
-	//   y=3: 3 - 2 = 1
-	//
-	// Same for width
-
-	pH, pW := int32(4), int32(4)
-	hHalf := pH / 2
-	wHalf := pW / 2
-	hNegCount := pH - hHalf
-	wNegCount := pW - wHalf
-
-	expectedH := []int32{-2, -1, 0, 1}
-	expectedW := []int32{-2, -1, 0, 1}
-
-	for y := int32(0); y < pH; y++ {
-		var hPos int32
-		if y < hNegCount {
-			hPos = -(pH - hHalf) + y
-		} else {
-			hPos = y - hNegCount
-		}
-		if hPos != expectedH[y] {
-			t.Errorf("y=%d: expected h_pos=%d, got %d", y, expectedH[y], hPos)
-		}
-	}
-
-	for x := int32(0); x < pW; x++ {
-		var wPos int32
-		if x < wNegCount {
-			wPos = -(pW - wHalf) + x
-		} else {
-			wPos = x - wNegCount
-		}
-		if wPos != expectedW[x] {
-			t.Errorf("x=%d: expected w_pos=%d, got %d", x, expectedW[x], wPos)
-		}
-	}
-}
-
-// TestRoPEHeadDimensions verifies the head dimension breakdown
-func TestRoPEHeadDimensions(t *testing.T) {
-	// axes_dims_rope = [16, 56, 56]
-	// Each dimension uses half the values for frequencies
-	// So we get: 8 + 28 + 28 = 64 frequency values
-	// Each frequency produces cos + sin, so: 64 * 2 = 128 total values per position
-
-	axesDims := []int32{16, 56, 56}
-	expectedFreqs := (axesDims[0]/2 + axesDims[1]/2 + axesDims[2]/2)
-	expectedHeadDim := expectedFreqs * 2
-
-	if expectedFreqs != 64 {
-		t.Errorf("expected 64 frequency values, got %d", expectedFreqs)
-	}
-	if expectedHeadDim != 128 {
-		t.Errorf("expected head_dim=128, got %d", expectedHeadDim)
-	}
-
-	// This should match the transformer's attention head dimension
-	// hidden_size = 3072, num_heads = 24
-	// head_dim = 3072 / 24 = 128
-}
-
--- a/x/imagegen/models/qwen_image_edit/vae.go
+++ b/x/imagegen/models/qwen_image_edit/vae.go
@@ -1,640 +0,0 @@
-package qwen_image_edit
-
-import (
-	"fmt"
-
-	"github.com/ollama/ollama/x/imagegen/mlx"
-	"github.com/ollama/ollama/x/imagegen/safetensors"
-)
-
-// VAEConfig holds Qwen-Image VAE configuration
-type VAEConfig struct {
-	ZDim               int32     `json:"z_dim"`               // 16
-	BaseDim            int32     `json:"base_dim"`            // 96
-	DimMult            []int32   `json:"dim_mult"`            // [1, 2, 4, 4]
-	NumResBlocks       int32     `json:"num_res_blocks"`      // 2
-	LatentsMean        []float32 `json:"latents_mean"`        // 16 values
-	LatentsStd         []float32 `json:"latents_std"`         // 16 values
-	TemperalDownsample []bool    `json:"temperal_downsample"` // [false, true, true]
-}
-
-// defaultVAEConfig returns config for Qwen-Image VAE
-func defaultVAEConfig() *VAEConfig {
-	return &VAEConfig{
-		ZDim:         16,
-		BaseDim:      96,
-		DimMult:      []int32{1, 2, 4, 4},
-		NumResBlocks: 2,
-		LatentsMean: []float32{
-			-0.7571, -0.7089, -0.9113, 0.1075,
-			-0.1745, 0.9653, -0.1517, 1.5508,
-			0.4134, -0.0715, 0.5517, -0.3632,
-			-0.1922, -0.9497, 0.2503, -0.2921,
-		},
-		LatentsStd: []float32{
-			2.8184, 1.4541, 2.3275, 2.6558,
-			1.2196, 1.7708, 2.6052, 2.0743,
-			3.2687, 2.1526, 2.8652, 1.5579,
-			1.6382, 1.1253, 2.8251, 1.916,
-		},
-		TemperalDownsample: []bool{false, true, true},
-	}
-}
-
-// VAE is the full VAE with encoder and decoder
-type VAE struct {
-	Config  *VAEConfig
-	Encoder *VAEEncoder
-	Decoder *VAEDecoder
-}
-
-// Load loads the VAE from a directory
-func (m *VAE) Load(path string) error {
-	fmt.Println("Loading Qwen-Image-Edit VAE (encoder + decoder)...")
-
-	cfg := defaultVAEConfig()
-	m.Config = cfg
-
-	weights, err := safetensors.LoadModelWeights(path)
-	if err != nil {
-		return fmt.Errorf("weights: %w", err)
-	}
-
-	// Load weights as f32 for quality (matches Python default behavior)
-	// VAE decoder precision is critical for final image quality
-	fmt.Print("  Loading weights as f32... ")
-	if err := weights.Load(mlx.DtypeFloat32); err != nil {
-		return fmt.Errorf("failed to load weights: %w", err)
-	}
-	fmt.Printf("✓ (%.1f GB)\n", float64(mlx.MetalGetActiveMemory())/(1024*1024*1024))
-
-	// Load encoder
-	fmt.Print("  Loading encoder... ")
-	m.Encoder = &VAEEncoder{}
-	if err := m.Encoder.loadFromWeights(weights, cfg); err != nil {
-		return fmt.Errorf("encoder: %w", err)
-	}
-	fmt.Println("✓")
-
-	// Load decoder
-	fmt.Print("  Loading decoder... ")
-	m.Decoder = &VAEDecoder{}
-	if err := m.Decoder.loadFromWeights(weights, cfg); err != nil {
-		return fmt.Errorf("decoder: %w", err)
-	}
-	fmt.Println("✓")
-
-	weights.ReleaseAll()
-	return nil
-}
-
-// Encode encodes an image to latents
-// x: [B, C, T, H, W] image tensor in [-1, 1] range
-// Returns: [B, C, T, H/8, W/8] latents (unnormalized)
-func (m *VAE) Encode(x *mlx.Array) *mlx.Array {
-	return m.Encoder.Encode(x)
-}
-
-// Decode decodes latents to image
-// z: [B, C, T, H, W] latents (denormalized)
-// Returns: [B, C, T, H*8, W*8] image in [-1, 1]
-func (m *VAE) Decode(z *mlx.Array) *mlx.Array {
-	return m.Decoder.Decode(z)
-}
-
-// Normalize applies latent normalization
-// Input z should be f32 (from VAE encoder), output is f32 for transformer
-func (m *VAE) Normalize(z *mlx.Array) *mlx.Array {
-	shape := z.Shape()
-	C := shape[1]
-
-	mean := mlx.NewArray(m.Config.LatentsMean[:C], []int32{1, C, 1, 1, 1})
-	std := mlx.NewArray(m.Config.LatentsStd[:C], []int32{1, C, 1, 1, 1})
-
-	// Mean/std are f32, will match z dtype through broadcasting
-	return mlx.Div(mlx.Sub(z, mean), std)
-}
-
-// Denormalize reverses latent normalization
-// Input z is bf16 (from transformer), output converted to f32 for VAE decoder
-func (m *VAE) Denormalize(z *mlx.Array) *mlx.Array {
-	shape := z.Shape()
-	C := shape[1]
-
-	// Convert latents to f32 for VAE decoder quality
-	z = mlx.AsType(z, mlx.DtypeFloat32)
-
-	mean := mlx.NewArray(m.Config.LatentsMean[:C], []int32{1, C, 1, 1, 1})
-	std := mlx.NewArray(m.Config.LatentsStd[:C], []int32{1, C, 1, 1, 1})
-
-	return mlx.Add(mlx.Mul(z, std), mean)
-}
-
-// VAEEncoder is the encoder part of the VAE
-// The encoder uses a flat structure where down_blocks contains a mix of ResBlocks and Downsamplers:
-// - Blocks 0,1: ResBlocks (base_dim)
-// - Block 2: Downsample
-// - Blocks 3,4: ResBlocks (base_dim*2)
-// - Block 5: Downsample + temporal
-// - Blocks 6,7: ResBlocks (base_dim*4)
-// - Block 8: Downsample + temporal
-// - Blocks 9,10: ResBlocks (base_dim*4)
-type VAEEncoder struct {
-	Config *VAEConfig
-
-	ConvIn     *CausalConv3d
-	Blocks     []EncoderBlock // Flat list of ResBlocks and Downsamplers
-	MidBlock   *MidBlock
-	NormOut    *RMSNorm3D
-	ConvOut    *CausalConv3d
-	QuantConv  *CausalConv3d
-}
-
-// EncoderBlock is either a ResBlock or a Downsample
-type EncoderBlock interface {
-	Forward(x *mlx.Array) *mlx.Array
-	IsDownsample() bool
-}
-
-// EncoderResBlock wraps ResBlock
-type EncoderResBlock struct {
-	*ResBlock
-}
-
-func (b *EncoderResBlock) IsDownsample() bool { return false }
-
-// EncoderDownsample is a downsample layer
-type EncoderDownsample struct {
-	Resample *CausalConv3d
-	TimeConv *CausalConv3d // Optional temporal downsample
-}
-
-func (d *EncoderDownsample) IsDownsample() bool { return true }
-
-func (d *EncoderDownsample) Forward(x *mlx.Array) *mlx.Array {
-	// Spatial downsample with stride 2
-	// WAN VAE uses: ZeroPad2d(0,1,0,1) + Conv2d(3x3, stride=2)
-	x = d.forwardSpatialDownsample(x)
-
-	// NOTE: In WAN VAE, time_conv is ONLY used in streaming/chunked mode
-	// with feat_cache. For single-frame encoding (T=1), time_conv is skipped.
-	// The Python forward checks: if feat_cache is not None ... then use time_conv
-	// Since we don't support streaming, we skip time_conv entirely.
-	return x
-}
-
-// forwardSpatialDownsample applies 2D conv with stride 2 for spatial downsampling
-func (d *EncoderDownsample) forwardSpatialDownsample(x *mlx.Array) *mlx.Array {
-	xShape := x.Shape()
-	B := xShape[0]
-	T := xShape[1]
-	H := xShape[2]
-	W := xShape[3]
-	C := xShape[4]
-
-	wShape := d.Resample.Weight.Shape()
-	outC := wShape[0]
-
-	// Reshape to [B*T, H, W, C] for 2D conv
-	x = mlx.Reshape(x, B*T, H, W, C)
-
-	// Asymmetric padding: pad right and bottom by 1 (WAN VAE style)
-	// ZeroPad2d(0, 1, 0, 1) means (left=0, right=1, top=0, bottom=1)
-	x = mlx.Pad(x, []int32{0, 0, 0, 1, 0, 1, 0, 0}) // [B, H, W, C] -> pad H and W
-
-	// Apply 2D conv with stride 2
-	weight := mlx.Transpose(d.Resample.Weight, 0, 2, 3, 1) // [O, I, kH, kW] -> [O, kH, kW, I]
-	x = conv2DStrided(x, weight, 2)
-
-	if d.Resample.Bias != nil {
-		bias := mlx.Reshape(d.Resample.Bias, 1, 1, 1, outC)
-		x = mlx.Add(x, bias)
-	}
-
-	// Output dims after stride 2: (H+1)/2, (W+1)/2
-	outH := (H + 1) / 2
-	outW := (W + 1) / 2
-
-	// Reshape back to [B, T, H', W', C]
-	x = mlx.Reshape(x, B, T, outH, outW, outC)
-	mlx.Eval(x)
-
-	return x
-}
-
-// loadFromWeights loads the encoder from pre-loaded weights
-func (e *VAEEncoder) loadFromWeights(weights *safetensors.ModelWeights, cfg *VAEConfig) error {
-	e.Config = cfg
-
-	// Conv in
-	convIn, err := newCausalConv3d(weights, "encoder.conv_in")
-	if err != nil {
-		return err
-	}
-	e.ConvIn = convIn
-
-	// Encoder uses flat block structure:
-	// dim_mult = [1, 2, 4, 4], num_res_blocks = 2, temporal_downsample = [false, true, true]
-	// Block layout: res,res,down, res,res,down+t, res,res,down+t, res,res
-	// That's 11 blocks: 0,1=res, 2=down, 3,4=res, 5=down+t, 6,7=res, 8=down+t, 9,10=res
-	e.Blocks = make([]EncoderBlock, 0, 11)
-
-	// Track dimensions
-	dims := []int32{cfg.BaseDim, cfg.BaseDim * 2, cfg.BaseDim * 4, cfg.BaseDim * 4}
-	blockIdx := 0
-
-	for stage := 0; stage < len(cfg.DimMult); stage++ {
-		inDim := cfg.BaseDim
-		if stage > 0 {
-			inDim = dims[stage-1]
-		}
-		outDim := dims[stage]
-
-		// ResBlocks for this stage (num_res_blocks per stage)
-		for r := int32(0); r < cfg.NumResBlocks; r++ {
-			prefix := fmt.Sprintf("encoder.down_blocks.%d", blockIdx)
-			currentInDim := inDim
-			if r > 0 {
-				currentInDim = outDim
-			}
-			block, err := newEncoderResBlock(weights, prefix, currentInDim, outDim)
-			if err != nil {
-				return fmt.Errorf("encoder res block %d: %w", blockIdx, err)
-			}
-			e.Blocks = append(e.Blocks, block)
-			blockIdx++
-		}
-
-		// Downsample after each stage except the last
-		if stage < len(cfg.DimMult)-1 {
-			prefix := fmt.Sprintf("encoder.down_blocks.%d", blockIdx)
-			down, err := newEncoderDownsample(weights, prefix, cfg.TemperalDownsample[stage])
-			if err != nil {
-				return fmt.Errorf("encoder downsample %d: %w", blockIdx, err)
-			}
-			e.Blocks = append(e.Blocks, down)
-			blockIdx++
-		}
-	}
-
-	// Mid block
-	midDim := cfg.BaseDim * cfg.DimMult[len(cfg.DimMult)-1]
-	midBlock, err := newMidBlock(weights, "encoder.mid_block", midDim)
-	if err != nil {
-		return err
-	}
-	e.MidBlock = midBlock
-
-	// Norm out
-	normOut, err := newRMSNorm3D(weights, "encoder.norm_out", midDim)
-	if err != nil {
-		return err
-	}
-	e.NormOut = normOut
-
-	// Conv out
-	convOut, err := newCausalConv3d(weights, "encoder.conv_out")
-	if err != nil {
-		return err
-	}
-	e.ConvOut = convOut
-
-	// Quant conv
-	quantConv, err := newCausalConv3d(weights, "quant_conv")
-	if err != nil {
-		return err
-	}
-	e.QuantConv = quantConv
-
-	return nil
-}
-
-// newEncoderResBlock creates a ResBlock for the encoder (flat structure)
-func newEncoderResBlock(weights *safetensors.ModelWeights, prefix string, inDim, outDim int32) (*EncoderResBlock, error) {
-	block, err := newResBlock(weights, prefix, inDim, outDim)
-	if err != nil {
-		return nil, err
-	}
-	return &EncoderResBlock{block}, nil
-}
-
-// newEncoderDownsample creates a downsample layer for the encoder
-func newEncoderDownsample(weights *safetensors.ModelWeights, prefix string, temporal bool) (*EncoderDownsample, error) {
-	resample, err := newCausalConv3d(weights, prefix+".resample.1")
-	if err != nil {
-		return nil, err
-	}
-
-	var timeConv *CausalConv3d
-	if temporal {
-		timeConv, _ = newCausalConv3d(weights, prefix+".time_conv")
-	}
-
-	return &EncoderDownsample{
-		Resample: resample,
-		TimeConv: timeConv,
-	}, nil
-}
-
-// Encode encodes an image to latents
-// x: [B, C, T, H, W] image tensor (channels-first)
-// Returns: [B, latent_C, T, H/8, W/8] latent distribution mode
-func (e *VAEEncoder) Encode(x *mlx.Array) *mlx.Array {
-	// Convert from channels-first [N, C, T, H, W] to channels-last [N, T, H, W, C]
-	x = mlx.Contiguous(mlx.Transpose(x, 0, 2, 3, 4, 1))
-	mlx.Eval(x)
-
-	// Conv in
-	x = e.ConvIn.Forward(x)
-
-	// Encoder blocks (mix of ResBlocks and Downsamplers)
-	for _, block := range e.Blocks {
-		prev := x
-		x = block.Forward(x)
-		prev.Free()
-	}
-
-	// Mid block
-	x = e.MidBlock.Forward(x)
-
-	// Norm + silu
-	{
-		prev := x
-		x = e.NormOut.Forward(x)
-		x = silu3D(x)
-		prev.Free()
-		mlx.Eval(x)
-	}
-
-	// Conv out
-	{
-		prev := x
-		x = e.ConvOut.Forward(x)
-		prev.Free()
-	}
-
-	// Quant conv
-	{
-		prev := x
-		x = e.QuantConv.Forward(x)
-		prev.Free()
-	}
-
-	// Get mode from distribution (first half of channels = mean)
-	// Output is [B, T, H, W, 2*latent_C], we take first latent_C channels
-	shape := x.Shape()
-	latentC := shape[4] / 2
-	x = mlx.Slice(x, []int32{0, 0, 0, 0, 0}, []int32{shape[0], shape[1], shape[2], shape[3], latentC})
-
-	// Convert back to channels-first [N, C, T, H, W]
-	x = mlx.Contiguous(mlx.Transpose(x, 0, 4, 1, 2, 3))
-	mlx.Eval(x)
-
-	return x
-}
-
-// VAEDecoder is the decoder part of the VAE
-type VAEDecoder struct {
-	Config *VAEConfig
-
-	PostQuantConv *CausalConv3d
-	ConvIn        *CausalConv3d
-	MidBlock      *MidBlock
-	UpBlocks      []*UpBlock
-	NormOut       *RMSNorm3D
-	ConvOut       *CausalConv3d
-}
-
-// loadFromWeights loads the decoder from pre-loaded weights
-func (d *VAEDecoder) loadFromWeights(weights *safetensors.ModelWeights, cfg *VAEConfig) error {
-	d.Config = cfg
-
-	postQuantConv, err := newCausalConv3d(weights, "post_quant_conv")
-	if err != nil {
-		return err
-	}
-	d.PostQuantConv = postQuantConv
-
-	convIn, err := newCausalConv3d(weights, "decoder.conv_in")
-	if err != nil {
-		return err
-	}
-	d.ConvIn = convIn
-
-	// Mid block
-	midDim := cfg.BaseDim * cfg.DimMult[len(cfg.DimMult)-1]
-	midBlock, err := newMidBlock(weights, "decoder.mid_block", midDim)
-	if err != nil {
-		return err
-	}
-	d.MidBlock = midBlock
-
-	// Up blocks (reversed dim_mult)
-	numUpBlocks := len(cfg.DimMult)
-	d.UpBlocks = make([]*UpBlock, numUpBlocks)
-
-	dimsMult := make([]int32, numUpBlocks+1)
-	dimsMult[0] = cfg.DimMult[numUpBlocks-1]
-	for i := 0; i < numUpBlocks; i++ {
-		dimsMult[i+1] = cfg.DimMult[numUpBlocks-1-i]
-	}
-
-	temporalUpsample := make([]bool, len(cfg.TemperalDownsample))
-	for i := range cfg.TemperalDownsample {
-		temporalUpsample[i] = cfg.TemperalDownsample[len(cfg.TemperalDownsample)-1-i]
-	}
-
-	for i := 0; i < numUpBlocks; i++ {
-		inDim := cfg.BaseDim * dimsMult[i]
-		outDim := cfg.BaseDim * dimsMult[i+1]
-
-		if i > 0 {
-			inDim = inDim / 2
-		}
-
-		upsampleMode := ""
-		if i < numUpBlocks-1 {
-			if temporalUpsample[i] {
-				upsampleMode = "upsample3d"
-			} else {
-				upsampleMode = "upsample2d"
-			}
-		}
-
-		prefix := fmt.Sprintf("decoder.up_blocks.%d", i)
-		upBlock, err := newUpBlock(weights, prefix, inDim, outDim, cfg.NumResBlocks, upsampleMode)
-		if err != nil {
-			return err
-		}
-		d.UpBlocks[i] = upBlock
-	}
-
-	normOut, err := newRMSNorm3D(weights, "decoder.norm_out", cfg.BaseDim)
-	if err != nil {
-		return err
-	}
-	d.NormOut = normOut
-
-	convOut, err := newCausalConv3d(weights, "decoder.conv_out")
-	if err != nil {
-		return err
-	}
-	d.ConvOut = convOut
-
-	return nil
-}
-
-// Decode converts latents to image
-// z: [B, C, T, H, W] denormalized latents
-func (d *VAEDecoder) Decode(z *mlx.Array) *mlx.Array {
-	var x *mlx.Array
-
-	// Convert from channels-first to channels-last
-	{
-		z = mlx.Contiguous(mlx.Transpose(z, 0, 2, 3, 4, 1))
-		mlx.Eval(z)
-	}
-
-	// PostQuantConv
-	x = d.PostQuantConv.Forward(z)
-	z.Free()
-
-	// ConvIn
-	{
-		prev := x
-		x = d.ConvIn.Forward(x)
-		prev.Free()
-	}
-
-	// Mid block
-	x = d.MidBlock.Forward(x)
-
-	// Up blocks
-	for _, upBlock := range d.UpBlocks {
-		x = upBlock.Forward(x)
-	}
-
-	// NormOut + silu
-	{
-		prev := x
-		x = d.NormOut.Forward(x)
-		x = silu3D(x)
-		prev.Free()
-		mlx.Eval(x)
-	}
-
-	// ConvOut
-	{
-		prev := x
-		x = d.ConvOut.Forward(x)
-		prev.Free()
-	}
-
-	// Post-processing: clamp and convert back to channels-first
-	{
-		prev := x
-		x = mlx.ClipScalar(x, -1.0, 1.0, true, true)
-		x = mlx.Contiguous(mlx.Transpose(x, 0, 4, 1, 2, 3))
-		prev.Free()
-		mlx.Eval(x)
-	}
-
-	return x
-}
-
-// DownBlock handles downsampling in encoder
-type DownBlock struct {
-	ResBlocks   []*ResBlock
-	Downsampler *Downsample
-}
-
-// newDownBlock creates a down block
-func newDownBlock(weights *safetensors.ModelWeights, prefix string, inDim, outDim int32, numBlocks int32, downsampleMode string) (*DownBlock, error) {
-	resBlocks := make([]*ResBlock, numBlocks+1)
-
-	currentDim := inDim
-	for i := int32(0); i <= numBlocks; i++ {
-		resPrefix := fmt.Sprintf("%s.resnets.%d", prefix, i)
-		block, err := newResBlock(weights, resPrefix, currentDim, outDim)
-		if err != nil {
-			return nil, err
-		}
-		resBlocks[i] = block
-		currentDim = outDim
-	}
-
-	var downsampler *Downsample
-	if downsampleMode != "" {
-		downsampler = newDownsample(weights, prefix+".downsamplers.0", outDim, downsampleMode)
-	}
-
-	return &DownBlock{
-		ResBlocks:   resBlocks,
-		Downsampler: downsampler,
-	}, nil
-}
-
-// Forward applies down block
-func (d *DownBlock) Forward(x *mlx.Array) *mlx.Array {
-	for _, block := range d.ResBlocks {
-		prev := x
-		x = block.Forward(x)
-		prev.Free()
-	}
-
-	if d.Downsampler != nil {
-		prev := x
-		x = d.Downsampler.Forward(x)
-		prev.Free()
-	}
-	return x
-}
-
-// Downsample handles spatial downsampling
-type Downsample struct {
-	Conv *mlx.Array
-	Bias *mlx.Array
-	Mode string
-}
-
-// newDownsample creates a downsampler
-func newDownsample(weights *safetensors.ModelWeights, prefix string, dim int32, mode string) *Downsample {
-	conv, _ := weights.Get(prefix + ".resample.1.weight")
-	bias, _ := weights.Get(prefix + ".resample.1.bias")
-	return &Downsample{
-		Conv: conv,
-		Bias: bias,
-		Mode: mode,
-	}
-}
-
-// Forward applies downsampling to channels-last input [B, T, H, W, C]
-func (d *Downsample) Forward(x *mlx.Array) *mlx.Array {
-	shape := x.Shape()
-	B := shape[0]
-	T := shape[1]
-	H := shape[2]
-	W := shape[3]
-	C := shape[4]
-	outC := d.Conv.Shape()[0]
-
-	// Reshape to [B*T, H, W, C] for 2D conv
-	x = mlx.Reshape(x, B*T, H, W, C)
-
-	// Pad for stride-2 conv: need (3-1)/2 = 1 on each side, but for stride 2 we need specific padding
-	// For 3x3 stride 2: pad 1 on all sides
-	x = mlx.Pad(x, []int32{0, 0, 1, 1, 1, 1, 0, 0})
-
-	// Conv with stride 2 using manual strided patching
-	weight := mlx.Transpose(d.Conv, 0, 2, 3, 1)
-	x = conv2DStrided(x, weight, 2)
-	if d.Bias != nil {
-		bias := mlx.Reshape(d.Bias, 1, 1, 1, outC)
-		x = mlx.Add(x, bias)
-	}
-
-	x = mlx.Reshape(x, B, T, H/2, W/2, outC)
-	mlx.Eval(x)
-
-	return x
-}
--- a/x/imagegen/models/zimage/scheduler.go
+++ b/x/imagegen/models/zimage/scheduler.go
@@ -1,146 +0,0 @@
-package zimage
-
-import (
-	"math"
-
-	"github.com/ollama/ollama/x/imagegen/mlx"
-)
-
-// FlowMatchSchedulerConfig holds scheduler configuration
-type FlowMatchSchedulerConfig struct {
-	NumTrainTimesteps  int32   `json:"num_train_timesteps"`  // 1000
-	Shift              float32 `json:"shift"`                // 3.0
-	UseDynamicShifting bool    `json:"use_dynamic_shifting"` // false
-}
-
-// DefaultFlowMatchSchedulerConfig returns default config
-func DefaultFlowMatchSchedulerConfig() *FlowMatchSchedulerConfig {
-	return &FlowMatchSchedulerConfig{
-		NumTrainTimesteps:  1000,
-		Shift:              3.0,
-		UseDynamicShifting: true, // Z-Image-Turbo uses dynamic shifting
-	}
-}
-
-// FlowMatchEulerScheduler implements the Flow Match Euler discrete scheduler
-// This is used in Z-Image-Turbo for fast sampling
-type FlowMatchEulerScheduler struct {
-	Config    *FlowMatchSchedulerConfig
-	Timesteps []float32 // Discretized timesteps
-	Sigmas    []float32 // Noise levels at each timestep
-	NumSteps  int       // Number of inference steps
-}
-
-// NewFlowMatchEulerScheduler creates a new scheduler
-func NewFlowMatchEulerScheduler(cfg *FlowMatchSchedulerConfig) *FlowMatchEulerScheduler {
-	return &FlowMatchEulerScheduler{
-		Config: cfg,
-	}
-}
-
-// SetTimesteps sets up the scheduler for the given number of inference steps
-func (s *FlowMatchEulerScheduler) SetTimesteps(numSteps int) {
-	s.SetTimestepsWithMu(numSteps, 0)
-}
-
-// SetTimestepsWithMu sets up the scheduler with dynamic mu shift
-func (s *FlowMatchEulerScheduler) SetTimestepsWithMu(numSteps int, mu float32) {
-	s.NumSteps = numSteps
-
-	// Create evenly spaced timesteps from 1.0 to 0.0 (flow matching goes t=1 to t=0)
-	// Match Python: np.linspace(1.0, 0.0, num_inference_steps + 1)
-	s.Timesteps = make([]float32, numSteps+1)
-	s.Sigmas = make([]float32, numSteps+1)
-
-	for i := 0; i <= numSteps; i++ {
-		t := 1.0 - float32(i)/float32(numSteps)
-
-		// Apply time shift if using dynamic shifting
-		if s.Config.UseDynamicShifting && mu != 0 {
-			t = s.timeShift(mu, t)
-		}
-
-		s.Timesteps[i] = t
-		s.Sigmas[i] = t
-	}
-}
-
-// timeShift applies the dynamic time shift (match Python)
-func (s *FlowMatchEulerScheduler) timeShift(mu float32, t float32) float32 {
-	if t <= 0 {
-		return 0
-	}
-	// exp(mu) / (exp(mu) + (1/t - 1))
-	expMu := float32(math.Exp(float64(mu)))
-	return expMu / (expMu + (1.0/t - 1.0))
-}
-
-// Step performs one denoising step
-// modelOutput: predicted velocity/noise from the model
-// timestepIdx: current timestep index
-// sample: current noisy sample
-// Returns: denoised sample for next step
-func (s *FlowMatchEulerScheduler) Step(modelOutput, sample *mlx.Array, timestepIdx int) *mlx.Array {
-	// Get current and next sigma
-	sigma := s.Sigmas[timestepIdx]
-	sigmaNext := s.Sigmas[timestepIdx+1]
-
-	// Euler step: x_{t-dt} = x_t + (sigma_next - sigma) * v_t
-	// where v_t is the velocity predicted by the model
-	dt := sigmaNext - sigma // This is negative (going from noise to clean)
-
-	// x_next = x + dt * velocity
-	scaledOutput := mlx.MulScalar(modelOutput, dt)
-	return mlx.Add(sample, scaledOutput)
-}
-
-// ScaleSample scales the sample for model input (identity for flow matching)
-func (s *FlowMatchEulerScheduler) ScaleSample(sample *mlx.Array, timestepIdx int) *mlx.Array {
-	// Flow matching doesn't need scaling
-	return sample
-}
-
-// GetTimestep returns the timestep value at the given index
-func (s *FlowMatchEulerScheduler) GetTimestep(idx int) float32 {
-	if idx < len(s.Timesteps) {
-		return s.Timesteps[idx]
-	}
-	return 0.0
-}
-
-// GetTimesteps returns all timesteps (implements Scheduler interface)
-func (s *FlowMatchEulerScheduler) GetTimesteps() []float32 {
-	return s.Timesteps
-}
-
-// AddNoise adds noise to clean samples for a given timestep
-// Used for img2img or inpainting
-func (s *FlowMatchEulerScheduler) AddNoise(cleanSample, noise *mlx.Array, timestepIdx int) *mlx.Array {
-	// In flow matching: x_t = (1-t) * x_0 + t * noise
-	t := s.Timesteps[timestepIdx]
-	oneMinusT := 1.0 - t
-
-	scaledClean := mlx.MulScalar(cleanSample, oneMinusT)
-	scaledNoise := mlx.MulScalar(noise, t)
-
-	return mlx.Add(scaledClean, scaledNoise)
-}
-
-// InitNoise creates initial noise for sampling
-func (s *FlowMatchEulerScheduler) InitNoise(shape []int32, seed int64) *mlx.Array {
-	return RandomNormal(shape, seed)
-}
-
-// RandomNormal creates a random normal tensor using MLX
-func RandomNormal(shape []int32, seed int64) *mlx.Array {
-	return mlx.RandomNormal(shape, uint64(seed))
-}
-
-// GetLatentShape returns the latent shape for a given image size
-func GetLatentShape(batchSize, height, width, latentChannels int32, patchSize int32) []int32 {
-	// Latent is 8x smaller than image (VAE downscale)
-	latentH := height / 8
-	latentW := width / 8
-
-	return []int32{batchSize, latentChannels, latentH, latentW}
-}
--- a/x/imagegen/models/zimage/text_encoder.go
+++ b/x/imagegen/models/zimage/text_encoder.go
@@ -1,294 +0,0 @@
-package zimage
-
-import (
-	"encoding/json"
-	"fmt"
-	"math"
-	"os"
-	"path/filepath"
-
-	"github.com/ollama/ollama/x/imagegen/mlx"
-	"github.com/ollama/ollama/x/imagegen/nn"
-	"github.com/ollama/ollama/x/imagegen/safetensors"
-	"github.com/ollama/ollama/x/imagegen/tokenizer"
-)
-
-// Qwen3Config holds Qwen3 text encoder configuration
-type Qwen3Config struct {
-	HiddenSize        int32   `json:"hidden_size"`
-	NumHiddenLayers   int32   `json:"num_hidden_layers"`
-	IntermediateSize  int32   `json:"intermediate_size"`
-	NumAttentionHeads int32   `json:"num_attention_heads"`
-	NumKeyValueHeads  int32   `json:"num_key_value_heads"`
-	VocabSize         int32   `json:"vocab_size"`
-	RMSNormEps        float32 `json:"rms_norm_eps"`
-	RopeTheta         float32 `json:"rope_theta"`
-	HeadDim           int32   `json:"head_dim"`
-}
-
-// loadQwen3Config loads text encoder config from a JSON file
-func loadQwen3Config(path string) (*Qwen3Config, error) {
-	data, err := os.ReadFile(path)
-	if err != nil {
-		return nil, fmt.Errorf("read config: %w", err)
-	}
-	var cfg Qwen3Config
-	if err := json.Unmarshal(data, &cfg); err != nil {
-		return nil, fmt.Errorf("parse config: %w", err)
-	}
-	return &cfg, nil
-}
-
-// Qwen3Attention implements Qwen3 attention with QK norms
-type Qwen3Attention struct {
-	QProj *nn.Linear  `weight:"q_proj"`
-	KProj *nn.Linear  `weight:"k_proj"`
-	VProj *nn.Linear  `weight:"v_proj"`
-	OProj *nn.Linear  `weight:"o_proj"`
-	QNorm *nn.RMSNorm `weight:"q_norm"`
-	KNorm *nn.RMSNorm `weight:"k_norm"`
-	// Computed fields
-	NHeads    int32
-	NKVHeads  int32
-	HeadDim   int32
-	Scale     float32
-	RopeTheta float32
-}
-
-// applyRoPEQwen3 applies the custom RoPE for Qwen3 text encoder
-func applyRoPEQwen3(x *mlx.Array, seqLen int32, theta float32) *mlx.Array {
-	shape := x.Shape()
-	B := shape[0]
-	L := shape[1]
-	H := shape[2]
-	D := shape[3]
-	half := D / 2
-
-	freqsArr := make([]float32, half)
-	logTheta := float32(math.Log(float64(theta)))
-	for i := int32(0); i < half; i++ {
-		freqsArr[i] = float32(math.Exp(float64(-logTheta * float32(i) / float32(half))))
-	}
-	freqs := mlx.NewArray(freqsArr, []int32{half})
-
-	posArr := make([]float32, seqLen)
-	for i := int32(0); i < seqLen; i++ {
-		posArr[i] = float32(i)
-	}
-	pos := mlx.NewArray(posArr, []int32{seqLen})
-
-	posExpanded := mlx.Reshape(pos, seqLen, 1)
-	freqsExpanded := mlx.Reshape(freqs, 1, half)
-	args := mlx.Mul(posExpanded, freqsExpanded)
-
-	cosVals := mlx.Cos(args)
-	sinVals := mlx.Sin(args)
-	cosVals = mlx.Reshape(cosVals, seqLen, 1, half)
-	sinVals = mlx.Reshape(sinVals, seqLen, 1, half)
-
-	x1 := mlx.Slice(x, []int32{0, 0, 0, 0}, []int32{B, L, H, half})
-	x2 := mlx.Slice(x, []int32{0, 0, 0, half}, []int32{B, L, H, D})
-
-	part1 := mlx.Sub(mlx.Mul(x1, cosVals), mlx.Mul(x2, sinVals))
-	part2 := mlx.Add(mlx.Mul(x1, sinVals), mlx.Mul(x2, cosVals))
-
-	return mlx.Concatenate([]*mlx.Array{part1, part2}, 3)
-}
-
-// Forward computes attention with causal masking
-func (attn *Qwen3Attention) Forward(x *mlx.Array) *mlx.Array {
-	shape := x.Shape()
-	B := shape[0]
-	L := shape[1]
-
-	q := attn.QProj.Forward(x)
-	k := attn.KProj.Forward(x)
-	v := attn.VProj.Forward(x)
-
-	q = mlx.Reshape(q, B, L, attn.NHeads, attn.HeadDim)
-	k = mlx.Reshape(k, B, L, attn.NKVHeads, attn.HeadDim)
-	v = mlx.Reshape(v, B, L, attn.NKVHeads, attn.HeadDim)
-
-	// QK norm uses 1e-6 hardcoded (Qwen3 specific)
-	q = attn.QNorm.Forward(q, 1e-6)
-	k = attn.KNorm.Forward(k, 1e-6)
-
-	q = applyRoPEQwen3(q, L, attn.RopeTheta)
-	k = applyRoPEQwen3(k, L, attn.RopeTheta)
-
-	q = mlx.Transpose(q, 0, 2, 1, 3)
-	k = mlx.Transpose(k, 0, 2, 1, 3)
-	v = mlx.Transpose(v, 0, 2, 1, 3)
-
-	if attn.NKVHeads < attn.NHeads {
-		repeats := attn.NHeads / attn.NKVHeads
-		k = repeatKV(k, repeats)
-		v = repeatKV(v, repeats)
-	}
-
-	out := mlx.ScaledDotProductAttention(q, k, v, attn.Scale, true)
-
-	out = mlx.Transpose(out, 0, 2, 1, 3)
-	out = mlx.Reshape(out, B, L, attn.NHeads*attn.HeadDim)
-
-	out = attn.OProj.Forward(out)
-
-	return out
-}
-
-// repeatKV repeats key/value heads for GQA
-func repeatKV(x *mlx.Array, repeats int32) *mlx.Array {
-	if repeats == 1 {
-		return x
-	}
-	shape := x.Shape()
-	x = mlx.ExpandDims(x, 2)
-	x = mlx.Tile(x, []int32{1, 1, repeats, 1, 1})
-	return mlx.Reshape(x, shape[0], shape[1]*repeats, shape[2], shape[3])
-}
-
-// Qwen3MLP implements Qwen3 SwiGLU MLP
-type Qwen3MLP struct {
-	GateProj *nn.Linear `weight:"gate_proj"`
-	UpProj   *nn.Linear `weight:"up_proj"`
-	DownProj *nn.Linear `weight:"down_proj"`
-}
-
-// Forward applies the MLP
-func (m *Qwen3MLP) Forward(x *mlx.Array) *mlx.Array {
-	gate := m.GateProj.Forward(x)
-	gate = mlx.SiLU(gate)
-	up := m.UpProj.Forward(x)
-	h := mlx.Mul(gate, up)
-	return m.DownProj.Forward(h)
-}
-
-// Qwen3Block represents a single Qwen3 transformer block
-type Qwen3Block struct {
-	Attention         *Qwen3Attention `weight:"self_attn"`
-	MLP               *Qwen3MLP       `weight:"mlp"`
-	InputLayerNorm    *nn.RMSNorm     `weight:"input_layernorm"`
-	PostAttnLayerNorm *nn.RMSNorm     `weight:"post_attention_layernorm"`
-}
-
-// Forward applies the Qwen3 block
-func (qb *Qwen3Block) Forward(x *mlx.Array, eps float32) *mlx.Array {
-	h := qb.InputLayerNorm.Forward(x, eps)
-	attnOut := qb.Attention.Forward(h)
-	x = mlx.Add(x, attnOut)
-
-	h = qb.PostAttnLayerNorm.Forward(x, eps)
-	mlpOut := qb.MLP.Forward(h)
-	x = mlx.Add(x, mlpOut)
-
-	return x
-}
-
-// Qwen3TextEncoder is the full Qwen3 encoder for Z-Image
-type Qwen3TextEncoder struct {
-	EmbedTokens *nn.Embedding   `weight:"model.embed_tokens"`
-	Layers      []*Qwen3Block   `weight:"model.layers"`
-	FinalNorm   *nn.RMSNorm     `weight:"model.norm"`
-	*Qwen3Config
-}
-
-// Load loads the Qwen3 text encoder from a directory
-func (m *Qwen3TextEncoder) Load(path string) error {
-	fmt.Println("Loading Qwen3 text encoder...")
-
-	// Load config
-	cfg, err := loadQwen3Config(filepath.Join(path, "config.json"))
-	if err != nil {
-		return fmt.Errorf("config: %w", err)
-	}
-	m.Qwen3Config = cfg
-
-	// Pre-allocate layers slice
-	m.Layers = make([]*Qwen3Block, cfg.NumHiddenLayers)
-
-	// Load weights
-	weights, err := safetensors.LoadModelWeights(path)
-	if err != nil {
-		return fmt.Errorf("weights: %w", err)
-	}
-
-	fmt.Print("  Loading weights via struct tags... ")
-	if err := safetensors.LoadModule(m, weights, ""); err != nil {
-		return fmt.Errorf("load module: %w", err)
-	}
-	fmt.Println("✓")
-
-	// Initialize computed fields
-	m.FinalNorm.Eps = cfg.RMSNormEps
-	for _, block := range m.Layers {
-		// Attention
-		block.Attention.NHeads = cfg.NumAttentionHeads
-		block.Attention.NKVHeads = cfg.NumKeyValueHeads
-		block.Attention.HeadDim = cfg.HeadDim
-		block.Attention.Scale = float32(1.0 / math.Sqrt(float64(cfg.HeadDim)))
-		block.Attention.RopeTheta = cfg.RopeTheta
-		block.Attention.QNorm.Eps = cfg.RMSNormEps
-		block.Attention.KNorm.Eps = cfg.RMSNormEps
-		// Block norms
-		block.InputLayerNorm.Eps = cfg.RMSNormEps
-		block.PostAttnLayerNorm.Eps = cfg.RMSNormEps
-	}
-
-	weights.ReleaseAll()
-	return nil
-}
-
-// Forward encodes text tokens
-func (te *Qwen3TextEncoder) Forward(tokens *mlx.Array) *mlx.Array {
-	h := te.EmbedTokens.Forward(tokens)
-	eps := te.RMSNormEps
-
-	for _, layer := range te.Layers {
-		h = layer.Forward(h, eps)
-	}
-
-	// Apply final RMS norm
-	h = te.FinalNorm.Forward(h, eps)
-
-	return h
-}
-
-// ApplyChatTemplate wraps prompt in Qwen3 chat format
-func ApplyChatTemplate(prompt string) string {
-	return "<|im_start|>user\n" + prompt + "<|im_end|>\n<|im_start|>assistant\n"
-}
-
-// EncodePrompt encodes a text prompt using the tokenizer and encoder
-func (te *Qwen3TextEncoder) EncodePrompt(tok *tokenizer.Tokenizer, prompt string, maxLen int) (*mlx.Array, *mlx.Array) {
-	formattedPrompt := ApplyChatTemplate(prompt)
-
-	tokens := tok.Encode(formattedPrompt, false)
-
-	if len(tokens) > maxLen {
-		tokens = tokens[:maxLen]
-	}
-
-	maskData := make([]float32, maxLen)
-	for i := 0; i < len(tokens); i++ {
-		maskData[i] = 1.0
-	}
-
-	// Get PAD token (different from EOS for Qwen3)
-	padToken := tok.PAD()
-	if padToken < 0 {
-		padToken = tok.EOS() // fallback
-	}
-
-	paddedTokens := make([]int32, maxLen)
-	copy(paddedTokens, tokens)
-	for i := len(tokens); i < maxLen; i++ {
-		paddedTokens[i] = padToken
-	}
-
-	tokensArr := mlx.NewArrayInt32(paddedTokens, []int32{1, int32(maxLen)})
-	maskArr := mlx.NewArray(maskData, []int32{1, int32(maxLen)})
-
-	embeddings := te.Forward(tokensArr)
-
-	return embeddings, maskArr
-}
--- a/x/imagegen/models/zimage/transformer.go
+++ b/x/imagegen/models/zimage/transformer.go
@@ -1,690 +0,0 @@
-// Package zimage implements the Z-Image diffusion transformer model.
-package zimage
-
-import (
-	"encoding/json"
-	"fmt"
-	"math"
-	"os"
-	"path/filepath"
-
-	"github.com/ollama/ollama/x/imagegen/cache"
-	"github.com/ollama/ollama/x/imagegen/mlx"
-	"github.com/ollama/ollama/x/imagegen/nn"
-	"github.com/ollama/ollama/x/imagegen/safetensors"
-)
-
-// TransformerConfig holds Z-Image transformer configuration
-type TransformerConfig struct {
-	Dim            int32   `json:"dim"`
-	NHeads         int32   `json:"n_heads"`
-	NKVHeads       int32   `json:"n_kv_heads"`
-	NLayers        int32   `json:"n_layers"`
-	NRefinerLayers int32   `json:"n_refiner_layers"`
-	InChannels     int32   `json:"in_channels"`
-	PatchSize      int32   `json:"-"` // Computed from AllPatchSize
-	CapFeatDim     int32   `json:"cap_feat_dim"`
-	NormEps        float32 `json:"norm_eps"`
-	RopeTheta      float32 `json:"rope_theta"`
-	TScale         float32 `json:"t_scale"`
-	QKNorm         bool    `json:"qk_norm"`
-	AxesDims       []int32 `json:"axes_dims"`
-	AxesLens       []int32 `json:"axes_lens"`
-	AllPatchSize   []int32 `json:"all_patch_size"` // JSON array, PatchSize = first element
-}
-
-// TimestepEmbedder creates sinusoidal timestep embeddings
-// Output dimension is 256 (fixed), used for AdaLN modulation
-type TimestepEmbedder struct {
-	Linear1       *nn.Linear `weight:"mlp.0"`
-	Linear2       *nn.Linear `weight:"mlp.2"`
-	FreqEmbedSize int32      // 256 (computed)
-}
-
-// Forward computes timestep embeddings -> [B, 256]
-func (te *TimestepEmbedder) Forward(t *mlx.Array) *mlx.Array {
-	// t: [B] timesteps
-
-	// Create sinusoidal embedding
-	half := te.FreqEmbedSize / 2
-
-	// freqs = exp(-log(10000) * arange(half) / half)
-	freqs := make([]float32, half)
-	for i := int32(0); i < half; i++ {
-		freqs[i] = float32(math.Exp(-math.Log(10000.0) * float64(i) / float64(half)))
-	}
-	freqsArr := mlx.NewArray(freqs, []int32{1, half})
-
-	// t[:, None] * freqs[None, :] -> [B, half]
-	tExpanded := mlx.ExpandDims(t, 1) // [B, 1]
-	args := mlx.Mul(tExpanded, freqsArr)
-
-	// embedding = [cos(args), sin(args)] -> [B, 256]
-	cosArgs := mlx.Cos(args)
-	sinArgs := mlx.Sin(args)
-	embedding := mlx.Concatenate([]*mlx.Array{cosArgs, sinArgs}, 1)
-
-	// MLP: linear1 -> silu -> linear2
-	h := te.Linear1.Forward(embedding)
-	h = mlx.SiLU(h)
-	h = te.Linear2.Forward(h)
-
-	return h
-}
-
-// XEmbedder embeds image patches to model dimension
-type XEmbedder struct {
-	Linear *nn.Linear `weight:"2-1"`
-}
-
-// Forward embeds patchified image latents
-func (xe *XEmbedder) Forward(x *mlx.Array) *mlx.Array {
-	// x: [B, L, in_channels * 4] -> [B, L, dim]
-	return xe.Linear.Forward(x)
-}
-
-// CapEmbedder projects caption features to model dimension
-type CapEmbedder struct {
-	Norm     *nn.RMSNorm `weight:"0"`
-	Linear   *nn.Linear  `weight:"1"`
-	PadToken *mlx.Array  // loaded separately at root level
-}
-
-// Forward projects caption embeddings: [B, L, cap_feat_dim] -> [B, L, dim]
-func (ce *CapEmbedder) Forward(capFeats *mlx.Array) *mlx.Array {
-	// RMSNorm on last axis (uses 1e-6)
-	h := ce.Norm.Forward(capFeats, 1e-6)
-	// Linear projection
-	return ce.Linear.Forward(h)
-}
-
-// FeedForward implements SwiGLU FFN
-type FeedForward struct {
-	W1     *nn.Linear `weight:"w1"` // gate projection
-	W2     *nn.Linear `weight:"w2"` // down projection
-	W3     *nn.Linear `weight:"w3"` // up projection
-	OutDim int32      // computed from W2
-}
-
-// Forward applies SwiGLU: silu(W1(x)) * W3(x), then W2
-func (ff *FeedForward) Forward(x *mlx.Array) *mlx.Array {
-	shape := x.Shape()
-	B := shape[0]
-	L := shape[1]
-	D := shape[2]
-
-	// Reshape for matmul
-	x = mlx.Reshape(x, B*L, D)
-	gate := ff.W1.Forward(x)
-	gate = mlx.SiLU(gate)
-	up := ff.W3.Forward(x)
-	h := mlx.Mul(gate, up)
-	out := ff.W2.Forward(h)
-
-	return mlx.Reshape(out, B, L, ff.OutDim)
-}
-
-// Attention implements multi-head attention with QK norm
-type Attention struct {
-	ToQ   *nn.Linear `weight:"to_q"`
-	ToK   *nn.Linear `weight:"to_k"`
-	ToV   *nn.Linear `weight:"to_v"`
-	ToOut *nn.Linear `weight:"to_out.0"`
-	NormQ *mlx.Array `weight:"norm_q.weight"` // [head_dim] for per-head RMSNorm
-	NormK *mlx.Array `weight:"norm_k.weight"`
-	// Computed fields
-	NHeads  int32
-	HeadDim int32
-	Dim     int32
-	Scale   float32
-}
-
-// Forward computes attention
-func (attn *Attention) Forward(x *mlx.Array, cos, sin *mlx.Array) *mlx.Array {
-	shape := x.Shape()
-	B := shape[0]
-	L := shape[1]
-	D := shape[2]
-
-	// Project Q, K, V
-	xFlat := mlx.Reshape(x, B*L, D)
-	q := attn.ToQ.Forward(xFlat)
-	k := attn.ToK.Forward(xFlat)
-	v := attn.ToV.Forward(xFlat)
-
-	// Reshape to [B, L, nheads, head_dim]
-	q = mlx.Reshape(q, B, L, attn.NHeads, attn.HeadDim)
-	k = mlx.Reshape(k, B, L, attn.NHeads, attn.HeadDim)
-	v = mlx.Reshape(v, B, L, attn.NHeads, attn.HeadDim)
-
-	// QK norm
-	q = mlx.RMSNorm(q, attn.NormQ, 1e-5)
-	k = mlx.RMSNorm(k, attn.NormK, 1e-5)
-
-	// Apply RoPE if provided
-	if cos != nil && sin != nil {
-		q = applyRoPE3D(q, cos, sin)
-		k = applyRoPE3D(k, cos, sin)
-	}
-
-	// Transpose to [B, nheads, L, head_dim]
-	q = mlx.Transpose(q, 0, 2, 1, 3)
-	k = mlx.Transpose(k, 0, 2, 1, 3)
-	v = mlx.Transpose(v, 0, 2, 1, 3)
-
-	// SDPA
-	out := mlx.ScaledDotProductAttention(q, k, v, attn.Scale, false)
-
-	// Transpose back and reshape
-	out = mlx.Transpose(out, 0, 2, 1, 3)
-	out = mlx.Reshape(out, B*L, attn.Dim)
-	out = attn.ToOut.Forward(out)
-
-	return mlx.Reshape(out, B, L, attn.Dim)
-}
-
-// applyRoPE3D applies 3-axis rotary position embeddings
-// x: [B, L, nheads, head_dim]
-// cos, sin: [B, L, 1, head_dim/2]
-func applyRoPE3D(x *mlx.Array, cos, sin *mlx.Array) *mlx.Array {
-	shape := x.Shape()
-	B := shape[0]
-	L := shape[1]
-	nheads := shape[2]
-	headDim := shape[3]
-	half := headDim / 2
-
-	// Create even/odd index arrays
-	evenIdx := make([]int32, half)
-	oddIdx := make([]int32, half)
-	for i := int32(0); i < half; i++ {
-		evenIdx[i] = i * 2
-		oddIdx[i] = i*2 + 1
-	}
-	evenIndices := mlx.NewArrayInt32(evenIdx, []int32{half})
-	oddIndices := mlx.NewArrayInt32(oddIdx, []int32{half})
-
-	// Extract x1 (even indices) and x2 (odd indices) along last axis
-	x1 := mlx.Take(x, evenIndices, 3) // [B, L, nheads, half]
-	x2 := mlx.Take(x, oddIndices, 3)  // [B, L, nheads, half]
-
-	// Apply rotation: [x1*cos - x2*sin, x1*sin + x2*cos]
-	r1 := mlx.Sub(mlx.Mul(x1, cos), mlx.Mul(x2, sin))
-	r2 := mlx.Add(mlx.Mul(x1, sin), mlx.Mul(x2, cos))
-
-	// Stack and reshape to interleave: [r1_0, r2_0, r1_1, r2_1, ...]
-	r1 = mlx.ExpandDims(r1, 4)                          // [B, L, nheads, half, 1]
-	r2 = mlx.ExpandDims(r2, 4)                          // [B, L, nheads, half, 1]
-	stacked := mlx.Concatenate([]*mlx.Array{r1, r2}, 4) // [B, L, nheads, half, 2]
-	return mlx.Reshape(stacked, B, L, nheads, headDim)
-}
-
-// TransformerBlock is a single transformer block with optional AdaLN modulation
-type TransformerBlock struct {
-	Attention      *Attention   `weight:"attention"`
-	FeedForward    *FeedForward `weight:"feed_forward"`
-	AttentionNorm1 *nn.RMSNorm  `weight:"attention_norm1"`
-	AttentionNorm2 *nn.RMSNorm  `weight:"attention_norm2"`
-	FFNNorm1       *nn.RMSNorm  `weight:"ffn_norm1"`
-	FFNNorm2       *nn.RMSNorm  `weight:"ffn_norm2"`
-	AdaLN          *nn.Linear   `weight:"adaLN_modulation.0,optional"` // only if modulation
-	// Computed fields
-	HasModulation bool
-	Dim           int32
-}
-
-// Forward applies the transformer block
-func (tb *TransformerBlock) Forward(x *mlx.Array, adaln *mlx.Array, cos, sin *mlx.Array, eps float32) *mlx.Array {
-	if tb.AdaLN != nil && adaln != nil {
-		// Compute modulation: [B, 256] -> [B, 4*dim]
-		chunks := tb.AdaLN.Forward(adaln)
-
-		// Split into 4 parts: scale_msa, gate_msa, scale_mlp, gate_mlp
-		chunkShape := chunks.Shape()
-		chunkDim := chunkShape[1] / 4
-
-		scaleMSA := mlx.Slice(chunks, []int32{0, 0}, []int32{chunkShape[0], chunkDim})
-		gateMSA := mlx.Slice(chunks, []int32{0, chunkDim}, []int32{chunkShape[0], chunkDim * 2})
-		scaleMLP := mlx.Slice(chunks, []int32{0, chunkDim * 2}, []int32{chunkShape[0], chunkDim * 3})
-		gateMLP := mlx.Slice(chunks, []int32{0, chunkDim * 3}, []int32{chunkShape[0], chunkDim * 4})
-
-		// Expand for broadcasting: [B, 1, dim]
-		scaleMSA = mlx.ExpandDims(scaleMSA, 1)
-		gateMSA = mlx.ExpandDims(gateMSA, 1)
-		scaleMLP = mlx.ExpandDims(scaleMLP, 1)
-		gateMLP = mlx.ExpandDims(gateMLP, 1)
-
-		// Attention with modulation
-		normX := tb.AttentionNorm1.Forward(x, eps)
-		normX = mlx.Mul(normX, mlx.AddScalar(scaleMSA, 1.0))
-		attnOut := tb.Attention.Forward(normX, cos, sin)
-		attnOut = tb.AttentionNorm2.Forward(attnOut, eps)
-		x = mlx.Add(x, mlx.Mul(mlx.Tanh(gateMSA), attnOut))
-
-		// FFN with modulation
-		normFFN := tb.FFNNorm1.Forward(x, eps)
-		normFFN = mlx.Mul(normFFN, mlx.AddScalar(scaleMLP, 1.0))
-		ffnOut := tb.FeedForward.Forward(normFFN)
-		ffnOut = tb.FFNNorm2.Forward(ffnOut, eps)
-		x = mlx.Add(x, mlx.Mul(mlx.Tanh(gateMLP), ffnOut))
-	} else {
-		// No modulation (context refiner)
-		attnOut := tb.Attention.Forward(tb.AttentionNorm1.Forward(x, eps), cos, sin)
-		x = mlx.Add(x, tb.AttentionNorm2.Forward(attnOut, eps))
-
-		ffnOut := tb.FeedForward.Forward(tb.FFNNorm1.Forward(x, eps))
-		x = mlx.Add(x, tb.FFNNorm2.Forward(ffnOut, eps))
-	}
-
-	return x
-}
-
-// FinalLayer outputs the denoised patches
-type FinalLayer struct {
-	AdaLN  *nn.Linear `weight:"adaLN_modulation.1"` // [256] -> [dim]
-	Output *nn.Linear `weight:"linear"`             // [dim] -> [out_channels]
-	OutDim int32      // computed from Output
-}
-
-// Forward computes final output
-func (fl *FinalLayer) Forward(x *mlx.Array, c *mlx.Array) *mlx.Array {
-	// c: [B, 256] -> scale: [B, dim]
-	scale := mlx.SiLU(c)
-	scale = fl.AdaLN.Forward(scale)
-	scale = mlx.ExpandDims(scale, 1) // [B, 1, dim]
-
-	// LayerNorm (affine=False) then scale
-	x = layerNormNoAffine(x, 1e-6)
-	x = mlx.Mul(x, mlx.AddScalar(scale, 1.0))
-
-	// Output projection
-	shape := x.Shape()
-	B := shape[0]
-	L := shape[1]
-	D := shape[2]
-	x = mlx.Reshape(x, B*L, D)
-	x = fl.Output.Forward(x)
-
-	return mlx.Reshape(x, B, L, fl.OutDim)
-}
-
-// layerNormNoAffine applies layer norm without learnable parameters
-func layerNormNoAffine(x *mlx.Array, eps float32) *mlx.Array {
-	ndim := x.Ndim()
-	lastAxis := ndim - 1
-
-	mean := mlx.Mean(x, lastAxis, true)
-	xCentered := mlx.Sub(x, mean)
-	variance := mlx.Mean(mlx.Square(xCentered), lastAxis, true)
-	return mlx.Div(xCentered, mlx.Sqrt(mlx.AddScalar(variance, eps)))
-}
-
-// Transformer is the full Z-Image DiT model
-type Transformer struct {
-	TEmbed          *TimestepEmbedder   `weight:"t_embedder"`
-	XEmbed          *XEmbedder          `weight:"all_x_embedder"`
-	CapEmbed        *CapEmbedder        `weight:"cap_embedder"`
-	NoiseRefiners   []*TransformerBlock `weight:"noise_refiner"`
-	ContextRefiners []*TransformerBlock `weight:"context_refiner"`
-	Layers          []*TransformerBlock `weight:"layers"`
-	FinalLayer      *FinalLayer         `weight:"all_final_layer.2-1"`
-	XPadToken       *mlx.Array          `weight:"x_pad_token"`
-	CapPadToken     *mlx.Array          `weight:"cap_pad_token"`
-	*TransformerConfig
-}
-
-// Load loads the Z-Image transformer from a directory
-func (m *Transformer) Load(path string) error {
-	fmt.Println("Loading Z-Image transformer...")
-
-	// Load config
-	cfg, err := loadTransformerConfig(filepath.Join(path, "config.json"))
-	if err != nil {
-		return fmt.Errorf("config: %w", err)
-	}
-	m.TransformerConfig = cfg
-
-	// Pre-allocate slices for loader
-	m.NoiseRefiners = make([]*TransformerBlock, cfg.NRefinerLayers)
-	m.ContextRefiners = make([]*TransformerBlock, cfg.NRefinerLayers)
-	m.Layers = make([]*TransformerBlock, cfg.NLayers)
-
-	// Load weights
-	weights, err := safetensors.LoadModelWeights(path)
-	if err != nil {
-		return fmt.Errorf("weights: %w", err)
-	}
-
-	fmt.Print("  Loading weights as bf16... ")
-	if err := weights.Load(mlx.DtypeBFloat16); err != nil {
-		return fmt.Errorf("load weights: %w", err)
-	}
-	fmt.Printf("✓ (%.1f GB)\n", float64(mlx.MetalGetActiveMemory())/(1024*1024*1024))
-
-	fmt.Print("  Loading weights via struct tags... ")
-	if err := safetensors.LoadModule(m, weights, ""); err != nil {
-		return fmt.Errorf("load module: %w", err)
-	}
-	fmt.Println("✓")
-
-	// Initialize computed fields
-	m.TEmbed.FreqEmbedSize = 256
-	m.FinalLayer.OutDim = m.FinalLayer.Output.Weight.Shape()[0]
-	m.CapEmbed.Norm.Eps = 1e-6
-
-	for _, block := range m.NoiseRefiners {
-		initTransformerBlock(block, cfg)
-	}
-	for _, block := range m.ContextRefiners {
-		initTransformerBlock(block, cfg)
-	}
-	for _, block := range m.Layers {
-		initTransformerBlock(block, cfg)
-	}
-
-	weights.ReleaseAll()
-	return nil
-}
-
-// loadTransformerConfig loads transformer config from a JSON file
-func loadTransformerConfig(path string) (*TransformerConfig, error) {
-	data, err := os.ReadFile(path)
-	if err != nil {
-		return nil, fmt.Errorf("read config: %w", err)
-	}
-	var cfg TransformerConfig
-	if err := json.Unmarshal(data, &cfg); err != nil {
-		return nil, fmt.Errorf("parse config: %w", err)
-	}
-	// Extract PatchSize from array
-	if len(cfg.AllPatchSize) > 0 {
-		cfg.PatchSize = cfg.AllPatchSize[0]
-	}
-	return &cfg, nil
-}
-
-// initTransformerBlock sets computed fields on a transformer block
-func initTransformerBlock(block *TransformerBlock, cfg *TransformerConfig) {
-	block.Dim = cfg.Dim
-	block.HasModulation = block.AdaLN != nil
-
-	// Init attention computed fields
-	attn := block.Attention
-	attn.NHeads = cfg.NHeads
-	attn.HeadDim = cfg.Dim / cfg.NHeads
-	attn.Dim = cfg.Dim
-	attn.Scale = float32(1.0 / math.Sqrt(float64(attn.HeadDim)))
-
-	// Init feedforward OutDim
-	block.FeedForward.OutDim = block.FeedForward.W2.Weight.Shape()[0]
-
-	// Set eps on all RMSNorm layers
-	block.AttentionNorm1.Eps = cfg.NormEps
-	block.AttentionNorm2.Eps = cfg.NormEps
-	block.FFNNorm1.Eps = cfg.NormEps
-	block.FFNNorm2.Eps = cfg.NormEps
-}
-
-// RoPECache holds precomputed RoPE values
-type RoPECache struct {
-	ImgCos     *mlx.Array
-	ImgSin     *mlx.Array
-	CapCos     *mlx.Array
-	CapSin     *mlx.Array
-	UnifiedCos *mlx.Array
-	UnifiedSin *mlx.Array
-	ImgLen     int32
-	CapLen     int32
-}
-
-// PrepareRoPECache precomputes RoPE values for the given image and caption lengths.
-// hTok and wTok are the number of tokens in each dimension (latentH/patchSize, latentW/patchSize).
-func (m *Transformer) PrepareRoPECache(hTok, wTok, capLen int32) *RoPECache {
-	imgLen := hTok * wTok
-
-	// Image positions: grid over (1, H, W) starting at (capLen+1, 0, 0)
-	imgPos := createCoordinateGrid(1, hTok, wTok, capLen+1, 0, 0)
-	imgPos = mlx.ToBFloat16(imgPos)
-	// Caption positions: grid over (capLen, 1, 1) starting at (1, 0, 0)
-	capPos := createCoordinateGrid(capLen, 1, 1, 1, 0, 0)
-	capPos = mlx.ToBFloat16(capPos)
-
-	// Compute RoPE from UNIFIED positions
-	unifiedPos := mlx.Concatenate([]*mlx.Array{imgPos, capPos}, 1)
-	unifiedCos, unifiedSin := prepareRoPE3D(unifiedPos, m.TransformerConfig.AxesDims)
-
-	// Slice RoPE for image and caption parts
-	imgCos := mlx.Slice(unifiedCos, []int32{0, 0, 0, 0}, []int32{1, imgLen, 1, 64})
-	imgSin := mlx.Slice(unifiedSin, []int32{0, 0, 0, 0}, []int32{1, imgLen, 1, 64})
-	capCos := mlx.Slice(unifiedCos, []int32{0, imgLen, 0, 0}, []int32{1, imgLen + capLen, 1, 64})
-	capSin := mlx.Slice(unifiedSin, []int32{0, imgLen, 0, 0}, []int32{1, imgLen + capLen, 1, 64})
-
-	return &RoPECache{
-		ImgCos:     imgCos,
-		ImgSin:     imgSin,
-		CapCos:     capCos,
-		CapSin:     capSin,
-		UnifiedCos: unifiedCos,
-		UnifiedSin: unifiedSin,
-		ImgLen:     imgLen,
-		CapLen:     capLen,
-	}
-}
-
-// Forward runs the Z-Image transformer with precomputed RoPE
-func (m *Transformer) Forward(x *mlx.Array, t *mlx.Array, capFeats *mlx.Array, rope *RoPECache) *mlx.Array {
-	imgLen := rope.ImgLen
-
-	// Timestep embedding -> [B, 256]
-	temb := m.TEmbed.Forward(mlx.MulScalar(t, m.TransformerConfig.TScale))
-
-	// Embed image patches -> [B, L_img, dim]
-	x = m.XEmbed.Forward(x)
-
-	// Embed caption features -> [B, L_cap, dim]
-	capEmb := m.CapEmbed.Forward(capFeats)
-
-	eps := m.NormEps
-
-	// Noise refiner: refine image patches with modulation
-	for _, refiner := range m.NoiseRefiners {
-		x = refiner.Forward(x, temb, rope.ImgCos, rope.ImgSin, eps)
-	}
-
-	// Context refiner: refine caption (no modulation)
-	for _, refiner := range m.ContextRefiners {
-		capEmb = refiner.Forward(capEmb, nil, rope.CapCos, rope.CapSin, eps)
-	}
-
-	// Concatenate image and caption for joint attention
-	unified := mlx.Concatenate([]*mlx.Array{x, capEmb}, 1)
-
-	// Main transformer layers use full unified RoPE
-	for _, layer := range m.Layers {
-		unified = layer.Forward(unified, temb, rope.UnifiedCos, rope.UnifiedSin, eps)
-	}
-
-	// Extract image tokens only
-	unifiedShape := unified.Shape()
-	B := unifiedShape[0]
-	imgOut := mlx.Slice(unified, []int32{0, 0, 0}, []int32{B, imgLen, unifiedShape[2]})
-
-	// Final layer
-	return m.FinalLayer.Forward(imgOut, temb)
-}
-
-// ForwardWithCache runs the transformer with layer caching for faster inference.
-// On refresh steps (step % cacheInterval == 0), all layers are computed and cached.
-// On other steps, shallow layers (0 to cacheLayers-1) reuse cached outputs.
-func (m *Transformer) ForwardWithCache(
-	x *mlx.Array,
-	t *mlx.Array,
-	capFeats *mlx.Array,
-	rope *RoPECache,
-	stepCache *cache.StepCache,
-	step int,
-	cacheInterval int,
-) *mlx.Array {
-	imgLen := rope.ImgLen
-	cacheLayers := stepCache.NumLayers()
-	eps := m.NormEps
-
-	// Timestep embedding -> [B, 256]
-	temb := m.TEmbed.Forward(mlx.MulScalar(t, m.TransformerConfig.TScale))
-
-	// Embed image patches -> [B, L_img, dim]
-	x = m.XEmbed.Forward(x)
-
-	// Context refiners: compute once on step 0, reuse forever
-	// (caption embedding doesn't depend on timestep or latents)
-	var capEmb *mlx.Array
-	if stepCache.GetConstant() != nil {
-		capEmb = stepCache.GetConstant()
-	} else {
-		capEmb = m.CapEmbed.Forward(capFeats)
-		for _, refiner := range m.ContextRefiners {
-			capEmb = refiner.Forward(capEmb, nil, rope.CapCos, rope.CapSin, eps)
-		}
-		stepCache.SetConstant(capEmb)
-	}
-
-	// Noise refiners: always compute (depend on x which changes each step)
-	for _, refiner := range m.NoiseRefiners {
-		x = refiner.Forward(x, temb, rope.ImgCos, rope.ImgSin, eps)
-	}
-
-	// Concatenate image and caption for joint attention
-	unified := mlx.Concatenate([]*mlx.Array{x, capEmb}, 1)
-
-	// Determine if this is a cache refresh step
-	refreshCache := stepCache.ShouldRefresh(step, cacheInterval)
-
-	// Main transformer layers with caching
-	for i, layer := range m.Layers {
-		if i < cacheLayers && !refreshCache && stepCache.Get(i) != nil {
-			// Use cached output for shallow layers
-			unified = stepCache.Get(i)
-		} else {
-			// Compute layer
-			unified = layer.Forward(unified, temb, rope.UnifiedCos, rope.UnifiedSin, eps)
-			// Cache shallow layer outputs on refresh steps
-			if i < cacheLayers && refreshCache {
-				stepCache.Set(i, unified)
-			}
-		}
-	}
-
-	// Extract image tokens only
-	unifiedShape := unified.Shape()
-	B := unifiedShape[0]
-	imgOut := mlx.Slice(unified, []int32{0, 0, 0}, []int32{B, imgLen, unifiedShape[2]})
-
-	// Final layer
-	return m.FinalLayer.Forward(imgOut, temb)
-}
-
-// createCoordinateGrid creates 3D position grid [1, d0*d1*d2, 3]
-func createCoordinateGrid(d0, d1, d2, s0, s1, s2 int32) *mlx.Array {
-	// Create meshgrid and stack
-	total := d0 * d1 * d2
-	coords := make([]float32, total*3)
-
-	idx := 0
-	for i := int32(0); i < d0; i++ {
-		for j := int32(0); j < d1; j++ {
-			for k := int32(0); k < d2; k++ {
-				coords[idx*3+0] = float32(s0 + i)
-				coords[idx*3+1] = float32(s1 + j)
-				coords[idx*3+2] = float32(s2 + k)
-				idx++
-			}
-		}
-	}
-
-	return mlx.NewArray(coords, []int32{1, total, 3})
-}
-
-// prepareRoPE3D computes cos/sin for 3-axis RoPE
-// positions: [B, L, 3] with (h, w, t) coordinates
-// axesDims: [32, 48, 48] - dimensions for each axis
-// Returns: cos, sin each [B, L, 1, head_dim/2]
-func prepareRoPE3D(positions *mlx.Array, axesDims []int32) (*mlx.Array, *mlx.Array) {
-	// Compute frequencies for each axis
-	// dims = [32, 48, 48], so halves = [16, 24, 24]
-	ropeTheta := float32(256.0)
-
-	freqs := make([]*mlx.Array, 3)
-	for axis := 0; axis < 3; axis++ {
-		half := axesDims[axis] / 2
-		f := make([]float32, half)
-		for i := int32(0); i < half; i++ {
-			f[i] = float32(math.Exp(-math.Log(float64(ropeTheta)) * float64(i) / float64(half)))
-		}
-		freqs[axis] = mlx.NewArray(f, []int32{1, 1, 1, half})
-	}
-
-	// Extract position coordinates
-	shape := positions.Shape()
-	B := shape[0]
-	L := shape[1]
-
-	// positions[:, :, 0] -> h positions
-	posH := mlx.Slice(positions, []int32{0, 0, 0}, []int32{B, L, 1})
-	posW := mlx.Slice(positions, []int32{0, 0, 1}, []int32{B, L, 2})
-	posT := mlx.Slice(positions, []int32{0, 0, 2}, []int32{B, L, 3})
-
-	// Compute args: pos * freqs for each axis
-	posH = mlx.ExpandDims(posH, 3) // [B, L, 1, 1]
-	posW = mlx.ExpandDims(posW, 3)
-	posT = mlx.ExpandDims(posT, 3)
-
-	argsH := mlx.Mul(posH, freqs[0]) // [B, L, 1, 16]
-	argsW := mlx.Mul(posW, freqs[1]) // [B, L, 1, 24]
-	argsT := mlx.Mul(posT, freqs[2]) // [B, L, 1, 24]
-
-	// Concatenate: [B, L, 1, 16+24+24=64]
-	args := mlx.Concatenate([]*mlx.Array{argsH, argsW, argsT}, 3)
-
-	// Compute cos and sin
-	return mlx.Cos(args), mlx.Sin(args)
-}
-
-// PatchifyLatents converts latents [B, C, H, W] to patches [B, L, C*patch^2]
-// Matches Python: x.reshape(C, 1, 1, H_tok, 2, W_tok, 2).transpose(1,2,3,5,4,6,0).reshape(1,-1,C*4)
-func PatchifyLatents(latents *mlx.Array, patchSize int32) *mlx.Array {
-	shape := latents.Shape()
-	C := shape[1]
-	H := shape[2]
-	W := shape[3]
-
-	pH := H / patchSize // H_tok
-	pW := W / patchSize // W_tok
-
-	// Match Python exactly: reshape treating B=1 as part of contiguous data
-	// [1, C, H, W] -> [C, 1, 1, pH, 2, pW, 2]
-	x := mlx.Reshape(latents, C, 1, 1, pH, patchSize, pW, patchSize)
-
-	// Python: transpose(1, 2, 3, 5, 4, 6, 0)
-	// [C, 1, 1, pH, 2, pW, 2] -> [1, 1, pH, pW, 2, 2, C]
-	x = mlx.Transpose(x, 1, 2, 3, 5, 4, 6, 0)
-
-	// [1, 1, pH, pW, 2, 2, C] -> [1, pH*pW, C*4]
-	return mlx.Reshape(x, 1, pH*pW, C*patchSize*patchSize)
-}
-
-// UnpatchifyLatents converts patches [B, L, C*patch^2] back to [B, C, H, W]
-// Matches Python: out.reshape(1,1,H_tok,W_tok,2,2,C).transpose(6,0,1,2,4,3,5).reshape(1,C,H,W)
-func UnpatchifyLatents(patches *mlx.Array, patchSize, H, W, C int32) *mlx.Array {
-	pH := H / patchSize
-	pW := W / patchSize
-
-	// [1, L, C*4] -> [1, 1, pH, pW, 2, 2, C]
-	x := mlx.Reshape(patches, 1, 1, pH, pW, patchSize, patchSize, C)
-
-	// Python: transpose(6, 0, 1, 2, 4, 3, 5)
-	// [1, 1, pH, pW, 2, 2, C] -> [C, 1, 1, pH, 2, pW, 2]
-	x = mlx.Transpose(x, 6, 0, 1, 2, 4, 3, 5)
-
-	// [C, 1, 1, pH, 2, pW, 2] -> [1, C, H, W]
-	return mlx.Reshape(x, 1, C, H, W)
-}
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
ParthSareen	0c2c2b8de9	fixes from rebase	2026-01-07 15:44:19 -08:00
ParthSareen	5e23c4f2f7	fix: agent loop message handling and cloud model inheritance - Fix tool result messages losing ToolName and ToolCallID fields - Include Thinking in intermediate assistant messages during tool loops - Inherit capabilities, remote config, and model family from base model when creating agents (fixes "does not support generate" for cloud models) - Add tests for tool message construction and message stitching 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>	2026-01-06 12:15:18 -08:00
ParthSareen	5c0caaff86	agents: add MCP server support and ENTRYPOINT command MCP (Model Context Protocol) support: - Add MCPRef type for agent MCP server references - Parse MCP command in Agentfiles (MCP name command [args...]) - Load and manage MCP servers with mcpManager - Implement agentic loop for multi-turn tool execution - Add /mcp REPL commands (add, remove, disable, enable) - Add 'ollama mcp' CLI commands for global config management - Support both model-bundled and global (~/.ollama/mcp.json) MCPs - Display MCPs in 'ollama show' output ENTRYPOINT support: - Add ENTRYPOINT command to Agentfiles for custom runtimes - Allow agents without FROM when ENTRYPOINT is specified - Execute entrypoint as subprocess with stdin/stdout connected - Support $PROMPT placeholder for prompt insertion control - Hide Model section in 'ollama show' for entrypoint-only agents - Pass user prompt as argument to entrypoint command	2026-01-06 12:15:18 -08:00
ParthSareen	e28ee8524d	skills: add registry reference check and working directory env var - Add check for registry references without digest in loadSkillsFromRefs - Fix IsLocalSkillPath to not treat registry refs as local paths - Inject OLLAMA_WORKING_DIR env var so skill scripts can access the directory where 'ollama run' was called from	2026-01-06 12:12:27 -08:00
ParthSareen	623e539a09	docs: add skills documentation Add comprehensive documentation for the skills feature: - Quick start guide for creating skills - SKILL.md structure and frontmatter - Skill reference formats (local, library, user) - CLI commands (push, pull, list, show, rm) - Dynamic skills in interactive chat - Storage layout - Security considerations - Future roadmap	2026-01-06 12:12:27 -08:00
ParthSareen	51911a5f6f	cmd: add skill CLI and REPL commands Add skill management commands and interactive REPL support: CLI commands (cmd/skill_cmd.go): ollama skill push NAME PATH - Push skill to registry ollama skill pull NAME - Pull skill from registry ollama skill list - List installed skills ollama skill show NAME - Show skill details ollama skill rm NAME - Remove a skill Skill loading (cmd/skills.go): - Load skills from model manifests - Parse SKILL.md frontmatter for metadata - Inject skill instructions into system prompt - Provide run_skill_script tool for script execution Interactive mode (cmd/interactive.go): /skills - Show available skills /skill add PATH - Add skill from local path /skill remove NAME - Remove skill from session /skill list - List session skills	2026-01-06 12:12:27 -08:00
ParthSareen	2c2354e980	parser: add SKILL command for Agentfiles Add SKILL command to the Modelfile/Agentfile parser. Supports both local paths and registry references: SKILL ./path/to/skill # Local skill bundled with agent SKILL skill/calc:1.0.0 # Registry skill reference SKILL alice/skill/calc:1.0 # User skill from registry	2026-01-06 12:12:27 -08:00
ParthSareen	ce6b19d8be	api,types: add skill types and configuration Add skill-related types to the API and configuration: - api/types.go: Skill reference types for API requests/responses - types/model/config.go: Skill configuration in model config - envconfig/config.go: Environment configuration for skills	2026-01-06 12:12:27 -08:00
ParthSareen	1de00fada0	server: add skill layer support Add support for skill layers in model manifests: - server/skill.go: New file with skill extraction and packaging - GetSkillsPath: Returns path to extracted skills cache - ExtractSkillBlob: Extracts skill tar.gz to cache - CreateSkillLayer: Creates skill blob from directory - ParseSkillName/GetSkillManifestPath: Skill name handling - server/images.go: Extract skill layers on pull - server/create.go: Create skill layers from SKILL directives - server/routes.go: Skill-related route handling Skills are stored as gzipped tar archives with MediaType "application/vnd.ollama.image.skill".	2026-01-06 12:12:27 -08:00
ParthSareen	7ecae75c4c	server: add Kind field to ModelPath for 5-part naming Updates ModelPath struct and parsing to support the Kind field, enabling skills and agents to use the 5-part naming structure. - ParseModelPath detects valid kinds (skill, agent) - GetNamespaceRepository includes kind in path - GetManifestPath returns correct 5-part filepath - GetFullTagname/GetShortTagname include kind when present	2026-01-06 12:12:27 -08:00
ParthSareen	ad5c276cf6	types: add Kind field to model.Name for 5-part naming Extends the model name structure from 4-part to 5-part: host/namespace/kind/model:tag The Kind field is optional and supports: - "skill" for skill packages - "agent" for agent packages (future) - empty for regular models Parser detects valid kinds to distinguish between old format (host/namespace/model) and new format (host/namespace/kind/model).	2026-01-06 12:12:27 -08:00