x/imagegen: replace memory estimation with actual weight size (#13848 )

Remove static VRAM estimation (EstimateVRAM, CheckMemoryRequirements) which wasn't helpful. Instead, report the actual tensor weight size from the manifest for ollama ps. - Remove memory estimation check from runner startup - Remove EstimateVRAM, CheckMemoryRequirements, modelVRAMEstimates - Add TotalTensorSize() to get actual weight size from manifest - Use weight size for Server.vramSize instead of estimates Note: This is better than showing 0 or inaccurate estimates, but the weight size is a drastic underestimation of actual memory usage since it doesn't account for activations, intermediate tensors, or MLX overhead. Future work should query real-time memory from MLX (e.g., MetalGetActiveMemory) for accurate reporting.
fix: handle Enter key pressed during model loading (#13839 )
2026-01-22 22:40:07 -05:00 · 2026-01-22 18:32:41 -08:00 · 2026-01-22 18:32:02 -08:00
12 changed files with 87 additions and 378 deletions
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -159,6 +159,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 			sb.WriteString(before)
 			if !ok {
 				fmt.Fprintln(&sb)
+				scanner.Prompt.UseAlt = true
 				continue
 			}

--- a/docs/capabilities/image-generation.mdx
+++ b/docs/capabilities/image-generation.mdx
@@ -1,205 +0,0 @@
---
-title: Image Generation
---
-
-<Warning>
-Image generation is experimental and currently only available on macOS. This feature may change in future versions.
-</Warning>
-
-Image generation models create images from text prompts. Ollama supports diffusion-based image generation models through both Ollama's API and OpenAI-compatible endpoints.
-
-## Usage
-
-<Tabs>
-  <Tab title="CLI">
-    ```shell
-    ollama run x/z-image-turbo "a sunset over mountains"
-    ```
-    The generated image will be saved to the current directory.
-  </Tab>
-  <Tab title="cURL">
-    ```shell
-    curl http://localhost:11434/api/generate -d '{
-      "model": "x/z-image-turbo",
-      "prompt": "a sunset over mountains",
-      "stream": false
-    }'
-    ```
-  </Tab>
-  <Tab title="Python">
-    ```python
-    import ollama
-    import base64
-
-    response = ollama.generate(
-        model='x/z-image-turbo',
-        prompt='a sunset over mountains',
-    )
-
-    # Save the generated image
-    with open('output.png', 'wb') as f:
-        f.write(base64.b64decode(response['image']))
-
-    print('Image saved to output.png')
-    ```
-  </Tab>
-  <Tab title="JavaScript">
-    ```javascript
-    import ollama from 'ollama'
-    import { writeFileSync } from 'fs'
-
-    const response = await ollama.generate({
-      model: 'x/z-image-turbo',
-      prompt: 'a sunset over mountains',
-    })
-
-    // Save the generated image
-    const imageBuffer = Buffer.from(response.image, 'base64')
-    writeFileSync('output.png', imageBuffer)
-
-    console.log('Image saved to output.png')
-    ```
-  </Tab>
-</Tabs>
-
-### Response
-
-The response includes an `image` field containing the base64-encoded image data:
-
-```json
-{
-  "model": "x/z-image-turbo",
-  "created_at": "2024-01-15T10:30:15.000000Z",
-  "image": "iVBORw0KGgoAAAANSUhEUg...",
-  "done": true,
-  "done_reason": "stop",
-  "total_duration": 15000000000,
-  "load_duration": 2000000000
-}
-```
-
-## Image dimensions
-
-Customize the output image size using the `width` and `height` parameters:
-
-<Tabs>
-  <Tab title="cURL">
-    ```shell
-    curl http://localhost:11434/api/generate -d '{
-      "model": "x/z-image-turbo",
-      "prompt": "a portrait of a robot artist",
-      "width": 768,
-      "height": 1024,
-      "stream": false
-    }'
-    ```
-  </Tab>
-  <Tab title="Python">
-    ```python
-    import ollama
-
-    response = ollama.generate(
-        model='x/z-image-turbo',
-        prompt='a portrait of a robot artist',
-        width=768,
-        height=1024,
-    )
-    ```
-  </Tab>
-  <Tab title="JavaScript">
-    ```javascript
-    import ollama from 'ollama'
-
-    const response = await ollama.generate({
-      model: 'x/z-image-turbo',
-      prompt: 'a portrait of a robot artist',
-      width: 768,
-      height: 1024,
-    })
-    ```
-  </Tab>
-</Tabs>
-
-## Streaming progress
-
-When streaming is enabled (the default), progress updates are sent during image generation:
-
-```json
-{
-  "model": "x/z-image-turbo",
-  "created_at": "2024-01-15T10:30:00.000000Z",
-  "completed": 5,
-  "total": 20,
-  "done": false
-}
-```
-
-The `completed` and `total` fields indicate the current progress through the diffusion steps.
-
-## Parameters
-
-| Parameter | Description | Default |
-|-----------|-------------|---------|
-| `prompt` | Text description of the image to generate | Required |
-| `width` | Width of the generated image in pixels | Model default |
-| `height` | Height of the generated image in pixels | Model default |
-| `steps` | Number of diffusion steps | Model default |
-
-## OpenAI compatibility
-
-Image generation is also available through the OpenAI-compatible `/v1/images/generations` endpoint:
-
-<Tabs>
-  <Tab title="cURL">
-    ```shell
-    curl http://localhost:11434/v1/images/generations \
-      -H "Content-Type: application/json" \
-      -d '{
-        "model": "x/z-image-turbo",
-        "prompt": "a sunset over mountains",
-        "size": "1024x1024",
-        "response_format": "b64_json"
-      }'
-    ```
-  </Tab>
-  <Tab title="Python">
-    ```python
-    from openai import OpenAI
-
-    client = OpenAI(
-        base_url='http://localhost:11434/v1/',
-        api_key='ollama',  # required but ignored
-    )
-
-    response = client.images.generate(
-        model='x/z-image-turbo',
-        prompt='a sunset over mountains',
-        size='1024x1024',
-        response_format='b64_json',
-    )
-
-    print(response.data[0].b64_json[:50] + '...')
-    ```
-  </Tab>
-  <Tab title="JavaScript">
-    ```javascript
-    import OpenAI from 'openai'
-
-    const openai = new OpenAI({
-      baseURL: 'http://localhost:11434/v1/',
-      apiKey: 'ollama', // required but ignored
-    })
-
-    const response = await openai.images.generate({
-      model: 'x/z-image-turbo',
-      prompt: 'a sunset over mountains',
-      size: '1024x1024',
-      response_format: 'b64_json',
-    })
-
-    console.log(response.data[0].b64_json.slice(0, 50) + '...')
-    ```
-  </Tab>
-</Tabs>
-
-See [OpenAI compatibility](/api/openai-compatibility#v1imagesgenerations-experimental) for more details.
--- a/docs/docs.json
+++ b/docs/docs.json
@@ -93,7 +93,6 @@
              "/capabilities/thinking",
              "/capabilities/structured-outputs",
              "/capabilities/vision",
-              "/capabilities/image-generation",
              "/capabilities/embeddings",
              "/capabilities/tool-calling",
              "/capabilities/web-search"
--- a/docs/openapi.yaml
+++ b/docs/openapi.yaml
@@ -117,15 +117,6 @@ components:
        top_logprobs:
          type: integer
          description: Number of most likely tokens to return at each token position when logprobs are enabled
-        width:
-          type: integer
-          description: (Experimental) Width of the generated image in pixels. For image generation models only.
-        height:
-          type: integer
-          description: (Experimental) Height of the generated image in pixels. For image generation models only.
-        steps:
-          type: integer
-          description: (Experimental) Number of diffusion steps. For image generation models only.
    GenerateResponse:
      type: object
      properties:
@@ -170,15 +161,6 @@ components:
          items:
            $ref: "#/components/schemas/Logprob"
          description: Log probability information for the generated tokens when logprobs are enabled
-        image:
-          type: string
-          description: (Experimental) Base64-encoded generated image data. For image generation models only.
-        completed:
-          type: integer
-          description: (Experimental) Number of completed diffusion steps. For image generation streaming progress.
-        total:
-          type: integer
-          description: (Experimental) Total number of diffusion steps. For image generation streaming progress.
    GenerateStreamEvent:
      type: object
      properties:
@@ -218,15 +200,6 @@ components:
        eval_duration:
          type: integer
          description: Time spent generating tokens in nanoseconds
-        image:
-          type: string
-          description: (Experimental) Base64-encoded generated image data. For image generation models only.
-        completed:
-          type: integer
-          description: (Experimental) Number of completed diffusion steps. For image generation streaming progress.
-        total:
-          type: integer
-          description: (Experimental) Total number of diffusion steps. For image generation streaming progress.
    ChatMessage:
      type: object
      required: [role, content]
--- a/readline/readline.go
+++ b/readline/readline.go
@@ -95,7 +95,21 @@ func (i *Instance) Readline() (string, error) {

 	var currentLineBuf []rune

+	// draining tracks if we're processing buffered input from cooked mode.
+	// In cooked mode Enter sends \n, but in raw mode Ctrl+J sends \n.
+	// We treat \n from cooked mode as submit, not multiline.
+	// We check Buffered() after the first read since the bufio buffer is
+	// empty until then. This is compatible with """ multiline mode in
+	// interactive.go since each Readline() call is independent.
+	var draining, stopDraining bool
+
 	for {
+		// Apply deferred state change from previous iteration
+		if stopDraining {
+			draining = false
+			stopDraining = false
+		}
+
 		// don't show placeholder when pasting unless we're in multiline mode
 		showPlaceholder := !i.Pasting || i.Prompt.UseAlt
 		if buf.IsEmpty() && showPlaceholder {
@@ -105,6 +119,15 @@ func (i *Instance) Readline() (string, error) {

 		r, err := i.Terminal.Read()

+		// After reading, check if there's more buffered data. If so, we're
+		// processing cooked-mode input. Once buffer empties, the current
+		// char is the last buffered one (still drain it), then stop next iteration.
+		if i.Terminal.reader.Buffered() > 0 {
+			draining = true
+		} else if draining {
+			stopDraining = true
+		}
+
 		if buf.IsEmpty() {
 			fmt.Print(ClearToEOL)
 		}
@@ -232,15 +255,20 @@ func (i *Instance) Readline() (string, error) {
 			fd := os.Stdin.Fd()
 			return handleCharCtrlZ(fd, i.Terminal.termios)
 		case CharCtrlJ:
-			i.pastedLines = append(i.pastedLines, buf.String())
-			buf.Buf.Clear()
-			buf.Pos = 0
-			buf.DisplayPos = 0
-			buf.LineHasSpace.Clear()
-			fmt.Println()
-			fmt.Print(i.Prompt.AltPrompt)
-			i.Prompt.UseAlt = true
-			continue
+			// If not draining cooked-mode input, treat as multiline
+			if !draining {
+				i.pastedLines = append(i.pastedLines, buf.String())
+				buf.Buf.Clear()
+				buf.Pos = 0
+				buf.DisplayPos = 0
+				buf.LineHasSpace.Clear()
+				fmt.Println()
+				fmt.Print(i.Prompt.AltPrompt)
+				i.Prompt.UseAlt = true
+				continue
+			}
+			// Draining cooked-mode input: treat \n as submit
+			fallthrough
 		case CharEnter:
 			output := buf.String()
 			if len(i.pastedLines) > 0 {
--- a/x/imagegen/manifest.go
+++ b/x/imagegen/manifest.go
@@ -161,6 +161,17 @@ func (m *ModelManifest) HasTensorLayers() bool {
 	return false
 }

+// TotalTensorSize returns the total size in bytes of all tensor layers.
+func (m *ModelManifest) TotalTensorSize() int64 {
+	var total int64
+	for _, layer := range m.Manifest.Layers {
+		if layer.MediaType == "application/vnd.ollama.image.tensor" {
+			total += layer.Size
+		}
+	}
+	return total
+}
+
 // ModelInfo contains metadata about an image generation model.
 type ModelInfo struct {
 	Architecture   string
--- a/x/imagegen/manifest_test.go
+++ b/x/imagegen/manifest_test.go
@@ -5,6 +5,37 @@ import (
 	"testing"
 )

+func TestTotalTensorSize(t *testing.T) {
+	m := &ModelManifest{
+		Manifest: &Manifest{
+			Layers: []ManifestLayer{
+				{MediaType: "application/vnd.ollama.image.tensor", Size: 1000},
+				{MediaType: "application/vnd.ollama.image.tensor", Size: 2000},
+				{MediaType: "application/vnd.ollama.image.json", Size: 500}, // not a tensor
+				{MediaType: "application/vnd.ollama.image.tensor", Size: 3000},
+			},
+		},
+	}
+
+	got := m.TotalTensorSize()
+	want := int64(6000)
+	if got != want {
+		t.Errorf("TotalTensorSize() = %d, want %d", got, want)
+	}
+}
+
+func TestTotalTensorSizeEmpty(t *testing.T) {
+	m := &ModelManifest{
+		Manifest: &Manifest{
+			Layers: []ManifestLayer{},
+		},
+	}
+
+	if got := m.TotalTensorSize(); got != 0 {
+		t.Errorf("TotalTensorSize() = %d, want 0", got)
+	}
+}
+
 func TestManifestAndBlobDirsRespectOLLAMAModels(t *testing.T) {
 	modelsDir := filepath.Join(t.TempDir(), "models")

--- a/x/imagegen/memory.go
+++ b/x/imagegen/memory.go
@@ -16,18 +16,9 @@ import (
 	"runtime"
 )

-// GB is a convenience constant for gigabytes.
-const GB = 1024 * 1024 * 1024
-
 // SupportedBackends lists the backends that support image generation.
 var SupportedBackends = []string{"metal", "cuda", "cpu"}

-// modelVRAMEstimates maps pipeline class names to their estimated VRAM requirements.
-var modelVRAMEstimates = map[string]uint64{
-	"ZImagePipeline": 21 * GB, // ~21GB for Z-Image (text encoder + transformer + VAE)
-	"FluxPipeline":   20 * GB, // ~20GB for Flux
-}
-
 // CheckPlatformSupport validates that image generation is supported on the current platform.
 // Returns nil if supported, or an error describing why it's not supported.
 func CheckPlatformSupport() error {
@@ -47,17 +38,6 @@ func CheckPlatformSupport() error {
 	}
 }

-// CheckMemoryRequirements validates that there's enough memory for image generation.
-// Returns nil if memory is sufficient, or an error if not.
-func CheckMemoryRequirements(modelName string, availableMemory uint64) error {
-	required := EstimateVRAM(modelName)
-	if availableMemory < required {
-		return fmt.Errorf("insufficient memory for image generation: need %d GB, have %d GB",
-			required/GB, availableMemory/GB)
-	}
-	return nil
-}
-
 // ResolveModelName checks if a model name is a known image generation model.
 // Returns the normalized model name if found, empty string otherwise.
 func ResolveModelName(modelName string) string {
@@ -68,16 +48,6 @@ func ResolveModelName(modelName string) string {
 	return ""
 }

-// EstimateVRAM returns the estimated VRAM needed for an image generation model.
-// Returns a conservative default of 21GB if the model type cannot be determined.
-func EstimateVRAM(modelName string) uint64 {
-	className := DetectModelType(modelName)
-	if estimate, ok := modelVRAMEstimates[className]; ok {
-		return estimate
-	}
-	return 21 * GB
-}
-
 // DetectModelType reads model_index.json and returns the model type.
 // Checks both "architecture" (Ollama format) and "_class_name" (diffusers format).
 // Returns empty string if detection fails.
--- a/x/imagegen/memory_test.go
+++ b/x/imagegen/memory_test.go
@@ -30,69 +30,6 @@ func TestCheckPlatformSupport(t *testing.T) {
 	}
 }

-func TestCheckMemoryRequirements(t *testing.T) {
-	tests := []struct {
-		name            string
-		availableMemory uint64
-		wantErr         bool
-	}{
-		{
-			name:            "sufficient memory",
-			availableMemory: 32 * GB,
-			wantErr:         false,
-		},
-		{
-			name:            "exactly enough memory",
-			availableMemory: 21 * GB,
-			wantErr:         false,
-		},
-		{
-			name:            "insufficient memory",
-			availableMemory: 16 * GB,
-			wantErr:         true,
-		},
-		{
-			name:            "zero memory",
-			availableMemory: 0,
-			wantErr:         true,
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			// Use a non-existent model name which will default to 21GB estimate
-			err := CheckMemoryRequirements("nonexistent-model", tt.availableMemory)
-			if (err != nil) != tt.wantErr {
-				t.Errorf("CheckMemoryRequirements() error = %v, wantErr %v", err, tt.wantErr)
-			}
-		})
-	}
-}
-
-func TestModelVRAMEstimates(t *testing.T) {
-	// Verify the VRAM estimates map has expected entries
-	expected := map[string]uint64{
-		"ZImagePipeline": 21 * GB,
-		"FluxPipeline":   20 * GB,
-	}
-
-	for name, expectedVRAM := range expected {
-		if actual, ok := modelVRAMEstimates[name]; !ok {
-			t.Errorf("Missing VRAM estimate for %s", name)
-		} else if actual != expectedVRAM {
-			t.Errorf("VRAM estimate for %s = %d GB, want %d GB", name, actual/GB, expectedVRAM/GB)
-		}
-	}
-}
-
-func TestEstimateVRAMDefault(t *testing.T) {
-	// Non-existent model should return default 21GB
-	vram := EstimateVRAM("nonexistent-model-that-does-not-exist")
-	if vram != 21*GB {
-		t.Errorf("EstimateVRAM() = %d GB, want 21 GB", vram/GB)
-	}
-}
-
 func TestResolveModelName(t *testing.T) {
 	// Non-existent model should return empty string
 	result := ResolveModelName("nonexistent-model")
--- a/x/imagegen/runner/runner.go
+++ b/x/imagegen/runner/runner.go
@@ -78,14 +78,6 @@ func Execute(args []string) error {
 	slog.Info("MLX library initialized")
 	slog.Info("starting image runner", "model", *modelName, "port", *port)

-	// Check memory requirements before loading
-	requiredMemory := imagegen.EstimateVRAM(*modelName)
-	availableMemory := mlx.GetMemoryLimit()
-	if availableMemory > 0 && availableMemory < requiredMemory {
-		return fmt.Errorf("insufficient memory for image generation: need %d GB, have %d GB",
-			requiredMemory/(1024*1024*1024), availableMemory/(1024*1024*1024))
-	}
-
 	// Detect model type and load appropriate model
 	modelType := imagegen.DetectModelType(*modelName)
 	slog.Info("detected model type", "type", modelType)
--- a/x/imagegen/server.go
+++ b/x/imagegen/server.go
@@ -104,11 +104,17 @@ func NewServer(modelName string) (*Server, error) {
 		slog.Debug("mlx subprocess library path", "LD_LIBRARY_PATH", pathEnvVal)
 	}

+	// Get total weight size from manifest
+	var weightSize uint64
+	if manifest, err := LoadManifest(modelName); err == nil {
+		weightSize = uint64(manifest.TotalTensorSize())
+	}
+
 	s := &Server{
 		cmd:       cmd,
 		port:      port,
 		modelName: modelName,
-		vramSize:  EstimateVRAM(modelName),
+		vramSize:  weightSize,
 		done:      make(chan error, 1),
 		client:    &http.Client{Timeout: 10 * time.Minute},
 	}
--- a/x/imagegen/server_test.go
+++ b/x/imagegen/server_test.go
@@ -38,40 +38,6 @@ func TestPlatformSupport(t *testing.T) {
 	}
 }

-// TestMemoryRequirementsError verifies memory check returns clear error.
-func TestMemoryRequirementsError(t *testing.T) {
-	// Test with insufficient memory
-	err := CheckMemoryRequirements("test-model", 8*GB)
-	if err == nil {
-		t.Error("Expected error for insufficient memory (8GB < 21GB default)")
-	}
-
-	// Test with sufficient memory
-	err = CheckMemoryRequirements("test-model", 32*GB)
-	if err != nil {
-		t.Errorf("Expected no error for sufficient memory (32GB), got: %v", err)
-	}
-}
-
-// TestEstimateVRAMReturnsReasonableDefaults verifies VRAM estimates are sensible.
-func TestEstimateVRAMReturnsReasonableDefaults(t *testing.T) {
-	// Unknown model should return default (21GB)
-	vram := EstimateVRAM("unknown-model")
-	if vram < 10*GB || vram > 100*GB {
-		t.Errorf("VRAM estimate %d GB is outside reasonable range (10-100 GB)", vram/GB)
-	}
-
-	// Verify known pipeline estimates exist and are reasonable
-	for name, estimate := range modelVRAMEstimates {
-		if estimate < 10*GB {
-			t.Errorf("VRAM estimate for %s (%d GB) is suspiciously low", name, estimate/GB)
-		}
-		if estimate > 200*GB {
-			t.Errorf("VRAM estimate for %s (%d GB) is suspiciously high", name, estimate/GB)
-		}
-	}
-}
-
 // TestServerInterfaceCompliance verifies Server implements llm.LlamaServer.
 // This is a compile-time check but we document it as a test.
 func TestServerInterfaceCompliance(t *testing.T) {