Add image generation documentation

- Add image generation capability page with API usage examples - Add image-generation to docs.json navigation - Update openapi.yaml with image generation request/response fields - Request: width, height, steps - Response: image, completed, total
2026-01-22 22:40:07 -05:00 · 2026-01-22 14:09:58 -08:00
12 changed files with 378 additions and 87 deletions
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -159,7 +159,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 			sb.WriteString(before)
 			if !ok {
 				fmt.Fprintln(&sb)
-				scanner.Prompt.UseAlt = true
 				continue
 			}

--- a/docs/capabilities/image-generation.mdx
+++ b/docs/capabilities/image-generation.mdx
@@ -0,0 +1,205 @@
+---
+title: Image Generation
+---
+
+<Warning>
+Image generation is experimental and currently only available on macOS. This feature may change in future versions.
+</Warning>
+
+Image generation models create images from text prompts. Ollama supports diffusion-based image generation models through both Ollama's API and OpenAI-compatible endpoints.
+
+## Usage
+
+<Tabs>
+  <Tab title="CLI">
+    ```shell
+    ollama run x/z-image-turbo "a sunset over mountains"
+    ```
+    The generated image will be saved to the current directory.
+  </Tab>
+  <Tab title="cURL">
+    ```shell
+    curl http://localhost:11434/api/generate -d '{
+      "model": "x/z-image-turbo",
+      "prompt": "a sunset over mountains",
+      "stream": false
+    }'
+    ```
+  </Tab>
+  <Tab title="Python">
+    ```python
+    import ollama
+    import base64
+
+    response = ollama.generate(
+        model='x/z-image-turbo',
+        prompt='a sunset over mountains',
+    )
+
+    # Save the generated image
+    with open('output.png', 'wb') as f:
+        f.write(base64.b64decode(response['image']))
+
+    print('Image saved to output.png')
+    ```
+  </Tab>
+  <Tab title="JavaScript">
+    ```javascript
+    import ollama from 'ollama'
+    import { writeFileSync } from 'fs'
+
+    const response = await ollama.generate({
+      model: 'x/z-image-turbo',
+      prompt: 'a sunset over mountains',
+    })
+
+    // Save the generated image
+    const imageBuffer = Buffer.from(response.image, 'base64')
+    writeFileSync('output.png', imageBuffer)
+
+    console.log('Image saved to output.png')
+    ```
+  </Tab>
+</Tabs>
+
+### Response
+
+The response includes an `image` field containing the base64-encoded image data:
+
+```json
+{
+  "model": "x/z-image-turbo",
+  "created_at": "2024-01-15T10:30:15.000000Z",
+  "image": "iVBORw0KGgoAAAANSUhEUg...",
+  "done": true,
+  "done_reason": "stop",
+  "total_duration": 15000000000,
+  "load_duration": 2000000000
+}
+```
+
+## Image dimensions
+
+Customize the output image size using the `width` and `height` parameters:
+
+<Tabs>
+  <Tab title="cURL">
+    ```shell
+    curl http://localhost:11434/api/generate -d '{
+      "model": "x/z-image-turbo",
+      "prompt": "a portrait of a robot artist",
+      "width": 768,
+      "height": 1024,
+      "stream": false
+    }'
+    ```
+  </Tab>
+  <Tab title="Python">
+    ```python
+    import ollama
+
+    response = ollama.generate(
+        model='x/z-image-turbo',
+        prompt='a portrait of a robot artist',
+        width=768,
+        height=1024,
+    )
+    ```
+  </Tab>
+  <Tab title="JavaScript">
+    ```javascript
+    import ollama from 'ollama'
+
+    const response = await ollama.generate({
+      model: 'x/z-image-turbo',
+      prompt: 'a portrait of a robot artist',
+      width: 768,
+      height: 1024,
+    })
+    ```
+  </Tab>
+</Tabs>
+
+## Streaming progress
+
+When streaming is enabled (the default), progress updates are sent during image generation:
+
+```json
+{
+  "model": "x/z-image-turbo",
+  "created_at": "2024-01-15T10:30:00.000000Z",
+  "completed": 5,
+  "total": 20,
+  "done": false
+}
+```
+
+The `completed` and `total` fields indicate the current progress through the diffusion steps.
+
+## Parameters
+
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| `prompt` | Text description of the image to generate | Required |
+| `width` | Width of the generated image in pixels | Model default |
+| `height` | Height of the generated image in pixels | Model default |
+| `steps` | Number of diffusion steps | Model default |
+
+## OpenAI compatibility
+
+Image generation is also available through the OpenAI-compatible `/v1/images/generations` endpoint:
+
+<Tabs>
+  <Tab title="cURL">
+    ```shell
+    curl http://localhost:11434/v1/images/generations \
+      -H "Content-Type: application/json" \
+      -d '{
+        "model": "x/z-image-turbo",
+        "prompt": "a sunset over mountains",
+        "size": "1024x1024",
+        "response_format": "b64_json"
+      }'
+    ```
+  </Tab>
+  <Tab title="Python">
+    ```python
+    from openai import OpenAI
+
+    client = OpenAI(
+        base_url='http://localhost:11434/v1/',
+        api_key='ollama',  # required but ignored
+    )
+
+    response = client.images.generate(
+        model='x/z-image-turbo',
+        prompt='a sunset over mountains',
+        size='1024x1024',
+        response_format='b64_json',
+    )
+
+    print(response.data[0].b64_json[:50] + '...')
+    ```
+  </Tab>
+  <Tab title="JavaScript">
+    ```javascript
+    import OpenAI from 'openai'
+
+    const openai = new OpenAI({
+      baseURL: 'http://localhost:11434/v1/',
+      apiKey: 'ollama', // required but ignored
+    })
+
+    const response = await openai.images.generate({
+      model: 'x/z-image-turbo',
+      prompt: 'a sunset over mountains',
+      size: '1024x1024',
+      response_format: 'b64_json',
+    })
+
+    console.log(response.data[0].b64_json.slice(0, 50) + '...')
+    ```
+  </Tab>
+</Tabs>
+
+See [OpenAI compatibility](/api/openai-compatibility#v1imagesgenerations-experimental) for more details.
--- a/docs/docs.json
+++ b/docs/docs.json
@@ -93,6 +93,7 @@
              "/capabilities/thinking",
              "/capabilities/structured-outputs",
              "/capabilities/vision",
+              "/capabilities/image-generation",
              "/capabilities/embeddings",
              "/capabilities/tool-calling",
              "/capabilities/web-search"
--- a/docs/openapi.yaml
+++ b/docs/openapi.yaml
@@ -117,6 +117,15 @@ components:
        top_logprobs:
          type: integer
          description: Number of most likely tokens to return at each token position when logprobs are enabled
+        width:
+          type: integer
+          description: (Experimental) Width of the generated image in pixels. For image generation models only.
+        height:
+          type: integer
+          description: (Experimental) Height of the generated image in pixels. For image generation models only.
+        steps:
+          type: integer
+          description: (Experimental) Number of diffusion steps. For image generation models only.
    GenerateResponse:
      type: object
      properties:
@@ -161,6 +170,15 @@ components:
          items:
            $ref: "#/components/schemas/Logprob"
          description: Log probability information for the generated tokens when logprobs are enabled
+        image:
+          type: string
+          description: (Experimental) Base64-encoded generated image data. For image generation models only.
+        completed:
+          type: integer
+          description: (Experimental) Number of completed diffusion steps. For image generation streaming progress.
+        total:
+          type: integer
+          description: (Experimental) Total number of diffusion steps. For image generation streaming progress.
    GenerateStreamEvent:
      type: object
      properties:
@@ -200,6 +218,15 @@ components:
        eval_duration:
          type: integer
          description: Time spent generating tokens in nanoseconds
+        image:
+          type: string
+          description: (Experimental) Base64-encoded generated image data. For image generation models only.
+        completed:
+          type: integer
+          description: (Experimental) Number of completed diffusion steps. For image generation streaming progress.
+        total:
+          type: integer
+          description: (Experimental) Total number of diffusion steps. For image generation streaming progress.
    ChatMessage:
      type: object
      required: [role, content]
--- a/readline/readline.go
+++ b/readline/readline.go
@@ -95,21 +95,7 @@ func (i *Instance) Readline() (string, error) {

 	var currentLineBuf []rune

-	// draining tracks if we're processing buffered input from cooked mode.
-	// In cooked mode Enter sends \n, but in raw mode Ctrl+J sends \n.
-	// We treat \n from cooked mode as submit, not multiline.
-	// We check Buffered() after the first read since the bufio buffer is
-	// empty until then. This is compatible with """ multiline mode in
-	// interactive.go since each Readline() call is independent.
-	var draining, stopDraining bool
-
 	for {
-		// Apply deferred state change from previous iteration
-		if stopDraining {
-			draining = false
-			stopDraining = false
-		}
-
 		// don't show placeholder when pasting unless we're in multiline mode
 		showPlaceholder := !i.Pasting || i.Prompt.UseAlt
 		if buf.IsEmpty() && showPlaceholder {
@@ -119,15 +105,6 @@ func (i *Instance) Readline() (string, error) {

 		r, err := i.Terminal.Read()

-		// After reading, check if there's more buffered data. If so, we're
-		// processing cooked-mode input. Once buffer empties, the current
-		// char is the last buffered one (still drain it), then stop next iteration.
-		if i.Terminal.reader.Buffered() > 0 {
-			draining = true
-		} else if draining {
-			stopDraining = true
-		}
-
 		if buf.IsEmpty() {
 			fmt.Print(ClearToEOL)
 		}
@@ -255,20 +232,15 @@ func (i *Instance) Readline() (string, error) {
 			fd := os.Stdin.Fd()
 			return handleCharCtrlZ(fd, i.Terminal.termios)
 		case CharCtrlJ:
-			// If not draining cooked-mode input, treat as multiline
-			if !draining {
-				i.pastedLines = append(i.pastedLines, buf.String())
-				buf.Buf.Clear()
-				buf.Pos = 0
-				buf.DisplayPos = 0
-				buf.LineHasSpace.Clear()
-				fmt.Println()
-				fmt.Print(i.Prompt.AltPrompt)
-				i.Prompt.UseAlt = true
-				continue
-			}
-			// Draining cooked-mode input: treat \n as submit
-			fallthrough
+			i.pastedLines = append(i.pastedLines, buf.String())
+			buf.Buf.Clear()
+			buf.Pos = 0
+			buf.DisplayPos = 0
+			buf.LineHasSpace.Clear()
+			fmt.Println()
+			fmt.Print(i.Prompt.AltPrompt)
+			i.Prompt.UseAlt = true
+			continue
 		case CharEnter:
 			output := buf.String()
 			if len(i.pastedLines) > 0 {
--- a/x/imagegen/manifest.go
+++ b/x/imagegen/manifest.go
@@ -161,17 +161,6 @@ func (m *ModelManifest) HasTensorLayers() bool {
 	return false
 }

-// TotalTensorSize returns the total size in bytes of all tensor layers.
-func (m *ModelManifest) TotalTensorSize() int64 {
-	var total int64
-	for _, layer := range m.Manifest.Layers {
-		if layer.MediaType == "application/vnd.ollama.image.tensor" {
-			total += layer.Size
-		}
-	}
-	return total
-}
-
 // ModelInfo contains metadata about an image generation model.
 type ModelInfo struct {
 	Architecture   string
--- a/x/imagegen/manifest_test.go
+++ b/x/imagegen/manifest_test.go
@@ -5,37 +5,6 @@ import (
 	"testing"
 )

-func TestTotalTensorSize(t *testing.T) {
-	m := &ModelManifest{
-		Manifest: &Manifest{
-			Layers: []ManifestLayer{
-				{MediaType: "application/vnd.ollama.image.tensor", Size: 1000},
-				{MediaType: "application/vnd.ollama.image.tensor", Size: 2000},
-				{MediaType: "application/vnd.ollama.image.json", Size: 500}, // not a tensor
-				{MediaType: "application/vnd.ollama.image.tensor", Size: 3000},
-			},
-		},
-	}
-
-	got := m.TotalTensorSize()
-	want := int64(6000)
-	if got != want {
-		t.Errorf("TotalTensorSize() = %d, want %d", got, want)
-	}
-}
-
-func TestTotalTensorSizeEmpty(t *testing.T) {
-	m := &ModelManifest{
-		Manifest: &Manifest{
-			Layers: []ManifestLayer{},
-		},
-	}
-
-	if got := m.TotalTensorSize(); got != 0 {
-		t.Errorf("TotalTensorSize() = %d, want 0", got)
-	}
-}
-
 func TestManifestAndBlobDirsRespectOLLAMAModels(t *testing.T) {
 	modelsDir := filepath.Join(t.TempDir(), "models")

--- a/x/imagegen/memory.go
+++ b/x/imagegen/memory.go
@@ -16,9 +16,18 @@ import (
 	"runtime"
 )

+// GB is a convenience constant for gigabytes.
+const GB = 1024 * 1024 * 1024
+
 // SupportedBackends lists the backends that support image generation.
 var SupportedBackends = []string{"metal", "cuda", "cpu"}

+// modelVRAMEstimates maps pipeline class names to their estimated VRAM requirements.
+var modelVRAMEstimates = map[string]uint64{
+	"ZImagePipeline": 21 * GB, // ~21GB for Z-Image (text encoder + transformer + VAE)
+	"FluxPipeline":   20 * GB, // ~20GB for Flux
+}
+
 // CheckPlatformSupport validates that image generation is supported on the current platform.
 // Returns nil if supported, or an error describing why it's not supported.
 func CheckPlatformSupport() error {
@@ -38,6 +47,17 @@ func CheckPlatformSupport() error {
 	}
 }

+// CheckMemoryRequirements validates that there's enough memory for image generation.
+// Returns nil if memory is sufficient, or an error if not.
+func CheckMemoryRequirements(modelName string, availableMemory uint64) error {
+	required := EstimateVRAM(modelName)
+	if availableMemory < required {
+		return fmt.Errorf("insufficient memory for image generation: need %d GB, have %d GB",
+			required/GB, availableMemory/GB)
+	}
+	return nil
+}
+
 // ResolveModelName checks if a model name is a known image generation model.
 // Returns the normalized model name if found, empty string otherwise.
 func ResolveModelName(modelName string) string {
@@ -48,6 +68,16 @@ func ResolveModelName(modelName string) string {
 	return ""
 }

+// EstimateVRAM returns the estimated VRAM needed for an image generation model.
+// Returns a conservative default of 21GB if the model type cannot be determined.
+func EstimateVRAM(modelName string) uint64 {
+	className := DetectModelType(modelName)
+	if estimate, ok := modelVRAMEstimates[className]; ok {
+		return estimate
+	}
+	return 21 * GB
+}
+
 // DetectModelType reads model_index.json and returns the model type.
 // Checks both "architecture" (Ollama format) and "_class_name" (diffusers format).
 // Returns empty string if detection fails.
--- a/x/imagegen/memory_test.go
+++ b/x/imagegen/memory_test.go
@@ -30,6 +30,69 @@ func TestCheckPlatformSupport(t *testing.T) {
 	}
 }

+func TestCheckMemoryRequirements(t *testing.T) {
+	tests := []struct {
+		name            string
+		availableMemory uint64
+		wantErr         bool
+	}{
+		{
+			name:            "sufficient memory",
+			availableMemory: 32 * GB,
+			wantErr:         false,
+		},
+		{
+			name:            "exactly enough memory",
+			availableMemory: 21 * GB,
+			wantErr:         false,
+		},
+		{
+			name:            "insufficient memory",
+			availableMemory: 16 * GB,
+			wantErr:         true,
+		},
+		{
+			name:            "zero memory",
+			availableMemory: 0,
+			wantErr:         true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			// Use a non-existent model name which will default to 21GB estimate
+			err := CheckMemoryRequirements("nonexistent-model", tt.availableMemory)
+			if (err != nil) != tt.wantErr {
+				t.Errorf("CheckMemoryRequirements() error = %v, wantErr %v", err, tt.wantErr)
+			}
+		})
+	}
+}
+
+func TestModelVRAMEstimates(t *testing.T) {
+	// Verify the VRAM estimates map has expected entries
+	expected := map[string]uint64{
+		"ZImagePipeline": 21 * GB,
+		"FluxPipeline":   20 * GB,
+	}
+
+	for name, expectedVRAM := range expected {
+		if actual, ok := modelVRAMEstimates[name]; !ok {
+			t.Errorf("Missing VRAM estimate for %s", name)
+		} else if actual != expectedVRAM {
+			t.Errorf("VRAM estimate for %s = %d GB, want %d GB", name, actual/GB, expectedVRAM/GB)
+		}
+	}
+}
+
+func TestEstimateVRAMDefault(t *testing.T) {
+	// Non-existent model should return default 21GB
+	vram := EstimateVRAM("nonexistent-model-that-does-not-exist")
+	if vram != 21*GB {
+		t.Errorf("EstimateVRAM() = %d GB, want 21 GB", vram/GB)
+	}
+}
+
 func TestResolveModelName(t *testing.T) {
 	// Non-existent model should return empty string
 	result := ResolveModelName("nonexistent-model")
--- a/x/imagegen/runner/runner.go
+++ b/x/imagegen/runner/runner.go
@@ -78,6 +78,14 @@ func Execute(args []string) error {
 	slog.Info("MLX library initialized")
 	slog.Info("starting image runner", "model", *modelName, "port", *port)

+	// Check memory requirements before loading
+	requiredMemory := imagegen.EstimateVRAM(*modelName)
+	availableMemory := mlx.GetMemoryLimit()
+	if availableMemory > 0 && availableMemory < requiredMemory {
+		return fmt.Errorf("insufficient memory for image generation: need %d GB, have %d GB",
+			requiredMemory/(1024*1024*1024), availableMemory/(1024*1024*1024))
+	}
+
 	// Detect model type and load appropriate model
 	modelType := imagegen.DetectModelType(*modelName)
 	slog.Info("detected model type", "type", modelType)
--- a/x/imagegen/server.go
+++ b/x/imagegen/server.go
@@ -104,17 +104,11 @@ func NewServer(modelName string) (*Server, error) {
 		slog.Debug("mlx subprocess library path", "LD_LIBRARY_PATH", pathEnvVal)
 	}

-	// Get total weight size from manifest
-	var weightSize uint64
-	if manifest, err := LoadManifest(modelName); err == nil {
-		weightSize = uint64(manifest.TotalTensorSize())
-	}
-
 	s := &Server{
 		cmd:       cmd,
 		port:      port,
 		modelName: modelName,
-		vramSize:  weightSize,
+		vramSize:  EstimateVRAM(modelName),
 		done:      make(chan error, 1),
 		client:    &http.Client{Timeout: 10 * time.Minute},
 	}
--- a/x/imagegen/server_test.go
+++ b/x/imagegen/server_test.go
@@ -38,6 +38,40 @@ func TestPlatformSupport(t *testing.T) {
 	}
 }

+// TestMemoryRequirementsError verifies memory check returns clear error.
+func TestMemoryRequirementsError(t *testing.T) {
+	// Test with insufficient memory
+	err := CheckMemoryRequirements("test-model", 8*GB)
+	if err == nil {
+		t.Error("Expected error for insufficient memory (8GB < 21GB default)")
+	}
+
+	// Test with sufficient memory
+	err = CheckMemoryRequirements("test-model", 32*GB)
+	if err != nil {
+		t.Errorf("Expected no error for sufficient memory (32GB), got: %v", err)
+	}
+}
+
+// TestEstimateVRAMReturnsReasonableDefaults verifies VRAM estimates are sensible.
+func TestEstimateVRAMReturnsReasonableDefaults(t *testing.T) {
+	// Unknown model should return default (21GB)
+	vram := EstimateVRAM("unknown-model")
+	if vram < 10*GB || vram > 100*GB {
+		t.Errorf("VRAM estimate %d GB is outside reasonable range (10-100 GB)", vram/GB)
+	}
+
+	// Verify known pipeline estimates exist and are reasonable
+	for name, estimate := range modelVRAMEstimates {
+		if estimate < 10*GB {
+			t.Errorf("VRAM estimate for %s (%d GB) is suspiciously low", name, estimate/GB)
+		}
+		if estimate > 200*GB {
+			t.Errorf("VRAM estimate for %s (%d GB) is suspiciously high", name, estimate/GB)
+		}
+	}
+}
+
 // TestServerInterfaceCompliance verifies Server implements llm.LlamaServer.
 // This is a compile-time check but we document it as a test.
 func TestServerInterfaceCompliance(t *testing.T) {