Compare commits

..

1 Commits

Author SHA1 Message Date
jmorganca
8b4410633d Add image generation documentation
- Add image generation capability page with API usage examples
- Add image-generation to docs.json navigation
- Update openapi.yaml with image generation request/response fields
  - Request: width, height, steps
  - Response: image, completed, total
2026-01-22 14:09:58 -08:00
12 changed files with 378 additions and 87 deletions

View File

@@ -159,7 +159,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
sb.WriteString(before)
if !ok {
fmt.Fprintln(&sb)
scanner.Prompt.UseAlt = true
continue
}

View File

@@ -0,0 +1,205 @@
---
title: Image Generation
---
<Warning>
Image generation is experimental and currently only available on macOS. This feature may change in future versions.
</Warning>
Image generation models create images from text prompts. Ollama supports diffusion-based image generation models through both Ollama's API and OpenAI-compatible endpoints.
## Usage
<Tabs>
<Tab title="CLI">
```shell
ollama run x/z-image-turbo "a sunset over mountains"
```
The generated image will be saved to the current directory.
</Tab>
<Tab title="cURL">
```shell
curl http://localhost:11434/api/generate -d '{
"model": "x/z-image-turbo",
"prompt": "a sunset over mountains",
"stream": false
}'
```
</Tab>
<Tab title="Python">
```python
import ollama
import base64
response = ollama.generate(
model='x/z-image-turbo',
prompt='a sunset over mountains',
)
# Save the generated image
with open('output.png', 'wb') as f:
f.write(base64.b64decode(response['image']))
print('Image saved to output.png')
```
</Tab>
<Tab title="JavaScript">
```javascript
import ollama from 'ollama'
import { writeFileSync } from 'fs'
const response = await ollama.generate({
model: 'x/z-image-turbo',
prompt: 'a sunset over mountains',
})
// Save the generated image
const imageBuffer = Buffer.from(response.image, 'base64')
writeFileSync('output.png', imageBuffer)
console.log('Image saved to output.png')
```
</Tab>
</Tabs>
### Response
The response includes an `image` field containing the base64-encoded image data:
```json
{
"model": "x/z-image-turbo",
"created_at": "2024-01-15T10:30:15.000000Z",
"image": "iVBORw0KGgoAAAANSUhEUg...",
"done": true,
"done_reason": "stop",
"total_duration": 15000000000,
"load_duration": 2000000000
}
```
## Image dimensions
Customize the output image size using the `width` and `height` parameters:
<Tabs>
<Tab title="cURL">
```shell
curl http://localhost:11434/api/generate -d '{
"model": "x/z-image-turbo",
"prompt": "a portrait of a robot artist",
"width": 768,
"height": 1024,
"stream": false
}'
```
</Tab>
<Tab title="Python">
```python
import ollama
response = ollama.generate(
model='x/z-image-turbo',
prompt='a portrait of a robot artist',
width=768,
height=1024,
)
```
</Tab>
<Tab title="JavaScript">
```javascript
import ollama from 'ollama'
const response = await ollama.generate({
model: 'x/z-image-turbo',
prompt: 'a portrait of a robot artist',
width: 768,
height: 1024,
})
```
</Tab>
</Tabs>
## Streaming progress
When streaming is enabled (the default), progress updates are sent during image generation:
```json
{
"model": "x/z-image-turbo",
"created_at": "2024-01-15T10:30:00.000000Z",
"completed": 5,
"total": 20,
"done": false
}
```
The `completed` and `total` fields indicate the current progress through the diffusion steps.
## Parameters
| Parameter | Description | Default |
|-----------|-------------|---------|
| `prompt` | Text description of the image to generate | Required |
| `width` | Width of the generated image in pixels | Model default |
| `height` | Height of the generated image in pixels | Model default |
| `steps` | Number of diffusion steps | Model default |
## OpenAI compatibility
Image generation is also available through the OpenAI-compatible `/v1/images/generations` endpoint:
<Tabs>
<Tab title="cURL">
```shell
curl http://localhost:11434/v1/images/generations \
-H "Content-Type: application/json" \
-d '{
"model": "x/z-image-turbo",
"prompt": "a sunset over mountains",
"size": "1024x1024",
"response_format": "b64_json"
}'
```
</Tab>
<Tab title="Python">
```python
from openai import OpenAI
client = OpenAI(
base_url='http://localhost:11434/v1/',
api_key='ollama', # required but ignored
)
response = client.images.generate(
model='x/z-image-turbo',
prompt='a sunset over mountains',
size='1024x1024',
response_format='b64_json',
)
print(response.data[0].b64_json[:50] + '...')
```
</Tab>
<Tab title="JavaScript">
```javascript
import OpenAI from 'openai'
const openai = new OpenAI({
baseURL: 'http://localhost:11434/v1/',
apiKey: 'ollama', // required but ignored
})
const response = await openai.images.generate({
model: 'x/z-image-turbo',
prompt: 'a sunset over mountains',
size: '1024x1024',
response_format: 'b64_json',
})
console.log(response.data[0].b64_json.slice(0, 50) + '...')
```
</Tab>
</Tabs>
See [OpenAI compatibility](/api/openai-compatibility#v1imagesgenerations-experimental) for more details.

View File

@@ -93,6 +93,7 @@
"/capabilities/thinking",
"/capabilities/structured-outputs",
"/capabilities/vision",
"/capabilities/image-generation",
"/capabilities/embeddings",
"/capabilities/tool-calling",
"/capabilities/web-search"

View File

@@ -117,6 +117,15 @@ components:
top_logprobs:
type: integer
description: Number of most likely tokens to return at each token position when logprobs are enabled
width:
type: integer
description: (Experimental) Width of the generated image in pixels. For image generation models only.
height:
type: integer
description: (Experimental) Height of the generated image in pixels. For image generation models only.
steps:
type: integer
description: (Experimental) Number of diffusion steps. For image generation models only.
GenerateResponse:
type: object
properties:
@@ -161,6 +170,15 @@ components:
items:
$ref: "#/components/schemas/Logprob"
description: Log probability information for the generated tokens when logprobs are enabled
image:
type: string
description: (Experimental) Base64-encoded generated image data. For image generation models only.
completed:
type: integer
description: (Experimental) Number of completed diffusion steps. For image generation streaming progress.
total:
type: integer
description: (Experimental) Total number of diffusion steps. For image generation streaming progress.
GenerateStreamEvent:
type: object
properties:
@@ -200,6 +218,15 @@ components:
eval_duration:
type: integer
description: Time spent generating tokens in nanoseconds
image:
type: string
description: (Experimental) Base64-encoded generated image data. For image generation models only.
completed:
type: integer
description: (Experimental) Number of completed diffusion steps. For image generation streaming progress.
total:
type: integer
description: (Experimental) Total number of diffusion steps. For image generation streaming progress.
ChatMessage:
type: object
required: [role, content]

View File

@@ -95,21 +95,7 @@ func (i *Instance) Readline() (string, error) {
var currentLineBuf []rune
// draining tracks if we're processing buffered input from cooked mode.
// In cooked mode Enter sends \n, but in raw mode Ctrl+J sends \n.
// We treat \n from cooked mode as submit, not multiline.
// We check Buffered() after the first read since the bufio buffer is
// empty until then. This is compatible with """ multiline mode in
// interactive.go since each Readline() call is independent.
var draining, stopDraining bool
for {
// Apply deferred state change from previous iteration
if stopDraining {
draining = false
stopDraining = false
}
// don't show placeholder when pasting unless we're in multiline mode
showPlaceholder := !i.Pasting || i.Prompt.UseAlt
if buf.IsEmpty() && showPlaceholder {
@@ -119,15 +105,6 @@ func (i *Instance) Readline() (string, error) {
r, err := i.Terminal.Read()
// After reading, check if there's more buffered data. If so, we're
// processing cooked-mode input. Once buffer empties, the current
// char is the last buffered one (still drain it), then stop next iteration.
if i.Terminal.reader.Buffered() > 0 {
draining = true
} else if draining {
stopDraining = true
}
if buf.IsEmpty() {
fmt.Print(ClearToEOL)
}
@@ -255,20 +232,15 @@ func (i *Instance) Readline() (string, error) {
fd := os.Stdin.Fd()
return handleCharCtrlZ(fd, i.Terminal.termios)
case CharCtrlJ:
// If not draining cooked-mode input, treat as multiline
if !draining {
i.pastedLines = append(i.pastedLines, buf.String())
buf.Buf.Clear()
buf.Pos = 0
buf.DisplayPos = 0
buf.LineHasSpace.Clear()
fmt.Println()
fmt.Print(i.Prompt.AltPrompt)
i.Prompt.UseAlt = true
continue
}
// Draining cooked-mode input: treat \n as submit
fallthrough
i.pastedLines = append(i.pastedLines, buf.String())
buf.Buf.Clear()
buf.Pos = 0
buf.DisplayPos = 0
buf.LineHasSpace.Clear()
fmt.Println()
fmt.Print(i.Prompt.AltPrompt)
i.Prompt.UseAlt = true
continue
case CharEnter:
output := buf.String()
if len(i.pastedLines) > 0 {

View File

@@ -161,17 +161,6 @@ func (m *ModelManifest) HasTensorLayers() bool {
return false
}
// TotalTensorSize returns the total size in bytes of all tensor layers.
func (m *ModelManifest) TotalTensorSize() int64 {
var total int64
for _, layer := range m.Manifest.Layers {
if layer.MediaType == "application/vnd.ollama.image.tensor" {
total += layer.Size
}
}
return total
}
// ModelInfo contains metadata about an image generation model.
type ModelInfo struct {
Architecture string

View File

@@ -5,37 +5,6 @@ import (
"testing"
)
func TestTotalTensorSize(t *testing.T) {
m := &ModelManifest{
Manifest: &Manifest{
Layers: []ManifestLayer{
{MediaType: "application/vnd.ollama.image.tensor", Size: 1000},
{MediaType: "application/vnd.ollama.image.tensor", Size: 2000},
{MediaType: "application/vnd.ollama.image.json", Size: 500}, // not a tensor
{MediaType: "application/vnd.ollama.image.tensor", Size: 3000},
},
},
}
got := m.TotalTensorSize()
want := int64(6000)
if got != want {
t.Errorf("TotalTensorSize() = %d, want %d", got, want)
}
}
func TestTotalTensorSizeEmpty(t *testing.T) {
m := &ModelManifest{
Manifest: &Manifest{
Layers: []ManifestLayer{},
},
}
if got := m.TotalTensorSize(); got != 0 {
t.Errorf("TotalTensorSize() = %d, want 0", got)
}
}
func TestManifestAndBlobDirsRespectOLLAMAModels(t *testing.T) {
modelsDir := filepath.Join(t.TempDir(), "models")

View File

@@ -16,9 +16,18 @@ import (
"runtime"
)
// GB is a convenience constant for gigabytes.
const GB = 1024 * 1024 * 1024
// SupportedBackends lists the backends that support image generation.
var SupportedBackends = []string{"metal", "cuda", "cpu"}
// modelVRAMEstimates maps pipeline class names to their estimated VRAM requirements.
var modelVRAMEstimates = map[string]uint64{
"ZImagePipeline": 21 * GB, // ~21GB for Z-Image (text encoder + transformer + VAE)
"FluxPipeline": 20 * GB, // ~20GB for Flux
}
// CheckPlatformSupport validates that image generation is supported on the current platform.
// Returns nil if supported, or an error describing why it's not supported.
func CheckPlatformSupport() error {
@@ -38,6 +47,17 @@ func CheckPlatformSupport() error {
}
}
// CheckMemoryRequirements validates that there's enough memory for image generation.
// Returns nil if memory is sufficient, or an error if not.
func CheckMemoryRequirements(modelName string, availableMemory uint64) error {
required := EstimateVRAM(modelName)
if availableMemory < required {
return fmt.Errorf("insufficient memory for image generation: need %d GB, have %d GB",
required/GB, availableMemory/GB)
}
return nil
}
// ResolveModelName checks if a model name is a known image generation model.
// Returns the normalized model name if found, empty string otherwise.
func ResolveModelName(modelName string) string {
@@ -48,6 +68,16 @@ func ResolveModelName(modelName string) string {
return ""
}
// EstimateVRAM returns the estimated VRAM needed for an image generation model.
// Returns a conservative default of 21GB if the model type cannot be determined.
func EstimateVRAM(modelName string) uint64 {
className := DetectModelType(modelName)
if estimate, ok := modelVRAMEstimates[className]; ok {
return estimate
}
return 21 * GB
}
// DetectModelType reads model_index.json and returns the model type.
// Checks both "architecture" (Ollama format) and "_class_name" (diffusers format).
// Returns empty string if detection fails.

View File

@@ -30,6 +30,69 @@ func TestCheckPlatformSupport(t *testing.T) {
}
}
func TestCheckMemoryRequirements(t *testing.T) {
tests := []struct {
name string
availableMemory uint64
wantErr bool
}{
{
name: "sufficient memory",
availableMemory: 32 * GB,
wantErr: false,
},
{
name: "exactly enough memory",
availableMemory: 21 * GB,
wantErr: false,
},
{
name: "insufficient memory",
availableMemory: 16 * GB,
wantErr: true,
},
{
name: "zero memory",
availableMemory: 0,
wantErr: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
// Use a non-existent model name which will default to 21GB estimate
err := CheckMemoryRequirements("nonexistent-model", tt.availableMemory)
if (err != nil) != tt.wantErr {
t.Errorf("CheckMemoryRequirements() error = %v, wantErr %v", err, tt.wantErr)
}
})
}
}
func TestModelVRAMEstimates(t *testing.T) {
// Verify the VRAM estimates map has expected entries
expected := map[string]uint64{
"ZImagePipeline": 21 * GB,
"FluxPipeline": 20 * GB,
}
for name, expectedVRAM := range expected {
if actual, ok := modelVRAMEstimates[name]; !ok {
t.Errorf("Missing VRAM estimate for %s", name)
} else if actual != expectedVRAM {
t.Errorf("VRAM estimate for %s = %d GB, want %d GB", name, actual/GB, expectedVRAM/GB)
}
}
}
func TestEstimateVRAMDefault(t *testing.T) {
// Non-existent model should return default 21GB
vram := EstimateVRAM("nonexistent-model-that-does-not-exist")
if vram != 21*GB {
t.Errorf("EstimateVRAM() = %d GB, want 21 GB", vram/GB)
}
}
func TestResolveModelName(t *testing.T) {
// Non-existent model should return empty string
result := ResolveModelName("nonexistent-model")

View File

@@ -78,6 +78,14 @@ func Execute(args []string) error {
slog.Info("MLX library initialized")
slog.Info("starting image runner", "model", *modelName, "port", *port)
// Check memory requirements before loading
requiredMemory := imagegen.EstimateVRAM(*modelName)
availableMemory := mlx.GetMemoryLimit()
if availableMemory > 0 && availableMemory < requiredMemory {
return fmt.Errorf("insufficient memory for image generation: need %d GB, have %d GB",
requiredMemory/(1024*1024*1024), availableMemory/(1024*1024*1024))
}
// Detect model type and load appropriate model
modelType := imagegen.DetectModelType(*modelName)
slog.Info("detected model type", "type", modelType)

View File

@@ -104,17 +104,11 @@ func NewServer(modelName string) (*Server, error) {
slog.Debug("mlx subprocess library path", "LD_LIBRARY_PATH", pathEnvVal)
}
// Get total weight size from manifest
var weightSize uint64
if manifest, err := LoadManifest(modelName); err == nil {
weightSize = uint64(manifest.TotalTensorSize())
}
s := &Server{
cmd: cmd,
port: port,
modelName: modelName,
vramSize: weightSize,
vramSize: EstimateVRAM(modelName),
done: make(chan error, 1),
client: &http.Client{Timeout: 10 * time.Minute},
}

View File

@@ -38,6 +38,40 @@ func TestPlatformSupport(t *testing.T) {
}
}
// TestMemoryRequirementsError verifies memory check returns clear error.
func TestMemoryRequirementsError(t *testing.T) {
// Test with insufficient memory
err := CheckMemoryRequirements("test-model", 8*GB)
if err == nil {
t.Error("Expected error for insufficient memory (8GB < 21GB default)")
}
// Test with sufficient memory
err = CheckMemoryRequirements("test-model", 32*GB)
if err != nil {
t.Errorf("Expected no error for sufficient memory (32GB), got: %v", err)
}
}
// TestEstimateVRAMReturnsReasonableDefaults verifies VRAM estimates are sensible.
func TestEstimateVRAMReturnsReasonableDefaults(t *testing.T) {
// Unknown model should return default (21GB)
vram := EstimateVRAM("unknown-model")
if vram < 10*GB || vram > 100*GB {
t.Errorf("VRAM estimate %d GB is outside reasonable range (10-100 GB)", vram/GB)
}
// Verify known pipeline estimates exist and are reasonable
for name, estimate := range modelVRAMEstimates {
if estimate < 10*GB {
t.Errorf("VRAM estimate for %s (%d GB) is suspiciously low", name, estimate/GB)
}
if estimate > 200*GB {
t.Errorf("VRAM estimate for %s (%d GB) is suspiciously high", name, estimate/GB)
}
}
}
// TestServerInterfaceCompliance verifies Server implements llm.LlamaServer.
// This is a compile-time check but we document it as a test.
func TestServerInterfaceCompliance(t *testing.T) {