mirror of
https://github.com/ollama/ollama.git
synced 2026-01-22 22:40:07 -05:00
Compare commits
2 Commits
ollama-ima
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3b3bf6c217 | ||
|
|
f52c21f457 |
@@ -159,6 +159,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
||||
sb.WriteString(before)
|
||||
if !ok {
|
||||
fmt.Fprintln(&sb)
|
||||
scanner.Prompt.UseAlt = true
|
||||
continue
|
||||
}
|
||||
|
||||
|
||||
@@ -1,205 +0,0 @@
|
||||
---
|
||||
title: Image Generation
|
||||
---
|
||||
|
||||
<Warning>
|
||||
Image generation is experimental and currently only available on macOS. This feature may change in future versions.
|
||||
</Warning>
|
||||
|
||||
Image generation models create images from text prompts. Ollama supports diffusion-based image generation models through both Ollama's API and OpenAI-compatible endpoints.
|
||||
|
||||
## Usage
|
||||
|
||||
<Tabs>
|
||||
<Tab title="CLI">
|
||||
```shell
|
||||
ollama run x/z-image-turbo "a sunset over mountains"
|
||||
```
|
||||
The generated image will be saved to the current directory.
|
||||
</Tab>
|
||||
<Tab title="cURL">
|
||||
```shell
|
||||
curl http://localhost:11434/api/generate -d '{
|
||||
"model": "x/z-image-turbo",
|
||||
"prompt": "a sunset over mountains",
|
||||
"stream": false
|
||||
}'
|
||||
```
|
||||
</Tab>
|
||||
<Tab title="Python">
|
||||
```python
|
||||
import ollama
|
||||
import base64
|
||||
|
||||
response = ollama.generate(
|
||||
model='x/z-image-turbo',
|
||||
prompt='a sunset over mountains',
|
||||
)
|
||||
|
||||
# Save the generated image
|
||||
with open('output.png', 'wb') as f:
|
||||
f.write(base64.b64decode(response['image']))
|
||||
|
||||
print('Image saved to output.png')
|
||||
```
|
||||
</Tab>
|
||||
<Tab title="JavaScript">
|
||||
```javascript
|
||||
import ollama from 'ollama'
|
||||
import { writeFileSync } from 'fs'
|
||||
|
||||
const response = await ollama.generate({
|
||||
model: 'x/z-image-turbo',
|
||||
prompt: 'a sunset over mountains',
|
||||
})
|
||||
|
||||
// Save the generated image
|
||||
const imageBuffer = Buffer.from(response.image, 'base64')
|
||||
writeFileSync('output.png', imageBuffer)
|
||||
|
||||
console.log('Image saved to output.png')
|
||||
```
|
||||
</Tab>
|
||||
</Tabs>
|
||||
|
||||
### Response
|
||||
|
||||
The response includes an `image` field containing the base64-encoded image data:
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "x/z-image-turbo",
|
||||
"created_at": "2024-01-15T10:30:15.000000Z",
|
||||
"image": "iVBORw0KGgoAAAANSUhEUg...",
|
||||
"done": true,
|
||||
"done_reason": "stop",
|
||||
"total_duration": 15000000000,
|
||||
"load_duration": 2000000000
|
||||
}
|
||||
```
|
||||
|
||||
## Image dimensions
|
||||
|
||||
Customize the output image size using the `width` and `height` parameters:
|
||||
|
||||
<Tabs>
|
||||
<Tab title="cURL">
|
||||
```shell
|
||||
curl http://localhost:11434/api/generate -d '{
|
||||
"model": "x/z-image-turbo",
|
||||
"prompt": "a portrait of a robot artist",
|
||||
"width": 768,
|
||||
"height": 1024,
|
||||
"stream": false
|
||||
}'
|
||||
```
|
||||
</Tab>
|
||||
<Tab title="Python">
|
||||
```python
|
||||
import ollama
|
||||
|
||||
response = ollama.generate(
|
||||
model='x/z-image-turbo',
|
||||
prompt='a portrait of a robot artist',
|
||||
width=768,
|
||||
height=1024,
|
||||
)
|
||||
```
|
||||
</Tab>
|
||||
<Tab title="JavaScript">
|
||||
```javascript
|
||||
import ollama from 'ollama'
|
||||
|
||||
const response = await ollama.generate({
|
||||
model: 'x/z-image-turbo',
|
||||
prompt: 'a portrait of a robot artist',
|
||||
width: 768,
|
||||
height: 1024,
|
||||
})
|
||||
```
|
||||
</Tab>
|
||||
</Tabs>
|
||||
|
||||
## Streaming progress
|
||||
|
||||
When streaming is enabled (the default), progress updates are sent during image generation:
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "x/z-image-turbo",
|
||||
"created_at": "2024-01-15T10:30:00.000000Z",
|
||||
"completed": 5,
|
||||
"total": 20,
|
||||
"done": false
|
||||
}
|
||||
```
|
||||
|
||||
The `completed` and `total` fields indicate the current progress through the diffusion steps.
|
||||
|
||||
## Parameters
|
||||
|
||||
| Parameter | Description | Default |
|
||||
|-----------|-------------|---------|
|
||||
| `prompt` | Text description of the image to generate | Required |
|
||||
| `width` | Width of the generated image in pixels | Model default |
|
||||
| `height` | Height of the generated image in pixels | Model default |
|
||||
| `steps` | Number of diffusion steps | Model default |
|
||||
|
||||
## OpenAI compatibility
|
||||
|
||||
Image generation is also available through the OpenAI-compatible `/v1/images/generations` endpoint:
|
||||
|
||||
<Tabs>
|
||||
<Tab title="cURL">
|
||||
```shell
|
||||
curl http://localhost:11434/v1/images/generations \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "x/z-image-turbo",
|
||||
"prompt": "a sunset over mountains",
|
||||
"size": "1024x1024",
|
||||
"response_format": "b64_json"
|
||||
}'
|
||||
```
|
||||
</Tab>
|
||||
<Tab title="Python">
|
||||
```python
|
||||
from openai import OpenAI
|
||||
|
||||
client = OpenAI(
|
||||
base_url='http://localhost:11434/v1/',
|
||||
api_key='ollama', # required but ignored
|
||||
)
|
||||
|
||||
response = client.images.generate(
|
||||
model='x/z-image-turbo',
|
||||
prompt='a sunset over mountains',
|
||||
size='1024x1024',
|
||||
response_format='b64_json',
|
||||
)
|
||||
|
||||
print(response.data[0].b64_json[:50] + '...')
|
||||
```
|
||||
</Tab>
|
||||
<Tab title="JavaScript">
|
||||
```javascript
|
||||
import OpenAI from 'openai'
|
||||
|
||||
const openai = new OpenAI({
|
||||
baseURL: 'http://localhost:11434/v1/',
|
||||
apiKey: 'ollama', // required but ignored
|
||||
})
|
||||
|
||||
const response = await openai.images.generate({
|
||||
model: 'x/z-image-turbo',
|
||||
prompt: 'a sunset over mountains',
|
||||
size: '1024x1024',
|
||||
response_format: 'b64_json',
|
||||
})
|
||||
|
||||
console.log(response.data[0].b64_json.slice(0, 50) + '...')
|
||||
```
|
||||
</Tab>
|
||||
</Tabs>
|
||||
|
||||
See [OpenAI compatibility](/api/openai-compatibility#v1imagesgenerations-experimental) for more details.
|
||||
@@ -93,7 +93,6 @@
|
||||
"/capabilities/thinking",
|
||||
"/capabilities/structured-outputs",
|
||||
"/capabilities/vision",
|
||||
"/capabilities/image-generation",
|
||||
"/capabilities/embeddings",
|
||||
"/capabilities/tool-calling",
|
||||
"/capabilities/web-search"
|
||||
|
||||
@@ -117,15 +117,6 @@ components:
|
||||
top_logprobs:
|
||||
type: integer
|
||||
description: Number of most likely tokens to return at each token position when logprobs are enabled
|
||||
width:
|
||||
type: integer
|
||||
description: (Experimental) Width of the generated image in pixels. For image generation models only.
|
||||
height:
|
||||
type: integer
|
||||
description: (Experimental) Height of the generated image in pixels. For image generation models only.
|
||||
steps:
|
||||
type: integer
|
||||
description: (Experimental) Number of diffusion steps. For image generation models only.
|
||||
GenerateResponse:
|
||||
type: object
|
||||
properties:
|
||||
@@ -170,15 +161,6 @@ components:
|
||||
items:
|
||||
$ref: "#/components/schemas/Logprob"
|
||||
description: Log probability information for the generated tokens when logprobs are enabled
|
||||
image:
|
||||
type: string
|
||||
description: (Experimental) Base64-encoded generated image data. For image generation models only.
|
||||
completed:
|
||||
type: integer
|
||||
description: (Experimental) Number of completed diffusion steps. For image generation streaming progress.
|
||||
total:
|
||||
type: integer
|
||||
description: (Experimental) Total number of diffusion steps. For image generation streaming progress.
|
||||
GenerateStreamEvent:
|
||||
type: object
|
||||
properties:
|
||||
@@ -218,15 +200,6 @@ components:
|
||||
eval_duration:
|
||||
type: integer
|
||||
description: Time spent generating tokens in nanoseconds
|
||||
image:
|
||||
type: string
|
||||
description: (Experimental) Base64-encoded generated image data. For image generation models only.
|
||||
completed:
|
||||
type: integer
|
||||
description: (Experimental) Number of completed diffusion steps. For image generation streaming progress.
|
||||
total:
|
||||
type: integer
|
||||
description: (Experimental) Total number of diffusion steps. For image generation streaming progress.
|
||||
ChatMessage:
|
||||
type: object
|
||||
required: [role, content]
|
||||
|
||||
@@ -95,7 +95,21 @@ func (i *Instance) Readline() (string, error) {
|
||||
|
||||
var currentLineBuf []rune
|
||||
|
||||
// draining tracks if we're processing buffered input from cooked mode.
|
||||
// In cooked mode Enter sends \n, but in raw mode Ctrl+J sends \n.
|
||||
// We treat \n from cooked mode as submit, not multiline.
|
||||
// We check Buffered() after the first read since the bufio buffer is
|
||||
// empty until then. This is compatible with """ multiline mode in
|
||||
// interactive.go since each Readline() call is independent.
|
||||
var draining, stopDraining bool
|
||||
|
||||
for {
|
||||
// Apply deferred state change from previous iteration
|
||||
if stopDraining {
|
||||
draining = false
|
||||
stopDraining = false
|
||||
}
|
||||
|
||||
// don't show placeholder when pasting unless we're in multiline mode
|
||||
showPlaceholder := !i.Pasting || i.Prompt.UseAlt
|
||||
if buf.IsEmpty() && showPlaceholder {
|
||||
@@ -105,6 +119,15 @@ func (i *Instance) Readline() (string, error) {
|
||||
|
||||
r, err := i.Terminal.Read()
|
||||
|
||||
// After reading, check if there's more buffered data. If so, we're
|
||||
// processing cooked-mode input. Once buffer empties, the current
|
||||
// char is the last buffered one (still drain it), then stop next iteration.
|
||||
if i.Terminal.reader.Buffered() > 0 {
|
||||
draining = true
|
||||
} else if draining {
|
||||
stopDraining = true
|
||||
}
|
||||
|
||||
if buf.IsEmpty() {
|
||||
fmt.Print(ClearToEOL)
|
||||
}
|
||||
@@ -232,15 +255,20 @@ func (i *Instance) Readline() (string, error) {
|
||||
fd := os.Stdin.Fd()
|
||||
return handleCharCtrlZ(fd, i.Terminal.termios)
|
||||
case CharCtrlJ:
|
||||
i.pastedLines = append(i.pastedLines, buf.String())
|
||||
buf.Buf.Clear()
|
||||
buf.Pos = 0
|
||||
buf.DisplayPos = 0
|
||||
buf.LineHasSpace.Clear()
|
||||
fmt.Println()
|
||||
fmt.Print(i.Prompt.AltPrompt)
|
||||
i.Prompt.UseAlt = true
|
||||
continue
|
||||
// If not draining cooked-mode input, treat as multiline
|
||||
if !draining {
|
||||
i.pastedLines = append(i.pastedLines, buf.String())
|
||||
buf.Buf.Clear()
|
||||
buf.Pos = 0
|
||||
buf.DisplayPos = 0
|
||||
buf.LineHasSpace.Clear()
|
||||
fmt.Println()
|
||||
fmt.Print(i.Prompt.AltPrompt)
|
||||
i.Prompt.UseAlt = true
|
||||
continue
|
||||
}
|
||||
// Draining cooked-mode input: treat \n as submit
|
||||
fallthrough
|
||||
case CharEnter:
|
||||
output := buf.String()
|
||||
if len(i.pastedLines) > 0 {
|
||||
|
||||
@@ -161,6 +161,17 @@ func (m *ModelManifest) HasTensorLayers() bool {
|
||||
return false
|
||||
}
|
||||
|
||||
// TotalTensorSize returns the total size in bytes of all tensor layers.
|
||||
func (m *ModelManifest) TotalTensorSize() int64 {
|
||||
var total int64
|
||||
for _, layer := range m.Manifest.Layers {
|
||||
if layer.MediaType == "application/vnd.ollama.image.tensor" {
|
||||
total += layer.Size
|
||||
}
|
||||
}
|
||||
return total
|
||||
}
|
||||
|
||||
// ModelInfo contains metadata about an image generation model.
|
||||
type ModelInfo struct {
|
||||
Architecture string
|
||||
|
||||
@@ -5,6 +5,37 @@ import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestTotalTensorSize(t *testing.T) {
|
||||
m := &ModelManifest{
|
||||
Manifest: &Manifest{
|
||||
Layers: []ManifestLayer{
|
||||
{MediaType: "application/vnd.ollama.image.tensor", Size: 1000},
|
||||
{MediaType: "application/vnd.ollama.image.tensor", Size: 2000},
|
||||
{MediaType: "application/vnd.ollama.image.json", Size: 500}, // not a tensor
|
||||
{MediaType: "application/vnd.ollama.image.tensor", Size: 3000},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
got := m.TotalTensorSize()
|
||||
want := int64(6000)
|
||||
if got != want {
|
||||
t.Errorf("TotalTensorSize() = %d, want %d", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTotalTensorSizeEmpty(t *testing.T) {
|
||||
m := &ModelManifest{
|
||||
Manifest: &Manifest{
|
||||
Layers: []ManifestLayer{},
|
||||
},
|
||||
}
|
||||
|
||||
if got := m.TotalTensorSize(); got != 0 {
|
||||
t.Errorf("TotalTensorSize() = %d, want 0", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestManifestAndBlobDirsRespectOLLAMAModels(t *testing.T) {
|
||||
modelsDir := filepath.Join(t.TempDir(), "models")
|
||||
|
||||
|
||||
@@ -16,18 +16,9 @@ import (
|
||||
"runtime"
|
||||
)
|
||||
|
||||
// GB is a convenience constant for gigabytes.
|
||||
const GB = 1024 * 1024 * 1024
|
||||
|
||||
// SupportedBackends lists the backends that support image generation.
|
||||
var SupportedBackends = []string{"metal", "cuda", "cpu"}
|
||||
|
||||
// modelVRAMEstimates maps pipeline class names to their estimated VRAM requirements.
|
||||
var modelVRAMEstimates = map[string]uint64{
|
||||
"ZImagePipeline": 21 * GB, // ~21GB for Z-Image (text encoder + transformer + VAE)
|
||||
"FluxPipeline": 20 * GB, // ~20GB for Flux
|
||||
}
|
||||
|
||||
// CheckPlatformSupport validates that image generation is supported on the current platform.
|
||||
// Returns nil if supported, or an error describing why it's not supported.
|
||||
func CheckPlatformSupport() error {
|
||||
@@ -47,17 +38,6 @@ func CheckPlatformSupport() error {
|
||||
}
|
||||
}
|
||||
|
||||
// CheckMemoryRequirements validates that there's enough memory for image generation.
|
||||
// Returns nil if memory is sufficient, or an error if not.
|
||||
func CheckMemoryRequirements(modelName string, availableMemory uint64) error {
|
||||
required := EstimateVRAM(modelName)
|
||||
if availableMemory < required {
|
||||
return fmt.Errorf("insufficient memory for image generation: need %d GB, have %d GB",
|
||||
required/GB, availableMemory/GB)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// ResolveModelName checks if a model name is a known image generation model.
|
||||
// Returns the normalized model name if found, empty string otherwise.
|
||||
func ResolveModelName(modelName string) string {
|
||||
@@ -68,16 +48,6 @@ func ResolveModelName(modelName string) string {
|
||||
return ""
|
||||
}
|
||||
|
||||
// EstimateVRAM returns the estimated VRAM needed for an image generation model.
|
||||
// Returns a conservative default of 21GB if the model type cannot be determined.
|
||||
func EstimateVRAM(modelName string) uint64 {
|
||||
className := DetectModelType(modelName)
|
||||
if estimate, ok := modelVRAMEstimates[className]; ok {
|
||||
return estimate
|
||||
}
|
||||
return 21 * GB
|
||||
}
|
||||
|
||||
// DetectModelType reads model_index.json and returns the model type.
|
||||
// Checks both "architecture" (Ollama format) and "_class_name" (diffusers format).
|
||||
// Returns empty string if detection fails.
|
||||
|
||||
@@ -30,69 +30,6 @@ func TestCheckPlatformSupport(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckMemoryRequirements(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
availableMemory uint64
|
||||
wantErr bool
|
||||
}{
|
||||
{
|
||||
name: "sufficient memory",
|
||||
availableMemory: 32 * GB,
|
||||
wantErr: false,
|
||||
},
|
||||
{
|
||||
name: "exactly enough memory",
|
||||
availableMemory: 21 * GB,
|
||||
wantErr: false,
|
||||
},
|
||||
{
|
||||
name: "insufficient memory",
|
||||
availableMemory: 16 * GB,
|
||||
wantErr: true,
|
||||
},
|
||||
{
|
||||
name: "zero memory",
|
||||
availableMemory: 0,
|
||||
wantErr: true,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
// Use a non-existent model name which will default to 21GB estimate
|
||||
err := CheckMemoryRequirements("nonexistent-model", tt.availableMemory)
|
||||
if (err != nil) != tt.wantErr {
|
||||
t.Errorf("CheckMemoryRequirements() error = %v, wantErr %v", err, tt.wantErr)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestModelVRAMEstimates(t *testing.T) {
|
||||
// Verify the VRAM estimates map has expected entries
|
||||
expected := map[string]uint64{
|
||||
"ZImagePipeline": 21 * GB,
|
||||
"FluxPipeline": 20 * GB,
|
||||
}
|
||||
|
||||
for name, expectedVRAM := range expected {
|
||||
if actual, ok := modelVRAMEstimates[name]; !ok {
|
||||
t.Errorf("Missing VRAM estimate for %s", name)
|
||||
} else if actual != expectedVRAM {
|
||||
t.Errorf("VRAM estimate for %s = %d GB, want %d GB", name, actual/GB, expectedVRAM/GB)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestEstimateVRAMDefault(t *testing.T) {
|
||||
// Non-existent model should return default 21GB
|
||||
vram := EstimateVRAM("nonexistent-model-that-does-not-exist")
|
||||
if vram != 21*GB {
|
||||
t.Errorf("EstimateVRAM() = %d GB, want 21 GB", vram/GB)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveModelName(t *testing.T) {
|
||||
// Non-existent model should return empty string
|
||||
result := ResolveModelName("nonexistent-model")
|
||||
|
||||
@@ -78,14 +78,6 @@ func Execute(args []string) error {
|
||||
slog.Info("MLX library initialized")
|
||||
slog.Info("starting image runner", "model", *modelName, "port", *port)
|
||||
|
||||
// Check memory requirements before loading
|
||||
requiredMemory := imagegen.EstimateVRAM(*modelName)
|
||||
availableMemory := mlx.GetMemoryLimit()
|
||||
if availableMemory > 0 && availableMemory < requiredMemory {
|
||||
return fmt.Errorf("insufficient memory for image generation: need %d GB, have %d GB",
|
||||
requiredMemory/(1024*1024*1024), availableMemory/(1024*1024*1024))
|
||||
}
|
||||
|
||||
// Detect model type and load appropriate model
|
||||
modelType := imagegen.DetectModelType(*modelName)
|
||||
slog.Info("detected model type", "type", modelType)
|
||||
|
||||
@@ -104,11 +104,17 @@ func NewServer(modelName string) (*Server, error) {
|
||||
slog.Debug("mlx subprocess library path", "LD_LIBRARY_PATH", pathEnvVal)
|
||||
}
|
||||
|
||||
// Get total weight size from manifest
|
||||
var weightSize uint64
|
||||
if manifest, err := LoadManifest(modelName); err == nil {
|
||||
weightSize = uint64(manifest.TotalTensorSize())
|
||||
}
|
||||
|
||||
s := &Server{
|
||||
cmd: cmd,
|
||||
port: port,
|
||||
modelName: modelName,
|
||||
vramSize: EstimateVRAM(modelName),
|
||||
vramSize: weightSize,
|
||||
done: make(chan error, 1),
|
||||
client: &http.Client{Timeout: 10 * time.Minute},
|
||||
}
|
||||
|
||||
@@ -38,40 +38,6 @@ func TestPlatformSupport(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestMemoryRequirementsError verifies memory check returns clear error.
|
||||
func TestMemoryRequirementsError(t *testing.T) {
|
||||
// Test with insufficient memory
|
||||
err := CheckMemoryRequirements("test-model", 8*GB)
|
||||
if err == nil {
|
||||
t.Error("Expected error for insufficient memory (8GB < 21GB default)")
|
||||
}
|
||||
|
||||
// Test with sufficient memory
|
||||
err = CheckMemoryRequirements("test-model", 32*GB)
|
||||
if err != nil {
|
||||
t.Errorf("Expected no error for sufficient memory (32GB), got: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestEstimateVRAMReturnsReasonableDefaults verifies VRAM estimates are sensible.
|
||||
func TestEstimateVRAMReturnsReasonableDefaults(t *testing.T) {
|
||||
// Unknown model should return default (21GB)
|
||||
vram := EstimateVRAM("unknown-model")
|
||||
if vram < 10*GB || vram > 100*GB {
|
||||
t.Errorf("VRAM estimate %d GB is outside reasonable range (10-100 GB)", vram/GB)
|
||||
}
|
||||
|
||||
// Verify known pipeline estimates exist and are reasonable
|
||||
for name, estimate := range modelVRAMEstimates {
|
||||
if estimate < 10*GB {
|
||||
t.Errorf("VRAM estimate for %s (%d GB) is suspiciously low", name, estimate/GB)
|
||||
}
|
||||
if estimate > 200*GB {
|
||||
t.Errorf("VRAM estimate for %s (%d GB) is suspiciously high", name, estimate/GB)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestServerInterfaceCompliance verifies Server implements llm.LlamaServer.
|
||||
// This is a compile-time check but we document it as a test.
|
||||
func TestServerInterfaceCompliance(t *testing.T) {
|
||||
|
||||
Reference in New Issue
Block a user