Compare commits

...

1 Commits

Author SHA1 Message Date
Bruce MacDonald
f5c9eb5aa2 models: qwen3vl 2025-09-10 12:11:46 -07:00
5 changed files with 194 additions and 40 deletions

View File

@@ -12,4 +12,5 @@ import (
_ "github.com/ollama/ollama/model/models/qwen2" _ "github.com/ollama/ollama/model/models/qwen2"
_ "github.com/ollama/ollama/model/models/qwen25vl" _ "github.com/ollama/ollama/model/models/qwen25vl"
_ "github.com/ollama/ollama/model/models/qwen3" _ "github.com/ollama/ollama/model/models/qwen3"
_ "github.com/ollama/ollama/model/models/qwen3vl"
) )

View File

@@ -44,8 +44,8 @@ func New(c fs.Config) (model.Model, error) {
}, },
), ),
TextModel: NewTextModel(c), TextModel: NewTextModel(c),
VisionModel: newVisionModel(c), VisionModel: NewVisionModel(c),
ImageProcessor: newImageProcessor(c), ImageProcessor: NewImageProcessor(c),
} }
m.Cache = kvcache.NewCausalCache(m.TextModel.Shift) m.Cache = kvcache.NewCausalCache(m.TextModel.Shift)
@@ -65,8 +65,8 @@ func (m *Model) PixelValues(ctx ml.Context, multimodalData []byte) (ml.Tensor, *
} }
// Calculate tensor dimensions // Calculate tensor dimensions
patchDim := m.ImageProcessor.numChannels * m.ImageProcessor.temporalPatchSize * patchDim := m.ImageProcessor.NumChannels * m.ImageProcessor.TemporalPatchSize *
m.ImageProcessor.patchSize * m.ImageProcessor.patchSize m.ImageProcessor.PatchSize * m.ImageProcessor.PatchSize
numPatches := grid.Temporal * grid.Height * grid.Width numPatches := grid.Temporal * grid.Height * grid.Width
pixelValues := ctx.Input().FromFloatSlice(f32s, patchDim, numPatches) pixelValues := ctx.Input().FromFloatSlice(f32s, patchDim, numPatches)

View File

@@ -345,8 +345,8 @@ func (m *VisionModel) PositionalEmbedding(ctx ml.Context, grid *Grid) ml.Tensor
return positionalEmbedding return positionalEmbedding
} }
// newVisionModel creates a new instance of the Qwen vision model // NewVisionModel creates a new instance of the Qwen vision model
func newVisionModel(c fs.Config) *VisionModel { func NewVisionModel(c fs.Config) *VisionModel {
patchSize := int(c.Uint("vision.patch_size", 14)) patchSize := int(c.Uint("vision.patch_size", 14))
hiddenSize := int(c.Uint("vision.embedding_length", 1280)) hiddenSize := int(c.Uint("vision.embedding_length", 1280))
numHeads := int(c.Uint("vision.attention.head_count", 16)) numHeads := int(c.Uint("vision.attention.head_count", 16))

View File

@@ -11,40 +11,40 @@ import (
// ImageProcessor contains configuration for the Qwen 2.5 VL image processing // ImageProcessor contains configuration for the Qwen 2.5 VL image processing
type ImageProcessor struct { type ImageProcessor struct {
numChannels int NumChannels int
patchSize int PatchSize int
temporalPatchSize int TemporalPatchSize int
mergeSize int MergeSize int
minPixels int MinPixels int
maxPixels int MaxPixels int
factor int Factor int
rescaleFactor float32 RescaleFactor float32
imageMean []float32 ImageMean []float32
imageStd []float32 ImageStd []float32
} }
// newImageProcessor creates a new image processor with default values // newImageProcessor creates a new image processor with default values
func newImageProcessor(c fs.Config) ImageProcessor { func NewImageProcessor(c fs.Config) ImageProcessor {
patchSize := int(c.Uint("vision.patch_size", 14)) patchSize := int(c.Uint("vision.patch_size", 14))
mergeSize := int(c.Uint("vision.spatial_merge_size", 2)) mergeSize := int(c.Uint("vision.spatial_merge_size", 2))
return ImageProcessor{ return ImageProcessor{
numChannels: int(c.Uint("vision.num_channels", 3)), // not set NumChannels: int(c.Uint("vision.num_channels", 3)), // not set
patchSize: patchSize, PatchSize: patchSize,
temporalPatchSize: 2, TemporalPatchSize: 2,
mergeSize: mergeSize, MergeSize: mergeSize,
minPixels: 56 * 56, MinPixels: 56 * 56,
maxPixels: int(c.Uint("vision.max_pixels", 28*28*1280)), // 1MP limit MaxPixels: int(c.Uint("vision.max_pixels", 28*28*1280)), // 1MP limit
factor: patchSize * mergeSize, Factor: patchSize * mergeSize,
rescaleFactor: 1.0 / 255.0, RescaleFactor: 1.0 / 255.0,
imageMean: imageproc.ClipDefaultMean[:], ImageMean: imageproc.ClipDefaultMean[:],
imageStd: imageproc.ClipDefaultSTD[:], ImageStd: imageproc.ClipDefaultSTD[:],
} }
} }
// SmartResize implements the smart resize algorithm // SmartResize implements the smart resize algorithm
func (p *ImageProcessor) SmartResize(height, width int) (int, int) { func (p *ImageProcessor) SmartResize(height, width int) (int, int) {
factor := p.factor factor := p.Factor
if height < factor || width < factor { if height < factor || width < factor {
panic(fmt.Sprintf("height:%d or width:%d must be larger than factor:%d", height, width, factor)) panic(fmt.Sprintf("height:%d or width:%d must be larger than factor:%d", height, width, factor))
@@ -57,13 +57,13 @@ func (p *ImageProcessor) SmartResize(height, width int) (int, int) {
hBar := round(float64(height)/float64(factor)) * factor hBar := round(float64(height)/float64(factor)) * factor
wBar := round(float64(width)/float64(factor)) * factor wBar := round(float64(width)/float64(factor)) * factor
if hBar*wBar > p.maxPixels { if hBar*wBar > p.MaxPixels {
beta := math.Sqrt(float64(height*width) / float64(p.maxPixels)) beta := math.Sqrt(float64(height*width) / float64(p.MaxPixels))
hBar = int(math.Floor(float64(height)/beta/float64(factor))) * factor hBar = int(math.Floor(float64(height)/beta/float64(factor))) * factor
wBar = int(math.Floor(float64(width)/beta/float64(factor))) * factor wBar = int(math.Floor(float64(width)/beta/float64(factor))) * factor
} else if hBar*wBar < p.minPixels { } else if hBar*wBar < p.MinPixels {
beta := math.Sqrt(float64(p.minPixels) / float64(height*width)) beta := math.Sqrt(float64(p.MinPixels) / float64(height*width))
hBar = int(math.Ceil(float64(height)*beta/float64(factor))) * factor hBar = int(math.Ceil(float64(height)*beta/float64(factor))) * factor
wBar = int(math.Ceil(float64(width)*beta/float64(factor))) * factor wBar = int(math.Ceil(float64(width)*beta/float64(factor))) * factor
@@ -90,16 +90,16 @@ func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, *Grid, error)
normalizedPixels := imageproc.Normalize( normalizedPixels := imageproc.Normalize(
resizedImg, resizedImg,
[3]float32{p.imageMean[0], p.imageMean[1], p.imageMean[2]}, [3]float32{p.ImageMean[0], p.ImageMean[1], p.ImageMean[2]},
[3]float32{p.imageStd[0], p.imageStd[1], p.imageStd[2]}, [3]float32{p.ImageStd[0], p.ImageStd[1], p.ImageStd[2]},
true, // rescale true, // rescale
true, // channelFirst true, // channelFirst
) )
// Calculate grid dimensions // Calculate grid dimensions
grid := &Grid{ grid := &Grid{
Height: resizedHeight / p.patchSize, Height: resizedHeight / p.PatchSize,
Width: resizedWidth / p.patchSize, Width: resizedWidth / p.PatchSize,
Temporal: 1, // For single images, temporal dimension is 1 Temporal: 1, // For single images, temporal dimension is 1
} }
@@ -113,10 +113,10 @@ func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, *Grid, error)
} }
func (p *ImageProcessor) createPatches(pixels []float32, height, width int, grid *Grid) ([]float32, error) { func (p *ImageProcessor) createPatches(pixels []float32, height, width int, grid *Grid) ([]float32, error) {
channels := p.numChannels channels := p.NumChannels
patchSize := p.patchSize patchSize := p.PatchSize
mergeSize := p.mergeSize mergeSize := p.MergeSize
temporalPatchSize := p.temporalPatchSize temporalPatchSize := p.TemporalPatchSize
// Calculate output dimensions // Calculate output dimensions
numPatches := grid.Temporal * grid.Height * grid.Width numPatches := grid.Temporal * grid.Height * grid.Width

View File

@@ -0,0 +1,153 @@
package qwen3vl
import (
"bytes"
"fmt"
"image"
"slices"
"github.com/ollama/ollama/fs"
"github.com/ollama/ollama/kvcache"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/model"
"github.com/ollama/ollama/model/input"
"github.com/ollama/ollama/model/models/qwen25vl"
"github.com/ollama/ollama/model/models/qwen3"
)
type Model struct {
model.Base
model.BytePairEncoding
TextModel *qwen3.Model
*qwen25vl.VisionModel
qwen25vl.ImageProcessor
}
var _ model.MultimodalProcessor = (*Model)(nil)
func New(c fs.Config) (model.Model, error) {
textModel, err := qwen3.New(c)
if err != nil {
return nil, err
}
m := &Model{
BytePairEncoding: model.NewBytePairEncoding(
c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
&model.Vocabulary{
Values: c.Strings("tokenizer.ggml.tokens"),
Types: c.Ints("tokenizer.ggml.token_type"),
Merges: c.Strings("tokenizer.ggml.merges"),
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
EOS: append(
[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
c.Ints("tokenizer.ggml.eos_token_ids")...,
),
},
),
TextModel: textModel.(*qwen3.Model),
VisionModel: qwen25vl.NewVisionModel(c),
ImageProcessor: qwen25vl.NewImageProcessor(c),
}
m.Cache = kvcache.NewCausalCache(m.TextModel.Shift)
return m, nil
}
func (m *Model) PixelValues(ctx ml.Context, multimodalData []byte) (ml.Tensor, *qwen25vl.Grid, error) {
image, _, err := image.Decode(bytes.NewReader(multimodalData))
if err != nil {
return nil, nil, err
}
f32s, grid, err := m.ImageProcessor.ProcessImage(image)
if err != nil {
return nil, nil, err
}
// Calculate tensor dimensions
patchDim := m.ImageProcessor.NumChannels * m.ImageProcessor.TemporalPatchSize *
m.ImageProcessor.PatchSize * m.ImageProcessor.PatchSize
numPatches := grid.Temporal * grid.Height * grid.Width
pixelValues := ctx.Input().FromFloatSlice(f32s, patchDim, numPatches)
return pixelValues, grid, nil
}
func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
if len(m.VisionModel.Layers) == 0 {
return nil, model.ErrNoVisionModel
}
pixels, grid, err := m.PixelValues(ctx, multimodalData)
if err != nil {
return nil, err
}
visionOutputs := m.VisionModel.Forward(ctx, pixels, grid)
return []input.Multimodal{{Tensor: visionOutputs}}, nil
}
// PostTokenize arranges Qwen-3-VL's inputs for the forward pass
func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
var result []*input.Input
var (
imageToken int32 = 151655
visionStartToken int32 = 151652
visionEndToken int32 = 151653
)
nImg := 0
for _, inp := range inputs {
if inp.Multimodal == nil {
// If not a multimodal input, add it to the result unchanged
result = append(result, inp)
} else {
// Adding the 'Picture' prefix is a hack, at the time of writing there is no way to prefix
// the image tokens with a prompt, so we add a prefix here
nImg++
pre, err := m.Encode(fmt.Sprintf(" Picture %d: ", nImg), true)
if err != nil {
return nil, fmt.Errorf("failed to encode image prompt: %w", err)
}
for i := range pre {
result = append(result, &input.Input{Token: pre[i]})
}
patchesPerChunk := inp.Multimodal[0].Tensor.Dim(1)
// First add the vision start token
result = append(result, &input.Input{Token: visionStartToken})
// Add the image token with the multimodal tensor data at the first position
result = append(result, &input.Input{
Token: imageToken,
Multimodal: inp.Multimodal,
MultimodalHash: inp.MultimodalHash,
SameBatch: patchesPerChunk,
})
// Add the placeholder tokens for the remaining positions (tokensPerGrid-1)
result = append(result, slices.Repeat([]*input.Input{{Token: imageToken}}, patchesPerChunk-1)...)
result = append(result, &input.Input{Token: visionEndToken})
}
}
return result, nil
}
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
return m.TextModel.Forward(ctx, batch)
}
func init() {
model.Register("qwen3vl", New)
}