models: qwen3vl

2026-01-02 12:38:15 -05:00 · 2025-09-10 12:11:46 -07:00
5 changed files with 194 additions and 40 deletions
--- a/model/models/models.go
+++ b/model/models/models.go
@@ -12,4 +12,5 @@ import (
 	_ "github.com/ollama/ollama/model/models/qwen2"
 	_ "github.com/ollama/ollama/model/models/qwen25vl"
 	_ "github.com/ollama/ollama/model/models/qwen3"
+	_ "github.com/ollama/ollama/model/models/qwen3vl"
 )
--- a/model/models/qwen25vl/model.go
+++ b/model/models/qwen25vl/model.go
@@ -44,8 +44,8 @@ func New(c fs.Config) (model.Model, error) {
 			},
 		),
 		TextModel:      NewTextModel(c),
-		VisionModel:    newVisionModel(c),
-		ImageProcessor: newImageProcessor(c),
+		VisionModel:    NewVisionModel(c),
+		ImageProcessor: NewImageProcessor(c),
 	}

 	m.Cache = kvcache.NewCausalCache(m.TextModel.Shift)
@@ -65,8 +65,8 @@ func (m *Model) PixelValues(ctx ml.Context, multimodalData []byte) (ml.Tensor, *
 	}

 	// Calculate tensor dimensions
-	patchDim := m.ImageProcessor.numChannels * m.ImageProcessor.temporalPatchSize *
-		m.ImageProcessor.patchSize * m.ImageProcessor.patchSize
+	patchDim := m.ImageProcessor.NumChannels * m.ImageProcessor.TemporalPatchSize *
+		m.ImageProcessor.PatchSize * m.ImageProcessor.PatchSize
 	numPatches := grid.Temporal * grid.Height * grid.Width

 	pixelValues := ctx.Input().FromFloatSlice(f32s, patchDim, numPatches)
--- a/model/models/qwen25vl/model_vision.go
+++ b/model/models/qwen25vl/model_vision.go
@@ -345,8 +345,8 @@ func (m *VisionModel) PositionalEmbedding(ctx ml.Context, grid *Grid) ml.Tensor
 	return positionalEmbedding
 }

-// newVisionModel creates a new instance of the Qwen vision model
-func newVisionModel(c fs.Config) *VisionModel {
+// NewVisionModel creates a new instance of the Qwen vision model
+func NewVisionModel(c fs.Config) *VisionModel {
 	patchSize := int(c.Uint("vision.patch_size", 14))
 	hiddenSize := int(c.Uint("vision.embedding_length", 1280))
 	numHeads := int(c.Uint("vision.attention.head_count", 16))
--- a/model/models/qwen25vl/process_image.go
+++ b/model/models/qwen25vl/process_image.go
@@ -11,40 +11,40 @@ import (

 // ImageProcessor contains configuration for the Qwen 2.5 VL image processing
 type ImageProcessor struct {
-	numChannels       int
-	patchSize         int
-	temporalPatchSize int
-	mergeSize         int
-	minPixels         int
-	maxPixels         int
-	factor            int
-	rescaleFactor     float32
-	imageMean         []float32
-	imageStd          []float32
+	NumChannels       int
+	PatchSize         int
+	TemporalPatchSize int
+	MergeSize         int
+	MinPixels         int
+	MaxPixels         int
+	Factor            int
+	RescaleFactor     float32
+	ImageMean         []float32
+	ImageStd          []float32
 }

 // newImageProcessor creates a new image processor with default values
-func newImageProcessor(c fs.Config) ImageProcessor {
+func NewImageProcessor(c fs.Config) ImageProcessor {
 	patchSize := int(c.Uint("vision.patch_size", 14))
 	mergeSize := int(c.Uint("vision.spatial_merge_size", 2))

 	return ImageProcessor{
-		numChannels:       int(c.Uint("vision.num_channels", 3)), // not set
-		patchSize:         patchSize,
-		temporalPatchSize: 2,
-		mergeSize:         mergeSize,
-		minPixels:         56 * 56,
-		maxPixels:         int(c.Uint("vision.max_pixels", 28*28*1280)), // 1MP limit
-		factor:            patchSize * mergeSize,
-		rescaleFactor:     1.0 / 255.0,
-		imageMean:         imageproc.ClipDefaultMean[:],
-		imageStd:          imageproc.ClipDefaultSTD[:],
+		NumChannels:       int(c.Uint("vision.num_channels", 3)), // not set
+		PatchSize:         patchSize,
+		TemporalPatchSize: 2,
+		MergeSize:         mergeSize,
+		MinPixels:         56 * 56,
+		MaxPixels:         int(c.Uint("vision.max_pixels", 28*28*1280)), // 1MP limit
+		Factor:            patchSize * mergeSize,
+		RescaleFactor:     1.0 / 255.0,
+		ImageMean:         imageproc.ClipDefaultMean[:],
+		ImageStd:          imageproc.ClipDefaultSTD[:],
 	}
 }

 // SmartResize implements the smart resize algorithm
 func (p *ImageProcessor) SmartResize(height, width int) (int, int) {
-	factor := p.factor
+	factor := p.Factor

 	if height < factor || width < factor {
 		panic(fmt.Sprintf("height:%d or width:%d must be larger than factor:%d", height, width, factor))
@@ -57,13 +57,13 @@ func (p *ImageProcessor) SmartResize(height, width int) (int, int) {
 	hBar := round(float64(height)/float64(factor)) * factor
 	wBar := round(float64(width)/float64(factor)) * factor

-	if hBar*wBar > p.maxPixels {
-		beta := math.Sqrt(float64(height*width) / float64(p.maxPixels))
+	if hBar*wBar > p.MaxPixels {
+		beta := math.Sqrt(float64(height*width) / float64(p.MaxPixels))

 		hBar = int(math.Floor(float64(height)/beta/float64(factor))) * factor
 		wBar = int(math.Floor(float64(width)/beta/float64(factor))) * factor
-	} else if hBar*wBar < p.minPixels {
-		beta := math.Sqrt(float64(p.minPixels) / float64(height*width))
+	} else if hBar*wBar < p.MinPixels {
+		beta := math.Sqrt(float64(p.MinPixels) / float64(height*width))

 		hBar = int(math.Ceil(float64(height)*beta/float64(factor))) * factor
 		wBar = int(math.Ceil(float64(width)*beta/float64(factor))) * factor
@@ -90,16 +90,16 @@ func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, *Grid, error)

 	normalizedPixels := imageproc.Normalize(
 		resizedImg,
-		[3]float32{p.imageMean[0], p.imageMean[1], p.imageMean[2]},
-		[3]float32{p.imageStd[0], p.imageStd[1], p.imageStd[2]},
+		[3]float32{p.ImageMean[0], p.ImageMean[1], p.ImageMean[2]},
+		[3]float32{p.ImageStd[0], p.ImageStd[1], p.ImageStd[2]},
 		true, // rescale
 		true, // channelFirst
 	)

 	// Calculate grid dimensions
 	grid := &Grid{
-		Height:   resizedHeight / p.patchSize,
-		Width:    resizedWidth / p.patchSize,
+		Height:   resizedHeight / p.PatchSize,
+		Width:    resizedWidth / p.PatchSize,
 		Temporal: 1, // For single images, temporal dimension is 1
 	}

@@ -113,10 +113,10 @@ func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, *Grid, error)
 }

 func (p *ImageProcessor) createPatches(pixels []float32, height, width int, grid *Grid) ([]float32, error) {
-	channels := p.numChannels
-	patchSize := p.patchSize
-	mergeSize := p.mergeSize
-	temporalPatchSize := p.temporalPatchSize
+	channels := p.NumChannels
+	patchSize := p.PatchSize
+	mergeSize := p.MergeSize
+	temporalPatchSize := p.TemporalPatchSize

 	// Calculate output dimensions
 	numPatches := grid.Temporal * grid.Height * grid.Width
--- a/model/models/qwen3vl/model.go
+++ b/model/models/qwen3vl/model.go
@@ -0,0 +1,153 @@
+package qwen3vl
+
+import (
+	"bytes"
+	"fmt"
+	"image"
+	"slices"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/model/input"
+	"github.com/ollama/ollama/model/models/qwen25vl"
+	"github.com/ollama/ollama/model/models/qwen3"
+)
+
+type Model struct {
+	model.Base
+	model.BytePairEncoding
+
+	TextModel *qwen3.Model
+	*qwen25vl.VisionModel
+
+	qwen25vl.ImageProcessor
+}
+
+var _ model.MultimodalProcessor = (*Model)(nil)
+
+func New(c fs.Config) (model.Model, error) {
+	textModel, err := qwen3.New(c)
+	if err != nil {
+		return nil, err
+	}
+
+	m := &Model{
+		BytePairEncoding: model.NewBytePairEncoding(
+			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
+			&model.Vocabulary{
+				Values: c.Strings("tokenizer.ggml.tokens"),
+				Types:  c.Ints("tokenizer.ggml.token_type"),
+				Merges: c.Strings("tokenizer.ggml.merges"),
+				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
+				EOS: append(
+					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				),
+			},
+		),
+		TextModel:      textModel.(*qwen3.Model),
+		VisionModel:    qwen25vl.NewVisionModel(c),
+		ImageProcessor: qwen25vl.NewImageProcessor(c),
+	}
+
+	m.Cache = kvcache.NewCausalCache(m.TextModel.Shift)
+
+	return m, nil
+}
+
+func (m *Model) PixelValues(ctx ml.Context, multimodalData []byte) (ml.Tensor, *qwen25vl.Grid, error) {
+	image, _, err := image.Decode(bytes.NewReader(multimodalData))
+	if err != nil {
+		return nil, nil, err
+	}
+
+	f32s, grid, err := m.ImageProcessor.ProcessImage(image)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	// Calculate tensor dimensions
+	patchDim := m.ImageProcessor.NumChannels * m.ImageProcessor.TemporalPatchSize *
+		m.ImageProcessor.PatchSize * m.ImageProcessor.PatchSize
+	numPatches := grid.Temporal * grid.Height * grid.Width
+
+	pixelValues := ctx.Input().FromFloatSlice(f32s, patchDim, numPatches)
+
+	return pixelValues, grid, nil
+}
+
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
+	if len(m.VisionModel.Layers) == 0 {
+		return nil, model.ErrNoVisionModel
+	}
+
+	pixels, grid, err := m.PixelValues(ctx, multimodalData)
+	if err != nil {
+		return nil, err
+	}
+
+	visionOutputs := m.VisionModel.Forward(ctx, pixels, grid)
+	return []input.Multimodal{{Tensor: visionOutputs}}, nil
+}
+
+// PostTokenize arranges Qwen-3-VL's inputs for the forward pass
+func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
+	var result []*input.Input
+
+	var (
+		imageToken       int32 = 151655
+		visionStartToken int32 = 151652
+		visionEndToken   int32 = 151653
+	)
+
+	nImg := 0
+	for _, inp := range inputs {
+		if inp.Multimodal == nil {
+			// If not a multimodal input, add it to the result unchanged
+			result = append(result, inp)
+		} else {
+			// Adding the 'Picture' prefix is a hack, at the time of writing there is no way to prefix
+			// the image tokens with a prompt, so we add a prefix here
+			nImg++
+			pre, err := m.Encode(fmt.Sprintf(" Picture %d: ", nImg), true)
+			if err != nil {
+				return nil, fmt.Errorf("failed to encode image prompt: %w", err)
+			}
+			for i := range pre {
+				result = append(result, &input.Input{Token: pre[i]})
+			}
+
+			patchesPerChunk := inp.Multimodal[0].Tensor.Dim(1)
+
+			// First add the vision start token
+			result = append(result, &input.Input{Token: visionStartToken})
+
+			// Add the image token with the multimodal tensor data at the first position
+			result = append(result, &input.Input{
+				Token:          imageToken,
+				Multimodal:     inp.Multimodal,
+				MultimodalHash: inp.MultimodalHash,
+				SameBatch:      patchesPerChunk,
+			})
+
+			// Add the placeholder tokens for the remaining positions (tokensPerGrid-1)
+			result = append(result, slices.Repeat([]*input.Input{{Token: imageToken}}, patchesPerChunk-1)...)
+
+			result = append(result, &input.Input{Token: visionEndToken})
+		}
+	}
+
+	return result, nil
+}
+
+func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
+	return m.TextModel.Forward(ctx, batch)
+}
+
+func init() {
+	model.Register("qwen3vl", New)
+}