models: qwen3vl

2026-01-19 12:57:56 -05:00 · 2025-09-10 12:11:46 -07:00
5 changed files with 194 additions and 40 deletions
--- a/model/models/models.go
+++ b/model/models/models.go
@@ -12,4 +12,5 @@ import (
 	_ "github.com/ollama/ollama/model/models/qwen2"
 	_ "github.com/ollama/ollama/model/models/qwen25vl"
 	_ "github.com/ollama/ollama/model/models/qwen3"
 	_ "github.com/ollama/ollama/model/models/qwen3vl"
 )
--- a/model/models/qwen25vl/model.go
+++ b/model/models/qwen25vl/model.go
@@ -44,8 +44,8 @@ func New(c fs.Config) (model.Model, error) {
 			},
 		),
 		TextModel:      NewTextModel(c),
-		VisionModel:    newVisionModel(c),
+		VisionModel:    NewVisionModel(c),
-		ImageProcessor: newImageProcessor(c),
+		ImageProcessor: NewImageProcessor(c),
 	}
 	m.Cache = kvcache.NewCausalCache(m.TextModel.Shift)
@@ -65,8 +65,8 @@ func (m *Model) PixelValues(ctx ml.Context, multimodalData []byte) (ml.Tensor, *
 	}
 	// Calculate tensor dimensions
-	patchDim := m.ImageProcessor.numChannels * m.ImageProcessor.temporalPatchSize *
+	patchDim := m.ImageProcessor.NumChannels * m.ImageProcessor.TemporalPatchSize *
-		m.ImageProcessor.patchSize * m.ImageProcessor.patchSize
+		m.ImageProcessor.PatchSize * m.ImageProcessor.PatchSize
 	numPatches := grid.Temporal * grid.Height * grid.Width
 	pixelValues := ctx.Input().FromFloatSlice(f32s, patchDim, numPatches)
--- a/model/models/qwen25vl/model_vision.go
+++ b/model/models/qwen25vl/model_vision.go
@@ -345,8 +345,8 @@ func (m *VisionModel) PositionalEmbedding(ctx ml.Context, grid *Grid) ml.Tensor
 	return positionalEmbedding
 }
-// newVisionModel creates a new instance of the Qwen vision model
+// NewVisionModel creates a new instance of the Qwen vision model
-func newVisionModel(c fs.Config) *VisionModel {
+func NewVisionModel(c fs.Config) *VisionModel {
 	patchSize := int(c.Uint("vision.patch_size", 14))
 	hiddenSize := int(c.Uint("vision.embedding_length", 1280))
 	numHeads := int(c.Uint("vision.attention.head_count", 16))
--- a/model/models/qwen25vl/process_image.go
+++ b/model/models/qwen25vl/process_image.go
@@ -11,40 +11,40 @@ import (
 // ImageProcessor contains configuration for the Qwen 2.5 VL image processing
 type ImageProcessor struct {
-	numChannels       int
+	NumChannels       int
-	patchSize         int
+	PatchSize         int
-	temporalPatchSize int
+	TemporalPatchSize int
-	mergeSize         int
+	MergeSize         int
-	minPixels         int
+	MinPixels         int
-	maxPixels         int
+	MaxPixels         int
-	factor            int
+	Factor            int
-	rescaleFactor     float32
+	RescaleFactor     float32
-	imageMean         []float32
+	ImageMean         []float32
-	imageStd          []float32
+	ImageStd          []float32
 }
 // newImageProcessor creates a new image processor with default values
-func newImageProcessor(c fs.Config) ImageProcessor {
+func NewImageProcessor(c fs.Config) ImageProcessor {
 	patchSize := int(c.Uint("vision.patch_size", 14))
 	mergeSize := int(c.Uint("vision.spatial_merge_size", 2))
 	return ImageProcessor{
-		numChannels:       int(c.Uint("vision.num_channels", 3)), // not set
+		NumChannels:       int(c.Uint("vision.num_channels", 3)), // not set
-		patchSize:         patchSize,
+		PatchSize:         patchSize,
-		temporalPatchSize: 2,
+		TemporalPatchSize: 2,
-		mergeSize:         mergeSize,
+		MergeSize:         mergeSize,
-		minPixels:         56 * 56,
+		MinPixels:         56 * 56,
-		maxPixels:         int(c.Uint("vision.max_pixels", 28*28*1280)), // 1MP limit
+		MaxPixels:         int(c.Uint("vision.max_pixels", 28*28*1280)), // 1MP limit
-		factor:            patchSize * mergeSize,
+		Factor:            patchSize * mergeSize,
-		rescaleFactor:     1.0 / 255.0,
+		RescaleFactor:     1.0 / 255.0,
-		imageMean:         imageproc.ClipDefaultMean[:],
+		ImageMean:         imageproc.ClipDefaultMean[:],
-		imageStd:          imageproc.ClipDefaultSTD[:],
+		ImageStd:          imageproc.ClipDefaultSTD[:],
 	}
 }
 // SmartResize implements the smart resize algorithm
 func (p *ImageProcessor) SmartResize(height, width int) (int, int) {
-	factor := p.factor
+	factor := p.Factor
 	if height < factor || width < factor {
 		panic(fmt.Sprintf("height:%d or width:%d must be larger than factor:%d", height, width, factor))
@@ -57,13 +57,13 @@ func (p *ImageProcessor) SmartResize(height, width int) (int, int) {
 	hBar := round(float64(height)/float64(factor)) * factor
 	wBar := round(float64(width)/float64(factor)) * factor
-	if hBar*wBar > p.maxPixels {
+	if hBar*wBar > p.MaxPixels {
-		beta := math.Sqrt(float64(height*width) / float64(p.maxPixels))
+		beta := math.Sqrt(float64(height*width) / float64(p.MaxPixels))
 		hBar = int(math.Floor(float64(height)/beta/float64(factor))) * factor
 		wBar = int(math.Floor(float64(width)/beta/float64(factor))) * factor
-	} else if hBar*wBar < p.minPixels {
+	} else if hBar*wBar < p.MinPixels {
-		beta := math.Sqrt(float64(p.minPixels) / float64(height*width))
+		beta := math.Sqrt(float64(p.MinPixels) / float64(height*width))
 		hBar = int(math.Ceil(float64(height)*beta/float64(factor))) * factor
 		wBar = int(math.Ceil(float64(width)*beta/float64(factor))) * factor
@@ -90,16 +90,16 @@ func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, *Grid, error)
 	normalizedPixels := imageproc.Normalize(
 		resizedImg,
-		[3]float32{p.imageMean[0], p.imageMean[1], p.imageMean[2]},
+		[3]float32{p.ImageMean[0], p.ImageMean[1], p.ImageMean[2]},
-		[3]float32{p.imageStd[0], p.imageStd[1], p.imageStd[2]},
+		[3]float32{p.ImageStd[0], p.ImageStd[1], p.ImageStd[2]},
 		true, // rescale
 		true, // channelFirst
 	)
 	// Calculate grid dimensions
 	grid := &Grid{
-		Height:   resizedHeight / p.patchSize,
+		Height:   resizedHeight / p.PatchSize,
-		Width:    resizedWidth / p.patchSize,
+		Width:    resizedWidth / p.PatchSize,
 		Temporal: 1, // For single images, temporal dimension is 1
 	}
@@ -113,10 +113,10 @@ func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, *Grid, error)
 }
 func (p *ImageProcessor) createPatches(pixels []float32, height, width int, grid *Grid) ([]float32, error) {
-	channels := p.numChannels
+	channels := p.NumChannels
-	patchSize := p.patchSize
+	patchSize := p.PatchSize
-	mergeSize := p.mergeSize
+	mergeSize := p.MergeSize
-	temporalPatchSize := p.temporalPatchSize
+	temporalPatchSize := p.TemporalPatchSize
 	// Calculate output dimensions
 	numPatches := grid.Temporal * grid.Height * grid.Width
--- a/model/models/qwen3vl/model.go
+++ b/model/models/qwen3vl/model.go
@@ -0,0 +1,153 @@
 package qwen3vl
 import (
 	"bytes"
 	"fmt"
 	"image"
 	"slices"
 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
 	"github.com/ollama/ollama/model/models/qwen25vl"
 	"github.com/ollama/ollama/model/models/qwen3"
 )
 type Model struct {
 	model.Base
 	model.BytePairEncoding
 	TextModel *qwen3.Model
 	*qwen25vl.VisionModel
 	qwen25vl.ImageProcessor
 }
 var _ model.MultimodalProcessor = (*Model)(nil)
 func New(c fs.Config) (model.Model, error) {
 	textModel, err := qwen3.New(c)
 	if err != nil {
 		return nil, err
 	}
 	m := &Model{
 		BytePairEncoding: model.NewBytePairEncoding(
 			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
 			&model.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
 				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
 				EOS: append(
 					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
 					c.Ints("tokenizer.ggml.eos_token_ids")...,
 				),
 			},
 		),
 		TextModel:      textModel.(*qwen3.Model),
 		VisionModel:    qwen25vl.NewVisionModel(c),
 		ImageProcessor: qwen25vl.NewImageProcessor(c),
 	}
 	m.Cache = kvcache.NewCausalCache(m.TextModel.Shift)
 	return m, nil
 }
 func (m *Model) PixelValues(ctx ml.Context, multimodalData []byte) (ml.Tensor, *qwen25vl.Grid, error) {
 	image, _, err := image.Decode(bytes.NewReader(multimodalData))
 	if err != nil {
 		return nil, nil, err
 	}
 	f32s, grid, err := m.ImageProcessor.ProcessImage(image)
 	if err != nil {
 		return nil, nil, err
 	}
 	// Calculate tensor dimensions
 	patchDim := m.ImageProcessor.NumChannels * m.ImageProcessor.TemporalPatchSize *
 		m.ImageProcessor.PatchSize * m.ImageProcessor.PatchSize
 	numPatches := grid.Temporal * grid.Height * grid.Width
 	pixelValues := ctx.Input().FromFloatSlice(f32s, patchDim, numPatches)
 	return pixelValues, grid, nil
 }
 func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
 	if len(m.VisionModel.Layers) == 0 {
 		return nil, model.ErrNoVisionModel
 	}
 	pixels, grid, err := m.PixelValues(ctx, multimodalData)
 	if err != nil {
 		return nil, err
 	}
 	visionOutputs := m.VisionModel.Forward(ctx, pixels, grid)
 	return []input.Multimodal{{Tensor: visionOutputs}}, nil
 }
 // PostTokenize arranges Qwen-3-VL's inputs for the forward pass
 func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
 	var result []*input.Input
 	var (
 		imageToken       int32 = 151655
 		visionStartToken int32 = 151652
 		visionEndToken   int32 = 151653
 	)
 	nImg := 0
 	for _, inp := range inputs {
 		if inp.Multimodal == nil {
 			// If not a multimodal input, add it to the result unchanged
 			result = append(result, inp)
 		} else {
 			// Adding the 'Picture' prefix is a hack, at the time of writing there is no way to prefix
 			// the image tokens with a prompt, so we add a prefix here
 			nImg++
 			pre, err := m.Encode(fmt.Sprintf(" Picture %d: ", nImg), true)
 			if err != nil {
 				return nil, fmt.Errorf("failed to encode image prompt: %w", err)
 			}
 			for i := range pre {
 				result = append(result, &input.Input{Token: pre[i]})
 			}
 			patchesPerChunk := inp.Multimodal[0].Tensor.Dim(1)
 			// First add the vision start token
 			result = append(result, &input.Input{Token: visionStartToken})
 			// Add the image token with the multimodal tensor data at the first position
 			result = append(result, &input.Input{
 				Token:          imageToken,
 				Multimodal:     inp.Multimodal,
 				MultimodalHash: inp.MultimodalHash,
 				SameBatch:      patchesPerChunk,
 			})
 			// Add the placeholder tokens for the remaining positions (tokensPerGrid-1)
 			result = append(result, slices.Repeat([]*input.Input{{Token: imageToken}}, patchesPerChunk-1)...)
 			result = append(result, &input.Input{Token: visionEndToken})
 		}
 	}
 	return result, nil
 }
 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 	return m.TextModel.Forward(ctx, batch)
 }
 func init() {
 	model.Register("qwen3vl", New)
 }