mirror of
https://github.com/ollama/ollama.git
synced 2026-01-19 12:57:56 -05:00
Compare commits
1 Commits
parth/decr
...
brucemacd/
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f5c9eb5aa2 |
@@ -12,4 +12,5 @@ import (
|
|||||||
_ "github.com/ollama/ollama/model/models/qwen2"
|
_ "github.com/ollama/ollama/model/models/qwen2"
|
||||||
_ "github.com/ollama/ollama/model/models/qwen25vl"
|
_ "github.com/ollama/ollama/model/models/qwen25vl"
|
||||||
_ "github.com/ollama/ollama/model/models/qwen3"
|
_ "github.com/ollama/ollama/model/models/qwen3"
|
||||||
|
_ "github.com/ollama/ollama/model/models/qwen3vl"
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -44,8 +44,8 @@ func New(c fs.Config) (model.Model, error) {
|
|||||||
},
|
},
|
||||||
),
|
),
|
||||||
TextModel: NewTextModel(c),
|
TextModel: NewTextModel(c),
|
||||||
VisionModel: newVisionModel(c),
|
VisionModel: NewVisionModel(c),
|
||||||
ImageProcessor: newImageProcessor(c),
|
ImageProcessor: NewImageProcessor(c),
|
||||||
}
|
}
|
||||||
|
|
||||||
m.Cache = kvcache.NewCausalCache(m.TextModel.Shift)
|
m.Cache = kvcache.NewCausalCache(m.TextModel.Shift)
|
||||||
@@ -65,8 +65,8 @@ func (m *Model) PixelValues(ctx ml.Context, multimodalData []byte) (ml.Tensor, *
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Calculate tensor dimensions
|
// Calculate tensor dimensions
|
||||||
patchDim := m.ImageProcessor.numChannels * m.ImageProcessor.temporalPatchSize *
|
patchDim := m.ImageProcessor.NumChannels * m.ImageProcessor.TemporalPatchSize *
|
||||||
m.ImageProcessor.patchSize * m.ImageProcessor.patchSize
|
m.ImageProcessor.PatchSize * m.ImageProcessor.PatchSize
|
||||||
numPatches := grid.Temporal * grid.Height * grid.Width
|
numPatches := grid.Temporal * grid.Height * grid.Width
|
||||||
|
|
||||||
pixelValues := ctx.Input().FromFloatSlice(f32s, patchDim, numPatches)
|
pixelValues := ctx.Input().FromFloatSlice(f32s, patchDim, numPatches)
|
||||||
|
|||||||
@@ -345,8 +345,8 @@ func (m *VisionModel) PositionalEmbedding(ctx ml.Context, grid *Grid) ml.Tensor
|
|||||||
return positionalEmbedding
|
return positionalEmbedding
|
||||||
}
|
}
|
||||||
|
|
||||||
// newVisionModel creates a new instance of the Qwen vision model
|
// NewVisionModel creates a new instance of the Qwen vision model
|
||||||
func newVisionModel(c fs.Config) *VisionModel {
|
func NewVisionModel(c fs.Config) *VisionModel {
|
||||||
patchSize := int(c.Uint("vision.patch_size", 14))
|
patchSize := int(c.Uint("vision.patch_size", 14))
|
||||||
hiddenSize := int(c.Uint("vision.embedding_length", 1280))
|
hiddenSize := int(c.Uint("vision.embedding_length", 1280))
|
||||||
numHeads := int(c.Uint("vision.attention.head_count", 16))
|
numHeads := int(c.Uint("vision.attention.head_count", 16))
|
||||||
|
|||||||
@@ -11,40 +11,40 @@ import (
|
|||||||
|
|
||||||
// ImageProcessor contains configuration for the Qwen 2.5 VL image processing
|
// ImageProcessor contains configuration for the Qwen 2.5 VL image processing
|
||||||
type ImageProcessor struct {
|
type ImageProcessor struct {
|
||||||
numChannels int
|
NumChannels int
|
||||||
patchSize int
|
PatchSize int
|
||||||
temporalPatchSize int
|
TemporalPatchSize int
|
||||||
mergeSize int
|
MergeSize int
|
||||||
minPixels int
|
MinPixels int
|
||||||
maxPixels int
|
MaxPixels int
|
||||||
factor int
|
Factor int
|
||||||
rescaleFactor float32
|
RescaleFactor float32
|
||||||
imageMean []float32
|
ImageMean []float32
|
||||||
imageStd []float32
|
ImageStd []float32
|
||||||
}
|
}
|
||||||
|
|
||||||
// newImageProcessor creates a new image processor with default values
|
// newImageProcessor creates a new image processor with default values
|
||||||
func newImageProcessor(c fs.Config) ImageProcessor {
|
func NewImageProcessor(c fs.Config) ImageProcessor {
|
||||||
patchSize := int(c.Uint("vision.patch_size", 14))
|
patchSize := int(c.Uint("vision.patch_size", 14))
|
||||||
mergeSize := int(c.Uint("vision.spatial_merge_size", 2))
|
mergeSize := int(c.Uint("vision.spatial_merge_size", 2))
|
||||||
|
|
||||||
return ImageProcessor{
|
return ImageProcessor{
|
||||||
numChannels: int(c.Uint("vision.num_channels", 3)), // not set
|
NumChannels: int(c.Uint("vision.num_channels", 3)), // not set
|
||||||
patchSize: patchSize,
|
PatchSize: patchSize,
|
||||||
temporalPatchSize: 2,
|
TemporalPatchSize: 2,
|
||||||
mergeSize: mergeSize,
|
MergeSize: mergeSize,
|
||||||
minPixels: 56 * 56,
|
MinPixels: 56 * 56,
|
||||||
maxPixels: int(c.Uint("vision.max_pixels", 28*28*1280)), // 1MP limit
|
MaxPixels: int(c.Uint("vision.max_pixels", 28*28*1280)), // 1MP limit
|
||||||
factor: patchSize * mergeSize,
|
Factor: patchSize * mergeSize,
|
||||||
rescaleFactor: 1.0 / 255.0,
|
RescaleFactor: 1.0 / 255.0,
|
||||||
imageMean: imageproc.ClipDefaultMean[:],
|
ImageMean: imageproc.ClipDefaultMean[:],
|
||||||
imageStd: imageproc.ClipDefaultSTD[:],
|
ImageStd: imageproc.ClipDefaultSTD[:],
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// SmartResize implements the smart resize algorithm
|
// SmartResize implements the smart resize algorithm
|
||||||
func (p *ImageProcessor) SmartResize(height, width int) (int, int) {
|
func (p *ImageProcessor) SmartResize(height, width int) (int, int) {
|
||||||
factor := p.factor
|
factor := p.Factor
|
||||||
|
|
||||||
if height < factor || width < factor {
|
if height < factor || width < factor {
|
||||||
panic(fmt.Sprintf("height:%d or width:%d must be larger than factor:%d", height, width, factor))
|
panic(fmt.Sprintf("height:%d or width:%d must be larger than factor:%d", height, width, factor))
|
||||||
@@ -57,13 +57,13 @@ func (p *ImageProcessor) SmartResize(height, width int) (int, int) {
|
|||||||
hBar := round(float64(height)/float64(factor)) * factor
|
hBar := round(float64(height)/float64(factor)) * factor
|
||||||
wBar := round(float64(width)/float64(factor)) * factor
|
wBar := round(float64(width)/float64(factor)) * factor
|
||||||
|
|
||||||
if hBar*wBar > p.maxPixels {
|
if hBar*wBar > p.MaxPixels {
|
||||||
beta := math.Sqrt(float64(height*width) / float64(p.maxPixels))
|
beta := math.Sqrt(float64(height*width) / float64(p.MaxPixels))
|
||||||
|
|
||||||
hBar = int(math.Floor(float64(height)/beta/float64(factor))) * factor
|
hBar = int(math.Floor(float64(height)/beta/float64(factor))) * factor
|
||||||
wBar = int(math.Floor(float64(width)/beta/float64(factor))) * factor
|
wBar = int(math.Floor(float64(width)/beta/float64(factor))) * factor
|
||||||
} else if hBar*wBar < p.minPixels {
|
} else if hBar*wBar < p.MinPixels {
|
||||||
beta := math.Sqrt(float64(p.minPixels) / float64(height*width))
|
beta := math.Sqrt(float64(p.MinPixels) / float64(height*width))
|
||||||
|
|
||||||
hBar = int(math.Ceil(float64(height)*beta/float64(factor))) * factor
|
hBar = int(math.Ceil(float64(height)*beta/float64(factor))) * factor
|
||||||
wBar = int(math.Ceil(float64(width)*beta/float64(factor))) * factor
|
wBar = int(math.Ceil(float64(width)*beta/float64(factor))) * factor
|
||||||
@@ -90,16 +90,16 @@ func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, *Grid, error)
|
|||||||
|
|
||||||
normalizedPixels := imageproc.Normalize(
|
normalizedPixels := imageproc.Normalize(
|
||||||
resizedImg,
|
resizedImg,
|
||||||
[3]float32{p.imageMean[0], p.imageMean[1], p.imageMean[2]},
|
[3]float32{p.ImageMean[0], p.ImageMean[1], p.ImageMean[2]},
|
||||||
[3]float32{p.imageStd[0], p.imageStd[1], p.imageStd[2]},
|
[3]float32{p.ImageStd[0], p.ImageStd[1], p.ImageStd[2]},
|
||||||
true, // rescale
|
true, // rescale
|
||||||
true, // channelFirst
|
true, // channelFirst
|
||||||
)
|
)
|
||||||
|
|
||||||
// Calculate grid dimensions
|
// Calculate grid dimensions
|
||||||
grid := &Grid{
|
grid := &Grid{
|
||||||
Height: resizedHeight / p.patchSize,
|
Height: resizedHeight / p.PatchSize,
|
||||||
Width: resizedWidth / p.patchSize,
|
Width: resizedWidth / p.PatchSize,
|
||||||
Temporal: 1, // For single images, temporal dimension is 1
|
Temporal: 1, // For single images, temporal dimension is 1
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -113,10 +113,10 @@ func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, *Grid, error)
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (p *ImageProcessor) createPatches(pixels []float32, height, width int, grid *Grid) ([]float32, error) {
|
func (p *ImageProcessor) createPatches(pixels []float32, height, width int, grid *Grid) ([]float32, error) {
|
||||||
channels := p.numChannels
|
channels := p.NumChannels
|
||||||
patchSize := p.patchSize
|
patchSize := p.PatchSize
|
||||||
mergeSize := p.mergeSize
|
mergeSize := p.MergeSize
|
||||||
temporalPatchSize := p.temporalPatchSize
|
temporalPatchSize := p.TemporalPatchSize
|
||||||
|
|
||||||
// Calculate output dimensions
|
// Calculate output dimensions
|
||||||
numPatches := grid.Temporal * grid.Height * grid.Width
|
numPatches := grid.Temporal * grid.Height * grid.Width
|
||||||
|
|||||||
153
model/models/qwen3vl/model.go
Normal file
153
model/models/qwen3vl/model.go
Normal file
@@ -0,0 +1,153 @@
|
|||||||
|
package qwen3vl
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"fmt"
|
||||||
|
"image"
|
||||||
|
"slices"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/fs"
|
||||||
|
"github.com/ollama/ollama/kvcache"
|
||||||
|
"github.com/ollama/ollama/ml"
|
||||||
|
"github.com/ollama/ollama/model"
|
||||||
|
"github.com/ollama/ollama/model/input"
|
||||||
|
"github.com/ollama/ollama/model/models/qwen25vl"
|
||||||
|
"github.com/ollama/ollama/model/models/qwen3"
|
||||||
|
)
|
||||||
|
|
||||||
|
type Model struct {
|
||||||
|
model.Base
|
||||||
|
model.BytePairEncoding
|
||||||
|
|
||||||
|
TextModel *qwen3.Model
|
||||||
|
*qwen25vl.VisionModel
|
||||||
|
|
||||||
|
qwen25vl.ImageProcessor
|
||||||
|
}
|
||||||
|
|
||||||
|
var _ model.MultimodalProcessor = (*Model)(nil)
|
||||||
|
|
||||||
|
func New(c fs.Config) (model.Model, error) {
|
||||||
|
textModel, err := qwen3.New(c)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
m := &Model{
|
||||||
|
BytePairEncoding: model.NewBytePairEncoding(
|
||||||
|
c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
|
||||||
|
&model.Vocabulary{
|
||||||
|
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||||
|
Types: c.Ints("tokenizer.ggml.token_type"),
|
||||||
|
Merges: c.Strings("tokenizer.ggml.merges"),
|
||||||
|
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
|
||||||
|
BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
|
||||||
|
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
||||||
|
EOS: append(
|
||||||
|
[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
|
||||||
|
c.Ints("tokenizer.ggml.eos_token_ids")...,
|
||||||
|
),
|
||||||
|
},
|
||||||
|
),
|
||||||
|
TextModel: textModel.(*qwen3.Model),
|
||||||
|
VisionModel: qwen25vl.NewVisionModel(c),
|
||||||
|
ImageProcessor: qwen25vl.NewImageProcessor(c),
|
||||||
|
}
|
||||||
|
|
||||||
|
m.Cache = kvcache.NewCausalCache(m.TextModel.Shift)
|
||||||
|
|
||||||
|
return m, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *Model) PixelValues(ctx ml.Context, multimodalData []byte) (ml.Tensor, *qwen25vl.Grid, error) {
|
||||||
|
image, _, err := image.Decode(bytes.NewReader(multimodalData))
|
||||||
|
if err != nil {
|
||||||
|
return nil, nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
f32s, grid, err := m.ImageProcessor.ProcessImage(image)
|
||||||
|
if err != nil {
|
||||||
|
return nil, nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate tensor dimensions
|
||||||
|
patchDim := m.ImageProcessor.NumChannels * m.ImageProcessor.TemporalPatchSize *
|
||||||
|
m.ImageProcessor.PatchSize * m.ImageProcessor.PatchSize
|
||||||
|
numPatches := grid.Temporal * grid.Height * grid.Width
|
||||||
|
|
||||||
|
pixelValues := ctx.Input().FromFloatSlice(f32s, patchDim, numPatches)
|
||||||
|
|
||||||
|
return pixelValues, grid, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
|
||||||
|
if len(m.VisionModel.Layers) == 0 {
|
||||||
|
return nil, model.ErrNoVisionModel
|
||||||
|
}
|
||||||
|
|
||||||
|
pixels, grid, err := m.PixelValues(ctx, multimodalData)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
visionOutputs := m.VisionModel.Forward(ctx, pixels, grid)
|
||||||
|
return []input.Multimodal{{Tensor: visionOutputs}}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// PostTokenize arranges Qwen-3-VL's inputs for the forward pass
|
||||||
|
func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
|
||||||
|
var result []*input.Input
|
||||||
|
|
||||||
|
var (
|
||||||
|
imageToken int32 = 151655
|
||||||
|
visionStartToken int32 = 151652
|
||||||
|
visionEndToken int32 = 151653
|
||||||
|
)
|
||||||
|
|
||||||
|
nImg := 0
|
||||||
|
for _, inp := range inputs {
|
||||||
|
if inp.Multimodal == nil {
|
||||||
|
// If not a multimodal input, add it to the result unchanged
|
||||||
|
result = append(result, inp)
|
||||||
|
} else {
|
||||||
|
// Adding the 'Picture' prefix is a hack, at the time of writing there is no way to prefix
|
||||||
|
// the image tokens with a prompt, so we add a prefix here
|
||||||
|
nImg++
|
||||||
|
pre, err := m.Encode(fmt.Sprintf(" Picture %d: ", nImg), true)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to encode image prompt: %w", err)
|
||||||
|
}
|
||||||
|
for i := range pre {
|
||||||
|
result = append(result, &input.Input{Token: pre[i]})
|
||||||
|
}
|
||||||
|
|
||||||
|
patchesPerChunk := inp.Multimodal[0].Tensor.Dim(1)
|
||||||
|
|
||||||
|
// First add the vision start token
|
||||||
|
result = append(result, &input.Input{Token: visionStartToken})
|
||||||
|
|
||||||
|
// Add the image token with the multimodal tensor data at the first position
|
||||||
|
result = append(result, &input.Input{
|
||||||
|
Token: imageToken,
|
||||||
|
Multimodal: inp.Multimodal,
|
||||||
|
MultimodalHash: inp.MultimodalHash,
|
||||||
|
SameBatch: patchesPerChunk,
|
||||||
|
})
|
||||||
|
|
||||||
|
// Add the placeholder tokens for the remaining positions (tokensPerGrid-1)
|
||||||
|
result = append(result, slices.Repeat([]*input.Input{{Token: imageToken}}, patchesPerChunk-1)...)
|
||||||
|
|
||||||
|
result = append(result, &input.Input{Token: visionEndToken})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return result, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
||||||
|
return m.TextModel.Forward(ctx, batch)
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
model.Register("qwen3vl", New)
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user