ollama/model/models/glmocr/model.go

package glmocr

import (
	"bytes"
	"errors"
	"image"
	"slices"

	"github.com/ollama/ollama/fs"
	"github.com/ollama/ollama/kvcache"
	"github.com/ollama/ollama/ml"
	"github.com/ollama/ollama/model"
	"github.com/ollama/ollama/model/input"
	"github.com/ollama/ollama/tokenizer"
)

type Model struct {
	model.Base
	tokenizer.Tokenizer

	*TextModel
	*VisionModel     `gguf:"v"`
	VisionDownsample *VisionDownsample `gguf:"mm.patch_merger"`
	PatchMerger      *PatchMerger      `gguf:"mm"`

	ImageProcessor

	imageTokenID      int32
	imageStartTokenID int32
	imageEndTokenID   int32
}

var _ model.MultimodalProcessor = (*Model)(nil)

func New(c fs.Config) (model.Model, error) {
	eosTokenID := int32(c.Uint("tokenizer.ggml.eos_token_id"))
	eosTokenIDs := c.Ints("tokenizer.ggml.eos_token_ids")
	allEOS := append([]int32{eosTokenID}, eosTokenIDs...)

	m := &Model{
		Tokenizer: tokenizer.NewBytePairEncoding(
			&tokenizer.Vocabulary{
				Values: c.Strings("tokenizer.ggml.tokens"),
				Types:  c.Ints("tokenizer.ggml.token_type"),
				Merges: c.Strings("tokenizer.ggml.merges"),
				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", false),
				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
				EOS:    allEOS,
			},
			`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
		),
		TextModel:         newTextModel(c),
		VisionModel:       newVisionModel(c),
		ImageProcessor:    newImageProcessor(c),
		imageTokenID:      int32(c.Uint("image_token_id", 59280)),
		imageStartTokenID: int32(c.Uint("image_start_token_id", 59256)),
		imageEndTokenID:   int32(c.Uint("image_end_token_id", 59257)),
	}

	m.Cache = kvcache.NewCausalCache(m.TextModel.Shift)

	return m, nil
}

func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
	if len(m.VisionModel.Blocks) == 0 {
		return nil, model.ErrNoVisionModel
	}

	img, _, err := image.Decode(bytes.NewReader(multimodalData))
	if err != nil {
		return nil, err
	}

	f32s, grid, err := m.ImageProcessor.ProcessImage(img)
	if err != nil {
		return nil, err
	}

	// Create pixel values tensor from flattened patches
	// Shape: [patchDim, numPatches]
	patchDim := m.VisionModel.numChannels * m.temporalPatchSize * m.patchSize * m.patchSize
	numPatches := grid.Temporal * grid.Height * grid.Width
	pixelValues := ctx.Input().FromFloats(f32s, patchDim, numPatches)

	// Forward through vision encoder
	visionOutputs := m.VisionModel.Forward(ctx, pixelValues, grid)

	// Forward through downsample (patch merger)
	if m.VisionDownsample == nil || m.VisionDownsample.Weight == nil {
		return nil, errors.New("glmocr: missing vision downsample weights")
	}
	visionOutputs = m.VisionDownsample.Forward(ctx, visionOutputs, grid, m.VisionModel.VisionModelOptions)

	// Forward through patch merger (FC + LayerNorm + GELU + SwiGLU FFN)
	if m.PatchMerger == nil {
		return nil, errors.New("glmocr: missing patch merger weights")
	}
	visionOutputs = m.PatchMerger.Forward(ctx, visionOutputs, m.VisionModel.VisionModelOptions)

	return []input.Multimodal{{Tensor: visionOutputs, Data: grid}}, nil
}

func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
	var result []*input.Input

	// Reset position cache
	m.TextModel.positionCache = m.TextModel.positionCache[:0]
	m.TextModel.ropeDelta = 0

	pos := int32(0)
	for _, inp := range inputs {
		if inp.Multimodal == nil {
			result = append(result, inp)
			m.TextModel.positionCache = append(m.TextModel.positionCache, pos)
			pos++
			continue
		}

		// Get grid info for position calculation
		grid := inp.Multimodal[0].Data.(*Grid)
		mergedH := grid.Height / m.VisionModel.spatialMergeSize
		mergedW := grid.Width / m.VisionModel.spatialMergeSize

		// Add image start token
		result = append(result, &input.Input{Token: m.imageStartTokenID})
		m.TextModel.positionCache = append(m.TextModel.positionCache, pos)
		pos++

		// Add image tokens with multimodal data
		// All image tokens share the same base position for temporal dimension
		tokensPerGrid := inp.Multimodal[0].Tensor.Dim(1)
		basePos := pos
		sameBatch := tokensPerGrid - 1
		if sameBatch < 0 {
			sameBatch = 0
		}
		result = append(result, &input.Input{
			Token:          m.imageTokenID,
			Multimodal:     inp.Multimodal,
			MultimodalHash: inp.MultimodalHash,
			SameBatch:      sameBatch,
		})
		m.TextModel.positionCache = append(m.TextModel.positionCache, basePos)

		// Add placeholder tokens for remaining positions
		// All image tokens use the same base position (temporal stays constant)
		for range tokensPerGrid - 1 {
			result = append(result, &input.Input{Token: m.imageTokenID})
			m.TextModel.positionCache = append(m.TextModel.positionCache, basePos)
		}

		// Advance position by max(mergedH, mergedW) after image tokens
		pos = basePos + int32(max(mergedH, mergedW))

		// Add image end token
		result = append(result, &input.Input{Token: m.imageEndTokenID})
		m.TextModel.positionCache = append(m.TextModel.positionCache, pos)
		pos++
	}

	// Compute rope delta for continuation after the prefill segment:
	// delta = (max_position_id + 1) - sequence_length
	if len(m.TextModel.positionCache) > 0 {
		last := m.TextModel.positionCache[len(m.TextModel.positionCache)-1]
		m.TextModel.ropeDelta = last + 1 - int32(len(m.TextModel.positionCache))
	}

	return result, nil
}

func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
	// Initial token embedding
	hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs).Duplicate(ctx)
	ctx.Forward(hiddenStates)

	// Build position slices for M-RoPE
	positionSlice := func() [][]int32 {
		s := [][]int32{
			make([]int32, len(batch.Positions)), // temporal
			make([]int32, len(batch.Positions)), // height
			make([]int32, len(batch.Positions)), // width
			make([]int32, len(batch.Positions)), // unused (zeros)
		}
		for i, position := range batch.Positions {
			// Translate through position cache or continue sequence
			if position < int32(len(m.TextModel.positionCache)) {
				position = m.TextModel.positionCache[position]
			} else if len(m.TextModel.positionCache) > 0 {
				// Continue sequence after cached positions using ropeDelta
				position = position + m.TextModel.ropeDelta
			}

			s[0][i] = position
			s[1][i] = position
			s[2][i] = position
		}
		return s
	}()

	// Inject vision embeddings and adjust positions for image tokens
	for _, mi := range batch.Multimodal {
		img := mi.Multimodal[0].Tensor
		ctx.Forward(img.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), img.Dim(0)*img.Dim(1))))

		if grid, ok := mi.Multimodal[0].Data.(*Grid); ok {
			w := grid.Width / m.VisionModel.spatialMergeSize
			for i := range img.Dim(1) {
				positionSlice[1][mi.Index+i] += int32(i / w)
				positionSlice[2][mi.Index+i] += int32(i % w)
			}
		}
	}

	positions := ctx.Input().FromInts(slices.Concat(positionSlice...), len(positionSlice[0])*len(positionSlice))

	// Process through transformer layers
	for i, layer := range m.TextModel.Layers {
		m.Cache.SetLayer(i)

		var lastLayerOutputs ml.Tensor
		if i == len(m.TextModel.Layers)-1 {
			lastLayerOutputs = batch.Outputs
		}

		hiddenStates = layer.Forward(ctx, hiddenStates, positions, lastLayerOutputs, m.Cache, m.TextModel.TextModelOptions)
	}

	hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.TextModel.eps)
	return m.Output.Forward(ctx, hiddenStates), nil
}

func init() {
	model.Register("glmocr", New)
}