ollama/model/models/glmocr/model_text.go

package glmocr

import (
	"math"

	"github.com/ollama/ollama/fs"
	"github.com/ollama/ollama/kvcache"
	"github.com/ollama/ollama/ml"
	"github.com/ollama/ollama/ml/nn"
	"github.com/ollama/ollama/ml/nn/rope"
)

type TextModelOptions struct {
	hiddenSize       int
	numHeads         int
	numKVHeads       int
	headDim          int
	rotaryDim        int
	intermediateSize int
	eps              float32
	ropeBase         float32
	mropeSections    []int
}

func (o *TextModelOptions) applyMRoPE(ctx ml.Context, states, positions ml.Tensor) ml.Tensor {
	// With 4 sections for [temporal, height, width, unused]
	return nn.RoPE(ctx, states, positions, o.rotaryDim, o.ropeBase, 1.0, rope.WithMRoPE(o.mropeSections))
}

type TextSelfAttention struct {
	Query  *nn.Linear `gguf:"attn_q"`
	Key    *nn.Linear `gguf:"attn_k"`
	Value  *nn.Linear `gguf:"attn_v"`
	Output *nn.Linear `gguf:"attn_out"`
}

func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, cache kvcache.Cache, opts *TextModelOptions) ml.Tensor {
	batchSize := hiddenStates.Dim(1)

	// Separate Q, K, V projections
	q := sa.Query.Forward(ctx, hiddenStates)
	k := sa.Key.Forward(ctx, hiddenStates)
	v := sa.Value.Forward(ctx, hiddenStates)

	// Reshape for GQA
	q = q.Reshape(ctx, opts.headDim, opts.numHeads, batchSize)
	k = k.Reshape(ctx, opts.headDim, opts.numKVHeads, batchSize)
	v = v.Reshape(ctx, opts.headDim, opts.numKVHeads, batchSize)

	// Apply M-RoPE (multi-resolution rotary position embeddings)
	q = opts.applyMRoPE(ctx, q, positions)
	k = opts.applyMRoPE(ctx, k, positions)

	// Scaled dot-product attention with KV cache
	scaleFactor := 1.0 / math.Sqrt(float64(opts.headDim))
	kqv := nn.Attention(ctx, q, k, v, scaleFactor, cache)
	// Reshape attention output: [headDim, numHeads, batchSize] -> [numHeads*headDim, batchSize]
	// Note: numHeads * headDim = 16 * 128 = 2048, which is the attention hidden size
	kqv = kqv.Reshape(ctx, opts.numHeads*opts.headDim, batchSize)

	return sa.Output.Forward(ctx, kqv)
}

type TextMLP struct {
	Gate *nn.Linear `gguf:"ffn_gate"`
	Up   *nn.Linear `gguf:"ffn_up"`
	Down *nn.Linear `gguf:"ffn_down"`
}

func (mlp *TextMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *TextModelOptions) ml.Tensor {
	// SwiGLU: down(silu(gate(x)) * up(x))
	gate := mlp.Gate.Forward(ctx, hiddenStates).SILU(ctx, mlp.Up.Forward(ctx, hiddenStates))
	return mlp.Down.Forward(ctx, gate)
}

type TextDecoderLayer struct {
	// Input layernorm (before attention)
	AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
	SelfAttention *TextSelfAttention
	// Post self-attention layernorm (after attention, before residual add)
	PostAttnNorm *nn.RMSNorm `gguf:"post_attn_norm"`

	// FFN input layernorm (after first residual, before MLP)
	FFNNorm *nn.RMSNorm `gguf:"ffn_norm"`
	MLP     *TextMLP
	// Post MLP layernorm (after MLP, before residual add)
	PostFFNNorm *nn.RMSNorm `gguf:"post_ffn_norm"`
}

func (l *TextDecoderLayer) Forward(ctx ml.Context, hiddenStates, positions, outputs ml.Tensor, cache kvcache.Cache, opts *TextModelOptions) ml.Tensor {
	// Attention block
	residual := hiddenStates
	hiddenStates = l.AttentionNorm.Forward(ctx, hiddenStates, opts.eps)
	hiddenStates = l.SelfAttention.Forward(ctx, hiddenStates, positions, cache, opts)
	hiddenStates = l.PostAttnNorm.Forward(ctx, hiddenStates, opts.eps)

	// Prune to output positions in final layer
	if outputs != nil {
		hiddenStates = hiddenStates.Rows(ctx, outputs)
		residual = residual.Rows(ctx, outputs)
	}

	hiddenStates = hiddenStates.Add(ctx, residual)

	// MLP block
	residual = hiddenStates
	hiddenStates = l.FFNNorm.Forward(ctx, hiddenStates, opts.eps)
	hiddenStates = l.MLP.Forward(ctx, hiddenStates, opts)
	hiddenStates = l.PostFFNNorm.Forward(ctx, hiddenStates, opts.eps)
	hiddenStates = hiddenStates.Add(ctx, residual)

	return hiddenStates
}

type TextModel struct {
	TokenEmbedding *nn.Embedding      `gguf:"token_embd"`
	Layers         []TextDecoderLayer `gguf:"blk"`
	OutputNorm     *nn.RMSNorm        `gguf:"output_norm"`
	Output         *nn.Linear         `gguf:"output,alt:token_embd"`

	*TextModelOptions

	// positionCache stores the M-RoPE position for each token in the sequence.
	// This is needed because image tokens share the same base position but have
	// different height/width offsets, and the end token position depends on the
	// image grid dimensions.
	positionCache []int32
	ropeDelta     int32
}

func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
	// Clear position cache when KV cache shifts
	m.positionCache = nil
	m.ropeDelta = 0
	return m.applyMRoPE(ctx, key, shift), nil
}

func newTextModel(c fs.Config) *TextModel {
	hiddenSize := int(c.Uint("embedding_length", 1536))
	numHeads := int(c.Uint("attention.head_count", 16))
	numKVHeads := int(c.Uint("attention.head_count_kv", 8))
	intermediateSize := int(c.Uint("feed_forward_length", 4608))
	eps := c.Float("attention.layer_norm_rms_epsilon", 1e-5)
	ropeBase := c.Float("rope.freq_base", 10000)

	headDim := int(c.Uint("attention.key_length", uint32(hiddenSize/numHeads)))
	ropeDim := int(c.Uint("rope.dimension_count", uint32(headDim)))
	if ropeDim <= 0 {
		ropeDim = headDim
	}

	mropeSections := c.Ints("rope.mrope_section")
	var sectionInts []int

	if len(mropeSections) > 0 {
		sectionInts = make([]int, len(mropeSections))
		for i, section := range mropeSections {
			sectionInts[i] = int(section)
		}
	} else {
		// Default to GLM-OCR's HF ratio (2:3:3) scaled to rotaryDim/2.
		// For rotaryDim=64 this yields [8, 12, 12].
		total := ropeDim / 2
		if total <= 0 {
			total = 32
		}
		s0 := total * 2 / 8
		s1 := total * 3 / 8
		s2 := total - s0 - s1
		sectionInts = []int{s0, s1, s2}
	}

	// GGML rope_multi: sector = (dim_pair) % sum(sections), mapping each pair to its position dim
	rotaryDim := ropeDim

	return &TextModel{
		Layers: make([]TextDecoderLayer, c.Uint("block_count", 16)),
		TextModelOptions: &TextModelOptions{
			hiddenSize:       hiddenSize,
			numHeads:         numHeads,
			numKVHeads:       numKVHeads,
			headDim:          headDim,
			rotaryDim:        rotaryDim,
			intermediateSize: intermediateSize,
			eps:              eps,
			ropeBase:         ropeBase,
			mropeSections:    sectionInts,
		},
	}
}