mirror of
https://github.com/ollama/ollama.git
synced 2026-02-20 08:16:07 -05:00
191 lines
6.1 KiB
Go
191 lines
6.1 KiB
Go
package glmocr
|
|
|
|
import (
|
|
"math"
|
|
|
|
"github.com/ollama/ollama/fs"
|
|
"github.com/ollama/ollama/kvcache"
|
|
"github.com/ollama/ollama/ml"
|
|
"github.com/ollama/ollama/ml/nn"
|
|
"github.com/ollama/ollama/ml/nn/rope"
|
|
)
|
|
|
|
type TextModelOptions struct {
|
|
hiddenSize int
|
|
numHeads int
|
|
numKVHeads int
|
|
headDim int
|
|
rotaryDim int
|
|
intermediateSize int
|
|
eps float32
|
|
ropeBase float32
|
|
mropeSections []int
|
|
}
|
|
|
|
func (o *TextModelOptions) applyMRoPE(ctx ml.Context, states, positions ml.Tensor) ml.Tensor {
|
|
// With 4 sections for [temporal, height, width, unused]
|
|
return nn.RoPE(ctx, states, positions, o.rotaryDim, o.ropeBase, 1.0, rope.WithMRoPE(o.mropeSections))
|
|
}
|
|
|
|
type TextSelfAttention struct {
|
|
Query *nn.Linear `gguf:"attn_q"`
|
|
Key *nn.Linear `gguf:"attn_k"`
|
|
Value *nn.Linear `gguf:"attn_v"`
|
|
Output *nn.Linear `gguf:"attn_out"`
|
|
}
|
|
|
|
func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, cache kvcache.Cache, opts *TextModelOptions) ml.Tensor {
|
|
batchSize := hiddenStates.Dim(1)
|
|
|
|
// Separate Q, K, V projections
|
|
q := sa.Query.Forward(ctx, hiddenStates)
|
|
k := sa.Key.Forward(ctx, hiddenStates)
|
|
v := sa.Value.Forward(ctx, hiddenStates)
|
|
|
|
// Reshape for GQA
|
|
q = q.Reshape(ctx, opts.headDim, opts.numHeads, batchSize)
|
|
k = k.Reshape(ctx, opts.headDim, opts.numKVHeads, batchSize)
|
|
v = v.Reshape(ctx, opts.headDim, opts.numKVHeads, batchSize)
|
|
|
|
// Apply M-RoPE (multi-resolution rotary position embeddings)
|
|
q = opts.applyMRoPE(ctx, q, positions)
|
|
k = opts.applyMRoPE(ctx, k, positions)
|
|
|
|
// Scaled dot-product attention with KV cache
|
|
scaleFactor := 1.0 / math.Sqrt(float64(opts.headDim))
|
|
kqv := nn.Attention(ctx, q, k, v, scaleFactor, cache)
|
|
// Reshape attention output: [headDim, numHeads, batchSize] -> [numHeads*headDim, batchSize]
|
|
// Note: numHeads * headDim = 16 * 128 = 2048, which is the attention hidden size
|
|
kqv = kqv.Reshape(ctx, opts.numHeads*opts.headDim, batchSize)
|
|
|
|
return sa.Output.Forward(ctx, kqv)
|
|
}
|
|
|
|
type TextMLP struct {
|
|
Gate *nn.Linear `gguf:"ffn_gate"`
|
|
Up *nn.Linear `gguf:"ffn_up"`
|
|
Down *nn.Linear `gguf:"ffn_down"`
|
|
}
|
|
|
|
func (mlp *TextMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *TextModelOptions) ml.Tensor {
|
|
// SwiGLU: down(silu(gate(x)) * up(x))
|
|
gate := mlp.Gate.Forward(ctx, hiddenStates).SILU(ctx, mlp.Up.Forward(ctx, hiddenStates))
|
|
return mlp.Down.Forward(ctx, gate)
|
|
}
|
|
|
|
type TextDecoderLayer struct {
|
|
// Input layernorm (before attention)
|
|
AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
|
|
SelfAttention *TextSelfAttention
|
|
// Post self-attention layernorm (after attention, before residual add)
|
|
PostAttnNorm *nn.RMSNorm `gguf:"post_attn_norm"`
|
|
|
|
// FFN input layernorm (after first residual, before MLP)
|
|
FFNNorm *nn.RMSNorm `gguf:"ffn_norm"`
|
|
MLP *TextMLP
|
|
// Post MLP layernorm (after MLP, before residual add)
|
|
PostFFNNorm *nn.RMSNorm `gguf:"post_ffn_norm"`
|
|
}
|
|
|
|
func (l *TextDecoderLayer) Forward(ctx ml.Context, hiddenStates, positions, outputs ml.Tensor, cache kvcache.Cache, opts *TextModelOptions) ml.Tensor {
|
|
// Attention block
|
|
residual := hiddenStates
|
|
hiddenStates = l.AttentionNorm.Forward(ctx, hiddenStates, opts.eps)
|
|
hiddenStates = l.SelfAttention.Forward(ctx, hiddenStates, positions, cache, opts)
|
|
hiddenStates = l.PostAttnNorm.Forward(ctx, hiddenStates, opts.eps)
|
|
|
|
// Prune to output positions in final layer
|
|
if outputs != nil {
|
|
hiddenStates = hiddenStates.Rows(ctx, outputs)
|
|
residual = residual.Rows(ctx, outputs)
|
|
}
|
|
|
|
hiddenStates = hiddenStates.Add(ctx, residual)
|
|
|
|
// MLP block
|
|
residual = hiddenStates
|
|
hiddenStates = l.FFNNorm.Forward(ctx, hiddenStates, opts.eps)
|
|
hiddenStates = l.MLP.Forward(ctx, hiddenStates, opts)
|
|
hiddenStates = l.PostFFNNorm.Forward(ctx, hiddenStates, opts.eps)
|
|
hiddenStates = hiddenStates.Add(ctx, residual)
|
|
|
|
return hiddenStates
|
|
}
|
|
|
|
type TextModel struct {
|
|
TokenEmbedding *nn.Embedding `gguf:"token_embd"`
|
|
Layers []TextDecoderLayer `gguf:"blk"`
|
|
OutputNorm *nn.RMSNorm `gguf:"output_norm"`
|
|
Output *nn.Linear `gguf:"output,alt:token_embd"`
|
|
|
|
*TextModelOptions
|
|
|
|
// positionCache stores the M-RoPE position for each token in the sequence.
|
|
// This is needed because image tokens share the same base position but have
|
|
// different height/width offsets, and the end token position depends on the
|
|
// image grid dimensions.
|
|
positionCache []int32
|
|
ropeDelta int32
|
|
}
|
|
|
|
func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
|
|
// Clear position cache when KV cache shifts
|
|
m.positionCache = nil
|
|
m.ropeDelta = 0
|
|
return m.applyMRoPE(ctx, key, shift), nil
|
|
}
|
|
|
|
func newTextModel(c fs.Config) *TextModel {
|
|
hiddenSize := int(c.Uint("embedding_length", 1536))
|
|
numHeads := int(c.Uint("attention.head_count", 16))
|
|
numKVHeads := int(c.Uint("attention.head_count_kv", 8))
|
|
intermediateSize := int(c.Uint("feed_forward_length", 4608))
|
|
eps := c.Float("attention.layer_norm_rms_epsilon", 1e-5)
|
|
ropeBase := c.Float("rope.freq_base", 10000)
|
|
|
|
headDim := int(c.Uint("attention.key_length", uint32(hiddenSize/numHeads)))
|
|
ropeDim := int(c.Uint("rope.dimension_count", uint32(headDim)))
|
|
if ropeDim <= 0 {
|
|
ropeDim = headDim
|
|
}
|
|
|
|
mropeSections := c.Ints("rope.mrope_section")
|
|
var sectionInts []int
|
|
|
|
if len(mropeSections) > 0 {
|
|
sectionInts = make([]int, len(mropeSections))
|
|
for i, section := range mropeSections {
|
|
sectionInts[i] = int(section)
|
|
}
|
|
} else {
|
|
// Default to GLM-OCR's HF ratio (2:3:3) scaled to rotaryDim/2.
|
|
// For rotaryDim=64 this yields [8, 12, 12].
|
|
total := ropeDim / 2
|
|
if total <= 0 {
|
|
total = 32
|
|
}
|
|
s0 := total * 2 / 8
|
|
s1 := total * 3 / 8
|
|
s2 := total - s0 - s1
|
|
sectionInts = []int{s0, s1, s2}
|
|
}
|
|
|
|
// GGML rope_multi: sector = (dim_pair) % sum(sections), mapping each pair to its position dim
|
|
rotaryDim := ropeDim
|
|
|
|
return &TextModel{
|
|
Layers: make([]TextDecoderLayer, c.Uint("block_count", 16)),
|
|
TextModelOptions: &TextModelOptions{
|
|
hiddenSize: hiddenSize,
|
|
numHeads: numHeads,
|
|
numKVHeads: numKVHeads,
|
|
headDim: headDim,
|
|
rotaryDim: rotaryDim,
|
|
intermediateSize: intermediateSize,
|
|
eps: eps,
|
|
ropeBase: ropeBase,
|
|
mropeSections: sectionInts,
|
|
},
|
|
}
|
|
}
|