model: add MLA absorption for glm4moelite

Split the combined KV_B tensor into separate K_B and V_B tensors during conversion, enabling MLA (Multi-head Latent Attention) absorption which compresses the KV cache for improved efficiency.
2026-01-21 05:48:35 -05:00 · 2026-01-20 21:21:15 -08:00
2 changed files with 135 additions and 9 deletions
--- a/convert/convert_glm4moelite.go
+++ b/convert/convert_glm4moelite.go
@@ -6,6 +6,10 @@ import (
 	"log/slog"
 	"regexp"
 	"strconv"
+	"strings"
+
+	"github.com/pdevine/tensor"
+	"github.com/pdevine/tensor/native"

 	"github.com/ollama/ollama/fs/ggml"
 )
@@ -69,6 +73,9 @@ func (p *glm4MoeLiteModel) KV(t *Tokenizer) KV {
 	kv["glm4moelite.rope.dimension_count"] = p.QKRopeHeadDim
 	kv["glm4moelite.rope.freq_base"] = cmp.Or(p.RopeTheta, float32(1000000.0))

+	kv["glm4moelite.attention.key_length_mla"] = p.KVLoraRank + p.QKRopeHeadDim
+	kv["glm4moelite.attention.value_length_mla"] = p.KVLoraRank
+
 	kv["tokenizer.ggml.pre"] = "glm4"

 	return kv
@@ -100,6 +107,67 @@ func (p *glm4MoeLiteModel) Replacements() []string {
 	}
 }

+// repackKVB extracts K or V from the combined KV_B tensor for MLA absorption.
+// K output row-major: [n_head, kv_lora_rank, qk_nope] -> GGML ne[]={qk_nope, kv_lora_rank, n_head}
+// V output row-major: [n_head, v_head, kv_lora_rank] -> GGML ne[]={kv_lora_rank, v_head, n_head}
+func (p *glm4MoeLiteModel) repackKVB(extractK bool, kvFirst bool, numHeads int) Repacker {
+	qkNope := int(p.QKNopeHeadDim)
+	vHeadDim := int(p.VHeadDim)
+	kvLoraRank := int(p.KVLoraRank)
+	kvPerHead := qkNope + vHeadDim
+
+	return func(_ string, data []float32, shape []uint64) ([]float32, error) {
+		dims := make([]int, len(shape))
+		for i := range shape {
+			dims[i] = int(shape[i])
+		}
+
+		var tt tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
+		var err error
+
+		// Normalize to [n_head * (qk_nope + v_head), kv_lora_rank] layout
+		if kvFirst {
+			tt, err = tensor.Transpose(tt, 1, 0)
+			if err != nil {
+				return nil, err
+			}
+			tt = tensor.Materialize(tt)
+		}
+
+		// Reshape to [n_head, qk_nope + v_head, kv_lora_rank]
+		if err := tt.Reshape(numHeads, kvPerHead, kvLoraRank); err != nil {
+			return nil, err
+		}
+
+		if extractK {
+			// Slice K: [n_head, qk_nope, kv_lora_rank]
+			tt, err = tt.Slice(nil, tensor.S(0, qkNope), nil)
+			if err != nil {
+				return nil, err
+			}
+			tt = tensor.Materialize(tt)
+			// Transpose to [n_head, kv_lora_rank, qk_nope]
+			tt, err = tensor.Transpose(tt, 0, 2, 1)
+			if err != nil {
+				return nil, err
+			}
+			tt = tensor.Materialize(tt)
+		} else {
+			// Slice V: [n_head, v_head, kv_lora_rank] - already correct layout
+			tt, err = tt.Slice(nil, tensor.S(qkNope, kvPerHead), nil)
+			if err != nil {
+				return nil, err
+			}
+			tt = tensor.Materialize(tt)
+		}
+
+		if err := tt.Reshape(tt.Shape().TotalSize()); err != nil {
+			return nil, err
+		}
+		return native.VectorF32(tt.(*tensor.Dense))
+	}
+}
+
 func (p *glm4MoeLiteModel) Tensors(s []Tensor) (out []*ggml.Tensor) {
 	merges := make([]merge, p.HiddenLayers*3)
 	for i := range p.HiddenLayers {
@@ -139,6 +207,52 @@ func (p *glm4MoeLiteModel) Tensors(s []Tensor) (out []*ggml.Tensor) {
 			slog.Debug("skipping layer", "name", t.Name())
 			continue
 		}
+
+		// Split attn_kv_b into separate attn_k_b and attn_v_b for MLA absorption
+		if strings.HasSuffix(t.Name(), ".attn_kv_b.weight") {
+			qkNope := int(p.QKNopeHeadDim)
+			vHeadDim := int(p.VHeadDim)
+			kvLoraRank := int(p.KVLoraRank)
+			kvPerHead := qkNope + vHeadDim
+			numHeads := int(p.NumAttentionHeads)
+			kvFirst := true
+			if len(t.Shape()) == 2 {
+				switch {
+				case int(t.Shape()[0]) == kvLoraRank:
+					if kvPerHead > 0 && int(t.Shape()[1])%kvPerHead == 0 {
+						numHeads = int(t.Shape()[1]) / kvPerHead
+					}
+					kvFirst = true
+				case int(t.Shape()[1]) == kvLoraRank:
+					if kvPerHead > 0 && int(t.Shape()[0])%kvPerHead == 0 {
+						numHeads = int(t.Shape()[0]) / kvPerHead
+					}
+					kvFirst = false
+				default:
+					slog.Warn("glm4moelite: unexpected attn_kv_b layout", "name", t.Name(), "shape", t.Shape())
+				}
+			}
+
+			kTensor := t.Clone()
+			kTensor.SetRepacker(p.repackKVB(true, kvFirst, numHeads))
+			out = append(out, &ggml.Tensor{
+				Name:     strings.Replace(t.Name(), "attn_kv_b", "attn_k_b", 1),
+				Kind:     t.Kind(),
+				Shape:    []uint64{uint64(numHeads), uint64(kvLoraRank), uint64(qkNope)},
+				WriterTo: kTensor,
+			})
+
+			vTensor := t.Clone()
+			vTensor.SetRepacker(p.repackKVB(false, kvFirst, numHeads))
+			out = append(out, &ggml.Tensor{
+				Name:     strings.Replace(t.Name(), "attn_kv_b", "attn_v_b", 1),
+				Kind:     t.Kind(),
+				Shape:    []uint64{uint64(numHeads), uint64(vHeadDim), uint64(kvLoraRank)},
+				WriterTo: vTensor,
+			})
+			continue
+		}
+
 		out = append(out, &ggml.Tensor{
 			Name:     t.Name(),
 			Kind:     t.Kind(),
--- a/model/models/glm4moelite/model.go
+++ b/model/models/glm4moelite/model.go
@@ -47,7 +47,9 @@ type Attention struct {

 	KVA     *nn.Linear  `gguf:"attn_kv_a_mqa"`
 	KVANorm *nn.RMSNorm `gguf:"attn_kv_a_norm"`
-	KVB     *nn.Linear  `gguf:"attn_kv_b"`
+
+	KB *nn.Linear `gguf:"attn_k_b"`
+	VB *nn.Linear `gguf:"attn_v_b"`

 	Output *nn.Linear `gguf:"attn_out,alt:attn_output"`
 }
@@ -78,15 +80,21 @@ func (attn *Attention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor
 	qRot := opts.applyRotaryPositionEmbeddings(ctx, queryChunks[1], positions)
 	kRot = opts.applyRotaryPositionEmbeddings(ctx, kRot, positions)
 	kPass = attn.KVANorm.Forward(ctx, kPass, opts.eps)
-	kPass = attn.KVB.Forward(ctx, kPass)

-	kv := kPass.Reshape(ctx, kPass.Dim(0)/opts.numKVHeads, opts.numKVHeads, seqLength)
-	kvChunks := kv.ChunkSections(ctx, 0, opts.kqNopeHeadDim, opts.vHeadDim)
+	// MLA absorption: absorb K projection into query
+	qPass := queryChunks[0].Permute(ctx, 0, 2, 1, 3)
+	qPassAbsorb := attn.KB.Forward(ctx, qPass)
+	qPassAbsorb = qPassAbsorb.Permute(ctx, 0, 2, 1, 3)

-	kRot = kRot.Repeat(ctx, 1, queryChunks[0].Dim(1))
-	query = qRot.Concat(ctx, queryChunks[0], 0)
-	key := kRot.Concat(ctx, kvChunks[0], 0)
-	attention := nn.Attention(ctx, query, key, kvChunks[1], opts.kqScale, cache)
+	// Build absorbed query (rope first for in-place context shifting)
+	query = qRot.Concat(ctx, qPassAbsorb, 0)
+
+	// Compressed KV
+	kPass = kPass.Reshape(ctx, opts.kvLoraRank, 1, seqLength)
+	key := kRot.Concat(ctx, kPass, 0)
+	value := kPass
+
+	attention := nn.AttentionWithVMLA(ctx, query, key, value, nil, attn.VB.Weight, opts.kqScale, cache)

 	attention = attention.Reshape(ctx, attention.Dim(0)*attention.Dim(1), seqLength)
 	return attn.Output.Forward(ctx, attention)
@@ -217,8 +225,12 @@ func New(c fs.Config) (model.Model, error) {

 	keyLength := int(c.Uint("attention.key_length"))
 	valueLength := int(c.Uint("attention.value_length"))
+	kvLoraRank := int(c.Uint("attention.kv_lora_rank"))
+	qkRopeHeadDim := int(c.Uint("rope.dimension_count"))

-	kqScale := 1.0 / math.Sqrt(float64(keyLength))
+	// For MLA absorption, the effective key dimension is kvLoraRank + qkRopeHeadDim
+	mlaKeyLength := kvLoraRank + qkRopeHeadDim
+	kqScale := 1.0 / math.Sqrt(float64(mlaKeyLength))

 	var pre []string
 	switch c.String("tokenizer.ggml.pre") {