mirror of
https://github.com/ollama/ollama.git
synced 2026-02-02 03:33:38 -05:00
Compare commits
6 Commits
brucemacd/
...
ollama-glm
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f201b7d258 | ||
|
|
6f5b814b86 | ||
|
|
79c00a1b16 | ||
|
|
9eded5fddb | ||
|
|
2626ec7772 | ||
|
|
f408e0ff5e |
@@ -313,6 +313,8 @@ func LoadModelMetadata(fsys fs.FS) (ModelKV, *Tokenizer, error) {
|
||||
conv = &deepseek2Model{}
|
||||
case "Glm4MoeLiteForCausalLM":
|
||||
conv = &glm4MoeLiteModel{}
|
||||
case "GlmOcrForConditionalGeneration":
|
||||
conv = &glmOcrModel{}
|
||||
case "Lfm2ForCausalLM":
|
||||
conv = &lfm2Model{}
|
||||
default:
|
||||
|
||||
469
convert/convert_glmocr.go
Normal file
469
convert/convert_glmocr.go
Normal file
@@ -0,0 +1,469 @@
|
||||
package convert
|
||||
|
||||
import (
|
||||
"cmp"
|
||||
"encoding/json"
|
||||
"io/fs"
|
||||
"log/slog"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
"github.com/pdevine/tensor"
|
||||
"github.com/pdevine/tensor/native"
|
||||
)
|
||||
|
||||
// normalToNeoXRepacker creates a repacker that permutes Q/K weights from interleaved (LLaMA)
|
||||
// to NeoX ordering for compatibility with GGML's M-RoPE kernel.
|
||||
//
|
||||
// For weights: reshape [out, in] -> [n_heads, head_dim, in], permute rotary dims, reshape back
|
||||
// For biases: reshape [out] -> [n_heads, head_dim], permute rotary dims, reshape back
|
||||
func normalToNeoXRepacker(nHeads, headDim int, partialRotaryFactor float32) func(string, []float32, []uint64) ([]float32, error) {
|
||||
return func(_ string, data []float32, shape []uint64) ([]float32, error) {
|
||||
rotaryDim := int(float32(headDim) * partialRotaryFactor)
|
||||
if rotaryDim%2 != 0 {
|
||||
rotaryDim = (rotaryDim / 2) * 2 // Round down to even
|
||||
}
|
||||
|
||||
// Handle 1D (bias) or 2D (weight) tensors
|
||||
is1D := len(shape) == 1
|
||||
var inFeatures int
|
||||
if is1D {
|
||||
inFeatures = 1
|
||||
} else {
|
||||
inFeatures = int(shape[1])
|
||||
}
|
||||
outFeatures := int(shape[0])
|
||||
nEffectiveHeads := outFeatures / headDim
|
||||
|
||||
if nEffectiveHeads != nHeads {
|
||||
slog.Warn("normalToNeoX: unexpected head count", "effective", nEffectiveHeads, "expected", nHeads)
|
||||
}
|
||||
|
||||
// Reshape to [n_heads, head_dim, in_features]
|
||||
reshaped := make([]float32, len(data))
|
||||
copy(reshaped, data)
|
||||
|
||||
// Permute the rotary dimensions: even indices first, then odd
|
||||
// For each head, reorder [0,1,2,3,4,5...] to [0,2,4...,1,3,5...]
|
||||
result := make([]float32, len(data))
|
||||
halfRotary := rotaryDim / 2
|
||||
|
||||
for h := range nEffectiveHeads {
|
||||
for f := range inFeatures {
|
||||
for i := range halfRotary {
|
||||
// Even dim (0, 2, 4, ...) -> position i
|
||||
srcIdx := h*headDim*inFeatures + (2*i)*inFeatures + f
|
||||
dstIdx := h*headDim*inFeatures + i*inFeatures + f
|
||||
result[dstIdx] = reshaped[srcIdx]
|
||||
|
||||
// Odd dim (1, 3, 5, ...) -> position halfRotary + i
|
||||
srcIdx = h*headDim*inFeatures + (2*i+1)*inFeatures + f
|
||||
dstIdx = h*headDim*inFeatures + (halfRotary+i)*inFeatures + f
|
||||
result[dstIdx] = reshaped[srcIdx]
|
||||
}
|
||||
|
||||
// Non-rotary part: copy as-is
|
||||
for i := rotaryDim; i < headDim; i++ {
|
||||
srcIdx := h*headDim*inFeatures + i*inFeatures + f
|
||||
result[srcIdx] = reshaped[srcIdx]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
}
|
||||
|
||||
type glmOcrModel struct {
|
||||
ModelParameters
|
||||
|
||||
TextConfig struct {
|
||||
HiddenSize uint32 `json:"hidden_size"`
|
||||
IntermediateSize uint32 `json:"intermediate_size"`
|
||||
NumHiddenLayers uint32 `json:"num_hidden_layers"`
|
||||
NumAttentionHeads uint32 `json:"num_attention_heads"`
|
||||
NumKeyValueHeads uint32 `json:"num_key_value_heads"`
|
||||
HeadDim uint32 `json:"head_dim"`
|
||||
MaxPositionEmbed uint32 `json:"max_position_embeddings"`
|
||||
RMSNormEps float32 `json:"rms_norm_eps"`
|
||||
PartialRotaryFactor float32 `json:"partial_rotary_factor"`
|
||||
RopeParameters struct {
|
||||
RopeType string `json:"rope_type"`
|
||||
MRopeSection []int32 `json:"mrope_section"`
|
||||
RopeTheta float32 `json:"rope_theta"`
|
||||
PartialRotaryFactor float32 `json:"partial_rotary_factor"`
|
||||
} `json:"rope_parameters"`
|
||||
} `json:"text_config"`
|
||||
|
||||
VisionConfig struct {
|
||||
HiddenSize uint32 `json:"hidden_size"`
|
||||
IntermediateSize uint32 `json:"intermediate_size"`
|
||||
Depth uint32 `json:"depth"`
|
||||
NumHeads uint32 `json:"num_heads"`
|
||||
ImageSize uint32 `json:"image_size"`
|
||||
PatchSize uint32 `json:"patch_size"`
|
||||
OutHiddenSize uint32 `json:"out_hidden_size"`
|
||||
RMSNormEps float32 `json:"rms_norm_eps"`
|
||||
SpatialMergeSize uint32 `json:"spatial_merge_size"`
|
||||
TemporalPatchSize uint32 `json:"temporal_patch_size"`
|
||||
} `json:"vision_config"`
|
||||
|
||||
ImageStartTokenID uint32 `json:"image_start_token_id"`
|
||||
ImageEndTokenID uint32 `json:"image_end_token_id"`
|
||||
VideoStartTokenID uint32 `json:"video_start_token_id"`
|
||||
VideoEndTokenID uint32 `json:"video_end_token_id"`
|
||||
ImageTokenID uint32 `json:"image_token_id"`
|
||||
VideoTokenID uint32 `json:"video_token_id"`
|
||||
|
||||
// Preprocessor config (preprocessor_config.json)
|
||||
Preprocessor struct {
|
||||
Size struct {
|
||||
ShortestEdge uint32 `json:"shortest_edge"`
|
||||
LongestEdge uint32 `json:"longest_edge"`
|
||||
} `json:"size"`
|
||||
PatchSize uint32 `json:"patch_size"`
|
||||
TemporalPatchSize uint32 `json:"temporal_patch_size"`
|
||||
MergeSize uint32 `json:"merge_size"`
|
||||
ImageMean []float32 `json:"image_mean"`
|
||||
ImageStd []float32 `json:"image_std"`
|
||||
} `json:"-"`
|
||||
}
|
||||
|
||||
var _ ModelConverter = (*glmOcrModel)(nil)
|
||||
|
||||
func (m *glmOcrModel) parseMore(fsys fs.FS) error {
|
||||
bts, err := fs.ReadFile(fsys, "preprocessor_config.json")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return json.Unmarshal(bts, &m.Preprocessor)
|
||||
}
|
||||
|
||||
func (m *glmOcrModel) KV(t *Tokenizer) KV {
|
||||
kv := m.ModelParameters.KV(t)
|
||||
kv["general.architecture"] = "glmocr"
|
||||
|
||||
// Text model parameters
|
||||
kv["glmocr.block_count"] = cmp.Or(m.TextConfig.NumHiddenLayers, 16)
|
||||
kv["glmocr.embedding_length"] = cmp.Or(m.TextConfig.HiddenSize, 1536)
|
||||
kv["glmocr.attention.head_count"] = cmp.Or(m.TextConfig.NumAttentionHeads, 16)
|
||||
kv["glmocr.attention.head_count_kv"] = cmp.Or(m.TextConfig.NumKeyValueHeads, 8)
|
||||
headDim := cmp.Or(m.TextConfig.HeadDim, m.TextConfig.HiddenSize/m.TextConfig.NumAttentionHeads)
|
||||
kv["glmocr.attention.key_length"] = headDim
|
||||
kv["glmocr.attention.value_length"] = headDim
|
||||
kv["glmocr.feed_forward_length"] = cmp.Or(m.TextConfig.IntermediateSize, 4608)
|
||||
kv["glmocr.attention.layer_norm_rms_epsilon"] = cmp.Or(m.TextConfig.RMSNormEps, 1e-5)
|
||||
kv["glmocr.context_length"] = cmp.Or(m.TextConfig.MaxPositionEmbed, 131072)
|
||||
kv["glmocr.rope.freq_base"] = cmp.Or(m.TextConfig.RopeParameters.RopeTheta, float32(10000))
|
||||
kv["glmocr.rope.partial_rotary_factor"] = cmp.Or(m.TextConfig.RopeParameters.PartialRotaryFactor, m.TextConfig.PartialRotaryFactor, float32(1.0))
|
||||
if len(m.TextConfig.RopeParameters.MRopeSection) > 0 {
|
||||
kv["glmocr.rope.mrope_section"] = m.TextConfig.RopeParameters.MRopeSection
|
||||
}
|
||||
|
||||
// Vision model parameters
|
||||
kv["glmocr.vision.block_count"] = cmp.Or(m.VisionConfig.Depth, 24)
|
||||
kv["glmocr.vision.embedding_length"] = cmp.Or(m.VisionConfig.HiddenSize, 1024)
|
||||
kv["glmocr.vision.attention.head_count"] = cmp.Or(m.VisionConfig.NumHeads, 16)
|
||||
kv["glmocr.vision.image_size"] = cmp.Or(m.VisionConfig.ImageSize, 336)
|
||||
kv["glmocr.vision.patch_size"] = cmp.Or(m.VisionConfig.PatchSize, m.Preprocessor.PatchSize, 14)
|
||||
kv["glmocr.vision.spatial_merge_size"] = cmp.Or(m.VisionConfig.SpatialMergeSize, m.Preprocessor.MergeSize, 2)
|
||||
kv["glmocr.vision.temporal_patch_size"] = cmp.Or(m.VisionConfig.TemporalPatchSize, m.Preprocessor.TemporalPatchSize, 2)
|
||||
kv["glmocr.vision.out_hidden_size"] = cmp.Or(m.VisionConfig.OutHiddenSize, 1536)
|
||||
kv["glmocr.vision.intermediate_size"] = cmp.Or(m.VisionConfig.IntermediateSize, 4096)
|
||||
kv["glmocr.vision.attention.layer_norm_rms_epsilon"] = cmp.Or(m.VisionConfig.RMSNormEps, 1e-5)
|
||||
|
||||
// Preprocessor-derived image settings (min/max pixels and normalization)
|
||||
// Note: fs.Config.keyValue() auto-prepends architecture prefix, so use full key
|
||||
if m.Preprocessor.Size.ShortestEdge > 0 {
|
||||
kv["glmocr.vision.min_pixels"] = m.Preprocessor.Size.ShortestEdge
|
||||
}
|
||||
if m.Preprocessor.Size.LongestEdge > 0 {
|
||||
kv["glmocr.vision.max_pixels"] = m.Preprocessor.Size.LongestEdge
|
||||
}
|
||||
if len(m.Preprocessor.ImageMean) == 3 {
|
||||
kv["glmocr.vision.image_mean"] = m.Preprocessor.ImageMean
|
||||
}
|
||||
if len(m.Preprocessor.ImageStd) == 3 {
|
||||
kv["glmocr.vision.image_std"] = m.Preprocessor.ImageStd
|
||||
}
|
||||
|
||||
// Special tokens
|
||||
kv["glmocr.image_token_id"] = m.ImageTokenID
|
||||
kv["glmocr.image_start_token_id"] = m.ImageStartTokenID
|
||||
kv["glmocr.image_end_token_id"] = m.ImageEndTokenID
|
||||
kv["glmocr.video_token_id"] = m.VideoTokenID
|
||||
kv["glmocr.video_start_token_id"] = m.VideoStartTokenID
|
||||
kv["glmocr.video_end_token_id"] = m.VideoEndTokenID
|
||||
|
||||
return kv
|
||||
}
|
||||
|
||||
func (m *glmOcrModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
||||
var out []*ggml.Tensor
|
||||
|
||||
// Skip layers >= num_hidden_layers (Multi-Token Prediction layers not needed for basic inference)
|
||||
numLayers := int(cmp.Or(m.TextConfig.NumHiddenLayers, 16))
|
||||
skipLayer := func(name string) bool {
|
||||
// Tensor names are already replaced to "blk.N.xxx" format
|
||||
re := regexp.MustCompile(`^blk\.(\d+)`)
|
||||
matches := re.FindStringSubmatch(name)
|
||||
if matches == nil {
|
||||
return false
|
||||
}
|
||||
blkNum, err := strconv.Atoi(matches[1])
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
return blkNum >= numLayers
|
||||
}
|
||||
|
||||
for _, t := range ts {
|
||||
name := t.Name()
|
||||
|
||||
// Skip next-n prediction layers (layers >= num_hidden_layers)
|
||||
if skipLayer(name) {
|
||||
continue
|
||||
}
|
||||
|
||||
// Split ffn_gate_up into separate gate and up projections
|
||||
if strings.Contains(name, "ffn_gate_up") {
|
||||
for t := range splitDim(t, 0,
|
||||
split{Replacer: strings.NewReplacer("ffn_gate_up", "ffn_gate")},
|
||||
split{Replacer: strings.NewReplacer("ffn_gate_up", "ffn_up")},
|
||||
) {
|
||||
out = append(out, t)
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
// Split 5D Conv3D patch_embed weight into two Conv2D weights along temporal dimension
|
||||
// Shape: [out_channels, in_channels, temporal=2, height, width] -> 2x [out_channels, in_channels, height, width]
|
||||
// NOTE: Tensor names are already renamed via Replacements() before Tensors() is called,
|
||||
// so we check for "patch_embd" (renamed) not "patch_embed" (original safetensors name)
|
||||
// NOTE: Ollama Conv2D expects PyTorch format [OC, IC, KH, KW] - no transpose needed
|
||||
if strings.HasSuffix(name, "patch_embd.weight") {
|
||||
shape := t.Shape()
|
||||
if len(shape) == 5 && shape[2] == 2 {
|
||||
// Original shape: [OC, IC, 2, KH, KW] -> [OC, IC, KH, KW] (PyTorch format, no transpose)
|
||||
newShape := []uint64{shape[0], shape[1], shape[3], shape[4]}
|
||||
|
||||
// Create repacker for first temporal slice (t=0)
|
||||
t0 := t.Clone()
|
||||
t0.SetRepacker(func(_ string, data []float32, shape []uint64) ([]float32, error) {
|
||||
dims := make([]int, len(shape))
|
||||
for i := range shape {
|
||||
dims[i] = int(shape[i])
|
||||
}
|
||||
var tt tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
|
||||
// Slice first temporal frame: [:, :, 0, :, :]
|
||||
tt, err := tt.Slice(nil, nil, tensor.S(0, 1), nil, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
tt = tensor.Materialize(tt)
|
||||
// Reshape to 4D by squeezing temporal dim [OC, IC, 1, KH, KW] -> [OC, IC, KH, KW]
|
||||
newDims := []int{int(shape[0]), int(shape[1]), int(shape[3]), int(shape[4])}
|
||||
if err := tt.Reshape(newDims...); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
// No transpose - keep PyTorch format
|
||||
if err := tt.Reshape(tt.Shape().TotalSize()); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return native.VectorF32(tt.(*tensor.Dense))
|
||||
})
|
||||
out = append(out, &ggml.Tensor{
|
||||
Name: strings.Replace(name, "patch_embd.weight", "patch_embd_0.weight", 1),
|
||||
Kind: t.Kind(),
|
||||
Shape: newShape,
|
||||
WriterTo: t0,
|
||||
})
|
||||
|
||||
// Create repacker for second temporal slice (t=1)
|
||||
t1 := t.Clone()
|
||||
t1.SetRepacker(func(_ string, data []float32, shape []uint64) ([]float32, error) {
|
||||
dims := make([]int, len(shape))
|
||||
for i := range shape {
|
||||
dims[i] = int(shape[i])
|
||||
}
|
||||
var tt tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
|
||||
// Slice second temporal frame: [:, :, 1, :, :]
|
||||
tt, err := tt.Slice(nil, nil, tensor.S(1, 2), nil, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
tt = tensor.Materialize(tt)
|
||||
// Reshape to 4D by squeezing temporal dim [OC, IC, 1, KH, KW] -> [OC, IC, KH, KW]
|
||||
newDims := []int{int(shape[0]), int(shape[1]), int(shape[3]), int(shape[4])}
|
||||
if err := tt.Reshape(newDims...); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
// No transpose - keep PyTorch format
|
||||
if err := tt.Reshape(tt.Shape().TotalSize()); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return native.VectorF32(tt.(*tensor.Dense))
|
||||
})
|
||||
out = append(out, &ggml.Tensor{
|
||||
Name: strings.Replace(name, "patch_embd.weight", "patch_embd_1.weight", 1),
|
||||
Kind: t.Kind(),
|
||||
Shape: newShape,
|
||||
WriterTo: t1,
|
||||
})
|
||||
|
||||
continue
|
||||
}
|
||||
|
||||
if len(shape) == 4 {
|
||||
out = append(out, &ggml.Tensor{
|
||||
Name: strings.Replace(name, "patch_embd.weight", "patch_embd_0.weight", 1),
|
||||
Kind: t.Kind(),
|
||||
Shape: t.Shape(),
|
||||
WriterTo: t,
|
||||
})
|
||||
continue
|
||||
}
|
||||
|
||||
slog.Warn("glmocr: patch_embed weight has unexpected shape - not splitting", "shape", shape)
|
||||
// Fall through to default handling
|
||||
}
|
||||
|
||||
// Handle pre-split patch embedding weights
|
||||
// Pattern 1: v.patch_embd.0.weight, v.patch_embd.1.weight -> patch_embd_0.weight, patch_embd_1.weight
|
||||
// Pattern 2: v.patch_embd.weight.0, v.patch_embd.weight.1 -> patch_embd_0.weight, patch_embd_1.weight
|
||||
if strings.Contains(name, "patch_embd.0.") {
|
||||
out = append(out, &ggml.Tensor{
|
||||
Name: strings.Replace(name, "patch_embd.0.", "patch_embd_0.", 1),
|
||||
Kind: t.Kind(),
|
||||
Shape: t.Shape(),
|
||||
WriterTo: t,
|
||||
})
|
||||
continue
|
||||
}
|
||||
if strings.Contains(name, "patch_embd.1.") {
|
||||
out = append(out, &ggml.Tensor{
|
||||
Name: strings.Replace(name, "patch_embd.1.", "patch_embd_1.", 1),
|
||||
Kind: t.Kind(),
|
||||
Shape: t.Shape(),
|
||||
WriterTo: t,
|
||||
})
|
||||
continue
|
||||
}
|
||||
// Handle .weight.0 and .weight.1 suffix patterns
|
||||
if strings.HasSuffix(name, "patch_embd.weight.0") {
|
||||
out = append(out, &ggml.Tensor{
|
||||
Name: strings.Replace(name, "patch_embd.weight.0", "patch_embd_0.weight", 1),
|
||||
Kind: t.Kind(),
|
||||
Shape: t.Shape(),
|
||||
WriterTo: t,
|
||||
})
|
||||
continue
|
||||
}
|
||||
if strings.HasSuffix(name, "patch_embd.weight.1") {
|
||||
out = append(out, &ggml.Tensor{
|
||||
Name: strings.Replace(name, "patch_embd.weight.1", "patch_embd_1.weight", 1),
|
||||
Kind: t.Kind(),
|
||||
Shape: t.Shape(),
|
||||
WriterTo: t,
|
||||
})
|
||||
continue
|
||||
}
|
||||
|
||||
// Permute Q/K weights for M-RoPE compatibility (interleaved -> NeoX ordering)
|
||||
// GGML's M-RoPE kernel uses NeoX-style rotation, but GLM-OCR uses interleaved (LLaMA-style)
|
||||
// We permute at conversion time so the weights work correctly with GGML's kernel
|
||||
// This aligns Q/K rotary dimensions with GGML's NeoX-style rotation
|
||||
if len(m.TextConfig.RopeParameters.MRopeSection) > 0 &&
|
||||
strings.Contains(name, "blk.") && (strings.Contains(name, "attn_q.") || strings.Contains(name, "attn_k.")) {
|
||||
// Get config values for permutation
|
||||
nHeads := int(cmp.Or(m.TextConfig.NumAttentionHeads, 16))
|
||||
nKVHeads := int(cmp.Or(m.TextConfig.NumKeyValueHeads, 8))
|
||||
hiddenSize := int(cmp.Or(m.TextConfig.HiddenSize, 1536))
|
||||
headDim := int(cmp.Or(m.TextConfig.HeadDim, uint32(hiddenSize/nHeads)))
|
||||
partialRotaryFactor := cmp.Or(m.TextConfig.PartialRotaryFactor, m.TextConfig.RopeParameters.PartialRotaryFactor, float32(1.0))
|
||||
|
||||
// Use appropriate head count: nHeads for Q, nKVHeads for K
|
||||
effectiveHeads := nHeads
|
||||
if strings.Contains(name, "attn_k.") {
|
||||
effectiveHeads = nKVHeads
|
||||
}
|
||||
|
||||
permutedT := t.Clone()
|
||||
permutedT.SetRepacker(normalToNeoXRepacker(effectiveHeads, headDim, partialRotaryFactor))
|
||||
out = append(out, &ggml.Tensor{
|
||||
Name: name,
|
||||
Kind: t.Kind(),
|
||||
Shape: t.Shape(),
|
||||
WriterTo: permutedT,
|
||||
})
|
||||
continue
|
||||
}
|
||||
|
||||
out = append(out, &ggml.Tensor{
|
||||
Name: name,
|
||||
Kind: t.Kind(),
|
||||
Shape: t.Shape(),
|
||||
WriterTo: t,
|
||||
})
|
||||
}
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
func (m *glmOcrModel) Replacements() []string {
|
||||
return []string{
|
||||
// Vision encoder
|
||||
"model.visual.patch_embed.proj_1", "v.patch_embd_1", // Second temporal split
|
||||
"model.visual.patch_embed.proj", "v.patch_embd",
|
||||
"model.visual.blocks", "v.blk",
|
||||
"model.visual.post_layernorm", "v.post_ln",
|
||||
"model.visual.downsample", "mm.patch_merger",
|
||||
|
||||
// Vision attention
|
||||
"attn.qkv", "attn_qkv",
|
||||
"attn.proj", "attn_out",
|
||||
"attn.q_norm", "attn_q_norm",
|
||||
"attn.k_norm", "attn_k_norm",
|
||||
|
||||
// Vision norms
|
||||
"norm1", "ln1",
|
||||
"norm2", "ln2",
|
||||
|
||||
// Vision MLP
|
||||
"mlp.gate_proj", "ffn_gate",
|
||||
"mlp.up_proj", "ffn_up",
|
||||
"mlp.down_proj", "ffn_down",
|
||||
|
||||
// Merger (multimodal projector)
|
||||
"model.visual.merger.proj", "mm.model.fc",
|
||||
"model.visual.merger.post_projection_norm", "mm.post_norm",
|
||||
"model.visual.merger.gate_proj", "mm.gate",
|
||||
"model.visual.merger.up_proj", "mm.up",
|
||||
"model.visual.merger.down_proj", "mm.down",
|
||||
|
||||
// Language model
|
||||
"model.language_model.embed_tokens", "token_embd",
|
||||
"model.language_model.layers", "blk",
|
||||
"model.language_model.norm", "output_norm",
|
||||
"lm_head", "output",
|
||||
|
||||
// Language model attention
|
||||
"self_attn.q_proj", "attn_q",
|
||||
"self_attn.k_proj", "attn_k",
|
||||
"self_attn.v_proj", "attn_v",
|
||||
"self_attn.o_proj", "attn_out",
|
||||
|
||||
// Language model norms
|
||||
"input_layernorm", "attn_norm",
|
||||
"post_attention_layernorm", "ffn_norm",
|
||||
"post_self_attn_layernorm", "post_attn_norm",
|
||||
"post_mlp_layernorm", "post_ffn_norm",
|
||||
|
||||
// Language model MLP (remove mlp. prefix so ffn_* names work)
|
||||
"mlp.gate_up_proj", "ffn_gate_up",
|
||||
"mlp.down_proj", "ffn_down",
|
||||
}
|
||||
}
|
||||
@@ -99,6 +99,7 @@ func (st safetensor) Kind() uint32 {
|
||||
if st.dtype == "BF16" &&
|
||||
!strings.HasPrefix(st.name, "v.") &&
|
||||
!strings.HasPrefix(st.name, "s.") &&
|
||||
!strings.HasPrefix(st.name, "mm.") &&
|
||||
kind != tensorKindFP32 {
|
||||
kind = tensorKindBF16
|
||||
}
|
||||
|
||||
@@ -270,6 +270,7 @@ func (kv KV) OllamaEngineRequired() bool {
|
||||
"qwen3", "qwen3moe",
|
||||
"qwen3vl", "qwen3vlmoe",
|
||||
"glm4moelite",
|
||||
"glmocr",
|
||||
"lfm2",
|
||||
}, kv.Architecture())
|
||||
}
|
||||
@@ -859,6 +860,7 @@ func (f GGML) FlashAttention() bool {
|
||||
"bert",
|
||||
"gemma3",
|
||||
"glm4moelite",
|
||||
"glmocr",
|
||||
"gptoss", "gpt-oss",
|
||||
"lfm2",
|
||||
"mistral3",
|
||||
|
||||
@@ -242,7 +242,6 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st
|
||||
} else {
|
||||
// For Ollama engine, use our SupportsFlashAttention logic
|
||||
if fa {
|
||||
slog.Info("enabling flash attention")
|
||||
loadRequest.FlashAttention = ml.FlashAttentionEnabled
|
||||
|
||||
// Flash Attention also supports kv cache quantization
|
||||
|
||||
@@ -170,6 +170,7 @@ type Tensor interface {
|
||||
Cos(ctx Context) Tensor
|
||||
Tanh(ctx Context) Tensor
|
||||
GELU(ctx Context, up ...Tensor) Tensor
|
||||
GELU_ERF(ctx Context) Tensor
|
||||
QuickGELU(ctx Context, up ...Tensor) Tensor
|
||||
SILU(ctx Context, up ...Tensor) Tensor
|
||||
RELU(ctx Context, up ...Tensor) Tensor
|
||||
|
||||
@@ -1581,6 +1581,13 @@ func (t *Tensor) GELU(ctx ml.Context, t2 ...ml.Tensor) ml.Tensor {
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) GELU_ERF(ctx ml.Context) ml.Tensor {
|
||||
return &Tensor{
|
||||
b: t.b,
|
||||
t: C.ggml_gelu_erf_inplace(ctx.(*Context).ctx, t.t),
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) QuickGELU(ctx ml.Context, t2 ...ml.Tensor) ml.Tensor {
|
||||
var tt *C.struct_ggml_tensor
|
||||
if len(t2) > 0 {
|
||||
|
||||
@@ -20,6 +20,7 @@ const (
|
||||
ResizeBilinear = iota
|
||||
ResizeNearestNeighbor
|
||||
ResizeApproxBilinear
|
||||
ResizeBicubic
|
||||
ResizeCatmullrom
|
||||
)
|
||||
|
||||
@@ -45,6 +46,7 @@ func Resize(img image.Image, newSize image.Point, method int) image.Image {
|
||||
ResizeBilinear: draw.BiLinear,
|
||||
ResizeNearestNeighbor: draw.NearestNeighbor,
|
||||
ResizeApproxBilinear: draw.ApproxBiLinear,
|
||||
ResizeBicubic: draw.CatmullRom,
|
||||
ResizeCatmullrom: draw.CatmullRom,
|
||||
}
|
||||
|
||||
|
||||
171
model/models/glmocr/imageprocessor.go
Normal file
171
model/models/glmocr/imageprocessor.go
Normal file
@@ -0,0 +1,171 @@
|
||||
package glmocr
|
||||
|
||||
import (
|
||||
"image"
|
||||
"math"
|
||||
|
||||
"github.com/ollama/ollama/fs"
|
||||
"github.com/ollama/ollama/model/imageproc"
|
||||
)
|
||||
|
||||
type ImageProcessor struct {
|
||||
imageSize int
|
||||
patchSize int
|
||||
temporalPatchSize int
|
||||
spatialMergeSize int
|
||||
minPixels int
|
||||
maxPixels int
|
||||
factor int
|
||||
imageMean [3]float32
|
||||
imageStd [3]float32
|
||||
}
|
||||
|
||||
func newImageProcessor(c fs.Config) ImageProcessor {
|
||||
patchSize := int(c.Uint("vision.patch_size", 14))
|
||||
spatialMergeSize := int(c.Uint("vision.spatial_merge_size", 2))
|
||||
temporalPatchSize := int(c.Uint("vision.temporal_patch_size", 2))
|
||||
|
||||
// Read normalization values from config if available, otherwise use CLIP defaults
|
||||
imageMean := c.Floats("vision.image_mean", imageproc.ClipDefaultMean[:])
|
||||
imageStd := c.Floats("vision.image_std", imageproc.ClipDefaultSTD[:])
|
||||
|
||||
// Default max_pixels: 2048 * patchSize² * mergeSize² * temporal = ~3.2M pixels
|
||||
// This limits to ~16k patches (4k output tokens) to keep memory stable without flash attention
|
||||
defaultMaxPixels := 2048 * patchSize * patchSize * spatialMergeSize * spatialMergeSize * temporalPatchSize
|
||||
|
||||
return ImageProcessor{
|
||||
imageSize: int(c.Uint("vision.image_size", 336)),
|
||||
patchSize: patchSize,
|
||||
temporalPatchSize: temporalPatchSize,
|
||||
spatialMergeSize: spatialMergeSize,
|
||||
minPixels: int(c.Uint("vision.min_pixels", uint32(8*patchSize*patchSize*spatialMergeSize*spatialMergeSize*temporalPatchSize))),
|
||||
maxPixels: int(c.Uint("vision.max_pixels", uint32(defaultMaxPixels))),
|
||||
factor: patchSize * spatialMergeSize,
|
||||
imageMean: [3]float32{imageMean[0], imageMean[1], imageMean[2]},
|
||||
imageStd: [3]float32{imageStd[0], imageStd[1], imageStd[2]},
|
||||
}
|
||||
}
|
||||
|
||||
func (p *ImageProcessor) SmartResize(height, width int) (int, int) {
|
||||
factor := p.factor
|
||||
temporalFactor := p.temporalPatchSize
|
||||
numFrames := temporalFactor // single image
|
||||
|
||||
if height < factor || width < factor {
|
||||
// Scale up small images
|
||||
scale := float64(factor) / float64(min(height, width))
|
||||
height = int(math.Ceil(float64(height) * scale))
|
||||
width = int(math.Ceil(float64(width) * scale))
|
||||
}
|
||||
|
||||
if temporalFactor <= 0 {
|
||||
panic("temporal_patch_size must be > 0")
|
||||
}
|
||||
if numFrames < temporalFactor {
|
||||
panic("num_frames must be >= temporal_patch_size")
|
||||
}
|
||||
if aspectRatio := float64(max(height, width)) / float64(min(height, width)); aspectRatio > 200 {
|
||||
panic("absolute aspect ratio must be smaller than 200")
|
||||
}
|
||||
|
||||
round := func(x float64) int { return int(math.RoundToEven(x)) }
|
||||
|
||||
hBar := round(float64(height)/float64(factor)) * factor
|
||||
wBar := round(float64(width)/float64(factor)) * factor
|
||||
tBar := round(float64(numFrames)/float64(temporalFactor)) * temporalFactor
|
||||
|
||||
if tBar*hBar*wBar > p.maxPixels {
|
||||
beta := math.Sqrt(float64(numFrames*height*width) / float64(p.maxPixels))
|
||||
hBar = int(math.Floor(float64(height)/beta/float64(factor))) * factor
|
||||
wBar = int(math.Floor(float64(width)/beta/float64(factor))) * factor
|
||||
} else if tBar*hBar*wBar < p.minPixels {
|
||||
beta := math.Sqrt(float64(p.minPixels) / float64(numFrames*height*width))
|
||||
hBar = int(math.Ceil(float64(height)*beta/float64(factor))) * factor
|
||||
wBar = int(math.Ceil(float64(width)*beta/float64(factor))) * factor
|
||||
}
|
||||
|
||||
return hBar, wBar
|
||||
}
|
||||
|
||||
func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, *Grid, error) {
|
||||
img = imageproc.Composite(img)
|
||||
|
||||
origWidth := img.Bounds().Dx()
|
||||
origHeight := img.Bounds().Dy()
|
||||
|
||||
// Calculate smart resize dimensions
|
||||
resizedHeight, resizedWidth := p.SmartResize(origHeight, origWidth)
|
||||
|
||||
// Resize image
|
||||
resizedImg := imageproc.Resize(img, image.Point{X: resizedWidth, Y: resizedHeight}, imageproc.ResizeBicubic)
|
||||
|
||||
// Normalize pixels - output format is [C, H, W] with rescale and channelFirst
|
||||
// We keep [C, H, W] for patch extraction
|
||||
normalizedPixels := imageproc.Normalize(resizedImg, p.imageMean, p.imageStd, true, true)
|
||||
|
||||
// Calculate grid dimensions (after Conv2D patching)
|
||||
grid := &Grid{
|
||||
Height: resizedHeight / p.patchSize,
|
||||
Width: resizedWidth / p.patchSize,
|
||||
Temporal: 1, // Single image
|
||||
ImageHeight: resizedHeight,
|
||||
ImageWidth: resizedWidth,
|
||||
}
|
||||
|
||||
patches, err := p.createPatches(normalizedPixels, resizedHeight, resizedWidth, grid)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
return patches, grid, nil
|
||||
}
|
||||
|
||||
func (p *ImageProcessor) createPatches(pixels []float32, height, width int, grid *Grid) ([]float32, error) {
|
||||
channels := 3
|
||||
patchSize := p.patchSize
|
||||
mergeSize := p.spatialMergeSize
|
||||
temporalPatchSize := p.temporalPatchSize
|
||||
|
||||
numPatches := grid.Temporal * grid.Height * grid.Width
|
||||
patchDim := channels * temporalPatchSize * patchSize * patchSize
|
||||
result := make([]float32, numPatches*patchDim)
|
||||
patchIndex := 0
|
||||
|
||||
// Single temporal frame handling (copies to all frames)
|
||||
for range grid.Temporal {
|
||||
for h := 0; h < grid.Height; h += mergeSize {
|
||||
for w := 0; w < grid.Width; w += mergeSize {
|
||||
for mh := range mergeSize {
|
||||
for mw := range mergeSize {
|
||||
baseOffset := patchIndex * patchDim
|
||||
for c := range channels {
|
||||
channelOffset := baseOffset + (c * temporalPatchSize * patchSize * patchSize)
|
||||
for py := range patchSize {
|
||||
for px := range patchSize {
|
||||
y := (h+mh)*patchSize + py
|
||||
x := (w+mw)*patchSize + px
|
||||
srcIdx := c*height*width + y*width + x
|
||||
dstIdx := channelOffset + (py * patchSize) + px
|
||||
result[dstIdx] = pixels[srcIdx]
|
||||
}
|
||||
}
|
||||
|
||||
if temporalPatchSize > 1 {
|
||||
frameSize := patchSize * patchSize
|
||||
for tp := 1; tp < temporalPatchSize; tp++ {
|
||||
currentFrameOffset := channelOffset + (tp * frameSize)
|
||||
copy(result[currentFrameOffset:currentFrameOffset+frameSize],
|
||||
result[channelOffset:channelOffset+frameSize])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
patchIndex++
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
235
model/models/glmocr/model.go
Normal file
235
model/models/glmocr/model.go
Normal file
@@ -0,0 +1,235 @@
|
||||
package glmocr
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"errors"
|
||||
"image"
|
||||
"slices"
|
||||
|
||||
"github.com/ollama/ollama/fs"
|
||||
"github.com/ollama/ollama/kvcache"
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/model"
|
||||
"github.com/ollama/ollama/model/input"
|
||||
)
|
||||
|
||||
type Model struct {
|
||||
model.Base
|
||||
model.BytePairEncoding
|
||||
|
||||
*TextModel
|
||||
*VisionModel `gguf:"v"`
|
||||
VisionDownsample *VisionDownsample `gguf:"mm.patch_merger"`
|
||||
PatchMerger *PatchMerger `gguf:"mm"`
|
||||
|
||||
ImageProcessor
|
||||
|
||||
imageTokenID int32
|
||||
imageStartTokenID int32
|
||||
imageEndTokenID int32
|
||||
}
|
||||
|
||||
var _ model.MultimodalProcessor = (*Model)(nil)
|
||||
|
||||
func New(c fs.Config) (model.Model, error) {
|
||||
eosTokenID := int32(c.Uint("tokenizer.ggml.eos_token_id"))
|
||||
eosTokenIDs := c.Ints("tokenizer.ggml.eos_token_ids")
|
||||
allEOS := append([]int32{eosTokenID}, eosTokenIDs...)
|
||||
|
||||
m := &Model{
|
||||
BytePairEncoding: model.NewBytePairEncoding(
|
||||
&model.Vocabulary{
|
||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||
Types: c.Ints("tokenizer.ggml.token_type"),
|
||||
Merges: c.Strings("tokenizer.ggml.merges"),
|
||||
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", false),
|
||||
BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
|
||||
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
||||
EOS: allEOS,
|
||||
},
|
||||
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
|
||||
),
|
||||
TextModel: newTextModel(c),
|
||||
VisionModel: newVisionModel(c),
|
||||
ImageProcessor: newImageProcessor(c),
|
||||
imageTokenID: int32(c.Uint("image_token_id", 59280)),
|
||||
imageStartTokenID: int32(c.Uint("image_start_token_id", 59256)),
|
||||
imageEndTokenID: int32(c.Uint("image_end_token_id", 59257)),
|
||||
}
|
||||
|
||||
m.Cache = kvcache.NewCausalCache(m.TextModel.Shift)
|
||||
|
||||
return m, nil
|
||||
}
|
||||
|
||||
func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
|
||||
if len(m.VisionModel.Blocks) == 0 {
|
||||
return nil, model.ErrNoVisionModel
|
||||
}
|
||||
|
||||
img, _, err := image.Decode(bytes.NewReader(multimodalData))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
f32s, grid, err := m.ImageProcessor.ProcessImage(img)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Create pixel values tensor from flattened patches
|
||||
// Shape: [patchDim, numPatches]
|
||||
patchDim := m.VisionModel.numChannels * m.temporalPatchSize * m.patchSize * m.patchSize
|
||||
numPatches := grid.Temporal * grid.Height * grid.Width
|
||||
pixelValues := ctx.Input().FromFloats(f32s, patchDim, numPatches)
|
||||
|
||||
// Forward through vision encoder
|
||||
visionOutputs := m.VisionModel.Forward(ctx, pixelValues, grid)
|
||||
|
||||
// Forward through downsample (patch merger)
|
||||
if m.VisionDownsample == nil || m.VisionDownsample.Weight == nil {
|
||||
return nil, errors.New("glmocr: missing vision downsample weights")
|
||||
}
|
||||
visionOutputs = m.VisionDownsample.Forward(ctx, visionOutputs, grid, m.VisionModel.VisionModelOptions)
|
||||
|
||||
// Forward through patch merger (FC + LayerNorm + GELU + SwiGLU FFN)
|
||||
if m.PatchMerger == nil {
|
||||
return nil, errors.New("glmocr: missing patch merger weights")
|
||||
}
|
||||
visionOutputs = m.PatchMerger.Forward(ctx, visionOutputs, m.VisionModel.VisionModelOptions)
|
||||
|
||||
return []input.Multimodal{{Tensor: visionOutputs, Data: grid}}, nil
|
||||
}
|
||||
|
||||
func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
|
||||
var result []*input.Input
|
||||
|
||||
// Reset position cache
|
||||
m.TextModel.positionCache = m.TextModel.positionCache[:0]
|
||||
m.TextModel.ropeDelta = 0
|
||||
|
||||
pos := int32(0)
|
||||
for _, inp := range inputs {
|
||||
if inp.Multimodal == nil {
|
||||
result = append(result, inp)
|
||||
m.TextModel.positionCache = append(m.TextModel.positionCache, pos)
|
||||
pos++
|
||||
continue
|
||||
}
|
||||
|
||||
// Get grid info for position calculation
|
||||
grid := inp.Multimodal[0].Data.(*Grid)
|
||||
mergedH := grid.Height / m.VisionModel.spatialMergeSize
|
||||
mergedW := grid.Width / m.VisionModel.spatialMergeSize
|
||||
|
||||
// Add image start token
|
||||
result = append(result, &input.Input{Token: m.imageStartTokenID})
|
||||
m.TextModel.positionCache = append(m.TextModel.positionCache, pos)
|
||||
pos++
|
||||
|
||||
// Add image tokens with multimodal data
|
||||
// All image tokens share the same base position for temporal dimension
|
||||
tokensPerGrid := inp.Multimodal[0].Tensor.Dim(1)
|
||||
basePos := pos
|
||||
sameBatch := tokensPerGrid - 1
|
||||
if sameBatch < 0 {
|
||||
sameBatch = 0
|
||||
}
|
||||
result = append(result, &input.Input{
|
||||
Token: m.imageTokenID,
|
||||
Multimodal: inp.Multimodal,
|
||||
MultimodalHash: inp.MultimodalHash,
|
||||
SameBatch: sameBatch,
|
||||
})
|
||||
m.TextModel.positionCache = append(m.TextModel.positionCache, basePos)
|
||||
|
||||
// Add placeholder tokens for remaining positions
|
||||
// All image tokens use the same base position (temporal stays constant)
|
||||
for range tokensPerGrid - 1 {
|
||||
result = append(result, &input.Input{Token: m.imageTokenID})
|
||||
m.TextModel.positionCache = append(m.TextModel.positionCache, basePos)
|
||||
}
|
||||
|
||||
// Advance position by max(mergedH, mergedW) after image tokens
|
||||
pos = basePos + int32(max(mergedH, mergedW))
|
||||
|
||||
// Add image end token
|
||||
result = append(result, &input.Input{Token: m.imageEndTokenID})
|
||||
m.TextModel.positionCache = append(m.TextModel.positionCache, pos)
|
||||
pos++
|
||||
}
|
||||
|
||||
// Compute rope delta for continuation after the prefill segment:
|
||||
// delta = (max_position_id + 1) - sequence_length
|
||||
if len(m.TextModel.positionCache) > 0 {
|
||||
last := m.TextModel.positionCache[len(m.TextModel.positionCache)-1]
|
||||
m.TextModel.ropeDelta = last + 1 - int32(len(m.TextModel.positionCache))
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
||||
// Initial token embedding
|
||||
hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs).Duplicate(ctx)
|
||||
ctx.Forward(hiddenStates)
|
||||
|
||||
// Build position slices for M-RoPE
|
||||
positionSlice := func() [][]int32 {
|
||||
s := [][]int32{
|
||||
make([]int32, len(batch.Positions)), // temporal
|
||||
make([]int32, len(batch.Positions)), // height
|
||||
make([]int32, len(batch.Positions)), // width
|
||||
make([]int32, len(batch.Positions)), // unused (zeros)
|
||||
}
|
||||
for i, position := range batch.Positions {
|
||||
// Translate through position cache or continue sequence
|
||||
if position < int32(len(m.TextModel.positionCache)) {
|
||||
position = m.TextModel.positionCache[position]
|
||||
} else if len(m.TextModel.positionCache) > 0 {
|
||||
// Continue sequence after cached positions using ropeDelta
|
||||
position = position + m.TextModel.ropeDelta
|
||||
}
|
||||
|
||||
s[0][i] = position
|
||||
s[1][i] = position
|
||||
s[2][i] = position
|
||||
}
|
||||
return s
|
||||
}()
|
||||
|
||||
// Inject vision embeddings and adjust positions for image tokens
|
||||
for _, mi := range batch.Multimodal {
|
||||
img := mi.Multimodal[0].Tensor
|
||||
ctx.Forward(img.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), img.Dim(0)*img.Dim(1))))
|
||||
|
||||
if grid, ok := mi.Multimodal[0].Data.(*Grid); ok {
|
||||
w := grid.Width / m.VisionModel.spatialMergeSize
|
||||
for i := range img.Dim(1) {
|
||||
positionSlice[1][mi.Index+i] += int32(i / w)
|
||||
positionSlice[2][mi.Index+i] += int32(i % w)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
positions := ctx.Input().FromInts(slices.Concat(positionSlice...), len(positionSlice[0])*len(positionSlice))
|
||||
|
||||
// Process through transformer layers
|
||||
for i, layer := range m.TextModel.Layers {
|
||||
m.Cache.SetLayer(i)
|
||||
|
||||
var lastLayerOutputs ml.Tensor
|
||||
if i == len(m.TextModel.Layers)-1 {
|
||||
lastLayerOutputs = batch.Outputs
|
||||
}
|
||||
|
||||
hiddenStates = layer.Forward(ctx, hiddenStates, positions, lastLayerOutputs, m.Cache, m.TextModel.TextModelOptions)
|
||||
}
|
||||
|
||||
hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.TextModel.eps)
|
||||
return m.Output.Forward(ctx, hiddenStates), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
model.Register("glmocr", New)
|
||||
}
|
||||
180
model/models/glmocr/model_text.go
Normal file
180
model/models/glmocr/model_text.go
Normal file
@@ -0,0 +1,180 @@
|
||||
package glmocr
|
||||
|
||||
import (
|
||||
"math"
|
||||
|
||||
"github.com/ollama/ollama/fs"
|
||||
"github.com/ollama/ollama/kvcache"
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/ml/nn"
|
||||
"github.com/ollama/ollama/ml/nn/rope"
|
||||
)
|
||||
|
||||
type TextModelOptions struct {
|
||||
hiddenSize int
|
||||
numHeads int
|
||||
numKVHeads int
|
||||
headDim int
|
||||
rotaryDim int
|
||||
intermediateSize int
|
||||
eps float32
|
||||
ropeBase float32
|
||||
mropeSections []int
|
||||
}
|
||||
|
||||
func (o *TextModelOptions) applyMRoPE(ctx ml.Context, states, positions ml.Tensor) ml.Tensor {
|
||||
// GLM4 uses standard M-RoPE (not interleaved like Qwen3VL)
|
||||
// With 4 sections for [temporal, height, width, unused]
|
||||
return nn.RoPE(ctx, states, positions, o.rotaryDim, o.ropeBase, 1.0, rope.WithMRoPE(o.mropeSections))
|
||||
}
|
||||
|
||||
type TextSelfAttention struct {
|
||||
Query *nn.Linear `gguf:"attn_q"`
|
||||
Key *nn.Linear `gguf:"attn_k"`
|
||||
Value *nn.Linear `gguf:"attn_v"`
|
||||
Output *nn.Linear `gguf:"attn_out"`
|
||||
}
|
||||
|
||||
func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, cache kvcache.Cache, opts *TextModelOptions) ml.Tensor {
|
||||
batchSize := hiddenStates.Dim(1)
|
||||
|
||||
// Separate Q, K, V projections
|
||||
q := sa.Query.Forward(ctx, hiddenStates)
|
||||
k := sa.Key.Forward(ctx, hiddenStates)
|
||||
v := sa.Value.Forward(ctx, hiddenStates)
|
||||
|
||||
// Reshape for GQA
|
||||
q = q.Reshape(ctx, opts.headDim, opts.numHeads, batchSize)
|
||||
k = k.Reshape(ctx, opts.headDim, opts.numKVHeads, batchSize)
|
||||
v = v.Reshape(ctx, opts.headDim, opts.numKVHeads, batchSize)
|
||||
|
||||
// Apply M-RoPE (multi-resolution rotary position embeddings)
|
||||
q = opts.applyMRoPE(ctx, q, positions)
|
||||
k = opts.applyMRoPE(ctx, k, positions)
|
||||
|
||||
// Scaled dot-product attention with KV cache
|
||||
scaleFactor := 1.0 / math.Sqrt(float64(opts.headDim))
|
||||
kqv := nn.Attention(ctx, q, k, v, scaleFactor, cache)
|
||||
// Reshape attention output: [headDim, numHeads, batchSize] -> [numHeads*headDim, batchSize]
|
||||
// Note: numHeads * headDim = 16 * 128 = 2048, which is the attention hidden size
|
||||
kqv = kqv.Reshape(ctx, opts.numHeads*opts.headDim, batchSize)
|
||||
|
||||
return sa.Output.Forward(ctx, kqv)
|
||||
}
|
||||
|
||||
type TextMLP struct {
|
||||
Gate *nn.Linear `gguf:"ffn_gate"`
|
||||
Up *nn.Linear `gguf:"ffn_up"`
|
||||
Down *nn.Linear `gguf:"ffn_down"`
|
||||
}
|
||||
|
||||
func (mlp *TextMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *TextModelOptions) ml.Tensor {
|
||||
// SwiGLU: down(silu(gate(x)) * up(x))
|
||||
gate := mlp.Gate.Forward(ctx, hiddenStates).SILU(ctx, mlp.Up.Forward(ctx, hiddenStates))
|
||||
return mlp.Down.Forward(ctx, gate)
|
||||
}
|
||||
|
||||
type TextDecoderLayer struct {
|
||||
// Input layernorm (before attention)
|
||||
AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
|
||||
SelfAttention *TextSelfAttention
|
||||
// Post self-attention layernorm (after attention, before residual add)
|
||||
PostAttnNorm *nn.RMSNorm `gguf:"post_attn_norm"`
|
||||
|
||||
// FFN input layernorm (after first residual, before MLP)
|
||||
FFNNorm *nn.RMSNorm `gguf:"ffn_norm"`
|
||||
MLP *TextMLP
|
||||
// Post MLP layernorm (after MLP, before residual add)
|
||||
PostFFNNorm *nn.RMSNorm `gguf:"post_ffn_norm"`
|
||||
}
|
||||
|
||||
func (l *TextDecoderLayer) Forward(ctx ml.Context, hiddenStates, positions, outputs ml.Tensor, cache kvcache.Cache, opts *TextModelOptions) ml.Tensor {
|
||||
// Attention block
|
||||
residual := hiddenStates
|
||||
hiddenStates = l.AttentionNorm.Forward(ctx, hiddenStates, opts.eps)
|
||||
hiddenStates = l.SelfAttention.Forward(ctx, hiddenStates, positions, cache, opts)
|
||||
hiddenStates = l.PostAttnNorm.Forward(ctx, hiddenStates, opts.eps)
|
||||
|
||||
// Prune to output positions in final layer
|
||||
if outputs != nil {
|
||||
hiddenStates = hiddenStates.Rows(ctx, outputs)
|
||||
residual = residual.Rows(ctx, outputs)
|
||||
}
|
||||
|
||||
hiddenStates = hiddenStates.Add(ctx, residual)
|
||||
|
||||
// MLP block
|
||||
residual = hiddenStates
|
||||
hiddenStates = l.FFNNorm.Forward(ctx, hiddenStates, opts.eps)
|
||||
hiddenStates = l.MLP.Forward(ctx, hiddenStates, opts)
|
||||
hiddenStates = l.PostFFNNorm.Forward(ctx, hiddenStates, opts.eps)
|
||||
hiddenStates = hiddenStates.Add(ctx, residual)
|
||||
|
||||
return hiddenStates
|
||||
}
|
||||
|
||||
type TextModel struct {
|
||||
TokenEmbedding *nn.Embedding `gguf:"token_embd"`
|
||||
Layers []TextDecoderLayer `gguf:"blk"`
|
||||
OutputNorm *nn.RMSNorm `gguf:"output_norm"`
|
||||
Output *nn.Linear `gguf:"output,alt:token_embd"`
|
||||
|
||||
*TextModelOptions
|
||||
|
||||
// positionCache stores the M-RoPE position for each token in the sequence.
|
||||
// This is needed because image tokens share the same base position but have
|
||||
// different height/width offsets, and the end token position depends on the
|
||||
// image grid dimensions.
|
||||
positionCache []int32
|
||||
ropeDelta int32
|
||||
}
|
||||
|
||||
func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
|
||||
// Clear position cache when KV cache shifts
|
||||
m.positionCache = nil
|
||||
m.ropeDelta = 0
|
||||
return m.applyMRoPE(ctx, key, shift), nil
|
||||
}
|
||||
|
||||
func newTextModel(c fs.Config) *TextModel {
|
||||
hiddenSize := int(c.Uint("embedding_length", 1536))
|
||||
numHeads := int(c.Uint("attention.head_count", 16))
|
||||
numKVHeads := int(c.Uint("attention.head_count_kv", 8))
|
||||
intermediateSize := int(c.Uint("feed_forward_length", 4608))
|
||||
eps := c.Float("attention.layer_norm_rms_epsilon", 1e-5)
|
||||
ropeBase := c.Float("rope.freq_base", 10000)
|
||||
|
||||
headDim := int(c.Uint("attention.key_length", uint32(hiddenSize/numHeads)))
|
||||
|
||||
mropeSections := c.Ints("rope.mrope_section")
|
||||
var sectionInts []int
|
||||
|
||||
if len(mropeSections) > 0 {
|
||||
sectionInts = make([]int, len(mropeSections))
|
||||
for i, section := range mropeSections {
|
||||
sectionInts[i] = int(section)
|
||||
}
|
||||
} else {
|
||||
// Default: 3 sections like GLM-OCR
|
||||
sectionInts = []int{16, 24, 24}
|
||||
}
|
||||
|
||||
// rotaryDim = headDim (128) to rotate all dimensions
|
||||
// GGML rope_multi: sector = (dim_pair) % sum(sections), mapping each pair to its position dim
|
||||
rotaryDim := headDim
|
||||
|
||||
return &TextModel{
|
||||
Layers: make([]TextDecoderLayer, c.Uint("block_count", 16)),
|
||||
TextModelOptions: &TextModelOptions{
|
||||
hiddenSize: hiddenSize,
|
||||
numHeads: numHeads,
|
||||
numKVHeads: numKVHeads,
|
||||
headDim: headDim,
|
||||
rotaryDim: rotaryDim,
|
||||
intermediateSize: intermediateSize,
|
||||
eps: eps,
|
||||
ropeBase: ropeBase,
|
||||
mropeSections: sectionInts,
|
||||
},
|
||||
}
|
||||
}
|
||||
348
model/models/glmocr/model_vision.go
Normal file
348
model/models/glmocr/model_vision.go
Normal file
@@ -0,0 +1,348 @@
|
||||
package glmocr
|
||||
|
||||
import (
|
||||
"log/slog"
|
||||
"math"
|
||||
"slices"
|
||||
|
||||
"github.com/ollama/ollama/fs"
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/ml/nn"
|
||||
"github.com/ollama/ollama/ml/nn/rope"
|
||||
)
|
||||
|
||||
type Grid struct {
|
||||
Height int // Number of patches in height direction
|
||||
Width int // Number of patches in width direction
|
||||
Temporal int
|
||||
ImageHeight int // Full image height in pixels
|
||||
ImageWidth int // Full image width in pixels
|
||||
}
|
||||
|
||||
type VisionModelOptions struct {
|
||||
hiddenSize int
|
||||
numHeads int
|
||||
headDim int
|
||||
numChannels int
|
||||
patchSize int
|
||||
temporalPatchSize int
|
||||
imageSize int
|
||||
spatialMergeSize int
|
||||
outHiddenSize int
|
||||
intermediateSize int
|
||||
eps float32
|
||||
}
|
||||
|
||||
type VisionPatchEmbed struct {
|
||||
Proj *nn.Conv2D `gguf:"patch_embd_0"`
|
||||
Proj1 *nn.Conv2D `gguf:"patch_embd_1"`
|
||||
Bias ml.Tensor `gguf:"patch_embd.bias"`
|
||||
}
|
||||
|
||||
func (pe *VisionPatchEmbed) Forward(ctx ml.Context, pixelValues ml.Tensor, grid *Grid, opts *VisionModelOptions) ml.Tensor {
|
||||
_ = grid // patches are already in merge-block order
|
||||
|
||||
// pixelValues shape: [patchDim, numPatches]
|
||||
numPatches := pixelValues.Shape()[1]
|
||||
|
||||
// Reshape to [patchSize*patchSize, temporalPatchSize, numChannels, numPatches]
|
||||
pixelValues = pixelValues.Reshape(ctx, opts.patchSize*opts.patchSize, opts.temporalPatchSize, opts.numChannels, numPatches)
|
||||
// Permute to [temporalPatchSize, patchSize*patchSize, numChannels, numPatches]
|
||||
pixelValues = pixelValues.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
|
||||
|
||||
// Slice temporal frames for Conv2D (simulate Conv3D)
|
||||
in0 := pixelValues.View(ctx, 0, 1, pixelValues.Stride(1), pixelValues.Dim(1), pixelValues.Stride(2), pixelValues.Dim(2), pixelValues.Stride(3), pixelValues.Dim(3)).Contiguous(ctx)
|
||||
in0 = in0.Reshape(ctx, opts.patchSize, opts.patchSize, opts.numChannels, numPatches)
|
||||
|
||||
s0, s1 := opts.patchSize, opts.patchSize
|
||||
p0, p1 := 0, 0
|
||||
d0, d1 := 1, 1
|
||||
hiddenStates := pe.Proj.Forward(ctx, in0, s0, s1, p0, p1, d0, d1)
|
||||
|
||||
if pe.Proj1 != nil && opts.temporalPatchSize > 1 {
|
||||
in1 := pixelValues.View(ctx, pixelValues.Stride(0), 1, pixelValues.Stride(1), pixelValues.Dim(1), pixelValues.Stride(2), pixelValues.Dim(2), pixelValues.Stride(3), pixelValues.Dim(3)).Contiguous(ctx)
|
||||
in1 = in1.Reshape(ctx, opts.patchSize, opts.patchSize, opts.numChannels, numPatches)
|
||||
out1 := pe.Proj1.Forward(ctx, in1, s0, s1, p0, p1, d0, d1)
|
||||
hiddenStates = hiddenStates.Add(ctx, out1)
|
||||
}
|
||||
|
||||
// Flatten to [hidden_size, num_patches]
|
||||
hiddenStates = hiddenStates.Reshape(ctx, opts.hiddenSize, numPatches)
|
||||
|
||||
// Add patch bias - reshape from [hidden_size] to [hidden_size, 1] for broadcasting
|
||||
if pe.Bias != nil {
|
||||
hiddenStates = hiddenStates.Add(ctx, pe.Bias.Reshape(ctx, opts.hiddenSize, 1))
|
||||
}
|
||||
|
||||
return hiddenStates
|
||||
}
|
||||
|
||||
type VisionSelfAttention struct {
|
||||
QKV *nn.Linear `gguf:"attn_qkv"`
|
||||
QNorm *nn.RMSNorm `gguf:"attn_q_norm"`
|
||||
KNorm *nn.RMSNorm `gguf:"attn_k_norm"`
|
||||
Output *nn.Linear `gguf:"attn_out"`
|
||||
}
|
||||
|
||||
func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, opts *VisionModelOptions) ml.Tensor {
|
||||
batchSize := hiddenStates.Dim(1)
|
||||
|
||||
// Combined QKV projection: [3*hidden_size, batch_size]
|
||||
qkv := sa.QKV.Forward(ctx, hiddenStates)
|
||||
|
||||
// Split using ChunkSections along dim 0 (handles byte offsets correctly)
|
||||
// ChunkSections returns views - must make contiguous before further operations
|
||||
chunks := qkv.ChunkSections(ctx, 0, opts.hiddenSize, opts.hiddenSize, opts.hiddenSize)
|
||||
q := chunks[0].Contiguous(ctx)
|
||||
k := chunks[1].Contiguous(ctx)
|
||||
v := chunks[2].Contiguous(ctx)
|
||||
|
||||
// Reshape for multi-head attention: [hiddenSize, N] -> [headDim, numHeads, N]
|
||||
q = q.Reshape(ctx, opts.headDim, opts.numHeads, batchSize)
|
||||
k = k.Reshape(ctx, opts.headDim, opts.numHeads, batchSize)
|
||||
v = v.Reshape(ctx, opts.headDim, opts.numHeads, batchSize)
|
||||
|
||||
// Apply Q-norm and K-norm after head reshape
|
||||
// Weights are [headDim]=64, tensor is [headDim, numHeads, N]
|
||||
q = sa.QNorm.Forward(ctx, q, opts.eps)
|
||||
k = sa.KNorm.Forward(ctx, k, opts.eps)
|
||||
|
||||
// Apply rotary position embeddings with vision-style 2D positions
|
||||
// Each section of headDim/4 pairs is assigned to one position dimension
|
||||
// Positions are [height, width, height, width] repeated for rotation
|
||||
ropeFreqBase := float32(10000.0)
|
||||
sections := []int{opts.headDim / 4, opts.headDim / 4, opts.headDim / 4, opts.headDim / 4}
|
||||
q = nn.RoPE(ctx, q, positions, opts.headDim/2, ropeFreqBase, 1.0, rope.WithVision(sections))
|
||||
k = nn.RoPE(ctx, k, positions, opts.headDim/2, ropeFreqBase, 1.0, rope.WithVision(sections))
|
||||
|
||||
// Scale factor for scaled dot-product attention
|
||||
scale := 1.0 / math.Sqrt(float64(opts.headDim))
|
||||
|
||||
// Try flash attention first (ScaledDotProductAttention), fall back to manual
|
||||
if sdpa, ok := q.(ml.ScaledDotProductAttention); ok {
|
||||
attention := sdpa.ScaledDotProductAttention(ctx, k, v, nil, nil, nil, scale, false)
|
||||
attention = attention.Reshape(ctx, opts.hiddenSize, batchSize)
|
||||
return sa.Output.Forward(ctx, attention)
|
||||
}
|
||||
|
||||
slog.Warn("glmocr: vision attention falling back to manual attention",
|
||||
"batchSize", batchSize, "numHeads", opts.numHeads,
|
||||
"hint", "set OLLAMA_FLASH_ATTENTION=1 to enable flash attention")
|
||||
|
||||
// Manual attention fallback
|
||||
// q, k, v are [headDim, numHeads, batchSize] - GGML treats as 4D with implicit dim 3 = 1
|
||||
q = q.Permute(ctx, 0, 2, 1, 3)
|
||||
k = k.Permute(ctx, 0, 2, 1, 3)
|
||||
v = v.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
|
||||
|
||||
// Attention scores
|
||||
kq := k.MulmatFullPrec(ctx, q)
|
||||
kq = kq.Scale(ctx, scale)
|
||||
kq = kq.Softmax(ctx)
|
||||
|
||||
// Attention output: v @ kq (note: v first)
|
||||
kqv := v.Mulmat(ctx, kq)
|
||||
attention := kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
attention = attention.Reshape(ctx, opts.hiddenSize, batchSize)
|
||||
|
||||
return sa.Output.Forward(ctx, attention)
|
||||
}
|
||||
|
||||
type VisionMLP struct {
|
||||
Gate *nn.Linear `gguf:"ffn_gate"`
|
||||
Up *nn.Linear `gguf:"ffn_up"`
|
||||
Down *nn.Linear `gguf:"ffn_down"`
|
||||
}
|
||||
|
||||
func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor) ml.Tensor {
|
||||
// SwiGLU: down(silu(gate(x)) * up(x))
|
||||
gate := mlp.Gate.Forward(ctx, hiddenStates).SILU(ctx, mlp.Up.Forward(ctx, hiddenStates))
|
||||
return mlp.Down.Forward(ctx, gate)
|
||||
}
|
||||
|
||||
type VisionBlock struct {
|
||||
Norm1 *nn.RMSNorm `gguf:"ln1"`
|
||||
SelfAttention *VisionSelfAttention
|
||||
Norm2 *nn.RMSNorm `gguf:"ln2"`
|
||||
MLP *VisionMLP
|
||||
}
|
||||
|
||||
func (b *VisionBlock) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, opts *VisionModelOptions) ml.Tensor {
|
||||
// Pre-norm architecture
|
||||
residual := hiddenStates
|
||||
hiddenStates = b.Norm1.Forward(ctx, hiddenStates, opts.eps)
|
||||
hiddenStates = b.SelfAttention.Forward(ctx, hiddenStates, positions, opts)
|
||||
hiddenStates = hiddenStates.Add(ctx, residual)
|
||||
|
||||
residual = hiddenStates
|
||||
hiddenStates = b.Norm2.Forward(ctx, hiddenStates, opts.eps)
|
||||
hiddenStates = b.MLP.Forward(ctx, hiddenStates)
|
||||
hiddenStates = hiddenStates.Add(ctx, residual)
|
||||
|
||||
return hiddenStates
|
||||
}
|
||||
|
||||
type VisionDownsample struct {
|
||||
*nn.Conv2D // Embedded to get mm.patch_merger.weight/bias directly
|
||||
}
|
||||
|
||||
func (d *VisionDownsample) Forward(ctx ml.Context, hiddenStates ml.Tensor, grid *Grid, opts *VisionModelOptions) ml.Tensor {
|
||||
// Apply spatial downsampling via Conv2D
|
||||
// Input: [hidden_size, num_patches] where patches are in merge-block order
|
||||
|
||||
if d.Conv2D == nil || d.Weight == nil {
|
||||
panic("VisionDownsample weights not loaded")
|
||||
}
|
||||
|
||||
merge := opts.spatialMergeSize
|
||||
numOutputTokens := (grid.Height / merge) * (grid.Width / merge)
|
||||
|
||||
// Step 1: Reshape to [hidden_size, merge, merge, num_output_tokens]
|
||||
hiddenStates = hiddenStates.Reshape(ctx, opts.hiddenSize, merge, merge, numOutputTokens)
|
||||
|
||||
// Step 2: Permute to [merge, merge, hidden_size, num_output_tokens]
|
||||
// ggml semantics: result.ne[perm[i]] = input.ne[i]
|
||||
// So permute(2,0,1,3) on [1024,2,2,N] gives: ne[2]=1024, ne[0]=2, ne[1]=2, ne[3]=N -> [2,2,1024,N]
|
||||
hiddenStates = hiddenStates.Permute(ctx, 2, 0, 1, 3).Contiguous(ctx)
|
||||
|
||||
// Step 3: Apply Conv2D without bias (bias added after reshape)
|
||||
// Note: ggml_conv_2d takes (kernel, input) - kernel must be receiver in ollama
|
||||
s0, s1 := merge, merge
|
||||
p0, p1 := 0, 0
|
||||
d0, d1 := 1, 1
|
||||
hiddenStates = d.Weight.Conv2D(ctx, hiddenStates, s0, s1, p0, p1, d0, d1)
|
||||
|
||||
// Step 4: Reshape to [out_hidden_size, num_output_tokens]
|
||||
hiddenStates = hiddenStates.Reshape(ctx, opts.outHiddenSize, numOutputTokens)
|
||||
|
||||
// Step 5: Add bias after reshape
|
||||
// Reshape bias from [out_hidden_size] to [out_hidden_size, 1] for proper broadcasting
|
||||
if d.Bias != nil {
|
||||
hiddenStates = hiddenStates.Add(ctx, d.Bias.Reshape(ctx, opts.outHiddenSize, 1))
|
||||
}
|
||||
|
||||
return hiddenStates
|
||||
}
|
||||
|
||||
type PatchMerger struct {
|
||||
// GGUF tags align with mm.* keys used by the model
|
||||
Proj *nn.Linear `gguf:"model.fc"` // mm.model.fc.weight
|
||||
PostLN *nn.LayerNorm `gguf:"post_norm"` // mm.post_norm.weight/bias
|
||||
GateProj *nn.Linear `gguf:"gate"` // mm.gate.weight
|
||||
UpProj *nn.Linear `gguf:"up"` // mm.up.weight
|
||||
DownProj *nn.Linear `gguf:"down"` // mm.down.weight
|
||||
}
|
||||
|
||||
func (m *PatchMerger) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *VisionModelOptions) ml.Tensor {
|
||||
// Linear projection
|
||||
hiddenStates = m.Proj.Forward(ctx, hiddenStates)
|
||||
|
||||
// Post-projection layer norm + GELU ERF
|
||||
hiddenStates = m.PostLN.Forward(ctx, hiddenStates, opts.eps)
|
||||
hiddenStates = hiddenStates.GELU_ERF(ctx)
|
||||
// Force a copy to avoid in-place mutation issues with GELU_ERF
|
||||
hiddenStates = hiddenStates.Contiguous(ctx)
|
||||
|
||||
// SwiGLU MLP: down(silu(gate(x)) * up(x))
|
||||
gateOut := m.GateProj.Forward(ctx, hiddenStates)
|
||||
upOut := m.UpProj.Forward(ctx, hiddenStates)
|
||||
gate := gateOut.SILU(ctx, upOut)
|
||||
return m.DownProj.Forward(ctx, gate)
|
||||
}
|
||||
|
||||
type VisionModel struct {
|
||||
PatchEmbed *VisionPatchEmbed
|
||||
Blocks []VisionBlock `gguf:"blk"`
|
||||
PostLN *nn.RMSNorm `gguf:"post_ln"`
|
||||
// Note: Downsample is applied at the model level so mm.patch_merger stays separate
|
||||
|
||||
*VisionModelOptions
|
||||
}
|
||||
|
||||
func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor, grid *Grid) ml.Tensor {
|
||||
// Extract patch embeddings from flattened patches
|
||||
hiddenStates := m.PatchEmbed.Forward(ctx, pixelValues, grid, m.VisionModelOptions)
|
||||
|
||||
// Create position IDs for RoPE (spatial grid)
|
||||
// Patches are already in merge-block order from preprocessing
|
||||
positions := m.createPositions(ctx, grid)
|
||||
|
||||
// Process through vision blocks
|
||||
for _, block := range m.Blocks {
|
||||
hiddenStates = block.Forward(ctx, hiddenStates, positions, m.VisionModelOptions)
|
||||
}
|
||||
|
||||
// Post-layernorm
|
||||
hiddenStates = m.PostLN.Forward(ctx, hiddenStates, m.eps)
|
||||
|
||||
// Note: Downsample is now applied separately in Model.EncodeMultimodal
|
||||
// so mm.patch_merger remains a distinct module
|
||||
|
||||
return hiddenStates
|
||||
}
|
||||
|
||||
func (m *VisionModel) createPositions(ctx ml.Context, grid *Grid) ml.Tensor {
|
||||
// Create spatial position IDs for vision RoPE
|
||||
// Position layout: [height, width, height, width] - 4 sections for mrope
|
||||
// Patches are in MERGE-BLOCK order after VisionPatchEmbed interleaving
|
||||
// This follows the GLM-OCR rot_pos_emb layout
|
||||
numPatches := grid.Height * grid.Width
|
||||
mergeRatio := m.spatialMergeSize
|
||||
|
||||
// Build position arrays in merge-block order
|
||||
// Each merge_ratio x merge_ratio block of patches is grouped together
|
||||
hpos := make([]int32, numPatches)
|
||||
wpos := make([]int32, numPatches)
|
||||
ptr := 0
|
||||
for y := 0; y < grid.Height; y += mergeRatio {
|
||||
for x := 0; x < grid.Width; x += mergeRatio {
|
||||
for dy := range mergeRatio {
|
||||
for dx := range mergeRatio {
|
||||
hpos[ptr] = int32(y + dy)
|
||||
wpos[ptr] = int32(x + dx)
|
||||
ptr++
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Build position arrays for 4 sections (mrope)
|
||||
s := [][]int32{
|
||||
hpos, // Section 0: height
|
||||
wpos, // Section 1: width
|
||||
slices.Clone(hpos), // Section 2: height (repeated)
|
||||
slices.Clone(wpos), // Section 3: width (repeated)
|
||||
}
|
||||
|
||||
return ctx.Input().FromInts(slices.Concat(s...), numPatches*4)
|
||||
}
|
||||
|
||||
func newVisionModel(c fs.Config) *VisionModel {
|
||||
hiddenSize := int(c.Uint("vision.embedding_length", 1024))
|
||||
numHeads := int(c.Uint("vision.attention.head_count", 16))
|
||||
numChannels := int(c.Uint("vision.num_channels", 3))
|
||||
patchSize := int(c.Uint("vision.patch_size", 14))
|
||||
temporalPatchSize := int(c.Uint("vision.temporal_patch_size", 2))
|
||||
imageSize := int(c.Uint("vision.image_size", 336))
|
||||
spatialMergeSize := int(c.Uint("vision.spatial_merge_size", 2))
|
||||
outHiddenSize := int(c.Uint("vision.out_hidden_size", 1536))
|
||||
intermediateSize := int(c.Uint("vision.intermediate_size", 4096))
|
||||
eps := c.Float("vision.attention.layer_norm_rms_epsilon", 1e-5)
|
||||
|
||||
return &VisionModel{
|
||||
Blocks: make([]VisionBlock, c.Uint("vision.block_count", 24)),
|
||||
VisionModelOptions: &VisionModelOptions{
|
||||
hiddenSize: hiddenSize,
|
||||
numHeads: numHeads,
|
||||
headDim: hiddenSize / numHeads,
|
||||
numChannels: numChannels,
|
||||
patchSize: patchSize,
|
||||
temporalPatchSize: temporalPatchSize,
|
||||
imageSize: imageSize,
|
||||
spatialMergeSize: spatialMergeSize,
|
||||
outHiddenSize: outHiddenSize,
|
||||
intermediateSize: intermediateSize,
|
||||
eps: eps,
|
||||
},
|
||||
}
|
||||
}
|
||||
@@ -8,6 +8,7 @@ import (
|
||||
_ "github.com/ollama/ollama/model/models/gemma3"
|
||||
_ "github.com/ollama/ollama/model/models/gemma3n"
|
||||
_ "github.com/ollama/ollama/model/models/glm4moelite"
|
||||
_ "github.com/ollama/ollama/model/models/glmocr"
|
||||
_ "github.com/ollama/ollama/model/models/gptoss"
|
||||
_ "github.com/ollama/ollama/model/models/lfm2"
|
||||
_ "github.com/ollama/ollama/model/models/llama"
|
||||
|
||||
19
model/parsers/glmocr.go
Normal file
19
model/parsers/glmocr.go
Normal file
@@ -0,0 +1,19 @@
|
||||
package parsers
|
||||
|
||||
import "github.com/ollama/ollama/api"
|
||||
|
||||
type GlmOcrParser struct {
|
||||
GLM47Parser
|
||||
}
|
||||
|
||||
func (p *GlmOcrParser) HasThinkingSupport() bool {
|
||||
return false
|
||||
}
|
||||
|
||||
func (p *GlmOcrParser) Init(tools []api.Tool, lastMessage *api.Message, thinkValue *api.ThinkValue) []api.Tool {
|
||||
p.tools = tools
|
||||
if thinkValue != nil && thinkValue.Bool() {
|
||||
p.state = glm46ParserState_CollectingThinking
|
||||
}
|
||||
return tools
|
||||
}
|
||||
@@ -70,6 +70,8 @@ func ParserForName(name string) Parser {
|
||||
return &FunctionGemmaParser{}
|
||||
case "glm-4.7":
|
||||
return &GLM47Parser{}
|
||||
case "glm-ocr":
|
||||
return &GlmOcrParser{}
|
||||
case "lfm2":
|
||||
return &LFM2Parser{hasThinkingSupport: false}
|
||||
case "lfm2-thinking":
|
||||
|
||||
109
model/renderers/glmocr.go
Normal file
109
model/renderers/glmocr.go
Normal file
@@ -0,0 +1,109 @@
|
||||
package renderers
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
)
|
||||
|
||||
type GlmOcrRenderer struct{}
|
||||
|
||||
func (r *GlmOcrRenderer) Render(messages []api.Message, tools []api.Tool, thinkValue *api.ThinkValue) (string, error) {
|
||||
var sb strings.Builder
|
||||
|
||||
sb.WriteString("[gMASK]<sop>")
|
||||
|
||||
if len(tools) > 0 {
|
||||
sb.WriteString("<|system|>\n")
|
||||
sb.WriteString("# Tools\n\n")
|
||||
sb.WriteString("You may call one or more functions to assist with the user query.\n\n")
|
||||
sb.WriteString("You are provided with function signatures within <tools></tools> XML tags:\n")
|
||||
sb.WriteString("<tools>\n")
|
||||
for _, tool := range tools {
|
||||
d, _ := json.Marshal(tool)
|
||||
sb.WriteString(formatGLM47ToolJSON(d))
|
||||
sb.WriteString("\n")
|
||||
}
|
||||
sb.WriteString("</tools>\n\n")
|
||||
sb.WriteString("For each function call, output the function name and arguments within the following XML format:\n")
|
||||
sb.WriteString("<tool_call>{function-name}<arg_key>{arg-key-1}</arg_key><arg_value>{arg-value-1}</arg_value><arg_key>{arg-key-2}</arg_key><arg_value>{arg-value-2}</arg_value>...</tool_call>")
|
||||
}
|
||||
|
||||
enableThinking := false
|
||||
thinkingExplicitlySet := false
|
||||
if thinkValue != nil {
|
||||
enableThinking = thinkValue.Bool()
|
||||
thinkingExplicitlySet = true
|
||||
}
|
||||
|
||||
for i, message := range messages {
|
||||
switch message.Role {
|
||||
case "user":
|
||||
sb.WriteString("<|user|>\n")
|
||||
sb.WriteString(message.Content)
|
||||
if thinkingExplicitlySet && !enableThinking && !strings.HasSuffix(message.Content, "/nothink") {
|
||||
sb.WriteString("/nothink")
|
||||
}
|
||||
case "assistant":
|
||||
sb.WriteString("<|assistant|>\n")
|
||||
if message.Thinking != "" {
|
||||
sb.WriteString("<think>" + strings.TrimSpace(message.Thinking) + "</think>")
|
||||
} else {
|
||||
sb.WriteString("<think></think>")
|
||||
}
|
||||
if message.Content != "" {
|
||||
sb.WriteString("\n" + strings.TrimSpace(message.Content))
|
||||
}
|
||||
if len(message.ToolCalls) > 0 {
|
||||
for _, toolCall := range message.ToolCalls {
|
||||
sb.WriteString("\n<tool_call>" + toolCall.Function.Name)
|
||||
sb.WriteString(renderGlmOcrToolArguments(toolCall.Function.Arguments))
|
||||
sb.WriteString("</tool_call>")
|
||||
}
|
||||
}
|
||||
sb.WriteString("\n")
|
||||
case "tool":
|
||||
if i == 0 || messages[i-1].Role != "tool" {
|
||||
sb.WriteString("<|observation|>")
|
||||
}
|
||||
sb.WriteString("\n<tool_response>\n")
|
||||
sb.WriteString(message.Content)
|
||||
sb.WriteString("\n</tool_response>\n")
|
||||
case "system":
|
||||
sb.WriteString("<|system|>\n")
|
||||
sb.WriteString(message.Content)
|
||||
sb.WriteString("\n")
|
||||
}
|
||||
}
|
||||
|
||||
sb.WriteString("<|assistant|>\n")
|
||||
if thinkingExplicitlySet && !enableThinking {
|
||||
sb.WriteString("<think></think>\n")
|
||||
}
|
||||
|
||||
return sb.String(), nil
|
||||
}
|
||||
|
||||
func renderGlmOcrToolArguments(args api.ToolCallFunctionArguments) string {
|
||||
var sb strings.Builder
|
||||
for key, value := range args.All() {
|
||||
sb.WriteString("<arg_key>" + key + "</arg_key>")
|
||||
var valueStr string
|
||||
if str, ok := value.(string); ok {
|
||||
valueStr = str
|
||||
} else {
|
||||
jsonBytes, err := json.Marshal(value)
|
||||
if err != nil {
|
||||
valueStr = fmt.Sprintf("%v", value)
|
||||
} else {
|
||||
valueStr = string(jsonBytes)
|
||||
}
|
||||
}
|
||||
|
||||
sb.WriteString("<arg_value>" + valueStr + "</arg_value>")
|
||||
}
|
||||
|
||||
return sb.String()
|
||||
}
|
||||
@@ -82,6 +82,8 @@ func rendererForName(name string) Renderer {
|
||||
return &FunctionGemmaRenderer{}
|
||||
case "glm-4.7":
|
||||
return &GLM47Renderer{}
|
||||
case "glm-ocr":
|
||||
return &GlmOcrRenderer{}
|
||||
case "lfm2":
|
||||
return &LFM2Renderer{IsThinking: false}
|
||||
case "lfm2-thinking":
|
||||
|
||||
Reference in New Issue
Block a user