Compare commits

...

6 Commits

Author SHA1 Message Date
jmorganca
f201b7d258 lint 2026-02-02 00:26:43 -08:00
jmorganca
6f5b814b86 lint 2026-02-02 00:07:52 -08:00
jmorganca
79c00a1b16 lint 2026-02-01 22:54:12 -08:00
jmorganca
9eded5fddb glm working 2026-02-01 22:24:32 -08:00
jmorganca
2626ec7772 glm wip 2026-01-26 14:07:21 -08:00
jmorganca
f408e0ff5e glm wip 2026-01-26 13:25:56 -08:00
17 changed files with 1551 additions and 1 deletions

View File

@@ -313,6 +313,8 @@ func LoadModelMetadata(fsys fs.FS) (ModelKV, *Tokenizer, error) {
conv = &deepseek2Model{}
case "Glm4MoeLiteForCausalLM":
conv = &glm4MoeLiteModel{}
case "GlmOcrForConditionalGeneration":
conv = &glmOcrModel{}
case "Lfm2ForCausalLM":
conv = &lfm2Model{}
default:

469
convert/convert_glmocr.go Normal file
View File

@@ -0,0 +1,469 @@
package convert
import (
"cmp"
"encoding/json"
"io/fs"
"log/slog"
"regexp"
"strconv"
"strings"
"github.com/ollama/ollama/fs/ggml"
"github.com/pdevine/tensor"
"github.com/pdevine/tensor/native"
)
// normalToNeoXRepacker creates a repacker that permutes Q/K weights from interleaved (LLaMA)
// to NeoX ordering for compatibility with GGML's M-RoPE kernel.
//
// For weights: reshape [out, in] -> [n_heads, head_dim, in], permute rotary dims, reshape back
// For biases: reshape [out] -> [n_heads, head_dim], permute rotary dims, reshape back
func normalToNeoXRepacker(nHeads, headDim int, partialRotaryFactor float32) func(string, []float32, []uint64) ([]float32, error) {
return func(_ string, data []float32, shape []uint64) ([]float32, error) {
rotaryDim := int(float32(headDim) * partialRotaryFactor)
if rotaryDim%2 != 0 {
rotaryDim = (rotaryDim / 2) * 2 // Round down to even
}
// Handle 1D (bias) or 2D (weight) tensors
is1D := len(shape) == 1
var inFeatures int
if is1D {
inFeatures = 1
} else {
inFeatures = int(shape[1])
}
outFeatures := int(shape[0])
nEffectiveHeads := outFeatures / headDim
if nEffectiveHeads != nHeads {
slog.Warn("normalToNeoX: unexpected head count", "effective", nEffectiveHeads, "expected", nHeads)
}
// Reshape to [n_heads, head_dim, in_features]
reshaped := make([]float32, len(data))
copy(reshaped, data)
// Permute the rotary dimensions: even indices first, then odd
// For each head, reorder [0,1,2,3,4,5...] to [0,2,4...,1,3,5...]
result := make([]float32, len(data))
halfRotary := rotaryDim / 2
for h := range nEffectiveHeads {
for f := range inFeatures {
for i := range halfRotary {
// Even dim (0, 2, 4, ...) -> position i
srcIdx := h*headDim*inFeatures + (2*i)*inFeatures + f
dstIdx := h*headDim*inFeatures + i*inFeatures + f
result[dstIdx] = reshaped[srcIdx]
// Odd dim (1, 3, 5, ...) -> position halfRotary + i
srcIdx = h*headDim*inFeatures + (2*i+1)*inFeatures + f
dstIdx = h*headDim*inFeatures + (halfRotary+i)*inFeatures + f
result[dstIdx] = reshaped[srcIdx]
}
// Non-rotary part: copy as-is
for i := rotaryDim; i < headDim; i++ {
srcIdx := h*headDim*inFeatures + i*inFeatures + f
result[srcIdx] = reshaped[srcIdx]
}
}
}
return result, nil
}
}
type glmOcrModel struct {
ModelParameters
TextConfig struct {
HiddenSize uint32 `json:"hidden_size"`
IntermediateSize uint32 `json:"intermediate_size"`
NumHiddenLayers uint32 `json:"num_hidden_layers"`
NumAttentionHeads uint32 `json:"num_attention_heads"`
NumKeyValueHeads uint32 `json:"num_key_value_heads"`
HeadDim uint32 `json:"head_dim"`
MaxPositionEmbed uint32 `json:"max_position_embeddings"`
RMSNormEps float32 `json:"rms_norm_eps"`
PartialRotaryFactor float32 `json:"partial_rotary_factor"`
RopeParameters struct {
RopeType string `json:"rope_type"`
MRopeSection []int32 `json:"mrope_section"`
RopeTheta float32 `json:"rope_theta"`
PartialRotaryFactor float32 `json:"partial_rotary_factor"`
} `json:"rope_parameters"`
} `json:"text_config"`
VisionConfig struct {
HiddenSize uint32 `json:"hidden_size"`
IntermediateSize uint32 `json:"intermediate_size"`
Depth uint32 `json:"depth"`
NumHeads uint32 `json:"num_heads"`
ImageSize uint32 `json:"image_size"`
PatchSize uint32 `json:"patch_size"`
OutHiddenSize uint32 `json:"out_hidden_size"`
RMSNormEps float32 `json:"rms_norm_eps"`
SpatialMergeSize uint32 `json:"spatial_merge_size"`
TemporalPatchSize uint32 `json:"temporal_patch_size"`
} `json:"vision_config"`
ImageStartTokenID uint32 `json:"image_start_token_id"`
ImageEndTokenID uint32 `json:"image_end_token_id"`
VideoStartTokenID uint32 `json:"video_start_token_id"`
VideoEndTokenID uint32 `json:"video_end_token_id"`
ImageTokenID uint32 `json:"image_token_id"`
VideoTokenID uint32 `json:"video_token_id"`
// Preprocessor config (preprocessor_config.json)
Preprocessor struct {
Size struct {
ShortestEdge uint32 `json:"shortest_edge"`
LongestEdge uint32 `json:"longest_edge"`
} `json:"size"`
PatchSize uint32 `json:"patch_size"`
TemporalPatchSize uint32 `json:"temporal_patch_size"`
MergeSize uint32 `json:"merge_size"`
ImageMean []float32 `json:"image_mean"`
ImageStd []float32 `json:"image_std"`
} `json:"-"`
}
var _ ModelConverter = (*glmOcrModel)(nil)
func (m *glmOcrModel) parseMore(fsys fs.FS) error {
bts, err := fs.ReadFile(fsys, "preprocessor_config.json")
if err != nil {
return err
}
return json.Unmarshal(bts, &m.Preprocessor)
}
func (m *glmOcrModel) KV(t *Tokenizer) KV {
kv := m.ModelParameters.KV(t)
kv["general.architecture"] = "glmocr"
// Text model parameters
kv["glmocr.block_count"] = cmp.Or(m.TextConfig.NumHiddenLayers, 16)
kv["glmocr.embedding_length"] = cmp.Or(m.TextConfig.HiddenSize, 1536)
kv["glmocr.attention.head_count"] = cmp.Or(m.TextConfig.NumAttentionHeads, 16)
kv["glmocr.attention.head_count_kv"] = cmp.Or(m.TextConfig.NumKeyValueHeads, 8)
headDim := cmp.Or(m.TextConfig.HeadDim, m.TextConfig.HiddenSize/m.TextConfig.NumAttentionHeads)
kv["glmocr.attention.key_length"] = headDim
kv["glmocr.attention.value_length"] = headDim
kv["glmocr.feed_forward_length"] = cmp.Or(m.TextConfig.IntermediateSize, 4608)
kv["glmocr.attention.layer_norm_rms_epsilon"] = cmp.Or(m.TextConfig.RMSNormEps, 1e-5)
kv["glmocr.context_length"] = cmp.Or(m.TextConfig.MaxPositionEmbed, 131072)
kv["glmocr.rope.freq_base"] = cmp.Or(m.TextConfig.RopeParameters.RopeTheta, float32(10000))
kv["glmocr.rope.partial_rotary_factor"] = cmp.Or(m.TextConfig.RopeParameters.PartialRotaryFactor, m.TextConfig.PartialRotaryFactor, float32(1.0))
if len(m.TextConfig.RopeParameters.MRopeSection) > 0 {
kv["glmocr.rope.mrope_section"] = m.TextConfig.RopeParameters.MRopeSection
}
// Vision model parameters
kv["glmocr.vision.block_count"] = cmp.Or(m.VisionConfig.Depth, 24)
kv["glmocr.vision.embedding_length"] = cmp.Or(m.VisionConfig.HiddenSize, 1024)
kv["glmocr.vision.attention.head_count"] = cmp.Or(m.VisionConfig.NumHeads, 16)
kv["glmocr.vision.image_size"] = cmp.Or(m.VisionConfig.ImageSize, 336)
kv["glmocr.vision.patch_size"] = cmp.Or(m.VisionConfig.PatchSize, m.Preprocessor.PatchSize, 14)
kv["glmocr.vision.spatial_merge_size"] = cmp.Or(m.VisionConfig.SpatialMergeSize, m.Preprocessor.MergeSize, 2)
kv["glmocr.vision.temporal_patch_size"] = cmp.Or(m.VisionConfig.TemporalPatchSize, m.Preprocessor.TemporalPatchSize, 2)
kv["glmocr.vision.out_hidden_size"] = cmp.Or(m.VisionConfig.OutHiddenSize, 1536)
kv["glmocr.vision.intermediate_size"] = cmp.Or(m.VisionConfig.IntermediateSize, 4096)
kv["glmocr.vision.attention.layer_norm_rms_epsilon"] = cmp.Or(m.VisionConfig.RMSNormEps, 1e-5)
// Preprocessor-derived image settings (min/max pixels and normalization)
// Note: fs.Config.keyValue() auto-prepends architecture prefix, so use full key
if m.Preprocessor.Size.ShortestEdge > 0 {
kv["glmocr.vision.min_pixels"] = m.Preprocessor.Size.ShortestEdge
}
if m.Preprocessor.Size.LongestEdge > 0 {
kv["glmocr.vision.max_pixels"] = m.Preprocessor.Size.LongestEdge
}
if len(m.Preprocessor.ImageMean) == 3 {
kv["glmocr.vision.image_mean"] = m.Preprocessor.ImageMean
}
if len(m.Preprocessor.ImageStd) == 3 {
kv["glmocr.vision.image_std"] = m.Preprocessor.ImageStd
}
// Special tokens
kv["glmocr.image_token_id"] = m.ImageTokenID
kv["glmocr.image_start_token_id"] = m.ImageStartTokenID
kv["glmocr.image_end_token_id"] = m.ImageEndTokenID
kv["glmocr.video_token_id"] = m.VideoTokenID
kv["glmocr.video_start_token_id"] = m.VideoStartTokenID
kv["glmocr.video_end_token_id"] = m.VideoEndTokenID
return kv
}
func (m *glmOcrModel) Tensors(ts []Tensor) []*ggml.Tensor {
var out []*ggml.Tensor
// Skip layers >= num_hidden_layers (Multi-Token Prediction layers not needed for basic inference)
numLayers := int(cmp.Or(m.TextConfig.NumHiddenLayers, 16))
skipLayer := func(name string) bool {
// Tensor names are already replaced to "blk.N.xxx" format
re := regexp.MustCompile(`^blk\.(\d+)`)
matches := re.FindStringSubmatch(name)
if matches == nil {
return false
}
blkNum, err := strconv.Atoi(matches[1])
if err != nil {
return false
}
return blkNum >= numLayers
}
for _, t := range ts {
name := t.Name()
// Skip next-n prediction layers (layers >= num_hidden_layers)
if skipLayer(name) {
continue
}
// Split ffn_gate_up into separate gate and up projections
if strings.Contains(name, "ffn_gate_up") {
for t := range splitDim(t, 0,
split{Replacer: strings.NewReplacer("ffn_gate_up", "ffn_gate")},
split{Replacer: strings.NewReplacer("ffn_gate_up", "ffn_up")},
) {
out = append(out, t)
}
continue
}
// Split 5D Conv3D patch_embed weight into two Conv2D weights along temporal dimension
// Shape: [out_channels, in_channels, temporal=2, height, width] -> 2x [out_channels, in_channels, height, width]
// NOTE: Tensor names are already renamed via Replacements() before Tensors() is called,
// so we check for "patch_embd" (renamed) not "patch_embed" (original safetensors name)
// NOTE: Ollama Conv2D expects PyTorch format [OC, IC, KH, KW] - no transpose needed
if strings.HasSuffix(name, "patch_embd.weight") {
shape := t.Shape()
if len(shape) == 5 && shape[2] == 2 {
// Original shape: [OC, IC, 2, KH, KW] -> [OC, IC, KH, KW] (PyTorch format, no transpose)
newShape := []uint64{shape[0], shape[1], shape[3], shape[4]}
// Create repacker for first temporal slice (t=0)
t0 := t.Clone()
t0.SetRepacker(func(_ string, data []float32, shape []uint64) ([]float32, error) {
dims := make([]int, len(shape))
for i := range shape {
dims[i] = int(shape[i])
}
var tt tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
// Slice first temporal frame: [:, :, 0, :, :]
tt, err := tt.Slice(nil, nil, tensor.S(0, 1), nil, nil)
if err != nil {
return nil, err
}
tt = tensor.Materialize(tt)
// Reshape to 4D by squeezing temporal dim [OC, IC, 1, KH, KW] -> [OC, IC, KH, KW]
newDims := []int{int(shape[0]), int(shape[1]), int(shape[3]), int(shape[4])}
if err := tt.Reshape(newDims...); err != nil {
return nil, err
}
// No transpose - keep PyTorch format
if err := tt.Reshape(tt.Shape().TotalSize()); err != nil {
return nil, err
}
return native.VectorF32(tt.(*tensor.Dense))
})
out = append(out, &ggml.Tensor{
Name: strings.Replace(name, "patch_embd.weight", "patch_embd_0.weight", 1),
Kind: t.Kind(),
Shape: newShape,
WriterTo: t0,
})
// Create repacker for second temporal slice (t=1)
t1 := t.Clone()
t1.SetRepacker(func(_ string, data []float32, shape []uint64) ([]float32, error) {
dims := make([]int, len(shape))
for i := range shape {
dims[i] = int(shape[i])
}
var tt tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
// Slice second temporal frame: [:, :, 1, :, :]
tt, err := tt.Slice(nil, nil, tensor.S(1, 2), nil, nil)
if err != nil {
return nil, err
}
tt = tensor.Materialize(tt)
// Reshape to 4D by squeezing temporal dim [OC, IC, 1, KH, KW] -> [OC, IC, KH, KW]
newDims := []int{int(shape[0]), int(shape[1]), int(shape[3]), int(shape[4])}
if err := tt.Reshape(newDims...); err != nil {
return nil, err
}
// No transpose - keep PyTorch format
if err := tt.Reshape(tt.Shape().TotalSize()); err != nil {
return nil, err
}
return native.VectorF32(tt.(*tensor.Dense))
})
out = append(out, &ggml.Tensor{
Name: strings.Replace(name, "patch_embd.weight", "patch_embd_1.weight", 1),
Kind: t.Kind(),
Shape: newShape,
WriterTo: t1,
})
continue
}
if len(shape) == 4 {
out = append(out, &ggml.Tensor{
Name: strings.Replace(name, "patch_embd.weight", "patch_embd_0.weight", 1),
Kind: t.Kind(),
Shape: t.Shape(),
WriterTo: t,
})
continue
}
slog.Warn("glmocr: patch_embed weight has unexpected shape - not splitting", "shape", shape)
// Fall through to default handling
}
// Handle pre-split patch embedding weights
// Pattern 1: v.patch_embd.0.weight, v.patch_embd.1.weight -> patch_embd_0.weight, patch_embd_1.weight
// Pattern 2: v.patch_embd.weight.0, v.patch_embd.weight.1 -> patch_embd_0.weight, patch_embd_1.weight
if strings.Contains(name, "patch_embd.0.") {
out = append(out, &ggml.Tensor{
Name: strings.Replace(name, "patch_embd.0.", "patch_embd_0.", 1),
Kind: t.Kind(),
Shape: t.Shape(),
WriterTo: t,
})
continue
}
if strings.Contains(name, "patch_embd.1.") {
out = append(out, &ggml.Tensor{
Name: strings.Replace(name, "patch_embd.1.", "patch_embd_1.", 1),
Kind: t.Kind(),
Shape: t.Shape(),
WriterTo: t,
})
continue
}
// Handle .weight.0 and .weight.1 suffix patterns
if strings.HasSuffix(name, "patch_embd.weight.0") {
out = append(out, &ggml.Tensor{
Name: strings.Replace(name, "patch_embd.weight.0", "patch_embd_0.weight", 1),
Kind: t.Kind(),
Shape: t.Shape(),
WriterTo: t,
})
continue
}
if strings.HasSuffix(name, "patch_embd.weight.1") {
out = append(out, &ggml.Tensor{
Name: strings.Replace(name, "patch_embd.weight.1", "patch_embd_1.weight", 1),
Kind: t.Kind(),
Shape: t.Shape(),
WriterTo: t,
})
continue
}
// Permute Q/K weights for M-RoPE compatibility (interleaved -> NeoX ordering)
// GGML's M-RoPE kernel uses NeoX-style rotation, but GLM-OCR uses interleaved (LLaMA-style)
// We permute at conversion time so the weights work correctly with GGML's kernel
// This aligns Q/K rotary dimensions with GGML's NeoX-style rotation
if len(m.TextConfig.RopeParameters.MRopeSection) > 0 &&
strings.Contains(name, "blk.") && (strings.Contains(name, "attn_q.") || strings.Contains(name, "attn_k.")) {
// Get config values for permutation
nHeads := int(cmp.Or(m.TextConfig.NumAttentionHeads, 16))
nKVHeads := int(cmp.Or(m.TextConfig.NumKeyValueHeads, 8))
hiddenSize := int(cmp.Or(m.TextConfig.HiddenSize, 1536))
headDim := int(cmp.Or(m.TextConfig.HeadDim, uint32(hiddenSize/nHeads)))
partialRotaryFactor := cmp.Or(m.TextConfig.PartialRotaryFactor, m.TextConfig.RopeParameters.PartialRotaryFactor, float32(1.0))
// Use appropriate head count: nHeads for Q, nKVHeads for K
effectiveHeads := nHeads
if strings.Contains(name, "attn_k.") {
effectiveHeads = nKVHeads
}
permutedT := t.Clone()
permutedT.SetRepacker(normalToNeoXRepacker(effectiveHeads, headDim, partialRotaryFactor))
out = append(out, &ggml.Tensor{
Name: name,
Kind: t.Kind(),
Shape: t.Shape(),
WriterTo: permutedT,
})
continue
}
out = append(out, &ggml.Tensor{
Name: name,
Kind: t.Kind(),
Shape: t.Shape(),
WriterTo: t,
})
}
return out
}
func (m *glmOcrModel) Replacements() []string {
return []string{
// Vision encoder
"model.visual.patch_embed.proj_1", "v.patch_embd_1", // Second temporal split
"model.visual.patch_embed.proj", "v.patch_embd",
"model.visual.blocks", "v.blk",
"model.visual.post_layernorm", "v.post_ln",
"model.visual.downsample", "mm.patch_merger",
// Vision attention
"attn.qkv", "attn_qkv",
"attn.proj", "attn_out",
"attn.q_norm", "attn_q_norm",
"attn.k_norm", "attn_k_norm",
// Vision norms
"norm1", "ln1",
"norm2", "ln2",
// Vision MLP
"mlp.gate_proj", "ffn_gate",
"mlp.up_proj", "ffn_up",
"mlp.down_proj", "ffn_down",
// Merger (multimodal projector)
"model.visual.merger.proj", "mm.model.fc",
"model.visual.merger.post_projection_norm", "mm.post_norm",
"model.visual.merger.gate_proj", "mm.gate",
"model.visual.merger.up_proj", "mm.up",
"model.visual.merger.down_proj", "mm.down",
// Language model
"model.language_model.embed_tokens", "token_embd",
"model.language_model.layers", "blk",
"model.language_model.norm", "output_norm",
"lm_head", "output",
// Language model attention
"self_attn.q_proj", "attn_q",
"self_attn.k_proj", "attn_k",
"self_attn.v_proj", "attn_v",
"self_attn.o_proj", "attn_out",
// Language model norms
"input_layernorm", "attn_norm",
"post_attention_layernorm", "ffn_norm",
"post_self_attn_layernorm", "post_attn_norm",
"post_mlp_layernorm", "post_ffn_norm",
// Language model MLP (remove mlp. prefix so ffn_* names work)
"mlp.gate_up_proj", "ffn_gate_up",
"mlp.down_proj", "ffn_down",
}
}

View File

@@ -99,6 +99,7 @@ func (st safetensor) Kind() uint32 {
if st.dtype == "BF16" &&
!strings.HasPrefix(st.name, "v.") &&
!strings.HasPrefix(st.name, "s.") &&
!strings.HasPrefix(st.name, "mm.") &&
kind != tensorKindFP32 {
kind = tensorKindBF16
}

View File

@@ -270,6 +270,7 @@ func (kv KV) OllamaEngineRequired() bool {
"qwen3", "qwen3moe",
"qwen3vl", "qwen3vlmoe",
"glm4moelite",
"glmocr",
"lfm2",
}, kv.Architecture())
}
@@ -859,6 +860,7 @@ func (f GGML) FlashAttention() bool {
"bert",
"gemma3",
"glm4moelite",
"glmocr",
"gptoss", "gpt-oss",
"lfm2",
"mistral3",

View File

@@ -242,7 +242,6 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st
} else {
// For Ollama engine, use our SupportsFlashAttention logic
if fa {
slog.Info("enabling flash attention")
loadRequest.FlashAttention = ml.FlashAttentionEnabled
// Flash Attention also supports kv cache quantization

View File

@@ -170,6 +170,7 @@ type Tensor interface {
Cos(ctx Context) Tensor
Tanh(ctx Context) Tensor
GELU(ctx Context, up ...Tensor) Tensor
GELU_ERF(ctx Context) Tensor
QuickGELU(ctx Context, up ...Tensor) Tensor
SILU(ctx Context, up ...Tensor) Tensor
RELU(ctx Context, up ...Tensor) Tensor

View File

@@ -1581,6 +1581,13 @@ func (t *Tensor) GELU(ctx ml.Context, t2 ...ml.Tensor) ml.Tensor {
}
}
func (t *Tensor) GELU_ERF(ctx ml.Context) ml.Tensor {
return &Tensor{
b: t.b,
t: C.ggml_gelu_erf_inplace(ctx.(*Context).ctx, t.t),
}
}
func (t *Tensor) QuickGELU(ctx ml.Context, t2 ...ml.Tensor) ml.Tensor {
var tt *C.struct_ggml_tensor
if len(t2) > 0 {

View File

@@ -20,6 +20,7 @@ const (
ResizeBilinear = iota
ResizeNearestNeighbor
ResizeApproxBilinear
ResizeBicubic
ResizeCatmullrom
)
@@ -45,6 +46,7 @@ func Resize(img image.Image, newSize image.Point, method int) image.Image {
ResizeBilinear: draw.BiLinear,
ResizeNearestNeighbor: draw.NearestNeighbor,
ResizeApproxBilinear: draw.ApproxBiLinear,
ResizeBicubic: draw.CatmullRom,
ResizeCatmullrom: draw.CatmullRom,
}

View File

@@ -0,0 +1,171 @@
package glmocr
import (
"image"
"math"
"github.com/ollama/ollama/fs"
"github.com/ollama/ollama/model/imageproc"
)
type ImageProcessor struct {
imageSize int
patchSize int
temporalPatchSize int
spatialMergeSize int
minPixels int
maxPixels int
factor int
imageMean [3]float32
imageStd [3]float32
}
func newImageProcessor(c fs.Config) ImageProcessor {
patchSize := int(c.Uint("vision.patch_size", 14))
spatialMergeSize := int(c.Uint("vision.spatial_merge_size", 2))
temporalPatchSize := int(c.Uint("vision.temporal_patch_size", 2))
// Read normalization values from config if available, otherwise use CLIP defaults
imageMean := c.Floats("vision.image_mean", imageproc.ClipDefaultMean[:])
imageStd := c.Floats("vision.image_std", imageproc.ClipDefaultSTD[:])
// Default max_pixels: 2048 * patchSize² * mergeSize² * temporal = ~3.2M pixels
// This limits to ~16k patches (4k output tokens) to keep memory stable without flash attention
defaultMaxPixels := 2048 * patchSize * patchSize * spatialMergeSize * spatialMergeSize * temporalPatchSize
return ImageProcessor{
imageSize: int(c.Uint("vision.image_size", 336)),
patchSize: patchSize,
temporalPatchSize: temporalPatchSize,
spatialMergeSize: spatialMergeSize,
minPixels: int(c.Uint("vision.min_pixels", uint32(8*patchSize*patchSize*spatialMergeSize*spatialMergeSize*temporalPatchSize))),
maxPixels: int(c.Uint("vision.max_pixels", uint32(defaultMaxPixels))),
factor: patchSize * spatialMergeSize,
imageMean: [3]float32{imageMean[0], imageMean[1], imageMean[2]},
imageStd: [3]float32{imageStd[0], imageStd[1], imageStd[2]},
}
}
func (p *ImageProcessor) SmartResize(height, width int) (int, int) {
factor := p.factor
temporalFactor := p.temporalPatchSize
numFrames := temporalFactor // single image
if height < factor || width < factor {
// Scale up small images
scale := float64(factor) / float64(min(height, width))
height = int(math.Ceil(float64(height) * scale))
width = int(math.Ceil(float64(width) * scale))
}
if temporalFactor <= 0 {
panic("temporal_patch_size must be > 0")
}
if numFrames < temporalFactor {
panic("num_frames must be >= temporal_patch_size")
}
if aspectRatio := float64(max(height, width)) / float64(min(height, width)); aspectRatio > 200 {
panic("absolute aspect ratio must be smaller than 200")
}
round := func(x float64) int { return int(math.RoundToEven(x)) }
hBar := round(float64(height)/float64(factor)) * factor
wBar := round(float64(width)/float64(factor)) * factor
tBar := round(float64(numFrames)/float64(temporalFactor)) * temporalFactor
if tBar*hBar*wBar > p.maxPixels {
beta := math.Sqrt(float64(numFrames*height*width) / float64(p.maxPixels))
hBar = int(math.Floor(float64(height)/beta/float64(factor))) * factor
wBar = int(math.Floor(float64(width)/beta/float64(factor))) * factor
} else if tBar*hBar*wBar < p.minPixels {
beta := math.Sqrt(float64(p.minPixels) / float64(numFrames*height*width))
hBar = int(math.Ceil(float64(height)*beta/float64(factor))) * factor
wBar = int(math.Ceil(float64(width)*beta/float64(factor))) * factor
}
return hBar, wBar
}
func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, *Grid, error) {
img = imageproc.Composite(img)
origWidth := img.Bounds().Dx()
origHeight := img.Bounds().Dy()
// Calculate smart resize dimensions
resizedHeight, resizedWidth := p.SmartResize(origHeight, origWidth)
// Resize image
resizedImg := imageproc.Resize(img, image.Point{X: resizedWidth, Y: resizedHeight}, imageproc.ResizeBicubic)
// Normalize pixels - output format is [C, H, W] with rescale and channelFirst
// We keep [C, H, W] for patch extraction
normalizedPixels := imageproc.Normalize(resizedImg, p.imageMean, p.imageStd, true, true)
// Calculate grid dimensions (after Conv2D patching)
grid := &Grid{
Height: resizedHeight / p.patchSize,
Width: resizedWidth / p.patchSize,
Temporal: 1, // Single image
ImageHeight: resizedHeight,
ImageWidth: resizedWidth,
}
patches, err := p.createPatches(normalizedPixels, resizedHeight, resizedWidth, grid)
if err != nil {
return nil, nil, err
}
return patches, grid, nil
}
func (p *ImageProcessor) createPatches(pixels []float32, height, width int, grid *Grid) ([]float32, error) {
channels := 3
patchSize := p.patchSize
mergeSize := p.spatialMergeSize
temporalPatchSize := p.temporalPatchSize
numPatches := grid.Temporal * grid.Height * grid.Width
patchDim := channels * temporalPatchSize * patchSize * patchSize
result := make([]float32, numPatches*patchDim)
patchIndex := 0
// Single temporal frame handling (copies to all frames)
for range grid.Temporal {
for h := 0; h < grid.Height; h += mergeSize {
for w := 0; w < grid.Width; w += mergeSize {
for mh := range mergeSize {
for mw := range mergeSize {
baseOffset := patchIndex * patchDim
for c := range channels {
channelOffset := baseOffset + (c * temporalPatchSize * patchSize * patchSize)
for py := range patchSize {
for px := range patchSize {
y := (h+mh)*patchSize + py
x := (w+mw)*patchSize + px
srcIdx := c*height*width + y*width + x
dstIdx := channelOffset + (py * patchSize) + px
result[dstIdx] = pixels[srcIdx]
}
}
if temporalPatchSize > 1 {
frameSize := patchSize * patchSize
for tp := 1; tp < temporalPatchSize; tp++ {
currentFrameOffset := channelOffset + (tp * frameSize)
copy(result[currentFrameOffset:currentFrameOffset+frameSize],
result[channelOffset:channelOffset+frameSize])
}
}
}
patchIndex++
}
}
}
}
}
return result, nil
}

View File

@@ -0,0 +1,235 @@
package glmocr
import (
"bytes"
"errors"
"image"
"slices"
"github.com/ollama/ollama/fs"
"github.com/ollama/ollama/kvcache"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/model"
"github.com/ollama/ollama/model/input"
)
type Model struct {
model.Base
model.BytePairEncoding
*TextModel
*VisionModel `gguf:"v"`
VisionDownsample *VisionDownsample `gguf:"mm.patch_merger"`
PatchMerger *PatchMerger `gguf:"mm"`
ImageProcessor
imageTokenID int32
imageStartTokenID int32
imageEndTokenID int32
}
var _ model.MultimodalProcessor = (*Model)(nil)
func New(c fs.Config) (model.Model, error) {
eosTokenID := int32(c.Uint("tokenizer.ggml.eos_token_id"))
eosTokenIDs := c.Ints("tokenizer.ggml.eos_token_ids")
allEOS := append([]int32{eosTokenID}, eosTokenIDs...)
m := &Model{
BytePairEncoding: model.NewBytePairEncoding(
&model.Vocabulary{
Values: c.Strings("tokenizer.ggml.tokens"),
Types: c.Ints("tokenizer.ggml.token_type"),
Merges: c.Strings("tokenizer.ggml.merges"),
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", false),
BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
EOS: allEOS,
},
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
),
TextModel: newTextModel(c),
VisionModel: newVisionModel(c),
ImageProcessor: newImageProcessor(c),
imageTokenID: int32(c.Uint("image_token_id", 59280)),
imageStartTokenID: int32(c.Uint("image_start_token_id", 59256)),
imageEndTokenID: int32(c.Uint("image_end_token_id", 59257)),
}
m.Cache = kvcache.NewCausalCache(m.TextModel.Shift)
return m, nil
}
func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
if len(m.VisionModel.Blocks) == 0 {
return nil, model.ErrNoVisionModel
}
img, _, err := image.Decode(bytes.NewReader(multimodalData))
if err != nil {
return nil, err
}
f32s, grid, err := m.ImageProcessor.ProcessImage(img)
if err != nil {
return nil, err
}
// Create pixel values tensor from flattened patches
// Shape: [patchDim, numPatches]
patchDim := m.VisionModel.numChannels * m.temporalPatchSize * m.patchSize * m.patchSize
numPatches := grid.Temporal * grid.Height * grid.Width
pixelValues := ctx.Input().FromFloats(f32s, patchDim, numPatches)
// Forward through vision encoder
visionOutputs := m.VisionModel.Forward(ctx, pixelValues, grid)
// Forward through downsample (patch merger)
if m.VisionDownsample == nil || m.VisionDownsample.Weight == nil {
return nil, errors.New("glmocr: missing vision downsample weights")
}
visionOutputs = m.VisionDownsample.Forward(ctx, visionOutputs, grid, m.VisionModel.VisionModelOptions)
// Forward through patch merger (FC + LayerNorm + GELU + SwiGLU FFN)
if m.PatchMerger == nil {
return nil, errors.New("glmocr: missing patch merger weights")
}
visionOutputs = m.PatchMerger.Forward(ctx, visionOutputs, m.VisionModel.VisionModelOptions)
return []input.Multimodal{{Tensor: visionOutputs, Data: grid}}, nil
}
func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
var result []*input.Input
// Reset position cache
m.TextModel.positionCache = m.TextModel.positionCache[:0]
m.TextModel.ropeDelta = 0
pos := int32(0)
for _, inp := range inputs {
if inp.Multimodal == nil {
result = append(result, inp)
m.TextModel.positionCache = append(m.TextModel.positionCache, pos)
pos++
continue
}
// Get grid info for position calculation
grid := inp.Multimodal[0].Data.(*Grid)
mergedH := grid.Height / m.VisionModel.spatialMergeSize
mergedW := grid.Width / m.VisionModel.spatialMergeSize
// Add image start token
result = append(result, &input.Input{Token: m.imageStartTokenID})
m.TextModel.positionCache = append(m.TextModel.positionCache, pos)
pos++
// Add image tokens with multimodal data
// All image tokens share the same base position for temporal dimension
tokensPerGrid := inp.Multimodal[0].Tensor.Dim(1)
basePos := pos
sameBatch := tokensPerGrid - 1
if sameBatch < 0 {
sameBatch = 0
}
result = append(result, &input.Input{
Token: m.imageTokenID,
Multimodal: inp.Multimodal,
MultimodalHash: inp.MultimodalHash,
SameBatch: sameBatch,
})
m.TextModel.positionCache = append(m.TextModel.positionCache, basePos)
// Add placeholder tokens for remaining positions
// All image tokens use the same base position (temporal stays constant)
for range tokensPerGrid - 1 {
result = append(result, &input.Input{Token: m.imageTokenID})
m.TextModel.positionCache = append(m.TextModel.positionCache, basePos)
}
// Advance position by max(mergedH, mergedW) after image tokens
pos = basePos + int32(max(mergedH, mergedW))
// Add image end token
result = append(result, &input.Input{Token: m.imageEndTokenID})
m.TextModel.positionCache = append(m.TextModel.positionCache, pos)
pos++
}
// Compute rope delta for continuation after the prefill segment:
// delta = (max_position_id + 1) - sequence_length
if len(m.TextModel.positionCache) > 0 {
last := m.TextModel.positionCache[len(m.TextModel.positionCache)-1]
m.TextModel.ropeDelta = last + 1 - int32(len(m.TextModel.positionCache))
}
return result, nil
}
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
// Initial token embedding
hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs).Duplicate(ctx)
ctx.Forward(hiddenStates)
// Build position slices for M-RoPE
positionSlice := func() [][]int32 {
s := [][]int32{
make([]int32, len(batch.Positions)), // temporal
make([]int32, len(batch.Positions)), // height
make([]int32, len(batch.Positions)), // width
make([]int32, len(batch.Positions)), // unused (zeros)
}
for i, position := range batch.Positions {
// Translate through position cache or continue sequence
if position < int32(len(m.TextModel.positionCache)) {
position = m.TextModel.positionCache[position]
} else if len(m.TextModel.positionCache) > 0 {
// Continue sequence after cached positions using ropeDelta
position = position + m.TextModel.ropeDelta
}
s[0][i] = position
s[1][i] = position
s[2][i] = position
}
return s
}()
// Inject vision embeddings and adjust positions for image tokens
for _, mi := range batch.Multimodal {
img := mi.Multimodal[0].Tensor
ctx.Forward(img.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), img.Dim(0)*img.Dim(1))))
if grid, ok := mi.Multimodal[0].Data.(*Grid); ok {
w := grid.Width / m.VisionModel.spatialMergeSize
for i := range img.Dim(1) {
positionSlice[1][mi.Index+i] += int32(i / w)
positionSlice[2][mi.Index+i] += int32(i % w)
}
}
}
positions := ctx.Input().FromInts(slices.Concat(positionSlice...), len(positionSlice[0])*len(positionSlice))
// Process through transformer layers
for i, layer := range m.TextModel.Layers {
m.Cache.SetLayer(i)
var lastLayerOutputs ml.Tensor
if i == len(m.TextModel.Layers)-1 {
lastLayerOutputs = batch.Outputs
}
hiddenStates = layer.Forward(ctx, hiddenStates, positions, lastLayerOutputs, m.Cache, m.TextModel.TextModelOptions)
}
hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.TextModel.eps)
return m.Output.Forward(ctx, hiddenStates), nil
}
func init() {
model.Register("glmocr", New)
}

View File

@@ -0,0 +1,180 @@
package glmocr
import (
"math"
"github.com/ollama/ollama/fs"
"github.com/ollama/ollama/kvcache"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/ml/nn"
"github.com/ollama/ollama/ml/nn/rope"
)
type TextModelOptions struct {
hiddenSize int
numHeads int
numKVHeads int
headDim int
rotaryDim int
intermediateSize int
eps float32
ropeBase float32
mropeSections []int
}
func (o *TextModelOptions) applyMRoPE(ctx ml.Context, states, positions ml.Tensor) ml.Tensor {
// GLM4 uses standard M-RoPE (not interleaved like Qwen3VL)
// With 4 sections for [temporal, height, width, unused]
return nn.RoPE(ctx, states, positions, o.rotaryDim, o.ropeBase, 1.0, rope.WithMRoPE(o.mropeSections))
}
type TextSelfAttention struct {
Query *nn.Linear `gguf:"attn_q"`
Key *nn.Linear `gguf:"attn_k"`
Value *nn.Linear `gguf:"attn_v"`
Output *nn.Linear `gguf:"attn_out"`
}
func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, cache kvcache.Cache, opts *TextModelOptions) ml.Tensor {
batchSize := hiddenStates.Dim(1)
// Separate Q, K, V projections
q := sa.Query.Forward(ctx, hiddenStates)
k := sa.Key.Forward(ctx, hiddenStates)
v := sa.Value.Forward(ctx, hiddenStates)
// Reshape for GQA
q = q.Reshape(ctx, opts.headDim, opts.numHeads, batchSize)
k = k.Reshape(ctx, opts.headDim, opts.numKVHeads, batchSize)
v = v.Reshape(ctx, opts.headDim, opts.numKVHeads, batchSize)
// Apply M-RoPE (multi-resolution rotary position embeddings)
q = opts.applyMRoPE(ctx, q, positions)
k = opts.applyMRoPE(ctx, k, positions)
// Scaled dot-product attention with KV cache
scaleFactor := 1.0 / math.Sqrt(float64(opts.headDim))
kqv := nn.Attention(ctx, q, k, v, scaleFactor, cache)
// Reshape attention output: [headDim, numHeads, batchSize] -> [numHeads*headDim, batchSize]
// Note: numHeads * headDim = 16 * 128 = 2048, which is the attention hidden size
kqv = kqv.Reshape(ctx, opts.numHeads*opts.headDim, batchSize)
return sa.Output.Forward(ctx, kqv)
}
type TextMLP struct {
Gate *nn.Linear `gguf:"ffn_gate"`
Up *nn.Linear `gguf:"ffn_up"`
Down *nn.Linear `gguf:"ffn_down"`
}
func (mlp *TextMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *TextModelOptions) ml.Tensor {
// SwiGLU: down(silu(gate(x)) * up(x))
gate := mlp.Gate.Forward(ctx, hiddenStates).SILU(ctx, mlp.Up.Forward(ctx, hiddenStates))
return mlp.Down.Forward(ctx, gate)
}
type TextDecoderLayer struct {
// Input layernorm (before attention)
AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
SelfAttention *TextSelfAttention
// Post self-attention layernorm (after attention, before residual add)
PostAttnNorm *nn.RMSNorm `gguf:"post_attn_norm"`
// FFN input layernorm (after first residual, before MLP)
FFNNorm *nn.RMSNorm `gguf:"ffn_norm"`
MLP *TextMLP
// Post MLP layernorm (after MLP, before residual add)
PostFFNNorm *nn.RMSNorm `gguf:"post_ffn_norm"`
}
func (l *TextDecoderLayer) Forward(ctx ml.Context, hiddenStates, positions, outputs ml.Tensor, cache kvcache.Cache, opts *TextModelOptions) ml.Tensor {
// Attention block
residual := hiddenStates
hiddenStates = l.AttentionNorm.Forward(ctx, hiddenStates, opts.eps)
hiddenStates = l.SelfAttention.Forward(ctx, hiddenStates, positions, cache, opts)
hiddenStates = l.PostAttnNorm.Forward(ctx, hiddenStates, opts.eps)
// Prune to output positions in final layer
if outputs != nil {
hiddenStates = hiddenStates.Rows(ctx, outputs)
residual = residual.Rows(ctx, outputs)
}
hiddenStates = hiddenStates.Add(ctx, residual)
// MLP block
residual = hiddenStates
hiddenStates = l.FFNNorm.Forward(ctx, hiddenStates, opts.eps)
hiddenStates = l.MLP.Forward(ctx, hiddenStates, opts)
hiddenStates = l.PostFFNNorm.Forward(ctx, hiddenStates, opts.eps)
hiddenStates = hiddenStates.Add(ctx, residual)
return hiddenStates
}
type TextModel struct {
TokenEmbedding *nn.Embedding `gguf:"token_embd"`
Layers []TextDecoderLayer `gguf:"blk"`
OutputNorm *nn.RMSNorm `gguf:"output_norm"`
Output *nn.Linear `gguf:"output,alt:token_embd"`
*TextModelOptions
// positionCache stores the M-RoPE position for each token in the sequence.
// This is needed because image tokens share the same base position but have
// different height/width offsets, and the end token position depends on the
// image grid dimensions.
positionCache []int32
ropeDelta int32
}
func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
// Clear position cache when KV cache shifts
m.positionCache = nil
m.ropeDelta = 0
return m.applyMRoPE(ctx, key, shift), nil
}
func newTextModel(c fs.Config) *TextModel {
hiddenSize := int(c.Uint("embedding_length", 1536))
numHeads := int(c.Uint("attention.head_count", 16))
numKVHeads := int(c.Uint("attention.head_count_kv", 8))
intermediateSize := int(c.Uint("feed_forward_length", 4608))
eps := c.Float("attention.layer_norm_rms_epsilon", 1e-5)
ropeBase := c.Float("rope.freq_base", 10000)
headDim := int(c.Uint("attention.key_length", uint32(hiddenSize/numHeads)))
mropeSections := c.Ints("rope.mrope_section")
var sectionInts []int
if len(mropeSections) > 0 {
sectionInts = make([]int, len(mropeSections))
for i, section := range mropeSections {
sectionInts[i] = int(section)
}
} else {
// Default: 3 sections like GLM-OCR
sectionInts = []int{16, 24, 24}
}
// rotaryDim = headDim (128) to rotate all dimensions
// GGML rope_multi: sector = (dim_pair) % sum(sections), mapping each pair to its position dim
rotaryDim := headDim
return &TextModel{
Layers: make([]TextDecoderLayer, c.Uint("block_count", 16)),
TextModelOptions: &TextModelOptions{
hiddenSize: hiddenSize,
numHeads: numHeads,
numKVHeads: numKVHeads,
headDim: headDim,
rotaryDim: rotaryDim,
intermediateSize: intermediateSize,
eps: eps,
ropeBase: ropeBase,
mropeSections: sectionInts,
},
}
}

View File

@@ -0,0 +1,348 @@
package glmocr
import (
"log/slog"
"math"
"slices"
"github.com/ollama/ollama/fs"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/ml/nn"
"github.com/ollama/ollama/ml/nn/rope"
)
type Grid struct {
Height int // Number of patches in height direction
Width int // Number of patches in width direction
Temporal int
ImageHeight int // Full image height in pixels
ImageWidth int // Full image width in pixels
}
type VisionModelOptions struct {
hiddenSize int
numHeads int
headDim int
numChannels int
patchSize int
temporalPatchSize int
imageSize int
spatialMergeSize int
outHiddenSize int
intermediateSize int
eps float32
}
type VisionPatchEmbed struct {
Proj *nn.Conv2D `gguf:"patch_embd_0"`
Proj1 *nn.Conv2D `gguf:"patch_embd_1"`
Bias ml.Tensor `gguf:"patch_embd.bias"`
}
func (pe *VisionPatchEmbed) Forward(ctx ml.Context, pixelValues ml.Tensor, grid *Grid, opts *VisionModelOptions) ml.Tensor {
_ = grid // patches are already in merge-block order
// pixelValues shape: [patchDim, numPatches]
numPatches := pixelValues.Shape()[1]
// Reshape to [patchSize*patchSize, temporalPatchSize, numChannels, numPatches]
pixelValues = pixelValues.Reshape(ctx, opts.patchSize*opts.patchSize, opts.temporalPatchSize, opts.numChannels, numPatches)
// Permute to [temporalPatchSize, patchSize*patchSize, numChannels, numPatches]
pixelValues = pixelValues.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
// Slice temporal frames for Conv2D (simulate Conv3D)
in0 := pixelValues.View(ctx, 0, 1, pixelValues.Stride(1), pixelValues.Dim(1), pixelValues.Stride(2), pixelValues.Dim(2), pixelValues.Stride(3), pixelValues.Dim(3)).Contiguous(ctx)
in0 = in0.Reshape(ctx, opts.patchSize, opts.patchSize, opts.numChannels, numPatches)
s0, s1 := opts.patchSize, opts.patchSize
p0, p1 := 0, 0
d0, d1 := 1, 1
hiddenStates := pe.Proj.Forward(ctx, in0, s0, s1, p0, p1, d0, d1)
if pe.Proj1 != nil && opts.temporalPatchSize > 1 {
in1 := pixelValues.View(ctx, pixelValues.Stride(0), 1, pixelValues.Stride(1), pixelValues.Dim(1), pixelValues.Stride(2), pixelValues.Dim(2), pixelValues.Stride(3), pixelValues.Dim(3)).Contiguous(ctx)
in1 = in1.Reshape(ctx, opts.patchSize, opts.patchSize, opts.numChannels, numPatches)
out1 := pe.Proj1.Forward(ctx, in1, s0, s1, p0, p1, d0, d1)
hiddenStates = hiddenStates.Add(ctx, out1)
}
// Flatten to [hidden_size, num_patches]
hiddenStates = hiddenStates.Reshape(ctx, opts.hiddenSize, numPatches)
// Add patch bias - reshape from [hidden_size] to [hidden_size, 1] for broadcasting
if pe.Bias != nil {
hiddenStates = hiddenStates.Add(ctx, pe.Bias.Reshape(ctx, opts.hiddenSize, 1))
}
return hiddenStates
}
type VisionSelfAttention struct {
QKV *nn.Linear `gguf:"attn_qkv"`
QNorm *nn.RMSNorm `gguf:"attn_q_norm"`
KNorm *nn.RMSNorm `gguf:"attn_k_norm"`
Output *nn.Linear `gguf:"attn_out"`
}
func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, opts *VisionModelOptions) ml.Tensor {
batchSize := hiddenStates.Dim(1)
// Combined QKV projection: [3*hidden_size, batch_size]
qkv := sa.QKV.Forward(ctx, hiddenStates)
// Split using ChunkSections along dim 0 (handles byte offsets correctly)
// ChunkSections returns views - must make contiguous before further operations
chunks := qkv.ChunkSections(ctx, 0, opts.hiddenSize, opts.hiddenSize, opts.hiddenSize)
q := chunks[0].Contiguous(ctx)
k := chunks[1].Contiguous(ctx)
v := chunks[2].Contiguous(ctx)
// Reshape for multi-head attention: [hiddenSize, N] -> [headDim, numHeads, N]
q = q.Reshape(ctx, opts.headDim, opts.numHeads, batchSize)
k = k.Reshape(ctx, opts.headDim, opts.numHeads, batchSize)
v = v.Reshape(ctx, opts.headDim, opts.numHeads, batchSize)
// Apply Q-norm and K-norm after head reshape
// Weights are [headDim]=64, tensor is [headDim, numHeads, N]
q = sa.QNorm.Forward(ctx, q, opts.eps)
k = sa.KNorm.Forward(ctx, k, opts.eps)
// Apply rotary position embeddings with vision-style 2D positions
// Each section of headDim/4 pairs is assigned to one position dimension
// Positions are [height, width, height, width] repeated for rotation
ropeFreqBase := float32(10000.0)
sections := []int{opts.headDim / 4, opts.headDim / 4, opts.headDim / 4, opts.headDim / 4}
q = nn.RoPE(ctx, q, positions, opts.headDim/2, ropeFreqBase, 1.0, rope.WithVision(sections))
k = nn.RoPE(ctx, k, positions, opts.headDim/2, ropeFreqBase, 1.0, rope.WithVision(sections))
// Scale factor for scaled dot-product attention
scale := 1.0 / math.Sqrt(float64(opts.headDim))
// Try flash attention first (ScaledDotProductAttention), fall back to manual
if sdpa, ok := q.(ml.ScaledDotProductAttention); ok {
attention := sdpa.ScaledDotProductAttention(ctx, k, v, nil, nil, nil, scale, false)
attention = attention.Reshape(ctx, opts.hiddenSize, batchSize)
return sa.Output.Forward(ctx, attention)
}
slog.Warn("glmocr: vision attention falling back to manual attention",
"batchSize", batchSize, "numHeads", opts.numHeads,
"hint", "set OLLAMA_FLASH_ATTENTION=1 to enable flash attention")
// Manual attention fallback
// q, k, v are [headDim, numHeads, batchSize] - GGML treats as 4D with implicit dim 3 = 1
q = q.Permute(ctx, 0, 2, 1, 3)
k = k.Permute(ctx, 0, 2, 1, 3)
v = v.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
// Attention scores
kq := k.MulmatFullPrec(ctx, q)
kq = kq.Scale(ctx, scale)
kq = kq.Softmax(ctx)
// Attention output: v @ kq (note: v first)
kqv := v.Mulmat(ctx, kq)
attention := kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
attention = attention.Reshape(ctx, opts.hiddenSize, batchSize)
return sa.Output.Forward(ctx, attention)
}
type VisionMLP struct {
Gate *nn.Linear `gguf:"ffn_gate"`
Up *nn.Linear `gguf:"ffn_up"`
Down *nn.Linear `gguf:"ffn_down"`
}
func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor) ml.Tensor {
// SwiGLU: down(silu(gate(x)) * up(x))
gate := mlp.Gate.Forward(ctx, hiddenStates).SILU(ctx, mlp.Up.Forward(ctx, hiddenStates))
return mlp.Down.Forward(ctx, gate)
}
type VisionBlock struct {
Norm1 *nn.RMSNorm `gguf:"ln1"`
SelfAttention *VisionSelfAttention
Norm2 *nn.RMSNorm `gguf:"ln2"`
MLP *VisionMLP
}
func (b *VisionBlock) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, opts *VisionModelOptions) ml.Tensor {
// Pre-norm architecture
residual := hiddenStates
hiddenStates = b.Norm1.Forward(ctx, hiddenStates, opts.eps)
hiddenStates = b.SelfAttention.Forward(ctx, hiddenStates, positions, opts)
hiddenStates = hiddenStates.Add(ctx, residual)
residual = hiddenStates
hiddenStates = b.Norm2.Forward(ctx, hiddenStates, opts.eps)
hiddenStates = b.MLP.Forward(ctx, hiddenStates)
hiddenStates = hiddenStates.Add(ctx, residual)
return hiddenStates
}
type VisionDownsample struct {
*nn.Conv2D // Embedded to get mm.patch_merger.weight/bias directly
}
func (d *VisionDownsample) Forward(ctx ml.Context, hiddenStates ml.Tensor, grid *Grid, opts *VisionModelOptions) ml.Tensor {
// Apply spatial downsampling via Conv2D
// Input: [hidden_size, num_patches] where patches are in merge-block order
if d.Conv2D == nil || d.Weight == nil {
panic("VisionDownsample weights not loaded")
}
merge := opts.spatialMergeSize
numOutputTokens := (grid.Height / merge) * (grid.Width / merge)
// Step 1: Reshape to [hidden_size, merge, merge, num_output_tokens]
hiddenStates = hiddenStates.Reshape(ctx, opts.hiddenSize, merge, merge, numOutputTokens)
// Step 2: Permute to [merge, merge, hidden_size, num_output_tokens]
// ggml semantics: result.ne[perm[i]] = input.ne[i]
// So permute(2,0,1,3) on [1024,2,2,N] gives: ne[2]=1024, ne[0]=2, ne[1]=2, ne[3]=N -> [2,2,1024,N]
hiddenStates = hiddenStates.Permute(ctx, 2, 0, 1, 3).Contiguous(ctx)
// Step 3: Apply Conv2D without bias (bias added after reshape)
// Note: ggml_conv_2d takes (kernel, input) - kernel must be receiver in ollama
s0, s1 := merge, merge
p0, p1 := 0, 0
d0, d1 := 1, 1
hiddenStates = d.Weight.Conv2D(ctx, hiddenStates, s0, s1, p0, p1, d0, d1)
// Step 4: Reshape to [out_hidden_size, num_output_tokens]
hiddenStates = hiddenStates.Reshape(ctx, opts.outHiddenSize, numOutputTokens)
// Step 5: Add bias after reshape
// Reshape bias from [out_hidden_size] to [out_hidden_size, 1] for proper broadcasting
if d.Bias != nil {
hiddenStates = hiddenStates.Add(ctx, d.Bias.Reshape(ctx, opts.outHiddenSize, 1))
}
return hiddenStates
}
type PatchMerger struct {
// GGUF tags align with mm.* keys used by the model
Proj *nn.Linear `gguf:"model.fc"` // mm.model.fc.weight
PostLN *nn.LayerNorm `gguf:"post_norm"` // mm.post_norm.weight/bias
GateProj *nn.Linear `gguf:"gate"` // mm.gate.weight
UpProj *nn.Linear `gguf:"up"` // mm.up.weight
DownProj *nn.Linear `gguf:"down"` // mm.down.weight
}
func (m *PatchMerger) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *VisionModelOptions) ml.Tensor {
// Linear projection
hiddenStates = m.Proj.Forward(ctx, hiddenStates)
// Post-projection layer norm + GELU ERF
hiddenStates = m.PostLN.Forward(ctx, hiddenStates, opts.eps)
hiddenStates = hiddenStates.GELU_ERF(ctx)
// Force a copy to avoid in-place mutation issues with GELU_ERF
hiddenStates = hiddenStates.Contiguous(ctx)
// SwiGLU MLP: down(silu(gate(x)) * up(x))
gateOut := m.GateProj.Forward(ctx, hiddenStates)
upOut := m.UpProj.Forward(ctx, hiddenStates)
gate := gateOut.SILU(ctx, upOut)
return m.DownProj.Forward(ctx, gate)
}
type VisionModel struct {
PatchEmbed *VisionPatchEmbed
Blocks []VisionBlock `gguf:"blk"`
PostLN *nn.RMSNorm `gguf:"post_ln"`
// Note: Downsample is applied at the model level so mm.patch_merger stays separate
*VisionModelOptions
}
func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor, grid *Grid) ml.Tensor {
// Extract patch embeddings from flattened patches
hiddenStates := m.PatchEmbed.Forward(ctx, pixelValues, grid, m.VisionModelOptions)
// Create position IDs for RoPE (spatial grid)
// Patches are already in merge-block order from preprocessing
positions := m.createPositions(ctx, grid)
// Process through vision blocks
for _, block := range m.Blocks {
hiddenStates = block.Forward(ctx, hiddenStates, positions, m.VisionModelOptions)
}
// Post-layernorm
hiddenStates = m.PostLN.Forward(ctx, hiddenStates, m.eps)
// Note: Downsample is now applied separately in Model.EncodeMultimodal
// so mm.patch_merger remains a distinct module
return hiddenStates
}
func (m *VisionModel) createPositions(ctx ml.Context, grid *Grid) ml.Tensor {
// Create spatial position IDs for vision RoPE
// Position layout: [height, width, height, width] - 4 sections for mrope
// Patches are in MERGE-BLOCK order after VisionPatchEmbed interleaving
// This follows the GLM-OCR rot_pos_emb layout
numPatches := grid.Height * grid.Width
mergeRatio := m.spatialMergeSize
// Build position arrays in merge-block order
// Each merge_ratio x merge_ratio block of patches is grouped together
hpos := make([]int32, numPatches)
wpos := make([]int32, numPatches)
ptr := 0
for y := 0; y < grid.Height; y += mergeRatio {
for x := 0; x < grid.Width; x += mergeRatio {
for dy := range mergeRatio {
for dx := range mergeRatio {
hpos[ptr] = int32(y + dy)
wpos[ptr] = int32(x + dx)
ptr++
}
}
}
}
// Build position arrays for 4 sections (mrope)
s := [][]int32{
hpos, // Section 0: height
wpos, // Section 1: width
slices.Clone(hpos), // Section 2: height (repeated)
slices.Clone(wpos), // Section 3: width (repeated)
}
return ctx.Input().FromInts(slices.Concat(s...), numPatches*4)
}
func newVisionModel(c fs.Config) *VisionModel {
hiddenSize := int(c.Uint("vision.embedding_length", 1024))
numHeads := int(c.Uint("vision.attention.head_count", 16))
numChannels := int(c.Uint("vision.num_channels", 3))
patchSize := int(c.Uint("vision.patch_size", 14))
temporalPatchSize := int(c.Uint("vision.temporal_patch_size", 2))
imageSize := int(c.Uint("vision.image_size", 336))
spatialMergeSize := int(c.Uint("vision.spatial_merge_size", 2))
outHiddenSize := int(c.Uint("vision.out_hidden_size", 1536))
intermediateSize := int(c.Uint("vision.intermediate_size", 4096))
eps := c.Float("vision.attention.layer_norm_rms_epsilon", 1e-5)
return &VisionModel{
Blocks: make([]VisionBlock, c.Uint("vision.block_count", 24)),
VisionModelOptions: &VisionModelOptions{
hiddenSize: hiddenSize,
numHeads: numHeads,
headDim: hiddenSize / numHeads,
numChannels: numChannels,
patchSize: patchSize,
temporalPatchSize: temporalPatchSize,
imageSize: imageSize,
spatialMergeSize: spatialMergeSize,
outHiddenSize: outHiddenSize,
intermediateSize: intermediateSize,
eps: eps,
},
}
}

View File

@@ -8,6 +8,7 @@ import (
_ "github.com/ollama/ollama/model/models/gemma3"
_ "github.com/ollama/ollama/model/models/gemma3n"
_ "github.com/ollama/ollama/model/models/glm4moelite"
_ "github.com/ollama/ollama/model/models/glmocr"
_ "github.com/ollama/ollama/model/models/gptoss"
_ "github.com/ollama/ollama/model/models/lfm2"
_ "github.com/ollama/ollama/model/models/llama"

19
model/parsers/glmocr.go Normal file
View File

@@ -0,0 +1,19 @@
package parsers
import "github.com/ollama/ollama/api"
type GlmOcrParser struct {
GLM47Parser
}
func (p *GlmOcrParser) HasThinkingSupport() bool {
return false
}
func (p *GlmOcrParser) Init(tools []api.Tool, lastMessage *api.Message, thinkValue *api.ThinkValue) []api.Tool {
p.tools = tools
if thinkValue != nil && thinkValue.Bool() {
p.state = glm46ParserState_CollectingThinking
}
return tools
}

View File

@@ -70,6 +70,8 @@ func ParserForName(name string) Parser {
return &FunctionGemmaParser{}
case "glm-4.7":
return &GLM47Parser{}
case "glm-ocr":
return &GlmOcrParser{}
case "lfm2":
return &LFM2Parser{hasThinkingSupport: false}
case "lfm2-thinking":

109
model/renderers/glmocr.go Normal file
View File

@@ -0,0 +1,109 @@
package renderers
import (
"encoding/json"
"fmt"
"strings"
"github.com/ollama/ollama/api"
)
type GlmOcrRenderer struct{}
func (r *GlmOcrRenderer) Render(messages []api.Message, tools []api.Tool, thinkValue *api.ThinkValue) (string, error) {
var sb strings.Builder
sb.WriteString("[gMASK]<sop>")
if len(tools) > 0 {
sb.WriteString("<|system|>\n")
sb.WriteString("# Tools\n\n")
sb.WriteString("You may call one or more functions to assist with the user query.\n\n")
sb.WriteString("You are provided with function signatures within <tools></tools> XML tags:\n")
sb.WriteString("<tools>\n")
for _, tool := range tools {
d, _ := json.Marshal(tool)
sb.WriteString(formatGLM47ToolJSON(d))
sb.WriteString("\n")
}
sb.WriteString("</tools>\n\n")
sb.WriteString("For each function call, output the function name and arguments within the following XML format:\n")
sb.WriteString("<tool_call>{function-name}<arg_key>{arg-key-1}</arg_key><arg_value>{arg-value-1}</arg_value><arg_key>{arg-key-2}</arg_key><arg_value>{arg-value-2}</arg_value>...</tool_call>")
}
enableThinking := false
thinkingExplicitlySet := false
if thinkValue != nil {
enableThinking = thinkValue.Bool()
thinkingExplicitlySet = true
}
for i, message := range messages {
switch message.Role {
case "user":
sb.WriteString("<|user|>\n")
sb.WriteString(message.Content)
if thinkingExplicitlySet && !enableThinking && !strings.HasSuffix(message.Content, "/nothink") {
sb.WriteString("/nothink")
}
case "assistant":
sb.WriteString("<|assistant|>\n")
if message.Thinking != "" {
sb.WriteString("<think>" + strings.TrimSpace(message.Thinking) + "</think>")
} else {
sb.WriteString("<think></think>")
}
if message.Content != "" {
sb.WriteString("\n" + strings.TrimSpace(message.Content))
}
if len(message.ToolCalls) > 0 {
for _, toolCall := range message.ToolCalls {
sb.WriteString("\n<tool_call>" + toolCall.Function.Name)
sb.WriteString(renderGlmOcrToolArguments(toolCall.Function.Arguments))
sb.WriteString("</tool_call>")
}
}
sb.WriteString("\n")
case "tool":
if i == 0 || messages[i-1].Role != "tool" {
sb.WriteString("<|observation|>")
}
sb.WriteString("\n<tool_response>\n")
sb.WriteString(message.Content)
sb.WriteString("\n</tool_response>\n")
case "system":
sb.WriteString("<|system|>\n")
sb.WriteString(message.Content)
sb.WriteString("\n")
}
}
sb.WriteString("<|assistant|>\n")
if thinkingExplicitlySet && !enableThinking {
sb.WriteString("<think></think>\n")
}
return sb.String(), nil
}
func renderGlmOcrToolArguments(args api.ToolCallFunctionArguments) string {
var sb strings.Builder
for key, value := range args.All() {
sb.WriteString("<arg_key>" + key + "</arg_key>")
var valueStr string
if str, ok := value.(string); ok {
valueStr = str
} else {
jsonBytes, err := json.Marshal(value)
if err != nil {
valueStr = fmt.Sprintf("%v", value)
} else {
valueStr = string(jsonBytes)
}
}
sb.WriteString("<arg_value>" + valueStr + "</arg_value>")
}
return sb.String()
}

View File

@@ -82,6 +82,8 @@ func rendererForName(name string) Renderer {
return &FunctionGemmaRenderer{}
case "glm-4.7":
return &GLM47Renderer{}
case "glm-ocr":
return &GlmOcrRenderer{}
case "lfm2":
return &LFM2Renderer{IsThinking: false}
case "lfm2-thinking":