mirror of
https://github.com/ollama/ollama.git
synced 2026-02-17 02:53:51 -05:00
Compare commits
5 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3a88f7eb20 | ||
|
|
0d5da826d4 | ||
|
|
9b795698b8 | ||
|
|
041fb77639 | ||
|
|
8224cce583 |
@@ -16,7 +16,7 @@ Start building with open models.
|
||||
curl -fsSL https://ollama.com/install.sh | sh
|
||||
```
|
||||
|
||||
or [download manually](http://localhost:8080/download/Ollama.dmg)
|
||||
or [download manually](https://ollama.com/download/Ollama.dmg)
|
||||
|
||||
### Windows
|
||||
|
||||
|
||||
@@ -3,5 +3,7 @@
|
||||
package mlxrunner
|
||||
|
||||
import (
|
||||
_ "github.com/ollama/ollama/x/models/gemma3"
|
||||
_ "github.com/ollama/ollama/x/models/glm4_moe_lite"
|
||||
_ "github.com/ollama/ollama/x/models/llama"
|
||||
)
|
||||
|
||||
92
x/mlxrunner/model/linear.go
Normal file
92
x/mlxrunner/model/linear.go
Normal file
@@ -0,0 +1,92 @@
|
||||
//go:build mlx
|
||||
|
||||
package model
|
||||
|
||||
import (
|
||||
"github.com/ollama/ollama/x/mlxrunner/mlx"
|
||||
"github.com/ollama/ollama/x/models/nn"
|
||||
)
|
||||
|
||||
// LinearFactory builds linear layers using shared tensor maps and quant defaults.
|
||||
type LinearFactory struct {
|
||||
tensors map[string]*mlx.Array
|
||||
defaultGroupSize int
|
||||
defaultBits int
|
||||
defaultMode string
|
||||
tensorQuant map[string]*TensorQuantInfo
|
||||
}
|
||||
|
||||
// NewLinearFactory creates a reusable constructor for model linear layers.
|
||||
func NewLinearFactory(
|
||||
tensors map[string]*mlx.Array,
|
||||
defaultGroupSize, defaultBits int,
|
||||
defaultMode string,
|
||||
tensorQuant map[string]*TensorQuantInfo,
|
||||
) LinearFactory {
|
||||
return LinearFactory{
|
||||
tensors: tensors,
|
||||
defaultGroupSize: defaultGroupSize,
|
||||
defaultBits: defaultBits,
|
||||
defaultMode: defaultMode,
|
||||
tensorQuant: tensorQuant,
|
||||
}
|
||||
}
|
||||
|
||||
// Make constructs a linear layer at path.
|
||||
func (f LinearFactory) Make(path string) nn.LinearLayer {
|
||||
return MakeLinearLayer(
|
||||
f.tensors,
|
||||
path,
|
||||
f.defaultGroupSize,
|
||||
f.defaultBits,
|
||||
f.defaultMode,
|
||||
f.tensorQuant,
|
||||
)
|
||||
}
|
||||
|
||||
// MakeLinearLayer constructs a linear layer from a tensor map.
|
||||
//
|
||||
// For quantized tensors (path.weight + path.weight_scale), it resolves per-tensor
|
||||
// quant params via TensorQuant metadata (with shape-based affine fallback).
|
||||
// For non-quantized tensors, it returns a standard nn.Linear.
|
||||
func MakeLinearLayer(
|
||||
tensors map[string]*mlx.Array,
|
||||
path string,
|
||||
defaultGroupSize, defaultBits int,
|
||||
defaultMode string,
|
||||
tensorQuant map[string]*TensorQuantInfo,
|
||||
) nn.LinearLayer {
|
||||
w := tensors[path+".weight"]
|
||||
if w == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
scales := tensors[path+".weight_scale"]
|
||||
if scales != nil {
|
||||
qbiases := tensors[path+".weight_qbias"]
|
||||
bias := tensors[path+".bias"]
|
||||
|
||||
groupSize, bits, mode := ResolveLinearQuantParams(
|
||||
defaultGroupSize,
|
||||
defaultBits,
|
||||
defaultMode,
|
||||
tensorQuant,
|
||||
path+".weight",
|
||||
w,
|
||||
scales,
|
||||
)
|
||||
|
||||
return &nn.QuantizedLinear{
|
||||
Weight: w,
|
||||
Scales: scales,
|
||||
QBiases: qbiases,
|
||||
Bias: bias,
|
||||
GroupSize: groupSize,
|
||||
Bits: bits,
|
||||
Mode: mode,
|
||||
}
|
||||
}
|
||||
|
||||
bias := tensors[path+".bias"]
|
||||
return nn.NewLinear(w, bias)
|
||||
}
|
||||
130
x/mlxrunner/model/quant.go
Normal file
130
x/mlxrunner/model/quant.go
Normal file
@@ -0,0 +1,130 @@
|
||||
//go:build mlx
|
||||
|
||||
package model
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/x/mlxrunner/mlx"
|
||||
)
|
||||
|
||||
// QuantizationParams returns default groupSize, bits, and mode for a quantization type.
|
||||
func QuantizationParams(quantization string) (groupSize, bits int, mode string) {
|
||||
switch strings.ToUpper(quantization) {
|
||||
case "NVFP4":
|
||||
return 16, 4, "nvfp4"
|
||||
case "FP4", "Q4", "INT4":
|
||||
return 32, 4, "affine"
|
||||
case "MXFP8":
|
||||
return 32, 8, "mxfp8"
|
||||
case "FP8", "Q8", "INT8", "":
|
||||
return 64, 8, "affine"
|
||||
default:
|
||||
return 32, 8, "affine"
|
||||
}
|
||||
}
|
||||
|
||||
// TensorQuantParams resolves quant params for a tensor using per-tensor metadata
|
||||
// when available, otherwise falling back to the provided model defaults.
|
||||
func TensorQuantParams(
|
||||
defaultGroupSize, defaultBits int,
|
||||
defaultMode string,
|
||||
tensorQuant map[string]*TensorQuantInfo,
|
||||
tensorName string,
|
||||
) (groupSize, bits int, mode string, fromTensor bool) {
|
||||
if tensorQuant != nil {
|
||||
if tq := tensorQuant[tensorName]; tq != nil {
|
||||
groupSize, bits, mode = QuantizationParams(tq.QuantType)
|
||||
if tq.GroupSize > 0 {
|
||||
groupSize = tq.GroupSize
|
||||
}
|
||||
return groupSize, bits, mode, true
|
||||
}
|
||||
}
|
||||
return defaultGroupSize, defaultBits, defaultMode, false
|
||||
}
|
||||
|
||||
// ResolveLinearQuantParams resolves quantization params for a quantized linear
|
||||
// tensor, preferring per-tensor metadata and falling back to shape-based
|
||||
// inference for affine packed tensors.
|
||||
func ResolveLinearQuantParams(
|
||||
defaultGroupSize, defaultBits int,
|
||||
defaultMode string,
|
||||
tensorQuant map[string]*TensorQuantInfo,
|
||||
tensorName string,
|
||||
weight, scales *mlx.Array,
|
||||
) (groupSize, bits int, mode string) {
|
||||
groupSize, bits, mode, fromTensor := TensorQuantParams(
|
||||
defaultGroupSize,
|
||||
defaultBits,
|
||||
defaultMode,
|
||||
tensorQuant,
|
||||
tensorName,
|
||||
)
|
||||
|
||||
if mode == "affine" {
|
||||
if inferredGroupSize, inferredBits, ok := InferAffineQuantParamsFromShapes(weight, scales, bits); ok {
|
||||
if !fromTensor || groupSize == 0 || bits == 0 {
|
||||
groupSize = inferredGroupSize
|
||||
bits = inferredBits
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return groupSize, bits, mode
|
||||
}
|
||||
|
||||
// InferAffineQuantParamsFromShapes infers (groupSize,bits) for affine quantized
|
||||
// tensors from packed weight and scale shapes.
|
||||
func InferAffineQuantParamsFromShapes(weight, scales *mlx.Array, hintBits int) (groupSize, bits int, ok bool) {
|
||||
if weight == nil || scales == nil {
|
||||
return 0, 0, false
|
||||
}
|
||||
|
||||
weightShape := weight.Dims()
|
||||
scaleShape := scales.Dims()
|
||||
if len(weightShape) == 0 || len(scaleShape) == 0 {
|
||||
return 0, 0, false
|
||||
}
|
||||
|
||||
weightCols := weightShape[len(weightShape)-1]
|
||||
scalesCols := scaleShape[len(scaleShape)-1]
|
||||
if weightCols <= 0 || scalesCols <= 0 {
|
||||
return 0, 0, false
|
||||
}
|
||||
|
||||
groupSize4 := weightCols * 8 / scalesCols
|
||||
groupSize8 := weightCols * 4 / scalesCols
|
||||
|
||||
switch {
|
||||
case groupSize4 == 32:
|
||||
return 32, 4, true
|
||||
case groupSize8 == 64:
|
||||
return 64, 8, true
|
||||
case groupSize4 == 64 && groupSize8 == 32:
|
||||
if hintBits == 8 {
|
||||
return 32, 8, true
|
||||
}
|
||||
if hintBits == 4 {
|
||||
return 64, 4, true
|
||||
}
|
||||
}
|
||||
|
||||
if isCommonGroupSize(groupSize4) && !isCommonGroupSize(groupSize8) {
|
||||
return groupSize4, 4, true
|
||||
}
|
||||
if isCommonGroupSize(groupSize8) && !isCommonGroupSize(groupSize4) {
|
||||
return groupSize8, 8, true
|
||||
}
|
||||
|
||||
return 0, 0, false
|
||||
}
|
||||
|
||||
func isCommonGroupSize(v int) bool {
|
||||
switch v {
|
||||
case 16, 32, 64, 128:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
@@ -8,42 +8,63 @@ import (
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/x/imagegen/manifest"
|
||||
)
|
||||
|
||||
// Root wraps a ModelManifest with pre-scanned quantization metadata.
|
||||
type Root struct {
|
||||
Manifest *manifest.ModelManifest
|
||||
quantType string
|
||||
groupSize int
|
||||
// TensorQuantInfo describes per-tensor quantization metadata.
|
||||
type TensorQuantInfo struct {
|
||||
QuantType string
|
||||
GroupSize int
|
||||
}
|
||||
|
||||
// Open loads a manifest for the given model name and pre-scans the first
|
||||
// tensor blob for quantization metadata (quant_type, group_size).
|
||||
// Root wraps a ModelManifest with pre-scanned quantization metadata.
|
||||
type Root struct {
|
||||
Manifest *manifest.ModelManifest
|
||||
|
||||
// Backwards-compatible model-level quant metadata (first tensor blob).
|
||||
quantType string
|
||||
groupSize int
|
||||
|
||||
// Per-tensor quantization metadata.
|
||||
tensorQuant map[string]*TensorQuantInfo
|
||||
}
|
||||
|
||||
// Open loads a manifest for the given model name and scans tensor blobs for
|
||||
// quantization metadata.
|
||||
func Open(modelName string) (*Root, error) {
|
||||
m, err := manifest.LoadManifest(modelName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
root := &Root{Manifest: m}
|
||||
root := &Root{
|
||||
Manifest: m,
|
||||
tensorQuant: make(map[string]*TensorQuantInfo),
|
||||
}
|
||||
|
||||
// Pre-scan first tensor blob for quantization metadata
|
||||
for _, layer := range m.GetTensorLayers("") {
|
||||
blobPath := m.BlobPath(layer.Digest)
|
||||
meta, err := readBlobMetadata(blobPath)
|
||||
if err != nil || meta == nil {
|
||||
|
||||
infos, blobQuantType, blobGroupSize, err := readBlobTensorQuantInfo(blobPath)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if qt := meta["quant_type"]; qt != "" {
|
||||
root.quantType = strings.ToUpper(qt)
|
||||
|
||||
for name, info := range infos {
|
||||
root.tensorQuant[name] = info
|
||||
}
|
||||
if gs := meta["group_size"]; gs != "" {
|
||||
fmt.Sscanf(gs, "%d", &root.groupSize)
|
||||
|
||||
if root.quantType == "" && blobQuantType != "" {
|
||||
root.quantType = strings.ToUpper(blobQuantType)
|
||||
root.groupSize = blobGroupSize
|
||||
if root.groupSize == 0 {
|
||||
root.groupSize = defaultGroupSize(root.quantType)
|
||||
}
|
||||
}
|
||||
break // only check the first tensor blob
|
||||
}
|
||||
|
||||
return root, nil
|
||||
@@ -52,46 +73,180 @@ func Open(modelName string) (*Root, error) {
|
||||
// Close is a no-op for now (future: release resources).
|
||||
func (r *Root) Close() {}
|
||||
|
||||
// QuantType returns the quantization type detected from tensor metadata.
|
||||
// QuantType returns the quantization type detected from the first tensor blob metadata.
|
||||
func (r *Root) QuantType() string { return r.quantType }
|
||||
|
||||
// GroupSize returns the quantization group size detected from tensor metadata.
|
||||
// GroupSize returns the quantization group size detected from the first tensor blob metadata.
|
||||
func (r *Root) GroupSize() int { return r.groupSize }
|
||||
|
||||
// readBlobMetadata reads the __metadata__ from a safetensors blob header.
|
||||
func readBlobMetadata(path string) (map[string]string, error) {
|
||||
// TensorQuant returns per-tensor quantization metadata if available.
|
||||
func (r *Root) TensorQuant(name string) *TensorQuantInfo {
|
||||
if r == nil {
|
||||
return nil
|
||||
}
|
||||
return r.tensorQuant[name]
|
||||
}
|
||||
|
||||
// AllTensorQuant returns a copy of the per-tensor quantization metadata.
|
||||
func (r *Root) AllTensorQuant() map[string]*TensorQuantInfo {
|
||||
out := make(map[string]*TensorQuantInfo, len(r.tensorQuant))
|
||||
for k, v := range r.tensorQuant {
|
||||
if v == nil {
|
||||
continue
|
||||
}
|
||||
copy := *v
|
||||
out[k] = ©
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func defaultGroupSize(quantType string) int {
|
||||
groupSize, _, _ := QuantizationParams(quantType)
|
||||
return groupSize
|
||||
}
|
||||
|
||||
func readBlobTensorQuantInfo(path string) (map[string]*TensorQuantInfo, string, int, error) {
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
return nil, "", 0, err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
var headerSize uint64
|
||||
if err := binary.Read(f, binary.LittleEndian, &headerSize); err != nil {
|
||||
return nil, err
|
||||
return nil, "", 0, err
|
||||
}
|
||||
if headerSize > 1024*1024 {
|
||||
return nil, fmt.Errorf("header too large: %d", headerSize)
|
||||
if headerSize > 100*1024*1024 {
|
||||
return nil, "", 0, fmt.Errorf("header too large: %d", headerSize)
|
||||
}
|
||||
|
||||
data := make([]byte, headerSize)
|
||||
if _, err := io.ReadFull(f, data); err != nil {
|
||||
return nil, err
|
||||
return nil, "", 0, err
|
||||
}
|
||||
|
||||
var header map[string]json.RawMessage
|
||||
if err := json.Unmarshal(data, &header); err != nil {
|
||||
return nil, err
|
||||
return nil, "", 0, err
|
||||
}
|
||||
|
||||
globalQuantType, globalGroupSize := parseGlobalQuantMetadata(header)
|
||||
globalQuantType = strings.ToUpper(globalQuantType)
|
||||
|
||||
mainNames := mainTensorNames(header)
|
||||
infos := make(map[string]*TensorQuantInfo)
|
||||
for _, name := range mainNames {
|
||||
if _, ok := header[name+".scale"]; !ok {
|
||||
continue
|
||||
}
|
||||
|
||||
quantType := globalQuantType
|
||||
groupSize := globalGroupSize
|
||||
|
||||
inferredType, inferredGroup := inferQuantTypeFromShapes(header, name, quantType)
|
||||
if quantType == "" {
|
||||
quantType = inferredType
|
||||
}
|
||||
if groupSize == 0 {
|
||||
groupSize = inferredGroup
|
||||
}
|
||||
if quantType == "" {
|
||||
continue
|
||||
}
|
||||
if groupSize == 0 {
|
||||
groupSize = defaultGroupSize(quantType)
|
||||
}
|
||||
|
||||
infos[name] = &TensorQuantInfo{QuantType: quantType, GroupSize: groupSize}
|
||||
}
|
||||
|
||||
return infos, globalQuantType, globalGroupSize, nil
|
||||
}
|
||||
|
||||
func parseGlobalQuantMetadata(header map[string]json.RawMessage) (quantType string, groupSize int) {
|
||||
metaRaw, ok := header["__metadata__"]
|
||||
if !ok {
|
||||
return nil, nil
|
||||
return "", 0
|
||||
}
|
||||
|
||||
var meta map[string]string
|
||||
if err := json.Unmarshal(metaRaw, &meta); err != nil {
|
||||
return nil, err
|
||||
return "", 0
|
||||
}
|
||||
return meta, nil
|
||||
|
||||
quantType = meta["quant_type"]
|
||||
if gs := meta["group_size"]; gs != "" {
|
||||
groupSize, _ = strconv.Atoi(gs)
|
||||
}
|
||||
return quantType, groupSize
|
||||
}
|
||||
|
||||
func mainTensorNames(header map[string]json.RawMessage) []string {
|
||||
names := make([]string, 0, len(header))
|
||||
for name := range header {
|
||||
if name == "__metadata__" || strings.HasSuffix(name, ".scale") || strings.HasSuffix(name, ".bias") {
|
||||
continue
|
||||
}
|
||||
names = append(names, name)
|
||||
}
|
||||
sort.Strings(names)
|
||||
return names
|
||||
}
|
||||
|
||||
func inferQuantTypeFromShapes(header map[string]json.RawMessage, tensorName string, hintQuantType string) (string, int) {
|
||||
type tensorShape struct {
|
||||
Shape []int64 `json:"shape"`
|
||||
}
|
||||
|
||||
mainRaw, ok := header[tensorName]
|
||||
if !ok {
|
||||
return "", 0
|
||||
}
|
||||
scaleRaw, ok := header[tensorName+".scale"]
|
||||
if !ok {
|
||||
return "", 0
|
||||
}
|
||||
|
||||
var mainInfo tensorShape
|
||||
if err := json.Unmarshal(mainRaw, &mainInfo); err != nil || len(mainInfo.Shape) == 0 {
|
||||
return "", 0
|
||||
}
|
||||
|
||||
var scaleInfo tensorShape
|
||||
if err := json.Unmarshal(scaleRaw, &scaleInfo); err != nil || len(scaleInfo.Shape) == 0 {
|
||||
return "", 0
|
||||
}
|
||||
|
||||
weightCols := int(mainInfo.Shape[len(mainInfo.Shape)-1])
|
||||
scalesCols := int(scaleInfo.Shape[len(scaleInfo.Shape)-1])
|
||||
if weightCols <= 0 || scalesCols <= 0 {
|
||||
return "", 0
|
||||
}
|
||||
|
||||
groupSize4 := weightCols * 8 / scalesCols
|
||||
groupSize8 := weightCols * 4 / scalesCols
|
||||
|
||||
switch {
|
||||
case groupSize4 == 32:
|
||||
return "INT4", 32
|
||||
case groupSize8 == 64:
|
||||
return "INT8", 64
|
||||
case groupSize4 == 64 && groupSize8 == 32:
|
||||
h := strings.ToUpper(hintQuantType)
|
||||
if strings.Contains(h, "8") {
|
||||
return "INT8", 32
|
||||
}
|
||||
if strings.Contains(h, "4") {
|
||||
return "INT4", 64
|
||||
}
|
||||
}
|
||||
|
||||
if isCommonGroupSize(groupSize4) && !isCommonGroupSize(groupSize8) {
|
||||
return "INT4", groupSize4
|
||||
}
|
||||
if isCommonGroupSize(groupSize8) && !isCommonGroupSize(groupSize4) {
|
||||
return "INT8", groupSize8
|
||||
}
|
||||
|
||||
return "", 0
|
||||
}
|
||||
|
||||
@@ -24,9 +24,13 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
|
||||
|
||||
caches, tokens := r.FindNearestCache(inputs)
|
||||
if len(caches) == 0 {
|
||||
caches = make([]cache.Cache, r.Model.NumLayers())
|
||||
for i := range caches {
|
||||
caches[i] = cache.NewKVCache()
|
||||
if cacheFactory, ok := r.Model.(interface{ NewCaches() []cache.Cache }); ok {
|
||||
caches = cacheFactory.NewCaches()
|
||||
} else {
|
||||
caches = make([]cache.Cache, r.Model.NumLayers())
|
||||
for i := range caches {
|
||||
caches[i] = cache.NewKVCache()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
521
x/models/gemma3/gemma3.go
Normal file
521
x/models/gemma3/gemma3.go
Normal file
@@ -0,0 +1,521 @@
|
||||
//go:build mlx
|
||||
|
||||
// Package gemma3 provides the Gemma 3 text model implementation for MLX.
|
||||
package gemma3
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"math"
|
||||
|
||||
"github.com/ollama/ollama/x/imagegen/tokenizer"
|
||||
"github.com/ollama/ollama/x/mlxrunner/cache"
|
||||
"github.com/ollama/ollama/x/mlxrunner/mlx"
|
||||
"github.com/ollama/ollama/x/mlxrunner/model"
|
||||
"github.com/ollama/ollama/x/mlxrunner/model/base"
|
||||
"github.com/ollama/ollama/x/models/nn"
|
||||
)
|
||||
|
||||
func init() {
|
||||
base.Register("Gemma3ForCausalLM", newModel)
|
||||
base.Register("Gemma3ForConditionalGeneration", newModel)
|
||||
}
|
||||
|
||||
// TextConfig holds configuration for the Gemma 3 text model.
|
||||
type TextConfig struct {
|
||||
HiddenSize int32 `json:"hidden_size"`
|
||||
NumHiddenLayers int32 `json:"num_hidden_layers"`
|
||||
IntermediateSize int32 `json:"intermediate_size"`
|
||||
NumAttentionHeads int32 `json:"num_attention_heads"`
|
||||
NumKeyValueHeads int32 `json:"num_key_value_heads"`
|
||||
HeadDim int32 `json:"head_dim"`
|
||||
VocabSize int32 `json:"vocab_size"`
|
||||
RMSNormEps float32 `json:"rms_norm_eps"`
|
||||
RopeTheta float32 `json:"rope_theta"`
|
||||
RopeLocalBaseFreq float32 `json:"rope_local_base_freq"`
|
||||
MaxPositionEmbeddings int32 `json:"max_position_embeddings"`
|
||||
SlidingWindow int32 `json:"sliding_window"`
|
||||
SlidingWindowPattern int32 `json:"sliding_window_pattern"`
|
||||
LayerTypes []string `json:"layer_types"`
|
||||
TieWordEmbeddings bool `json:"tie_word_embeddings"`
|
||||
|
||||
// Quantization parameters (set during load based on model quantization).
|
||||
QuantGroupSize int `json:"-"`
|
||||
QuantBits int `json:"-"`
|
||||
QuantMode string `json:"-"`
|
||||
TensorQuant map[string]*model.TensorQuantInfo `json:"-"`
|
||||
|
||||
// Computed fields.
|
||||
Scale float32 `json:"-"`
|
||||
}
|
||||
|
||||
// Attention implements Gemma 3 attention with Q/K normalization.
|
||||
type Attention struct {
|
||||
QProj nn.LinearLayer
|
||||
KProj nn.LinearLayer
|
||||
VProj nn.LinearLayer
|
||||
OProj nn.LinearLayer
|
||||
|
||||
QNorm *nn.RMSNorm
|
||||
KNorm *nn.RMSNorm
|
||||
|
||||
// Precomputed (1 + weight) for Gemma-style RMSNorm.
|
||||
QNormScaled *mlx.Array
|
||||
KNormScaled *mlx.Array
|
||||
}
|
||||
|
||||
// MLP is the feed-forward network with GELU activation.
|
||||
type MLP struct {
|
||||
GateProj nn.LinearLayer
|
||||
UpProj nn.LinearLayer
|
||||
DownProj nn.LinearLayer
|
||||
}
|
||||
|
||||
// DecoderLayer is a single transformer block.
|
||||
type DecoderLayer struct {
|
||||
InputNorm *nn.RMSNorm
|
||||
Attention *Attention
|
||||
PostAttnNorm *nn.RMSNorm
|
||||
PreFFNorm *nn.RMSNorm
|
||||
MLP *MLP
|
||||
PostFFNorm *nn.RMSNorm
|
||||
|
||||
// Precomputed (1 + weight) for Gemma-style RMSNorm.
|
||||
InputNormScaled *mlx.Array
|
||||
PostAttnNormScaled *mlx.Array
|
||||
PreFFNormScaled *mlx.Array
|
||||
PostFFNormScaled *mlx.Array
|
||||
|
||||
// Layer metadata.
|
||||
IsSliding bool
|
||||
LayerIdx int32
|
||||
}
|
||||
|
||||
// Model is the Gemma 3 text-only model.
|
||||
type Model struct {
|
||||
EmbedTokens *nn.Embedding
|
||||
Layers []*DecoderLayer
|
||||
Norm *nn.RMSNorm
|
||||
LMHead nn.LinearLayer
|
||||
|
||||
// Precomputed (1 + weight) for Gemma-style RMSNorm.
|
||||
NormScaled *mlx.Array
|
||||
|
||||
tok *tokenizer.Tokenizer
|
||||
*TextConfig
|
||||
|
||||
weightPrefix string
|
||||
}
|
||||
|
||||
func defaultHeads(numLayers int32) (numHeads, numKVHeads int32) {
|
||||
switch numLayers {
|
||||
case 34:
|
||||
return 8, 4
|
||||
case 48:
|
||||
return 16, 8
|
||||
case 62:
|
||||
return 32, 16
|
||||
default:
|
||||
return 8, 4
|
||||
}
|
||||
}
|
||||
|
||||
func parseTextConfig(configData []byte) (TextConfig, bool, error) {
|
||||
var cfg TextConfig
|
||||
if err := json.Unmarshal(configData, &cfg); err != nil {
|
||||
return TextConfig{}, false, fmt.Errorf("parse config: %w", err)
|
||||
}
|
||||
|
||||
var wrapped struct {
|
||||
TextConfig *TextConfig `json:"text_config"`
|
||||
}
|
||||
if err := json.Unmarshal(configData, &wrapped); err != nil {
|
||||
return TextConfig{}, false, fmt.Errorf("parse nested text config: %w", err)
|
||||
}
|
||||
|
||||
fromConditional := wrapped.TextConfig != nil
|
||||
if fromConditional {
|
||||
cfg = *wrapped.TextConfig
|
||||
|
||||
if cfg.HeadDim == 0 {
|
||||
cfg.HeadDim = 256
|
||||
}
|
||||
if cfg.NumAttentionHeads == 0 {
|
||||
cfg.NumAttentionHeads, cfg.NumKeyValueHeads = defaultHeads(cfg.NumHiddenLayers)
|
||||
}
|
||||
if cfg.NumKeyValueHeads == 0 {
|
||||
_, cfg.NumKeyValueHeads = defaultHeads(cfg.NumHiddenLayers)
|
||||
}
|
||||
if cfg.VocabSize == 0 {
|
||||
cfg.VocabSize = 262208
|
||||
}
|
||||
if cfg.SlidingWindowPattern == 0 && len(cfg.LayerTypes) == 0 {
|
||||
cfg.SlidingWindowPattern = 6
|
||||
}
|
||||
if cfg.MaxPositionEmbeddings == 0 {
|
||||
cfg.MaxPositionEmbeddings = 131072
|
||||
}
|
||||
}
|
||||
|
||||
if cfg.HeadDim == 0 {
|
||||
cfg.HeadDim = 256
|
||||
}
|
||||
if cfg.NumAttentionHeads == 0 {
|
||||
cfg.NumAttentionHeads, cfg.NumKeyValueHeads = defaultHeads(cfg.NumHiddenLayers)
|
||||
}
|
||||
if cfg.NumKeyValueHeads == 0 {
|
||||
cfg.NumKeyValueHeads = max(1, cfg.NumAttentionHeads/2)
|
||||
}
|
||||
if cfg.RopeTheta == 0 {
|
||||
cfg.RopeTheta = 1000000
|
||||
}
|
||||
if cfg.RopeLocalBaseFreq == 0 {
|
||||
cfg.RopeLocalBaseFreq = 10000
|
||||
}
|
||||
if cfg.RMSNormEps == 0 {
|
||||
cfg.RMSNormEps = 1e-6
|
||||
}
|
||||
if cfg.VocabSize == 0 {
|
||||
cfg.VocabSize = 262208
|
||||
}
|
||||
|
||||
cfg.Scale = float32(1.0 / math.Sqrt(float64(cfg.HeadDim)))
|
||||
|
||||
return cfg, fromConditional, nil
|
||||
}
|
||||
|
||||
func resolveWeightPrefix(tensors map[string]*mlx.Array) string {
|
||||
for _, prefix := range []string{"", "language_model."} {
|
||||
if tensors[prefix+"model.embed_tokens.weight"] != nil {
|
||||
return prefix
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func isLayerSliding(layerIdx int32, cfg *TextConfig) bool {
|
||||
if len(cfg.LayerTypes) > 0 && int(layerIdx) < len(cfg.LayerTypes) {
|
||||
return cfg.LayerTypes[layerIdx] == "sliding_attention"
|
||||
}
|
||||
if cfg.SlidingWindowPattern <= 0 {
|
||||
return false
|
||||
}
|
||||
return (layerIdx+1)%cfg.SlidingWindowPattern != 0
|
||||
}
|
||||
|
||||
func precomputeGemmaScaledWeights(m *Model) {
|
||||
if m.Norm != nil {
|
||||
m.NormScaled = mlx.AddScalar(m.Norm.Weight, 1.0)
|
||||
}
|
||||
|
||||
var scaled []*mlx.Array
|
||||
if m.NormScaled != nil {
|
||||
scaled = append(scaled, m.NormScaled)
|
||||
}
|
||||
|
||||
for _, layer := range m.Layers {
|
||||
if layer == nil || layer.Attention == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
if layer.InputNorm != nil {
|
||||
layer.InputNormScaled = mlx.AddScalar(layer.InputNorm.Weight, 1.0)
|
||||
scaled = append(scaled, layer.InputNormScaled)
|
||||
}
|
||||
if layer.PostAttnNorm != nil {
|
||||
layer.PostAttnNormScaled = mlx.AddScalar(layer.PostAttnNorm.Weight, 1.0)
|
||||
scaled = append(scaled, layer.PostAttnNormScaled)
|
||||
}
|
||||
if layer.PreFFNorm != nil {
|
||||
layer.PreFFNormScaled = mlx.AddScalar(layer.PreFFNorm.Weight, 1.0)
|
||||
scaled = append(scaled, layer.PreFFNormScaled)
|
||||
}
|
||||
if layer.PostFFNorm != nil {
|
||||
layer.PostFFNormScaled = mlx.AddScalar(layer.PostFFNorm.Weight, 1.0)
|
||||
scaled = append(scaled, layer.PostFFNormScaled)
|
||||
}
|
||||
|
||||
if layer.Attention.QNorm != nil {
|
||||
layer.Attention.QNormScaled = mlx.AddScalar(layer.Attention.QNorm.Weight, 1.0)
|
||||
scaled = append(scaled, layer.Attention.QNormScaled)
|
||||
}
|
||||
if layer.Attention.KNorm != nil {
|
||||
layer.Attention.KNormScaled = mlx.AddScalar(layer.Attention.KNorm.Weight, 1.0)
|
||||
scaled = append(scaled, layer.Attention.KNormScaled)
|
||||
}
|
||||
}
|
||||
|
||||
if len(scaled) > 0 {
|
||||
mlx.Eval(scaled...)
|
||||
}
|
||||
}
|
||||
|
||||
func newModel(root *model.Root) (base.Model, error) {
|
||||
configData, err := root.Manifest.ReadConfig("config.json")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("load config: %w", err)
|
||||
}
|
||||
|
||||
cfg, _, err := parseTextConfig(configData)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if qt := root.QuantType(); qt != "" {
|
||||
cfg.QuantGroupSize, cfg.QuantBits, cfg.QuantMode = model.QuantizationParams(qt)
|
||||
if gs := root.GroupSize(); gs > 0 {
|
||||
cfg.QuantGroupSize = gs
|
||||
}
|
||||
} else {
|
||||
cfg.QuantGroupSize, cfg.QuantBits, cfg.QuantMode = model.QuantizationParams("")
|
||||
}
|
||||
cfg.TensorQuant = root.AllTensorQuant()
|
||||
|
||||
tokData, err := root.Manifest.ReadConfig("tokenizer.json")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("load tokenizer config: %w", err)
|
||||
}
|
||||
|
||||
tokConfig := &tokenizer.TokenizerConfig{ConfigJSON: configData}
|
||||
if genConfigData, err := root.Manifest.ReadConfig("generation_config.json"); err == nil {
|
||||
tokConfig.GenerationConfigJSON = genConfigData
|
||||
}
|
||||
if tokConfigData, err := root.Manifest.ReadConfig("tokenizer_config.json"); err == nil {
|
||||
tokConfig.TokenizerConfigJSON = tokConfigData
|
||||
}
|
||||
|
||||
tok, err := tokenizer.LoadFromBytesWithConfig(tokData, tokConfig)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("parse tokenizer: %w", err)
|
||||
}
|
||||
|
||||
m := &Model{
|
||||
Layers: make([]*DecoderLayer, cfg.NumHiddenLayers),
|
||||
TextConfig: &cfg,
|
||||
tok: tok,
|
||||
}
|
||||
|
||||
for i := range m.Layers {
|
||||
m.Layers[i] = &DecoderLayer{
|
||||
LayerIdx: int32(i),
|
||||
IsSliding: isLayerSliding(int32(i), m.TextConfig),
|
||||
}
|
||||
}
|
||||
|
||||
return m, nil
|
||||
}
|
||||
|
||||
// LoadWeights receives all tensors loaded from the manifest and assigns them
|
||||
// to model fields.
|
||||
func (m *Model) LoadWeights(tensors map[string]*mlx.Array) error {
|
||||
m.weightPrefix = resolveWeightPrefix(tensors)
|
||||
prefix := m.weightPrefix
|
||||
linears := model.NewLinearFactory(tensors, m.QuantGroupSize, m.QuantBits, m.QuantMode, m.TensorQuant)
|
||||
|
||||
embedWeight := tensors[prefix+"model.embed_tokens.weight"]
|
||||
if embedWeight == nil {
|
||||
return fmt.Errorf("missing embedding weight: %smodel.embed_tokens.weight", prefix)
|
||||
}
|
||||
m.EmbedTokens = nn.NewEmbedding(embedWeight)
|
||||
|
||||
normWeight := tensors[prefix+"model.norm.weight"]
|
||||
if normWeight == nil {
|
||||
return fmt.Errorf("missing final norm weight: %smodel.norm.weight", prefix)
|
||||
}
|
||||
m.Norm = nn.NewRMSNorm(normWeight, m.RMSNormEps)
|
||||
|
||||
if lmHead := linears.Make(prefix + "lm_head"); lmHead != nil {
|
||||
m.LMHead = lmHead
|
||||
} else if lmHead := linears.Make("lm_head"); lmHead != nil {
|
||||
m.LMHead = lmHead
|
||||
} else {
|
||||
// Gemma usually ties output projection to embeddings.
|
||||
m.LMHead = nn.NewLinear(embedWeight, nil)
|
||||
}
|
||||
|
||||
for i := int32(0); i < m.NumHiddenLayers; i++ {
|
||||
layerPrefix := fmt.Sprintf("%smodel.layers.%d", prefix, i)
|
||||
|
||||
layer := &DecoderLayer{
|
||||
LayerIdx: i,
|
||||
IsSliding: isLayerSliding(i, m.TextConfig),
|
||||
Attention: &Attention{},
|
||||
MLP: &MLP{},
|
||||
}
|
||||
|
||||
if w := tensors[layerPrefix+".input_layernorm.weight"]; w != nil {
|
||||
layer.InputNorm = nn.NewRMSNorm(w, m.RMSNormEps)
|
||||
}
|
||||
if w := tensors[layerPrefix+".post_attention_layernorm.weight"]; w != nil {
|
||||
layer.PostAttnNorm = nn.NewRMSNorm(w, m.RMSNormEps)
|
||||
}
|
||||
if w := tensors[layerPrefix+".pre_feedforward_layernorm.weight"]; w != nil {
|
||||
layer.PreFFNorm = nn.NewRMSNorm(w, m.RMSNormEps)
|
||||
}
|
||||
if w := tensors[layerPrefix+".post_feedforward_layernorm.weight"]; w != nil {
|
||||
layer.PostFFNorm = nn.NewRMSNorm(w, m.RMSNormEps)
|
||||
}
|
||||
|
||||
layer.Attention.QProj = linears.Make(layerPrefix + ".self_attn.q_proj")
|
||||
layer.Attention.KProj = linears.Make(layerPrefix + ".self_attn.k_proj")
|
||||
layer.Attention.VProj = linears.Make(layerPrefix + ".self_attn.v_proj")
|
||||
layer.Attention.OProj = linears.Make(layerPrefix + ".self_attn.o_proj")
|
||||
|
||||
if w := tensors[layerPrefix+".self_attn.q_norm.weight"]; w != nil {
|
||||
layer.Attention.QNorm = nn.NewRMSNorm(w, m.RMSNormEps)
|
||||
}
|
||||
if w := tensors[layerPrefix+".self_attn.k_norm.weight"]; w != nil {
|
||||
layer.Attention.KNorm = nn.NewRMSNorm(w, m.RMSNormEps)
|
||||
}
|
||||
|
||||
layer.MLP.GateProj = linears.Make(layerPrefix + ".mlp.gate_proj")
|
||||
layer.MLP.UpProj = linears.Make(layerPrefix + ".mlp.up_proj")
|
||||
layer.MLP.DownProj = linears.Make(layerPrefix + ".mlp.down_proj")
|
||||
|
||||
if layer.InputNorm == nil {
|
||||
return fmt.Errorf("layer %d: missing input_layernorm", i)
|
||||
}
|
||||
if layer.PostAttnNorm == nil {
|
||||
return fmt.Errorf("layer %d: missing post_attention_layernorm", i)
|
||||
}
|
||||
if layer.PreFFNorm == nil {
|
||||
return fmt.Errorf("layer %d: missing pre_feedforward_layernorm", i)
|
||||
}
|
||||
if layer.PostFFNorm == nil {
|
||||
return fmt.Errorf("layer %d: missing post_feedforward_layernorm", i)
|
||||
}
|
||||
if layer.Attention.QProj == nil || layer.Attention.KProj == nil || layer.Attention.VProj == nil || layer.Attention.OProj == nil {
|
||||
return fmt.Errorf("layer %d: missing attention projections", i)
|
||||
}
|
||||
if layer.Attention.QNorm == nil || layer.Attention.KNorm == nil {
|
||||
return fmt.Errorf("layer %d: missing attention q/k norms", i)
|
||||
}
|
||||
if layer.MLP.GateProj == nil || layer.MLP.UpProj == nil || layer.MLP.DownProj == nil {
|
||||
return fmt.Errorf("layer %d: missing mlp projections", i)
|
||||
}
|
||||
|
||||
m.Layers[i] = layer
|
||||
}
|
||||
|
||||
precomputeGemmaScaledWeights(m)
|
||||
if m.NormScaled == nil {
|
||||
return fmt.Errorf("missing precomputed final norm weight")
|
||||
}
|
||||
collected := mlx.Collect(m)
|
||||
mlx.Eval(collected...)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *Model) Forward(tokens *mlx.Array, caches []cache.Cache) *mlx.Array {
|
||||
dims := tokens.Dims()
|
||||
B, L := int32(dims[0]), int32(dims[1])
|
||||
|
||||
h := m.EmbedTokens.Forward(tokens)
|
||||
h = mlx.MulScalar(h, float32(math.Sqrt(float64(m.HiddenSize))))
|
||||
|
||||
for i, layer := range m.Layers {
|
||||
var c cache.Cache
|
||||
if caches != nil && i < len(caches) {
|
||||
c = caches[i]
|
||||
}
|
||||
h = layer.Forward(h, c, B, L, m.TextConfig)
|
||||
}
|
||||
|
||||
return mlx.RMSNormFn(h, m.NormScaled, m.RMSNormEps)
|
||||
}
|
||||
|
||||
func (m *Model) Unembed(x *mlx.Array) *mlx.Array {
|
||||
return m.LMHead.Forward(x)
|
||||
}
|
||||
|
||||
func (m *Model) NumLayers() int {
|
||||
return len(m.Layers)
|
||||
}
|
||||
|
||||
func (m *Model) Tokenizer() *tokenizer.Tokenizer {
|
||||
return m.tok
|
||||
}
|
||||
|
||||
// NewCaches creates cache objects for all layers.
|
||||
func (m *Model) NewCaches() []cache.Cache {
|
||||
caches := make([]cache.Cache, len(m.Layers))
|
||||
for i, layer := range m.Layers {
|
||||
if m.SlidingWindow > 0 && layer.IsSliding {
|
||||
caches[i] = cache.NewRotatingKVCache(int(m.SlidingWindow))
|
||||
} else {
|
||||
caches[i] = cache.NewKVCache()
|
||||
}
|
||||
}
|
||||
return caches
|
||||
}
|
||||
|
||||
// FormatPrompt applies the Gemma 3 chat template.
|
||||
func (m *Model) FormatPrompt(prompt string) string {
|
||||
return fmt.Sprintf("<start_of_turn>user\n%s<end_of_turn>\n<start_of_turn>model\n", prompt)
|
||||
}
|
||||
|
||||
func (l *DecoderLayer) Forward(x *mlx.Array, c cache.Cache, B, L int32, cfg *TextConfig) *mlx.Array {
|
||||
normed := mlx.RMSNormFn(x, l.InputNormScaled, cfg.RMSNormEps)
|
||||
|
||||
attnOut := l.Attention.Forward(normed, c, B, L, l.IsSliding, cfg)
|
||||
attnOut = mlx.RMSNormFn(attnOut, l.PostAttnNormScaled, cfg.RMSNormEps)
|
||||
h := mlx.Add(x, attnOut)
|
||||
|
||||
normed = mlx.RMSNormFn(h, l.PreFFNormScaled, cfg.RMSNormEps)
|
||||
|
||||
mlpOut := l.MLP.Forward(normed)
|
||||
mlpOut = mlx.RMSNormFn(mlpOut, l.PostFFNormScaled, cfg.RMSNormEps)
|
||||
|
||||
return mlx.Add(h, mlpOut)
|
||||
}
|
||||
|
||||
func (a *Attention) Forward(x *mlx.Array, c cache.Cache, B, L int32, isSliding bool, cfg *TextConfig) *mlx.Array {
|
||||
q := a.QProj.Forward(x)
|
||||
k := a.KProj.Forward(x)
|
||||
v := a.VProj.Forward(x)
|
||||
|
||||
q = mlx.Reshape(q, B, L, cfg.NumAttentionHeads, cfg.HeadDim)
|
||||
q = mlx.Transpose(q, 0, 2, 1, 3)
|
||||
|
||||
k = mlx.Reshape(k, B, L, cfg.NumKeyValueHeads, cfg.HeadDim)
|
||||
k = mlx.Transpose(k, 0, 2, 1, 3)
|
||||
|
||||
v = mlx.Reshape(v, B, L, cfg.NumKeyValueHeads, cfg.HeadDim)
|
||||
v = mlx.Transpose(v, 0, 2, 1, 3)
|
||||
|
||||
q = mlx.RMSNormFn(q, a.QNormScaled, cfg.RMSNormEps)
|
||||
k = mlx.RMSNormFn(k, a.KNormScaled, cfg.RMSNormEps)
|
||||
|
||||
ropeTheta := cfg.RopeTheta
|
||||
if isSliding {
|
||||
ropeTheta = cfg.RopeLocalBaseFreq
|
||||
}
|
||||
|
||||
offset := 0
|
||||
if c != nil {
|
||||
offset = c.Offset()
|
||||
}
|
||||
q = mlx.RoPEWithBase(q, int(cfg.HeadDim), false, ropeTheta, 1.0, offset)
|
||||
k = mlx.RoPEWithBase(k, int(cfg.HeadDim), false, ropeTheta, 1.0, offset)
|
||||
|
||||
if c != nil {
|
||||
k, v = c.Update(k, v)
|
||||
}
|
||||
|
||||
repeatFactor := cfg.NumAttentionHeads / cfg.NumKeyValueHeads
|
||||
if repeatFactor > 1 {
|
||||
k = nn.RepeatKV(k, repeatFactor)
|
||||
v = nn.RepeatKV(v, repeatFactor)
|
||||
}
|
||||
|
||||
out := mlx.ScaledDotProductAttentionCausal(q, k, v, cfg.Scale, L > 1)
|
||||
out = mlx.Reshape(mlx.Transpose(out, 0, 2, 1, 3), B, L, cfg.NumAttentionHeads*cfg.HeadDim)
|
||||
return a.OProj.Forward(out)
|
||||
}
|
||||
|
||||
func (m *MLP) Forward(x *mlx.Array) *mlx.Array {
|
||||
gate := mlx.GELUApprox(m.GateProj.Forward(x))
|
||||
up := m.UpProj.Forward(x)
|
||||
return m.DownProj.Forward(mlx.Mul(gate, up))
|
||||
}
|
||||
@@ -8,7 +8,6 @@ import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"math"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/x/imagegen/tokenizer"
|
||||
"github.com/ollama/ollama/x/mlxrunner/cache"
|
||||
@@ -64,9 +63,10 @@ type Config struct {
|
||||
RopeScaling *RopeScaling `json:"rope_scaling"`
|
||||
|
||||
// Quantization parameters (set during load based on model quantization)
|
||||
QuantGroupSize int `json:"-"` // Group size for quantization (default 64)
|
||||
QuantBits int `json:"-"` // Bits per weight (4 or 8)
|
||||
QuantMode string `json:"-"` // Quantization mode ("affine", etc.)
|
||||
QuantGroupSize int `json:"-"` // Group size for quantization (default 64)
|
||||
QuantBits int `json:"-"` // Bits per weight (4 or 8)
|
||||
QuantMode string `json:"-"` // Quantization mode ("affine", etc.)
|
||||
TensorQuant map[string]*model.TensorQuantInfo `json:"-"`
|
||||
|
||||
// Computed fields
|
||||
QHeadDim int32 `json:"-"` // qk_nope_head_dim + qk_rope_head_dim
|
||||
@@ -372,22 +372,6 @@ func supportsGatherQMM(mode string, bits int) bool {
|
||||
return mode == "affine" && (bits == 4 || bits == 8)
|
||||
}
|
||||
|
||||
// quantizationParams returns groupSize, bits, mode for a quantization type string.
|
||||
func quantizationParams(quantization string) (groupSize, bits int, mode string) {
|
||||
switch strings.ToUpper(quantization) {
|
||||
case "NVFP4":
|
||||
return 16, 4, "nvfp4"
|
||||
case "FP4", "Q4", "INT4":
|
||||
return 32, 4, "affine"
|
||||
case "MXFP8":
|
||||
return 32, 8, "mxfp8"
|
||||
case "FP8", "Q8", "INT8", "":
|
||||
return 64, 8, "affine"
|
||||
default:
|
||||
return 32, 8, "affine"
|
||||
}
|
||||
}
|
||||
|
||||
// ExpertWeight holds a single expert's weight with optional quantization components.
|
||||
type ExpertWeight struct {
|
||||
Weight *mlx.Array
|
||||
@@ -408,7 +392,15 @@ func loadExpertWeight(tensors map[string]*mlx.Array, path string, useQuantized b
|
||||
if scales != nil {
|
||||
qbiases := tensors[path+".weight_qbias"]
|
||||
|
||||
groupSize, bits, mode := cfg.QuantGroupSize, cfg.QuantBits, cfg.QuantMode
|
||||
groupSize, bits, mode := model.ResolveLinearQuantParams(
|
||||
cfg.QuantGroupSize,
|
||||
cfg.QuantBits,
|
||||
cfg.QuantMode,
|
||||
cfg.TensorQuant,
|
||||
path+".weight",
|
||||
w,
|
||||
scales,
|
||||
)
|
||||
|
||||
if useQuantized && supportsGatherQMM(mode, bits) {
|
||||
return &ExpertWeight{Weight: w, Scales: scales, Biases: qbiases, Bits: bits, GroupSize: groupSize}
|
||||
@@ -492,7 +484,16 @@ func sanitizeMLAWeights(tensors map[string]*mlx.Array, prefix string, cfg *Confi
|
||||
// Check if quantized and dequantize
|
||||
if scales := tensors[path+".weight_scale"]; scales != nil {
|
||||
qbiases := tensors[path+".weight_qbias"]
|
||||
w = mlx.Dequantize(w, scales, qbiases, cfg.QuantGroupSize, cfg.QuantBits, cfg.QuantMode)
|
||||
groupSize, bits, mode := model.ResolveLinearQuantParams(
|
||||
cfg.QuantGroupSize,
|
||||
cfg.QuantBits,
|
||||
cfg.QuantMode,
|
||||
cfg.TensorQuant,
|
||||
path+".weight",
|
||||
w,
|
||||
scales,
|
||||
)
|
||||
w = mlx.Dequantize(w, scales, qbiases, groupSize, bits, mode)
|
||||
}
|
||||
|
||||
headDim := cfg.QKNopeHeadDim + cfg.VHeadDim
|
||||
@@ -507,32 +508,6 @@ func sanitizeMLAWeights(tensors map[string]*mlx.Array, prefix string, cfg *Confi
|
||||
return embedQ, unembedOut
|
||||
}
|
||||
|
||||
// makeLinear creates a Linear or QuantizedLinear layer from the tensor map.
|
||||
func makeLinear(tensors map[string]*mlx.Array, path string, cfg *Config) nn.LinearLayer {
|
||||
w := tensors[path+".weight"]
|
||||
if w == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
scales := tensors[path+".weight_scale"]
|
||||
if scales != nil {
|
||||
qbiases := tensors[path+".weight_qbias"]
|
||||
bias := tensors[path+".bias"]
|
||||
return &nn.QuantizedLinear{
|
||||
Weight: w,
|
||||
Scales: scales,
|
||||
QBiases: qbiases,
|
||||
Bias: bias,
|
||||
GroupSize: cfg.QuantGroupSize,
|
||||
Bits: cfg.QuantBits,
|
||||
Mode: cfg.QuantMode,
|
||||
}
|
||||
}
|
||||
|
||||
bias := tensors[path+".bias"]
|
||||
return nn.NewLinear(w, bias)
|
||||
}
|
||||
|
||||
// newModel creates a new GLM4-MoE-Lite model from a Root (config + tokenizer,
|
||||
// no weights loaded yet). Called by the registry via base.New().
|
||||
func newModel(root *model.Root) (base.Model, error) {
|
||||
@@ -551,13 +526,14 @@ func newModel(root *model.Root) (base.Model, error) {
|
||||
|
||||
// Set up quantization parameters from pre-scanned metadata
|
||||
if qt := root.QuantType(); qt != "" {
|
||||
_, cfg.QuantBits, cfg.QuantMode = quantizationParams(qt)
|
||||
cfg.QuantGroupSize, cfg.QuantBits, cfg.QuantMode = model.QuantizationParams(qt)
|
||||
if gs := root.GroupSize(); gs > 0 {
|
||||
cfg.QuantGroupSize = gs
|
||||
} else {
|
||||
cfg.QuantGroupSize, _, _ = quantizationParams(qt)
|
||||
}
|
||||
} else {
|
||||
cfg.QuantGroupSize, cfg.QuantBits, cfg.QuantMode = model.QuantizationParams("")
|
||||
}
|
||||
cfg.TensorQuant = root.AllTensorQuant()
|
||||
|
||||
// Load tokenizer
|
||||
tokData, err := root.Manifest.ReadConfig("tokenizer.json")
|
||||
@@ -596,7 +572,20 @@ func newModel(root *model.Root) (base.Model, error) {
|
||||
// layer creation.
|
||||
func (m *Model) LoadWeights(tensors map[string]*mlx.Array) error {
|
||||
cfg := m.Config
|
||||
linears := model.NewLinearFactory(tensors, cfg.QuantGroupSize, cfg.QuantBits, cfg.QuantMode, cfg.TensorQuant)
|
||||
useQuantized := supportsGatherQMM(cfg.QuantMode, cfg.QuantBits)
|
||||
if !useQuantized && cfg.TensorQuant != nil {
|
||||
for _, tq := range cfg.TensorQuant {
|
||||
if tq == nil {
|
||||
continue
|
||||
}
|
||||
_, bits, mode := model.QuantizationParams(tq.QuantType)
|
||||
if supportsGatherQMM(mode, bits) {
|
||||
useQuantized = true
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Load embedding
|
||||
if w := tensors["model.embed_tokens.weight"]; w != nil {
|
||||
@@ -609,7 +598,7 @@ func (m *Model) LoadWeights(tensors map[string]*mlx.Array) error {
|
||||
}
|
||||
|
||||
// Load LM head
|
||||
m.LMHead = makeLinear(tensors, "lm_head", cfg)
|
||||
m.LMHead = linears.Make("lm_head")
|
||||
|
||||
// Load layers
|
||||
for i := int32(0); i < cfg.NumHiddenLayers; i++ {
|
||||
@@ -617,16 +606,16 @@ func (m *Model) LoadWeights(tensors map[string]*mlx.Array) error {
|
||||
|
||||
// Load attention (same for both block types)
|
||||
attn := &MLAAttention{}
|
||||
attn.QAProj = makeLinear(tensors, prefix+".self_attn.q_a_proj", cfg)
|
||||
attn.QAProj = linears.Make(prefix + ".self_attn.q_a_proj")
|
||||
if w := tensors[prefix+".self_attn.q_a_layernorm.weight"]; w != nil {
|
||||
attn.QALayerNorm = nn.NewRMSNorm(w, cfg.RMSNormEps)
|
||||
}
|
||||
attn.QBProj = makeLinear(tensors, prefix+".self_attn.q_b_proj", cfg)
|
||||
attn.KVAProjWithMQA = makeLinear(tensors, prefix+".self_attn.kv_a_proj_with_mqa", cfg)
|
||||
attn.QBProj = linears.Make(prefix + ".self_attn.q_b_proj")
|
||||
attn.KVAProjWithMQA = linears.Make(prefix + ".self_attn.kv_a_proj_with_mqa")
|
||||
if w := tensors[prefix+".self_attn.kv_a_layernorm.weight"]; w != nil {
|
||||
attn.KVALayerNorm = nn.NewRMSNorm(w, cfg.RMSNormEps)
|
||||
}
|
||||
attn.OProj = makeLinear(tensors, prefix+".self_attn.o_proj", cfg)
|
||||
attn.OProj = linears.Make(prefix + ".self_attn.o_proj")
|
||||
|
||||
// Sanitize MLA weights for absorbed attention
|
||||
embedQ, unembedOut := sanitizeMLAWeights(tensors, prefix, cfg)
|
||||
@@ -647,9 +636,9 @@ func (m *Model) LoadWeights(tensors map[string]*mlx.Array) error {
|
||||
}
|
||||
|
||||
block.MLP = &DenseMLP{
|
||||
GateProj: makeLinear(tensors, prefix+".mlp.gate_proj", cfg),
|
||||
UpProj: makeLinear(tensors, prefix+".mlp.up_proj", cfg),
|
||||
DownProj: makeLinear(tensors, prefix+".mlp.down_proj", cfg),
|
||||
GateProj: linears.Make(prefix + ".mlp.gate_proj"),
|
||||
UpProj: linears.Make(prefix + ".mlp.up_proj"),
|
||||
DownProj: linears.Make(prefix + ".mlp.down_proj"),
|
||||
}
|
||||
|
||||
m.Layers[i] = block
|
||||
@@ -690,7 +679,7 @@ func (m *Model) LoadWeights(tensors map[string]*mlx.Array) error {
|
||||
}
|
||||
|
||||
moeGate := &MoEGate{}
|
||||
moeGate.Gate = makeLinear(tensors, prefix+".mlp.gate", cfg)
|
||||
moeGate.Gate = linears.Make(prefix + ".mlp.gate")
|
||||
if bias := tensors[prefix+".mlp.gate.e_score_correction_bias"]; bias != nil {
|
||||
moeGate.EScoreCorrectionBias = bias
|
||||
}
|
||||
@@ -703,9 +692,9 @@ func (m *Model) LoadWeights(tensors map[string]*mlx.Array) error {
|
||||
// Load shared experts if present
|
||||
if cfg.NSharedExperts > 0 {
|
||||
block.MoE.SharedExperts = &SharedExperts{
|
||||
GateProj: makeLinear(tensors, prefix+".mlp.shared_experts.gate_proj", cfg),
|
||||
UpProj: makeLinear(tensors, prefix+".mlp.shared_experts.up_proj", cfg),
|
||||
DownProj: makeLinear(tensors, prefix+".mlp.shared_experts.down_proj", cfg),
|
||||
GateProj: linears.Make(prefix + ".mlp.shared_experts.gate_proj"),
|
||||
UpProj: linears.Make(prefix + ".mlp.shared_experts.up_proj"),
|
||||
DownProj: linears.Make(prefix + ".mlp.shared_experts.down_proj"),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
323
x/models/llama/llama.go
Normal file
323
x/models/llama/llama.go
Normal file
@@ -0,0 +1,323 @@
|
||||
//go:build mlx
|
||||
|
||||
// Package llama provides a Llama-style decoder-only transformer for MLX.
|
||||
package llama
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"math"
|
||||
|
||||
"github.com/ollama/ollama/x/imagegen/tokenizer"
|
||||
"github.com/ollama/ollama/x/mlxrunner/cache"
|
||||
"github.com/ollama/ollama/x/mlxrunner/mlx"
|
||||
"github.com/ollama/ollama/x/mlxrunner/model"
|
||||
"github.com/ollama/ollama/x/mlxrunner/model/base"
|
||||
"github.com/ollama/ollama/x/models/nn"
|
||||
)
|
||||
|
||||
func init() {
|
||||
base.Register("LlamaForCausalLM", newModel)
|
||||
}
|
||||
|
||||
// Config holds Llama model configuration.
|
||||
type Config struct {
|
||||
HiddenSize int32 `json:"hidden_size"`
|
||||
NumHiddenLayers int32 `json:"num_hidden_layers"`
|
||||
IntermediateSize int32 `json:"intermediate_size"`
|
||||
NumAttentionHeads int32 `json:"num_attention_heads"`
|
||||
NumKeyValueHeads int32 `json:"num_key_value_heads"`
|
||||
VocabSize int32 `json:"vocab_size"`
|
||||
RMSNormEps float32 `json:"rms_norm_eps"`
|
||||
RopeTheta float32 `json:"rope_theta"`
|
||||
MaxPositionEmbeddings int32 `json:"max_position_embeddings"`
|
||||
TieWordEmbeddings bool `json:"tie_word_embeddings"`
|
||||
|
||||
// Quantization parameters (set during load based on model quantization).
|
||||
QuantGroupSize int `json:"-"`
|
||||
QuantBits int `json:"-"`
|
||||
QuantMode string `json:"-"`
|
||||
TensorQuant map[string]*model.TensorQuantInfo `json:"-"`
|
||||
|
||||
// Computed fields.
|
||||
HeadDim int32 `json:"-"`
|
||||
Scale float32 `json:"-"`
|
||||
}
|
||||
|
||||
// Model is a Llama text model.
|
||||
type Model struct {
|
||||
EmbedTokens *nn.Embedding
|
||||
Layers []*Layer
|
||||
Norm *nn.RMSNorm
|
||||
LMHead nn.LinearLayer
|
||||
|
||||
tok *tokenizer.Tokenizer
|
||||
*Config
|
||||
|
||||
weightPrefix string
|
||||
}
|
||||
|
||||
type Layer struct {
|
||||
Attention *Attention
|
||||
MLP *MLP
|
||||
AttentionNorm *nn.RMSNorm
|
||||
MLPNorm *nn.RMSNorm
|
||||
}
|
||||
|
||||
type Attention struct {
|
||||
QProj nn.LinearLayer
|
||||
KProj nn.LinearLayer
|
||||
VProj nn.LinearLayer
|
||||
OProj nn.LinearLayer
|
||||
}
|
||||
|
||||
type MLP struct {
|
||||
GateProj nn.LinearLayer
|
||||
UpProj nn.LinearLayer
|
||||
DownProj nn.LinearLayer
|
||||
}
|
||||
|
||||
func resolveWeightPrefix(tensors map[string]*mlx.Array) string {
|
||||
for _, prefix := range []string{"", "language_model."} {
|
||||
if tensors[prefix+"model.embed_tokens.weight"] != nil {
|
||||
return prefix
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func newModel(root *model.Root) (base.Model, error) {
|
||||
configData, err := root.Manifest.ReadConfig("config.json")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("load config: %w", err)
|
||||
}
|
||||
|
||||
var cfg Config
|
||||
if err := json.Unmarshal(configData, &cfg); err != nil {
|
||||
return nil, fmt.Errorf("parse config: %w", err)
|
||||
}
|
||||
|
||||
if cfg.HiddenSize <= 0 {
|
||||
return nil, fmt.Errorf("invalid hidden_size: %d", cfg.HiddenSize)
|
||||
}
|
||||
if cfg.NumAttentionHeads <= 0 {
|
||||
return nil, fmt.Errorf("invalid num_attention_heads: %d", cfg.NumAttentionHeads)
|
||||
}
|
||||
if cfg.NumKeyValueHeads <= 0 {
|
||||
cfg.NumKeyValueHeads = cfg.NumAttentionHeads
|
||||
}
|
||||
if cfg.HiddenSize%cfg.NumAttentionHeads != 0 {
|
||||
return nil, fmt.Errorf("hidden_size (%d) must be divisible by num_attention_heads (%d)", cfg.HiddenSize, cfg.NumAttentionHeads)
|
||||
}
|
||||
if cfg.HeadDim == 0 {
|
||||
cfg.HeadDim = cfg.HiddenSize / cfg.NumAttentionHeads
|
||||
}
|
||||
if cfg.HeadDim <= 0 {
|
||||
return nil, fmt.Errorf("invalid head_dim: %d", cfg.HeadDim)
|
||||
}
|
||||
if cfg.NumAttentionHeads%cfg.NumKeyValueHeads != 0 {
|
||||
return nil, fmt.Errorf("num_attention_heads (%d) must be divisible by num_key_value_heads (%d)", cfg.NumAttentionHeads, cfg.NumKeyValueHeads)
|
||||
}
|
||||
if cfg.RopeTheta == 0 {
|
||||
cfg.RopeTheta = 10000
|
||||
}
|
||||
if cfg.RMSNormEps == 0 {
|
||||
cfg.RMSNormEps = 1e-5
|
||||
}
|
||||
cfg.Scale = float32(1.0 / math.Sqrt(float64(cfg.HeadDim)))
|
||||
|
||||
if qt := root.QuantType(); qt != "" {
|
||||
cfg.QuantGroupSize, cfg.QuantBits, cfg.QuantMode = model.QuantizationParams(qt)
|
||||
if gs := root.GroupSize(); gs > 0 {
|
||||
cfg.QuantGroupSize = gs
|
||||
}
|
||||
} else {
|
||||
cfg.QuantGroupSize, cfg.QuantBits, cfg.QuantMode = model.QuantizationParams("")
|
||||
}
|
||||
cfg.TensorQuant = root.AllTensorQuant()
|
||||
|
||||
tokData, err := root.Manifest.ReadConfig("tokenizer.json")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("load tokenizer config: %w", err)
|
||||
}
|
||||
|
||||
tokConfig := &tokenizer.TokenizerConfig{
|
||||
ConfigJSON: configData,
|
||||
}
|
||||
if genConfigData, err := root.Manifest.ReadConfig("generation_config.json"); err == nil {
|
||||
tokConfig.GenerationConfigJSON = genConfigData
|
||||
}
|
||||
if tokConfigData, err := root.Manifest.ReadConfig("tokenizer_config.json"); err == nil {
|
||||
tokConfig.TokenizerConfigJSON = tokConfigData
|
||||
}
|
||||
|
||||
tok, err := tokenizer.LoadFromBytesWithConfig(tokData, tokConfig)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("parse tokenizer: %w", err)
|
||||
}
|
||||
|
||||
m := &Model{
|
||||
Layers: make([]*Layer, cfg.NumHiddenLayers),
|
||||
Config: &cfg,
|
||||
tok: tok,
|
||||
}
|
||||
|
||||
return m, nil
|
||||
}
|
||||
|
||||
// LoadWeights receives all tensors loaded from the manifest and assigns them
|
||||
// to model fields.
|
||||
func (m *Model) LoadWeights(tensors map[string]*mlx.Array) error {
|
||||
m.weightPrefix = resolveWeightPrefix(tensors)
|
||||
prefix := m.weightPrefix
|
||||
linears := model.NewLinearFactory(tensors, m.QuantGroupSize, m.QuantBits, m.QuantMode, m.TensorQuant)
|
||||
|
||||
embedWeight := tensors[prefix+"model.embed_tokens.weight"]
|
||||
if embedWeight == nil {
|
||||
return fmt.Errorf("missing embedding weight: %smodel.embed_tokens.weight", prefix)
|
||||
}
|
||||
m.EmbedTokens = nn.NewEmbedding(embedWeight)
|
||||
|
||||
normWeight := tensors[prefix+"model.norm.weight"]
|
||||
if normWeight == nil {
|
||||
return fmt.Errorf("missing final norm weight: %smodel.norm.weight", prefix)
|
||||
}
|
||||
m.Norm = nn.NewRMSNorm(normWeight, m.RMSNormEps)
|
||||
|
||||
if m.TieWordEmbeddings {
|
||||
m.LMHead = nn.NewLinear(embedWeight, nil)
|
||||
} else if lmHead := linears.Make(prefix + "lm_head"); lmHead != nil {
|
||||
m.LMHead = lmHead
|
||||
} else if lmHead := linears.Make("lm_head"); lmHead != nil {
|
||||
m.LMHead = lmHead
|
||||
} else {
|
||||
// Fallback used by many Llama checkpoints where output is tied.
|
||||
m.LMHead = nn.NewLinear(embedWeight, nil)
|
||||
}
|
||||
|
||||
for i := int32(0); i < m.NumHiddenLayers; i++ {
|
||||
layerPrefix := fmt.Sprintf("%smodel.layers.%d", prefix, i)
|
||||
|
||||
layer := &Layer{
|
||||
Attention: &Attention{},
|
||||
MLP: &MLP{},
|
||||
}
|
||||
|
||||
if w := tensors[layerPrefix+".input_layernorm.weight"]; w != nil {
|
||||
layer.AttentionNorm = nn.NewRMSNorm(w, m.RMSNormEps)
|
||||
}
|
||||
if w := tensors[layerPrefix+".post_attention_layernorm.weight"]; w != nil {
|
||||
layer.MLPNorm = nn.NewRMSNorm(w, m.RMSNormEps)
|
||||
}
|
||||
|
||||
layer.Attention.QProj = linears.Make(layerPrefix + ".self_attn.q_proj")
|
||||
layer.Attention.KProj = linears.Make(layerPrefix + ".self_attn.k_proj")
|
||||
layer.Attention.VProj = linears.Make(layerPrefix + ".self_attn.v_proj")
|
||||
layer.Attention.OProj = linears.Make(layerPrefix + ".self_attn.o_proj")
|
||||
|
||||
layer.MLP.GateProj = linears.Make(layerPrefix + ".mlp.gate_proj")
|
||||
layer.MLP.UpProj = linears.Make(layerPrefix + ".mlp.up_proj")
|
||||
layer.MLP.DownProj = linears.Make(layerPrefix + ".mlp.down_proj")
|
||||
|
||||
if layer.AttentionNorm == nil {
|
||||
return fmt.Errorf("layer %d: missing input_layernorm", i)
|
||||
}
|
||||
if layer.MLPNorm == nil {
|
||||
return fmt.Errorf("layer %d: missing post_attention_layernorm", i)
|
||||
}
|
||||
if layer.Attention.QProj == nil || layer.Attention.KProj == nil || layer.Attention.VProj == nil || layer.Attention.OProj == nil {
|
||||
return fmt.Errorf("layer %d: missing attention projections", i)
|
||||
}
|
||||
if layer.MLP.GateProj == nil || layer.MLP.UpProj == nil || layer.MLP.DownProj == nil {
|
||||
return fmt.Errorf("layer %d: missing mlp projections", i)
|
||||
}
|
||||
|
||||
m.Layers[i] = layer
|
||||
}
|
||||
|
||||
collected := mlx.Collect(m)
|
||||
mlx.Eval(collected...)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *Model) Forward(tokens *mlx.Array, caches []cache.Cache) *mlx.Array {
|
||||
dims := tokens.Dims()
|
||||
B, L := int32(dims[0]), int32(dims[1])
|
||||
|
||||
h := m.EmbedTokens.Forward(tokens)
|
||||
for i, layer := range m.Layers {
|
||||
var c cache.Cache
|
||||
if caches != nil && i < len(caches) {
|
||||
c = caches[i]
|
||||
}
|
||||
h = layer.Forward(h, c, B, L, m.Config)
|
||||
}
|
||||
|
||||
return m.Norm.Forward(h, m.RMSNormEps)
|
||||
}
|
||||
|
||||
func (m *Model) Unembed(x *mlx.Array) *mlx.Array {
|
||||
return m.LMHead.Forward(x)
|
||||
}
|
||||
|
||||
func (m *Model) NumLayers() int {
|
||||
return len(m.Layers)
|
||||
}
|
||||
|
||||
func (m *Model) Tokenizer() *tokenizer.Tokenizer {
|
||||
return m.tok
|
||||
}
|
||||
|
||||
func (m *Model) NewCaches() []cache.Cache {
|
||||
caches := make([]cache.Cache, len(m.Layers))
|
||||
for i := range caches {
|
||||
caches[i] = cache.NewKVCache()
|
||||
}
|
||||
return caches
|
||||
}
|
||||
|
||||
func (l *Layer) Forward(x *mlx.Array, c cache.Cache, B, L int32, cfg *Config) *mlx.Array {
|
||||
h := mlx.Add(x, l.Attention.Forward(l.AttentionNorm.Forward(x, cfg.RMSNormEps), c, B, L, cfg))
|
||||
return mlx.Add(h, l.MLP.Forward(l.MLPNorm.Forward(h, cfg.RMSNormEps)))
|
||||
}
|
||||
|
||||
func (a *Attention) Forward(x *mlx.Array, c cache.Cache, B, L int32, cfg *Config) *mlx.Array {
|
||||
q := a.QProj.Forward(x)
|
||||
k := a.KProj.Forward(x)
|
||||
v := a.VProj.Forward(x)
|
||||
|
||||
q = mlx.Reshape(q, B, L, cfg.NumAttentionHeads, cfg.HeadDim)
|
||||
q = mlx.Transpose(q, 0, 2, 1, 3)
|
||||
|
||||
k = mlx.Reshape(k, B, L, cfg.NumKeyValueHeads, cfg.HeadDim)
|
||||
k = mlx.Transpose(k, 0, 2, 1, 3)
|
||||
|
||||
v = mlx.Reshape(v, B, L, cfg.NumKeyValueHeads, cfg.HeadDim)
|
||||
v = mlx.Transpose(v, 0, 2, 1, 3)
|
||||
|
||||
offset := 0
|
||||
if c != nil {
|
||||
offset = c.Offset()
|
||||
}
|
||||
q = mlx.RoPEWithBase(q, int(cfg.HeadDim), false, cfg.RopeTheta, 1.0, offset)
|
||||
k = mlx.RoPEWithBase(k, int(cfg.HeadDim), false, cfg.RopeTheta, 1.0, offset)
|
||||
|
||||
if c != nil {
|
||||
k, v = c.Update(k, v)
|
||||
}
|
||||
|
||||
repeatFactor := cfg.NumAttentionHeads / cfg.NumKeyValueHeads
|
||||
if repeatFactor > 1 {
|
||||
k = nn.RepeatKV(k, repeatFactor)
|
||||
v = nn.RepeatKV(v, repeatFactor)
|
||||
}
|
||||
|
||||
out := mlx.ScaledDotProductAttentionCausal(q, k, v, cfg.Scale, L > 1)
|
||||
out = mlx.Reshape(mlx.Transpose(out, 0, 2, 1, 3), B, L, cfg.NumAttentionHeads*cfg.HeadDim)
|
||||
return a.OProj.Forward(out)
|
||||
}
|
||||
|
||||
func (m *MLP) Forward(x *mlx.Array) *mlx.Array {
|
||||
return m.DownProj.Forward(mlx.Mul(mlx.SiLU(m.GateProj.Forward(x)), m.UpProj.Forward(x)))
|
||||
}
|
||||
@@ -5,6 +5,7 @@ import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"math"
|
||||
"os"
|
||||
"sort"
|
||||
"strings"
|
||||
@@ -58,7 +59,15 @@ func GetSafetensorsLLMInfo(name model.Name) (map[string]any, error) {
|
||||
}
|
||||
}
|
||||
|
||||
return buildModelInfo(config, totalBytes, tensorCount), nil
|
||||
info := buildModelInfo(config, totalBytes, tensorCount)
|
||||
|
||||
// For quantized models, byte-based estimation can significantly undercount
|
||||
// parameters. Prefer exact counting from tensor shapes in safetensors headers.
|
||||
if paramCount, err := getParameterCountFromManifest(mf); err == nil && paramCount > 0 {
|
||||
info["general.parameter_count"] = paramCount
|
||||
}
|
||||
|
||||
return info, nil
|
||||
}
|
||||
|
||||
// buildModelInfo constructs the model info map from config and tensor stats.
|
||||
@@ -151,6 +160,51 @@ func buildModelInfo(config modelConfig, totalTensorBytes, tensorCount int64) map
|
||||
return info
|
||||
}
|
||||
|
||||
// getParameterCountFromManifest counts model parameters from tensor shapes.
|
||||
// This accounts for quantized tensors by using unpacked shapes from
|
||||
// getTensorInfoFromManifest.
|
||||
func getParameterCountFromManifest(mf *manifest.Manifest) (int64, error) {
|
||||
tensors, err := getTensorInfoFromManifest(mf)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
var total int64
|
||||
for _, tensor := range tensors {
|
||||
if len(tensor.Shape) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
elements := int64(1)
|
||||
for _, dim := range tensor.Shape {
|
||||
if dim == 0 {
|
||||
elements = 0
|
||||
break
|
||||
}
|
||||
|
||||
if dim > uint64(math.MaxInt64) {
|
||||
return 0, fmt.Errorf("tensor %s dimension too large: %d", tensor.Name, dim)
|
||||
}
|
||||
|
||||
d := int64(dim)
|
||||
if elements > math.MaxInt64/d {
|
||||
return 0, fmt.Errorf("tensor %s element count overflow", tensor.Name)
|
||||
}
|
||||
elements *= d
|
||||
}
|
||||
|
||||
if elements == 0 {
|
||||
continue
|
||||
}
|
||||
if total > math.MaxInt64-elements {
|
||||
return 0, fmt.Errorf("total parameter count overflow")
|
||||
}
|
||||
total += elements
|
||||
}
|
||||
|
||||
return total, nil
|
||||
}
|
||||
|
||||
// GetSafetensorsTensorInfo extracts tensor information from safetensors model layers.
|
||||
// Each tensor is stored as a minimal safetensors file with an 88-byte header containing metadata.
|
||||
func GetSafetensorsTensorInfo(name model.Name) ([]api.Tensor, error) {
|
||||
|
||||
@@ -714,6 +714,187 @@ func TestGetTensorInfoFromManifest_Quantized(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetParameterCountFromManifest(t *testing.T) {
|
||||
// Create a temp directory for blobs and set OLLAMA_MODELS
|
||||
tempDir := t.TempDir()
|
||||
t.Setenv("OLLAMA_MODELS", tempDir)
|
||||
|
||||
blobDir := filepath.Join(tempDir, "blobs")
|
||||
if err := os.MkdirAll(blobDir, 0o755); err != nil {
|
||||
t.Fatalf("failed to create blobs dir: %v", err)
|
||||
}
|
||||
|
||||
// Unquantized tensor: [4,5] = 20 params
|
||||
header1 := map[string]any{
|
||||
"model.embed_tokens.weight": map[string]any{
|
||||
"dtype": "BF16",
|
||||
"shape": []int64{4, 5},
|
||||
"data_offsets": []int64{0, 40},
|
||||
},
|
||||
}
|
||||
header1JSON, _ := json.Marshal(header1)
|
||||
var buf1 bytes.Buffer
|
||||
binary.Write(&buf1, binary.LittleEndian, uint64(len(header1JSON)))
|
||||
buf1.Write(header1JSON)
|
||||
|
||||
digest1 := "sha256:1111111111111111111111111111111111111111111111111111111111111111"
|
||||
blobPath1, err := manifest.BlobsPath(digest1)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to get blob path: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(blobPath1, buf1.Bytes(), 0o644); err != nil {
|
||||
t.Fatalf("failed to write blob1: %v", err)
|
||||
}
|
||||
|
||||
// Quantized int4 tensor with packed shape [10,2] -> unpacked [10,16] = 160 params
|
||||
header2 := map[string]any{
|
||||
"__metadata__": map[string]string{
|
||||
"quant_type": "int4",
|
||||
"group_size": "32",
|
||||
},
|
||||
"model.layers.0.mlp.up_proj.weight": map[string]any{
|
||||
"dtype": "U32",
|
||||
"shape": []int64{10, 2},
|
||||
"data_offsets": []int64{0, 80},
|
||||
},
|
||||
"model.layers.0.mlp.up_proj.weight.scale": map[string]any{
|
||||
"dtype": "BF16",
|
||||
"shape": []int64{10, 1},
|
||||
"data_offsets": []int64{80, 100},
|
||||
},
|
||||
"model.layers.0.mlp.up_proj.weight.bias": map[string]any{
|
||||
"dtype": "BF16",
|
||||
"shape": []int64{10, 1},
|
||||
"data_offsets": []int64{100, 120},
|
||||
},
|
||||
}
|
||||
header2JSON, _ := json.Marshal(header2)
|
||||
var buf2 bytes.Buffer
|
||||
binary.Write(&buf2, binary.LittleEndian, uint64(len(header2JSON)))
|
||||
buf2.Write(header2JSON)
|
||||
|
||||
digest2 := "sha256:2222222222222222222222222222222222222222222222222222222222222222"
|
||||
blobPath2, err := manifest.BlobsPath(digest2)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to get blob path: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(blobPath2, buf2.Bytes(), 0o644); err != nil {
|
||||
t.Fatalf("failed to write blob2: %v", err)
|
||||
}
|
||||
|
||||
mf := &manifest.Manifest{
|
||||
SchemaVersion: 2,
|
||||
MediaType: "application/vnd.docker.distribution.manifest.v2+json",
|
||||
Layers: []manifest.Layer{
|
||||
{
|
||||
MediaType: manifest.MediaTypeImageTensor,
|
||||
Digest: digest1,
|
||||
Size: int64(buf1.Len() + 40),
|
||||
Name: "model.embed_tokens.weight",
|
||||
},
|
||||
{
|
||||
MediaType: manifest.MediaTypeImageTensor,
|
||||
Digest: digest2,
|
||||
Size: int64(buf2.Len() + 120),
|
||||
Name: "model.layers.0.mlp.up_proj.weight",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
paramCount, err := getParameterCountFromManifest(mf)
|
||||
if err != nil {
|
||||
t.Fatalf("getParameterCountFromManifest() error = %v", err)
|
||||
}
|
||||
|
||||
const want int64 = 180 // 20 + 160
|
||||
if paramCount != want {
|
||||
t.Errorf("parameter_count = %d, want %d", paramCount, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetParameterCountFromManifest_MixedQuantizedPacked(t *testing.T) {
|
||||
// Create a temp directory for blobs and set OLLAMA_MODELS
|
||||
tempDir := t.TempDir()
|
||||
t.Setenv("OLLAMA_MODELS", tempDir)
|
||||
|
||||
blobDir := filepath.Join(tempDir, "blobs")
|
||||
if err := os.MkdirAll(blobDir, 0o755); err != nil {
|
||||
t.Fatalf("failed to create blobs dir: %v", err)
|
||||
}
|
||||
|
||||
// Packed mixed-precision blob (no global metadata):
|
||||
// - gate_proj: int4 packed [5,8] + scale [5,2] => unpacked [5,64] = 320 params
|
||||
// - down_proj: int8 packed [5,16] + scale [5,1] => unpacked [5,64] = 320 params
|
||||
header := map[string]any{
|
||||
"model.layers.0.mlp.experts.0.gate_proj.weight": map[string]any{
|
||||
"dtype": "U32",
|
||||
"shape": []int64{5, 8},
|
||||
"data_offsets": []int64{0, 160},
|
||||
},
|
||||
"model.layers.0.mlp.experts.0.gate_proj.weight.scale": map[string]any{
|
||||
"dtype": "BF16",
|
||||
"shape": []int64{5, 2},
|
||||
"data_offsets": []int64{160, 180},
|
||||
},
|
||||
"model.layers.0.mlp.experts.0.gate_proj.weight.bias": map[string]any{
|
||||
"dtype": "BF16",
|
||||
"shape": []int64{5, 2},
|
||||
"data_offsets": []int64{180, 200},
|
||||
},
|
||||
"model.layers.0.mlp.experts.0.down_proj.weight": map[string]any{
|
||||
"dtype": "U32",
|
||||
"shape": []int64{5, 16},
|
||||
"data_offsets": []int64{200, 520},
|
||||
},
|
||||
"model.layers.0.mlp.experts.0.down_proj.weight.scale": map[string]any{
|
||||
"dtype": "BF16",
|
||||
"shape": []int64{5, 1},
|
||||
"data_offsets": []int64{520, 530},
|
||||
},
|
||||
"model.layers.0.mlp.experts.0.down_proj.weight.bias": map[string]any{
|
||||
"dtype": "BF16",
|
||||
"shape": []int64{5, 1},
|
||||
"data_offsets": []int64{530, 540},
|
||||
},
|
||||
}
|
||||
headerJSON, _ := json.Marshal(header)
|
||||
var buf bytes.Buffer
|
||||
binary.Write(&buf, binary.LittleEndian, uint64(len(headerJSON)))
|
||||
buf.Write(headerJSON)
|
||||
|
||||
digest := "sha256:3333333333333333333333333333333333333333333333333333333333333333"
|
||||
blobPath, err := manifest.BlobsPath(digest)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to get blob path: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(blobPath, buf.Bytes(), 0o644); err != nil {
|
||||
t.Fatalf("failed to write blob: %v", err)
|
||||
}
|
||||
|
||||
mf := &manifest.Manifest{
|
||||
SchemaVersion: 2,
|
||||
MediaType: "application/vnd.docker.distribution.manifest.v2+json",
|
||||
Layers: []manifest.Layer{
|
||||
{
|
||||
MediaType: manifest.MediaTypeImageTensor,
|
||||
Digest: digest,
|
||||
Size: int64(buf.Len() + 540),
|
||||
Name: "model.layers.0.mlp.experts",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
paramCount, err := getParameterCountFromManifest(mf)
|
||||
if err != nil {
|
||||
t.Fatalf("getParameterCountFromManifest() error = %v", err)
|
||||
}
|
||||
|
||||
const want int64 = 640 // 320 + 320
|
||||
if paramCount != want {
|
||||
t.Errorf("parameter_count = %d, want %d", paramCount, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseSafetensorsAllHeaders(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
|
||||
Reference in New Issue
Block a user