mirror of
https://github.com/ollama/ollama.git
synced 2026-02-20 08:16:07 -05:00
556 lines
16 KiB
Go
556 lines
16 KiB
Go
package server
|
|
|
|
import (
|
|
"encoding/binary"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"math"
|
|
"os"
|
|
"sort"
|
|
"strings"
|
|
|
|
"github.com/ollama/ollama/api"
|
|
"github.com/ollama/ollama/manifest"
|
|
"github.com/ollama/ollama/types/model"
|
|
)
|
|
|
|
// modelConfig represents the HuggingFace config.json structure
|
|
type modelConfig struct {
|
|
Architectures []string `json:"architectures"`
|
|
ModelType string `json:"model_type"`
|
|
HiddenSize int `json:"hidden_size"`
|
|
NumHiddenLayers int `json:"num_hidden_layers"`
|
|
MaxPositionEmbeddings int `json:"max_position_embeddings"`
|
|
IntermediateSize int `json:"intermediate_size"`
|
|
NumAttentionHeads int `json:"num_attention_heads"`
|
|
NumKeyValueHeads int `json:"num_key_value_heads"`
|
|
VocabSize int `json:"vocab_size"`
|
|
RMSNormEps float64 `json:"rms_norm_eps"`
|
|
RopeTheta float64 `json:"rope_theta"`
|
|
TorchDtype string `json:"torch_dtype"`
|
|
TextConfig *struct {
|
|
HiddenSize int `json:"hidden_size"`
|
|
MaxPositionEmbeddings int `json:"max_position_embeddings"`
|
|
NumHiddenLayers int `json:"num_hidden_layers"`
|
|
} `json:"text_config"`
|
|
}
|
|
|
|
// GetSafetensorsLLMInfo extracts model information from safetensors LLM models.
|
|
// It reads the config.json layer and returns a map compatible with GGML's KV format.
|
|
func GetSafetensorsLLMInfo(name model.Name) (map[string]any, error) {
|
|
mf, err := manifest.ParseNamedManifest(name)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to load manifest: %w", err)
|
|
}
|
|
|
|
var config modelConfig
|
|
if err := mf.ReadConfigJSON("config.json", &config); err != nil {
|
|
return nil, fmt.Errorf("failed to read config.json: %w", err)
|
|
}
|
|
|
|
// Calculate total tensor bytes from manifest layers
|
|
var totalBytes int64
|
|
var tensorCount int64
|
|
for _, layer := range mf.Layers {
|
|
if layer.MediaType == manifest.MediaTypeImageTensor {
|
|
totalBytes += layer.Size
|
|
tensorCount++
|
|
}
|
|
}
|
|
|
|
info := buildModelInfo(config, totalBytes, tensorCount)
|
|
|
|
// For quantized models, byte-based estimation can significantly undercount
|
|
// parameters. Prefer exact counting from tensor shapes in safetensors headers.
|
|
if paramCount, err := getParameterCountFromManifest(mf); err == nil && paramCount > 0 {
|
|
info["general.parameter_count"] = paramCount
|
|
}
|
|
|
|
return info, nil
|
|
}
|
|
|
|
// buildModelInfo constructs the model info map from config and tensor stats.
|
|
// This is separated for testability.
|
|
func buildModelInfo(config modelConfig, totalTensorBytes, tensorCount int64) map[string]any {
|
|
// Determine architecture
|
|
arch := config.ModelType
|
|
if arch == "" && len(config.Architectures) > 0 {
|
|
// Convert HuggingFace architecture name to Ollama format
|
|
// e.g., "Gemma3ForCausalLM" -> "gemma3"
|
|
hfArch := config.Architectures[0]
|
|
arch = strings.ToLower(hfArch)
|
|
arch = strings.TrimSuffix(arch, "forcausallm")
|
|
arch = strings.TrimSuffix(arch, "forconditionalgeneration")
|
|
}
|
|
|
|
// Use text_config values if they exist (for multimodal models)
|
|
hiddenSize := config.HiddenSize
|
|
maxPosEmbed := config.MaxPositionEmbeddings
|
|
numLayers := config.NumHiddenLayers
|
|
|
|
if config.TextConfig != nil {
|
|
if config.TextConfig.HiddenSize > 0 {
|
|
hiddenSize = config.TextConfig.HiddenSize
|
|
}
|
|
if config.TextConfig.MaxPositionEmbeddings > 0 {
|
|
maxPosEmbed = config.TextConfig.MaxPositionEmbeddings
|
|
}
|
|
if config.TextConfig.NumHiddenLayers > 0 {
|
|
numLayers = config.TextConfig.NumHiddenLayers
|
|
}
|
|
}
|
|
|
|
// Get dtype to determine bytes per parameter for count calculation
|
|
dtype := config.TorchDtype
|
|
|
|
// Determine bytes per parameter based on dtype
|
|
var bytesPerParam int64 = 2 // default to float16/bfloat16
|
|
switch strings.ToLower(dtype) {
|
|
case "float32":
|
|
bytesPerParam = 4
|
|
case "float16", "bfloat16":
|
|
bytesPerParam = 2
|
|
case "int8", "uint8":
|
|
bytesPerParam = 1
|
|
}
|
|
|
|
// Subtract safetensors header overhead per tensor blob.
|
|
// Headers include __metadata__ with the tensor name, so overhead is ~150 bytes on average.
|
|
totalBytes := totalTensorBytes - tensorCount*150
|
|
|
|
paramCount := totalBytes / bytesPerParam
|
|
|
|
info := map[string]any{
|
|
"general.architecture": arch,
|
|
}
|
|
|
|
if maxPosEmbed > 0 {
|
|
info[fmt.Sprintf("%s.context_length", arch)] = maxPosEmbed
|
|
}
|
|
|
|
if hiddenSize > 0 {
|
|
info[fmt.Sprintf("%s.embedding_length", arch)] = hiddenSize
|
|
}
|
|
|
|
if numLayers > 0 {
|
|
info[fmt.Sprintf("%s.block_count", arch)] = numLayers
|
|
}
|
|
|
|
if config.NumAttentionHeads > 0 {
|
|
info[fmt.Sprintf("%s.attention.head_count", arch)] = config.NumAttentionHeads
|
|
}
|
|
|
|
if config.NumKeyValueHeads > 0 {
|
|
info[fmt.Sprintf("%s.attention.head_count_kv", arch)] = config.NumKeyValueHeads
|
|
}
|
|
|
|
if config.IntermediateSize > 0 {
|
|
info[fmt.Sprintf("%s.feed_forward_length", arch)] = config.IntermediateSize
|
|
}
|
|
|
|
if config.VocabSize > 0 {
|
|
info[fmt.Sprintf("%s.vocab_size", arch)] = config.VocabSize
|
|
}
|
|
|
|
if paramCount > 0 {
|
|
info["general.parameter_count"] = paramCount
|
|
}
|
|
|
|
return info
|
|
}
|
|
|
|
// getParameterCountFromManifest counts model parameters from tensor shapes.
|
|
// This accounts for quantized tensors by using unpacked shapes from
|
|
// getTensorInfoFromManifest.
|
|
func getParameterCountFromManifest(mf *manifest.Manifest) (int64, error) {
|
|
tensors, err := getTensorInfoFromManifest(mf)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
|
|
var total int64
|
|
for _, tensor := range tensors {
|
|
if len(tensor.Shape) == 0 {
|
|
continue
|
|
}
|
|
|
|
elements := int64(1)
|
|
for _, dim := range tensor.Shape {
|
|
if dim == 0 {
|
|
elements = 0
|
|
break
|
|
}
|
|
|
|
if dim > uint64(math.MaxInt64) {
|
|
return 0, fmt.Errorf("tensor %s dimension too large: %d", tensor.Name, dim)
|
|
}
|
|
|
|
d := int64(dim)
|
|
if elements > math.MaxInt64/d {
|
|
return 0, fmt.Errorf("tensor %s element count overflow", tensor.Name)
|
|
}
|
|
elements *= d
|
|
}
|
|
|
|
if elements == 0 {
|
|
continue
|
|
}
|
|
if total > math.MaxInt64-elements {
|
|
return 0, fmt.Errorf("total parameter count overflow")
|
|
}
|
|
total += elements
|
|
}
|
|
|
|
return total, nil
|
|
}
|
|
|
|
// GetSafetensorsTensorInfo extracts tensor information from safetensors model layers.
|
|
// Each tensor is stored as a minimal safetensors file with an 88-byte header containing metadata.
|
|
func GetSafetensorsTensorInfo(name model.Name) ([]api.Tensor, error) {
|
|
mf, err := manifest.ParseNamedManifest(name)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to load manifest: %w", err)
|
|
}
|
|
|
|
return getTensorInfoFromManifest(mf)
|
|
}
|
|
|
|
// getTensorInfoFromManifest extracts tensor info from a manifest.
|
|
// This is separated for testability.
|
|
// For quantized tensors, reads quant_type from blob __metadata__.
|
|
// For packed blobs (multiple tensors per blob), enumerates all tensors in the blob.
|
|
func getTensorInfoFromManifest(mf *manifest.Manifest) ([]api.Tensor, error) {
|
|
var tensors []api.Tensor
|
|
|
|
for _, layer := range mf.Layers {
|
|
if layer.MediaType != manifest.MediaTypeImageTensor {
|
|
continue
|
|
}
|
|
|
|
// Read all tensor entries from the safetensors header
|
|
blobPath, err := manifest.BlobsPath(layer.Digest)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
|
|
f, err := os.Open(blobPath)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
|
|
allInfos, err := parseSafetensorsAllHeaders(f)
|
|
f.Close()
|
|
if err != nil {
|
|
continue
|
|
}
|
|
|
|
// Determine if this is a packed blob (multiple main tensors)
|
|
isPacked := len(allInfos) > 1
|
|
|
|
for _, info := range allInfos {
|
|
tensorName := layer.Name
|
|
if isPacked {
|
|
// For packed blobs, use the tensor name from the header
|
|
tensorName = info.Name
|
|
}
|
|
|
|
if info.QuantType != "" {
|
|
quantType := strings.ToUpper(info.QuantType)
|
|
|
|
shape := make([]uint64, len(info.Shape))
|
|
for i, s := range info.Shape {
|
|
shape[i] = uint64(s)
|
|
}
|
|
|
|
var packFactor int64
|
|
switch strings.ToLower(info.QuantType) {
|
|
case "int4", "nvfp4":
|
|
packFactor = 8
|
|
case "int8", "mxfp8":
|
|
packFactor = 4
|
|
}
|
|
if packFactor > 0 && len(shape) >= 2 {
|
|
shape[len(shape)-1] = uint64(info.Shape[len(info.Shape)-1] * packFactor)
|
|
}
|
|
|
|
tensors = append(tensors, api.Tensor{
|
|
Name: tensorName,
|
|
Type: quantType,
|
|
Shape: shape,
|
|
})
|
|
} else {
|
|
shape := make([]uint64, len(info.Shape))
|
|
for i, s := range info.Shape {
|
|
shape[i] = uint64(s)
|
|
}
|
|
|
|
tensors = append(tensors, api.Tensor{
|
|
Name: tensorName,
|
|
Type: info.Dtype,
|
|
Shape: shape,
|
|
})
|
|
}
|
|
}
|
|
}
|
|
|
|
sort.Slice(tensors, func(i, j int) bool {
|
|
return tensors[i].Name < tensors[j].Name
|
|
})
|
|
|
|
return tensors, nil
|
|
}
|
|
|
|
// GetSafetensorsDtype returns the quantization type for a safetensors model.
|
|
// Reads quant_type from the first tensor blob's __metadata__.
|
|
// Falls back to torch_dtype from config.json if no quant metadata.
|
|
func GetSafetensorsDtype(name model.Name) (string, error) {
|
|
mf, err := manifest.ParseNamedManifest(name)
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed to load manifest: %w", err)
|
|
}
|
|
|
|
// Check first tensor blob for quant_type metadata
|
|
for _, layer := range mf.Layers {
|
|
if layer.MediaType != manifest.MediaTypeImageTensor {
|
|
continue
|
|
}
|
|
blobPath, err := manifest.BlobsPath(layer.Digest)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
info, err := readSafetensorsHeader(blobPath)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
if info.QuantType != "" {
|
|
return strings.ToUpper(info.QuantType), nil
|
|
}
|
|
// Only check the first tensor blob
|
|
break
|
|
}
|
|
|
|
// Not quantized - return torch_dtype from config.json
|
|
var cfg struct {
|
|
TorchDtype string `json:"torch_dtype"`
|
|
}
|
|
if err := mf.ReadConfigJSON("config.json", &cfg); err != nil {
|
|
return "", fmt.Errorf("failed to read config.json: %w", err)
|
|
}
|
|
|
|
return cfg.TorchDtype, nil
|
|
}
|
|
|
|
// safetensorsTensorInfo holds metadata about a tensor from a safetensors header
|
|
type safetensorsTensorInfo struct {
|
|
Name string // tensor name from the header key
|
|
Dtype string `json:"dtype"`
|
|
Shape []int64 `json:"shape"`
|
|
QuantType string // from __metadata__.quant_type (e.g., "int4", "int8", "nvfp4", "mxfp8")
|
|
GroupSize string // from __metadata__.group_size (e.g., "32", "64")
|
|
}
|
|
|
|
// readSafetensorsHeader reads the JSON header from a safetensors file to get tensor metadata.
|
|
// Safetensors format: 8-byte header size (little endian) + JSON header + tensor data
|
|
func readSafetensorsHeader(path string) (*safetensorsTensorInfo, error) {
|
|
f, err := os.Open(path)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer f.Close()
|
|
|
|
return parseSafetensorsHeader(f)
|
|
}
|
|
|
|
// parseSafetensorsHeader parses a safetensors header from a reader.
|
|
// This is separated for testability.
|
|
// Parses __metadata__ for quant_type and group_size if present.
|
|
func parseSafetensorsHeader(r io.Reader) (*safetensorsTensorInfo, error) {
|
|
// Read header size (8 bytes, little endian)
|
|
var headerSize uint64
|
|
if err := binary.Read(r, binary.LittleEndian, &headerSize); err != nil {
|
|
return nil, fmt.Errorf("failed to read header size: %w", err)
|
|
}
|
|
|
|
// Sanity check - header shouldn't be too large
|
|
if headerSize > 1024*1024 {
|
|
return nil, fmt.Errorf("header size too large: %d", headerSize)
|
|
}
|
|
|
|
// Read header JSON
|
|
headerBytes := make([]byte, headerSize)
|
|
if _, err := io.ReadFull(r, headerBytes); err != nil {
|
|
return nil, fmt.Errorf("failed to read header: %w", err)
|
|
}
|
|
|
|
// Parse as map of tensor name -> info
|
|
var header map[string]json.RawMessage
|
|
if err := json.Unmarshal(headerBytes, &header); err != nil {
|
|
return nil, fmt.Errorf("failed to parse header: %w", err)
|
|
}
|
|
|
|
// Parse metadata if present
|
|
var quantType, groupSize string
|
|
if metaRaw, ok := header["__metadata__"]; ok {
|
|
var meta map[string]string
|
|
if json.Unmarshal(metaRaw, &meta) == nil {
|
|
quantType = meta["quant_type"]
|
|
groupSize = meta["group_size"]
|
|
}
|
|
}
|
|
|
|
// Find the main tensor entry (not __metadata__, .scale, or .bias)
|
|
for name, raw := range header {
|
|
if name == "__metadata__" || strings.HasSuffix(name, ".scale") || strings.HasSuffix(name, ".bias") {
|
|
continue
|
|
}
|
|
var info safetensorsTensorInfo
|
|
if err := json.Unmarshal(raw, &info); err != nil {
|
|
return nil, fmt.Errorf("failed to parse tensor info: %w", err)
|
|
}
|
|
info.QuantType = quantType
|
|
info.GroupSize = groupSize
|
|
return &info, nil
|
|
}
|
|
|
|
// Fall back to first non-metadata tensor entry
|
|
for name, raw := range header {
|
|
if name == "__metadata__" {
|
|
continue
|
|
}
|
|
var info safetensorsTensorInfo
|
|
if err := json.Unmarshal(raw, &info); err != nil {
|
|
return nil, fmt.Errorf("failed to parse tensor info: %w", err)
|
|
}
|
|
info.QuantType = quantType
|
|
info.GroupSize = groupSize
|
|
return &info, nil
|
|
}
|
|
|
|
return nil, fmt.Errorf("no tensor found in header")
|
|
}
|
|
|
|
// parseSafetensorsAllHeaders parses all tensor entries from a safetensors header.
|
|
// Returns one safetensorsTensorInfo per main tensor (skipping __metadata__, .scale, .bias).
|
|
// For packed blobs this returns multiple entries; for single-tensor blobs, one entry.
|
|
// Each tensor's quant type is inferred from its shape and the presence of .scale/.bias entries
|
|
// when no global __metadata__ quant_type is present.
|
|
func parseSafetensorsAllHeaders(r io.Reader) ([]safetensorsTensorInfo, error) {
|
|
var headerSize uint64
|
|
if err := binary.Read(r, binary.LittleEndian, &headerSize); err != nil {
|
|
return nil, fmt.Errorf("failed to read header size: %w", err)
|
|
}
|
|
|
|
if headerSize > 100*1024*1024 { // 100MB limit for packed blob headers
|
|
return nil, fmt.Errorf("header size too large: %d", headerSize)
|
|
}
|
|
|
|
headerBytes := make([]byte, headerSize)
|
|
if _, err := io.ReadFull(r, headerBytes); err != nil {
|
|
return nil, fmt.Errorf("failed to read header: %w", err)
|
|
}
|
|
|
|
var header map[string]json.RawMessage
|
|
if err := json.Unmarshal(headerBytes, &header); err != nil {
|
|
return nil, fmt.Errorf("failed to parse header: %w", err)
|
|
}
|
|
|
|
// Parse global metadata if present
|
|
var globalQuantType, globalGroupSize string
|
|
if metaRaw, ok := header["__metadata__"]; ok {
|
|
var meta map[string]string
|
|
if json.Unmarshal(metaRaw, &meta) == nil {
|
|
globalQuantType = meta["quant_type"]
|
|
globalGroupSize = meta["group_size"]
|
|
}
|
|
}
|
|
|
|
// Build a set of all keys for checking .scale/.bias presence
|
|
headerKeys := make(map[string]bool, len(header))
|
|
for k := range header {
|
|
headerKeys[k] = true
|
|
}
|
|
|
|
// Collect all main tensor entries (sorted for deterministic output)
|
|
var mainNames []string
|
|
for name := range header {
|
|
if name == "__metadata__" || strings.HasSuffix(name, ".scale") || strings.HasSuffix(name, ".bias") {
|
|
continue
|
|
}
|
|
mainNames = append(mainNames, name)
|
|
}
|
|
sort.Strings(mainNames)
|
|
|
|
var results []safetensorsTensorInfo
|
|
for _, name := range mainNames {
|
|
var info safetensorsTensorInfo
|
|
if err := json.Unmarshal(header[name], &info); err != nil {
|
|
return nil, fmt.Errorf("failed to parse tensor info for %s: %w", name, err)
|
|
}
|
|
info.Name = name
|
|
|
|
if globalQuantType != "" {
|
|
// Use global metadata
|
|
info.QuantType = globalQuantType
|
|
info.GroupSize = globalGroupSize
|
|
} else if headerKeys[name+".scale"] {
|
|
// No global metadata, but has .scale - infer quant type from shape
|
|
info.QuantType = inferQuantType(header, name)
|
|
}
|
|
|
|
results = append(results, info)
|
|
}
|
|
|
|
if len(results) == 0 {
|
|
return nil, fmt.Errorf("no tensor found in header")
|
|
}
|
|
|
|
return results, nil
|
|
}
|
|
|
|
// inferQuantType infers the quantization type for a tensor from its shape and scale shape.
|
|
// Returns "int4", "int8", etc. or "" if not quantized.
|
|
func inferQuantType(header map[string]json.RawMessage, name string) string {
|
|
// Parse the main tensor shape
|
|
var mainInfo struct {
|
|
Shape []int64 `json:"shape"`
|
|
}
|
|
if json.Unmarshal(header[name], &mainInfo) != nil || len(mainInfo.Shape) < 2 {
|
|
return ""
|
|
}
|
|
|
|
// Parse scale shape to determine group size
|
|
scaleRaw, ok := header[name+".scale"]
|
|
if !ok {
|
|
return ""
|
|
}
|
|
var scaleInfo struct {
|
|
Shape []int64 `json:"shape"`
|
|
}
|
|
if json.Unmarshal(scaleRaw, &scaleInfo) != nil || len(scaleInfo.Shape) < 2 {
|
|
return ""
|
|
}
|
|
|
|
// Calculate group size: main_cols * pack_factor / scale_cols
|
|
// Main dtype is U32, so we need to figure out the pack factor
|
|
// For int4: pack=8, group=32. scale_cols = original_cols / 32 = main_cols * 8 / 32 = main_cols / 4
|
|
// For int8: pack=4, group=64. scale_cols = original_cols / 64 = main_cols * 4 / 64 = main_cols / 16
|
|
mainCols := mainInfo.Shape[len(mainInfo.Shape)-1]
|
|
scaleCols := scaleInfo.Shape[len(scaleInfo.Shape)-1]
|
|
if scaleCols == 0 {
|
|
return ""
|
|
}
|
|
|
|
ratio := mainCols / scaleCols // main_packed_cols / scale_cols
|
|
// int4: ratio = (orig/8) / (orig/32) = 32/8 = 4
|
|
// int8: ratio = (orig/4) / (orig/64) = 64/4 = 16
|
|
switch ratio {
|
|
case 4:
|
|
return "int4"
|
|
case 16:
|
|
return "int8"
|
|
default:
|
|
return ""
|
|
}
|
|
}
|