Files
ollama/x/server/show.go

556 lines
16 KiB
Go

package server
import (
"encoding/binary"
"encoding/json"
"fmt"
"io"
"math"
"os"
"sort"
"strings"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/manifest"
"github.com/ollama/ollama/types/model"
)
// modelConfig represents the HuggingFace config.json structure
type modelConfig struct {
Architectures []string `json:"architectures"`
ModelType string `json:"model_type"`
HiddenSize int `json:"hidden_size"`
NumHiddenLayers int `json:"num_hidden_layers"`
MaxPositionEmbeddings int `json:"max_position_embeddings"`
IntermediateSize int `json:"intermediate_size"`
NumAttentionHeads int `json:"num_attention_heads"`
NumKeyValueHeads int `json:"num_key_value_heads"`
VocabSize int `json:"vocab_size"`
RMSNormEps float64 `json:"rms_norm_eps"`
RopeTheta float64 `json:"rope_theta"`
TorchDtype string `json:"torch_dtype"`
TextConfig *struct {
HiddenSize int `json:"hidden_size"`
MaxPositionEmbeddings int `json:"max_position_embeddings"`
NumHiddenLayers int `json:"num_hidden_layers"`
} `json:"text_config"`
}
// GetSafetensorsLLMInfo extracts model information from safetensors LLM models.
// It reads the config.json layer and returns a map compatible with GGML's KV format.
func GetSafetensorsLLMInfo(name model.Name) (map[string]any, error) {
mf, err := manifest.ParseNamedManifest(name)
if err != nil {
return nil, fmt.Errorf("failed to load manifest: %w", err)
}
var config modelConfig
if err := mf.ReadConfigJSON("config.json", &config); err != nil {
return nil, fmt.Errorf("failed to read config.json: %w", err)
}
// Calculate total tensor bytes from manifest layers
var totalBytes int64
var tensorCount int64
for _, layer := range mf.Layers {
if layer.MediaType == manifest.MediaTypeImageTensor {
totalBytes += layer.Size
tensorCount++
}
}
info := buildModelInfo(config, totalBytes, tensorCount)
// For quantized models, byte-based estimation can significantly undercount
// parameters. Prefer exact counting from tensor shapes in safetensors headers.
if paramCount, err := getParameterCountFromManifest(mf); err == nil && paramCount > 0 {
info["general.parameter_count"] = paramCount
}
return info, nil
}
// buildModelInfo constructs the model info map from config and tensor stats.
// This is separated for testability.
func buildModelInfo(config modelConfig, totalTensorBytes, tensorCount int64) map[string]any {
// Determine architecture
arch := config.ModelType
if arch == "" && len(config.Architectures) > 0 {
// Convert HuggingFace architecture name to Ollama format
// e.g., "Gemma3ForCausalLM" -> "gemma3"
hfArch := config.Architectures[0]
arch = strings.ToLower(hfArch)
arch = strings.TrimSuffix(arch, "forcausallm")
arch = strings.TrimSuffix(arch, "forconditionalgeneration")
}
// Use text_config values if they exist (for multimodal models)
hiddenSize := config.HiddenSize
maxPosEmbed := config.MaxPositionEmbeddings
numLayers := config.NumHiddenLayers
if config.TextConfig != nil {
if config.TextConfig.HiddenSize > 0 {
hiddenSize = config.TextConfig.HiddenSize
}
if config.TextConfig.MaxPositionEmbeddings > 0 {
maxPosEmbed = config.TextConfig.MaxPositionEmbeddings
}
if config.TextConfig.NumHiddenLayers > 0 {
numLayers = config.TextConfig.NumHiddenLayers
}
}
// Get dtype to determine bytes per parameter for count calculation
dtype := config.TorchDtype
// Determine bytes per parameter based on dtype
var bytesPerParam int64 = 2 // default to float16/bfloat16
switch strings.ToLower(dtype) {
case "float32":
bytesPerParam = 4
case "float16", "bfloat16":
bytesPerParam = 2
case "int8", "uint8":
bytesPerParam = 1
}
// Subtract safetensors header overhead per tensor blob.
// Headers include __metadata__ with the tensor name, so overhead is ~150 bytes on average.
totalBytes := totalTensorBytes - tensorCount*150
paramCount := totalBytes / bytesPerParam
info := map[string]any{
"general.architecture": arch,
}
if maxPosEmbed > 0 {
info[fmt.Sprintf("%s.context_length", arch)] = maxPosEmbed
}
if hiddenSize > 0 {
info[fmt.Sprintf("%s.embedding_length", arch)] = hiddenSize
}
if numLayers > 0 {
info[fmt.Sprintf("%s.block_count", arch)] = numLayers
}
if config.NumAttentionHeads > 0 {
info[fmt.Sprintf("%s.attention.head_count", arch)] = config.NumAttentionHeads
}
if config.NumKeyValueHeads > 0 {
info[fmt.Sprintf("%s.attention.head_count_kv", arch)] = config.NumKeyValueHeads
}
if config.IntermediateSize > 0 {
info[fmt.Sprintf("%s.feed_forward_length", arch)] = config.IntermediateSize
}
if config.VocabSize > 0 {
info[fmt.Sprintf("%s.vocab_size", arch)] = config.VocabSize
}
if paramCount > 0 {
info["general.parameter_count"] = paramCount
}
return info
}
// getParameterCountFromManifest counts model parameters from tensor shapes.
// This accounts for quantized tensors by using unpacked shapes from
// getTensorInfoFromManifest.
func getParameterCountFromManifest(mf *manifest.Manifest) (int64, error) {
tensors, err := getTensorInfoFromManifest(mf)
if err != nil {
return 0, err
}
var total int64
for _, tensor := range tensors {
if len(tensor.Shape) == 0 {
continue
}
elements := int64(1)
for _, dim := range tensor.Shape {
if dim == 0 {
elements = 0
break
}
if dim > uint64(math.MaxInt64) {
return 0, fmt.Errorf("tensor %s dimension too large: %d", tensor.Name, dim)
}
d := int64(dim)
if elements > math.MaxInt64/d {
return 0, fmt.Errorf("tensor %s element count overflow", tensor.Name)
}
elements *= d
}
if elements == 0 {
continue
}
if total > math.MaxInt64-elements {
return 0, fmt.Errorf("total parameter count overflow")
}
total += elements
}
return total, nil
}
// GetSafetensorsTensorInfo extracts tensor information from safetensors model layers.
// Each tensor is stored as a minimal safetensors file with an 88-byte header containing metadata.
func GetSafetensorsTensorInfo(name model.Name) ([]api.Tensor, error) {
mf, err := manifest.ParseNamedManifest(name)
if err != nil {
return nil, fmt.Errorf("failed to load manifest: %w", err)
}
return getTensorInfoFromManifest(mf)
}
// getTensorInfoFromManifest extracts tensor info from a manifest.
// This is separated for testability.
// For quantized tensors, reads quant_type from blob __metadata__.
// For packed blobs (multiple tensors per blob), enumerates all tensors in the blob.
func getTensorInfoFromManifest(mf *manifest.Manifest) ([]api.Tensor, error) {
var tensors []api.Tensor
for _, layer := range mf.Layers {
if layer.MediaType != manifest.MediaTypeImageTensor {
continue
}
// Read all tensor entries from the safetensors header
blobPath, err := manifest.BlobsPath(layer.Digest)
if err != nil {
continue
}
f, err := os.Open(blobPath)
if err != nil {
continue
}
allInfos, err := parseSafetensorsAllHeaders(f)
f.Close()
if err != nil {
continue
}
// Determine if this is a packed blob (multiple main tensors)
isPacked := len(allInfos) > 1
for _, info := range allInfos {
tensorName := layer.Name
if isPacked {
// For packed blobs, use the tensor name from the header
tensorName = info.Name
}
if info.QuantType != "" {
quantType := strings.ToUpper(info.QuantType)
shape := make([]uint64, len(info.Shape))
for i, s := range info.Shape {
shape[i] = uint64(s)
}
var packFactor int64
switch strings.ToLower(info.QuantType) {
case "int4", "nvfp4":
packFactor = 8
case "int8", "mxfp8":
packFactor = 4
}
if packFactor > 0 && len(shape) >= 2 {
shape[len(shape)-1] = uint64(info.Shape[len(info.Shape)-1] * packFactor)
}
tensors = append(tensors, api.Tensor{
Name: tensorName,
Type: quantType,
Shape: shape,
})
} else {
shape := make([]uint64, len(info.Shape))
for i, s := range info.Shape {
shape[i] = uint64(s)
}
tensors = append(tensors, api.Tensor{
Name: tensorName,
Type: info.Dtype,
Shape: shape,
})
}
}
}
sort.Slice(tensors, func(i, j int) bool {
return tensors[i].Name < tensors[j].Name
})
return tensors, nil
}
// GetSafetensorsDtype returns the quantization type for a safetensors model.
// Reads quant_type from the first tensor blob's __metadata__.
// Falls back to torch_dtype from config.json if no quant metadata.
func GetSafetensorsDtype(name model.Name) (string, error) {
mf, err := manifest.ParseNamedManifest(name)
if err != nil {
return "", fmt.Errorf("failed to load manifest: %w", err)
}
// Check first tensor blob for quant_type metadata
for _, layer := range mf.Layers {
if layer.MediaType != manifest.MediaTypeImageTensor {
continue
}
blobPath, err := manifest.BlobsPath(layer.Digest)
if err != nil {
continue
}
info, err := readSafetensorsHeader(blobPath)
if err != nil {
continue
}
if info.QuantType != "" {
return strings.ToUpper(info.QuantType), nil
}
// Only check the first tensor blob
break
}
// Not quantized - return torch_dtype from config.json
var cfg struct {
TorchDtype string `json:"torch_dtype"`
}
if err := mf.ReadConfigJSON("config.json", &cfg); err != nil {
return "", fmt.Errorf("failed to read config.json: %w", err)
}
return cfg.TorchDtype, nil
}
// safetensorsTensorInfo holds metadata about a tensor from a safetensors header
type safetensorsTensorInfo struct {
Name string // tensor name from the header key
Dtype string `json:"dtype"`
Shape []int64 `json:"shape"`
QuantType string // from __metadata__.quant_type (e.g., "int4", "int8", "nvfp4", "mxfp8")
GroupSize string // from __metadata__.group_size (e.g., "32", "64")
}
// readSafetensorsHeader reads the JSON header from a safetensors file to get tensor metadata.
// Safetensors format: 8-byte header size (little endian) + JSON header + tensor data
func readSafetensorsHeader(path string) (*safetensorsTensorInfo, error) {
f, err := os.Open(path)
if err != nil {
return nil, err
}
defer f.Close()
return parseSafetensorsHeader(f)
}
// parseSafetensorsHeader parses a safetensors header from a reader.
// This is separated for testability.
// Parses __metadata__ for quant_type and group_size if present.
func parseSafetensorsHeader(r io.Reader) (*safetensorsTensorInfo, error) {
// Read header size (8 bytes, little endian)
var headerSize uint64
if err := binary.Read(r, binary.LittleEndian, &headerSize); err != nil {
return nil, fmt.Errorf("failed to read header size: %w", err)
}
// Sanity check - header shouldn't be too large
if headerSize > 1024*1024 {
return nil, fmt.Errorf("header size too large: %d", headerSize)
}
// Read header JSON
headerBytes := make([]byte, headerSize)
if _, err := io.ReadFull(r, headerBytes); err != nil {
return nil, fmt.Errorf("failed to read header: %w", err)
}
// Parse as map of tensor name -> info
var header map[string]json.RawMessage
if err := json.Unmarshal(headerBytes, &header); err != nil {
return nil, fmt.Errorf("failed to parse header: %w", err)
}
// Parse metadata if present
var quantType, groupSize string
if metaRaw, ok := header["__metadata__"]; ok {
var meta map[string]string
if json.Unmarshal(metaRaw, &meta) == nil {
quantType = meta["quant_type"]
groupSize = meta["group_size"]
}
}
// Find the main tensor entry (not __metadata__, .scale, or .bias)
for name, raw := range header {
if name == "__metadata__" || strings.HasSuffix(name, ".scale") || strings.HasSuffix(name, ".bias") {
continue
}
var info safetensorsTensorInfo
if err := json.Unmarshal(raw, &info); err != nil {
return nil, fmt.Errorf("failed to parse tensor info: %w", err)
}
info.QuantType = quantType
info.GroupSize = groupSize
return &info, nil
}
// Fall back to first non-metadata tensor entry
for name, raw := range header {
if name == "__metadata__" {
continue
}
var info safetensorsTensorInfo
if err := json.Unmarshal(raw, &info); err != nil {
return nil, fmt.Errorf("failed to parse tensor info: %w", err)
}
info.QuantType = quantType
info.GroupSize = groupSize
return &info, nil
}
return nil, fmt.Errorf("no tensor found in header")
}
// parseSafetensorsAllHeaders parses all tensor entries from a safetensors header.
// Returns one safetensorsTensorInfo per main tensor (skipping __metadata__, .scale, .bias).
// For packed blobs this returns multiple entries; for single-tensor blobs, one entry.
// Each tensor's quant type is inferred from its shape and the presence of .scale/.bias entries
// when no global __metadata__ quant_type is present.
func parseSafetensorsAllHeaders(r io.Reader) ([]safetensorsTensorInfo, error) {
var headerSize uint64
if err := binary.Read(r, binary.LittleEndian, &headerSize); err != nil {
return nil, fmt.Errorf("failed to read header size: %w", err)
}
if headerSize > 100*1024*1024 { // 100MB limit for packed blob headers
return nil, fmt.Errorf("header size too large: %d", headerSize)
}
headerBytes := make([]byte, headerSize)
if _, err := io.ReadFull(r, headerBytes); err != nil {
return nil, fmt.Errorf("failed to read header: %w", err)
}
var header map[string]json.RawMessage
if err := json.Unmarshal(headerBytes, &header); err != nil {
return nil, fmt.Errorf("failed to parse header: %w", err)
}
// Parse global metadata if present
var globalQuantType, globalGroupSize string
if metaRaw, ok := header["__metadata__"]; ok {
var meta map[string]string
if json.Unmarshal(metaRaw, &meta) == nil {
globalQuantType = meta["quant_type"]
globalGroupSize = meta["group_size"]
}
}
// Build a set of all keys for checking .scale/.bias presence
headerKeys := make(map[string]bool, len(header))
for k := range header {
headerKeys[k] = true
}
// Collect all main tensor entries (sorted for deterministic output)
var mainNames []string
for name := range header {
if name == "__metadata__" || strings.HasSuffix(name, ".scale") || strings.HasSuffix(name, ".bias") {
continue
}
mainNames = append(mainNames, name)
}
sort.Strings(mainNames)
var results []safetensorsTensorInfo
for _, name := range mainNames {
var info safetensorsTensorInfo
if err := json.Unmarshal(header[name], &info); err != nil {
return nil, fmt.Errorf("failed to parse tensor info for %s: %w", name, err)
}
info.Name = name
if globalQuantType != "" {
// Use global metadata
info.QuantType = globalQuantType
info.GroupSize = globalGroupSize
} else if headerKeys[name+".scale"] {
// No global metadata, but has .scale - infer quant type from shape
info.QuantType = inferQuantType(header, name)
}
results = append(results, info)
}
if len(results) == 0 {
return nil, fmt.Errorf("no tensor found in header")
}
return results, nil
}
// inferQuantType infers the quantization type for a tensor from its shape and scale shape.
// Returns "int4", "int8", etc. or "" if not quantized.
func inferQuantType(header map[string]json.RawMessage, name string) string {
// Parse the main tensor shape
var mainInfo struct {
Shape []int64 `json:"shape"`
}
if json.Unmarshal(header[name], &mainInfo) != nil || len(mainInfo.Shape) < 2 {
return ""
}
// Parse scale shape to determine group size
scaleRaw, ok := header[name+".scale"]
if !ok {
return ""
}
var scaleInfo struct {
Shape []int64 `json:"shape"`
}
if json.Unmarshal(scaleRaw, &scaleInfo) != nil || len(scaleInfo.Shape) < 2 {
return ""
}
// Calculate group size: main_cols * pack_factor / scale_cols
// Main dtype is U32, so we need to figure out the pack factor
// For int4: pack=8, group=32. scale_cols = original_cols / 32 = main_cols * 8 / 32 = main_cols / 4
// For int8: pack=4, group=64. scale_cols = original_cols / 64 = main_cols * 4 / 64 = main_cols / 16
mainCols := mainInfo.Shape[len(mainInfo.Shape)-1]
scaleCols := scaleInfo.Shape[len(scaleInfo.Shape)-1]
if scaleCols == 0 {
return ""
}
ratio := mainCols / scaleCols // main_packed_cols / scale_cols
// int4: ratio = (orig/8) / (orig/32) = 32/8 = 4
// int8: ratio = (orig/4) / (orig/64) = 64/4 = 16
switch ratio {
case 4:
return "int4"
case 16:
return "int8"
default:
return ""
}
}