Compare commits

..

1 Commits

Author SHA1 Message Date
Patrick Devine
313b6a6a32 safetensors quantization for mlx
This change includes:
  - changes to the safetensors metadata format
  - changes to the create command to properly create the blobs with the new format
  - changes to load the new format
  - fixes ollama show to properly show each tensor
2026-02-09 13:20:13 -08:00
20 changed files with 1865 additions and 708 deletions

View File

@@ -147,7 +147,7 @@ ARG PARALLEL
WORKDIR /go/src/github.com/ollama/ollama
COPY CMakeLists.txt CMakePresets.json .
COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
COPY x/imagegen/mlx x/imagegen/mlx
COPY x/ml/backend/mlx x/ml/backend/mlx
COPY go.mod go.sum .
COPY MLX_VERSION .
RUN curl -fsSL https://golang.org/dl/go$(awk '/^go/ { print $2 }' go.mod).linux-$(case $(uname -m) in x86_64) echo amd64 ;; aarch64) echo arm64 ;; esac).tar.gz | tar xz -C /usr/local

View File

@@ -897,5 +897,11 @@ func countContentBlock(block any) int {
}
}
if source, ok := blockMap["source"].(map[string]any); ok {
if data, ok := source["data"].(string); ok {
total += len(data)
}
}
return total
}

View File

@@ -1826,18 +1826,6 @@ func NewCLI() *cobra.Command {
return
}
// If no args, run launch to show interactive app selector
if len(args) == 0 {
if err := checkServerHeartbeat(cmd, args); err != nil {
cobra.CheckErr(err)
return
}
if err := config.RunLaunch(cmd, args, "", false); err != nil {
cobra.CheckErr(err)
}
return
}
cmd.Print(cmd.UsageString())
},
}

View File

@@ -59,12 +59,6 @@ var integrations = map[string]Runner{
"openclaw": &Openclaw{},
}
// IsIntegration returns true if the given name is a valid integration.
func IsIntegration(name string) bool {
_, ok := integrations[strings.ToLower(name)]
return ok
}
// recommendedModels are shown when the user has no models or as suggestions.
// Order matters: local models first, then cloud models.
var recommendedModels = []selectItem{
@@ -82,7 +76,7 @@ var integrationAliases = map[string]bool{
func selectIntegration() (string, error) {
if len(integrations) == 0 {
return "", fmt.Errorf("no apps available")
return "", fmt.Errorf("no integrations available")
}
names := slices.Sorted(maps.Keys(integrations))
@@ -99,14 +93,14 @@ func selectIntegration() (string, error) {
items = append(items, selectItem{Name: name, Description: description})
}
return selectPrompt("Select app:", items)
return selectPrompt("Select integration:", items)
}
// selectModels lets the user select models for an integration
func selectModels(ctx context.Context, name, current string) ([]string, error) {
r, ok := integrations[name]
if !ok {
return nil, fmt.Errorf("unknown app: %s", name)
return nil, fmt.Errorf("unknown integration: %s", name)
}
client, err := api.ClientFromEnvironment()
@@ -312,7 +306,7 @@ func ensureAuth(ctx context.Context, client *api.Client, cloudModels map[string]
func runIntegration(name, modelName string, args []string) error {
r, ok := integrations[name]
if !ok {
return fmt.Errorf("unknown app: %s", name)
return fmt.Errorf("unknown integration: %s", name)
}
fmt.Fprintf(os.Stderr, "\nLaunching %s with %s...\n", r, modelName)
@@ -341,230 +335,17 @@ func syncAliases(ctx context.Context, client *api.Client, ac AliasConfigurer, na
return saveAliases(name, aliases)
}
// RunLaunch executes the launch logic for the given integration and arguments.
// This can be called directly from the root command (with empty modelFlag/configFlag)
// or via the launch subcommand.
func RunLaunch(cmd *cobra.Command, args []string, modelFlag string, configFlag bool) error {
// Extract integration name and args to pass through using -- separator
var name string
var passArgs []string
dashIdx := cmd.ArgsLenAtDash()
if dashIdx == -1 {
// No "--" separator: only allow 0 or 1 args (integration name)
if len(args) > 1 {
return fmt.Errorf("unexpected arguments: %v\nUse '--' to pass extra arguments to the app", args[1:])
}
if len(args) == 1 {
name = args[0]
}
} else {
// "--" was used: args before it = integration name, args after = passthrough
if dashIdx > 1 {
return fmt.Errorf("expected at most 1 app name before '--', got %d", dashIdx)
}
if dashIdx == 1 {
name = args[0]
}
passArgs = args[dashIdx:]
}
if name == "" {
var err error
name, err = selectIntegration()
if errors.Is(err, errCancelled) {
return nil
}
if err != nil {
return err
}
}
r, ok := integrations[strings.ToLower(name)]
if !ok {
return fmt.Errorf("unknown app: %s", name)
}
// Handle AliasConfigurer integrations (claude, codex)
if ac, ok := r.(AliasConfigurer); ok {
client, err := api.ClientFromEnvironment()
if err != nil {
return err
}
// Validate --model flag if provided
if modelFlag != "" {
if err := showOrPull(cmd.Context(), client, modelFlag); err != nil {
if errors.Is(err, errCancelled) {
return nil
}
return err
}
}
var model string
var existingAliases map[string]string
// Load saved config
if cfg, err := loadIntegration(name); err == nil {
existingAliases = cfg.Aliases
if len(cfg.Models) > 0 {
model = cfg.Models[0]
// AliasConfigurer integrations use single model; sanitize if multiple
if len(cfg.Models) > 1 {
_ = saveIntegration(name, []string{model})
}
}
}
// --model flag overrides saved model
if modelFlag != "" {
model = modelFlag
}
// Validate saved model still exists
if model != "" && modelFlag == "" {
if _, err := client.Show(cmd.Context(), &api.ShowRequest{Model: model}); err != nil {
fmt.Fprintf(os.Stderr, "%sConfigured model %q not found%s\n\n", ansiGray, model, ansiReset)
if err := showOrPull(cmd.Context(), client, model); err != nil {
model = ""
}
}
}
// If no valid model or --config flag, show picker
if model == "" || configFlag {
aliases, _, err := ac.ConfigureAliases(cmd.Context(), model, existingAliases, configFlag)
if errors.Is(err, errCancelled) {
return nil
}
if err != nil {
return err
}
model = aliases["primary"]
existingAliases = aliases
}
// Ensure cloud models are authenticated
if isCloudModel(cmd.Context(), client, model) {
if err := ensureAuth(cmd.Context(), client, map[string]bool{model: true}, []string{model}); err != nil {
return err
}
}
// Sync aliases and save
if err := syncAliases(cmd.Context(), client, ac, name, model, existingAliases); err != nil {
fmt.Fprintf(os.Stderr, "%sWarning: Could not sync aliases: %v%s\n", ansiGray, err, ansiReset)
}
if err := saveIntegration(name, []string{model}); err != nil {
return fmt.Errorf("failed to save: %w", err)
}
// Launch (unless --config without confirmation)
if configFlag {
if launch, _ := confirmPrompt(fmt.Sprintf("Launch %s now?", r)); launch {
return runIntegration(name, model, passArgs)
}
return nil
}
return runIntegration(name, model, passArgs)
}
// Validate --model flag for non-AliasConfigurer integrations
if modelFlag != "" {
client, err := api.ClientFromEnvironment()
if err != nil {
return err
}
if err := showOrPull(cmd.Context(), client, modelFlag); err != nil {
if errors.Is(err, errCancelled) {
return nil
}
return err
}
}
var models []string
if modelFlag != "" {
models = []string{modelFlag}
if existing, err := loadIntegration(name); err == nil && len(existing.Models) > 0 {
for _, m := range existing.Models {
if m != modelFlag {
models = append(models, m)
}
}
}
} else if saved, err := loadIntegration(name); err == nil && len(saved.Models) > 0 && !configFlag {
return runIntegration(name, saved.Models[0], passArgs)
} else {
var err error
models, err = selectModels(cmd.Context(), name, "")
if errors.Is(err, errCancelled) {
return nil
}
if err != nil {
return err
}
}
if editor, isEditor := r.(Editor); isEditor {
paths := editor.Paths()
if len(paths) > 0 {
fmt.Fprintf(os.Stderr, "This will modify your %s configuration:\n", r)
for _, p := range paths {
fmt.Fprintf(os.Stderr, " %s\n", p)
}
fmt.Fprintf(os.Stderr, "Backups will be saved to %s/\n\n", backupDir())
if ok, _ := confirmPrompt("Proceed?"); !ok {
return nil
}
}
}
if err := saveIntegration(name, models); err != nil {
return fmt.Errorf("failed to save: %w", err)
}
if editor, isEditor := r.(Editor); isEditor {
if err := editor.Edit(models); err != nil {
return fmt.Errorf("setup failed: %w", err)
}
}
if _, isEditor := r.(Editor); isEditor {
if len(models) == 1 {
fmt.Fprintf(os.Stderr, "Added %s to %s\n", models[0], r)
} else {
fmt.Fprintf(os.Stderr, "Added %d models to %s (default: %s)\n", len(models), r, models[0])
}
}
if configFlag {
if launch, _ := confirmPrompt(fmt.Sprintf("\nLaunch %s now?", r)); launch {
return runIntegration(name, models[0], passArgs)
}
return nil
}
if runner, isRunner := r.(Runner); isRunner {
return runner.Run(models[0], passArgs)
}
return nil
}
// LaunchCmd returns the cobra command for launching integrations.
func LaunchCmd(checkServerHeartbeat func(cmd *cobra.Command, args []string) error) *cobra.Command {
var modelFlag string
var configFlag bool
cmd := &cobra.Command{
Use: "launch [APP] [-- [EXTRA_ARGS...]]",
Short: "Launch an app with Ollama",
Long: `Launch an app configured with Ollama models.
Use: "launch [INTEGRATION] [-- [EXTRA_ARGS...]]",
Short: "Launch an integration with Ollama",
Long: `Launch an integration configured with Ollama models.
Supported apps:
Supported integrations:
claude Claude Code
codex Codex
droid Droid
@@ -576,12 +357,215 @@ Examples:
ollama launch claude
ollama launch claude --model <model>
ollama launch droid --config (does not auto-launch)
ollama launch codex -- -p myprofile (pass extra args to app)
ollama launch codex -- -p myprofile (pass extra args to integration)
ollama launch codex -- --sandbox workspace-write`,
Args: cobra.ArbitraryArgs,
PreRunE: checkServerHeartbeat,
RunE: func(cmd *cobra.Command, args []string) error {
return RunLaunch(cmd, args, modelFlag, configFlag)
// Extract integration name and args to pass through using -- separator
var name string
var passArgs []string
dashIdx := cmd.ArgsLenAtDash()
if dashIdx == -1 {
// No "--" separator: only allow 0 or 1 args (integration name)
if len(args) > 1 {
return fmt.Errorf("unexpected arguments: %v\nUse '--' to pass extra arguments to the integration", args[1:])
}
if len(args) == 1 {
name = args[0]
}
} else {
// "--" was used: args before it = integration name, args after = passthrough
if dashIdx > 1 {
return fmt.Errorf("expected at most 1 integration name before '--', got %d", dashIdx)
}
if dashIdx == 1 {
name = args[0]
}
passArgs = args[dashIdx:]
}
if name == "" {
var err error
name, err = selectIntegration()
if errors.Is(err, errCancelled) {
return nil
}
if err != nil {
return err
}
}
r, ok := integrations[strings.ToLower(name)]
if !ok {
return fmt.Errorf("unknown integration: %s", name)
}
// Handle AliasConfigurer integrations (claude, codex)
if ac, ok := r.(AliasConfigurer); ok {
client, err := api.ClientFromEnvironment()
if err != nil {
return err
}
// Validate --model flag if provided
if modelFlag != "" {
if err := showOrPull(cmd.Context(), client, modelFlag); err != nil {
if errors.Is(err, errCancelled) {
return nil
}
return err
}
}
var model string
var existingAliases map[string]string
// Load saved config
if cfg, err := loadIntegration(name); err == nil {
existingAliases = cfg.Aliases
if len(cfg.Models) > 0 {
model = cfg.Models[0]
// AliasConfigurer integrations use single model; sanitize if multiple
if len(cfg.Models) > 1 {
_ = saveIntegration(name, []string{model})
}
}
}
// --model flag overrides saved model
if modelFlag != "" {
model = modelFlag
}
// Validate saved model still exists
if model != "" && modelFlag == "" {
if _, err := client.Show(cmd.Context(), &api.ShowRequest{Model: model}); err != nil {
fmt.Fprintf(os.Stderr, "%sConfigured model %q not found%s\n\n", ansiGray, model, ansiReset)
if err := showOrPull(cmd.Context(), client, model); err != nil {
model = ""
}
}
}
// If no valid model or --config flag, show picker
if model == "" || configFlag {
aliases, _, err := ac.ConfigureAliases(cmd.Context(), model, existingAliases, configFlag)
if errors.Is(err, errCancelled) {
return nil
}
if err != nil {
return err
}
model = aliases["primary"]
existingAliases = aliases
}
// Ensure cloud models are authenticated
if isCloudModel(cmd.Context(), client, model) {
if err := ensureAuth(cmd.Context(), client, map[string]bool{model: true}, []string{model}); err != nil {
return err
}
}
// Sync aliases and save
if err := syncAliases(cmd.Context(), client, ac, name, model, existingAliases); err != nil {
fmt.Fprintf(os.Stderr, "%sWarning: Could not sync aliases: %v%s\n", ansiGray, err, ansiReset)
}
if err := saveIntegration(name, []string{model}); err != nil {
return fmt.Errorf("failed to save: %w", err)
}
// Launch (unless --config without confirmation)
if configFlag {
if launch, _ := confirmPrompt(fmt.Sprintf("Launch %s now?", r)); launch {
return runIntegration(name, model, passArgs)
}
return nil
}
return runIntegration(name, model, passArgs)
}
// Validate --model flag for non-AliasConfigurer integrations
if modelFlag != "" {
client, err := api.ClientFromEnvironment()
if err != nil {
return err
}
if err := showOrPull(cmd.Context(), client, modelFlag); err != nil {
if errors.Is(err, errCancelled) {
return nil
}
return err
}
}
var models []string
if modelFlag != "" {
models = []string{modelFlag}
if existing, err := loadIntegration(name); err == nil && len(existing.Models) > 0 {
for _, m := range existing.Models {
if m != modelFlag {
models = append(models, m)
}
}
}
} else if saved, err := loadIntegration(name); err == nil && len(saved.Models) > 0 && !configFlag {
return runIntegration(name, saved.Models[0], passArgs)
} else {
var err error
models, err = selectModels(cmd.Context(), name, "")
if errors.Is(err, errCancelled) {
return nil
}
if err != nil {
return err
}
}
if editor, isEditor := r.(Editor); isEditor {
paths := editor.Paths()
if len(paths) > 0 {
fmt.Fprintf(os.Stderr, "This will modify your %s configuration:\n", r)
for _, p := range paths {
fmt.Fprintf(os.Stderr, " %s\n", p)
}
fmt.Fprintf(os.Stderr, "Backups will be saved to %s/\n\n", backupDir())
if ok, _ := confirmPrompt("Proceed?"); !ok {
return nil
}
}
}
if err := saveIntegration(name, models); err != nil {
return fmt.Errorf("failed to save: %w", err)
}
if editor, isEditor := r.(Editor); isEditor {
if err := editor.Edit(models); err != nil {
return fmt.Errorf("setup failed: %w", err)
}
}
if _, isEditor := r.(Editor); isEditor {
if len(models) == 1 {
fmt.Fprintf(os.Stderr, "Added %s to %s\n", models[0], r)
} else {
fmt.Fprintf(os.Stderr, "Added %d models to %s (default: %s)\n", len(models), r, models[0])
}
}
if configFlag {
if launch, _ := confirmPrompt(fmt.Sprintf("\nLaunch %s now?", r)); launch {
return runIntegration(name, models[0], passArgs)
}
fmt.Fprintf(os.Stderr, "Run 'ollama launch %s' to start with %s\n", strings.ToLower(name), models[0])
return nil
}
return runIntegration(name, models[0], passArgs)
},
}

View File

@@ -98,8 +98,8 @@ func TestLaunchCmd(t *testing.T) {
cmd := LaunchCmd(mockCheck)
t.Run("command structure", func(t *testing.T) {
if cmd.Use != "launch [APP] [-- [EXTRA_ARGS...]]" {
t.Errorf("Use = %q, want %q", cmd.Use, "launch [APP] [-- [EXTRA_ARGS...]]")
if cmd.Use != "launch [INTEGRATION] [-- [EXTRA_ARGS...]]" {
t.Errorf("Use = %q, want %q", cmd.Use, "launch [INTEGRATION] [-- [EXTRA_ARGS...]]")
}
if cmd.Short == "" {
t.Error("Short description should not be empty")
@@ -133,8 +133,8 @@ func TestRunIntegration_UnknownIntegration(t *testing.T) {
if err == nil {
t.Error("expected error for unknown integration, got nil")
}
if !strings.Contains(err.Error(), "unknown app") {
t.Errorf("error should mention 'unknown app', got: %v", err)
if !strings.Contains(err.Error(), "unknown integration") {
t.Errorf("error should mention 'unknown integration', got: %v", err)
}
}

View File

@@ -312,7 +312,7 @@ Parallel request processing for a given model results in increasing the context
The following server settings may be used to adjust how Ollama handles concurrent requests on most platforms:
- `OLLAMA_MAX_LOADED_MODELS` - The maximum number of models that can be loaded concurrently provided they fit in available memory. The default is 3 \* the number of GPUs or 3 for CPU inference.
- `OLLAMA_NUM_PARALLEL` - The maximum number of parallel requests each model will process at the same time, default 1. Required RAM will scale by `OLLAMA_NUM_PARALLEL` * `OLLAMA_CONTEXT_LENGTH`.
- `OLLAMA_NUM_PARALLEL` - The maximum number of parallel requests each model will process at the same time. The default will auto-select either 4 or 1 based on available memory.
- `OLLAMA_MAX_QUEUE` - The maximum number of requests Ollama will queue when busy before rejecting additional requests. The default is 512
Note: Windows with Radeon GPUs currently default to 1 model maximum due to limitations in ROCm v5.7 for available VRAM reporting. Once ROCm v6.2 is available, Windows Radeon will follow the defaults above. You may enable concurrent model loads on Radeon on Windows, but ensure you don't load more models than will fit into your GPUs VRAM.

View File

@@ -19,6 +19,7 @@ import (
"github.com/ollama/ollama/progress"
"github.com/ollama/ollama/types/model"
"github.com/ollama/ollama/x/create"
"github.com/ollama/ollama/x/imagegen/safetensors"
)
// MinOllamaVersion is the minimum Ollama version required for safetensors models.
@@ -35,7 +36,7 @@ type ModelfileConfig struct {
type CreateOptions struct {
ModelName string
ModelDir string
Quantize string // "q4", "q8", "nvfp4", or "mxfp8" for quantization
Quantize string // "int4", "int8", "nvfp4", or "mxfp8" for quantization
Modelfile *ModelfileConfig // template/system/license from Modelfile
}
@@ -94,6 +95,7 @@ func CreateModel(opts CreateOptions, p *progress.Progress) error {
newLayerCreator(), newTensorLayerCreator(),
newManifestWriter(opts, capabilities, parserName, rendererName),
progressFn,
newPackedTensorLayerCreator(),
)
} else {
err = create.CreateImageGenModel(
@@ -141,60 +143,33 @@ func newTensorLayerCreator() create.QuantizingTensorLayerCreator {
}
}
// createQuantizedLayers quantizes a tensor and returns the resulting layers.
// createQuantizedLayers quantizes a tensor and returns a single combined layer.
// The combined blob contains data, scale, and optional bias tensors with metadata.
func createQuantizedLayers(r io.Reader, name, dtype string, shape []int32, quantize string) ([]create.LayerInfo, error) {
if !QuantizeSupported() {
return nil, fmt.Errorf("quantization requires MLX support")
}
// Quantize the tensor
qweightData, scalesData, qbiasData, _, _, _, err := quantizeTensor(r, name, dtype, shape, quantize)
// Quantize the tensor into a single combined blob
blobData, err := quantizeTensor(r, name, dtype, shape, quantize)
if err != nil {
return nil, fmt.Errorf("failed to quantize %s: %w", name, err)
}
// Create layer for quantized weight
weightLayer, err := manifest.NewLayer(bytes.NewReader(qweightData), manifest.MediaTypeImageTensor)
// Create single layer for the combined blob
layer, err := manifest.NewLayer(bytes.NewReader(blobData), manifest.MediaTypeImageTensor)
if err != nil {
return nil, err
}
// Create layer for scales
scalesLayer, err := manifest.NewLayer(bytes.NewReader(scalesData), manifest.MediaTypeImageTensor)
if err != nil {
return nil, err
}
layers := []create.LayerInfo{
return []create.LayerInfo{
{
Digest: weightLayer.Digest,
Size: weightLayer.Size,
MediaType: weightLayer.MediaType,
Digest: layer.Digest,
Size: layer.Size,
MediaType: layer.MediaType,
Name: name,
},
{
Digest: scalesLayer.Digest,
Size: scalesLayer.Size,
MediaType: scalesLayer.MediaType,
Name: name + "_scale",
},
}
// Add qbiases layer if present (affine mode)
if qbiasData != nil {
qbiasLayer, err := manifest.NewLayer(bytes.NewReader(qbiasData), manifest.MediaTypeImageTensor)
if err != nil {
return nil, err
}
layers = append(layers, create.LayerInfo{
Digest: qbiasLayer.Digest,
Size: qbiasLayer.Size,
MediaType: qbiasLayer.MediaType,
Name: name + "_qbias",
})
}
return layers, nil
}, nil
}
// createUnquantizedLayer creates a single tensor layer without quantization.
@@ -214,6 +189,58 @@ func createUnquantizedLayer(r io.Reader, name string) ([]create.LayerInfo, error
}, nil
}
// newPackedTensorLayerCreator returns a PackedTensorLayerCreator callback for
// creating packed multi-tensor blob layers (used for expert groups).
func newPackedTensorLayerCreator() create.PackedTensorLayerCreator {
return func(groupName string, tensors []create.PackedTensorInput) (create.LayerInfo, error) {
// Check if any tensor in the group needs quantization
hasQuantize := false
for _, t := range tensors {
if t.Quantize != "" {
hasQuantize = true
break
}
}
var blobReader io.Reader
if hasQuantize {
if !QuantizeSupported() {
return create.LayerInfo{}, fmt.Errorf("quantization requires MLX support")
}
blobData, err := quantizePackedGroup(tensors)
if err != nil {
return create.LayerInfo{}, fmt.Errorf("failed to quantize packed group %s: %w", groupName, err)
}
blobReader = bytes.NewReader(blobData)
} else {
// Build unquantized packed blob using streaming reader
// Extract raw tensor data from safetensors-wrapped readers
var tds []*safetensors.TensorData
for _, t := range tensors {
rawData, err := safetensors.ExtractRawFromSafetensors(t.Reader)
if err != nil {
return create.LayerInfo{}, fmt.Errorf("failed to extract tensor %s: %w", t.Name, err)
}
td := safetensors.NewTensorDataFromBytes(t.Name, t.Dtype, t.Shape, rawData)
tds = append(tds, td)
}
blobReader = safetensors.BuildPackedSafetensorsReader(tds)
}
layer, err := manifest.NewLayer(blobReader, manifest.MediaTypeImageTensor)
if err != nil {
return create.LayerInfo{}, err
}
return create.LayerInfo{
Digest: layer.Digest,
Size: layer.Size,
MediaType: layer.MediaType,
Name: groupName,
}, nil
}
}
// newManifestWriter returns a ManifestWriter callback for writing the model manifest.
func newManifestWriter(opts CreateOptions, capabilities []string, parserName, rendererName string) create.ManifestWriter {
return func(modelName string, config create.LayerInfo, layers []create.LayerInfo) error {

View File

@@ -3,128 +3,195 @@
package client
import (
"encoding/binary"
"encoding/json"
"fmt"
"io"
"os"
"path/filepath"
"strconv"
"github.com/ollama/ollama/x/create"
"github.com/ollama/ollama/x/imagegen/mlx"
)
// quantizeTensor loads a tensor from safetensors format, quantizes it,
// and returns safetensors data for the quantized weights, scales, and biases.
// Supported quantization types:
// - "q4": affine 4-bit, group_size=32 (with qbiases)
// - "nvfp4": NVIDIA FP4, group_size=16 (no qbiases, E4M3 scales)
// - "q8": affine 8-bit, group_size=64 (with qbiases)
// - "mxfp8": Microsoft MX FP8, group_size=32 (no qbiases, E4M3 scales)
// Uses MLX's native SaveSafetensors to ensure correct dtype handling (especially uint32 for quantized weights).
func quantizeTensor(r io.Reader, name, dtype string, shape []int32, quantize string) (qweightData, scalesData, qbiasData []byte, qweightShape, scalesShape, qbiasShape []int32, err error) {
// quantizeParams maps quantization type names to MLX quantize parameters.
var quantizeParams = map[string]struct {
groupSize int
bits int
mode string
}{
"int4": {32, 4, "affine"},
"nvfp4": {16, 4, "nvfp4"},
"int8": {64, 8, "affine"},
"mxfp8": {32, 8, "mxfp8"},
}
// loadAndQuantizeArray writes a safetensors reader to a temp file, loads it with MLX,
// quantizes the tensor, and appends the resulting arrays (weight, scale, optional bias)
// to the provided maps. If quantize is empty, the tensor is kept as-is.
// Returns any temp file paths created (caller must clean up) and arrays needing eval.
func loadAndQuantizeArray(r io.Reader, name, quantize string, arrays map[string]*mlx.Array) (tmpPath string, toEval []*mlx.Array, nativeHandle *mlx.SafetensorsFile, err error) {
tmpDir := ensureTempDir()
// Read safetensors data to a temp file (LoadSafetensorsNative needs a path)
tmpFile, err := os.CreateTemp(tmpDir, "quant-input-*.safetensors")
tmpFile, err := os.CreateTemp(tmpDir, "quant-*.safetensors")
if err != nil {
return nil, nil, nil, nil, nil, nil, fmt.Errorf("failed to create temp file: %w", err)
return "", nil, nil, fmt.Errorf("failed to create temp file: %w", err)
}
tmpPath := tmpFile.Name()
defer os.Remove(tmpPath)
tmpPath = tmpFile.Name()
if _, err := io.Copy(tmpFile, r); err != nil {
tmpFile.Close()
return nil, nil, nil, nil, nil, nil, fmt.Errorf("failed to write temp file: %w", err)
return tmpPath, nil, nil, fmt.Errorf("failed to write temp file for %s: %w", name, err)
}
tmpFile.Close()
// Load the tensor using MLX's native loader
st, err := mlx.LoadSafetensorsNative(tmpPath)
if err != nil {
return nil, nil, nil, nil, nil, nil, fmt.Errorf("failed to load safetensors: %w", err)
return tmpPath, nil, nil, fmt.Errorf("failed to load safetensors for %s: %w", name, err)
}
defer st.Free()
// Get the tensor (it's stored as "data" in our minimal safetensors format)
arr := st.Get("data")
// Find the tensor key (may differ from name for single-tensor blobs)
inputKey, err := findSafetensorsKey(tmpPath)
if err != nil {
st.Free()
return tmpPath, nil, nil, fmt.Errorf("failed to read blob header for %s: %w", name, err)
}
arr := st.Get(inputKey)
if arr == nil {
return nil, nil, nil, nil, nil, nil, fmt.Errorf("tensor 'data' not found in safetensors")
st.Free()
return tmpPath, nil, nil, fmt.Errorf("tensor %q not found in safetensors", inputKey)
}
// Convert to BFloat16 if needed (quantize expects float type)
if quantize == "" {
arr = mlx.Contiguous(arr)
arrays[name] = arr
return tmpPath, []*mlx.Array{arr}, st, nil
}
// Convert to float type if needed (quantize expects float)
if arr.Dtype() != mlx.DtypeBFloat16 && arr.Dtype() != mlx.DtypeFloat32 && arr.Dtype() != mlx.DtypeFloat16 {
arr = mlx.AsType(arr, mlx.DtypeBFloat16)
mlx.Eval(arr)
}
// Quantize based on quantization type
var qweight, scales, qbiases *mlx.Array
switch quantize {
case "q4":
// affine mode: group_size=32, bits=4 (with qbiases for zero-point offset)
qweight, scales, qbiases = mlx.Quantize(arr, 32, 4, "affine")
case "nvfp4":
// NVIDIA FP4: group_size=16, bits=4 (no qbiases, E4M3 scales)
qweight, scales, qbiases = mlx.Quantize(arr, 16, 4, "nvfp4")
case "q8":
// affine mode: group_size=64, bits=8 (with qbiases for zero-point offset)
qweight, scales, qbiases = mlx.Quantize(arr, 64, 8, "affine")
case "mxfp8":
// Microsoft MX FP8: group_size=32, bits=8, E4M3 scales (no qbiases)
qweight, scales, qbiases = mlx.Quantize(arr, 32, 8, "mxfp8")
default:
return nil, nil, nil, nil, nil, nil, fmt.Errorf("unsupported quantization type: %s", quantize)
params, ok := quantizeParams[quantize]
if !ok {
st.Free()
return tmpPath, nil, nil, fmt.Errorf("unsupported quantization type: %s", quantize)
}
// Eval and make contiguous for data access
qweight, scales, qbiases := mlx.Quantize(arr, params.groupSize, params.bits, params.mode)
qweight = mlx.Contiguous(qweight)
scales = mlx.Contiguous(scales)
arrays[name] = qweight
arrays[name+".scale"] = scales
toEval = append(toEval, qweight, scales)
if qbiases != nil {
qbiases = mlx.Contiguous(qbiases)
mlx.Eval(qweight, scales, qbiases)
} else {
mlx.Eval(qweight, scales)
arrays[name+".bias"] = qbiases
toEval = append(toEval, qbiases)
}
// Get shapes
qweightShape = qweight.Shape()
scalesShape = scales.Shape()
return tmpPath, toEval, st, nil
}
// Save quantized weight using MLX's native safetensors (correctly handles uint32 dtype)
qweightPath := filepath.Join(tmpDir, "qweight.safetensors")
defer os.Remove(qweightPath)
if err := mlx.SaveSafetensors(qweightPath, map[string]*mlx.Array{"data": qweight}); err != nil {
return nil, nil, nil, nil, nil, nil, fmt.Errorf("failed to save quantized weight: %w", err)
// quantizeTensor loads a tensor from safetensors format, quantizes it,
// and returns a single combined safetensors blob with the quantized weight, scale, and optional bias.
// Tensor keys use the original tensor name: name, name.scale, name.bias.
// The blob includes __metadata__ with quant_type and group_size.
// Supported quantization types: "int4", "nvfp4", "int8", "mxfp8".
func quantizeTensor(r io.Reader, tensorName, dtype string, shape []int32, quantize string) (blobData []byte, err error) {
arrays := make(map[string]*mlx.Array)
tmpPath, toEval, st, err := loadAndQuantizeArray(r, tensorName, quantize, arrays)
if tmpPath != "" {
defer os.Remove(tmpPath)
}
if st != nil {
defer st.Free()
}
qweightData, err = os.ReadFile(qweightPath)
if err != nil {
return nil, nil, nil, nil, nil, nil, fmt.Errorf("failed to read quantized weight: %w", err)
return nil, err
}
// Save scales using MLX's native safetensors
scalesPath := filepath.Join(tmpDir, "scales.safetensors")
defer os.Remove(scalesPath)
if err := mlx.SaveSafetensors(scalesPath, map[string]*mlx.Array{"data": scales}); err != nil {
return nil, nil, nil, nil, nil, nil, fmt.Errorf("failed to save scales: %w", err)
}
scalesData, err = os.ReadFile(scalesPath)
if err != nil {
return nil, nil, nil, nil, nil, nil, fmt.Errorf("failed to read scales: %w", err)
mlx.Eval(toEval...)
// Build metadata for single-tensor blobs
params := quantizeParams[quantize]
metadata := map[string]string{
"quant_type": quantize,
"group_size": strconv.Itoa(params.groupSize),
}
// Affine mode returns qbiases for zero-point offset
if qbiases != nil {
qbiasShape = qbiases.Shape()
qbiasPath := filepath.Join(tmpDir, "qbias.safetensors")
defer os.Remove(qbiasPath)
if err := mlx.SaveSafetensors(qbiasPath, map[string]*mlx.Array{"data": qbiases}); err != nil {
return nil, nil, nil, nil, nil, nil, fmt.Errorf("failed to save qbiases: %w", err)
tmpDir := ensureTempDir()
outPath := filepath.Join(tmpDir, "combined.safetensors")
defer os.Remove(outPath)
if err := mlx.SaveSafetensorsWithMetadata(outPath, arrays, metadata); err != nil {
return nil, fmt.Errorf("failed to save combined blob: %w", err)
}
return os.ReadFile(outPath)
}
// quantizePackedGroup quantizes multiple tensors and saves them all into a single
// combined safetensors blob. Used for packing expert groups.
// Each tensor may have a different quantization type (mixed-precision).
// Returns the blob bytes. No __metadata__ is added because different tensors
// may use different quantization types.
func quantizePackedGroup(inputs []create.PackedTensorInput) ([]byte, error) {
allArrays := make(map[string]*mlx.Array)
var allToEval []*mlx.Array
var tmpPaths []string
var handles []*mlx.SafetensorsFile
for _, input := range inputs {
tmpPath, toEval, st, err := loadAndQuantizeArray(input.Reader, input.Name, input.Quantize, allArrays)
if tmpPath != "" {
tmpPaths = append(tmpPaths, tmpPath)
}
if st != nil {
handles = append(handles, st)
}
qbiasData, err = os.ReadFile(qbiasPath)
if err != nil {
return nil, nil, nil, nil, nil, nil, fmt.Errorf("failed to read qbiases: %w", err)
// Cleanup on error
for _, h := range handles {
h.Free()
}
for _, p := range tmpPaths {
os.Remove(p)
}
return nil, err
}
allToEval = append(allToEval, toEval...)
}
return qweightData, scalesData, qbiasData, qweightShape, scalesShape, qbiasShape, nil
mlx.Eval(allToEval...)
// Free native handles after eval
for _, h := range handles {
h.Free()
}
// Save combined blob (no global metadata for mixed-precision packed blobs)
tmpDir := ensureTempDir()
outPath := filepath.Join(tmpDir, "packed-combined.safetensors")
defer os.Remove(outPath)
if err := mlx.SaveSafetensorsWithMetadata(outPath, allArrays, nil); err != nil {
return nil, fmt.Errorf("failed to save packed blob: %w", err)
}
blobData, err := os.ReadFile(outPath)
if err != nil {
return nil, fmt.Errorf("failed to read packed blob: %w", err)
}
for _, p := range tmpPaths {
os.Remove(p)
}
return blobData, nil
}
// QuantizeSupported returns true if quantization is supported (MLX build)
@@ -138,3 +205,33 @@ func ensureTempDir() string {
os.MkdirAll(tmpDir, 0755)
return tmpDir
}
// findSafetensorsKey reads the first non-metadata tensor key from a safetensors file.
func findSafetensorsKey(path string) (string, error) {
f, err := os.Open(path)
if err != nil {
return "", err
}
defer f.Close()
var headerSize uint64
if err := binary.Read(f, binary.LittleEndian, &headerSize); err != nil {
return "", err
}
headerBytes := make([]byte, headerSize)
if _, err := io.ReadFull(f, headerBytes); err != nil {
return "", err
}
var header map[string]json.RawMessage
if err := json.Unmarshal(headerBytes, &header); err != nil {
return "", err
}
for k := range header {
if k != "__metadata__" {
return k, nil
}
}
return "", fmt.Errorf("no tensor found in safetensors header")
}

View File

@@ -5,11 +5,18 @@ package client
import (
"fmt"
"io"
"github.com/ollama/ollama/x/create"
)
// quantizeTensor is not available without MLX
func quantizeTensor(r io.Reader, name, dtype string, shape []int32, quantize string) (qweightData, scalesData, qbiasData []byte, qweightShape, scalesShape, qbiasShape []int32, err error) {
return nil, nil, nil, nil, nil, nil, fmt.Errorf("quantization requires MLX support (build with mlx tag)")
func quantizeTensor(r io.Reader, tensorName, dtype string, shape []int32, quantize string) (blobData []byte, err error) {
return nil, fmt.Errorf("quantization requires MLX support (build with mlx tag)")
}
// quantizePackedGroup is not available without MLX
func quantizePackedGroup(inputs []create.PackedTensorInput) ([]byte, error) {
return nil, fmt.Errorf("quantization requires MLX support (build with mlx tag)")
}
// QuantizeSupported returns false when MLX is not available

View File

@@ -6,7 +6,9 @@ import (
"io"
"os"
"path/filepath"
"regexp"
"slices"
"sort"
"strings"
"github.com/ollama/ollama/envconfig"
@@ -228,7 +230,7 @@ type LayerCreator func(r io.Reader, mediaType, name string) (LayerInfo, error)
type TensorLayerCreator func(r io.Reader, name, dtype string, shape []int32) (LayerInfo, error)
// QuantizingTensorLayerCreator creates tensor layers with optional quantization.
// When quantize is non-empty (e.g., "q8"), returns multiple layers (weight + scales + biases).
// When quantize is non-empty (e.g., "int8"), returns multiple layers (weight + scales + biases).
type QuantizingTensorLayerCreator func(r io.Reader, name, dtype string, shape []int32, quantize string) ([]LayerInfo, error)
// ManifestWriter writes the manifest file.
@@ -264,19 +266,19 @@ func ShouldQuantize(name, component string) bool {
// ShouldQuantizeTensor returns true if a tensor should be quantized based on name, shape, and quantize type.
// This is a more detailed check that also considers tensor dimensions.
// The quantize parameter specifies the quantization type (e.g., "q4", "nvfp4", "q8", "mxfp8").
// The quantize parameter specifies the quantization type (e.g., "int4", "nvfp4", "int8", "mxfp8").
func ShouldQuantizeTensor(name string, shape []int32, quantize string) bool {
return GetTensorQuantization(name, shape, quantize) != ""
}
// normalizeQuantType converts various quantization type aliases to canonical forms.
// Supports: q4/Q4/int4/INT4/fp4/FP4 -> q4, q8/Q8/int8/INT8/fp8/FP8 -> q8, nvfp4/NVFP4, mxfp8/MXFP8
// Supports: q4/Q4/int4/INT4/fp4/FP4 -> int4, q8/Q8/int8/INT8/fp8/FP8 -> int8, nvfp4/NVFP4, mxfp8/MXFP8
func normalizeQuantType(quantize string) string {
switch strings.ToUpper(quantize) {
case "Q4", "INT4", "FP4":
return "q4"
return "int4"
case "Q8", "INT8", "FP8":
return "q8"
return "int8"
case "NVFP4":
return "nvfp4"
case "MXFP8":
@@ -286,29 +288,12 @@ func normalizeQuantType(quantize string) string {
}
}
// getQuantGroupSize returns the group size for a given quantization type.
// These must match the values used in quantize.go when creating quantized models.
func getQuantGroupSize(quantize string) int {
switch normalizeQuantType(quantize) {
case "nvfp4":
return 16
case "q4":
return 32
case "mxfp8":
return 32
case "q8":
return 64
default:
return 32
}
}
// GetTensorQuantization returns the appropriate quantization type for a tensor.
// Returns "" if the tensor should not be quantized.
// This implements mixed-precision quantization:
// - Attention MLA weights (q_a, q_b, kv_a, kv_b): unquantized (most sensitive)
// - Output projection, gate/up weights: q4 (less sensitive)
// - Down projection weights: q8 (more sensitive, would be Q6 in GGML but no MLX kernel)
// - Output projection, gate/up weights: int4 (less sensitive)
// - Down projection weights: int8 (more sensitive, would be Q6 in GGML but no MLX kernel)
// - Norms, embeddings, biases, routing gates: no quantization
func GetTensorQuantization(name string, shape []int32, quantize string) string {
// Use basic name-based check first
@@ -330,12 +315,12 @@ func GetTensorQuantization(name string, shape []int32, quantize string) string {
quantNorm := normalizeQuantType(quantize)
// MLX quantization requires last dimension to be divisible by group size
// nvfp4: 16, q4/mxfp8: 32, q8: 64
// nvfp4: 16, int4/mxfp8: 32, int8: 64
groupSize := int32(32)
switch quantNorm {
case "nvfp4":
groupSize = 16
case "q8":
case "int8":
groupSize = 64
}
if shape[len(shape)-1]%groupSize != 0 {
@@ -363,13 +348,13 @@ func GetTensorQuantization(name string, shape []int32, quantize string) string {
return "" // No quantization - keep bf16
}
// Down projection weights - use Q8 (would be Q6_K in GGML, but MLX has no Q6 kernel)
// Down projection weights - use INT8 (would be Q6_K in GGML, but MLX has no Q6 kernel)
// mlp.down_proj, mlp.experts.X.down_proj, mlp.shared_experts.down_proj
if strings.Contains(name, "down_proj") {
return "q8"
return "int8"
}
// Output projection, gate/up weights - use requested quantization (Q4)
// Output projection, gate/up weights - use requested quantization (INT4)
// o_proj, gate_proj, up_proj
if strings.Contains(name, "o_proj") ||
strings.Contains(name, "gate_proj") ||
@@ -386,14 +371,69 @@ func GetTensorQuantization(name string, shape []int32, quantize string) string {
return quantNorm
}
// expertGroupRegexp matches expert tensor names and captures the group prefix.
// Matches: model.layers.{L}.mlp.experts.{E}.{proj}.weight (and .scale, .bias suffixes)
// Captures: model.layers.{L}.mlp.experts
var expertGroupRegexp = regexp.MustCompile(`^(model\.layers\.\d+\.mlp\.(?:shared_)?experts)\..*\.weight`)
// ExpertGroupPrefix returns the group prefix for expert tensors that should be packed together.
// For example:
// - "model.layers.1.mlp.experts.0.down_proj.weight" -> "model.layers.1.mlp.experts"
// - "model.layers.1.mlp.shared_experts.down_proj.weight" -> "model.layers.1.mlp.shared_experts"
// - "model.layers.0.mlp.down_proj.weight" -> "" (dense layer, no experts)
// - "model.layers.1.mlp.gate.weight" -> "" (routing gate, not an expert)
func ExpertGroupPrefix(tensorName string) string {
m := expertGroupRegexp.FindStringSubmatch(tensorName)
if m == nil {
return ""
}
return m[1]
}
// PackedTensorInput holds metadata for a tensor that will be packed into a multi-tensor blob.
type PackedTensorInput struct {
Name string
Dtype string
Shape []int32
Quantize string // per-tensor quantization type (may differ within group)
Reader io.Reader // safetensors-wrapped tensor data
}
// PackedTensorLayerCreator creates a single blob layer containing multiple packed tensors.
// groupName is the group prefix (e.g., "model.layers.1.mlp.experts").
type PackedTensorLayerCreator func(groupName string, tensors []PackedTensorInput) (LayerInfo, error)
// CreateSafetensorsModel imports a standard safetensors model from a directory.
// This handles Hugging Face style models with config.json and *.safetensors files.
// Stores each tensor as a separate blob for fine-grained deduplication.
// If quantize is non-empty (e.g., "q8"), eligible tensors will be quantized.
func CreateSafetensorsModel(modelName, modelDir, quantize string, createLayer LayerCreator, createTensorLayer QuantizingTensorLayerCreator, writeManifest ManifestWriter, fn func(status string)) error {
// Expert tensors are packed into per-layer blobs when createPackedLayer is non-nil.
// If quantize is non-empty (e.g., "int8"), eligible tensors will be quantized.
func CreateSafetensorsModel(modelName, modelDir, quantize string, createLayer LayerCreator, createTensorLayer QuantizingTensorLayerCreator, writeManifest ManifestWriter, fn func(status string), createPackedLayer ...PackedTensorLayerCreator) error {
var layers []LayerInfo
var configLayer LayerInfo
// Resolve the optional packed layer creator
var packedCreator PackedTensorLayerCreator
if len(createPackedLayer) > 0 {
packedCreator = createPackedLayer[0]
}
// Accumulate expert tensors by group prefix for packing.
// Readers reference file-backed SectionReaders, so we keep extractors
// open until each group is flushed to avoid buffering tensor data in memory.
expertGroups := make(map[string][]PackedTensorInput)
var expertGroupOrder []string
// Track open extractors so we can close them after flushing groups
var openExtractors []*safetensors.TensorExtractor
closeExtractors := func() {
for _, ext := range openExtractors {
ext.Close()
}
openExtractors = nil
}
entries, err := os.ReadDir(modelDir)
if err != nil {
return fmt.Errorf("failed to read directory: %w", err)
@@ -410,6 +450,7 @@ func CreateSafetensorsModel(modelName, modelDir, quantize string, createLayer La
// Extract individual tensors from safetensors file
extractor, err := safetensors.OpenForExtraction(stPath)
if err != nil {
closeExtractors()
return fmt.Errorf("failed to open %s: %w", stPath, err)
}
@@ -420,10 +461,14 @@ func CreateSafetensorsModel(modelName, modelDir, quantize string, createLayer La
}
fn(fmt.Sprintf("importing %s (%d tensors%s)", entry.Name(), len(tensorNames), quantizeMsg))
// Track whether this extractor has expert tensors that need to stay open
hasExpertTensors := false
for _, tensorName := range tensorNames {
td, err := extractor.GetTensor(tensorName)
if err != nil {
extractor.Close()
closeExtractors()
return fmt.Errorf("failed to get tensor %s: %w", tensorName, err)
}
@@ -434,20 +479,65 @@ func CreateSafetensorsModel(modelName, modelDir, quantize string, createLayer La
quantizeType = GetTensorQuantization(tensorName, td.Shape, quantize)
}
// Store as minimal safetensors format (88 bytes header overhead)
// This enables native mmap loading via mlx_load_safetensors
// createTensorLayer returns multiple layers if quantizing (weight + scales)
newLayers, err := createTensorLayer(td.SafetensorsReader(), tensorName, td.Dtype, td.Shape, quantizeType)
if err != nil {
extractor.Close()
return fmt.Errorf("failed to create layer for %s: %w", tensorName, err)
// Check if this tensor belongs to an expert group for packing
groupPrefix := ""
if packedCreator != nil {
groupPrefix = ExpertGroupPrefix(tensorName)
}
if groupPrefix != "" {
// Accumulate expert tensor for packed blob.
// The Reader uses a file-backed SectionReader, so we must
// keep the extractor open until this group is flushed.
hasExpertTensors = true
if _, exists := expertGroups[groupPrefix]; !exists {
expertGroupOrder = append(expertGroupOrder, groupPrefix)
}
expertGroups[groupPrefix] = append(expertGroups[groupPrefix], PackedTensorInput{
Name: tensorName,
Dtype: td.Dtype,
Shape: td.Shape,
Quantize: quantizeType,
Reader: td.SafetensorsReader(),
})
} else {
// Store as minimal safetensors format (88 bytes header overhead)
// This enables native mmap loading via mlx_load_safetensors
// createTensorLayer returns multiple layers if quantizing (weight + scales)
newLayers, err := createTensorLayer(td.SafetensorsReader(), tensorName, td.Dtype, td.Shape, quantizeType)
if err != nil {
extractor.Close()
closeExtractors()
return fmt.Errorf("failed to create layer for %s: %w", tensorName, err)
}
layers = append(layers, newLayers...)
}
layers = append(layers, newLayers...)
}
extractor.Close()
if hasExpertTensors {
// Keep extractor open - readers still reference its file handle
openExtractors = append(openExtractors, extractor)
} else {
extractor.Close()
}
}
// Process accumulated expert groups into packed blobs, then close extractors
if packedCreator != nil {
sort.Strings(expertGroupOrder)
for _, groupName := range expertGroupOrder {
tensors := expertGroups[groupName]
fn(fmt.Sprintf("packing %s (%d tensors)", groupName, len(tensors)))
layer, err := packedCreator(groupName, tensors)
if err != nil {
closeExtractors()
return fmt.Errorf("failed to create packed layer for %s: %w", groupName, err)
}
layers = append(layers, layer)
}
}
closeExtractors()
// Process all JSON config files
for _, entry := range entries {
if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".json") {
@@ -487,23 +577,6 @@ func CreateSafetensorsModel(modelName, modelDir, quantize string, createLayer La
return fmt.Errorf("config.json not found in %s", modelDir)
}
// Create model_index.json with quantization info if quantizing
if quantize != "" {
modelIndex := map[string]any{
"quantization": strings.ToUpper(quantize),
"group_size": getQuantGroupSize(quantize),
}
indexData, err := json.MarshalIndent(modelIndex, "", " ")
if err != nil {
return fmt.Errorf("failed to marshal model_index.json: %w", err)
}
indexLayer, err := createLayer(strings.NewReader(string(indexData)), "application/vnd.ollama.image.json", "model_index.json")
if err != nil {
return fmt.Errorf("failed to create model_index.json layer: %w", err)
}
layers = append(layers, indexLayer)
}
fn(fmt.Sprintf("writing manifest for %s", modelName))
if err := writeManifest(modelName, configLayer, layers); err != nil {

View File

@@ -586,6 +586,39 @@ func TestShouldQuantizeTensor(t *testing.T) {
}
}
func TestExpertGroupPrefix(t *testing.T) {
tests := []struct {
name string
want string
}{
// Expert tensors should return the group prefix
{"model.layers.1.mlp.experts.0.down_proj.weight", "model.layers.1.mlp.experts"},
{"model.layers.1.mlp.experts.63.gate_proj.weight", "model.layers.1.mlp.experts"},
{"model.layers.0.mlp.experts.0.up_proj.weight", "model.layers.0.mlp.experts"},
// Shared expert tensors should return their own group prefix
{"model.layers.1.mlp.shared_experts.down_proj.weight", "model.layers.1.mlp.shared_experts"},
{"model.layers.2.mlp.shared_experts.gate_proj.weight", "model.layers.2.mlp.shared_experts"},
// Non-expert tensors should return empty string
{"model.layers.0.mlp.down_proj.weight", ""}, // dense layer, no experts
{"model.layers.1.mlp.gate.weight", ""}, // routing gate, not an expert
{"model.embed_tokens.weight", ""}, // embedding
{"model.layers.0.self_attn.q_proj.weight", ""}, // attention
{"model.norm.weight", ""}, // norm
{"lm_head.weight", ""}, // output head
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := ExpertGroupPrefix(tt.name)
if got != tt.want {
t.Errorf("ExpertGroupPrefix(%q) = %q, want %q", tt.name, got, tt.want)
}
})
}
}
func TestCreateSafetensorsModel_WithQuantize(t *testing.T) {
dir := t.TempDir()
@@ -751,7 +784,7 @@ func TestCreateImageGenModel_WithQuantize(t *testing.T) {
progressFn := func(status string) {}
err := CreateImageGenModel("test-imagegen", dir, "q8", createLayer, createTensorLayer, writeManifest, progressFn)
err := CreateImageGenModel("test-imagegen", dir, "int8", createLayer, createTensorLayer, writeManifest, progressFn)
if err != nil {
t.Fatalf("CreateImageGenModel failed: %v", err)
}

View File

@@ -15,15 +15,15 @@ import (
// CreateImageGenModel imports an image generation model from a directory.
// Stores each tensor as a separate blob for fine-grained deduplication.
// If quantize is specified, linear weights in transformer/text_encoder are quantized.
// Supported quantization types: q4, q8, nvfp4, mxfp8 (or empty for no quantization).
// Supported quantization types: int4, int8, nvfp4, mxfp8 (or empty for no quantization).
// Layer creation and manifest writing are done via callbacks to avoid import cycles.
func CreateImageGenModel(modelName, modelDir, quantize string, createLayer LayerCreator, createTensorLayer QuantizingTensorLayerCreator, writeManifest ManifestWriter, fn func(status string)) error {
// Validate quantization type
switch quantize {
case "", "q4", "q8", "nvfp4", "mxfp8":
case "", "int4", "int8", "nvfp4", "mxfp8":
// valid
default:
return fmt.Errorf("unsupported quantization type %q: supported types are q4, q8, nvfp4, mxfp8", quantize)
return fmt.Errorf("unsupported quantization type %q: supported types are int4, int8, nvfp4, mxfp8", quantize)
}
var layers []LayerInfo
@@ -214,7 +214,7 @@ func CreateImageGenModel(modelName, modelDir, quantize string, createLayer Layer
// canQuantizeShape returns true if a tensor shape is compatible with MLX quantization.
// MLX requires the last dimension to be divisible by the group size.
// nvfp4: 16, q4/mxfp8: 32, q8: 64
// nvfp4: 16, int4/mxfp8: 32, int8: 64
func canQuantizeShape(shape []int32, quantize string) bool {
if len(shape) < 2 {
return false
@@ -223,7 +223,7 @@ func canQuantizeShape(shape []int32, quantize string) bool {
switch strings.ToUpper(quantize) {
case "NVFP4":
groupSize = 16
case "Q8":
case "INT8":
groupSize = 64
}
return shape[len(shape)-1]%groupSize == 0

View File

@@ -0,0 +1,158 @@
# Tensor Blob Format
Ollama stores model tensors as individual blobs in the safetensors format. Each blob contains a logical tensor (or a combined quantized tensor with its scale/bias components), or a group of logical tensors (e.g. shared experts for a given layer along with the scale/bias components for that tensor).
## Safetensors File Format
Every blob follows the [safetensors](https://github.com/huggingface/safetensors) layout:
```
[8 bytes: header_size (uint64 LE)] [header_size bytes: JSON header] [tensor data region]
```
The JSON header maps tensor names to their dtype, shape, and byte offsets within the data region. A special `__metadata__` key holds string-to-string metadata.
## Unquantized Blobs
An unquantized blob stores a single tensor keyed by its name:
```json
{
"model.layers.0.self_attn.q_proj.weight": {
"dtype": "BF16",
"shape": [2560, 2560],
"data_offsets": [0, 13107200]
}
}
```
The tensor key is the full tensor name. Dtype is typically `BF16` or `F32`.
## Quantized Blobs (Combined Format)
A quantized blob stores the packed weight, scaling factors, and optional zero-point biases in a single file. Tensor keys use the tensor name, with `.scale` and `.bias` suffixes for the auxiliary tensors:
```json
{
"__metadata__": {
"quant_type": "int4",
"group_size": "32"
},
"model.layers.0.mlp.up_proj.weight": {
"dtype": "U32",
"shape": [2560, 320],
"data_offsets": [0, 3276800]
},
"model.layers.0.mlp.up_proj.weight.scale": {
"dtype": "BF16",
"shape": [2560, 80],
"data_offsets": [3276800, 3686400]
},
"model.layers.0.mlp.up_proj.weight.bias": {
"dtype": "BF16",
"shape": [2560, 80],
"data_offsets": [3686400, 4096000]
}
}
```
### Metadata Fields
| Field | Description |
|---|---|
| `quant_type` | Quantization type: `int4`, `int8`, `nvfp4`, or `mxfp8` |
| `group_size` | Number of elements per quantization group (e.g., `32`, `64`) |
### Tensor Keys
| Key | Description |
|---|---|
| `{name}` | Packed quantized weights (dtype `U32`) |
| `{name}.scale` | Per-group scaling factors |
| `{name}.bias` | Per-group zero-point offsets (affine modes only) |
## Quantization Types
| Type | Bits | Group Size | Mode | Has Bias |
|---|---|---|---|---|
| `int4` | 4 | 32 | affine | yes |
| `int8` | 8 | 64 | affine | yes |
| `nvfp4` | 4 | 16 | nvfp4 | no |
| `mxfp8` | 8 | 32 | mxfp8 | no |
**Affine modes** (`int4`, `int8`) use `scale + bias` for dequantization. The bias tensor provides the zero-point offset.
**Non-affine modes** (`nvfp4`, `mxfp8`) use only `scale` with specialized E4M3 scale formats.
### Packed Weight Shape
Quantized weights are packed into `uint32` values:
- **4-bit** (int4, nvfp4): 8 values per uint32, so `packed_cols = original_cols / 8`
- **8-bit** (int8, mxfp8): 4 values per uint32, so `packed_cols = original_cols / 4`
Scale shape: `[rows, original_cols / group_size]`
## Manifest References
Blobs are referenced from the model manifest as layers:
```json
{
"mediaType": "application/vnd.ollama.image.tensor",
"digest": "sha256:abc123...",
"size": 4096150,
"name": "model.layers.0.mlp.up_proj.weight"
}
```
Each tensor (quantized or not) is one layer in the manifest. The layer name matches the tensor key in the blob header.
## Packed Blobs (Expert Groups)
For MoE (Mixture of Experts) models, expert tensors from the same layer are packed into a single blob to reduce blob count and improve loading efficiency. A packed blob is a standard safetensors file containing multiple tensor entries:
```json
{
"model.layers.1.mlp.experts.0.down_proj.weight": {
"dtype": "U32",
"shape": [2560, 640],
"data_offsets": [0, 6553600]
},
"model.layers.1.mlp.experts.0.down_proj.weight.scale": {
"dtype": "BF16",
"shape": [2560, 40],
"data_offsets": [6553600, 6963200]
},
"model.layers.1.mlp.experts.0.gate_proj.weight": {
"dtype": "U32",
"shape": [10240, 320],
"data_offsets": [6963200, 20070400]
},
"model.layers.1.mlp.experts.0.gate_proj.weight.scale": { "..." : "..." }
}
```
### Grouping Rules
- `model.layers.{L}.mlp.experts.*` tensors are packed into one blob per layer
- `model.layers.{L}.mlp.shared_experts.*` tensors are packed into one blob per layer
- All other tensors remain as individual blobs
### Manifest Representation
One manifest layer per packed group, using the group prefix as the layer name:
```json
{
"mediaType": "application/vnd.ollama.image.tensor",
"digest": "sha256:...",
"size": 123456789,
"name": "model.layers.1.mlp.experts"
}
```
## Loading
At load time, `mlx_load_safetensors` opens each blob via mmap for zero-copy access. For combined quantized blobs, the loader extracts `{name}`, `{name}.scale`, and `{name}.bias` tensors and caches them as `name`, `name + "_scale"`, and `name + "_qbias"` respectively, maintaining compatibility with the weight loading interface.
For packed blobs, if the manifest layer name (group prefix) is not found as a tensor key, the loader parses the blob header to discover all tensor names and loads each individually.

View File

@@ -1,11 +1,13 @@
package manifest
import (
"encoding/binary"
"encoding/json"
"fmt"
"io"
"os"
"path/filepath"
"sort"
"strings"
"github.com/ollama/ollama/envconfig"
@@ -205,17 +207,12 @@ func GetModelInfo(modelName string) (*ModelInfo, error) {
}
}
// Fallback: detect quantization from tensor names if not in config
// Fallback: detect quantization from first tensor blob's __metadata__
if info.Quantization == "" {
for _, layer := range manifest.Manifest.Layers {
if strings.HasSuffix(layer.Name, ".weight_scale") {
info.Quantization = "Q8"
break
}
}
if info.Quantization == "" {
info.Quantization = "BF16"
}
info.Quantization = detectQuantizationFromBlobs(manifest)
}
if info.Quantization == "" {
info.Quantization = "BF16"
}
// Fallback: estimate parameter count if not in config
@@ -223,9 +220,7 @@ func GetModelInfo(modelName string) (*ModelInfo, error) {
var totalSize int64
for _, layer := range manifest.Manifest.Layers {
if layer.MediaType == "application/vnd.ollama.image.tensor" {
if !strings.HasSuffix(layer.Name, "_scale") && !strings.HasSuffix(layer.Name, "_qbias") {
totalSize += layer.Size
}
totalSize += layer.Size
}
}
// Assume BF16 (2 bytes/param) as rough estimate
@@ -234,3 +229,79 @@ func GetModelInfo(modelName string) (*ModelInfo, error) {
return info, nil
}
// detectQuantizationFromBlobs reads __metadata__ from the first tensor blob
// to detect quantization type.
func detectQuantizationFromBlobs(manifest *ModelManifest) string {
for _, layer := range manifest.Manifest.Layers {
if layer.MediaType != "application/vnd.ollama.image.tensor" {
continue
}
data, err := readBlobHeader(manifest.BlobPath(layer.Digest))
if err != nil {
continue
}
var header map[string]json.RawMessage
if json.Unmarshal(data, &header) != nil {
continue
}
if metaRaw, ok := header["__metadata__"]; ok {
var meta map[string]string
if json.Unmarshal(metaRaw, &meta) == nil {
if qt, ok := meta["quant_type"]; ok && qt != "" {
return strings.ToUpper(qt)
}
}
}
// Only check the first tensor blob
break
}
return ""
}
// ParseBlobTensorNames reads a safetensors blob and returns all "main" tensor names.
// Filters out __metadata__, .scale, and .bias entries to return only primary weight tensors.
func ParseBlobTensorNames(path string) ([]string, error) {
data, err := readBlobHeader(path)
if err != nil {
return nil, err
}
var header map[string]json.RawMessage
if err := json.Unmarshal(data, &header); err != nil {
return nil, err
}
var names []string
for k := range header {
if k == "__metadata__" || strings.HasSuffix(k, ".scale") || strings.HasSuffix(k, ".bias") {
continue
}
names = append(names, k)
}
sort.Strings(names)
return names, nil
}
// readBlobHeader reads the JSON header bytes from a safetensors blob file.
func readBlobHeader(path string) ([]byte, error) {
f, err := os.Open(path)
if err != nil {
return nil, err
}
defer f.Close()
var headerSize uint64
if err := binary.Read(f, binary.LittleEndian, &headerSize); err != nil {
return nil, err
}
if headerSize > 1024*1024 {
return nil, fmt.Errorf("header too large: %d", headerSize)
}
data := make([]byte, headerSize)
if _, err := io.ReadFull(f, data); err != nil {
return nil, err
}
return data, nil
}

View File

@@ -5,6 +5,7 @@ package manifest
import (
"fmt"
"sort"
"strconv"
"strings"
"github.com/ollama/ollama/x/imagegen/mlx"
@@ -18,6 +19,8 @@ type ManifestWeights struct {
tensors map[string]ManifestLayer // name -> layer
cache map[string]*mlx.Array // name -> loaded array
nativeCache []*mlx.SafetensorsFile // keep native handles alive
quantType string // quantization type from blob metadata (e.g., "int4", "int8")
groupSize int // quantization group size from blob metadata
}
// LoadWeightsFromManifest creates a weight loader from manifest storage.
@@ -54,43 +57,115 @@ func LoadWeightsFromManifest(manifest *ModelManifest, component string) (*Manife
// Load loads all tensor blobs using native mmap (zero-copy).
// Blobs are stored in safetensors format for native mlx_load_safetensors mmap.
// If dtype is non-zero, tensors are converted to the specified dtype.
// Combined quantized blobs contain tensors keyed by name, name+".scale", and optional name+".bias"
// with quantization metadata. Scale and bias are stored in cache as name+"_scale"
// and name+"_qbias" for compatibility with downstream loading code.
// Packed blobs (e.g., for expert groups) contain multiple tensors; the manifest name
// is a group prefix and individual tensors are loaded by their actual names from the blob.
// If dtype is non-zero, non-quantized tensors are converted to the specified dtype.
func (mw *ManifestWeights) Load(dtype mlx.Dtype) error {
// Track native handles to free after batch eval
nativeHandles := make([]*mlx.SafetensorsFile, 0, len(mw.tensors))
arrays := make([]*mlx.Array, 0, len(mw.tensors))
// Group tensors by digest to avoid loading the same blob multiple times
type blobEntry struct {
name string
layer ManifestLayer
}
blobGroups := make(map[string][]blobEntry)
for name, layer := range mw.tensors {
path := mw.manifest.BlobPath(layer.Digest)
blobGroups[layer.Digest] = append(blobGroups[layer.Digest], blobEntry{name, layer})
}
for digest, entries := range blobGroups {
path := mw.manifest.BlobPath(digest)
// Load blob as safetensors (native mmap, zero-copy)
sf, err := mlx.LoadSafetensorsNative(path)
if err != nil {
// Free any handles we've accumulated
for _, h := range nativeHandles {
h.Free()
}
return fmt.Errorf("load %s: %w", name, err)
return fmt.Errorf("load %s: %w", entries[0].name, err)
}
nativeHandles = append(nativeHandles, sf)
// Blob contains single tensor named "data"
arr := sf.Get("data")
if arr == nil {
for _, h := range nativeHandles {
h.Free()
// Read quantization metadata from blob
if qt := sf.GetMetadata("quant_type"); qt != "" && mw.quantType == "" {
mw.quantType = qt
if gs := sf.GetMetadata("group_size"); gs != "" {
mw.groupSize, _ = strconv.Atoi(gs)
}
return fmt.Errorf("tensor 'data' not found in blob for %s", name)
}
// Convert dtype if needed
if dtype != 0 && arr.Dtype() != dtype {
arr = mlx.AsType(arr, dtype)
for _, entry := range entries {
name := entry.name
// Try to get tensor by manifest name
arr := sf.Get(name)
if arr != nil {
// Single-tensor blob or tensor found by name
if dtype != 0 && arr.Dtype() != dtype {
arr = mlx.AsType(arr, dtype)
}
arr = mlx.Contiguous(arr)
mw.cache[name] = arr
arrays = append(arrays, arr)
// Check for scale tensor
if scale := sf.Get(name + ".scale"); scale != nil {
scale = mlx.Contiguous(scale)
mw.cache[name+"_scale"] = scale
arrays = append(arrays, scale)
}
// Check for bias tensor
if bias := sf.Get(name + ".bias"); bias != nil {
bias = mlx.Contiguous(bias)
mw.cache[name+"_qbias"] = bias
arrays = append(arrays, bias)
}
} else {
// Packed blob: manifest name is a group prefix, not a tensor name.
// Load all individual tensors from the blob.
tensorNames, err := ParseBlobTensorNames(path)
if err != nil {
for _, h := range nativeHandles {
h.Free()
}
return fmt.Errorf("parse packed blob for %s: %w", name, err)
}
for _, tensorName := range tensorNames {
tArr := sf.Get(tensorName)
if tArr == nil {
continue
}
if dtype != 0 && tArr.Dtype() != dtype {
tArr = mlx.AsType(tArr, dtype)
}
tArr = mlx.Contiguous(tArr)
mw.cache[tensorName] = tArr
arrays = append(arrays, tArr)
// Check for scale tensor
if scale := sf.Get(tensorName + ".scale"); scale != nil {
scale = mlx.Contiguous(scale)
mw.cache[tensorName+"_scale"] = scale
arrays = append(arrays, scale)
}
// Check for bias tensor
if bias := sf.Get(tensorName + ".bias"); bias != nil {
bias = mlx.Contiguous(bias)
mw.cache[tensorName+"_qbias"] = bias
arrays = append(arrays, bias)
}
}
}
}
// Make contiguous copy to ensure independence from mmap
arr = mlx.Contiguous(arr)
mw.cache[name] = arr
arrays = append(arrays, arr)
}
// Batch evaluate all tensors at once (much faster than one at a time)
@@ -117,30 +192,50 @@ func (mw *ManifestWeights) GetTensor(name string) (*mlx.Array, error) {
}
// ListTensors returns all tensor names in sorted order.
// Includes both manifest tensor names and scale/bias entries from combined blobs.
func (mw *ManifestWeights) ListTensors() []string {
names := make([]string, 0, len(mw.tensors))
seen := make(map[string]bool, len(mw.tensors)+len(mw.cache))
for name := range mw.tensors {
seen[name] = true
}
// Also include cache entries (scale/bias from combined blobs)
for name := range mw.cache {
seen[name] = true
}
names := make([]string, 0, len(seen))
for name := range seen {
names = append(names, name)
}
sort.Strings(names)
return names
}
// HasTensor checks if a tensor exists.
// HasTensor checks if a tensor exists in the manifest or cache.
func (mw *ManifestWeights) HasTensor(name string) bool {
_, ok := mw.tensors[name]
return ok
if _, ok := mw.tensors[name]; ok {
return true
}
// Also check cache for scale/bias entries from combined blobs
if _, ok := mw.cache[name]; ok {
return true
}
return false
}
// Quantization returns the model's quantization type from model_index.json.
// Quantization returns the model's quantization type.
// Returns the quant_type from blob metadata (e.g., "int4", "int8", "nvfp4", "mxfp8").
// Returns empty string if not quantized.
// Falls back to detecting from tensor names and shapes if not in config.
// Falls back to model_index.json for image gen models.
func (mw *ManifestWeights) Quantization() string {
if mw.quantType != "" {
return strings.ToUpper(mw.quantType)
}
if mw.manifest == nil {
return ""
}
// Try to read from model_index.json first
// Fallback: read from model_index.json (for image gen models)
var index struct {
Quantization string `json:"quantization"`
}
@@ -148,89 +243,22 @@ func (mw *ManifestWeights) Quantization() string {
return index.Quantization
}
// Fallback: detect from tensor names
// Check if any tensors have _scale suffix (indicates quantization)
hasScales := false
hasQBias := false
for name := range mw.tensors {
if strings.HasSuffix(name, ".weight_scale") {
hasScales = true
}
if strings.HasSuffix(name, ".weight_qbias") {
hasQBias = true
}
}
if !hasScales {
// No scales = not quantized
return ""
}
// Has scales but no qbias = NVFP4 (or other non-affine mode)
if !hasQBias {
return "NVFP4"
}
// Has both scales and qbias = affine mode
// Need to determine FP4 vs FP8 from tensor shapes
// FP4: weight last dim is 1/8 of scales last dim * group_size
// FP8: weight last dim is 1/4 of scales last dim * group_size
//
// For affine mode with group_size=32:
// - FP4 (4 bits): 8 elements packed per uint32, so weight_dim = orig_dim / 8
// - FP8 (8 bits): 4 elements packed per uint32, so weight_dim = orig_dim / 4
// scales_dim = orig_dim / group_size
// So: weight_dim / scales_dim = group_size / pack_factor
// FP4: ratio = 32/8 = 4
// FP8: ratio = 32/4 = 8
// Find a weight/scale pair to check the ratio
for name := range mw.tensors {
if !strings.HasSuffix(name, ".weight") || strings.Contains(name, "_scale") || strings.Contains(name, "_qbias") {
continue
}
scaleName := name + "_scale"
if _, ok := mw.tensors[scaleName]; !ok {
continue
}
// Load both tensors to check shapes
weightLayer := mw.tensors[name]
scaleLayer := mw.tensors[scaleName]
// Get shapes from manifest layer metadata if available
// For now, default to FP4 since it's more common
// The actual shape check would require loading the tensor
// Simple heuristic: check if scale tensor is ~4x smaller than weight
// FP4: weight is packed 8 per uint32, scales are 1 per group (32)
// So scale size should be ~weight_size * 8 / 32 = weight_size / 4
// FP8: weight is packed 4 per uint32, scales are 1 per group (32)
// So scale size should be ~weight_size * 4 / 32 = weight_size / 8
// Rough size heuristic (assuming float16 scales)
// Q4: scale_bytes ≈ weight_bytes / 4 * 2 / 4 = weight_bytes / 8
// Q8: scale_bytes ≈ weight_bytes / 8 * 2 / 4 = weight_bytes / 16
ratio := float64(weightLayer.Size) / float64(scaleLayer.Size)
if ratio < 12 {
// Closer to 8 = Q4
return "Q4"
}
// Closer to 16 = Q8
return "Q8"
}
// Default to Q4 for affine mode (most common)
return "Q4"
return ""
}
// GroupSize returns the quantization group size from model_index.json.
// GroupSize returns the quantization group size.
// Returns the group_size from blob metadata.
// Returns 0 if not specified (caller should use default based on quantization type).
func (mw *ManifestWeights) GroupSize() int {
if mw.groupSize > 0 {
return mw.groupSize
}
if mw.manifest == nil {
return 0
}
// Fallback: read from model_index.json (for image gen models)
var index struct {
GroupSize int `json:"group_size"`
}

View File

@@ -1544,6 +1544,18 @@ func (s *SafetensorsFile) Count() int {
return 0
}
// GetMetadata retrieves a metadata value by key from the safetensors file
func (s *SafetensorsFile) GetMetadata(key string) string {
cKey := C.CString(key)
defer C.free(unsafe.Pointer(cKey))
var cValue *C.char
if C.mlx_map_string_to_string_get(&cValue, s.metadata, cKey) != 0 {
return ""
}
return C.GoString(cValue)
}
// Free releases the safetensors file
func (s *SafetensorsFile) Free() {
C.mlx_map_string_to_array_free(s.arrays)
@@ -1578,6 +1590,41 @@ func SaveSafetensors(path string, arrays map[string]*Array) error {
return nil
}
// SaveSafetensorsWithMetadata saves arrays to a safetensors file with metadata key/value pairs.
// This is like SaveSafetensors but inserts metadata into the __metadata__ section.
func SaveSafetensorsWithMetadata(path string, arrays map[string]*Array, metadata map[string]string) error {
cPath := C.CString(path)
defer C.free(unsafe.Pointer(cPath))
// Create the array map
cArrays := C.mlx_map_string_to_array_new()
defer C.mlx_map_string_to_array_free(cArrays)
for name, arr := range arrays {
cName := C.CString(name)
C.mlx_map_string_to_array_insert(cArrays, cName, arr.c)
C.free(unsafe.Pointer(cName))
}
// Create metadata map
cMeta := C.mlx_map_string_to_string_new()
defer C.mlx_map_string_to_string_free(cMeta)
for key, value := range metadata {
cKey := C.CString(key)
cValue := C.CString(value)
C.mlx_map_string_to_string_insert(cMeta, cKey, cValue)
C.free(unsafe.Pointer(cKey))
C.free(unsafe.Pointer(cValue))
}
// Save
if C.mlx_save_safetensors(cPath, cArrays, cMeta) != 0 {
return fmt.Errorf("failed to save safetensors: %s", path)
}
return nil
}
// ============ NPY Loading ============
// LoadNpy loads a numpy array from an npy file

View File

@@ -41,13 +41,11 @@ func (td *TensorData) Reader() io.Reader {
return td.reader
}
// SafetensorsReader returns a reader that outputs the tensor wrapped in
// minimal safetensors format. This allows using mlx_load_safetensors on
// individual tensor blobs for native zero-copy loading.
func (td *TensorData) SafetensorsReader() io.Reader {
// Build minimal safetensors header with tensor named "data"
header := map[string]tensorInfo{
"data": {
// safetensorsHeader builds the JSON header for a minimal safetensors blob
// containing a single tensor keyed by its name.
func (td *TensorData) safetensorsHeader() []byte {
header := map[string]any{
td.Name: tensorInfo{
Dtype: td.Dtype,
Shape: td.Shape,
DataOffsets: [2]int{0, int(td.Size)},
@@ -58,6 +56,15 @@ func (td *TensorData) SafetensorsReader() io.Reader {
// Pad header to 8-byte alignment
padding := (8 - len(headerJSON)%8) % 8
headerJSON = append(headerJSON, bytes.Repeat([]byte(" "), padding)...)
return headerJSON
}
// SafetensorsReader returns a reader that outputs the tensor wrapped in
// minimal safetensors format. This allows using mlx_load_safetensors on
// individual tensor blobs for native zero-copy loading.
// The tensor is keyed by its name in the safetensors header.
func (td *TensorData) SafetensorsReader() io.Reader {
headerJSON := td.safetensorsHeader()
// Build header with size prefix
headerBuf := new(bytes.Buffer)
@@ -71,16 +78,77 @@ func (td *TensorData) SafetensorsReader() io.Reader {
// SafetensorsSize returns the total size of the safetensors-wrapped tensor.
func (td *TensorData) SafetensorsSize() int64 {
header := map[string]tensorInfo{
"data": {
headerJSON := td.safetensorsHeader()
return 8 + int64(len(headerJSON)) + td.Size
}
// NewTensorDataFromBytes creates a TensorData from raw tensor bytes.
// This is useful for constructing packed blobs from already-extracted data.
func NewTensorDataFromBytes(name, dtype string, shape []int32, rawData []byte) *TensorData {
return &TensorData{
Name: name,
Dtype: dtype,
Shape: shape,
Size: int64(len(rawData)),
reader: io.NewSectionReader(bytes.NewReader(rawData), 0, int64(len(rawData))),
}
}
// ExtractRawFromSafetensors reads a safetensors-wrapped reader and extracts
// the raw tensor data bytes (stripping the header).
func ExtractRawFromSafetensors(r io.Reader) ([]byte, error) {
// Read header size (8 bytes, little endian)
var headerSize uint64
if err := binary.Read(r, binary.LittleEndian, &headerSize); err != nil {
return nil, fmt.Errorf("failed to read header size: %w", err)
}
// Skip header
if _, err := io.CopyN(io.Discard, r, int64(headerSize)); err != nil {
return nil, fmt.Errorf("failed to skip header: %w", err)
}
// Read remaining bytes (the raw tensor data)
return io.ReadAll(r)
}
// BuildPackedSafetensorsReader builds a streaming io.Reader that outputs a valid
// safetensors file containing multiple tensors. Used for packing expert tensors
// into a single blob without loading all data into memory.
// Each TensorData must have been obtained from GetTensor.
func BuildPackedSafetensorsReader(tensors []*TensorData) io.Reader {
// Build the header with sequential data offsets
header := make(map[string]tensorInfo, len(tensors))
var offset int
for _, td := range tensors {
header[td.Name] = tensorInfo{
Dtype: td.Dtype,
Shape: td.Shape,
DataOffsets: [2]int{0, int(td.Size)},
},
DataOffsets: [2]int{offset, offset + int(td.Size)},
}
offset += int(td.Size)
}
headerJSON, _ := json.Marshal(header)
// Pad header to 8-byte alignment
padding := (8 - len(headerJSON)%8) % 8
return 8 + int64(len(headerJSON)) + int64(padding) + td.Size
headerJSON = append(headerJSON, bytes.Repeat([]byte(" "), padding)...)
// Build header with size prefix
headerBuf := new(bytes.Buffer)
binary.Write(headerBuf, binary.LittleEndian, uint64(len(headerJSON)))
headerBuf.Write(headerJSON)
// Build multi-reader: header + all tensor data readers
readers := make([]io.Reader, 0, 1+len(tensors))
readers = append(readers, headerBuf)
for _, td := range tensors {
td.reader.Seek(0, io.SeekStart)
readers = append(readers, td.reader)
}
return io.MultiReader(readers...)
}
// OpenForExtraction opens a safetensors file for tensor extraction.

View File

@@ -17,7 +17,7 @@ type WeightSource interface {
GetTensor(name string) (*mlx.Array, error)
ListTensors() []string
HasTensor(name string) bool
Quantization() string // Returns "NVFP4", "Q4", "Q8", or ""
Quantization() string // Returns "NVFP4", "INT4", "INT8", or ""
GroupSize() int // Returns quantization group size, or 0 if not specified
}

View File

@@ -6,6 +6,7 @@ import (
"fmt"
"io"
"os"
"sort"
"strings"
"github.com/ollama/ollama/api"
@@ -105,9 +106,9 @@ func buildModelInfo(config modelConfig, totalTensorBytes, tensorCount int64) map
bytesPerParam = 1
}
// Subtract safetensors header overhead (88 bytes per tensor file)
// Each tensor is stored as a minimal safetensors file
totalBytes := totalTensorBytes - tensorCount*88
// Subtract safetensors header overhead per tensor blob.
// Headers include __metadata__ with the tensor name, so overhead is ~150 bytes on average.
totalBytes := totalTensorBytes - tensorCount*150
paramCount := totalBytes / bytesPerParam
@@ -163,24 +164,103 @@ func GetSafetensorsTensorInfo(name model.Name) ([]api.Tensor, error) {
// getTensorInfoFromManifest extracts tensor info from a manifest.
// This is separated for testability.
// For quantized models, groups weight/scale/qbias into single entries with detected quantization type.
// For quantized tensors, reads quant_type from blob __metadata__.
// For packed blobs (multiple tensors per blob), enumerates all tensors in the blob.
func getTensorInfoFromManifest(mf *manifest.Manifest) ([]api.Tensor, error) {
var tensors []api.Tensor
// First pass: collect all tensor info and identify scale tensors
type tensorData struct {
info *safetensorsTensorInfo
digest string
}
tensorMap := make(map[string]*tensorData)
scaleMap := make(map[string]*tensorData) // base name -> scale tensor info
for _, layer := range mf.Layers {
if layer.MediaType != manifest.MediaTypeImageTensor {
continue
}
// Read the safetensors header from the blob
// Read all tensor entries from the safetensors header
blobPath, err := manifest.BlobsPath(layer.Digest)
if err != nil {
continue
}
f, err := os.Open(blobPath)
if err != nil {
continue
}
allInfos, err := parseSafetensorsAllHeaders(f)
f.Close()
if err != nil {
continue
}
// Determine if this is a packed blob (multiple main tensors)
isPacked := len(allInfos) > 1
for _, info := range allInfos {
tensorName := layer.Name
if isPacked {
// For packed blobs, use the tensor name from the header
tensorName = info.Name
}
if info.QuantType != "" {
quantType := strings.ToUpper(info.QuantType)
shape := make([]uint64, len(info.Shape))
for i, s := range info.Shape {
shape[i] = uint64(s)
}
var packFactor int64
switch strings.ToLower(info.QuantType) {
case "int4", "nvfp4":
packFactor = 8
case "int8", "mxfp8":
packFactor = 4
}
if packFactor > 0 && len(shape) >= 2 {
shape[len(shape)-1] = uint64(info.Shape[len(info.Shape)-1] * packFactor)
}
tensors = append(tensors, api.Tensor{
Name: tensorName,
Type: quantType,
Shape: shape,
})
} else {
shape := make([]uint64, len(info.Shape))
for i, s := range info.Shape {
shape[i] = uint64(s)
}
tensors = append(tensors, api.Tensor{
Name: tensorName,
Type: info.Dtype,
Shape: shape,
})
}
}
}
sort.Slice(tensors, func(i, j int) bool {
return tensors[i].Name < tensors[j].Name
})
return tensors, nil
}
// GetSafetensorsDtype returns the quantization type for a safetensors model.
// Reads quant_type from the first tensor blob's __metadata__.
// Falls back to torch_dtype from config.json if no quant metadata.
func GetSafetensorsDtype(name model.Name) (string, error) {
mf, err := manifest.ParseNamedManifest(name)
if err != nil {
return "", fmt.Errorf("failed to load manifest: %w", err)
}
// Check first tensor blob for quant_type metadata
for _, layer := range mf.Layers {
if layer.MediaType != manifest.MediaTypeImageTensor {
continue
}
blobPath, err := manifest.BlobsPath(layer.Digest)
if err != nil {
continue
@@ -189,131 +269,11 @@ func getTensorInfoFromManifest(mf *manifest.Manifest) ([]api.Tensor, error) {
if err != nil {
continue
}
td := &tensorData{info: info, digest: layer.Digest}
if strings.HasSuffix(layer.Name, "_scale") {
baseName := strings.TrimSuffix(layer.Name, "_scale")
scaleMap[baseName] = td
} else if strings.HasSuffix(layer.Name, "_qbias") {
// Skip qbias tensors - they're included with the quantized weight
continue
} else {
tensorMap[layer.Name] = td
if info.QuantType != "" {
return strings.ToUpper(info.QuantType), nil
}
}
// Second pass: build tensor list with quantization info
for _, layer := range mf.Layers {
if layer.MediaType != manifest.MediaTypeImageTensor {
continue
}
// Skip scale and qbias tensors
if strings.HasSuffix(layer.Name, "_scale") || strings.HasSuffix(layer.Name, "_qbias") {
continue
}
td := tensorMap[layer.Name]
if td == nil {
continue
}
// Check if this tensor has a corresponding scale tensor (quantized)
scaleTd := scaleMap[layer.Name]
if scaleTd != nil && len(td.info.Shape) >= 2 && len(scaleTd.info.Shape) >= 2 {
// Quantized tensor - detect bits from shapes
weightCols := td.info.Shape[len(td.info.Shape)-1]
scaleCols := scaleTd.info.Shape[len(scaleTd.info.Shape)-1]
// Detect quantization: Q4 has pack_factor=8, Q8 has pack_factor=4
// Q4 uses group_size=32: weightCols * 8 / scaleCols = 32
// Q8 uses group_size=64: weightCols * 4 / scaleCols = 64
var bits int
var quantType string
if weightCols*8/scaleCols == 32 {
bits = 4
quantType = "Q4"
} else if weightCols*4/scaleCols == 64 {
bits = 8
quantType = "Q8"
} else {
// Unknown quantization, show raw
quantType = td.info.Dtype
}
// Calculate unpacked shape
shape := make([]uint64, len(td.info.Shape))
for i, s := range td.info.Shape {
shape[i] = uint64(s)
}
if bits > 0 {
packFactor := int64(32 / bits)
shape[len(shape)-1] = uint64(td.info.Shape[len(td.info.Shape)-1] * packFactor)
}
tensors = append(tensors, api.Tensor{
Name: layer.Name,
Type: quantType,
Shape: shape,
})
} else {
// Non-quantized tensor
shape := make([]uint64, len(td.info.Shape))
for i, s := range td.info.Shape {
shape[i] = uint64(s)
}
tensors = append(tensors, api.Tensor{
Name: layer.Name,
Type: td.info.Dtype,
Shape: shape,
})
}
}
return tensors, nil
}
// GetSafetensorsDtype returns the quantization type for a safetensors model.
// Reads from model_index.json first, falls back to detection from tensor names.
// Otherwise returns the torch_dtype from config.json.
func GetSafetensorsDtype(name model.Name) (string, error) {
mf, err := manifest.ParseNamedManifest(name)
if err != nil {
return "", fmt.Errorf("failed to load manifest: %w", err)
}
// First try to read quantization from model_index.json
var modelIndex struct {
Quantization string `json:"quantization"`
}
if err := mf.ReadConfigJSON("model_index.json", &modelIndex); err == nil && modelIndex.Quantization != "" {
return modelIndex.Quantization, nil
}
// Fallback: detect from tensor names
hasScales := false
hasQBias := false
for _, layer := range mf.Layers {
if layer.MediaType == manifest.MediaTypeImageTensor {
if strings.HasSuffix(layer.Name, "_scale") {
hasScales = true
}
if strings.HasSuffix(layer.Name, "_qbias") {
hasQBias = true
}
}
}
if hasScales {
if hasQBias {
// Affine mode (has scale + qbias) - could be Q4 or Q8
// Default to Q4 as it's more common
return "Q4", nil
}
// No qbias = NVFP4
return "NVFP4", nil
// Only check the first tensor blob
break
}
// Not quantized - return torch_dtype from config.json
@@ -329,8 +289,11 @@ func GetSafetensorsDtype(name model.Name) (string, error) {
// safetensorsTensorInfo holds metadata about a tensor from a safetensors header
type safetensorsTensorInfo struct {
Dtype string `json:"dtype"`
Shape []int64 `json:"shape"`
Name string // tensor name from the header key
Dtype string `json:"dtype"`
Shape []int64 `json:"shape"`
QuantType string // from __metadata__.quant_type (e.g., "int4", "int8", "nvfp4", "mxfp8")
GroupSize string // from __metadata__.group_size (e.g., "32", "64")
}
// readSafetensorsHeader reads the JSON header from a safetensors file to get tensor metadata.
@@ -347,6 +310,7 @@ func readSafetensorsHeader(path string) (*safetensorsTensorInfo, error) {
// parseSafetensorsHeader parses a safetensors header from a reader.
// This is separated for testability.
// Parses __metadata__ for quant_type and group_size if present.
func parseSafetensorsHeader(r io.Reader) (*safetensorsTensorInfo, error) {
// Read header size (8 bytes, little endian)
var headerSize uint64
@@ -371,7 +335,31 @@ func parseSafetensorsHeader(r io.Reader) (*safetensorsTensorInfo, error) {
return nil, fmt.Errorf("failed to parse header: %w", err)
}
// Find the first (and should be only) tensor entry
// Parse metadata if present
var quantType, groupSize string
if metaRaw, ok := header["__metadata__"]; ok {
var meta map[string]string
if json.Unmarshal(metaRaw, &meta) == nil {
quantType = meta["quant_type"]
groupSize = meta["group_size"]
}
}
// Find the main tensor entry (not __metadata__, .scale, or .bias)
for name, raw := range header {
if name == "__metadata__" || strings.HasSuffix(name, ".scale") || strings.HasSuffix(name, ".bias") {
continue
}
var info safetensorsTensorInfo
if err := json.Unmarshal(raw, &info); err != nil {
return nil, fmt.Errorf("failed to parse tensor info: %w", err)
}
info.QuantType = quantType
info.GroupSize = groupSize
return &info, nil
}
// Fall back to first non-metadata tensor entry
for name, raw := range header {
if name == "__metadata__" {
continue
@@ -380,8 +368,134 @@ func parseSafetensorsHeader(r io.Reader) (*safetensorsTensorInfo, error) {
if err := json.Unmarshal(raw, &info); err != nil {
return nil, fmt.Errorf("failed to parse tensor info: %w", err)
}
info.QuantType = quantType
info.GroupSize = groupSize
return &info, nil
}
return nil, fmt.Errorf("no tensor found in header")
}
// parseSafetensorsAllHeaders parses all tensor entries from a safetensors header.
// Returns one safetensorsTensorInfo per main tensor (skipping __metadata__, .scale, .bias).
// For packed blobs this returns multiple entries; for single-tensor blobs, one entry.
// Each tensor's quant type is inferred from its shape and the presence of .scale/.bias entries
// when no global __metadata__ quant_type is present.
func parseSafetensorsAllHeaders(r io.Reader) ([]safetensorsTensorInfo, error) {
var headerSize uint64
if err := binary.Read(r, binary.LittleEndian, &headerSize); err != nil {
return nil, fmt.Errorf("failed to read header size: %w", err)
}
if headerSize > 100*1024*1024 { // 100MB limit for packed blob headers
return nil, fmt.Errorf("header size too large: %d", headerSize)
}
headerBytes := make([]byte, headerSize)
if _, err := io.ReadFull(r, headerBytes); err != nil {
return nil, fmt.Errorf("failed to read header: %w", err)
}
var header map[string]json.RawMessage
if err := json.Unmarshal(headerBytes, &header); err != nil {
return nil, fmt.Errorf("failed to parse header: %w", err)
}
// Parse global metadata if present
var globalQuantType, globalGroupSize string
if metaRaw, ok := header["__metadata__"]; ok {
var meta map[string]string
if json.Unmarshal(metaRaw, &meta) == nil {
globalQuantType = meta["quant_type"]
globalGroupSize = meta["group_size"]
}
}
// Build a set of all keys for checking .scale/.bias presence
headerKeys := make(map[string]bool, len(header))
for k := range header {
headerKeys[k] = true
}
// Collect all main tensor entries (sorted for deterministic output)
var mainNames []string
for name := range header {
if name == "__metadata__" || strings.HasSuffix(name, ".scale") || strings.HasSuffix(name, ".bias") {
continue
}
mainNames = append(mainNames, name)
}
sort.Strings(mainNames)
var results []safetensorsTensorInfo
for _, name := range mainNames {
var info safetensorsTensorInfo
if err := json.Unmarshal(header[name], &info); err != nil {
return nil, fmt.Errorf("failed to parse tensor info for %s: %w", name, err)
}
info.Name = name
if globalQuantType != "" {
// Use global metadata
info.QuantType = globalQuantType
info.GroupSize = globalGroupSize
} else if headerKeys[name+".scale"] {
// No global metadata, but has .scale - infer quant type from shape
info.QuantType = inferQuantType(header, name)
}
results = append(results, info)
}
if len(results) == 0 {
return nil, fmt.Errorf("no tensor found in header")
}
return results, nil
}
// inferQuantType infers the quantization type for a tensor from its shape and scale shape.
// Returns "int4", "int8", etc. or "" if not quantized.
func inferQuantType(header map[string]json.RawMessage, name string) string {
// Parse the main tensor shape
var mainInfo struct {
Shape []int64 `json:"shape"`
}
if json.Unmarshal(header[name], &mainInfo) != nil || len(mainInfo.Shape) < 2 {
return ""
}
// Parse scale shape to determine group size
scaleRaw, ok := header[name+".scale"]
if !ok {
return ""
}
var scaleInfo struct {
Shape []int64 `json:"shape"`
}
if json.Unmarshal(scaleRaw, &scaleInfo) != nil || len(scaleInfo.Shape) < 2 {
return ""
}
// Calculate group size: main_cols * pack_factor / scale_cols
// Main dtype is U32, so we need to figure out the pack factor
// For int4: pack=8, group=32. scale_cols = original_cols / 32 = main_cols * 8 / 32 = main_cols / 4
// For int8: pack=4, group=64. scale_cols = original_cols / 64 = main_cols * 4 / 64 = main_cols / 16
mainCols := mainInfo.Shape[len(mainInfo.Shape)-1]
scaleCols := scaleInfo.Shape[len(scaleInfo.Shape)-1]
if scaleCols == 0 {
return ""
}
ratio := mainCols / scaleCols // main_packed_cols / scale_cols
// int4: ratio = (orig/8) / (orig/32) = 32/8 = 4
// int8: ratio = (orig/4) / (orig/64) = 64/4 = 16
switch ratio {
case 4:
return "int4"
case 16:
return "int8"
default:
return ""
}
}

View File

@@ -36,7 +36,7 @@ func TestBuildModelInfo(t *testing.T) {
VocabSize: 262144,
TorchDtype: "bfloat16",
},
totalTensorBytes: 8_600_000_088, // ~4.3B params * 2 bytes + 88 bytes header
totalTensorBytes: 8_600_000_150, // ~4.3B params * 2 bytes + 150 bytes header
tensorCount: 1,
wantArch: "gemma3",
wantContextLen: 131072,
@@ -57,7 +57,7 @@ func TestBuildModelInfo(t *testing.T) {
VocabSize: 32000,
TorchDtype: "float16",
},
totalTensorBytes: 14_000_000_088, // ~7B params * 2 bytes + 88 bytes header
totalTensorBytes: 14_000_000_150, // ~7B params * 2 bytes + 150 bytes header
tensorCount: 1,
wantArch: "llama",
wantContextLen: 4096,
@@ -84,7 +84,7 @@ func TestBuildModelInfo(t *testing.T) {
VocabSize: 262144,
TorchDtype: "bfloat16",
},
totalTensorBytes: 8_600_000_088,
totalTensorBytes: 8_600_000_150,
tensorCount: 1,
wantArch: "gemma3",
wantContextLen: 131072,
@@ -101,7 +101,7 @@ func TestBuildModelInfo(t *testing.T) {
MaxPositionEmbeddings: 2048,
TorchDtype: "float32",
},
totalTensorBytes: 400_000_088, // 100M params * 4 bytes + 88 bytes header
totalTensorBytes: 400_000_150, // 100M params * 4 bytes + 150 bytes header
tensorCount: 1,
wantArch: "test",
wantContextLen: 2048,
@@ -118,7 +118,7 @@ func TestBuildModelInfo(t *testing.T) {
MaxPositionEmbeddings: 1024,
TorchDtype: "bfloat16",
},
totalTensorBytes: 2_000_880, // 1M params * 2 bytes + 10 tensors * 88 bytes
totalTensorBytes: 2_001_500, // 1M params * 2 bytes + 10 tensors * 150 bytes
tensorCount: 10,
wantArch: "test",
wantContextLen: 1024,
@@ -230,42 +230,42 @@ func TestBuildModelInfo_BytesPerParam(t *testing.T) {
{
name: "bfloat16",
dtype: "bfloat16",
totalBytes: 2_000_088, // 1M * 2 + 88
totalBytes: 2_000_150, // 1M * 2 + 150
tensorCount: 1,
wantParamCount: 1_000_000,
},
{
name: "float16",
dtype: "float16",
totalBytes: 2_000_088,
totalBytes: 2_000_150,
tensorCount: 1,
wantParamCount: 1_000_000,
},
{
name: "float32",
dtype: "float32",
totalBytes: 4_000_088, // 1M * 4 + 88
totalBytes: 4_000_150, // 1M * 4 + 150
tensorCount: 1,
wantParamCount: 1_000_000,
},
{
name: "int8",
dtype: "int8",
totalBytes: 1_000_088, // 1M * 1 + 88
totalBytes: 1_000_150, // 1M * 1 + 150
tensorCount: 1,
wantParamCount: 1_000_000,
},
{
name: "unknown dtype defaults to 2 bytes",
dtype: "unknown",
totalBytes: 2_000_088,
totalBytes: 2_000_150,
tensorCount: 1,
wantParamCount: 1_000_000,
},
{
name: "empty dtype defaults to 2 bytes",
dtype: "",
totalBytes: 2_000_088,
totalBytes: 2_000_150,
tensorCount: 1,
wantParamCount: 1_000_000,
},
@@ -288,11 +288,13 @@ func TestBuildModelInfo_BytesPerParam(t *testing.T) {
func TestParseSafetensorsHeader(t *testing.T) {
tests := []struct {
name string
header map[string]any
wantDtype string
wantShape []int64
wantErr bool
name string
header map[string]any
wantDtype string
wantShape []int64
wantQuantType string
wantGroupSize string
wantErr bool
}{
{
name: "simple tensor",
@@ -307,7 +309,70 @@ func TestParseSafetensorsHeader(t *testing.T) {
wantShape: []int64{2560, 262144},
},
{
name: "with metadata",
name: "tensor keyed by name",
header: map[string]any{
"model.layers.0.weight": map[string]any{
"dtype": "BF16",
"shape": []int64{2560, 2560},
"data_offsets": []int64{0, 13107200},
},
},
wantDtype: "BF16",
wantShape: []int64{2560, 2560},
},
{
name: "with int4 quant metadata",
header: map[string]any{
"__metadata__": map[string]any{
"quant_type": "int4",
"group_size": "32",
},
"model.layers.0.mlp.up_proj.weight": map[string]any{
"dtype": "U32",
"shape": []int64{2560, 320},
"data_offsets": []int64{0, 3276800},
},
"model.layers.0.mlp.up_proj.weight.scale": map[string]any{
"dtype": "BF16",
"shape": []int64{2560, 80},
"data_offsets": []int64{3276800, 3686400},
},
"model.layers.0.mlp.up_proj.weight.bias": map[string]any{
"dtype": "BF16",
"shape": []int64{2560, 80},
"data_offsets": []int64{3686400, 4096000},
},
},
wantDtype: "U32",
wantShape: []int64{2560, 320},
wantQuantType: "int4",
wantGroupSize: "32",
},
{
name: "int8 quant metadata",
header: map[string]any{
"__metadata__": map[string]any{
"quant_type": "int8",
"group_size": "64",
},
"model.layers.0.mlp.down_proj.weight": map[string]any{
"dtype": "U32",
"shape": []int64{2560, 640},
"data_offsets": []int64{0, 6553600},
},
"model.layers.0.mlp.down_proj.weight.scale": map[string]any{
"dtype": "BF16",
"shape": []int64{2560, 40},
"data_offsets": []int64{6553600, 6963200},
},
},
wantDtype: "U32",
wantShape: []int64{2560, 640},
wantQuantType: "int8",
wantGroupSize: "64",
},
{
name: "with old-style format metadata",
header: map[string]any{
"__metadata__": map[string]any{
"format": "pt",
@@ -371,6 +436,13 @@ func TestParseSafetensorsHeader(t *testing.T) {
}
}
}
if info.QuantType != tt.wantQuantType {
t.Errorf("QuantType = %v, want %v", info.QuantType, tt.wantQuantType)
}
if info.GroupSize != tt.wantGroupSize {
t.Errorf("GroupSize = %v, want %v", info.GroupSize, tt.wantGroupSize)
}
})
}
}
@@ -460,7 +532,7 @@ func TestGetTensorInfoFromManifest(t *testing.T) {
t.Fatalf("failed to create blobs dir: %v", err)
}
// Create test tensor blobs
// Create test tensor blobs with __metadata__
tensors := []struct {
name string
digest string
@@ -487,10 +559,9 @@ func TestGetTensorInfoFromManifest(t *testing.T) {
},
}
// Create blob files
// Create blob files with tensor keyed by name
var layers []manifest.Layer
for _, tensor := range tensors {
// Create safetensors blob
header := map[string]any{
tensor.name: map[string]any{
"dtype": tensor.dtype,
@@ -561,6 +632,391 @@ func TestGetTensorInfoFromManifest(t *testing.T) {
}
}
func TestGetTensorInfoFromManifest_Quantized(t *testing.T) {
// Create a temp directory for blobs and set OLLAMA_MODELS
tempDir := t.TempDir()
t.Setenv("OLLAMA_MODELS", tempDir)
blobDir := filepath.Join(tempDir, "blobs")
if err := os.MkdirAll(blobDir, 0o755); err != nil {
t.Fatalf("failed to create blobs dir: %v", err)
}
// Create a combined quantized blob with __metadata__
header := map[string]any{
"__metadata__": map[string]string{
"quant_type": "int4",
"group_size": "32",
},
"model.layers.0.mlp.up_proj.weight": map[string]any{
"dtype": "U32",
"shape": []int64{2560, 320}, // packed: 2560 / 8 = 320
"data_offsets": []int64{0, 3276800},
},
"model.layers.0.mlp.up_proj.weight.scale": map[string]any{
"dtype": "BF16",
"shape": []int64{2560, 80}, // 2560 / 32 = 80
"data_offsets": []int64{3276800, 3686400},
},
"model.layers.0.mlp.up_proj.weight.bias": map[string]any{
"dtype": "BF16",
"shape": []int64{2560, 80},
"data_offsets": []int64{3686400, 4096000},
},
}
headerJSON, _ := json.Marshal(header)
var buf bytes.Buffer
binary.Write(&buf, binary.LittleEndian, uint64(len(headerJSON)))
buf.Write(headerJSON)
digest := "sha256:aabb11aabb11aabb11aabb11aabb11aabb11aabb11aabb11aabb11aabb11aabb"
blobPath, err := manifest.BlobsPath(digest)
if err != nil {
t.Fatalf("failed to get blob path: %v", err)
}
if err := os.WriteFile(blobPath, buf.Bytes(), 0o644); err != nil {
t.Fatalf("failed to write blob: %v", err)
}
mf := &manifest.Manifest{
SchemaVersion: 2,
MediaType: "application/vnd.docker.distribution.manifest.v2+json",
Layers: []manifest.Layer{
{
MediaType: manifest.MediaTypeImageTensor,
Digest: digest,
Size: int64(buf.Len() + 4096000),
Name: "model.layers.0.mlp.up_proj.weight",
},
},
}
result, err := getTensorInfoFromManifest(mf)
if err != nil {
t.Fatalf("getTensorInfoFromManifest() error = %v", err)
}
if len(result) != 1 {
t.Fatalf("got %d tensors, want 1", len(result))
}
tensor := result[0]
if tensor.Name != "model.layers.0.mlp.up_proj.weight" {
t.Errorf("Name = %v, want model.layers.0.mlp.up_proj.weight", tensor.Name)
}
if tensor.Type != "INT4" {
t.Errorf("Type = %v, want INT4", tensor.Type)
}
// Shape should be unpacked: 320 * 8 = 2560
if len(tensor.Shape) != 2 || tensor.Shape[0] != 2560 || tensor.Shape[1] != 2560 {
t.Errorf("Shape = %v, want [2560, 2560]", tensor.Shape)
}
}
func TestParseSafetensorsAllHeaders(t *testing.T) {
tests := []struct {
name string
header map[string]any
wantCount int
wantNames []string
wantDtypes []string
wantQuants []string
wantErr bool
}{
{
name: "single tensor blob",
header: map[string]any{
"model.layers.0.weight": map[string]any{
"dtype": "BF16",
"shape": []int64{2560, 2560},
"data_offsets": []int64{0, 13107200},
},
},
wantCount: 1,
wantNames: []string{"model.layers.0.weight"},
wantDtypes: []string{"BF16"},
wantQuants: []string{""},
},
{
name: "packed unquantized blob",
header: map[string]any{
"model.layers.0.mlp.experts.0.down_proj.weight": map[string]any{
"dtype": "BF16",
"shape": []int64{2560, 10240},
"data_offsets": []int64{0, 52428800},
},
"model.layers.0.mlp.experts.0.gate_proj.weight": map[string]any{
"dtype": "BF16",
"shape": []int64{10240, 2560},
"data_offsets": []int64{52428800, 104857600},
},
"model.layers.0.mlp.experts.0.up_proj.weight": map[string]any{
"dtype": "BF16",
"shape": []int64{10240, 2560},
"data_offsets": []int64{104857600, 157286400},
},
},
wantCount: 3,
wantNames: []string{
"model.layers.0.mlp.experts.0.down_proj.weight",
"model.layers.0.mlp.experts.0.gate_proj.weight",
"model.layers.0.mlp.experts.0.up_proj.weight",
},
wantDtypes: []string{"BF16", "BF16", "BF16"},
wantQuants: []string{"", "", ""},
},
{
name: "packed quantized blob with global metadata",
header: map[string]any{
"__metadata__": map[string]any{
"quant_type": "int4",
"group_size": "32",
},
"model.layers.0.mlp.experts.0.gate_proj.weight": map[string]any{
"dtype": "U32",
"shape": []int64{10240, 320},
"data_offsets": []int64{0, 13107200},
},
"model.layers.0.mlp.experts.0.gate_proj.weight.scale": map[string]any{
"dtype": "BF16",
"shape": []int64{10240, 80},
"data_offsets": []int64{13107200, 14745600},
},
"model.layers.0.mlp.experts.0.gate_proj.weight.bias": map[string]any{
"dtype": "BF16",
"shape": []int64{10240, 80},
"data_offsets": []int64{14745600, 16384000},
},
"model.layers.0.mlp.experts.0.up_proj.weight": map[string]any{
"dtype": "U32",
"shape": []int64{10240, 320},
"data_offsets": []int64{16384000, 29491200},
},
"model.layers.0.mlp.experts.0.up_proj.weight.scale": map[string]any{
"dtype": "BF16",
"shape": []int64{10240, 80},
"data_offsets": []int64{29491200, 31129600},
},
"model.layers.0.mlp.experts.0.up_proj.weight.bias": map[string]any{
"dtype": "BF16",
"shape": []int64{10240, 80},
"data_offsets": []int64{31129600, 32768000},
},
},
wantCount: 2,
wantNames: []string{
"model.layers.0.mlp.experts.0.gate_proj.weight",
"model.layers.0.mlp.experts.0.up_proj.weight",
},
wantDtypes: []string{"U32", "U32"},
wantQuants: []string{"int4", "int4"},
},
{
name: "packed mixed-precision blob (no global metadata)",
header: map[string]any{
"model.layers.0.mlp.experts.0.gate_proj.weight": map[string]any{
"dtype": "U32",
"shape": []int64{10240, 320},
"data_offsets": []int64{0, 13107200},
},
"model.layers.0.mlp.experts.0.gate_proj.weight.scale": map[string]any{
"dtype": "BF16",
"shape": []int64{10240, 80},
"data_offsets": []int64{13107200, 14745600},
},
"model.layers.0.mlp.experts.0.gate_proj.weight.bias": map[string]any{
"dtype": "BF16",
"shape": []int64{10240, 80},
"data_offsets": []int64{14745600, 16384000},
},
"model.layers.0.mlp.experts.0.down_proj.weight": map[string]any{
"dtype": "U32",
"shape": []int64{2560, 2560},
"data_offsets": []int64{16384000, 42598400},
},
"model.layers.0.mlp.experts.0.down_proj.weight.scale": map[string]any{
"dtype": "BF16",
"shape": []int64{2560, 160},
"data_offsets": []int64{42598400, 43417600},
},
},
wantCount: 2,
wantNames: []string{
"model.layers.0.mlp.experts.0.down_proj.weight",
"model.layers.0.mlp.experts.0.gate_proj.weight",
},
wantDtypes: []string{"U32", "U32"},
wantQuants: []string{"int8", "int4"},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
headerJSON, err := json.Marshal(tt.header)
if err != nil {
t.Fatalf("failed to marshal header: %v", err)
}
var buf bytes.Buffer
if err := binary.Write(&buf, binary.LittleEndian, uint64(len(headerJSON))); err != nil {
t.Fatalf("failed to write header size: %v", err)
}
buf.Write(headerJSON)
results, err := parseSafetensorsAllHeaders(&buf)
if (err != nil) != tt.wantErr {
t.Errorf("parseSafetensorsAllHeaders() error = %v, wantErr %v", err, tt.wantErr)
return
}
if tt.wantErr {
return
}
if len(results) != tt.wantCount {
t.Fatalf("got %d tensors, want %d", len(results), tt.wantCount)
}
for i, info := range results {
if info.Name != tt.wantNames[i] {
t.Errorf("tensor[%d].Name = %v, want %v", i, info.Name, tt.wantNames[i])
}
if info.Dtype != tt.wantDtypes[i] {
t.Errorf("tensor[%d].Dtype = %v, want %v", i, info.Dtype, tt.wantDtypes[i])
}
if info.QuantType != tt.wantQuants[i] {
t.Errorf("tensor[%d].QuantType = %v, want %v", i, info.QuantType, tt.wantQuants[i])
}
}
})
}
}
func TestGetTensorInfoFromManifest_Packed(t *testing.T) {
// Create a temp directory for blobs and set OLLAMA_MODELS
tempDir := t.TempDir()
t.Setenv("OLLAMA_MODELS", tempDir)
blobDir := filepath.Join(tempDir, "blobs")
if err := os.MkdirAll(blobDir, 0o755); err != nil {
t.Fatalf("failed to create blobs dir: %v", err)
}
// Create a packed blob with multiple expert tensors (mixed quantization)
header := map[string]any{
"model.layers.0.mlp.experts.0.gate_proj.weight": map[string]any{
"dtype": "U32",
"shape": []int64{10240, 320},
"data_offsets": []int64{0, 13107200},
},
"model.layers.0.mlp.experts.0.gate_proj.weight.scale": map[string]any{
"dtype": "BF16",
"shape": []int64{10240, 80},
"data_offsets": []int64{13107200, 14745600},
},
"model.layers.0.mlp.experts.0.gate_proj.weight.bias": map[string]any{
"dtype": "BF16",
"shape": []int64{10240, 80},
"data_offsets": []int64{14745600, 16384000},
},
"model.layers.0.mlp.experts.0.down_proj.weight": map[string]any{
"dtype": "U32",
"shape": []int64{2560, 2560},
"data_offsets": []int64{16384000, 42598400},
},
"model.layers.0.mlp.experts.0.down_proj.weight.scale": map[string]any{
"dtype": "BF16",
"shape": []int64{2560, 160},
"data_offsets": []int64{42598400, 43417600},
},
}
headerJSON, _ := json.Marshal(header)
var buf bytes.Buffer
binary.Write(&buf, binary.LittleEndian, uint64(len(headerJSON)))
buf.Write(headerJSON)
packedDigest := "sha256:aaaa000000000000000000000000000000000000000000000000000000000001"
blobPath, err := manifest.BlobsPath(packedDigest)
if err != nil {
t.Fatalf("failed to get blob path: %v", err)
}
if err := os.WriteFile(blobPath, buf.Bytes(), 0o644); err != nil {
t.Fatalf("failed to write packed blob: %v", err)
}
// Also create a regular (single-tensor) blob
singleHeader := map[string]any{
"model.embed_tokens.weight": map[string]any{
"dtype": "BF16",
"shape": []int64{262144, 2560},
"data_offsets": []int64{0, 1342177280},
},
}
singleHeaderJSON, _ := json.Marshal(singleHeader)
var singleBuf bytes.Buffer
binary.Write(&singleBuf, binary.LittleEndian, uint64(len(singleHeaderJSON)))
singleBuf.Write(singleHeaderJSON)
singleDigest := "sha256:bbbb000000000000000000000000000000000000000000000000000000000002"
singleBlobPath, err := manifest.BlobsPath(singleDigest)
if err != nil {
t.Fatalf("failed to get blob path: %v", err)
}
if err := os.WriteFile(singleBlobPath, singleBuf.Bytes(), 0o644); err != nil {
t.Fatalf("failed to write single blob: %v", err)
}
mf := &manifest.Manifest{
SchemaVersion: 2,
MediaType: "application/vnd.docker.distribution.manifest.v2+json",
Layers: []manifest.Layer{
{
MediaType: manifest.MediaTypeImageTensor,
Digest: singleDigest,
Size: int64(singleBuf.Len()),
Name: "model.embed_tokens.weight",
},
{
MediaType: manifest.MediaTypeImageTensor,
Digest: packedDigest,
Size: int64(buf.Len()),
Name: "model.layers.0.mlp.experts", // group prefix
},
},
}
result, err := getTensorInfoFromManifest(mf)
if err != nil {
t.Fatalf("getTensorInfoFromManifest() error = %v", err)
}
// Should have 3 tensors: 1 single + 2 packed main tensors
if len(result) != 3 {
t.Fatalf("got %d tensors, want 3. Tensors: %v", len(result), result)
}
// First tensor should be the single blob
if result[0].Name != "model.embed_tokens.weight" {
t.Errorf("tensor[0].Name = %v, want model.embed_tokens.weight", result[0].Name)
}
if result[0].Type != "BF16" {
t.Errorf("tensor[0].Type = %v, want BF16", result[0].Type)
}
// Packed tensors should have their actual names (sorted)
packedNames := make(map[string]bool)
for _, r := range result[1:] {
packedNames[r.Name] = true
}
if !packedNames["model.layers.0.mlp.experts.0.down_proj.weight"] {
t.Error("missing packed tensor: model.layers.0.mlp.experts.0.down_proj.weight")
}
if !packedNames["model.layers.0.mlp.experts.0.gate_proj.weight"] {
t.Error("missing packed tensor: model.layers.0.mlp.experts.0.gate_proj.weight")
}
}
func TestReadSafetensorsHeader(t *testing.T) {
// Create a temp file with a valid safetensors header
tempDir := t.TempDir()