ollama/server/quantization_test.go

package server

import (
	"bytes"
	"io"
	"os"
	"path/filepath"
	"slices"
	"testing"

	fsggml "github.com/ollama/ollama/fs/ggml"
)

func TestLlamaQuantizeArgs(t *testing.T) {
	tests := []struct {
		name     string
		arch     string
		fileType fsggml.FileType
		typeName string
		want     []string
	}{
		{
			name:     "default",
			arch:     "llama",
			fileType: fsggml.FileTypeQ4_K_M,
			want:     []string{"--allow-requantize", "in.gguf", "out.gguf", "Q4_K_M"},
		},
		{
			name:     "qwen35moe k quant keeps mtp projection q8",
			arch:     "qwen35moe",
			fileType: fsggml.FileTypeQ4_K_M,
			want: []string{
				"--allow-requantize",
				"--tensor-type", `^blk\.[0-9]+\.nextn\.eh_proj\.weight$=q8_0`,
				"in.gguf", "out.gguf", "Q4_K_M",
			},
		},
		{
			name:     "qwen35 k quant keeps mtp projection q8",
			arch:     "qwen35",
			fileType: fsggml.FileTypeQ4_K_S,
			want: []string{
				"--allow-requantize",
				"--tensor-type", `^blk\.[0-9]+\.nextn\.eh_proj\.weight$=q8_0`,
				"in.gguf", "out.gguf", "Q4_K_S",
			},
		},
		{
			name:     "qwen35moe f16 keeps mtp projection unquantized",
			arch:     "qwen35moe",
			fileType: fsggml.FileTypeF16,
			want:     []string{"--allow-requantize", "in.gguf", "out.gguf", "F16"},
		},
		{
			name:     "qwen35moe bf16 keeps mtp projection unquantized",
			arch:     "qwen35moe",
			fileType: fsggml.FileTypeBF16,
			want:     []string{"--allow-requantize", "in.gguf", "out.gguf", "BF16"},
		},
		{
			name:     "qwen35moe q8 already satisfies mtp projection floor",
			arch:     "qwen35moe",
			fileType: fsggml.FileTypeQ8_0,
			want:     []string{"--allow-requantize", "in.gguf", "out.gguf", "Q8_0"},
		},
		{
			name:     "gemma3n k quant keeps per layer token embedding f16",
			arch:     "gemma3n",
			fileType: fsggml.FileTypeQ4_K_M,
			want: []string{
				"--allow-requantize",
				"--tensor-type", `^per_layer_token_embd\.weight$=f16`,
				"in.gguf", "out.gguf", "Q4_K_M",
			},
		},
		{
			name:     "deepseek2 k quant keeps mla tensors q8",
			arch:     "deepseek2",
			fileType: fsggml.FileTypeQ4_K_M,
			want: []string{
				"--allow-requantize",
				"--tensor-type", `attn_k_b\.weight$=q8_0`,
				"--tensor-type", `attn_q_a\.weight$=q8_0`,
				"--tensor-type", `attn_q_b\.weight$=q8_0`,
				"--tensor-type", `attn_v_b\.weight$=q8_0`,
				"--tensor-type", `attn_kv_a_mqa\.weight$=q8_0`,
				"in.gguf", "out.gguf", "Q4_K_M",
			},
		},
		{
			name:     "gemma3n q8 does not add k quant override",
			arch:     "gemma3n",
			fileType: fsggml.FileTypeQ8_0,
			want:     []string{"--allow-requantize", "in.gguf", "out.gguf", "Q8_0"},
		},
		{
			name:     "glmocr k quant keeps input and output embeddings f16",
			arch:     "glmocr",
			fileType: fsggml.FileTypeQ4_K_M,
			want: []string{
				"--allow-requantize",
				"--tensor-type", `^token_embd\.weight$=f16`,
				"--tensor-type", `^output\.weight$=f16`,
				"in.gguf", "out.gguf", "Q4_K_M",
			},
		},
		{
			name:     "glm4 k quant keeps glm-ocr split text embeddings f16",
			arch:     "glm4",
			fileType: fsggml.FileTypeQ4_K_M,
			want: []string{
				"--allow-requantize",
				"--tensor-type", `^token_embd\.weight$=f16`,
				"--tensor-type", `^output\.weight$=f16`,
				"in.gguf", "out.gguf", "Q4_K_M",
			},
		},
		{
			name:     "copy does not add quantization overrides",
			arch:     "gemma3n",
			fileType: fsggml.FileTypeQ4_K_M,
			typeName: "COPY",
			want:     []string{"--allow-requantize", "in.gguf", "out.gguf", "COPY"},
		},
		{
			name:     "qwen35moe copy does not add mtp projection override",
			arch:     "qwen35moe",
			fileType: fsggml.FileTypeQ4_K_M,
			typeName: "COPY",
			want:     []string{"--allow-requantize", "in.gguf", "out.gguf", "COPY"},
		},
	}

	for _, tt := range tests {
		t.Run(tt.name, func(t *testing.T) {
			typeName := tt.fileType.String()
			if tt.typeName != "" {
				typeName = tt.typeName
			}
			got := llamaQuantizeArgs(tt.arch, tt.fileType, "in.gguf", "out.gguf", typeName)
			if !slices.Equal(got, tt.want) {
				t.Fatalf("llamaQuantizeArgs = %#v, want %#v", got, tt.want)
			}
		})
	}
}

func TestDisableLlamaCppCompat(t *testing.T) {
	got := disableLlamaCppCompat([]string{"A=1", llamaCppCompatEnv + "=1", "B=2"})
	want := []string{"A=1", "B=2", llamaCppCompatEnv + "=0"}
	if !slices.Equal(got, want) {
		t.Fatalf("disableLlamaCppCompat = %#v, want %#v", got, want)
	}
}

func TestLlamaQuantizeEnv(t *testing.T) {
	env := []string{"A=1", llamaCppCompatEnv + "=0", "B=2"}
	tests := []struct {
		name         string
		enableCompat bool
		want         []string
	}{
		{
			name:         "clean GGUF validation disables compat",
			enableCompat: false,
			want:         []string{"A=1", "B=2", llamaCppCompatEnv + "=0"},
		},
		{
			name:         "embedded compatibility quantization allows compat",
			enableCompat: true,
			want:         []string{"A=1", "B=2"},
		},
	}

	for _, tt := range tests {
		t.Run(tt.name, func(t *testing.T) {
			if got := llamaQuantizeEnv(env, tt.enableCompat); !slices.Equal(got, tt.want) {
				t.Fatalf("llamaQuantizeEnv = %#v, want %#v", got, tt.want)
			}
		})
	}
}

func TestRestoreEmbeddedCompatibilityTensorsReplacesExistingCopies(t *testing.T) {
	dir := t.TempDir()

	origFile, orig := writeQuantizationTestGGUF(t, dir, "orig.gguf", fsggml.KV{
		"general.architecture": "qwen35",
	}, []*fsggml.Tensor{
		testTensor("blk.0.weight", fsggml.TensorTypeF32, []uint64{1}, []byte{1, 2, 3, 4}),
		testTensor("v.pos_embed.weight", fsggml.TensorTypeF16, []uint64{2}, []byte{5, 6, 7, 8}),
	})
	defer origFile.Close()

	outFile, _ := writeQuantizationTestGGUF(t, dir, "out.gguf", fsggml.KV{
		"general.architecture": "qwen35",
		"general.file_type":    fsggml.FileTypeQ4_K_M,
	}, []*fsggml.Tensor{
		testTensor("blk.0.weight", fsggml.TensorTypeF32, []uint64{1}, []byte{9, 10, 11, 12}),
		testTensor("v.pos_embed.weight", fsggml.TensorTypeF32, []uint64{1}, []byte{13, 14, 15, 16}),
	})
	defer outFile.Close()

	if err := restoreEmbeddedCompatibilityTensors(origFile, outFile, orig, fsggml.FileTypeQ4_K_M); err != nil {
		t.Fatal(err)
	}
	if _, err := outFile.Seek(0, io.SeekStart); err != nil {
		t.Fatal(err)
	}
	got, err := fsggml.Decode(outFile, -1)
	if err != nil {
		t.Fatal(err)
	}

	tensors := map[string]*fsggml.Tensor{}
	for _, tensor := range got.Tensors().Items() {
		tensors[tensor.Name] = tensor
	}
	if got := tensors["blk.0.weight"].Kind; got != uint32(fsggml.TensorTypeF32) {
		t.Fatalf("blk.0.weight kind = %v, want F32", got)
	}
	if got := tensors["v.pos_embed.weight"].Kind; got != uint32(fsggml.TensorTypeF16) {
		t.Fatalf("v.pos_embed.weight kind = %v, want F16 from original", got)
	}
	if got := tensors["v.pos_embed.weight"].Shape; !slices.Equal(got, []uint64{2}) {
		t.Fatalf("v.pos_embed.weight shape = %v, want original shape [2]", got)
	}
}

func writeQuantizationTestGGUF(t *testing.T, dir, name string, kv fsggml.KV, tensors []*fsggml.Tensor) (*os.File, *fsggml.GGML) {
	t.Helper()

	file, err := os.Create(filepath.Join(dir, name))
	if err != nil {
		t.Fatal(err)
	}
	if err := fsggml.WriteGGUF(file, kv, tensors); err != nil {
		t.Fatal(err)
	}
	if _, err := file.Seek(0, io.SeekStart); err != nil {
		t.Fatal(err)
	}
	ggml, err := fsggml.Decode(file, -1)
	if err != nil {
		t.Fatal(err)
	}
	if _, err := file.Seek(0, io.SeekStart); err != nil {
		t.Fatal(err)
	}
	return file, ggml
}

func testTensor(name string, kind fsggml.TensorType, shape []uint64, data []byte) *fsggml.Tensor {
	return &fsggml.Tensor{
		Name:     name,
		Kind:     uint32(kind),
		Shape:    shape,
		WriterTo: bytes.NewReader(data),
	}
}