Files
ollama/server/quantization_test.go
Patrick Devine 0e93ccc2cd convert: fixes for qwen3next model conversion (#16354)
This change addresses some problems with GGUF conversion including:
 * correctly naming the MoE tensors
 * correctly quantizing the nextn.eh_proj.weight MTP tensor
2026-06-01 09:43:11 -07:00

261 lines
7.4 KiB
Go

package server
import (
"bytes"
"io"
"os"
"path/filepath"
"slices"
"testing"
fsggml "github.com/ollama/ollama/fs/ggml"
)
func TestLlamaQuantizeArgs(t *testing.T) {
tests := []struct {
name string
arch string
fileType fsggml.FileType
typeName string
want []string
}{
{
name: "default",
arch: "llama",
fileType: fsggml.FileTypeQ4_K_M,
want: []string{"--allow-requantize", "in.gguf", "out.gguf", "Q4_K_M"},
},
{
name: "qwen35moe k quant keeps mtp projection q8",
arch: "qwen35moe",
fileType: fsggml.FileTypeQ4_K_M,
want: []string{
"--allow-requantize",
"--tensor-type", `^blk\.[0-9]+\.nextn\.eh_proj\.weight$=q8_0`,
"in.gguf", "out.gguf", "Q4_K_M",
},
},
{
name: "qwen35 k quant keeps mtp projection q8",
arch: "qwen35",
fileType: fsggml.FileTypeQ4_K_S,
want: []string{
"--allow-requantize",
"--tensor-type", `^blk\.[0-9]+\.nextn\.eh_proj\.weight$=q8_0`,
"in.gguf", "out.gguf", "Q4_K_S",
},
},
{
name: "qwen35moe f16 keeps mtp projection unquantized",
arch: "qwen35moe",
fileType: fsggml.FileTypeF16,
want: []string{"--allow-requantize", "in.gguf", "out.gguf", "F16"},
},
{
name: "qwen35moe bf16 keeps mtp projection unquantized",
arch: "qwen35moe",
fileType: fsggml.FileTypeBF16,
want: []string{"--allow-requantize", "in.gguf", "out.gguf", "BF16"},
},
{
name: "qwen35moe q8 already satisfies mtp projection floor",
arch: "qwen35moe",
fileType: fsggml.FileTypeQ8_0,
want: []string{"--allow-requantize", "in.gguf", "out.gguf", "Q8_0"},
},
{
name: "gemma3n k quant keeps per layer token embedding f16",
arch: "gemma3n",
fileType: fsggml.FileTypeQ4_K_M,
want: []string{
"--allow-requantize",
"--tensor-type", `^per_layer_token_embd\.weight$=f16`,
"in.gguf", "out.gguf", "Q4_K_M",
},
},
{
name: "deepseek2 k quant keeps mla tensors q8",
arch: "deepseek2",
fileType: fsggml.FileTypeQ4_K_M,
want: []string{
"--allow-requantize",
"--tensor-type", `attn_k_b\.weight$=q8_0`,
"--tensor-type", `attn_q_a\.weight$=q8_0`,
"--tensor-type", `attn_q_b\.weight$=q8_0`,
"--tensor-type", `attn_v_b\.weight$=q8_0`,
"--tensor-type", `attn_kv_a_mqa\.weight$=q8_0`,
"in.gguf", "out.gguf", "Q4_K_M",
},
},
{
name: "gemma3n q8 does not add k quant override",
arch: "gemma3n",
fileType: fsggml.FileTypeQ8_0,
want: []string{"--allow-requantize", "in.gguf", "out.gguf", "Q8_0"},
},
{
name: "glmocr k quant keeps input and output embeddings f16",
arch: "glmocr",
fileType: fsggml.FileTypeQ4_K_M,
want: []string{
"--allow-requantize",
"--tensor-type", `^token_embd\.weight$=f16`,
"--tensor-type", `^output\.weight$=f16`,
"in.gguf", "out.gguf", "Q4_K_M",
},
},
{
name: "glm4 k quant keeps glm-ocr split text embeddings f16",
arch: "glm4",
fileType: fsggml.FileTypeQ4_K_M,
want: []string{
"--allow-requantize",
"--tensor-type", `^token_embd\.weight$=f16`,
"--tensor-type", `^output\.weight$=f16`,
"in.gguf", "out.gguf", "Q4_K_M",
},
},
{
name: "copy does not add quantization overrides",
arch: "gemma3n",
fileType: fsggml.FileTypeQ4_K_M,
typeName: "COPY",
want: []string{"--allow-requantize", "in.gguf", "out.gguf", "COPY"},
},
{
name: "qwen35moe copy does not add mtp projection override",
arch: "qwen35moe",
fileType: fsggml.FileTypeQ4_K_M,
typeName: "COPY",
want: []string{"--allow-requantize", "in.gguf", "out.gguf", "COPY"},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
typeName := tt.fileType.String()
if tt.typeName != "" {
typeName = tt.typeName
}
got := llamaQuantizeArgs(tt.arch, tt.fileType, "in.gguf", "out.gguf", typeName)
if !slices.Equal(got, tt.want) {
t.Fatalf("llamaQuantizeArgs = %#v, want %#v", got, tt.want)
}
})
}
}
func TestDisableLlamaCppCompat(t *testing.T) {
got := disableLlamaCppCompat([]string{"A=1", llamaCppCompatEnv + "=1", "B=2"})
want := []string{"A=1", "B=2", llamaCppCompatEnv + "=0"}
if !slices.Equal(got, want) {
t.Fatalf("disableLlamaCppCompat = %#v, want %#v", got, want)
}
}
func TestLlamaQuantizeEnv(t *testing.T) {
env := []string{"A=1", llamaCppCompatEnv + "=0", "B=2"}
tests := []struct {
name string
enableCompat bool
want []string
}{
{
name: "clean GGUF validation disables compat",
enableCompat: false,
want: []string{"A=1", "B=2", llamaCppCompatEnv + "=0"},
},
{
name: "embedded compatibility quantization allows compat",
enableCompat: true,
want: []string{"A=1", "B=2"},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := llamaQuantizeEnv(env, tt.enableCompat); !slices.Equal(got, tt.want) {
t.Fatalf("llamaQuantizeEnv = %#v, want %#v", got, tt.want)
}
})
}
}
func TestRestoreEmbeddedCompatibilityTensorsReplacesExistingCopies(t *testing.T) {
dir := t.TempDir()
origFile, orig := writeQuantizationTestGGUF(t, dir, "orig.gguf", fsggml.KV{
"general.architecture": "qwen35",
}, []*fsggml.Tensor{
testTensor("blk.0.weight", fsggml.TensorTypeF32, []uint64{1}, []byte{1, 2, 3, 4}),
testTensor("v.pos_embed.weight", fsggml.TensorTypeF16, []uint64{2}, []byte{5, 6, 7, 8}),
})
defer origFile.Close()
outFile, _ := writeQuantizationTestGGUF(t, dir, "out.gguf", fsggml.KV{
"general.architecture": "qwen35",
"general.file_type": fsggml.FileTypeQ4_K_M,
}, []*fsggml.Tensor{
testTensor("blk.0.weight", fsggml.TensorTypeF32, []uint64{1}, []byte{9, 10, 11, 12}),
testTensor("v.pos_embed.weight", fsggml.TensorTypeF32, []uint64{1}, []byte{13, 14, 15, 16}),
})
defer outFile.Close()
if err := restoreEmbeddedCompatibilityTensors(origFile, outFile, orig, fsggml.FileTypeQ4_K_M); err != nil {
t.Fatal(err)
}
if _, err := outFile.Seek(0, io.SeekStart); err != nil {
t.Fatal(err)
}
got, err := fsggml.Decode(outFile, -1)
if err != nil {
t.Fatal(err)
}
tensors := map[string]*fsggml.Tensor{}
for _, tensor := range got.Tensors().Items() {
tensors[tensor.Name] = tensor
}
if got := tensors["blk.0.weight"].Kind; got != uint32(fsggml.TensorTypeF32) {
t.Fatalf("blk.0.weight kind = %v, want F32", got)
}
if got := tensors["v.pos_embed.weight"].Kind; got != uint32(fsggml.TensorTypeF16) {
t.Fatalf("v.pos_embed.weight kind = %v, want F16 from original", got)
}
if got := tensors["v.pos_embed.weight"].Shape; !slices.Equal(got, []uint64{2}) {
t.Fatalf("v.pos_embed.weight shape = %v, want original shape [2]", got)
}
}
func writeQuantizationTestGGUF(t *testing.T, dir, name string, kv fsggml.KV, tensors []*fsggml.Tensor) (*os.File, *fsggml.GGML) {
t.Helper()
file, err := os.Create(filepath.Join(dir, name))
if err != nil {
t.Fatal(err)
}
if err := fsggml.WriteGGUF(file, kv, tensors); err != nil {
t.Fatal(err)
}
if _, err := file.Seek(0, io.SeekStart); err != nil {
t.Fatal(err)
}
ggml, err := fsggml.Decode(file, -1)
if err != nil {
t.Fatal(err)
}
if _, err := file.Seek(0, io.SeekStart); err != nil {
t.Fatal(err)
}
return file, ggml
}
func testTensor(name string, kind fsggml.TensorType, shape []uint64, data []byte) *fsggml.Tensor {
return &fsggml.Tensor{
Name: name,
Kind: uint32(kind),
Shape: shape,
WriterTo: bytes.NewReader(data),
}
}