mirror of
https://github.com/ollama/ollama.git
synced 2026-06-03 13:59:06 -04:00
This change addresses some problems with GGUF conversion including: * correctly naming the MoE tensors * correctly quantizing the nextn.eh_proj.weight MTP tensor
261 lines
7.4 KiB
Go
261 lines
7.4 KiB
Go
package server
|
|
|
|
import (
|
|
"bytes"
|
|
"io"
|
|
"os"
|
|
"path/filepath"
|
|
"slices"
|
|
"testing"
|
|
|
|
fsggml "github.com/ollama/ollama/fs/ggml"
|
|
)
|
|
|
|
func TestLlamaQuantizeArgs(t *testing.T) {
|
|
tests := []struct {
|
|
name string
|
|
arch string
|
|
fileType fsggml.FileType
|
|
typeName string
|
|
want []string
|
|
}{
|
|
{
|
|
name: "default",
|
|
arch: "llama",
|
|
fileType: fsggml.FileTypeQ4_K_M,
|
|
want: []string{"--allow-requantize", "in.gguf", "out.gguf", "Q4_K_M"},
|
|
},
|
|
{
|
|
name: "qwen35moe k quant keeps mtp projection q8",
|
|
arch: "qwen35moe",
|
|
fileType: fsggml.FileTypeQ4_K_M,
|
|
want: []string{
|
|
"--allow-requantize",
|
|
"--tensor-type", `^blk\.[0-9]+\.nextn\.eh_proj\.weight$=q8_0`,
|
|
"in.gguf", "out.gguf", "Q4_K_M",
|
|
},
|
|
},
|
|
{
|
|
name: "qwen35 k quant keeps mtp projection q8",
|
|
arch: "qwen35",
|
|
fileType: fsggml.FileTypeQ4_K_S,
|
|
want: []string{
|
|
"--allow-requantize",
|
|
"--tensor-type", `^blk\.[0-9]+\.nextn\.eh_proj\.weight$=q8_0`,
|
|
"in.gguf", "out.gguf", "Q4_K_S",
|
|
},
|
|
},
|
|
{
|
|
name: "qwen35moe f16 keeps mtp projection unquantized",
|
|
arch: "qwen35moe",
|
|
fileType: fsggml.FileTypeF16,
|
|
want: []string{"--allow-requantize", "in.gguf", "out.gguf", "F16"},
|
|
},
|
|
{
|
|
name: "qwen35moe bf16 keeps mtp projection unquantized",
|
|
arch: "qwen35moe",
|
|
fileType: fsggml.FileTypeBF16,
|
|
want: []string{"--allow-requantize", "in.gguf", "out.gguf", "BF16"},
|
|
},
|
|
{
|
|
name: "qwen35moe q8 already satisfies mtp projection floor",
|
|
arch: "qwen35moe",
|
|
fileType: fsggml.FileTypeQ8_0,
|
|
want: []string{"--allow-requantize", "in.gguf", "out.gguf", "Q8_0"},
|
|
},
|
|
{
|
|
name: "gemma3n k quant keeps per layer token embedding f16",
|
|
arch: "gemma3n",
|
|
fileType: fsggml.FileTypeQ4_K_M,
|
|
want: []string{
|
|
"--allow-requantize",
|
|
"--tensor-type", `^per_layer_token_embd\.weight$=f16`,
|
|
"in.gguf", "out.gguf", "Q4_K_M",
|
|
},
|
|
},
|
|
{
|
|
name: "deepseek2 k quant keeps mla tensors q8",
|
|
arch: "deepseek2",
|
|
fileType: fsggml.FileTypeQ4_K_M,
|
|
want: []string{
|
|
"--allow-requantize",
|
|
"--tensor-type", `attn_k_b\.weight$=q8_0`,
|
|
"--tensor-type", `attn_q_a\.weight$=q8_0`,
|
|
"--tensor-type", `attn_q_b\.weight$=q8_0`,
|
|
"--tensor-type", `attn_v_b\.weight$=q8_0`,
|
|
"--tensor-type", `attn_kv_a_mqa\.weight$=q8_0`,
|
|
"in.gguf", "out.gguf", "Q4_K_M",
|
|
},
|
|
},
|
|
{
|
|
name: "gemma3n q8 does not add k quant override",
|
|
arch: "gemma3n",
|
|
fileType: fsggml.FileTypeQ8_0,
|
|
want: []string{"--allow-requantize", "in.gguf", "out.gguf", "Q8_0"},
|
|
},
|
|
{
|
|
name: "glmocr k quant keeps input and output embeddings f16",
|
|
arch: "glmocr",
|
|
fileType: fsggml.FileTypeQ4_K_M,
|
|
want: []string{
|
|
"--allow-requantize",
|
|
"--tensor-type", `^token_embd\.weight$=f16`,
|
|
"--tensor-type", `^output\.weight$=f16`,
|
|
"in.gguf", "out.gguf", "Q4_K_M",
|
|
},
|
|
},
|
|
{
|
|
name: "glm4 k quant keeps glm-ocr split text embeddings f16",
|
|
arch: "glm4",
|
|
fileType: fsggml.FileTypeQ4_K_M,
|
|
want: []string{
|
|
"--allow-requantize",
|
|
"--tensor-type", `^token_embd\.weight$=f16`,
|
|
"--tensor-type", `^output\.weight$=f16`,
|
|
"in.gguf", "out.gguf", "Q4_K_M",
|
|
},
|
|
},
|
|
{
|
|
name: "copy does not add quantization overrides",
|
|
arch: "gemma3n",
|
|
fileType: fsggml.FileTypeQ4_K_M,
|
|
typeName: "COPY",
|
|
want: []string{"--allow-requantize", "in.gguf", "out.gguf", "COPY"},
|
|
},
|
|
{
|
|
name: "qwen35moe copy does not add mtp projection override",
|
|
arch: "qwen35moe",
|
|
fileType: fsggml.FileTypeQ4_K_M,
|
|
typeName: "COPY",
|
|
want: []string{"--allow-requantize", "in.gguf", "out.gguf", "COPY"},
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
typeName := tt.fileType.String()
|
|
if tt.typeName != "" {
|
|
typeName = tt.typeName
|
|
}
|
|
got := llamaQuantizeArgs(tt.arch, tt.fileType, "in.gguf", "out.gguf", typeName)
|
|
if !slices.Equal(got, tt.want) {
|
|
t.Fatalf("llamaQuantizeArgs = %#v, want %#v", got, tt.want)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestDisableLlamaCppCompat(t *testing.T) {
|
|
got := disableLlamaCppCompat([]string{"A=1", llamaCppCompatEnv + "=1", "B=2"})
|
|
want := []string{"A=1", "B=2", llamaCppCompatEnv + "=0"}
|
|
if !slices.Equal(got, want) {
|
|
t.Fatalf("disableLlamaCppCompat = %#v, want %#v", got, want)
|
|
}
|
|
}
|
|
|
|
func TestLlamaQuantizeEnv(t *testing.T) {
|
|
env := []string{"A=1", llamaCppCompatEnv + "=0", "B=2"}
|
|
tests := []struct {
|
|
name string
|
|
enableCompat bool
|
|
want []string
|
|
}{
|
|
{
|
|
name: "clean GGUF validation disables compat",
|
|
enableCompat: false,
|
|
want: []string{"A=1", "B=2", llamaCppCompatEnv + "=0"},
|
|
},
|
|
{
|
|
name: "embedded compatibility quantization allows compat",
|
|
enableCompat: true,
|
|
want: []string{"A=1", "B=2"},
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
if got := llamaQuantizeEnv(env, tt.enableCompat); !slices.Equal(got, tt.want) {
|
|
t.Fatalf("llamaQuantizeEnv = %#v, want %#v", got, tt.want)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestRestoreEmbeddedCompatibilityTensorsReplacesExistingCopies(t *testing.T) {
|
|
dir := t.TempDir()
|
|
|
|
origFile, orig := writeQuantizationTestGGUF(t, dir, "orig.gguf", fsggml.KV{
|
|
"general.architecture": "qwen35",
|
|
}, []*fsggml.Tensor{
|
|
testTensor("blk.0.weight", fsggml.TensorTypeF32, []uint64{1}, []byte{1, 2, 3, 4}),
|
|
testTensor("v.pos_embed.weight", fsggml.TensorTypeF16, []uint64{2}, []byte{5, 6, 7, 8}),
|
|
})
|
|
defer origFile.Close()
|
|
|
|
outFile, _ := writeQuantizationTestGGUF(t, dir, "out.gguf", fsggml.KV{
|
|
"general.architecture": "qwen35",
|
|
"general.file_type": fsggml.FileTypeQ4_K_M,
|
|
}, []*fsggml.Tensor{
|
|
testTensor("blk.0.weight", fsggml.TensorTypeF32, []uint64{1}, []byte{9, 10, 11, 12}),
|
|
testTensor("v.pos_embed.weight", fsggml.TensorTypeF32, []uint64{1}, []byte{13, 14, 15, 16}),
|
|
})
|
|
defer outFile.Close()
|
|
|
|
if err := restoreEmbeddedCompatibilityTensors(origFile, outFile, orig, fsggml.FileTypeQ4_K_M); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if _, err := outFile.Seek(0, io.SeekStart); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
got, err := fsggml.Decode(outFile, -1)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
tensors := map[string]*fsggml.Tensor{}
|
|
for _, tensor := range got.Tensors().Items() {
|
|
tensors[tensor.Name] = tensor
|
|
}
|
|
if got := tensors["blk.0.weight"].Kind; got != uint32(fsggml.TensorTypeF32) {
|
|
t.Fatalf("blk.0.weight kind = %v, want F32", got)
|
|
}
|
|
if got := tensors["v.pos_embed.weight"].Kind; got != uint32(fsggml.TensorTypeF16) {
|
|
t.Fatalf("v.pos_embed.weight kind = %v, want F16 from original", got)
|
|
}
|
|
if got := tensors["v.pos_embed.weight"].Shape; !slices.Equal(got, []uint64{2}) {
|
|
t.Fatalf("v.pos_embed.weight shape = %v, want original shape [2]", got)
|
|
}
|
|
}
|
|
|
|
func writeQuantizationTestGGUF(t *testing.T, dir, name string, kv fsggml.KV, tensors []*fsggml.Tensor) (*os.File, *fsggml.GGML) {
|
|
t.Helper()
|
|
|
|
file, err := os.Create(filepath.Join(dir, name))
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if err := fsggml.WriteGGUF(file, kv, tensors); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if _, err := file.Seek(0, io.SeekStart); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
ggml, err := fsggml.Decode(file, -1)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if _, err := file.Seek(0, io.SeekStart); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
return file, ggml
|
|
}
|
|
|
|
func testTensor(name string, kind fsggml.TensorType, shape []uint64, data []byte) *fsggml.Tensor {
|
|
return &fsggml.Tensor{
|
|
Name: name,
|
|
Kind: uint32(kind),
|
|
Shape: shape,
|
|
WriterTo: bytes.NewReader(data),
|
|
}
|
|
}
|