mirror of
https://github.com/ollama/ollama.git
synced 2026-06-03 05:53:55 -04:00
This change addresses some problems with GGUF conversion including: * correctly naming the MoE tensors * correctly quantizing the nextn.eh_proj.weight MTP tensor
336 lines
9.5 KiB
Go
336 lines
9.5 KiB
Go
package server
|
|
|
|
import (
|
|
"bufio"
|
|
"fmt"
|
|
"io"
|
|
"log/slog"
|
|
"maps"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"regexp"
|
|
"strconv"
|
|
"strings"
|
|
|
|
fsggml "github.com/ollama/ollama/fs/ggml"
|
|
"github.com/ollama/ollama/llm"
|
|
"github.com/ollama/ollama/manifest"
|
|
)
|
|
|
|
// findLlamaQuantize locates the llama-quantize binary (installed alongside llama-server).
|
|
func findLlamaQuantize() (string, error) {
|
|
return llm.FindLlamaCppBinary("llama-quantize")
|
|
}
|
|
|
|
// progressRegex matches llama-quantize output lines like "[ 42/ 200]"
|
|
var progressRegex = regexp.MustCompile(`\[\s*(\d+)/\s*(\d+)\]`)
|
|
|
|
const llamaCppCompatEnv = "OLLAMA_LLAMA_CPP_COMPAT"
|
|
|
|
var runLlamaQuantize = runLlamaQuantizeCommand
|
|
|
|
// quantize re-quantizes a GGUF model by shelling out to llama-quantize.
|
|
// Embedded compatibility tensors are restored afterward because llama.cpp's
|
|
// text model loader intentionally does not claim those tensors.
|
|
func quantize(in, out *os.File, orig *fsggml.GGML, newFileType fsggml.FileType, progressFn func(n uint64)) error {
|
|
typeName := newFileType.String()
|
|
if typeName == "" {
|
|
return fmt.Errorf("unsupported quantization type: %v", newFileType)
|
|
}
|
|
if err := runLlamaQuantize(in, out, orig, newFileType, typeName, progressFn); err != nil {
|
|
return err
|
|
}
|
|
if hasEmbeddedCompatibilityTensors(orig) {
|
|
if err := restoreEmbeddedCompatibilityTensors(in, out, orig, newFileType); err != nil {
|
|
return fmt.Errorf("failed to restore embedded compatibility tensors: %w", err)
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func copyGGUFWithLlamaQuantize(in, out *os.File, orig *fsggml.GGML, progressFn func(n uint64)) error {
|
|
return runLlamaQuantize(in, out, orig, orig.KV().FileType(), "COPY", progressFn)
|
|
}
|
|
|
|
func needsDefaultLlavaProjectorType(ggml *fsggml.GGML) bool {
|
|
kv := ggml.KV()
|
|
if kv.Architecture() != "clip" || !kv.Bool("has_vision_encoder") {
|
|
return false
|
|
}
|
|
if _, ok := kv["clip.projector_type"]; ok {
|
|
return false
|
|
}
|
|
if _, ok := kv["clip.vision.projector_type"]; ok {
|
|
return false
|
|
}
|
|
return true
|
|
}
|
|
|
|
func addDefaultLlavaProjectorType(layer *layerGGML) (*layerGGML, error) {
|
|
blob, err := manifest.BlobsPath(layer.Digest)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
fp, err := os.Open(blob)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer fp.Close()
|
|
|
|
temp, err := os.CreateTemp(filepath.Dir(blob), "projector-metadata")
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer os.Remove(temp.Name())
|
|
defer temp.Close()
|
|
|
|
kv := maps.Clone(layer.GGML.KV())
|
|
kv["clip.projector_type"] = "mlp"
|
|
|
|
tensors := make([]*fsggml.Tensor, 0, len(layer.GGML.Tensors().Items()))
|
|
for _, tensor := range layer.GGML.Tensors().Items() {
|
|
tensors = append(tensors, tensorFromFile(fp, layer.GGML.Tensors().Offset+tensor.Offset, tensor))
|
|
}
|
|
|
|
if err := fsggml.WriteGGUF(temp, kv, tensors); err != nil {
|
|
return nil, err
|
|
}
|
|
if _, err := temp.Seek(0, io.SeekStart); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
newLayer, err := manifest.NewLayer(temp, layer.MediaType)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if _, err := temp.Seek(0, io.SeekStart); err != nil {
|
|
return nil, err
|
|
}
|
|
f, err := fsggml.Decode(temp, 1024)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return &layerGGML{Layer: newLayer, GGML: f}, nil
|
|
}
|
|
|
|
func runLlamaQuantizeCommand(in, out *os.File, orig *fsggml.GGML, newFileType fsggml.FileType, typeName string, progressFn func(n uint64)) error {
|
|
quantizeExe, err := findLlamaQuantize()
|
|
if err != nil {
|
|
return fmt.Errorf("llama-quantize unavailable: %w", err)
|
|
}
|
|
|
|
slog.Info("quantizing model", "type", typeName, "input", in.Name(), "output", out.Name())
|
|
|
|
args := llamaQuantizeArgs(orig.KV().Architecture(), newFileType, in.Name(), out.Name(), typeName)
|
|
cmd := exec.Command(quantizeExe, args...)
|
|
cmd.Env = llamaQuantizeEnv(os.Environ(), hasEmbeddedCompatibilityTensors(orig))
|
|
|
|
// Parse progress from stdout
|
|
stdout, err := cmd.StdoutPipe()
|
|
if err != nil {
|
|
return fmt.Errorf("failed to create stdout pipe: %w", err)
|
|
}
|
|
cmd.Stderr = os.Stderr
|
|
|
|
if err := cmd.Start(); err != nil {
|
|
return fmt.Errorf("failed to start llama-quantize: %w", err)
|
|
}
|
|
|
|
// Track total tensor size for progress reporting
|
|
totalSize := uint64(0)
|
|
for _, t := range orig.Tensors().Items() {
|
|
totalSize += t.Size()
|
|
}
|
|
|
|
var lastReported uint64
|
|
scanner := bufio.NewScanner(stdout)
|
|
for scanner.Scan() {
|
|
line := scanner.Text()
|
|
if matches := progressRegex.FindStringSubmatch(line); len(matches) == 3 {
|
|
current, _ := strconv.ParseUint(matches[1], 10, 64)
|
|
total, _ := strconv.ParseUint(matches[2], 10, 64)
|
|
if total > 0 && progressFn != nil {
|
|
// progressFn expects incremental byte deltas
|
|
done := totalSize * current / total
|
|
if done > lastReported {
|
|
progressFn(done - lastReported)
|
|
lastReported = done
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if err := cmd.Wait(); err != nil {
|
|
return fmt.Errorf("llama-quantize failed: %w", err)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func disableLlamaCppCompat(env []string) []string {
|
|
out := make([]string, 0, len(env)+1)
|
|
prefix := llamaCppCompatEnv + "="
|
|
for _, entry := range env {
|
|
if !strings.HasPrefix(entry, prefix) {
|
|
out = append(out, entry)
|
|
}
|
|
}
|
|
return append(out, llamaCppCompatEnv+"=0")
|
|
}
|
|
|
|
func llamaQuantizeEnv(env []string, enableCompat bool) []string {
|
|
if !enableCompat {
|
|
return disableLlamaCppCompat(env)
|
|
}
|
|
|
|
out := make([]string, 0, len(env))
|
|
prefix := llamaCppCompatEnv + "="
|
|
for _, entry := range env {
|
|
if !strings.HasPrefix(entry, prefix) {
|
|
out = append(out, entry)
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
type tensorSection struct {
|
|
*io.SectionReader
|
|
}
|
|
|
|
func (r tensorSection) WriteTo(w io.Writer) (int64, error) {
|
|
return io.Copy(w, r.SectionReader)
|
|
}
|
|
|
|
func restoreEmbeddedCompatibilityTensors(in, out *os.File, orig *fsggml.GGML, newFileType fsggml.FileType) error {
|
|
if _, err := out.Seek(0, io.SeekStart); err != nil {
|
|
return err
|
|
}
|
|
|
|
rewritten, err := fsggml.Decode(out, -1)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
kv := maps.Clone(orig.KV())
|
|
kv["general.file_type"] = newFileType
|
|
|
|
present := map[string]struct{}{}
|
|
tensors := make([]*fsggml.Tensor, 0, len(rewritten.Tensors().Items()))
|
|
for _, tensor := range rewritten.Tensors().Items() {
|
|
if isEmbeddedCompatibilityTensor(tensor.Name) {
|
|
continue
|
|
}
|
|
present[tensor.Name] = struct{}{}
|
|
tensors = append(tensors, tensorFromFile(out, rewritten.Tensors().Offset+tensor.Offset, tensor))
|
|
}
|
|
|
|
var restored int
|
|
for _, tensor := range orig.Tensors().Items() {
|
|
if !isEmbeddedCompatibilityTensor(tensor.Name) {
|
|
continue
|
|
}
|
|
if _, ok := present[tensor.Name]; ok {
|
|
continue
|
|
}
|
|
tensors = append(tensors, tensorFromFile(in, orig.Tensors().Offset+tensor.Offset, tensor))
|
|
restored++
|
|
}
|
|
if restored == 0 {
|
|
return nil
|
|
}
|
|
|
|
temp, err := os.CreateTemp(filepath.Dir(out.Name()), "compat-tensors")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer os.Remove(temp.Name())
|
|
defer temp.Close()
|
|
|
|
if err := fsggml.WriteGGUF(temp, kv, tensors); err != nil {
|
|
return err
|
|
}
|
|
if _, err := temp.Seek(0, io.SeekStart); err != nil {
|
|
return err
|
|
}
|
|
if err := out.Truncate(0); err != nil {
|
|
return err
|
|
}
|
|
if _, err := out.Seek(0, io.SeekStart); err != nil {
|
|
return err
|
|
}
|
|
if _, err := io.Copy(out, temp); err != nil {
|
|
return err
|
|
}
|
|
|
|
slog.Info("restored embedded compatibility tensors after llama-quantize", "count", restored)
|
|
return nil
|
|
}
|
|
|
|
func tensorFromFile(file *os.File, offset uint64, tensor *fsggml.Tensor) *fsggml.Tensor {
|
|
return &fsggml.Tensor{
|
|
Name: tensor.Name,
|
|
Kind: tensor.Kind,
|
|
Shape: append([]uint64(nil), tensor.Shape...),
|
|
WriterTo: tensorSection{
|
|
SectionReader: io.NewSectionReader(file, int64(offset), int64(tensor.Size())),
|
|
},
|
|
}
|
|
}
|
|
|
|
func llamaQuantizeArgs(arch string, newFileType fsggml.FileType, input, output, typeName string) []string {
|
|
args := []string{"--allow-requantize"}
|
|
if typeName == "COPY" {
|
|
return append(args, input, output, typeName)
|
|
}
|
|
// Qwen3.5 MTP uses this projection to combine hidden and embedding states
|
|
// for the draft layer. Keep it at least Q8 when quantizing lower than Q8,
|
|
// while preserving unquantized outputs such as F16/BF16.
|
|
if arch == "qwen35" || arch == "qwen35moe" {
|
|
switch newFileType {
|
|
case fsggml.FileTypeQ4_K_S, fsggml.FileTypeQ4_K_M:
|
|
args = append(args, "--tensor-type", `^blk\.[0-9]+\.nextn\.eh_proj\.weight$=q8_0`)
|
|
}
|
|
}
|
|
// gemma3n's per_layer_token_embd is read on every layer for every token
|
|
// (not just once at input like token_embd), so it's far more quality-sensitive
|
|
// than a normal token embedding. Keep it at F16 on K-quants via an anchored
|
|
// regex so we don't also bump token_embd (which --token-embedding-type would).
|
|
if arch == "gemma3n" {
|
|
switch newFileType {
|
|
case fsggml.FileTypeQ4_K_S, fsggml.FileTypeQ4_K_M:
|
|
args = append(args, "--tensor-type", `^per_layer_token_embd\.weight$=f16`)
|
|
}
|
|
}
|
|
// deepseek2 MLA tensors are small, critical matrices in DeepSeek-V2-style
|
|
// multi-head latent attention. Keep the same higher-precision policy used
|
|
// by published library/glm-4.7-flash K-quant models.
|
|
if arch == "deepseek2" {
|
|
switch newFileType {
|
|
case fsggml.FileTypeQ4_K_S, fsggml.FileTypeQ4_K_M:
|
|
args = append(args,
|
|
"--tensor-type", `attn_k_b\.weight$=q8_0`,
|
|
"--tensor-type", `attn_q_a\.weight$=q8_0`,
|
|
"--tensor-type", `attn_q_b\.weight$=q8_0`,
|
|
"--tensor-type", `attn_v_b\.weight$=q8_0`,
|
|
"--tensor-type", `attn_kv_a_mqa\.weight$=q8_0`,
|
|
)
|
|
}
|
|
}
|
|
// GLM-OCR is a small multimodal OCR model; keeping the input/output
|
|
// embeddings high precision avoids degenerate text output on K-quants.
|
|
// Legacy Ollama GGUFs use "glmocr"; split text GGUFs use llama.cpp's
|
|
// native "glm4" architecture.
|
|
if arch == "glmocr" || arch == "glm4" {
|
|
switch newFileType {
|
|
case fsggml.FileTypeQ4_K_S, fsggml.FileTypeQ4_K_M:
|
|
args = append(args,
|
|
"--tensor-type", `^token_embd\.weight$=f16`,
|
|
"--tensor-type", `^output\.weight$=f16`,
|
|
)
|
|
}
|
|
}
|
|
return append(args, input, output, typeName)
|
|
}
|