mirror of
https://github.com/ollama/ollama.git
synced 2025-12-25 08:40:47 -05:00
Compare commits
21 Commits
royh-param
...
timeout
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d77a174eb4 | ||
|
|
2cc7d05012 | ||
|
|
123a722a6f | ||
|
|
4d311eb731 | ||
|
|
cb42e607c5 | ||
|
|
2aa91a937b | ||
|
|
ccef9431c8 | ||
|
|
9a9e7d83c4 | ||
|
|
189a43caa2 | ||
|
|
e835ef1836 | ||
|
|
7e7749224c | ||
|
|
c7c2f3bc22 | ||
|
|
54a79d6a8a | ||
|
|
5bf5aeec01 | ||
|
|
e01e535cbb | ||
|
|
0195d6a2f8 | ||
|
|
8e0641a9bf | ||
|
|
662568d453 | ||
|
|
4ebb66c662 | ||
|
|
23e899f32d | ||
|
|
1a1c99e334 |
10
README.md
10
README.md
@@ -53,8 +53,8 @@ Here are some example models that can be downloaded:
|
||||
| Llama 3 | 70B | 40GB | `ollama run llama3:70b` |
|
||||
| Phi 3 Mini | 3.8B | 2.3GB | `ollama run phi3` |
|
||||
| Phi 3 Medium | 14B | 7.9GB | `ollama run phi3:medium` |
|
||||
| Gemma | 2B | 1.4GB | `ollama run gemma:2b` |
|
||||
| Gemma | 7B | 4.8GB | `ollama run gemma:7b` |
|
||||
| Gemma 2 | 9B | 5.5GB | `ollama run gemma2` |
|
||||
| Gemma 2 | 27B | 16GB | `ollama run gemma2:27b` |
|
||||
| Mistral | 7B | 4.1GB | `ollama run mistral` |
|
||||
| Moondream 2 | 1.4B | 829MB | `ollama run moondream` |
|
||||
| Neural Chat | 7B | 4.1GB | `ollama run neural-chat` |
|
||||
@@ -182,6 +182,12 @@ $ ollama run llama3 "Summarize this file: $(cat README.md)"
|
||||
Ollama is a lightweight, extensible framework for building and running language models on the local machine. It provides a simple API for creating, running, and managing models, as well as a library of pre-built models that can be easily used in a variety of applications.
|
||||
```
|
||||
|
||||
### Show model information
|
||||
|
||||
```
|
||||
ollama show llama3
|
||||
```
|
||||
|
||||
### List models on your computer
|
||||
|
||||
```
|
||||
|
||||
13
api/types.go
13
api/types.go
@@ -608,6 +608,19 @@ func FormatParams(params map[string][]string) (map[string]interface{}, error) {
|
||||
} else {
|
||||
field := valueOpts.FieldByName(opt.Name)
|
||||
if field.IsValid() && field.CanSet() {
|
||||
if reflect.PointerTo(field.Type()) == reflect.TypeOf((*TriState)(nil)) {
|
||||
boolVal, err := strconv.ParseBool(vals[0])
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid bool value %s", vals)
|
||||
}
|
||||
if boolVal {
|
||||
out[key] = TriStateTrue
|
||||
} else {
|
||||
out[key] = TriStateFalse
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
switch field.Kind() {
|
||||
case reflect.Float32:
|
||||
floatVal, err := strconv.ParseFloat(vals[0], 32)
|
||||
|
||||
@@ -2,6 +2,7 @@ package api
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"math"
|
||||
"testing"
|
||||
"time"
|
||||
@@ -141,3 +142,65 @@ func TestUseMmapParsingFromJSON(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestUseMmapFormatParams(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
req map[string][]string
|
||||
exp TriState
|
||||
err error
|
||||
}{
|
||||
{
|
||||
name: "True",
|
||||
req: map[string][]string{
|
||||
"use_mmap": []string{"true"},
|
||||
},
|
||||
exp: TriStateTrue,
|
||||
err: nil,
|
||||
},
|
||||
{
|
||||
name: "False",
|
||||
req: map[string][]string{
|
||||
"use_mmap": []string{"false"},
|
||||
},
|
||||
exp: TriStateFalse,
|
||||
err: nil,
|
||||
},
|
||||
{
|
||||
name: "Numeric True",
|
||||
req: map[string][]string{
|
||||
"use_mmap": []string{"1"},
|
||||
},
|
||||
exp: TriStateTrue,
|
||||
err: nil,
|
||||
},
|
||||
{
|
||||
name: "Numeric False",
|
||||
req: map[string][]string{
|
||||
"use_mmap": []string{"0"},
|
||||
},
|
||||
exp: TriStateFalse,
|
||||
err: nil,
|
||||
},
|
||||
{
|
||||
name: "invalid string",
|
||||
req: map[string][]string{
|
||||
"use_mmap": []string{"foo"},
|
||||
},
|
||||
exp: TriStateUndefined,
|
||||
err: fmt.Errorf("invalid bool value [foo]"),
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
t.Run(test.name, func(t *testing.T) {
|
||||
resp, err := FormatParams(test.req)
|
||||
require.Equal(t, err, test.err)
|
||||
respVal, ok := resp["use_mmap"]
|
||||
if test.exp != TriStateUndefined {
|
||||
assert.True(t, ok, "resp: %v", resp)
|
||||
assert.Equal(t, test.exp, respVal)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
71
cmd/cmd.go
71
cmd/cmd.go
@@ -162,9 +162,6 @@ func tempZipFiles(path string) (string, error) {
|
||||
}
|
||||
defer tempfile.Close()
|
||||
|
||||
zipfile := zip.NewWriter(tempfile)
|
||||
defer zipfile.Close()
|
||||
|
||||
detectContentType := func(path string) (string, error) {
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
@@ -233,6 +230,9 @@ func tempZipFiles(path string) (string, error) {
|
||||
files = append(files, tks...)
|
||||
}
|
||||
|
||||
zipfile := zip.NewWriter(tempfile)
|
||||
defer zipfile.Close()
|
||||
|
||||
for _, file := range files {
|
||||
f, err := os.Open(file)
|
||||
if err != nil {
|
||||
@@ -287,38 +287,12 @@ func createBlob(cmd *cobra.Command, client *api.Client, path string) (string, er
|
||||
}
|
||||
|
||||
func RunHandler(cmd *cobra.Command, args []string) error {
|
||||
client, err := api.ClientFromEnvironment()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
name := args[0]
|
||||
|
||||
// check if the model exists on the server
|
||||
show, err := client.Show(cmd.Context(), &api.ShowRequest{Name: name})
|
||||
var statusError api.StatusError
|
||||
switch {
|
||||
case errors.As(err, &statusError) && statusError.StatusCode == http.StatusNotFound:
|
||||
if err := PullHandler(cmd, []string{name}); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
show, err = client.Show(cmd.Context(), &api.ShowRequest{Name: name})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
case err != nil:
|
||||
return err
|
||||
}
|
||||
|
||||
interactive := true
|
||||
|
||||
opts := runOptions{
|
||||
Model: args[0],
|
||||
WordWrap: os.Getenv("TERM") == "xterm-256color",
|
||||
Options: map[string]interface{}{},
|
||||
MultiModal: slices.Contains(show.Details.Families, "clip"),
|
||||
ParentModel: show.Details.ParentModel,
|
||||
Model: args[0],
|
||||
WordWrap: os.Getenv("TERM") == "xterm-256color",
|
||||
Options: map[string]interface{}{},
|
||||
}
|
||||
|
||||
format, err := cmd.Flags().GetString("format")
|
||||
@@ -362,11 +336,38 @@ func RunHandler(cmd *cobra.Command, args []string) error {
|
||||
}
|
||||
opts.WordWrap = !nowrap
|
||||
|
||||
if !interactive {
|
||||
return generate(cmd, opts)
|
||||
// Fill out the rest of the options based on information about the
|
||||
// model.
|
||||
client, err := api.ClientFromEnvironment()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return generateInteractive(cmd, opts)
|
||||
name := args[0]
|
||||
info, err := func() (*api.ShowResponse, error) {
|
||||
showReq := &api.ShowRequest{Name: name}
|
||||
info, err := client.Show(cmd.Context(), showReq)
|
||||
var se api.StatusError
|
||||
if errors.As(err, &se) && se.StatusCode == http.StatusNotFound {
|
||||
if err := PullHandler(cmd, []string{name}); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return client.Show(cmd.Context(), &api.ShowRequest{Name: name})
|
||||
}
|
||||
return info, err
|
||||
}()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
opts.MultiModal = slices.Contains(info.Details.Families, "clip")
|
||||
opts.ParentModel = info.Details.ParentModel
|
||||
opts.Messages = append(opts.Messages, info.Messages...)
|
||||
|
||||
if interactive {
|
||||
return generateInteractive(cmd, opts)
|
||||
}
|
||||
return generate(cmd, opts)
|
||||
}
|
||||
|
||||
func errFromUnknownKey(unknownKeyErr error) error {
|
||||
|
||||
@@ -31,65 +31,40 @@ const (
|
||||
)
|
||||
|
||||
func loadModel(cmd *cobra.Command, opts *runOptions) error {
|
||||
client, err := api.ClientFromEnvironment()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
p := progress.NewProgress(os.Stderr)
|
||||
defer p.StopAndClear()
|
||||
|
||||
spinner := progress.NewSpinner("")
|
||||
p.Add("", spinner)
|
||||
|
||||
showReq := api.ShowRequest{Name: opts.Model}
|
||||
showResp, err := client.Show(cmd.Context(), &showReq)
|
||||
client, err := api.ClientFromEnvironment()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
opts.MultiModal = slices.Contains(showResp.Details.Families, "clip")
|
||||
opts.ParentModel = showResp.Details.ParentModel
|
||||
|
||||
if len(showResp.Messages) > 0 {
|
||||
opts.Messages = append(opts.Messages, showResp.Messages...)
|
||||
}
|
||||
|
||||
chatReq := &api.ChatRequest{
|
||||
Model: opts.Model,
|
||||
Messages: []api.Message{},
|
||||
Model: opts.Model,
|
||||
KeepAlive: opts.KeepAlive,
|
||||
}
|
||||
|
||||
if opts.KeepAlive != nil {
|
||||
chatReq.KeepAlive = opts.KeepAlive
|
||||
}
|
||||
|
||||
err = client.Chat(cmd.Context(), chatReq, func(resp api.ChatResponse) error {
|
||||
return client.Chat(cmd.Context(), chatReq, func(resp api.ChatResponse) error {
|
||||
p.StopAndClear()
|
||||
if len(opts.Messages) > 0 {
|
||||
for _, msg := range opts.Messages {
|
||||
switch msg.Role {
|
||||
case "user":
|
||||
fmt.Printf(">>> %s\n", msg.Content)
|
||||
case "assistant":
|
||||
state := &displayResponseState{}
|
||||
displayResponse(msg.Content, opts.WordWrap, state)
|
||||
fmt.Println()
|
||||
fmt.Println()
|
||||
}
|
||||
for _, msg := range opts.Messages {
|
||||
switch msg.Role {
|
||||
case "user":
|
||||
fmt.Printf(">>> %s\n", msg.Content)
|
||||
case "assistant":
|
||||
state := &displayResponseState{}
|
||||
displayResponse(msg.Content, opts.WordWrap, state)
|
||||
fmt.Println()
|
||||
fmt.Println()
|
||||
}
|
||||
}
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
||||
opts.Messages = make([]api.Message, 0)
|
||||
|
||||
err := loadModel(cmd, &opts)
|
||||
if err != nil {
|
||||
return err
|
||||
|
||||
@@ -77,20 +77,27 @@ func cleanupTmpDirs() {
|
||||
continue
|
||||
}
|
||||
raw, err := os.ReadFile(filepath.Join(d, "ollama.pid"))
|
||||
if err == nil {
|
||||
pid, err := strconv.Atoi(string(raw))
|
||||
if err == nil {
|
||||
if proc, err := os.FindProcess(pid); err == nil && !errors.Is(proc.Signal(syscall.Signal(0)), os.ErrProcessDone) {
|
||||
// Another running ollama, ignore this tmpdir
|
||||
continue
|
||||
}
|
||||
}
|
||||
} else {
|
||||
slog.Debug("failed to open ollama.pid", "path", d, "error", err)
|
||||
}
|
||||
err = os.RemoveAll(d)
|
||||
if err != nil {
|
||||
slog.Debug("unable to cleanup stale tmpdir", "path", d, "error", err)
|
||||
slog.Warn("failed to read ollama.pid", "path", d, "error", err)
|
||||
// No pid, ignore this tmpdir
|
||||
continue
|
||||
}
|
||||
|
||||
pid, err := strconv.Atoi(string(raw))
|
||||
if err != nil {
|
||||
slog.Warn("failed to parse pid", "path", d, "error", err)
|
||||
continue
|
||||
}
|
||||
|
||||
proc, err := os.FindProcess(pid)
|
||||
if err == nil && !errors.Is(proc.Signal(syscall.Signal(0)), os.ErrProcessDone) {
|
||||
slog.Warn("found running ollama", "pid", pid, "path", d)
|
||||
// Another running ollama, ignore this tmpdir
|
||||
continue
|
||||
}
|
||||
|
||||
if err := os.Remove(d); err != nil {
|
||||
slog.Warn("unable to cleanup stale tmpdir", "path", d, "error", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
13
llm/ggla.go
13
llm/ggla.go
@@ -53,7 +53,7 @@ func (llm *ggla) Tensors() Tensors {
|
||||
return llm.tensors
|
||||
}
|
||||
|
||||
func (llm *ggla) decode(rs io.ReadSeeker) error {
|
||||
func (llm *ggla) decode(rs io.ReadSeeker) (retErr error) {
|
||||
var r uint32
|
||||
if err := binary.Read(rs, binary.LittleEndian, &r); err != nil {
|
||||
return err
|
||||
@@ -69,9 +69,18 @@ func (llm *ggla) decode(rs io.ReadSeeker) error {
|
||||
for {
|
||||
var dims uint32
|
||||
if err := binary.Read(rs, binary.LittleEndian, &dims); err != nil {
|
||||
if errors.Is(err, io.EOF) {
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
defer func() {
|
||||
if errors.Is(retErr, io.EOF) {
|
||||
retErr = io.ErrUnexpectedEOF
|
||||
}
|
||||
}()
|
||||
|
||||
var namesize uint32
|
||||
if err := binary.Read(rs, binary.LittleEndian, &namesize); err != nil {
|
||||
return err
|
||||
@@ -108,7 +117,7 @@ func (llm *ggla) decode(rs io.ReadSeeker) error {
|
||||
return err
|
||||
}
|
||||
|
||||
if _, err := rs.Seek((offset+31)&-32, io.SeekStart); err != nil {
|
||||
if _, err := rs.Seek((offset+31)&-32-offset, io.SeekCurrent); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
|
||||
65
llm/ggml.go
65
llm/ggml.go
@@ -6,6 +6,8 @@ import (
|
||||
"fmt"
|
||||
"io"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/util/bufioutil"
|
||||
)
|
||||
|
||||
type GGML struct {
|
||||
@@ -69,6 +71,30 @@ func (kv KV) HeadCountKV() uint64 {
|
||||
return 1
|
||||
}
|
||||
|
||||
func (kv KV) EmbeddingHeadCount() uint64 {
|
||||
if heads := kv.HeadCount(); heads > 0 {
|
||||
return kv.EmbeddingLength() / kv.HeadCount()
|
||||
}
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
func (kv KV) EmbeddingHeadCountK() uint64 {
|
||||
if k := kv.u64(fmt.Sprintf("%s.attention.key_length", kv.Architecture())); k > 0 {
|
||||
return k
|
||||
}
|
||||
|
||||
return kv.EmbeddingHeadCount()
|
||||
}
|
||||
|
||||
func (kv KV) EmbeddingHeadCountV() uint64 {
|
||||
if v := kv.u64(fmt.Sprintf("%s.attention.value_length", kv.Architecture())); v > 0 {
|
||||
return v
|
||||
}
|
||||
|
||||
return kv.EmbeddingHeadCount()
|
||||
}
|
||||
|
||||
func (kv KV) GQA() uint64 {
|
||||
return kv.HeadCount() / kv.HeadCountKV()
|
||||
}
|
||||
@@ -254,7 +280,18 @@ func DetectGGMLType(b []byte) string {
|
||||
}
|
||||
}
|
||||
|
||||
func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
|
||||
// DecodeGGML decodes a GGML model from the given reader.
|
||||
//
|
||||
// It collects array values for arrays with a size less than or equal to
|
||||
// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
|
||||
// the maxArraySize is negative, all arrays are collected.
|
||||
func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
|
||||
if maxArraySize == 0 {
|
||||
maxArraySize = 1024
|
||||
}
|
||||
|
||||
rs = bufioutil.NewBufferedSeeker(rs, 32<<10)
|
||||
|
||||
var magic uint32
|
||||
if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
|
||||
return nil, 0, err
|
||||
@@ -267,17 +304,15 @@ func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
|
||||
case FILE_MAGIC_GGLA:
|
||||
c = &containerGGLA{}
|
||||
case FILE_MAGIC_GGUF_LE:
|
||||
c = &containerGGUF{ByteOrder: binary.LittleEndian}
|
||||
c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize}
|
||||
case FILE_MAGIC_GGUF_BE:
|
||||
c = &containerGGUF{ByteOrder: binary.BigEndian}
|
||||
c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
|
||||
default:
|
||||
return nil, 0, errors.New("invalid file magic")
|
||||
}
|
||||
|
||||
model, err := c.Decode(rs)
|
||||
if errors.Is(err, io.EOF) {
|
||||
// noop
|
||||
} else if err != nil {
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
@@ -297,7 +332,10 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
|
||||
embedding := llm.KV().EmbeddingLength()
|
||||
heads := llm.KV().HeadCount()
|
||||
headsKV := llm.KV().HeadCountKV()
|
||||
vocab := uint64(len(llm.KV()["tokenizer.ggml.tokens"].([]any)))
|
||||
vocab := uint64(llm.KV()["tokenizer.ggml.tokens"].(*array).size)
|
||||
|
||||
embeddingHeads := llm.KV().EmbeddingHeadCount()
|
||||
embeddingHeadsK := llm.KV().EmbeddingHeadCountK()
|
||||
|
||||
layers := llm.Tensors().Layers()
|
||||
|
||||
@@ -308,7 +346,7 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
|
||||
partialOffload = 4 * batch * embedding
|
||||
partialOffload += max(
|
||||
// 4*batch*(4+6*embedding+context*(2*heads)+llm.KV().GQA()),
|
||||
4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embedding/heads*headsKV),
|
||||
4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
|
||||
4*batch*(embedding+vocab)+embedding*vocab*105/128,
|
||||
)
|
||||
|
||||
@@ -316,15 +354,15 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
|
||||
// mixtral 8x22b
|
||||
ff := uint64(llm.KV()["llama.feed_forward_length"].(uint32))
|
||||
partialOffload = max(
|
||||
3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embedding/heads*headsKV),
|
||||
4*(context*batch*heads+context*embedding/heads*headsKV+batch*1024+embedding/heads*headsKV*batch),
|
||||
3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
|
||||
4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
|
||||
)
|
||||
} else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok {
|
||||
// mixtral 8x7b
|
||||
ffnGateWeight1 := ffnGateWeight.Shape[1]
|
||||
fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1)
|
||||
partialOffload = max(
|
||||
4*batch*(3+embedding/heads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
|
||||
4*batch*(3+embeddingHeads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
|
||||
4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
|
||||
)
|
||||
}
|
||||
@@ -368,15 +406,14 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
|
||||
fullOffload,
|
||||
)
|
||||
case "deepseek2":
|
||||
keys := uint64(llm.KV()["deepseek2.attention.key_length"].(uint32))
|
||||
fullOffload = max(
|
||||
4*batch*(3*embedding+vocab),
|
||||
4*batch*(3*embedding+2+context*(1+headsKV)+2*keys*headsKV),
|
||||
4*batch*(3*embedding+2+context*(1+headsKV)+2*embeddingHeadsK*headsKV),
|
||||
)
|
||||
|
||||
partialOffload = max(
|
||||
4*batch*(3*embedding+vocab)+embedding*vocab*105/128,
|
||||
4*batch*(2*embedding+1+2*keys*headsKV+context+context*headsKV)+4*keys*context*headsKV+embedding*keys*headsKV*9/16,
|
||||
4*batch*(2*embedding+1+2*embeddingHeadsK*headsKV+context+context*headsKV)+4*embeddingHeadsK*context*headsKV+embedding*embeddingHeadsK*headsKV*9/16,
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
1
llm/ggml_test.go
Normal file
1
llm/ggml_test.go
Normal file
@@ -0,0 +1 @@
|
||||
package llm
|
||||
130
llm/gguf.go
130
llm/gguf.go
@@ -3,11 +3,10 @@ package llm
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"strings"
|
||||
|
||||
"log/slog"
|
||||
)
|
||||
|
||||
type containerGGUF struct {
|
||||
@@ -29,6 +28,12 @@ type containerGGUF struct {
|
||||
NumTensor uint64
|
||||
NumKV uint64
|
||||
}
|
||||
|
||||
maxArraySize int
|
||||
}
|
||||
|
||||
func (c *containerGGUF) canCollectArray(size int) bool {
|
||||
return c.maxArraySize < 0 || size <= c.maxArraySize
|
||||
}
|
||||
|
||||
func (c *containerGGUF) Name() string {
|
||||
@@ -54,7 +59,6 @@ func (c *containerGGUF) Decode(rs io.ReadSeeker) (model, error) {
|
||||
}
|
||||
|
||||
model := newGGUF(c)
|
||||
slog.Debug(fmt.Sprintf("model = %#v", model))
|
||||
if err := model.Decode(rs); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -85,6 +89,8 @@ type gguf struct {
|
||||
tensors []*Tensor
|
||||
|
||||
parameters uint64
|
||||
|
||||
scratch [16 << 10]byte
|
||||
}
|
||||
|
||||
func newGGUF(container *containerGGUF) *gguf {
|
||||
@@ -181,34 +187,34 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
|
||||
}
|
||||
|
||||
// decode tensors
|
||||
for i := 0; uint64(i) < llm.numTensor(); i++ {
|
||||
for range llm.numTensor() {
|
||||
name, err := readGGUFString(llm, rs)
|
||||
if err != nil {
|
||||
return err
|
||||
return fmt.Errorf("failed to read tensor name: %w", err)
|
||||
}
|
||||
|
||||
// dims is the number of dimensions in the tensor
|
||||
dims, err := readGGUF[uint32](llm, rs)
|
||||
if err != nil {
|
||||
return err
|
||||
return fmt.Errorf("failed to read tensor dimensions: %w", err)
|
||||
}
|
||||
|
||||
shape := [4]uint64{1, 1, 1, 1}
|
||||
for i := 0; uint32(i) < dims; i++ {
|
||||
shape[i], err = readGGUF[uint64](llm, rs)
|
||||
if err != nil {
|
||||
return err
|
||||
return fmt.Errorf("failed to read tensor shape: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
kind, err := readGGUF[uint32](llm, rs)
|
||||
if err != nil {
|
||||
return err
|
||||
return fmt.Errorf("failed to read tensor kind: %w", err)
|
||||
}
|
||||
|
||||
offset, err := readGGUF[uint64](llm, rs)
|
||||
if err != nil {
|
||||
return err
|
||||
return fmt.Errorf("failed to read tensor offset: %w", err)
|
||||
}
|
||||
|
||||
tensor := Tensor{
|
||||
@@ -230,24 +236,19 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
|
||||
alignment = 32
|
||||
}
|
||||
|
||||
offset, err := rs.Seek(0, io.SeekCurrent)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
padding := llm.padding(offset, int64(alignment))
|
||||
if _, err := rs.Seek(padding, io.SeekCurrent); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, tensor := range llm.tensors {
|
||||
if _, err := rs.Seek(int64(tensor.Size()), io.SeekCurrent); err != nil {
|
||||
return err
|
||||
offset, err := rs.Seek(0, io.SeekCurrent)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to get current offset: %w", err)
|
||||
}
|
||||
|
||||
padding := llm.padding(int64(tensor.Size()), int64(alignment))
|
||||
padding := llm.padding(offset, int64(alignment))
|
||||
if _, err := rs.Seek(padding, io.SeekCurrent); err != nil {
|
||||
return err
|
||||
return fmt.Errorf("failed to seek to init padding: %w", err)
|
||||
}
|
||||
|
||||
if _, err := rs.Seek(int64(tensor.Size()), io.SeekCurrent); err != nil {
|
||||
return fmt.Errorf("failed to seek to tensor: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -285,22 +286,48 @@ func readGGUFV1String(llm *gguf, r io.Reader) (string, error) {
|
||||
return b.String(), nil
|
||||
}
|
||||
|
||||
func discardGGUFString(llm *gguf, r io.Reader) error {
|
||||
buf := llm.scratch[:8]
|
||||
_, err := io.ReadFull(r, buf)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
size := int(llm.ByteOrder.Uint64(buf))
|
||||
for size > 0 {
|
||||
n, err := r.Read(llm.scratch[:min(size, cap(llm.scratch))])
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
size -= n
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func readGGUFString(llm *gguf, r io.Reader) (string, error) {
|
||||
if llm.Version == 1 {
|
||||
return readGGUFV1String(llm, r)
|
||||
}
|
||||
|
||||
var length uint64
|
||||
if err := binary.Read(r, llm.ByteOrder, &length); err != nil {
|
||||
buf := llm.scratch[:8]
|
||||
_, err := io.ReadFull(r, buf)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
var b bytes.Buffer
|
||||
if _, err := io.CopyN(&b, r, int64(length)); err != nil {
|
||||
length := int(llm.ByteOrder.Uint64(buf))
|
||||
if length > len(llm.scratch) {
|
||||
buf = make([]byte, length)
|
||||
} else {
|
||||
buf = llm.scratch[:length]
|
||||
}
|
||||
clear(buf)
|
||||
|
||||
_, err = io.ReadFull(r, buf)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return b.String(), nil
|
||||
return string(buf), nil
|
||||
}
|
||||
|
||||
func writeGGUFString(llm *gguf, w io.Writer, s string) error {
|
||||
@@ -316,7 +343,16 @@ func writeGGUFString(llm *gguf, w io.Writer, s string) error {
|
||||
return err
|
||||
}
|
||||
|
||||
func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) {
|
||||
type array struct {
|
||||
size int
|
||||
values []any
|
||||
}
|
||||
|
||||
func (a *array) MarshalJSON() ([]byte, error) {
|
||||
return json.Marshal(a.values)
|
||||
}
|
||||
|
||||
func readGGUFV1Array(llm *gguf, r io.Reader) (*array, error) {
|
||||
t, err := readGGUF[uint32](llm, r)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@@ -327,7 +363,12 @@ func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for i := 0; uint32(i) < n; i++ {
|
||||
a := &array{size: int(n)}
|
||||
if llm.canCollectArray(int(n)) {
|
||||
a.values = make([]any, 0, int(n))
|
||||
}
|
||||
|
||||
for i := range n {
|
||||
var e any
|
||||
switch t {
|
||||
case ggufTypeUint8:
|
||||
@@ -361,13 +402,15 @@ func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
a = append(a, e)
|
||||
if a.values != nil {
|
||||
a.values[i] = e
|
||||
}
|
||||
}
|
||||
|
||||
return
|
||||
return a, nil
|
||||
}
|
||||
|
||||
func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
|
||||
func readGGUFArray(llm *gguf, r io.Reader) (*array, error) {
|
||||
if llm.Version == 1 {
|
||||
return readGGUFV1Array(llm, r)
|
||||
}
|
||||
@@ -382,7 +425,12 @@ func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for i := 0; uint64(i) < n; i++ {
|
||||
a := &array{size: int(n)}
|
||||
if llm.canCollectArray(int(n)) {
|
||||
a.values = make([]any, int(n))
|
||||
}
|
||||
|
||||
for i := range n {
|
||||
var e any
|
||||
switch t {
|
||||
case ggufTypeUint8:
|
||||
@@ -408,7 +456,11 @@ func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
|
||||
case ggufTypeBool:
|
||||
e, err = readGGUF[bool](llm, r)
|
||||
case ggufTypeString:
|
||||
e, err = readGGUFString(llm, r)
|
||||
if a.values != nil {
|
||||
e, err = readGGUFString(llm, r)
|
||||
} else {
|
||||
err = discardGGUFString(llm, r)
|
||||
}
|
||||
default:
|
||||
return nil, fmt.Errorf("invalid array type: %d", t)
|
||||
}
|
||||
@@ -416,10 +468,12 @@ func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
a = append(a, e)
|
||||
if a.values != nil {
|
||||
a.values[i] = e
|
||||
}
|
||||
}
|
||||
|
||||
return
|
||||
return a, nil
|
||||
}
|
||||
|
||||
func writeGGUFArray[S ~[]E, E any](llm *gguf, w io.Writer, t uint32, s S) error {
|
||||
|
||||
@@ -115,8 +115,8 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
||||
slog.Warn("model missing blk.0 layer size")
|
||||
}
|
||||
|
||||
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
|
||||
var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()
|
||||
// fp16 k,v = sizeof(float16) * n_ctx * n_layer * (n_embd_head_k + n_embd_head_v) * n_head_kv
|
||||
var kv uint64 = 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * (ggml.KV().EmbeddingHeadCountK() + ggml.KV().EmbeddingHeadCountV()) * ggml.KV().HeadCountKV()
|
||||
|
||||
// KV is proportional to the number of layers
|
||||
layerSize += kv / ggml.KV().BlockCount()
|
||||
|
||||
@@ -22,13 +22,14 @@ func TestEstimateGPULayers(t *testing.T) {
|
||||
defer f.Close()
|
||||
gguf := NewGGUFV3(binary.LittleEndian)
|
||||
inputLayerCount := 5
|
||||
|
||||
tensors := []Tensor{
|
||||
{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
|
||||
{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
|
||||
{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
|
||||
{Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
|
||||
{Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
|
||||
{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
|
||||
{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||
{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||
{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||
{Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||
{Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||
{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||
}
|
||||
assert.Len(t, tensors, inputLayerCount+1)
|
||||
err = gguf.Encode(f, KV{
|
||||
@@ -45,8 +46,10 @@ func TestEstimateGPULayers(t *testing.T) {
|
||||
}, tensors)
|
||||
require.NoError(t, err)
|
||||
|
||||
ggml, err := LoadModel(f.Name())
|
||||
require.NoError(t, err)
|
||||
ggml, err := LoadModel(f.Name(), 0)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Simple CPU scenario
|
||||
gpus := []gpu.GpuInfo{
|
||||
|
||||
305
llm/patches/07-gemma.diff
Normal file
305
llm/patches/07-gemma.diff
Normal file
@@ -0,0 +1,305 @@
|
||||
From 5cadb45f39d001ffbad95b690d6cf0abcb4a6d96 Mon Sep 17 00:00:00 2001
|
||||
From: Ollama maintainers <hello@ollama.com>
|
||||
Date: Wed, 26 Jun 2024 16:18:09 -0700
|
||||
Subject: [PATCH] Architecture support
|
||||
|
||||
---
|
||||
llama.cpp | 194 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
|
||||
1 file changed, 193 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/llama.cpp b/llama.cpp
|
||||
index 61948751..3b4196f5 100644
|
||||
--- a/llama.cpp
|
||||
+++ b/llama.cpp
|
||||
@@ -217,6 +217,7 @@ enum llm_arch {
|
||||
LLM_ARCH_INTERNLM2,
|
||||
LLM_ARCH_MINICPM,
|
||||
LLM_ARCH_GEMMA,
|
||||
+ LLM_ARCH_GEMMA2,
|
||||
LLM_ARCH_STARCODER2,
|
||||
LLM_ARCH_MAMBA,
|
||||
LLM_ARCH_XVERSE,
|
||||
@@ -255,6 +256,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
||||
{ LLM_ARCH_MINICPM, "minicpm" },
|
||||
{ LLM_ARCH_GEMMA, "gemma" },
|
||||
+ { LLM_ARCH_GEMMA2, "gemma2" },
|
||||
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
||||
{ LLM_ARCH_MAMBA, "mamba" },
|
||||
{ LLM_ARCH_XVERSE, "xverse" },
|
||||
@@ -464,10 +466,12 @@ enum llm_tensor {
|
||||
LLM_TENSOR_ATTN_NORM,
|
||||
LLM_TENSOR_ATTN_NORM_2,
|
||||
LLM_TENSOR_ATTN_OUT_NORM,
|
||||
+ LLM_TENSOR_ATTN_POST_NORM,
|
||||
LLM_TENSOR_ATTN_ROT_EMBD,
|
||||
LLM_TENSOR_FFN_GATE_INP,
|
||||
LLM_TENSOR_FFN_GATE_INP_SHEXP,
|
||||
LLM_TENSOR_FFN_NORM,
|
||||
+ LLM_TENSOR_FFN_POST_NORM,
|
||||
LLM_TENSOR_FFN_GATE,
|
||||
LLM_TENSOR_FFN_DOWN,
|
||||
LLM_TENSOR_FFN_UP,
|
||||
@@ -960,6 +964,24 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
+ {
|
||||
+ LLM_ARCH_GEMMA2,
|
||||
+ {
|
||||
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
+ { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
|
||||
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
+ { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
||||
+ },
|
||||
+ },
|
||||
{
|
||||
LLM_ARCH_STARCODER2,
|
||||
{
|
||||
@@ -1941,6 +1963,8 @@ enum e_model {
|
||||
MODEL_8x22B,
|
||||
MODEL_16x12B,
|
||||
MODEL_10B_128x3_66B,
|
||||
+ MODEL_9B,
|
||||
+ MODEL_27B,
|
||||
};
|
||||
|
||||
static const size_t kiB = 1024;
|
||||
@@ -2114,6 +2138,7 @@ struct llama_layer {
|
||||
struct ggml_tensor * attn_out_norm_b;
|
||||
struct ggml_tensor * attn_q_a_norm;
|
||||
struct ggml_tensor * attn_kv_a_norm;
|
||||
+ struct ggml_tensor * attn_post_norm;
|
||||
|
||||
// attention
|
||||
struct ggml_tensor * wq;
|
||||
@@ -2136,6 +2161,7 @@ struct llama_layer {
|
||||
// normalization
|
||||
struct ggml_tensor * ffn_norm;
|
||||
struct ggml_tensor * ffn_norm_b;
|
||||
+ struct ggml_tensor * ffn_post_norm;
|
||||
struct ggml_tensor * layer_out_norm;
|
||||
struct ggml_tensor * layer_out_norm_b;
|
||||
struct ggml_tensor * ffn_norm_exps;
|
||||
@@ -4529,6 +4555,16 @@ static void llm_load_hparams(
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_GEMMA:
|
||||
+ {
|
||||
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
+
|
||||
+ switch (hparams.n_layer) {
|
||||
+ case 18: model.type = e_model::MODEL_9B; break;
|
||||
+ case 28: model.type = e_model::MODEL_27B; break;
|
||||
+ default: model.type = e_model::MODEL_UNKNOWN;
|
||||
+ }
|
||||
+ } break;
|
||||
+ case LLM_ARCH_GEMMA2:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
@@ -6305,6 +6341,40 @@ static bool llm_load_tensors(
|
||||
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
||||
}
|
||||
} break;
|
||||
+ case LLM_ARCH_GEMMA2:
|
||||
+ {
|
||||
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||
+
|
||||
+ // output
|
||||
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
|
||||
+
|
||||
+ const int64_t n_ff = hparams.n_ff;
|
||||
+ const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
||||
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
||||
+
|
||||
+ for (uint32_t i = 0; i < n_layer; ++i) {
|
||||
+ ggml_context * ctx_layer = ctx_for_layer(i);
|
||||
+ ggml_context * ctx_split = ctx_for_layer_split(i);
|
||||
+
|
||||
+ auto & layer = model.layers[i];
|
||||
+
|
||||
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||
+
|
||||
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * hparams.n_head});
|
||||
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
|
||||
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
|
||||
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * hparams.n_head, n_embd});
|
||||
+ layer.attn_post_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd});
|
||||
+
|
||||
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
||||
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
||||
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
||||
+ layer.ffn_post_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd});
|
||||
+ }
|
||||
+ } break;
|
||||
case LLM_ARCH_STARCODER2:
|
||||
{
|
||||
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||
@@ -10614,6 +10684,123 @@ struct llm_build_context {
|
||||
return gf;
|
||||
}
|
||||
|
||||
+ struct ggml_cgraph * build_gemma2() {
|
||||
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
+
|
||||
+ const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
||||
+
|
||||
+ struct ggml_tensor * cur;
|
||||
+ struct ggml_tensor * inpL;
|
||||
+
|
||||
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
||||
+
|
||||
+ inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
|
||||
+ cb(inpL, "inp_scaled", -1);
|
||||
+
|
||||
+ // inp_pos - contains the positions
|
||||
+ struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
+
|
||||
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
||||
+
|
||||
+ for (int il = 0; il < n_layer; ++il) {
|
||||
+ // norm
|
||||
+ cur = llm_build_norm(ctx0, inpL, hparams,
|
||||
+ model.layers[il].attn_norm, NULL,
|
||||
+ LLM_NORM_RMS, cb, il);
|
||||
+ cb(cur, "attn_norm", il);
|
||||
+
|
||||
+ // self-attention
|
||||
+ {
|
||||
+ // compute Q and K and RoPE them
|
||||
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
||||
+ cb(Qcur, "Qcur", il);
|
||||
+
|
||||
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
||||
+ cb(Kcur, "Kcur", il);
|
||||
+
|
||||
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
||||
+ cb(Vcur, "Vcur", il);
|
||||
+
|
||||
+ Qcur = ggml_rope_ext(
|
||||
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
|
||||
+ n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
+ ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
+ cb(Qcur, "Qcur", il);
|
||||
+
|
||||
+ Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
|
||||
+ cb(Qcur, "Qcur_scaled", il);
|
||||
+
|
||||
+ Kcur = ggml_rope_ext(
|
||||
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
|
||||
+ n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
+ ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
+ cb(Kcur, "Kcur", il);
|
||||
+
|
||||
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
||||
+ model.layers[il].wo, NULL,
|
||||
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
||||
+ }
|
||||
+
|
||||
+ if (il == n_layer - 1) {
|
||||
+ // skip computing output for unused tokens
|
||||
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
||||
+ }
|
||||
+
|
||||
+ cur = llm_build_norm(ctx0, cur, hparams,
|
||||
+ model.layers[il].attn_post_norm, NULL,
|
||||
+ LLM_NORM_RMS, cb, il);
|
||||
+ cb(cur, "attn_post_norm", il);
|
||||
+
|
||||
+ struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
|
||||
+ cb(sa_out, "sa_out", il);
|
||||
+
|
||||
+ cur = llm_build_norm(ctx0, sa_out, hparams,
|
||||
+ model.layers[il].ffn_norm, NULL,
|
||||
+ LLM_NORM_RMS, cb, il);
|
||||
+ cb(cur, "ffn_norm", il);
|
||||
+
|
||||
+ // feed-forward network
|
||||
+ {
|
||||
+ cur = llm_build_ffn(ctx0, cur,
|
||||
+ model.layers[il].ffn_up, NULL,
|
||||
+ model.layers[il].ffn_gate, NULL,
|
||||
+ model.layers[il].ffn_down, NULL,
|
||||
+ NULL,
|
||||
+ LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
|
||||
+ cb(cur, "ffn_out", il);
|
||||
+ }
|
||||
+
|
||||
+ cur = llm_build_norm(ctx0, cur, hparams,
|
||||
+ model.layers[il].ffn_post_norm, NULL,
|
||||
+ LLM_NORM_RMS, cb, -1);
|
||||
+ cb(cur, "ffn_post_norm", -1);
|
||||
+
|
||||
+ cur = ggml_add(ctx0, cur, sa_out);
|
||||
+ cb(cur, "l_out", il);
|
||||
+
|
||||
+ // input for next layer
|
||||
+ inpL = cur;
|
||||
+ }
|
||||
+
|
||||
+ cur = inpL;
|
||||
+
|
||||
+ cur = llm_build_norm(ctx0, cur, hparams,
|
||||
+ model.output_norm, NULL,
|
||||
+ LLM_NORM_RMS, cb, -1);
|
||||
+ cb(cur, "result_norm", -1);
|
||||
+
|
||||
+ // lm_head
|
||||
+ cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||
+ cb(cur, "result_output", -1);
|
||||
+
|
||||
+ ggml_build_forward_expand(gf, cur);
|
||||
+
|
||||
+ return gf;
|
||||
+ }
|
||||
+
|
||||
struct ggml_cgraph * build_starcoder2() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
@@ -11847,6 +12034,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||
{
|
||||
result = llm.build_gemma();
|
||||
} break;
|
||||
+ case LLM_ARCH_GEMMA2:
|
||||
+ {
|
||||
+ result = llm.build_gemma2();
|
||||
+ } break;
|
||||
case LLM_ARCH_STARCODER2:
|
||||
{
|
||||
result = llm.build_starcoder2();
|
||||
@@ -16671,6 +16862,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||
case LLM_ARCH_PHI2:
|
||||
case LLM_ARCH_PHI3:
|
||||
case LLM_ARCH_GEMMA:
|
||||
+ case LLM_ARCH_GEMMA2:
|
||||
case LLM_ARCH_STARCODER2:
|
||||
case LLM_ARCH_GPTNEOX:
|
||||
return LLAMA_ROPE_TYPE_NEOX;
|
||||
@@ -18551,7 +18743,7 @@ static int32_t llama_chat_apply_template_internal(
|
||||
if (add_ass) {
|
||||
ss << "<s>assistant\n";
|
||||
}
|
||||
- } else if (tmpl == "gemma" || tmpl.find("<start_of_turn>") != std::string::npos) {
|
||||
+ } else if (tmpl == "gemma" || tmpl == "gemma2" || tmpl.find("<start_of_turn>") != std::string::npos) {
|
||||
// google/gemma-7b-it
|
||||
std::string system_prompt = "";
|
||||
for (auto message : chat) {
|
||||
--
|
||||
2.45.2
|
||||
|
||||
@@ -60,7 +60,12 @@ type llmServer struct {
|
||||
sem *semaphore.Weighted
|
||||
}
|
||||
|
||||
func LoadModel(model string) (*GGML, error) {
|
||||
// LoadModel will load a model from disk. The model must be in the GGML format.
|
||||
//
|
||||
// It collects array values for arrays with a size less than or equal to
|
||||
// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
|
||||
// the maxArraySize is negative, all arrays are collected.
|
||||
func LoadModel(model string, maxArraySize int) (*GGML, error) {
|
||||
if _, err := os.Stat(model); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -71,7 +76,7 @@ func LoadModel(model string) (*GGML, error) {
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
ggml, _, err := DecodeGGML(f)
|
||||
ggml, _, err := DecodeGGML(f, maxArraySize)
|
||||
return ggml, err
|
||||
}
|
||||
|
||||
@@ -81,7 +86,17 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||
var err error
|
||||
var cpuRunner string
|
||||
var estimate MemoryEstimate
|
||||
var systemMemory uint64
|
||||
var systemTotalMemory uint64
|
||||
var systemFreeMemory uint64
|
||||
|
||||
systemMemInfo, err := gpu.GetCPUMem()
|
||||
if err != nil {
|
||||
slog.Error("failed to lookup system memory", "error", err)
|
||||
} else {
|
||||
systemTotalMemory = systemMemInfo.TotalMemory
|
||||
systemFreeMemory = systemMemInfo.FreeMemory
|
||||
slog.Debug("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", systemFreeMemory)
|
||||
}
|
||||
|
||||
// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
|
||||
if opts.NumGPU == 0 {
|
||||
@@ -91,19 +106,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||
cpuRunner = serverForCpu()
|
||||
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||
} else {
|
||||
if gpus[0].Library == "metal" {
|
||||
memInfo, err := gpu.GetCPUMem()
|
||||
if err != nil {
|
||||
slog.Error("failed to lookup system memory", "error", err)
|
||||
} else {
|
||||
systemMemory = memInfo.TotalMemory
|
||||
slog.Debug("system memory", "total", format.HumanBytes2(systemMemory))
|
||||
}
|
||||
}
|
||||
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||
|
||||
switch {
|
||||
case gpus[0].Library == "metal" && estimate.VRAMSize > systemMemory:
|
||||
case gpus[0].Library == "metal" && estimate.VRAMSize > systemTotalMemory:
|
||||
// disable partial offloading when model is greater than total system memory as this
|
||||
// can lead to locking up the system
|
||||
opts.NumGPU = 0
|
||||
@@ -160,6 +166,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||
|
||||
params = append(params, "--log-disable")
|
||||
|
||||
params = append(params, "--timeout", fmt.Sprintf("%d", 600))
|
||||
|
||||
if opts.NumGPU >= 0 {
|
||||
params = append(params, "--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU))
|
||||
}
|
||||
@@ -211,7 +219,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||
}
|
||||
|
||||
// Windows CUDA should not use mmap for best performance
|
||||
if (runtime.GOOS == "windows" && gpus[0].Library == "cuda") || opts.UseMMap == api.TriStateFalse {
|
||||
// Linux with a model larger than free space, mmap leads to thrashing
|
||||
if (runtime.GOOS == "windows" && gpus[0].Library == "cuda" && opts.UseMMap == api.TriStateUndefined) ||
|
||||
(runtime.GOOS == "linux" && systemFreeMemory < estimate.TotalSize && opts.UseMMap == api.TriStateUndefined) ||
|
||||
opts.UseMMap == api.TriStateFalse {
|
||||
params = append(params, "--no-mmap")
|
||||
}
|
||||
|
||||
@@ -408,7 +419,7 @@ func projectorMemoryRequirements(filename string) uint64 {
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
ggml, _, err := DecodeGGML(file)
|
||||
ggml, _, err := DecodeGGML(file, 0)
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
|
||||
@@ -279,7 +279,7 @@ if ! check_gpu nvidia-smi || [ -z "$(nvidia-smi | grep -o "CUDA Version: [0-9]*\
|
||||
case $OS_NAME in
|
||||
centos|rhel) install_cuda_driver_yum 'rhel' $(echo $OS_VERSION | cut -d '.' -f 1) ;;
|
||||
rocky) install_cuda_driver_yum 'rhel' $(echo $OS_VERSION | cut -c1) ;;
|
||||
fedora) [ $OS_VERSION -lt '37' ] && install_cuda_driver_yum $OS_NAME $OS_VERSION || install_cuda_driver_yum $OS_NAME '37';;
|
||||
fedora) [ $OS_VERSION -lt '39' ] && install_cuda_driver_yum $OS_NAME $OS_VERSION || install_cuda_driver_yum $OS_NAME '39';;
|
||||
amzn) install_cuda_driver_yum 'fedora' '37' ;;
|
||||
debian) install_cuda_driver_apt $OS_NAME $OS_VERSION ;;
|
||||
ubuntu) install_cuda_driver_apt $OS_NAME $(echo $OS_VERSION | sed 's/\.//') ;;
|
||||
|
||||
@@ -414,17 +414,22 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
|
||||
return err
|
||||
}
|
||||
|
||||
layers, err := parseFromFile(ctx, temp, "", fn)
|
||||
layer, err := NewLayer(temp, baseLayer.MediaType)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if len(layers) != 1 {
|
||||
return errors.New("quantization failed")
|
||||
if _, err := temp.Seek(0, io.SeekStart); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
baseLayer.Layer = layers[0].Layer
|
||||
baseLayer.GGML = layers[0].GGML
|
||||
ggml, _, err := llm.DecodeGGML(temp, 0)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
baseLayer.Layer = layer
|
||||
baseLayer.GGML = ggml
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -11,6 +11,7 @@ import (
|
||||
"net/http"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/convert"
|
||||
@@ -63,7 +64,7 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe
|
||||
}
|
||||
defer blob.Close()
|
||||
|
||||
ggml, _, err := llm.DecodeGGML(blob)
|
||||
ggml, _, err := llm.DecodeGGML(blob, 0)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -77,62 +78,80 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe
|
||||
return layers, nil
|
||||
}
|
||||
|
||||
func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
|
||||
func extractFromZipFile(p string, file *os.File, fn func(api.ProgressResponse)) error {
|
||||
stat, err := file.Stat()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
return err
|
||||
}
|
||||
|
||||
r, err := zip.NewReader(file, stat.Size())
|
||||
if err != nil {
|
||||
return nil, err
|
||||
return err
|
||||
}
|
||||
|
||||
tempdir, err := os.MkdirTemp(filepath.Dir(file.Name()), "")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer os.RemoveAll(tempdir)
|
||||
|
||||
fn(api.ProgressResponse{Status: "unpacking model metadata"})
|
||||
for _, f := range r.File {
|
||||
n := filepath.Join(p, f.Name)
|
||||
if !strings.HasPrefix(n, p) {
|
||||
slog.Warn("skipped extracting file outside of context", "name", f.Name)
|
||||
continue
|
||||
}
|
||||
|
||||
if err := os.MkdirAll(filepath.Dir(n), 0o750); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// TODO(mxyng): this should not write out all files to disk
|
||||
outfile, err := os.Create(filepath.Join(tempdir, f.Name))
|
||||
outfile, err := os.Create(n)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
return err
|
||||
}
|
||||
defer outfile.Close()
|
||||
|
||||
infile, err := f.Open()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
return err
|
||||
}
|
||||
defer infile.Close()
|
||||
|
||||
if _, err = io.Copy(outfile, infile); err != nil {
|
||||
return nil, err
|
||||
return err
|
||||
}
|
||||
|
||||
if err := outfile.Close(); err != nil {
|
||||
return nil, err
|
||||
return err
|
||||
}
|
||||
|
||||
if err := infile.Close(); err != nil {
|
||||
return nil, err
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
mf, err := convert.GetModelFormat(tempdir)
|
||||
return nil
|
||||
}
|
||||
|
||||
func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
|
||||
tempDir, err := os.MkdirTemp(filepath.Dir(file.Name()), "")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer os.RemoveAll(tempDir)
|
||||
|
||||
if err := extractFromZipFile(tempDir, file, fn); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
mf, err := convert.GetModelFormat(tempDir)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
params, err := mf.GetParams(tempdir)
|
||||
params, err := mf.GetParams(tempDir)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
mArch, err := mf.GetModelArch("", tempdir, params)
|
||||
mArch, err := mf.GetModelArch("", tempDir, params)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -150,7 +169,7 @@ func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(a
|
||||
|
||||
// TODO(mxyng): this should write directly into a layer
|
||||
// e.g. NewLayer(arch.Reader(), "application/vnd.ollama.image.model")
|
||||
temp, err := os.CreateTemp(tempdir, "fp16")
|
||||
temp, err := os.CreateTemp(tempDir, "fp16")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -176,7 +195,7 @@ func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(a
|
||||
}
|
||||
defer bin.Close()
|
||||
|
||||
ggml, _, err := llm.DecodeGGML(bin)
|
||||
ggml, _, err := llm.DecodeGGML(bin, 0)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -210,7 +229,7 @@ func parseFromFile(ctx context.Context, file *os.File, digest string, fn func(ap
|
||||
|
||||
var offset int64
|
||||
for offset < stat.Size() {
|
||||
ggml, n, err := llm.DecodeGGML(file)
|
||||
ggml, n, err := llm.DecodeGGML(file, 0)
|
||||
if errors.Is(err, io.EOF) {
|
||||
break
|
||||
} else if err != nil {
|
||||
|
||||
92
server/model_test.go
Normal file
92
server/model_test.go
Normal file
@@ -0,0 +1,92 @@
|
||||
package server
|
||||
|
||||
import (
|
||||
"archive/zip"
|
||||
"bytes"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"slices"
|
||||
"testing"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
)
|
||||
|
||||
func createZipFile(t *testing.T, name string) *os.File {
|
||||
t.Helper()
|
||||
|
||||
f, err := os.CreateTemp(t.TempDir(), "")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
zf := zip.NewWriter(f)
|
||||
defer zf.Close()
|
||||
|
||||
zh, err := zf.CreateHeader(&zip.FileHeader{Name: name})
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, err := io.Copy(zh, bytes.NewReader([]byte(""))); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func TestExtractFromZipFile(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
expect []string
|
||||
}{
|
||||
{
|
||||
name: "good",
|
||||
expect: []string{"good"},
|
||||
},
|
||||
{
|
||||
name: filepath.Join("..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "bad"),
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range cases {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
f := createZipFile(t, tt.name)
|
||||
defer f.Close()
|
||||
|
||||
tempDir := t.TempDir()
|
||||
if err := extractFromZipFile(tempDir, f, func(api.ProgressResponse) {}); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
var matches []string
|
||||
if err := filepath.Walk(tempDir, func(p string, fi os.FileInfo, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if !fi.IsDir() {
|
||||
matches = append(matches, p)
|
||||
}
|
||||
|
||||
return nil
|
||||
}); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
var actual []string
|
||||
for _, match := range matches {
|
||||
rel, err := filepath.Rel(tempDir, match)
|
||||
if err != nil {
|
||||
t.Error(err)
|
||||
}
|
||||
|
||||
actual = append(actual, rel)
|
||||
}
|
||||
|
||||
if !slices.Equal(actual, tt.expect) {
|
||||
t.Fatalf("expected %d files, got %d", len(tt.expect), len(matches))
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -754,7 +754,11 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
|
||||
}
|
||||
|
||||
func getKVData(digest string, verbose bool) (llm.KV, error) {
|
||||
kvData, err := llm.LoadModel(digest)
|
||||
maxArraySize := 0
|
||||
if verbose {
|
||||
maxArraySize = -1
|
||||
}
|
||||
kvData, err := llm.LoadModel(digest, maxArraySize)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -1101,11 +1105,20 @@ func Serve(ln net.Listener) error {
|
||||
schedCtx, schedDone := context.WithCancel(ctx)
|
||||
sched := InitScheduler(schedCtx)
|
||||
s := &Server{addr: ln.Addr(), sched: sched}
|
||||
r := s.GenerateRoutes()
|
||||
|
||||
http.Handle("/", s.GenerateRoutes())
|
||||
|
||||
slog.Info(fmt.Sprintf("Listening on %s (version %s)", ln.Addr(), version.Version))
|
||||
srvr := &http.Server{
|
||||
Handler: r,
|
||||
// Use http.DefaultServeMux so we get net/http/pprof for
|
||||
// free.
|
||||
//
|
||||
// TODO(bmizerany): Decide if we want to make this
|
||||
// configurable so it is not exposed by default, or allow
|
||||
// users to bind it to a different port. This was a quick
|
||||
// and easy way to get pprof, but it may not be the best
|
||||
// way.
|
||||
Handler: nil,
|
||||
}
|
||||
|
||||
// listen for a ctrl+c and stop any loaded llm
|
||||
|
||||
@@ -144,7 +144,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
||||
}
|
||||
|
||||
// Load model for fitting
|
||||
ggml, err := llm.LoadModel(pending.model.ModelPath)
|
||||
ggml, err := llm.LoadModel(pending.model.ModelPath, 0)
|
||||
if err != nil {
|
||||
pending.errCh <- err
|
||||
break
|
||||
|
||||
@@ -128,14 +128,14 @@ func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedV
|
||||
"tokenizer.ggml.scores": []float32{0},
|
||||
"tokenizer.ggml.token_type": []int32{0},
|
||||
}, []llm.Tensor{
|
||||
{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
|
||||
{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
|
||||
{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||
{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
fname := f.Name()
|
||||
model := &Model{Name: modelName, ModelPath: fname}
|
||||
scenario.ggml, err = llm.LoadModel(model.ModelPath)
|
||||
scenario.ggml, err = llm.LoadModel(model.ModelPath, 0)
|
||||
require.NoError(t, err)
|
||||
|
||||
scenario.req = &LlmRequest{
|
||||
|
||||
34
util/bufioutil/buffer_seeker.go
Normal file
34
util/bufioutil/buffer_seeker.go
Normal file
@@ -0,0 +1,34 @@
|
||||
package bufioutil
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"io"
|
||||
)
|
||||
|
||||
type BufferedSeeker struct {
|
||||
rs io.ReadSeeker
|
||||
br *bufio.Reader
|
||||
}
|
||||
|
||||
func NewBufferedSeeker(rs io.ReadSeeker, size int) *BufferedSeeker {
|
||||
return &BufferedSeeker{
|
||||
rs: rs,
|
||||
br: bufio.NewReaderSize(rs, size),
|
||||
}
|
||||
}
|
||||
|
||||
func (b *BufferedSeeker) Read(p []byte) (int, error) {
|
||||
return b.br.Read(p)
|
||||
}
|
||||
|
||||
func (b *BufferedSeeker) Seek(offset int64, whence int) (int64, error) {
|
||||
if whence == io.SeekCurrent {
|
||||
offset -= int64(b.br.Buffered())
|
||||
}
|
||||
n, err := b.rs.Seek(offset, whence)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
b.br.Reset(b.rs)
|
||||
return n, nil
|
||||
}
|
||||
64
util/bufioutil/buffer_seeker_test.go
Normal file
64
util/bufioutil/buffer_seeker_test.go
Normal file
@@ -0,0 +1,64 @@
|
||||
package bufioutil
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"io"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestBufferedSeeker(t *testing.T) {
|
||||
const alphabet = "abcdefghijklmnopqrstuvwxyz"
|
||||
|
||||
bs := NewBufferedSeeker(strings.NewReader(alphabet), 0) // minReadBufferSize = 16
|
||||
|
||||
checkRead := func(buf []byte, expected string) {
|
||||
t.Helper()
|
||||
_, err := bs.Read(buf)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if !bytes.Equal(buf, []byte(expected)) {
|
||||
t.Fatalf("expected %s, got %s", expected, buf)
|
||||
}
|
||||
}
|
||||
|
||||
// Read the first 5 bytes
|
||||
buf := make([]byte, 5)
|
||||
|
||||
checkRead(buf, "abcde")
|
||||
|
||||
// Seek back to the beginning
|
||||
_, err := bs.Seek(0, io.SeekStart)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// read 'a'
|
||||
checkRead(buf[:1], "a")
|
||||
|
||||
if bs.br.Buffered() == 0 {
|
||||
t.Fatalf("totally unexpected sanity check failed")
|
||||
}
|
||||
|
||||
// Seek past 'b'
|
||||
_, err = bs.Seek(1, io.SeekCurrent)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
checkRead(buf, "cdefg")
|
||||
|
||||
// Seek back to the beginning
|
||||
_, err = bs.Seek(0, io.SeekStart)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
checkRead(buf, "abcde")
|
||||
|
||||
// Seek to the end
|
||||
_, err = bs.Seek(-5, io.SeekEnd)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
checkRead(buf, "vwxyz")
|
||||
}
|
||||
Reference in New Issue
Block a user