mirror of
https://github.com/ollama/ollama.git
synced 2026-02-28 04:56:37 -05:00
Compare commits
4 Commits
v0.17.4
...
dongchen/r
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
241ac6e003 | ||
|
|
17dc6918cf | ||
|
|
dd5eb6337d | ||
|
|
79917cf80b |
14
api/types.go
14
api/types.go
@@ -15,6 +15,7 @@ import (
|
||||
"github.com/google/uuid"
|
||||
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/internal/orderedmap"
|
||||
"github.com/ollama/ollama/types/model"
|
||||
)
|
||||
@@ -569,6 +570,7 @@ type DebugInfo struct {
|
||||
|
||||
type Metrics struct {
|
||||
TotalDuration time.Duration `json:"total_duration,omitempty"`
|
||||
PeakMemory uint64 `json:"peak_memory,omitempty"`
|
||||
LoadDuration time.Duration `json:"load_duration,omitempty"`
|
||||
PromptEvalCount int `json:"prompt_eval_count,omitempty"`
|
||||
PromptEvalDuration time.Duration `json:"prompt_eval_duration,omitempty"`
|
||||
@@ -934,6 +936,10 @@ func (m *Metrics) Summary() {
|
||||
fmt.Fprintf(os.Stderr, "total duration: %v\n", m.TotalDuration)
|
||||
}
|
||||
|
||||
if m.PeakMemory > 0 {
|
||||
fmt.Fprintf(os.Stderr, "peak memory: %s\n", formatPeakMemory(m.PeakMemory))
|
||||
}
|
||||
|
||||
if m.LoadDuration > 0 {
|
||||
fmt.Fprintf(os.Stderr, "load duration: %v\n", m.LoadDuration)
|
||||
}
|
||||
@@ -957,6 +963,14 @@ func (m *Metrics) Summary() {
|
||||
}
|
||||
}
|
||||
|
||||
func formatPeakMemory(b uint64) string {
|
||||
if b >= format.GibiByte {
|
||||
return fmt.Sprintf("%.3f GiB", float64(b)/float64(format.GibiByte))
|
||||
}
|
||||
|
||||
return format.HumanBytes2(b)
|
||||
}
|
||||
|
||||
func (opts *Options) FromMap(m map[string]any) error {
|
||||
valueOpts := reflect.ValueOf(opts).Elem() // names of the fields in the options struct
|
||||
typeOpts := reflect.TypeOf(opts).Elem() // types of the fields in the options struct
|
||||
|
||||
@@ -1518,6 +1518,7 @@ type CompletionResponse struct {
|
||||
PromptEvalDuration time.Duration `json:"prompt_eval_duration"`
|
||||
EvalCount int `json:"eval_count"`
|
||||
EvalDuration time.Duration `json:"eval_duration"`
|
||||
PeakMemory uint64 `json:"peak_memory,omitempty"`
|
||||
|
||||
// Logprobs contains log probability information if requested
|
||||
Logprobs []Logprob `json:"logprobs,omitempty"`
|
||||
|
||||
@@ -557,6 +557,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
||||
PromptEvalDuration: cr.PromptEvalDuration,
|
||||
EvalCount: cr.EvalCount,
|
||||
EvalDuration: cr.EvalDuration,
|
||||
PeakMemory: cr.PeakMemory,
|
||||
},
|
||||
Logprobs: toAPILogprobs(cr.Logprobs),
|
||||
}
|
||||
@@ -2309,6 +2310,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
||||
PromptEvalDuration: r.PromptEvalDuration,
|
||||
EvalCount: r.EvalCount,
|
||||
EvalDuration: r.EvalDuration,
|
||||
PeakMemory: r.PeakMemory,
|
||||
},
|
||||
Logprobs: toAPILogprobs(r.Logprobs),
|
||||
}
|
||||
|
||||
@@ -150,6 +150,14 @@ func (u *uploader) uploadOnce(ctx context.Context, blob Blob) (int64, error) {
|
||||
u.logger.Debug("uploading blob", "digest", blob.Digest, "size", blob.Size)
|
||||
}
|
||||
|
||||
// Check if blob already exists (may have been uploaded by another
|
||||
// concurrent process since the initial HEAD check)
|
||||
if exists, err := u.exists(ctx, blob); err != nil {
|
||||
return 0, err
|
||||
} else if exists {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
// Init upload
|
||||
uploadURL, err := u.initUpload(ctx, blob)
|
||||
if err != nil {
|
||||
|
||||
@@ -78,6 +78,12 @@ func (c *kvCache) findRemaining(tokens []int32) []int32 {
|
||||
prefix++
|
||||
}
|
||||
|
||||
// Always keep at least one token to re-evaluate so the
|
||||
// pipeline can seed token generation from it.
|
||||
if prefix == len(tokens) && prefix > 0 {
|
||||
prefix--
|
||||
}
|
||||
|
||||
if prefix < len(c.tokens) {
|
||||
trim := len(c.tokens) - prefix
|
||||
for _, kv := range c.caches {
|
||||
|
||||
@@ -268,6 +268,7 @@ func (c *Client) Completion(ctx context.Context, req llm.CompletionRequest, fn f
|
||||
PromptEvalDuration int `json:"prompt_eval_duration,omitempty"`
|
||||
EvalCount int `json:"eval_count,omitempty"`
|
||||
EvalDuration int `json:"eval_duration,omitempty"`
|
||||
PeakMemory uint64 `json:"peak_memory,omitempty"`
|
||||
}
|
||||
if err := json.Unmarshal(scanner.Bytes(), &raw); err != nil {
|
||||
slog.Debug("mlx response parse error", "error", err, "line", string(scanner.Bytes()))
|
||||
@@ -282,6 +283,7 @@ func (c *Client) Completion(ctx context.Context, req llm.CompletionRequest, fn f
|
||||
PromptEvalDuration: time.Duration(raw.PromptEvalDuration),
|
||||
EvalCount: raw.EvalCount,
|
||||
EvalDuration: time.Duration(raw.EvalDuration),
|
||||
PeakMemory: raw.PeakMemory,
|
||||
}
|
||||
|
||||
fn(cresp)
|
||||
|
||||
@@ -64,6 +64,10 @@ func PeakMemory() int {
|
||||
return int(peak)
|
||||
}
|
||||
|
||||
func ResetPeakMemory() {
|
||||
C.mlx_reset_peak_memory()
|
||||
}
|
||||
|
||||
type Memory struct{}
|
||||
|
||||
func (Memory) LogValue() slog.Value {
|
||||
|
||||
@@ -44,8 +44,13 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
|
||||
} else {
|
||||
mlx.DisableCompile()
|
||||
}
|
||||
mlx.ResetPeakMemory()
|
||||
|
||||
inputs := r.Tokenizer.Encode(request.Prompt, true)
|
||||
if len(inputs) == 0 {
|
||||
return errors.New("empty prompt")
|
||||
}
|
||||
|
||||
session := r.cache.begin(r.Model, inputs)
|
||||
defer session.close()
|
||||
|
||||
@@ -53,7 +58,6 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
|
||||
tokens := session.remaining
|
||||
|
||||
total, processed := len(tokens), 0
|
||||
slog.Info("Prompt processing progress", "processed", processed, "total", total)
|
||||
for total-processed > 1 {
|
||||
if err := request.Ctx.Err(); err != nil {
|
||||
return err
|
||||
@@ -103,7 +107,6 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
|
||||
nextSample, nextLogprobs = step(sample)
|
||||
|
||||
if i == 0 {
|
||||
slog.Info("Prompt processing progress", "processed", total, "total", total)
|
||||
mlx.Eval(sample)
|
||||
final.PromptTokensDuration = time.Since(now)
|
||||
now = time.Now()
|
||||
@@ -138,6 +141,7 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
|
||||
}
|
||||
|
||||
final.CompletionTokensDuration = time.Since(now)
|
||||
final.PeakMemory = uint64(mlx.PeakMemory())
|
||||
select {
|
||||
case <-request.Ctx.Done():
|
||||
return request.Ctx.Err()
|
||||
|
||||
@@ -54,6 +54,7 @@ type Response struct {
|
||||
PromptTokensDuration time.Duration `json:"prompt_eval_duration,omitempty"`
|
||||
CompletionTokens int `json:"eval_count,omitempty"`
|
||||
CompletionTokensDuration time.Duration `json:"eval_duration,omitempty"`
|
||||
PeakMemory uint64 `json:"peak_memory,omitempty"`
|
||||
TotalTokens int `json:"total_tokens,omitempty"`
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user