Compare commits

...

1 Commits

Author SHA1 Message Date
Jesse Gross
7b82e60254 mlxrunner: Fix panic on full KV cache hit
When the entire prompt was already cached (e.g. repeated prompt),
findRemaining returned an empty slice, causing FromValues to panic
on an index-out-of-range accessing a zero-length byte slice.

Fix by always keeping at least one token to re-evaluate so the
pipeline can seed token generation. Also reject empty prompts
early rather than panicking.
2026-02-25 17:03:57 -08:00
2 changed files with 10 additions and 2 deletions

View File

@@ -78,6 +78,12 @@ func (c *kvCache) findRemaining(tokens []int32) []int32 {
prefix++
}
// Always keep at least one token to re-evaluate so the
// pipeline can seed token generation from it.
if prefix == len(tokens) && prefix > 0 {
prefix--
}
if prefix < len(c.tokens) {
trim := len(c.tokens) - prefix
for _, kv := range c.caches {

View File

@@ -46,6 +46,10 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
}
inputs := r.Tokenizer.Encode(request.Prompt, true)
if len(inputs) == 0 {
return errors.New("empty prompt")
}
session := r.cache.begin(r.Model, inputs)
defer session.close()
@@ -53,7 +57,6 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
tokens := session.remaining
total, processed := len(tokens), 0
slog.Info("Prompt processing progress", "processed", processed, "total", total)
for total-processed > 1 {
if err := request.Ctx.Err(); err != nil {
return err
@@ -103,7 +106,6 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
nextSample, nextLogprobs = step(sample)
if i == 0 {
slog.Info("Prompt processing progress", "processed", total, "total", total)
mlx.Eval(sample)
final.PromptTokensDuration = time.Since(now)
now = time.Now()