mlxrunner: Fix panic on full KV cache hit

When the entire prompt was already cached (e.g. repeated prompt), findRemaining returned an empty slice, causing FromValues to panic on an index-out-of-range accessing a zero-length byte slice. Fix by always keeping at least one token to re-evaluate so the pipeline can seed token generation. Also reject empty prompts early rather than panicking.
2026-02-26 03:57:15 -05:00 · 2026-02-25 17:03:57 -08:00
2 changed files with 10 additions and 2 deletions
--- a/x/mlxrunner/cache.go
+++ b/x/mlxrunner/cache.go
@@ -78,6 +78,12 @@ func (c *kvCache) findRemaining(tokens []int32) []int32 {
 		prefix++
 	}

+	// Always keep at least one token to re-evaluate so the
+	// pipeline can seed token generation from it.
+	if prefix == len(tokens) && prefix > 0 {
+		prefix--
+	}
+
 	if prefix < len(c.tokens) {
 		trim := len(c.tokens) - prefix
 		for _, kv := range c.caches {
--- a/x/mlxrunner/pipeline.go
+++ b/x/mlxrunner/pipeline.go
@@ -46,6 +46,10 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
 	}

 	inputs := r.Tokenizer.Encode(request.Prompt, true)
+	if len(inputs) == 0 {
+		return errors.New("empty prompt")
+	}
+
 	session := r.cache.begin(r.Model, inputs)
 	defer session.close()

@@ -53,7 +57,6 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
 	tokens := session.remaining

 	total, processed := len(tokens), 0
-	slog.Info("Prompt processing progress", "processed", processed, "total", total)
 	for total-processed > 1 {
 		if err := request.Ctx.Err(); err != nil {
 			return err
@@ -103,7 +106,6 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
 		nextSample, nextLogprobs = step(sample)

 		if i == 0 {
-			slog.Info("Prompt processing progress", "processed", total, "total", total)
 			mlx.Eval(sample)
 			final.PromptTokensDuration = time.Since(now)
 			now = time.Now()