mlxrunner: Fix panic on full KV cache hit

When the entire prompt was already cached (e.g. repeated prompt), findRemaining returned an empty slice, causing FromValues to panic on an index-out-of-range accessing a zero-length byte slice. Fix by always keeping at least one token to re-evaluate so the pipeline can seed token generation. Also reject empty prompts early rather than panicking.
2026-02-27 12:36:54 -05:00 · 2026-02-25 17:03:57 -08:00
8 changed files with 24 additions and 138 deletions
--- a/app/cmd/app/app.go
+++ b/app/cmd/app/app.go
@@ -296,15 +296,8 @@ func main() {

 	// Check for pending updates on startup (show tray notification if update is ready)
 	if updater.IsUpdatePending() {
-		// On Windows, the tray is initialized in osRun(). Calling UpdateAvailable
-		// before that would dereference a nil tray callback.
-		// TODO: refactor so the update check runs after platform init on all platforms.
-		if runtime.GOOS == "windows" {
-			slog.Debug("update pending on startup, deferring tray notification until tray initialization")
-		} else {
-			slog.Debug("update pending on startup, showing tray notification")
-			UpdateAvailable("")
-		}
+		slog.Debug("update pending on startup, showing tray notification")
+		UpdateAvailable("")
 	}

 	hasCompletedFirstRun, err := st.HasCompletedFirstRun()
--- a/app/cmd/app/app_windows.go
+++ b/app/cmd/app/app_windows.go
@@ -154,10 +154,6 @@ func handleURLSchemeRequest(urlScheme string) {
 }

 func UpdateAvailable(ver string) error {
-	if app.t == nil {
-		slog.Debug("tray not yet initialized, skipping update notification")
-		return nil
-	}
 	return app.t.UpdateAvailable(ver)
 }

@@ -169,14 +165,6 @@ func osRun(shutdown func(), hasCompletedFirstRun, startHidden bool) {
 		log.Fatalf("Failed to start: %s", err)
 	}

-	// Check for pending updates now that the tray is initialized.
-	// The platform-independent check in app.go fires before osRun,
-	// when app.t is still nil, so we must re-check here.
-	if updater.IsUpdatePending() {
-		slog.Debug("update pending on startup, showing tray notification")
-		UpdateAvailable("")
-	}
-
 	signals := make(chan os.Signal, 1)
 	signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM)

--- a/model/parsers/qwen3.go
+++ b/model/parsers/qwen3.go
@@ -204,24 +204,6 @@ func (p *Qwen3Parser) eat() ([]qwen3Event, bool) {
 			p.maybeThinkingOpenAtBOL = false
 		}

-		thinkingCloseIdx := strings.Index(acc, qwen3ThinkingCloseTag)
-		toolOpenIdx := strings.Index(acc, qwen3ToolOpenTag)
-
-		// If a tool call starts before </think>, treat that as the end of thinking
-		// for parsing purposes and continue in tool-call mode.
-		if toolOpenIdx != -1 && (thinkingCloseIdx == -1 || toolOpenIdx < thinkingCloseIdx) {
-			before, after := p.splitAtTag(qwen3ToolOpenTag, true)
-			if len(before) > 0 {
-				events = append(events, qwen3EventThinkingContent{content: before})
-			}
-			if after == "" {
-				p.state = qwen3ParserStateToolStartedEatingWhitespace
-			} else {
-				p.state = qwen3ParserStateCollectingToolContent
-			}
-			return events, true
-		}
-
 		if strings.Contains(acc, qwen3ThinkingCloseTag) {
 			thinking, remaining := p.splitAtTag(qwen3ThinkingCloseTag, true)
 			if len(thinking) > 0 {
@@ -233,7 +215,7 @@ func (p *Qwen3Parser) eat() ([]qwen3Event, bool) {
 				p.state = qwen3ParserStateCollectingContent
 			}
 			return events, true
-		} else if overlapLen := max(overlap(acc, qwen3ThinkingCloseTag), overlap(acc, qwen3ToolOpenTag)); overlapLen > 0 {
+		} else if overlapLen := overlap(acc, qwen3ThinkingCloseTag); overlapLen > 0 {
 			beforePartialTag := acc[:len(acc)-overlapLen]
 			trailingWsLen := trailingWhitespaceLen(beforePartialTag)
 			ambiguousStart := len(beforePartialTag) - trailingWsLen
--- a/model/parsers/qwen3_test.go
+++ b/model/parsers/qwen3_test.go
@@ -146,68 +146,6 @@ func TestQwen3ParserToolCall(t *testing.T) {
 	}
 }

-func TestQwen3ParserThinkingWithToolCallBeforeThinkingClose(t *testing.T) {
-	parser := &Qwen3Parser{hasThinkingSupport: true, defaultThinking: true}
-	parser.Init(nil, nil, &api.ThinkValue{Value: true})
-
-	input := "Let me think<tool_call>{\"name\":\"get_weather\",\"arguments\":{\"location\":\"San Francisco\",\"unit\":\"celsius\"}}</tool_call>"
-	content, thinking, calls, err := parser.Add(input, true)
-	if err != nil {
-		t.Fatalf("parse failed: %v", err)
-	}
-
-	if content != "" {
-		t.Fatalf("expected empty content, got %q", content)
-	}
-	if thinking != "Let me think" {
-		t.Fatalf("expected thinking %q, got %q", "Let me think", thinking)
-	}
-	if len(calls) != 1 {
-		t.Fatalf("expected 1 tool call, got %d", len(calls))
-	}
-	if calls[0].Function.Name != "get_weather" {
-		t.Fatalf("expected tool name %q, got %q", "get_weather", calls[0].Function.Name)
-	}
-}
-
-func TestQwen3ParserThinkingWithSplitToolOpenTag(t *testing.T) {
-	parser := &Qwen3Parser{hasThinkingSupport: true, defaultThinking: true}
-	parser.Init(nil, nil, &api.ThinkValue{Value: true})
-
-	content, thinking, calls, err := parser.Add("Let me think<tool_ca", false)
-	if err != nil {
-		t.Fatalf("parse failed on first chunk: %v", err)
-	}
-	if content != "" || thinking != "Let me think" || len(calls) != 0 {
-		t.Fatalf(
-			"expected content=%q thinking=%q calls=%d, got content=%q thinking=%q calls=%d",
-			"",
-			"Let me think",
-			0,
-			content,
-			thinking,
-			len(calls),
-		)
-	}
-
-	content, thinking, calls, err = parser.Add("ll>{\"name\":\"get_weather\",\"arguments\":{\"location\":\"SF\"}}</tool_call>", true)
-	if err != nil {
-		t.Fatalf("parse failed on second chunk: %v", err)
-	}
-	if content != "" {
-		t.Fatalf("expected empty content, got %q", content)
-	}
-	if thinking != "" {
-		t.Fatalf("expected no additional thinking on second chunk, got %q", thinking)
-	}
-	if len(calls) != 1 {
-		t.Fatalf("expected 1 tool call, got %d", len(calls))
-	}
-	if calls[0].Function.Name != "get_weather" {
-		t.Fatalf("expected tool name %q, got %q", "get_weather", calls[0].Function.Name)
-	}
-}
-
 func TestQwen35ParserRespectsNoThink(t *testing.T) {
 	parser := ParserForName("qwen3.5")
 	if parser == nil {
--- a/model/parsers/qwen3vl.go
+++ b/model/parsers/qwen3vl.go
@@ -180,22 +180,7 @@ func (p *Qwen3VLParser) eat() ([]qwenEvent, bool) {
 			return events, false
 		}
 	case CollectingThinkingContent:
-		acc := p.buffer.String()
-		thinkingCloseIdx := strings.Index(acc, thinkingCloseTag)
-		toolOpenIdx := strings.Index(acc, toolOpenTag)
-
-		// If a tool call starts before </think>, treat that as the end of thinking
-		// for parsing purposes and continue in tool-call mode.
-		if toolOpenIdx != -1 && (thinkingCloseIdx == -1 || toolOpenIdx < thinkingCloseIdx) {
-			before, _ := splitAtTag(&p.buffer, toolOpenTag, false)
-			if len(before) > 0 {
-				events = append(events, qwenEventThinkingContent{content: before})
-			}
-			p.state = CollectingToolContent
-			return events, true
-		}
-
-		if strings.Contains(acc, thinkingCloseTag) {
+		if strings.Contains(p.buffer.String(), thinkingCloseTag) {
 			thinking, remaining := splitAtTag(&p.buffer, thinkingCloseTag, true)
 			if len(thinking) > 0 {
 				events = append(events, qwenEventThinkingContent{content: thinking})
@@ -206,13 +191,13 @@ func (p *Qwen3VLParser) eat() ([]qwenEvent, bool) {
 				p.state = CollectingContent
 			}
 			return events, true
-		} else if overlapLen := max(overlap(acc, thinkingCloseTag), overlap(acc, toolOpenTag)); overlapLen > 0 {
-			beforePartialTag := acc[:len(acc)-overlapLen]
+		} else if overlapLen := overlap(p.buffer.String(), thinkingCloseTag); overlapLen > 0 {
+			beforePartialTag := p.buffer.String()[:len(p.buffer.String())-overlapLen]
 			trailingWhitespaceLen := trailingWhitespaceLen(beforePartialTag)
 			ambiguousStart := len(beforePartialTag) - trailingWhitespaceLen

-			unambiguous := acc[:ambiguousStart]
-			ambiguous := acc[ambiguousStart:]
+			unambiguous := p.buffer.String()[:ambiguousStart]
+			ambiguous := p.buffer.String()[ambiguousStart:]
 			p.buffer.Reset()
 			p.buffer.WriteString(ambiguous)
 			if len(unambiguous) > 0 {
@@ -220,11 +205,11 @@ func (p *Qwen3VLParser) eat() ([]qwenEvent, bool) {
 			}
 			return events, false
 		} else {
-			whitespaceLen := trailingWhitespaceLen(acc)
-			ambiguousStart := len(acc) - whitespaceLen
+			whitespaceLen := trailingWhitespaceLen(p.buffer.String())
+			ambiguousStart := len(p.buffer.String()) - whitespaceLen

-			unambiguous := acc[:ambiguousStart]
-			ambiguous := acc[ambiguousStart:]
+			unambiguous := p.buffer.String()[:ambiguousStart]
+			ambiguous := p.buffer.String()[ambiguousStart:]
 			p.buffer.Reset()
 			p.buffer.WriteString(ambiguous)
 			if len(unambiguous) > 0 {
--- a/model/parsers/qwen3vl_thinking_test.go
+++ b/model/parsers/qwen3vl_thinking_test.go
@@ -98,12 +98,8 @@ func TestQwen3VLThinkingParserStreaming(t *testing.T) {
 			desc: "nested thinking and tool call (outside thinking, inside tool call)",
 			steps: []step{
 				{
-					input: "I'm thinking<tool_call>I'm nested tool call</tool_call></think>",
-					wantEvents: []qwenEvent{
-						qwenEventThinkingContent{content: "I'm thinking"},
-						qwenEventRawToolCall{raw: "I'm nested tool call"},
-						qwenEventContent{content: "</think>"},
-					},
+					input:      "I'm thinking<tool_call>I'm nested tool call</tool_call></think>",
+					wantEvents: []qwenEvent{qwenEventThinkingContent{content: "I'm thinking<tool_call>I'm nested tool call</tool_call>"}},
 				},
 			},
 		},
@@ -113,7 +109,8 @@ func TestQwen3VLThinkingParserStreaming(t *testing.T) {
 				{
 					input: "<tool_call>I'm nested tool call<think>I'm thinking</think></tool_call>",
 					wantEvents: []qwenEvent{
-						qwenEventRawToolCall{raw: "I'm nested tool call<think>I'm thinking</think>"},
+						qwenEventThinkingContent{content: "<tool_call>I'm nested tool call<think>I'm thinking"},
+						qwenEventContent{content: "</tool_call>"},
 					},
 				},
 			},
@@ -124,8 +121,8 @@ func TestQwen3VLThinkingParserStreaming(t *testing.T) {
 				{
 					input: "I'm thinking<tool_call>I'm NOT a nested tool call</think></tool_call><tool_call>I'm nested tool call 2<think></tool_call></think>",
 					wantEvents: []qwenEvent{
-						qwenEventThinkingContent{content: "I'm thinking"},
-						qwenEventRawToolCall{raw: "I'm NOT a nested tool call</think>"},
+						qwenEventThinkingContent{content: "I'm thinking<tool_call>I'm NOT a nested tool call"},
+						qwenEventContent{content: "</tool_call>"},
 						qwenEventRawToolCall{raw: "I'm nested tool call 2<think>"},
 						qwenEventContent{content: "</think>"},
 					},
--- a/x/mlxrunner/cache.go
+++ b/x/mlxrunner/cache.go
@@ -78,8 +78,9 @@ func (c *kvCache) findRemaining(tokens []int32) []int32 {
 		prefix++
 	}

+	// Always keep at least one token to re-evaluate so the
+	// pipeline can seed token generation from it.
 	if prefix == len(tokens) && prefix > 0 {
-		// Leave one token to run through the model so we can sample a response.
 		prefix--
 	}

--- a/x/mlxrunner/pipeline.go
+++ b/x/mlxrunner/pipeline.go
@@ -46,6 +46,10 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
 	}

 	inputs := r.Tokenizer.Encode(request.Prompt, true)
+	if len(inputs) == 0 {
+		return errors.New("empty prompt")
+	}
+
 	session := r.cache.begin(r.Model, inputs)
 	defer session.close()

@@ -53,7 +57,6 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
 	tokens := session.remaining

 	total, processed := len(tokens), 0
-	slog.Info("Prompt processing progress", "processed", processed, "total", total)
 	for total-processed > 1 {
 		if err := request.Ctx.Err(); err != nil {
 			return err
@@ -103,7 +106,6 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
 		nextSample, nextLogprobs = step(sample)

 		if i == 0 {
-			slog.Info("Prompt processing progress", "processed", total, "total", total)
 			mlx.Eval(sample)
 			final.PromptTokensDuration = time.Since(now)
 			now = time.Now()