feat(dllm): gemma4 streaming parser emitting ChatDeltas

Fragment-safe state machine (content / channel header / thought / tool-call / done) classifying model output into content, reasoning_content and tool_calls deltas. Tool-call payload decoder is a non-partial port of vLLM's gemma4 parser grammar; ~25 of its test cases are ported with citations, plus a 2-split invariance property over every byte position. Recursion depth-capped against model-generated deep nesting; marker constants shared with the renderer. Assisted-by: Claude Code (Fable 5) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-06-11 18:27:32 -04:00 · 2026-06-11 15:55:27 +00:00
parent 778f85c2a0
commit 294c04ae2f
4 changed files with 2527 additions and 0 deletions
--- a/backend/go/dllm/gemma4_parser.go
+++ b/backend/go/dllm/gemma4_parser.go
@@ -0,0 +1,562 @@
+// Gemma4 (DiffusionGemma) streaming output parser: raw model text, fed in
+// arbitrary fragments (per committed diffusion block; a fragment can split
+// anywhere, including mid-marker and mid-payload), is turned into
+// pb.ChatDelta events (content / reasoning_content / tool_calls).
+//
+// Normative sources:
+//   - The chat template embedded at the top of gemma4_renderer.go ("tpl L<n>"
+//     citations below refer to its numbered lines). The OUTPUT format mirrors
+//     what the template renders for assistant history: thought channels
+//     (<|channel>thought\n ... <channel|>, tpl L240), tool calls
+//     (<|tool_call>call:name{...}<tool_call|>, tpl L246-L257) and turn ends
+//     (<turn|>, tpl L351).
+//   - vLLM PR #45163: vllm/tool_parsers/gemma4_tool_parser.py (marker
+//     handling, the call:name{...} argument grammar and its decoder, ported
+//     below) and vllm/reasoning/gemma4_reasoning_parser.py (channel markers,
+//     the "thought\n" role label, is_reasoning_end semantics).
+//
+// Initial state (derived from the generation prompt, tpl L356-L362, see
+// RenderGemma4):
+//   - enable_thinking=false: the prompt ends with "<|turn>model\n" +
+//     "<|channel>thought\n<channel|>" - an EMPTY thought channel, pre-opened
+//     AND pre-closed by the template. The model's output therefore starts in
+//     plain content. Use NewGemma4Parser(false).
+//   - enable_thinking=true: the prompt ends at "<|turn>model\n" and the model
+//     opens and closes its own thought channel in the OUTPUT
+//     ("<|channel>thought\n...reasoning...<channel|>final answer", per the
+//     vLLM Gemma4ReasoningParser docstring). The parser still starts in
+//     content state - the channel markers in the output drive the switch.
+//     Use NewGemma4Parser(false) here too.
+//   - NewGemma4Parser(true) is for callers that pre-open the thought channel
+//     in the prompt themselves (appending "<|channel>thought\n" after the
+//     generation prompt to force thinking): the output then begins mid-thought
+//     and everything is reasoning until the first <channel|>.
+//
+// State diagram (markers are consumed, never emitted):
+//
+//	             <|channel>                  \n (channel name dropped: the
+//	[content] --------------> [chan-header] ----> [thought]   "thought\n" role
+//	   ^ |  <channel|> (stray close: swallowed,                label, stripped
+//	   +-+  strip_thinking semantics, tpl L148-L158)           like vLLM does)
+//	   ^                  <channel|>
+//	   +----------------------------------------- [thought]
+//	   ^                  <tool_call|>                 | <|tool_call> (implicit
+//	   +-------------- [tool-call] <-------------------+  reasoning end, vLLM
+//	   |  <|tool_call>     ^                               is_reasoning_end)
+//	   +-------------------+
+//	[content]/[thought] --- <turn|> ---> [done]  (everything after is dropped)
+//
+// Buffering rules:
+//   - content/thought states hold back at most len(longest marker)-1 bytes:
+//     the longest tail that is still a proper prefix of a watched marker.
+//     Content is otherwise emitted immediately (no unbounded buffering).
+//   - the tool-call state buffers the whole payload until <tool_call|>. This
+//     is unbounded in principle but bounded in practice by the model's
+//     diffusion canvas, and is required because the call:name{...} payload
+//     only becomes decodable (and trustworthy) once complete - the same
+//     reason vLLM's parser accumulates before parsing.
+//   - Close() flushes whatever is still held: partial markers come out as
+//     content/reasoning (per the state that held them); an unterminated
+//     channel header or tool-call payload is re-emitted RAW (including its
+//     opening marker) as content - malformed output is never silently
+//     dropped (mirrors vLLM extract_tool_calls returning the raw text as
+//     content when its regex does not match).
+//
+// Streaming granularity DIVERGENCE from vLLM: vLLM re-parses the partial
+// payload on every token and streams argument-JSON diffs (its `partial=True`
+// decoder mode plus withholding logic exist only for that). Our fragments are
+// whole committed diffusion blocks, so each completed tool call is emitted
+// once, as a single ToolCallDelta carrying index + id + name + the full
+// arguments JSON - exactly the shape backend/python/vllm/backend.py emits
+// per call and pkg/functions.ToolCallsFromChatDeltas re-accumulates.
+package main
+
+import (
+	"encoding/json"
+	"regexp"
+	"strconv"
+	"strings"
+
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+)
+
+// gemma4CallRE is vLLM's tool_call_regex
+// (`<\|tool_call>call:([\w\-\.]+)\{(.*?)\}<tool_call\|>`, DOTALL) anchored to
+// a single already-extracted payload: name charset [\w\-.], braces mandatory.
+var gemma4CallRE = regexp.MustCompile(`(?s)^call:([\w\-.]+)\{(.*)\}$`)
+
+type g4State int
+
+const (
+	g4Content g4State = iota
+	g4ChanHeader
+	g4Thought
+	g4ToolCall
+	g4Done
+)
+
+// Markers watched per emitting state. A stray <tool_call|> outside a tool
+// call is deliberately NOT watched: it passes through verbatim, consistent
+// with the malformed-payload fallback re-emitting it as content.
+var (
+	gemma4ContentMarkers = []string{gemma4ChannelOpen, gemma4ChannelClose, gemma4ToolCallOpen, gemma4TurnEnd}
+	gemma4ThoughtMarkers = []string{gemma4ChannelClose, gemma4ToolCallOpen, gemma4TurnEnd}
+)
+
+type Gemma4Parser struct {
+	state g4State
+	// held is the per-state carry-over between Feed calls: a partial marker
+	// (content/thought), a partial channel header (chan-header) or the
+	// payload accumulated so far (tool-call).
+	held    string
+	toolIdx int
+}
+
+// NewGemma4Parser returns a parser positioned per the initial-state rules in
+// the header comment: startInThought=true only when the caller pre-opened a
+// thought channel in the prompt.
+func NewGemma4Parser(startInThought bool) *Gemma4Parser {
+	state := g4Content
+	if startInThought {
+		state = g4Thought
+	}
+	return &Gemma4Parser{state: state}
+}
+
+// Feed consumes the next output fragment and returns the deltas it completes.
+func (p *Gemma4Parser) Feed(text string) []*pb.ChatDelta {
+	if text == "" || p.state == g4Done {
+		return nil
+	}
+	pending := p.held + text
+	p.held = ""
+	var em g4Emitter
+	for pending != "" {
+		switch p.state {
+		case g4Content, g4Thought:
+			markers := gemma4ContentMarkers
+			if p.state == g4Thought {
+				markers = gemma4ThoughtMarkers
+			}
+			idx, marker := findEarliestGemma4Marker(pending, markers)
+			if idx == -1 {
+				hold := gemma4MarkerHoldback(pending, markers)
+				p.emitText(&em, pending[:len(pending)-hold])
+				p.held = pending[len(pending)-hold:]
+				pending = ""
+				continue
+			}
+			p.emitText(&em, pending[:idx])
+			pending = pending[idx+len(marker):]
+			switch marker {
+			case gemma4ChannelOpen:
+				p.state = g4ChanHeader
+			case gemma4ChannelClose:
+				// In thought: channel ends. In content: stray close,
+				// swallowed (strip_thinking keeps both sides, tpl L148-L158).
+				p.state = g4Content
+			case gemma4ToolCallOpen:
+				p.state = g4ToolCall
+			case gemma4TurnEnd:
+				p.state = g4Done
+			}
+		case g4ChanHeader:
+			// The channel header is "<name>\n"; the template only ever writes
+			// "thought" (tpl L240/L360) and the label is structural, so it is
+			// dropped, not emitted (vLLM strips the same "thought\n" prefix).
+			nl := strings.IndexByte(pending, '\n')
+			if nl == -1 {
+				p.held = pending
+				pending = ""
+				continue
+			}
+			pending = pending[nl+1:]
+			p.state = g4Thought
+		case g4ToolCall:
+			end := strings.Index(pending, gemma4ToolCallClose)
+			if end == -1 {
+				p.held = pending
+				pending = ""
+				continue
+			}
+			p.emitToolCall(&em, pending[:end])
+			pending = pending[end+len(gemma4ToolCallClose):]
+			p.state = g4Content
+		case g4Done:
+			pending = ""
+		}
+	}
+	return em.deltas
+}
+
+// Close flushes held-back partials. Incomplete structures (open channel
+// header, unterminated tool payload) are re-emitted raw as content rather
+// than dropped. The parser is finished afterwards.
+func (p *Gemma4Parser) Close() []*pb.ChatDelta {
+	var em g4Emitter
+	switch p.state {
+	case g4Content:
+		em.content(p.held)
+	case g4Thought:
+		em.reasoning(p.held)
+	case g4ChanHeader:
+		em.content(gemma4ChannelOpen + p.held)
+	case g4ToolCall:
+		em.content(gemma4ToolCallOpen + p.held)
+	case g4Done:
+	}
+	p.held = ""
+	p.state = g4Done
+	return em.deltas
+}
+
+func (p *Gemma4Parser) emitText(em *g4Emitter, s string) {
+	if p.state == g4Thought {
+		em.reasoning(s)
+		return
+	}
+	em.content(s)
+}
+
+// emitToolCall decodes one complete <|tool_call>...<tool_call|> payload. On a
+// payload that does not match call:name{...} the raw text (markers included)
+// is emitted as content, mirroring vLLM's extract_tool_calls fallback.
+func (p *Gemma4Parser) emitToolCall(em *g4Emitter, payload string) {
+	m := gemma4CallRE.FindStringSubmatch(payload)
+	if m == nil {
+		em.content(gemma4ToolCallOpen + payload + gemma4ToolCallClose)
+		return
+	}
+	// Index-based ids: deterministic (the split-invariance property relies
+	// on it) and matching the call_<n> convention of pkg/grpc/rich_test.go;
+	// core only needs ids to be non-empty and unique within the response.
+	em.tool(p.toolIdx, "call_"+strconv.Itoa(p.toolIdx), m[1], decodeGemma4Args(m[2], 0))
+	p.toolIdx++
+}
+
+// g4Emitter collects ChatDeltas; empty text events are dropped.
+type g4Emitter struct {
+	deltas []*pb.ChatDelta
+}
+
+func (e *g4Emitter) content(s string) {
+	if s != "" {
+		e.deltas = append(e.deltas, &pb.ChatDelta{Content: s})
+	}
+}
+
+func (e *g4Emitter) reasoning(s string) {
+	if s != "" {
+		e.deltas = append(e.deltas, &pb.ChatDelta{ReasoningContent: s})
+	}
+}
+
+func (e *g4Emitter) tool(index int, id, name, argsJSON string) {
+	e.deltas = append(e.deltas, &pb.ChatDelta{ToolCalls: []*pb.ToolCallDelta{{
+		Index:     int32(index),
+		Id:        id,
+		Name:      name,
+		Arguments: argsJSON,
+	}}})
+}
+
+// findEarliestGemma4Marker returns the position and value of the first
+// complete marker occurrence, or (-1, "").
+func findEarliestGemma4Marker(s string, markers []string) (int, string) {
+	best, bestMarker := -1, ""
+	for _, m := range markers {
+		if idx := strings.Index(s, m); idx >= 0 && (best == -1 || idx < best) {
+			best, bestMarker = idx, m
+		}
+	}
+	return best, bestMarker
+}
+
+// gemma4MarkerHoldback returns the length of the longest suffix of s that is
+// a proper prefix of a watched marker - the only bytes that may still grow
+// into a marker and therefore must not be emitted yet (bounded by the
+// longest marker, so content is never buffered unboundedly).
+func gemma4MarkerHoldback(s string, markers []string) int {
+	maxHold := 0
+	for _, m := range markers {
+		if len(m)-1 > maxHold {
+			maxHold = len(m) - 1
+		}
+	}
+	if len(s) < maxHold {
+		maxHold = len(s)
+	}
+	for k := maxHold; k >= 1; k-- {
+		tail := s[len(s)-k:]
+		for _, m := range markers {
+			if strings.HasPrefix(m, tail) {
+				return k
+			}
+		}
+	}
+	return 0
+}
+
+// ---------------------------------------------------------------------------
+// call:name{...} argument decoder
+//
+// Port of vLLM's _parse_gemma4_args / _parse_gemma4_array /
+// _parse_gemma4_value (gemma4_tool_parser.py) in non-partial mode only: this
+// parser decodes exclusively COMPLETE payloads (incomplete ones fall back to
+// raw content at Close), so vLLM's partial-withholding machinery
+// (trailing-dot floats, withheld bare tails) is intentionally not ported.
+//
+// Grammar (inverse of the renderer's formatGemma4Argument, tpl L118-L147):
+//
+//	args    := pair (',' pair)*
+//	pair    := key ':' value          (keys unquoted, up to the first ':')
+//	value   := string | object | array | bare
+//	string  := '<|"|>' ... '<|"|>'    (no escapes; unterminated -> rest)
+//	object  := '{' args '}'           (delimited strings skipped when
+//	array   := '[' value,* ']'         counting braces/brackets)
+//	bare    := true | false | null/none/nil | number | bare-string
+//
+// Output is a JSON object/array string with keys in payload order (Python
+// dict insertion order), built with HTML escaping off so payload text
+// survives byte-for-byte.
+// ---------------------------------------------------------------------------
+
+func isGemma4Space(c byte) bool { return c == ' ' || c == '\n' || c == '\t' }
+
+// gemma4MaxArgsDepth caps the mutual recursion between decodeGemma4Args and
+// decodeGemma4Array. Defense against model-generated deep nesting: a Go stack
+// overflow is a fatal process kill, not a recoverable error, so past the cap
+// a nested body gracefully degrades to a JSON string of its raw text.
+const gemma4MaxArgsDepth = 100
+
+// decodeGemma4Args decodes one args body (the text between the outer braces
+// of call:name{...}) into a JSON object string. depth is the current nesting
+// level (0 at the payload root); see gemma4MaxArgsDepth.
+func decodeGemma4Args(s string, depth int) string {
+	if depth > gemma4MaxArgsDepth {
+		return gemma4JSONString(s)
+	}
+	var b strings.Builder
+	b.WriteString("{")
+	first := true
+	pair := func(key, val string) {
+		if !first {
+			b.WriteString(",")
+		}
+		first = false
+		b.WriteString(gemma4JSONString(key))
+		b.WriteString(":")
+		b.WriteString(val)
+	}
+	i, n := 0, len(s)
+	for i < n {
+		for i < n && (isGemma4Space(s[i]) || s[i] == ',') {
+			i++
+		}
+		if i >= n {
+			break
+		}
+		keyStart := i
+		for i < n && s[i] != ':' {
+			i++
+		}
+		if i >= n {
+			break // no ':' -> trailing junk, dropped (vLLM does the same)
+		}
+		key := strings.TrimSpace(s[keyStart:i])
+		i++ // skip ':'
+		for i < n && isGemma4Space(s[i]) {
+			i++
+		}
+		if i >= n {
+			pair(key, `""`) // "key:" with nothing after -> empty string
+			break
+		}
+		switch {
+		case strings.HasPrefix(s[i:], gemma4StringDelim):
+			i += len(gemma4StringDelim)
+			if end := strings.Index(s[i:], gemma4StringDelim); end == -1 {
+				pair(key, gemma4JSONString(s[i:])) // unterminated -> take rest
+				i = n
+			} else {
+				pair(key, gemma4JSONString(s[i:i+end]))
+				i += end + len(gemma4StringDelim)
+			}
+		case s[i] == '{':
+			inner, next := scanGemma4Balanced(s, i, '{', '}')
+			pair(key, decodeGemma4Args(inner, depth+1))
+			i = next
+		case s[i] == '[':
+			inner, next := scanGemma4Balanced(s, i, '[', ']')
+			pair(key, decodeGemma4Array(inner, depth+1))
+			i = next
+		default:
+			valStart := i
+			for i < n && s[i] != ',' && s[i] != '}' && s[i] != ']' {
+				i++
+			}
+			if i == valStart {
+				// No progress (value starts on a stray '}'/']'): abort on
+				// malformed input rather than loop, like vLLM.
+				i = n
+				continue
+			}
+			pair(key, decodeGemma4Bare(s[valStart:i]))
+		}
+	}
+	b.WriteString("}")
+	return b.String()
+}
+
+// decodeGemma4Array decodes one array body (the text between '[' and ']')
+// into a JSON array string. depth is the current nesting level; see
+// gemma4MaxArgsDepth.
+func decodeGemma4Array(s string, depth int) string {
+	if depth > gemma4MaxArgsDepth {
+		return gemma4JSONString(s)
+	}
+	var b strings.Builder
+	b.WriteString("[")
+	first := true
+	item := func(val string) {
+		if !first {
+			b.WriteString(",")
+		}
+		first = false
+		b.WriteString(val)
+	}
+	i, n := 0, len(s)
+	for i < n {
+		for i < n && (isGemma4Space(s[i]) || s[i] == ',') {
+			i++
+		}
+		if i >= n {
+			break
+		}
+		switch {
+		case strings.HasPrefix(s[i:], gemma4StringDelim):
+			i += len(gemma4StringDelim)
+			if end := strings.Index(s[i:], gemma4StringDelim); end == -1 {
+				item(gemma4JSONString(s[i:]))
+				i = n
+			} else {
+				item(gemma4JSONString(s[i : i+end]))
+				i += end + len(gemma4StringDelim)
+			}
+		case s[i] == '{':
+			inner, next := scanGemma4Balanced(s, i, '{', '}')
+			item(decodeGemma4Args(inner, depth+1))
+			i = next
+		case s[i] == '[':
+			inner, next := scanGemma4Balanced(s, i, '[', ']')
+			item(decodeGemma4Array(inner, depth+1))
+			i = next
+		default:
+			valStart := i
+			for i < n && s[i] != ',' && s[i] != ']' {
+				i++
+			}
+			if i == valStart {
+				i = n // no progress: abort on malformed input, like vLLM
+				continue
+			}
+			item(decodeGemma4Bare(s[valStart:i]))
+		}
+	}
+	b.WriteString("]")
+	return b.String()
+}
+
+// scanGemma4Balanced scans a brace/bracket-balanced span starting at the
+// opener s[start], skipping over <|"|>-delimited strings so structural
+// characters inside them do not count (vLLM's depth scan). Returns the inner
+// text and the index just past the closer; an unterminated span yields the
+// rest of the string (the inner decoder still extracts what is there - this
+// path is only reachable from genuinely malformed complete payloads).
+func scanGemma4Balanced(s string, start int, open, close byte) (string, int) {
+	depth := 1
+	i := start + 1
+	innerStart := i
+	n := len(s)
+	for i < n && depth > 0 {
+		if strings.HasPrefix(s[i:], gemma4StringDelim) {
+			i += len(gemma4StringDelim)
+			if nd := strings.Index(s[i:], gemma4StringDelim); nd == -1 {
+				i = n
+			} else {
+				i += nd + len(gemma4StringDelim)
+			}
+			continue
+		}
+		switch s[i] {
+		case open:
+			depth++
+		case close:
+			depth--
+		}
+		i++
+	}
+	if depth > 0 {
+		return s[innerStart:], n
+	}
+	return s[innerStart : i-1], i
+}
+
+// decodeGemma4Bare maps an undelimited value to its JSON form: booleans,
+// null aliases (null/none/nil, case-insensitive - the renderer writes
+// Python None as "None", tpl L144-L145 via format_argument's else branch),
+// numbers (vLLM's rule: a '.' tries float, otherwise int; anything that
+// fails parses as a bare string).
+func decodeGemma4Bare(raw string) string {
+	v := strings.TrimSpace(raw)
+	if v == "" {
+		return `""`
+	}
+	if v == "true" || v == "false" {
+		return v
+	}
+	switch strings.ToLower(v) {
+	case "null", "none", "nil":
+		return "null"
+	}
+	if strings.Contains(v, ".") {
+		if f, err := strconv.ParseFloat(v, 64); err == nil {
+			return formatGemma4Float(f)
+		}
+	} else if iv, err := strconv.ParseInt(v, 10, 64); err == nil {
+		return strconv.FormatInt(iv, 10)
+	}
+	return gemma4JSONString(v)
+}
+
+// formatGemma4Float renders like Python's json.dumps(float): integral floats
+// keep a ".0" suffix ("108." decodes to 108.0, not 108), so the arguments
+// JSON matches what vLLM would have produced for the same payload.
+func formatGemma4Float(f float64) string {
+	s := strconv.FormatFloat(f, 'g', -1, 64)
+	if !strings.ContainsAny(s, ".eE") {
+		s += ".0"
+	}
+	return s
+}
+
+// gemma4JSONString encodes a JSON string WITHOUT HTML escaping (json.Marshal
+// would escape the angle brackets in "<div>" to \u003c / \u003e sequences;
+// payload text should survive
+// byte-for-byte, like Python's json.dumps(ensure_ascii=False)).
+func gemma4JSONString(s string) string {
+	var sb strings.Builder
+	enc := json.NewEncoder(&sb)
+	enc.SetEscapeHTML(false)
+	if err := enc.Encode(s); err != nil {
+		// Unreachable for plain strings; fall back to default escaping
+		// rather than emitting invalid JSON.
+		b, mErr := json.Marshal(s)
+		if mErr != nil {
+			return `""`
+		}
+		return string(b)
+	}
+	// Encode appends a trailing newline.
+	return strings.TrimSuffix(sb.String(), "\n")
+}
--- a/backend/go/dllm/gemma4_parser_test.go
+++ b/backend/go/dllm/gemma4_parser_test.go
@@ -0,0 +1,592 @@
+package main
+
+// Parser specs for Gemma4Parser (model output text -> pb.ChatDelta events).
+//
+// Fixture provenance:
+//   - Entries marked "vLLM: <name>" are direct ports of the named test from
+//     vLLM PR #45163, tests/tool_parsers/test_gemma4_tool_parser.py (the
+//     authoritative test-suite for the gemma4 tool-call wire format). The
+//     streaming tests' chunk lists are reused verbatim as Feed fragments.
+//   - Decoder entries port the TestParseGemma4Args / TestParseGemma4Array
+//     classes from the same file (non-partial mode only; this parser never
+//     decodes partial payloads, see the divergence note in gemma4_parser.go).
+//   - Channel/turn-marker expectations come from the chat template embedded
+//     in gemma4_renderer.go (tpl L356-L362 generation prompt, L148-L158
+//     strip_thinking) and vLLM's Gemma4ReasoningParser
+//     (vllm/reasoning/gemma4_reasoning_parser.py).
+
+import (
+	"encoding/json"
+	"fmt"
+	"strings"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+)
+
+// flatGemma4Tool is one accumulated tool call, mirroring how LocalAI core
+// folds ToolCallDelta streams (pkg/functions/chat_deltas.go
+// ToolCallsFromChatDeltas: name/id latch on first non-empty, arguments
+// concatenate per index). Tests flatten through the same rules so they
+// assert exactly what core will reconstruct.
+type flatGemma4Tool struct {
+	id   string
+	name string
+	args string
+}
+
+func flattenGemma4Deltas(deltas []*pb.ChatDelta) (string, string, []flatGemma4Tool) {
+	var content, reasoning strings.Builder
+	byIndex := map[int32]*flatGemma4Tool{}
+	maxIdx := int32(-1)
+	for _, d := range deltas {
+		content.WriteString(d.GetContent())
+		reasoning.WriteString(d.GetReasoningContent())
+		for _, tc := range d.GetToolCalls() {
+			acc, ok := byIndex[tc.GetIndex()]
+			if !ok {
+				acc = &flatGemma4Tool{}
+				byIndex[tc.GetIndex()] = acc
+			}
+			if tc.GetName() != "" {
+				acc.name = tc.GetName()
+			}
+			if tc.GetId() != "" {
+				acc.id = tc.GetId()
+			}
+			acc.args += tc.GetArguments()
+			if tc.GetIndex() > maxIdx {
+				maxIdx = tc.GetIndex()
+			}
+		}
+	}
+	var tools []flatGemma4Tool
+	for i := int32(0); i <= maxIdx; i++ {
+		if acc, ok := byIndex[i]; ok {
+			tools = append(tools, *acc)
+		}
+	}
+	return content.String(), reasoning.String(), tools
+}
+
+type wantGemma4Tool struct {
+	name     string
+	argsJSON string // compared with MatchJSON (key order irrelevant)
+}
+
+type parseGemma4Case struct {
+	startInThought bool
+	fragments      []string
+	wantContent    string
+	wantReasoning  string
+	wantTools      []wantGemma4Tool
+}
+
+func parseGemma4Fragments(startInThought bool, fragments []string) []*pb.ChatDelta {
+	p := NewGemma4Parser(startInThought)
+	var all []*pb.ChatDelta
+	for _, f := range fragments {
+		all = append(all, p.Feed(f)...)
+	}
+	return append(all, p.Close()...)
+}
+
+var _ = Describe("Gemma4Parser", func() {
+	DescribeTable("parses streamed gemma4 output into ChatDeltas",
+		func(c parseGemma4Case) {
+			content, reasoning, tools := flattenGemma4Deltas(parseGemma4Fragments(c.startInThought, c.fragments))
+			Expect(content).To(Equal(c.wantContent))
+			Expect(reasoning).To(Equal(c.wantReasoning))
+			Expect(tools).To(HaveLen(len(c.wantTools)))
+			seenIDs := map[string]bool{}
+			for i, want := range c.wantTools {
+				Expect(tools[i].name).To(Equal(want.name), "tool %d name", i)
+				Expect(tools[i].args).To(MatchJSON(want.argsJSON), "tool %d arguments", i)
+				Expect(tools[i].id).ToNot(BeEmpty(), "tool %d id", i)
+				Expect(seenIDs).ToNot(HaveKey(tools[i].id), "tool %d id must be unique", i)
+				seenIDs[tools[i].id] = true
+			}
+		},
+
+		// --- (1) pure content -------------------------------------------------
+		// vLLM: test_no_tool_calls
+		Entry("pure content, single fragment", parseGemma4Case{
+			fragments:   []string{"Hello, how can I help you today?"},
+			wantContent: "Hello, how can I help you today?",
+		}),
+
+		// --- (2) thought -> final transition ----------------------------------
+		// enable_thinking render: prompt ends at <|turn>model\n and the model
+		// opens/closes its own thought channel in the OUTPUT (vLLM
+		// Gemma4ReasoningParser docstring; tpl L356-L362). The "thought\n"
+		// role label after <|channel> is structural and must be stripped
+		// (vLLM _THOUGHT_PREFIX handling).
+		Entry("thought channel then final content", parseGemma4Case{
+			fragments:     []string{"<|channel>thought\nLet me think about this.\n<channel|>The answer is 42."},
+			wantReasoning: "Let me think about this.\n",
+			wantContent:   "The answer is 42.",
+		}),
+
+		// --- (3) startInThought both ways -------------------------------------
+		Entry("startInThought=true routes initial text to reasoning until <channel|>", parseGemma4Case{
+			startInThought: true,
+			fragments:      []string{"I am thinking hard.<channel|>Done."},
+			wantReasoning:  "I am thinking hard.",
+			wantContent:    "Done.",
+		}),
+		// A stray <channel|> with no open channel is swallowed, matching the
+		// template's strip_thinking (tpl L148-L158: the marker is dropped,
+		// text on both sides is kept).
+		Entry("startInThought=false keeps the same text as content, stray <channel|> swallowed", parseGemma4Case{
+			startInThought: false,
+			fragments:      []string{"I am thinking hard.<channel|>Done."},
+			wantContent:    "I am thinking hard.Done.",
+		}),
+
+		// --- (4) one tool call, full payload type zoo --------------------------
+		Entry("single tool call: strings, numbers, bools, null, nested object and array", parseGemma4Case{
+			fragments: []string{`<|tool_call>call:complex_function{text:<|"|>with, comma and {braces}<|"|>,count:42,score:3.14,yes:true,no:false,nothing:null,obj:{inner:<|"|>v<|"|>,k:1},arr:[<|"|>a<|"|>,2,true]}<tool_call|>`},
+			wantTools: []wantGemma4Tool{{
+				name:     "complex_function",
+				argsJSON: `{"text":"with, comma and {braces}","count":42,"score":3.14,"yes":true,"no":false,"nothing":null,"obj":{"inner":"v","k":1},"arr":["a",2,true]}`,
+			}},
+		}),
+
+		// --- (5) payload split across 3 fragments ------------------------------
+		Entry("tool-call payload split across three fragments", parseGemma4Case{
+			fragments: []string{
+				"<|tool_call>call:get_weather{loc",
+				`ation:<|"|>Paris, Fra`,
+				`nce<|"|>}<tool_call|>`,
+			},
+			wantTools: []wantGemma4Tool{{name: "get_weather", argsJSON: `{"location":"Paris, France"}`}},
+		}),
+
+		// --- (6) marker split across fragments ----------------------------------
+		Entry("tool-call open marker split across fragments", parseGemma4Case{
+			fragments: []string{
+				"<|tool_ca",
+				`ll>call:get_weather{location:<|"|>London<|"|>}<tool_call|>`,
+			},
+			wantTools: []wantGemma4Tool{{name: "get_weather", argsJSON: `{"location":"London"}`}},
+		}),
+		Entry("channel open marker split across fragments", parseGemma4Case{
+			fragments: []string{
+				"<|chan",
+				"nel>thought\ndeep thought<channel|>final",
+			},
+			wantReasoning: "deep thought",
+			wantContent:   "final",
+		}),
+
+		// --- (7) trailing partial marker held, flushed by Close -----------------
+		Entry("trailing partial marker is held back and flushed by Close", parseGemma4Case{
+			fragments:   []string{"Hello <|tool"},
+			wantContent: "Hello <|tool",
+		}),
+
+		// --- (8) malformed/incomplete payload -> content fallback ---------------
+		// vLLM: test_incomplete_tool_call (no end marker: the whole text stays
+		// content, never silently dropped).
+		Entry("incomplete tool payload at Close is emitted as raw content", parseGemma4Case{
+			fragments:   []string{`<|tool_call>call:get_weather{location:<|"|>London`},
+			wantContent: `<|tool_call>call:get_weather{location:<|"|>London`,
+		}),
+		Entry("malformed complete payload is emitted as raw content, parsing continues", parseGemma4Case{
+			fragments:   []string{"<|tool_call>oops no call syntax<tool_call|> done"},
+			wantContent: "<|tool_call>oops no call syntax<tool_call|> done",
+		}),
+
+		// --- (9) <turn|> ends the turn -------------------------------------------
+		Entry("text after <turn|> is ignored, including later fragments", parseGemma4Case{
+			fragments: []string{
+				"before<turn|>after",
+				`more <|tool_call>call:f{}<tool_call|>`,
+			},
+			wantContent: "before",
+		}),
+		Entry("<turn|> inside a thought channel ends the turn", parseGemma4Case{
+			startInThought: true,
+			fragments:      []string{"thinking<turn|>ignored"},
+			wantReasoning:  "thinking",
+		}),
+
+		// --- (10) ported vLLM non-streaming cases ---------------------------------
+		// vLLM: test_single_tool_call
+		Entry("vLLM: test_single_tool_call", parseGemma4Case{
+			fragments: []string{`<|tool_call>call:get_weather{location:<|"|>London<|"|>}<tool_call|>`},
+			wantTools: []wantGemma4Tool{{name: "get_weather", argsJSON: `{"location":"London"}`}},
+		}),
+		// vLLM: test_multiple_arguments
+		Entry("vLLM: test_multiple_arguments", parseGemma4Case{
+			fragments: []string{`<|tool_call>call:get_weather{location:<|"|>San Francisco<|"|>,unit:<|"|>celsius<|"|>}<tool_call|>`},
+			wantTools: []wantGemma4Tool{{name: "get_weather", argsJSON: `{"location":"San Francisco","unit":"celsius"}`}},
+		}),
+		// vLLM: test_text_before_tool_call. DIVERGENCE: vLLM's non-streaming
+		// extractor trims the content ("...you."); a streaming parser cannot
+		// retroactively trim already-emitted text, so the trailing space is
+		// kept (vLLM's own streaming path keeps it too, see
+		// test_streaming_text_before_tool_call which only checks a prefix).
+		Entry("vLLM: test_text_before_tool_call (streaming semantics: no trim)", parseGemma4Case{
+			fragments:   []string{`Let me check the weather for you. <|tool_call>call:get_weather{location:<|"|>Paris<|"|>}<tool_call|>`},
+			wantContent: "Let me check the weather for you. ",
+			wantTools:   []wantGemma4Tool{{name: "get_weather", argsJSON: `{"location":"Paris"}`}},
+		}),
+		// vLLM: test_multiple_tool_calls (also covers case 11: multi-tool sequence)
+		Entry("vLLM: test_multiple_tool_calls", parseGemma4Case{
+			fragments: []string{`<|tool_call>call:get_weather{location:<|"|>London<|"|>}<tool_call|><|tool_call>call:get_time{location:<|"|>London<|"|>}<tool_call|>`},
+			wantTools: []wantGemma4Tool{
+				{name: "get_weather", argsJSON: `{"location":"London"}`},
+				{name: "get_time", argsJSON: `{"location":"London"}`},
+			},
+		}),
+		// vLLM: test_nested_arguments
+		Entry("vLLM: test_nested_arguments", parseGemma4Case{
+			fragments: []string{`<|tool_call>call:complex_function{nested:{inner:<|"|>value<|"|>},list:[<|"|>a<|"|>,<|"|>b<|"|>]}<tool_call|>`},
+			wantTools: []wantGemma4Tool{{name: "complex_function", argsJSON: `{"nested":{"inner":"value"},"list":["a","b"]}`}},
+		}),
+		// vLLM: test_tool_call_with_number_and_boolean
+		Entry("vLLM: test_tool_call_with_number_and_boolean", parseGemma4Case{
+			fragments: []string{`<|tool_call>call:set_status{is_active:true,count:42,score:3.14}<tool_call|>`},
+			wantTools: []wantGemma4Tool{{name: "set_status", argsJSON: `{"is_active":true,"count":42,"score":3.14}`}},
+		}),
+		// vLLM: test_hyphenated_function_name
+		Entry("vLLM: test_hyphenated_function_name", parseGemma4Case{
+			fragments: []string{`<|tool_call>call:get-weather{location:<|"|>London<|"|>}<tool_call|>`},
+			wantTools: []wantGemma4Tool{{name: "get-weather", argsJSON: `{"location":"London"}`}},
+		}),
+		// vLLM: test_dotted_function_name
+		Entry("vLLM: test_dotted_function_name", parseGemma4Case{
+			fragments: []string{`<|tool_call>call:weather.get{location:<|"|>London<|"|>}<tool_call|>`},
+			wantTools: []wantGemma4Tool{{name: "weather.get", argsJSON: `{"location":"London"}`}},
+		}),
+		// vLLM: test_no_arguments
+		Entry("vLLM: test_no_arguments", parseGemma4Case{
+			fragments: []string{"<|tool_call>call:get_status{}<tool_call|>"},
+			wantTools: []wantGemma4Tool{{name: "get_status", argsJSON: `{}`}},
+		}),
+
+		// --- ported vLLM streaming cases (chunk lists reused as fragments) --------
+		// vLLM: test_basic_streaming_single_tool
+		Entry("vLLM: test_basic_streaming_single_tool", parseGemma4Case{
+			fragments: []string{
+				"<|tool_call>",
+				"call:get_weather{",
+				`location:<|"|>Paris`,
+				", France",
+				`<|"|>}`,
+				"<tool_call|>",
+			},
+			wantTools: []wantGemma4Tool{{name: "get_weather", argsJSON: `{"location":"Paris, France"}`}},
+		}),
+		// vLLM: test_streaming_multi_arg
+		Entry("vLLM: test_streaming_multi_arg", parseGemma4Case{
+			fragments: []string{
+				"<|tool_call>",
+				"call:get_weather{",
+				`location:<|"|>Tokyo<|"|>,`,
+				`unit:<|"|>celsius<|"|>}`,
+				"<tool_call|>",
+			},
+			wantTools: []wantGemma4Tool{{name: "get_weather", argsJSON: `{"location":"Tokyo","unit":"celsius"}`}},
+		}),
+		// vLLM: test_streaming_text_before_tool_call
+		Entry("vLLM: test_streaming_text_before_tool_call", parseGemma4Case{
+			fragments: []string{
+				"Let me check ",
+				"the weather. ",
+				"<|tool_call>",
+				"call:get_weather{",
+				`location:<|"|>London<|"|>}`,
+				"<tool_call|>",
+			},
+			wantContent: "Let me check the weather. ",
+			wantTools:   []wantGemma4Tool{{name: "get_weather", argsJSON: `{"location":"London"}`}},
+		}),
+		// vLLM: test_streaming_numeric_args
+		Entry("vLLM: test_streaming_numeric_args", parseGemma4Case{
+			fragments: []string{
+				"<|tool_call>",
+				"call:set_config{",
+				"count:42,",
+				"active:true}",
+				"<tool_call|>",
+			},
+			wantTools: []wantGemma4Tool{{name: "set_config", argsJSON: `{"count":42,"active":true}`}},
+		}),
+		// vLLM: test_streaming_boolean_split_across_chunks
+		Entry("vLLM: test_streaming_boolean_split_across_chunks", parseGemma4Case{
+			fragments: []string{
+				"<|tool_call>",
+				"call:search{input:{all:tru",
+				"e}}",
+				"<tool_call|>",
+			},
+			wantTools: []wantGemma4Tool{{name: "search", argsJSON: `{"input":{"all":true}}`}},
+		}),
+		// vLLM: test_streaming_false_split_across_chunks
+		Entry("vLLM: test_streaming_false_split_across_chunks", parseGemma4Case{
+			fragments: []string{
+				"<|tool_call>",
+				"call:set{flag:fals",
+				"e}",
+				"<tool_call|>",
+			},
+			wantTools: []wantGemma4Tool{{name: "set", argsJSON: `{"flag":false}`}},
+		}),
+		// vLLM: test_streaming_number_split_across_chunks
+		Entry("vLLM: test_streaming_number_split_across_chunks", parseGemma4Case{
+			fragments: []string{
+				"<|tool_call>",
+				"call:set{count:4",
+				"2}",
+				"<tool_call|>",
+			},
+			wantTools: []wantGemma4Tool{{name: "set", argsJSON: `{"count":42}`}},
+		}),
+		// vLLM: test_streaming_empty_args
+		Entry("vLLM: test_streaming_empty_args", parseGemma4Case{
+			fragments: []string{
+				"<|tool_call>",
+				"call:get_status{}",
+				"<tool_call|>",
+			},
+			wantTools: []wantGemma4Tool{{name: "get_status", argsJSON: `{}`}},
+		}),
+		// vLLM: test_streaming_split_delimiter_no_invalid_json (string
+		// delimiter <|"|> split across fragments must not leak fragments).
+		Entry("vLLM: test_streaming_split_delimiter_no_invalid_json", parseGemma4Case{
+			fragments: []string{
+				"<|tool_call>",
+				"call:todowrite{",
+				`content:<|"|>Buy milk<|`,
+				`"|>}`,
+				"<tool_call|>",
+			},
+			wantTools: []wantGemma4Tool{{name: "todowrite", argsJSON: `{"content":"Buy milk"}`}},
+		}),
+		// vLLM: test_streaming_does_not_duplicate_plain_text_after_tool_call
+		Entry("vLLM: test_streaming_does_not_duplicate_plain_text_after_tool_call", parseGemma4Case{
+			fragments: []string{
+				"<|tool_call>",
+				"call:get_weather{",
+				`location:<|"|>Paris<|"|>}`,
+				"<tool_call|><",
+				"div>",
+			},
+			wantContent: "<div>",
+			wantTools:   []wantGemma4Tool{{name: "get_weather", argsJSON: `{"location":"Paris"}`}},
+		}),
+		// vLLM: test_streaming_html_argument_does_not_duplicate_tag_prefixes
+		Entry("vLLM: test_streaming_html_argument_does_not_duplicate_tag_prefixes", parseGemma4Case{
+			fragments: []string{
+				"<|tool_call>",
+				"call:write_file{",
+				`path:<|"|>index.html<|"|>,`,
+				`content:<|"|><!DOCTYPE html>` + "\n<",
+				`html lang="zh-CN">` + "\n<",
+				"head>\n    <",
+				`meta charset="UTF-8">` + "\n    <",
+				`meta name="viewport" content="width=device-width">` + "\n",
+				`<|"|>}`,
+				"<tool_call|>",
+			},
+			wantTools: []wantGemma4Tool{{
+				name:     "write_file",
+				argsJSON: `{"path":"index.html","content":"<!DOCTYPE html>\n<html lang=\"zh-CN\">\n<head>\n    <meta charset=\"UTF-8\">\n    <meta name=\"viewport\" content=\"width=device-width\">\n"}`,
+			}},
+		}),
+		// vLLM: test_streaming_single_chunk_complete_tool_call
+		Entry("vLLM: test_streaming_single_chunk_complete_tool_call", parseGemma4Case{
+			fragments: []string{`<|tool_call>call:name_a_color{color_hex:<|"|>00ff11<|"|>}<tool_call|>`},
+			wantTools: []wantGemma4Tool{{name: "name_a_color", argsJSON: `{"color_hex":"00ff11"}`}},
+		}),
+		// vLLM: test_streaming_multi_chunk_batched_tool_calls (two complete
+		// calls in ONE fragment; both must come out with distinct indices)
+		Entry("vLLM: test_streaming_multi_chunk_batched_tool_calls", parseGemma4Case{
+			fragments: []string{
+				`<|tool_call>call:get_weather{location:<|"|>London<|"|>}<tool_call|>` +
+					`<|tool_call>call:get_time{timezone:<|"|>GMT<|"|>}<tool_call|>`,
+			},
+			wantTools: []wantGemma4Tool{
+				{name: "get_weather", argsJSON: `{"location":"London"}`},
+				{name: "get_time", argsJSON: `{"timezone":"GMT"}`},
+			},
+		}),
+		// vLLM: test_streaming_trailing_bare_bool_not_duplicated
+		Entry("vLLM: test_streaming_trailing_bare_bool_not_duplicated", parseGemma4Case{
+			fragments: []string{
+				"<|tool_call>",
+				"call:Edit{",
+				`file_path:<|"|>src/env.py<|"|>,`,
+				`old_string:<|"|>old_val<|"|>,`,
+				`new_string:<|"|>new_val<|"|>,`,
+				"replace_all:",
+				"false}",
+				"<tool_call|>",
+			},
+			wantTools: []wantGemma4Tool{{
+				name:     "Edit",
+				argsJSON: `{"file_path":"src/env.py","old_string":"old_val","new_string":"new_val","replace_all":false}`,
+			}},
+		}),
+
+		// --- implicit reasoning end on <|tool_call> (vLLM is_reasoning_end:
+		// a tool_call token means reasoning is over) -----------------------------
+		Entry("tool call inside an open thought channel ends the reasoning", parseGemma4Case{
+			startInThought: true,
+			fragments:      []string{`need the weather<|tool_call>call:get_weather{location:<|"|>Rome<|"|>}<tool_call|>`},
+			wantReasoning:  "need the weather",
+			wantTools:      []wantGemma4Tool{{name: "get_weather", argsJSON: `{"location":"Rome"}`}},
+		}),
+
+		// --- (12) empty fragments are no-ops --------------------------------------
+		Entry("empty fragments are no-ops", parseGemma4Case{
+			fragments:   []string{"", "Hello", "", "", " world", ""},
+			wantContent: "Hello world",
+		}),
+	)
+
+	It("returns no deltas for an empty fragment and after Close", func() {
+		p := NewGemma4Parser(false)
+		Expect(p.Feed("")).To(BeEmpty())
+		Expect(p.Feed("hi")).ToNot(BeEmpty())
+		Expect(p.Close()).To(BeEmpty()) // nothing held back
+		// The parser is finished after Close: further input is dropped.
+		Expect(p.Feed("more")).To(BeEmpty())
+		Expect(p.Close()).To(BeEmpty())
+	})
+
+	It("generates index-based tool call ids (call_<index>)", func() {
+		// Mirrors the index-based id convention of pkg/grpc/rich_test.go and
+		// keeps ids deterministic for the split-invariance property below.
+		deltas := parseGemma4Fragments(false, []string{
+			`<|tool_call>call:a{}<tool_call|><|tool_call>call:b{}<tool_call|>`,
+		})
+		_, _, tools := flattenGemma4Deltas(deltas)
+		Expect(tools).To(HaveLen(2))
+		Expect(tools[0].id).To(Equal("call_0"))
+		Expect(tools[1].id).To(Equal("call_1"))
+	})
+
+	// Property: for a fixed full output, EVERY 2-split position must yield
+	// exactly the same flattened result as the unsplit parse. This kills
+	// fragment-boundary bugs (mid-marker, mid-delimiter, mid-payload splits).
+	DescribeTable("2-split fragment invariance",
+		func(startInThought bool, full string) {
+			refContent, refReasoning, refTools := flattenGemma4Deltas(
+				parseGemma4Fragments(startInThought, []string{full}))
+			for i := 0; i <= len(full); i++ {
+				content, reasoning, tools := flattenGemma4Deltas(
+					parseGemma4Fragments(startInThought, []string{full[:i], full[i:]}))
+				Expect(content).To(Equal(refContent), fmt.Sprintf("content diverged at split %d", i))
+				Expect(reasoning).To(Equal(refReasoning), fmt.Sprintf("reasoning diverged at split %d", i))
+				Expect(tools).To(Equal(refTools), fmt.Sprintf("tool calls diverged at split %d", i))
+			}
+		},
+		Entry("thought + content + two tool calls + turn end", false,
+			"<|channel>thought\nPondering the request...\n<channel|>Sure - calling tools now. "+
+				`<|tool_call>call:get_weather{location:<|"|>Paris, France<|"|>,unit:<|"|>celsius<|"|>,days:3,detailed:true}<tool_call|>`+
+				`<|tool_call>call:get_time{timezone:<|"|>Europe/Lisbon<|"|>,nested:{flag:false,vals:[1,2.5,<|"|>x<|"|>]}}<tool_call|>`+
+				"Done.<turn|>ignored tail"),
+		Entry("startInThought + tool call + trailing partial marker", true,
+			`Deep thought<channel|>final answer <|tool_call>call:noop{}<tool_call|> trailing <|tool`),
+		Entry("malformed payload fallback", false,
+			`pre <|tool_call>not a call<tool_call|> post`),
+	)
+})
+
+// Decoder-level ports of vLLM's TestParseGemma4Args / TestParseGemma4Array
+// (non-partial mode; the partial-withholding tests do not apply because this
+// parser only ever decodes COMPLETE payloads, see gemma4_parser.go).
+var _ = Describe("decodeGemma4Args", func() {
+	DescribeTable("decodes the gemma4 call syntax into JSON arguments",
+		func(in, wantJSON string) {
+			Expect(decodeGemma4Args(in, 0)).To(MatchJSON(wantJSON))
+		},
+		// vLLM: test_empty_string / test_whitespace_only
+		Entry("empty string", "", `{}`),
+		Entry("whitespace only", "   ", `{}`),
+		// vLLM: test_single_string_value
+		Entry("single string value", `location:<|"|>Paris<|"|>`, `{"location":"Paris"}`),
+		// vLLM: test_string_value_with_comma
+		Entry("string value with comma", `location:<|"|>Paris, France<|"|>`, `{"location":"Paris, France"}`),
+		// vLLM: test_multiple_string_values
+		Entry("multiple string values", `location:<|"|>San Francisco<|"|>,unit:<|"|>celsius<|"|>`, `{"location":"San Francisco","unit":"celsius"}`),
+		// vLLM: test_integer_value / test_float_value
+		Entry("integer value", "count:42", `{"count":42}`),
+		Entry("float value", "score:3.14", `{"score":3.14}`),
+		// vLLM: test_boolean_true / test_boolean_false
+		Entry("boolean true", "flag:true", `{"flag":true}`),
+		Entry("boolean false", "flag:false", `{"flag":false}`),
+		// vLLM: test_null_value (bare null must become JSON null, not "null")
+		Entry("null value", "param:null", `{"param":null}`),
+		// vLLM: test_mixed_types
+		Entry("mixed types", `name:<|"|>test<|"|>,count:42,active:true,score:3.14`,
+			`{"name":"test","count":42,"active":true,"score":3.14}`),
+		// vLLM: test_nested_object
+		Entry("nested object", `nested:{inner:<|"|>value<|"|>}`, `{"nested":{"inner":"value"}}`),
+		// vLLM: test_array_of_strings
+		Entry("array of strings", `items:[<|"|>a<|"|>,<|"|>b<|"|>]`, `{"items":["a","b"]}`),
+		// vLLM: test_unterminated_string (take everything after the delimiter)
+		Entry("unterminated string", `key:<|"|>unterminated`, `{"key":"unterminated"}`),
+		// vLLM: test_empty_value (key with no value after colon)
+		Entry("empty value", "key:", `{"key":""}`),
+		// vLLM: test_trailing_dot_float_partial_withheld, non-partial branch
+		// (trailing-dot floats parse normally outside streaming).
+		Entry("trailing dot float, complete payload", "left:108.,right:22.8", `{"left":108.0,"right":22.8}`),
+	)
+
+	It("terminates and yields valid JSON on malformed input", func() {
+		// vLLM: test_malformed_partial_array (the assertion there is only
+		// "returns a dict without hanging"; ours is "valid JSON object").
+		out := decodeGemma4Args(":[t:[]", 0)
+		var v map[string]any
+		Expect(json.Unmarshal([]byte(out), &v)).To(Succeed())
+	})
+
+	It("degrades nesting beyond the recursion cap to a string value", func() {
+		// 200 levels of a:{a:{...a:1...}}. Without the depth cap the mutual
+		// recursion would grow the stack with the model's output; a Go stack
+		// overflow is a fatal process kill, so levels past gemma4MaxArgsDepth
+		// must gracefully fall back to the raw inner text as a JSON string.
+		const depth = 200
+		body := strings.Repeat("a:{", depth-1) + "a:1" + strings.Repeat("}", depth-1)
+		out := decodeGemma4Args(body, 0)
+		var v map[string]any
+		Expect(json.Unmarshal([]byte(out), &v)).To(Succeed())
+		levels := 0
+		var cur any = v
+		for {
+			m, ok := cur.(map[string]any)
+			if !ok {
+				break
+			}
+			Expect(m).To(HaveKey("a"))
+			cur = m["a"]
+			levels++
+		}
+		Expect(levels).To(Equal(gemma4MaxArgsDepth + 1))
+		Expect(cur).To(BeAssignableToTypeOf(""))
+		Expect(cur).To(ContainSubstring("a:{"))
+	})
+})
+
+var _ = Describe("decodeGemma4Array", func() {
+	DescribeTable("decodes gemma4 array bodies into JSON arrays",
+		func(in, wantJSON string) {
+			Expect(decodeGemma4Array(in, 0)).To(MatchJSON(wantJSON))
+		},
+		// vLLM: test_string_array / test_empty_array / test_bare_values
+		Entry("string array", `<|"|>a<|"|>,<|"|>b<|"|>`, `["a","b"]`),
+		Entry("empty array", "", `[]`),
+		Entry("bare values", "42,true,3.14", `[42,true,3.14]`),
+		// vLLM: test_string_element_with_closing_bracket (a ']' inside a
+		// delimited string must not close the array)
+		Entry("string element with closing bracket", `[<|"|>a]b<|"|>,<|"|>c<|"|>],<|"|>tail<|"|>`, `[["a]b","c"],"tail"]`),
+		// vLLM: test_stray_closing_bracket (no-progress abort, keep prefix)
+		Entry("stray closing bracket", "42,]trailing", `[42]`),
+	)
+})
--- a/backend/go/dllm/gemma4_renderer.go
+++ b/backend/go/dllm/gemma4_renderer.go
--- a/backend/go/dllm/gemma4_renderer_test.go
+++ b/backend/go/dllm/gemma4_renderer_test.go
@@ -0,0 +1,347 @@
+package main
+
+// Renderer specs for RenderGemma4 against the canonical gemma4 chat template
+// (see the normative template comment in gemma4_renderer.go).
+//
+// Fixture provenance:
+//   - "single user message" and "enable_thinking" are the EXACT expected
+//     decodes from transformers tests/models/diffusion_gemma/
+//     test_modeling_diffusion_gemma.py (test_diffusion_gemma_chat_template
+//     and ..._with_thinking) with ONE difference: the transformers fixtures
+//     start with "<bos>" because apply_chat_template tokenizes the rendered
+//     text with add_bos. Our prompt goes through dllm_capi_generate, whose
+//     run_generate already tokenizes with prepend_bos = vocab.add_bos
+//     (dllm.cpp src/capi.cpp:230-231, true for gemma4), so the renderer must
+//     NOT emit a literal <bos> (it would double) and every expected string
+//     here drops that leading token.
+//   - All other expected strings were produced by rendering the verbatim
+//     GGUF template with jinja2 3.1.2 (bos_token="<bos>") and dropping the
+//     leading "<bos>" for the same reason.
+
+import (
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+)
+
+// Two-function tools array used by the tool fixtures (OpenAI wire shape, as
+// LocalAI passes it through PredictOptions.Tools).
+const testToolsJSON = `[{"type":"function","function":{"name":"get_weather","description":"Get the current weather in a location.","parameters":{"type":"object","properties":{"location":{"type":"string","description":"The city name."},"unit":{"type":"string","enum":["celsius","fahrenheit"]}},"required":["location"]}}},{"type":"function","function":{"name":"get_time","description":"Get the current time in a timezone.","parameters":{"type":"object","properties":{"timezone":{"type":"string","description":"IANA timezone name."}},"required":["timezone"]}}}]`
+
+// The <|tool>...<tool|> block the template renders for testToolsJSON inside
+// the system turn (jinja2-verified).
+const testToolsBlock = `<|tool>declaration:get_weather{description:<|"|>Get the current weather in a location.<|"|>,parameters:{properties:{location:{description:<|"|>The city name.<|"|>,type:<|"|>STRING<|"|>},unit:{enum:[<|"|>celsius<|"|>,<|"|>fahrenheit<|"|>],type:<|"|>STRING<|"|>}},required:[<|"|>location<|"|>],type:<|"|>OBJECT<|"|>}}<tool|><|tool>declaration:get_time{description:<|"|>Get the current time in a timezone.<|"|>,parameters:{properties:{timezone:{description:<|"|>IANA timezone name.<|"|>,type:<|"|>STRING<|"|>}},required:[<|"|>timezone<|"|>],type:<|"|>OBJECT<|"|>}}<tool|>`
+
+// A single tool exercising the deep format_parameters branches: array items
+// (string-typed and nested-array), nullable, enum+nullable, nested object
+// properties/required, and a response declaration.
+const complexToolsJSON = `[{"type":"function","function":{"name":"complex_tool","description":"A complex tool.","parameters":{"type":"object","properties":{"tags":{"type":"array","description":"Tags.","items":{"type":"string"}},"matrix":{"type":"array","items":{"type":"array","items":{"type":"number"}}},"opts":{"type":"object","description":"Options.","properties":{"depth":{"type":"integer","nullable":true}},"required":["depth"]},"mode":{"type":"string","enum":["a","b"],"nullable":true}},"required":["tags","opts"]},"response":{"description":"The result.","type":"object"}}}]`
+
+// jinja2-verified render of complexToolsJSON. Notable template quirks pinned
+// here: nested array items go through format_argument with ESCAPED keys and
+// an un-uppercased type (<|"|>type<|"|>:<|"|>number<|"|>), while direct item
+// types are uppercased; properties dictsort case-insensitively.
+const complexToolsBlock = `<|tool>declaration:complex_tool{description:<|"|>A complex tool.<|"|>,parameters:{properties:{matrix:{items:{items:{<|"|>type<|"|>:<|"|>number<|"|>},type:<|"|>ARRAY<|"|>},type:<|"|>ARRAY<|"|>},mode:{enum:[<|"|>a<|"|>,<|"|>b<|"|>],nullable:true,type:<|"|>STRING<|"|>},opts:{description:<|"|>Options.<|"|>,properties:{depth:{nullable:true,type:<|"|>INTEGER<|"|>}},required:[<|"|>depth<|"|>],type:<|"|>OBJECT<|"|>},tags:{description:<|"|>Tags.<|"|>,items:{type:<|"|>STRING<|"|>},type:<|"|>ARRAY<|"|>}},required:[<|"|>tags<|"|>,<|"|>opts<|"|>],type:<|"|>OBJECT<|"|>},response:{description:<|"|>The result.<|"|>,type:<|"|>OBJECT<|"|>}}<tool|>`
+
+type renderGemma4Case struct {
+	msgs               []*pb.Message
+	toolsJSON          string
+	enableThinking     bool
+	noGenerationPrompt bool // inverted so the zero value is the common case
+	expected           string
+}
+
+var _ = Describe("RenderGemma4", func() {
+	DescribeTable("renders the canonical gemma4 prompt",
+		func(c renderGemma4Case) {
+			out, err := RenderGemma4(c.msgs, c.toolsJSON, c.enableThinking, !c.noGenerationPrompt)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(out).To(Equal(c.expected))
+			// The C-ABI generate prepends BOS itself: a literal <bos>
+			// anywhere in the rendered prompt would double-encode it.
+			Expect(out).ToNot(ContainSubstring("<bos>"))
+		},
+
+		// transformers fixture (test_diffusion_gemma_chat_template), sans <bos>:
+		// default thinking pre-opens an EMPTY thought channel in the
+		// generation prompt.
+		Entry("single user message, default (no thinking)", renderGemma4Case{
+			msgs: []*pb.Message{
+				{Role: "user", Content: "Write a long essay about Portugal."},
+			},
+			expected: "<|turn>user\nWrite a long essay about Portugal.<turn|>\n<|turn>model\n<|channel>thought\n<channel|>",
+		}),
+
+		// transformers fixture (test_diffusion_gemma_chat_template_with_thinking),
+		// sans <bos>: a system turn carrying <|think|> and NO auto-opened
+		// thought channel.
+		Entry("enable_thinking=true", renderGemma4Case{
+			msgs: []*pb.Message{
+				{Role: "user", Content: "Write a long essay about Portugal."},
+			},
+			enableThinking: true,
+			expected:       "<|turn>system\n<|think|>\n<turn|>\n<|turn>user\nWrite a long essay about Portugal.<turn|>\n<|turn>model\n",
+		}),
+
+		Entry("multi-turn user/assistant/user", renderGemma4Case{
+			msgs: []*pb.Message{
+				{Role: "user", Content: "Hello, who are you?"},
+				{Role: "assistant", Content: "I am Gemma, a helpful assistant."},
+				{Role: "user", Content: "Tell me a joke."},
+			},
+			expected: "<|turn>user\nHello, who are you?<turn|>\n<|turn>model\nI am Gemma, a helpful assistant.<turn|>\n<|turn>user\nTell me a joke.<turn|>\n<|turn>model\n<|channel>thought\n<channel|>",
+		}),
+
+		// tpl L178-L195: a leading system message is folded into the system
+		// turn (trimmed) and consumed from the loop.
+		Entry("system message folds into the system turn", renderGemma4Case{
+			msgs: []*pb.Message{
+				{Role: "system", Content: "You are a pirate."},
+				{Role: "user", Content: "Hello!"},
+			},
+			expected: "<|turn>system\nYou are a pirate.<turn|>\n<|turn>user\nHello!<turn|>\n<|turn>model\n<|channel>thought\n<channel|>",
+		}),
+
+		// tpl L182-L185: <|think|> goes at the very top of the SAME system
+		// turn, before the system prompt text.
+		Entry("system message with enable_thinking shares the turn", renderGemma4Case{
+			msgs: []*pb.Message{
+				{Role: "system", Content: "You are a pirate."},
+				{Role: "user", Content: "Hello!"},
+			},
+			enableThinking: true,
+			expected:       "<|turn>system\n<|think|>\nYou are a pirate.<turn|>\n<|turn>user\nHello!<turn|>\n<|turn>model\n",
+		}),
+
+		// tpl L196-L203: tool declarations render in the system turn, one
+		// <|tool>declaration:...<tool|> block per tool, no separators.
+		Entry("tools array (two functions)", renderGemma4Case{
+			msgs: []*pb.Message{
+				{Role: "user", Content: "What is the weather in Tokyo?"},
+			},
+			toolsJSON: testToolsJSON,
+			expected:  "<|turn>system\n" + testToolsBlock + "<turn|>\n<|turn>user\nWhat is the weather in Tokyo?<turn|>\n<|turn>model\n<|channel>thought\n<channel|>",
+		}),
+
+		// format_parameters deep branches (tpl L1-L85) + response declaration
+		// (tpl L106-L116).
+		Entry("complex tool schema (array items, nullable, nested object, response)", renderGemma4Case{
+			msgs: []*pb.Message{
+				{Role: "user", Content: "go"},
+			},
+			toolsJSON: complexToolsJSON,
+			expected:  "<|turn>system\n" + complexToolsBlock + "<turn|>\n<|turn>user\ngo<turn|>\n<|turn>model\n<|channel>thought\n<channel|>",
+		}),
+
+		// tpl L243-L313: assistant tool_calls render as
+		// <|tool_call>call:name{args}<tool_call|>; the following role=tool
+		// message renders inline as <|tool_response>response:name{value:..}
+		// <tool_response|>; the model turn stays OPEN (no <turn|>, no new
+		// generation prompt) so the model continues after the response.
+		Entry("assistant tool_calls + role=tool result", renderGemma4Case{
+			msgs: []*pb.Message{
+				{Role: "user", Content: "What is the weather in Tokyo?"},
+				{Role: "assistant", Content: "", ToolCalls: `[{"index":0,"id":"call_1","type":"function","function":{"name":"get_weather","arguments":"{\"location\":\"Tokyo\",\"unit\":\"celsius\"}"}}]`},
+				{Role: "tool", ToolCallId: "call_1", Content: "Sunny, 22 degrees celsius."},
+			},
+			toolsJSON: testToolsJSON,
+			expected:  "<|turn>system\n" + testToolsBlock + "<turn|>\n<|turn>user\nWhat is the weather in Tokyo?<turn|>\n<|turn>model\n" + `<|tool_call>call:get_weather{location:<|"|>Tokyo<|"|>,unit:<|"|>celsius<|"|>}<tool_call|><|tool_response>response:get_weather{value:<|"|>Sunny, 22 degrees celsius.<|"|>}<tool_response|>`,
+		}),
+
+		// tpl L348-L349: a tool_calls turn with no rendered responses ends
+		// on an OPEN <|tool_response> marker for the runtime to fill, and
+		// add_generation_prompt adds nothing (tpl L357).
+		Entry("assistant tool_calls without a result leaves <|tool_response> open", renderGemma4Case{
+			msgs: []*pb.Message{
+				{Role: "user", Content: "What is the weather in Tokyo?"},
+				{Role: "assistant", Content: "", ToolCalls: `[{"index":0,"id":"call_1","type":"function","function":{"name":"get_weather","arguments":"{\"location\":\"Tokyo\",\"unit\":\"celsius\"}"}}]`},
+			},
+			toolsJSON: testToolsJSON,
+			expected:  "<|turn>system\n" + testToolsBlock + "<turn|>\n<|turn>user\nWhat is the weather in Tokyo?<turn|>\n<|turn>model\n" + `<|tool_call>call:get_weather{location:<|"|>Tokyo<|"|>,unit:<|"|>celsius<|"|>}<tool_call|><|tool_response>`,
+		}),
+
+		// tpl L237-L241: reasoning_content renders as a thought channel only
+		// on a tool-calling turn after the last user message.
+		Entry("reasoning_content with tool_calls renders the thought channel", renderGemma4Case{
+			msgs: []*pb.Message{
+				{Role: "user", Content: "weather?"},
+				{Role: "assistant", Content: "", ReasoningContent: "I should call the tool", ToolCalls: `[{"index":0,"id":"c1","type":"function","function":{"name":"get_weather","arguments":"{\"location\":\"Tokyo\"}"}}]`},
+				{Role: "tool", ToolCallId: "c1", Content: "Sunny"},
+			},
+			expected: "<|turn>user\nweather?<turn|>\n<|turn>model\n<|channel>thought\nI should call the tool\n<channel|>" + `<|tool_call>call:get_weather{location:<|"|>Tokyo<|"|>}<tool_call|><|tool_response>response:get_weather{value:<|"|>Sunny<|"|>}<tool_response|>`,
+		}),
+
+		// tpl L220-L235: the assistant answer following its own tool round
+		// continues the SAME model turn (no second <|turn>model).
+		Entry("tool round then final assistant answer then user", renderGemma4Case{
+			msgs: []*pb.Message{
+				{Role: "user", Content: "weather?"},
+				{Role: "assistant", Content: "", ToolCalls: `[{"index":0,"id":"c1","type":"function","function":{"name":"get_weather","arguments":"{\"location\":\"Tokyo\"}"}}]`},
+				{Role: "tool", ToolCallId: "c1", Content: "Sunny"},
+				{Role: "assistant", Content: "It is sunny."},
+				{Role: "user", Content: "thanks"},
+			},
+			expected: "<|turn>user\nweather?<turn|>\n<|turn>model\n" + `<|tool_call>call:get_weather{location:<|"|>Tokyo<|"|>}<tool_call|><|tool_response>response:get_weather{value:<|"|>Sunny<|"|>}<tool_response|>` + "It is sunny.<turn|>\n<|turn>user\nthanks<turn|>\n<|turn>model\n<|channel>thought\n<channel|>",
+		}),
+
+		// format_argument (tpl L118-L147): numbers keep their JSON literal,
+		// booleans lower-case, nested maps have unquoted dictsorted keys,
+		// arrays bracketed; top-level args are dictsorted case-insensitively.
+		Entry("tool_call argument types (number/bool/nested/array)", renderGemma4Case{
+			msgs: []*pb.Message{
+				{Role: "user", Content: "go"},
+				{Role: "assistant", Content: "", ToolCalls: `[{"index":0,"id":"c1","type":"function","function":{"name":"f","arguments":"{\"count\":42,\"ratio\":3.5,\"flag\":true,\"off\":false,\"nested\":{\"x\":\"y\",\"n\":7},\"list\":[\"a\",1,true]}"}}]`},
+			},
+			expected: "<|turn>user\ngo<turn|>\n<|turn>model\n" + `<|tool_call>call:f{count:42,flag:true,list:[<|"|>a<|"|>,1,true],nested:{n:7,x:<|"|>y<|"|>},off:false,ratio:3.5}<tool_call|><|tool_response>`,
+		}),
+
+		// jinja dictsort is case-insensitive: alpha sorts before Beta.
+		Entry("tool_call argument dictsort is case-insensitive", renderGemma4Case{
+			msgs: []*pb.Message{
+				{Role: "user", Content: "go"},
+				{Role: "assistant", Content: "", ToolCalls: `[{"index":0,"id":"c1","type":"function","function":{"name":"f","arguments":"{\"Beta\":1,\"alpha\":2}"}}]`},
+			},
+			expected: "<|turn>user\ngo<turn|>\n<|turn>model\n<|tool_call>call:f{alpha:2,Beta:1}<tool_call|><|tool_response>",
+		}),
+
+		// jinja renders Python None as "None" (round-trips through vLLM's
+		// parser, which lowers "none" back to null).
+		Entry("tool_call null argument renders as None", renderGemma4Case{
+			msgs: []*pb.Message{
+				{Role: "user", Content: "go"},
+				{Role: "assistant", Content: "", ToolCalls: `[{"index":0,"id":"c1","type":"function","function":{"name":"f","arguments":"{\"maybe\":null}"}}]`},
+			},
+			expected: "<|turn>user\ngo<turn|>\n<|turn>model\n<|tool_call>call:f{maybe:None}<tool_call|><|tool_response>",
+		}),
+
+		Entry("tool_call empty arguments render empty braces", renderGemma4Case{
+			msgs: []*pb.Message{
+				{Role: "user", Content: "go"},
+				{Role: "assistant", Content: "", ToolCalls: `[{"index":0,"id":"c1","type":"function","function":{"name":"f","arguments":"{}"}}]`},
+			},
+			expected: "<|turn>user\ngo<turn|>\n<|turn>model\n<|tool_call>call:f{}<tool_call|><|tool_response>",
+		}),
+
+		// tpl L253-L254: a non-object arguments string renders verbatim.
+		Entry("tool_call non-object string arguments render verbatim", renderGemma4Case{
+			msgs: []*pb.Message{
+				{Role: "user", Content: "go"},
+				{Role: "assistant", Content: "", ToolCalls: `[{"index":0,"id":"c1","type":"function","function":{"name":"f","arguments":"just text"}}]`},
+			},
+			expected: "<|turn>user\ngo<turn|>\n<|turn>model\n<|tool_call>call:f{just text}<tool_call|><|tool_response>",
+		}),
+
+		// tpl L278-L285: unmatched tool_call_id falls back to the tool
+		// message's own name.
+		Entry("tool result name falls back when tool_call_id does not match", renderGemma4Case{
+			msgs: []*pb.Message{
+				{Role: "user", Content: "go"},
+				{Role: "assistant", Content: "", ToolCalls: `[{"index":0,"id":"c1","type":"function","function":{"name":"f","arguments":"{}"}}]`},
+				{Role: "tool", ToolCallId: "OTHER", Name: "named_tool", Content: "out"},
+			},
+			expected: "<|turn>user\ngo<turn|>\n<|turn>model\n" + `<|tool_call>call:f{}<tool_call|><|tool_response>response:named_tool{value:<|"|>out<|"|>}<tool_response|>`,
+		}),
+
+		// strip_thinking (tpl L148-L158): historical assistant content loses
+		// its <|channel>...<channel|> spans.
+		Entry("assistant content thinking channels are stripped", renderGemma4Case{
+			msgs: []*pb.Message{
+				{Role: "user", Content: "hi"},
+				{Role: "assistant", Content: "<|channel>thought\nsecret\n<channel|>visible answer"},
+				{Role: "user", Content: "more"},
+			},
+			expected: "<|turn>user\nhi<turn|>\n<|turn>model\nvisible answer<turn|>\n<|turn>user\nmore<turn|>\n<|turn>model\n<|channel>thought\n<channel|>",
+		}),
+
+		// tpl L220-L235: consecutive assistant messages suppress the second
+		// <|turn>model (continuation), but each still closes with <turn|>.
+		Entry("consecutive assistant messages continue the model turn", renderGemma4Case{
+			msgs: []*pb.Message{
+				{Role: "user", Content: "hi"},
+				{Role: "assistant", Content: "part one"},
+				{Role: "assistant", Content: "part two"},
+				{Role: "user", Content: "ok"},
+			},
+			expected: "<|turn>user\nhi<turn|>\n<|turn>model\npart one<turn|>\npart two<turn|>\n<|turn>user\nok<turn|>\n<|turn>model\n<|channel>thought\n<channel|>",
+		}),
+
+		Entry("add_generation_prompt=false renders no model turn", renderGemma4Case{
+			msgs: []*pb.Message{
+				{Role: "user", Content: "hi"},
+			},
+			noGenerationPrompt: true,
+			expected:           "<|turn>user\nhi<turn|>\n",
+		}),
+	)
+
+	Describe("error handling", func() {
+		It("fails loud on an unknown role", func() {
+			_, err := RenderGemma4([]*pb.Message{
+				{Role: "narrator", Content: "Meanwhile..."},
+			}, "", false, true)
+			Expect(err).To(HaveOccurred())
+			Expect(err.Error()).To(ContainSubstring(`unknown role "narrator"`))
+		})
+
+		It("fails on invalid tools JSON", func() {
+			_, err := RenderGemma4([]*pb.Message{
+				{Role: "user", Content: "hi"},
+			}, "{not json", false, true)
+			Expect(err).To(HaveOccurred())
+			Expect(err.Error()).To(ContainSubstring("tools JSON"))
+		})
+
+		It("fails on invalid tool_calls JSON", func() {
+			_, err := RenderGemma4([]*pb.Message{
+				{Role: "user", Content: "hi"},
+				{Role: "assistant", Content: "", ToolCalls: "{not json"},
+			}, "", false, true)
+			Expect(err).To(HaveOccurred())
+			Expect(err.Error()).To(ContainSubstring("tool_calls JSON"))
+		})
+
+		It("fails on an orphan tool message, naming its index", func() {
+			// A role:tool message with no preceding assistant tool_calls turn
+			// would be silently dropped by the jinja; we fail loud instead.
+			_, err := RenderGemma4([]*pb.Message{
+				{Role: "user", Content: "hi"},
+				{Role: "tool", Content: `{"temp": 20}`, ToolCallId: "call_1"},
+			}, "", false, true)
+			Expect(err).To(HaveOccurred())
+			Expect(err.Error()).To(ContainSubstring("orphan tool message 1"))
+		})
+
+		It("fails on trailing garbage after the tools JSON array", func() {
+			_, err := RenderGemma4([]*pb.Message{
+				{Role: "user", Content: "hi"},
+			}, "[] junk", false, true)
+			Expect(err).To(HaveOccurred())
+			Expect(err.Error()).To(ContainSubstring("tools JSON"))
+		})
+
+		It("fails when the tools JSON is not an array", func() {
+			_, err := RenderGemma4([]*pb.Message{
+				{Role: "user", Content: "hi"},
+			}, `{"type":"function"}`, false, true)
+			Expect(err).To(HaveOccurred())
+			Expect(err.Error()).To(ContainSubstring("tools JSON is not an array"))
+		})
+
+		It("fails when a tools array element is not an object", func() {
+			_, err := RenderGemma4([]*pb.Message{
+				{Role: "user", Content: "hi"},
+			}, `[42]`, false, true)
+			Expect(err).To(HaveOccurred())
+			Expect(err.Error()).To(ContainSubstring("tools[0] is not an object"))
+		})
+
+		It("rejects a nil message via the unknown-role check", func() {
+			// Pins current behavior: pb getters are nil-safe, so a nil message
+			// reads as role "" and trips the fail-loud unknown-role guard.
+			_, err := RenderGemma4([]*pb.Message{nil}, "", false, true)
+			Expect(err).To(HaveOccurred())
+			Expect(err.Error()).To(ContainSubstring(`unknown role "" in message 0`))
+		})
+	})
+})