mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-11 18:27:32 -04:00
feat(dllm): gemma4 streaming parser emitting ChatDeltas
Fragment-safe state machine (content / channel header / thought / tool-call / done) classifying model output into content, reasoning_content and tool_calls deltas. Tool-call payload decoder is a non-partial port of vLLM's gemma4 parser grammar; ~25 of its test cases are ported with citations, plus a 2-split invariance property over every byte position. Recursion depth-capped against model-generated deep nesting; marker constants shared with the renderer. Assisted-by: Claude Code (Fable 5) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
562
backend/go/dllm/gemma4_parser.go
Executable file
562
backend/go/dllm/gemma4_parser.go
Executable file
@@ -0,0 +1,562 @@
|
||||
// Gemma4 (DiffusionGemma) streaming output parser: raw model text, fed in
|
||||
// arbitrary fragments (per committed diffusion block; a fragment can split
|
||||
// anywhere, including mid-marker and mid-payload), is turned into
|
||||
// pb.ChatDelta events (content / reasoning_content / tool_calls).
|
||||
//
|
||||
// Normative sources:
|
||||
// - The chat template embedded at the top of gemma4_renderer.go ("tpl L<n>"
|
||||
// citations below refer to its numbered lines). The OUTPUT format mirrors
|
||||
// what the template renders for assistant history: thought channels
|
||||
// (<|channel>thought\n ... <channel|>, tpl L240), tool calls
|
||||
// (<|tool_call>call:name{...}<tool_call|>, tpl L246-L257) and turn ends
|
||||
// (<turn|>, tpl L351).
|
||||
// - vLLM PR #45163: vllm/tool_parsers/gemma4_tool_parser.py (marker
|
||||
// handling, the call:name{...} argument grammar and its decoder, ported
|
||||
// below) and vllm/reasoning/gemma4_reasoning_parser.py (channel markers,
|
||||
// the "thought\n" role label, is_reasoning_end semantics).
|
||||
//
|
||||
// Initial state (derived from the generation prompt, tpl L356-L362, see
|
||||
// RenderGemma4):
|
||||
// - enable_thinking=false: the prompt ends with "<|turn>model\n" +
|
||||
// "<|channel>thought\n<channel|>" - an EMPTY thought channel, pre-opened
|
||||
// AND pre-closed by the template. The model's output therefore starts in
|
||||
// plain content. Use NewGemma4Parser(false).
|
||||
// - enable_thinking=true: the prompt ends at "<|turn>model\n" and the model
|
||||
// opens and closes its own thought channel in the OUTPUT
|
||||
// ("<|channel>thought\n...reasoning...<channel|>final answer", per the
|
||||
// vLLM Gemma4ReasoningParser docstring). The parser still starts in
|
||||
// content state - the channel markers in the output drive the switch.
|
||||
// Use NewGemma4Parser(false) here too.
|
||||
// - NewGemma4Parser(true) is for callers that pre-open the thought channel
|
||||
// in the prompt themselves (appending "<|channel>thought\n" after the
|
||||
// generation prompt to force thinking): the output then begins mid-thought
|
||||
// and everything is reasoning until the first <channel|>.
|
||||
//
|
||||
// State diagram (markers are consumed, never emitted):
|
||||
//
|
||||
// <|channel> \n (channel name dropped: the
|
||||
// [content] --------------> [chan-header] ----> [thought] "thought\n" role
|
||||
// ^ | <channel|> (stray close: swallowed, label, stripped
|
||||
// +-+ strip_thinking semantics, tpl L148-L158) like vLLM does)
|
||||
// ^ <channel|>
|
||||
// +----------------------------------------- [thought]
|
||||
// ^ <tool_call|> | <|tool_call> (implicit
|
||||
// +-------------- [tool-call] <-------------------+ reasoning end, vLLM
|
||||
// | <|tool_call> ^ is_reasoning_end)
|
||||
// +-------------------+
|
||||
// [content]/[thought] --- <turn|> ---> [done] (everything after is dropped)
|
||||
//
|
||||
// Buffering rules:
|
||||
// - content/thought states hold back at most len(longest marker)-1 bytes:
|
||||
// the longest tail that is still a proper prefix of a watched marker.
|
||||
// Content is otherwise emitted immediately (no unbounded buffering).
|
||||
// - the tool-call state buffers the whole payload until <tool_call|>. This
|
||||
// is unbounded in principle but bounded in practice by the model's
|
||||
// diffusion canvas, and is required because the call:name{...} payload
|
||||
// only becomes decodable (and trustworthy) once complete - the same
|
||||
// reason vLLM's parser accumulates before parsing.
|
||||
// - Close() flushes whatever is still held: partial markers come out as
|
||||
// content/reasoning (per the state that held them); an unterminated
|
||||
// channel header or tool-call payload is re-emitted RAW (including its
|
||||
// opening marker) as content - malformed output is never silently
|
||||
// dropped (mirrors vLLM extract_tool_calls returning the raw text as
|
||||
// content when its regex does not match).
|
||||
//
|
||||
// Streaming granularity DIVERGENCE from vLLM: vLLM re-parses the partial
|
||||
// payload on every token and streams argument-JSON diffs (its `partial=True`
|
||||
// decoder mode plus withholding logic exist only for that). Our fragments are
|
||||
// whole committed diffusion blocks, so each completed tool call is emitted
|
||||
// once, as a single ToolCallDelta carrying index + id + name + the full
|
||||
// arguments JSON - exactly the shape backend/python/vllm/backend.py emits
|
||||
// per call and pkg/functions.ToolCallsFromChatDeltas re-accumulates.
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
)
|
||||
|
||||
// gemma4CallRE is vLLM's tool_call_regex
|
||||
// (`<\|tool_call>call:([\w\-\.]+)\{(.*?)\}<tool_call\|>`, DOTALL) anchored to
|
||||
// a single already-extracted payload: name charset [\w\-.], braces mandatory.
|
||||
var gemma4CallRE = regexp.MustCompile(`(?s)^call:([\w\-.]+)\{(.*)\}$`)
|
||||
|
||||
type g4State int
|
||||
|
||||
const (
|
||||
g4Content g4State = iota
|
||||
g4ChanHeader
|
||||
g4Thought
|
||||
g4ToolCall
|
||||
g4Done
|
||||
)
|
||||
|
||||
// Markers watched per emitting state. A stray <tool_call|> outside a tool
|
||||
// call is deliberately NOT watched: it passes through verbatim, consistent
|
||||
// with the malformed-payload fallback re-emitting it as content.
|
||||
var (
|
||||
gemma4ContentMarkers = []string{gemma4ChannelOpen, gemma4ChannelClose, gemma4ToolCallOpen, gemma4TurnEnd}
|
||||
gemma4ThoughtMarkers = []string{gemma4ChannelClose, gemma4ToolCallOpen, gemma4TurnEnd}
|
||||
)
|
||||
|
||||
type Gemma4Parser struct {
|
||||
state g4State
|
||||
// held is the per-state carry-over between Feed calls: a partial marker
|
||||
// (content/thought), a partial channel header (chan-header) or the
|
||||
// payload accumulated so far (tool-call).
|
||||
held string
|
||||
toolIdx int
|
||||
}
|
||||
|
||||
// NewGemma4Parser returns a parser positioned per the initial-state rules in
|
||||
// the header comment: startInThought=true only when the caller pre-opened a
|
||||
// thought channel in the prompt.
|
||||
func NewGemma4Parser(startInThought bool) *Gemma4Parser {
|
||||
state := g4Content
|
||||
if startInThought {
|
||||
state = g4Thought
|
||||
}
|
||||
return &Gemma4Parser{state: state}
|
||||
}
|
||||
|
||||
// Feed consumes the next output fragment and returns the deltas it completes.
|
||||
func (p *Gemma4Parser) Feed(text string) []*pb.ChatDelta {
|
||||
if text == "" || p.state == g4Done {
|
||||
return nil
|
||||
}
|
||||
pending := p.held + text
|
||||
p.held = ""
|
||||
var em g4Emitter
|
||||
for pending != "" {
|
||||
switch p.state {
|
||||
case g4Content, g4Thought:
|
||||
markers := gemma4ContentMarkers
|
||||
if p.state == g4Thought {
|
||||
markers = gemma4ThoughtMarkers
|
||||
}
|
||||
idx, marker := findEarliestGemma4Marker(pending, markers)
|
||||
if idx == -1 {
|
||||
hold := gemma4MarkerHoldback(pending, markers)
|
||||
p.emitText(&em, pending[:len(pending)-hold])
|
||||
p.held = pending[len(pending)-hold:]
|
||||
pending = ""
|
||||
continue
|
||||
}
|
||||
p.emitText(&em, pending[:idx])
|
||||
pending = pending[idx+len(marker):]
|
||||
switch marker {
|
||||
case gemma4ChannelOpen:
|
||||
p.state = g4ChanHeader
|
||||
case gemma4ChannelClose:
|
||||
// In thought: channel ends. In content: stray close,
|
||||
// swallowed (strip_thinking keeps both sides, tpl L148-L158).
|
||||
p.state = g4Content
|
||||
case gemma4ToolCallOpen:
|
||||
p.state = g4ToolCall
|
||||
case gemma4TurnEnd:
|
||||
p.state = g4Done
|
||||
}
|
||||
case g4ChanHeader:
|
||||
// The channel header is "<name>\n"; the template only ever writes
|
||||
// "thought" (tpl L240/L360) and the label is structural, so it is
|
||||
// dropped, not emitted (vLLM strips the same "thought\n" prefix).
|
||||
nl := strings.IndexByte(pending, '\n')
|
||||
if nl == -1 {
|
||||
p.held = pending
|
||||
pending = ""
|
||||
continue
|
||||
}
|
||||
pending = pending[nl+1:]
|
||||
p.state = g4Thought
|
||||
case g4ToolCall:
|
||||
end := strings.Index(pending, gemma4ToolCallClose)
|
||||
if end == -1 {
|
||||
p.held = pending
|
||||
pending = ""
|
||||
continue
|
||||
}
|
||||
p.emitToolCall(&em, pending[:end])
|
||||
pending = pending[end+len(gemma4ToolCallClose):]
|
||||
p.state = g4Content
|
||||
case g4Done:
|
||||
pending = ""
|
||||
}
|
||||
}
|
||||
return em.deltas
|
||||
}
|
||||
|
||||
// Close flushes held-back partials. Incomplete structures (open channel
|
||||
// header, unterminated tool payload) are re-emitted raw as content rather
|
||||
// than dropped. The parser is finished afterwards.
|
||||
func (p *Gemma4Parser) Close() []*pb.ChatDelta {
|
||||
var em g4Emitter
|
||||
switch p.state {
|
||||
case g4Content:
|
||||
em.content(p.held)
|
||||
case g4Thought:
|
||||
em.reasoning(p.held)
|
||||
case g4ChanHeader:
|
||||
em.content(gemma4ChannelOpen + p.held)
|
||||
case g4ToolCall:
|
||||
em.content(gemma4ToolCallOpen + p.held)
|
||||
case g4Done:
|
||||
}
|
||||
p.held = ""
|
||||
p.state = g4Done
|
||||
return em.deltas
|
||||
}
|
||||
|
||||
func (p *Gemma4Parser) emitText(em *g4Emitter, s string) {
|
||||
if p.state == g4Thought {
|
||||
em.reasoning(s)
|
||||
return
|
||||
}
|
||||
em.content(s)
|
||||
}
|
||||
|
||||
// emitToolCall decodes one complete <|tool_call>...<tool_call|> payload. On a
|
||||
// payload that does not match call:name{...} the raw text (markers included)
|
||||
// is emitted as content, mirroring vLLM's extract_tool_calls fallback.
|
||||
func (p *Gemma4Parser) emitToolCall(em *g4Emitter, payload string) {
|
||||
m := gemma4CallRE.FindStringSubmatch(payload)
|
||||
if m == nil {
|
||||
em.content(gemma4ToolCallOpen + payload + gemma4ToolCallClose)
|
||||
return
|
||||
}
|
||||
// Index-based ids: deterministic (the split-invariance property relies
|
||||
// on it) and matching the call_<n> convention of pkg/grpc/rich_test.go;
|
||||
// core only needs ids to be non-empty and unique within the response.
|
||||
em.tool(p.toolIdx, "call_"+strconv.Itoa(p.toolIdx), m[1], decodeGemma4Args(m[2], 0))
|
||||
p.toolIdx++
|
||||
}
|
||||
|
||||
// g4Emitter collects ChatDeltas; empty text events are dropped.
|
||||
type g4Emitter struct {
|
||||
deltas []*pb.ChatDelta
|
||||
}
|
||||
|
||||
func (e *g4Emitter) content(s string) {
|
||||
if s != "" {
|
||||
e.deltas = append(e.deltas, &pb.ChatDelta{Content: s})
|
||||
}
|
||||
}
|
||||
|
||||
func (e *g4Emitter) reasoning(s string) {
|
||||
if s != "" {
|
||||
e.deltas = append(e.deltas, &pb.ChatDelta{ReasoningContent: s})
|
||||
}
|
||||
}
|
||||
|
||||
func (e *g4Emitter) tool(index int, id, name, argsJSON string) {
|
||||
e.deltas = append(e.deltas, &pb.ChatDelta{ToolCalls: []*pb.ToolCallDelta{{
|
||||
Index: int32(index),
|
||||
Id: id,
|
||||
Name: name,
|
||||
Arguments: argsJSON,
|
||||
}}})
|
||||
}
|
||||
|
||||
// findEarliestGemma4Marker returns the position and value of the first
|
||||
// complete marker occurrence, or (-1, "").
|
||||
func findEarliestGemma4Marker(s string, markers []string) (int, string) {
|
||||
best, bestMarker := -1, ""
|
||||
for _, m := range markers {
|
||||
if idx := strings.Index(s, m); idx >= 0 && (best == -1 || idx < best) {
|
||||
best, bestMarker = idx, m
|
||||
}
|
||||
}
|
||||
return best, bestMarker
|
||||
}
|
||||
|
||||
// gemma4MarkerHoldback returns the length of the longest suffix of s that is
|
||||
// a proper prefix of a watched marker - the only bytes that may still grow
|
||||
// into a marker and therefore must not be emitted yet (bounded by the
|
||||
// longest marker, so content is never buffered unboundedly).
|
||||
func gemma4MarkerHoldback(s string, markers []string) int {
|
||||
maxHold := 0
|
||||
for _, m := range markers {
|
||||
if len(m)-1 > maxHold {
|
||||
maxHold = len(m) - 1
|
||||
}
|
||||
}
|
||||
if len(s) < maxHold {
|
||||
maxHold = len(s)
|
||||
}
|
||||
for k := maxHold; k >= 1; k-- {
|
||||
tail := s[len(s)-k:]
|
||||
for _, m := range markers {
|
||||
if strings.HasPrefix(m, tail) {
|
||||
return k
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// call:name{...} argument decoder
|
||||
//
|
||||
// Port of vLLM's _parse_gemma4_args / _parse_gemma4_array /
|
||||
// _parse_gemma4_value (gemma4_tool_parser.py) in non-partial mode only: this
|
||||
// parser decodes exclusively COMPLETE payloads (incomplete ones fall back to
|
||||
// raw content at Close), so vLLM's partial-withholding machinery
|
||||
// (trailing-dot floats, withheld bare tails) is intentionally not ported.
|
||||
//
|
||||
// Grammar (inverse of the renderer's formatGemma4Argument, tpl L118-L147):
|
||||
//
|
||||
// args := pair (',' pair)*
|
||||
// pair := key ':' value (keys unquoted, up to the first ':')
|
||||
// value := string | object | array | bare
|
||||
// string := '<|"|>' ... '<|"|>' (no escapes; unterminated -> rest)
|
||||
// object := '{' args '}' (delimited strings skipped when
|
||||
// array := '[' value,* ']' counting braces/brackets)
|
||||
// bare := true | false | null/none/nil | number | bare-string
|
||||
//
|
||||
// Output is a JSON object/array string with keys in payload order (Python
|
||||
// dict insertion order), built with HTML escaping off so payload text
|
||||
// survives byte-for-byte.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func isGemma4Space(c byte) bool { return c == ' ' || c == '\n' || c == '\t' }
|
||||
|
||||
// gemma4MaxArgsDepth caps the mutual recursion between decodeGemma4Args and
|
||||
// decodeGemma4Array. Defense against model-generated deep nesting: a Go stack
|
||||
// overflow is a fatal process kill, not a recoverable error, so past the cap
|
||||
// a nested body gracefully degrades to a JSON string of its raw text.
|
||||
const gemma4MaxArgsDepth = 100
|
||||
|
||||
// decodeGemma4Args decodes one args body (the text between the outer braces
|
||||
// of call:name{...}) into a JSON object string. depth is the current nesting
|
||||
// level (0 at the payload root); see gemma4MaxArgsDepth.
|
||||
func decodeGemma4Args(s string, depth int) string {
|
||||
if depth > gemma4MaxArgsDepth {
|
||||
return gemma4JSONString(s)
|
||||
}
|
||||
var b strings.Builder
|
||||
b.WriteString("{")
|
||||
first := true
|
||||
pair := func(key, val string) {
|
||||
if !first {
|
||||
b.WriteString(",")
|
||||
}
|
||||
first = false
|
||||
b.WriteString(gemma4JSONString(key))
|
||||
b.WriteString(":")
|
||||
b.WriteString(val)
|
||||
}
|
||||
i, n := 0, len(s)
|
||||
for i < n {
|
||||
for i < n && (isGemma4Space(s[i]) || s[i] == ',') {
|
||||
i++
|
||||
}
|
||||
if i >= n {
|
||||
break
|
||||
}
|
||||
keyStart := i
|
||||
for i < n && s[i] != ':' {
|
||||
i++
|
||||
}
|
||||
if i >= n {
|
||||
break // no ':' -> trailing junk, dropped (vLLM does the same)
|
||||
}
|
||||
key := strings.TrimSpace(s[keyStart:i])
|
||||
i++ // skip ':'
|
||||
for i < n && isGemma4Space(s[i]) {
|
||||
i++
|
||||
}
|
||||
if i >= n {
|
||||
pair(key, `""`) // "key:" with nothing after -> empty string
|
||||
break
|
||||
}
|
||||
switch {
|
||||
case strings.HasPrefix(s[i:], gemma4StringDelim):
|
||||
i += len(gemma4StringDelim)
|
||||
if end := strings.Index(s[i:], gemma4StringDelim); end == -1 {
|
||||
pair(key, gemma4JSONString(s[i:])) // unterminated -> take rest
|
||||
i = n
|
||||
} else {
|
||||
pair(key, gemma4JSONString(s[i:i+end]))
|
||||
i += end + len(gemma4StringDelim)
|
||||
}
|
||||
case s[i] == '{':
|
||||
inner, next := scanGemma4Balanced(s, i, '{', '}')
|
||||
pair(key, decodeGemma4Args(inner, depth+1))
|
||||
i = next
|
||||
case s[i] == '[':
|
||||
inner, next := scanGemma4Balanced(s, i, '[', ']')
|
||||
pair(key, decodeGemma4Array(inner, depth+1))
|
||||
i = next
|
||||
default:
|
||||
valStart := i
|
||||
for i < n && s[i] != ',' && s[i] != '}' && s[i] != ']' {
|
||||
i++
|
||||
}
|
||||
if i == valStart {
|
||||
// No progress (value starts on a stray '}'/']'): abort on
|
||||
// malformed input rather than loop, like vLLM.
|
||||
i = n
|
||||
continue
|
||||
}
|
||||
pair(key, decodeGemma4Bare(s[valStart:i]))
|
||||
}
|
||||
}
|
||||
b.WriteString("}")
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// decodeGemma4Array decodes one array body (the text between '[' and ']')
|
||||
// into a JSON array string. depth is the current nesting level; see
|
||||
// gemma4MaxArgsDepth.
|
||||
func decodeGemma4Array(s string, depth int) string {
|
||||
if depth > gemma4MaxArgsDepth {
|
||||
return gemma4JSONString(s)
|
||||
}
|
||||
var b strings.Builder
|
||||
b.WriteString("[")
|
||||
first := true
|
||||
item := func(val string) {
|
||||
if !first {
|
||||
b.WriteString(",")
|
||||
}
|
||||
first = false
|
||||
b.WriteString(val)
|
||||
}
|
||||
i, n := 0, len(s)
|
||||
for i < n {
|
||||
for i < n && (isGemma4Space(s[i]) || s[i] == ',') {
|
||||
i++
|
||||
}
|
||||
if i >= n {
|
||||
break
|
||||
}
|
||||
switch {
|
||||
case strings.HasPrefix(s[i:], gemma4StringDelim):
|
||||
i += len(gemma4StringDelim)
|
||||
if end := strings.Index(s[i:], gemma4StringDelim); end == -1 {
|
||||
item(gemma4JSONString(s[i:]))
|
||||
i = n
|
||||
} else {
|
||||
item(gemma4JSONString(s[i : i+end]))
|
||||
i += end + len(gemma4StringDelim)
|
||||
}
|
||||
case s[i] == '{':
|
||||
inner, next := scanGemma4Balanced(s, i, '{', '}')
|
||||
item(decodeGemma4Args(inner, depth+1))
|
||||
i = next
|
||||
case s[i] == '[':
|
||||
inner, next := scanGemma4Balanced(s, i, '[', ']')
|
||||
item(decodeGemma4Array(inner, depth+1))
|
||||
i = next
|
||||
default:
|
||||
valStart := i
|
||||
for i < n && s[i] != ',' && s[i] != ']' {
|
||||
i++
|
||||
}
|
||||
if i == valStart {
|
||||
i = n // no progress: abort on malformed input, like vLLM
|
||||
continue
|
||||
}
|
||||
item(decodeGemma4Bare(s[valStart:i]))
|
||||
}
|
||||
}
|
||||
b.WriteString("]")
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// scanGemma4Balanced scans a brace/bracket-balanced span starting at the
|
||||
// opener s[start], skipping over <|"|>-delimited strings so structural
|
||||
// characters inside them do not count (vLLM's depth scan). Returns the inner
|
||||
// text and the index just past the closer; an unterminated span yields the
|
||||
// rest of the string (the inner decoder still extracts what is there - this
|
||||
// path is only reachable from genuinely malformed complete payloads).
|
||||
func scanGemma4Balanced(s string, start int, open, close byte) (string, int) {
|
||||
depth := 1
|
||||
i := start + 1
|
||||
innerStart := i
|
||||
n := len(s)
|
||||
for i < n && depth > 0 {
|
||||
if strings.HasPrefix(s[i:], gemma4StringDelim) {
|
||||
i += len(gemma4StringDelim)
|
||||
if nd := strings.Index(s[i:], gemma4StringDelim); nd == -1 {
|
||||
i = n
|
||||
} else {
|
||||
i += nd + len(gemma4StringDelim)
|
||||
}
|
||||
continue
|
||||
}
|
||||
switch s[i] {
|
||||
case open:
|
||||
depth++
|
||||
case close:
|
||||
depth--
|
||||
}
|
||||
i++
|
||||
}
|
||||
if depth > 0 {
|
||||
return s[innerStart:], n
|
||||
}
|
||||
return s[innerStart : i-1], i
|
||||
}
|
||||
|
||||
// decodeGemma4Bare maps an undelimited value to its JSON form: booleans,
|
||||
// null aliases (null/none/nil, case-insensitive - the renderer writes
|
||||
// Python None as "None", tpl L144-L145 via format_argument's else branch),
|
||||
// numbers (vLLM's rule: a '.' tries float, otherwise int; anything that
|
||||
// fails parses as a bare string).
|
||||
func decodeGemma4Bare(raw string) string {
|
||||
v := strings.TrimSpace(raw)
|
||||
if v == "" {
|
||||
return `""`
|
||||
}
|
||||
if v == "true" || v == "false" {
|
||||
return v
|
||||
}
|
||||
switch strings.ToLower(v) {
|
||||
case "null", "none", "nil":
|
||||
return "null"
|
||||
}
|
||||
if strings.Contains(v, ".") {
|
||||
if f, err := strconv.ParseFloat(v, 64); err == nil {
|
||||
return formatGemma4Float(f)
|
||||
}
|
||||
} else if iv, err := strconv.ParseInt(v, 10, 64); err == nil {
|
||||
return strconv.FormatInt(iv, 10)
|
||||
}
|
||||
return gemma4JSONString(v)
|
||||
}
|
||||
|
||||
// formatGemma4Float renders like Python's json.dumps(float): integral floats
|
||||
// keep a ".0" suffix ("108." decodes to 108.0, not 108), so the arguments
|
||||
// JSON matches what vLLM would have produced for the same payload.
|
||||
func formatGemma4Float(f float64) string {
|
||||
s := strconv.FormatFloat(f, 'g', -1, 64)
|
||||
if !strings.ContainsAny(s, ".eE") {
|
||||
s += ".0"
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
// gemma4JSONString encodes a JSON string WITHOUT HTML escaping (json.Marshal
|
||||
// would escape the angle brackets in "<div>" to \u003c / \u003e sequences;
|
||||
// payload text should survive
|
||||
// byte-for-byte, like Python's json.dumps(ensure_ascii=False)).
|
||||
func gemma4JSONString(s string) string {
|
||||
var sb strings.Builder
|
||||
enc := json.NewEncoder(&sb)
|
||||
enc.SetEscapeHTML(false)
|
||||
if err := enc.Encode(s); err != nil {
|
||||
// Unreachable for plain strings; fall back to default escaping
|
||||
// rather than emitting invalid JSON.
|
||||
b, mErr := json.Marshal(s)
|
||||
if mErr != nil {
|
||||
return `""`
|
||||
}
|
||||
return string(b)
|
||||
}
|
||||
// Encode appends a trailing newline.
|
||||
return strings.TrimSuffix(sb.String(), "\n")
|
||||
}
|
||||
592
backend/go/dllm/gemma4_parser_test.go
Executable file
592
backend/go/dllm/gemma4_parser_test.go
Executable file
@@ -0,0 +1,592 @@
|
||||
package main
|
||||
|
||||
// Parser specs for Gemma4Parser (model output text -> pb.ChatDelta events).
|
||||
//
|
||||
// Fixture provenance:
|
||||
// - Entries marked "vLLM: <name>" are direct ports of the named test from
|
||||
// vLLM PR #45163, tests/tool_parsers/test_gemma4_tool_parser.py (the
|
||||
// authoritative test-suite for the gemma4 tool-call wire format). The
|
||||
// streaming tests' chunk lists are reused verbatim as Feed fragments.
|
||||
// - Decoder entries port the TestParseGemma4Args / TestParseGemma4Array
|
||||
// classes from the same file (non-partial mode only; this parser never
|
||||
// decodes partial payloads, see the divergence note in gemma4_parser.go).
|
||||
// - Channel/turn-marker expectations come from the chat template embedded
|
||||
// in gemma4_renderer.go (tpl L356-L362 generation prompt, L148-L158
|
||||
// strip_thinking) and vLLM's Gemma4ReasoningParser
|
||||
// (vllm/reasoning/gemma4_reasoning_parser.py).
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
|
||||
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
)
|
||||
|
||||
// flatGemma4Tool is one accumulated tool call, mirroring how LocalAI core
|
||||
// folds ToolCallDelta streams (pkg/functions/chat_deltas.go
|
||||
// ToolCallsFromChatDeltas: name/id latch on first non-empty, arguments
|
||||
// concatenate per index). Tests flatten through the same rules so they
|
||||
// assert exactly what core will reconstruct.
|
||||
type flatGemma4Tool struct {
|
||||
id string
|
||||
name string
|
||||
args string
|
||||
}
|
||||
|
||||
func flattenGemma4Deltas(deltas []*pb.ChatDelta) (string, string, []flatGemma4Tool) {
|
||||
var content, reasoning strings.Builder
|
||||
byIndex := map[int32]*flatGemma4Tool{}
|
||||
maxIdx := int32(-1)
|
||||
for _, d := range deltas {
|
||||
content.WriteString(d.GetContent())
|
||||
reasoning.WriteString(d.GetReasoningContent())
|
||||
for _, tc := range d.GetToolCalls() {
|
||||
acc, ok := byIndex[tc.GetIndex()]
|
||||
if !ok {
|
||||
acc = &flatGemma4Tool{}
|
||||
byIndex[tc.GetIndex()] = acc
|
||||
}
|
||||
if tc.GetName() != "" {
|
||||
acc.name = tc.GetName()
|
||||
}
|
||||
if tc.GetId() != "" {
|
||||
acc.id = tc.GetId()
|
||||
}
|
||||
acc.args += tc.GetArguments()
|
||||
if tc.GetIndex() > maxIdx {
|
||||
maxIdx = tc.GetIndex()
|
||||
}
|
||||
}
|
||||
}
|
||||
var tools []flatGemma4Tool
|
||||
for i := int32(0); i <= maxIdx; i++ {
|
||||
if acc, ok := byIndex[i]; ok {
|
||||
tools = append(tools, *acc)
|
||||
}
|
||||
}
|
||||
return content.String(), reasoning.String(), tools
|
||||
}
|
||||
|
||||
type wantGemma4Tool struct {
|
||||
name string
|
||||
argsJSON string // compared with MatchJSON (key order irrelevant)
|
||||
}
|
||||
|
||||
type parseGemma4Case struct {
|
||||
startInThought bool
|
||||
fragments []string
|
||||
wantContent string
|
||||
wantReasoning string
|
||||
wantTools []wantGemma4Tool
|
||||
}
|
||||
|
||||
func parseGemma4Fragments(startInThought bool, fragments []string) []*pb.ChatDelta {
|
||||
p := NewGemma4Parser(startInThought)
|
||||
var all []*pb.ChatDelta
|
||||
for _, f := range fragments {
|
||||
all = append(all, p.Feed(f)...)
|
||||
}
|
||||
return append(all, p.Close()...)
|
||||
}
|
||||
|
||||
var _ = Describe("Gemma4Parser", func() {
|
||||
DescribeTable("parses streamed gemma4 output into ChatDeltas",
|
||||
func(c parseGemma4Case) {
|
||||
content, reasoning, tools := flattenGemma4Deltas(parseGemma4Fragments(c.startInThought, c.fragments))
|
||||
Expect(content).To(Equal(c.wantContent))
|
||||
Expect(reasoning).To(Equal(c.wantReasoning))
|
||||
Expect(tools).To(HaveLen(len(c.wantTools)))
|
||||
seenIDs := map[string]bool{}
|
||||
for i, want := range c.wantTools {
|
||||
Expect(tools[i].name).To(Equal(want.name), "tool %d name", i)
|
||||
Expect(tools[i].args).To(MatchJSON(want.argsJSON), "tool %d arguments", i)
|
||||
Expect(tools[i].id).ToNot(BeEmpty(), "tool %d id", i)
|
||||
Expect(seenIDs).ToNot(HaveKey(tools[i].id), "tool %d id must be unique", i)
|
||||
seenIDs[tools[i].id] = true
|
||||
}
|
||||
},
|
||||
|
||||
// --- (1) pure content -------------------------------------------------
|
||||
// vLLM: test_no_tool_calls
|
||||
Entry("pure content, single fragment", parseGemma4Case{
|
||||
fragments: []string{"Hello, how can I help you today?"},
|
||||
wantContent: "Hello, how can I help you today?",
|
||||
}),
|
||||
|
||||
// --- (2) thought -> final transition ----------------------------------
|
||||
// enable_thinking render: prompt ends at <|turn>model\n and the model
|
||||
// opens/closes its own thought channel in the OUTPUT (vLLM
|
||||
// Gemma4ReasoningParser docstring; tpl L356-L362). The "thought\n"
|
||||
// role label after <|channel> is structural and must be stripped
|
||||
// (vLLM _THOUGHT_PREFIX handling).
|
||||
Entry("thought channel then final content", parseGemma4Case{
|
||||
fragments: []string{"<|channel>thought\nLet me think about this.\n<channel|>The answer is 42."},
|
||||
wantReasoning: "Let me think about this.\n",
|
||||
wantContent: "The answer is 42.",
|
||||
}),
|
||||
|
||||
// --- (3) startInThought both ways -------------------------------------
|
||||
Entry("startInThought=true routes initial text to reasoning until <channel|>", parseGemma4Case{
|
||||
startInThought: true,
|
||||
fragments: []string{"I am thinking hard.<channel|>Done."},
|
||||
wantReasoning: "I am thinking hard.",
|
||||
wantContent: "Done.",
|
||||
}),
|
||||
// A stray <channel|> with no open channel is swallowed, matching the
|
||||
// template's strip_thinking (tpl L148-L158: the marker is dropped,
|
||||
// text on both sides is kept).
|
||||
Entry("startInThought=false keeps the same text as content, stray <channel|> swallowed", parseGemma4Case{
|
||||
startInThought: false,
|
||||
fragments: []string{"I am thinking hard.<channel|>Done."},
|
||||
wantContent: "I am thinking hard.Done.",
|
||||
}),
|
||||
|
||||
// --- (4) one tool call, full payload type zoo --------------------------
|
||||
Entry("single tool call: strings, numbers, bools, null, nested object and array", parseGemma4Case{
|
||||
fragments: []string{`<|tool_call>call:complex_function{text:<|"|>with, comma and {braces}<|"|>,count:42,score:3.14,yes:true,no:false,nothing:null,obj:{inner:<|"|>v<|"|>,k:1},arr:[<|"|>a<|"|>,2,true]}<tool_call|>`},
|
||||
wantTools: []wantGemma4Tool{{
|
||||
name: "complex_function",
|
||||
argsJSON: `{"text":"with, comma and {braces}","count":42,"score":3.14,"yes":true,"no":false,"nothing":null,"obj":{"inner":"v","k":1},"arr":["a",2,true]}`,
|
||||
}},
|
||||
}),
|
||||
|
||||
// --- (5) payload split across 3 fragments ------------------------------
|
||||
Entry("tool-call payload split across three fragments", parseGemma4Case{
|
||||
fragments: []string{
|
||||
"<|tool_call>call:get_weather{loc",
|
||||
`ation:<|"|>Paris, Fra`,
|
||||
`nce<|"|>}<tool_call|>`,
|
||||
},
|
||||
wantTools: []wantGemma4Tool{{name: "get_weather", argsJSON: `{"location":"Paris, France"}`}},
|
||||
}),
|
||||
|
||||
// --- (6) marker split across fragments ----------------------------------
|
||||
Entry("tool-call open marker split across fragments", parseGemma4Case{
|
||||
fragments: []string{
|
||||
"<|tool_ca",
|
||||
`ll>call:get_weather{location:<|"|>London<|"|>}<tool_call|>`,
|
||||
},
|
||||
wantTools: []wantGemma4Tool{{name: "get_weather", argsJSON: `{"location":"London"}`}},
|
||||
}),
|
||||
Entry("channel open marker split across fragments", parseGemma4Case{
|
||||
fragments: []string{
|
||||
"<|chan",
|
||||
"nel>thought\ndeep thought<channel|>final",
|
||||
},
|
||||
wantReasoning: "deep thought",
|
||||
wantContent: "final",
|
||||
}),
|
||||
|
||||
// --- (7) trailing partial marker held, flushed by Close -----------------
|
||||
Entry("trailing partial marker is held back and flushed by Close", parseGemma4Case{
|
||||
fragments: []string{"Hello <|tool"},
|
||||
wantContent: "Hello <|tool",
|
||||
}),
|
||||
|
||||
// --- (8) malformed/incomplete payload -> content fallback ---------------
|
||||
// vLLM: test_incomplete_tool_call (no end marker: the whole text stays
|
||||
// content, never silently dropped).
|
||||
Entry("incomplete tool payload at Close is emitted as raw content", parseGemma4Case{
|
||||
fragments: []string{`<|tool_call>call:get_weather{location:<|"|>London`},
|
||||
wantContent: `<|tool_call>call:get_weather{location:<|"|>London`,
|
||||
}),
|
||||
Entry("malformed complete payload is emitted as raw content, parsing continues", parseGemma4Case{
|
||||
fragments: []string{"<|tool_call>oops no call syntax<tool_call|> done"},
|
||||
wantContent: "<|tool_call>oops no call syntax<tool_call|> done",
|
||||
}),
|
||||
|
||||
// --- (9) <turn|> ends the turn -------------------------------------------
|
||||
Entry("text after <turn|> is ignored, including later fragments", parseGemma4Case{
|
||||
fragments: []string{
|
||||
"before<turn|>after",
|
||||
`more <|tool_call>call:f{}<tool_call|>`,
|
||||
},
|
||||
wantContent: "before",
|
||||
}),
|
||||
Entry("<turn|> inside a thought channel ends the turn", parseGemma4Case{
|
||||
startInThought: true,
|
||||
fragments: []string{"thinking<turn|>ignored"},
|
||||
wantReasoning: "thinking",
|
||||
}),
|
||||
|
||||
// --- (10) ported vLLM non-streaming cases ---------------------------------
|
||||
// vLLM: test_single_tool_call
|
||||
Entry("vLLM: test_single_tool_call", parseGemma4Case{
|
||||
fragments: []string{`<|tool_call>call:get_weather{location:<|"|>London<|"|>}<tool_call|>`},
|
||||
wantTools: []wantGemma4Tool{{name: "get_weather", argsJSON: `{"location":"London"}`}},
|
||||
}),
|
||||
// vLLM: test_multiple_arguments
|
||||
Entry("vLLM: test_multiple_arguments", parseGemma4Case{
|
||||
fragments: []string{`<|tool_call>call:get_weather{location:<|"|>San Francisco<|"|>,unit:<|"|>celsius<|"|>}<tool_call|>`},
|
||||
wantTools: []wantGemma4Tool{{name: "get_weather", argsJSON: `{"location":"San Francisco","unit":"celsius"}`}},
|
||||
}),
|
||||
// vLLM: test_text_before_tool_call. DIVERGENCE: vLLM's non-streaming
|
||||
// extractor trims the content ("...you."); a streaming parser cannot
|
||||
// retroactively trim already-emitted text, so the trailing space is
|
||||
// kept (vLLM's own streaming path keeps it too, see
|
||||
// test_streaming_text_before_tool_call which only checks a prefix).
|
||||
Entry("vLLM: test_text_before_tool_call (streaming semantics: no trim)", parseGemma4Case{
|
||||
fragments: []string{`Let me check the weather for you. <|tool_call>call:get_weather{location:<|"|>Paris<|"|>}<tool_call|>`},
|
||||
wantContent: "Let me check the weather for you. ",
|
||||
wantTools: []wantGemma4Tool{{name: "get_weather", argsJSON: `{"location":"Paris"}`}},
|
||||
}),
|
||||
// vLLM: test_multiple_tool_calls (also covers case 11: multi-tool sequence)
|
||||
Entry("vLLM: test_multiple_tool_calls", parseGemma4Case{
|
||||
fragments: []string{`<|tool_call>call:get_weather{location:<|"|>London<|"|>}<tool_call|><|tool_call>call:get_time{location:<|"|>London<|"|>}<tool_call|>`},
|
||||
wantTools: []wantGemma4Tool{
|
||||
{name: "get_weather", argsJSON: `{"location":"London"}`},
|
||||
{name: "get_time", argsJSON: `{"location":"London"}`},
|
||||
},
|
||||
}),
|
||||
// vLLM: test_nested_arguments
|
||||
Entry("vLLM: test_nested_arguments", parseGemma4Case{
|
||||
fragments: []string{`<|tool_call>call:complex_function{nested:{inner:<|"|>value<|"|>},list:[<|"|>a<|"|>,<|"|>b<|"|>]}<tool_call|>`},
|
||||
wantTools: []wantGemma4Tool{{name: "complex_function", argsJSON: `{"nested":{"inner":"value"},"list":["a","b"]}`}},
|
||||
}),
|
||||
// vLLM: test_tool_call_with_number_and_boolean
|
||||
Entry("vLLM: test_tool_call_with_number_and_boolean", parseGemma4Case{
|
||||
fragments: []string{`<|tool_call>call:set_status{is_active:true,count:42,score:3.14}<tool_call|>`},
|
||||
wantTools: []wantGemma4Tool{{name: "set_status", argsJSON: `{"is_active":true,"count":42,"score":3.14}`}},
|
||||
}),
|
||||
// vLLM: test_hyphenated_function_name
|
||||
Entry("vLLM: test_hyphenated_function_name", parseGemma4Case{
|
||||
fragments: []string{`<|tool_call>call:get-weather{location:<|"|>London<|"|>}<tool_call|>`},
|
||||
wantTools: []wantGemma4Tool{{name: "get-weather", argsJSON: `{"location":"London"}`}},
|
||||
}),
|
||||
// vLLM: test_dotted_function_name
|
||||
Entry("vLLM: test_dotted_function_name", parseGemma4Case{
|
||||
fragments: []string{`<|tool_call>call:weather.get{location:<|"|>London<|"|>}<tool_call|>`},
|
||||
wantTools: []wantGemma4Tool{{name: "weather.get", argsJSON: `{"location":"London"}`}},
|
||||
}),
|
||||
// vLLM: test_no_arguments
|
||||
Entry("vLLM: test_no_arguments", parseGemma4Case{
|
||||
fragments: []string{"<|tool_call>call:get_status{}<tool_call|>"},
|
||||
wantTools: []wantGemma4Tool{{name: "get_status", argsJSON: `{}`}},
|
||||
}),
|
||||
|
||||
// --- ported vLLM streaming cases (chunk lists reused as fragments) --------
|
||||
// vLLM: test_basic_streaming_single_tool
|
||||
Entry("vLLM: test_basic_streaming_single_tool", parseGemma4Case{
|
||||
fragments: []string{
|
||||
"<|tool_call>",
|
||||
"call:get_weather{",
|
||||
`location:<|"|>Paris`,
|
||||
", France",
|
||||
`<|"|>}`,
|
||||
"<tool_call|>",
|
||||
},
|
||||
wantTools: []wantGemma4Tool{{name: "get_weather", argsJSON: `{"location":"Paris, France"}`}},
|
||||
}),
|
||||
// vLLM: test_streaming_multi_arg
|
||||
Entry("vLLM: test_streaming_multi_arg", parseGemma4Case{
|
||||
fragments: []string{
|
||||
"<|tool_call>",
|
||||
"call:get_weather{",
|
||||
`location:<|"|>Tokyo<|"|>,`,
|
||||
`unit:<|"|>celsius<|"|>}`,
|
||||
"<tool_call|>",
|
||||
},
|
||||
wantTools: []wantGemma4Tool{{name: "get_weather", argsJSON: `{"location":"Tokyo","unit":"celsius"}`}},
|
||||
}),
|
||||
// vLLM: test_streaming_text_before_tool_call
|
||||
Entry("vLLM: test_streaming_text_before_tool_call", parseGemma4Case{
|
||||
fragments: []string{
|
||||
"Let me check ",
|
||||
"the weather. ",
|
||||
"<|tool_call>",
|
||||
"call:get_weather{",
|
||||
`location:<|"|>London<|"|>}`,
|
||||
"<tool_call|>",
|
||||
},
|
||||
wantContent: "Let me check the weather. ",
|
||||
wantTools: []wantGemma4Tool{{name: "get_weather", argsJSON: `{"location":"London"}`}},
|
||||
}),
|
||||
// vLLM: test_streaming_numeric_args
|
||||
Entry("vLLM: test_streaming_numeric_args", parseGemma4Case{
|
||||
fragments: []string{
|
||||
"<|tool_call>",
|
||||
"call:set_config{",
|
||||
"count:42,",
|
||||
"active:true}",
|
||||
"<tool_call|>",
|
||||
},
|
||||
wantTools: []wantGemma4Tool{{name: "set_config", argsJSON: `{"count":42,"active":true}`}},
|
||||
}),
|
||||
// vLLM: test_streaming_boolean_split_across_chunks
|
||||
Entry("vLLM: test_streaming_boolean_split_across_chunks", parseGemma4Case{
|
||||
fragments: []string{
|
||||
"<|tool_call>",
|
||||
"call:search{input:{all:tru",
|
||||
"e}}",
|
||||
"<tool_call|>",
|
||||
},
|
||||
wantTools: []wantGemma4Tool{{name: "search", argsJSON: `{"input":{"all":true}}`}},
|
||||
}),
|
||||
// vLLM: test_streaming_false_split_across_chunks
|
||||
Entry("vLLM: test_streaming_false_split_across_chunks", parseGemma4Case{
|
||||
fragments: []string{
|
||||
"<|tool_call>",
|
||||
"call:set{flag:fals",
|
||||
"e}",
|
||||
"<tool_call|>",
|
||||
},
|
||||
wantTools: []wantGemma4Tool{{name: "set", argsJSON: `{"flag":false}`}},
|
||||
}),
|
||||
// vLLM: test_streaming_number_split_across_chunks
|
||||
Entry("vLLM: test_streaming_number_split_across_chunks", parseGemma4Case{
|
||||
fragments: []string{
|
||||
"<|tool_call>",
|
||||
"call:set{count:4",
|
||||
"2}",
|
||||
"<tool_call|>",
|
||||
},
|
||||
wantTools: []wantGemma4Tool{{name: "set", argsJSON: `{"count":42}`}},
|
||||
}),
|
||||
// vLLM: test_streaming_empty_args
|
||||
Entry("vLLM: test_streaming_empty_args", parseGemma4Case{
|
||||
fragments: []string{
|
||||
"<|tool_call>",
|
||||
"call:get_status{}",
|
||||
"<tool_call|>",
|
||||
},
|
||||
wantTools: []wantGemma4Tool{{name: "get_status", argsJSON: `{}`}},
|
||||
}),
|
||||
// vLLM: test_streaming_split_delimiter_no_invalid_json (string
|
||||
// delimiter <|"|> split across fragments must not leak fragments).
|
||||
Entry("vLLM: test_streaming_split_delimiter_no_invalid_json", parseGemma4Case{
|
||||
fragments: []string{
|
||||
"<|tool_call>",
|
||||
"call:todowrite{",
|
||||
`content:<|"|>Buy milk<|`,
|
||||
`"|>}`,
|
||||
"<tool_call|>",
|
||||
},
|
||||
wantTools: []wantGemma4Tool{{name: "todowrite", argsJSON: `{"content":"Buy milk"}`}},
|
||||
}),
|
||||
// vLLM: test_streaming_does_not_duplicate_plain_text_after_tool_call
|
||||
Entry("vLLM: test_streaming_does_not_duplicate_plain_text_after_tool_call", parseGemma4Case{
|
||||
fragments: []string{
|
||||
"<|tool_call>",
|
||||
"call:get_weather{",
|
||||
`location:<|"|>Paris<|"|>}`,
|
||||
"<tool_call|><",
|
||||
"div>",
|
||||
},
|
||||
wantContent: "<div>",
|
||||
wantTools: []wantGemma4Tool{{name: "get_weather", argsJSON: `{"location":"Paris"}`}},
|
||||
}),
|
||||
// vLLM: test_streaming_html_argument_does_not_duplicate_tag_prefixes
|
||||
Entry("vLLM: test_streaming_html_argument_does_not_duplicate_tag_prefixes", parseGemma4Case{
|
||||
fragments: []string{
|
||||
"<|tool_call>",
|
||||
"call:write_file{",
|
||||
`path:<|"|>index.html<|"|>,`,
|
||||
`content:<|"|><!DOCTYPE html>` + "\n<",
|
||||
`html lang="zh-CN">` + "\n<",
|
||||
"head>\n <",
|
||||
`meta charset="UTF-8">` + "\n <",
|
||||
`meta name="viewport" content="width=device-width">` + "\n",
|
||||
`<|"|>}`,
|
||||
"<tool_call|>",
|
||||
},
|
||||
wantTools: []wantGemma4Tool{{
|
||||
name: "write_file",
|
||||
argsJSON: `{"path":"index.html","content":"<!DOCTYPE html>\n<html lang=\"zh-CN\">\n<head>\n <meta charset=\"UTF-8\">\n <meta name=\"viewport\" content=\"width=device-width\">\n"}`,
|
||||
}},
|
||||
}),
|
||||
// vLLM: test_streaming_single_chunk_complete_tool_call
|
||||
Entry("vLLM: test_streaming_single_chunk_complete_tool_call", parseGemma4Case{
|
||||
fragments: []string{`<|tool_call>call:name_a_color{color_hex:<|"|>00ff11<|"|>}<tool_call|>`},
|
||||
wantTools: []wantGemma4Tool{{name: "name_a_color", argsJSON: `{"color_hex":"00ff11"}`}},
|
||||
}),
|
||||
// vLLM: test_streaming_multi_chunk_batched_tool_calls (two complete
|
||||
// calls in ONE fragment; both must come out with distinct indices)
|
||||
Entry("vLLM: test_streaming_multi_chunk_batched_tool_calls", parseGemma4Case{
|
||||
fragments: []string{
|
||||
`<|tool_call>call:get_weather{location:<|"|>London<|"|>}<tool_call|>` +
|
||||
`<|tool_call>call:get_time{timezone:<|"|>GMT<|"|>}<tool_call|>`,
|
||||
},
|
||||
wantTools: []wantGemma4Tool{
|
||||
{name: "get_weather", argsJSON: `{"location":"London"}`},
|
||||
{name: "get_time", argsJSON: `{"timezone":"GMT"}`},
|
||||
},
|
||||
}),
|
||||
// vLLM: test_streaming_trailing_bare_bool_not_duplicated
|
||||
Entry("vLLM: test_streaming_trailing_bare_bool_not_duplicated", parseGemma4Case{
|
||||
fragments: []string{
|
||||
"<|tool_call>",
|
||||
"call:Edit{",
|
||||
`file_path:<|"|>src/env.py<|"|>,`,
|
||||
`old_string:<|"|>old_val<|"|>,`,
|
||||
`new_string:<|"|>new_val<|"|>,`,
|
||||
"replace_all:",
|
||||
"false}",
|
||||
"<tool_call|>",
|
||||
},
|
||||
wantTools: []wantGemma4Tool{{
|
||||
name: "Edit",
|
||||
argsJSON: `{"file_path":"src/env.py","old_string":"old_val","new_string":"new_val","replace_all":false}`,
|
||||
}},
|
||||
}),
|
||||
|
||||
// --- implicit reasoning end on <|tool_call> (vLLM is_reasoning_end:
|
||||
// a tool_call token means reasoning is over) -----------------------------
|
||||
Entry("tool call inside an open thought channel ends the reasoning", parseGemma4Case{
|
||||
startInThought: true,
|
||||
fragments: []string{`need the weather<|tool_call>call:get_weather{location:<|"|>Rome<|"|>}<tool_call|>`},
|
||||
wantReasoning: "need the weather",
|
||||
wantTools: []wantGemma4Tool{{name: "get_weather", argsJSON: `{"location":"Rome"}`}},
|
||||
}),
|
||||
|
||||
// --- (12) empty fragments are no-ops --------------------------------------
|
||||
Entry("empty fragments are no-ops", parseGemma4Case{
|
||||
fragments: []string{"", "Hello", "", "", " world", ""},
|
||||
wantContent: "Hello world",
|
||||
}),
|
||||
)
|
||||
|
||||
It("returns no deltas for an empty fragment and after Close", func() {
|
||||
p := NewGemma4Parser(false)
|
||||
Expect(p.Feed("")).To(BeEmpty())
|
||||
Expect(p.Feed("hi")).ToNot(BeEmpty())
|
||||
Expect(p.Close()).To(BeEmpty()) // nothing held back
|
||||
// The parser is finished after Close: further input is dropped.
|
||||
Expect(p.Feed("more")).To(BeEmpty())
|
||||
Expect(p.Close()).To(BeEmpty())
|
||||
})
|
||||
|
||||
It("generates index-based tool call ids (call_<index>)", func() {
|
||||
// Mirrors the index-based id convention of pkg/grpc/rich_test.go and
|
||||
// keeps ids deterministic for the split-invariance property below.
|
||||
deltas := parseGemma4Fragments(false, []string{
|
||||
`<|tool_call>call:a{}<tool_call|><|tool_call>call:b{}<tool_call|>`,
|
||||
})
|
||||
_, _, tools := flattenGemma4Deltas(deltas)
|
||||
Expect(tools).To(HaveLen(2))
|
||||
Expect(tools[0].id).To(Equal("call_0"))
|
||||
Expect(tools[1].id).To(Equal("call_1"))
|
||||
})
|
||||
|
||||
// Property: for a fixed full output, EVERY 2-split position must yield
|
||||
// exactly the same flattened result as the unsplit parse. This kills
|
||||
// fragment-boundary bugs (mid-marker, mid-delimiter, mid-payload splits).
|
||||
DescribeTable("2-split fragment invariance",
|
||||
func(startInThought bool, full string) {
|
||||
refContent, refReasoning, refTools := flattenGemma4Deltas(
|
||||
parseGemma4Fragments(startInThought, []string{full}))
|
||||
for i := 0; i <= len(full); i++ {
|
||||
content, reasoning, tools := flattenGemma4Deltas(
|
||||
parseGemma4Fragments(startInThought, []string{full[:i], full[i:]}))
|
||||
Expect(content).To(Equal(refContent), fmt.Sprintf("content diverged at split %d", i))
|
||||
Expect(reasoning).To(Equal(refReasoning), fmt.Sprintf("reasoning diverged at split %d", i))
|
||||
Expect(tools).To(Equal(refTools), fmt.Sprintf("tool calls diverged at split %d", i))
|
||||
}
|
||||
},
|
||||
Entry("thought + content + two tool calls + turn end", false,
|
||||
"<|channel>thought\nPondering the request...\n<channel|>Sure - calling tools now. "+
|
||||
`<|tool_call>call:get_weather{location:<|"|>Paris, France<|"|>,unit:<|"|>celsius<|"|>,days:3,detailed:true}<tool_call|>`+
|
||||
`<|tool_call>call:get_time{timezone:<|"|>Europe/Lisbon<|"|>,nested:{flag:false,vals:[1,2.5,<|"|>x<|"|>]}}<tool_call|>`+
|
||||
"Done.<turn|>ignored tail"),
|
||||
Entry("startInThought + tool call + trailing partial marker", true,
|
||||
`Deep thought<channel|>final answer <|tool_call>call:noop{}<tool_call|> trailing <|tool`),
|
||||
Entry("malformed payload fallback", false,
|
||||
`pre <|tool_call>not a call<tool_call|> post`),
|
||||
)
|
||||
})
|
||||
|
||||
// Decoder-level ports of vLLM's TestParseGemma4Args / TestParseGemma4Array
|
||||
// (non-partial mode; the partial-withholding tests do not apply because this
|
||||
// parser only ever decodes COMPLETE payloads, see gemma4_parser.go).
|
||||
var _ = Describe("decodeGemma4Args", func() {
|
||||
DescribeTable("decodes the gemma4 call syntax into JSON arguments",
|
||||
func(in, wantJSON string) {
|
||||
Expect(decodeGemma4Args(in, 0)).To(MatchJSON(wantJSON))
|
||||
},
|
||||
// vLLM: test_empty_string / test_whitespace_only
|
||||
Entry("empty string", "", `{}`),
|
||||
Entry("whitespace only", " ", `{}`),
|
||||
// vLLM: test_single_string_value
|
||||
Entry("single string value", `location:<|"|>Paris<|"|>`, `{"location":"Paris"}`),
|
||||
// vLLM: test_string_value_with_comma
|
||||
Entry("string value with comma", `location:<|"|>Paris, France<|"|>`, `{"location":"Paris, France"}`),
|
||||
// vLLM: test_multiple_string_values
|
||||
Entry("multiple string values", `location:<|"|>San Francisco<|"|>,unit:<|"|>celsius<|"|>`, `{"location":"San Francisco","unit":"celsius"}`),
|
||||
// vLLM: test_integer_value / test_float_value
|
||||
Entry("integer value", "count:42", `{"count":42}`),
|
||||
Entry("float value", "score:3.14", `{"score":3.14}`),
|
||||
// vLLM: test_boolean_true / test_boolean_false
|
||||
Entry("boolean true", "flag:true", `{"flag":true}`),
|
||||
Entry("boolean false", "flag:false", `{"flag":false}`),
|
||||
// vLLM: test_null_value (bare null must become JSON null, not "null")
|
||||
Entry("null value", "param:null", `{"param":null}`),
|
||||
// vLLM: test_mixed_types
|
||||
Entry("mixed types", `name:<|"|>test<|"|>,count:42,active:true,score:3.14`,
|
||||
`{"name":"test","count":42,"active":true,"score":3.14}`),
|
||||
// vLLM: test_nested_object
|
||||
Entry("nested object", `nested:{inner:<|"|>value<|"|>}`, `{"nested":{"inner":"value"}}`),
|
||||
// vLLM: test_array_of_strings
|
||||
Entry("array of strings", `items:[<|"|>a<|"|>,<|"|>b<|"|>]`, `{"items":["a","b"]}`),
|
||||
// vLLM: test_unterminated_string (take everything after the delimiter)
|
||||
Entry("unterminated string", `key:<|"|>unterminated`, `{"key":"unterminated"}`),
|
||||
// vLLM: test_empty_value (key with no value after colon)
|
||||
Entry("empty value", "key:", `{"key":""}`),
|
||||
// vLLM: test_trailing_dot_float_partial_withheld, non-partial branch
|
||||
// (trailing-dot floats parse normally outside streaming).
|
||||
Entry("trailing dot float, complete payload", "left:108.,right:22.8", `{"left":108.0,"right":22.8}`),
|
||||
)
|
||||
|
||||
It("terminates and yields valid JSON on malformed input", func() {
|
||||
// vLLM: test_malformed_partial_array (the assertion there is only
|
||||
// "returns a dict without hanging"; ours is "valid JSON object").
|
||||
out := decodeGemma4Args(":[t:[]", 0)
|
||||
var v map[string]any
|
||||
Expect(json.Unmarshal([]byte(out), &v)).To(Succeed())
|
||||
})
|
||||
|
||||
It("degrades nesting beyond the recursion cap to a string value", func() {
|
||||
// 200 levels of a:{a:{...a:1...}}. Without the depth cap the mutual
|
||||
// recursion would grow the stack with the model's output; a Go stack
|
||||
// overflow is a fatal process kill, so levels past gemma4MaxArgsDepth
|
||||
// must gracefully fall back to the raw inner text as a JSON string.
|
||||
const depth = 200
|
||||
body := strings.Repeat("a:{", depth-1) + "a:1" + strings.Repeat("}", depth-1)
|
||||
out := decodeGemma4Args(body, 0)
|
||||
var v map[string]any
|
||||
Expect(json.Unmarshal([]byte(out), &v)).To(Succeed())
|
||||
levels := 0
|
||||
var cur any = v
|
||||
for {
|
||||
m, ok := cur.(map[string]any)
|
||||
if !ok {
|
||||
break
|
||||
}
|
||||
Expect(m).To(HaveKey("a"))
|
||||
cur = m["a"]
|
||||
levels++
|
||||
}
|
||||
Expect(levels).To(Equal(gemma4MaxArgsDepth + 1))
|
||||
Expect(cur).To(BeAssignableToTypeOf(""))
|
||||
Expect(cur).To(ContainSubstring("a:{"))
|
||||
})
|
||||
})
|
||||
|
||||
var _ = Describe("decodeGemma4Array", func() {
|
||||
DescribeTable("decodes gemma4 array bodies into JSON arrays",
|
||||
func(in, wantJSON string) {
|
||||
Expect(decodeGemma4Array(in, 0)).To(MatchJSON(wantJSON))
|
||||
},
|
||||
// vLLM: test_string_array / test_empty_array / test_bare_values
|
||||
Entry("string array", `<|"|>a<|"|>,<|"|>b<|"|>`, `["a","b"]`),
|
||||
Entry("empty array", "", `[]`),
|
||||
Entry("bare values", "42,true,3.14", `[42,true,3.14]`),
|
||||
// vLLM: test_string_element_with_closing_bracket (a ']' inside a
|
||||
// delimited string must not close the array)
|
||||
Entry("string element with closing bracket", `[<|"|>a]b<|"|>,<|"|>c<|"|>],<|"|>tail<|"|>`, `[["a]b","c"],"tail"]`),
|
||||
// vLLM: test_stray_closing_bracket (no-progress abort, keep prefix)
|
||||
Entry("stray closing bracket", "42,]trailing", `[42]`),
|
||||
)
|
||||
})
|
||||
1026
backend/go/dllm/gemma4_renderer.go
Executable file
1026
backend/go/dllm/gemma4_renderer.go
Executable file
File diff suppressed because it is too large
Load Diff
347
backend/go/dllm/gemma4_renderer_test.go
Executable file
347
backend/go/dllm/gemma4_renderer_test.go
Executable file
@@ -0,0 +1,347 @@
|
||||
package main
|
||||
|
||||
// Renderer specs for RenderGemma4 against the canonical gemma4 chat template
|
||||
// (see the normative template comment in gemma4_renderer.go).
|
||||
//
|
||||
// Fixture provenance:
|
||||
// - "single user message" and "enable_thinking" are the EXACT expected
|
||||
// decodes from transformers tests/models/diffusion_gemma/
|
||||
// test_modeling_diffusion_gemma.py (test_diffusion_gemma_chat_template
|
||||
// and ..._with_thinking) with ONE difference: the transformers fixtures
|
||||
// start with "<bos>" because apply_chat_template tokenizes the rendered
|
||||
// text with add_bos. Our prompt goes through dllm_capi_generate, whose
|
||||
// run_generate already tokenizes with prepend_bos = vocab.add_bos
|
||||
// (dllm.cpp src/capi.cpp:230-231, true for gemma4), so the renderer must
|
||||
// NOT emit a literal <bos> (it would double) and every expected string
|
||||
// here drops that leading token.
|
||||
// - All other expected strings were produced by rendering the verbatim
|
||||
// GGUF template with jinja2 3.1.2 (bos_token="<bos>") and dropping the
|
||||
// leading "<bos>" for the same reason.
|
||||
|
||||
import (
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
|
||||
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
)
|
||||
|
||||
// Two-function tools array used by the tool fixtures (OpenAI wire shape, as
|
||||
// LocalAI passes it through PredictOptions.Tools).
|
||||
const testToolsJSON = `[{"type":"function","function":{"name":"get_weather","description":"Get the current weather in a location.","parameters":{"type":"object","properties":{"location":{"type":"string","description":"The city name."},"unit":{"type":"string","enum":["celsius","fahrenheit"]}},"required":["location"]}}},{"type":"function","function":{"name":"get_time","description":"Get the current time in a timezone.","parameters":{"type":"object","properties":{"timezone":{"type":"string","description":"IANA timezone name."}},"required":["timezone"]}}}]`
|
||||
|
||||
// The <|tool>...<tool|> block the template renders for testToolsJSON inside
|
||||
// the system turn (jinja2-verified).
|
||||
const testToolsBlock = `<|tool>declaration:get_weather{description:<|"|>Get the current weather in a location.<|"|>,parameters:{properties:{location:{description:<|"|>The city name.<|"|>,type:<|"|>STRING<|"|>},unit:{enum:[<|"|>celsius<|"|>,<|"|>fahrenheit<|"|>],type:<|"|>STRING<|"|>}},required:[<|"|>location<|"|>],type:<|"|>OBJECT<|"|>}}<tool|><|tool>declaration:get_time{description:<|"|>Get the current time in a timezone.<|"|>,parameters:{properties:{timezone:{description:<|"|>IANA timezone name.<|"|>,type:<|"|>STRING<|"|>}},required:[<|"|>timezone<|"|>],type:<|"|>OBJECT<|"|>}}<tool|>`
|
||||
|
||||
// A single tool exercising the deep format_parameters branches: array items
|
||||
// (string-typed and nested-array), nullable, enum+nullable, nested object
|
||||
// properties/required, and a response declaration.
|
||||
const complexToolsJSON = `[{"type":"function","function":{"name":"complex_tool","description":"A complex tool.","parameters":{"type":"object","properties":{"tags":{"type":"array","description":"Tags.","items":{"type":"string"}},"matrix":{"type":"array","items":{"type":"array","items":{"type":"number"}}},"opts":{"type":"object","description":"Options.","properties":{"depth":{"type":"integer","nullable":true}},"required":["depth"]},"mode":{"type":"string","enum":["a","b"],"nullable":true}},"required":["tags","opts"]},"response":{"description":"The result.","type":"object"}}}]`
|
||||
|
||||
// jinja2-verified render of complexToolsJSON. Notable template quirks pinned
|
||||
// here: nested array items go through format_argument with ESCAPED keys and
|
||||
// an un-uppercased type (<|"|>type<|"|>:<|"|>number<|"|>), while direct item
|
||||
// types are uppercased; properties dictsort case-insensitively.
|
||||
const complexToolsBlock = `<|tool>declaration:complex_tool{description:<|"|>A complex tool.<|"|>,parameters:{properties:{matrix:{items:{items:{<|"|>type<|"|>:<|"|>number<|"|>},type:<|"|>ARRAY<|"|>},type:<|"|>ARRAY<|"|>},mode:{enum:[<|"|>a<|"|>,<|"|>b<|"|>],nullable:true,type:<|"|>STRING<|"|>},opts:{description:<|"|>Options.<|"|>,properties:{depth:{nullable:true,type:<|"|>INTEGER<|"|>}},required:[<|"|>depth<|"|>],type:<|"|>OBJECT<|"|>},tags:{description:<|"|>Tags.<|"|>,items:{type:<|"|>STRING<|"|>},type:<|"|>ARRAY<|"|>}},required:[<|"|>tags<|"|>,<|"|>opts<|"|>],type:<|"|>OBJECT<|"|>},response:{description:<|"|>The result.<|"|>,type:<|"|>OBJECT<|"|>}}<tool|>`
|
||||
|
||||
type renderGemma4Case struct {
|
||||
msgs []*pb.Message
|
||||
toolsJSON string
|
||||
enableThinking bool
|
||||
noGenerationPrompt bool // inverted so the zero value is the common case
|
||||
expected string
|
||||
}
|
||||
|
||||
var _ = Describe("RenderGemma4", func() {
|
||||
DescribeTable("renders the canonical gemma4 prompt",
|
||||
func(c renderGemma4Case) {
|
||||
out, err := RenderGemma4(c.msgs, c.toolsJSON, c.enableThinking, !c.noGenerationPrompt)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(out).To(Equal(c.expected))
|
||||
// The C-ABI generate prepends BOS itself: a literal <bos>
|
||||
// anywhere in the rendered prompt would double-encode it.
|
||||
Expect(out).ToNot(ContainSubstring("<bos>"))
|
||||
},
|
||||
|
||||
// transformers fixture (test_diffusion_gemma_chat_template), sans <bos>:
|
||||
// default thinking pre-opens an EMPTY thought channel in the
|
||||
// generation prompt.
|
||||
Entry("single user message, default (no thinking)", renderGemma4Case{
|
||||
msgs: []*pb.Message{
|
||||
{Role: "user", Content: "Write a long essay about Portugal."},
|
||||
},
|
||||
expected: "<|turn>user\nWrite a long essay about Portugal.<turn|>\n<|turn>model\n<|channel>thought\n<channel|>",
|
||||
}),
|
||||
|
||||
// transformers fixture (test_diffusion_gemma_chat_template_with_thinking),
|
||||
// sans <bos>: a system turn carrying <|think|> and NO auto-opened
|
||||
// thought channel.
|
||||
Entry("enable_thinking=true", renderGemma4Case{
|
||||
msgs: []*pb.Message{
|
||||
{Role: "user", Content: "Write a long essay about Portugal."},
|
||||
},
|
||||
enableThinking: true,
|
||||
expected: "<|turn>system\n<|think|>\n<turn|>\n<|turn>user\nWrite a long essay about Portugal.<turn|>\n<|turn>model\n",
|
||||
}),
|
||||
|
||||
Entry("multi-turn user/assistant/user", renderGemma4Case{
|
||||
msgs: []*pb.Message{
|
||||
{Role: "user", Content: "Hello, who are you?"},
|
||||
{Role: "assistant", Content: "I am Gemma, a helpful assistant."},
|
||||
{Role: "user", Content: "Tell me a joke."},
|
||||
},
|
||||
expected: "<|turn>user\nHello, who are you?<turn|>\n<|turn>model\nI am Gemma, a helpful assistant.<turn|>\n<|turn>user\nTell me a joke.<turn|>\n<|turn>model\n<|channel>thought\n<channel|>",
|
||||
}),
|
||||
|
||||
// tpl L178-L195: a leading system message is folded into the system
|
||||
// turn (trimmed) and consumed from the loop.
|
||||
Entry("system message folds into the system turn", renderGemma4Case{
|
||||
msgs: []*pb.Message{
|
||||
{Role: "system", Content: "You are a pirate."},
|
||||
{Role: "user", Content: "Hello!"},
|
||||
},
|
||||
expected: "<|turn>system\nYou are a pirate.<turn|>\n<|turn>user\nHello!<turn|>\n<|turn>model\n<|channel>thought\n<channel|>",
|
||||
}),
|
||||
|
||||
// tpl L182-L185: <|think|> goes at the very top of the SAME system
|
||||
// turn, before the system prompt text.
|
||||
Entry("system message with enable_thinking shares the turn", renderGemma4Case{
|
||||
msgs: []*pb.Message{
|
||||
{Role: "system", Content: "You are a pirate."},
|
||||
{Role: "user", Content: "Hello!"},
|
||||
},
|
||||
enableThinking: true,
|
||||
expected: "<|turn>system\n<|think|>\nYou are a pirate.<turn|>\n<|turn>user\nHello!<turn|>\n<|turn>model\n",
|
||||
}),
|
||||
|
||||
// tpl L196-L203: tool declarations render in the system turn, one
|
||||
// <|tool>declaration:...<tool|> block per tool, no separators.
|
||||
Entry("tools array (two functions)", renderGemma4Case{
|
||||
msgs: []*pb.Message{
|
||||
{Role: "user", Content: "What is the weather in Tokyo?"},
|
||||
},
|
||||
toolsJSON: testToolsJSON,
|
||||
expected: "<|turn>system\n" + testToolsBlock + "<turn|>\n<|turn>user\nWhat is the weather in Tokyo?<turn|>\n<|turn>model\n<|channel>thought\n<channel|>",
|
||||
}),
|
||||
|
||||
// format_parameters deep branches (tpl L1-L85) + response declaration
|
||||
// (tpl L106-L116).
|
||||
Entry("complex tool schema (array items, nullable, nested object, response)", renderGemma4Case{
|
||||
msgs: []*pb.Message{
|
||||
{Role: "user", Content: "go"},
|
||||
},
|
||||
toolsJSON: complexToolsJSON,
|
||||
expected: "<|turn>system\n" + complexToolsBlock + "<turn|>\n<|turn>user\ngo<turn|>\n<|turn>model\n<|channel>thought\n<channel|>",
|
||||
}),
|
||||
|
||||
// tpl L243-L313: assistant tool_calls render as
|
||||
// <|tool_call>call:name{args}<tool_call|>; the following role=tool
|
||||
// message renders inline as <|tool_response>response:name{value:..}
|
||||
// <tool_response|>; the model turn stays OPEN (no <turn|>, no new
|
||||
// generation prompt) so the model continues after the response.
|
||||
Entry("assistant tool_calls + role=tool result", renderGemma4Case{
|
||||
msgs: []*pb.Message{
|
||||
{Role: "user", Content: "What is the weather in Tokyo?"},
|
||||
{Role: "assistant", Content: "", ToolCalls: `[{"index":0,"id":"call_1","type":"function","function":{"name":"get_weather","arguments":"{\"location\":\"Tokyo\",\"unit\":\"celsius\"}"}}]`},
|
||||
{Role: "tool", ToolCallId: "call_1", Content: "Sunny, 22 degrees celsius."},
|
||||
},
|
||||
toolsJSON: testToolsJSON,
|
||||
expected: "<|turn>system\n" + testToolsBlock + "<turn|>\n<|turn>user\nWhat is the weather in Tokyo?<turn|>\n<|turn>model\n" + `<|tool_call>call:get_weather{location:<|"|>Tokyo<|"|>,unit:<|"|>celsius<|"|>}<tool_call|><|tool_response>response:get_weather{value:<|"|>Sunny, 22 degrees celsius.<|"|>}<tool_response|>`,
|
||||
}),
|
||||
|
||||
// tpl L348-L349: a tool_calls turn with no rendered responses ends
|
||||
// on an OPEN <|tool_response> marker for the runtime to fill, and
|
||||
// add_generation_prompt adds nothing (tpl L357).
|
||||
Entry("assistant tool_calls without a result leaves <|tool_response> open", renderGemma4Case{
|
||||
msgs: []*pb.Message{
|
||||
{Role: "user", Content: "What is the weather in Tokyo?"},
|
||||
{Role: "assistant", Content: "", ToolCalls: `[{"index":0,"id":"call_1","type":"function","function":{"name":"get_weather","arguments":"{\"location\":\"Tokyo\",\"unit\":\"celsius\"}"}}]`},
|
||||
},
|
||||
toolsJSON: testToolsJSON,
|
||||
expected: "<|turn>system\n" + testToolsBlock + "<turn|>\n<|turn>user\nWhat is the weather in Tokyo?<turn|>\n<|turn>model\n" + `<|tool_call>call:get_weather{location:<|"|>Tokyo<|"|>,unit:<|"|>celsius<|"|>}<tool_call|><|tool_response>`,
|
||||
}),
|
||||
|
||||
// tpl L237-L241: reasoning_content renders as a thought channel only
|
||||
// on a tool-calling turn after the last user message.
|
||||
Entry("reasoning_content with tool_calls renders the thought channel", renderGemma4Case{
|
||||
msgs: []*pb.Message{
|
||||
{Role: "user", Content: "weather?"},
|
||||
{Role: "assistant", Content: "", ReasoningContent: "I should call the tool", ToolCalls: `[{"index":0,"id":"c1","type":"function","function":{"name":"get_weather","arguments":"{\"location\":\"Tokyo\"}"}}]`},
|
||||
{Role: "tool", ToolCallId: "c1", Content: "Sunny"},
|
||||
},
|
||||
expected: "<|turn>user\nweather?<turn|>\n<|turn>model\n<|channel>thought\nI should call the tool\n<channel|>" + `<|tool_call>call:get_weather{location:<|"|>Tokyo<|"|>}<tool_call|><|tool_response>response:get_weather{value:<|"|>Sunny<|"|>}<tool_response|>`,
|
||||
}),
|
||||
|
||||
// tpl L220-L235: the assistant answer following its own tool round
|
||||
// continues the SAME model turn (no second <|turn>model).
|
||||
Entry("tool round then final assistant answer then user", renderGemma4Case{
|
||||
msgs: []*pb.Message{
|
||||
{Role: "user", Content: "weather?"},
|
||||
{Role: "assistant", Content: "", ToolCalls: `[{"index":0,"id":"c1","type":"function","function":{"name":"get_weather","arguments":"{\"location\":\"Tokyo\"}"}}]`},
|
||||
{Role: "tool", ToolCallId: "c1", Content: "Sunny"},
|
||||
{Role: "assistant", Content: "It is sunny."},
|
||||
{Role: "user", Content: "thanks"},
|
||||
},
|
||||
expected: "<|turn>user\nweather?<turn|>\n<|turn>model\n" + `<|tool_call>call:get_weather{location:<|"|>Tokyo<|"|>}<tool_call|><|tool_response>response:get_weather{value:<|"|>Sunny<|"|>}<tool_response|>` + "It is sunny.<turn|>\n<|turn>user\nthanks<turn|>\n<|turn>model\n<|channel>thought\n<channel|>",
|
||||
}),
|
||||
|
||||
// format_argument (tpl L118-L147): numbers keep their JSON literal,
|
||||
// booleans lower-case, nested maps have unquoted dictsorted keys,
|
||||
// arrays bracketed; top-level args are dictsorted case-insensitively.
|
||||
Entry("tool_call argument types (number/bool/nested/array)", renderGemma4Case{
|
||||
msgs: []*pb.Message{
|
||||
{Role: "user", Content: "go"},
|
||||
{Role: "assistant", Content: "", ToolCalls: `[{"index":0,"id":"c1","type":"function","function":{"name":"f","arguments":"{\"count\":42,\"ratio\":3.5,\"flag\":true,\"off\":false,\"nested\":{\"x\":\"y\",\"n\":7},\"list\":[\"a\",1,true]}"}}]`},
|
||||
},
|
||||
expected: "<|turn>user\ngo<turn|>\n<|turn>model\n" + `<|tool_call>call:f{count:42,flag:true,list:[<|"|>a<|"|>,1,true],nested:{n:7,x:<|"|>y<|"|>},off:false,ratio:3.5}<tool_call|><|tool_response>`,
|
||||
}),
|
||||
|
||||
// jinja dictsort is case-insensitive: alpha sorts before Beta.
|
||||
Entry("tool_call argument dictsort is case-insensitive", renderGemma4Case{
|
||||
msgs: []*pb.Message{
|
||||
{Role: "user", Content: "go"},
|
||||
{Role: "assistant", Content: "", ToolCalls: `[{"index":0,"id":"c1","type":"function","function":{"name":"f","arguments":"{\"Beta\":1,\"alpha\":2}"}}]`},
|
||||
},
|
||||
expected: "<|turn>user\ngo<turn|>\n<|turn>model\n<|tool_call>call:f{alpha:2,Beta:1}<tool_call|><|tool_response>",
|
||||
}),
|
||||
|
||||
// jinja renders Python None as "None" (round-trips through vLLM's
|
||||
// parser, which lowers "none" back to null).
|
||||
Entry("tool_call null argument renders as None", renderGemma4Case{
|
||||
msgs: []*pb.Message{
|
||||
{Role: "user", Content: "go"},
|
||||
{Role: "assistant", Content: "", ToolCalls: `[{"index":0,"id":"c1","type":"function","function":{"name":"f","arguments":"{\"maybe\":null}"}}]`},
|
||||
},
|
||||
expected: "<|turn>user\ngo<turn|>\n<|turn>model\n<|tool_call>call:f{maybe:None}<tool_call|><|tool_response>",
|
||||
}),
|
||||
|
||||
Entry("tool_call empty arguments render empty braces", renderGemma4Case{
|
||||
msgs: []*pb.Message{
|
||||
{Role: "user", Content: "go"},
|
||||
{Role: "assistant", Content: "", ToolCalls: `[{"index":0,"id":"c1","type":"function","function":{"name":"f","arguments":"{}"}}]`},
|
||||
},
|
||||
expected: "<|turn>user\ngo<turn|>\n<|turn>model\n<|tool_call>call:f{}<tool_call|><|tool_response>",
|
||||
}),
|
||||
|
||||
// tpl L253-L254: a non-object arguments string renders verbatim.
|
||||
Entry("tool_call non-object string arguments render verbatim", renderGemma4Case{
|
||||
msgs: []*pb.Message{
|
||||
{Role: "user", Content: "go"},
|
||||
{Role: "assistant", Content: "", ToolCalls: `[{"index":0,"id":"c1","type":"function","function":{"name":"f","arguments":"just text"}}]`},
|
||||
},
|
||||
expected: "<|turn>user\ngo<turn|>\n<|turn>model\n<|tool_call>call:f{just text}<tool_call|><|tool_response>",
|
||||
}),
|
||||
|
||||
// tpl L278-L285: unmatched tool_call_id falls back to the tool
|
||||
// message's own name.
|
||||
Entry("tool result name falls back when tool_call_id does not match", renderGemma4Case{
|
||||
msgs: []*pb.Message{
|
||||
{Role: "user", Content: "go"},
|
||||
{Role: "assistant", Content: "", ToolCalls: `[{"index":0,"id":"c1","type":"function","function":{"name":"f","arguments":"{}"}}]`},
|
||||
{Role: "tool", ToolCallId: "OTHER", Name: "named_tool", Content: "out"},
|
||||
},
|
||||
expected: "<|turn>user\ngo<turn|>\n<|turn>model\n" + `<|tool_call>call:f{}<tool_call|><|tool_response>response:named_tool{value:<|"|>out<|"|>}<tool_response|>`,
|
||||
}),
|
||||
|
||||
// strip_thinking (tpl L148-L158): historical assistant content loses
|
||||
// its <|channel>...<channel|> spans.
|
||||
Entry("assistant content thinking channels are stripped", renderGemma4Case{
|
||||
msgs: []*pb.Message{
|
||||
{Role: "user", Content: "hi"},
|
||||
{Role: "assistant", Content: "<|channel>thought\nsecret\n<channel|>visible answer"},
|
||||
{Role: "user", Content: "more"},
|
||||
},
|
||||
expected: "<|turn>user\nhi<turn|>\n<|turn>model\nvisible answer<turn|>\n<|turn>user\nmore<turn|>\n<|turn>model\n<|channel>thought\n<channel|>",
|
||||
}),
|
||||
|
||||
// tpl L220-L235: consecutive assistant messages suppress the second
|
||||
// <|turn>model (continuation), but each still closes with <turn|>.
|
||||
Entry("consecutive assistant messages continue the model turn", renderGemma4Case{
|
||||
msgs: []*pb.Message{
|
||||
{Role: "user", Content: "hi"},
|
||||
{Role: "assistant", Content: "part one"},
|
||||
{Role: "assistant", Content: "part two"},
|
||||
{Role: "user", Content: "ok"},
|
||||
},
|
||||
expected: "<|turn>user\nhi<turn|>\n<|turn>model\npart one<turn|>\npart two<turn|>\n<|turn>user\nok<turn|>\n<|turn>model\n<|channel>thought\n<channel|>",
|
||||
}),
|
||||
|
||||
Entry("add_generation_prompt=false renders no model turn", renderGemma4Case{
|
||||
msgs: []*pb.Message{
|
||||
{Role: "user", Content: "hi"},
|
||||
},
|
||||
noGenerationPrompt: true,
|
||||
expected: "<|turn>user\nhi<turn|>\n",
|
||||
}),
|
||||
)
|
||||
|
||||
Describe("error handling", func() {
|
||||
It("fails loud on an unknown role", func() {
|
||||
_, err := RenderGemma4([]*pb.Message{
|
||||
{Role: "narrator", Content: "Meanwhile..."},
|
||||
}, "", false, true)
|
||||
Expect(err).To(HaveOccurred())
|
||||
Expect(err.Error()).To(ContainSubstring(`unknown role "narrator"`))
|
||||
})
|
||||
|
||||
It("fails on invalid tools JSON", func() {
|
||||
_, err := RenderGemma4([]*pb.Message{
|
||||
{Role: "user", Content: "hi"},
|
||||
}, "{not json", false, true)
|
||||
Expect(err).To(HaveOccurred())
|
||||
Expect(err.Error()).To(ContainSubstring("tools JSON"))
|
||||
})
|
||||
|
||||
It("fails on invalid tool_calls JSON", func() {
|
||||
_, err := RenderGemma4([]*pb.Message{
|
||||
{Role: "user", Content: "hi"},
|
||||
{Role: "assistant", Content: "", ToolCalls: "{not json"},
|
||||
}, "", false, true)
|
||||
Expect(err).To(HaveOccurred())
|
||||
Expect(err.Error()).To(ContainSubstring("tool_calls JSON"))
|
||||
})
|
||||
|
||||
It("fails on an orphan tool message, naming its index", func() {
|
||||
// A role:tool message with no preceding assistant tool_calls turn
|
||||
// would be silently dropped by the jinja; we fail loud instead.
|
||||
_, err := RenderGemma4([]*pb.Message{
|
||||
{Role: "user", Content: "hi"},
|
||||
{Role: "tool", Content: `{"temp": 20}`, ToolCallId: "call_1"},
|
||||
}, "", false, true)
|
||||
Expect(err).To(HaveOccurred())
|
||||
Expect(err.Error()).To(ContainSubstring("orphan tool message 1"))
|
||||
})
|
||||
|
||||
It("fails on trailing garbage after the tools JSON array", func() {
|
||||
_, err := RenderGemma4([]*pb.Message{
|
||||
{Role: "user", Content: "hi"},
|
||||
}, "[] junk", false, true)
|
||||
Expect(err).To(HaveOccurred())
|
||||
Expect(err.Error()).To(ContainSubstring("tools JSON"))
|
||||
})
|
||||
|
||||
It("fails when the tools JSON is not an array", func() {
|
||||
_, err := RenderGemma4([]*pb.Message{
|
||||
{Role: "user", Content: "hi"},
|
||||
}, `{"type":"function"}`, false, true)
|
||||
Expect(err).To(HaveOccurred())
|
||||
Expect(err.Error()).To(ContainSubstring("tools JSON is not an array"))
|
||||
})
|
||||
|
||||
It("fails when a tools array element is not an object", func() {
|
||||
_, err := RenderGemma4([]*pb.Message{
|
||||
{Role: "user", Content: "hi"},
|
||||
}, `[42]`, false, true)
|
||||
Expect(err).To(HaveOccurred())
|
||||
Expect(err.Error()).To(ContainSubstring("tools[0] is not an object"))
|
||||
})
|
||||
|
||||
It("rejects a nil message via the unknown-role check", func() {
|
||||
// Pins current behavior: pb getters are nil-safe, so a nil message
|
||||
// reads as role "" and trips the fail-loud unknown-role guard.
|
||||
_, err := RenderGemma4([]*pb.Message{nil}, "", false, true)
|
||||
Expect(err).To(HaveOccurred())
|
||||
Expect(err.Error()).To(ContainSubstring(`unknown role "" in message 0`))
|
||||
})
|
||||
})
|
||||
})
|
||||
Reference in New Issue
Block a user