Compare commits

...

12 Commits

Author SHA1 Message Date
Michael Yang
d05fc26570 null truncate 2025-08-25 10:00:16 -07:00
Michael Yang
c457628090 null stream 2025-08-25 10:00:15 -07:00
Michael Yang
e914477bb6 types: add types.Null[T]
there's a common pattern where request fields may need to differentiate
between an unset value and a value set to the type's zero value. this is
commonly used to apply a different default value, e.g. stream, or to
omit a field entirely, e.g. think.

similar to sql.Null[T], types.Null[T] simplifies this by providing
utilities to quickly and easily apply this pattern to any type using
generics.
2025-08-25 09:49:02 -07:00
Jeffrey Morgan
4bcb04ad88 tools: avoid matching braces that are part of tool content (#12039) 2025-08-22 15:22:14 -07:00
Devon Rifkin
e3d5708754 Merge pull request #12021 from ollama/drifkin/thinking-double-emit
thinking: fix double emit when no opening tag
2025-08-22 12:01:37 -07:00
Jeffrey Morgan
4be4dc8717 server: skip parsing initial <think> if provided in the prompt (#12024) 2025-08-22 12:00:16 -07:00
zoupingshi
109d4fc3b4 chore: remove redundant words in comment (#12028)
Signed-off-by: zoupingshi <hangfachang@outlook.com>
2025-08-22 11:00:27 -07:00
Devon Rifkin
2cb0a580f3 thinking: fix double emit when no opening tag
The thinking parser will automatically transition to being a
pass-through if non-whitespace is seen before an opening tag. However,
we weren't clearing the buffer after the first non-whitespace input, so
in practice the first token would be emitted twice.

Added a test that demonstrated this, and then fixed the bug.
2025-08-21 21:03:12 -07:00
Parth Sareen
7cce5aac76 harmony: move harmony parsing into a package (#12016) 2025-08-21 13:56:22 -07:00
Michael Yang
4ae4f47b16 gpt-oss: convert from hugging face format (#11907) 2025-08-20 15:39:18 -07:00
Jesse Gross
073fa31df5 llm: Don't always evict models in CPU-only mode
With old memory estimates, it's currently impossible to load more
than one model at a time when no GPUs are available. This is because
the check for whether we need to evict a model looks to see if all
layers of the new model can be loaded onto GPUs, which is never true
if there are no GPUs. Before the memory management changes, there
was a special code path for CPU-only systems.

This problem does not exist with new memory estimates.

Fixes #11974
2025-08-20 14:31:02 -07:00
Michael Yang
91fc3c48e3 openai: remove reasoning as an api.Options (#11993) 2025-08-20 12:21:42 -07:00
22 changed files with 725 additions and 173 deletions

View File

@@ -12,6 +12,7 @@ import (
"time"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/types"
"github.com/ollama/ollama/types/model"
)
@@ -64,7 +65,7 @@ type GenerateRequest struct {
Context []int `json:"context,omitempty"`
// Stream specifies whether the response is streaming; it is true by default.
Stream *bool `json:"stream,omitempty"`
Stream types.Null[bool] `json:"stream,omitempty"`
// Raw set to true means that no formatting will be applied to the prompt.
Raw bool `json:"raw,omitempty"`
@@ -105,7 +106,7 @@ type ChatRequest struct {
Messages []Message `json:"messages"`
// Stream enables streaming of returned responses; true by default.
Stream *bool `json:"stream,omitempty"`
Stream types.Null[bool] `json:"stream,omitempty"`
// Format is the format to return the response in (e.g. "json").
Format json.RawMessage `json:"format,omitempty"`
@@ -381,7 +382,7 @@ type EmbedRequest struct {
// this request.
KeepAlive *Duration `json:"keep_alive,omitempty"`
Truncate *bool `json:"truncate,omitempty"`
Truncate types.Null[bool] `json:"truncate,omitempty"`
// Options lists model-specific options.
Options map[string]any `json:"options"`
@@ -420,9 +421,9 @@ type EmbeddingResponse struct {
// CreateRequest is the request passed to [Client.Create].
type CreateRequest struct {
Model string `json:"model"`
Stream *bool `json:"stream,omitempty"`
Quantize string `json:"quantize,omitempty"`
Model string `json:"model"`
Stream types.Null[bool] `json:"stream,omitempty"`
Quantize string `json:"quantize,omitempty"`
From string `json:"from,omitempty"`
Files map[string]string `json:"files,omitempty"`
@@ -486,11 +487,11 @@ type CopyRequest struct {
// PullRequest is the request passed to [Client.Pull].
type PullRequest struct {
Model string `json:"model"`
Insecure bool `json:"insecure,omitempty"` // Deprecated: ignored
Username string `json:"username"` // Deprecated: ignored
Password string `json:"password"` // Deprecated: ignored
Stream *bool `json:"stream,omitempty"`
Model string `json:"model"`
Insecure bool `json:"insecure,omitempty"`
Username string `json:"username"` // Deprecated: ignored
Password string `json:"password"` // Deprecated: ignored
Stream types.Null[bool] `json:"stream,omitempty"`
// Deprecated: set the model name with Model instead
Name string `json:"name"`
@@ -507,11 +508,11 @@ type ProgressResponse struct {
// PushRequest is the request passed to [Client.Push].
type PushRequest struct {
Model string `json:"model"`
Insecure bool `json:"insecure,omitempty"`
Username string `json:"username"`
Password string `json:"password"`
Stream *bool `json:"stream,omitempty"`
Model string `json:"model"`
Insecure bool `json:"insecure,omitempty"`
Username string `json:"username"` // Deprecated: ignored
Password string `json:"password"` // Deprecated: ignored
Stream types.Null[bool] `json:"stream,omitempty"`
// Deprecated: set the model name with Model instead
Name string `json:"name"`

View File

@@ -15,19 +15,24 @@ import (
type gptossModel struct {
ModelParameters
HiddenLayers uint32 `json:"num_hidden_layers"`
HiddenSize uint32 `json:"hidden_size"`
IntermediateSize uint32 `json:"intermediate_size"`
AttentionHeads uint32 `json:"num_attention_heads"`
KeyValueHeads uint32 `json:"num_key_value_heads"`
HeadDim uint32 `json:"head_dim"`
Experts uint32 `json:"num_experts"`
ExpertsPerToken uint32 `json:"experts_per_token"`
RMSNormEpsilon float32 `json:"rms_norm_eps"`
InitialContextLength uint32 `json:"initial_context_length"`
RopeTheta float32 `json:"rope_theta"`
RopeScalingFactor float32 `json:"rope_scaling_factor"`
SlidingWindow uint32 `json:"sliding_window"`
HiddenLayers uint32 `json:"num_hidden_layers"`
MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
HiddenSize uint32 `json:"hidden_size"`
IntermediateSize uint32 `json:"intermediate_size"`
AttentionHeads uint32 `json:"num_attention_heads"`
KeyValueHeads uint32 `json:"num_key_value_heads"`
HeadDim uint32 `json:"head_dim"`
Experts uint32 `json:"num_experts"`
LocalExperts uint32 `json:"num_local_experts"`
ExpertsPerToken uint32 `json:"experts_per_token"`
RMSNormEpsilon float32 `json:"rms_norm_eps"`
InitialContextLength uint32 `json:"initial_context_length"`
RopeTheta float32 `json:"rope_theta"`
RopeScalingFactor float32 `json:"rope_scaling_factor"`
RopeScaling struct {
Factor float32 `json:"factor"`
} `json:"rope_scaling"`
SlidingWindow uint32 `json:"sliding_window"`
}
var _ ModelConverter = (*gptossModel)(nil)
@@ -36,11 +41,11 @@ func (m *gptossModel) KV(t *Tokenizer) ggml.KV {
kv := m.ModelParameters.KV(t)
kv["general.architecture"] = "gptoss"
kv["general.file_type"] = uint32(4)
kv["gptoss.context_length"] = uint32(m.RopeScalingFactor * float32(m.InitialContextLength))
kv["gptoss.context_length"] = cmp.Or(m.MaxPositionEmbeddings, uint32(m.RopeScalingFactor*float32(m.InitialContextLength)))
kv["gptoss.block_count"] = m.HiddenLayers
kv["gptoss.embedding_length"] = m.HiddenSize
kv["gptoss.feed_forward_length"] = m.IntermediateSize
kv["gptoss.expert_count"] = m.Experts
kv["gptoss.expert_count"] = cmp.Or(m.Experts, m.LocalExperts)
kv["gptoss.expert_used_count"] = m.ExpertsPerToken
kv["gptoss.attention.head_count"] = m.AttentionHeads
kv["gptoss.attention.head_count_kv"] = m.KeyValueHeads
@@ -49,7 +54,7 @@ func (m *gptossModel) KV(t *Tokenizer) ggml.KV {
kv["gptoss.attention.layer_norm_rms_epsilon"] = cmp.Or(m.RMSNormEpsilon, 1e-5)
kv["gptoss.attention.sliding_window"] = m.SlidingWindow
kv["gptoss.rope.freq_base"] = m.RopeTheta
kv["gptoss.rope.scaling.factor"] = m.RopeScalingFactor
kv["gptoss.rope.scaling.factor"] = cmp.Or(m.RopeScalingFactor, m.RopeScaling.Factor)
kv["gptoss.rope.scaling.original_context_length"] = m.InitialContextLength
kv["tokenizer.ggml.bos_token_id"] = uint32(199998) // <|startoftext|>
kv["tokenizer.ggml.add_bos_token"] = false
@@ -92,6 +97,11 @@ func (m *gptossModel) Tensors(ts []Tensor) []*ggml.Tensor {
for name, mxfp4 := range mxfp4s {
dims := mxfp4.blocks.Shape()
if !strings.HasSuffix(name, ".weight") {
name += ".weight"
}
out = append(out, &ggml.Tensor{
Name: name,
Kind: uint32(ggml.TensorTypeMXFP4),
@@ -104,25 +114,47 @@ func (m *gptossModel) Tensors(ts []Tensor) []*ggml.Tensor {
}
func (m *gptossModel) Replacements() []string {
return []string{
// noop replacements so other replacements will not be applied
".blocks", ".blocks",
".scales", ".scales",
// real replacements
"block", "blk",
"attn.norm", "attn_norm",
"attn.qkv", "attn_qkv",
"attn.sinks", "attn_sinks",
"attn.out", "attn_out",
"mlp.norm", "ffn_norm",
"mlp.gate", "ffn_gate_inp",
"mlp.mlp1_", "ffn_gate_up_exps.",
"mlp.mlp2_", "ffn_down_exps.",
"embedding", "token_embd",
"norm", "output_norm",
"unembedding", "output",
"scale", "weight",
var replacements []string
if m.MaxPositionEmbeddings > 0 {
// hf flavored model
replacements = []string{
"lm_head", "output",
"model.embed_tokens", "token_embd",
"model.layers", "blk",
"input_layernorm", "attn_norm",
"self_attn.q_proj", "attn_q",
"self_attn.k_proj", "attn_k",
"self_attn.v_proj", "attn_v",
"self_attn.o_proj", "attn_out",
"self_attn.sinks", "attn_sinks",
"post_attention_layernorm", "ffn_norm",
"mlp.router", "ffn_gate_inp",
"mlp.experts.gate_up_proj_", "ffn_gate_up_exps.",
"mlp.experts.down_proj_", "ffn_down_exps.",
"model.norm", "output_norm",
}
} else {
replacements = []string{
// noop replacements so other replacements will not be applied
".blocks", ".blocks",
".scales", ".scales",
// real replacements
"block", "blk",
"attn.norm", "attn_norm",
"attn.qkv", "attn_qkv",
"attn.sinks", "attn_sinks",
"attn.out", "attn_out",
"mlp.norm", "ffn_norm",
"mlp.gate", "ffn_gate_inp",
"mlp.mlp1_", "ffn_gate_up_exps.",
"mlp.mlp2_", "ffn_down_exps.",
"embedding", "token_embd",
"norm", "output_norm",
"unembedding", "output",
"scale", "weight",
}
}
return replacements
}
type mxfp4 struct {

View File

@@ -1,10 +1,9 @@
package server
package harmony
import (
"context"
"fmt"
"log/slog"
"slices"
"strings"
"unicode"
@@ -20,18 +19,6 @@ const (
harmonyParserState_ParsingContent
)
func shouldUseHarmony(model Model) bool {
if slices.Contains([]string{"gptoss", "gpt-oss"}, model.Config.ModelFamily) {
// heuristic to check whether the template expects to be parsed via harmony:
// search for harmony tags that are nearly always used
if model.Template.Contains("<|start|>") && model.Template.Contains("<|end|>") {
return true
}
}
return false
}
func (s harmonyParserState) String() string {
switch s {
// we're looking for the message start tag
@@ -277,20 +264,20 @@ const (
// This is a higher level interface that maps harmony concepts into ollama concepts
type HarmonyMessageHandler struct {
state harmonyMessageState
harmonyParser *HarmonyParser
functionNameMap *FunctionNameMap
HarmonyParser *HarmonyParser
FunctionNameMap *FunctionNameMap
}
// NewHarmonyMessageHandler creates a new message handler
func NewHarmonyMessageHandler() *HarmonyMessageHandler {
return &HarmonyMessageHandler{
state: harmonyMessageState_Normal,
harmonyParser: &HarmonyParser{
HarmonyParser: &HarmonyParser{
MessageStartTag: "<|start|>",
MessageEndTag: "<|end|>",
HeaderEndTag: "<|message|>",
},
functionNameMap: NewFunctionNameMap(),
FunctionNameMap: NewFunctionNameMap(),
}
}
@@ -301,7 +288,7 @@ func (h *HarmonyMessageHandler) AddContent(content string, toolParser *HarmonyTo
thinkingSb := strings.Builder{}
toolContentSb := strings.Builder{}
events := h.harmonyParser.AddContent(content)
events := h.HarmonyParser.AddContent(content)
for _, event := range events {
switch event := event.(type) {
case HarmonyEventHeaderComplete:

View File

@@ -1,4 +1,4 @@
package server
package harmony
import (
"fmt"

View File

@@ -30,7 +30,7 @@ func pickBestFullFitByLibrary(f *ggml.GGML, modelPath string, projectors []strin
// Try to pack into as few GPUs as possible, starting from 1 GPU
for numGPUs := 1; numGPUs <= len(sgl); numGPUs++ {
gpuSubset := sgl[:numGPUs]
ok, estimatedVRAM := PredictServerFit(gpuSubset, f, adapters, projectors, opts, numParallel)
ok, estimatedVRAM := predictServerFit(gpuSubset, f, adapters, projectors, opts, numParallel)
if ok {
slog.Info("new model will fit in available VRAM across minimum required GPUs, loading",
@@ -48,7 +48,7 @@ func pickBestFullFitByLibrary(f *ggml.GGML, modelPath string, projectors []strin
// - try subsets of GPUs instead of just falling back to 1 or all in a family
// Now try all the GPUS (OLLAMA_SCHED_SPREAD is set)
if ok, estimatedVRAM := PredictServerFit(sgl, f, adapters, projectors, opts, numParallel); ok {
if ok, estimatedVRAM := predictServerFit(sgl, f, adapters, projectors, opts, numParallel); ok {
slog.Info("new model will fit in available VRAM, loading",
"model", modelPath,
"library", sgl[0].Library,
@@ -71,7 +71,7 @@ func pickBestPartialFitByLibrary(f *ggml.GGML, projectors []string, adapters []s
var bestEstimate uint64
var bestFit int
for i, gl := range byLibrary {
_, estimatedVRAM := PredictServerFit(gl, f, adapters, projectors, opts, numParallel)
_, estimatedVRAM := predictServerFit(gl, f, adapters, projectors, opts, numParallel)
if estimatedVRAM > bestEstimate {
bestEstimate = estimatedVRAM
bestFit = i
@@ -81,7 +81,7 @@ func pickBestPartialFitByLibrary(f *ggml.GGML, projectors []string, adapters []s
}
// This algorithm looks for a complete fit to determine if we need to unload other models
func PredictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (bool, uint64) {
func predictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (bool, uint64) {
// Split up the GPUs by type and try them
var estimatedVRAM uint64
for _, gpus := range allGpus.ByLibrary() {
@@ -97,6 +97,10 @@ func PredictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, proj
return true, estimatedVRAM
}
}
if len(gpus) == 1 && gpus[0].Library == "cpu" && estimate.TotalSize <= gpus[0].FreeMemory {
return true, estimatedVRAM
}
}
return false, estimatedVRAM
}

View File

@@ -492,6 +492,7 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
if !requireFull {
g = pickBestPartialFitByLibrary(s.ggml, []string{s.loadRequest.ProjectorPath}, s.loadRequest.LoraPath, s.options, gpus, s.numParallel)
} else {
slog.Info("model requires more memory than is currently available, evicting a model to make space", "estimate", s.estimate)
return ErrLoadRequiredFull
}
}
@@ -524,10 +525,6 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
}
}
if requireFull && len(gpus) == 1 && gpus[0].Library == "cpu" && s.estimate.TotalSize > gpus[0].FreeMemory {
return ErrLoadRequiredFull
}
slog.Info("offload", "", s.estimate)
s.gpus = gpus

View File

@@ -17,6 +17,7 @@ import (
"github.com/gin-gonic/gin"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/types"
"github.com/ollama/ollama/types/model"
)
@@ -557,12 +558,10 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
var think *api.ThinkValue
if r.Reasoning != nil {
options["reasoning"] = *r.Reasoning.Effort
think = &api.ThinkValue{
Value: *r.Reasoning.Effort,
}
} else if r.ReasoningEffort != nil {
options["reasoning"] = *r.ReasoningEffort
think = &api.ThinkValue{
Value: *r.ReasoningEffort,
}
@@ -573,7 +572,7 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
Messages: messages,
Format: format,
Options: options,
Stream: &r.Stream,
Stream: types.NullWithValue(r.Stream),
Tools: r.Tools,
Think: think,
}, nil
@@ -652,7 +651,7 @@ func fromCompleteRequest(r CompletionRequest) (api.GenerateRequest, error) {
Model: r.Model,
Prompt: r.Prompt,
Options: options,
Stream: &r.Stream,
Stream: types.NullWithValue(r.Stream),
Suffix: r.Suffix,
}, nil
}

View File

@@ -46,7 +46,7 @@ func NewInputCache(lc *llama.Context, kvSize int, numSlots int, multiUserCache b
}
// Locking: Operations on InputCacheSlot (including finding one
// through LoadCacheSlot) require a lock to be be held that serializes
// through LoadCacheSlot) require a lock to be held that serializes
// these operations with each other and llama.Decode
type InputCacheSlot struct {

View File

@@ -78,7 +78,7 @@ func (c *InputCache) Close() {
}
// Locking: Operations on InputCacheSlot (including finding one
// through LoadCacheSlot) require a lock to be be held that serializes
// through LoadCacheSlot) require a lock to be held that serializes
// these operations with each other and processBatch
type InputCacheSlot struct {

View File

@@ -146,7 +146,7 @@ func (s *Server) CreateHandler(c *gin.Context) {
ch <- api.ProgressResponse{Status: "success"}
}()
if r.Stream != nil && !*r.Stream {
if !r.Stream.Value(true) {
waitForStream(c, ch)
return
}

View File

@@ -32,6 +32,7 @@ import (
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/harmony"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/logutil"
"github.com/ollama/ollama/openai"
@@ -45,6 +46,18 @@ import (
"github.com/ollama/ollama/version"
)
func shouldUseHarmony(model *Model) bool {
if slices.Contains([]string{"gptoss", "gpt-oss"}, model.Config.ModelFamily) {
// heuristic to check whether the template expects to be parsed via harmony:
// search for harmony tags that are nearly always used
if model.Template.Contains("<|start|>") && model.Template.Contains("<|end|>") {
return true
}
}
return false
}
func experimentEnabled(name string) bool {
return slices.Contains(strings.Split(os.Getenv("OLLAMA_EXPERIMENT"), ","), name)
}
@@ -194,12 +207,12 @@ func (s *Server) GenerateHandler(c *gin.Context) {
return
}
useHarmony := shouldUseHarmony(*m) && !req.Raw
var harmonyMessageHandler *HarmonyMessageHandler
var harmonyToolParser *HarmonyToolCallAccumulator
useHarmony := shouldUseHarmony(m) && !req.Raw
var harmonyMessageHandler *harmony.HarmonyMessageHandler
var harmonyToolParser *harmony.HarmonyToolCallAccumulator
if useHarmony {
harmonyMessageHandler = NewHarmonyMessageHandler()
harmonyMessageHandler.harmonyParser.AddImplicitStart()
harmonyMessageHandler = harmony.NewHarmonyMessageHandler()
harmonyMessageHandler.HarmonyParser.AddImplicitStart()
harmonyToolParser = harmonyMessageHandler.CreateToolParser()
}
@@ -427,7 +440,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
}
}()
if req.Stream != nil && !*req.Stream {
if !req.Stream.Value(true) {
var r api.GenerateResponse
var sbThinking strings.Builder
var sbContent strings.Builder
@@ -474,12 +487,6 @@ func (s *Server) EmbedHandler(c *gin.Context) {
return
}
truncate := true
if req.Truncate != nil && !*req.Truncate {
truncate = false
}
var input []string
switch i := req.Input.(type) {
@@ -528,6 +535,7 @@ func (s *Server) EmbedHandler(c *gin.Context) {
}
var count int
truncate := req.Truncate.Value(true)
for i, s := range input {
tokens, err := r.Tokenize(c.Request.Context(), s)
if err != nil {
@@ -688,7 +696,7 @@ func (s *Server) PullHandler(c *gin.Context) {
}
}()
if req.Stream != nil && !*req.Stream {
if !req.Stream.Value(true) {
waitForStream(c, ch)
return
}
@@ -743,7 +751,7 @@ func (s *Server) PushHandler(c *gin.Context) {
}
}()
if req.Stream != nil && !*req.Stream {
if !req.Stream.Value(true) {
waitForStream(c, ch)
return
}
@@ -1603,19 +1611,19 @@ func (s *Server) ChatHandler(c *gin.Context) {
}
msgs = filterThinkTags(msgs, m)
var harmonyMessageHandler *HarmonyMessageHandler
var harmonyToolParser *HarmonyToolCallAccumulator
var harmonyMessageHandler *harmony.HarmonyMessageHandler
var harmonyToolParser *harmony.HarmonyToolCallAccumulator
useHarmony := shouldUseHarmony(*m)
useHarmony := shouldUseHarmony(m)
processedTools := req.Tools
if useHarmony {
harmonyMessageHandler = NewHarmonyMessageHandler()
harmonyMessageHandler = harmony.NewHarmonyMessageHandler()
var lastMessage *api.Message
if len(msgs) > 0 {
lastMessage = &msgs[len(msgs)-1]
}
harmonyMessageHandler.harmonyParser.AddImplicitStartOrPrefill(lastMessage)
harmonyMessageHandler.HarmonyParser.AddImplicitStartOrPrefill(lastMessage)
harmonyToolParser = harmonyMessageHandler.CreateToolParser()
// make a copy of tools to pass to the chat prompt. Function names may be
@@ -1623,7 +1631,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
processedTools = make([]api.Tool, len(req.Tools))
copy(processedTools, req.Tools)
for i, tool := range processedTools {
processedTools[i].Function.Name = harmonyMessageHandler.functionNameMap.ConvertAndAdd(tool.Function.Name)
processedTools[i].Function.Name = harmonyMessageHandler.FunctionNameMap.ConvertAndAdd(tool.Function.Name)
}
}
@@ -1660,6 +1668,10 @@ func (s *Server) ChatHandler(c *gin.Context) {
OpeningTag: openingTag,
ClosingTag: closingTag,
}
if strings.HasSuffix(strings.TrimSpace(prompt), openingTag) {
thinkingState.AddContent(openingTag)
}
}
var toolParser *tools.Parser
@@ -1705,7 +1717,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
toolName, toolContent := harmonyToolParser.Drain()
if toolName != nil {
*toolName = strings.TrimPrefix(*toolName, "functions.")
*toolName = harmonyMessageHandler.functionNameMap.OriginalFromConverted(*toolName)
*toolName = harmonyMessageHandler.FunctionNameMap.OriginalFromConverted(*toolName)
var args api.ToolCallFunctionArguments
if err := json.Unmarshal([]byte(toolContent), &args); err != nil {
errStr := fmt.Sprintf("error parsing tool call: raw='%s', err=%s", toolContent, err.Error())
@@ -1758,7 +1770,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
}
}()
if req.Stream != nil && !*req.Stream {
if !req.Stream.Value(true) {
var resp api.ChatResponse
var toolCalls []api.ToolCall
var sbThinking strings.Builder

View File

@@ -22,8 +22,6 @@ import (
"github.com/ollama/ollama/fs/ggml"
)
var stream bool = false
func createBinFile(t *testing.T, kv map[string]any, ti []*ggml.Tensor) (string, string) {
t.Helper()
t.Setenv("OLLAMA_MODELS", cmp.Or(os.Getenv("OLLAMA_MODELS"), t.TempDir()))
@@ -118,7 +116,7 @@ func TestCreateFromBin(t *testing.T) {
w := createRequest(t, s.CreateHandler, api.CreateRequest{
Name: "test",
Files: map[string]string{"test.gguf": digest},
Stream: &stream,
Stream: streamFalse,
})
if w.Code != http.StatusOK {
@@ -148,7 +146,7 @@ func TestCreateFromModel(t *testing.T) {
w := createRequest(t, s.CreateHandler, api.CreateRequest{
Name: "test",
Files: map[string]string{"test.gguf": digest},
Stream: &stream,
Stream: streamFalse,
})
if w.Code != http.StatusOK {
@@ -162,7 +160,7 @@ func TestCreateFromModel(t *testing.T) {
w = createRequest(t, s.CreateHandler, api.CreateRequest{
Name: "test2",
From: "test",
Stream: &stream,
Stream: streamFalse,
})
if w.Code != http.StatusOK {
@@ -192,7 +190,7 @@ func TestCreateRemovesLayers(t *testing.T) {
Name: "test",
Files: map[string]string{"test.gguf": digest},
Template: "{{ .Prompt }}",
Stream: &stream,
Stream: streamFalse,
})
if w.Code != http.StatusOK {
@@ -213,7 +211,7 @@ func TestCreateRemovesLayers(t *testing.T) {
Name: "test",
Files: map[string]string{"test.gguf": digest},
Template: "{{ .System }} {{ .Prompt }}",
Stream: &stream,
Stream: streamFalse,
})
if w.Code != http.StatusOK {
@@ -243,7 +241,7 @@ func TestCreateUnsetsSystem(t *testing.T) {
Name: "test",
Files: map[string]string{"test.gguf": digest},
System: "Say hi!",
Stream: &stream,
Stream: streamFalse,
})
if w.Code != http.StatusOK {
@@ -264,7 +262,7 @@ func TestCreateUnsetsSystem(t *testing.T) {
Name: "test",
Files: map[string]string{"test.gguf": digest},
System: "",
Stream: &stream,
Stream: streamFalse,
})
if w.Code != http.StatusOK {
@@ -297,7 +295,7 @@ func TestCreateMergeParameters(t *testing.T) {
"top_k": 10,
"stop": []string{"USER:", "ASSISTANT:"},
},
Stream: &stream,
Stream: streamFalse,
})
if w.Code != http.StatusOK {
@@ -322,7 +320,7 @@ func TestCreateMergeParameters(t *testing.T) {
"temperature": 0.6,
"top_p": 0.7,
},
Stream: &stream,
Stream: streamFalse,
})
if w.Code != http.StatusOK {
@@ -381,7 +379,7 @@ func TestCreateMergeParameters(t *testing.T) {
"top_p": 0.7,
"stop": []string{"<|endoftext|>"},
},
Stream: &stream,
Stream: streamFalse,
})
if w.Code != http.StatusOK {
@@ -441,7 +439,7 @@ func TestCreateReplacesMessages(t *testing.T) {
Content: "Oh, my god.",
},
},
Stream: &stream,
Stream: streamFalse,
})
if w.Code != http.StatusOK {
@@ -475,7 +473,7 @@ func TestCreateReplacesMessages(t *testing.T) {
Content: "A test. And a thumping good one at that, I'd wager.",
},
},
Stream: &stream,
Stream: streamFalse,
})
if w.Code != http.StatusOK {
@@ -536,7 +534,7 @@ func TestCreateTemplateSystem(t *testing.T) {
Files: map[string]string{"test.gguf": digest},
Template: "{{ .System }} {{ .Prompt }}",
System: "Say bye!",
Stream: &stream,
Stream: streamFalse,
})
if w.Code != http.StatusOK {
@@ -578,7 +576,7 @@ func TestCreateTemplateSystem(t *testing.T) {
Name: "test",
Files: map[string]string{"test.gguf": digest},
Template: "{{ .Prompt",
Stream: &stream,
Stream: streamFalse,
})
if w.Code != http.StatusBadRequest {
@@ -592,7 +590,7 @@ func TestCreateTemplateSystem(t *testing.T) {
Name: "test",
Files: map[string]string{"test.gguf": digest},
Template: "{{ if .Prompt }}",
Stream: &stream,
Stream: streamFalse,
})
if w.Code != http.StatusBadRequest {
@@ -606,7 +604,7 @@ func TestCreateTemplateSystem(t *testing.T) {
Name: "test",
Files: map[string]string{"test.gguf": digest},
Template: "{{ Prompt }}",
Stream: &stream,
Stream: streamFalse,
})
if w.Code != http.StatusBadRequest {
@@ -627,7 +625,7 @@ func TestCreateLicenses(t *testing.T) {
Name: "test",
Files: map[string]string{"test.gguf": digest},
License: []string{"MIT", "Apache-2.0"},
Stream: &stream,
Stream: streamFalse,
})
if w.Code != http.StatusOK {
@@ -678,7 +676,7 @@ func TestCreateDetectTemplate(t *testing.T) {
w := createRequest(t, s.CreateHandler, api.CreateRequest{
Name: "test",
Files: map[string]string{"test.gguf": digest},
Stream: &stream,
Stream: streamFalse,
})
if w.Code != http.StatusOK {
@@ -698,7 +696,7 @@ func TestCreateDetectTemplate(t *testing.T) {
w := createRequest(t, s.CreateHandler, api.CreateRequest{
Name: "test",
Files: map[string]string{"test.gguf": digest},
Stream: &stream,
Stream: streamFalse,
})
if w.Code != http.StatusOK {

View File

@@ -12,6 +12,7 @@ import (
"github.com/ollama/ollama/discover"
"github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/types"
)
func TestGenerateDebugRenderOnly(t *testing.T) {
@@ -53,7 +54,6 @@ func TestGenerateDebugRenderOnly(t *testing.T) {
go s.sched.Run(t.Context())
// Create a test model
stream := false
_, digest := createBinFile(t, ggml.KV{
"general.architecture": "llama",
"llama.block_count": uint32(1),
@@ -82,7 +82,7 @@ func TestGenerateDebugRenderOnly(t *testing.T) {
Model: "test-model",
Files: map[string]string{"file.gguf": digest},
Template: "{{ .Prompt }}",
Stream: &stream,
Stream: streamFalse,
})
if w.Code != http.StatusOK {
@@ -172,7 +172,7 @@ func TestGenerateDebugRenderOnly(t *testing.T) {
}
t.Run(tt.name+streamSuffix, func(t *testing.T) {
req := tt.request
req.Stream = &stream
req.Stream = types.NullWithValue(stream)
w := createRequest(t, s.GenerateHandler, req)
if tt.expectDebug {
@@ -246,7 +246,6 @@ func TestChatDebugRenderOnly(t *testing.T) {
go s.sched.Run(t.Context())
// Create a test model
stream := false
_, digest := createBinFile(t, ggml.KV{
"general.architecture": "llama",
"llama.block_count": uint32(1),
@@ -275,7 +274,7 @@ func TestChatDebugRenderOnly(t *testing.T) {
Model: "test-model",
Files: map[string]string{"file.gguf": digest},
Template: "{{ if .Tools }}{{ .Tools }}{{ end }}{{ range .Messages }}{{ .Role }}: {{ .Content }}\n{{ end }}",
Stream: &stream,
Stream: streamFalse,
})
if w.Code != http.StatusOK {
@@ -377,7 +376,7 @@ func TestChatDebugRenderOnly(t *testing.T) {
}
t.Run(tt.name+streamSuffix, func(t *testing.T) {
req := tt.request
req.Stream = &stream
req.Stream = types.NullWithValue(stream)
w := createRequest(t, s.ChatHandler, req)
if tt.expectDebug {

View File

@@ -126,7 +126,7 @@ func TestGenerateChat(t *testing.T) {
{{- range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}
{{- end }}
{{ end }}`,
Stream: &stream,
Stream: streamFalse,
})
if w.Code != http.StatusOK {
@@ -182,7 +182,7 @@ func TestGenerateChat(t *testing.T) {
w := createRequest(t, s.CreateHandler, api.CreateRequest{
Model: "bert",
Files: map[string]string{"bert.gguf": digest},
Stream: &stream,
Stream: streamFalse,
})
if w.Code != http.StatusOK {
@@ -288,7 +288,7 @@ func TestGenerateChat(t *testing.T) {
Messages: []api.Message{
{Role: "user", Content: "Hello!"},
},
Stream: &stream,
Stream: streamFalse,
})
if w.Code != http.StatusOK {
@@ -318,7 +318,7 @@ func TestGenerateChat(t *testing.T) {
Messages: []api.Message{
{Role: "user", Content: "Hello!"},
},
Stream: &stream,
Stream: streamFalse,
})
if w.Code != http.StatusOK {
@@ -340,7 +340,7 @@ func TestGenerateChat(t *testing.T) {
{Role: "system", Content: "You can perform magic tricks."},
{Role: "user", Content: "Hello!"},
},
Stream: &stream,
Stream: streamFalse,
})
if w.Code != http.StatusOK {
@@ -363,7 +363,7 @@ func TestGenerateChat(t *testing.T) {
{Role: "system", Content: "You can perform magic tricks."},
{Role: "user", Content: "Help me write tests."},
},
Stream: &stream,
Stream: streamFalse,
})
if w.Code != http.StatusOK {
@@ -422,15 +422,13 @@ func TestGenerateChat(t *testing.T) {
EvalDuration: 1,
}
streamRequest := true
w := createRequest(t, s.ChatHandler, api.ChatRequest{
Model: "test-system",
Messages: []api.Message{
{Role: "user", Content: "What's the weather in Seattle?"},
},
Tools: tools,
Stream: &streamRequest,
Stream: streamTrue,
})
if w.Code != http.StatusOK {
@@ -551,7 +549,7 @@ func TestGenerateChat(t *testing.T) {
{Role: "user", Content: "What's the weather in Seattle?"},
},
Tools: tools,
Stream: &stream,
Stream: streamFalse,
})
wg.Wait()
@@ -666,7 +664,7 @@ func TestGenerate(t *testing.T) {
{{- if .Prompt }}User: {{ .Prompt }} {{ end }}
{{- if .Response }}Assistant: {{ .Response }} {{ end }}
`,
Stream: &stream,
Stream: streamFalse,
})
if w.Code != http.StatusOK {
@@ -704,7 +702,7 @@ func TestGenerate(t *testing.T) {
w := createRequest(t, s.CreateHandler, api.CreateRequest{
Model: "bert",
Files: map[string]string{"file.gguf": digest},
Stream: &stream,
Stream: streamFalse,
})
if w.Code != http.StatusOK {
@@ -825,7 +823,7 @@ func TestGenerate(t *testing.T) {
w := createRequest(t, s.GenerateHandler, api.GenerateRequest{
Model: "test",
Prompt: "Hello!",
Stream: &stream,
Stream: streamFalse,
})
if w.Code != http.StatusOK {
@@ -853,7 +851,7 @@ func TestGenerate(t *testing.T) {
w := createRequest(t, s.GenerateHandler, api.GenerateRequest{
Model: "test-system",
Prompt: "Hello!",
Stream: &stream,
Stream: streamFalse,
})
if w.Code != http.StatusOK {
@@ -873,7 +871,7 @@ func TestGenerate(t *testing.T) {
Model: "test-system",
Prompt: "Hello!",
System: "You can perform magic tricks.",
Stream: &stream,
Stream: streamFalse,
})
if w.Code != http.StatusOK {
@@ -895,7 +893,7 @@ func TestGenerate(t *testing.T) {
Template: `{{- if .System }}{{ .System }} {{ end }}
{{- if .Prompt }}### USER {{ .Prompt }} {{ end }}
{{- if .Response }}### ASSISTANT {{ .Response }} {{ end }}`,
Stream: &stream,
Stream: streamFalse,
})
if w.Code != http.StatusOK {
@@ -957,7 +955,7 @@ func TestGenerate(t *testing.T) {
Model: "test-system",
Prompt: "Help me write tests.",
Raw: true,
Stream: &stream,
Stream: streamFalse,
})
if w.Code != http.StatusOK {
@@ -969,3 +967,232 @@ func TestGenerate(t *testing.T) {
}
})
}
func TestChatWithPromptEndingInThinkTag(t *testing.T) {
gin.SetMode(gin.TestMode)
// Helper to create a standard thinking test setup
setupThinkingTest := func(t *testing.T) (*mockRunner, *Server) {
mock := &mockRunner{
CompletionResponse: llm.CompletionResponse{
Done: true,
DoneReason: llm.DoneReasonStop,
PromptEvalCount: 1,
PromptEvalDuration: 1,
EvalCount: 1,
EvalDuration: 1,
},
}
s := &Server{
sched: &Scheduler{
pendingReqCh: make(chan *LlmRequest, 1),
finishedReqCh: make(chan *LlmRequest, 1),
expiredCh: make(chan *runnerRef, 1),
unloadedCh: make(chan any, 1),
loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(mock),
getGpuFn: discover.GetGPUInfo,
getCpuFn: discover.GetCPUInfo,
reschedDelay: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{llama: mock}
return false
},
},
}
go s.sched.Run(t.Context())
// Create a model with thinking support
_, digest := createBinFile(t, ggml.KV{
"general.architecture": "llama",
"llama.block_count": uint32(1),
"llama.context_length": uint32(8192),
"llama.embedding_length": uint32(4096),
"llama.attention.head_count": uint32(32),
"llama.attention.head_count_kv": uint32(8),
"tokenizer.ggml.tokens": []string{""},
"tokenizer.ggml.scores": []float32{0},
"tokenizer.ggml.token_type": []int32{0},
}, []*ggml.Tensor{
{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.ffn_gate.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.ffn_up.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.ffn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_k.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_q.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_v.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
})
// Create model with thinking template that adds <think> at the end
w := createRequest(t, s.CreateHandler, api.CreateRequest{
Model: "test-thinking",
Files: map[string]string{"file.gguf": digest},
Template: `{{- range .Messages }}
{{- if eq .Role "user" }}user: {{ .Content }}
{{ else if eq .Role "assistant" }}assistant: {{ if .Thinking }}<think>{{ .Thinking }}</think>{{ end }}{{ .Content }}
{{ end }}{{ end }}<think>`,
Stream: streamFalse,
})
if w.Code != http.StatusOK {
t.Fatalf("expected status 200, got %d", w.Code)
}
return mock, s
}
mock, s := setupThinkingTest(t)
// Helper to test chat responses
testChatRequest := func(t *testing.T, name string, userContent string, modelResponse string, expectedThinking string, expectedContent string, think bool) {
t.Run(name, func(t *testing.T) {
mock.CompletionResponse = llm.CompletionResponse{
Content: modelResponse,
Done: true,
DoneReason: llm.DoneReasonStop,
PromptEvalCount: 1,
PromptEvalDuration: 1,
EvalCount: 1,
EvalDuration: 1,
}
mock.CompletionFn = nil
req := api.ChatRequest{
Model: "test-thinking",
Messages: []api.Message{
{Role: "user", Content: userContent},
},
Stream: streamFalse,
}
if think {
req.Think = &api.ThinkValue{Value: think}
}
w := createRequest(t, s.ChatHandler, req)
if w.Code != http.StatusOK {
t.Fatalf("expected status 200, got %d", w.Code)
}
var resp api.ChatResponse
if err := json.NewDecoder(w.Body).Decode(&resp); err != nil {
t.Fatal(err)
}
if resp.Message.Thinking != expectedThinking {
t.Errorf("expected thinking %q, got %q", expectedThinking, resp.Message.Thinking)
}
if resp.Message.Content != expectedContent {
t.Errorf("expected content %q, got %q", expectedContent, resp.Message.Content)
}
})
}
// Test cases - Note: Template adds <think> at the end, and leading whitespace after <think> is eaten by the parser
testChatRequest(t, "basic thinking response",
"Help me solve this problem",
" Let me think about this step by step... </think> The answer is 42.",
"Let me think about this step by step... ",
"The answer is 42.",
true)
testChatRequest(t, "thinking with multiple sentences",
"Explain quantum computing",
" First, I need to understand the basics. Quantum bits can be in superposition. </think> Quantum computing uses quantum mechanics principles.",
"First, I need to understand the basics. Quantum bits can be in superposition. ",
"Quantum computing uses quantum mechanics principles.",
true)
testChatRequest(t, "no thinking content",
"What is 2+2?",
"</think> The answer is 4.",
"",
"The answer is 4.",
true)
testChatRequest(t, "thinking disabled but template still adds think tag",
"Simple question",
" My thoughts </think> The answer.",
"",
" My thoughts </think> The answer.",
false)
// Test streaming response with template-added <think>
t.Run("streaming with thinking", func(t *testing.T) {
var wg sync.WaitGroup
wg.Add(1)
mock.CompletionFn = func(ctx context.Context, r llm.CompletionRequest, fn func(r llm.CompletionResponse)) error {
defer wg.Done()
// Verify the prompt ends with <think> due to template
if !strings.HasSuffix(r.Prompt, "<think>") {
t.Errorf("expected prompt to end with <think>, got: %q", r.Prompt)
}
// Simulate streaming chunks
responses := []llm.CompletionResponse{
{Content: " I need to consider", Done: false, PromptEvalCount: 1, PromptEvalDuration: 1},
{Content: " multiple factors here...", Done: false, PromptEvalCount: 1, PromptEvalDuration: 1},
{Content: " </think> Based on my analysis,", Done: false, PromptEvalCount: 1, PromptEvalDuration: 1},
{Content: " the solution is straightforward.", Done: true, DoneReason: llm.DoneReasonStop, PromptEvalCount: 1, PromptEvalDuration: 1, EvalCount: 1, EvalDuration: 1},
}
for _, resp := range responses {
select {
case <-ctx.Done():
return ctx.Err()
default:
fn(resp)
time.Sleep(10 * time.Millisecond)
}
}
return nil
}
think := true
w := createRequest(t, s.ChatHandler, api.ChatRequest{
Model: "test-thinking",
Messages: []api.Message{{Role: "user", Content: "Analyze this complex problem"}},
Think: &api.ThinkValue{Value: think},
Stream: streamFalse,
})
wg.Wait()
if w.Code != http.StatusOK {
t.Fatalf("expected status 200, got %d", w.Code)
}
// Parse streaming responses
decoder := json.NewDecoder(w.Body)
var allThinking, allContent strings.Builder
for {
var resp api.ChatResponse
if err := decoder.Decode(&resp); err == io.EOF {
break
} else if err != nil {
t.Fatal(err)
}
allThinking.WriteString(resp.Message.Thinking)
allContent.WriteString(resp.Message.Content)
}
// Note: Leading whitespace after <think> is eaten by the parser
if got := allThinking.String(); got != "I need to consider multiple factors here... " {
t.Errorf("expected thinking %q, got %q", "I need to consider multiple factors here... ", got)
}
if got := allContent.String(); got != "Based on my analysis, the solution is straightforward." {
t.Errorf("expected content %q, got %q", "Based on my analysis, the solution is straightforward.", got)
}
})
}

View File

@@ -291,12 +291,11 @@ func TestChatHarmonyParserStreamingRealtime(t *testing.T) {
// Create a simple test model
_, digest := createHarmonyTestModel(t)
streamFalse := false
w := createRequest(t, s.CreateHandler, api.CreateRequest{
Model: "harmony-test-streaming",
Files: map[string]string{"test.gguf": digest},
Template: `<|start|><|end|>{{ with .Tools }}{{ end }}{{ .Prompt }}`,
Stream: &streamFalse,
Stream: streamFalse,
})
if w.Code != 200 {
@@ -304,11 +303,10 @@ func TestChatHarmonyParserStreamingRealtime(t *testing.T) {
}
// Test chat endpoint with streaming
streamTrue := true
w = createRequest(t, s.ChatHandler, api.ChatRequest{
Model: "harmony-test-streaming",
Messages: []api.Message{{Role: "user", Content: "Hello"}},
Stream: &streamTrue,
Stream: streamTrue,
Tools: getTestTools(),
})
@@ -441,12 +439,11 @@ func TestChatHarmonyParserStreamingSimple(t *testing.T) {
// Create model
_, digest := createHarmonyTestModel(t)
streamFalse := false
w := createRequest(t, s.CreateHandler, api.CreateRequest{
Model: "gpt-oss",
Files: map[string]string{"test.gguf": digest},
Template: `<|start|><|end|>{{ .Tools }}{{ .Prompt }}`,
Stream: &streamFalse,
Stream: streamFalse,
})
if w.Code != 200 {
@@ -454,11 +451,10 @@ func TestChatHarmonyParserStreamingSimple(t *testing.T) {
}
// Test streaming
streamTrue := true
w = createRequest(t, s.ChatHandler, api.ChatRequest{
Model: "gpt-oss",
Messages: []api.Message{{Role: "user", Content: "Hello"}},
Stream: &streamTrue,
Stream: streamTrue,
Tools: getTestTools(),
})
@@ -625,12 +621,11 @@ func TestChatHarmonyParserStreaming(t *testing.T) {
_, digest := createHarmonyTestModel(t)
// Create model with passthrough template
stream := false
w := createRequest(t, s.CreateHandler, api.CreateRequest{
Model: "harmony-test",
Files: map[string]string{"file.gguf": digest},
Template: `<|start|><|end|>{{ with .Tools }}{{ end }}{{ .Prompt }}`,
Stream: &stream,
Stream: streamFalse,
})
if w.Code != http.StatusOK {
@@ -638,11 +633,10 @@ func TestChatHarmonyParserStreaming(t *testing.T) {
}
// Test chat endpoint with streaming
streamTrue := true
w = createRequest(t, s.ChatHandler, api.ChatRequest{
Model: "harmony-test",
Messages: []api.Message{{Role: "user", Content: "Hello"}},
Stream: &streamTrue,
Stream: streamTrue,
Tools: getTestTools(),
})

View File

@@ -28,10 +28,16 @@ import (
"github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/openai"
"github.com/ollama/ollama/server/internal/client/ollama"
"github.com/ollama/ollama/types"
"github.com/ollama/ollama/types/model"
"github.com/ollama/ollama/version"
)
var (
streamFalse = types.NullWithValue(false)
streamTrue = types.NullWithValue(true)
)
func createTestFile(t *testing.T, name string) (string, string) {
t.Helper()
@@ -332,11 +338,10 @@ func TestRoutes(t *testing.T) {
Path: "/api/create",
Setup: func(t *testing.T, req *http.Request) {
_, digest := createTestFile(t, "ollama-model")
stream := false
createReq := api.CreateRequest{
Name: "t-bone",
Files: map[string]string{"test.gguf": digest},
Stream: &stream,
Stream: streamFalse,
}
jsonData, err := json.Marshal(createReq)
if err != nil {
@@ -638,7 +643,7 @@ func TestManifestCaseSensitivity(t *testing.T) {
// version.
Name: wantStableName,
Files: map[string]string{"test.gguf": digest},
Stream: &stream,
Stream: streamFalse,
}))
checkManifestList()
@@ -646,14 +651,14 @@ func TestManifestCaseSensitivity(t *testing.T) {
checkOK(createRequest(t, s.CreateHandler, api.CreateRequest{
Name: name(),
Files: map[string]string{"test.gguf": digest},
Stream: &stream,
Stream: streamFalse,
}))
checkManifestList()
t.Logf("pulling")
checkOK(createRequest(t, s.PullHandler, api.PullRequest{
Name: name(),
Stream: &stream,
Stream: streamFalse,
Insecure: true,
}))
checkManifestList()

View File

@@ -103,7 +103,9 @@ func eat(s *Parser) (string, string, bool) {
// note that we use the original content, not the trimmed one because we
// don't want to eat any whitespace in the real content if there were no
// thinking tags
return "", s.acc.String(), false
untrimmed := s.acc.String()
s.acc.Reset()
return "", untrimmed, false
}
case thinkingState_ThinkingStartedEatingWhitespace:
trimmed := strings.TrimLeftFunc(s.acc.String(), unicode.IsSpace)

View File

@@ -58,6 +58,15 @@ func TestThinkingStreaming(t *testing.T) {
wantContent: " abc",
wantStateAfter: thinkingState_ThinkingDone,
},
// regression test for a bug where we were transitioning directly to
// ThinkingDone without clearing the buffer. This would cuase the first
// step to be outputted twice
{
input: "def",
wantThinking: "",
wantContent: "def",
wantStateAfter: thinkingState_ThinkingDone,
},
},
},
{

View File

@@ -224,22 +224,45 @@ func findArguments(buffer []byte) (map[string]any, int) {
return nil, 0
}
start := -1
var braces int
var start int = -1
var inString, escaped bool
for i := range buffer {
c := buffer[i]
if escaped {
escaped = false
continue
}
if c == '\\' {
escaped = true
continue
}
if c == '"' {
inString = !inString
continue
}
if inString {
continue
}
for i, c := range buffer {
if c == '{' {
if braces == 0 {
start = i
}
braces++
} else if c == '}' && braces > 0 {
} else if c == '}' {
braces--
if braces == 0 && start != -1 {
object := buffer[start : i+1]
var data map[string]any
if err := json.Unmarshal(object, &data); err != nil {
// not a valid object, keep looking
start = -1
continue
}
@@ -282,6 +305,10 @@ func findArguments(buffer []byte) (map[string]any, int) {
return data, i
}
if braces < 0 {
braces = 0
}
}
}

View File

@@ -1,6 +1,7 @@
package tools
import (
"strings"
"testing"
"text/template"
@@ -1140,11 +1141,163 @@ func TestFindArguments(t *testing.T) {
},
{
name: "deepseek",
buffer: []byte(`", "arguments": {"location": "Tokyo"}}</tool_call>`),
buffer: []byte(`"arguments": {"location": "Tokyo"}}</tool_call>`),
want: map[string]any{
"location": "Tokyo",
},
},
{
name: "string with braces",
buffer: []byte(`{"name": "process_code", "arguments": {"code": "if (x > 0) { return true; }"}}`),
want: map[string]any{
"code": "if (x > 0) { return true; }",
},
},
{
name: "string with nested json",
buffer: []byte(`{"name": "send_data", "arguments": {"payload": "{\"nested\": {\"key\": \"value\"}}"}}`),
want: map[string]any{
"payload": `{"nested": {"key": "value"}}`,
},
},
{
name: "string with escaped quotes and braces",
buffer: []byte(`{"name": "analyze", "arguments": {"text": "The JSON is: {\"key\": \"val{ue}\"}"}}`),
want: map[string]any{
"text": `The JSON is: {"key": "val{ue}"}`,
},
},
{
name: "multiple objects with string containing braces",
buffer: []byte(`{"name": "test", "arguments": {"query": "find } in text"}} {"name": "other"}`),
want: map[string]any{
"query": "find } in text",
},
},
{
name: "unmatched closing brace in string",
buffer: []byte(`{"name": "search", "arguments": {"pattern": "regex: }"}}`),
want: map[string]any{
"pattern": "regex: }",
},
},
{
name: "complex nested with mixed braces",
buffer: []byte(`{"name": "analyze", "arguments": {"data": "{\"items\": [{\"value\": \"}\"}, {\"code\": \"if (x) { return y; }\"}]}"}}`),
want: map[string]any{
"data": `{"items": [{"value": "}"}, {"code": "if (x) { return y; }"}]}`,
},
},
{
name: "string with newline and braces",
buffer: []byte(`{"name": "format", "arguments": {"template": "{\n \"key\": \"value\"\n}"}}`),
want: map[string]any{
"template": "{\n \"key\": \"value\"\n}",
},
},
{
name: "string with unicode escape",
buffer: []byte(`{"name": "test", "arguments": {"text": "Unicode: \u007B and \u007D"}}`),
want: map[string]any{
"text": "Unicode: { and }",
},
},
{
name: "array arguments",
buffer: []byte(`{"name": "batch", "arguments": ["item1", "item2", "{\"nested\": true}"]}`),
want: nil, // This should return nil because arguments is not a map
},
{
name: "escaped backslash before quote",
buffer: []byte(`{"name": "path", "arguments": {"dir": "C:\\Program Files\\{App}\\"}}`),
want: map[string]any{
"dir": `C:\Program Files\{App}\`,
},
},
{
name: "single quotes not treated as string delimiters",
buffer: []byte(`{"name": "query", "arguments": {"sql": "SELECT * FROM users WHERE name = '{admin}'"}}`),
want: map[string]any{
"sql": "SELECT * FROM users WHERE name = '{admin}'",
},
},
{
name: "incomplete json at buffer end",
buffer: []byte(`{"name": "test", "arguments": {"data": "some {"`),
want: nil,
},
{
name: "multiple escaped quotes",
buffer: []byte(`{"name": "echo", "arguments": {"msg": "He said \"Hello {World}\" loudly"}}`),
want: map[string]any{
"msg": `He said "Hello {World}" loudly`,
},
},
{
name: "json with comments style string",
buffer: []byte(`{"name": "code", "arguments": {"snippet": "// This is a comment with { and }"}}`),
want: map[string]any{
"snippet": "// This is a comment with { and }",
},
},
{
name: "consecutive escaped backslashes",
buffer: []byte(`{"name": "test", "arguments": {"path": "C:\\\\{folder}\\\\"}}`),
want: map[string]any{
"path": `C:\\{folder}\\`,
},
},
{
name: "empty string with braces after",
buffer: []byte(`{"name": "test", "arguments": {"a": "", "b": "{value}"}}`),
want: map[string]any{
"a": "",
"b": "{value}",
},
},
{
name: "unicode in key names",
buffer: []byte(`{"name": "test", "arguments": {"key{": "value", "key}": "value2"}}`),
want: map[string]any{
"key{": "value",
"key}": "value2",
},
},
{
name: "very long string with braces",
buffer: []byte(`{"name": "test", "arguments": {"data": "` + strings.Repeat("a{b}c", 100) + `"}}`),
want: map[string]any{
"data": strings.Repeat("a{b}c", 100),
},
},
{
name: "tab characters and braces",
buffer: []byte(`{"name": "test", "arguments": {"code": "\tif (true) {\n\t\treturn;\n\t}"}}`),
want: map[string]any{
"code": "\tif (true) {\n\t\treturn;\n\t}",
},
},
{
name: "null byte in string",
buffer: []byte(`{"name": "test", "arguments": {"data": "before\u0000{after}"}}`),
want: map[string]any{
"data": "before\x00{after}",
},
},
{
name: "escaped quote at end of string",
buffer: []byte(`{"name": "test", "arguments": {"data": "text with quote at end\\\""}}`),
want: map[string]any{
"data": `text with quote at end\"`,
},
},
{
name: "mixed array and object in arguments",
buffer: []byte(`{"name": "test", "arguments": {"items": ["{", "}", {"key": "value"}]}}`),
want: map[string]any{
"items": []any{"{", "}", map[string]any{"key": "value"}},
},
},
}
for _, tt := range tests {

53
types/null.go Normal file
View File

@@ -0,0 +1,53 @@
package types
import (
"encoding/json"
)
// Null represents a value of any type T that may be null.
type Null[T any] struct {
value T
valid bool
}
// NullWithValue creates a new, valid Null[T].
func NullWithValue[T any](value T) Null[T] {
return Null[T]{value: value, valid: true}
}
// Value returns the value of the Type[T] if set, otherwise it returns the provided default value or the zero value of T.
func (n Null[T]) Value(defaultValue ...T) T {
if n.valid {
return n.value
}
if len(defaultValue) > 0 {
return defaultValue[0]
}
var zero T
return zero
}
// SetValue sets the value of the Type[T].
func (n *Null[T]) SetValue(t T) {
n.value = t
n.valid = true
}
// MarshalJSON implements [json.Marshaler].
func (n Null[T]) MarshalJSON() ([]byte, error) {
if n.valid {
return json.Marshal(n.value)
}
return []byte("null"), nil
}
// UnmarshalJSON implements [json.Unmarshaler].
func (n *Null[T]) UnmarshalJSON(data []byte) error {
if string(data) != "null" {
if err := json.Unmarshal(data, &n.value); err != nil {
return err
}
n.valid = true
}
return nil
}

53
types/null_test.go Normal file
View File

@@ -0,0 +1,53 @@
package types_test
import (
"encoding/json"
"testing"
"github.com/ollama/ollama/types"
)
func TestNull(t *testing.T) {
var s types.Null[string]
if val := s.Value(); val != "" {
t.Errorf("expected Value to return zero value '', got '%s'", val)
}
if val := s.Value("default"); val != "default" {
t.Errorf("expected Value to return default value 'default', got '%s'", val)
}
if bts, err := json.Marshal(s); err != nil {
t.Errorf("unexpected error during MarshalJSON: %v", err)
} else if want := "null"; string(bts) != want {
t.Errorf("expected marshaled JSON to be %s, got %s", want, string(bts))
}
s.SetValue("foo")
if val := s.Value(); val != "foo" {
t.Errorf("expected Value to return 'foo', got '%s'", val)
}
s = types.NullValue("bar")
if val := s.Value(); val != "bar" {
t.Errorf("expected Value to return 'bar', got '%s'", val)
}
if bts, err := json.Marshal(s); err != nil {
t.Errorf("unexpected error during MarshalJSON: %v", err)
} else if want := `"bar"`; string(bts) != want {
t.Errorf("expected marshaled JSON to be %s, got %s", want, string(bts))
}
if err := json.Unmarshal([]byte(`null`), &s); err != nil {
t.Errorf("unexpected error during UnmarshalJSON: %v", err)
}
if err := json.Unmarshal([]byte(`"baz"`), &s); err != nil {
t.Errorf("unexpected error during UnmarshalJSON: %v", err)
}
if err := json.Unmarshal([]byte(`1.2345`), &s); err == nil {
t.Error("expected error during UnmarshalJSON with invalid JSON, got nil")
}
}