null truncate

null stream
types: add types.Null[T]
2026-02-18 15:25:27 -05:00 · 2025-08-25 10:00:16 -07:00 · 2025-08-25 10:00:15 -07:00 · 2025-08-25 09:49:02 -07:00 · 2025-08-22 15:22:14 -07:00 · 2025-08-22 12:01:37 -07:00
22 changed files with 725 additions and 173 deletions
--- a/api/types.go
+++ b/api/types.go
@@ -12,6 +12,7 @@ import (
 	"time"

 	"github.com/ollama/ollama/envconfig"
+	"github.com/ollama/ollama/types"
 	"github.com/ollama/ollama/types/model"
 )

@@ -64,7 +65,7 @@ type GenerateRequest struct {
 	Context []int `json:"context,omitempty"`

 	// Stream specifies whether the response is streaming; it is true by default.
-	Stream *bool `json:"stream,omitempty"`
+	Stream types.Null[bool] `json:"stream,omitempty"`

 	// Raw set to true means that no formatting will be applied to the prompt.
 	Raw bool `json:"raw,omitempty"`
@@ -105,7 +106,7 @@ type ChatRequest struct {
 	Messages []Message `json:"messages"`

 	// Stream enables streaming of returned responses; true by default.
-	Stream *bool `json:"stream,omitempty"`
+	Stream types.Null[bool] `json:"stream,omitempty"`

 	// Format is the format to return the response in (e.g. "json").
 	Format json.RawMessage `json:"format,omitempty"`
@@ -381,7 +382,7 @@ type EmbedRequest struct {
 	// this request.
 	KeepAlive *Duration `json:"keep_alive,omitempty"`

-	Truncate *bool `json:"truncate,omitempty"`
+	Truncate types.Null[bool] `json:"truncate,omitempty"`

 	// Options lists model-specific options.
 	Options map[string]any `json:"options"`
@@ -420,9 +421,9 @@ type EmbeddingResponse struct {

 // CreateRequest is the request passed to [Client.Create].
 type CreateRequest struct {
-	Model    string `json:"model"`
-	Stream   *bool  `json:"stream,omitempty"`
-	Quantize string `json:"quantize,omitempty"`
+	Model    string           `json:"model"`
+	Stream   types.Null[bool] `json:"stream,omitempty"`
+	Quantize string           `json:"quantize,omitempty"`

 	From       string            `json:"from,omitempty"`
 	Files      map[string]string `json:"files,omitempty"`
@@ -486,11 +487,11 @@ type CopyRequest struct {

 // PullRequest is the request passed to [Client.Pull].
 type PullRequest struct {
-	Model    string `json:"model"`
-	Insecure bool   `json:"insecure,omitempty"` // Deprecated: ignored
-	Username string `json:"username"`           // Deprecated: ignored
-	Password string `json:"password"`           // Deprecated: ignored
-	Stream   *bool  `json:"stream,omitempty"`
+	Model    string           `json:"model"`
+	Insecure bool             `json:"insecure,omitempty"`
+	Username string           `json:"username"` // Deprecated: ignored
+	Password string           `json:"password"` // Deprecated: ignored
+	Stream   types.Null[bool] `json:"stream,omitempty"`

 	// Deprecated: set the model name with Model instead
 	Name string `json:"name"`
@@ -507,11 +508,11 @@ type ProgressResponse struct {

 // PushRequest is the request passed to [Client.Push].
 type PushRequest struct {
-	Model    string `json:"model"`
-	Insecure bool   `json:"insecure,omitempty"`
-	Username string `json:"username"`
-	Password string `json:"password"`
-	Stream   *bool  `json:"stream,omitempty"`
+	Model    string           `json:"model"`
+	Insecure bool             `json:"insecure,omitempty"`
+	Username string           `json:"username"` // Deprecated: ignored
+	Password string           `json:"password"` // Deprecated: ignored
+	Stream   types.Null[bool] `json:"stream,omitempty"`

 	// Deprecated: set the model name with Model instead
 	Name string `json:"name"`
--- a/convert/convert_gptoss.go
+++ b/convert/convert_gptoss.go
@@ -15,19 +15,24 @@ import (

 type gptossModel struct {
 	ModelParameters
-	HiddenLayers         uint32  `json:"num_hidden_layers"`
-	HiddenSize           uint32  `json:"hidden_size"`
-	IntermediateSize     uint32  `json:"intermediate_size"`
-	AttentionHeads       uint32  `json:"num_attention_heads"`
-	KeyValueHeads        uint32  `json:"num_key_value_heads"`
-	HeadDim              uint32  `json:"head_dim"`
-	Experts              uint32  `json:"num_experts"`
-	ExpertsPerToken      uint32  `json:"experts_per_token"`
-	RMSNormEpsilon       float32 `json:"rms_norm_eps"`
-	InitialContextLength uint32  `json:"initial_context_length"`
-	RopeTheta            float32 `json:"rope_theta"`
-	RopeScalingFactor    float32 `json:"rope_scaling_factor"`
-	SlidingWindow        uint32  `json:"sliding_window"`
+	HiddenLayers          uint32  `json:"num_hidden_layers"`
+	MaxPositionEmbeddings uint32  `json:"max_position_embeddings"`
+	HiddenSize            uint32  `json:"hidden_size"`
+	IntermediateSize      uint32  `json:"intermediate_size"`
+	AttentionHeads        uint32  `json:"num_attention_heads"`
+	KeyValueHeads         uint32  `json:"num_key_value_heads"`
+	HeadDim               uint32  `json:"head_dim"`
+	Experts               uint32  `json:"num_experts"`
+	LocalExperts          uint32  `json:"num_local_experts"`
+	ExpertsPerToken       uint32  `json:"experts_per_token"`
+	RMSNormEpsilon        float32 `json:"rms_norm_eps"`
+	InitialContextLength  uint32  `json:"initial_context_length"`
+	RopeTheta             float32 `json:"rope_theta"`
+	RopeScalingFactor     float32 `json:"rope_scaling_factor"`
+	RopeScaling           struct {
+		Factor float32 `json:"factor"`
+	} `json:"rope_scaling"`
+	SlidingWindow uint32 `json:"sliding_window"`
 }

 var _ ModelConverter = (*gptossModel)(nil)
@@ -36,11 +41,11 @@ func (m *gptossModel) KV(t *Tokenizer) ggml.KV {
 	kv := m.ModelParameters.KV(t)
 	kv["general.architecture"] = "gptoss"
 	kv["general.file_type"] = uint32(4)
-	kv["gptoss.context_length"] = uint32(m.RopeScalingFactor * float32(m.InitialContextLength))
+	kv["gptoss.context_length"] = cmp.Or(m.MaxPositionEmbeddings, uint32(m.RopeScalingFactor*float32(m.InitialContextLength)))
 	kv["gptoss.block_count"] = m.HiddenLayers
 	kv["gptoss.embedding_length"] = m.HiddenSize
 	kv["gptoss.feed_forward_length"] = m.IntermediateSize
-	kv["gptoss.expert_count"] = m.Experts
+	kv["gptoss.expert_count"] = cmp.Or(m.Experts, m.LocalExperts)
 	kv["gptoss.expert_used_count"] = m.ExpertsPerToken
 	kv["gptoss.attention.head_count"] = m.AttentionHeads
 	kv["gptoss.attention.head_count_kv"] = m.KeyValueHeads
@@ -49,7 +54,7 @@ func (m *gptossModel) KV(t *Tokenizer) ggml.KV {
 	kv["gptoss.attention.layer_norm_rms_epsilon"] = cmp.Or(m.RMSNormEpsilon, 1e-5)
 	kv["gptoss.attention.sliding_window"] = m.SlidingWindow
 	kv["gptoss.rope.freq_base"] = m.RopeTheta
-	kv["gptoss.rope.scaling.factor"] = m.RopeScalingFactor
+	kv["gptoss.rope.scaling.factor"] = cmp.Or(m.RopeScalingFactor, m.RopeScaling.Factor)
 	kv["gptoss.rope.scaling.original_context_length"] = m.InitialContextLength
 	kv["tokenizer.ggml.bos_token_id"] = uint32(199998) // <|startoftext|>
 	kv["tokenizer.ggml.add_bos_token"] = false
@@ -92,6 +97,11 @@ func (m *gptossModel) Tensors(ts []Tensor) []*ggml.Tensor {

 	for name, mxfp4 := range mxfp4s {
 		dims := mxfp4.blocks.Shape()
+
+		if !strings.HasSuffix(name, ".weight") {
+			name += ".weight"
+		}
+
 		out = append(out, &ggml.Tensor{
 			Name:     name,
 			Kind:     uint32(ggml.TensorTypeMXFP4),
@@ -104,25 +114,47 @@ func (m *gptossModel) Tensors(ts []Tensor) []*ggml.Tensor {
 }

 func (m *gptossModel) Replacements() []string {
-	return []string{
-		// noop replacements so other replacements will not be applied
-		".blocks", ".blocks",
-		".scales", ".scales",
-		// real replacements
-		"block", "blk",
-		"attn.norm", "attn_norm",
-		"attn.qkv", "attn_qkv",
-		"attn.sinks", "attn_sinks",
-		"attn.out", "attn_out",
-		"mlp.norm", "ffn_norm",
-		"mlp.gate", "ffn_gate_inp",
-		"mlp.mlp1_", "ffn_gate_up_exps.",
-		"mlp.mlp2_", "ffn_down_exps.",
-		"embedding", "token_embd",
-		"norm", "output_norm",
-		"unembedding", "output",
-		"scale", "weight",
+	var replacements []string
+	if m.MaxPositionEmbeddings > 0 {
+		// hf flavored model
+		replacements = []string{
+			"lm_head", "output",
+			"model.embed_tokens", "token_embd",
+			"model.layers", "blk",
+			"input_layernorm", "attn_norm",
+			"self_attn.q_proj", "attn_q",
+			"self_attn.k_proj", "attn_k",
+			"self_attn.v_proj", "attn_v",
+			"self_attn.o_proj", "attn_out",
+			"self_attn.sinks", "attn_sinks",
+			"post_attention_layernorm", "ffn_norm",
+			"mlp.router", "ffn_gate_inp",
+			"mlp.experts.gate_up_proj_", "ffn_gate_up_exps.",
+			"mlp.experts.down_proj_", "ffn_down_exps.",
+			"model.norm", "output_norm",
+		}
+	} else {
+		replacements = []string{
+			// noop replacements so other replacements will not be applied
+			".blocks", ".blocks",
+			".scales", ".scales",
+			// real replacements
+			"block", "blk",
+			"attn.norm", "attn_norm",
+			"attn.qkv", "attn_qkv",
+			"attn.sinks", "attn_sinks",
+			"attn.out", "attn_out",
+			"mlp.norm", "ffn_norm",
+			"mlp.gate", "ffn_gate_inp",
+			"mlp.mlp1_", "ffn_gate_up_exps.",
+			"mlp.mlp2_", "ffn_down_exps.",
+			"embedding", "token_embd",
+			"norm", "output_norm",
+			"unembedding", "output",
+			"scale", "weight",
+		}
 	}
+	return replacements
 }

 type mxfp4 struct {
--- a/harmony/harmonyparser.go
+++ b/harmony/harmonyparser.go
@@ -1,10 +1,9 @@
-package server
+package harmony

 import (
 	"context"
 	"fmt"
 	"log/slog"
-	"slices"
 	"strings"
 	"unicode"

@@ -20,18 +19,6 @@ const (
 	harmonyParserState_ParsingContent
 )

-func shouldUseHarmony(model Model) bool {
-	if slices.Contains([]string{"gptoss", "gpt-oss"}, model.Config.ModelFamily) {
-		// heuristic to check whether the template expects to be parsed via harmony:
-		// search for harmony tags that are nearly always used
-		if model.Template.Contains("<|start|>") && model.Template.Contains("<|end|>") {
-			return true
-		}
-	}
-
-	return false
-}
-
 func (s harmonyParserState) String() string {
 	switch s {
 	// we're looking for the message start tag
@@ -277,20 +264,20 @@ const (
 // This is a higher level interface that maps harmony concepts into ollama concepts
 type HarmonyMessageHandler struct {
 	state           harmonyMessageState
-	harmonyParser   *HarmonyParser
-	functionNameMap *FunctionNameMap
+	HarmonyParser   *HarmonyParser
+	FunctionNameMap *FunctionNameMap
 }

 // NewHarmonyMessageHandler creates a new message handler
 func NewHarmonyMessageHandler() *HarmonyMessageHandler {
 	return &HarmonyMessageHandler{
 		state: harmonyMessageState_Normal,
-		harmonyParser: &HarmonyParser{
+		HarmonyParser: &HarmonyParser{
 			MessageStartTag: "<|start|>",
 			MessageEndTag:   "<|end|>",
 			HeaderEndTag:    "<|message|>",
 		},
-		functionNameMap: NewFunctionNameMap(),
+		FunctionNameMap: NewFunctionNameMap(),
 	}
 }

@@ -301,7 +288,7 @@ func (h *HarmonyMessageHandler) AddContent(content string, toolParser *HarmonyTo
 	thinkingSb := strings.Builder{}
 	toolContentSb := strings.Builder{}

-	events := h.harmonyParser.AddContent(content)
+	events := h.HarmonyParser.AddContent(content)
 	for _, event := range events {
 		switch event := event.(type) {
 		case HarmonyEventHeaderComplete:
--- a/harmony/harmonyparser_test.go
+++ b/harmony/harmonyparser_test.go
@@ -1,4 +1,4 @@
-package server
+package harmony

 import (
 	"fmt"
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -30,7 +30,7 @@ func pickBestFullFitByLibrary(f *ggml.GGML, modelPath string, projectors []strin
 			// Try to pack into as few GPUs as possible, starting from 1 GPU
 			for numGPUs := 1; numGPUs <= len(sgl); numGPUs++ {
 				gpuSubset := sgl[:numGPUs]
-				ok, estimatedVRAM := PredictServerFit(gpuSubset, f, adapters, projectors, opts, numParallel)
+				ok, estimatedVRAM := predictServerFit(gpuSubset, f, adapters, projectors, opts, numParallel)

 				if ok {
 					slog.Info("new model will fit in available VRAM across minimum required GPUs, loading",
@@ -48,7 +48,7 @@ func pickBestFullFitByLibrary(f *ggml.GGML, modelPath string, projectors []strin
 			// - try subsets of GPUs instead of just falling back to 1 or all in a family

 			// Now try all the GPUS (OLLAMA_SCHED_SPREAD is set)
-			if ok, estimatedVRAM := PredictServerFit(sgl, f, adapters, projectors, opts, numParallel); ok {
+			if ok, estimatedVRAM := predictServerFit(sgl, f, adapters, projectors, opts, numParallel); ok {
 				slog.Info("new model will fit in available VRAM, loading",
 					"model", modelPath,
 					"library", sgl[0].Library,
@@ -71,7 +71,7 @@ func pickBestPartialFitByLibrary(f *ggml.GGML, projectors []string, adapters []s
 	var bestEstimate uint64
 	var bestFit int
 	for i, gl := range byLibrary {
-		_, estimatedVRAM := PredictServerFit(gl, f, adapters, projectors, opts, numParallel)
+		_, estimatedVRAM := predictServerFit(gl, f, adapters, projectors, opts, numParallel)
 		if estimatedVRAM > bestEstimate {
 			bestEstimate = estimatedVRAM
 			bestFit = i
@@ -81,7 +81,7 @@ func pickBestPartialFitByLibrary(f *ggml.GGML, projectors []string, adapters []s
 }

 // This algorithm looks for a complete fit to determine if we need to unload other models
-func PredictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (bool, uint64) {
+func predictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (bool, uint64) {
 	// Split up the GPUs by type and try them
 	var estimatedVRAM uint64
 	for _, gpus := range allGpus.ByLibrary() {
@@ -97,6 +97,10 @@ func PredictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, proj
 				return true, estimatedVRAM
 			}
 		}
+
+		if len(gpus) == 1 && gpus[0].Library == "cpu" && estimate.TotalSize <= gpus[0].FreeMemory {
+			return true, estimatedVRAM
+		}
 	}
 	return false, estimatedVRAM
 }
--- a/llm/server.go
+++ b/llm/server.go
@@ -492,6 +492,7 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
 		if !requireFull {
 			g = pickBestPartialFitByLibrary(s.ggml, []string{s.loadRequest.ProjectorPath}, s.loadRequest.LoraPath, s.options, gpus, s.numParallel)
 		} else {
+			slog.Info("model requires more memory than is currently available, evicting a model to make space", "estimate", s.estimate)
 			return ErrLoadRequiredFull
 		}
 	}
@@ -524,10 +525,6 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
 		}
 	}

-	if requireFull && len(gpus) == 1 && gpus[0].Library == "cpu" && s.estimate.TotalSize > gpus[0].FreeMemory {
-		return ErrLoadRequiredFull
-	}
-
 	slog.Info("offload", "", s.estimate)

 	s.gpus = gpus
--- a/openai/openai.go
+++ b/openai/openai.go
@@ -17,6 +17,7 @@ import (
 	"github.com/gin-gonic/gin"

 	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/types"
 	"github.com/ollama/ollama/types/model"
 )

@@ -557,12 +558,10 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {

 	var think *api.ThinkValue
 	if r.Reasoning != nil {
-		options["reasoning"] = *r.Reasoning.Effort
 		think = &api.ThinkValue{
 			Value: *r.Reasoning.Effort,
 		}
 	} else if r.ReasoningEffort != nil {
-		options["reasoning"] = *r.ReasoningEffort
 		think = &api.ThinkValue{
 			Value: *r.ReasoningEffort,
 		}
@@ -573,7 +572,7 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
 		Messages: messages,
 		Format:   format,
 		Options:  options,
-		Stream:   &r.Stream,
+		Stream:   types.NullWithValue(r.Stream),
 		Tools:    r.Tools,
 		Think:    think,
 	}, nil
@@ -652,7 +651,7 @@ func fromCompleteRequest(r CompletionRequest) (api.GenerateRequest, error) {
 		Model:   r.Model,
 		Prompt:  r.Prompt,
 		Options: options,
-		Stream:  &r.Stream,
+		Stream:  types.NullWithValue(r.Stream),
 		Suffix:  r.Suffix,
 	}, nil
 }
--- a/runner/llamarunner/cache.go
+++ b/runner/llamarunner/cache.go
@@ -46,7 +46,7 @@ func NewInputCache(lc *llama.Context, kvSize int, numSlots int, multiUserCache b
 }

 // Locking: Operations on InputCacheSlot (including finding one
-// through LoadCacheSlot) require a lock to be be held that serializes
+// through LoadCacheSlot) require a lock to be held that serializes
 // these operations with each other and llama.Decode

 type InputCacheSlot struct {
--- a/runner/ollamarunner/cache.go
+++ b/runner/ollamarunner/cache.go
@@ -78,7 +78,7 @@ func (c *InputCache) Close() {
 }

 // Locking: Operations on InputCacheSlot (including finding one
-// through LoadCacheSlot) require a lock to be be held that serializes
+// through LoadCacheSlot) require a lock to be held that serializes
 // these operations with each other and processBatch

 type InputCacheSlot struct {
--- a/server/create.go
+++ b/server/create.go
@@ -146,7 +146,7 @@ func (s *Server) CreateHandler(c *gin.Context) {
 		ch <- api.ProgressResponse{Status: "success"}
 	}()

-	if r.Stream != nil && !*r.Stream {
+	if !r.Stream.Value(true) {
 		waitForStream(c, ch)
 		return
 	}
--- a/server/routes.go
+++ b/server/routes.go
@@ -32,6 +32,7 @@ import (
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/fs/ggml"
+	"github.com/ollama/ollama/harmony"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/logutil"
 	"github.com/ollama/ollama/openai"
@@ -45,6 +46,18 @@ import (
 	"github.com/ollama/ollama/version"
 )

+func shouldUseHarmony(model *Model) bool {
+	if slices.Contains([]string{"gptoss", "gpt-oss"}, model.Config.ModelFamily) {
+		// heuristic to check whether the template expects to be parsed via harmony:
+		// search for harmony tags that are nearly always used
+		if model.Template.Contains("<|start|>") && model.Template.Contains("<|end|>") {
+			return true
+		}
+	}
+
+	return false
+}
+
 func experimentEnabled(name string) bool {
 	return slices.Contains(strings.Split(os.Getenv("OLLAMA_EXPERIMENT"), ","), name)
 }
@@ -194,12 +207,12 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 		return
 	}

-	useHarmony := shouldUseHarmony(*m) && !req.Raw
-	var harmonyMessageHandler *HarmonyMessageHandler
-	var harmonyToolParser *HarmonyToolCallAccumulator
+	useHarmony := shouldUseHarmony(m) && !req.Raw
+	var harmonyMessageHandler *harmony.HarmonyMessageHandler
+	var harmonyToolParser *harmony.HarmonyToolCallAccumulator
 	if useHarmony {
-		harmonyMessageHandler = NewHarmonyMessageHandler()
-		harmonyMessageHandler.harmonyParser.AddImplicitStart()
+		harmonyMessageHandler = harmony.NewHarmonyMessageHandler()
+		harmonyMessageHandler.HarmonyParser.AddImplicitStart()
 		harmonyToolParser = harmonyMessageHandler.CreateToolParser()
 	}

@@ -427,7 +440,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 		}
 	}()

-	if req.Stream != nil && !*req.Stream {
+	if !req.Stream.Value(true) {
 		var r api.GenerateResponse
 		var sbThinking strings.Builder
 		var sbContent strings.Builder
@@ -474,12 +487,6 @@ func (s *Server) EmbedHandler(c *gin.Context) {
 		return
 	}

-	truncate := true
-
-	if req.Truncate != nil && !*req.Truncate {
-		truncate = false
-	}
-
 	var input []string

 	switch i := req.Input.(type) {
@@ -528,6 +535,7 @@ func (s *Server) EmbedHandler(c *gin.Context) {
 	}

 	var count int
+	truncate := req.Truncate.Value(true)
 	for i, s := range input {
 		tokens, err := r.Tokenize(c.Request.Context(), s)
 		if err != nil {
@@ -688,7 +696,7 @@ func (s *Server) PullHandler(c *gin.Context) {
 		}
 	}()

-	if req.Stream != nil && !*req.Stream {
+	if !req.Stream.Value(true) {
 		waitForStream(c, ch)
 		return
 	}
@@ -743,7 +751,7 @@ func (s *Server) PushHandler(c *gin.Context) {
 		}
 	}()

-	if req.Stream != nil && !*req.Stream {
+	if !req.Stream.Value(true) {
 		waitForStream(c, ch)
 		return
 	}
@@ -1603,19 +1611,19 @@ func (s *Server) ChatHandler(c *gin.Context) {
 	}
 	msgs = filterThinkTags(msgs, m)

-	var harmonyMessageHandler *HarmonyMessageHandler
-	var harmonyToolParser *HarmonyToolCallAccumulator
+	var harmonyMessageHandler *harmony.HarmonyMessageHandler
+	var harmonyToolParser *harmony.HarmonyToolCallAccumulator

-	useHarmony := shouldUseHarmony(*m)
+	useHarmony := shouldUseHarmony(m)

 	processedTools := req.Tools
 	if useHarmony {
-		harmonyMessageHandler = NewHarmonyMessageHandler()
+		harmonyMessageHandler = harmony.NewHarmonyMessageHandler()
 		var lastMessage *api.Message
 		if len(msgs) > 0 {
 			lastMessage = &msgs[len(msgs)-1]
 		}
-		harmonyMessageHandler.harmonyParser.AddImplicitStartOrPrefill(lastMessage)
+		harmonyMessageHandler.HarmonyParser.AddImplicitStartOrPrefill(lastMessage)
 		harmonyToolParser = harmonyMessageHandler.CreateToolParser()

 		// make a copy of tools to pass to the chat prompt. Function names may be
@@ -1623,7 +1631,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
 		processedTools = make([]api.Tool, len(req.Tools))
 		copy(processedTools, req.Tools)
 		for i, tool := range processedTools {
-			processedTools[i].Function.Name = harmonyMessageHandler.functionNameMap.ConvertAndAdd(tool.Function.Name)
+			processedTools[i].Function.Name = harmonyMessageHandler.FunctionNameMap.ConvertAndAdd(tool.Function.Name)
 		}
 	}

@@ -1660,6 +1668,10 @@ func (s *Server) ChatHandler(c *gin.Context) {
 			OpeningTag: openingTag,
 			ClosingTag: closingTag,
 		}
+
+		if strings.HasSuffix(strings.TrimSpace(prompt), openingTag) {
+			thinkingState.AddContent(openingTag)
+		}
 	}

 	var toolParser *tools.Parser
@@ -1705,7 +1717,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
 					toolName, toolContent := harmonyToolParser.Drain()
 					if toolName != nil {
 						*toolName = strings.TrimPrefix(*toolName, "functions.")
-						*toolName = harmonyMessageHandler.functionNameMap.OriginalFromConverted(*toolName)
+						*toolName = harmonyMessageHandler.FunctionNameMap.OriginalFromConverted(*toolName)
 						var args api.ToolCallFunctionArguments
 						if err := json.Unmarshal([]byte(toolContent), &args); err != nil {
 							errStr := fmt.Sprintf("error parsing tool call: raw='%s', err=%s", toolContent, err.Error())
@@ -1758,7 +1770,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
 		}
 	}()

-	if req.Stream != nil && !*req.Stream {
+	if !req.Stream.Value(true) {
 		var resp api.ChatResponse
 		var toolCalls []api.ToolCall
 		var sbThinking strings.Builder
--- a/server/routes_create_test.go
+++ b/server/routes_create_test.go
@@ -22,8 +22,6 @@ import (
 	"github.com/ollama/ollama/fs/ggml"
 )

-var stream bool = false
-
 func createBinFile(t *testing.T, kv map[string]any, ti []*ggml.Tensor) (string, string) {
 	t.Helper()
 	t.Setenv("OLLAMA_MODELS", cmp.Or(os.Getenv("OLLAMA_MODELS"), t.TempDir()))
@@ -118,7 +116,7 @@ func TestCreateFromBin(t *testing.T) {
 	w := createRequest(t, s.CreateHandler, api.CreateRequest{
 		Name:   "test",
 		Files:  map[string]string{"test.gguf": digest},
-		Stream: &stream,
+		Stream: streamFalse,
 	})

 	if w.Code != http.StatusOK {
@@ -148,7 +146,7 @@ func TestCreateFromModel(t *testing.T) {
 	w := createRequest(t, s.CreateHandler, api.CreateRequest{
 		Name:   "test",
 		Files:  map[string]string{"test.gguf": digest},
-		Stream: &stream,
+		Stream: streamFalse,
 	})

 	if w.Code != http.StatusOK {
@@ -162,7 +160,7 @@ func TestCreateFromModel(t *testing.T) {
 	w = createRequest(t, s.CreateHandler, api.CreateRequest{
 		Name:   "test2",
 		From:   "test",
-		Stream: &stream,
+		Stream: streamFalse,
 	})

 	if w.Code != http.StatusOK {
@@ -192,7 +190,7 @@ func TestCreateRemovesLayers(t *testing.T) {
 		Name:     "test",
 		Files:    map[string]string{"test.gguf": digest},
 		Template: "{{ .Prompt }}",
-		Stream:   &stream,
+		Stream:   streamFalse,
 	})

 	if w.Code != http.StatusOK {
@@ -213,7 +211,7 @@ func TestCreateRemovesLayers(t *testing.T) {
 		Name:     "test",
 		Files:    map[string]string{"test.gguf": digest},
 		Template: "{{ .System }} {{ .Prompt }}",
-		Stream:   &stream,
+		Stream:   streamFalse,
 	})

 	if w.Code != http.StatusOK {
@@ -243,7 +241,7 @@ func TestCreateUnsetsSystem(t *testing.T) {
 		Name:   "test",
 		Files:  map[string]string{"test.gguf": digest},
 		System: "Say hi!",
-		Stream: &stream,
+		Stream: streamFalse,
 	})

 	if w.Code != http.StatusOK {
@@ -264,7 +262,7 @@ func TestCreateUnsetsSystem(t *testing.T) {
 		Name:   "test",
 		Files:  map[string]string{"test.gguf": digest},
 		System: "",
-		Stream: &stream,
+		Stream: streamFalse,
 	})

 	if w.Code != http.StatusOK {
@@ -297,7 +295,7 @@ func TestCreateMergeParameters(t *testing.T) {
 			"top_k":       10,
 			"stop":        []string{"USER:", "ASSISTANT:"},
 		},
-		Stream: &stream,
+		Stream: streamFalse,
 	})

 	if w.Code != http.StatusOK {
@@ -322,7 +320,7 @@ func TestCreateMergeParameters(t *testing.T) {
 			"temperature": 0.6,
 			"top_p":       0.7,
 		},
-		Stream: &stream,
+		Stream: streamFalse,
 	})

 	if w.Code != http.StatusOK {
@@ -381,7 +379,7 @@ func TestCreateMergeParameters(t *testing.T) {
 			"top_p":       0.7,
 			"stop":        []string{"<|endoftext|>"},
 		},
-		Stream: &stream,
+		Stream: streamFalse,
 	})

 	if w.Code != http.StatusOK {
@@ -441,7 +439,7 @@ func TestCreateReplacesMessages(t *testing.T) {
 				Content: "Oh, my god.",
 			},
 		},
-		Stream: &stream,
+		Stream: streamFalse,
 	})

 	if w.Code != http.StatusOK {
@@ -475,7 +473,7 @@ func TestCreateReplacesMessages(t *testing.T) {
 				Content: "A test. And a thumping good one at that, I'd wager.",
 			},
 		},
-		Stream: &stream,
+		Stream: streamFalse,
 	})

 	if w.Code != http.StatusOK {
@@ -536,7 +534,7 @@ func TestCreateTemplateSystem(t *testing.T) {
 		Files:    map[string]string{"test.gguf": digest},
 		Template: "{{ .System }} {{ .Prompt }}",
 		System:   "Say bye!",
-		Stream:   &stream,
+		Stream:   streamFalse,
 	})

 	if w.Code != http.StatusOK {
@@ -578,7 +576,7 @@ func TestCreateTemplateSystem(t *testing.T) {
 			Name:     "test",
 			Files:    map[string]string{"test.gguf": digest},
 			Template: "{{ .Prompt",
-			Stream:   &stream,
+			Stream:   streamFalse,
 		})

 		if w.Code != http.StatusBadRequest {
@@ -592,7 +590,7 @@ func TestCreateTemplateSystem(t *testing.T) {
 			Name:     "test",
 			Files:    map[string]string{"test.gguf": digest},
 			Template: "{{ if .Prompt }}",
-			Stream:   &stream,
+			Stream:   streamFalse,
 		})

 		if w.Code != http.StatusBadRequest {
@@ -606,7 +604,7 @@ func TestCreateTemplateSystem(t *testing.T) {
 			Name:     "test",
 			Files:    map[string]string{"test.gguf": digest},
 			Template: "{{ Prompt }}",
-			Stream:   &stream,
+			Stream:   streamFalse,
 		})

 		if w.Code != http.StatusBadRequest {
@@ -627,7 +625,7 @@ func TestCreateLicenses(t *testing.T) {
 		Name:    "test",
 		Files:   map[string]string{"test.gguf": digest},
 		License: []string{"MIT", "Apache-2.0"},
-		Stream:  &stream,
+		Stream:  streamFalse,
 	})

 	if w.Code != http.StatusOK {
@@ -678,7 +676,7 @@ func TestCreateDetectTemplate(t *testing.T) {
 		w := createRequest(t, s.CreateHandler, api.CreateRequest{
 			Name:   "test",
 			Files:  map[string]string{"test.gguf": digest},
-			Stream: &stream,
+			Stream: streamFalse,
 		})

 		if w.Code != http.StatusOK {
@@ -698,7 +696,7 @@ func TestCreateDetectTemplate(t *testing.T) {
 		w := createRequest(t, s.CreateHandler, api.CreateRequest{
 			Name:   "test",
 			Files:  map[string]string{"test.gguf": digest},
-			Stream: &stream,
+			Stream: streamFalse,
 		})

 		if w.Code != http.StatusOK {
--- a/server/routes_debug_test.go
+++ b/server/routes_debug_test.go
@@ -12,6 +12,7 @@ import (
 	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/types"
 )

 func TestGenerateDebugRenderOnly(t *testing.T) {
@@ -53,7 +54,6 @@ func TestGenerateDebugRenderOnly(t *testing.T) {
 	go s.sched.Run(t.Context())

 	// Create a test model
-	stream := false
 	_, digest := createBinFile(t, ggml.KV{
 		"general.architecture":          "llama",
 		"llama.block_count":             uint32(1),
@@ -82,7 +82,7 @@ func TestGenerateDebugRenderOnly(t *testing.T) {
 		Model:    "test-model",
 		Files:    map[string]string{"file.gguf": digest},
 		Template: "{{ .Prompt }}",
-		Stream:   &stream,
+		Stream:   streamFalse,
 	})

 	if w.Code != http.StatusOK {
@@ -172,7 +172,7 @@ func TestGenerateDebugRenderOnly(t *testing.T) {
 			}
 			t.Run(tt.name+streamSuffix, func(t *testing.T) {
 				req := tt.request
-				req.Stream = &stream
+				req.Stream = types.NullWithValue(stream)
 				w := createRequest(t, s.GenerateHandler, req)

 				if tt.expectDebug {
@@ -246,7 +246,6 @@ func TestChatDebugRenderOnly(t *testing.T) {
 	go s.sched.Run(t.Context())

 	// Create a test model
-	stream := false
 	_, digest := createBinFile(t, ggml.KV{
 		"general.architecture":          "llama",
 		"llama.block_count":             uint32(1),
@@ -275,7 +274,7 @@ func TestChatDebugRenderOnly(t *testing.T) {
 		Model:    "test-model",
 		Files:    map[string]string{"file.gguf": digest},
 		Template: "{{ if .Tools }}{{ .Tools }}{{ end }}{{ range .Messages }}{{ .Role }}: {{ .Content }}\n{{ end }}",
-		Stream:   &stream,
+		Stream:   streamFalse,
 	})

 	if w.Code != http.StatusOK {
@@ -377,7 +376,7 @@ func TestChatDebugRenderOnly(t *testing.T) {
 			}
 			t.Run(tt.name+streamSuffix, func(t *testing.T) {
 				req := tt.request
-				req.Stream = &stream
+				req.Stream = types.NullWithValue(stream)
 				w := createRequest(t, s.ChatHandler, req)

 				if tt.expectDebug {
--- a/server/routes_generate_test.go
+++ b/server/routes_generate_test.go
@@ -126,7 +126,7 @@ func TestGenerateChat(t *testing.T) {
 {{- range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}
 {{- end }}
 {{ end }}`,
-		Stream: &stream,
+		Stream: streamFalse,
 	})

 	if w.Code != http.StatusOK {
@@ -182,7 +182,7 @@ func TestGenerateChat(t *testing.T) {
 		w := createRequest(t, s.CreateHandler, api.CreateRequest{
 			Model:  "bert",
 			Files:  map[string]string{"bert.gguf": digest},
-			Stream: &stream,
+			Stream: streamFalse,
 		})

 		if w.Code != http.StatusOK {
@@ -288,7 +288,7 @@ func TestGenerateChat(t *testing.T) {
 			Messages: []api.Message{
 				{Role: "user", Content: "Hello!"},
 			},
-			Stream: &stream,
+			Stream: streamFalse,
 		})

 		if w.Code != http.StatusOK {
@@ -318,7 +318,7 @@ func TestGenerateChat(t *testing.T) {
 			Messages: []api.Message{
 				{Role: "user", Content: "Hello!"},
 			},
-			Stream: &stream,
+			Stream: streamFalse,
 		})

 		if w.Code != http.StatusOK {
@@ -340,7 +340,7 @@ func TestGenerateChat(t *testing.T) {
 				{Role: "system", Content: "You can perform magic tricks."},
 				{Role: "user", Content: "Hello!"},
 			},
-			Stream: &stream,
+			Stream: streamFalse,
 		})

 		if w.Code != http.StatusOK {
@@ -363,7 +363,7 @@ func TestGenerateChat(t *testing.T) {
 				{Role: "system", Content: "You can perform magic tricks."},
 				{Role: "user", Content: "Help me write tests."},
 			},
-			Stream: &stream,
+			Stream: streamFalse,
 		})

 		if w.Code != http.StatusOK {
@@ -422,15 +422,13 @@ func TestGenerateChat(t *testing.T) {
 			EvalDuration:       1,
 		}

-		streamRequest := true
-
 		w := createRequest(t, s.ChatHandler, api.ChatRequest{
 			Model: "test-system",
 			Messages: []api.Message{
 				{Role: "user", Content: "What's the weather in Seattle?"},
 			},
 			Tools:  tools,
-			Stream: &streamRequest,
+			Stream: streamTrue,
 		})

 		if w.Code != http.StatusOK {
@@ -551,7 +549,7 @@ func TestGenerateChat(t *testing.T) {
 				{Role: "user", Content: "What's the weather in Seattle?"},
 			},
 			Tools:  tools,
-			Stream: &stream,
+			Stream: streamFalse,
 		})

 		wg.Wait()
@@ -666,7 +664,7 @@ func TestGenerate(t *testing.T) {
 {{- if .Prompt }}User: {{ .Prompt }} {{ end }}
 {{- if .Response }}Assistant: {{ .Response }} {{ end }}
 `,
-		Stream: &stream,
+		Stream: streamFalse,
 	})

 	if w.Code != http.StatusOK {
@@ -704,7 +702,7 @@ func TestGenerate(t *testing.T) {
 		w := createRequest(t, s.CreateHandler, api.CreateRequest{
 			Model:  "bert",
 			Files:  map[string]string{"file.gguf": digest},
-			Stream: &stream,
+			Stream: streamFalse,
 		})

 		if w.Code != http.StatusOK {
@@ -825,7 +823,7 @@ func TestGenerate(t *testing.T) {
 		w := createRequest(t, s.GenerateHandler, api.GenerateRequest{
 			Model:  "test",
 			Prompt: "Hello!",
-			Stream: &stream,
+			Stream: streamFalse,
 		})

 		if w.Code != http.StatusOK {
@@ -853,7 +851,7 @@ func TestGenerate(t *testing.T) {
 		w := createRequest(t, s.GenerateHandler, api.GenerateRequest{
 			Model:  "test-system",
 			Prompt: "Hello!",
-			Stream: &stream,
+			Stream: streamFalse,
 		})

 		if w.Code != http.StatusOK {
@@ -873,7 +871,7 @@ func TestGenerate(t *testing.T) {
 			Model:  "test-system",
 			Prompt: "Hello!",
 			System: "You can perform magic tricks.",
-			Stream: &stream,
+			Stream: streamFalse,
 		})

 		if w.Code != http.StatusOK {
@@ -895,7 +893,7 @@ func TestGenerate(t *testing.T) {
 			Template: `{{- if .System }}{{ .System }} {{ end }}
 {{- if .Prompt }}### USER {{ .Prompt }} {{ end }}
 {{- if .Response }}### ASSISTANT {{ .Response }} {{ end }}`,
-			Stream: &stream,
+			Stream: streamFalse,
 		})

 		if w.Code != http.StatusOK {
@@ -957,7 +955,7 @@ func TestGenerate(t *testing.T) {
 			Model:  "test-system",
 			Prompt: "Help me write tests.",
 			Raw:    true,
-			Stream: &stream,
+			Stream: streamFalse,
 		})

 		if w.Code != http.StatusOK {
@@ -969,3 +967,232 @@ func TestGenerate(t *testing.T) {
 		}
 	})
 }
+
+func TestChatWithPromptEndingInThinkTag(t *testing.T) {
+	gin.SetMode(gin.TestMode)
+
+	// Helper to create a standard thinking test setup
+	setupThinkingTest := func(t *testing.T) (*mockRunner, *Server) {
+		mock := &mockRunner{
+			CompletionResponse: llm.CompletionResponse{
+				Done:               true,
+				DoneReason:         llm.DoneReasonStop,
+				PromptEvalCount:    1,
+				PromptEvalDuration: 1,
+				EvalCount:          1,
+				EvalDuration:       1,
+			},
+		}
+
+		s := &Server{
+			sched: &Scheduler{
+				pendingReqCh:  make(chan *LlmRequest, 1),
+				finishedReqCh: make(chan *LlmRequest, 1),
+				expiredCh:     make(chan *runnerRef, 1),
+				unloadedCh:    make(chan any, 1),
+				loaded:        make(map[string]*runnerRef),
+				newServerFn:   newMockServer(mock),
+				getGpuFn:      discover.GetGPUInfo,
+				getCpuFn:      discover.GetCPUInfo,
+				reschedDelay:  250 * time.Millisecond,
+				loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
+					time.Sleep(time.Millisecond)
+					req.successCh <- &runnerRef{llama: mock}
+					return false
+				},
+			},
+		}
+
+		go s.sched.Run(t.Context())
+
+		// Create a model with thinking support
+		_, digest := createBinFile(t, ggml.KV{
+			"general.architecture":          "llama",
+			"llama.block_count":             uint32(1),
+			"llama.context_length":          uint32(8192),
+			"llama.embedding_length":        uint32(4096),
+			"llama.attention.head_count":    uint32(32),
+			"llama.attention.head_count_kv": uint32(8),
+			"tokenizer.ggml.tokens":         []string{""},
+			"tokenizer.ggml.scores":         []float32{0},
+			"tokenizer.ggml.token_type":     []int32{0},
+		}, []*ggml.Tensor{
+			{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
+			{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
+			{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
+			{Name: "blk.0.ffn_gate.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
+			{Name: "blk.0.ffn_up.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
+			{Name: "blk.0.ffn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
+			{Name: "blk.0.attn_k.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
+			{Name: "blk.0.attn_output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
+			{Name: "blk.0.attn_q.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
+			{Name: "blk.0.attn_v.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
+			{Name: "output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
+		})
+
+		// Create model with thinking template that adds <think> at the end
+		w := createRequest(t, s.CreateHandler, api.CreateRequest{
+			Model: "test-thinking",
+			Files: map[string]string{"file.gguf": digest},
+			Template: `{{- range .Messages }}
+{{- if eq .Role "user" }}user: {{ .Content }}
+{{ else if eq .Role "assistant" }}assistant: {{ if .Thinking }}<think>{{ .Thinking }}</think>{{ end }}{{ .Content }}
+{{ end }}{{ end }}<think>`,
+			Stream: streamFalse,
+		})
+
+		if w.Code != http.StatusOK {
+			t.Fatalf("expected status 200, got %d", w.Code)
+		}
+
+		return mock, s
+	}
+
+	mock, s := setupThinkingTest(t)
+
+	// Helper to test chat responses
+	testChatRequest := func(t *testing.T, name string, userContent string, modelResponse string, expectedThinking string, expectedContent string, think bool) {
+		t.Run(name, func(t *testing.T) {
+			mock.CompletionResponse = llm.CompletionResponse{
+				Content:            modelResponse,
+				Done:               true,
+				DoneReason:         llm.DoneReasonStop,
+				PromptEvalCount:    1,
+				PromptEvalDuration: 1,
+				EvalCount:          1,
+				EvalDuration:       1,
+			}
+			mock.CompletionFn = nil
+
+			req := api.ChatRequest{
+				Model: "test-thinking",
+				Messages: []api.Message{
+					{Role: "user", Content: userContent},
+				},
+				Stream: streamFalse,
+			}
+			if think {
+				req.Think = &api.ThinkValue{Value: think}
+			}
+
+			w := createRequest(t, s.ChatHandler, req)
+			if w.Code != http.StatusOK {
+				t.Fatalf("expected status 200, got %d", w.Code)
+			}
+
+			var resp api.ChatResponse
+			if err := json.NewDecoder(w.Body).Decode(&resp); err != nil {
+				t.Fatal(err)
+			}
+
+			if resp.Message.Thinking != expectedThinking {
+				t.Errorf("expected thinking %q, got %q", expectedThinking, resp.Message.Thinking)
+			}
+
+			if resp.Message.Content != expectedContent {
+				t.Errorf("expected content %q, got %q", expectedContent, resp.Message.Content)
+			}
+		})
+	}
+
+	// Test cases - Note: Template adds <think> at the end, and leading whitespace after <think> is eaten by the parser
+	testChatRequest(t, "basic thinking response",
+		"Help me solve this problem",
+		" Let me think about this step by step... </think> The answer is 42.",
+		"Let me think about this step by step... ",
+		"The answer is 42.",
+		true)
+
+	testChatRequest(t, "thinking with multiple sentences",
+		"Explain quantum computing",
+		" First, I need to understand the basics. Quantum bits can be in superposition. </think> Quantum computing uses quantum mechanics principles.",
+		"First, I need to understand the basics. Quantum bits can be in superposition. ",
+		"Quantum computing uses quantum mechanics principles.",
+		true)
+
+	testChatRequest(t, "no thinking content",
+		"What is 2+2?",
+		"</think> The answer is 4.",
+		"",
+		"The answer is 4.",
+		true)
+
+	testChatRequest(t, "thinking disabled but template still adds think tag",
+		"Simple question",
+		" My thoughts </think> The answer.",
+		"",
+		" My thoughts </think> The answer.",
+		false)
+
+	// Test streaming response with template-added <think>
+	t.Run("streaming with thinking", func(t *testing.T) {
+		var wg sync.WaitGroup
+		wg.Add(1)
+
+		mock.CompletionFn = func(ctx context.Context, r llm.CompletionRequest, fn func(r llm.CompletionResponse)) error {
+			defer wg.Done()
+
+			// Verify the prompt ends with <think> due to template
+			if !strings.HasSuffix(r.Prompt, "<think>") {
+				t.Errorf("expected prompt to end with <think>, got: %q", r.Prompt)
+			}
+
+			// Simulate streaming chunks
+			responses := []llm.CompletionResponse{
+				{Content: " I need to consider", Done: false, PromptEvalCount: 1, PromptEvalDuration: 1},
+				{Content: " multiple factors here...", Done: false, PromptEvalCount: 1, PromptEvalDuration: 1},
+				{Content: " </think> Based on my analysis,", Done: false, PromptEvalCount: 1, PromptEvalDuration: 1},
+				{Content: " the solution is straightforward.", Done: true, DoneReason: llm.DoneReasonStop, PromptEvalCount: 1, PromptEvalDuration: 1, EvalCount: 1, EvalDuration: 1},
+			}
+
+			for _, resp := range responses {
+				select {
+				case <-ctx.Done():
+					return ctx.Err()
+				default:
+					fn(resp)
+					time.Sleep(10 * time.Millisecond)
+				}
+			}
+			return nil
+		}
+
+		think := true
+		w := createRequest(t, s.ChatHandler, api.ChatRequest{
+			Model:    "test-thinking",
+			Messages: []api.Message{{Role: "user", Content: "Analyze this complex problem"}},
+			Think:    &api.ThinkValue{Value: think},
+			Stream:   streamFalse,
+		})
+
+		wg.Wait()
+
+		if w.Code != http.StatusOK {
+			t.Fatalf("expected status 200, got %d", w.Code)
+		}
+
+		// Parse streaming responses
+		decoder := json.NewDecoder(w.Body)
+		var allThinking, allContent strings.Builder
+
+		for {
+			var resp api.ChatResponse
+			if err := decoder.Decode(&resp); err == io.EOF {
+				break
+			} else if err != nil {
+				t.Fatal(err)
+			}
+			allThinking.WriteString(resp.Message.Thinking)
+			allContent.WriteString(resp.Message.Content)
+		}
+
+		// Note: Leading whitespace after <think> is eaten by the parser
+		if got := allThinking.String(); got != "I need to consider multiple factors here... " {
+			t.Errorf("expected thinking %q, got %q", "I need to consider multiple factors here... ", got)
+		}
+
+		if got := allContent.String(); got != "Based on my analysis, the solution is straightforward." {
+			t.Errorf("expected content %q, got %q", "Based on my analysis, the solution is straightforward.", got)
+		}
+	})
+}
--- a/server/routes_harmony_streaming_test.go
+++ b/server/routes_harmony_streaming_test.go
@@ -291,12 +291,11 @@ func TestChatHarmonyParserStreamingRealtime(t *testing.T) {
 			// Create a simple test model
 			_, digest := createHarmonyTestModel(t)

-			streamFalse := false
 			w := createRequest(t, s.CreateHandler, api.CreateRequest{
 				Model:    "harmony-test-streaming",
 				Files:    map[string]string{"test.gguf": digest},
 				Template: `<|start|><|end|>{{ with .Tools }}{{ end }}{{ .Prompt }}`,
-				Stream:   &streamFalse,
+				Stream:   streamFalse,
 			})

 			if w.Code != 200 {
@@ -304,11 +303,10 @@ func TestChatHarmonyParserStreamingRealtime(t *testing.T) {
 			}

 			// Test chat endpoint with streaming
-			streamTrue := true
 			w = createRequest(t, s.ChatHandler, api.ChatRequest{
 				Model:    "harmony-test-streaming",
 				Messages: []api.Message{{Role: "user", Content: "Hello"}},
-				Stream:   &streamTrue,
+				Stream:   streamTrue,
 				Tools:    getTestTools(),
 			})

@@ -441,12 +439,11 @@ func TestChatHarmonyParserStreamingSimple(t *testing.T) {

 	// Create model
 	_, digest := createHarmonyTestModel(t)
-	streamFalse := false
 	w := createRequest(t, s.CreateHandler, api.CreateRequest{
 		Model:    "gpt-oss",
 		Files:    map[string]string{"test.gguf": digest},
 		Template: `<|start|><|end|>{{ .Tools }}{{ .Prompt }}`,
-		Stream:   &streamFalse,
+		Stream:   streamFalse,
 	})

 	if w.Code != 200 {
@@ -454,11 +451,10 @@ func TestChatHarmonyParserStreamingSimple(t *testing.T) {
 	}

 	// Test streaming
-	streamTrue := true
 	w = createRequest(t, s.ChatHandler, api.ChatRequest{
 		Model:    "gpt-oss",
 		Messages: []api.Message{{Role: "user", Content: "Hello"}},
-		Stream:   &streamTrue,
+		Stream:   streamTrue,
 		Tools:    getTestTools(),
 	})

@@ -625,12 +621,11 @@ func TestChatHarmonyParserStreaming(t *testing.T) {
 			_, digest := createHarmonyTestModel(t)

 			// Create model with passthrough template
-			stream := false
 			w := createRequest(t, s.CreateHandler, api.CreateRequest{
 				Model:    "harmony-test",
 				Files:    map[string]string{"file.gguf": digest},
 				Template: `<|start|><|end|>{{ with .Tools }}{{ end }}{{ .Prompt }}`,
-				Stream:   &stream,
+				Stream:   streamFalse,
 			})

 			if w.Code != http.StatusOK {
@@ -638,11 +633,10 @@ func TestChatHarmonyParserStreaming(t *testing.T) {
 			}

 			// Test chat endpoint with streaming
-			streamTrue := true
 			w = createRequest(t, s.ChatHandler, api.ChatRequest{
 				Model:    "harmony-test",
 				Messages: []api.Message{{Role: "user", Content: "Hello"}},
-				Stream:   &streamTrue,
+				Stream:   streamTrue,
 				Tools:    getTestTools(),
 			})

--- a/server/routes_test.go
+++ b/server/routes_test.go
@@ -28,10 +28,16 @@ import (
 	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/openai"
 	"github.com/ollama/ollama/server/internal/client/ollama"
+	"github.com/ollama/ollama/types"
 	"github.com/ollama/ollama/types/model"
 	"github.com/ollama/ollama/version"
 )

+var (
+	streamFalse = types.NullWithValue(false)
+	streamTrue  = types.NullWithValue(true)
+)
+
 func createTestFile(t *testing.T, name string) (string, string) {
 	t.Helper()

@@ -332,11 +338,10 @@ func TestRoutes(t *testing.T) {
 			Path:   "/api/create",
 			Setup: func(t *testing.T, req *http.Request) {
 				_, digest := createTestFile(t, "ollama-model")
-				stream := false
 				createReq := api.CreateRequest{
 					Name:   "t-bone",
 					Files:  map[string]string{"test.gguf": digest},
-					Stream: &stream,
+					Stream: streamFalse,
 				}
 				jsonData, err := json.Marshal(createReq)
 				if err != nil {
@@ -638,7 +643,7 @@ func TestManifestCaseSensitivity(t *testing.T) {
 		// version.
 		Name:   wantStableName,
 		Files:  map[string]string{"test.gguf": digest},
-		Stream: &stream,
+		Stream: streamFalse,
 	}))
 	checkManifestList()

@@ -646,14 +651,14 @@ func TestManifestCaseSensitivity(t *testing.T) {
 	checkOK(createRequest(t, s.CreateHandler, api.CreateRequest{
 		Name:   name(),
 		Files:  map[string]string{"test.gguf": digest},
-		Stream: &stream,
+		Stream: streamFalse,
 	}))
 	checkManifestList()

 	t.Logf("pulling")
 	checkOK(createRequest(t, s.PullHandler, api.PullRequest{
 		Name:     name(),
-		Stream:   &stream,
+		Stream:   streamFalse,
 		Insecure: true,
 	}))
 	checkManifestList()
--- a/thinking/parser.go
+++ b/thinking/parser.go
@@ -103,7 +103,9 @@ func eat(s *Parser) (string, string, bool) {
 			// note that we use the original content, not the trimmed one because we
 			// don't want to eat any whitespace in the real content if there were no
 			// thinking tags
-			return "", s.acc.String(), false
+			untrimmed := s.acc.String()
+			s.acc.Reset()
+			return "", untrimmed, false
 		}
 	case thinkingState_ThinkingStartedEatingWhitespace:
 		trimmed := strings.TrimLeftFunc(s.acc.String(), unicode.IsSpace)
--- a/thinking/parser_test.go
+++ b/thinking/parser_test.go
@@ -58,6 +58,15 @@ func TestThinkingStreaming(t *testing.T) {
 					wantContent:    "  abc",
 					wantStateAfter: thinkingState_ThinkingDone,
 				},
+				// regression test for a bug where we were transitioning directly to
+				// ThinkingDone without clearing the buffer. This would cuase the first
+				// step to be outputted twice
+				{
+					input:          "def",
+					wantThinking:   "",
+					wantContent:    "def",
+					wantStateAfter: thinkingState_ThinkingDone,
+				},
 			},
 		},
 		{
--- a/tools/tools.go
+++ b/tools/tools.go
@@ -224,22 +224,45 @@ func findArguments(buffer []byte) (map[string]any, int) {
 		return nil, 0
 	}

+	start := -1
 	var braces int
-	var start int = -1
+	var inString, escaped bool
+
+	for i := range buffer {
+		c := buffer[i]
+
+		if escaped {
+			escaped = false
+			continue
+		}
+
+		if c == '\\' {
+			escaped = true
+			continue
+		}
+
+		if c == '"' {
+			inString = !inString
+			continue
+		}
+
+		if inString {
+			continue
+		}

-	for i, c := range buffer {
 		if c == '{' {
 			if braces == 0 {
 				start = i
 			}
 			braces++
-		} else if c == '}' && braces > 0 {
+		} else if c == '}' {
 			braces--
 			if braces == 0 && start != -1 {
 				object := buffer[start : i+1]

 				var data map[string]any
 				if err := json.Unmarshal(object, &data); err != nil {
+					// not a valid object, keep looking
 					start = -1
 					continue
 				}
@@ -282,6 +305,10 @@ func findArguments(buffer []byte) (map[string]any, int) {

 				return data, i
 			}
+
+			if braces < 0 {
+				braces = 0
+			}
 		}
 	}

--- a/tools/tools_test.go
+++ b/tools/tools_test.go
@@ -1,6 +1,7 @@
 package tools

 import (
+	"strings"
 	"testing"
 	"text/template"

@@ -1140,11 +1141,163 @@ func TestFindArguments(t *testing.T) {
 		},
 		{
 			name:   "deepseek",
-			buffer: []byte(`", "arguments": {"location": "Tokyo"}}</tool_call>`),
+			buffer: []byte(`"arguments": {"location": "Tokyo"}}</tool_call>`),
 			want: map[string]any{
 				"location": "Tokyo",
 			},
 		},
+		{
+			name:   "string with braces",
+			buffer: []byte(`{"name": "process_code", "arguments": {"code": "if (x > 0) { return true; }"}}`),
+			want: map[string]any{
+				"code": "if (x > 0) { return true; }",
+			},
+		},
+		{
+			name:   "string with nested json",
+			buffer: []byte(`{"name": "send_data", "arguments": {"payload": "{\"nested\": {\"key\": \"value\"}}"}}`),
+			want: map[string]any{
+				"payload": `{"nested": {"key": "value"}}`,
+			},
+		},
+		{
+			name:   "string with escaped quotes and braces",
+			buffer: []byte(`{"name": "analyze", "arguments": {"text": "The JSON is: {\"key\": \"val{ue}\"}"}}`),
+			want: map[string]any{
+				"text": `The JSON is: {"key": "val{ue}"}`,
+			},
+		},
+		{
+			name:   "multiple objects with string containing braces",
+			buffer: []byte(`{"name": "test", "arguments": {"query": "find } in text"}} {"name": "other"}`),
+			want: map[string]any{
+				"query": "find } in text",
+			},
+		},
+		{
+			name:   "unmatched closing brace in string",
+			buffer: []byte(`{"name": "search", "arguments": {"pattern": "regex: }"}}`),
+			want: map[string]any{
+				"pattern": "regex: }",
+			},
+		},
+		{
+			name:   "complex nested with mixed braces",
+			buffer: []byte(`{"name": "analyze", "arguments": {"data": "{\"items\": [{\"value\": \"}\"}, {\"code\": \"if (x) { return y; }\"}]}"}}`),
+			want: map[string]any{
+				"data": `{"items": [{"value": "}"}, {"code": "if (x) { return y; }"}]}`,
+			},
+		},
+		{
+			name:   "string with newline and braces",
+			buffer: []byte(`{"name": "format", "arguments": {"template": "{\n  \"key\": \"value\"\n}"}}`),
+			want: map[string]any{
+				"template": "{\n  \"key\": \"value\"\n}",
+			},
+		},
+		{
+			name:   "string with unicode escape",
+			buffer: []byte(`{"name": "test", "arguments": {"text": "Unicode: \u007B and \u007D"}}`),
+			want: map[string]any{
+				"text": "Unicode: { and }",
+			},
+		},
+		{
+			name:   "array arguments",
+			buffer: []byte(`{"name": "batch", "arguments": ["item1", "item2", "{\"nested\": true}"]}`),
+			want:   nil, // This should return nil because arguments is not a map
+		},
+		{
+			name:   "escaped backslash before quote",
+			buffer: []byte(`{"name": "path", "arguments": {"dir": "C:\\Program Files\\{App}\\"}}`),
+			want: map[string]any{
+				"dir": `C:\Program Files\{App}\`,
+			},
+		},
+		{
+			name:   "single quotes not treated as string delimiters",
+			buffer: []byte(`{"name": "query", "arguments": {"sql": "SELECT * FROM users WHERE name = '{admin}'"}}`),
+			want: map[string]any{
+				"sql": "SELECT * FROM users WHERE name = '{admin}'",
+			},
+		},
+		{
+			name:   "incomplete json at buffer end",
+			buffer: []byte(`{"name": "test", "arguments": {"data": "some {"`),
+			want:   nil,
+		},
+		{
+			name:   "multiple escaped quotes",
+			buffer: []byte(`{"name": "echo", "arguments": {"msg": "He said \"Hello {World}\" loudly"}}`),
+			want: map[string]any{
+				"msg": `He said "Hello {World}" loudly`,
+			},
+		},
+		{
+			name:   "json with comments style string",
+			buffer: []byte(`{"name": "code", "arguments": {"snippet": "// This is a comment with { and }"}}`),
+			want: map[string]any{
+				"snippet": "// This is a comment with { and }",
+			},
+		},
+		{
+			name:   "consecutive escaped backslashes",
+			buffer: []byte(`{"name": "test", "arguments": {"path": "C:\\\\{folder}\\\\"}}`),
+			want: map[string]any{
+				"path": `C:\\{folder}\\`,
+			},
+		},
+		{
+			name:   "empty string with braces after",
+			buffer: []byte(`{"name": "test", "arguments": {"a": "", "b": "{value}"}}`),
+			want: map[string]any{
+				"a": "",
+				"b": "{value}",
+			},
+		},
+		{
+			name:   "unicode in key names",
+			buffer: []byte(`{"name": "test", "arguments": {"key{": "value", "key}": "value2"}}`),
+			want: map[string]any{
+				"key{": "value",
+				"key}": "value2",
+			},
+		},
+		{
+			name:   "very long string with braces",
+			buffer: []byte(`{"name": "test", "arguments": {"data": "` + strings.Repeat("a{b}c", 100) + `"}}`),
+			want: map[string]any{
+				"data": strings.Repeat("a{b}c", 100),
+			},
+		},
+		{
+			name:   "tab characters and braces",
+			buffer: []byte(`{"name": "test", "arguments": {"code": "\tif (true) {\n\t\treturn;\n\t}"}}`),
+			want: map[string]any{
+				"code": "\tif (true) {\n\t\treturn;\n\t}",
+			},
+		},
+		{
+			name:   "null byte in string",
+			buffer: []byte(`{"name": "test", "arguments": {"data": "before\u0000{after}"}}`),
+			want: map[string]any{
+				"data": "before\x00{after}",
+			},
+		},
+		{
+			name:   "escaped quote at end of string",
+			buffer: []byte(`{"name": "test", "arguments": {"data": "text with quote at end\\\""}}`),
+			want: map[string]any{
+				"data": `text with quote at end\"`,
+			},
+		},
+		{
+			name:   "mixed array and object in arguments",
+			buffer: []byte(`{"name": "test", "arguments": {"items": ["{", "}", {"key": "value"}]}}`),
+			want: map[string]any{
+				"items": []any{"{", "}", map[string]any{"key": "value"}},
+			},
+		},
 	}

 	for _, tt := range tests {
--- a/types/null.go
+++ b/types/null.go
@@ -0,0 +1,53 @@
+package types
+
+import (
+	"encoding/json"
+)
+
+// Null represents a value of any type T that may be null.
+type Null[T any] struct {
+	value T
+	valid bool
+}
+
+// NullWithValue creates a new, valid Null[T].
+func NullWithValue[T any](value T) Null[T] {
+	return Null[T]{value: value, valid: true}
+}
+
+// Value returns the value of the Type[T] if set, otherwise it returns the provided default value or the zero value of T.
+func (n Null[T]) Value(defaultValue ...T) T {
+	if n.valid {
+		return n.value
+	}
+	if len(defaultValue) > 0 {
+		return defaultValue[0]
+	}
+	var zero T
+	return zero
+}
+
+// SetValue sets the value of the Type[T].
+func (n *Null[T]) SetValue(t T) {
+	n.value = t
+	n.valid = true
+}
+
+// MarshalJSON implements [json.Marshaler].
+func (n Null[T]) MarshalJSON() ([]byte, error) {
+	if n.valid {
+		return json.Marshal(n.value)
+	}
+	return []byte("null"), nil
+}
+
+// UnmarshalJSON implements [json.Unmarshaler].
+func (n *Null[T]) UnmarshalJSON(data []byte) error {
+	if string(data) != "null" {
+		if err := json.Unmarshal(data, &n.value); err != nil {
+			return err
+		}
+		n.valid = true
+	}
+	return nil
+}
--- a/types/null_test.go
+++ b/types/null_test.go
@@ -0,0 +1,53 @@
+package types_test
+
+import (
+	"encoding/json"
+	"testing"
+
+	"github.com/ollama/ollama/types"
+)
+
+func TestNull(t *testing.T) {
+	var s types.Null[string]
+	if val := s.Value(); val != "" {
+		t.Errorf("expected Value to return zero value '', got '%s'", val)
+	}
+
+	if val := s.Value("default"); val != "default" {
+		t.Errorf("expected Value to return default value 'default', got '%s'", val)
+	}
+
+	if bts, err := json.Marshal(s); err != nil {
+		t.Errorf("unexpected error during MarshalJSON: %v", err)
+	} else if want := "null"; string(bts) != want {
+		t.Errorf("expected marshaled JSON to be %s, got %s", want, string(bts))
+	}
+
+	s.SetValue("foo")
+	if val := s.Value(); val != "foo" {
+		t.Errorf("expected Value to return 'foo', got '%s'", val)
+	}
+
+	s = types.NullValue("bar")
+	if val := s.Value(); val != "bar" {
+		t.Errorf("expected Value to return 'bar', got '%s'", val)
+	}
+
+	if bts, err := json.Marshal(s); err != nil {
+		t.Errorf("unexpected error during MarshalJSON: %v", err)
+	} else if want := `"bar"`; string(bts) != want {
+		t.Errorf("expected marshaled JSON to be %s, got %s", want, string(bts))
+	}
+
+	if err := json.Unmarshal([]byte(`null`), &s); err != nil {
+		t.Errorf("unexpected error during UnmarshalJSON: %v", err)
+	}
+
+	if err := json.Unmarshal([]byte(`"baz"`), &s); err != nil {
+		t.Errorf("unexpected error during UnmarshalJSON: %v", err)
+	}
+
+	if err := json.Unmarshal([]byte(`1.2345`), &s); err == nil {
+		t.Error("expected error during UnmarshalJSON with invalid JSON, got nil")
+	}
+}
Author	SHA1	Message	Date
Michael Yang	d05fc26570	null truncate	2025-08-25 10:00:16 -07:00
Michael Yang	c457628090	null stream	2025-08-25 10:00:15 -07:00
Michael Yang	e914477bb6	types: add types.Null[T] there's a common pattern where request fields may need to differentiate between an unset value and a value set to the type's zero value. this is commonly used to apply a different default value, e.g. stream, or to omit a field entirely, e.g. think. similar to sql.Null[T], types.Null[T] simplifies this by providing utilities to quickly and easily apply this pattern to any type using generics.	2025-08-25 09:49:02 -07:00
Jeffrey Morgan	4bcb04ad88	tools: avoid matching braces that are part of tool content (#12039 )	2025-08-22 15:22:14 -07:00
Devon Rifkin	e3d5708754	Merge pull request #12021 from ollama/drifkin/thinking-double-emit thinking: fix double emit when no opening tag	2025-08-22 12:01:37 -07:00
Jeffrey Morgan	4be4dc8717	server: skip parsing initial <think> if provided in the prompt (#12024 )	2025-08-22 12:00:16 -07:00
zoupingshi	109d4fc3b4	chore: remove redundant words in comment (#12028 ) Signed-off-by: zoupingshi <hangfachang@outlook.com>	2025-08-22 11:00:27 -07:00
Devon Rifkin	2cb0a580f3	thinking: fix double emit when no opening tag The thinking parser will automatically transition to being a pass-through if non-whitespace is seen before an opening tag. However, we weren't clearing the buffer after the first non-whitespace input, so in practice the first token would be emitted twice. Added a test that demonstrated this, and then fixed the bug.	2025-08-21 21:03:12 -07:00
Parth Sareen	7cce5aac76	harmony: move harmony parsing into a package (#12016 )	2025-08-21 13:56:22 -07:00
Michael Yang	4ae4f47b16	gpt-oss: convert from hugging face format (#11907 )	2025-08-20 15:39:18 -07:00
Jesse Gross	073fa31df5	llm: Don't always evict models in CPU-only mode With old memory estimates, it's currently impossible to load more than one model at a time when no GPUs are available. This is because the check for whether we need to evict a model looks to see if all layers of the new model can be loaded onto GPUs, which is never true if there are no GPUs. Before the memory management changes, there was a special code path for CPU-only systems. This problem does not exist with new memory estimates. Fixes #11974	2025-08-20 14:31:02 -07:00
Michael Yang	91fc3c48e3	openai: remove reasoning as an api.Options (#11993 )	2025-08-20 12:21:42 -07:00