add min vram for glm-4.7-flash

revert doc changes for recommended models
launch: add recommended models with install flow and sorting
2026-02-02 19:54:33 -05:00 · 2026-02-02 13:53:27 -08:00 · 2026-02-02 13:47:07 -08:00 · 2026-02-02 13:45:46 -08:00 · 2026-02-02 13:29:45 -08:00 · 2026-02-02 11:13:21 -08:00
13 changed files with 577 additions and 78 deletions
--- a/cmd/config/integrations.go
+++ b/cmd/config/integrations.go
@@ -13,6 +13,7 @@ import (
 	"time"

 	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/progress"
 	"github.com/spf13/cobra"
 )

@@ -49,6 +50,14 @@ var integrations = map[string]Runner{
 	"openclaw": &Openclaw{},
 }

+// recommendedModels are shown when the user has no models or as suggestions.
+// Order matters: local models first, then cloud models.
+var recommendedModels = []selectItem{
+	{Name: "glm-4.7-flash", Description: "Recommended (requires ~25GB VRAM)"},
+	{Name: "glm-4.7:cloud", Description: "recommended"},
+	{Name: "kimi-k2.5:cloud", Description: "recommended"},
+}
+
 // integrationAliases are hidden from the interactive selector but work as CLI arguments.
 var integrationAliases = map[string]bool{
 	"clawdbot": true,
@@ -94,62 +103,25 @@ func selectModels(ctx context.Context, name, current string) ([]string, error) {
 		return nil, err
 	}

-	if len(models.Models) == 0 {
-		return nil, fmt.Errorf("no models available, run 'ollama pull <model>' first")
-	}
-
-	var items []selectItem
-	cloudModels := make(map[string]bool)
+	var existing []modelInfo
 	for _, m := range models.Models {
-		if m.RemoteModel != "" {
-			cloudModels[m.Name] = true
-		}
-		items = append(items, selectItem{Name: m.Name})
+		existing = append(existing, modelInfo{Name: m.Name, Remote: m.RemoteModel != ""})
 	}

-	if len(items) == 0 {
-		return nil, fmt.Errorf("no local models available, run 'ollama pull <model>' first")
-	}
-
-	// Get previously configured models (saved config takes precedence)
 	var preChecked []string
 	if saved, err := loadIntegration(name); err == nil {
 		preChecked = saved.Models
 	} else if editor, ok := r.(Editor); ok {
 		preChecked = editor.Models()
 	}
-	checked := make(map[string]bool, len(preChecked))
-	for _, n := range preChecked {
-		checked[n] = true
-	}

-	// Resolve current to full name (e.g., "llama3.2" -> "llama3.2:latest")
-	for _, item := range items {
-		if item.Name == current || strings.HasPrefix(item.Name, current+":") {
-			current = item.Name
-			break
-		}
-	}
+	items, preChecked, existingModels, cloudModels := buildModelList(existing, preChecked, current)

-	// If current model is configured, move to front of preChecked
-	if checked[current] {
-		preChecked = append([]string{current}, slices.DeleteFunc(preChecked, func(m string) bool { return m == current })...)
+	if len(items) == 0 {
+		return nil, fmt.Errorf("no models available")
 	}

-	// Sort: checked first, then alphabetical
-	slices.SortFunc(items, func(a, b selectItem) int {
-		ac, bc := checked[a.Name], checked[b.Name]
-		if ac != bc {
-			if ac {
-				return -1
-			}
-			return 1
-		}
-		return strings.Compare(strings.ToLower(a.Name), strings.ToLower(b.Name))
-	})
-
 	var selected []string
-	// only editors support multi-model selection
 	if _, ok := r.(Editor); ok {
 		selected, err = multiSelectPrompt(fmt.Sprintf("Select models for %s:", r), items, preChecked)
 		if err != nil {
@@ -163,7 +135,27 @@ func selectModels(ctx context.Context, name, current string) ([]string, error) {
 		selected = []string{model}
 	}

-	// if any model in selected is a cloud model, ensure signed in
+	var toPull []string
+	for _, m := range selected {
+		if !existingModels[m] {
+			toPull = append(toPull, m)
+		}
+	}
+	if len(toPull) > 0 {
+		msg := fmt.Sprintf("Download %s?", strings.Join(toPull, ", "))
+		if ok, err := confirmPrompt(msg); err != nil {
+			return nil, err
+		} else if !ok {
+			return nil, errCancelled
+		}
+		for _, m := range toPull {
+			fmt.Fprintf(os.Stderr, "\n")
+			if err := pullModel(ctx, client, m); err != nil {
+				return nil, fmt.Errorf("failed to pull %s: %w", m, err)
+			}
+		}
+	}
+
 	var selectedCloudModels []string
 	for _, m := range selected {
 		if cloudModels[m] {
@@ -286,7 +278,6 @@ Examples:
 				return fmt.Errorf("unknown integration: %s", name)
 			}

-			// If launching without --model, use saved config if available
 			if !configFlag && modelFlag == "" {
 				if config, err := loadIntegration(name); err == nil && len(config.Models) > 0 {
 					return runIntegration(name, config.Models[0])
@@ -295,7 +286,6 @@ Examples:

 			var models []string
 			if modelFlag != "" {
-				// When --model is specified, merge with existing models (new model becomes default)
 				models = []string{modelFlag}
 				if existing, err := loadIntegration(name); err == nil && len(existing.Models) > 0 {
 					for _, m := range existing.Models {
@@ -364,3 +354,150 @@ Examples:
 	cmd.Flags().BoolVar(&configFlag, "config", false, "Configure without launching")
 	return cmd
 }
+
+type modelInfo struct {
+	Name   string
+	Remote bool
+}
+
+// buildModelList merges existing models with recommendations, sorts them, and returns
+// the ordered items along with maps of existing and cloud model names.
+func buildModelList(existing []modelInfo, preChecked []string, current string) (items []selectItem, orderedChecked []string, existingModels, cloudModels map[string]bool) {
+	existingModels = make(map[string]bool)
+	cloudModels = make(map[string]bool)
+	recommended := make(map[string]bool)
+	var hasLocalModel, hasCloudModel bool
+
+	for _, rec := range recommendedModels {
+		recommended[rec.Name] = true
+	}
+
+	for _, m := range existing {
+		existingModels[m.Name] = true
+		if m.Remote {
+			cloudModels[m.Name] = true
+			hasCloudModel = true
+		} else {
+			hasLocalModel = true
+		}
+		displayName := strings.TrimSuffix(m.Name, ":latest")
+		existingModels[displayName] = true
+		item := selectItem{Name: displayName}
+		if recommended[displayName] {
+			item.Description = "recommended"
+		}
+		items = append(items, item)
+	}
+
+	for _, rec := range recommendedModels {
+		if existingModels[rec.Name] || existingModels[rec.Name+":latest"] {
+			continue
+		}
+		items = append(items, rec)
+		if isCloudModel(rec.Name) {
+			cloudModels[rec.Name] = true
+		}
+	}
+
+	checked := make(map[string]bool, len(preChecked))
+	for _, n := range preChecked {
+		checked[n] = true
+	}
+
+	// Resolve current to full name (e.g., "llama3.2" -> "llama3.2:latest")
+	for _, item := range items {
+		if item.Name == current || strings.HasPrefix(item.Name, current+":") {
+			current = item.Name
+			break
+		}
+	}
+
+	if checked[current] {
+		preChecked = append([]string{current}, slices.DeleteFunc(preChecked, func(m string) bool { return m == current })...)
+	}
+
+	// Non-existing models get "install?" suffix and are pushed to the bottom.
+	// When user has no models, preserve recommended order.
+	notInstalled := make(map[string]bool)
+	for i := range items {
+		if !existingModels[items[i].Name] {
+			notInstalled[items[i].Name] = true
+			items[i].Description = "recommended, install?"
+		}
+	}
+
+	if hasLocalModel || hasCloudModel {
+		slices.SortStableFunc(items, func(a, b selectItem) int {
+			ac, bc := checked[a.Name], checked[b.Name]
+			aNew, bNew := notInstalled[a.Name], notInstalled[b.Name]
+
+			if ac != bc {
+				if ac {
+					return -1
+				}
+				return 1
+			}
+			if !ac && !bc && aNew != bNew {
+				if aNew {
+					return 1
+				}
+				return -1
+			}
+			return strings.Compare(strings.ToLower(a.Name), strings.ToLower(b.Name))
+		})
+	}
+
+	return items, preChecked, existingModels, cloudModels
+}
+
+func isCloudModel(name string) bool {
+	return strings.HasSuffix(name, ":cloud")
+}
+
+func pullModel(ctx context.Context, client *api.Client, model string) error {
+	p := progress.NewProgress(os.Stderr)
+	defer p.Stop()
+
+	bars := make(map[string]*progress.Bar)
+	var status string
+	var spinner *progress.Spinner
+
+	fn := func(resp api.ProgressResponse) error {
+		if resp.Digest != "" {
+			if resp.Completed == 0 {
+				return nil
+			}
+
+			if spinner != nil {
+				spinner.Stop()
+			}
+
+			bar, ok := bars[resp.Digest]
+			if !ok {
+				name, isDigest := strings.CutPrefix(resp.Digest, "sha256:")
+				name = strings.TrimSpace(name)
+				if isDigest {
+					name = name[:min(12, len(name))]
+				}
+				bar = progress.NewBar(fmt.Sprintf("pulling %s:", name), resp.Total, resp.Completed)
+				bars[resp.Digest] = bar
+				p.Add(resp.Digest, bar)
+			}
+
+			bar.Set(resp.Completed)
+		} else if status != resp.Status {
+			if spinner != nil {
+				spinner.Stop()
+			}
+
+			status = resp.Status
+			spinner = progress.NewSpinner(status)
+			p.Add(status, spinner)
+		}
+
+		return nil
+	}
+
+	request := api.PullRequest{Name: model}
+	return client.Pull(ctx, &request, fn)
+}
--- a/cmd/config/integrations_test.go
+++ b/cmd/config/integrations_test.go
@@ -5,6 +5,7 @@ import (
 	"strings"
 	"testing"

+	"github.com/google/go-cmp/cmp"
 	"github.com/spf13/cobra"
 )

@@ -174,15 +175,226 @@ func TestLaunchCmd_NilHeartbeat(t *testing.T) {
 func TestAllIntegrations_HaveRequiredMethods(t *testing.T) {
 	for name, r := range integrations {
 		t.Run(name, func(t *testing.T) {
-			// Test String() doesn't panic and returns non-empty
 			displayName := r.String()
 			if displayName == "" {
 				t.Error("String() should not return empty")
 			}
-
-			// Test Run() exists (we can't call it without actually running the command)
-			// Just verify the method is available
 			var _ func(string) error = r.Run
 		})
 	}
 }
+
+func TestIsCloudModel(t *testing.T) {
+	tests := []struct {
+		name string
+		want bool
+	}{
+		{"glm-4.7:cloud", true},
+		{"kimi-k2.5:cloud", true},
+		{"glm-4.7-flash", false},
+		{"glm-4.7-flash:latest", false},
+		{"cloud-model", false},
+		{"model:cloudish", false},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := isCloudModel(tt.name); got != tt.want {
+				t.Errorf("isCloudModel(%q) = %v, want %v", tt.name, got, tt.want)
+			}
+		})
+	}
+}
+
+func names(items []selectItem) []string {
+	var out []string
+	for _, item := range items {
+		out = append(out, item.Name)
+	}
+	return out
+}
+
+func TestBuildModelList_NoExistingModels(t *testing.T) {
+	items, _, _, _ := buildModelList(nil, nil, "")
+
+	want := []string{"glm-4.7-flash", "glm-4.7:cloud", "kimi-k2.5:cloud"}
+	if diff := cmp.Diff(want, names(items)); diff != "" {
+		t.Errorf("with no existing models, items should be recommended in order (-want +got):\n%s", diff)
+	}
+
+	for _, item := range items {
+		if item.Description != "recommended, install?" {
+			t.Errorf("item %q should have description 'install?', got %q", item.Name, item.Description)
+		}
+	}
+}
+
+func TestBuildModelList_OnlyLocalModels_CloudRecsAtBottom(t *testing.T) {
+	existing := []modelInfo{
+		{Name: "llama3.2:latest", Remote: false},
+		{Name: "qwen2.5:latest", Remote: false},
+	}
+
+	items, _, _, _ := buildModelList(existing, nil, "")
+	got := names(items)
+
+	want := []string{"llama3.2", "qwen2.5", "glm-4.7-flash", "glm-4.7:cloud", "kimi-k2.5:cloud"}
+	if diff := cmp.Diff(want, got); diff != "" {
+		t.Errorf("cloud recs should be at bottom (-want +got):\n%s", diff)
+	}
+}
+
+func TestBuildModelList_BothCloudAndLocal_RegularSort(t *testing.T) {
+	existing := []modelInfo{
+		{Name: "llama3.2:latest", Remote: false},
+		{Name: "glm-4.7:cloud", Remote: true},
+	}
+
+	items, _, _, _ := buildModelList(existing, nil, "")
+	got := names(items)
+
+	want := []string{"glm-4.7:cloud", "llama3.2", "glm-4.7-flash", "kimi-k2.5:cloud"}
+	if diff := cmp.Diff(want, got); diff != "" {
+		t.Errorf("mixed models should be alphabetical (-want +got):\n%s", diff)
+	}
+}
+
+func TestBuildModelList_PreCheckedFirst(t *testing.T) {
+	existing := []modelInfo{
+		{Name: "llama3.2:latest", Remote: false},
+		{Name: "glm-4.7:cloud", Remote: true},
+	}
+
+	items, _, _, _ := buildModelList(existing, []string{"llama3.2"}, "")
+	got := names(items)
+
+	if got[0] != "llama3.2" {
+		t.Errorf("pre-checked model should be first, got %v", got)
+	}
+}
+
+func TestBuildModelList_ExistingRecommendedMarked(t *testing.T) {
+	existing := []modelInfo{
+		{Name: "glm-4.7-flash", Remote: false},
+		{Name: "glm-4.7:cloud", Remote: true},
+	}
+
+	items, _, _, _ := buildModelList(existing, nil, "")
+
+	for _, item := range items {
+		switch item.Name {
+		case "glm-4.7-flash", "glm-4.7:cloud":
+			if item.Description != "recommended" {
+				t.Errorf("installed recommended %q should have description 'recommended', got %q", item.Name, item.Description)
+			}
+		case "kimi-k2.5:cloud":
+			if item.Description != "recommended, install?" {
+				t.Errorf("non-installed recommended %q should have description 'install?', got %q", item.Name, item.Description)
+			}
+		}
+	}
+}
+
+func TestBuildModelList_ExistingCloudModelsNotPushedToBottom(t *testing.T) {
+	existing := []modelInfo{
+		{Name: "glm-4.7-flash", Remote: false},
+		{Name: "glm-4.7:cloud", Remote: true},
+	}
+
+	items, _, _, _ := buildModelList(existing, nil, "")
+	got := names(items)
+
+	// glm-4.7-flash and glm-4.7:cloud are installed so they sort normally;
+	// kimi-k2.5:cloud and qwen3:0.6b are not installed so they go to the bottom
+	want := []string{"glm-4.7-flash", "glm-4.7:cloud", "kimi-k2.5:cloud"}
+	if diff := cmp.Diff(want, got); diff != "" {
+		t.Errorf("existing cloud models should sort normally (-want +got):\n%s", diff)
+	}
+}
+
+func TestBuildModelList_HasRecommendedCloudModel_OnlyNonInstalledAtBottom(t *testing.T) {
+	existing := []modelInfo{
+		{Name: "llama3.2:latest", Remote: false},
+		{Name: "kimi-k2.5:cloud", Remote: true},
+	}
+
+	items, _, _, _ := buildModelList(existing, nil, "")
+	got := names(items)
+
+	// kimi-k2.5:cloud is installed so it sorts normally;
+	// the rest of the recommendations are not installed so they go to the bottom
+	want := []string{"kimi-k2.5:cloud", "llama3.2", "glm-4.7-flash", "glm-4.7:cloud"}
+	if diff := cmp.Diff(want, got); diff != "" {
+		t.Errorf("only non-installed models should be at bottom (-want +got):\n%s", diff)
+	}
+
+	// Non-installed models should have "recommended, install?" description
+	for _, item := range items {
+		if !slices.Contains([]string{"kimi-k2.5:cloud", "llama3.2"}, item.Name) {
+			if item.Description != "recommended, install?" {
+				t.Errorf("non-installed %q should have description 'install?', got %q", item.Name, item.Description)
+			}
+		}
+	}
+}
+
+func TestBuildModelList_LatestTagStripped(t *testing.T) {
+	existing := []modelInfo{
+		{Name: "glm-4.7-flash:latest", Remote: false},
+		{Name: "llama3.2:latest", Remote: false},
+	}
+
+	items, _, existingModels, _ := buildModelList(existing, nil, "")
+	got := names(items)
+
+	// :latest should be stripped from display names
+	for _, name := range got {
+		if strings.HasSuffix(name, ":latest") {
+			t.Errorf("name %q should not have :latest suffix", name)
+		}
+	}
+
+	// glm-4.7-flash should not be duplicated (existing :latest matches the recommendation)
+	count := 0
+	for _, name := range got {
+		if name == "glm-4.7-flash" {
+			count++
+		}
+	}
+	if count != 1 {
+		t.Errorf("glm-4.7-flash should appear exactly once, got %d in %v", count, got)
+	}
+
+	// Stripped name should be in existingModels so it won't be pulled
+	if !existingModels["glm-4.7-flash"] {
+		t.Error("glm-4.7-flash should be in existingModels")
+	}
+}
+
+func TestBuildModelList_ReturnsExistingAndCloudMaps(t *testing.T) {
+	existing := []modelInfo{
+		{Name: "llama3.2:latest", Remote: false},
+		{Name: "glm-4.7:cloud", Remote: true},
+	}
+
+	_, _, existingModels, cloudModels := buildModelList(existing, nil, "")
+
+	if !existingModels["llama3.2"] {
+		t.Error("llama3.2 should be in existingModels")
+	}
+	if !existingModels["glm-4.7:cloud"] {
+		t.Error("glm-4.7:cloud should be in existingModels")
+	}
+	if existingModels["glm-4.7-flash"] {
+		t.Error("glm-4.7-flash should not be in existingModels (it's a recommendation)")
+	}
+
+	if !cloudModels["glm-4.7:cloud"] {
+		t.Error("glm-4.7:cloud should be in cloudModels")
+	}
+	if !cloudModels["kimi-k2.5:cloud"] {
+		t.Error("kimi-k2.5:cloud should be in cloudModels (recommended cloud)")
+	}
+	if cloudModels["llama3.2"] {
+		t.Error("llama3.2 should not be in cloudModels")
+	}
+}
--- a/cmd/config/selector.go
+++ b/cmd/config/selector.go
@@ -353,10 +353,15 @@ func renderMultiSelect(w io.Writer, prompt string, s *multiSelectState) int {
 				suffix = " " + ansiGray + "(default)" + ansiReset
 			}

+			desc := ""
+			if item.Description != "" {
+				desc = " " + ansiGray + "- " + item.Description + ansiReset
+			}
+
 			if idx == s.highlighted && !s.focusOnButton {
-				fmt.Fprintf(w, "  %s%s %s %s%s%s\r\n", ansiBold, prefix, checkbox, item.Name, ansiReset, suffix)
+				fmt.Fprintf(w, "  %s%s %s %s%s%s%s\r\n", ansiBold, prefix, checkbox, item.Name, ansiReset, desc, suffix)
 			} else {
-				fmt.Fprintf(w, "  %s %s %s%s\r\n", prefix, checkbox, item.Name, suffix)
+				fmt.Fprintf(w, "  %s %s %s%s%s\r\n", prefix, checkbox, item.Name, desc, suffix)
 			}
 			lineCount++
 		}
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -201,7 +201,7 @@ var (
 	// Enable the new Ollama engine
 	NewEngine = Bool("OLLAMA_NEW_ENGINE")
 	// ContextLength sets the default context length
-	ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 4096)
+	ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 0)
 	// Auth enables authentication between the Ollama client and server
 	UseAuth = Bool("OLLAMA_AUTH")
 	// Enable Vulkan backend
@@ -290,7 +290,7 @@ func AsMap() map[string]EnvVar {
 		"OLLAMA_ORIGINS":           {"OLLAMA_ORIGINS", AllowedOrigins(), "A comma separated list of allowed origins"},
 		"OLLAMA_SCHED_SPREAD":      {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
 		"OLLAMA_MULTIUSER_CACHE":   {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},
-		"OLLAMA_CONTEXT_LENGTH":    {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 4096)"},
+		"OLLAMA_CONTEXT_LENGTH":    {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 4k/32k/256k based on VRAM)"},
 		"OLLAMA_NEW_ENGINE":        {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"},
 		"OLLAMA_REMOTES":           {"OLLAMA_REMOTES", Remotes(), "Allowed hosts for remote models (default \"ollama.com\")"},

--- a/envconfig/config_test.go
+++ b/envconfig/config_test.go
@@ -282,7 +282,7 @@ func TestVar(t *testing.T) {

 func TestContextLength(t *testing.T) {
 	cases := map[string]uint{
-		"":     4096,
+		"":     0,
 		"2048": 2048,
 	}

--- a/llm/server.go
+++ b/llm/server.go
@@ -80,6 +80,7 @@ type LlamaServer interface {
 	GetPort() int
 	GetDeviceInfos(ctx context.Context) []ml.DeviceInfo
 	HasExited() bool
+	ContextLength() int
 }

 // llmServer is an instance of a runner hosting a single model
@@ -1200,7 +1201,8 @@ func (s *llmServer) initModel(ctx context.Context, req LoadRequest, operation Lo

 	resp, err := http.DefaultClient.Do(r)
 	if err != nil {
-		return nil, fmt.Errorf("do load request: %w", err)
+		slog.Error("do load request", "error", err)
+		return nil, errors.New("model failed to load, this may be due to resource limitations or an internal error, check ollama server logs for details")
 	}
 	defer resp.Body.Close()

@@ -1901,6 +1903,10 @@ func (s *llmServer) VRAMByGPU(id ml.DeviceID) uint64 {
 	return 0
 }

+func (s *llmServer) ContextLength() int {
+	return s.options.NumCtx
+}
+
 func (s *ollamaServer) GetDeviceInfos(ctx context.Context) []ml.DeviceInfo {
 	devices, err := ml.GetDevicesFromRunner(ctx, s)
 	if err != nil {
--- a/server/routes.go
+++ b/server/routes.go
@@ -75,16 +75,12 @@ func experimentEnabled(name string) bool {

 var useClient2 = experimentEnabled("client2")

-// Low VRAM mode is based on the sum of total VRAM (not free) and triggers
-// reduced context length on some models
-var lowVRAMThreshold uint64 = 20 * format.GibiByte
-
 var mode string = gin.DebugMode

 type Server struct {
-	addr    net.Addr
-	sched   *Scheduler
-	lowVRAM bool
+	addr          net.Addr
+	sched         *Scheduler
+	defaultNumCtx int
 }

 func init() {
@@ -107,8 +103,12 @@ var (
 	errBadTemplate = errors.New("template error")
 )

-func modelOptions(model *Model, requestOpts map[string]any) (api.Options, error) {
+func (s *Server) modelOptions(model *Model, requestOpts map[string]any) (api.Options, error) {
 	opts := api.DefaultOptions()
+	if opts.NumCtx == 0 {
+		opts.NumCtx = s.defaultNumCtx
+	}
+
 	if err := opts.FromMap(model.Options); err != nil {
 		return api.Options{}, err
 	}
@@ -140,20 +140,11 @@ func (s *Server) scheduleRunner(ctx context.Context, name string, caps []model.C
 		return nil, nil, nil, fmt.Errorf("%s %w", name, err)
 	}

-	opts, err := modelOptions(model, requestOpts)
+	opts, err := s.modelOptions(model, requestOpts)
 	if err != nil {
 		return nil, nil, nil, err
 	}

-	// This model is much more capable with a larger context, so set that
-	// unless it would penalize performance too much
-	if !s.lowVRAM && slices.Contains([]string{
-		"gptoss", "gpt-oss",
-		"qwen3vl", "qwen3vlmoe",
-	}, model.Config.ModelFamily) {
-		opts.NumCtx = max(opts.NumCtx, 8192)
-	}
-
 	runnerCh, errCh := s.sched.GetRunner(ctx, model, opts, keepAlive)
 	var runner *runnerRef
 	select {
@@ -1720,10 +1711,18 @@ func Serve(ln net.Listener) error {
 	for _, gpu := range gpus {
 		totalVRAM += gpu.TotalMemory - envconfig.GpuOverhead()
 	}
-	if totalVRAM < lowVRAMThreshold {
-		s.lowVRAM = true
-		slog.Info("entering low vram mode", "total vram", format.HumanBytes2(totalVRAM), "threshold", format.HumanBytes2(lowVRAMThreshold))
+
+	// Set default context based on VRAM tier
+	// Use slightly lower thresholds (47/23 GiB vs. 48/24 GiB) to account for small differences in the exact value
+	switch {
+	case totalVRAM >= 47*format.GibiByte:
+		s.defaultNumCtx = 262144
+	case totalVRAM >= 23*format.GibiByte:
+		s.defaultNumCtx = 32768
+	default:
+		s.defaultNumCtx = 4096
 	}
+	slog.Info("vram-based default context", "total_vram", format.HumanBytes2(totalVRAM), "default_num_ctx", s.defaultNumCtx)

 	err = srvr.Serve(ln)
 	// If server is closed from the signal handler, wait for the ctx to be done
@@ -1897,8 +1896,8 @@ func (s *Server) PsHandler(c *gin.Context) {
 			Details:   modelDetails,
 			ExpiresAt: v.expiresAt,
 		}
-		if v.Options != nil {
-			mr.ContextLength = v.Options.NumCtx
+		if v.llama != nil {
+			mr.ContextLength = v.llama.ContextLength()
 		}
 		// The scheduler waits to set expiresAt, so if a model is loading it's
 		// possible that it will be set to the unix epoch. For those cases, just
--- a/server/routes_debug_test.go
+++ b/server/routes_debug_test.go
@@ -15,6 +15,7 @@ import (
 )

 func TestGenerateDebugRenderOnly(t *testing.T) {
+	t.Setenv("OLLAMA_CONTEXT_LENGTH", "4096")
 	gin.SetMode(gin.TestMode)

 	mock := mockRunner{
@@ -208,6 +209,7 @@ func TestGenerateDebugRenderOnly(t *testing.T) {
 }

 func TestChatDebugRenderOnly(t *testing.T) {
+	t.Setenv("OLLAMA_CONTEXT_LENGTH", "4096")
 	gin.SetMode(gin.TestMode)

 	mock := mockRunner{
--- a/server/routes_generate_renderer_test.go
+++ b/server/routes_generate_renderer_test.go
@@ -20,6 +20,7 @@ import (
 // TestGenerateWithBuiltinRenderer tests that api/generate uses built-in renderers
 // when in chat-like flow (messages present, no suffix, no template)
 func TestGenerateWithBuiltinRenderer(t *testing.T) {
+	t.Setenv("OLLAMA_CONTEXT_LENGTH", "4096")
 	gin.SetMode(gin.TestMode)

 	mock := mockRunner{
@@ -204,6 +205,7 @@ func TestGenerateWithBuiltinRenderer(t *testing.T) {

 // TestGenerateWithDebugRenderOnly tests that debug_render_only works with built-in renderers
 func TestGenerateWithDebugRenderOnly(t *testing.T) {
+	t.Setenv("OLLAMA_CONTEXT_LENGTH", "4096")
 	gin.SetMode(gin.TestMode)

 	mock := mockRunner{
--- a/server/routes_generate_test.go
+++ b/server/routes_generate_test.go
@@ -162,6 +162,7 @@ func TestGenerateChatRemote(t *testing.T) {
 }

 func TestGenerateChat(t *testing.T) {
+	t.Setenv("OLLAMA_CONTEXT_LENGTH", "4096")
 	gin.SetMode(gin.TestMode)

 	mock := mockRunner{
@@ -878,6 +879,7 @@ func TestGenerateChat(t *testing.T) {
 }

 func TestGenerate(t *testing.T) {
+	t.Setenv("OLLAMA_CONTEXT_LENGTH", "4096")
 	gin.SetMode(gin.TestMode)

 	mock := mockRunner{
@@ -2355,6 +2357,7 @@ func TestGenerateWithImages(t *testing.T) {
 // TestImageGenerateStreamFalse tests that image generation respects stream=false
 // and returns a single JSON response instead of streaming ndjson.
 func TestImageGenerateStreamFalse(t *testing.T) {
+	t.Setenv("OLLAMA_CONTEXT_LENGTH", "4096")
 	gin.SetMode(gin.TestMode)

 	p := t.TempDir()
--- a/server/routes_options_test.go
+++ b/server/routes_options_test.go
@@ -0,0 +1,127 @@
+package server
+
+import (
+	"testing"
+)
+
+func TestModelOptionsNumCtxPriority(t *testing.T) {
+	tests := []struct {
+		name           string
+		envContextLen  string // empty means not set (uses 0 sentinel)
+		defaultNumCtx  int    // VRAM-based default
+		modelNumCtx    int    // 0 means not set in model
+		requestNumCtx  int    // 0 means not set in request
+		expectedNumCtx int
+	}{
+		{
+			name:           "vram default when nothing else set",
+			envContextLen:  "",
+			defaultNumCtx:  32768,
+			modelNumCtx:    0,
+			requestNumCtx:  0,
+			expectedNumCtx: 32768,
+		},
+		{
+			name:           "env var overrides vram default",
+			envContextLen:  "8192",
+			defaultNumCtx:  32768,
+			modelNumCtx:    0,
+			requestNumCtx:  0,
+			expectedNumCtx: 8192,
+		},
+		{
+			name:           "model overrides vram default",
+			envContextLen:  "",
+			defaultNumCtx:  32768,
+			modelNumCtx:    16384,
+			requestNumCtx:  0,
+			expectedNumCtx: 16384,
+		},
+		{
+			name:           "model overrides env var",
+			envContextLen:  "8192",
+			defaultNumCtx:  32768,
+			modelNumCtx:    16384,
+			requestNumCtx:  0,
+			expectedNumCtx: 16384,
+		},
+		{
+			name:           "request overrides everything",
+			envContextLen:  "8192",
+			defaultNumCtx:  32768,
+			modelNumCtx:    16384,
+			requestNumCtx:  4096,
+			expectedNumCtx: 4096,
+		},
+		{
+			name:           "request overrides vram default",
+			envContextLen:  "",
+			defaultNumCtx:  32768,
+			modelNumCtx:    0,
+			requestNumCtx:  4096,
+			expectedNumCtx: 4096,
+		},
+		{
+			name:           "request overrides model",
+			envContextLen:  "",
+			defaultNumCtx:  32768,
+			modelNumCtx:    16384,
+			requestNumCtx:  4096,
+			expectedNumCtx: 4096,
+		},
+		{
+			name:           "low vram tier default",
+			envContextLen:  "",
+			defaultNumCtx:  4096,
+			modelNumCtx:    0,
+			requestNumCtx:  0,
+			expectedNumCtx: 4096,
+		},
+		{
+			name:           "high vram tier default",
+			envContextLen:  "",
+			defaultNumCtx:  262144,
+			modelNumCtx:    0,
+			requestNumCtx:  0,
+			expectedNumCtx: 262144,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			// Set or clear environment variable
+			if tt.envContextLen != "" {
+				t.Setenv("OLLAMA_CONTEXT_LENGTH", tt.envContextLen)
+			}
+
+			// Create server with VRAM-based default
+			s := &Server{
+				defaultNumCtx: tt.defaultNumCtx,
+			}
+
+			// Create model options (use float64 as FromMap expects JSON-style numbers)
+			var modelOpts map[string]any
+			if tt.modelNumCtx != 0 {
+				modelOpts = map[string]any{"num_ctx": float64(tt.modelNumCtx)}
+			}
+			model := &Model{
+				Options: modelOpts,
+			}
+
+			// Create request options (use float64 as FromMap expects JSON-style numbers)
+			var requestOpts map[string]any
+			if tt.requestNumCtx != 0 {
+				requestOpts = map[string]any{"num_ctx": float64(tt.requestNumCtx)}
+			}
+
+			opts, err := s.modelOptions(model, requestOpts)
+			if err != nil {
+				t.Fatalf("modelOptions failed: %v", err)
+			}
+
+			if opts.NumCtx != tt.expectedNumCtx {
+				t.Errorf("NumCtx = %d, want %d", opts.NumCtx, tt.expectedNumCtx)
+			}
+		})
+	}
+}
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -804,6 +804,7 @@ func (s *mockLlm) GetPort() int                                       { return -
 func (s *mockLlm) GetDeviceInfos(ctx context.Context) []ml.DeviceInfo { return nil }
 func (s *mockLlm) HasExited() bool                                    { return false }
 func (s *mockLlm) GetActiveDeviceIDs() []ml.DeviceID                  { return nil }
+func (s *mockLlm) ContextLength() int                                 { return 0 }

 // TestImageGenRunnerCanBeEvicted verifies that an image generation model
 // loaded in the scheduler can be evicted when idle.
--- a/x/imagegen/server.go
+++ b/x/imagegen/server.go
@@ -347,6 +347,11 @@ func (s *Server) VRAMByGPU(id ml.DeviceID) uint64 {
 	return s.vramSize
 }

+// Context length is not applicable for image generation.
+func (s *Server) ContextLength() int {
+	return 0
+}
+
 func (s *Server) Embedding(ctx context.Context, input string) ([]float32, int, error) {
 	return nil, 0, errors.New("not supported")
 }
Author	SHA1	Message	Date
ParthSareen	d8b954f8e2	add min vram for glm-4.7-flash	2026-02-02 13:53:27 -08:00
ParthSareen	e4c0575f8f	revert doc changes for recommended models	2026-02-02 13:47:07 -08:00
ParthSareen	970621df27	launch: add recommended models with install flow and sorting Show recommended models (glm-4.7-flash, glm-4.7:cloud, kimi-k2.5:cloud) in the model selector. Non-installed models are labeled "recommended, install?" and sorted to the bottom. Selecting a non-installed model prompts for confirmation before pulling. Strip :latest suffix from display names. Update docs to match new recommendations.	2026-02-02 13:45:46 -08:00
ParthSareen	a3024436a8	cmd: launch default models	2026-02-02 13:29:45 -08:00
Richard Lyons	6582f6da5c	llm: Make "do load request" error message more informative	2026-02-02 11:13:21 -08:00
Jesse Gross	0334ffa625	server: use tiered VRAM-based default context length Replace binary low VRAM mode with tiered VRAM thresholds that set default context lengths for all models: - < 24 GiB VRAM: 4,096 context - 24-48 GiB VRAM: 32,768 context - >= 48 GiB VRAM: 262,144 context	2026-02-02 10:47:09 -08:00
Jesse Gross	d11fbd2c60	server: fix ollama ps showing configured instead of actual context length When context length is clamped to the model's trained context length, ollama ps now shows the actual clamped value instead of the originally configured value.	2026-02-02 10:47:09 -08:00