cmd: fix context limits for droid and add qwen3-coder-next ctx (#14112 )

Revert "move tokenizers to separate package (#13825 )" (#14111 )
2026-02-06 05:34:21 -05:00 · 2026-02-05 22:29:53 -08:00 · 2026-02-05 20:49:08 -08:00
6 changed files with 125 additions and 3 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -182,7 +182,7 @@ option(MLX_ENGINE "Enable MLX backend" OFF)

 if(MLX_ENGINE)
    message(STATUS "Setting up MLX (this takes a while...)")
-    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/x/ml/backend/mlx)
+    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/x/imagegen/mlx)

    # Find CUDA toolkit if MLX is built with CUDA support
    find_package(CUDAToolkit)
@@ -216,4 +216,4 @@ if(MLX_ENGINE)
                COMPONENT MLX)
        endif()
    endif()
-endif()
+endif()
--- a/cmd/config/droid.go
+++ b/cmd/config/droid.go
@@ -1,6 +1,7 @@
 package config

 import (
+	"context"
 	"encoding/json"
 	"fmt"
 	"os"
@@ -8,6 +9,7 @@ import (
 	"path/filepath"
 	"slices"

+	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/envconfig"
 )

@@ -112,9 +114,17 @@ func (d *Droid) Edit(models []string) error {
 	}

 	// Build new Ollama model entries with sequential indices (0, 1, 2, ...)
+	client, _ := api.ClientFromEnvironment()
+
 	var newModels []any
 	var defaultModelID string
 	for i, model := range models {
+		maxOutput := 64000
+		if isCloudModel(context.Background(), client, model) {
+			if l, ok := lookupCloudModelLimit(model); ok {
+				maxOutput = l.Output
+			}
+		}
 		modelID := fmt.Sprintf("custom:%s-%d", model, i)
 		newModels = append(newModels, modelEntry{
 			Model:           model,
@@ -122,7 +132,7 @@ func (d *Droid) Edit(models []string) error {
 			BaseURL:         envconfig.Host().String() + "/v1",
 			APIKey:          "ollama",
 			Provider:        "generic-chat-completion-api",
-			MaxOutputTokens: 64000,
+			MaxOutputTokens: maxOutput,
 			SupportsImages:  false,
 			ID:              modelID,
 			Index:           i,
--- a/cmd/config/droid_test.go
+++ b/cmd/config/droid_test.go
@@ -1251,6 +1251,55 @@ func TestDroidEdit_LargeNumberOfModels(t *testing.T) {
 	}
 }

+func TestDroidEdit_LocalModelDefaultMaxOutput(t *testing.T) {
+	d := &Droid{}
+	tmpDir := t.TempDir()
+	setTestHome(t, tmpDir)
+
+	settingsDir := filepath.Join(tmpDir, ".factory")
+	settingsPath := filepath.Join(settingsDir, "settings.json")
+
+	if err := d.Edit([]string{"llama3.2"}); err != nil {
+		t.Fatal(err)
+	}
+
+	data, _ := os.ReadFile(settingsPath)
+	var settings map[string]any
+	json.Unmarshal(data, &settings)
+
+	models := settings["customModels"].([]any)
+	entry := models[0].(map[string]any)
+	if entry["maxOutputTokens"] != float64(64000) {
+		t.Errorf("local model maxOutputTokens = %v, want 64000", entry["maxOutputTokens"])
+	}
+}
+
+func TestDroidEdit_CloudModelLimitsUsed(t *testing.T) {
+	// Verify that every cloud model in cloudModelLimits has a valid output
+	// value that would be used for maxOutputTokens when isCloudModel returns true.
+	// :cloud suffix stripping must also work since that's how users specify them.
+	for name, expected := range cloudModelLimits {
+		t.Run(name, func(t *testing.T) {
+			l, ok := lookupCloudModelLimit(name)
+			if !ok {
+				t.Fatalf("lookupCloudModelLimit(%q) returned false", name)
+			}
+			if l.Output != expected.Output {
+				t.Errorf("output = %d, want %d", l.Output, expected.Output)
+			}
+			// Also verify :cloud suffix lookup
+			cloudName := name + ":cloud"
+			l2, ok := lookupCloudModelLimit(cloudName)
+			if !ok {
+				t.Fatalf("lookupCloudModelLimit(%q) returned false", cloudName)
+			}
+			if l2.Output != expected.Output {
+				t.Errorf(":cloud output = %d, want %d", l2.Output, expected.Output)
+			}
+		})
+	}
+}
+
 func TestDroidEdit_ArraysWithMixedTypes(t *testing.T) {
 	d := &Droid{}
 	tmpDir := t.TempDir()
--- a/cmd/config/opencode.go
+++ b/cmd/config/opencode.go
@@ -39,6 +39,7 @@ var cloudModelLimits = map[string]cloudModelLimit{
 	"kimi-k2-thinking":    {Context: 262_144, Output: 262_144},
 	"nemotron-3-nano:30b": {Context: 1_048_576, Output: 131_072},
 	"qwen3-coder:480b":    {Context: 262_144, Output: 65_536},
+	"qwen3-coder-next":    {Context: 262_144, Output: 32_768},
 	"qwen3-next:80b":      {Context: 262_144, Output: 32_768},
 }

--- a/cmd/config/opencode_test.go
+++ b/cmd/config/opencode_test.go
@@ -633,6 +633,7 @@ func TestLookupCloudModelLimit(t *testing.T) {
 		{"deepseek-v3.2", true, 163_840, 65_536},
 		{"deepseek-v3.2:cloud", true, 163_840, 65_536},
 		{"qwen3-coder:480b", true, 262_144, 65_536},
+		{"qwen3-coder-next:cloud", true, 262_144, 32_768},
 		{"llama3.2", false, 0, 0},
 		{"unknown-model:cloud", false, 0, 0},
 	}
--- a/x/imagegen/mlx/CMakeLists.txt
+++ b/x/imagegen/mlx/CMakeLists.txt
@@ -0,0 +1,61 @@
+include(FetchContent)
+
+# Read MLX version from top-level file (shared with Dockerfile)
+file(READ "${CMAKE_SOURCE_DIR}/MLX_VERSION" MLX_C_GIT_TAG)
+string(STRIP "${MLX_C_GIT_TAG}" MLX_C_GIT_TAG)
+
+set(MLX_C_BUILD_EXAMPLES OFF)
+
+set(MLX_BUILD_GGUF OFF)
+set(MLX_BUILD_SAFETENSORS ON)
+
+function(set_target_output_directory _target)
+    if(TARGET ${_target})
+        set_target_properties(${_target} PROPERTIES
+            RUNTIME_OUTPUT_DIRECTORY ${OLLAMA_BUILD_DIR}
+            LIBRARY_OUTPUT_DIRECTORY ${OLLAMA_BUILD_DIR}
+            ARCHIVE_OUTPUT_DIRECTORY ${OLLAMA_BUILD_DIR}
+        )
+    endif()
+endfunction()
+
+# Check for Metal support (macOS only)
+if(CMAKE_SYSTEM_NAME MATCHES "Darwin")
+    execute_process(
+      COMMAND
+        zsh "-c"
+        "echo \"__METAL_VERSION__\" | xcrun -sdk macosx metal ${XCRUN_FLAGS} -E -x metal -P - | tail -1 | tr -d '\n'"
+      OUTPUT_VARIABLE MLX_METAL_VERSION COMMAND_ERROR_IS_FATAL ANY)
+
+    if(NOT MLX_METAL_VERSION)
+        message(STATUS "`xcrun metal` error. Setting MLX_BUILD_METAL=OFF")
+        set(MLX_BUILD_METAL OFF)
+    endif()
+else()
+    # On Linux, disable Metal backend
+    message(STATUS "Non-macOS platform detected. Setting MLX_BUILD_METAL=OFF")
+    set(MLX_BUILD_METAL OFF)
+endif()
+
+# Map CMAKE_CUDA_ARCHITECTURES to MLX_CUDA_ARCHITECTURES if not explicitly set
+if(NOT MLX_CUDA_ARCHITECTURES AND CMAKE_CUDA_ARCHITECTURES)
+    set(MLX_CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES})
+    message(STATUS "Using CMAKE_CUDA_ARCHITECTURES for MLX: ${MLX_CUDA_ARCHITECTURES}")
+endif()
+
+# Enable CUDA backend if CUDA architectures are specified and CUDA compiler is available
+if(MLX_CUDA_ARCHITECTURES AND CMAKE_CUDA_COMPILER)
+    set(MLX_BUILD_CUDA ON CACHE BOOL "Build CUDA backend for MLX" FORCE)
+    message(STATUS "Enabling MLX CUDA backend with architectures: ${MLX_CUDA_ARCHITECTURES}")
+elseif(MLX_CUDA_ARCHITECTURES)
+    message(WARNING "MLX_CUDA_ARCHITECTURES specified but CUDA compiler not found, CUDA backend will be disabled")
+endif()
+
+FetchContent_Declare(
+  mlx-c
+  GIT_REPOSITORY "https://github.com/ml-explore/mlx-c.git"
+  GIT_TAG ${MLX_C_GIT_TAG})
+FetchContent_MakeAvailable(mlx-c)
+
+set_target_output_directory(mlx)
+set_target_output_directory(mlxc)
Author	SHA1	Message	Date
Parth Sareen	42e1d49fbe	cmd: fix context limits for droid and add qwen3-coder-next ctx (#14112 )	2026-02-05 22:29:53 -08:00
Michael Yang	814630ca60	Revert "move tokenizers to separate package (#13825 )" (#14111 )	2026-02-05 20:49:08 -08:00