llama: fix fattn-tile shared memory overflow on sm_50/52 (#13872 )

Use nthreads=128 for ncols=4 configurations in flash attention tile kernel to reduce shared memory usage below 48KB limit on Maxwell architectures (sm_50/52). With nthreads=256 and ncols=4, np=2 which caused shared memory to exceed 48KB. With nthreads=128 and ncols=4, np=1 keeps shared memory under the limit.
cmd: rename ollama config to ollama launch (#13871 )
2026-01-23 23:09:15 -05:00 · 2026-01-23 19:22:32 -08:00 · 2026-01-23 18:40:40 -08:00 · 2026-01-23 18:40:28 -08:00 · 2026-01-23 17:14:15 -08:00
6 changed files with 63 additions and 70 deletions
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -2031,7 +2031,7 @@ func NewCLI() *cobra.Command {
 		copyCmd,
 		deleteCmd,
 		runnerCmd,
-		config.ConfigCmd(checkServerHeartbeat),
+		config.LaunchCmd(checkServerHeartbeat),
 	)

 	return rootCmd
--- a/cmd/config/integrations.go
+++ b/cmd/config/integrations.go
@@ -230,15 +230,15 @@ func runIntegration(name, modelName string) error {
 	return r.Run(modelName)
 }

-// ConfigCmd returns the cobra command for configuring integrations.
-func ConfigCmd(checkServerHeartbeat func(cmd *cobra.Command, args []string) error) *cobra.Command {
+// LaunchCmd returns the cobra command for launching integrations.
+func LaunchCmd(checkServerHeartbeat func(cmd *cobra.Command, args []string) error) *cobra.Command {
 	var modelFlag string
-	var launchFlag bool
+	var configFlag bool

 	cmd := &cobra.Command{
-		Use:   "config [INTEGRATION]",
-		Short: "Configure an external integration to use Ollama",
-		Long: `Configure an external application to use Ollama models.
+		Use:   "launch [INTEGRATION]",
+		Short: "Launch an integration with Ollama",
+		Long: `Launch an integration configured with Ollama models.

 Supported integrations:
  claude    Claude Code
@@ -247,9 +247,10 @@ Supported integrations:
  opencode  OpenCode

 Examples:
-  ollama config
-  ollama config claude
-  ollama config droid --launch`,
+  ollama launch
+  ollama launch claude
+  ollama launch claude --model <model>
+  ollama launch droid --config (does not auto-launch)`,
 		Args:    cobra.MaximumNArgs(1),
 		PreRunE: checkServerHeartbeat,
 		RunE: func(cmd *cobra.Command, args []string) error {
@@ -272,8 +273,8 @@ Examples:
 				return fmt.Errorf("unknown integration: %s", name)
 			}

-			// If --launch without --model, use saved config if available
-			if launchFlag && modelFlag == "" {
+			// If launching without --model, use saved config if available
+			if !configFlag && modelFlag == "" {
 				if config, err := loadIntegration(name); err == nil && len(config.Models) > 0 {
 					return runIntegration(name, config.Models[0])
 				}
@@ -334,29 +335,19 @@ Examples:
 				}
 			}

-			if slices.ContainsFunc(models, func(m string) bool {
-				return !strings.HasSuffix(m, "cloud")
-			}) {
-				fmt.Fprintln(os.Stderr)
-				fmt.Fprintln(os.Stderr, "Coding agents work best with at least 64k context. Either:")
-				fmt.Fprintln(os.Stderr, "  - Set the context slider in Ollama app settings")
-				fmt.Fprintln(os.Stderr, "  - Run: OLLAMA_CONTEXT_LENGTH=64000 ollama serve")
+			if configFlag {
+				if launch, _ := confirmPrompt(fmt.Sprintf("\nLaunch %s now?", r)); launch {
+					return runIntegration(name, models[0])
+				}
+				fmt.Fprintf(os.Stderr, "Run 'ollama launch %s' to start with %s\n", strings.ToLower(name), models[0])
+				return nil
 			}

-			if launchFlag {
-				return runIntegration(name, models[0])
-			}
-
-			if launch, _ := confirmPrompt(fmt.Sprintf("\nLaunch %s now?", r)); launch {
-				return runIntegration(name, models[0])
-			}
-
-			fmt.Fprintf(os.Stderr, "Run 'ollama config %s --launch' to start with %s\n", strings.ToLower(name), models[0])
-			return nil
+			return runIntegration(name, models[0])
 		},
 	}

 	cmd.Flags().StringVar(&modelFlag, "model", "", "Model to use")
-	cmd.Flags().BoolVar(&launchFlag, "launch", false, "Launch the integration after configuring")
+	cmd.Flags().BoolVar(&configFlag, "config", false, "Configure without launching")
 	return cmd
 }
--- a/cmd/config/integrations_test.go
+++ b/cmd/config/integrations_test.go
@@ -81,17 +81,17 @@ func TestHasLocalModel(t *testing.T) {
 	}
 }

-func TestConfigCmd(t *testing.T) {
+func TestLaunchCmd(t *testing.T) {
 	// Mock checkServerHeartbeat that always succeeds
 	mockCheck := func(cmd *cobra.Command, args []string) error {
 		return nil
 	}

-	cmd := ConfigCmd(mockCheck)
+	cmd := LaunchCmd(mockCheck)

 	t.Run("command structure", func(t *testing.T) {
-		if cmd.Use != "config [INTEGRATION]" {
-			t.Errorf("Use = %q, want %q", cmd.Use, "config [INTEGRATION]")
+		if cmd.Use != "launch [INTEGRATION]" {
+			t.Errorf("Use = %q, want %q", cmd.Use, "launch [INTEGRATION]")
 		}
 		if cmd.Short == "" {
 			t.Error("Short description should not be empty")
@@ -107,9 +107,9 @@ func TestConfigCmd(t *testing.T) {
 			t.Error("--model flag should exist")
 		}

-		launchFlag := cmd.Flags().Lookup("launch")
-		if launchFlag == nil {
-			t.Error("--launch flag should exist")
+		configFlag := cmd.Flags().Lookup("config")
+		if configFlag == nil {
+			t.Error("--config flag should exist")
 		}
 	})

@@ -158,11 +158,11 @@ func TestHasLocalModel_DocumentsHeuristic(t *testing.T) {
 	}
 }

-func TestConfigCmd_NilHeartbeat(t *testing.T) {
+func TestLaunchCmd_NilHeartbeat(t *testing.T) {
 	// This should not panic - cmd creation should work even with nil
-	cmd := ConfigCmd(nil)
+	cmd := LaunchCmd(nil)
 	if cmd == nil {
-		t.Fatal("ConfigCmd returned nil")
+		t.Fatal("LaunchCmd returned nil")
 	}

 	// PreRunE should be nil when passed nil
--- a/cmd/config/selector.go
+++ b/cmd/config/selector.go
@@ -465,7 +465,7 @@ func confirmPrompt(prompt string) (bool, error) {
 	}
 	defer term.Restore(fd, oldState)

-	fmt.Fprintf(os.Stderr, "%s [y/n] ", prompt)
+	fmt.Fprintf(os.Stderr, "%s (\033[1my\033[0m/n) ", prompt)

 	buf := make([]byte, 1)
 	for {
--- a/llama/patches/0032-ggml-enable-MLA-flash-attention-for-GLM-4.7-flash.patch
+++ b/llama/patches/0032-ggml-enable-MLA-flash-attention-for-GLM-4.7-flash.patch
@@ -1,6 +1,6 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: nobody <>
-Date: Fri, 23 Jan 2026 12:42:53 -0800
+Date: Sat, 24 Jan 2026 02:31:01 +0000
 Subject: [PATCH] ggml: enable MLA flash attention for GLM-4.7-flash

 Add support for gqa_ratio 4 in MLA flash attention kernels. GLM-4.7-flash
@@ -18,17 +18,17 @@ CUDA changes:
 - Add MMA config cases for ncols 4
 - Add template instances for ncols2=4
 ---
- ggml/src/ggml-cuda/fattn-mma-f16.cuh             | 15 ++++++++++++---
- ggml/src/ggml-cuda/fattn-tile.cuh                | 16 ++++++++++++++++
- ggml/src/ggml-cuda/fattn.cu                      | 12 ++++++++----
- .../fattn-mma-f16-instance-ncols1_16-ncols2_4.cu |  1 +
- .../fattn-mma-f16-instance-ncols1_2-ncols2_4.cu  |  1 +
- .../fattn-mma-f16-instance-ncols1_4-ncols2_4.cu  |  1 +
- .../fattn-mma-f16-instance-ncols1_8-ncols2_4.cu  |  1 +
- ggml/src/ggml-metal/ggml-metal-device.m          |  8 ++------
- ggml/src/ggml-metal/ggml-metal-ops.cpp           |  2 +-
- ggml/src/ggml-metal/ggml-metal.metal             |  1 +
- 10 files changed, 44 insertions(+), 14 deletions(-)
+ ggml/src/ggml-cuda/fattn-mma-f16.cuh           | 15 ++++++++++++---
+ ggml/src/ggml-cuda/fattn-tile.cuh              | 18 +++++++++++++++++-
+ ggml/src/ggml-cuda/fattn.cu                    | 12 ++++++++----
+ ...attn-mma-f16-instance-ncols1_16-ncols2_4.cu |  1 +
+ ...fattn-mma-f16-instance-ncols1_2-ncols2_4.cu |  1 +
+ ...fattn-mma-f16-instance-ncols1_4-ncols2_4.cu |  1 +
+ ...fattn-mma-f16-instance-ncols1_8-ncols2_4.cu |  1 +
+ ggml/src/ggml-metal/ggml-metal-device.m        |  8 ++------
+ ggml/src/ggml-metal/ggml-metal-ops.cpp         |  2 +-
+ ggml/src/ggml-metal/ggml-metal.metal           |  1 +
+ 10 files changed, 45 insertions(+), 15 deletions(-)

 diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
 index 7bd1044c1..a627302f9 100644
@@ -75,42 +75,44 @@ index 7bd1044c1..a627302f9 100644
 +extern DECL_FATTN_MMA_F16_CASE(576, 512, 8,  4);
 +extern DECL_FATTN_MMA_F16_CASE(576, 512, 16, 4);
 diff --git a/ggml/src/ggml-cuda/fattn-tile.cuh b/ggml/src/ggml-cuda/fattn-tile.cuh
-index 7c4d6fe67..682fb366e 100644
+index 7c4d6fe67..6389ba5c4 100644
 --- a/ggml/src/ggml-cuda/fattn-tile.cuh
 +++ b/ggml/src/ggml-cuda/fattn-tile.cuh
@@ -68,6 +68,8 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
     GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2,  64,  64)
     GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2,  64,  64)
 
-+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 256, 2,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 128, 2,  64,  64)
 +    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  8, 256, 2,  64,  64)
     GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2,  64,  64)
- 
+
     return 0;
-@@ -122,6 +124,8 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
+@@ -122,7 +124,9 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
     GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2,  32, 128)
     GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2,  32,  64)
- 
-+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 256, 2,  32,  64)
+
+-    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2,  32,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 128, 2,  32,  64)
 +    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  8, 256, 2,  32,  64)
-     GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2,  32,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2,  32,  64)
 
     return 0;
+ }
@@ -183,6 +187,8 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
     GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2,  32, 128)
     GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2,  32, 128)
 
-+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 256, 2,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 128, 2,  64,  64)
 +    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  8, 256, 2,  64,  64)
     GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2,  64,  64)
     GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 32, 512, 1, 128,  64)
- 
+
@@ -245,6 +251,8 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
     GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 5,  32, 256)
     GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 3,  64, 128)
- 
-+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 256, 4,  64,  64)
-+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  8, 256, 4,  64,  64)
+
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 128, 2,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  8, 256, 2,  64,  64)
     GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 4,  64,  64)
     GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 32, 256, 2, 128,  64)
 
--- a/ml/backend/ggml/ggml/src/ggml-cuda/fattn-tile.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-tile.cuh
@@ -68,7 +68,7 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2,  64,  64)

-    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 256, 2,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 128, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  8, 256, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2,  64,  64)

@@ -124,7 +124,7 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2,  32, 128)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2,  32,  64)

-    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 256, 2,  32,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 128, 2,  32,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  8, 256, 2,  32,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2,  32,  64)

@@ -187,7 +187,7 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2,  32, 128)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2,  32, 128)

-    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 256, 2,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 128, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  8, 256, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 32, 512, 1, 128,  64)
@@ -251,8 +251,8 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 5,  32, 256)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 3,  64, 128)

-    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 256, 4,  64,  64)
-    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  8, 256, 4,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 128, 2,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  8, 256, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 4,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 32, 256, 2, 128,  64)
Author	SHA1	Message	Date
Jeffrey Morgan	912d984346	llama: fix fattn-tile shared memory overflow on sm_50/52 (#13872 ) Use nthreads=128 for ncols=4 configurations in flash attention tile kernel to reduce shared memory usage below 48KB limit on Maxwell architectures (sm_50/52). With nthreads=256 and ncols=4, np=2 which caused shared memory to exceed 48KB. With nthreads=128 and ncols=4, np=1 keeps shared memory under the limit.	2026-01-23 19:22:32 -08:00
Parth Sareen	aae6ecbaff	cmd: rename ollama config to ollama launch (#13871 )	2026-01-23 18:40:40 -08:00
Jeffrey Morgan	64737330a4	Re-apply "model: add MLA absorption for glm4moelite" with fix (#13870 ) The nvidia_fp32 config for (576, 512) head sizes had nbatch_fa=32, which caused zero-sized arrays when computing array dimensions: nbatch_fa / (np * warp_size) = 32 / (2 * 32) = 0 This resulted in CUDA compilation failures on CUDA 12 (Windows and Linux arm64): - "static assertion failed with nbatch_fa % (np*warp_size) != 0" - "the size of an array must be greater than zero" Fix by changing nbatch_fa from 32 to 64 for all (576, 512) configs in the nvidia_fp32 function, matching the nvidia_fp16 and AMD configs.	2026-01-23 18:40:28 -08:00
Jeffrey Morgan	2eda97f1c3	Revert "model: add MLA absorption for glm4moelite (#13810 )" (#13869 ) This reverts commit `1044b0419a`.	2026-01-23 17:14:15 -08:00