Compare commits

...

4 Commits

Author SHA1 Message Date
Jeffrey Morgan
912d984346 llama: fix fattn-tile shared memory overflow on sm_50/52 (#13872)
Use nthreads=128 for ncols=4 configurations in flash attention tile
kernel to reduce shared memory usage below 48KB limit on Maxwell
architectures (sm_50/52).

With nthreads=256 and ncols=4, np=2 which caused shared memory to
exceed 48KB. With nthreads=128 and ncols=4, np=1 keeps shared memory
under the limit.
2026-01-23 19:22:32 -08:00
Parth Sareen
aae6ecbaff cmd: rename ollama config to ollama launch (#13871) 2026-01-23 18:40:40 -08:00
Jeffrey Morgan
64737330a4 Re-apply "model: add MLA absorption for glm4moelite" with fix (#13870)
The nvidia_fp32 config for (576, 512) head sizes had nbatch_fa=32,
which caused zero-sized arrays when computing array dimensions:
  nbatch_fa / (np * warp_size) = 32 / (2 * 32) = 0

This resulted in CUDA compilation failures on CUDA 12 (Windows and
Linux arm64):
- "static assertion failed with nbatch_fa % (np*warp_size) != 0"
- "the size of an array must be greater than zero"

Fix by changing nbatch_fa from 32 to 64 for all (576, 512) configs
in the nvidia_fp32 function, matching the nvidia_fp16 and AMD configs.
2026-01-23 18:40:28 -08:00
Jeffrey Morgan
2eda97f1c3 Revert "model: add MLA absorption for glm4moelite (#13810)" (#13869)
This reverts commit 1044b0419a.
2026-01-23 17:14:15 -08:00
6 changed files with 63 additions and 70 deletions

View File

@@ -2031,7 +2031,7 @@ func NewCLI() *cobra.Command {
copyCmd,
deleteCmd,
runnerCmd,
config.ConfigCmd(checkServerHeartbeat),
config.LaunchCmd(checkServerHeartbeat),
)
return rootCmd

View File

@@ -230,15 +230,15 @@ func runIntegration(name, modelName string) error {
return r.Run(modelName)
}
// ConfigCmd returns the cobra command for configuring integrations.
func ConfigCmd(checkServerHeartbeat func(cmd *cobra.Command, args []string) error) *cobra.Command {
// LaunchCmd returns the cobra command for launching integrations.
func LaunchCmd(checkServerHeartbeat func(cmd *cobra.Command, args []string) error) *cobra.Command {
var modelFlag string
var launchFlag bool
var configFlag bool
cmd := &cobra.Command{
Use: "config [INTEGRATION]",
Short: "Configure an external integration to use Ollama",
Long: `Configure an external application to use Ollama models.
Use: "launch [INTEGRATION]",
Short: "Launch an integration with Ollama",
Long: `Launch an integration configured with Ollama models.
Supported integrations:
claude Claude Code
@@ -247,9 +247,10 @@ Supported integrations:
opencode OpenCode
Examples:
ollama config
ollama config claude
ollama config droid --launch`,
ollama launch
ollama launch claude
ollama launch claude --model <model>
ollama launch droid --config (does not auto-launch)`,
Args: cobra.MaximumNArgs(1),
PreRunE: checkServerHeartbeat,
RunE: func(cmd *cobra.Command, args []string) error {
@@ -272,8 +273,8 @@ Examples:
return fmt.Errorf("unknown integration: %s", name)
}
// If --launch without --model, use saved config if available
if launchFlag && modelFlag == "" {
// If launching without --model, use saved config if available
if !configFlag && modelFlag == "" {
if config, err := loadIntegration(name); err == nil && len(config.Models) > 0 {
return runIntegration(name, config.Models[0])
}
@@ -334,29 +335,19 @@ Examples:
}
}
if slices.ContainsFunc(models, func(m string) bool {
return !strings.HasSuffix(m, "cloud")
}) {
fmt.Fprintln(os.Stderr)
fmt.Fprintln(os.Stderr, "Coding agents work best with at least 64k context. Either:")
fmt.Fprintln(os.Stderr, " - Set the context slider in Ollama app settings")
fmt.Fprintln(os.Stderr, " - Run: OLLAMA_CONTEXT_LENGTH=64000 ollama serve")
if configFlag {
if launch, _ := confirmPrompt(fmt.Sprintf("\nLaunch %s now?", r)); launch {
return runIntegration(name, models[0])
}
fmt.Fprintf(os.Stderr, "Run 'ollama launch %s' to start with %s\n", strings.ToLower(name), models[0])
return nil
}
if launchFlag {
return runIntegration(name, models[0])
}
if launch, _ := confirmPrompt(fmt.Sprintf("\nLaunch %s now?", r)); launch {
return runIntegration(name, models[0])
}
fmt.Fprintf(os.Stderr, "Run 'ollama config %s --launch' to start with %s\n", strings.ToLower(name), models[0])
return nil
return runIntegration(name, models[0])
},
}
cmd.Flags().StringVar(&modelFlag, "model", "", "Model to use")
cmd.Flags().BoolVar(&launchFlag, "launch", false, "Launch the integration after configuring")
cmd.Flags().BoolVar(&configFlag, "config", false, "Configure without launching")
return cmd
}

View File

@@ -81,17 +81,17 @@ func TestHasLocalModel(t *testing.T) {
}
}
func TestConfigCmd(t *testing.T) {
func TestLaunchCmd(t *testing.T) {
// Mock checkServerHeartbeat that always succeeds
mockCheck := func(cmd *cobra.Command, args []string) error {
return nil
}
cmd := ConfigCmd(mockCheck)
cmd := LaunchCmd(mockCheck)
t.Run("command structure", func(t *testing.T) {
if cmd.Use != "config [INTEGRATION]" {
t.Errorf("Use = %q, want %q", cmd.Use, "config [INTEGRATION]")
if cmd.Use != "launch [INTEGRATION]" {
t.Errorf("Use = %q, want %q", cmd.Use, "launch [INTEGRATION]")
}
if cmd.Short == "" {
t.Error("Short description should not be empty")
@@ -107,9 +107,9 @@ func TestConfigCmd(t *testing.T) {
t.Error("--model flag should exist")
}
launchFlag := cmd.Flags().Lookup("launch")
if launchFlag == nil {
t.Error("--launch flag should exist")
configFlag := cmd.Flags().Lookup("config")
if configFlag == nil {
t.Error("--config flag should exist")
}
})
@@ -158,11 +158,11 @@ func TestHasLocalModel_DocumentsHeuristic(t *testing.T) {
}
}
func TestConfigCmd_NilHeartbeat(t *testing.T) {
func TestLaunchCmd_NilHeartbeat(t *testing.T) {
// This should not panic - cmd creation should work even with nil
cmd := ConfigCmd(nil)
cmd := LaunchCmd(nil)
if cmd == nil {
t.Fatal("ConfigCmd returned nil")
t.Fatal("LaunchCmd returned nil")
}
// PreRunE should be nil when passed nil

View File

@@ -465,7 +465,7 @@ func confirmPrompt(prompt string) (bool, error) {
}
defer term.Restore(fd, oldState)
fmt.Fprintf(os.Stderr, "%s [y/n] ", prompt)
fmt.Fprintf(os.Stderr, "%s (\033[1my\033[0m/n) ", prompt)
buf := make([]byte, 1)
for {

View File

@@ -1,6 +1,6 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: nobody <>
Date: Fri, 23 Jan 2026 12:42:53 -0800
Date: Sat, 24 Jan 2026 02:31:01 +0000
Subject: [PATCH] ggml: enable MLA flash attention for GLM-4.7-flash
Add support for gqa_ratio 4 in MLA flash attention kernels. GLM-4.7-flash
@@ -18,17 +18,17 @@ CUDA changes:
- Add MMA config cases for ncols 4
- Add template instances for ncols2=4
---
ggml/src/ggml-cuda/fattn-mma-f16.cuh | 15 ++++++++++++---
ggml/src/ggml-cuda/fattn-tile.cuh | 16 ++++++++++++++++
ggml/src/ggml-cuda/fattn.cu | 12 ++++++++----
.../fattn-mma-f16-instance-ncols1_16-ncols2_4.cu | 1 +
.../fattn-mma-f16-instance-ncols1_2-ncols2_4.cu | 1 +
.../fattn-mma-f16-instance-ncols1_4-ncols2_4.cu | 1 +
.../fattn-mma-f16-instance-ncols1_8-ncols2_4.cu | 1 +
ggml/src/ggml-metal/ggml-metal-device.m | 8 ++------
ggml/src/ggml-metal/ggml-metal-ops.cpp | 2 +-
ggml/src/ggml-metal/ggml-metal.metal | 1 +
10 files changed, 44 insertions(+), 14 deletions(-)
ggml/src/ggml-cuda/fattn-mma-f16.cuh | 15 ++++++++++++---
ggml/src/ggml-cuda/fattn-tile.cuh | 18 +++++++++++++++++-
ggml/src/ggml-cuda/fattn.cu | 12 ++++++++----
...attn-mma-f16-instance-ncols1_16-ncols2_4.cu | 1 +
...fattn-mma-f16-instance-ncols1_2-ncols2_4.cu | 1 +
...fattn-mma-f16-instance-ncols1_4-ncols2_4.cu | 1 +
...fattn-mma-f16-instance-ncols1_8-ncols2_4.cu | 1 +
ggml/src/ggml-metal/ggml-metal-device.m | 8 ++------
ggml/src/ggml-metal/ggml-metal-ops.cpp | 2 +-
ggml/src/ggml-metal/ggml-metal.metal | 1 +
10 files changed, 45 insertions(+), 15 deletions(-)
diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
index 7bd1044c1..a627302f9 100644
@@ -75,42 +75,44 @@ index 7bd1044c1..a627302f9 100644
+extern DECL_FATTN_MMA_F16_CASE(576, 512, 8, 4);
+extern DECL_FATTN_MMA_F16_CASE(576, 512, 16, 4);
diff --git a/ggml/src/ggml-cuda/fattn-tile.cuh b/ggml/src/ggml-cuda/fattn-tile.cuh
index 7c4d6fe67..682fb366e 100644
index 7c4d6fe67..6389ba5c4 100644
--- a/ggml/src/ggml-cuda/fattn-tile.cuh
+++ b/ggml/src/ggml-cuda/fattn-tile.cuh
@@ -68,6 +68,8 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2, 64, 64)
+ GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 4, 256, 2, 64, 64)
+ GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 4, 128, 2, 64, 64)
+ GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 8, 256, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2, 64, 64)
return 0;
@@ -122,6 +124,8 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
@@ -122,7 +124,9 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2, 32, 128)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2, 32, 64)
+ GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 4, 256, 2, 32, 64)
- GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2, 32, 64)
+ GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 4, 128, 2, 32, 64)
+ GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 8, 256, 2, 32, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2, 32, 64)
+ GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2, 32, 64)
return 0;
}
@@ -183,6 +187,8 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2, 32, 128)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2, 32, 128)
+ GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 4, 256, 2, 64, 64)
+ GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 4, 128, 2, 64, 64)
+ GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 8, 256, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 32, 512, 1, 128, 64)
@@ -245,6 +251,8 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 5, 32, 256)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 3, 64, 128)
+ GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 4, 256, 4, 64, 64)
+ GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 8, 256, 4, 64, 64)
+ GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 4, 128, 2, 64, 64)
+ GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 8, 256, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 4, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 32, 256, 2, 128, 64)

View File

@@ -68,7 +68,7 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 4, 256, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 4, 128, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 8, 256, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2, 64, 64)
@@ -124,7 +124,7 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2, 32, 128)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2, 32, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 4, 256, 2, 32, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 4, 128, 2, 32, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 8, 256, 2, 32, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2, 32, 64)
@@ -187,7 +187,7 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2, 32, 128)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2, 32, 128)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 4, 256, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 4, 128, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 8, 256, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 32, 512, 1, 128, 64)
@@ -251,8 +251,8 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 5, 32, 256)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 3, 64, 128)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 4, 256, 4, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 8, 256, 4, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 4, 128, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 8, 256, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 4, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 32, 256, 2, 128, 64)