fix: add -O3 optimization to CGO flags

CGO_CFLAGS and CGO_CXXFLAGS were being set without optimization flags, which overrides Go's default -O2 and results in unoptimized C++ code. This caused significant performance degradation in release builds compared to local `go build` which uses the default optimization. - build_darwin.sh: add -O3 to CGO_CFLAGS and CGO_CXXFLAGS exports - Dockerfile: preserve CGO_CFLAGS/CGO_CXXFLAGS from build args instead of overwriting them - app/README.md: update documentation to include -O3
README: Update the "Ollama for ruby" to the most popular and maintained ruby gem. (#13855 )
2026-01-24 07:20:57 -05:00 · 2026-01-24 01:43:11 -08:00 · 2026-01-24 01:24:52 -05:00 · 2026-01-23 20:10:04 -08:00 · 2026-01-23 19:22:32 -08:00
7 changed files with 128 additions and 53 deletions
--- a/4
+++ b/4
@@ -169,8 +169,10 @@ COPY . .
 RUN git clone --depth 1 --branch "$(cat MLX_VERSION)" https://github.com/ml-explore/mlx-c.git build/_deps/mlx-c-src
 ARG GOFLAGS="'-ldflags=-w -s'"
 ENV CGO_ENABLED=1
-ENV CGO_CFLAGS="-I/go/src/github.com/ollama/ollama/build/_deps/mlx-c-src"
+ARG CGO_CFLAGS
 ARG CGO_CXXFLAGS
+ENV CGO_CFLAGS="${CGO_CFLAGS} -I/go/src/github.com/ollama/ollama/build/_deps/mlx-c-src"
+ENV CGO_CXXFLAGS="${CGO_CXXFLAGS}"
 RUN --mount=type=cache,target=/root/.cache/go-build \
    go build -tags mlx -trimpath -buildmode=pie -o /bin/ollama .

--- a/README.md
+++ b/README.md
@@ -558,7 +558,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [LiteLLM](https://github.com/BerriAI/litellm)
 - [OllamaFarm for Go](https://github.com/presbrey/ollamafarm)
 - [OllamaSharp for .NET](https://github.com/awaescher/OllamaSharp)
- [Ollama for Ruby](https://github.com/gbaptista/ollama-ai)
+- [Ollama for Ruby](https://github.com/crmne/ruby_llm)
 - [Ollama-rs for Rust](https://github.com/pepperoni21/ollama-rs)
 - [Ollama-hpp for C++](https://github.com/jmont-dev/ollama-hpp)
 - [Ollama4j for Java](https://github.com/ollama4j/ollama4j)
--- a/app/README.md
+++ b/app/README.md
@@ -75,9 +75,9 @@ The `-dev` flag enables:
 CI builds with Xcode 14.1 for OS compatibility prior to v13.  If you want to manually build v11+ support, you can download the older Xcode [here](https://developer.apple.com/services-account/download?path=/Developer_Tools/Xcode_14.1/Xcode_14.1.xip), extract, then `mv ./Xcode.app /Applications/Xcode_14.1.0.app` then activate with:

 ```
-export CGO_CFLAGS=-mmacosx-version-min=12.0
-export CGO_CXXFLAGS=-mmacosx-version-min=12.0
-export CGO_LDFLAGS=-mmacosx-version-min=12.0
+export CGO_CFLAGS="-O3 -mmacosx-version-min=12.0"
+export CGO_CXXFLAGS="-O3 -mmacosx-version-min=12.0"
+export CGO_LDFLAGS="-mmacosx-version-min=12.0"
 export SDKROOT=/Applications/Xcode_14.1.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
 export DEVELOPER_DIR=/Applications/Xcode_14.1.0.app/Contents/Developer
 ```
--- a/llama/patches/0032-ggml-enable-MLA-flash-attention-for-GLM-4.7-flash.patch
+++ b/llama/patches/0032-ggml-enable-MLA-flash-attention-for-GLM-4.7-flash.patch
@@ -19,20 +19,20 @@ CUDA changes:
 - Add template instances for ncols2=4
 - Fix nbatch_fa values in nvidia_fp32 config (32->64)
 ---
- ggml/src/ggml-cuda/fattn-mma-f16.cuh           | 15 ++++++++++++---
- ggml/src/ggml-cuda/fattn-tile.cuh              | 18 +++++++++++++++++-
- ggml/src/ggml-cuda/fattn.cu                    | 12 ++++++++----
- ...attn-mma-f16-instance-ncols1_16-ncols2_4.cu |  1 +
- ...fattn-mma-f16-instance-ncols1_2-ncols2_4.cu |  1 +
- ...fattn-mma-f16-instance-ncols1_4-ncols2_4.cu |  1 +
- ...fattn-mma-f16-instance-ncols1_8-ncols2_4.cu |  1 +
- ggml/src/ggml-metal/ggml-metal-device.m        |  8 ++------
- ggml/src/ggml-metal/ggml-metal-ops.cpp         |  2 +-
- ggml/src/ggml-metal/ggml-metal.metal           |  1 +
- 10 files changed, 45 insertions(+), 15 deletions(-)
+ ggml/src/ggml-cuda/fattn-mma-f16.cuh          | 40 +++++++++++++++----
+ ggml/src/ggml-cuda/fattn-tile.cuh             | 16 ++++++++
+ ggml/src/ggml-cuda/fattn.cu                   | 12 ++++--
+ ...ttn-mma-f16-instance-ncols1_16-ncols2_4.cu |  1 +
+ ...attn-mma-f16-instance-ncols1_2-ncols2_4.cu |  1 +
+ ...attn-mma-f16-instance-ncols1_4-ncols2_4.cu |  1 +
+ ...attn-mma-f16-instance-ncols1_8-ncols2_4.cu |  1 +
+ ggml/src/ggml-metal/ggml-metal-device.m       |  8 +---
+ ggml/src/ggml-metal/ggml-metal-ops.cpp        |  2 +-
+ ggml/src/ggml-metal/ggml-metal.metal          |  1 +
+ 10 files changed, 64 insertions(+), 19 deletions(-)

 diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
-index 7bd1044c1..a627302f9 100644
+index 7bd1044c1..3dea2205e 100644
 --- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh
 +++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@@ -66,7 +66,8 @@ static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_co
@@ -65,45 +65,103 @@ index 7bd1044c1..a627302f9 100644
     GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 16,  64, 4,  32, 288, 256,  64, 1, false);
     GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 32, 128, 2,  32, 160, 128,  64, 1, false);
     GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 64, 256, 1,  32, 160, 128,  64, 1, false);
-@@ -1585,3 +1588,9 @@ DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 256,  64)
+@@ -397,7 +400,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
+     constexpr int  ncols           = ncols1 * ncols2;
+     constexpr int  cols_per_warp   = T_B_KQ::I;
+     constexpr int  cols_per_thread = 2; // This is specifically KQ columns, Volta only has a single VKQ column.
+-    constexpr int  np              = nwarps * (cols_per_warp/ncols2) / ncols1; // Number of parallel CUDA warps per Q column.
+    constexpr int  np              = cols_per_warp > ncols ? nwarps : nwarps * cols_per_warp/ncols; // Number of parallel CUDA warps per Q column.
+     constexpr int  nbatch_fa       = ggml_cuda_fattn_mma_get_nbatch_fa(DKQ, DV, ncols);
+     constexpr int  nbatch_K2       = ggml_cuda_fattn_mma_get_nbatch_K2(DKQ, DV, ncols);
+     constexpr int  nbatch_V2       = ggml_cuda_fattn_mma_get_nbatch_V2(DKQ, DV, ncols);
+@@ -467,7 +470,6 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
+                 }
+             }
+         } else {
+-            static_assert(cols_per_warp != 8, "cols_per_warp == 8 not implemented");
+ #pragma unroll
+             for (int k_KQ_0 = k0_start; k_KQ_0 < k0_stop; k_KQ_0 += T_A_KQ::J) {
+                 load_ldmatrix(Q_B[0], tile_Q + (threadIdx.y / np)*(T_B_KQ::I*stride_tile_Q) + k_KQ_0, stride_tile_Q);
+@@ -479,8 +481,18 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
+                     T_A_KQ K_A;
+                     load_ldmatrix(K_A, tile_K + i_KQ_0*stride_tile_K + (k_KQ_0 - k0_start), stride_tile_K);
+ 
+-                    // Wide version of KQ_C is column-major => swap A and B.
+-                    mma(KQ_C[i_KQ_00/(np*T_A_KQ::I)], Q_B[0], K_A);
+                    if constexpr (cols_per_warp == 8) {
+                        mma(KQ_C[i_KQ_00/(np*T_A_KQ::I)], K_A, Q_B[0]);
+                    } else {
+                        // Wide version of KQ_C is column-major
+#if defined(AMD_WMMA_AVAILABLE)
+                        // RDNA matrix C is column-major.
+                        mma(KQ_C[i_KQ_00/(np*T_A_KQ::I)], K_A, Q_B[0]);
+#else
+                        // swap A and B for CUDA.
+                        mma(KQ_C[i_KQ_00/(np*T_A_KQ::I)], Q_B[0], K_A);
+#endif // defined(AMD_WMMA_AVAILABLE)
+                    }
+                 }
+             }
+         }
+@@ -841,7 +853,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
+ 
+     constexpr int  cols_per_warp   = T_B_KQ::I;
+     constexpr int  cols_per_thread = 2; // This is specifically KQ columns, Volta only has a single VKQ column.
+-    constexpr int  np              = nwarps * (cols_per_warp/ncols2) / ncols1; // Number of parallel CUDA warps per Q column.
+    constexpr int  np              = cols_per_warp > ncols ? nwarps : nwarps * cols_per_warp/ncols; // Number of parallel CUDA warps per Q column.
+     constexpr int  nbatch_fa       = ggml_cuda_fattn_mma_get_nbatch_fa     (DKQ, DV, ncols);
+     constexpr int  nbatch_K2       = ggml_cuda_fattn_mma_get_nbatch_K2     (DKQ, DV, ncols);
+     constexpr int  nbatch_V2       = ggml_cuda_fattn_mma_get_nbatch_V2     (DKQ, DV, ncols);
+@@ -1353,6 +1365,13 @@ static __global__ void flash_attn_ext_f16(
+         NO_DEVICE_CODE;
+         return;
+     }
+#ifdef VOLTA_MMA_AVAILABLE
+    if (ncols1*ncols2 < 32) {
+        NO_DEVICE_CODE;
+        return;
+    }
+#endif // VOLTA_MMA_AVAILABLE
+
+ #if __CUDA_ARCH__ == GGML_CUDA_CC_TURING
+     if (ncols1*ncols2 > 32) {
+         NO_DEVICE_CODE;
+@@ -1585,3 +1604,8 @@ DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 256,  64)
 extern DECL_FATTN_MMA_F16_CASE(576, 512, 1, 16);
 extern DECL_FATTN_MMA_F16_CASE(576, 512, 2, 16);
 extern DECL_FATTN_MMA_F16_CASE(576, 512, 4, 16);
 +
-+// GLM 4.7 Flash uses gqa_ratio 4:
-+extern DECL_FATTN_MMA_F16_CASE(576, 512, 2,  4);
-+extern DECL_FATTN_MMA_F16_CASE(576, 512, 4,  4);
-+extern DECL_FATTN_MMA_F16_CASE(576, 512, 8,  4);
-+extern DECL_FATTN_MMA_F16_CASE(576, 512, 16, 4);
+// For GLM 4.7 Flash
+extern DECL_FATTN_MMA_F16_CASE(576, 512,  4,  4);
+extern DECL_FATTN_MMA_F16_CASE(576, 512,  8,  4);
+extern DECL_FATTN_MMA_F16_CASE(576, 512, 16,  4);
 diff --git a/ggml/src/ggml-cuda/fattn-tile.cuh b/ggml/src/ggml-cuda/fattn-tile.cuh
-index 7c4d6fe67..6389ba5c4 100644
+index 7c4d6fe67..371be7442 100644
 --- a/ggml/src/ggml-cuda/fattn-tile.cuh
 +++ b/ggml/src/ggml-cuda/fattn-tile.cuh
@@ -68,6 +68,8 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
     GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2,  64,  64)
     GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2,  64,  64)
 
-+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 256, 2,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 128, 2,  64,  64)
 +    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  8, 256, 2,  64,  64)
     GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2,  64,  64)
 
     return 0;
-@@ -122,7 +124,9 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
+@@ -122,6 +124,8 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
     GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2,  32, 128)
     GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2,  32,  64)
 
-    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2,  32,  64)
-+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 256, 2,  64,  64)
-+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  8, 256, 2,  64,  64)
-+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 128, 2,  32,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  8, 256, 2,  32,  64)
+     GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2,  32,  64)
 
     return 0;
- }
@@ -183,6 +187,8 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
     GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2,  32, 128)
     GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2,  32, 128)
 
-+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 256, 2,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 128, 2,  64,  64)
 +    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  8, 256, 2,  64,  64)
     GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2,  64,  64)
     GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 32, 512, 1, 128,  64)
@@ -112,8 +170,8 @@ index 7c4d6fe67..6389ba5c4 100644
     GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 5,  32, 256)
     GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 3,  64, 128)
 
-+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 256, 4,  64,  64)
-+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  8, 256, 4,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 128, 2,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  8, 256, 2,  64,  64)
     GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 4,  64,  64)
     GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 32, 256, 2, 128,  64)
 
--- a/ml/backend/ggml/ggml/src/ggml-cuda/fattn-mma-f16.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@@ -400,7 +400,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
    constexpr int  ncols           = ncols1 * ncols2;
    constexpr int  cols_per_warp   = T_B_KQ::I;
    constexpr int  cols_per_thread = 2; // This is specifically KQ columns, Volta only has a single VKQ column.
-    constexpr int  np              = nwarps * (cols_per_warp/ncols2) / ncols1; // Number of parallel CUDA warps per Q column.
+    constexpr int  np              = cols_per_warp > ncols ? nwarps : nwarps * cols_per_warp/ncols; // Number of parallel CUDA warps per Q column.
    constexpr int  nbatch_fa       = ggml_cuda_fattn_mma_get_nbatch_fa(DKQ, DV, ncols);
    constexpr int  nbatch_K2       = ggml_cuda_fattn_mma_get_nbatch_K2(DKQ, DV, ncols);
    constexpr int  nbatch_V2       = ggml_cuda_fattn_mma_get_nbatch_V2(DKQ, DV, ncols);
@@ -470,7 +470,6 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
                }
            }
        } else {
-            static_assert(cols_per_warp != 8, "cols_per_warp == 8 not implemented");
 #pragma unroll
            for (int k_KQ_0 = k0_start; k_KQ_0 < k0_stop; k_KQ_0 += T_A_KQ::J) {
                load_ldmatrix(Q_B[0], tile_Q + (threadIdx.y / np)*(T_B_KQ::I*stride_tile_Q) + k_KQ_0, stride_tile_Q);
@@ -482,8 +481,18 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
                    T_A_KQ K_A;
                    load_ldmatrix(K_A, tile_K + i_KQ_0*stride_tile_K + (k_KQ_0 - k0_start), stride_tile_K);

-                    // Wide version of KQ_C is column-major => swap A and B.
-                    mma(KQ_C[i_KQ_00/(np*T_A_KQ::I)], Q_B[0], K_A);
+                    if constexpr (cols_per_warp == 8) {
+                        mma(KQ_C[i_KQ_00/(np*T_A_KQ::I)], K_A, Q_B[0]);
+                    } else {
+                        // Wide version of KQ_C is column-major
+#if defined(AMD_WMMA_AVAILABLE)
+                        // RDNA matrix C is column-major.
+                        mma(KQ_C[i_KQ_00/(np*T_A_KQ::I)], K_A, Q_B[0]);
+#else
+                        // swap A and B for CUDA.
+                        mma(KQ_C[i_KQ_00/(np*T_A_KQ::I)], Q_B[0], K_A);
+#endif // defined(AMD_WMMA_AVAILABLE)
+                    }
                }
            }
        }
@@ -844,7 +853,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(

    constexpr int  cols_per_warp   = T_B_KQ::I;
    constexpr int  cols_per_thread = 2; // This is specifically KQ columns, Volta only has a single VKQ column.
-    constexpr int  np              = nwarps * (cols_per_warp/ncols2) / ncols1; // Number of parallel CUDA warps per Q column.
+    constexpr int  np              = cols_per_warp > ncols ? nwarps : nwarps * cols_per_warp/ncols; // Number of parallel CUDA warps per Q column.
    constexpr int  nbatch_fa       = ggml_cuda_fattn_mma_get_nbatch_fa     (DKQ, DV, ncols);
    constexpr int  nbatch_K2       = ggml_cuda_fattn_mma_get_nbatch_K2     (DKQ, DV, ncols);
    constexpr int  nbatch_V2       = ggml_cuda_fattn_mma_get_nbatch_V2     (DKQ, DV, ncols);
@@ -1356,6 +1365,13 @@ static __global__ void flash_attn_ext_f16(
        NO_DEVICE_CODE;
        return;
    }
+#ifdef VOLTA_MMA_AVAILABLE
+    if (ncols1*ncols2 < 32) {
+        NO_DEVICE_CODE;
+        return;
+    }
+#endif // VOLTA_MMA_AVAILABLE
+
 #if __CUDA_ARCH__ == GGML_CUDA_CC_TURING
    if (ncols1*ncols2 > 32) {
        NO_DEVICE_CODE;
@@ -1589,8 +1605,7 @@ extern DECL_FATTN_MMA_F16_CASE(576, 512, 1, 16);
 extern DECL_FATTN_MMA_F16_CASE(576, 512, 2, 16);
 extern DECL_FATTN_MMA_F16_CASE(576, 512, 4, 16);

-// GLM 4.7 Flash uses gqa_ratio 4:
-extern DECL_FATTN_MMA_F16_CASE(576, 512, 2,  4);
-extern DECL_FATTN_MMA_F16_CASE(576, 512, 4,  4);
-extern DECL_FATTN_MMA_F16_CASE(576, 512, 8,  4);
-extern DECL_FATTN_MMA_F16_CASE(576, 512, 16, 4);
+// For GLM 4.7 Flash
+extern DECL_FATTN_MMA_F16_CASE(576, 512,  4,  4);
+extern DECL_FATTN_MMA_F16_CASE(576, 512,  8,  4);
+extern DECL_FATTN_MMA_F16_CASE(576, 512, 16,  4);
--- a/ml/backend/ggml/ggml/src/ggml-cuda/fattn-tile.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-tile.cuh
@@ -68,7 +68,7 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2,  64,  64)

-    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 256, 2,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 128, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  8, 256, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2,  64,  64)

@@ -124,9 +124,9 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2,  32, 128)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2,  32,  64)

-    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 256, 2,  64,  64)
-    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  8, 256, 2,  64,  64)
-    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 128, 2,  32,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  8, 256, 2,  32,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2,  32,  64)

    return 0;
 }
@@ -187,7 +187,7 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2,  32, 128)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2,  32, 128)

-    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 256, 2,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 128, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  8, 256, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 32, 512, 1, 128,  64)
@@ -251,8 +251,8 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 5,  32, 256)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 3,  64, 128)

-    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 256, 4,  64,  64)
-    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  8, 256, 4,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  4, 128, 2,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512,  8, 256, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 4,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(576, 512, 32, 256, 2, 128,  64)

--- a/scripts/build_darwin.sh
+++ b/scripts/build_darwin.sh
@@ -14,8 +14,8 @@
 VOL_NAME=${VOL_NAME:-"Ollama"}
 export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")}
 export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${VERSION#v}\" \"-X=github.com/ollama/ollama/server.mode=release\"'"
-export CGO_CFLAGS="-mmacosx-version-min=14.0"
-export CGO_CXXFLAGS="-mmacosx-version-min=14.0"
+export CGO_CFLAGS="-O3 -mmacosx-version-min=14.0"
+export CGO_CXXFLAGS="-O3 -mmacosx-version-min=14.0"
 export CGO_LDFLAGS="-mmacosx-version-min=14.0"

 set -e
Author	SHA1	Message	Date
jmorganca	134cb6720a	fix: add -O3 optimization to CGO flags CGO_CFLAGS and CGO_CXXFLAGS were being set without optimization flags, which overrides Go's default -O2 and results in unoptimized C++ code. This caused significant performance degradation in release builds compared to local `go build` which uses the default optimization. - build_darwin.sh: add -O3 to CGO_CFLAGS and CGO_CXXFLAGS exports - Dockerfile: preserve CGO_CFLAGS/CGO_CXXFLAGS from build args instead of overwriting them - app/README.md: update documentation to include -O3	2026-01-24 01:43:11 -08:00
Stillhart	b44f56319f	README: Update the "Ollama for ruby" to the most popular and maintained ruby gem. (#13855 ) * update README ruby link the ollama-ai ruby gem is vastly less popular and seems unmaintained https://rubygems.org/gems/ollama-ai the defacto standard with the most downloads in the ruby ecosystem is ruby_llm https://rubygems.org/gems/ruby_llm I would link to that to avoid complication and guarantee feature compatibility with ollama. * Update gem link ruby_llm from website to GitHub ollama links mostly to github, not project websites, hence link to ruby_llm github.	2026-01-24 01:24:52 -05:00
Jeffrey Morgan	0209c268bb	llama: fix CUDA MMA errors in release build (#13874 )	2026-01-23 20:10:04 -08:00
Jeffrey Morgan	912d984346	llama: fix fattn-tile shared memory overflow on sm_50/52 (#13872 ) Use nthreads=128 for ncols=4 configurations in flash attention tile kernel to reduce shared memory usage below 48KB limit on Maxwell architectures (sm_50/52). With nthreads=256 and ncols=4, np=2 which caused shared memory to exceed 48KB. With nthreads=128 and ncols=4, np=1 keeps shared memory under the limit.	2026-01-23 19:22:32 -08:00