server: add tests and fix isHuggingFaceURL edge case

- Add comprehensive tests for isHuggingFaceURL and getNumDownloadParts - Fix bug where domains ending in huggingface.co (like nothuggingface.co) would incorrectly match as HuggingFace URLs - Improve code comments with more detailed documentation
server: reduce download concurrency for HuggingFace URLs
2026-01-19 04:51:17 -05:00 · 2026-01-18 16:45:17 -08:00 · 2026-01-18 16:38:49 -08:00 · 2026-01-13 11:25:31 -08:00 · 2026-01-13 09:13:09 -08:00 · 2026-01-12 22:38:10 -08:00
14 changed files with 440 additions and 84 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -372,13 +372,17 @@ jobs:
          outputs: type=local,dest=dist/${{ matrix.os }}-${{ matrix.arch }}
          cache-from: type=registry,ref=${{ vars.DOCKER_REPO }}:latest
          cache-to: type=inline
+      - name: Deduplicate CUDA libraries
+        run: |
+          ./scripts/deduplicate_cuda_libs.sh dist/${{ matrix.os }}-${{ matrix.arch }}
      - run: |
          for COMPONENT in bin/* lib/ollama/*; do
            case "$COMPONENT" in
-              bin/ollama)                echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
+              bin/ollama*)               echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/*.so*)          echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/cuda_v*)        echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/vulkan*)        echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
+              lib/ollama/mlx*)           echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
              lib/ollama/cuda_jetpack5)  echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
              lib/ollama/cuda_jetpack6)  echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
              lib/ollama/rocm)           echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-rocm.tar.in ;;
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -48,9 +48,10 @@ if((CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
    set(GGML_CPU_ALL_VARIANTS ON)
 endif()

-if (CMAKE_OSX_ARCHITECTURES MATCHES "x86_64")
+if(APPLE)
    set(CMAKE_BUILD_RPATH "@loader_path")
    set(CMAKE_INSTALL_RPATH "@loader_path")
+    set(CMAKE_BUILD_WITH_INSTALL_RPATH ON)
 endif()

 set(OLLAMA_BUILD_DIR ${CMAKE_BINARY_DIR}/lib/ollama)
@@ -196,6 +197,14 @@ if(MLX_ENGINE)
        FRAMEWORK DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT MLX
    )

+    # Install the Metal library for macOS arm64 (must be colocated with the binary)
+    # Metal backend is only built for arm64, not x86_64
+    if(APPLE AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
+        install(FILES ${CMAKE_BINARY_DIR}/_deps/mlx-build/mlx/backend/metal/kernels/mlx.metallib
+            DESTINATION ${OLLAMA_INSTALL_DIR}
+            COMPONENT MLX)
+    endif()
+
    # Manually install cudart and cublas since they might not be picked up as direct dependencies
    if(CUDAToolkit_FOUND)
        file(GLOB CUDART_LIBS
--- a/4
+++ b/4
@@ -161,6 +161,9 @@ ARG GOFLAGS="'-ldflags=-w -s'"
 ENV CGO_ENABLED=1
 ARG CGO_CFLAGS
 ARG CGO_CXXFLAGS
+RUN mkdir -p dist/bin
+RUN --mount=type=cache,target=/root/.cache/go-build \
+    go build -tags mlx -trimpath -buildmode=pie -o dist/bin/ollama-mlx .

 FROM base AS build
 WORKDIR /go/src/github.com/ollama/ollama
@@ -182,6 +185,7 @@ COPY --from=cuda-12 dist/lib/ollama /lib/ollama/
 COPY --from=cuda-13 dist/lib/ollama /lib/ollama/
 COPY --from=vulkan  dist/lib/ollama  /lib/ollama/
 COPY --from=mlx     /go/src/github.com/ollama/ollama/dist/lib/ollama /lib/ollama/
+COPY --from=mlx     /go/src/github.com/ollama/ollama/dist/bin/ /bin/

 FROM --platform=linux/arm64 scratch AS arm64
 # COPY --from=cuda-11 dist/lib/ollama/ /lib/ollama/
--- a/middleware/anthropic.go
+++ b/middleware/anthropic.go
@@ -118,6 +118,9 @@ func AnthropicMessagesMiddleware() gin.HandlerFunc {
 			return
 		}

+		// Set think to nil when being used with Anthropic API to connect to tools like claude code
+		c.Set("relax_thinking", true)
+
 		var b bytes.Buffer
 		if err := json.NewEncoder(&b).Encode(chatReq); err != nil {
 			c.AbortWithStatusJSON(http.StatusInternalServerError, anthropic.NewError(http.StatusInternalServerError, err.Error()))
--- a/middleware/anthropic_test.go
+++ b/middleware/anthropic_test.go
@@ -582,3 +582,26 @@ func TestAnthropicWriter_ErrorFromRoutes(t *testing.T) {
 		})
 	}
 }
+
+func TestAnthropicMessagesMiddleware_SetsRelaxThinkingFlag(t *testing.T) {
+	gin.SetMode(gin.TestMode)
+
+	var flagSet bool
+	router := gin.New()
+	router.Use(AnthropicMessagesMiddleware())
+	router.POST("/v1/messages", func(c *gin.Context) {
+		_, flagSet = c.Get("relax_thinking")
+		c.Status(http.StatusOK)
+	})
+
+	body := `{"model": "test-model", "max_tokens": 100, "messages": [{"role": "user", "content": "Hi"}]}`
+	req, _ := http.NewRequest(http.MethodPost, "/v1/messages", strings.NewReader(body))
+	req.Header.Set("Content-Type", "application/json")
+
+	resp := httptest.NewRecorder()
+	router.ServeHTTP(resp, req)
+
+	if !flagSet {
+		t.Error("expected relax_thinking flag to be set in context")
+	}
+}
--- a/scripts/build_darwin.sh
+++ b/scripts/build_darwin.sh
@@ -73,7 +73,7 @@ _build_darwin() {
            MLX_CGO_CFLAGS="-O3 -I$(pwd)/$BUILD_DIR/_deps/mlx-c-src -mmacosx-version-min=14.0"
            MLX_CGO_LDFLAGS="-L$(pwd)/$BUILD_DIR/lib/ollama -lmlxc -lmlx -Wl,-rpath,@executable_path -lc++ -framework Metal -framework Foundation -framework Accelerate -mmacosx-version-min=14.0"
        fi
-        GOOS=darwin GOARCH=$ARCH CGO_ENABLED=1 CGO_CFLAGS="$MLX_CGO_CFLAGS" CGO_LDFLAGS="$MLX_CGO_LDFLAGS" go build -tags mlx -o $INSTALL_PREFIX/imagegen ./x/imagegen/cmd/engine
+        GOOS=darwin GOARCH=$ARCH CGO_ENABLED=1 CGO_CFLAGS="$MLX_CGO_CFLAGS" CGO_LDFLAGS="$MLX_CGO_LDFLAGS" go build -tags mlx -o $INSTALL_PREFIX/ollama-mlx .
        GOOS=darwin GOARCH=$ARCH CGO_ENABLED=1 go build -o $INSTALL_PREFIX .
    done
 }
@@ -82,19 +82,19 @@ _sign_darwin() {
    status "Creating universal binary..."
    mkdir -p dist/darwin
    lipo -create -output dist/darwin/ollama dist/darwin-*/ollama
-    lipo -create -output dist/darwin/imagegen dist/darwin-*/imagegen
+    lipo -create -output dist/darwin/ollama-mlx dist/darwin-*/ollama-mlx
    chmod +x dist/darwin/ollama
-    chmod +x dist/darwin/imagegen
+    chmod +x dist/darwin/ollama-mlx

    if [ -n "$APPLE_IDENTITY" ]; then
-        for F in dist/darwin/ollama dist/darwin-*/lib/ollama/* dist/darwin/imagegen; do
+        for F in dist/darwin/ollama dist/darwin-*/lib/ollama/* dist/darwin/ollama-mlx; do
            codesign -f --timestamp -s "$APPLE_IDENTITY" --identifier ai.ollama.ollama --options=runtime $F
        done

        # create a temporary zip for notarization
        TEMP=$(mktemp -u).zip
        ditto -c -k --keepParent dist/darwin/ollama "$TEMP"
-        xcrun notarytool submit "$TEMP" --wait --timeout 10m --apple-id $APPLE_ID --password $APPLE_PASSWORD --team-id $APPLE_TEAM_ID
+        xcrun notarytool submit "$TEMP" --wait --timeout 20m --apple-id $APPLE_ID --password $APPLE_PASSWORD --team-id $APPLE_TEAM_ID
        rm -f "$TEMP"
    fi

@@ -154,23 +154,25 @@ _build_macapp() {
    mkdir -p dist/Ollama.app/Contents/Resources
    if [ -d dist/darwin-amd64 ]; then
        lipo -create -output dist/Ollama.app/Contents/Resources/ollama dist/darwin-amd64/ollama dist/darwin-arm64/ollama
-        lipo -create -output dist/Ollama.app/Contents/Resources/imagegen dist/darwin-amd64/imagegen dist/darwin-arm64/imagegen
+        lipo -create -output dist/Ollama.app/Contents/Resources/ollama-mlx dist/darwin-amd64/ollama-mlx dist/darwin-arm64/ollama-mlx
        for F in dist/darwin-amd64/lib/ollama/*mlx*.dylib ; do
            lipo -create -output dist/darwin/$(basename $F) $F dist/darwin-arm64/lib/ollama/$(basename $F)
        done
        cp dist/darwin-*/lib/ollama/*.so dist/darwin-*/lib/ollama/*.dylib dist/Ollama.app/Contents/Resources/
        cp dist/darwin/*.dylib dist/Ollama.app/Contents/Resources/
+        # Copy MLX metallib (architecture-independent, just use arm64 version)
+        cp dist/darwin-arm64/lib/ollama/*.metallib dist/Ollama.app/Contents/Resources/ 2>/dev/null || true
    else
        cp -a dist/darwin/ollama dist/Ollama.app/Contents/Resources/ollama
        cp dist/darwin/*.so dist/darwin/*.dylib dist/Ollama.app/Contents/Resources/
    fi
-    cp -a dist/darwin/imagegen dist/Ollama.app/Contents/Resources/imagegen
+    cp -a dist/darwin/ollama-mlx dist/Ollama.app/Contents/Resources/ollama-mlx
    chmod a+x dist/Ollama.app/Contents/Resources/ollama

    # Sign
    if [ -n "$APPLE_IDENTITY" ]; then
        codesign -f --timestamp -s "$APPLE_IDENTITY" --identifier ai.ollama.ollama --options=runtime dist/Ollama.app/Contents/Resources/ollama
-        for lib in dist/Ollama.app/Contents/Resources/*.so dist/Ollama.app/Contents/Resources/*.dylib dist/Ollama.app/Contents/Resources/imagegen ; do
+        for lib in dist/Ollama.app/Contents/Resources/*.so dist/Ollama.app/Contents/Resources/*.dylib dist/Ollama.app/Contents/Resources/*.metallib dist/Ollama.app/Contents/Resources/ollama-mlx ; do
            codesign -f --timestamp -s "$APPLE_IDENTITY" --identifier ai.ollama.ollama --options=runtime ${lib}
        done
        codesign -f --timestamp -s "$APPLE_IDENTITY" --identifier com.electron.ollama --deep --options=runtime dist/Ollama.app
@@ -178,11 +180,11 @@ _build_macapp() {

    rm -f dist/Ollama-darwin.zip
    ditto -c -k --keepParent dist/Ollama.app dist/Ollama-darwin.zip
-    (cd dist/Ollama.app/Contents/Resources/; tar -cf - ollama imagegen *.so *.dylib) | gzip -9vc > dist/ollama-darwin.tgz
+    (cd dist/Ollama.app/Contents/Resources/; tar -cf - ollama ollama-mlx *.so *.dylib *.metallib 2>/dev/null) | gzip -9vc > dist/ollama-darwin.tgz

    # Notarize and Staple
    if [ -n "$APPLE_IDENTITY" ]; then
-        $(xcrun -f notarytool) submit dist/Ollama-darwin.zip --wait --timeout 10m --apple-id "$APPLE_ID" --password "$APPLE_PASSWORD" --team-id "$APPLE_TEAM_ID"
+        $(xcrun -f notarytool) submit dist/Ollama-darwin.zip --wait --timeout 20m --apple-id "$APPLE_ID" --password "$APPLE_PASSWORD" --team-id "$APPLE_TEAM_ID"
        rm -f dist/Ollama-darwin.zip
        $(xcrun -f stapler) staple dist/Ollama.app
        ditto -c -k --keepParent dist/Ollama.app dist/Ollama-darwin.zip
@@ -206,7 +208,7 @@ _build_macapp() {
        rm -f dist/rw*.dmg

        codesign -f --timestamp -s "$APPLE_IDENTITY" --identifier ai.ollama.ollama --options=runtime dist/Ollama.dmg
-        $(xcrun -f notarytool) submit dist/Ollama.dmg --wait --timeout 10m --apple-id "$APPLE_ID" --password "$APPLE_PASSWORD" --team-id "$APPLE_TEAM_ID"
+        $(xcrun -f notarytool) submit dist/Ollama.dmg --wait --timeout 20m --apple-id "$APPLE_ID" --password "$APPLE_PASSWORD" --team-id "$APPLE_TEAM_ID"
        $(xcrun -f stapler) staple dist/Ollama.dmg
    else
        echo "WARNING: Code signing disabled, this bundle will not work for upgrade testing"
--- a/scripts/build_linux.sh
+++ b/scripts/build_linux.sh
@@ -48,53 +48,12 @@ if echo $PLATFORM | grep "amd64" > /dev/null; then
        .
 fi

-# Deduplicate CUDA libraries across mlx_* and cuda_* directories
-deduplicate_cuda_libs() {
-    local base_dir="$1"
-    echo "Deduplicating CUDA libraries in ${base_dir}..."
-
-    # Find all mlx_cuda_* directories
-    for mlx_dir in "${base_dir}"/lib/ollama/mlx_cuda_*; do
-        [ -d "${mlx_dir}" ] || continue
-
-        # Extract CUDA version (e.g., v12, v13)
-        cuda_version=$(basename "${mlx_dir}" | sed 's/mlx_cuda_//')
-        cuda_dir="${base_dir}/lib/ollama/cuda_${cuda_version}"
-
-        # Skip if corresponding cuda_* directory doesn't exist
-        [ -d "${cuda_dir}" ] || continue
-
-        echo "  Checking ${mlx_dir} against ${cuda_dir}..."
-
-        # Find all .so* files in mlx directory
-        find "${mlx_dir}" -type f -name "*.so*" | while read mlx_file; do
-            filename=$(basename "${mlx_file}")
-            cuda_file="${cuda_dir}/${filename}"
-
-            # Skip if file doesn't exist in cuda directory
-            [ -f "${cuda_file}" ] || continue
-
-            # Compare checksums
-            mlx_sum=$(sha256sum "${mlx_file}" | awk '{print $1}')
-            cuda_sum=$(sha256sum "${cuda_file}" | awk '{print $1}')
-
-            if [ "${mlx_sum}" = "${cuda_sum}" ]; then
-                echo "    Deduplicating ${filename}"
-                # Calculate relative path from mlx_dir to cuda_dir
-                rel_path="../cuda_${cuda_version}/${filename}"
-                rm -f "${mlx_file}"
-                ln -s "${rel_path}" "${mlx_file}"
-            fi
-        done
-    done
-}
-
 # Run deduplication for each platform output directory
 if echo $PLATFORM | grep "," > /dev/null ; then
-    deduplicate_cuda_libs "./dist/linux_amd64"
-    deduplicate_cuda_libs "./dist/linux_arm64"
+    $(dirname $0)/deduplicate_cuda_libs.sh "./dist/linux_amd64"
+    $(dirname $0)/deduplicate_cuda_libs.sh "./dist/linux_arm64"
 elif echo $PLATFORM | grep "amd64\|arm64" > /dev/null ; then
-    deduplicate_cuda_libs "./dist"
+    $(dirname $0)/deduplicate_cuda_libs.sh "./dist"
 fi

 # buildx behavior changes for single vs. multiplatform
--- a/scripts/deduplicate_cuda_libs.sh
+++ b/scripts/deduplicate_cuda_libs.sh
@@ -0,0 +1,60 @@
+#!/bin/sh
+#
+# Deduplicate CUDA libraries across mlx_* and cuda_* directories
+# This script finds identical .so* files in mlx_cuda_* directories that exist
+# in corresponding cuda_* directories and replaces them with symlinks.
+#
+
+set -eu
+
+if [ $# -eq 0 ]; then
+    echo "ERROR: No directory specified" >&2
+    echo "Usage: $0 <base_directory>" >&2
+    exit 1
+fi
+
+base_dir="$1"
+
+if [ ! -d "${base_dir}" ]; then
+    echo "ERROR: Directory ${base_dir} does not exist" >&2
+    exit 1
+fi
+
+echo "Deduplicating CUDA libraries in ${base_dir}..."
+
+# Find all mlx_cuda_* directories
+for mlx_dir in "${base_dir}"/lib/ollama/mlx_cuda_*; do
+    [ -d "${mlx_dir}" ] || continue
+
+    # Extract CUDA version (e.g., v12, v13)
+    cuda_version=$(basename "${mlx_dir}" | sed 's/mlx_cuda_//')
+    cuda_dir="${base_dir}/lib/ollama/cuda_${cuda_version}"
+
+    # Skip if corresponding cuda_* directory doesn't exist
+    [ -d "${cuda_dir}" ] || continue
+
+    echo "  Checking ${mlx_dir} against ${cuda_dir}..."
+
+    # Find all .so* files in mlx directory
+    find "${mlx_dir}" -type f -name "*.so*" | while read mlx_file; do
+        filename=$(basename "${mlx_file}")
+        cuda_file="${cuda_dir}/${filename}"
+
+        # Skip if file doesn't exist in cuda directory
+        [ -f "${cuda_file}" ] || continue
+
+        # Compare checksums
+        mlx_sum=$(sha256sum "${mlx_file}" | awk '{print $1}')
+        cuda_sum=$(sha256sum "${cuda_file}" | awk '{print $1}')
+
+        if [ "${mlx_sum}" = "${cuda_sum}" ]; then
+            echo "    Deduplicating ${filename}"
+            # Calculate relative path from mlx_dir to cuda_dir
+            rel_path="../cuda_${cuda_version}/${filename}"
+            rm -f "${mlx_file}"
+            ln -s "${rel_path}" "${mlx_file}"
+        fi
+    done
+done
+
+echo "Deduplication complete"
--- a/server/download.go
+++ b/server/download.go
@@ -95,11 +95,48 @@ func (p *blobDownloadPart) UnmarshalJSON(b []byte) error {
 }

 const (
-	numDownloadParts          = 16
+	// numDownloadParts is the default number of concurrent download parts for standard downloads
+	numDownloadParts = 16
+	// numHFDownloadParts is the reduced number of concurrent download parts for HuggingFace
+	// downloads to avoid triggering rate limits (HTTP 429 errors). See GitHub issue #13297.
+	numHFDownloadParts        = 4
 	minDownloadPartSize int64 = 100 * format.MegaByte
 	maxDownloadPartSize int64 = 1000 * format.MegaByte
 )

+// isHuggingFaceURL returns true if the URL is from a HuggingFace domain.
+// This includes:
+//   - huggingface.co (main domain)
+//   - *.huggingface.co (subdomains like cdn-lfs.huggingface.co)
+//   - hf.co (shortlink domain)
+//   - *.hf.co (CDN domains like cdn-lfs.hf.co, cdn-lfs3.hf.co)
+func isHuggingFaceURL(u *url.URL) bool {
+	if u == nil {
+		return false
+	}
+	host := strings.ToLower(u.Hostname())
+	return host == "huggingface.co" ||
+		strings.HasSuffix(host, ".huggingface.co") ||
+		host == "hf.co" ||
+		strings.HasSuffix(host, ".hf.co")
+}
+
+// getNumDownloadParts returns the number of concurrent download parts to use
+// for the given URL. HuggingFace URLs use reduced concurrency (default 4) to
+// avoid triggering rate limits. This can be overridden via the OLLAMA_HF_CONCURRENCY
+// environment variable. For non-HuggingFace URLs, returns the standard concurrency (16).
+func getNumDownloadParts(u *url.URL) int {
+	if isHuggingFaceURL(u) {
+		if v := os.Getenv("OLLAMA_HF_CONCURRENCY"); v != "" {
+			if n, err := strconv.Atoi(v); err == nil && n > 0 {
+				return n
+			}
+		}
+		return numHFDownloadParts
+	}
+	return numDownloadParts
+}
+
 func (p *blobDownloadPart) Name() string {
 	return strings.Join([]string{
 		p.blobDownload.Name, "partial", strconv.Itoa(p.N),
@@ -271,7 +308,11 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *regis
 	}

 	g, inner := errgroup.WithContext(ctx)
-	g.SetLimit(numDownloadParts)
+	concurrency := getNumDownloadParts(directURL)
+	if concurrency != numDownloadParts {
+		slog.Info(fmt.Sprintf("using reduced concurrency (%d) for HuggingFace download", concurrency))
+	}
+	g.SetLimit(concurrency)
 	for i := range b.Parts {
 		part := b.Parts[i]
 		if part.Completed.Load() == part.Size {
--- a/server/download_test.go
+++ b/server/download_test.go
@@ -0,0 +1,194 @@
+package server
+
+import (
+	"net/url"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestIsHuggingFaceURL(t *testing.T) {
+	tests := []struct {
+		name     string
+		url      string
+		expected bool
+	}{
+		{
+			name:     "nil url",
+			url:      "",
+			expected: false,
+		},
+		{
+			name:     "huggingface.co main domain",
+			url:      "https://huggingface.co/some/model",
+			expected: true,
+		},
+		{
+			name:     "cdn-lfs.huggingface.co subdomain",
+			url:      "https://cdn-lfs.huggingface.co/repos/abc/123",
+			expected: true,
+		},
+		{
+			name:     "cdn-lfs3.hf.co CDN domain",
+			url:      "https://cdn-lfs3.hf.co/repos/abc/123",
+			expected: true,
+		},
+		{
+			name:     "hf.co shortlink domain",
+			url:      "https://hf.co/model",
+			expected: true,
+		},
+		{
+			name:     "uppercase HuggingFace domain",
+			url:      "https://HUGGINGFACE.CO/model",
+			expected: true,
+		},
+		{
+			name:     "mixed case HF domain",
+			url:      "https://Cdn-Lfs.HF.Co/repos",
+			expected: true,
+		},
+		{
+			name:     "ollama registry",
+			url:      "https://registry.ollama.ai/v2/library/llama3",
+			expected: false,
+		},
+		{
+			name:     "github.com",
+			url:      "https://github.com/ollama/ollama",
+			expected: false,
+		},
+		{
+			name:     "fake huggingface domain",
+			url:      "https://nothuggingface.co/model",
+			expected: false,
+		},
+		{
+			name:     "fake hf domain",
+			url:      "https://nothf.co/model",
+			expected: false,
+		},
+		{
+			name:     "huggingface in path not host",
+			url:      "https://example.com/huggingface.co/model",
+			expected: false,
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			var u *url.URL
+			if tc.url != "" {
+				var err error
+				u, err = url.Parse(tc.url)
+				if err != nil {
+					t.Fatalf("failed to parse URL: %v", err)
+				}
+			}
+			got := isHuggingFaceURL(u)
+			assert.Equal(t, tc.expected, got)
+		})
+	}
+}
+
+func TestGetNumDownloadParts(t *testing.T) {
+	tests := []struct {
+		name        string
+		url         string
+		envValue    string
+		expected    int
+		description string
+	}{
+		{
+			name:        "nil url returns default",
+			url:         "",
+			envValue:    "",
+			expected:    numDownloadParts,
+			description: "nil URL should return standard concurrency",
+		},
+		{
+			name:        "ollama registry returns default",
+			url:         "https://registry.ollama.ai/v2/library/llama3",
+			envValue:    "",
+			expected:    numDownloadParts,
+			description: "Ollama registry should use standard concurrency",
+		},
+		{
+			name:        "huggingface returns reduced default",
+			url:         "https://huggingface.co/model/repo",
+			envValue:    "",
+			expected:    numHFDownloadParts,
+			description: "HuggingFace should use reduced concurrency",
+		},
+		{
+			name:        "hf.co CDN returns reduced default",
+			url:         "https://cdn-lfs3.hf.co/repos/abc/123",
+			envValue:    "",
+			expected:    numHFDownloadParts,
+			description: "HuggingFace CDN should use reduced concurrency",
+		},
+		{
+			name:        "huggingface with env override",
+			url:         "https://huggingface.co/model/repo",
+			envValue:    "2",
+			expected:    2,
+			description: "OLLAMA_HF_CONCURRENCY should override default",
+		},
+		{
+			name:        "huggingface with higher env override",
+			url:         "https://huggingface.co/model/repo",
+			envValue:    "8",
+			expected:    8,
+			description: "OLLAMA_HF_CONCURRENCY can be set higher than default",
+		},
+		{
+			name:        "huggingface with invalid env (non-numeric)",
+			url:         "https://huggingface.co/model/repo",
+			envValue:    "invalid",
+			expected:    numHFDownloadParts,
+			description: "Invalid OLLAMA_HF_CONCURRENCY should fall back to default",
+		},
+		{
+			name:        "huggingface with invalid env (zero)",
+			url:         "https://huggingface.co/model/repo",
+			envValue:    "0",
+			expected:    numHFDownloadParts,
+			description: "Zero OLLAMA_HF_CONCURRENCY should fall back to default",
+		},
+		{
+			name:        "huggingface with invalid env (negative)",
+			url:         "https://huggingface.co/model/repo",
+			envValue:    "-1",
+			expected:    numHFDownloadParts,
+			description: "Negative OLLAMA_HF_CONCURRENCY should fall back to default",
+		},
+		{
+			name:        "non-huggingface ignores env",
+			url:         "https://registry.ollama.ai/v2/library/llama3",
+			envValue:    "2",
+			expected:    numDownloadParts,
+			description: "OLLAMA_HF_CONCURRENCY should not affect non-HF URLs",
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			// Set or clear the environment variable
+			if tc.envValue != "" {
+				t.Setenv("OLLAMA_HF_CONCURRENCY", tc.envValue)
+			}
+
+			var u *url.URL
+			if tc.url != "" {
+				var err error
+				u, err = url.Parse(tc.url)
+				if err != nil {
+					t.Fatalf("failed to parse URL: %v", err)
+				}
+			}
+
+			got := getNumDownloadParts(u)
+			assert.Equal(t, tc.expected, got, tc.description)
+		})
+	}
+}
--- a/server/routes.go
+++ b/server/routes.go
@@ -2072,8 +2072,14 @@ func (s *Server) ChatHandler(c *gin.Context) {
 		}
 	} else {
 		if req.Think != nil && req.Think.Bool() {
-			c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("%q does not support thinking", req.Model)})
-			return
+			// Set think to nil when being used with Anthropic API to connect to tools like claude code
+			if _, ok := c.Get("relax_thinking"); ok {
+				slog.Warn("model does not support thinking, relaxing thinking to nil", "model", req.Model)
+				req.Think = nil
+			} else {
+				c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("%q does not support thinking", req.Model)})
+				return
+			}
 		}
 	}

--- a/x/README.md
+++ b/x/README.md
@@ -1,24 +1,50 @@
-# Experimental Features 
+# Experimental Features

 ## MLX Backend

 We're working on a new experimental backend based on the [MLX project](https://github.com/ml-explore/mlx)

-Support is currently limited to MacOS and Linux with CUDA GPUs.  We're looking to add support for Windows CUDA soon, and other GPU vendors.  To build:
+Support is currently limited to MacOS and Linux with CUDA GPUs. We're looking to add support for Windows CUDA soon, and other GPU vendors.

-```
+### Building ollama-mlx
+
+The `ollama-mlx` binary is a separate build of Ollama with MLX support enabled. This enables experimental features like image generation.
+
+#### macOS (Apple Silicon and Intel)
+
+```bash
+# Build MLX backend libraries
 cmake --preset MLX
 cmake --build --preset MLX --parallel
 cmake --install build --component MLX
-go build -tags mlx .
+
+# Build ollama-mlx binary
+go build -tags mlx -o ollama-mlx .
 ```

-On linux, use the preset "MLX CUDA 13" or "MLX CUDA 12" to enable CUDA with the default Ollama NVIDIA GPU architectures enabled. 
+#### Linux (CUDA)
+
+On Linux, use the preset "MLX CUDA 13" or "MLX CUDA 12" to enable CUDA with the default Ollama NVIDIA GPU architectures enabled:
+
+```bash
+# Build MLX backend libraries with CUDA support
+cmake --preset 'MLX CUDA 13'
+cmake --build --preset 'MLX CUDA 13' --parallel
+cmake --install build --component MLX
+
+# Build ollama-mlx binary
+CGO_CFLAGS="-O3 -I$(pwd)/build/_deps/mlx-c-src" \
+CGO_LDFLAGS="-L$(pwd)/build/lib/ollama -lmlxc -lmlx" \
+go build -tags mlx -o ollama-mlx .
+```
+
+#### Using build scripts
+
+The build scripts automatically create the `ollama-mlx` binary:
+
+- **macOS**: `./scripts/build_darwin.sh` produces `dist/darwin/ollama-mlx`
+- **Linux**: `./scripts/build_linux.sh` produces `ollama-mlx` in the output archives

 ## Image Generation

-Based on the experimental MLX backend, we're working on adding imagegen support.  After running the cmake commands above:
-
-```
-go build -o imagegen ./x/imagegen/cmd/engine
-```
+Image generation is built into the `ollama-mlx` binary. Run `ollama-mlx serve` to start the server with image generation support enabled.
--- a/x/imagegen/cli.go
+++ b/x/imagegen/cli.go
@@ -123,11 +123,6 @@ func RegisterFlags(cmd *cobra.Command) {
 // Returns true if it handled the request, false if the caller should continue with normal flow.
 // Supports flags: --width, --height, --steps, --seed, --negative
 func RunCLI(cmd *cobra.Command, name string, prompt string, interactive bool, keepAlive *api.Duration) error {
-	// Verify it's a valid image gen model
-	if ResolveModelName(name) == "" {
-		return fmt.Errorf("unknown image generation model: %s", name)
-	}
-
 	// Get options from flags (with env var defaults)
 	opts := DefaultOptions()
 	if cmd != nil && cmd.Flags() != nil {
@@ -511,10 +506,7 @@ func displayImageInTerminal(imagePath string) bool {
 		// Send in chunks for large images
 		const chunkSize = 4096
 		for i := 0; i < len(encoded); i += chunkSize {
-			end := i + chunkSize
-			if end > len(encoded) {
-				end = len(encoded)
-			}
+			end := min(i+chunkSize, len(encoded))
 			chunk := encoded[i:end]

 			if i == 0 {
--- a/x/imagegen/server.go
+++ b/x/imagegen/server.go
@@ -14,7 +14,9 @@ import (
 	"os"
 	"os/exec"
 	"path/filepath"
+	"runtime"
 	"strconv"
+	"strings"
 	"sync"
 	"time"

@@ -70,7 +72,7 @@ func NewServer(modelName string) (*Server, error) {
 		port = rand.Intn(65535-49152) + 49152
 	}

-	// Get the ollama executable path
+	// Get the ollama-mlx executable path (in same directory as current executable)
 	exe, err := os.Executable()
 	if err != nil {
 		return nil, fmt.Errorf("unable to lookup executable path: %w", err)
@@ -78,11 +80,42 @@ func NewServer(modelName string) (*Server, error) {
 	if eval, err := filepath.EvalSymlinks(exe); err == nil {
 		exe = eval
 	}
+	mlxExe := filepath.Join(filepath.Dir(exe), "ollama-mlx")

-	// Spawn subprocess: ollama runner --image-engine --model <path> --port <port>
-	cmd := exec.Command(exe, "runner", "--image-engine", "--model", modelName, "--port", strconv.Itoa(port))
+	// Spawn subprocess: ollama-mlx runner --image-engine --model <path> --port <port>
+	cmd := exec.Command(mlxExe, "runner", "--image-engine", "--model", modelName, "--port", strconv.Itoa(port))
 	cmd.Env = os.Environ()

+	// On Linux, set LD_LIBRARY_PATH to include MLX library directories
+	if runtime.GOOS == "linux" {
+		// Build library paths: start with LibOllamaPath, then add any mlx_* subdirectories
+		libraryPaths := []string{ml.LibOllamaPath}
+		if mlxDirs, err := filepath.Glob(filepath.Join(ml.LibOllamaPath, "mlx_*")); err == nil {
+			libraryPaths = append(libraryPaths, mlxDirs...)
+		}
+
+		// Append existing LD_LIBRARY_PATH if set
+		if existingPath, ok := os.LookupEnv("LD_LIBRARY_PATH"); ok {
+			libraryPaths = append(libraryPaths, filepath.SplitList(existingPath)...)
+		}
+
+		pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
+
+		// Update or add LD_LIBRARY_PATH in cmd.Env
+		found := false
+		for i := range cmd.Env {
+			if strings.HasPrefix(cmd.Env[i], "LD_LIBRARY_PATH=") {
+				cmd.Env[i] = "LD_LIBRARY_PATH=" + pathEnvVal
+				found = true
+				break
+			}
+		}
+		if !found {
+			cmd.Env = append(cmd.Env, "LD_LIBRARY_PATH="+pathEnvVal)
+		}
+		slog.Debug("mlx subprocess library path", "LD_LIBRARY_PATH", pathEnvVal)
+	}
+
 	s := &Server{
 		cmd:       cmd,
 		port:      port,
@@ -113,7 +146,7 @@ func NewServer(modelName string) (*Server, error) {
 		}
 	}()

-	slog.Info("starting image runner subprocess", "model", modelName, "port", port)
+	slog.Info("starting ollama-mlx image runner subprocess", "exe", mlxExe, "model", modelName, "port", port)
 	if err := cmd.Start(); err != nil {
 		return nil, fmt.Errorf("failed to start image runner: %w", err)
 	}
Author	SHA1	Message	Date
Parth Sareen	6b2abfb433	server: add tests and fix isHuggingFaceURL edge case - Add comprehensive tests for isHuggingFaceURL and getNumDownloadParts - Fix bug where domains ending in huggingface.co (like nothuggingface.co) would incorrectly match as HuggingFace URLs - Improve code comments with more detailed documentation	2026-01-18 16:45:17 -08:00
Parth Sareen	805ed4644c	server: reduce download concurrency for HuggingFace URLs Reduces concurrent download parts from 16 to 4 for HuggingFace URLs to avoid triggering rate limits (HTTP 429 errors). Adds OLLAMA_HF_CONCURRENCY environment variable for users who want to customize the concurrency level. Fixes #13297	2026-01-18 16:38:49 -08:00
Daniel Hiltgen	e4b488a7b5	CI: dedup cuda libraries to reduce payload size (#13704 )	2026-01-13 11:25:31 -08:00
Daniel Hiltgen	98079ddd79	ci: add missing mlx components to release build (#13702 )	2026-01-13 09:13:09 -08:00
Jeffrey Morgan	d70942f47b	x/imagegen/cli: skip local model check (#13699 )	2026-01-12 22:38:10 -08:00
Jeffrey Morgan	58e4701557	scripts: increase notarization timeout to 20m (#13697 ) The 100MB mlx.metallib file significantly increased the app bundle size, causing Apple's notarization service to timeout with the previous 10m limit.	2026-01-12 20:38:38 -08:00
Jeffrey Morgan	dbf47ee55a	cmake: use CMAKE_SYSTEM_PROCESSOR instead of CMAKE_OSX_ARCHITECTURES for mlx.metallib install (#13696 ) The CMake condition for installing mlx.metallib checks CMAKE_OSX_ARCHITECTURES, but this variable is only set when explicitly passed - not auto-detected. The arm64 build was missing this flag, causing the metallib to not be installed, which then caused codesign to fail on the unexpanded glob pattern.	2026-01-12 20:05:11 -08:00
Jeffrey Morgan	af7ea6e96e	x/imagegen: install mlx.metallib and fix macOS rpath handling, add mlx library directories to LD_LIBRARY_PATH (#13695 ) - Install mlx.metallib for arm64 builds (required for Metal GPU acceleration) - Apply rpath settings to all macOS builds, not just x86_64 - Add CMAKE_BUILD_WITH_INSTALL_RPATH to avoid install_name_tool errors - Update build_darwin.sh to copy, sign, and package the metallib	2026-01-12 19:03:11 -08:00
Jeffrey Morgan	8f1e0140e7	x/imagegen: fix mlx build in Dockerfile and macOS build script (#13693 )	2026-01-12 15:52:43 -08:00
Parth Sareen	35c3c9e3c2	anthropic: allow non-thinking models when using Anthropic API (#13692 )	2026-01-12 15:13:26 -08:00