server: add tests and fix isHuggingFaceURL edge case

- Add comprehensive tests for isHuggingFaceURL and getNumDownloadParts - Fix bug where domains ending in huggingface.co (like nothuggingface.co) would incorrectly match as HuggingFace URLs - Improve code comments with more detailed documentation
server: reduce download concurrency for HuggingFace URLs
2026-01-22 06:20:00 -05:00 · 2026-01-18 16:45:17 -08:00 · 2026-01-18 16:38:49 -08:00
7 changed files with 255 additions and 25 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -190,7 +190,7 @@ if(MLX_ENGINE)
    install(TARGETS mlx mlxc
        RUNTIME_DEPENDENCIES
            DIRECTORIES ${CUDAToolkit_BIN_DIR} ${CUDAToolkit_BIN_DIR}/x64 ${CUDAToolkit_LIBRARY_DIR}
-            PRE_INCLUDE_REGEXES cublas cublasLt cudart nvrtc nvrtc-builtins cudnn nccl openblas gfortran
+            PRE_INCLUDE_REGEXES cublas cublasLt cudart nvrtc cudnn nccl
            PRE_EXCLUDE_REGEXES ".*"
        RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT MLX
        LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT MLX
--- a/docs/capabilities/web-search.mdx
+++ b/docs/capabilities/web-search.mdx
@@ -110,7 +110,7 @@ More Ollama [Python example](https://github.com/ollama/ollama-python/blob/main/e
 import { Ollama } from "ollama";

 const client = new Ollama();
-const results = await client.webSearch("what is ollama?");
+const results = await client.webSearch({ query: "what is ollama?" });
 console.log(JSON.stringify(results, null, 2));
 ```

@@ -213,7 +213,7 @@ models](https://ollama.com/models)\n\nAvailable for macOS, Windows, and Linux',
 import { Ollama } from "ollama";

 const client = new Ollama();
-const fetchResult = await client.webFetch("https://ollama.com");
+const fetchResult = await client.webFetch({ url: "https://ollama.com" });
 console.log(JSON.stringify(fetchResult, null, 2));
 ```

--- a/docs/faq.mdx
+++ b/docs/faq.mdx
@@ -22,7 +22,7 @@ Please refer to the [GPU docs](./gpu).

 ## How can I specify the context window size?

-By default, Ollama uses a context window size of 4096 tokens.
+By default, Ollama uses a context window size of 2048 tokens.

 This can be overridden with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use:

--- a/docs/linux.mdx
+++ b/docs/linux.mdx
@@ -1,5 +1,5 @@
 ---
-title: Linux
+title: "Linux"
 ---

 ## Install
@@ -13,15 +13,14 @@ curl -fsSL https://ollama.com/install.sh | sh
 ## Manual install

 <Note>
-  If you are upgrading from a prior version, you should remove the old libraries
-  with `sudo rm -rf /usr/lib/ollama` first.
+  If you are upgrading from a prior version, you should remove the old libraries with `sudo rm -rf /usr/lib/ollama` first.
 </Note>

 Download and extract the package:

 ```shell
-curl -fsSL https://ollama.com/download/ollama-linux-amd64.tar.zst \
-    | sudo tar x -C /usr
+curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz \
+    | sudo tar zx -C /usr
 ```

 Start Ollama:
@@ -41,8 +40,8 @@ ollama -v
 If you have an AMD GPU, also download and extract the additional ROCm package:

 ```shell
-curl -fsSL https://ollama.com/download/ollama-linux-amd64-rocm.tar.zst \
-    | sudo tar x -C /usr
+curl -fsSL https://ollama.com/download/ollama-linux-amd64-rocm.tgz \
+    | sudo tar zx -C /usr
 ```

 ### ARM64 install
@@ -50,8 +49,8 @@ curl -fsSL https://ollama.com/download/ollama-linux-amd64-rocm.tar.zst \
 Download and extract the ARM64-specific package:

 ```shell
-curl -fsSL https://ollama.com/download/ollama-linux-arm64.tar.zst \
-    | sudo tar x -C /usr
+curl -fsSL https://ollama.com/download/ollama-linux-arm64.tgz \
+    | sudo tar zx -C /usr
 ```

 ### Adding Ollama as a startup service (recommended)
@@ -113,11 +112,7 @@ sudo systemctl status ollama
 ```

 <Note>
-  While AMD has contributed the `amdgpu` driver upstream to the official linux
-  kernel source, the version is older and may not support all ROCm features. We
-  recommend you install the latest driver from
-  https://www.amd.com/en/support/linux-drivers for best support of your Radeon
-  GPU.
+  While AMD has contributed the `amdgpu` driver upstream to the official linux kernel source, the version is older and may not support all ROCm features. We recommend you install the latest driver from https://www.amd.com/en/support/linux-drivers for best support of your Radeon GPU.
 </Note>

 ## Customizing
@@ -146,8 +141,8 @@ curl -fsSL https://ollama.com/install.sh | sh
 Or by re-downloading Ollama:

 ```shell
-curl -fsSL https://ollama.com/download/ollama-linux-amd64.tar.zst \
-    | sudo tar x -C /usr
+curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz \
+    | sudo tar zx -C /usr
 ```

 ## Installing specific versions
@@ -196,4 +191,4 @@ Remove the downloaded models and Ollama service user and group:
 sudo userdel ollama
 sudo groupdel ollama
 sudo rm -r /usr/share/ollama
-```
+```
--- a/scripts/build_darwin.sh
+++ b/scripts/build_darwin.sh
@@ -179,7 +179,7 @@ _build_macapp() {
    fi

    rm -f dist/Ollama-darwin.zip
-    ditto -c -k --norsrc --keepParent dist/Ollama.app dist/Ollama-darwin.zip
+    ditto -c -k --keepParent dist/Ollama.app dist/Ollama-darwin.zip
    (cd dist/Ollama.app/Contents/Resources/; tar -cf - ollama ollama-mlx *.so *.dylib *.metallib 2>/dev/null) | gzip -9vc > dist/ollama-darwin.tgz

    # Notarize and Staple
@@ -187,7 +187,7 @@ _build_macapp() {
        $(xcrun -f notarytool) submit dist/Ollama-darwin.zip --wait --timeout 20m --apple-id "$APPLE_ID" --password "$APPLE_PASSWORD" --team-id "$APPLE_TEAM_ID"
        rm -f dist/Ollama-darwin.zip
        $(xcrun -f stapler) staple dist/Ollama.app
-        ditto -c -k --norsrc --keepParent dist/Ollama.app dist/Ollama-darwin.zip
+        ditto -c -k --keepParent dist/Ollama.app dist/Ollama-darwin.zip

        rm -f dist/Ollama.dmg

--- a/server/download.go
+++ b/server/download.go
@@ -95,11 +95,48 @@ func (p *blobDownloadPart) UnmarshalJSON(b []byte) error {
 }

 const (
-	numDownloadParts          = 16
+	// numDownloadParts is the default number of concurrent download parts for standard downloads
+	numDownloadParts = 16
+	// numHFDownloadParts is the reduced number of concurrent download parts for HuggingFace
+	// downloads to avoid triggering rate limits (HTTP 429 errors). See GitHub issue #13297.
+	numHFDownloadParts        = 4
 	minDownloadPartSize int64 = 100 * format.MegaByte
 	maxDownloadPartSize int64 = 1000 * format.MegaByte
 )

+// isHuggingFaceURL returns true if the URL is from a HuggingFace domain.
+// This includes:
+//   - huggingface.co (main domain)
+//   - *.huggingface.co (subdomains like cdn-lfs.huggingface.co)
+//   - hf.co (shortlink domain)
+//   - *.hf.co (CDN domains like cdn-lfs.hf.co, cdn-lfs3.hf.co)
+func isHuggingFaceURL(u *url.URL) bool {
+	if u == nil {
+		return false
+	}
+	host := strings.ToLower(u.Hostname())
+	return host == "huggingface.co" ||
+		strings.HasSuffix(host, ".huggingface.co") ||
+		host == "hf.co" ||
+		strings.HasSuffix(host, ".hf.co")
+}
+
+// getNumDownloadParts returns the number of concurrent download parts to use
+// for the given URL. HuggingFace URLs use reduced concurrency (default 4) to
+// avoid triggering rate limits. This can be overridden via the OLLAMA_HF_CONCURRENCY
+// environment variable. For non-HuggingFace URLs, returns the standard concurrency (16).
+func getNumDownloadParts(u *url.URL) int {
+	if isHuggingFaceURL(u) {
+		if v := os.Getenv("OLLAMA_HF_CONCURRENCY"); v != "" {
+			if n, err := strconv.Atoi(v); err == nil && n > 0 {
+				return n
+			}
+		}
+		return numHFDownloadParts
+	}
+	return numDownloadParts
+}
+
 func (p *blobDownloadPart) Name() string {
 	return strings.Join([]string{
 		p.blobDownload.Name, "partial", strconv.Itoa(p.N),
@@ -271,7 +308,11 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *regis
 	}

 	g, inner := errgroup.WithContext(ctx)
-	g.SetLimit(numDownloadParts)
+	concurrency := getNumDownloadParts(directURL)
+	if concurrency != numDownloadParts {
+		slog.Info(fmt.Sprintf("using reduced concurrency (%d) for HuggingFace download", concurrency))
+	}
+	g.SetLimit(concurrency)
 	for i := range b.Parts {
 		part := b.Parts[i]
 		if part.Completed.Load() == part.Size {
--- a/server/download_test.go
+++ b/server/download_test.go
@@ -0,0 +1,194 @@
+package server
+
+import (
+	"net/url"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestIsHuggingFaceURL(t *testing.T) {
+	tests := []struct {
+		name     string
+		url      string
+		expected bool
+	}{
+		{
+			name:     "nil url",
+			url:      "",
+			expected: false,
+		},
+		{
+			name:     "huggingface.co main domain",
+			url:      "https://huggingface.co/some/model",
+			expected: true,
+		},
+		{
+			name:     "cdn-lfs.huggingface.co subdomain",
+			url:      "https://cdn-lfs.huggingface.co/repos/abc/123",
+			expected: true,
+		},
+		{
+			name:     "cdn-lfs3.hf.co CDN domain",
+			url:      "https://cdn-lfs3.hf.co/repos/abc/123",
+			expected: true,
+		},
+		{
+			name:     "hf.co shortlink domain",
+			url:      "https://hf.co/model",
+			expected: true,
+		},
+		{
+			name:     "uppercase HuggingFace domain",
+			url:      "https://HUGGINGFACE.CO/model",
+			expected: true,
+		},
+		{
+			name:     "mixed case HF domain",
+			url:      "https://Cdn-Lfs.HF.Co/repos",
+			expected: true,
+		},
+		{
+			name:     "ollama registry",
+			url:      "https://registry.ollama.ai/v2/library/llama3",
+			expected: false,
+		},
+		{
+			name:     "github.com",
+			url:      "https://github.com/ollama/ollama",
+			expected: false,
+		},
+		{
+			name:     "fake huggingface domain",
+			url:      "https://nothuggingface.co/model",
+			expected: false,
+		},
+		{
+			name:     "fake hf domain",
+			url:      "https://nothf.co/model",
+			expected: false,
+		},
+		{
+			name:     "huggingface in path not host",
+			url:      "https://example.com/huggingface.co/model",
+			expected: false,
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			var u *url.URL
+			if tc.url != "" {
+				var err error
+				u, err = url.Parse(tc.url)
+				if err != nil {
+					t.Fatalf("failed to parse URL: %v", err)
+				}
+			}
+			got := isHuggingFaceURL(u)
+			assert.Equal(t, tc.expected, got)
+		})
+	}
+}
+
+func TestGetNumDownloadParts(t *testing.T) {
+	tests := []struct {
+		name        string
+		url         string
+		envValue    string
+		expected    int
+		description string
+	}{
+		{
+			name:        "nil url returns default",
+			url:         "",
+			envValue:    "",
+			expected:    numDownloadParts,
+			description: "nil URL should return standard concurrency",
+		},
+		{
+			name:        "ollama registry returns default",
+			url:         "https://registry.ollama.ai/v2/library/llama3",
+			envValue:    "",
+			expected:    numDownloadParts,
+			description: "Ollama registry should use standard concurrency",
+		},
+		{
+			name:        "huggingface returns reduced default",
+			url:         "https://huggingface.co/model/repo",
+			envValue:    "",
+			expected:    numHFDownloadParts,
+			description: "HuggingFace should use reduced concurrency",
+		},
+		{
+			name:        "hf.co CDN returns reduced default",
+			url:         "https://cdn-lfs3.hf.co/repos/abc/123",
+			envValue:    "",
+			expected:    numHFDownloadParts,
+			description: "HuggingFace CDN should use reduced concurrency",
+		},
+		{
+			name:        "huggingface with env override",
+			url:         "https://huggingface.co/model/repo",
+			envValue:    "2",
+			expected:    2,
+			description: "OLLAMA_HF_CONCURRENCY should override default",
+		},
+		{
+			name:        "huggingface with higher env override",
+			url:         "https://huggingface.co/model/repo",
+			envValue:    "8",
+			expected:    8,
+			description: "OLLAMA_HF_CONCURRENCY can be set higher than default",
+		},
+		{
+			name:        "huggingface with invalid env (non-numeric)",
+			url:         "https://huggingface.co/model/repo",
+			envValue:    "invalid",
+			expected:    numHFDownloadParts,
+			description: "Invalid OLLAMA_HF_CONCURRENCY should fall back to default",
+		},
+		{
+			name:        "huggingface with invalid env (zero)",
+			url:         "https://huggingface.co/model/repo",
+			envValue:    "0",
+			expected:    numHFDownloadParts,
+			description: "Zero OLLAMA_HF_CONCURRENCY should fall back to default",
+		},
+		{
+			name:        "huggingface with invalid env (negative)",
+			url:         "https://huggingface.co/model/repo",
+			envValue:    "-1",
+			expected:    numHFDownloadParts,
+			description: "Negative OLLAMA_HF_CONCURRENCY should fall back to default",
+		},
+		{
+			name:        "non-huggingface ignores env",
+			url:         "https://registry.ollama.ai/v2/library/llama3",
+			envValue:    "2",
+			expected:    numDownloadParts,
+			description: "OLLAMA_HF_CONCURRENCY should not affect non-HF URLs",
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			// Set or clear the environment variable
+			if tc.envValue != "" {
+				t.Setenv("OLLAMA_HF_CONCURRENCY", tc.envValue)
+			}
+
+			var u *url.URL
+			if tc.url != "" {
+				var err error
+				u, err = url.Parse(tc.url)
+				if err != nil {
+					t.Fatalf("failed to parse URL: %v", err)
+				}
+			}
+
+			got := getNumDownloadParts(u)
+			assert.Equal(t, tc.expected, got, tc.description)
+		})
+	}
+}