server: add tests and fix isHuggingFaceURL edge case

- Add comprehensive tests for isHuggingFaceURL and getNumDownloadParts - Fix bug where domains ending in huggingface.co (like nothuggingface.co) would incorrectly match as HuggingFace URLs - Improve code comments with more detailed documentation
server: reduce download concurrency for HuggingFace URLs
2026-01-18 20:39:13 -05:00 · 2026-01-18 16:45:17 -08:00 · 2026-01-18 16:38:49 -08:00
26 changed files with 381 additions and 255 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -190,7 +190,7 @@ if(MLX_ENGINE)
    install(TARGETS mlx mlxc
        RUNTIME_DEPENDENCIES
            DIRECTORIES ${CUDAToolkit_BIN_DIR} ${CUDAToolkit_BIN_DIR}/x64 ${CUDAToolkit_LIBRARY_DIR}
-            PRE_INCLUDE_REGEXES cublas cublasLt cudart nvrtc nvrtc-builtins cudnn nccl openblas gfortran
+            PRE_INCLUDE_REGEXES cublas cublasLt cudart nvrtc cudnn nccl
            PRE_EXCLUDE_REGEXES ".*"
        RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT MLX
        LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT MLX
--- a/README.md
+++ b/README.md
@@ -48,7 +48,7 @@ ollama run gemma3

 ## Model library

-Ollama supports a list of models available on [ollama.com/library](https://ollama.com/library "ollama model library")
+Ollama supports a list of models available on [ollama.com/library](https://ollama.com/library 'ollama model library')

 Here are some example models that can be downloaded:

@@ -79,7 +79,7 @@ Here are some example models that can be downloaded:
 | Code Llama         | 7B         | 3.8GB | `ollama run codellama`           |
 | Llama 2 Uncensored | 7B         | 3.8GB | `ollama run llama2-uncensored`   |
 | LLaVA              | 7B         | 4.5GB | `ollama run llava`               |
-| Granite-3.3        | 8B         | 4.9GB | `ollama run granite3.3`          |
+| Granite-3.3         | 8B         | 4.9GB | `ollama run granite3.3`          |

 > [!NOTE]
 > You should have at least 8 GB of RAM available to run the 7B models, 16 GB to run the 13B models, and 32 GB to run the 33B models.
@@ -260,38 +260,6 @@ Finally, in a separate shell, run a model:
 ./ollama run llama3.2
 ```

-## Building with MLX (experimental)
-
-First build the MLX libraries:
-
-```shell
-cmake --preset MLX
-cmake --build --preset MLX --parallel
-cmake --install build --component MLX
-```
-
-Next, build the `ollama-mlx` binary, which is a separate build of the Ollama runtime with MLX support enabled (needs to be in the same directory as `ollama`):
-
-```shell
-go build -tags mlx -o ollama-mlx .
-```
-
-Finally, start the server:
-
-```
-./ollama serve
-```
-
-### Building MLX with CUDA
-
-When building with CUDA, use the preset "MLX CUDA 13" or "MLX CUDA 12" to enable CUDA with default architectures:
-
-```shell
-cmake --preset 'MLX CUDA 13'
-cmake --build --preset 'MLX CUDA 13' --parallel
-cmake --install build --component MLX
-```
-
 ## REST API

 Ollama has a REST API for running and managing models.
@@ -322,7 +290,6 @@ See the [API documentation](./docs/api.md) for all endpoints.

 ### Web & Desktop

- [Onyx](https://github.com/onyx-dot-app/onyx)
 - [Open WebUI](https://github.com/open-webui/open-webui)
 - [SwiftChat (macOS with ReactNative)](https://github.com/aws-samples/swift-chat)
 - [Enchanted (macOS native)](https://github.com/AugustDev/enchanted)
@@ -454,7 +421,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [AppFlowy](https://github.com/AppFlowy-IO/AppFlowy) (AI collaborative workspace with Ollama, cross-platform and self-hostable)
 - [Lumina](https://github.com/cushydigit/lumina.git) (A lightweight, minimal React.js frontend for interacting with Ollama servers)
 - [Tiny Notepad](https://pypi.org/project/tiny-notepad) (A lightweight, notepad-like interface to chat with ollama available on PyPI)
- [macLlama (macOS native)](https://github.com/hellotunamayo/macLlama) (A native macOS GUI application for interacting with Ollama models, featuring a chat interface.)
+- [macLlama (macOS native)](https://github.com/hellotunamayo/macLlama) (A native macOS GUI application for interacting with Ollama models, featuring a chat interface.) 
 - [GPTranslate](https://github.com/philberndt/GPTranslate) (A fast and lightweight, AI powered desktop translation application written with Rust and Tauri. Features real-time translation with OpenAI/Azure/Ollama.)
 - [ollama launcher](https://github.com/NGC13009/ollama-launcher) (A launcher for Ollama, aiming to provide users with convenient functions such as ollama server launching, management, or configuration.)
 - [ai-hub](https://github.com/Aj-Seven/ai-hub) (AI Hub supports multiple models via API keys and Chat support via Ollama API.)
@@ -526,7 +493,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 ### Database

 - [pgai](https://github.com/timescale/pgai) - PostgreSQL as a vector database (Create and search embeddings from Ollama models using pgvector)
-  - [Get started guide](https://github.com/timescale/pgai/blob/main/docs/vectorizer-quick-start.md)
+   - [Get started guide](https://github.com/timescale/pgai/blob/main/docs/vectorizer-quick-start.md)
 - [MindsDB](https://github.com/mindsdb/mindsdb/blob/staging/mindsdb/integrations/handlers/ollama_handler/README.md) (Connects Ollama models with nearly 200 data platforms and apps)
 - [chromem-go](https://github.com/philippgille/chromem-go/blob/v0.5.0/embed_ollama.go) with [example](https://github.com/philippgille/chromem-go/tree/v0.5.0/examples/rag-wikipedia-ollama)
 - [Kangaroo](https://github.com/dbkangaroo/kangaroo) (AI-powered SQL client and admin tool for popular databases)
@@ -669,7 +636,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [llama.cpp](https://github.com/ggml-org/llama.cpp) project founded by Georgi Gerganov.

 ### Observability
-
 - [Opik](https://www.comet.com/docs/opik/cookbook/ollama) is an open-source platform to debug, evaluate, and monitor your LLM applications, RAG systems, and agentic workflows with comprehensive tracing, automated evaluations, and production-ready dashboards. Opik supports native integration to Ollama.
 - [Lunary](https://lunary.ai/docs/integrations/ollama) is the leading open-source LLM observability platform. It provides a variety of enterprise-grade features such as real-time analytics, prompt templates management, PII masking, and comprehensive agent tracing.
 - [OpenLIT](https://github.com/openlit/openlit) is an OpenTelemetry-native tool for monitoring Ollama Applications & GPUs using traces and metrics.
@@ -678,5 +644,4 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [MLflow Tracing](https://mlflow.org/docs/latest/llms/tracing/index.html#automatic-tracing) is an open source LLM observability tool with a convenient API to log and visualize traces, making it easy to debug and evaluate GenAI applications.

 ### Security
-
 - [Ollama Fortress](https://github.com/ParisNeo/ollama_proxy_server)
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -116,7 +116,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 		Prompt:         ">>> ",
 		AltPrompt:      "... ",
 		Placeholder:    "Send a message (/? for help)",
-		AltPlaceholder: "Press Enter to send",
+		AltPlaceholder: `Use """ to end multi-line input`,
 	})
 	if err != nil {
 		return err
--- a/docs/api/anthropic-compatibility.mdx
+++ b/docs/api/anthropic-compatibility.mdx
@@ -21,7 +21,6 @@ ollama pull glm-4.7:cloud
 To use Ollama with tools that expect the Anthropic API (like Claude Code), set these environment variables:

 ```shell
-export ANTHROPIC_AUTH_TOKEN=ollama  # required but ignored
 export ANTHROPIC_BASE_URL=http://localhost:11434
 export ANTHROPIC_API_KEY=ollama  # required but ignored
 ```
@@ -248,13 +247,12 @@ curl -X POST http://localhost:11434/v1/messages \
 [Claude Code](https://code.claude.com/docs/en/overview) can be configured to use Ollama as its backend:

 ```shell
-ANTHROPIC_AUTH_TOKEN=ollama ANTHROPIC_BASE_URL=http://localhost:11434 ANTHROPIC_API_KEY=ollama claude --model qwen3-coder
+ANTHROPIC_BASE_URL=http://localhost:11434 ANTHROPIC_API_KEY=ollama claude --model qwen3-coder
 ```

 Or set the environment variables in your shell profile:

 ```shell
-export ANTHROPIC_AUTH_TOKEN=ollama
 export ANTHROPIC_BASE_URL=http://localhost:11434
 export ANTHROPIC_API_KEY=ollama
 ```
--- a/docs/capabilities/web-search.mdx
+++ b/docs/capabilities/web-search.mdx
@@ -110,7 +110,7 @@ More Ollama [Python example](https://github.com/ollama/ollama-python/blob/main/e
 import { Ollama } from "ollama";

 const client = new Ollama();
-const results = await client.webSearch("what is ollama?");
+const results = await client.webSearch({ query: "what is ollama?" });
 console.log(JSON.stringify(results, null, 2));
 ```

@@ -213,7 +213,7 @@ models](https://ollama.com/models)\n\nAvailable for macOS, Windows, and Linux',
 import { Ollama } from "ollama";

 const client = new Ollama();
-const fetchResult = await client.webFetch("https://ollama.com");
+const fetchResult = await client.webFetch({ url: "https://ollama.com" });
 console.log(JSON.stringify(fetchResult, null, 2));
 ```

--- a/docs/docs.json
+++ b/docs/docs.json
@@ -111,9 +111,7 @@
              "/integrations/zed",
              "/integrations/roo-code",
              "/integrations/n8n",
-              "/integrations/xcode",
-              "/integrations/onyx",
-              "/integrations/marimo"
+              "/integrations/xcode"
            ]
          },
          {
--- a/docs/faq.mdx
+++ b/docs/faq.mdx
@@ -22,7 +22,7 @@ Please refer to the [GPU docs](./gpu).

 ## How can I specify the context window size?

-By default, Ollama uses a context window size of 4096 tokens.
+By default, Ollama uses a context window size of 2048 tokens.

 This can be overridden with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use:

--- a/docs/images/marimo-add-model.png
+++ b/docs/images/marimo-add-model.png
--- a/docs/images/marimo-chat.png
+++ b/docs/images/marimo-chat.png
--- a/docs/images/marimo-code-completion.png
+++ b/docs/images/marimo-code-completion.png
--- a/docs/images/marimo-models.png
+++ b/docs/images/marimo-models.png
--- a/docs/images/marimo-settings.png
+++ b/docs/images/marimo-settings.png
--- a/docs/images/onyx-login.png
+++ b/docs/images/onyx-login.png
--- a/docs/images/onyx-ollama-form.png
+++ b/docs/images/onyx-ollama-form.png
--- a/docs/images/onyx-ollama-llm.png
+++ b/docs/images/onyx-ollama-llm.png
--- a/docs/images/onyx-query.png
+++ b/docs/images/onyx-query.png
--- a/docs/integrations/claude-code.mdx
+++ b/docs/integrations/claude-code.mdx
@@ -25,7 +25,6 @@ Claude Code connects to Ollama using the Anthropic-compatible API.
 1. Set the environment variables:

 ```shell
-export ANTHROPIC_AUTH_TOKEN=ollama
 export ANTHROPIC_BASE_URL=http://localhost:11434
 export ANTHROPIC_API_KEY=ollama
 ```
@@ -39,7 +38,7 @@ claude --model qwen3-coder
 Or run with environment variables inline:

 ```shell
-ANTHROPIC_AUTH_TOKEN=ollama ANTHROPIC_BASE_URL=http://localhost:11434 ANTHROPIC_API_KEY=ollama claude --model qwen3-coder
+ANTHROPIC_BASE_URL=http://localhost:11434 ANTHROPIC_API_KEY=ollama claude --model qwen3-coder
 ```

 ## Connecting to ollama.com
--- a/docs/integrations/marimo.mdx
+++ b/docs/integrations/marimo.mdx
@@ -1,73 +0,0 @@
---
-title: marimo
---
-
-## Install
-
-Install [marimo](https://marimo.io). You can use `pip` or `uv` for this. You 
-can also use `uv` to create a sandboxed environment for marimo by running:
-
-```
-uvx marimo edit --sandbox notebook.py
-```
-
-## Usage with Ollama
-
-1. In marimo, go to the user settings and go to the AI tab. From here
-you can find and configure Ollama as an AI provider. For local use you
-would typically point the base url to `http://localhost:11434/v1`.
-
-<div style={{ display: 'flex', justifyContent: 'center' }}>
-  <img 
-    src="/images/marimo-settings.png" 
-    alt="Ollama settings in marimo"
-    width="50%"
-  />
-</div>
-
-2. Once the AI provider is set up, you can turn on/off specific AI models you'd like to access. 
-
-<div style={{ display: 'flex', justifyContent: 'center' }}>
-  <img 
-    src="/images/marimo-models.png" 
-    alt="Selecting an Ollama model"
-    width="50%"
-  />
-</div>
-
-3. You can also add a model to the list of available models by scrolling to the bottom and using the UI there. 
-
-<div style={{ display: 'flex', justifyContent: 'center' }}>
-  <img 
-    src="/images/marimo-add-model.png" 
-    alt="Adding a new Ollama model"
-    width="50%"
-  />
-</div>
-
-4. Once configured, you can now use Ollama for AI chats in marimo.
-
-<div style={{ display: 'flex', justifyContent: 'center' }}>
-  <img 
-    src="/images/marimo-chat.png" 
-    alt="Configure code completion"
-    width="50%"
-  />
-</div>
-
-4. Alternatively, you can now use Ollama for **inline code completion** in marimo. This can be configured in the "AI Features" tab. 
-
-<div style={{ display: 'flex', justifyContent: 'center' }}>
-  <img 
-    src="/images/marimo-code-completion.png" 
-    alt="Configure code completion"
-    width="50%"
-  />
-</div>
-
-
-## Connecting to ollama.com
-
-1. Sign in to ollama cloud via `ollama signin` 
-2. In the ollama model settings add a model that ollama hosts, like `gpt-oss:120b`.
-3. You can now refer to this model in marimo!
--- a/docs/integrations/onyx.mdx
+++ b/docs/integrations/onyx.mdx
@@ -1,63 +0,0 @@
---
-title: Onyx
---
-
-## Overview
-[Onyx](http://onyx.app/) is a self-hostable Chat UI that integrates with all Ollama models. Features include:
- Creating custom Agents
- Web search
- Deep Research
- RAG over uploaded documents and connected apps
- Connectors to applications like Google Drive, Email, Slack, etc.
- MCP and OpenAPI Actions support
- Image generation
- User/Groups management, RBAC, SSO, etc.
-
-Onyx can be deployed for single users or large organizations.
-
-## Install Onyx
-
-Deploy Onyx with the [quickstart guide](https://docs.onyx.app/deployment/getting_started/quickstart).
-
-<Info>
-Resourcing/scaling docs [here](https://docs.onyx.app/deployment/getting_started/resourcing).
-</Info>
-
-## Usage with Ollama 
-
-1. Login to your Onyx deployment (create an account first).
-<div style={{ display: 'flex', justifyContent: 'center' }}>
-  <img 
-    src="/images/onyx-login.png" 
-    alt="Onyx Login Page"
-    width="75%"
-  />
-</div>
-2. In the set-up process select `Ollama` as the LLM provider.
-<div style={{ display: 'flex', justifyContent: 'center' }}>
-  <img 
-    src="/images/onyx-ollama-llm.png" 
-    alt="Onyx Set Up Form"
-    width="75%"
-  />
-</div>
-3. Provide your **Ollama API URL** and select your models.
-<Note>If you're running Onyx in Docker, to access your computer's local network use `http://host.docker.internal` instead of `http://127.0.0.1`.</Note>
-<div style={{ display: 'flex', justifyContent: 'center' }}>
-  <img 
-    src="/images/onyx-ollama-form.png" 
-    alt="Selecting Ollama Models"
-    width="75%"
-  />
-</div>
-
-You can also easily connect up Onyx Cloud with the `Ollama Cloud` tab of the setup.
-
-## Send your first query
-<div style={{ display: 'flex', justifyContent: 'center' }}>
-  <img 
-    src="/images/onyx-query.png" 
-    alt="Onyx Query Example"
-    width="75%"
-  />
-</div>
--- a/docs/linux.mdx
+++ b/docs/linux.mdx
@@ -1,5 +1,5 @@
 ---
-title: Linux
+title: "Linux"
 ---

 ## Install
@@ -13,15 +13,14 @@ curl -fsSL https://ollama.com/install.sh | sh
 ## Manual install

 <Note>
-  If you are upgrading from a prior version, you should remove the old libraries
-  with `sudo rm -rf /usr/lib/ollama` first.
+  If you are upgrading from a prior version, you should remove the old libraries with `sudo rm -rf /usr/lib/ollama` first.
 </Note>

 Download and extract the package:

 ```shell
-curl -fsSL https://ollama.com/download/ollama-linux-amd64.tar.zst \
-    | sudo tar x -C /usr
+curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz \
+    | sudo tar zx -C /usr
 ```

 Start Ollama:
@@ -41,8 +40,8 @@ ollama -v
 If you have an AMD GPU, also download and extract the additional ROCm package:

 ```shell
-curl -fsSL https://ollama.com/download/ollama-linux-amd64-rocm.tar.zst \
-    | sudo tar x -C /usr
+curl -fsSL https://ollama.com/download/ollama-linux-amd64-rocm.tgz \
+    | sudo tar zx -C /usr
 ```

 ### ARM64 install
@@ -50,8 +49,8 @@ curl -fsSL https://ollama.com/download/ollama-linux-amd64-rocm.tar.zst \
 Download and extract the ARM64-specific package:

 ```shell
-curl -fsSL https://ollama.com/download/ollama-linux-arm64.tar.zst \
-    | sudo tar x -C /usr
+curl -fsSL https://ollama.com/download/ollama-linux-arm64.tgz \
+    | sudo tar zx -C /usr
 ```

 ### Adding Ollama as a startup service (recommended)
@@ -113,11 +112,7 @@ sudo systemctl status ollama
 ```

 <Note>
-  While AMD has contributed the `amdgpu` driver upstream to the official linux
-  kernel source, the version is older and may not support all ROCm features. We
-  recommend you install the latest driver from
-  https://www.amd.com/en/support/linux-drivers for best support of your Radeon
-  GPU.
+  While AMD has contributed the `amdgpu` driver upstream to the official linux kernel source, the version is older and may not support all ROCm features. We recommend you install the latest driver from https://www.amd.com/en/support/linux-drivers for best support of your Radeon GPU.
 </Note>

 ## Customizing
@@ -146,8 +141,8 @@ curl -fsSL https://ollama.com/install.sh | sh
 Or by re-downloading Ollama:

 ```shell
-curl -fsSL https://ollama.com/download/ollama-linux-amd64.tar.zst \
-    | sudo tar x -C /usr
+curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz \
+    | sudo tar zx -C /usr
 ```

 ## Installing specific versions
@@ -196,4 +191,4 @@ Remove the downloaded models and Ollama service user and group:
 sudo userdel ollama
 sudo groupdel ollama
 sudo rm -r /usr/share/ollama
-```
+```
--- a/readline/readline.go
+++ b/readline/readline.go
@@ -5,7 +5,6 @@ import (
 	"fmt"
 	"io"
 	"os"
-	"strings"
 )

 type Prompt struct {
@@ -37,11 +36,10 @@ type Terminal struct {
 }

 type Instance struct {
-	Prompt      *Prompt
-	Terminal    *Terminal
-	History     *History
-	Pasting     bool
-	pastedLines []string
+	Prompt   *Prompt
+	Terminal *Terminal
+	History  *History
+	Pasting  bool
 }

 func New(prompt Prompt) (*Instance, error) {
@@ -176,8 +174,6 @@ func (i *Instance) Readline() (string, error) {
 		case CharEsc:
 			esc = true
 		case CharInterrupt:
-			i.pastedLines = nil
-			i.Prompt.UseAlt = false
 			return "", ErrInterrupt
 		case CharPrev:
 			i.historyPrev(buf, &currentLineBuf)
@@ -192,23 +188,7 @@ func (i *Instance) Readline() (string, error) {
 		case CharForward:
 			buf.MoveRight()
 		case CharBackspace, CharCtrlH:
-			if buf.IsEmpty() && len(i.pastedLines) > 0 {
-				lastIdx := len(i.pastedLines) - 1
-				prevLine := i.pastedLines[lastIdx]
-				i.pastedLines = i.pastedLines[:lastIdx]
-				fmt.Print(CursorBOL + ClearToEOL + CursorUp + CursorBOL + ClearToEOL)
-				if len(i.pastedLines) == 0 {
-					fmt.Print(i.Prompt.Prompt)
-					i.Prompt.UseAlt = false
-				} else {
-					fmt.Print(i.Prompt.AltPrompt)
-				}
-				for _, r := range prevLine {
-					buf.Add(r)
-				}
-			} else {
-				buf.Remove()
-			}
+			buf.Remove()
 		case CharTab:
 			// todo: convert back to real tabs
 			for range 8 {
@@ -231,28 +211,13 @@ func (i *Instance) Readline() (string, error) {
 		case CharCtrlZ:
 			fd := os.Stdin.Fd()
 			return handleCharCtrlZ(fd, i.Terminal.termios)
-		case CharCtrlJ:
-			i.pastedLines = append(i.pastedLines, buf.String())
-			buf.Buf.Clear()
-			buf.Pos = 0
-			buf.DisplayPos = 0
-			buf.LineHasSpace.Clear()
-			fmt.Println()
-			fmt.Print(i.Prompt.AltPrompt)
-			i.Prompt.UseAlt = true
-			continue
-		case CharEnter:
+		case CharEnter, CharCtrlJ:
 			output := buf.String()
-			if len(i.pastedLines) > 0 {
-				output = strings.Join(i.pastedLines, "\n") + "\n" + output
-				i.pastedLines = nil
-			}
 			if output != "" {
 				i.History.Add(output)
 			}
 			buf.MoveToEnd()
 			fmt.Println()
-			i.Prompt.UseAlt = false

 			return output, nil
 		default:
--- a/scripts/build_darwin.sh
+++ b/scripts/build_darwin.sh
@@ -179,7 +179,7 @@ _build_macapp() {
    fi

    rm -f dist/Ollama-darwin.zip
-    ditto -c -k --norsrc --keepParent dist/Ollama.app dist/Ollama-darwin.zip
+    ditto -c -k --keepParent dist/Ollama.app dist/Ollama-darwin.zip
    (cd dist/Ollama.app/Contents/Resources/; tar -cf - ollama ollama-mlx *.so *.dylib *.metallib 2>/dev/null) | gzip -9vc > dist/ollama-darwin.tgz

    # Notarize and Staple
@@ -187,7 +187,7 @@ _build_macapp() {
        $(xcrun -f notarytool) submit dist/Ollama-darwin.zip --wait --timeout 20m --apple-id "$APPLE_ID" --password "$APPLE_PASSWORD" --team-id "$APPLE_TEAM_ID"
        rm -f dist/Ollama-darwin.zip
        $(xcrun -f stapler) staple dist/Ollama.app
-        ditto -c -k --norsrc --keepParent dist/Ollama.app dist/Ollama-darwin.zip
+        ditto -c -k --keepParent dist/Ollama.app dist/Ollama-darwin.zip

        rm -f dist/Ollama.dmg

--- a/server/download.go
+++ b/server/download.go
@@ -95,11 +95,48 @@ func (p *blobDownloadPart) UnmarshalJSON(b []byte) error {
 }

 const (
-	numDownloadParts          = 16
+	// numDownloadParts is the default number of concurrent download parts for standard downloads
+	numDownloadParts = 16
+	// numHFDownloadParts is the reduced number of concurrent download parts for HuggingFace
+	// downloads to avoid triggering rate limits (HTTP 429 errors). See GitHub issue #13297.
+	numHFDownloadParts        = 4
 	minDownloadPartSize int64 = 100 * format.MegaByte
 	maxDownloadPartSize int64 = 1000 * format.MegaByte
 )

+// isHuggingFaceURL returns true if the URL is from a HuggingFace domain.
+// This includes:
+//   - huggingface.co (main domain)
+//   - *.huggingface.co (subdomains like cdn-lfs.huggingface.co)
+//   - hf.co (shortlink domain)
+//   - *.hf.co (CDN domains like cdn-lfs.hf.co, cdn-lfs3.hf.co)
+func isHuggingFaceURL(u *url.URL) bool {
+	if u == nil {
+		return false
+	}
+	host := strings.ToLower(u.Hostname())
+	return host == "huggingface.co" ||
+		strings.HasSuffix(host, ".huggingface.co") ||
+		host == "hf.co" ||
+		strings.HasSuffix(host, ".hf.co")
+}
+
+// getNumDownloadParts returns the number of concurrent download parts to use
+// for the given URL. HuggingFace URLs use reduced concurrency (default 4) to
+// avoid triggering rate limits. This can be overridden via the OLLAMA_HF_CONCURRENCY
+// environment variable. For non-HuggingFace URLs, returns the standard concurrency (16).
+func getNumDownloadParts(u *url.URL) int {
+	if isHuggingFaceURL(u) {
+		if v := os.Getenv("OLLAMA_HF_CONCURRENCY"); v != "" {
+			if n, err := strconv.Atoi(v); err == nil && n > 0 {
+				return n
+			}
+		}
+		return numHFDownloadParts
+	}
+	return numDownloadParts
+}
+
 func (p *blobDownloadPart) Name() string {
 	return strings.Join([]string{
 		p.blobDownload.Name, "partial", strconv.Itoa(p.N),
@@ -271,7 +308,11 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *regis
 	}

 	g, inner := errgroup.WithContext(ctx)
-	g.SetLimit(numDownloadParts)
+	concurrency := getNumDownloadParts(directURL)
+	if concurrency != numDownloadParts {
+		slog.Info(fmt.Sprintf("using reduced concurrency (%d) for HuggingFace download", concurrency))
+	}
+	g.SetLimit(concurrency)
 	for i := range b.Parts {
 		part := b.Parts[i]
 		if part.Completed.Load() == part.Size {
--- a/server/download_test.go
+++ b/server/download_test.go
@@ -0,0 +1,194 @@
+package server
+
+import (
+	"net/url"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestIsHuggingFaceURL(t *testing.T) {
+	tests := []struct {
+		name     string
+		url      string
+		expected bool
+	}{
+		{
+			name:     "nil url",
+			url:      "",
+			expected: false,
+		},
+		{
+			name:     "huggingface.co main domain",
+			url:      "https://huggingface.co/some/model",
+			expected: true,
+		},
+		{
+			name:     "cdn-lfs.huggingface.co subdomain",
+			url:      "https://cdn-lfs.huggingface.co/repos/abc/123",
+			expected: true,
+		},
+		{
+			name:     "cdn-lfs3.hf.co CDN domain",
+			url:      "https://cdn-lfs3.hf.co/repos/abc/123",
+			expected: true,
+		},
+		{
+			name:     "hf.co shortlink domain",
+			url:      "https://hf.co/model",
+			expected: true,
+		},
+		{
+			name:     "uppercase HuggingFace domain",
+			url:      "https://HUGGINGFACE.CO/model",
+			expected: true,
+		},
+		{
+			name:     "mixed case HF domain",
+			url:      "https://Cdn-Lfs.HF.Co/repos",
+			expected: true,
+		},
+		{
+			name:     "ollama registry",
+			url:      "https://registry.ollama.ai/v2/library/llama3",
+			expected: false,
+		},
+		{
+			name:     "github.com",
+			url:      "https://github.com/ollama/ollama",
+			expected: false,
+		},
+		{
+			name:     "fake huggingface domain",
+			url:      "https://nothuggingface.co/model",
+			expected: false,
+		},
+		{
+			name:     "fake hf domain",
+			url:      "https://nothf.co/model",
+			expected: false,
+		},
+		{
+			name:     "huggingface in path not host",
+			url:      "https://example.com/huggingface.co/model",
+			expected: false,
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			var u *url.URL
+			if tc.url != "" {
+				var err error
+				u, err = url.Parse(tc.url)
+				if err != nil {
+					t.Fatalf("failed to parse URL: %v", err)
+				}
+			}
+			got := isHuggingFaceURL(u)
+			assert.Equal(t, tc.expected, got)
+		})
+	}
+}
+
+func TestGetNumDownloadParts(t *testing.T) {
+	tests := []struct {
+		name        string
+		url         string
+		envValue    string
+		expected    int
+		description string
+	}{
+		{
+			name:        "nil url returns default",
+			url:         "",
+			envValue:    "",
+			expected:    numDownloadParts,
+			description: "nil URL should return standard concurrency",
+		},
+		{
+			name:        "ollama registry returns default",
+			url:         "https://registry.ollama.ai/v2/library/llama3",
+			envValue:    "",
+			expected:    numDownloadParts,
+			description: "Ollama registry should use standard concurrency",
+		},
+		{
+			name:        "huggingface returns reduced default",
+			url:         "https://huggingface.co/model/repo",
+			envValue:    "",
+			expected:    numHFDownloadParts,
+			description: "HuggingFace should use reduced concurrency",
+		},
+		{
+			name:        "hf.co CDN returns reduced default",
+			url:         "https://cdn-lfs3.hf.co/repos/abc/123",
+			envValue:    "",
+			expected:    numHFDownloadParts,
+			description: "HuggingFace CDN should use reduced concurrency",
+		},
+		{
+			name:        "huggingface with env override",
+			url:         "https://huggingface.co/model/repo",
+			envValue:    "2",
+			expected:    2,
+			description: "OLLAMA_HF_CONCURRENCY should override default",
+		},
+		{
+			name:        "huggingface with higher env override",
+			url:         "https://huggingface.co/model/repo",
+			envValue:    "8",
+			expected:    8,
+			description: "OLLAMA_HF_CONCURRENCY can be set higher than default",
+		},
+		{
+			name:        "huggingface with invalid env (non-numeric)",
+			url:         "https://huggingface.co/model/repo",
+			envValue:    "invalid",
+			expected:    numHFDownloadParts,
+			description: "Invalid OLLAMA_HF_CONCURRENCY should fall back to default",
+		},
+		{
+			name:        "huggingface with invalid env (zero)",
+			url:         "https://huggingface.co/model/repo",
+			envValue:    "0",
+			expected:    numHFDownloadParts,
+			description: "Zero OLLAMA_HF_CONCURRENCY should fall back to default",
+		},
+		{
+			name:        "huggingface with invalid env (negative)",
+			url:         "https://huggingface.co/model/repo",
+			envValue:    "-1",
+			expected:    numHFDownloadParts,
+			description: "Negative OLLAMA_HF_CONCURRENCY should fall back to default",
+		},
+		{
+			name:        "non-huggingface ignores env",
+			url:         "https://registry.ollama.ai/v2/library/llama3",
+			envValue:    "2",
+			expected:    numDownloadParts,
+			description: "OLLAMA_HF_CONCURRENCY should not affect non-HF URLs",
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			// Set or clear the environment variable
+			if tc.envValue != "" {
+				t.Setenv("OLLAMA_HF_CONCURRENCY", tc.envValue)
+			}
+
+			var u *url.URL
+			if tc.url != "" {
+				var err error
+				u, err = url.Parse(tc.url)
+				if err != nil {
+					t.Fatalf("failed to parse URL: %v", err)
+				}
+			}
+
+			got := getNumDownloadParts(u)
+			assert.Equal(t, tc.expected, got, tc.description)
+		})
+	}
+}
--- a/x/README.md
+++ b/x/README.md
@@ -0,0 +1,50 @@
+# Experimental Features
+
+## MLX Backend
+
+We're working on a new experimental backend based on the [MLX project](https://github.com/ml-explore/mlx)
+
+Support is currently limited to MacOS and Linux with CUDA GPUs. We're looking to add support for Windows CUDA soon, and other GPU vendors.
+
+### Building ollama-mlx
+
+The `ollama-mlx` binary is a separate build of Ollama with MLX support enabled. This enables experimental features like image generation.
+
+#### macOS (Apple Silicon and Intel)
+
+```bash
+# Build MLX backend libraries
+cmake --preset MLX
+cmake --build --preset MLX --parallel
+cmake --install build --component MLX
+
+# Build ollama-mlx binary
+go build -tags mlx -o ollama-mlx .
+```
+
+#### Linux (CUDA)
+
+On Linux, use the preset "MLX CUDA 13" or "MLX CUDA 12" to enable CUDA with the default Ollama NVIDIA GPU architectures enabled:
+
+```bash
+# Build MLX backend libraries with CUDA support
+cmake --preset 'MLX CUDA 13'
+cmake --build --preset 'MLX CUDA 13' --parallel
+cmake --install build --component MLX
+
+# Build ollama-mlx binary
+CGO_CFLAGS="-O3 -I$(pwd)/build/_deps/mlx-c-src" \
+CGO_LDFLAGS="-L$(pwd)/build/lib/ollama -lmlxc -lmlx" \
+go build -tags mlx -o ollama-mlx .
+```
+
+#### Using build scripts
+
+The build scripts automatically create the `ollama-mlx` binary:
+
+- **macOS**: `./scripts/build_darwin.sh` produces `dist/darwin/ollama-mlx`
+- **Linux**: `./scripts/build_linux.sh` produces `ollama-mlx` in the output archives
+
+## Image Generation
+
+Image generation is built into the `ollama-mlx` binary. Run `ollama-mlx serve` to start the server with image generation support enabled.
--- a/x/cmd/run.go
+++ b/x/cmd/run.go
@@ -25,6 +25,14 @@ import (
 	"github.com/ollama/ollama/x/tools"
 )

+// MultilineState tracks the state of multiline input
+type MultilineState int
+
+const (
+	MultilineNone MultilineState = iota
+	MultilineSystem
+)
+
 // Tool output capping constants
 const (
 	// localModelTokenLimit is the token limit for local models (smaller context).
@@ -648,7 +656,7 @@ func GenerateInteractive(cmd *cobra.Command, modelName string, wordWrap bool, op
 		Prompt:         ">>> ",
 		AltPrompt:      "... ",
 		Placeholder:    "Send a message (/? for help)",
-		AltPlaceholder: "Press Enter to send",
+		AltPlaceholder: `Use """ to end multi-line input`,
 	})
 	if err != nil {
 		return err
@@ -699,6 +707,7 @@ func GenerateInteractive(cmd *cobra.Command, modelName string, wordWrap bool, op
 	var sb strings.Builder
 	var format string
 	var system string
+	var multiline MultilineState = MultilineNone

 	for {
 		line, err := scanner.Readline()
@@ -712,12 +721,37 @@ func GenerateInteractive(cmd *cobra.Command, modelName string, wordWrap bool, op
 			}
 			scanner.Prompt.UseAlt = false
 			sb.Reset()
+			multiline = MultilineNone
 			continue
 		case err != nil:
 			return err
 		}

 		switch {
+		case multiline != MultilineNone:
+			// check if there's a multiline terminating string
+			before, ok := strings.CutSuffix(line, `"""`)
+			sb.WriteString(before)
+			if !ok {
+				fmt.Fprintln(&sb)
+				continue
+			}
+
+			switch multiline {
+			case MultilineSystem:
+				system = sb.String()
+				newMessage := api.Message{Role: "system", Content: system}
+				if len(messages) > 0 && messages[len(messages)-1].Role == "system" {
+					messages[len(messages)-1] = newMessage
+				} else {
+					messages = append(messages, newMessage)
+				}
+				fmt.Println("Set system message.")
+				sb.Reset()
+			}
+
+			multiline = MultilineNone
+			scanner.Prompt.UseAlt = false
 		case strings.HasPrefix(line, "/exit"), strings.HasPrefix(line, "/bye"):
 			return nil
 		case strings.HasPrefix(line, "/clear"):
@@ -826,18 +860,41 @@ func GenerateInteractive(cmd *cobra.Command, modelName string, wordWrap bool, op
 					options[args[2]] = fp[args[2]]
 				case "system":
 					if len(args) < 3 {
-						fmt.Println("Usage: /set system <message>")
+						fmt.Println("Usage: /set system <message> or /set system \"\"\"<multi-line message>\"\"\"")
 						continue
 					}

-					system = strings.Join(args[2:], " ")
-					newMessage := api.Message{Role: "system", Content: system}
+					multiline = MultilineSystem
+
+					line := strings.Join(args[2:], " ")
+					line, ok := strings.CutPrefix(line, `"""`)
+					if !ok {
+						multiline = MultilineNone
+					} else {
+						// only cut suffix if the line is multiline
+						line, ok = strings.CutSuffix(line, `"""`)
+						if ok {
+							multiline = MultilineNone
+						}
+					}
+
+					sb.WriteString(line)
+					if multiline != MultilineNone {
+						scanner.Prompt.UseAlt = true
+						continue
+					}
+
+					system = sb.String()
+					newMessage := api.Message{Role: "system", Content: sb.String()}
+					// Check if the slice is not empty and the last message is from 'system'
 					if len(messages) > 0 && messages[len(messages)-1].Role == "system" {
+						// Replace the last message
 						messages[len(messages)-1] = newMessage
 					} else {
 						messages = append(messages, newMessage)
 					}
 					fmt.Println("Set system message.")
+					sb.Reset()
 					continue
 				default:
 					fmt.Printf("Unknown command '/set %s'. Type /? for help\n", args[1])
@@ -1024,7 +1081,7 @@ func GenerateInteractive(cmd *cobra.Command, modelName string, wordWrap bool, op
 			sb.WriteString(line)
 		}

-		if sb.Len() > 0 {
+		if sb.Len() > 0 && multiline == MultilineNone {
 			newMessage := api.Message{Role: "user", Content: sb.String()}
 			messages = append(messages, newMessage)