num parallel embed

2025-12-31 19:50:04 -05:00 · 2024-07-26 15:18:35 -07:00
76 changed files with 733 additions and 1055 deletions
--- a/README.md
+++ b/README.md
@@ -35,10 +35,10 @@ The official [Ollama Docker image](https://hub.docker.com/r/ollama/ollama) `olla

 ## Quickstart

-To run and chat with [Llama 3.1](https://ollama.com/library/llama3.1):
+To run and chat with [Llama 3](https://ollama.com/library/llama3):

 ```
-ollama run llama3.1
+ollama run llama3
 ```

 ## Model library
@@ -49,9 +49,8 @@ Here are some example models that can be downloaded:

 | Model              | Parameters | Size  | Download                       |
 | ------------------ | ---------- | ----- | ------------------------------ |
-| Llama 3.1          | 8B         | 4.7GB | `ollama run llama3.1`          |
-| Llama 3.1          | 70B        | 40GB  | `ollama run llama3.1:70b`      |
-| Llama 3.1          | 405B       | 231GB | `ollama run llama3.1:405b`     |
+| Llama 3            | 8B         | 4.7GB | `ollama run llama3`            |
+| Llama 3            | 70B        | 40GB  | `ollama run llama3:70b`        |
 | Phi 3 Mini         | 3.8B       | 2.3GB | `ollama run phi3`              |
 | Phi 3 Medium       | 14B        | 7.9GB | `ollama run phi3:medium`       |
 | Gemma 2            | 9B         | 5.5GB | `ollama run gemma2`            |
@@ -98,16 +97,16 @@ See the [guide](docs/import.md) on importing models for more information.

 ### Customize a prompt

-Models from the Ollama library can be customized with a prompt. For example, to customize the `llama3.1` model:
+Models from the Ollama library can be customized with a prompt. For example, to customize the `llama3` model:

 ```
-ollama pull llama3.1
+ollama pull llama3
 ```

 Create a `Modelfile`:

 ```
-FROM llama3.1
+FROM llama3

 # set the temperature to 1 [higher is more creative, lower is more coherent]
 PARAMETER temperature 1
@@ -142,7 +141,7 @@ ollama create mymodel -f ./Modelfile
 ### Pull a model

 ```
-ollama pull llama3.1
+ollama pull llama3
 ```

 > This command can also be used to update a local model. Only the diff will be pulled.
@@ -150,13 +149,13 @@ ollama pull llama3.1
 ### Remove a model

 ```
-ollama rm llama3.1
+ollama rm llama3
 ```

 ### Copy a model

 ```
-ollama cp llama3.1 my-model
+ollama cp llama3 my-model
 ```

 ### Multiline input
@@ -173,21 +172,21 @@ I'm a basic program that prints the famous "Hello, world!" message to the consol
 ### Multimodal models

 ```
-ollama run llava "What's in this image? /Users/jmorgan/Desktop/smile.png"
+>>> What's in this image? /Users/jmorgan/Desktop/smile.png
 The image features a yellow smiley face, which is likely the central focus of the picture.
 ```

 ### Pass the prompt as an argument

 ```
-$ ollama run llama3.1 "Summarize this file: $(cat README.md)"
+$ ollama run llama3 "Summarize this file: $(cat README.md)"
 Ollama is a lightweight, extensible framework for building and running language models on the local machine. It provides a simple API for creating, running, and managing models, as well as a library of pre-built models that can be easily used in a variety of applications.
 ```

 ### Show model information

 ```
-ollama show llama3.1
+ollama show llama3
 ```

 ### List models on your computer
@@ -215,7 +214,7 @@ Next, start the server:
 Finally, in a separate shell, run a model:

 ```
-./ollama run llama3.1
+./ollama run llama3
 ```

 ## REST API
@@ -226,7 +225,7 @@ Ollama has a REST API for running and managing models.

 ```
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama3.1",
+  "model": "llama3",
  "prompt":"Why is the sky blue?"
 }'
 ```
@@ -235,7 +234,7 @@ curl http://localhost:11434/api/generate -d '{

 ```
 curl http://localhost:11434/api/chat -d '{
-  "model": "llama3.1",
+  "model": "llama3",
  "messages": [
    { "role": "user", "content": "why is the sky blue?" }
  ]
@@ -299,7 +298,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [AI Studio](https://github.com/MindWorkAI/AI-Studio)
 - [Sidellama](https://github.com/gyopak/sidellama) (browser-based LLM client)
 - [LLMStack](https://github.com/trypromptly/LLMStack) (No-code multi-agent framework to build LLM agents and workflows)
- [BoltAI for Mac](https://boltai.com) (AI Chat Client for Mac)

 ### Terminal

@@ -338,7 +336,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 ### Libraries

 - [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/modules/model_io/models/llms/integrations/ollama) with [example](https://js.langchain.com/docs/use_cases/question_answering/local_retrieval_qa)
- [Firebase Genkit](https://firebase.google.com/docs/genkit/plugins/ollama)
 - [LangChainGo](https://github.com/tmc/langchaingo/) with [example](https://github.com/tmc/langchaingo/tree/main/examples/ollama-completion-example)
 - [LangChain4j](https://github.com/langchain4j/langchain4j) with [example](https://github.com/langchain4j/langchain4j-examples/tree/main/ollama-examples/src/main/java)
 - [LangChainRust](https://github.com/Abraxas-365/langchain-rust) with [example](https://github.com/Abraxas-365/langchain-rust/blob/main/examples/llm_ollama.rs)
@@ -392,7 +389,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Llama Coder](https://github.com/ex3ndr/llama-coder) (Copilot alternative using Ollama)
 - [Ollama Copilot](https://github.com/bernardo-bruning/ollama-copilot) (Proxy that allows you to use ollama as a copilot like Github copilot)
 - [twinny](https://github.com/rjmacarthy/twinny) (Copilot and Copilot chat alternative using Ollama)
- [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and Hugging Face)
+- [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and HuggingFace)
 - [Page Assist](https://github.com/n4ze3m/page-assist) (Chrome Extension)
 - [AI Telegram Bot](https://github.com/tusharhero/aitelegrambot) (Telegram bot using Ollama in backend)
 - [AI ST Completion](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (Sublime Text 4 AI assistant plugin with Ollama support)
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -1,25 +0,0 @@
-# Security
-
-The Ollama maintainer team takes security seriously and will actively work to resolve security issues.
-
-## Reporting a vulnerability
-
-If you discover a security vulnerability, please do not open a public issue. Instead, please report it by emailing hello@ollama.com. We ask that you give us sufficient time to investigate and address the vulnerability before disclosing it publicly.
-
-Please include the following details in your report:
- A description of the vulnerability
- Steps to reproduce the issue
- Your assessment of the potential impact
- Any possible mitigations
-
-## Security best practices
-
-While the maintainer team does their best to secure Ollama, users are encouraged to implement their own security best practices, such as:
-
- Regularly updating to the latest version of Ollama
- Securing access to hosted instances of Ollama
- Monitoring systems for unusual activity
-
-## Contact
-
-For any other questions or concerns related to security, please contact us at hello@ollama.com
--- a/api/client.go
+++ b/api/client.go
@@ -20,6 +20,7 @@ import (
 	"encoding/json"
 	"fmt"
 	"io"
+	"net"
 	"net/http"
 	"net/url"
 	"runtime"
@@ -62,8 +63,13 @@ func checkError(resp *http.Response, body []byte) error {
 // If the variable is not specified, a default ollama host and port will be
 // used.
 func ClientFromEnvironment() (*Client, error) {
+	ollamaHost := envconfig.Host
+
 	return &Client{
-		base: envconfig.Host(),
+		base: &url.URL{
+			Scheme: ollamaHost.Scheme,
+			Host:   net.JoinHostPort(ollamaHost.Host, ollamaHost.Port),
+		},
 		http: http.DefaultClient,
 	}, nil
 }
--- a/api/client_test.go
+++ b/api/client_test.go
@@ -2,6 +2,8 @@ package api

 import (
 	"testing"
+
+	"github.com/ollama/ollama/envconfig"
 )

 func TestClientFromEnvironment(t *testing.T) {
@@ -31,6 +33,7 @@ func TestClientFromEnvironment(t *testing.T) {
 	for k, v := range testCases {
 		t.Run(k, func(t *testing.T) {
 			t.Setenv("OLLAMA_HOST", v.value)
+			envconfig.LoadConfig()

 			client, err := ClientFromEnvironment()
 			if err != v.err {
--- a/api/types.go
+++ b/api/types.go
@@ -114,11 +114,6 @@ func (t Tools) String() string {
 	return string(bts)
 }

-func (t Tool) String() string {
-	bts, _ := json.Marshal(t)
-	return string(bts)
-}
-
 // Message is a single message in a chat sequence. The message contains the
 // role ("system", "user", or "assistant"), the content and an optional list
 // of images.
@@ -214,7 +209,6 @@ type Options struct {
 	NumPredict       int      `json:"num_predict,omitempty"`
 	TopK             int      `json:"top_k,omitempty"`
 	TopP             float32  `json:"top_p,omitempty"`
-	MinP             float32  `json:"min_p,omitempty"`
 	TFSZ             float32  `json:"tfs_z,omitempty"`
 	TypicalP         float32  `json:"typical_p,omitempty"`
 	RepeatLastN      int      `json:"repeat_last_n,omitempty"`
@@ -267,10 +261,6 @@ type EmbedRequest struct {
 type EmbedResponse struct {
 	Model      string      `json:"model"`
 	Embeddings [][]float32 `json:"embeddings"`
-
-	TotalDuration   time.Duration `json:"total_duration,omitempty"`
-	LoadDuration    time.Duration `json:"load_duration,omitempty"`
-	PromptEvalCount int           `json:"prompt_eval_count,omitempty"`
 }

 // EmbeddingRequest is the request passed to [Client.Embeddings].
--- a/app/lifecycle/logging.go
+++ b/app/lifecycle/logging.go
@@ -14,7 +14,7 @@ import (
 func InitLogging() {
 	level := slog.LevelInfo

-	if envconfig.Debug() {
+	if envconfig.Debug {
 		level = slog.LevelDebug
 	}

--- a/app/ollama.iss
+++ b/app/ollama.iss
@@ -138,7 +138,7 @@ SetupAppRunningError=Another Ollama installer is running.%n%nPlease cancel or fi


 ;FinishedHeadingLabel=Run your first model
-;FinishedLabel=%nRun this command in a PowerShell or cmd terminal.%n%n%n    ollama run llama3.1
+;FinishedLabel=%nRun this command in a PowerShell or cmd terminal.%n%n%n    ollama run llama3
 ;ClickFinish=%n

 [Registry]
--- a/app/ollama_welcome.ps1
+++ b/app/ollama_welcome.ps1
@@ -4,5 +4,5 @@ write-host "Welcome to Ollama!"
 write-host ""
 write-host "Run your first model:"
 write-host ""
-write-host "`tollama run llama3.1"
+write-host "`tollama run llama3"
 write-host ""
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -362,24 +362,9 @@ func RunHandler(cmd *cobra.Command, args []string) error {

 	opts.MultiModal = slices.Contains(info.Details.Families, "clip")
 	opts.ParentModel = info.Details.ParentModel
+	opts.Messages = append(opts.Messages, info.Messages...)

 	if interactive {
-		if err := loadModel(cmd, &opts); err != nil {
-			return err
-		}
-
-		for _, msg := range info.Messages {
-			switch msg.Role {
-			case "user":
-				fmt.Printf(">>> %s\n", msg.Content)
-			case "assistant":
-				state := &displayResponseState{}
-				displayResponse(msg.Content, opts.WordWrap, state)
-				fmt.Println()
-				fmt.Println()
-			}
-		}
-
 		return generateInteractive(cmd, opts)
 	}
 	return generate(cmd, opts)
@@ -1091,7 +1076,7 @@ func RunServer(cmd *cobra.Command, _ []string) error {
 		return err
 	}

-	ln, err := net.Listen("tcp", envconfig.Host().Host)
+	ln, err := net.Listen("tcp", net.JoinHostPort(envconfig.Host.Host, envconfig.Host.Port))
 	if err != nil {
 		return err
 	}
@@ -1356,7 +1341,6 @@ func NewCLI() *cobra.Command {
 				envVars["OLLAMA_NUM_PARALLEL"],
 				envVars["OLLAMA_NOPRUNE"],
 				envVars["OLLAMA_ORIGINS"],
-				envVars["OLLAMA_SCHED_SPREAD"],
 				envVars["OLLAMA_TMPDIR"],
 				envVars["OLLAMA_FLASH_ATTENTION"],
 				envVars["OLLAMA_LLM_LIBRARY"],
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -1,7 +1,6 @@
 package cmd

 import (
-	"cmp"
 	"errors"
 	"fmt"
 	"io"
@@ -10,14 +9,13 @@ import (
 	"path/filepath"
 	"regexp"
 	"slices"
+	"sort"
 	"strings"

 	"github.com/spf13/cobra"
-	"golang.org/x/exp/maps"

 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/envconfig"
-	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/progress"
 	"github.com/ollama/ollama/readline"
 	"github.com/ollama/ollama/types/errtypes"
@@ -48,10 +46,29 @@ func loadModel(cmd *cobra.Command, opts *runOptions) error {
 		KeepAlive: opts.KeepAlive,
 	}

-	return client.Chat(cmd.Context(), chatReq, func(api.ChatResponse) error { return nil })
+	return client.Chat(cmd.Context(), chatReq, func(resp api.ChatResponse) error {
+		p.StopAndClear()
+		for _, msg := range opts.Messages {
+			switch msg.Role {
+			case "user":
+				fmt.Printf(">>> %s\n", msg.Content)
+			case "assistant":
+				state := &displayResponseState{}
+				displayResponse(msg.Content, opts.WordWrap, state)
+				fmt.Println()
+				fmt.Println()
+			}
+		}
+		return nil
+	})
 }

 func generateInteractive(cmd *cobra.Command, opts runOptions) error {
+	err := loadModel(cmd, &opts)
+	if err != nil {
+		return err
+	}
+
 	usage := func() {
 		fmt.Fprintln(os.Stderr, "Available Commands:")
 		fmt.Fprintln(os.Stderr, "  /set            Set session variables")
@@ -121,7 +138,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 		fmt.Fprintln(os.Stderr, "  /set parameter num_predict <int>      Max number of tokens to predict")
 		fmt.Fprintln(os.Stderr, "  /set parameter top_k <int>            Pick from top k num of tokens")
 		fmt.Fprintln(os.Stderr, "  /set parameter top_p <float>          Pick token based on sum of probabilities")
-		fmt.Fprintln(os.Stderr, "  /set parameter min_p <float>          Pick token based on top token probability * min_p")
 		fmt.Fprintln(os.Stderr, "  /set parameter num_ctx <int>          Set the context size")
 		fmt.Fprintln(os.Stderr, "  /set parameter temperature <float>    Set creativity level")
 		fmt.Fprintln(os.Stderr, "  /set parameter repeat_penalty <float> How strongly to penalize repetitions")
@@ -141,7 +157,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 		return err
 	}

-	if envconfig.NoHistory() {
+	if envconfig.NoHistory {
 		scanner.HistoryDisable()
 	}

@@ -359,9 +375,9 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 					return err
 				}
 				req := &api.ShowRequest{
-					Name:    opts.Model,
-					System:  opts.System,
-					Options: opts.Options,
+					Name:     opts.Model,
+					System:   opts.System,
+					Options:  opts.Options,
 				}
 				resp, err := client.Show(cmd.Context(), req)
 				if err != nil {
@@ -490,35 +506,31 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 }

 func buildModelfile(opts runOptions) string {
-	var f parser.File
-	f.Commands = append(f.Commands, parser.Command{Name: "model", Args: cmp.Or(opts.ParentModel, opts.Model)})
-
+	var mf strings.Builder
+	model := opts.ParentModel
+	if model == "" {
+		model = opts.Model
+	}
+	fmt.Fprintf(&mf, "FROM %s\n", model)
 	if opts.System != "" {
-		f.Commands = append(f.Commands, parser.Command{Name: "system", Args: opts.System})
+		fmt.Fprintf(&mf, "SYSTEM \"\"\"%s\"\"\"\n", opts.System)
 	}

-	keys := maps.Keys(opts.Options)
-	slices.Sort(keys)
+	keys := make([]string, 0)
+	for k := range opts.Options {
+		keys = append(keys, k)
+	}
+	sort.Strings(keys)
 	for _, k := range keys {
-		v := opts.Options[k]
-		var cmds []parser.Command
-		switch t := v.(type) {
-		case []string:
-			for _, s := range t {
-				cmds = append(cmds, parser.Command{Name: k, Args: s})
-			}
-		default:
-			cmds = append(cmds, parser.Command{Name: k, Args: fmt.Sprintf("%v", t)})
-		}
-
-		f.Commands = append(f.Commands, cmds...)
+		fmt.Fprintf(&mf, "PARAMETER %s %v\n", k, opts.Options[k])
 	}
+	fmt.Fprintln(&mf)

 	for _, msg := range opts.Messages {
-		f.Commands = append(f.Commands, parser.Command{Name: "message", Args: fmt.Sprintf("%s: %s", msg.Role, msg.Content)})
+		fmt.Fprintf(&mf, "MESSAGE %s \"\"\"%s\"\"\"\n", msg.Role, msg.Content)
 	}

-	return f.String()
+	return mf.String()
 }

 func normalizeFilePath(fp string) string {
--- a/cmd/interactive_test.go
+++ b/cmd/interactive_test.go
@@ -1,10 +1,12 @@
 package cmd

 import (
+	"bytes"
 	"testing"
+	"text/template"

-	"github.com/google/go-cmp/cmp"
 	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"

 	"github.com/ollama/ollama/api"
 )
@@ -55,53 +57,58 @@ d:\path with\spaces\seven.svg inbetween7 c:\users\jdoe\eight.png inbetween8

 func TestModelfileBuilder(t *testing.T) {
 	opts := runOptions{
-		Model:  "hork",
-		System: "You are part horse and part shark, but all hork. Do horklike things",
+		Model:    "hork",
+		System:   "You are part horse and part shark, but all hork. Do horklike things",
 		Messages: []api.Message{
 			{Role: "user", Content: "Hey there hork!"},
 			{Role: "assistant", Content: "Yes it is true, I am half horse, half shark."},
 		},
-		Options: map[string]any{
-			"temperature":      0.9,
-			"seed":             42,
-			"penalize_newline": false,
-			"stop":             []string{"hi", "there"},
-		},
+		Options: map[string]interface{}{},
 	}

-	t.Run("model", func(t *testing.T) {
-		expect := `FROM hork
-SYSTEM You are part horse and part shark, but all hork. Do horklike things
+	opts.Options["temperature"] = 0.9
+	opts.Options["seed"] = 42
+	opts.Options["penalize_newline"] = false
+	opts.Options["stop"] = []string{"hi", "there"}
+
+	mf := buildModelfile(opts)
+	expectedModelfile := `FROM {{.Model}}
+SYSTEM """{{.System}}"""
 PARAMETER penalize_newline false
 PARAMETER seed 42
-PARAMETER stop hi
-PARAMETER stop there
+PARAMETER stop [hi there]
 PARAMETER temperature 0.9
-MESSAGE user Hey there hork!
-MESSAGE assistant Yes it is true, I am half horse, half shark.
+
+MESSAGE user """Hey there hork!"""
+MESSAGE assistant """Yes it is true, I am half horse, half shark."""
 `

-		actual := buildModelfile(opts)
-		if diff := cmp.Diff(expect, actual); diff != "" {
-			t.Errorf("mismatch (-want +got):\n%s", diff)
-		}
-	})
+	tmpl, err := template.New("").Parse(expectedModelfile)
+	require.NoError(t, err)

-	t.Run("parent model", func(t *testing.T) {
-		opts.ParentModel = "horseshark"
-		expect := `FROM horseshark
-SYSTEM You are part horse and part shark, but all hork. Do horklike things
+	var buf bytes.Buffer
+	err = tmpl.Execute(&buf, opts)
+	require.NoError(t, err)
+	assert.Equal(t, buf.String(), mf)
+
+	opts.ParentModel = "horseshark"
+	mf = buildModelfile(opts)
+	expectedModelfile = `FROM {{.ParentModel}}
+SYSTEM """{{.System}}"""
 PARAMETER penalize_newline false
 PARAMETER seed 42
-PARAMETER stop hi
-PARAMETER stop there
+PARAMETER stop [hi there]
 PARAMETER temperature 0.9
-MESSAGE user Hey there hork!
-MESSAGE assistant Yes it is true, I am half horse, half shark.
+
+MESSAGE user """Hey there hork!"""
+MESSAGE assistant """Yes it is true, I am half horse, half shark."""
 `
-		actual := buildModelfile(opts)
-		if diff := cmp.Diff(expect, actual); diff != "" {
-			t.Errorf("mismatch (-want +got):\n%s", diff)
-		}
-	})
+
+	tmpl, err = template.New("").Parse(expectedModelfile)
+	require.NoError(t, err)
+
+	var parentBuf bytes.Buffer
+	err = tmpl.Execute(&parentBuf, opts)
+	require.NoError(t, err)
+	assert.Equal(t, parentBuf.String(), mf)
 }
--- a/docs/api.md
+++ b/docs/api.md
@@ -336,7 +336,6 @@ curl http://localhost:11434/api/generate -d '{
    "num_predict": 100,
    "top_k": 20,
    "top_p": 0.9,
-    "min_p": 0.0,
    "tfs_z": 0.5,
    "typical_p": 0.7,
    "repeat_last_n": 33,
@@ -587,7 +586,7 @@ Final response:

 ##### Request

-Send a chat message with images. The images should be provided as an array, with the individual images encoded in Base64.
+Send a chat message with a conversation history.

 ```shell
 curl http://localhost:11434/api/chat -d '{
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -63,7 +63,7 @@ docker run -d --device /dev/kfd --device /dev/dri -v ollama:/root/.ollama -p 114
 Now you can run a model:

 ```
-docker exec -it ollama ollama run llama3.1
+docker exec -it ollama ollama run llama3
 ```

 ### Try different models
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -227,7 +227,7 @@ curl http://localhost:11434/api/chat -d '{"model": "mistral"}'

 To preload a model using the CLI, use the command:
 ```shell
-ollama run llama3.1 ""
+ollama run llama3 ""
 ```

 ## How do I keep a model loaded in memory or make it unload immediately?
@@ -272,8 +272,4 @@ The following server settings may be used to adjust how Ollama handles concurren
 - `OLLAMA_NUM_PARALLEL` - The maximum number of parallel requests each model will process at the same time.  The default will auto-select either 4 or 1 based on available memory.
 - `OLLAMA_MAX_QUEUE` - The maximum number of requests Ollama will queue when busy before rejecting additional requests. The default is 512

-Note: Windows with Radeon GPUs currently default to 1 model maximum due to limitations in ROCm v5.7 for available VRAM reporting.  Once ROCm v6.2 is available, Windows Radeon will follow the defaults above.  You may enable concurrent model loads on Radeon on Windows, but ensure you don't load more models than will fit into your GPUs VRAM.
-
-## How does Ollama load models on multiple GPUs?
-
-Installing multiple GPUs of the same brand can be a great way to increase your available VRAM to load larger models.  When you load a new model, Ollama evaluates the required VRAM for the model against what is currently available.  If the model will entirely fit on any single GPU, Ollama will load the model on that GPU.  This typically provides the best performance as it reduces the amount of data transfering across the PCI bus during inference.  If the model does not fit entirely on one GPU, then it will be spread across all the available GPUs.
+Note: Windows with Radeon GPUs currently default to 1 model maximum due to limitations in ROCm v5.7 for available VRAM reporting.  Once ROCm v6.2 is available, Windows Radeon will follow the defaults above.  You may enable concurrent model loads on Radeon on Windows, but ensure you don't load more models than will fit into your GPUs VRAM.
--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@@ -141,7 +141,6 @@ PARAMETER <parameter> <parametervalue>
 | num_predict    | Maximum number of tokens to predict when generating text. (Default: 128, -1 = infinite generation, -2 = fill context)                                                                                                                                   | int        | num_predict 42       |
 | top_k          | Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)                                                                        | int        | top_k 40             |
 | top_p          | Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)                                                                 | float      | top_p 0.9            |
-| min_p          | Alternative to the top_p, and aims to ensure a balance of quality and variety. The parameter *p* represents the minimum probability for a token to be considered, relative to the probability of the most likely token. For example, with *p*=0.05 and the most likely token having a probability of 0.9, logits with a value less than 0.045 are filtered out. (Default: 0.0) | float      | min_p 0.05            |

 ### TEMPLATE

--- a/docs/tutorials/langchainjs.md
+++ b/docs/tutorials/langchainjs.md
@@ -15,7 +15,7 @@ import { Ollama } from "@langchain/community/llms/ollama";

 const ollama = new Ollama({
  baseUrl: "http://localhost:11434",
-  model: "llama3.1",
+  model: "llama3",
 });

 const answer = await ollama.invoke(`why is the sky blue?`);
@@ -23,7 +23,7 @@ const answer = await ollama.invoke(`why is the sky blue?`);
 console.log(answer);
 ```

-That will get us the same thing as if we ran `ollama run llama3.1 "why is the sky blue"` in the terminal. But we want to load a document from the web to ask a question against. **Cheerio** is a great library for ingesting a webpage, and **LangChain** uses it in their **CheerioWebBaseLoader**. So let's install **Cheerio** and build that part of the app.
+That will get us the same thing as if we ran `ollama run llama3 "why is the sky blue"` in the terminal. But we want to load a document from the web to ask a question against. **Cheerio** is a great library for ingesting a webpage, and **LangChain** uses it in their **CheerioWebBaseLoader**. So let's install **Cheerio** and build that part of the app.

 ```bash
 npm install cheerio
--- a/docs/windows.md
+++ b/docs/windows.md
@@ -23,8 +23,6 @@ Logs will often be helpful in diagnosing the problem (see
 * NVIDIA 452.39 or newer Drivers if you have an NVIDIA card
 * AMD Radeon Driver https://www.amd.com/en/support if you have a Radeon card

-Ollama uses unicode characters for progress indication, which may render as unknown squares in some older terminal fonts in Windows 10. If you see this, try changing your terminal font settings.
-
 ## API Access

 Here's a quick example showing API access from `powershell`
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -1,11 +1,11 @@
 package envconfig

 import (
+	"errors"
 	"fmt"
 	"log/slog"
 	"math"
 	"net"
-	"net/url"
 	"os"
 	"path/filepath"
 	"runtime"
@@ -14,16 +14,296 @@ import (
 	"time"
 )

-// Host returns the scheme and host. Host can be configured via the OLLAMA_HOST environment variable.
-// Default is scheme "http" and host "127.0.0.1:11434"
-func Host() *url.URL {
+type OllamaHost struct {
+	Scheme string
+	Host   string
+	Port   string
+}
+
+func (o OllamaHost) String() string {
+	return fmt.Sprintf("%s://%s:%s", o.Scheme, o.Host, o.Port)
+}
+
+var ErrInvalidHostPort = errors.New("invalid port specified in OLLAMA_HOST")
+
+var (
+	// Set via OLLAMA_ORIGINS in the environment
+	AllowOrigins []string
+	// Set via OLLAMA_DEBUG in the environment
+	Debug bool
+	// Experimental flash attention
+	FlashAttention bool
+	// Set via OLLAMA_HOST in the environment
+	Host *OllamaHost
+	// Set via OLLAMA_KEEP_ALIVE in the environment
+	KeepAlive time.Duration
+	// Set via OLLAMA_LLM_LIBRARY in the environment
+	LLMLibrary string
+	// Set via OLLAMA_MAX_LOADED_MODELS in the environment
+	MaxRunners int
+	// Set via OLLAMA_MAX_QUEUE in the environment
+	MaxQueuedRequests int
+	// Set via OLLAMA_MODELS in the environment
+	ModelsDir string
+	// Set via OLLAMA_NOHISTORY in the environment
+	NoHistory bool
+	// Set via OLLAMA_NOPRUNE in the environment
+	NoPrune bool
+	// Set via OLLAMA_NUM_PARALLEL in the environment
+	NumParallel int
+	// Set via OLLAMA_RUNNERS_DIR in the environment
+	RunnersDir string
+	// Set via OLLAMA_SCHED_SPREAD in the environment
+	SchedSpread bool
+	// Set via OLLAMA_TMPDIR in the environment
+	TmpDir string
+	// Set via OLLAMA_INTEL_GPU in the environment
+	IntelGpu bool
+
+	// Set via CUDA_VISIBLE_DEVICES in the environment
+	CudaVisibleDevices string
+	// Set via HIP_VISIBLE_DEVICES in the environment
+	HipVisibleDevices string
+	// Set via ROCR_VISIBLE_DEVICES in the environment
+	RocrVisibleDevices string
+	// Set via GPU_DEVICE_ORDINAL in the environment
+	GpuDeviceOrdinal string
+	// Set via HSA_OVERRIDE_GFX_VERSION in the environment
+	HsaOverrideGfxVersion string
+)
+
+type EnvVar struct {
+	Name        string
+	Value       any
+	Description string
+}
+
+func AsMap() map[string]EnvVar {
+	ret := map[string]EnvVar{
+		"OLLAMA_DEBUG":             {"OLLAMA_DEBUG", Debug, "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
+		"OLLAMA_FLASH_ATTENTION":   {"OLLAMA_FLASH_ATTENTION", FlashAttention, "Enabled flash attention"},
+		"OLLAMA_HOST":              {"OLLAMA_HOST", Host, "IP Address for the ollama server (default 127.0.0.1:11434)"},
+		"OLLAMA_KEEP_ALIVE":        {"OLLAMA_KEEP_ALIVE", KeepAlive, "The duration that models stay loaded in memory (default \"5m\")"},
+		"OLLAMA_LLM_LIBRARY":       {"OLLAMA_LLM_LIBRARY", LLMLibrary, "Set LLM library to bypass autodetection"},
+		"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models per GPU"},
+		"OLLAMA_MAX_QUEUE":         {"OLLAMA_MAX_QUEUE", MaxQueuedRequests, "Maximum number of queued requests"},
+		"OLLAMA_MODELS":            {"OLLAMA_MODELS", ModelsDir, "The path to the models directory"},
+		"OLLAMA_NOHISTORY":         {"OLLAMA_NOHISTORY", NoHistory, "Do not preserve readline history"},
+		"OLLAMA_NOPRUNE":           {"OLLAMA_NOPRUNE", NoPrune, "Do not prune model blobs on startup"},
+		"OLLAMA_NUM_PARALLEL":      {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests"},
+		"OLLAMA_ORIGINS":           {"OLLAMA_ORIGINS", AllowOrigins, "A comma separated list of allowed origins"},
+		"OLLAMA_RUNNERS_DIR":       {"OLLAMA_RUNNERS_DIR", RunnersDir, "Location for runners"},
+		"OLLAMA_SCHED_SPREAD":      {"OLLAMA_SCHED_SPREAD", SchedSpread, "Always schedule model across all GPUs"},
+		"OLLAMA_TMPDIR":            {"OLLAMA_TMPDIR", TmpDir, "Location for temporary files"},
+	}
+	if runtime.GOOS != "darwin" {
+		ret["CUDA_VISIBLE_DEVICES"] = EnvVar{"CUDA_VISIBLE_DEVICES", CudaVisibleDevices, "Set which NVIDIA devices are visible"}
+		ret["HIP_VISIBLE_DEVICES"] = EnvVar{"HIP_VISIBLE_DEVICES", HipVisibleDevices, "Set which AMD devices are visible"}
+		ret["ROCR_VISIBLE_DEVICES"] = EnvVar{"ROCR_VISIBLE_DEVICES", RocrVisibleDevices, "Set which AMD devices are visible"}
+		ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal, "Set which AMD devices are visible"}
+		ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion, "Override the gfx used for all detected AMD GPUs"}
+		ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGpu, "Enable experimental Intel GPU detection"}
+	}
+	return ret
+}
+
+func Values() map[string]string {
+	vals := make(map[string]string)
+	for k, v := range AsMap() {
+		vals[k] = fmt.Sprintf("%v", v.Value)
+	}
+	return vals
+}
+
+var defaultAllowOrigins = []string{
+	"localhost",
+	"127.0.0.1",
+	"0.0.0.0",
+}
+
+// Clean quotes and spaces from the value
+func clean(key string) string {
+	return strings.Trim(os.Getenv(key), "\"' ")
+}
+
+func init() {
+	// default values
+	NumParallel = 0 // Autoselect
+	MaxRunners = 0  // Autoselect
+	MaxQueuedRequests = 512
+	KeepAlive = 5 * time.Minute
+
+	LoadConfig()
+}
+
+func LoadConfig() {
+	if debug := clean("OLLAMA_DEBUG"); debug != "" {
+		d, err := strconv.ParseBool(debug)
+		if err == nil {
+			Debug = d
+		} else {
+			Debug = true
+		}
+	}
+
+	if fa := clean("OLLAMA_FLASH_ATTENTION"); fa != "" {
+		d, err := strconv.ParseBool(fa)
+		if err == nil {
+			FlashAttention = d
+		}
+	}
+
+	RunnersDir = clean("OLLAMA_RUNNERS_DIR")
+	if runtime.GOOS == "windows" && RunnersDir == "" {
+		// On Windows we do not carry the payloads inside the main executable
+		appExe, err := os.Executable()
+		if err != nil {
+			slog.Error("failed to lookup executable path", "error", err)
+		}
+
+		cwd, err := os.Getwd()
+		if err != nil {
+			slog.Error("failed to lookup working directory", "error", err)
+		}
+
+		var paths []string
+		for _, root := range []string{filepath.Dir(appExe), cwd} {
+			paths = append(paths,
+				root,
+				filepath.Join(root, "windows-"+runtime.GOARCH),
+				filepath.Join(root, "dist", "windows-"+runtime.GOARCH),
+			)
+		}
+
+		// Try a few variations to improve developer experience when building from source in the local tree
+		for _, p := range paths {
+			candidate := filepath.Join(p, "ollama_runners")
+			_, err := os.Stat(candidate)
+			if err == nil {
+				RunnersDir = candidate
+				break
+			}
+		}
+		if RunnersDir == "" {
+			slog.Error("unable to locate llm runner directory.  Set OLLAMA_RUNNERS_DIR to the location of 'ollama_runners'")
+		}
+	}
+
+	TmpDir = clean("OLLAMA_TMPDIR")
+
+	LLMLibrary = clean("OLLAMA_LLM_LIBRARY")
+
+	if onp := clean("OLLAMA_NUM_PARALLEL"); onp != "" {
+		val, err := strconv.Atoi(onp)
+		if err != nil {
+			slog.Error("invalid setting, ignoring", "OLLAMA_NUM_PARALLEL", onp, "error", err)
+		} else {
+			NumParallel = val
+		}
+	}
+
+	if nohistory := clean("OLLAMA_NOHISTORY"); nohistory != "" {
+		NoHistory = true
+	}
+
+	if spread := clean("OLLAMA_SCHED_SPREAD"); spread != "" {
+		s, err := strconv.ParseBool(spread)
+		if err == nil {
+			SchedSpread = s
+		} else {
+			SchedSpread = true
+		}
+	}
+
+	if noprune := clean("OLLAMA_NOPRUNE"); noprune != "" {
+		NoPrune = true
+	}
+
+	if origins := clean("OLLAMA_ORIGINS"); origins != "" {
+		AllowOrigins = strings.Split(origins, ",")
+	}
+	for _, allowOrigin := range defaultAllowOrigins {
+		AllowOrigins = append(AllowOrigins,
+			fmt.Sprintf("http://%s", allowOrigin),
+			fmt.Sprintf("https://%s", allowOrigin),
+			fmt.Sprintf("http://%s", net.JoinHostPort(allowOrigin, "*")),
+			fmt.Sprintf("https://%s", net.JoinHostPort(allowOrigin, "*")),
+		)
+	}
+
+	AllowOrigins = append(AllowOrigins,
+		"app://*",
+		"file://*",
+		"tauri://*",
+	)
+
+	maxRunners := clean("OLLAMA_MAX_LOADED_MODELS")
+	if maxRunners != "" {
+		m, err := strconv.Atoi(maxRunners)
+		if err != nil {
+			slog.Error("invalid setting, ignoring", "OLLAMA_MAX_LOADED_MODELS", maxRunners, "error", err)
+		} else {
+			MaxRunners = m
+		}
+	}
+
+	if onp := os.Getenv("OLLAMA_MAX_QUEUE"); onp != "" {
+		p, err := strconv.Atoi(onp)
+		if err != nil || p <= 0 {
+			slog.Error("invalid setting, ignoring", "OLLAMA_MAX_QUEUE", onp, "error", err)
+		} else {
+			MaxQueuedRequests = p
+		}
+	}
+
+	ka := clean("OLLAMA_KEEP_ALIVE")
+	if ka != "" {
+		loadKeepAlive(ka)
+	}
+
+	var err error
+	ModelsDir, err = getModelsDir()
+	if err != nil {
+		slog.Error("invalid setting", "OLLAMA_MODELS", ModelsDir, "error", err)
+	}
+
+	Host, err = getOllamaHost()
+	if err != nil {
+		slog.Error("invalid setting", "OLLAMA_HOST", Host, "error", err, "using default port", Host.Port)
+	}
+
+	if set, err := strconv.ParseBool(clean("OLLAMA_INTEL_GPU")); err == nil {
+		IntelGpu = set
+	}
+
+	CudaVisibleDevices = clean("CUDA_VISIBLE_DEVICES")
+	HipVisibleDevices = clean("HIP_VISIBLE_DEVICES")
+	RocrVisibleDevices = clean("ROCR_VISIBLE_DEVICES")
+	GpuDeviceOrdinal = clean("GPU_DEVICE_ORDINAL")
+	HsaOverrideGfxVersion = clean("HSA_OVERRIDE_GFX_VERSION")
+}
+
+func getModelsDir() (string, error) {
+	if models, exists := os.LookupEnv("OLLAMA_MODELS"); exists {
+		return models, nil
+	}
+	home, err := os.UserHomeDir()
+	if err != nil {
+		return "", err
+	}
+	return filepath.Join(home, ".ollama", "models"), nil
+}
+
+func getOllamaHost() (*OllamaHost, error) {
 	defaultPort := "11434"

-	s := strings.TrimSpace(Var("OLLAMA_HOST"))
-	scheme, hostport, ok := strings.Cut(s, "://")
+	hostVar := os.Getenv("OLLAMA_HOST")
+	hostVar = strings.TrimSpace(strings.Trim(strings.TrimSpace(hostVar), "\"'"))
+
+	scheme, hostport, ok := strings.Cut(hostVar, "://")
 	switch {
 	case !ok:
-		scheme, hostport = "http", s
+		scheme, hostport = "http", hostVar
 	case scheme == "http":
 		defaultPort = "80"
 	case scheme == "https":
@@ -43,242 +323,38 @@ func Host() *url.URL {
 		}
 	}

-	if n, err := strconv.ParseInt(port, 10, 32); err != nil || n > 65535 || n < 0 {
-		slog.Warn("invalid port, using default", "port", port, "default", defaultPort)
-		return &url.URL{
+	if portNum, err := strconv.ParseInt(port, 10, 32); err != nil || portNum > 65535 || portNum < 0 {
+		return &OllamaHost{
 			Scheme: scheme,
-			Host:   net.JoinHostPort(host, defaultPort),
-		}
+			Host:   host,
+			Port:   defaultPort,
+		}, ErrInvalidHostPort
 	}

-	return &url.URL{
+	return &OllamaHost{
 		Scheme: scheme,
-		Host:   net.JoinHostPort(host, port),
-	}
+		Host:   host,
+		Port:   port,
+	}, nil
 }

-// Origins returns a list of allowed origins. Origins can be configured via the OLLAMA_ORIGINS environment variable.
-func Origins() (origins []string) {
-	if s := Var("OLLAMA_ORIGINS"); s != "" {
-		origins = strings.Split(s, ",")
-	}
-
-	for _, origin := range []string{"localhost", "127.0.0.1", "0.0.0.0"} {
-		origins = append(origins,
-			fmt.Sprintf("http://%s", origin),
-			fmt.Sprintf("https://%s", origin),
-			fmt.Sprintf("http://%s", net.JoinHostPort(origin, "*")),
-			fmt.Sprintf("https://%s", net.JoinHostPort(origin, "*")),
-		)
-	}
-
-	origins = append(origins,
-		"app://*",
-		"file://*",
-		"tauri://*",
-	)
-
-	return origins
-}
-
-// Models returns the path to the models directory. Models directory can be configured via the OLLAMA_MODELS environment variable.
-// Default is $HOME/.ollama/models
-func Models() string {
-	if s := Var("OLLAMA_MODELS"); s != "" {
-		return s
-	}
-
-	home, err := os.UserHomeDir()
+func loadKeepAlive(ka string) {
+	v, err := strconv.Atoi(ka)
 	if err != nil {
-		panic(err)
-	}
-
-	return filepath.Join(home, ".ollama", "models")
-}
-
-// KeepAlive returns the duration that models stay loaded in memory. KeepAlive can be configured via the OLLAMA_KEEP_ALIVE environment variable.
-// Negative values are treated as infinite. Zero is treated as no keep alive.
-// Default is 5 minutes.
-func KeepAlive() (keepAlive time.Duration) {
-	keepAlive = 5 * time.Minute
-	if s := Var("OLLAMA_KEEP_ALIVE"); s != "" {
-		if d, err := time.ParseDuration(s); err == nil {
-			keepAlive = d
-		} else if n, err := strconv.ParseInt(s, 10, 64); err == nil {
-			keepAlive = time.Duration(n) * time.Second
-		}
-	}
-
-	if keepAlive < 0 {
-		return time.Duration(math.MaxInt64)
-	}
-
-	return keepAlive
-}
-
-func Bool(k string) func() bool {
-	return func() bool {
-		if s := Var(k); s != "" {
-			b, err := strconv.ParseBool(s)
-			if err != nil {
-				return true
-			}
-
-			return b
-		}
-
-		return false
-	}
-}
-
-var (
-	// Debug enabled additional debug information.
-	Debug = Bool("OLLAMA_DEBUG")
-	// FlashAttention enables the experimental flash attention feature.
-	FlashAttention = Bool("OLLAMA_FLASH_ATTENTION")
-	// NoHistory disables readline history.
-	NoHistory = Bool("OLLAMA_NOHISTORY")
-	// NoPrune disables pruning of model blobs on startup.
-	NoPrune = Bool("OLLAMA_NOPRUNE")
-	// SchedSpread allows scheduling models across all GPUs.
-	SchedSpread = Bool("OLLAMA_SCHED_SPREAD")
-	// IntelGPU enables experimental Intel GPU detection.
-	IntelGPU = Bool("OLLAMA_INTEL_GPU")
-)
-
-func String(s string) func() string {
-	return func() string {
-		return Var(s)
-	}
-}
-
-var (
-	LLMLibrary = String("OLLAMA_LLM_LIBRARY")
-	TmpDir     = String("OLLAMA_TMPDIR")
-
-	CudaVisibleDevices    = String("CUDA_VISIBLE_DEVICES")
-	HipVisibleDevices     = String("HIP_VISIBLE_DEVICES")
-	RocrVisibleDevices    = String("ROCR_VISIBLE_DEVICES")
-	GpuDeviceOrdinal      = String("GPU_DEVICE_ORDINAL")
-	HsaOverrideGfxVersion = String("HSA_OVERRIDE_GFX_VERSION")
-)
-
-func RunnersDir() (p string) {
-	if p := Var("OLLAMA_RUNNERS_DIR"); p != "" {
-		return p
-	}
-
-	if runtime.GOOS != "windows" {
-		return
-	}
-
-	defer func() {
-		if p == "" {
-			slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama_runners'")
-		}
-	}()
-
-	// On Windows we do not carry the payloads inside the main executable
-	exe, err := os.Executable()
-	if err != nil {
-		return
-	}
-
-	cwd, err := os.Getwd()
-	if err != nil {
-		return
-	}
-
-	var paths []string
-	for _, root := range []string{filepath.Dir(exe), cwd} {
-		paths = append(paths,
-			root,
-			filepath.Join(root, "windows-"+runtime.GOARCH),
-			filepath.Join(root, "dist", "windows-"+runtime.GOARCH),
-		)
-	}
-
-	// Try a few variations to improve developer experience when building from source in the local tree
-	for _, path := range paths {
-		candidate := filepath.Join(path, "ollama_runners")
-		if _, err := os.Stat(candidate); err == nil {
-			p = candidate
-			break
-		}
-	}
-
-	return p
-}
-
-func Uint(key string, defaultValue uint) func() uint {
-	return func() uint {
-		if s := Var(key); s != "" {
-			if n, err := strconv.ParseUint(s, 10, 64); err != nil {
-				slog.Warn("invalid environment variable, using default", "key", key, "value", s, "default", defaultValue)
+		d, err := time.ParseDuration(ka)
+		if err == nil {
+			if d < 0 {
+				KeepAlive = time.Duration(math.MaxInt64)
 			} else {
-				return uint(n)
+				KeepAlive = d
 			}
 		}
-
-		return defaultValue
+	} else {
+		d := time.Duration(v) * time.Second
+		if d < 0 {
+			KeepAlive = time.Duration(math.MaxInt64)
+		} else {
+			KeepAlive = d
+		}
 	}
 }
-
-var (
-	// NumParallel sets the number of parallel model requests. NumParallel can be configured via the OLLAMA_NUM_PARALLEL environment variable.
-	NumParallel = Uint("OLLAMA_NUM_PARALLEL", 0)
-	// MaxRunners sets the maximum number of loaded models. MaxRunners can be configured via the OLLAMA_MAX_LOADED_MODELS environment variable.
-	MaxRunners = Uint("OLLAMA_MAX_LOADED_MODELS", 0)
-	// MaxQueue sets the maximum number of queued requests. MaxQueue can be configured via the OLLAMA_MAX_QUEUE environment variable.
-	MaxQueue = Uint("OLLAMA_MAX_QUEUE", 512)
-	// MaxVRAM sets a maximum VRAM override in bytes. MaxVRAM can be configured via the OLLAMA_MAX_VRAM environment variable.
-	MaxVRAM = Uint("OLLAMA_MAX_VRAM", 0)
-)
-
-type EnvVar struct {
-	Name        string
-	Value       any
-	Description string
-}
-
-func AsMap() map[string]EnvVar {
-	ret := map[string]EnvVar{
-		"OLLAMA_DEBUG":             {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
-		"OLLAMA_FLASH_ATTENTION":   {"OLLAMA_FLASH_ATTENTION", FlashAttention(), "Enabled flash attention"},
-		"OLLAMA_HOST":              {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"},
-		"OLLAMA_KEEP_ALIVE":        {"OLLAMA_KEEP_ALIVE", KeepAlive(), "The duration that models stay loaded in memory (default \"5m\")"},
-		"OLLAMA_LLM_LIBRARY":       {"OLLAMA_LLM_LIBRARY", LLMLibrary(), "Set LLM library to bypass autodetection"},
-		"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners(), "Maximum number of loaded models per GPU"},
-		"OLLAMA_MAX_QUEUE":         {"OLLAMA_MAX_QUEUE", MaxQueue(), "Maximum number of queued requests"},
-		"OLLAMA_MODELS":            {"OLLAMA_MODELS", Models(), "The path to the models directory"},
-		"OLLAMA_NOHISTORY":         {"OLLAMA_NOHISTORY", NoHistory(), "Do not preserve readline history"},
-		"OLLAMA_NOPRUNE":           {"OLLAMA_NOPRUNE", NoPrune(), "Do not prune model blobs on startup"},
-		"OLLAMA_NUM_PARALLEL":      {"OLLAMA_NUM_PARALLEL", NumParallel(), "Maximum number of parallel requests"},
-		"OLLAMA_ORIGINS":           {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"},
-		"OLLAMA_RUNNERS_DIR":       {"OLLAMA_RUNNERS_DIR", RunnersDir(), "Location for runners"},
-		"OLLAMA_SCHED_SPREAD":      {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
-		"OLLAMA_TMPDIR":            {"OLLAMA_TMPDIR", TmpDir(), "Location for temporary files"},
-	}
-	if runtime.GOOS != "darwin" {
-		ret["CUDA_VISIBLE_DEVICES"] = EnvVar{"CUDA_VISIBLE_DEVICES", CudaVisibleDevices(), "Set which NVIDIA devices are visible"}
-		ret["HIP_VISIBLE_DEVICES"] = EnvVar{"HIP_VISIBLE_DEVICES", HipVisibleDevices(), "Set which AMD devices are visible"}
-		ret["ROCR_VISIBLE_DEVICES"] = EnvVar{"ROCR_VISIBLE_DEVICES", RocrVisibleDevices(), "Set which AMD devices are visible"}
-		ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal(), "Set which AMD devices are visible"}
-		ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion(), "Override the gfx used for all detected AMD GPUs"}
-		ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGPU(), "Enable experimental Intel GPU detection"}
-	}
-	return ret
-}
-
-func Values() map[string]string {
-	vals := make(map[string]string)
-	for k, v := range AsMap() {
-		vals[k] = fmt.Sprintf("%v", v.Value)
-	}
-	return vals
-}
-
-// Var returns an environment variable stripped of leading and trailing quotes or spaces
-func Var(key string) string {
-	return strings.Trim(strings.TrimSpace(os.Getenv(key)), "\"'")
-}
--- a/envconfig/config_test.go
+++ b/envconfig/config_test.go
@@ -1,234 +1,87 @@
 package envconfig

 import (
+	"fmt"
 	"math"
+	"net"
 	"testing"
 	"time"

-	"github.com/google/go-cmp/cmp"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
 )

-func TestHost(t *testing.T) {
-	cases := map[string]struct {
+func TestConfig(t *testing.T) {
+	Debug = false // Reset whatever was loaded in init()
+	t.Setenv("OLLAMA_DEBUG", "")
+	LoadConfig()
+	require.False(t, Debug)
+	t.Setenv("OLLAMA_DEBUG", "false")
+	LoadConfig()
+	require.False(t, Debug)
+	t.Setenv("OLLAMA_DEBUG", "1")
+	LoadConfig()
+	require.True(t, Debug)
+	t.Setenv("OLLAMA_FLASH_ATTENTION", "1")
+	LoadConfig()
+	require.True(t, FlashAttention)
+	t.Setenv("OLLAMA_KEEP_ALIVE", "")
+	LoadConfig()
+	require.Equal(t, 5*time.Minute, KeepAlive)
+	t.Setenv("OLLAMA_KEEP_ALIVE", "3")
+	LoadConfig()
+	require.Equal(t, 3*time.Second, KeepAlive)
+	t.Setenv("OLLAMA_KEEP_ALIVE", "1h")
+	LoadConfig()
+	require.Equal(t, 1*time.Hour, KeepAlive)
+	t.Setenv("OLLAMA_KEEP_ALIVE", "-1s")
+	LoadConfig()
+	require.Equal(t, time.Duration(math.MaxInt64), KeepAlive)
+	t.Setenv("OLLAMA_KEEP_ALIVE", "-1")
+	LoadConfig()
+	require.Equal(t, time.Duration(math.MaxInt64), KeepAlive)
+}
+
+func TestClientFromEnvironment(t *testing.T) {
+	type testCase struct {
 		value  string
 		expect string
-	}{
-		"empty":               {"", "127.0.0.1:11434"},
-		"only address":        {"1.2.3.4", "1.2.3.4:11434"},
-		"only port":           {":1234", ":1234"},
-		"address and port":    {"1.2.3.4:1234", "1.2.3.4:1234"},
-		"hostname":            {"example.com", "example.com:11434"},
-		"hostname and port":   {"example.com:1234", "example.com:1234"},
-		"zero port":           {":0", ":0"},
-		"too large port":      {":66000", ":11434"},
-		"too small port":      {":-1", ":11434"},
-		"ipv6 localhost":      {"[::1]", "[::1]:11434"},
-		"ipv6 world open":     {"[::]", "[::]:11434"},
-		"ipv6 no brackets":    {"::1", "[::1]:11434"},
-		"ipv6 + port":         {"[::1]:1337", "[::1]:1337"},
-		"extra space":         {" 1.2.3.4 ", "1.2.3.4:11434"},
-		"extra quotes":        {"\"1.2.3.4\"", "1.2.3.4:11434"},
-		"extra space+quotes":  {" \" 1.2.3.4 \" ", "1.2.3.4:11434"},
-		"extra single quotes": {"'1.2.3.4'", "1.2.3.4:11434"},
-		"http":                {"http://1.2.3.4", "1.2.3.4:80"},
-		"http port":           {"http://1.2.3.4:4321", "1.2.3.4:4321"},
-		"https":               {"https://1.2.3.4", "1.2.3.4:443"},
-		"https port":          {"https://1.2.3.4:4321", "1.2.3.4:4321"},
+		err    error
 	}

-	for name, tt := range cases {
-		t.Run(name, func(t *testing.T) {
-			t.Setenv("OLLAMA_HOST", tt.value)
-			if host := Host(); host.Host != tt.expect {
-				t.Errorf("%s: expected %s, got %s", name, tt.expect, host.Host)
-			}
-		})
-	}
-}
-
-func TestOrigins(t *testing.T) {
-	cases := []struct {
-		value  string
-		expect []string
-	}{
-		{"", []string{
-			"http://localhost",
-			"https://localhost",
-			"http://localhost:*",
-			"https://localhost:*",
-			"http://127.0.0.1",
-			"https://127.0.0.1",
-			"http://127.0.0.1:*",
-			"https://127.0.0.1:*",
-			"http://0.0.0.0",
-			"https://0.0.0.0",
-			"http://0.0.0.0:*",
-			"https://0.0.0.0:*",
-			"app://*",
-			"file://*",
-			"tauri://*",
-		}},
-		{"http://10.0.0.1", []string{
-			"http://10.0.0.1",
-			"http://localhost",
-			"https://localhost",
-			"http://localhost:*",
-			"https://localhost:*",
-			"http://127.0.0.1",
-			"https://127.0.0.1",
-			"http://127.0.0.1:*",
-			"https://127.0.0.1:*",
-			"http://0.0.0.0",
-			"https://0.0.0.0",
-			"http://0.0.0.0:*",
-			"https://0.0.0.0:*",
-			"app://*",
-			"file://*",
-			"tauri://*",
-		}},
-		{"http://172.16.0.1,https://192.168.0.1", []string{
-			"http://172.16.0.1",
-			"https://192.168.0.1",
-			"http://localhost",
-			"https://localhost",
-			"http://localhost:*",
-			"https://localhost:*",
-			"http://127.0.0.1",
-			"https://127.0.0.1",
-			"http://127.0.0.1:*",
-			"https://127.0.0.1:*",
-			"http://0.0.0.0",
-			"https://0.0.0.0",
-			"http://0.0.0.0:*",
-			"https://0.0.0.0:*",
-			"app://*",
-			"file://*",
-			"tauri://*",
-		}},
-		{"http://totally.safe,http://definitely.legit", []string{
-			"http://totally.safe",
-			"http://definitely.legit",
-			"http://localhost",
-			"https://localhost",
-			"http://localhost:*",
-			"https://localhost:*",
-			"http://127.0.0.1",
-			"https://127.0.0.1",
-			"http://127.0.0.1:*",
-			"https://127.0.0.1:*",
-			"http://0.0.0.0",
-			"https://0.0.0.0",
-			"http://0.0.0.0:*",
-			"https://0.0.0.0:*",
-			"app://*",
-			"file://*",
-			"tauri://*",
-		}},
-	}
-	for _, tt := range cases {
-		t.Run(tt.value, func(t *testing.T) {
-			t.Setenv("OLLAMA_ORIGINS", tt.value)
-
-			if diff := cmp.Diff(Origins(), tt.expect); diff != "" {
-				t.Errorf("%s: mismatch (-want +got):\n%s", tt.value, diff)
-			}
-		})
-	}
-}
-
-func TestBool(t *testing.T) {
-	cases := map[string]bool{
-		"":      false,
-		"true":  true,
-		"false": false,
-		"1":     true,
-		"0":     false,
-		// invalid values
-		"random":    true,
-		"something": true,
+	hostTestCases := map[string]*testCase{
+		"empty":               {value: "", expect: "127.0.0.1:11434"},
+		"only address":        {value: "1.2.3.4", expect: "1.2.3.4:11434"},
+		"only port":           {value: ":1234", expect: ":1234"},
+		"address and port":    {value: "1.2.3.4:1234", expect: "1.2.3.4:1234"},
+		"hostname":            {value: "example.com", expect: "example.com:11434"},
+		"hostname and port":   {value: "example.com:1234", expect: "example.com:1234"},
+		"zero port":           {value: ":0", expect: ":0"},
+		"too large port":      {value: ":66000", err: ErrInvalidHostPort},
+		"too small port":      {value: ":-1", err: ErrInvalidHostPort},
+		"ipv6 localhost":      {value: "[::1]", expect: "[::1]:11434"},
+		"ipv6 world open":     {value: "[::]", expect: "[::]:11434"},
+		"ipv6 no brackets":    {value: "::1", expect: "[::1]:11434"},
+		"ipv6 + port":         {value: "[::1]:1337", expect: "[::1]:1337"},
+		"extra space":         {value: " 1.2.3.4 ", expect: "1.2.3.4:11434"},
+		"extra quotes":        {value: "\"1.2.3.4\"", expect: "1.2.3.4:11434"},
+		"extra space+quotes":  {value: " \" 1.2.3.4 \" ", expect: "1.2.3.4:11434"},
+		"extra single quotes": {value: "'1.2.3.4'", expect: "1.2.3.4:11434"},
 	}

-	for k, v := range cases {
+	for k, v := range hostTestCases {
 		t.Run(k, func(t *testing.T) {
-			t.Setenv("OLLAMA_BOOL", k)
-			if b := Bool("OLLAMA_BOOL")(); b != v {
-				t.Errorf("%s: expected %t, got %t", k, v, b)
-			}
-		})
-	}
-}
-
-func TestUint(t *testing.T) {
-	cases := map[string]uint{
-		"0":    0,
-		"1":    1,
-		"1337": 1337,
-		// default values
-		"":       11434,
-		"-1":     11434,
-		"0o10":   11434,
-		"0x10":   11434,
-		"string": 11434,
-	}
-
-	for k, v := range cases {
-		t.Run(k, func(t *testing.T) {
-			t.Setenv("OLLAMA_UINT", k)
-			if i := Uint("OLLAMA_UINT", 11434)(); i != v {
-				t.Errorf("%s: expected %d, got %d", k, v, i)
-			}
-		})
-	}
-}
-
-func TestKeepAlive(t *testing.T) {
-	cases := map[string]time.Duration{
-		"":       5 * time.Minute,
-		"1s":     time.Second,
-		"1m":     time.Minute,
-		"1h":     time.Hour,
-		"5m0s":   5 * time.Minute,
-		"1h2m3s": 1*time.Hour + 2*time.Minute + 3*time.Second,
-		"0":      time.Duration(0),
-		"60":     60 * time.Second,
-		"120":    2 * time.Minute,
-		"3600":   time.Hour,
-		"-0":     time.Duration(0),
-		"-1":     time.Duration(math.MaxInt64),
-		"-1m":    time.Duration(math.MaxInt64),
-		// invalid values
-		" ":   5 * time.Minute,
-		"???": 5 * time.Minute,
-		"1d":  5 * time.Minute,
-		"1y":  5 * time.Minute,
-		"1w":  5 * time.Minute,
-	}
-
-	for tt, expect := range cases {
-		t.Run(tt, func(t *testing.T) {
-			t.Setenv("OLLAMA_KEEP_ALIVE", tt)
-			if actual := KeepAlive(); actual != expect {
-				t.Errorf("%s: expected %s, got %s", tt, expect, actual)
-			}
-		})
-	}
-}
-
-func TestVar(t *testing.T) {
-	cases := map[string]string{
-		"value":       "value",
-		" value ":     "value",
-		" 'value' ":   "value",
-		` "value" `:   "value",
-		" ' value ' ": " value ",
-		` " value " `: " value ",
-	}
-
-	for k, v := range cases {
-		t.Run(k, func(t *testing.T) {
-			t.Setenv("OLLAMA_VAR", k)
-			if s := Var("OLLAMA_VAR"); s != v {
-				t.Errorf("%s: expected %q, got %q", k, v, s)
+			t.Setenv("OLLAMA_HOST", v.value)
+			LoadConfig()
+
+			oh, err := getOllamaHost()
+			if err != v.err {
+				t.Fatalf("expected %s, got %s", v.err, err)
+			}
+
+			if err == nil {
+				host := net.JoinHostPort(oh.Host, oh.Port)
+				assert.Equal(t, v.expect, host, fmt.Sprintf("%s: expected %s, got %s", k, v.expect, host))
 			}
 		})
 	}
--- a/examples/go-chat/main.go
+++ b/examples/go-chat/main.go
@@ -35,7 +35,7 @@ func main() {

 	ctx := context.Background()
 	req := &api.ChatRequest{
-		Model:    "llama3.1",
+		Model:    "llama3",
 		Messages: messages,
 	}

--- a/examples/go-generate-streaming/main.go
+++ b/examples/go-generate-streaming/main.go
@@ -16,7 +16,7 @@ func main() {

 	// By default, GenerateRequest is streaming.
 	req := &api.GenerateRequest{
-		Model:  "gemma2",
+		Model:  "gemma",
 		Prompt: "how many planets are there?",
 	}

--- a/examples/go-generate/main.go
+++ b/examples/go-generate/main.go
@@ -15,7 +15,7 @@ func main() {
 	}

 	req := &api.GenerateRequest{
-		Model:  "gemma2",
+		Model:  "gemma",
 		Prompt: "how many planets are there?",

 		// set streaming to false
--- a/examples/go-http-generate/README.md
+++ b/examples/go-http-generate/README.md
--- a/examples/langchain-python-rag-document/README.md
+++ b/examples/langchain-python-rag-document/README.md
@@ -4,14 +4,6 @@ This example provides an interface for asking questions to a PDF document.

 ## Setup

-1. Ensure you have the `llama3.1` model installed:
-
-```
-ollama pull llama3.1
-```
-
-2. Install the Python Requirements.
-
 ```
 pip install -r requirements.txt
 ```
--- a/examples/langchain-python-rag-document/main.py
+++ b/examples/langchain-python-rag-document/main.py
@@ -51,7 +51,7 @@ while True:
        template=template,
    )

-    llm = Ollama(model="llama3.1", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
+    llm = Ollama(model="llama3:8b", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
    qa_chain = RetrievalQA.from_chain_type(
        llm,
        retriever=vectorstore.as_retriever(),
--- a/examples/langchain-python-rag-websummary/README.md
+++ b/examples/langchain-python-rag-websummary/README.md
@@ -4,10 +4,10 @@ This example summarizes the website, [https://ollama.com/blog/run-llama2-uncenso

 ## Running the Example

-1. Ensure you have the `llama3.1` model installed:
+1. Ensure you have the `llama2` model installed:

   ```bash
-   ollama pull llama3.1
+   ollama pull llama2
   ```

 2. Install the Python Requirements.
--- a/examples/langchain-python-rag-websummary/main.py
+++ b/examples/langchain-python-rag-websummary/main.py
@@ -5,8 +5,8 @@ from langchain.chains.summarize import load_summarize_chain
 loader = WebBaseLoader("https://ollama.com/blog/run-llama2-uncensored-locally")
 docs = loader.load()

-llm = Ollama(model="llama3.1")
+llm = Ollama(model="llama3")
 chain = load_summarize_chain(llm, chain_type="stuff")

-result = chain.invoke(docs)
+result = chain.invoke(docs) 
 print(result)
--- a/examples/langchain-python-simple/README.md
+++ b/examples/langchain-python-simple/README.md
@@ -4,10 +4,10 @@ This example is a basic "hello world" of using LangChain with Ollama.

 ## Running the Example

-1. Ensure you have the `llama3.1` model installed:
+1. Ensure you have the `llama3` model installed:

   ```bash
-   ollama pull llama3.1
+   ollama pull llama3
   ```

 2. Install the Python Requirements.
--- a/examples/langchain-python-simple/main.py
+++ b/examples/langchain-python-simple/main.py
@@ -1,6 +1,6 @@
 from langchain.llms import Ollama

 input = input("What is your question?")
-llm = Ollama(model="llama3.1")
+llm = Ollama(model="llama3")
 res = llm.predict(input)
 print (res)
--- a/examples/modelfile-mario/Modelfile
+++ b/examples/modelfile-mario/Modelfile
@@ -1,4 +1,4 @@
-FROM llama3.1
+FROM llama3
 PARAMETER temperature 1
 SYSTEM """
 You are Mario from super mario bros, acting as an assistant.
--- a/examples/modelfile-mario/readme.md
+++ b/examples/modelfile-mario/readme.md
@@ -2,12 +2,12 @@

 # Example character: Mario

-This example shows how to create a basic character using Llama3.1 as the base model.
+This example shows how to create a basic character using Llama3 as the base model.

 To run this example:

 1. Download the Modelfile
-2. `ollama pull llama3.1` to get the base model used in the model file.
+2. `ollama pull llama3` to get the base model used in the model file.
 3. `ollama create NAME -f ./Modelfile`
 4. `ollama run NAME`

@@ -18,7 +18,7 @@ Ask it some questions like "Who are you?" or "Is Peach in trouble again?"
 What the model file looks like:

 ```
-FROM llama3.1
+FROM llama3
 PARAMETER temperature 1
 SYSTEM """
 You are Mario from Super Mario Bros, acting as an assistant.
--- a/examples/python-dockerit/dockerit.py
+++ b/examples/python-dockerit/dockerit.py
@@ -4,7 +4,7 @@ imageName = input("Enter the name of the image: ")
 client = docker.from_env()
 s = requests.Session()
 output=""
-with s.post('http://localhost:11434/api/generate', json={'model': 'mattw/dockerit', 'prompt': inputDescription}, stream=True) as r:
+with s.post('http://localhost:11434/api/generate', json={'model': 'dockerit', 'prompt': inputDescription}, stream=True) as r:
  for line in r.iter_lines():
    if line:
      j = json.loads(line)
--- a/examples/python-json-datagenerator/predefinedschema.py
+++ b/examples/python-json-datagenerator/predefinedschema.py
@@ -2,7 +2,7 @@ import requests
 import json
 import random

-model = "llama3.1"
+model = "llama3"
 template = {
  "firstName": "",
  "lastName": "",
--- a/examples/python-json-datagenerator/randomaddresses.py
+++ b/examples/python-json-datagenerator/randomaddresses.py
@@ -12,7 +12,7 @@ countries = [
    "France",
 ]
 country = random.choice(countries)
-model = "llama3.1"
+model = "llama3"

 prompt = f"generate one realistically believable sample data set of a persons first name, last name, address in {country}, and phone number. Do not use common names. Respond using JSON. Key names should have no backslashes, values should use plain ascii with no special characters."

--- a/examples/python-json-datagenerator/readme.md
+++ b/examples/python-json-datagenerator/readme.md
@@ -6,10 +6,10 @@ There are two python scripts in this example. `randomaddresses.py` generates ran

 ## Running the Example

-1. Ensure you have the `llama3.1` model installed:
+1. Ensure you have the `llama3` model installed:

   ```bash
-   ollama pull llama3.1
+   ollama pull llama3
   ```

 2. Install the Python Requirements.
--- a/examples/python-simplechat/client.py
+++ b/examples/python-simplechat/client.py
@@ -2,7 +2,7 @@ import json
 import requests

 # NOTE: ollama must be running for this to work, start the ollama app or run `ollama serve`
-model = "llama3.1"  # TODO: update this for whatever model you wish to use
+model = "llama3"  # TODO: update this for whatever model you wish to use


 def chat(messages):
--- a/examples/python-simplechat/readme.md
+++ b/examples/python-simplechat/readme.md
@@ -4,10 +4,10 @@ The **chat** endpoint is one of two ways to generate text from an LLM with Ollam

 ## Running the Example

-1. Ensure you have the `llama3.1` model installed:
+1. Ensure you have the `llama3` model installed:

   ```bash
-   ollama pull llama3.1
+   ollama pull llama3
   ```

 2. Install the Python Requirements.
--- a/examples/typescript-simplechat/client.ts
+++ b/examples/typescript-simplechat/client.ts
@@ -1,6 +1,6 @@
 import * as readline from "readline";

-const model = "llama3.1";
+const model = "llama3";
 type Message = {
  role: "assistant" | "user" | "system";
  content: string;
--- a/gpu/amd_linux.go
+++ b/gpu/amd_linux.go
@@ -10,7 +10,6 @@ import (
 	"path/filepath"
 	"regexp"
 	"slices"
-	"sort"
 	"strconv"
 	"strings"

@@ -61,9 +60,9 @@ func AMDGetGPUInfo() []RocmGPUInfo {

 	// Determine if the user has already pre-selected which GPUs to look at, then ignore the others
 	var visibleDevices []string
-	hipVD := envconfig.HipVisibleDevices()   // zero based index only
-	rocrVD := envconfig.RocrVisibleDevices() // zero based index or UUID, but consumer cards seem to not support UUID
-	gpuDO := envconfig.GpuDeviceOrdinal()    // zero based index
+	hipVD := envconfig.HipVisibleDevices   // zero based index only
+	rocrVD := envconfig.RocrVisibleDevices // zero based index or UUID, but consumer cards seem to not support UUID
+	gpuDO := envconfig.GpuDeviceOrdinal    // zero based index
 	switch {
 	// TODO is this priorty order right?
 	case hipVD != "":
@@ -76,27 +75,13 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 		visibleDevices = strings.Split(gpuDO, ",")
 	}

-	gfxOverride := envconfig.HsaOverrideGfxVersion()
+	gfxOverride := envconfig.HsaOverrideGfxVersion
 	var supported []string
 	libDir := ""

 	// The amdgpu driver always exposes the host CPU(s) first, but we have to skip them and subtract
 	// from the other IDs to get alignment with the HIP libraries expectations (zero is the first GPU, not the CPU)
 	matches, _ := filepath.Glob(GPUPropertiesFileGlob)
-	sort.Slice(matches, func(i, j int) bool {
-		// /sys/class/kfd/kfd/topology/nodes/<number>/properties
-		a, err := strconv.ParseInt(filepath.Base(filepath.Dir(matches[i])), 10, 64)
-		if err != nil {
-			slog.Debug("parse err", "error", err, "match", matches[i])
-			return false
-		}
-		b, err := strconv.ParseInt(filepath.Base(filepath.Dir(matches[j])), 10, 64)
-		if err != nil {
-			slog.Debug("parse err", "error", err, "match", matches[i])
-			return false
-		}
-		return a < b
-	})
 	cpuCount := 0
 	for _, match := range matches {
 		slog.Debug("evaluating amdgpu node " + match)
--- a/gpu/amd_windows.go
+++ b/gpu/amd_windows.go
@@ -53,7 +53,7 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 	}

 	var supported []string
-	gfxOverride := envconfig.HsaOverrideGfxVersion()
+	gfxOverride := envconfig.HsaOverrideGfxVersion
 	if gfxOverride == "" {
 		supported, err = GetSupportedGFX(libDir)
 		if err != nil {
--- a/gpu/assets.go
+++ b/gpu/assets.go
@@ -26,7 +26,7 @@ func PayloadsDir() (string, error) {
 	defer lock.Unlock()
 	var err error
 	if payloadsDir == "" {
-		runnersDir := envconfig.RunnersDir()
+		runnersDir := envconfig.RunnersDir

 		if runnersDir != "" {
 			payloadsDir = runnersDir
@@ -35,7 +35,7 @@ func PayloadsDir() (string, error) {

 		// The remainder only applies on non-windows where we still carry payloads in the main executable
 		cleanupTmpDirs()
-		tmpDir := envconfig.TmpDir()
+		tmpDir := envconfig.TmpDir
 		if tmpDir == "" {
 			tmpDir, err = os.MkdirTemp("", "ollama")
 			if err != nil {
@@ -105,7 +105,7 @@ func cleanupTmpDirs() {
 func Cleanup() {
 	lock.Lock()
 	defer lock.Unlock()
-	runnersDir := envconfig.RunnersDir()
+	runnersDir := envconfig.RunnersDir
 	if payloadsDir != "" && runnersDir == "" && runtime.GOOS != "windows" {
 		// We want to fully clean up the tmpdir parent of the payloads dir
 		tmpDir := filepath.Clean(filepath.Join(payloadsDir, ".."))
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -230,8 +230,8 @@ func GetGPUInfo() GpuInfoList {

 		// On windows we bundle the nvidia library one level above the runner dir
 		depPath := ""
-		if runtime.GOOS == "windows" && envconfig.RunnersDir() != "" {
-			depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir()), "cuda")
+		if runtime.GOOS == "windows" && envconfig.RunnersDir != "" {
+			depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir), "cuda")
 		}

 		// Load ALL libraries
@@ -302,12 +302,12 @@ func GetGPUInfo() GpuInfoList {
 		}

 		// Intel
-		if envconfig.IntelGPU() {
+		if envconfig.IntelGpu {
 			oHandles = initOneAPIHandles()
 			// On windows we bundle the oneapi library one level above the runner dir
 			depPath = ""
-			if runtime.GOOS == "windows" && envconfig.RunnersDir() != "" {
-				depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir()), "oneapi")
+			if runtime.GOOS == "windows" && envconfig.RunnersDir != "" {
+				depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir), "oneapi")
 			}

 			for d := range oHandles.oneapi.num_drivers {
@@ -611,7 +611,7 @@ func LoadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string) {
 }

 func getVerboseState() C.uint16_t {
-	if envconfig.Debug() {
+	if envconfig.Debug {
 		return C.uint16_t(1)
 	}
 	return C.uint16_t(0)
--- a/integration/basic_test.go
+++ b/integration/basic_test.go
@@ -45,7 +45,14 @@ func TestUnicodeModelDir(t *testing.T) {
 	defer os.RemoveAll(modelDir)
 	slog.Info("unicode", "OLLAMA_MODELS", modelDir)

-	t.Setenv("OLLAMA_MODELS", modelDir)
+	oldModelsDir := os.Getenv("OLLAMA_MODELS")
+	if oldModelsDir == "" {
+		defer os.Unsetenv("OLLAMA_MODELS")
+	} else {
+		defer os.Setenv("OLLAMA_MODELS", oldModelsDir)
+	}
+	err = os.Setenv("OLLAMA_MODELS", modelDir)
+	require.NoError(t, err)

 	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
 	defer cancel()
--- a/integration/concurrency_test.go
+++ b/integration/concurrency_test.go
@@ -5,16 +5,14 @@ package integration
 import (
 	"context"
 	"log/slog"
+	"os"
 	"strconv"
 	"sync"
 	"testing"
 	"time"

-	"github.com/stretchr/testify/require"
-
 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/envconfig"
-	"github.com/ollama/ollama/format"
+	"github.com/stretchr/testify/require"
 )

 func TestMultiModelConcurrency(t *testing.T) {
@@ -108,16 +106,13 @@ func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {

 // Stress the system if we know how much VRAM it has, and attempt to load more models than will fit
 func TestMultiModelStress(t *testing.T) {
-	s := os.Getenv("OLLAMA_MAX_VRAM") // TODO - discover actual VRAM
-	if s == "" {
+	vram := os.Getenv("OLLAMA_MAX_VRAM") // TODO - discover actual VRAM
+	if vram == "" {
 		t.Skip("OLLAMA_MAX_VRAM not specified, can't pick the right models for the stress test")
 	}
-
-	maxVram, err := strconv.ParseUint(s, 10, 64)
-	if err != nil {
-		t.Fatal(err)
-	}
-
+	max, err := strconv.ParseUint(vram, 10, 64)
+	require.NoError(t, err)
+	const MB = uint64(1024 * 1024)
 	type model struct {
 		name string
 		size uint64 // Approximate amount of VRAM they typically use when fully loaded in VRAM
@@ -126,82 +121,83 @@ func TestMultiModelStress(t *testing.T) {
 	smallModels := []model{
 		{
 			name: "orca-mini",
-			size: 2992 * format.MebiByte,
+			size: 2992 * MB,
 		},
 		{
 			name: "phi",
-			size: 2616 * format.MebiByte,
+			size: 2616 * MB,
 		},
 		{
 			name: "gemma:2b",
-			size: 2364 * format.MebiByte,
+			size: 2364 * MB,
 		},
 		{
 			name: "stable-code:3b",
-			size: 2608 * format.MebiByte,
+			size: 2608 * MB,
 		},
 		{
 			name: "starcoder2:3b",
-			size: 2166 * format.MebiByte,
+			size: 2166 * MB,
 		},
 	}
 	mediumModels := []model{
 		{
 			name: "llama2",
-			size: 5118 * format.MebiByte,
+			size: 5118 * MB,
 		},
 		{
 			name: "mistral",
-			size: 4620 * format.MebiByte,
+			size: 4620 * MB,
 		},
 		{
 			name: "orca-mini:7b",
-			size: 5118 * format.MebiByte,
+			size: 5118 * MB,
 		},
 		{
 			name: "dolphin-mistral",
-			size: 4620 * format.MebiByte,
+			size: 4620 * MB,
 		},
 		{
 			name: "gemma:7b",
-			size: 5000 * format.MebiByte,
-		},
-		{
-			name: "codellama:7b",
-			size: 5118 * format.MebiByte,
+			size: 5000 * MB,
 		},
+		// TODO - uncomment this once #3565 is merged and this is rebased on it
+		// {
+		// 	name: "codellama:7b",
+		// 	size: 5118 * MB,
+		// },
 	}

 	// These seem to be too slow to be useful...
 	// largeModels := []model{
 	// 	{
 	// 		name: "llama2:13b",
-	// 		size: 7400 * format.MebiByte,
+	// 		size: 7400 * MB,
 	// 	},
 	// 	{
 	// 		name: "codellama:13b",
-	// 		size: 7400 * format.MebiByte,
+	// 		size: 7400 * MB,
 	// 	},
 	// 	{
 	// 		name: "orca-mini:13b",
-	// 		size: 7400 * format.MebiByte,
+	// 		size: 7400 * MB,
 	// 	},
 	// 	{
 	// 		name: "gemma:7b",
-	// 		size: 5000 * format.MebiByte,
+	// 		size: 5000 * MB,
 	// 	},
 	// 	{
 	// 		name: "starcoder2:15b",
-	// 		size: 9100 * format.MebiByte,
+	// 		size: 9100 * MB,
 	// 	},
 	// }

 	var chosenModels []model
 	switch {
-	case maxVram < 10000*format.MebiByte:
+	case max < 10000*MB:
 		slog.Info("selecting small models")
 		chosenModels = smallModels
-	// case maxVram < 30000*format.MebiByte:
+	// case max < 30000*MB:
 	default:
 		slog.Info("selecting medium models")
 		chosenModels = mediumModels
@@ -230,15 +226,15 @@ func TestMultiModelStress(t *testing.T) {
 	}

 	var wg sync.WaitGroup
-	consumed := uint64(256 * format.MebiByte) // Assume some baseline usage
+	consumed := uint64(256 * MB) // Assume some baseline usage
 	for i := 0; i < len(req); i++ {
 		// Always get at least 2 models, but dont' overshoot VRAM too much or we'll take too long
-		if i > 1 && consumed > vram {
-			slog.Info("achieved target vram exhaustion", "count", i, "vram", format.HumanBytes2(vram), "models", format.HumanBytes2(consumed))
+		if i > 1 && consumed > max {
+			slog.Info("achieved target vram exhaustion", "count", i, "vramMB", max/1024/1024, "modelsMB", consumed/1024/1024)
 			break
 		}
 		consumed += chosenModels[i].size
-		slog.Info("target vram", "count", i, "vram", format.HumanBytes2(vram), "models", format.HumanBytes2(consumed))
+		slog.Info("target vram", "count", i, "vramMB", max/1024/1024, "modelsMB", consumed/1024/1024)

 		wg.Add(1)
 		go func(i int) {
--- a/integration/embed_test.go
+++ b/integration/embed_test.go
@@ -69,10 +69,6 @@ func TestAllMiniLMEmbed(t *testing.T) {
 	if !floatsEqual32(res.Embeddings[0][0], 0.010071031) {
 		t.Fatalf("expected 0.010071031, got %.8f", res.Embeddings[0][0])
 	}
-
-	if res.PromptEvalCount != 8 {
-		t.Fatalf("expected 8 prompt tokens, got %d", res.PromptEvalCount)
-	}
 }

 func TestAllMiniLMBatchEmbed(t *testing.T) {
@@ -101,10 +97,6 @@ func TestAllMiniLMBatchEmbed(t *testing.T) {
 	if !floatsEqual32(res.Embeddings[0][0], 0.010071031) || !floatsEqual32(res.Embeddings[1][0], -0.009802706) {
 		t.Fatalf("expected 0.010071031 and -0.009802706, got %.8f and %.8f", res.Embeddings[0][0], res.Embeddings[1][0])
 	}
-
-	if res.PromptEvalCount != 16 {
-		t.Fatalf("expected 16 prompt tokens, got %d", res.PromptEvalCount)
-	}
 }

 func TestAllMiniLMEmbedTruncate(t *testing.T) {
--- a/integration/max_queue_test.go
+++ b/integration/max_queue_test.go
@@ -5,6 +5,7 @@ package integration
 import (
 	"context"
 	"errors"
+	"fmt"
 	"log/slog"
 	"os"
 	"strconv"
@@ -13,10 +14,8 @@ import (
 	"testing"
 	"time"

-	"github.com/stretchr/testify/require"
-
 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/envconfig"
+	"github.com/stretchr/testify/require"
 )

 func TestMaxQueue(t *testing.T) {
@@ -28,10 +27,13 @@ func TestMaxQueue(t *testing.T) {
 	// Note: This test can be quite slow when running in CPU mode, so keep the threadCount low unless your on GPU
 	// Also note that by default Darwin can't sustain > ~128 connections without adjusting limits
 	threadCount := 32
-	if maxQueue := envconfig.MaxQueue(); maxQueue != 0 {
-		threadCount = maxQueue
+	mq := os.Getenv("OLLAMA_MAX_QUEUE")
+	if mq != "" {
+		var err error
+		threadCount, err = strconv.Atoi(mq)
+		require.NoError(t, err)
 	} else {
-		t.Setenv("OLLAMA_MAX_QUEUE", strconv.Itoa(threadCount))
+		os.Setenv("OLLAMA_MAX_QUEUE", fmt.Sprintf("%d", threadCount))
 	}

 	req := api.GenerateRequest{
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@@ -41,7 +41,6 @@

 #if defined(_WIN32)
 #include <windows.h>
-#include <errhandlingapi.h>
 #endif

 #include <cstddef>
@@ -1221,7 +1220,6 @@ struct llama_server_context
                res.result_json = json
                {
                    {"embedding", std::vector<float>(embd, embd + n_embd)},
-                    {"timings",             slot.get_formated_timings()},
                };
            }
        }
@@ -2439,6 +2437,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
            params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
            params.use_mmap = false;
        }
+        else if (arg == "--lora-base")
+        {
+            if (++i >= argc)
+            {
+                invalid_param = true;
+                break;
+            }
+            params.lora_base = argv[i];
+        }
        else if (arg == "-v" || arg == "--verbose")
        {
            server_verbose = true;
@@ -2730,9 +2737,6 @@ int wmain(int argc, wchar_t **wargv) {
    for (int i = 0; i < argc; ++i) {
        argv[i] = wchar_to_char(wargv[i]);
    }
-
-    // Adjust error mode to avoid error dialog after we start.
-    SetErrorMode(SEM_FAILCRITICALERRORS);
 #else
 int main(int argc, char **argv) {
 #endif
@@ -3204,15 +3208,11 @@ int main(int argc, char **argv) {

                    responses = result.result_json.value("results", std::vector<json>{result.result_json});
                    json embeddings = json::array();
-
-                    int prompt_n = 0;
                    for (auto & elem : responses) {
                        embeddings.push_back(elem.at("embedding"));
-                        prompt_n += elem.at("timings").at("prompt_n").get<int>();
                    }
-
                    // send the result
-                    json embedding_res = json{{"embedding", embeddings}, {"prompt_n", prompt_n}};
+                    json embedding_res = json{{"embedding", embeddings}};
                    return res.set_content(embedding_res.dump(), "application/json; charset=utf-8");
                }
            });
--- a/llm/llama.cpp
+++ b/llm/llama.cpp
--- a/llm/llm_darwin_amd64.go
+++ b/llm/llm_darwin_amd64.go
@@ -2,10 +2,7 @@ package llm

 import (
 	"embed"
-	"syscall"
 )

 //go:embed build/darwin/x86_64/*/bin/*
 var libEmbed embed.FS
-
-var LlamaServerSysProcAttr = &syscall.SysProcAttr{}
--- a/llm/llm_darwin_arm64.go
+++ b/llm/llm_darwin_arm64.go
@@ -2,10 +2,7 @@ package llm

 import (
 	"embed"
-	"syscall"
 )

 //go:embed build/darwin/arm64/*/bin/*
 var libEmbed embed.FS
-
-var LlamaServerSysProcAttr = &syscall.SysProcAttr{}
--- a/llm/llm_linux.go
+++ b/llm/llm_linux.go
@@ -1,11 +1,6 @@
 package llm

-import (
-	"embed"
-	"syscall"
-)
+import "embed"

 //go:embed build/linux/*/*/bin/*
 var libEmbed embed.FS
-
-var LlamaServerSysProcAttr = &syscall.SysProcAttr{}
--- a/llm/llm_windows.go
+++ b/llm/llm_windows.go
@@ -1,20 +1,6 @@
 package llm

-import (
-	"embed"
-	"syscall"
-)
+import "embed"

 // unused on windows
 var libEmbed embed.FS
-
-const CREATE_DEFAULT_ERROR_MODE = 0x04000000
-
-var LlamaServerSysProcAttr = &syscall.SysProcAttr{
-	// Wire up the default error handling logic If for some reason a DLL is
-	// missing in the path this will pop up a GUI Dialog explaining the fault so
-	// the user can either fix their PATH, or report a bug. Without this
-	// setting, the process exits immediately with a generic exit status but no
-	// way to (easily) figure out what the actual missing DLL was.
-	CreationFlags: CREATE_DEFAULT_ERROR_MODE,
-}
--- a/llm/memory_test.go
+++ b/llm/memory_test.go
@@ -8,14 +8,14 @@ import (
 	"testing"

 	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/gpu"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 )

 func TestEstimateGPULayers(t *testing.T) {
-	t.Setenv("OLLAMA_DEBUG", "1")
-
+	envconfig.Debug = true
 	modelName := "dummy"
 	f, err := os.CreateTemp(t.TempDir(), modelName)
 	require.NoError(t, err)
--- a/llm/patches/05-default-pretokenizer.diff
+++ b/llm/patches/05-default-pretokenizer.diff
@@ -1,8 +1,8 @@
 diff --git a/src/llama.cpp b/src/llama.cpp
-index a207451f..2ddf431d 100644
+index 8fe51971..7113ba64 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
-@@ -5347,16 +5347,7 @@ static void llm_load_vocab(
+@@ -5433,16 +5433,7 @@ static void llm_load_vocab(
         if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
             vocab.tokenizer_add_space_prefix = false;
             vocab.tokenizer_clean_spaces = true;
@@ -20,9 +20,9 @@ index a207451f..2ddf431d 100644
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
             } else if (
                     tokenizer_pre == "llama3"   ||
-@@ -5443,7 +5434,8 @@ static void llm_load_vocab(
-                 tokenizer_pre == "codeshell") {
-                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
+@@ -5526,7 +5517,8 @@ static void llm_load_vocab(
+                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
+                 vocab.tokenizer_clean_spaces = false;
             } else {
 -                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
 +                LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
--- a/llm/patches/09-lora.diff
+++ b/llm/patches/09-lora.diff
@@ -2,7 +2,7 @@ diff --git a/common/common.cpp b/common/common.cpp
 index dbb724fb..c26fe6ee 100644
 --- a/common/common.cpp
 +++ b/common/common.cpp
-@@ -2087,14 +2087,27 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
+@@ -2087,14 +2087,29 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
     for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
         const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
         float lora_scale = std::get<1>(params.lora_adapter[i]);
@@ -20,7 +20,9 @@ index dbb724fb..c26fe6ee 100644
 +            int err = llama_model_apply_lora_from_file(model,
 +                                                    lora_adapter.c_str(),
 +                                                    lora_scale,
-+                                                    nullptr,
+                                                    ((i > 0) || params.lora_base.empty())
+                                                        ? NULL
+                                                        : params.lora_base.c_str(),
 +                                                    params.n_threads);
 +            if (err != 0) {
 +                fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
--- a/llm/patches/10-params.diff
+++ b/llm/patches/10-params.diff
@@ -1,20 +0,0 @@
-diff --git a/src/llama.cpp b/src/llama.cpp
-index a207451f..fba6b175 100644
--- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -4969,6 +4969,7 @@ static void llm_load_hparams(
-                 hparams.attn_soft_cap = true;
- 
-                 switch (hparams.n_layer) {
-+                    case 26: model.type = e_model::MODEL_2B; break;
-                     case 42: model.type = e_model::MODEL_9B; break;
-                     case 46: model.type = e_model::MODEL_27B; break;
-                     default: model.type = e_model::MODEL_UNKNOWN;
-@@ -11736,6 +11737,7 @@ struct llm_build_context {
- 
-                 // ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
-                 switch (model.type) {
-+                    case e_model::MODEL_2B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break;
-                     case e_model::MODEL_9B:  Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));   break;
-                     case e_model::MODEL_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break;
-                     default: GGML_ABORT("fatal error");
--- a/llm/patches/11-phi3-sliding-window.diff
+++ b/llm/patches/11-phi3-sliding-window.diff
@@ -1,43 +0,0 @@
-From 6eedae4cf2fcc8015dac79cb3f28f61fcabacab2 Mon Sep 17 00:00:00 2001
-From: Michael Yang <mxyng@pm.me>
-Date: Wed, 31 Jul 2024 14:57:04 -0700
-Subject: [PATCH] phi3 sliding window
-
---
- src/llama.cpp | 6 +++---
- 1 file changed, 3 insertions(+), 3 deletions(-)
-
-diff --git a/src/llama.cpp b/src/llama.cpp
-index a207451f..f2872d4e 100644
--- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -4893,7 +4893,7 @@ static void llm_load_hparams(
-             } break;
-         case LLM_ARCH_PHI3:
-             {
-                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
-+                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
-                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- 
-                 switch (hparams.n_layer) {
-@@ -10762,7 +10762,7 @@ struct llm_build_context {
-         struct ggml_tensor * inp_pos = build_inp_pos();
- 
-         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa();
-+        struct ggml_tensor * KQ_mask = hparams.n_swa > 0 ? build_inp_KQ_mask_swa() : build_inp_KQ_mask();
- 
-         for (int il = 0; il < n_layer; ++il) {
-             auto residual = inpL;
-@@ -10820,7 +10820,7 @@ struct llm_build_context {
- 
-                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask_swa, n_tokens, kv_head, n_kv, 1.0f, cb, il);
-+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
-             }
- 
-             if (il == n_layer - 1) {
-- 
-2.45.2
-
--- a/llm/server.go
+++ b/llm/server.go
@@ -33,7 +33,7 @@ type LlamaServer interface {
 	Ping(ctx context.Context) error
 	WaitUntilRunning(ctx context.Context) error
 	Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
-	Embed(ctx context.Context, input []string) (*EmbedResponse, error)
+	Embed(ctx context.Context, input []string) ([][]float32, error)
 	Tokenize(ctx context.Context, content string) ([]int, error)
 	Detokenize(ctx context.Context, tokens []int) (string, error)
 	Close() error
@@ -163,7 +163,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 	} else {
 		servers = serversForGpu(gpus[0]) // All GPUs in the list are matching Library and Variant
 	}
-	demandLib := envconfig.LLMLibrary()
+	demandLib := envconfig.LLMLibrary
 	if demandLib != "" {
 		serverPath := availableServers[demandLib]
 		if serverPath == "" {
@@ -195,7 +195,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		params = append(params, "--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU))
 	}

-	if envconfig.Debug() {
+	if envconfig.Debug {
 		params = append(params, "--verbose")
 	}

@@ -221,7 +221,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		params = append(params, "--memory-f32")
 	}

-	flashAttnEnabled := envconfig.FlashAttention()
+	flashAttnEnabled := envconfig.FlashAttention

 	for _, g := range gpus {
 		// only cuda (compute capability 7+) and metal support flash attention
@@ -346,7 +346,6 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		s.cmd.Env = os.Environ()
 		s.cmd.Stdout = os.Stdout
 		s.cmd.Stderr = s.status
-		s.cmd.SysProcAttr = LlamaServerSysProcAttr

 		envWorkarounds := [][2]string{}
 		for _, gpu := range gpus {
@@ -382,7 +381,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		}

 		slog.Info("starting llama server", "cmd", s.cmd.String())
-		if envconfig.Debug() {
+		if envconfig.Debug {
 			filteredEnv := []string{}
 			for _, ev := range s.cmd.Env {
 				if strings.HasPrefix(ev, "CUDA_") ||
@@ -727,7 +726,6 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 		"temperature":       req.Options.Temperature,
 		"top_k":             req.Options.TopK,
 		"top_p":             req.Options.TopP,
-		"min_p":             req.Options.MinP,
 		"tfs_z":             req.Options.TFSZ,
 		"typical_p":         req.Options.TypicalP,
 		"repeat_last_n":     req.Options.RepeatLastN,
@@ -879,11 +877,10 @@ type EmbedRequest struct {
 }

 type EmbedResponse struct {
-	Embedding       [][]float32 `json:"embedding"`
-	PromptEvalCount int         `json:"prompt_n"`
+	Embedding [][]float32 `json:"embedding"`
 }

-func (s *llmServer) Embed(ctx context.Context, input []string) (*EmbedResponse, error) {
+func (s *llmServer) Embed(ctx context.Context, input []string) ([][]float32, error) {
 	if err := s.sem.Acquire(ctx, 1); err != nil {
 		slog.Error("Failed to acquire semaphore", "error", err)
 		return nil, err
@@ -925,12 +922,12 @@ func (s *llmServer) Embed(ctx context.Context, input []string) (*EmbedResponse,
 		return nil, fmt.Errorf("%s", body)
 	}

-	var e EmbedResponse
-	if err := json.Unmarshal(body, &e); err != nil {
+	var embedding EmbedResponse
+	if err := json.Unmarshal(body, &embedding); err != nil {
 		return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
 	}

-	return &e, nil
+	return embedding.Embedding, nil
 }

 type TokenizeRequest struct {
--- a/macapp/src/app.tsx
+++ b/macapp/src/app.tsx
@@ -19,7 +19,7 @@ export default function () {
  const [step, setStep] = useState<Step>(Step.WELCOME)
  const [commandCopied, setCommandCopied] = useState<boolean>(false)

-  const command = 'ollama run llama3.1'
+  const command = 'ollama run llama3'

  return (
    <div className='drag'>
--- a/openai/openai.go
+++ b/openai/openai.go
@@ -218,9 +218,6 @@ func toChatCompletion(id string, r api.ChatResponse) ChatCompletion {
 			Index:   0,
 			Message: Message{Role: r.Message.Role, Content: r.Message.Content, ToolCalls: toolCalls},
 			FinishReason: func(reason string) *string {
-				if len(toolCalls) > 0 {
-					reason = "tool_calls"
-				}
 				if len(reason) > 0 {
 					return &reason
 				}
--- a/parser/parser_test.go
+++ b/parser/parser_test.go
@@ -451,7 +451,6 @@ func TestParseFileParameters(t *testing.T) {
 		"num_predict 1":                {"num_predict", "1"},
 		"top_k 1":                      {"top_k", "1"},
 		"top_p 1.0":                    {"top_p", "1.0"},
-		"min_p 0.05":                   {"min_p", "0.05"},
 		"tfs_z 1.0":                    {"tfs_z", "1.0"},
 		"typical_p 1.0":                {"typical_p", "1.0"},
 		"repeat_last_n 1":              {"repeat_last_n", "1"},
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -198,29 +198,19 @@ if check_gpu lspci amdgpu || check_gpu lshw amdgpu; then
    exit 0
 fi

-CUDA_REPO_ERR_MSG="NVIDIA GPU detected, but your OS and Architecture are not supported by NVIDIA.  Please install the CUDA driver manually https://docs.nvidia.com/cuda/cuda-installation-guide-linux/"
 # ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#rhel-7-centos-7
 # ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#rhel-8-rocky-8
 # ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#rhel-9-rocky-9
 # ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#fedora
 install_cuda_driver_yum() {
    status 'Installing NVIDIA repository...'
-    
    case $PACKAGE_MANAGER in
        yum)
            $SUDO $PACKAGE_MANAGER -y install yum-utils
-            if curl -I --silent --fail --location "https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo" >/dev/null ; then
-                $SUDO $PACKAGE_MANAGER-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo
-            else
-                error $CUDA_REPO_ERR_MSG
-            fi
+            $SUDO $PACKAGE_MANAGER-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo
            ;;
        dnf)
-            if curl -I --silent --fail --location "https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo" >/dev/null ; then
-                $SUDO $PACKAGE_MANAGER config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo
-            else
-                error $CUDA_REPO_ERR_MSG
-            fi
+            $SUDO $PACKAGE_MANAGER config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo
            ;;
    esac

@@ -245,11 +235,7 @@ install_cuda_driver_yum() {
 # ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#debian
 install_cuda_driver_apt() {
    status 'Installing NVIDIA repository...'
-    if curl -I --silent --fail --location "https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-keyring_1.1-1_all.deb" >/dev/null ; then
-        curl -fsSL -o $TEMP_DIR/cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-keyring_1.1-1_all.deb
-    else
-        error $CUDA_REPO_ERR_MSG
-    fi
+    curl -fsSL -o $TEMP_DIR/cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-keyring_1.1-1_all.deb

    case $1 in
        debian)
--- a/server/auth.go
+++ b/server/auth.go
@@ -67,7 +67,7 @@ func getAuthorizationToken(ctx context.Context, challenge registryChallenge) (st

 	headers.Add("Authorization", signature)

-	response, err := makeRequest(ctx, http.MethodGet, redirectURL, headers, nil, &registryOptions{})
+	response, err := makeRequest(ctx, http.MethodGet, redirectURL, headers, nil, nil)
 	if err != nil {
 		return "", err
 	}
--- a/server/download.go
+++ b/server/download.go
@@ -44,53 +44,21 @@ type blobDownload struct {

 	context.CancelFunc

-	done       chan struct{}
+	done       bool
 	err        error
 	references atomic.Int32
 }

 type blobDownloadPart struct {
-	N         int
-	Offset    int64
-	Size      int64
-	Completed atomic.Int64
-
-	lastUpdatedMu sync.Mutex
-	lastUpdated   time.Time
+	N           int
+	Offset      int64
+	Size        int64
+	Completed   int64
+	lastUpdated time.Time

 	*blobDownload `json:"-"`
 }

-type jsonBlobDownloadPart struct {
-	N         int
-	Offset    int64
-	Size      int64
-	Completed int64
-}
-
-func (p *blobDownloadPart) MarshalJSON() ([]byte, error) {
-	return json.Marshal(jsonBlobDownloadPart{
-		N:         p.N,
-		Offset:    p.Offset,
-		Size:      p.Size,
-		Completed: p.Completed.Load(),
-	})
-}
-
-func (p *blobDownloadPart) UnmarshalJSON(b []byte) error {
-	var j jsonBlobDownloadPart
-	if err := json.Unmarshal(b, &j); err != nil {
-		return err
-	}
-	*p = blobDownloadPart{
-		N:      j.N,
-		Offset: j.Offset,
-		Size:   j.Size,
-	}
-	p.Completed.Store(j.Completed)
-	return nil
-}
-
 const (
 	numDownloadParts          = 64
 	minDownloadPartSize int64 = 100 * format.MegaByte
@@ -104,7 +72,7 @@ func (p *blobDownloadPart) Name() string {
 }

 func (p *blobDownloadPart) StartsAt() int64 {
-	return p.Offset + p.Completed.Load()
+	return p.Offset + p.Completed
 }

 func (p *blobDownloadPart) StopsAt() int64 {
@@ -114,9 +82,7 @@ func (p *blobDownloadPart) StopsAt() int64 {
 func (p *blobDownloadPart) Write(b []byte) (n int, err error) {
 	n = len(b)
 	p.blobDownload.Completed.Add(int64(n))
-	p.lastUpdatedMu.Lock()
 	p.lastUpdated = time.Now()
-	p.lastUpdatedMu.Unlock()
 	return n, nil
 }

@@ -126,8 +92,6 @@ func (b *blobDownload) Prepare(ctx context.Context, requestURL *url.URL, opts *r
 		return err
 	}

-	b.done = make(chan struct{})
-
 	for _, partFilePath := range partFilePaths {
 		part, err := b.readPart(partFilePath)
 		if err != nil {
@@ -135,7 +99,7 @@ func (b *blobDownload) Prepare(ctx context.Context, requestURL *url.URL, opts *r
 		}

 		b.Total += part.Size
-		b.Completed.Add(part.Completed.Load())
+		b.Completed.Add(part.Completed)
 		b.Parts = append(b.Parts, part)
 	}

@@ -175,7 +139,6 @@ func (b *blobDownload) Prepare(ctx context.Context, requestURL *url.URL, opts *r
 }

 func (b *blobDownload) Run(ctx context.Context, requestURL *url.URL, opts *registryOptions) {
-	defer close(b.done)
 	b.err = b.run(ctx, requestURL, opts)
 }

@@ -267,7 +230,7 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *regis
 	g.SetLimit(numDownloadParts)
 	for i := range b.Parts {
 		part := b.Parts[i]
-		if part.Completed.Load() == part.Size {
+		if part.Completed == part.Size {
 			continue
 		}

@@ -275,7 +238,7 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *regis
 			var err error
 			for try := 0; try < maxRetries; try++ {
 				w := io.NewOffsetWriter(file, part.StartsAt())
-				err = b.downloadChunk(inner, directURL, w, part)
+				err = b.downloadChunk(inner, directURL, w, part, opts)
 				switch {
 				case errors.Is(err, context.Canceled), errors.Is(err, syscall.ENOSPC):
 					// return immediately if the context is canceled or the device is out of space
@@ -316,31 +279,29 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *regis
 		return err
 	}

+	b.done = true
 	return nil
 }

-func (b *blobDownload) downloadChunk(ctx context.Context, requestURL *url.URL, w io.Writer, part *blobDownloadPart) error {
+func (b *blobDownload) downloadChunk(ctx context.Context, requestURL *url.URL, w io.Writer, part *blobDownloadPart, opts *registryOptions) error {
 	g, ctx := errgroup.WithContext(ctx)
 	g.Go(func() error {
-		req, err := http.NewRequestWithContext(ctx, http.MethodGet, requestURL.String(), nil)
-		if err != nil {
-			return err
-		}
-		req.Header.Set("Range", fmt.Sprintf("bytes=%d-%d", part.StartsAt(), part.StopsAt()-1))
-		resp, err := http.DefaultClient.Do(req)
+		headers := make(http.Header)
+		headers.Set("Range", fmt.Sprintf("bytes=%d-%d", part.StartsAt(), part.StopsAt()-1))
+		resp, err := makeRequestWithRetry(ctx, http.MethodGet, requestURL, headers, nil, opts)
 		if err != nil {
 			return err
 		}
 		defer resp.Body.Close()

-		n, err := io.CopyN(w, io.TeeReader(resp.Body, part), part.Size-part.Completed.Load())
+		n, err := io.CopyN(w, io.TeeReader(resp.Body, part), part.Size-part.Completed)
 		if err != nil && !errors.Is(err, context.Canceled) && !errors.Is(err, io.ErrUnexpectedEOF) {
 			// rollback progress
 			b.Completed.Add(-n)
 			return err
 		}

-		part.Completed.Add(n)
+		part.Completed += n
 		if err := b.writePart(part.Name(), part); err != nil {
 			return err
 		}
@@ -354,21 +315,15 @@ func (b *blobDownload) downloadChunk(ctx context.Context, requestURL *url.URL, w
 		for {
 			select {
 			case <-ticker.C:
-				if part.Completed.Load() >= part.Size {
+				if part.Completed >= part.Size {
 					return nil
 				}

-				part.lastUpdatedMu.Lock()
-				lastUpdated := part.lastUpdated
-				part.lastUpdatedMu.Unlock()
-
-				if !lastUpdated.IsZero() && time.Since(lastUpdated) > 5*time.Second {
+				if !part.lastUpdated.IsZero() && time.Since(part.lastUpdated) > 5*time.Second {
 					const msg = "%s part %d stalled; retrying. If this persists, press ctrl-c to exit, then 'ollama pull' to find a faster connection."
 					slog.Info(fmt.Sprintf(msg, b.Digest[7:19], part.N))
 					// reset last updated
-					part.lastUpdatedMu.Lock()
 					part.lastUpdated = time.Time{}
-					part.lastUpdatedMu.Unlock()
 					return errPartStalled
 				}
 			case <-ctx.Done():
@@ -433,8 +388,6 @@ func (b *blobDownload) Wait(ctx context.Context, fn func(api.ProgressResponse))
 	ticker := time.NewTicker(60 * time.Millisecond)
 	for {
 		select {
-		case <-b.done:
-			return b.err
 		case <-ticker.C:
 			fn(api.ProgressResponse{
 				Status:    fmt.Sprintf("pulling %s", b.Digest[7:19]),
@@ -442,6 +395,10 @@ func (b *blobDownload) Wait(ctx context.Context, fn func(api.ProgressResponse))
 				Total:     b.Total,
 				Completed: b.Completed.Load(),
 			})
+
+			if b.done || b.err != nil {
+				return b.err
+			}
 		case <-ctx.Done():
 			return ctx.Err()
 		}
--- a/server/images.go
+++ b/server/images.go
@@ -70,7 +70,7 @@ type Model struct {
 	License        []string
 	Digest         string
 	Options        map[string]interface{}
-	Messages       []api.Message
+	Messages       []Message

 	Template *template.Template
 }
@@ -191,6 +191,11 @@ func (m *Model) String() string {
 	return modelfile.String()
 }

+type Message struct {
+	Role    string `json:"role"`
+	Content string `json:"content"`
+}
+
 type ConfigV2 struct {
 	ModelFormat   string   `json:"model_format"`
 	ModelFamily   string   `json:"model_family"`
@@ -641,7 +646,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
 		return err
 	}

-	if !envconfig.NoPrune() && old != nil {
+	if !envconfig.NoPrune && old != nil {
 		if err := old.RemoveLayers(); err != nil {
 			return err
 		}
@@ -880,7 +885,7 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
 	// build deleteMap to prune unused layers
 	deleteMap := make(map[string]struct{})

-	if !envconfig.NoPrune() {
+	if !envconfig.NoPrune {
 		manifest, _, err = GetManifest(mp)
 		if err != nil && !errors.Is(err, os.ErrNotExist) {
 			return err
--- a/server/manifest_test.go
+++ b/server/manifest_test.go
@@ -7,6 +7,7 @@ import (
 	"slices"
 	"testing"

+	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/types/model"
 )

@@ -107,6 +108,7 @@ func TestManifests(t *testing.T) {
 		t.Run(n, func(t *testing.T) {
 			d := t.TempDir()
 			t.Setenv("OLLAMA_MODELS", d)
+			envconfig.LoadConfig()

 			for _, p := range wants.ps {
 				createManifest(t, d, p)
--- a/server/modelpath.go
+++ b/server/modelpath.go
@@ -105,7 +105,9 @@ func (mp ModelPath) GetShortTagname() string {

 // GetManifestPath returns the path to the manifest file for the given model path, it is up to the caller to create the directory if it does not exist.
 func (mp ModelPath) GetManifestPath() (string, error) {
-	return filepath.Join(envconfig.Models(), "manifests", mp.Registry, mp.Namespace, mp.Repository, mp.Tag), nil
+	dir := envconfig.ModelsDir
+
+	return filepath.Join(dir, "manifests", mp.Registry, mp.Namespace, mp.Repository, mp.Tag), nil
 }

 func (mp ModelPath) BaseURL() *url.URL {
@@ -116,7 +118,9 @@ func (mp ModelPath) BaseURL() *url.URL {
 }

 func GetManifestPath() (string, error) {
-	path := filepath.Join(envconfig.Models(), "manifests")
+	dir := envconfig.ModelsDir
+
+	path := filepath.Join(dir, "manifests")
 	if err := os.MkdirAll(path, 0o755); err != nil {
 		return "", err
 	}
@@ -125,6 +129,8 @@ func GetManifestPath() (string, error) {
 }

 func GetBlobsPath(digest string) (string, error) {
+	dir := envconfig.ModelsDir
+
 	// only accept actual sha256 digests
 	pattern := "^sha256[:-][0-9a-fA-F]{64}$"
 	re := regexp.MustCompile(pattern)
@@ -134,7 +140,7 @@ func GetBlobsPath(digest string) (string, error) {
 	}

 	digest = strings.ReplaceAll(digest, ":", "-")
-	path := filepath.Join(envconfig.Models(), "blobs", digest)
+	path := filepath.Join(dir, "blobs", digest)
 	dirPath := filepath.Dir(path)
 	if digest == "" {
 		dirPath = path
--- a/server/modelpath_test.go
+++ b/server/modelpath_test.go
@@ -7,6 +7,8 @@ import (

 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
+
+	"github.com/ollama/ollama/envconfig"
 )

 func TestGetBlobsPath(t *testing.T) {
@@ -61,6 +63,7 @@ func TestGetBlobsPath(t *testing.T) {
 	for _, tc := range tests {
 		t.Run(tc.name, func(t *testing.T) {
 			t.Setenv("OLLAMA_MODELS", dir)
+			envconfig.LoadConfig()

 			got, err := GetBlobsPath(tc.digest)

--- a/server/routes.go
+++ b/server/routes.go
@@ -164,6 +164,17 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 			}
 		}

+		var b bytes.Buffer
+		if req.Context != nil {
+			s, err := r.Detokenize(c.Request.Context(), req.Context)
+			if err != nil {
+				c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+				return
+			}
+
+			b.WriteString(s)
+		}
+
 		var values template.Values
 		if req.Suffix != "" {
 			values.Prompt = prompt
@@ -176,10 +187,6 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 				msgs = append(msgs, api.Message{Role: "system", Content: m.System})
 			}

-			if req.Context == nil {
-				msgs = append(msgs, m.Messages...)
-			}
-
 			for _, i := range images {
 				msgs = append(msgs, api.Message{Role: "user", Content: fmt.Sprintf("[img-%d]", i.ID)})
 			}
@@ -187,22 +194,11 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 			values.Messages = append(msgs, api.Message{Role: "user", Content: req.Prompt})
 		}

-		var b bytes.Buffer
 		if err := tmpl.Execute(&b, values); err != nil {
 			c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 			return
 		}

-		if req.Context != nil {
-			s, err := r.Detokenize(c.Request.Context(), req.Context)
-			if err != nil {
-				c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
-				return
-			}
-
-			b.WriteString(s)
-		}
-
 		prompt = b.String()
 	}

@@ -288,7 +284,6 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 }

 func (s *Server) EmbedHandler(c *gin.Context) {
-	checkpointStart := time.Now()
 	var req api.EmbedRequest
 	err := c.ShouldBindJSON(&req)
 	switch {
@@ -337,8 +332,6 @@ func (s *Server) EmbedHandler(c *gin.Context) {
 		return
 	}

-	checkpointLoaded := time.Now()
-
 	kvData, err := getKVData(m.ModelPath, false)
 	if err != nil {
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
@@ -377,16 +370,13 @@ func (s *Server) EmbedHandler(c *gin.Context) {
 		return
 	}

-	for i, e := range embeddings.Embedding {
-		embeddings.Embedding[i] = normalize(e)
+	for i, e := range embeddings {
+		embeddings[i] = normalize(e)
 	}

 	resp := api.EmbedResponse{
-		Model:           req.Model,
-		Embeddings:      embeddings.Embedding,
-		TotalDuration:   time.Since(checkpointStart),
-		LoadDuration:    checkpointLoaded.Sub(checkpointStart),
-		PromptEvalCount: embeddings.PromptEvalCount,
+		Model:      req.Model,
+		Embeddings: embeddings,
 	}
 	c.JSON(http.StatusOK, resp)
 }
@@ -438,9 +428,9 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) {
 		return
 	}

-	embedding := make([]float64, len(embeddings.Embedding[0]))
+	embedding := make([]float64, len(embeddings[0]))

-	for i, v := range embeddings.Embedding[0] {
+	for i, v := range embeddings[0] {
 		embedding[i] = float64(v)
 	}

@@ -1057,7 +1047,7 @@ func (s *Server) GenerateRoutes() http.Handler {
 	for _, prop := range openAIProperties {
 		config.AllowHeaders = append(config.AllowHeaders, "x-stainless-"+prop)
 	}
-	config.AllowOrigins = envconfig.Origins()
+	config.AllowOrigins = envconfig.AllowOrigins

 	r := gin.Default()
 	r.Use(
@@ -1102,7 +1092,7 @@ func (s *Server) GenerateRoutes() http.Handler {

 func Serve(ln net.Listener) error {
 	level := slog.LevelInfo
-	if envconfig.Debug() {
+	if envconfig.Debug {
 		level = slog.LevelDebug
 	}

@@ -1130,7 +1120,7 @@ func Serve(ln net.Listener) error {
 		return err
 	}

-	if !envconfig.NoPrune() {
+	if !envconfig.NoPrune {
 		// clean up unused layers and manifests
 		if err := PruneLayers(); err != nil {
 			return err
@@ -1333,12 +1323,11 @@ func (s *Server) ChatHandler(c *gin.Context) {
 		return
 	}

-	msgs := append(m.Messages, req.Messages...)
 	if req.Messages[0].Role != "system" && m.System != "" {
-		msgs = append([]api.Message{{Role: "system", Content: m.System}}, msgs...)
+		req.Messages = append([]api.Message{{Role: "system", Content: m.System}}, req.Messages...)
 	}

-	prompt, images, err := chatPrompt(c.Request.Context(), m, r.Tokenize, opts, msgs, req.Tools)
+	prompt, images, err := chatPrompt(c.Request.Context(), m, r.Tokenize, opts, req.Messages, req.Tools)
 	if err != nil {
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
--- a/server/routes_create_test.go
+++ b/server/routes_create_test.go
@@ -15,6 +15,7 @@ import (

 	"github.com/gin-gonic/gin"
 	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/llm"
 )

@@ -88,6 +89,7 @@ func TestCreateFromBin(t *testing.T) {

 	p := t.TempDir()
 	t.Setenv("OLLAMA_MODELS", p)
+	envconfig.LoadConfig()

 	var s Server
 	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
@@ -115,6 +117,7 @@ func TestCreateFromModel(t *testing.T) {

 	p := t.TempDir()
 	t.Setenv("OLLAMA_MODELS", p)
+	envconfig.LoadConfig()
 	var s Server

 	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
@@ -157,6 +160,7 @@ func TestCreateRemovesLayers(t *testing.T) {

 	p := t.TempDir()
 	t.Setenv("OLLAMA_MODELS", p)
+	envconfig.LoadConfig()
 	var s Server

 	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
@@ -205,6 +209,7 @@ func TestCreateUnsetsSystem(t *testing.T) {

 	p := t.TempDir()
 	t.Setenv("OLLAMA_MODELS", p)
+	envconfig.LoadConfig()
 	var s Server

 	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
@@ -262,6 +267,7 @@ func TestCreateMergeParameters(t *testing.T) {

 	p := t.TempDir()
 	t.Setenv("OLLAMA_MODELS", p)
+	envconfig.LoadConfig()
 	var s Server

 	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
@@ -366,6 +372,7 @@ func TestCreateReplacesMessages(t *testing.T) {

 	p := t.TempDir()
 	t.Setenv("OLLAMA_MODELS", p)
+	envconfig.LoadConfig()
 	var s Server

 	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
@@ -443,6 +450,7 @@ func TestCreateTemplateSystem(t *testing.T) {

 	p := t.TempDir()
 	t.Setenv("OLLAMA_MODELS", p)
+	envconfig.LoadConfig()
 	var s Server

 	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
@@ -526,6 +534,7 @@ func TestCreateLicenses(t *testing.T) {

 	p := t.TempDir()
 	t.Setenv("OLLAMA_MODELS", p)
+	envconfig.LoadConfig()
 	var s Server

 	w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
@@ -573,6 +582,7 @@ func TestCreateDetectTemplate(t *testing.T) {

 	p := t.TempDir()
 	t.Setenv("OLLAMA_MODELS", p)
+	envconfig.LoadConfig()
 	var s Server

 	t.Run("matched", func(t *testing.T) {
--- a/server/routes_delete_test.go
+++ b/server/routes_delete_test.go
@@ -10,6 +10,7 @@ import (

 	"github.com/gin-gonic/gin"
 	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/types/model"
 )

@@ -18,6 +19,7 @@ func TestDelete(t *testing.T) {

 	p := t.TempDir()
 	t.Setenv("OLLAMA_MODELS", p)
+	envconfig.LoadConfig()

 	var s Server

--- a/server/routes_list_test.go
+++ b/server/routes_list_test.go
@@ -9,12 +9,14 @@ import (

 	"github.com/gin-gonic/gin"
 	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/envconfig"
 )

 func TestList(t *testing.T) {
 	gin.SetMode(gin.TestMode)

 	t.Setenv("OLLAMA_MODELS", t.TempDir())
+	envconfig.LoadConfig()

 	expectNames := []string{
 		"mistral:7b-instruct-q4_0",
--- a/server/routes_test.go
+++ b/server/routes_test.go
@@ -19,6 +19,7 @@ import (
 	"github.com/stretchr/testify/require"

 	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/openai"
 	"github.com/ollama/ollama/parser"
@@ -346,6 +347,7 @@ func Test_Routes(t *testing.T) {
 	}

 	t.Setenv("OLLAMA_MODELS", t.TempDir())
+	envconfig.LoadConfig()

 	s := &Server{}
 	router := s.GenerateRoutes()
@@ -376,6 +378,7 @@ func Test_Routes(t *testing.T) {

 func TestCase(t *testing.T) {
 	t.Setenv("OLLAMA_MODELS", t.TempDir())
+	envconfig.LoadConfig()

 	cases := []string{
 		"mistral",
@@ -455,6 +458,7 @@ func TestCase(t *testing.T) {

 func TestShow(t *testing.T) {
 	t.Setenv("OLLAMA_MODELS", t.TempDir())
+	envconfig.LoadConfig()

 	var s Server

--- a/server/sched.go
+++ b/server/sched.go
@@ -5,11 +5,9 @@ import (
 	"errors"
 	"fmt"
 	"log/slog"
-	"os"
 	"reflect"
 	"runtime"
 	"sort"
-	"strconv"
 	"strings"
 	"sync"
 	"time"
@@ -61,12 +59,11 @@ var defaultParallel = 4
 var ErrMaxQueue = fmt.Errorf("server busy, please try again.  maximum pending requests exceeded")

 func InitScheduler(ctx context.Context) *Scheduler {
-	maxQueue := envconfig.MaxQueue()
 	sched := &Scheduler{
-		pendingReqCh:  make(chan *LlmRequest, maxQueue),
-		finishedReqCh: make(chan *LlmRequest, maxQueue),
-		expiredCh:     make(chan *runnerRef, maxQueue),
-		unloadedCh:    make(chan interface{}, maxQueue),
+		pendingReqCh:  make(chan *LlmRequest, envconfig.MaxQueuedRequests),
+		finishedReqCh: make(chan *LlmRequest, envconfig.MaxQueuedRequests),
+		expiredCh:     make(chan *runnerRef, envconfig.MaxQueuedRequests),
+		unloadedCh:    make(chan interface{}, envconfig.MaxQueuedRequests),
 		loaded:        make(map[string]*runnerRef),
 		newServerFn:   llm.NewLlamaServer,
 		getGpuFn:      gpu.GetGPUInfo,
@@ -129,12 +126,14 @@ func (s *Scheduler) processPending(ctx context.Context) {
 				slog.Debug("pending request cancelled or timed out, skipping scheduling")
 				continue
 			}
-			numParallel := int(envconfig.NumParallel())
+			numParallel := envconfig.NumParallel
 			// TODO (jmorganca): multimodal models don't support parallel yet
 			// see https://github.com/ollama/ollama/issues/4165
 			if len(pending.model.ProjectorPaths) > 0 && numParallel != 1 {
 				numParallel = 1
 				slog.Warn("multimodal models don't support parallel requests yet")
+			} else if strings.Contains(pending.model.Config.ModelFamily, "bert") {
+				numParallel = runtime.NumCPU()
 			}

 			for {
@@ -151,7 +150,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						pending.useLoadedRunner(runner, s.finishedReqCh)
 						break
 					}
-				} else if envconfig.MaxRunners() > 0 && loadedCount >= int(envconfig.MaxRunners()) {
+				} else if envconfig.MaxRunners > 0 && loadedCount >= envconfig.MaxRunners {
 					slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount)
 					runnerToExpire = s.findRunnerToUnload()
 				} else {
@@ -164,7 +163,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						gpus = s.getGpuFn()
 					}

-					if envconfig.MaxRunners() <= 0 {
+					if envconfig.MaxRunners <= 0 {
 						// No user specified MaxRunners, so figure out what automatic setting to use
 						// If all GPUs have reliable free memory reporting, defaultModelsPerGPU * the number of GPUs
 						// if any GPU has unreliable free memory reporting, 1x the number of GPUs
@@ -176,13 +175,11 @@ func (s *Scheduler) processPending(ctx context.Context) {
 							}
 						}
 						if allReliable {
-							// HACK
-							os.Setenv("OLLAMA_MAX_LOADED_MODELS", strconv.Itoa(defaultModelsPerGPU*len(gpus)))
+							envconfig.MaxRunners = defaultModelsPerGPU * len(gpus)
 							slog.Debug("updating default concurrency", "OLLAMA_MAX_LOADED_MODELS", envconfig.MaxRunners, "gpu_count", len(gpus))
 						} else {
-							// HACK
-							os.Setenv("OLLAMA_MAX_LOADED_MODELS", strconv.Itoa(len(gpus)))
 							slog.Info("one or more GPUs detected that are unable to accurately report free memory - disabling default concurrency")
+							envconfig.MaxRunners = len(gpus)
 						}
 					}

@@ -217,12 +214,9 @@ func (s *Scheduler) processPending(ctx context.Context) {
 					} else if loadedCount == 0 {
 						// No models loaded. Load the model but prefer the best fit.
 						slog.Debug("loading first model", "model", pending.model.ModelPath)
-						g := pickBestFullFitByLibrary(pending, ggml, gpus, &numParallel)
+						g := pickBestFitGPUs(pending, ggml, gpus, &numParallel)
 						if g != nil {
 							gpus = g
-						} else {
-							// Only allow partial loads when this is the first model
-							gpus = pickBestPartialFitByLibrary(pending, ggml, gpus, &numParallel)
 						}
 						s.loadFn(pending, ggml, gpus, numParallel)
 						break
@@ -239,7 +233,7 @@ func (s *Scheduler) processPending(ctx context.Context) {

 						// Update free memory from currently loaded models
 						s.updateFreeSpace(availGpus)
-						fitGpus := pickBestFullFitByLibrary(pending, ggml, availGpus, &numParallel)
+						fitGpus := pickBestFitGPUs(pending, ggml, availGpus, &numParallel)
 						if fitGpus != nil {
 							slog.Debug("new model fits with existing models, loading")
 							s.loadFn(pending, ggml, fitGpus, numParallel)
@@ -409,7 +403,7 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList,
 	if numParallel < 1 {
 		numParallel = 1
 	}
-	sessionDuration := envconfig.KeepAlive()
+	sessionDuration := envconfig.KeepAlive
 	if req.sessionDuration != nil {
 		sessionDuration = req.sessionDuration.Duration
 	}
@@ -676,12 +670,11 @@ func (a ByDuration) Less(i, j int) bool {
 // func (a BySize) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
 // func (a BySize) Less(i, j int) bool { return a[i].estimatedVRAM < a[j].estimatedVRAM }

-// pickBestFullFitByLibrary will try to find the optimal placement of the model in the available GPUs where the model fully fits
-// The list of GPUs returned will always be the same brand (library)
+// pickBestFitGPUs will try to find the optimal placement of the model in the available GPUs where the model fully fits
 // If the model can not be fit fully within the available GPU(s) nil is returned
 // If numParallel is <= 0, this will attempt try to optimize parallism based on available VRAM, and adjust
 // opts.NumCtx accordingly
-func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
+func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
 	var estimatedVRAM uint64

 	var numParallelToTry []int
@@ -704,7 +697,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL
 		// First attempt to fit the model into a single GPU
 		for _, p := range numParallelToTry {
 			req.opts.NumCtx = req.origNumCtx * p
-			if !envconfig.SchedSpread() {
+			if !envconfig.SchedSpread {
 				for _, g := range sgl {
 					if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
 						slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
@@ -732,25 +725,6 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL
 	return nil
 }

-// If multiple Libraries are detected, pick the Library which loads the most layers for the model
-func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
-	*numParallel = 1
-	byLibrary := gpus.ByLibrary()
-	if len(byLibrary) <= 1 {
-		return gpus
-	}
-	var bestEstimate uint64
-	var bestFit int
-	for i, gl := range byLibrary {
-		_, estimatedVRAM := llm.PredictServerFit(gl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts)
-		if estimatedVRAM > bestEstimate {
-			bestEstimate = estimatedVRAM
-			bestFit = i
-		}
-	}
-	return byLibrary[bestFit]
-}
-
 // findRunnerToUnload finds a runner to unload to make room for a new model
 func (s *Scheduler) findRunnerToUnload() *runnerRef {
 	s.loadedMu.Lock()
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -12,6 +12,7 @@ import (

 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/app/lifecycle"
+	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/gpu"
 	"github.com/ollama/ollama/llm"
@@ -271,7 +272,7 @@ func TestRequestsMultipleLoadedModels(t *testing.T) {
 	c.req.opts.NumGPU = 0                                       // CPU load, will be allowed
 	d := newScenarioRequest(t, ctx, "ollama-model-3c", 30, nil) // Needs prior unloaded

-	t.Setenv("OLLAMA_MAX_LOADED_MODELS", "1")
+	envconfig.MaxRunners = 1
 	s.newServerFn = a.newServer
 	slog.Info("a")
 	s.pendingReqCh <- a.req
@@ -290,7 +291,7 @@ func TestRequestsMultipleLoadedModels(t *testing.T) {
 	require.Len(t, s.loaded, 1)
 	s.loadedMu.Unlock()

-	t.Setenv("OLLAMA_MAX_LOADED_MODELS", "0")
+	envconfig.MaxRunners = 0
 	s.newServerFn = b.newServer
 	slog.Info("b")
 	s.pendingReqCh <- b.req
@@ -361,7 +362,7 @@ func TestGetRunner(t *testing.T) {
 	a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, &api.Duration{Duration: 2 * time.Millisecond})
 	b := newScenarioRequest(t, ctx, "ollama-model-1b", 10, &api.Duration{Duration: 2 * time.Millisecond})
 	c := newScenarioRequest(t, ctx, "ollama-model-1c", 10, &api.Duration{Duration: 2 * time.Millisecond})
-	t.Setenv("OLLAMA_MAX_QUEUE", "1")
+	envconfig.MaxQueuedRequests = 1
 	s := InitScheduler(ctx)
 	s.getGpuFn = getGpuFn
 	s.getCpuFn = getCpuFn
@@ -665,50 +666,11 @@ func TestAlreadyCanceled(t *testing.T) {
 	require.Empty(t, scenario1a.req.successCh)
 }

-func TestHomogeneousGPUs(t *testing.T) {
-	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
-	defer done()
-	s := InitScheduler(ctx)
-
-	s.getGpuFn = func() gpu.GpuInfoList {
-		// Set memory values to require the model to be spread
-		gpus := []gpu.GpuInfo{
-			{Library: "cuda"},
-			{Library: "rocm"},
-		}
-		gpus[0].TotalMemory = 1 * format.GibiByte
-		gpus[0].FreeMemory = 256 * format.MebiByte
-		gpus[1].TotalMemory = 1 * format.GibiByte
-		gpus[1].FreeMemory = 256 * format.MebiByte
-		return gpus
-	}
-	s.getCpuFn = getCpuFn
-	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
-	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
-		require.Len(t, gpus, 1)
-		return a.newServer(gpus, model, ggml, adapters, projectors, opts, numParallel)
-	}
-	slog.Info("a")
-	s.pendingReqCh <- a.req
-	require.Len(t, s.pendingReqCh, 1)
-	s.Run(ctx)
-	select {
-	case resp := <-a.req.successCh:
-		require.Equal(t, resp.llama, a.srv)
-		require.Empty(t, s.pendingReqCh)
-		require.Empty(t, a.req.errCh)
-	case err := <-a.req.errCh:
-		t.Fatal(err.Error())
-	case <-ctx.Done():
-		t.Fatal("timeout")
-	}
-}
-
 type mockLlm struct {
 	pingResp           error
 	waitResp           error
 	completionResp     error
-	embedResp          *llm.EmbedResponse
+	embedResp          [][]float32
 	embedRespErr       error
 	tokenizeResp       []int
 	tokenizeRespErr    error
@@ -726,7 +688,7 @@ func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitRes
 func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
 	return s.completionResp
 }
-func (s *mockLlm) Embed(ctx context.Context, input []string) (*llm.EmbedResponse, error) {
+func (s *mockLlm) Embed(ctx context.Context, input []string) ([][]float32, error) {
 	return s.embedResp, s.embedRespErr
 }
 func (s *mockLlm) Tokenize(ctx context.Context, content string) ([]int, error) {
--- a/server/upload.go
+++ b/server/upload.go
@@ -254,7 +254,7 @@ func (b *blobUpload) uploadPart(ctx context.Context, method string, requestURL *

 		// retry uploading to the redirect URL
 		for try := range maxRetries {
-			err = b.uploadPart(ctx, http.MethodPut, redirectURL, part, &registryOptions{})
+			err = b.uploadPart(ctx, http.MethodPut, redirectURL, part, nil)
 			switch {
 			case errors.Is(err, context.Canceled):
 				return err