fix the cpu estimatedTotal memory + get the expiry time for loading models (#4461 )

Merge pull request #4459 from dhiltgen/sanitize_env_log
Sanitize the env var debug log
2025-12-24 08:10:54 -05:00 · 2024-05-15 15:43:16 -07:00 · 2024-05-15 14:58:55 -07:00 · 2024-05-15 14:42:57 -07:00 · 2024-05-14 15:34:29 -07:00 · 2024-05-14 15:17:04 -07:00
116 changed files with 5068 additions and 2461 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -11,4 +11,5 @@ ggml-metal.metal
 .idea
 test_data
 *.crt
-llm/build
+llm/build
+__debug_bin*
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 <div align="center">
-  <img alt="ollama" height="200px" src="https://github.com/ollama/ollama/assets/3325447/0d0b44e2-8f4a-4e99-9b52-a5c1c741c8f7">
+ <img alt="ollama" height="200px" src="https://github.com/ollama/ollama/assets/3325447/0d0b44e2-8f4a-4e99-9b52-a5c1c741c8f7">
 </div>

 # Ollama
@@ -51,7 +51,7 @@ Here are some example models that can be downloaded:
 | ------------------ | ---------- | ----- | ------------------------------ |
 | Llama 3            | 8B         | 4.7GB | `ollama run llama3`            |
 | Llama 3            | 70B        | 40GB  | `ollama run llama3:70b`        |
-| Phi-3              | 3,8B       | 2.3GB | `ollama run phi3`              |
+| Phi-3              | 3.8B       | 2.3GB | `ollama run phi3`              |
 | Mistral            | 7B         | 4.1GB | `ollama run mistral`           |
 | Neural Chat        | 7B         | 4.1GB | `ollama run neural-chat`       |
 | Starling           | 7B         | 4.1GB | `ollama run starling-lm`       |
@@ -173,7 +173,7 @@ I'm a basic program that prints the famous "Hello, world!" message to the consol
 The image features a yellow smiley face, which is likely the central focus of the picture.
 ```

-### Pass in prompt as arguments
+### Pass the prompt as an argument

 ```
 $ ollama run llama3 "Summarize this file: $(cat README.md)"
@@ -258,6 +258,7 @@ See the [API documentation](./docs/api.md) for all endpoints.

 - [Open WebUI](https://github.com/open-webui/open-webui)
 - [Enchanted (macOS native)](https://github.com/AugustDev/enchanted)
+- [Hollama](https://github.com/fmaclen/hollama)
 - [Lollms-Webui](https://github.com/ParisNeo/lollms-webui)
 - [LibreChat](https://github.com/danny-avila/LibreChat)
 - [Bionic GPT](https://github.com/bionic-gpt/bionic-gpt)
@@ -284,17 +285,20 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [OllamaGUI](https://github.com/enoch1118/ollamaGUI)
 - [OpenAOE](https://github.com/InternLM/OpenAOE)
 - [Odin Runes](https://github.com/leonid20000/OdinRunes)
- [LLM-X: Progressive Web App](https://github.com/mrdjohnson/llm-x)
+- [LLM-X](https://github.com/mrdjohnson/llm-x) (Progressive Web App)
 - [AnythingLLM (Docker + MacOs/Windows/Linux native app)](https://github.com/Mintplex-Labs/anything-llm)
 - [Ollama Basic Chat: Uses HyperDiv Reactive UI](https://github.com/rapidarchitect/ollama_basic_chat)
 - [Ollama-chats RPG](https://github.com/drazdra/ollama-chats)
- [QA-Pilot: Chat with Code Repository](https://github.com/reid41/QA-Pilot)
- [ChatOllama: Open Source Chatbot based on Ollama with Knowledge Bases](https://github.com/sugarforever/chat-ollama)
- [CRAG Ollama Chat: Simple Web Search with Corrective RAG](https://github.com/Nagi-ovo/CRAG-Ollama-Chat)
- [RAGFlow: Open-source Retrieval-Augmented Generation engine based on deep document understanding](https://github.com/infiniflow/ragflow)
- [chat: chat web app for teams](https://github.com/swuecho/chat)
+- [QA-Pilot](https://github.com/reid41/QA-Pilot) (Chat with Code Repository)
+- [ChatOllama](https://github.com/sugarforever/chat-ollama) (Open Source Chatbot based on Ollama with Knowledge Bases)
+- [CRAG Ollama Chat](https://github.com/Nagi-ovo/CRAG-Ollama-Chat) (Simple Web Search with Corrective RAG)
+- [RAGFlow](https://github.com/infiniflow/ragflow) (Open-source Retrieval-Augmented Generation engine based on deep document understanding)
+- [StreamDeploy](https://github.com/StreamDeploy-DevRel/streamdeploy-llm-app-scaffold) (LLM Application Scaffold)
+- [chat](https://github.com/swuecho/chat) (chat web app for teams)
 - [Lobe Chat](https://github.com/lobehub/lobe-chat) with [Integrating Doc](https://lobehub.com/docs/self-hosting/examples/ollama)
- [Ollama RAG Chatbot: Local Chat with multiples PDFs using Ollama and RAG.](https://github.com/datvodinh/rag-chatbot.git)
+- [Ollama RAG Chatbot](https://github.com/datvodinh/rag-chatbot.git) (Local Chat with multiple PDFs using Ollama and RAG)
+- [BrainSoup](https://www.nurgo-software.com/products/brainsoup) (Flexible native client with RAG & multi-agent automation)
+- [macai](https://github.com/Renset/macai) (macOS client for Ollama, ChatGPT, and other compatible API back-ends)

 ### Terminal

@@ -327,6 +331,7 @@ See the [API documentation](./docs/api.md) for all endpoints.

 - [Pacman](https://archlinux.org/packages/extra/x86_64/ollama/)
 - [Helm Chart](https://artifacthub.io/packages/helm/ollama-helm/ollama)
+- [Guix channel](https://codeberg.org/tusharhero/ollama-guix)

 ### Libraries

@@ -348,10 +353,13 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Haystack](https://github.com/deepset-ai/haystack-integrations/blob/main/integrations/ollama.md)
 - [Elixir LangChain](https://github.com/brainlid/langchain)
 - [Ollama for R - rollama](https://github.com/JBGruber/rollama)
+- [Ollama for R - ollama-r](https://github.com/hauselin/ollama-r)
 - [Ollama-ex for Elixir](https://github.com/lebrunel/ollama-ex)
 - [Ollama Connector for SAP ABAP](https://github.com/b-tocs/abap_btocs_ollama)
 - [Testcontainers](https://testcontainers.com/modules/ollama/)
-
+- [Portkey](https://portkey.ai/docs/welcome/integration-guides/ollama)
+- [PromptingTools.jl](https://github.com/svilupp/PromptingTools.jl) with an [example](https://svilupp.github.io/PromptingTools.jl/dev/examples/working_with_ollama)
+- [LlamaScript](https://github.com/Project-Llama/llamascript)
 ### Mobile

 - [Enchanted](https://github.com/AugustDev/enchanted)
@@ -370,12 +378,13 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Ollama Telegram Bot](https://github.com/ruecat/ollama-telegram)
 - [Hass Ollama Conversation](https://github.com/ej52/hass-ollama-conversation)
 - [Rivet plugin](https://github.com/abrenneke/rivet-plugin-ollama)
- [Llama Coder](https://github.com/ex3ndr/llama-coder) (Copilot alternative using Ollama)
 - [Obsidian BMO Chatbot plugin](https://github.com/longy2k/obsidian-bmo-chatbot)
 - [Cliobot](https://github.com/herval/cliobot) (Telegram bot with Ollama support)
 - [Copilot for Obsidian plugin](https://github.com/logancyang/obsidian-copilot)
 - [Obsidian Local GPT plugin](https://github.com/pfrankov/obsidian-local-gpt)
 - [Open Interpreter](https://docs.openinterpreter.com/language-model-setup/local-models/ollama)
+- [Llama Coder](https://github.com/ex3ndr/llama-coder) (Copilot alternative using Ollama)
+- [Ollama Copilot](https://github.com/bernardo-bruning/ollama-copilot) (Proxy that allows you to use ollama as a copilot like Github copilot)
 - [twinny](https://github.com/rjmacarthy/twinny) (Copilot and Copilot chat alternative using Ollama)
 - [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and HuggingFace)
 - [Page Assist](https://github.com/n4ze3m/page-assist) (Chrome Extension)
@@ -384,4 +393,5 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation)

 ### Supported backends 
- [llama.cpp](https://github.com/ggerganov/llama.cpp) project founded by Georgi Gerganov. 
+- [llama.cpp](https://github.com/ggerganov/llama.cpp) project founded by Georgi Gerganov.
+
--- a/api/client.go
+++ b/api/client.go
@@ -1,9 +1,16 @@
 // Package api implements the client-side API for code wishing to interact
 // with the ollama service. The methods of the [Client] type correspond to
-// the ollama REST API as described in https://github.com/ollama/ollama/blob/main/docs/api.md
-//
+// the ollama REST API as described in [the API documentation].
 // The ollama command-line client itself uses this package to interact with
 // the backend service.
+//
+// # Examples
+//
+// Several examples of using this package are available [in the GitHub
+// repository].
+//
+// [the API documentation]: https://github.com/ollama/ollama/blob/main/docs/api.md
+// [in the GitHub repository]: https://github.com/ollama/ollama/tree/main/examples
 package api

 import (
@@ -18,6 +25,7 @@ import (
 	"net/url"
 	"os"
 	"runtime"
+	"strconv"
 	"strings"

 	"github.com/ollama/ollama/format"
@@ -57,12 +65,36 @@ func checkError(resp *http.Response, body []byte) error {
 // If the variable is not specified, a default ollama host and port will be
 // used.
 func ClientFromEnvironment() (*Client, error) {
+	ollamaHost, err := GetOllamaHost()
+	if err != nil {
+		return nil, err
+	}
+
+	return &Client{
+		base: &url.URL{
+			Scheme: ollamaHost.Scheme,
+			Host:   net.JoinHostPort(ollamaHost.Host, ollamaHost.Port),
+		},
+		http: http.DefaultClient,
+	}, nil
+}
+
+type OllamaHost struct {
+	Scheme string
+	Host   string
+	Port   string
+}
+
+func GetOllamaHost() (OllamaHost, error) {
 	defaultPort := "11434"

-	scheme, hostport, ok := strings.Cut(os.Getenv("OLLAMA_HOST"), "://")
+	hostVar := os.Getenv("OLLAMA_HOST")
+	hostVar = strings.TrimSpace(strings.Trim(strings.TrimSpace(hostVar), "\"'"))
+
+	scheme, hostport, ok := strings.Cut(hostVar, "://")
 	switch {
 	case !ok:
-		scheme, hostport = "http", os.Getenv("OLLAMA_HOST")
+		scheme, hostport = "http", hostVar
 	case scheme == "http":
 		defaultPort = "80"
 	case scheme == "https":
@@ -82,12 +114,14 @@ func ClientFromEnvironment() (*Client, error) {
 		}
 	}

-	return &Client{
-		base: &url.URL{
-			Scheme: scheme,
-			Host:   net.JoinHostPort(host, port),
-		},
-		http: http.DefaultClient,
+	if portNum, err := strconv.ParseInt(port, 10, 32); err != nil || portNum > 65535 || portNum < 0 {
+		return OllamaHost{}, ErrInvalidHostPort
+	}
+
+	return OllamaHost{
+		Scheme: scheme,
+		Host:   host,
+		Port:   port,
 	}, nil
 }

@@ -272,8 +306,14 @@ func (c *Client) Pull(ctx context.Context, req *PullRequest, fn PullProgressFunc
 	})
 }

+// PushProgressFunc is a function that [Client.Push] invokes when progress is
+// made.
+// It's similar to other progress function types like [PullProgressFunc].
 type PushProgressFunc func(ProgressResponse) error

+// Push uploads a model to the model library; requires registering for ollama.ai
+// and adding a public key first. fn is called each time progress is made on
+// the request and can be used to display a progress bar, etc.
 func (c *Client) Push(ctx context.Context, req *PushRequest, fn PushProgressFunc) error {
 	return c.stream(ctx, http.MethodPost, "/api/push", req, func(bts []byte) error {
 		var resp ProgressResponse
@@ -285,8 +325,15 @@ func (c *Client) Push(ctx context.Context, req *PushRequest, fn PushProgressFunc
 	})
 }

+// CreateProgressFunc is a function that [Client.Create] invokes when progress
+// is made.
+// It's similar to other progress function types like [PullProgressFunc].
 type CreateProgressFunc func(ProgressResponse) error

+// Create creates a model from a [Modelfile]. fn is a progress function that
+// behaves similarly to other methods (see [Client.Pull]).
+//
+// [Modelfile]: https://github.com/ollama/ollama/blob/main/docs/modelfile.md
 func (c *Client) Create(ctx context.Context, req *CreateRequest, fn CreateProgressFunc) error {
 	return c.stream(ctx, http.MethodPost, "/api/create", req, func(bts []byte) error {
 		var resp ProgressResponse
@@ -298,6 +345,7 @@ func (c *Client) Create(ctx context.Context, req *CreateRequest, fn CreateProgre
 	})
 }

+// List lists models that are available locally.
 func (c *Client) List(ctx context.Context) (*ListResponse, error) {
 	var lr ListResponse
 	if err := c.do(ctx, http.MethodGet, "/api/tags", nil, &lr); err != nil {
@@ -306,6 +354,17 @@ func (c *Client) List(ctx context.Context) (*ListResponse, error) {
 	return &lr, nil
 }

+// List running models.
+func (c *Client) ListRunning(ctx context.Context) (*ListResponse, error) {
+	var lr ListResponse
+	if err := c.do(ctx, http.MethodGet, "/api/ps", nil, &lr); err != nil {
+		return nil, err
+	}
+	return &lr, nil
+}
+
+// Copy copies a model - creating a model with another name from an existing
+// model.
 func (c *Client) Copy(ctx context.Context, req *CopyRequest) error {
 	if err := c.do(ctx, http.MethodPost, "/api/copy", req, nil); err != nil {
 		return err
@@ -313,6 +372,7 @@ func (c *Client) Copy(ctx context.Context, req *CopyRequest) error {
 	return nil
 }

+// Delete deletes a model and its data.
 func (c *Client) Delete(ctx context.Context, req *DeleteRequest) error {
 	if err := c.do(ctx, http.MethodDelete, "/api/delete", req, nil); err != nil {
 		return err
@@ -320,6 +380,7 @@ func (c *Client) Delete(ctx context.Context, req *DeleteRequest) error {
 	return nil
 }

+// Show obtains model information, including details, modelfile, license etc.
 func (c *Client) Show(ctx context.Context, req *ShowRequest) (*ShowResponse, error) {
 	var resp ShowResponse
 	if err := c.do(ctx, http.MethodPost, "/api/show", req, &resp); err != nil {
@@ -328,12 +389,16 @@ func (c *Client) Show(ctx context.Context, req *ShowRequest) (*ShowResponse, err
 	return &resp, nil
 }

+// Hearbeat checks if the server has started and is responsive; if yes, it
+// returns nil, otherwise an error.
 func (c *Client) Heartbeat(ctx context.Context) error {
 	if err := c.do(ctx, http.MethodHead, "/", nil, nil); err != nil {
 		return err
 	}
 	return nil
 }
+
+// Embeddings generates embeddings from a model.
 func (c *Client) Embeddings(ctx context.Context, req *EmbeddingRequest) (*EmbeddingResponse, error) {
 	var resp EmbeddingResponse
 	if err := c.do(ctx, http.MethodPost, "/api/embeddings", req, &resp); err != nil {
@@ -342,10 +407,13 @@ func (c *Client) Embeddings(ctx context.Context, req *EmbeddingRequest) (*Embedd
 	return &resp, nil
 }

+// CreateBlob creates a blob from a file on the server. digest is the
+// expected SHA256 digest of the file, and r represents the file.
 func (c *Client) CreateBlob(ctx context.Context, digest string, r io.Reader) error {
 	return c.do(ctx, http.MethodPost, fmt.Sprintf("/api/blobs/%s", digest), r, nil)
 }

+// Version returns the Ollama server version as a string.
 func (c *Client) Version(ctx context.Context) (string, error) {
 	var version struct {
 		Version string `json:"version"`
--- a/api/client_test.go
+++ b/api/client_test.go
@@ -1,6 +1,12 @@
 package api

-import "testing"
+import (
+	"fmt"
+	"net"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)

 func TestClientFromEnvironment(t *testing.T) {
 	type testCase struct {
@@ -40,4 +46,40 @@ func TestClientFromEnvironment(t *testing.T) {
 			}
 		})
 	}
+
+	hostTestCases := map[string]*testCase{
+		"empty":               {value: "", expect: "127.0.0.1:11434"},
+		"only address":        {value: "1.2.3.4", expect: "1.2.3.4:11434"},
+		"only port":           {value: ":1234", expect: ":1234"},
+		"address and port":    {value: "1.2.3.4:1234", expect: "1.2.3.4:1234"},
+		"hostname":            {value: "example.com", expect: "example.com:11434"},
+		"hostname and port":   {value: "example.com:1234", expect: "example.com:1234"},
+		"zero port":           {value: ":0", expect: ":0"},
+		"too large port":      {value: ":66000", err: ErrInvalidHostPort},
+		"too small port":      {value: ":-1", err: ErrInvalidHostPort},
+		"ipv6 localhost":      {value: "[::1]", expect: "[::1]:11434"},
+		"ipv6 world open":     {value: "[::]", expect: "[::]:11434"},
+		"ipv6 no brackets":    {value: "::1", expect: "[::1]:11434"},
+		"ipv6 + port":         {value: "[::1]:1337", expect: "[::1]:1337"},
+		"extra space":         {value: " 1.2.3.4 ", expect: "1.2.3.4:11434"},
+		"extra quotes":        {value: "\"1.2.3.4\"", expect: "1.2.3.4:11434"},
+		"extra space+quotes":  {value: " \" 1.2.3.4 \" ", expect: "1.2.3.4:11434"},
+		"extra single quotes": {value: "'1.2.3.4'", expect: "1.2.3.4:11434"},
+	}
+
+	for k, v := range hostTestCases {
+		t.Run(k, func(t *testing.T) {
+			t.Setenv("OLLAMA_HOST", v.value)
+
+			oh, err := GetOllamaHost()
+			if err != v.err {
+				t.Fatalf("expected %s, got %s", v.err, err)
+			}
+
+			if err == nil {
+				host := net.JoinHostPort(oh.Host, oh.Port)
+				assert.Equal(t, v.expect, host, fmt.Sprintf("%s: expected %s, got %s", k, v.expect, host))
+			}
+		})
+	}
 }
--- a/api/types.go
+++ b/api/types.go
@@ -4,6 +4,7 @@ import (
 	"encoding/json"
 	"errors"
 	"fmt"
+	"log/slog"
 	"math"
 	"os"
 	"reflect"
@@ -12,6 +13,7 @@ import (
 	"time"
 )

+// StatusError is an error with and HTTP status code.
 type StatusError struct {
 	StatusCode   int
 	Status       string
@@ -32,6 +34,7 @@ func (e StatusError) Error() string {
 	}
 }

+// ImageData represents the raw binary data of an image file.
 type ImageData []byte

 // GenerateRequest describes a request sent by [Client.Generate]. While you
@@ -77,26 +80,44 @@ type GenerateRequest struct {
 	Options map[string]interface{} `json:"options"`
 }

+// ChatRequest describes a request sent by [Client.Chat].
 type ChatRequest struct {
-	Model     string    `json:"model"`
-	Messages  []Message `json:"messages"`
-	Stream    *bool     `json:"stream,omitempty"`
-	Format    string    `json:"format"`
+	// Model is the model name, as in [GenerateRequest].
+	Model string `json:"model"`
+
+	// Messages is the messages of the chat - can be used to keep a chat memory.
+	Messages []Message `json:"messages"`
+
+	// Stream enable streaming of returned response; true by default.
+	Stream *bool `json:"stream,omitempty"`
+
+	// Format is the format to return the response in (e.g. "json").
+	Format string `json:"format"`
+
+	// KeepAlive controls how long the model will stay loaded into memory
+	// followin the request.
 	KeepAlive *Duration `json:"keep_alive,omitempty"`

+	// Options lists model-specific options.
 	Options map[string]interface{} `json:"options"`
 }

+// Message is a single message in a chat sequence. The message contains the
+// role ("system", "user", or "assistant"), the content and an optional list
+// of images.
 type Message struct {
-	Role    string      `json:"role"` // one of ["system", "user", "assistant"]
+	Role    string      `json:"role"`
 	Content string      `json:"content"`
 	Images  []ImageData `json:"images,omitempty"`
 }

+// ChatResponse is the response returned by [Client.Chat]. Its fields are
+// similar to [GenerateResponse].
 type ChatResponse struct {
-	Model     string    `json:"model"`
-	CreatedAt time.Time `json:"created_at"`
-	Message   Message   `json:"message"`
+	Model      string    `json:"model"`
+	CreatedAt  time.Time `json:"created_at"`
+	Message    Message   `json:"message"`
+	DoneReason string    `json:"done_reason,omitempty"`

 	Done bool `json:"done"`

@@ -112,7 +133,8 @@ type Metrics struct {
 	EvalDuration       time.Duration `json:"eval_duration,omitempty"`
 }

-// Options specified in GenerateRequest, if you add a new option here add it to the API docs also
+// Options specified in [GenerateRequest], if you add a new option here add it
+// to the API docs also.
 type Options struct {
 	Runner

@@ -141,7 +163,6 @@ type Runner struct {
 	UseNUMA   bool `json:"numa,omitempty"`
 	NumCtx    int  `json:"num_ctx,omitempty"`
 	NumBatch  int  `json:"num_batch,omitempty"`
-	NumGQA    int  `json:"num_gqa,omitempty"`
 	NumGPU    int  `json:"num_gpu,omitempty"`
 	MainGPU   int  `json:"main_gpu,omitempty"`
 	LowVRAM   bool `json:"low_vram,omitempty"`
@@ -151,36 +172,45 @@ type Runner struct {
 	UseMMap   bool `json:"use_mmap,omitempty"`
 	UseMLock  bool `json:"use_mlock,omitempty"`
 	NumThread int  `json:"num_thread,omitempty"`
-
-	// Unused: RopeFrequencyBase is ignored. Instead the value in the model will be used
-	RopeFrequencyBase float32 `json:"rope_frequency_base,omitempty"`
-	// Unused: RopeFrequencyScale is ignored. Instead the value in the model will be used
-	RopeFrequencyScale float32 `json:"rope_frequency_scale,omitempty"`
 }

+// EmbeddingRequest is the request passed to [Client.Embeddings].
 type EmbeddingRequest struct {
-	Model     string    `json:"model"`
-	Prompt    string    `json:"prompt"`
+	// Model is the model name.
+	Model string `json:"model"`
+
+	// Prompt is the textual prompt to embed.
+	Prompt string `json:"prompt"`
+
+	// KeepAlive controls how long the model will stay loaded in memory following
+	// this request.
 	KeepAlive *Duration `json:"keep_alive,omitempty"`

+	// Options lists model-specific options.
 	Options map[string]interface{} `json:"options"`
 }

+// EmbeddingResponse is the response from [Client.Embeddings].
 type EmbeddingResponse struct {
 	Embedding []float64 `json:"embedding"`
 }

+// CreateRequest is the request passed to [Client.Create].
 type CreateRequest struct {
-	Model        string `json:"model"`
-	Path         string `json:"path"`
-	Modelfile    string `json:"modelfile"`
-	Stream       *bool  `json:"stream,omitempty"`
-	Quantization string `json:"quantization,omitempty"`
+	Model     string `json:"model"`
+	Path      string `json:"path"`
+	Modelfile string `json:"modelfile"`
+	Stream    *bool  `json:"stream,omitempty"`
+	Quantize  string `json:"quantize,omitempty"`

 	// Name is deprecated, see Model
 	Name string `json:"name"`
+
+	// Quantization is deprecated, see Quantize
+	Quantization string `json:"quantization,omitempty"`
 }

+// DeleteRequest is the request passed to [Client.Delete].
 type DeleteRequest struct {
 	Model string `json:"model"`

@@ -188,6 +218,7 @@ type DeleteRequest struct {
 	Name string `json:"name"`
 }

+// ShowRequest is the request passed to [Client.Show].
 type ShowRequest struct {
 	Model    string `json:"model"`
 	System   string `json:"system"`
@@ -199,6 +230,7 @@ type ShowRequest struct {
 	Name string `json:"name"`
 }

+// ShowResponse is the response returned from [Client.Show].
 type ShowResponse struct {
 	License    string       `json:"license,omitempty"`
 	Modelfile  string       `json:"modelfile,omitempty"`
@@ -209,11 +241,13 @@ type ShowResponse struct {
 	Messages   []Message    `json:"messages,omitempty"`
 }

+// CopyRequest is the request passed to [Client.Copy].
 type CopyRequest struct {
 	Source      string `json:"source"`
 	Destination string `json:"destination"`
 }

+// PullRequest is the request passed to [Client.Pull].
 type PullRequest struct {
 	Model    string `json:"model"`
 	Insecure bool   `json:"insecure,omitempty"`
@@ -225,6 +259,8 @@ type PullRequest struct {
 	Name string `json:"name"`
 }

+// ProgressResponse is the response passed to progress functions like
+// [PullProgressFunc] and [PushProgressFunc].
 type ProgressResponse struct {
 	Status    string `json:"status"`
 	Digest    string `json:"digest,omitempty"`
@@ -232,6 +268,7 @@ type ProgressResponse struct {
 	Completed int64  `json:"completed,omitempty"`
 }

+// PushRequest is the request passed to [Client.Push].
 type PushRequest struct {
 	Model    string `json:"model"`
 	Insecure bool   `json:"insecure,omitempty"`
@@ -243,34 +280,52 @@ type PushRequest struct {
 	Name string `json:"name"`
 }

+// ListResponse is the response from [Client.List].
 type ListResponse struct {
 	Models []ModelResponse `json:"models"`
 }

+// ModelResponse is a single model description in [ListResponse].
 type ModelResponse struct {
 	Name       string       `json:"name"`
 	Model      string       `json:"model"`
-	ModifiedAt time.Time    `json:"modified_at"`
+	ModifiedAt time.Time    `json:"modified_at,omitempty"`
 	Size       int64        `json:"size"`
 	Digest     string       `json:"digest"`
 	Details    ModelDetails `json:"details,omitempty"`
+	ExpiresAt  time.Time    `json:"expires_at,omitempty"`
+	SizeVRAM   int64        `json:"size_vram,omitempty"`
 }

 type TokenResponse struct {
 	Token string `json:"token"`
 }

+// GenerateResponse is the response passed into [GenerateResponseFunc].
 type GenerateResponse struct {
-	Model     string    `json:"model"`
-	CreatedAt time.Time `json:"created_at"`
-	Response  string    `json:"response"`
+	// Model is the model name that generated the response.
+	Model string `json:"model"`

-	Done    bool  `json:"done"`
+	//CreatedAt is the timestamp of the response.
+	CreatedAt time.Time `json:"created_at"`
+
+	// Response is the textual response itself.
+	Response string `json:"response"`
+
+	// Done specifies if the response is complete.
+	Done bool `json:"done"`
+
+	// DoneReason is the reason the model stopped generating text.
+	DoneReason string `json:"done_reason,omitempty"`
+
+	// Context is an encoding of the conversation used in this response; this
+	// can be sent in the next request to keep a conversational memory.
 	Context []int `json:"context,omitempty"`

 	Metrics
 }

+// ModelDetails provides details about a model.
 type ModelDetails struct {
 	ParentModel       string   `json:"parent_model"`
 	Format            string   `json:"format"`
@@ -308,7 +363,7 @@ func (m *Metrics) Summary() {
 	}
 }

-var ErrInvalidOpts = errors.New("invalid options")
+var ErrInvalidHostPort = errors.New("invalid port specified in OLLAMA_HOST")

 func (opts *Options) FromMap(m map[string]interface{}) error {
 	valueOpts := reflect.ValueOf(opts).Elem() // names of the fields in the options struct
@@ -323,76 +378,76 @@ func (opts *Options) FromMap(m map[string]interface{}) error {
 		}
 	}

-	invalidOpts := []string{}
 	for key, val := range m {
-		if opt, ok := jsonOpts[key]; ok {
-			field := valueOpts.FieldByName(opt.Name)
-			if field.IsValid() && field.CanSet() {
-				if val == nil {
-					continue
-				}
+		opt, ok := jsonOpts[key]
+		if !ok {
+			slog.Warn("invalid option provided", "option", opt.Name)
+			continue
+		}

-				switch field.Kind() {
-				case reflect.Int:
-					switch t := val.(type) {
-					case int64:
-						field.SetInt(t)
-					case float64:
-						// when JSON unmarshals numbers, it uses float64, not int
-						field.SetInt(int64(t))
-					default:
-						return fmt.Errorf("option %q must be of type integer", key)
-					}
-				case reflect.Bool:
-					val, ok := val.(bool)
-					if !ok {
-						return fmt.Errorf("option %q must be of type boolean", key)
-					}
-					field.SetBool(val)
-				case reflect.Float32:
-					// JSON unmarshals to float64
-					val, ok := val.(float64)
-					if !ok {
-						return fmt.Errorf("option %q must be of type float32", key)
-					}
-					field.SetFloat(val)
-				case reflect.String:
-					val, ok := val.(string)
-					if !ok {
-						return fmt.Errorf("option %q must be of type string", key)
-					}
-					field.SetString(val)
-				case reflect.Slice:
-					// JSON unmarshals to []interface{}, not []string
-					val, ok := val.([]interface{})
-					if !ok {
-						return fmt.Errorf("option %q must be of type array", key)
-					}
-					// convert []interface{} to []string
-					slice := make([]string, len(val))
-					for i, item := range val {
-						str, ok := item.(string)
-						if !ok {
-							return fmt.Errorf("option %q must be of an array of strings", key)
-						}
-						slice[i] = str
-					}
-					field.Set(reflect.ValueOf(slice))
-				default:
-					return fmt.Errorf("unknown type loading config params: %v", field.Kind())
-				}
+		field := valueOpts.FieldByName(opt.Name)
+		if field.IsValid() && field.CanSet() {
+			if val == nil {
+				continue
+			}
+
+			switch field.Kind() {
+			case reflect.Int:
+				switch t := val.(type) {
+				case int64:
+					field.SetInt(t)
+				case float64:
+					// when JSON unmarshals numbers, it uses float64, not int
+					field.SetInt(int64(t))
+				default:
+					return fmt.Errorf("option %q must be of type integer", key)
+				}
+			case reflect.Bool:
+				val, ok := val.(bool)
+				if !ok {
+					return fmt.Errorf("option %q must be of type boolean", key)
+				}
+				field.SetBool(val)
+			case reflect.Float32:
+				// JSON unmarshals to float64
+				val, ok := val.(float64)
+				if !ok {
+					return fmt.Errorf("option %q must be of type float32", key)
+				}
+				field.SetFloat(val)
+			case reflect.String:
+				val, ok := val.(string)
+				if !ok {
+					return fmt.Errorf("option %q must be of type string", key)
+				}
+				field.SetString(val)
+			case reflect.Slice:
+				// JSON unmarshals to []interface{}, not []string
+				val, ok := val.([]interface{})
+				if !ok {
+					return fmt.Errorf("option %q must be of type array", key)
+				}
+				// convert []interface{} to []string
+				slice := make([]string, len(val))
+				for i, item := range val {
+					str, ok := item.(string)
+					if !ok {
+						return fmt.Errorf("option %q must be of an array of strings", key)
+					}
+					slice[i] = str
+				}
+				field.Set(reflect.ValueOf(slice))
+			default:
+				return fmt.Errorf("unknown type loading config params: %v", field.Kind())
 			}
-		} else {
-			invalidOpts = append(invalidOpts, key)
 		}
 	}

-	if len(invalidOpts) > 0 {
-		return fmt.Errorf("%w: %v", ErrInvalidOpts, strings.Join(invalidOpts, ", "))
-	}
 	return nil
 }

+// DefaultOptions is the default set of options for [GenerateRequest]; these
+// values are used unless the user specifies other values explicitly.
 func DefaultOptions() Options {
 	return Options{
 		// options set on request to runner
@@ -420,8 +475,7 @@ func DefaultOptions() Options {
 			NumCtx:    2048,
 			NumBatch:  512,
 			NumGPU:    -1, // -1 here indicates that NumGPU should be set dynamically
-			NumGQA:    1,
-			NumThread: 0, // let the runtime decide
+			NumThread: 0,  // let the runtime decide
 			LowVRAM:   false,
 			F16KV:     true,
 			UseMLock:  false,
@@ -435,6 +489,13 @@ type Duration struct {
 	time.Duration
 }

+func (d Duration) MarshalJSON() ([]byte, error) {
+	if d.Duration < 0 {
+		return []byte("-1"), nil
+	}
+	return []byte("\"" + d.Duration.String() + "\""), nil
+}
+
 func (d *Duration) UnmarshalJSON(b []byte) (err error) {
 	var v any
 	if err := json.Unmarshal(b, &v); err != nil {
@@ -448,7 +509,7 @@ func (d *Duration) UnmarshalJSON(b []byte) (err error) {
 		if t < 0 {
 			d.Duration = time.Duration(math.MaxInt64)
 		} else {
-			d.Duration = time.Duration(t * float64(time.Second))
+			d.Duration = time.Duration(int(t) * int(time.Second))
 		}
 	case string:
 		d.Duration, err = time.ParseDuration(t)
@@ -458,6 +519,8 @@ func (d *Duration) UnmarshalJSON(b []byte) (err error) {
 		if d.Duration < 0 {
 			d.Duration = time.Duration(math.MaxInt64)
 		}
+	default:
+		return fmt.Errorf("Unsupported type: '%s'", reflect.TypeOf(v))
 	}

 	return nil
--- a/api/types_test.go
+++ b/api/types_test.go
@@ -21,6 +21,11 @@ func TestKeepAliveParsingFromJSON(t *testing.T) {
 			req:  `{ "keep_alive": 42 }`,
 			exp:  &Duration{42 * time.Second},
 		},
+		{
+			name: "Positive Float",
+			req:  `{ "keep_alive": 42.5 }`,
+			exp:  &Duration{42 * time.Second},
+		},
 		{
 			name: "Positive Integer String",
 			req:  `{ "keep_alive": "42m" }`,
@@ -31,6 +36,11 @@ func TestKeepAliveParsingFromJSON(t *testing.T) {
 			req:  `{ "keep_alive": -1 }`,
 			exp:  &Duration{math.MaxInt64},
 		},
+		{
+			name: "Negative Float",
+			req:  `{ "keep_alive": -3.14 }`,
+			exp:  &Duration{math.MaxInt64},
+		},
 		{
 			name: "Negative Integer String",
 			req:  `{ "keep_alive": "-1m" }`,
@@ -48,3 +58,50 @@ func TestKeepAliveParsingFromJSON(t *testing.T) {
 		})
 	}
 }
+
+func TestDurationMarshalUnmarshal(t *testing.T) {
+	tests := []struct {
+		name     string
+		input    time.Duration
+		expected time.Duration
+	}{
+		{
+			"negative duration",
+			time.Duration(-1),
+			time.Duration(math.MaxInt64),
+		},
+		{
+			"positive duration",
+			time.Duration(42 * time.Second),
+			time.Duration(42 * time.Second),
+		},
+		{
+			"another positive duration",
+			time.Duration(42 * time.Minute),
+			time.Duration(42 * time.Minute),
+		},
+		{
+			"zero duration",
+			time.Duration(0),
+			time.Duration(0),
+		},
+		{
+			"max duration",
+			time.Duration(math.MaxInt64),
+			time.Duration(math.MaxInt64),
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			b, err := json.Marshal(Duration{test.input})
+			require.NoError(t, err)
+
+			var d Duration
+			err = json.Unmarshal(b, &d)
+			require.NoError(t, err)
+
+			assert.Equal(t, test.expected, d.Duration, "input %v, marshalled %v, got %v", test.input, string(b), d.Duration)
+		})
+	}
+}
--- a/app/lifecycle/logging.go
+++ b/app/lifecycle/logging.go
@@ -5,12 +5,14 @@ import (
 	"log/slog"
 	"os"
 	"path/filepath"
+
+	"github.com/ollama/ollama/server/envconfig"
 )

 func InitLogging() {
 	level := slog.LevelInfo

-	if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
+	if envconfig.Debug {
 		level = slog.LevelDebug
 	}

--- a/app/lifecycle/server.go
+++ b/app/lifecycle/server.go
@@ -43,37 +43,36 @@ func getCLIFullPath(command string) string {
 	return command
 }

-func SpawnServer(ctx context.Context, command string) (chan int, error) {
-	done := make(chan int)
-
-	logDir := filepath.Dir(ServerLogFile)
-	_, err := os.Stat(logDir)
-	if errors.Is(err, os.ErrNotExist) {
-		if err := os.MkdirAll(logDir, 0o755); err != nil {
-			return done, fmt.Errorf("create ollama server log dir %s: %v", logDir, err)
-		}
-	}
-
+func start(ctx context.Context, command string) (*exec.Cmd, error) {
 	cmd := getCmd(ctx, getCLIFullPath(command))
-	// send stdout and stderr to a file
 	stdout, err := cmd.StdoutPipe()
 	if err != nil {
-		return done, fmt.Errorf("failed to spawn server stdout pipe %s", err)
+		return nil, fmt.Errorf("failed to spawn server stdout pipe: %w", err)
 	}
 	stderr, err := cmd.StderrPipe()
 	if err != nil {
-		return done, fmt.Errorf("failed to spawn server stderr pipe %s", err)
-	}
-	stdin, err := cmd.StdinPipe()
-	if err != nil {
-		return done, fmt.Errorf("failed to spawn server stdin pipe %s", err)
+		return nil, fmt.Errorf("failed to spawn server stderr pipe: %w", err)
 	}

 	// TODO - rotation
 	logFile, err := os.OpenFile(ServerLogFile, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0755)
 	if err != nil {
-		return done, fmt.Errorf("failed to create server log %w", err)
+		return nil, fmt.Errorf("failed to create server log: %w", err)
 	}
+
+	logDir := filepath.Dir(ServerLogFile)
+	_, err = os.Stat(logDir)
+	if err != nil {
+		if !errors.Is(err, os.ErrNotExist) {
+			return nil, fmt.Errorf("stat ollama server log dir %s: %v", logDir, err)
+
+		}
+
+		if err := os.MkdirAll(logDir, 0o755); err != nil {
+			return nil, fmt.Errorf("create ollama server log dir %s: %v", logDir, err)
+		}
+	}
+
 	go func() {
 		defer logFile.Close()
 		io.Copy(logFile, stdout) //nolint:errcheck
@@ -117,19 +116,33 @@ func SpawnServer(ctx context.Context, command string) (chan int, error) {

 	// run the command and wait for it to finish
 	if err := cmd.Start(); err != nil {
-		return done, fmt.Errorf("failed to start server %w", err)
+		return nil, fmt.Errorf("failed to start server %w", err)
 	}
 	if cmd.Process != nil {
 		slog.Info(fmt.Sprintf("started ollama server with pid %d", cmd.Process.Pid))
 	}
 	slog.Info(fmt.Sprintf("ollama server logs %s", ServerLogFile))

+	return cmd, nil
+}
+
+func SpawnServer(ctx context.Context, command string) (chan int, error) {
+	done := make(chan int)
+
 	go func() {
 		// Keep the server running unless we're shuttind down the app
 		crashCount := 0
 		for {
+			slog.Info("starting server...")
+			cmd, err := start(ctx, command)
+			if err != nil {
+				crashCount++
+				slog.Error(fmt.Sprintf("failed to start server %s", err))
+				time.Sleep(500 * time.Millisecond * time.Duration(crashCount))
+				continue
+			}
+
 			cmd.Wait() //nolint:errcheck
-			stdin.Close()
 			var code int
 			if cmd.ProcessState != nil {
 				code = cmd.ProcessState.ExitCode()
@@ -143,15 +156,12 @@ func SpawnServer(ctx context.Context, command string) (chan int, error) {
 			default:
 				crashCount++
 				slog.Warn(fmt.Sprintf("server crash %d - exit code %d - respawning", crashCount, code))
-				time.Sleep(500 * time.Millisecond)
-				if err := cmd.Start(); err != nil {
-					slog.Error(fmt.Sprintf("failed to restart server %s", err))
-					// Keep trying, but back off if we keep failing
-					time.Sleep(time.Duration(crashCount) * time.Second)
-				}
+				time.Sleep(500 * time.Millisecond * time.Duration(crashCount))
+				break
 			}
 		}
 	}()
+
 	return done, nil
 }

--- a/app/lifecycle/updater_windows.go
+++ b/app/lifecycle/updater_windows.go
@@ -31,16 +31,13 @@ func DoUpgrade(cancel context.CancelFunc, done chan int) error {
 		"/LOG=" + filepath.Base(UpgradeLogFile), // Only relative seems reliable, so set pwd
 		"/FORCECLOSEAPPLICATIONS",               // Force close the tray app - might be needed
 	}
-	// When we're not in debug mode, make the upgrade as quiet as possible (no GUI, no prompts)
-	// TODO - temporarily disable since we're pinning in debug mode for the preview
-	// if debug := os.Getenv("OLLAMA_DEBUG"); debug == "" {
+	// make the upgrade as quiet as possible (no GUI, no prompts)
 	installArgs = append(installArgs,
 		"/SP", // Skip the "This will install... Do you wish to continue" prompt
 		"/SUPPRESSMSGBOXES",
 		"/SILENT",
 		"/VERYSILENT",
 	)
-	// }

 	// Safeguard in case we have requests in flight that need to drain...
 	slog.Info("Waiting for server to shutdown")
--- a/app/ollama.iss
+++ b/app/ollama.iss
@@ -88,8 +88,8 @@ DialogFontSize=12
 [Files]
 Source: ".\app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ; Flags: ignoreversion 64bit
 Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit
-Source: "..\dist\windows-amd64\*.dll"; DestDir: "{app}"; Flags: ignoreversion 64bit
-Source: "..\dist\windows-amd64\ollama_runners\*"; DestDir: "{app}\ollama_runners"; Flags: ignoreversion 64bit recursesubdirs
+Source: "..\dist\windows-{#ARCH}\*.dll"; DestDir: "{app}"; Flags: ignoreversion 64bit
+Source: "..\dist\windows-{#ARCH}\ollama_runners\*"; DestDir: "{app}\ollama_runners"; Flags: ignoreversion 64bit recursesubdirs
 Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
 Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion
 #if DirExists("..\dist\windows-amd64\rocm")
--- a/app/tray/wintray/menus.go
+++ b/app/tray/wintray/menus.go
@@ -1,71 +1,71 @@
-//go:build windows
-
-package wintray
-
-import (
-	"fmt"
-	"log/slog"
-	"unsafe"
-
-	"golang.org/x/sys/windows"
-)
-
-const (
-	updatAvailableMenuID = 1
-	updateMenuID         = updatAvailableMenuID + 1
-	separatorMenuID      = updateMenuID + 1
-	diagLogsMenuID       = separatorMenuID + 1
-	diagSeparatorMenuID  = diagLogsMenuID + 1
-	quitMenuID           = diagSeparatorMenuID + 1
-)
-
-func (t *winTray) initMenus() error {
-	if err := t.addOrUpdateMenuItem(diagLogsMenuID, 0, diagLogsMenuTitle, false); err != nil {
-		return fmt.Errorf("unable to create menu entries %w\n", err)
-	}
-	if err := t.addSeparatorMenuItem(diagSeparatorMenuID, 0); err != nil {
-		return fmt.Errorf("unable to create menu entries %w", err)
-	}
-	if err := t.addOrUpdateMenuItem(quitMenuID, 0, quitMenuTitle, false); err != nil {
-		return fmt.Errorf("unable to create menu entries %w\n", err)
-	}
-	return nil
-}
-
-func (t *winTray) UpdateAvailable(ver string) error {
-	if !t.updateNotified {
-		slog.Debug("updating menu and sending notification for new update")
-		if err := t.addOrUpdateMenuItem(updatAvailableMenuID, 0, updateAvailableMenuTitle, true); err != nil {
-			return fmt.Errorf("unable to create menu entries %w", err)
-		}
-		if err := t.addOrUpdateMenuItem(updateMenuID, 0, updateMenutTitle, false); err != nil {
-			return fmt.Errorf("unable to create menu entries %w", err)
-		}
-		if err := t.addSeparatorMenuItem(separatorMenuID, 0); err != nil {
-			return fmt.Errorf("unable to create menu entries %w", err)
-		}
-		iconFilePath, err := iconBytesToFilePath(wt.updateIcon)
-		if err != nil {
-			return fmt.Errorf("unable to write icon data to temp file: %w", err)
-		}
-		if err := wt.setIcon(iconFilePath); err != nil {
-			return fmt.Errorf("unable to set icon: %w", err)
-		}
-		t.updateNotified = true
-
-		t.pendingUpdate = true
-		// Now pop up the notification
-		t.muNID.Lock()
-		defer t.muNID.Unlock()
-		copy(t.nid.InfoTitle[:], windows.StringToUTF16(updateTitle))
-		copy(t.nid.Info[:], windows.StringToUTF16(fmt.Sprintf(updateMessage, ver)))
-		t.nid.Flags |= NIF_INFO
-		t.nid.Timeout = 10
-		t.nid.Size = uint32(unsafe.Sizeof(*wt.nid))
-		err = t.nid.modify()
-		if err != nil {
-			return err
-		}
-	}
-	return nil
-}
+//go:build windows
+
+package wintray
+
+import (
+	"fmt"
+	"log/slog"
+	"unsafe"
+
+	"golang.org/x/sys/windows"
+)
+
+const (
+	updatAvailableMenuID = 1
+	updateMenuID         = updatAvailableMenuID + 1
+	separatorMenuID      = updateMenuID + 1
+	diagLogsMenuID       = separatorMenuID + 1
+	diagSeparatorMenuID  = diagLogsMenuID + 1
+	quitMenuID           = diagSeparatorMenuID + 1
+)
+
+func (t *winTray) initMenus() error {
+	if err := t.addOrUpdateMenuItem(diagLogsMenuID, 0, diagLogsMenuTitle, false); err != nil {
+		return fmt.Errorf("unable to create menu entries %w\n", err)
+	}
+	if err := t.addSeparatorMenuItem(diagSeparatorMenuID, 0); err != nil {
+		return fmt.Errorf("unable to create menu entries %w", err)
+	}
+	if err := t.addOrUpdateMenuItem(quitMenuID, 0, quitMenuTitle, false); err != nil {
+		return fmt.Errorf("unable to create menu entries %w\n", err)
+	}
+	return nil
+}
+
+func (t *winTray) UpdateAvailable(ver string) error {
+	if !t.updateNotified {
+		slog.Debug("updating menu and sending notification for new update")
+		if err := t.addOrUpdateMenuItem(updatAvailableMenuID, 0, updateAvailableMenuTitle, true); err != nil {
+			return fmt.Errorf("unable to create menu entries %w", err)
+		}
+		if err := t.addOrUpdateMenuItem(updateMenuID, 0, updateMenutTitle, false); err != nil {
+			return fmt.Errorf("unable to create menu entries %w", err)
+		}
+		if err := t.addSeparatorMenuItem(separatorMenuID, 0); err != nil {
+			return fmt.Errorf("unable to create menu entries %w", err)
+		}
+		iconFilePath, err := iconBytesToFilePath(wt.updateIcon)
+		if err != nil {
+			return fmt.Errorf("unable to write icon data to temp file: %w", err)
+		}
+		if err := wt.setIcon(iconFilePath); err != nil {
+			return fmt.Errorf("unable to set icon: %w", err)
+		}
+		t.updateNotified = true
+
+		t.pendingUpdate = true
+		// Now pop up the notification
+		t.muNID.Lock()
+		defer t.muNID.Unlock()
+		copy(t.nid.InfoTitle[:], windows.StringToUTF16(updateTitle))
+		copy(t.nid.Info[:], windows.StringToUTF16(fmt.Sprintf(updateMessage, ver)))
+		t.nid.Flags |= NIF_INFO
+		t.nid.Timeout = 10
+		t.nid.Size = uint32(unsafe.Sizeof(*wt.nid))
+		err = t.nid.modify()
+		if err != nil {
+			return err
+		}
+	}
+	return nil
+}
--- a/auth/auth.go
+++ b/auth/auth.go
@@ -10,12 +10,44 @@ import (
 	"log/slog"
 	"os"
 	"path/filepath"
+	"strings"

 	"golang.org/x/crypto/ssh"
 )

 const defaultPrivateKey = "id_ed25519"

+func keyPath() (string, error) {
+	home, err := os.UserHomeDir()
+	if err != nil {
+		return "", err
+	}
+
+	return filepath.Join(home, ".ollama", defaultPrivateKey), nil
+}
+
+func GetPublicKey() (string, error) {
+	keyPath, err := keyPath()
+	if err != nil {
+		return "", err
+	}
+
+	privateKeyFile, err := os.ReadFile(keyPath)
+	if err != nil {
+		slog.Info(fmt.Sprintf("Failed to load private key: %v", err))
+		return "", err
+	}
+
+	privateKey, err := ssh.ParsePrivateKey(privateKeyFile)
+	if err != nil {
+		return "", err
+	}
+
+	publicKey := ssh.MarshalAuthorizedKey(privateKey.PublicKey())
+
+	return strings.TrimSpace(string(publicKey)), nil
+}
+
 func NewNonce(r io.Reader, length int) (string, error) {
 	nonce := make([]byte, length)
 	if _, err := io.ReadFull(r, nonce); err != nil {
@@ -26,13 +58,11 @@ func NewNonce(r io.Reader, length int) (string, error) {
 }

 func Sign(ctx context.Context, bts []byte) (string, error) {
-	home, err := os.UserHomeDir()
+	keyPath, err := keyPath()
 	if err != nil {
 		return "", err
 	}

-	keyPath := filepath.Join(home, ".ollama", defaultPrivateKey)
-
 	privateKeyFile, err := os.ReadFile(keyPath)
 	if err != nil {
 		slog.Info(fmt.Sprintf("Failed to load private key: %v", err))
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -12,6 +12,7 @@ import (
 	"fmt"
 	"io"
 	"log"
+	"math"
 	"net"
 	"net/http"
 	"os"
@@ -32,10 +33,12 @@ import (
 	"golang.org/x/term"

 	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/auth"
 	"github.com/ollama/ollama/format"
-	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/progress"
 	"github.com/ollama/ollama/server"
+	"github.com/ollama/ollama/types/errtypes"
+	"github.com/ollama/ollama/types/model"
 	"github.com/ollama/ollama/version"
 )

@@ -54,12 +57,13 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 	p := progress.NewProgress(os.Stderr)
 	defer p.Stop()

-	modelfile, err := os.ReadFile(filename)
+	f, err := os.Open(filename)
 	if err != nil {
 		return err
 	}
+	defer f.Close()

-	commands, err := parser.Parse(bytes.NewReader(modelfile))
+	modelfile, err := model.ParseFile(f)
 	if err != nil {
 		return err
 	}
@@ -73,10 +77,10 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 	spinner := progress.NewSpinner(status)
 	p.Add(status, spinner)

-	for _, c := range commands {
-		switch c.Name {
+	for i := range modelfile.Commands {
+		switch modelfile.Commands[i].Name {
 		case "model", "adapter":
-			path := c.Args
+			path := modelfile.Commands[i].Args
 			if path == "~" {
 				path = home
 			} else if strings.HasPrefix(path, "~/") {
@@ -88,7 +92,7 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 			}

 			fi, err := os.Stat(path)
-			if errors.Is(err, os.ErrNotExist) && c.Name == "model" {
+			if errors.Is(err, os.ErrNotExist) && modelfile.Commands[i].Name == "model" {
 				continue
 			} else if err != nil {
 				return err
@@ -111,13 +115,7 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 				return err
 			}

-			name := c.Name
-			if c.Name == "model" {
-				name = "from"
-			}
-
-			re := regexp.MustCompile(fmt.Sprintf(`(?im)^(%s)\s+%s\s*$`, name, c.Args))
-			modelfile = re.ReplaceAll(modelfile, []byte("$1 @"+digest))
+			modelfile.Commands[i].Args = "@" + digest
 		}
 	}

@@ -145,9 +143,9 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 		return nil
 	}

-	quantization, _ := cmd.Flags().GetString("quantization")
+	quantize, _ := cmd.Flags().GetString("quantize")

-	request := api.CreateRequest{Name: args[0], Modelfile: string(modelfile), Quantization: quantization}
+	request := api.CreateRequest{Name: args[0], Modelfile: modelfile.String(), Quantize: quantize}
 	if err := client.Create(cmd.Context(), &request, fn); err != nil {
 		return err
 	}
@@ -327,6 +325,18 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 	}
 	opts.Format = format

+	keepAlive, err := cmd.Flags().GetString("keepalive")
+	if err != nil {
+		return err
+	}
+	if keepAlive != "" {
+		d, err := time.ParseDuration(keepAlive)
+		if err != nil {
+			return err
+		}
+		opts.KeepAlive = &api.Duration{Duration: d}
+	}
+
 	prompts := args[1:]
 	// prepend stdin to the prompt if provided
 	if !term.IsTerminal(int(os.Stdin.Fd())) {
@@ -357,6 +367,47 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 	return generateInteractive(cmd, opts)
 }

+func errFromUnknownKey(unknownKeyErr error) error {
+	// find SSH public key in the error message
+	sshKeyPattern := `ssh-\w+ [^\s"]+`
+	re := regexp.MustCompile(sshKeyPattern)
+	matches := re.FindStringSubmatch(unknownKeyErr.Error())
+
+	if len(matches) > 0 {
+		serverPubKey := matches[0]
+
+		localPubKey, err := auth.GetPublicKey()
+		if err != nil {
+			return unknownKeyErr
+		}
+
+		if runtime.GOOS == "linux" && serverPubKey != localPubKey {
+			// try the ollama service public key
+			svcPubKey, err := os.ReadFile("/usr/share/ollama/.ollama/id_ed25519.pub")
+			if err != nil {
+				return unknownKeyErr
+			}
+			localPubKey = strings.TrimSpace(string(svcPubKey))
+		}
+
+		// check if the returned public key matches the local public key, this prevents adding a remote key to the user's account
+		if serverPubKey != localPubKey {
+			return unknownKeyErr
+		}
+
+		var msg strings.Builder
+		msg.WriteString(unknownKeyErr.Error())
+		msg.WriteString("\n\nYour ollama key is:\n")
+		msg.WriteString(localPubKey)
+		msg.WriteString("\nAdd your key at:\n")
+		msg.WriteString("https://ollama.com/settings/keys")
+
+		return errors.New(msg.String())
+	}
+
+	return unknownKeyErr
+}
+
 func PushHandler(cmd *cobra.Command, args []string) error {
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
@@ -404,6 +455,20 @@ func PushHandler(cmd *cobra.Command, args []string) error {

 	request := api.PushRequest{Name: args[0], Insecure: insecure}
 	if err := client.Push(cmd.Context(), &request, fn); err != nil {
+		if spinner != nil {
+			spinner.Stop()
+		}
+		if strings.Contains(err.Error(), "access denied") {
+			return errors.New("you are not authorized to push to this namespace, create the model under a namespace you own")
+		}
+		host := model.ParseName(args[0]).Host
+		isOllamaHost := strings.HasSuffix(host, ".ollama.ai") || strings.HasSuffix(host, ".ollama.com")
+		if strings.Contains(err.Error(), errtypes.UnknownOllamaKeyErrMsg) && isOllamaHost {
+			// the user has not added their ollama key to ollama.com
+			// re-throw an error with a more user-friendly message
+			return errFromUnknownKey(err)
+		}
+
 		return err
 	}

@@ -444,6 +509,52 @@ func ListHandler(cmd *cobra.Command, args []string) error {
 	return nil
 }

+func ListRunningHandler(cmd *cobra.Command, args []string) error {
+	client, err := api.ClientFromEnvironment()
+	if err != nil {
+		return err
+	}
+
+	models, err := client.ListRunning(cmd.Context())
+	if err != nil {
+		return err
+	}
+
+	var data [][]string
+
+	for _, m := range models.Models {
+		if len(args) == 0 || strings.HasPrefix(m.Name, args[0]) {
+			var procStr string
+			switch {
+			case m.SizeVRAM == 0:
+				procStr = "100% CPU"
+			case m.SizeVRAM == m.Size:
+				procStr = "100% GPU"
+			case m.SizeVRAM > m.Size || m.Size == 0:
+				procStr = "Unknown"
+			default:
+				sizeCPU := m.Size - m.SizeVRAM
+				cpuPercent := math.Round(float64(sizeCPU) / float64(m.Size) * 100)
+				procStr = fmt.Sprintf("%d%%/%d%% CPU/GPU", int(cpuPercent), int(100-cpuPercent))
+			}
+			data = append(data, []string{m.Name, m.Digest[:12], format.HumanBytes(m.Size), procStr, format.HumanTime(m.ExpiresAt, "Never")})
+		}
+	}
+
+	table := tablewriter.NewWriter(os.Stdout)
+	table.SetHeader([]string{"NAME", "ID", "SIZE", "PROCESSOR", "UNTIL"})
+	table.SetHeaderAlignment(tablewriter.ALIGN_LEFT)
+	table.SetAlignment(tablewriter.ALIGN_LEFT)
+	table.SetHeaderLine(false)
+	table.SetBorder(false)
+	table.SetNoWhiteSpace(true)
+	table.SetTablePadding("\t")
+	table.AppendBulk(data)
+	table.Render()
+
+	return nil
+}
+
 func DeleteHandler(cmd *cobra.Command, args []string) error {
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
@@ -620,6 +731,7 @@ type runOptions struct {
 	Images      []api.ImageData
 	Options     map[string]interface{}
 	MultiModal  bool
+	KeepAlive   *api.Duration
 }

 type displayResponseState struct {
@@ -714,6 +826,10 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 		Options:  opts.Options,
 	}

+	if opts.KeepAlive != nil {
+		req.KeepAlive = opts.KeepAlive
+	}
+
 	if err := client.Chat(cancelCtx, req, fn); err != nil {
 		if errors.Is(err, context.Canceled) {
 			return nil, nil
@@ -789,14 +905,15 @@ func generate(cmd *cobra.Command, opts runOptions) error {
 	}

 	request := api.GenerateRequest{
-		Model:    opts.Model,
-		Prompt:   opts.Prompt,
-		Context:  generateContext,
-		Images:   opts.Images,
-		Format:   opts.Format,
-		System:   opts.System,
-		Template: opts.Template,
-		Options:  opts.Options,
+		Model:     opts.Model,
+		Prompt:    opts.Prompt,
+		Context:   generateContext,
+		Images:    opts.Images,
+		Format:    opts.Format,
+		System:    opts.System,
+		Template:  opts.Template,
+		Options:   opts.Options,
+		KeepAlive: opts.KeepAlive,
 	}

 	if err := client.Generate(ctx, &request, fn); err != nil {
@@ -831,24 +948,27 @@ func generate(cmd *cobra.Command, opts runOptions) error {
 }

 func RunServer(cmd *cobra.Command, _ []string) error {
-	host, port, err := net.SplitHostPort(strings.Trim(os.Getenv("OLLAMA_HOST"), "\"'"))
+	// retrieve the OLLAMA_HOST environment variable
+	ollamaHost, err := api.GetOllamaHost()
 	if err != nil {
-		host, port = "127.0.0.1", "11434"
-		if ip := net.ParseIP(strings.Trim(os.Getenv("OLLAMA_HOST"), "[]")); ip != nil {
-			host = ip.String()
-		}
+		return err
 	}

 	if err := initializeKeypair(); err != nil {
 		return err
 	}

-	ln, err := net.Listen("tcp", net.JoinHostPort(host, port))
+	ln, err := net.Listen("tcp", net.JoinHostPort(ollamaHost.Host, ollamaHost.Port))
 	if err != nil {
 		return err
 	}

-	return server.Serve(ln)
+	err = server.Serve(ln)
+	if errors.Is(err, http.ErrServerClosed) {
+		return nil
+	}
+
+	return err
 }

 func initializeKeypair() error {
@@ -995,8 +1115,8 @@ func NewCLI() *cobra.Command {
 		RunE:    CreateHandler,
 	}

-	createCmd.Flags().StringP("file", "f", "Modelfile", "Name of the Modelfile (default \"Modelfile\")")
-	createCmd.Flags().StringP("quantization", "q", "", "Quantization level.")
+	createCmd.Flags().StringP("file", "f", "Modelfile", "Name of the Modelfile")
+	createCmd.Flags().StringP("quantize", "q", "", "Quantize model to this level (e.g. q4_0)")

 	showCmd := &cobra.Command{
 		Use:     "show MODEL",
@@ -1020,6 +1140,7 @@ func NewCLI() *cobra.Command {
 		RunE:    RunHandler,
 	}

+	runCmd.Flags().String("keepalive", "", "Duration to keep a model loaded (e.g. 5m)")
 	runCmd.Flags().Bool("verbose", false, "Show timings for response")
 	runCmd.Flags().Bool("insecure", false, "Use an insecure registry")
 	runCmd.Flags().Bool("nowordwrap", false, "Don't wrap words to the next line automatically")
@@ -1035,9 +1156,9 @@ func NewCLI() *cobra.Command {
 Environment Variables:

    OLLAMA_HOST         The host:port to bind to (default "127.0.0.1:11434")
-    OLLAMA_ORIGINS      A comma separated list of allowed origins.
-    OLLAMA_MODELS       The path to the models directory (default is "~/.ollama/models")
-    OLLAMA_KEEP_ALIVE   The duration that models stay loaded in memory (default is "5m")
+    OLLAMA_ORIGINS      A comma separated list of allowed origins
+    OLLAMA_MODELS       The path to the models directory (default "~/.ollama/models")
+    OLLAMA_KEEP_ALIVE   The duration that models stay loaded in memory (default "5m")
    OLLAMA_DEBUG        Set to 1 to enable additional debug logging
 `)

@@ -1068,8 +1189,16 @@ Environment Variables:
 		PreRunE: checkServerHeartbeat,
 		RunE:    ListHandler,
 	}
+
+	psCmd := &cobra.Command{
+		Use:     "ps",
+		Short:   "List running models",
+		PreRunE: checkServerHeartbeat,
+		RunE:    ListRunningHandler,
+	}
+
 	copyCmd := &cobra.Command{
-		Use:     "cp SOURCE TARGET",
+		Use:     "cp SOURCE DESTINATION",
 		Short:   "Copy a model",
 		Args:    cobra.ExactArgs(2),
 		PreRunE: checkServerHeartbeat,
@@ -1091,6 +1220,7 @@ Environment Variables:
 		pullCmd,
 		pushCmd,
 		listCmd,
+		psCmd,
 		copyCmd,
 		deleteCmd,
 	} {
@@ -1105,6 +1235,7 @@ Environment Variables:
 		pullCmd,
 		pushCmd,
 		listCmd,
+		psCmd,
 		copyCmd,
 		deleteCmd,
 	)
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -17,6 +17,7 @@ import (
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/progress"
 	"github.com/ollama/ollama/readline"
+	"github.com/ollama/ollama/types/errtypes"
 )

 type MultilineState int
@@ -56,6 +57,11 @@ func loadModel(cmd *cobra.Command, opts *runOptions) error {
 		Model:    opts.Model,
 		Messages: []api.Message{},
 	}
+
+	if opts.KeepAlive != nil {
+		chatReq.KeepAlive = opts.KeepAlive
+	}
+
 	err = client.Chat(cmd.Context(), chatReq, func(resp api.ChatResponse) error {
 		p.StopAndClear()
 		if len(opts.Messages) > 0 {
@@ -94,6 +100,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 		fmt.Fprintln(os.Stderr, "  /show           Show model information")
 		fmt.Fprintln(os.Stderr, "  /load <model>   Load a session or model")
 		fmt.Fprintln(os.Stderr, "  /save <model>   Save your current session")
+		fmt.Fprintln(os.Stderr, "  /clear          Clear session context")
 		fmt.Fprintln(os.Stderr, "  /bye            Exit")
 		fmt.Fprintln(os.Stderr, "  /?, /help       Help for a command")
 		fmt.Fprintln(os.Stderr, "  /? shortcuts    Help for keyboard shortcuts")
@@ -161,7 +168,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 		fmt.Fprintln(os.Stderr, "  /set parameter repeat_penalty <float> How strongly to penalize repetitions")
 		fmt.Fprintln(os.Stderr, "  /set parameter repeat_last_n <int>    Set how far back to look for repetitions")
 		fmt.Fprintln(os.Stderr, "  /set parameter num_gpu <int>          The number of layers to send to the GPU")
-		fmt.Fprintln(os.Stderr, "  /set parameter stop \"<string>\", ...   Set the stop parameters")
+		fmt.Fprintln(os.Stderr, "  /set parameter stop <string> <string> ...   Set the stop parameters")
 		fmt.Fprintln(os.Stderr, "")
 	}

@@ -275,11 +282,22 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 			fn := func(resp api.ProgressResponse) error { return nil }
 			err = client.Create(cmd.Context(), req, fn)
 			if err != nil {
-				fmt.Println("error: couldn't save model")
+				if strings.Contains(err.Error(), errtypes.InvalidModelNameErrMsg) {
+					fmt.Printf("error: The model name '%s' is invalid\n", args[1])
+					continue
+				}
 				return err
 			}
 			fmt.Printf("Created new model '%s'\n", args[1])
 			continue
+		case strings.HasPrefix(line, "/clear"):
+			opts.Messages = []api.Message{}
+			if opts.System != "" {
+				newMessage := api.Message{Role: "system", Content: opts.System}
+				opts.Messages = append(opts.Messages, newMessage)
+			}
+			fmt.Println("Cleared session context")
+			continue
 		case strings.HasPrefix(line, "/set"):
 			args := strings.Fields(line)
 			if len(args) > 1 {
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -5,6 +5,7 @@ import (
 	"encoding/binary"
 	"encoding/json"
 	"fmt"
+	"io"
 	"log/slog"
 	"os"
 	"path/filepath"
@@ -47,7 +48,7 @@ type ByteOrder interface {
 type ModelArch interface {
 	GetTensors() error
 	LoadVocab() error
-	WriteGGUF() (string, error)
+	WriteGGUF(io.WriteSeeker) error
 }

 type ModelFormat interface {
--- a/convert/gemma.go
+++ b/convert/gemma.go
@@ -94,7 +94,7 @@ func (m *GemmaModel) LoadVocab() error {
 	return nil
 }

-func (m *GemmaModel) WriteGGUF() (string, error) {
+func (m *GemmaModel) WriteGGUF(ws io.WriteSeeker) error {
 	kv := llm.KV{
 		"general.architecture":                   "gemma",
 		"general.name":                           m.Name,
@@ -122,16 +122,5 @@ func (m *GemmaModel) WriteGGUF() (string, error) {
 		"tokenizer.ggml.add_eos_token":    false,
 	}

-	f, err := os.CreateTemp("", "ollama-gguf")
-	if err != nil {
-		return "", err
-	}
-	defer f.Close()
-
-	mod := llm.NewGGUFV3(m.Params.ByteOrder)
-	if err := mod.Encode(f, kv, m.Tensors); err != nil {
-		return "", err
-	}
-
-	return f.Name(), nil
+	return llm.NewGGUFV3(m.Params.ByteOrder).Encode(ws, kv, m.Tensors)
 }
--- a/convert/llama.go
+++ b/convert/llama.go
@@ -5,7 +5,6 @@ import (
 	"fmt"
 	"io"
 	"log/slog"
-	"os"
 	"regexp"
 	"strings"

@@ -132,7 +131,7 @@ func (m *LlamaModel) LoadVocab() error {
 	return nil
 }

-func (m *LlamaModel) WriteGGUF() (string, error) {
+func (m *LlamaModel) WriteGGUF(ws io.WriteSeeker) error {
 	kv := llm.KV{
 		"general.architecture":                   "llama",
 		"general.name":                           m.Name,
@@ -159,18 +158,5 @@ func (m *LlamaModel) WriteGGUF() (string, error) {
 		"tokenizer.ggml.add_eos_token":    false,
 	}

-	f, err := os.CreateTemp("", "ollama-gguf")
-	if err != nil {
-		return "", err
-	}
-	defer f.Close()
-
-	mod := llm.NewGGUFV3(m.Params.ByteOrder)
-	if err := mod.Encode(f, kv, m.Tensors); err != nil {
-		return "", err
-	}
-
-	slog.Debug(fmt.Sprintf("gguf file = %s", f.Name()))
-
-	return f.Name(), nil
+	return llm.NewGGUFV3(m.Params.ByteOrder).Encode(ws, kv, m.Tensors)
 }
--- a/convert/mistral.go
+++ b/convert/mistral.go
@@ -132,7 +132,7 @@ func (m *MistralModel) LoadVocab() error {
 	return nil
 }

-func (m *MistralModel) WriteGGUF() (string, error) {
+func (m *MistralModel) WriteGGUF(ws io.WriteSeeker) error {
 	kv := llm.KV{
 		"general.architecture":                   "llama",
 		"general.name":                           m.Name,
@@ -158,16 +158,5 @@ func (m *MistralModel) WriteGGUF() (string, error) {
 		"tokenizer.ggml.unknown_token_id": uint32(0),
 	}

-	f, err := os.CreateTemp("", "ollama-gguf")
-	if err != nil {
-		return "", err
-	}
-	defer f.Close()
-
-	mod := llm.NewGGUFV3(m.Params.ByteOrder)
-	if err := mod.Encode(f, kv, m.Tensors); err != nil {
-		return "", err
-	}
-
-	return f.Name(), nil
+	return llm.NewGGUFV3(m.Params.ByteOrder).Encode(ws, kv, m.Tensors)
 }
--- a/convert/mixtral.go
+++ b/convert/mixtral.go
@@ -1,7 +1,7 @@
 package convert

 import (
-	"os"
+	"io"
 	"regexp"

 	"github.com/ollama/ollama/llm"
@@ -47,7 +47,7 @@ func (m *MixtralModel) LoadVocab() error {
 	return nil
 }

-func (m *MixtralModel) WriteGGUF() (string, error) {
+func (m *MixtralModel) WriteGGUF(ws io.WriteSeeker) error {
 	kv := llm.KV{
 		"general.architecture":          "llama",
 		"general.name":                  m.Name,
@@ -81,16 +81,5 @@ func (m *MixtralModel) WriteGGUF() (string, error) {
 		"tokenizer.ggml.add_eos_token":    false,
 	}

-	f, err := os.CreateTemp("", "ollama-gguf")
-	if err != nil {
-		return "", err
-	}
-	defer f.Close()
-
-	mod := llm.NewGGUFV3(m.Params.ByteOrder)
-	if err := mod.Encode(f, kv, m.Tensors); err != nil {
-		return "", err
-	}
-
-	return f.Name(), nil
+	return llm.NewGGUFV3(m.Params.ByteOrder).Encode(ws, kv, m.Tensors)
 }
--- a/convert/safetensors.go
+++ b/convert/safetensors.go
@@ -53,7 +53,7 @@ func (m *SafetensorFormat) GetTensors(dirpath string, params *Params) ([]llm.Ten
 		var err error
 		t, offset, err = m.readTensors(f, offset, params)
 		if err != nil {
-			slog.Error("%v", err)
+			slog.Error(err.Error())
 			return nil, err
 		}
 		tensors = append(tensors, t...)
@@ -122,7 +122,7 @@ func (m *SafetensorFormat) readTensors(fn string, offset uint64, params *Params)

 		ggufName, err := m.GetLayerName(k)
 		if err != nil {
-			slog.Error("%v", err)
+			slog.Error(err.Error())
 			return nil, 0, err
 		}

--- a/convert/torch.go
+++ b/convert/torch.go
@@ -74,7 +74,7 @@ func (tf *TorchFormat) GetTensors(dirpath string, params *Params) ([]llm.Tensor,

 			ggufName, err := tf.GetLayerName(k.(string))
 			if err != nil {
-				slog.Error("%v", err)
+				slog.Error(err.Error())
 				return nil, err
 			}
 			slog.Debug(fmt.Sprintf("finding name for '%s' -> '%s'", k.(string), ggufName))
--- a/docs/README.md
+++ b/docs/README.md
@@ -6,7 +6,7 @@
 * [Importing models](./import.md)
 * [Linux Documentation](./linux.md)
 * [Windows Documentation](./windows.md)
-* [Docker Documentation](https://hub.docker.com/r/ollama/ollama)
+* [Docker Documentation](./docker.md)

 ### Reference

--- a/docs/api.md
+++ b/docs/api.md
@@ -17,7 +17,7 @@

 ### Model names

-Model names follow a `model:tag` format, where `model` can have an optional namespace such as `example/model`. Some examples are `orca-mini:3b-q4_1` and `llama2:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version.
+Model names follow a `model:tag` format, where `model` can have an optional namespace such as `example/model`. Some examples are `orca-mini:3b-q4_1` and `llama3:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version.

 ### Durations

@@ -66,7 +66,7 @@ Enable JSON mode by setting the `format` parameter to `json`. This will structur

 ```shell
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama2",
+  "model": "llama3",
  "prompt": "Why is the sky blue?"
 }'
 ```
@@ -77,7 +77,7 @@ A stream of JSON objects is returned:

 ```json
 {
-  "model": "llama2",
+  "model": "llama3",
  "created_at": "2023-08-04T08:52:19.385406455-07:00",
  "response": "The",
  "done": false
@@ -95,11 +95,11 @@ The final response in the stream also includes additional data about the generat
 - `context`: an encoding of the conversation used in this response, this can be sent in the next request to keep a conversational memory
 - `response`: empty if the response was streamed, if not streamed, this will contain the full response

-To calculate how fast the response is generated in tokens per second (token/s), divide `eval_count` / `eval_duration`.
+To calculate how fast the response is generated in tokens per second (token/s), divide `eval_count` / `eval_duration` * `10^9`.

 ```json
 {
-  "model": "llama2",
+  "model": "llama3",
  "created_at": "2023-08-04T19:22:45.499127Z",
  "response": "",
  "done": true,
@@ -121,7 +121,7 @@ A response can be received in one reply when streaming is off.

 ```shell
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama2",
+  "model": "llama3",
  "prompt": "Why is the sky blue?",
  "stream": false
 }'
@@ -133,7 +133,7 @@ If `stream` is set to `false`, the response will be a single JSON object:

 ```json
 {
-  "model": "llama2",
+  "model": "llama3",
  "created_at": "2023-08-04T19:22:45.499127Z",
  "response": "The sky is blue because it is the color of the sky.",
  "done": true,
@@ -155,7 +155,7 @@ If `stream` is set to `false`, the response will be a single JSON object:

 ```shell
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama2",
+  "model": "llama3",
  "prompt": "What color is the sky at different times of the day? Respond using JSON",
  "format": "json",
  "stream": false
@@ -166,7 +166,7 @@ curl http://localhost:11434/api/generate -d '{

 ```json
 {
-  "model": "llama2",
+  "model": "llama3",
  "created_at": "2023-11-09T21:07:55.186497Z",
  "response": "{\n\"morning\": {\n\"color\": \"blue\"\n},\n\"noon\": {\n\"color\": \"blue-gray\"\n},\n\"afternoon\": {\n\"color\": \"warm gray\"\n},\n\"evening\": {\n\"color\": \"orange\"\n}\n}\n",
  "done": true,
@@ -289,7 +289,7 @@ If you want to set custom options for the model at runtime rather than in the Mo

 ```shell
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama2",
+  "model": "llama3",
  "prompt": "Why is the sky blue?",
  "stream": false,
  "options": {
@@ -313,7 +313,6 @@ curl http://localhost:11434/api/generate -d '{
    "numa": false,
    "num_ctx": 1024,
    "num_batch": 2,
-    "num_gqa": 1,
    "num_gpu": 1,
    "main_gpu": 0,
    "low_vram": false,
@@ -321,8 +320,6 @@ curl http://localhost:11434/api/generate -d '{
    "vocab_only": false,
    "use_mmap": true,
    "use_mlock": false,
-    "rope_frequency_base": 1.1,
-    "rope_frequency_scale": 0.8,
    "num_thread": 8
  }
 }'
@@ -332,7 +329,7 @@ curl http://localhost:11434/api/generate -d '{

 ```json
 {
-  "model": "llama2",
+  "model": "llama3",
  "created_at": "2023-08-04T19:22:45.499127Z",
  "response": "The sky is blue because it is the color of the sky.",
  "done": true,
@@ -354,7 +351,7 @@ If an empty prompt is provided, the model will be loaded into memory.

 ```shell
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama2"
+  "model": "llama3"
 }'
 ```

@@ -364,7 +361,7 @@ A single JSON object is returned:

 ```json
 {
-  "model": "llama2",
+  "model": "llama3",
  "created_at": "2023-12-18T19:52:07.071755Z",
  "response": "",
  "done": true
@@ -407,7 +404,7 @@ Send a chat message with a streaming response.

 ```shell
 curl http://localhost:11434/api/chat -d '{
-  "model": "llama2",
+  "model": "llama3",
  "messages": [
    {
      "role": "user",
@@ -423,7 +420,7 @@ A stream of JSON objects is returned:

 ```json
 {
-  "model": "llama2",
+  "model": "llama3",
  "created_at": "2023-08-04T08:52:19.385406455-07:00",
  "message": {
    "role": "assistant",
@@ -438,7 +435,7 @@ Final response:

 ```json
 {
-  "model": "llama2",
+  "model": "llama3",
  "created_at": "2023-08-04T19:22:45.499127Z",
  "done": true,
  "total_duration": 4883583458,
@@ -456,7 +453,7 @@ Final response:

 ```shell
 curl http://localhost:11434/api/chat -d '{
-  "model": "llama2",
+  "model": "llama3",
  "messages": [
    {
      "role": "user",
@@ -471,7 +468,7 @@ curl http://localhost:11434/api/chat -d '{

 ```json
 {
-  "model": "registry.ollama.ai/library/llama2:latest",
+  "model": "registry.ollama.ai/library/llama3:latest",
  "created_at": "2023-12-12T14:13:43.416799Z",
  "message": {
    "role": "assistant",
@@ -495,7 +492,7 @@ Send a chat message with a conversation history. You can use this same approach

 ```shell
 curl http://localhost:11434/api/chat -d '{
-  "model": "llama2",
+  "model": "llama3",
  "messages": [
    {
      "role": "user",
@@ -519,7 +516,7 @@ A stream of JSON objects is returned:

 ```json
 {
-  "model": "llama2",
+  "model": "llama3",
  "created_at": "2023-08-04T08:52:19.385406455-07:00",
  "message": {
    "role": "assistant",
@@ -533,7 +530,7 @@ Final response:

 ```json
 {
-  "model": "llama2",
+  "model": "llama3",
  "created_at": "2023-08-04T19:22:45.499127Z",
  "done": true,
  "total_duration": 8113331500,
@@ -591,7 +588,7 @@ curl http://localhost:11434/api/chat -d '{

 ```shell
 curl http://localhost:11434/api/chat -d '{
-  "model": "llama2",
+  "model": "llama3",
  "messages": [
    {
      "role": "user",
@@ -609,7 +606,7 @@ curl http://localhost:11434/api/chat -d '{

 ```json
 {
-  "model": "registry.ollama.ai/library/llama2:latest",
+  "model": "registry.ollama.ai/library/llama3:latest",
  "created_at": "2023-12-12T14:13:43.416799Z",
  "message": {
    "role": "assistant",
@@ -651,7 +648,7 @@ Create a new model from a `Modelfile`.
 ```shell
 curl http://localhost:11434/api/create -d '{
  "name": "mario",
-  "modelfile": "FROM llama2\nSYSTEM You are mario from Super Mario Bros."
+  "modelfile": "FROM llama3\nSYSTEM You are mario from Super Mario Bros."
 }'
 ```

@@ -758,7 +755,7 @@ A single JSON object will be returned.
      }
    },
    {
-      "name": "llama2:latest",
+      "name": "llama3:latest",
      "modified_at": "2023-12-07T09:32:18.757212583-08:00",
      "size": 3825819519,
      "digest": "fe938a131f40e6f6d40083c9f0f430a515233eb2edaa6d72eb85c50d64f2300e",
@@ -792,7 +789,7 @@ Show information about a model including details, modelfile, template, parameter

 ```shell
 curl http://localhost:11434/api/show -d '{
-  "name": "llama2"
+  "name": "llama3"
 }'
 ```

@@ -800,9 +797,9 @@ curl http://localhost:11434/api/show -d '{

 ```json
 {
-  "modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llava:latest\n\nFROM /Users/matt/.ollama/models/blobs/sha256:200765e1283640ffbd013184bf496e261032fa75b99498a9613be4e94d63ad52\nTEMPLATE \"\"\"{{ .System }}\nUSER: {{ .Prompt }}\nASSSISTANT: \"\"\"\nPARAMETER num_ctx 4096\nPARAMETER stop \"\u003c/s\u003e\"\nPARAMETER stop \"USER:\"\nPARAMETER stop \"ASSSISTANT:\"",
-  "parameters": "num_ctx                        4096\nstop                           \u003c/s\u003e\nstop                           USER:\nstop                           ASSSISTANT:",
-  "template": "{{ .System }}\nUSER: {{ .Prompt }}\nASSSISTANT: ",
+  "modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llava:latest\n\nFROM /Users/matt/.ollama/models/blobs/sha256:200765e1283640ffbd013184bf496e261032fa75b99498a9613be4e94d63ad52\nTEMPLATE \"\"\"{{ .System }}\nUSER: {{ .Prompt }}\nASSISTANT: \"\"\"\nPARAMETER num_ctx 4096\nPARAMETER stop \"\u003c/s\u003e\"\nPARAMETER stop \"USER:\"\nPARAMETER stop \"ASSISTANT:\"",
+  "parameters": "num_ctx                        4096\nstop                           \u003c/s\u003e\nstop                           USER:\nstop                           ASSISTANT:",
+  "template": "{{ .System }}\nUSER: {{ .Prompt }}\nASSISTANT: ",
  "details": {
    "format": "gguf",
    "family": "llama",
@@ -827,8 +824,8 @@ Copy a model. Creates a model with another name from an existing model.

 ```shell
 curl http://localhost:11434/api/copy -d '{
-  "source": "llama2",
-  "destination": "llama2-backup"
+  "source": "llama3",
+  "destination": "llama3-backup"
 }'
 ```

@@ -854,7 +851,7 @@ Delete a model and its data.

 ```shell
 curl -X DELETE http://localhost:11434/api/delete -d '{
-  "name": "llama2:13b"
+  "name": "llama3:13b"
 }'
 ```

@@ -882,7 +879,7 @@ Download a model from the ollama library. Cancelled pulls are resumed from where

 ```shell
 curl http://localhost:11434/api/pull -d '{
-  "name": "llama2"
+  "name": "llama3"
 }'
 ```

--- a/docs/development.md
+++ b/docs/development.md
@@ -51,7 +51,7 @@ Typically the build scripts will auto-detect CUDA, however, if your Linux distro
 or installation approach uses unusual paths, you can specify the location by
 specifying an environment variable `CUDA_LIB_DIR` to the location of the shared
 libraries, and `CUDACXX` to the location of the nvcc compiler. You can customize
-set set of target CUDA architectues by setting `CMAKE_CUDA_ARCHITECTURES` (e.g. "50;60;70")
+a set of target CUDA architectures by setting `CMAKE_CUDA_ARCHITECTURES` (e.g. "50;60;70")

 Then generate dependencies:

@@ -142,4 +142,4 @@ In addition to the common Windows development tools described above, install AMD
 - [AMD HIP](https://www.amd.com/en/developer/resources/rocm-hub/hip-sdk.html)
 - [Strawberry Perl](https://strawberryperl.com/)

-Lastly, add `ninja.exe` included with MSVC to the system path (e.g. `C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\Common7\IDE\CommonExtensions\Microsoft\CMake\Ninja`).
+Lastly, add `ninja.exe` included with MSVC to the system path (e.g. `C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\Common7\IDE\CommonExtensions\Microsoft\CMake\Ninja`).
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -0,0 +1,71 @@
+# Ollama Docker image
+
+### CPU only
+
+```bash
+docker run -d -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
+```
+
+### Nvidia GPU
+Install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installation).
+
+#### Install with Apt
+1.  Configure the repository
+```bash
+curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \
+    | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
+curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \
+    | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \
+    | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
+sudo apt-get update
+```
+2.  Install the NVIDIA Container Toolkit packages
+```bash
+sudo apt-get install -y nvidia-container-toolkit
+```
+
+#### Install with Yum or Dnf
+1.  Configure the repository
+    
+```bash
+curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo \
+    | sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo
+```
+    
+2. Install the NVIDIA Container Toolkit packages
+    
+```bash
+sudo yum install -y nvidia-container-toolkit
+```
+
+#### Configure Docker to use Nvidia driver 
+```
+sudo nvidia-ctk runtime configure --runtime=docker
+sudo systemctl restart docker
+```
+
+#### Start the container
+
+```bash
+docker run -d --gpus=all -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
+```
+
+### AMD GPU
+
+To run Ollama using Docker with AMD GPUs, use the `rocm` tag and the following command:
+
+```
+docker run -d --device /dev/kfd --device /dev/dri -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama:rocm
+```
+
+### Run model locally
+
+Now you can run a model:
+
+```
+docker exec -it ollama ollama run llama3
+```
+
+### Try different models
+
+More models can be found on the [Ollama library](https://ollama.com/library).
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -32,7 +32,7 @@ When using the API, specify the `num_ctx` parameter:

 ```
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama2",
+  "model": "llama3",
  "prompt": "Why is the sky blue?",
  "options": {
    "num_ctx": 4096
@@ -80,17 +80,19 @@ If Ollama is run as a systemd service, environment variables should be set using

 ### Setting environment variables on Windows

-On windows, Ollama inherits your user and system environment variables.
+On Windows, Ollama inherits your user and system environment variables.

-1. First Quit Ollama by clicking on it in the task bar
+1. First Quit Ollama by clicking on it in the task bar.

-2. Edit system environment variables from the control panel
+2. Start the Settings (Windows 11) or Control Panel (Windows 10) application and search for _environment variables_.

-3. Edit or create New variable(s) for your user account for `OLLAMA_HOST`, `OLLAMA_MODELS`, etc.
+3. Click on _Edit environment variables for your account_.

-4. Click OK/Apply to save 
+4. Edit or create a new variable for your user account for `OLLAMA_HOST`, `OLLAMA_MODELS`, etc.

-5. Run `ollama` from a new terminal window 
+5. Click OK/Apply to save.
+
+6. Start the Ollama application from the Windows Start menu.


 ## How can I expose Ollama on my network?
@@ -140,7 +142,7 @@ Refer to the section [above](#how-do-i-configure-ollama-server) for how to set e

 - macOS: `~/.ollama/models`
 - Linux: `/usr/share/ollama/.ollama/models`
- Windows: `C:\Users\<username>\.ollama\models`
+- Windows: `C:\Users\%username%\.ollama\models`

 ### How do I set them to a different location?

@@ -221,14 +223,20 @@ The `keep_alive` parameter can be set to:

 For example, to preload a model and leave it in memory use:
 ```shell
-curl http://localhost:11434/api/generate -d '{"model": "llama2", "keep_alive": -1}'
+curl http://localhost:11434/api/generate -d '{"model": "llama3", "keep_alive": -1}'
 ```

 To unload the model and free up memory use:
 ```shell
-curl http://localhost:11434/api/generate -d '{"model": "llama2", "keep_alive": 0}'
+curl http://localhost:11434/api/generate -d '{"model": "llama3", "keep_alive": 0}'
 ```

 Alternatively, you can change the amount of time all models are loaded into memory by setting the `OLLAMA_KEEP_ALIVE` environment variable when starting the Ollama server. The `OLLAMA_KEEP_ALIVE` variable uses the same parameter types as the `keep_alive` parameter types mentioned above. Refer to section explaining [how to configure the Ollama server](#how-do-i-configure-ollama-server) to correctly set the environment variable.

 If you wish to override the `OLLAMA_KEEP_ALIVE` setting, use the `keep_alive` API parameter with the `/api/generate` or `/api/chat` API endpoints.
+
+## How do I manage the maximum number of requests the server can queue
+
+If too many requests are sent to the server, it will respond with a 503 error
+indicating the server is overloaded.  You can adjust how many requests may be
+queue by setting `OLLAMA_MAX_QUEUE`
--- a/docs/import.md
+++ b/docs/import.md
@@ -125,7 +125,7 @@ Publishing models is in early alpha. If you'd like to publish your model to shar

 1. Create [an account](https://ollama.com/signup)
 2. Copy your Ollama public key:
-  - macOS: `cat ~/.ollama/id_ed25519.pub`
+  - macOS: `cat ~/.ollama/id_ed25519.pub | pbcopy`
  - Windows: `type %USERPROFILE%\.ollama\id_ed25519.pub`
  - Linux: `cat /usr/share/ollama/.ollama/id_ed25519.pub`
 3. Add your public key to your [Ollama account](https://ollama.com/settings/keys)
@@ -136,6 +136,8 @@ Next, copy your model to your username's namespace:
 ollama cp example <your username>/example
 ```

+> Note: model names may only contain lowercase letters, digits, and the characters `.`, `-`, and `_`.
+
 Then push the model:

 ```
--- a/docs/linux.md
+++ b/docs/linux.md
@@ -105,7 +105,7 @@ sudo chmod +x /usr/bin/ollama
 To view logs of Ollama running as a startup service, run:

 ```bash
-journalctl -u ollama
+journalctl -e -u ollama
 ```

 ## Uninstall
--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@@ -10,7 +10,7 @@ A model file is the blueprint to create and share models with Ollama.
 - [Examples](#examples)
 - [Instructions](#instructions)
  - [FROM (Required)](#from-required)
-    - [Build from llama2](#build-from-llama2)
+    - [Build from llama3](#build-from-llama3)
    - [Build from a bin file](#build-from-a-bin-file)
  - [PARAMETER](#parameter)
    - [Valid Parameters and Values](#valid-parameters-and-values)
@@ -48,7 +48,7 @@ INSTRUCTION arguments
 An example of a `Modelfile` creating a mario blueprint:

 ```modelfile
-FROM llama2
+FROM llama3
 # sets the temperature to 1 [higher is more creative, lower is more coherent]
 PARAMETER temperature 1
 # sets the context window size to 4096, this controls how many tokens the LLM can use as context to generate the next token
@@ -67,33 +67,25 @@ To use this:

 More examples are available in the [examples directory](../examples).

-### `Modelfile`s in [ollama.com/library][1]
-
-There are two ways to view `Modelfile`s underlying the models in [ollama.com/library][1]:
-
- Option 1: view a details page from a model's tags page:
-  1.  Go to a particular model's tags (e.g. https://ollama.com/library/llama2/tags)
-  2.  Click on a tag (e.g. https://ollama.com/library/llama2:13b)
-  3.  Scroll down to "Layers"
-      - Note: if the [`FROM` instruction](#from-required) is not present,
-        it means the model was created from a local file
- Option 2: use `ollama show` to print the `Modelfile` for any local models like so:
+To view the Modelfile of a given model, use the `ollama show --modelfile` command.

  ```bash
-  > ollama show --modelfile llama2:13b
+  > ollama show --modelfile llama3
  # Modelfile generated by "ollama show"
  # To build a new Modelfile based on this one, replace the FROM line with:
-  # FROM llama2:13b
+  # FROM llama3:latest
+  FROM /Users/pdevine/.ollama/models/blobs/sha256-00e1317cbf74d901080d7100f57580ba8dd8de57203072dc6f668324ba545f29
+  TEMPLATE """{{ if .System }}<|start_header_id|>system<|end_header_id|>

-  FROM /root/.ollama/models/blobs/sha256:123abc
-  TEMPLATE """[INST] {{ if .System }}<<SYS>>{{ .System }}<</SYS>>
+  {{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>

-  {{ end }}{{ .Prompt }} [/INST] """
-  SYSTEM """"""
-  PARAMETER stop [INST]
-  PARAMETER stop [/INST]
-  PARAMETER stop <<SYS>>
-  PARAMETER stop <</SYS>>
+  {{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>
+
+  {{ .Response }}<|eot_id|>"""
+  PARAMETER stop "<|start_header_id|>"
+  PARAMETER stop "<|end_header_id|>"
+  PARAMETER stop "<|eot_id|>"
+  PARAMETER stop "<|reserved_special_token"
  ```

 ## Instructions
@@ -106,10 +98,10 @@ The `FROM` instruction defines the base model to use when creating a model.
 FROM <model name>:<tag>
 ```

-#### Build from llama2
+#### Build from llama3

 ```modelfile
-FROM llama2
+FROM llama3
 ```

 A list of available base models:
--- a/docs/openai.md
+++ b/docs/openai.md
@@ -25,7 +25,7 @@ chat_completion = client.chat.completions.create(
            'content': 'Say this is a test',
        }
    ],
-    model='llama2',
+    model='llama3',
 )
 ```

@@ -43,7 +43,7 @@ const openai = new OpenAI({

 const chatCompletion = await openai.chat.completions.create({
  messages: [{ role: 'user', content: 'Say this is a test' }],
-  model: 'llama2',
+  model: 'llama3',
 })
 ```

@@ -53,7 +53,7 @@ const chatCompletion = await openai.chat.completions.create({
 curl http://localhost:11434/v1/chat/completions \
    -H "Content-Type: application/json" \
    -d '{
-        "model": "llama2",
+        "model": "llama3",
        "messages": [
            {
                "role": "system",
@@ -113,7 +113,7 @@ curl http://localhost:11434/v1/chat/completions \
 Before using a model, pull it locally `ollama pull`:

 ```shell
-ollama pull llama2
+ollama pull llama3
 ```

 ### Default model names
@@ -121,7 +121,7 @@ ollama pull llama2
 For tooling that relies on default OpenAI model names such as `gpt-3.5-turbo`, use `ollama cp` to copy an existing model name to a temporary name:

 ```
-ollama cp llama2 gpt-3.5-turbo
+ollama cp llama3 gpt-3.5-turbo
 ```

 Afterwards, this new model name can be specified the `model` field:
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -82,4 +82,23 @@ curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION="0.1.29" sh
 If your system is configured with the "noexec" flag where Ollama stores its
 temporary executable files, you can specify an alternate location by setting
 OLLAMA_TMPDIR to a location writable by the user ollama runs as.  For example
-OLLAMA_TMPDIR=/usr/share/ollama/
+OLLAMA_TMPDIR=/usr/share/ollama/
+
+## Container fails to run on NVIDIA GPU
+
+Make sure you've set up the conatiner runtime first as described in [docker.md](./docker.md)
+
+Sometimes the container runtime can have difficulties initializing the GPU.
+When you check the server logs, this can show up as various error codes, such
+as "3" (not initialized), "46" (device unavailable), "100" (no device), "999"
+(unknown), or others.  The following troubleshooting techniques may help resolve
+the problem
+
+- Is the uvm driver not loaded? `sudo nvidia-modprobe -u`
+- Try reloading the nvidia_uvm driver - `sudo rmmod nvidia_uvm` then `sudo modprobe nvidia_uvm`
+- Try rebooting
+- Make sure you're running the latest nvidia drivers
+
+If none of those resolve the problem, gather additional information and file an issue:
+- Set `CUDA_ERROR_LEVEL=50` and try again to get more diagnostic logs
+- Check dmesg for any errors `sudo dmesg | grep -i nvrm` and `sudo dmesg | grep -i nvidia`
--- a/docs/tutorials/langchainjs.md
+++ b/docs/tutorials/langchainjs.md
@@ -5,17 +5,17 @@ In this tutorial, we are going to use JavaScript with LangChain and Ollama to le
 To get started, let's just use **LangChain** to ask a simple question to a model. To do this with JavaScript, we need to install **LangChain**:

 ```bash
-npm install langchain
+npm install @langchain/community
 ```

 Now we can start building out our JavaScript:

 ```javascript
-import { Ollama } from "langchain/llms/ollama";
+import { Ollama } from "@langchain/community/llms/ollama";

 const ollama = new Ollama({
  baseUrl: "http://localhost:11434",
-  model: "llama2",
+  model: "llama3",
 });

 const answer = await ollama.invoke(`why is the sky blue?`);
@@ -23,10 +23,10 @@ const answer = await ollama.invoke(`why is the sky blue?`);
 console.log(answer);
 ```

-That will get us the same thing as if we ran `ollama run llama2 "why is the sky blue"` in the terminal. But we want to load a document from the web to ask a question against. **Cheerio** is a great library for ingesting a webpage, and **LangChain** uses it in their **CheerioWebBaseLoader**. So let's install **Cheerio** and build that part of the app.
+That will get us the same thing as if we ran `ollama run llama3 "why is the sky blue"` in the terminal. But we want to load a document from the web to ask a question against. **Cheerio** is a great library for ingesting a webpage, and **LangChain** uses it in their **CheerioWebBaseLoader**. So let's install **Cheerio** and build that part of the app.

 ```bash
-npm install cheerio 
+npm install cheerio
 ```

 ```javascript
--- a/docs/tutorials/langchainpy.md
+++ b/docs/tutorials/langchainpy.md
@@ -12,15 +12,17 @@ So let's figure out how we can use **LangChain** with Ollama to ask our question

 Let's start by asking a simple question that we can get an answer to from the **Llama2** model using **Ollama**. First, we need to install the **LangChain** package:

-`pip install langchain`
+`pip install langchain_community`

 Then we can create a model and ask the question:

 ```python
-from langchain.llms import Ollama
-ollama = Ollama(base_url='http://localhost:11434',
-model="llama2")
-print(ollama("why is the sky blue"))
+from langchain_community.llms import Ollama
+ollama = Ollama(
+    base_url='http://localhost:11434',
+    model="llama3"
+)
+print(ollama.invoke("why is the sky blue"))
 ```

 Notice that we are defining the model and the base URL for Ollama.
--- a/docs/windows.md
+++ b/docs/windows.md
@@ -1,47 +1,61 @@
-# Ollama Windows Preview
-
-Welcome to the Ollama Windows preview.
-
-No more WSL required!
-
-Ollama now runs as a native Windows application, including NVIDIA and AMD Radeon GPU support.
-After installing Ollama Windows Preview, Ollama will run in the background and
-the `ollama` command line is available in `cmd`, `powershell` or your favorite
-terminal application. As usual the Ollama [api](./api.md) will be served on
-`http://localhost:11434`.
-
-As this is a preview release, you should expect a few bugs here and there.  If
-you run into a problem you can reach out on
-[Discord](https://discord.gg/ollama), or file an 
-[issue](https://github.com/ollama/ollama/issues).
-Logs will often be helpful in diagnosing the problem (see
-[Troubleshooting](#troubleshooting) below)
-
-## System Requirements
-
-* Windows 10 or newer, Home or Pro
-* NVIDIA 452.39 or newer Drivers if you have an NVIDIA card
-* AMD Radeon Driver https://www.amd.com/en/support if you have a Radeon card
-
-## API Access
-
-Here's a quick example showing API access from `powershell`
-```powershell
-(Invoke-WebRequest -method POST -Body '{"model":"llama2", "prompt":"Why is the sky blue?", "stream": false}' -uri http://localhost:11434/api/generate ).Content | ConvertFrom-json
-```
-
-## Troubleshooting
-
-While we're in preview, `OLLAMA_DEBUG` is always enabled, which adds
-a "view logs" menu item to the app, and increses logging for the GUI app and
-server.
-
-Ollama on Windows stores files in a few different locations.  You can view them in
-the explorer window by hitting `<cmd>+R` and type in:
- `explorer %LOCALAPPDATA%\Ollama` contains logs, and downloaded updates
-    - *app.log* contains logs from the GUI application
-    - *server.log* contains the server logs
-    - *upgrade.log* contains log output for upgrades
- `explorer %LOCALAPPDATA%\Programs\Ollama` contains the binaries (The installer adds this to your user PATH)
- `explorer %HOMEPATH%\.ollama` contains models and configuration
- `explorer %TEMP%` contains temporary executable files in one or more `ollama*` directories
+# Ollama Windows Preview
+
+Welcome to the Ollama Windows preview.
+
+No more WSL required!
+
+Ollama now runs as a native Windows application, including NVIDIA and AMD Radeon GPU support.
+After installing Ollama Windows Preview, Ollama will run in the background and
+the `ollama` command line is available in `cmd`, `powershell` or your favorite
+terminal application. As usual the Ollama [api](./api.md) will be served on
+`http://localhost:11434`.
+
+As this is a preview release, you should expect a few bugs here and there.  If
+you run into a problem you can reach out on
+[Discord](https://discord.gg/ollama), or file an
+[issue](https://github.com/ollama/ollama/issues).
+Logs will often be helpful in diagnosing the problem (see
+[Troubleshooting](#troubleshooting) below)
+
+## System Requirements
+
+* Windows 10 or newer, Home or Pro
+* NVIDIA 452.39 or newer Drivers if you have an NVIDIA card
+* AMD Radeon Driver https://www.amd.com/en/support if you have a Radeon card
+
+## API Access
+
+Here's a quick example showing API access from `powershell`
+```powershell
+(Invoke-WebRequest -method POST -Body '{"model":"llama3", "prompt":"Why is the sky blue?", "stream": false}' -uri http://localhost:11434/api/generate ).Content | ConvertFrom-json
+```
+
+## Troubleshooting
+
+While we're in preview, `OLLAMA_DEBUG` is always enabled, which adds
+a "view logs" menu item to the app, and increses logging for the GUI app and
+server.
+
+Ollama on Windows stores files in a few different locations.  You can view them in
+the explorer window by hitting `<cmd>+R` and type in:
+- `explorer %LOCALAPPDATA%\Ollama` contains logs, and downloaded updates
+    - *app.log* contains logs from the GUI application
+    - *server.log* contains the server logs
+    - *upgrade.log* contains log output for upgrades
+- `explorer %LOCALAPPDATA%\Programs\Ollama` contains the binaries (The installer adds this to your user PATH)
+- `explorer %HOMEPATH%\.ollama` contains models and configuration
+- `explorer %TEMP%` contains temporary executable files in one or more `ollama*` directories
+
+
+## Standalone CLI
+
+The easiest way to install Ollama on Windows is to use the `OllamaSetup.exe`
+installer. It installs in your account without requiring Administrator rights.
+We update Ollama regularly to support the latest models, and this installer will
+help you keep up to date.
+
+If you'd like to install or integrate Ollama as a service, a standalone
+`ollama-windows-amd64.zip` zip file is available containing only the Ollama CLI
+and GPU library dependencies for Nvidia and AMD. This allows for embedding
+Ollama in existing applications, or running it as a system service via `ollama
+serve` with tools such as [NSSM](https://nssm.cc/).
--- a/examples/bash-comparemodels/README.md
+++ b/examples/bash-comparemodels/README.md
@@ -1,10 +0,0 @@
-# Bash Shell examples
-
-When calling `ollama`, you can pass it a file to run all the prompts in the file, one after the other:
-
-`ollama run llama2 < sourcequestions.txt`
-
-This concept is used in the following example.
-
-## Compare Models
-`comparemodels.sh` is a script that runs all the questions in `sourcequestions.txt` using any 4 models you choose that you have already pulled from the Ollama library or have created locally.
--- a/examples/bash-comparemodels/comparemodels.sh
+++ b/examples/bash-comparemodels/comparemodels.sh
@@ -1,64 +0,0 @@
-#! /usr/bin/env bash
-# Compare multiple models by running them with the same questions
-
-NUMBEROFCHOICES=4
-SELECTIONS=()
-declare -a SUMS=()
-
-# Get the list of models
-CHOICES=$(ollama list | awk '{print $1}')
-
-# Select which models to run as a comparison
-echo "Select $NUMBEROFCHOICES models to compare:"
-select ITEM in $CHOICES; do
-    if [[ -n $ITEM ]]; then
-        echo "You have selected $ITEM"
-        SELECTIONS+=("$ITEM")
-        ((COUNT++))
-        if [[ $COUNT -eq $NUMBEROFCHOICES ]]; then
-            break
-        fi
-    else
-        echo "Invalid selection"
-    fi
-done
-
-# Loop through each of the selected models
-for ITEM in "${SELECTIONS[@]}"; do
-    echo "--------------------------------------------------------------"
-    echo "Loading the model $ITEM into memory"
-    ollama run "$ITEM" ""
-    echo "--------------------------------------------------------------"
-    echo "Running the questions through the model $ITEM"
-    COMMAND_OUTPUT=$(ollama run "$ITEM" --verbose < sourcequestions.txt 2>&1| tee /dev/stderr)
-
-    # eval duration is sometimes listed in seconds and sometimes in milliseconds. 
-    # Add up the values for each model
-    SUM=$(echo "$COMMAND_OUTPUT" | awk '
-    /eval duration:/ {
-        value = $3
-        if (index(value, "ms") > 0) {
-            gsub("ms", "", value)
-            value /= 1000
-        } else {
-            gsub("s", "", value)
-        }
-        sum += value
-    }
-    END { print sum }')
-
-
-    SUMS+=("All questions for $ITEM completed in $SUM seconds")
-done
-
-echo ""
-echo "--------------------------------------------------------------"
-echo -e "Sums of eval durations for each run:"
-for val in "${SUMS[@]}"; do
-    echo "$val"
-done
-
-echo "--------------------------------------------------------------"
-echo "Comparison complete. Now you can decide"
-echo "which model is best."
-echo "--------------------------------------------------------------"
--- a/examples/bash-comparemodels/sourcequestions.txt
+++ b/examples/bash-comparemodels/sourcequestions.txt
@@ -1,7 +0,0 @@
-Why is the sky blue
-What is a black hole
-Explain the big bang theory like I am 5?
-What is the quickest way to win a game of Monopoly with 3 others?
-Why does a vacuum bottle keep my coffee hot and my milkshake cold?
-What is the difference between a meteor, a meteorite, and a meteoroid?
-Create an array with 5 items and print to the console. Do this in Python, C#, Typescript, and Rust.
--- a/examples/flyio/.gitignore
+++ b/examples/flyio/.gitignore
@@ -0,0 +1 @@
+fly.toml
--- a/examples/flyio/README.md
+++ b/examples/flyio/README.md
@@ -0,0 +1,67 @@
+# Deploy Ollama to Fly.io
+
+> Note: this example exposes a public endpoint and does not configure authentication. Use with care.
+
+## Prerequisites
+
+- Ollama: https://ollama.com/download
+- Fly.io account. Sign up for a free account: https://fly.io/app/sign-up
+
+## Steps
+
+1. Login to Fly.io
+
+    ```bash
+    fly auth login
+    ```
+
+1. Create a new Fly app
+
+    ```bash
+    fly launch --name <name> --image ollama/ollama --internal-port 11434 --vm-size shared-cpu-8x --now
+    ```
+
+1. Pull and run `orca-mini:3b`
+
+    ```bash
+    OLLAMA_HOST=https://<name>.fly.dev ollama run orca-mini:3b
+    ```
+
+`shared-cpu-8x` is a free-tier eligible machine type. For better performance, switch to a `performance` or `dedicated` machine type or attach a GPU for hardware acceleration (see below).
+
+## (Optional) Persistent Volume
+
+By default Fly Machines use ephemeral storage which is problematic if you want to use the same model across restarts without pulling it again. Create and attach a persistent volume to store the downloaded models:
+
+1. Create the Fly Volume
+
+    ```bash
+    fly volume create ollama
+    ```
+
+1. Update `fly.toml` and add `[mounts]`
+
+    ```toml
+    [mounts]
+      source = "ollama"
+      destination = "/mnt/ollama/models"
+    ```
+
+1. Update `fly.toml` and add `[env]`
+
+    ```toml
+    [env]
+      OLLAMA_MODELS = "/mnt/ollama/models"
+    ```
+
+1. Deploy your app
+
+    ```bash
+    fly deploy
+    ```
+
+## (Optional) Hardware Acceleration
+
+Fly.io GPU is currently in waitlist. Sign up for the waitlist: https://fly.io/gpu
+
+Once you've been accepted, create the app with the additional flags `--vm-gpu-kind a100-pcie-40gb` or `--vm-gpu-kind a100-pcie-80gb`.
--- a/examples/go-chat/main.go
+++ b/examples/go-chat/main.go
@@ -35,7 +35,7 @@ func main() {

 	ctx := context.Background()
 	req := &api.ChatRequest{
-		Model:    "llama2",
+		Model:    "llama3",
 		Messages: messages,
 	}

--- a/examples/go-http-generate/main.go
+++ b/examples/go-http-generate/main.go
@@ -19,7 +19,7 @@ func main() {
 	}

 	defer resp.Body.Close()
-	
+
 	responseData, err := io.ReadAll(resp.Body)
 	if err != nil {
 		log.Fatal(err)
--- a/examples/kubernetes/README.md
+++ b/examples/kubernetes/README.md
@@ -7,12 +7,24 @@

 ## Steps

-1. Create the Ollama namespace, daemon set, and service
+1. Create the Ollama namespace, deployment, and service

   ```bash
   kubectl apply -f cpu.yaml
   ```

+## (Optional) Hardware Acceleration
+
+Hardware acceleration in Kubernetes requires NVIDIA's [`k8s-device-plugin`](https://github.com/NVIDIA/k8s-device-plugin) which is deployed in Kubernetes in form of daemonset. Follow the link for more details.
+
+Once configured, create a GPU enabled Ollama deployment.
+
+```bash
+kubectl apply -f gpu.yaml
+```
+
+## Test
+
 1. Port forward the Ollama service to connect and use it locally

   ```bash
@@ -23,14 +35,4 @@

   ```bash
   ollama run orca-mini:3b
-   ```
-
-## (Optional) Hardware Acceleration
-
-Hardware acceleration in Kubernetes requires NVIDIA's [`k8s-device-plugin`](https://github.com/NVIDIA/k8s-device-plugin). Follow the link for more details.
-
-Once configured, create a GPU enabled Ollama deployment.
-
-```bash
-kubectl apply -f gpu.yaml
-```
+   ```
--- a/examples/langchain-python-rag-document/main.py
+++ b/examples/langchain-python-rag-document/main.py
@@ -40,9 +40,9 @@ while True:
        continue

    # Prompt
-    template = """Use the following pieces of context to answer the question at the end. 
-    If you don't know the answer, just say that you don't know, don't try to make up an answer. 
-    Use three sentences maximum and keep the answer as concise as possible. 
+    template = """Use the following pieces of context to answer the question at the end.
+    If you don't know the answer, just say that you don't know, don't try to make up an answer.
+    Use three sentences maximum and keep the answer as concise as possible.
    {context}
    Question: {question}
    Helpful Answer:"""
@@ -51,11 +51,11 @@ while True:
        template=template,
    )

-    llm = Ollama(model="llama2:13b", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
+    llm = Ollama(model="llama3:8b", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
    qa_chain = RetrievalQA.from_chain_type(
        llm,
        retriever=vectorstore.as_retriever(),
        chain_type_kwargs={"prompt": QA_CHAIN_PROMPT},
    )

-    result = qa_chain({"query": query})
+    result = qa_chain({"query": query})
--- a/examples/langchain-python-rag-websummary/main.py
+++ b/examples/langchain-python-rag-websummary/main.py
@@ -1,12 +1,12 @@
-from langchain.llms import Ollama
-from langchain.document_loaders import WebBaseLoader
+from langchain_community.llms import Ollama
+from langchain_community.document_loaders import WebBaseLoader
 from langchain.chains.summarize import load_summarize_chain

 loader = WebBaseLoader("https://ollama.com/blog/run-llama2-uncensored-locally")
 docs = loader.load()

-llm = Ollama(model="llama2")
+llm = Ollama(model="llama3")
 chain = load_summarize_chain(llm, chain_type="stuff")

-result = chain.run(docs)
+result = chain.invoke(docs) 
 print(result)
--- a/examples/langchain-python-simple/README.md
+++ b/examples/langchain-python-simple/README.md
@@ -4,10 +4,10 @@ This example is a basic "hello world" of using LangChain with Ollama.

 ## Running the Example

-1. Ensure you have the `llama2` model installed:
+1. Ensure you have the `llama3` model installed:

   ```bash
-   ollama pull llama2
+   ollama pull llama3
   ```

 2. Install the Python Requirements.
@@ -21,4 +21,3 @@ This example is a basic "hello world" of using LangChain with Ollama.
   ```bash
   python main.py
   ```
-  
--- a/examples/langchain-python-simple/main.py
+++ b/examples/langchain-python-simple/main.py
@@ -1,6 +1,6 @@
 from langchain.llms import Ollama

 input = input("What is your question?")
-llm = Ollama(model="llama2")
+llm = Ollama(model="llama3")
 res = llm.predict(input)
 print (res)
--- a/examples/modelfile-mario/Modelfile
+++ b/examples/modelfile-mario/Modelfile
@@ -1,4 +1,4 @@
-FROM llama2
+FROM llama3
 PARAMETER temperature 1
 SYSTEM """
 You are Mario from super mario bros, acting as an assistant.
--- a/examples/modelfile-mario/readme.md
+++ b/examples/modelfile-mario/readme.md
@@ -2,12 +2,12 @@

 # Example character: Mario

-This example shows how to create a basic character using Llama2 as the base model.
+This example shows how to create a basic character using Llama3 as the base model.

 To run this example:

 1. Download the Modelfile
-2. `ollama pull llama2` to get the base model used in the model file.
+2. `ollama pull llama3` to get the base model used in the model file.
 3. `ollama create NAME -f ./Modelfile`
 4. `ollama run NAME`

@@ -18,7 +18,7 @@ Ask it some questions like "Who are you?" or "Is Peach in trouble again?"
 What the model file looks like:

 ```
-FROM llama2
+FROM llama3
 PARAMETER temperature 1
 SYSTEM """
 You are Mario from Super Mario Bros, acting as an assistant.
--- a/examples/python-json-datagenerator/predefinedschema.py
+++ b/examples/python-json-datagenerator/predefinedschema.py
@@ -2,16 +2,16 @@ import requests
 import json
 import random

-model = "llama2"
+model = "llama3"
 template = {
-  "firstName": "", 
-  "lastName": "", 
+  "firstName": "",
+  "lastName": "",
  "address": {
-    "street": "", 
-    "city": "", 
-    "state": "", 
+    "street": "",
+    "city": "",
+    "state": "",
    "zipCode": ""
-  }, 
+  },
  "phoneNumber": ""
 }

--- a/examples/python-json-datagenerator/randomaddresses.py
+++ b/examples/python-json-datagenerator/randomaddresses.py
@@ -12,7 +12,7 @@ countries = [
    "France",
 ]
 country = random.choice(countries)
-model = "llama2"
+model = "llama3"

 prompt = f"generate one realistically believable sample data set of a persons first name, last name, address in {country}, and phone number. Do not use common names. Respond using JSON. Key names should have no backslashes, values should use plain ascii with no special characters."

--- a/examples/python-json-datagenerator/readme.md
+++ b/examples/python-json-datagenerator/readme.md
@@ -6,10 +6,10 @@ There are two python scripts in this example. `randomaddresses.py` generates ran

 ## Running the Example

-1. Ensure you have the `llama2` model installed:
+1. Ensure you have the `llama3` model installed:

   ```bash
-   ollama pull llama2
+   ollama pull llama3
   ```

 2. Install the Python Requirements.
--- a/examples/python-simplechat/client.py
+++ b/examples/python-simplechat/client.py
@@ -2,7 +2,7 @@ import json
 import requests

 # NOTE: ollama must be running for this to work, start the ollama app or run `ollama serve`
-model = "llama2"  # TODO: update this for whatever model you wish to use
+model = "llama3"  # TODO: update this for whatever model you wish to use


 def chat(messages):
--- a/examples/python-simplechat/readme.md
+++ b/examples/python-simplechat/readme.md
@@ -4,10 +4,10 @@ The **chat** endpoint is one of two ways to generate text from an LLM with Ollam

 ## Running the Example

-1. Ensure you have the `llama2` model installed:
+1. Ensure you have the `llama3` model installed:

   ```bash
-   ollama pull llama2
+   ollama pull llama3
   ```

 2. Install the Python Requirements.
--- a/examples/typescript-mentors/README.md
+++ b/examples/typescript-mentors/README.md
@@ -4,10 +4,10 @@ This example demonstrates how one would create a set of 'mentors' you can have a

 ## Usage

-1. Add llama2 to have the mentors ask your questions:
+1. Add llama3 to have the mentors ask your questions:

   ```bash
-   ollama pull llama2
+   ollama pull llama3
   ```

 2. Install prerequisites:
--- a/examples/typescript-mentors/character-generator.ts
+++ b/examples/typescript-mentors/character-generator.ts
@@ -15,7 +15,7 @@ async function characterGenerator() {
  ollama.setModel("stablebeluga2:70b-q4_K_M");
  const bio = await ollama.generate(`create a bio of ${character} in a single long paragraph. Instead of saying '${character} is...' or '${character} was...' use language like 'You are...' or 'You were...'. Then create a paragraph describing the speaking mannerisms and style of ${character}. Don't include anything about how ${character} looked or what they sounded like, just focus on the words they said. Instead of saying '${character} would say...' use language like 'You should say...'. If you use quotes, always use single quotes instead of double quotes. If there are any specific words or phrases you used a lot, show how you used them. `);

-  const thecontents = `FROM llama2\nSYSTEM """\n${bio.response.replace(/(\r\n|\n|\r)/gm, " ").replace('would', 'should')} All answers to questions should be related back to what you are most known for.\n"""`;
+  const thecontents = `FROM llama3\nSYSTEM """\n${bio.response.replace(/(\r\n|\n|\r)/gm, " ").replace('would', 'should')} All answers to questions should be related back to what you are most known for.\n"""`;

  fs.writeFile(path.join(directory, 'Modelfile'), thecontents, (err: any) => {
    if (err) throw err;
@@ -23,4 +23,4 @@ async function characterGenerator() {
  });
 }

-characterGenerator();
+characterGenerator();
--- a/examples/typescript-simplechat/client.ts
+++ b/examples/typescript-simplechat/client.ts
@@ -1,6 +1,6 @@
 import * as readline from "readline";

-const model = "llama2";
+const model = "llama3";
 type Message = {
  role: "assistant" | "user" | "system";
  content: string;
@@ -74,4 +74,4 @@ async function main() {

 }

-main();
+main();
--- a/format/bytes.go
+++ b/format/bytes.go
@@ -53,6 +53,8 @@ func HumanBytes(b int64) string {

 func HumanBytes2(b uint64) string {
 	switch {
+	case b >= GibiByte:
+		return fmt.Sprintf("%.1f GiB", float64(b)/GibiByte)
 	case b >= MebiByte:
 		return fmt.Sprintf("%.1f MiB", float64(b)/MebiByte)
 	case b >= KibiByte:
--- a/format/format.go
+++ b/format/format.go
@@ -13,12 +13,20 @@ const (

 func HumanNumber(b uint64) string {
 	switch {
-	case b > Billion:
-		return fmt.Sprintf("%.0fB", math.Round(float64(b)/Billion))
-	case b > Million:
-		return fmt.Sprintf("%.0fM", math.Round(float64(b)/Million))
-	case b > Thousand:
-		return fmt.Sprintf("%.0fK", math.Round(float64(b)/Thousand))
+	case b >= Billion:
+		number := float64(b) / Billion
+		if number == math.Floor(number) {
+			return fmt.Sprintf("%.0fB", number) // no decimals if whole number
+		}
+		return fmt.Sprintf("%.1fB", number) // one decimal if not a whole number
+	case b >= Million:
+		number := float64(b) / Million
+		if number == math.Floor(number) {
+			return fmt.Sprintf("%.0fM", number) // no decimals if whole number
+		}
+		return fmt.Sprintf("%.2fM", number) // two decimals if not a whole number
+	case b >= Thousand:
+		return fmt.Sprintf("%.0fK", float64(b)/Thousand)
 	default:
 		return fmt.Sprintf("%d", b)
 	}
--- a/format/format_test.go
+++ b/format/format_test.go
@@ -0,0 +1,34 @@
+package format
+
+import (
+	"testing"
+)
+
+func TestHumanNumber(t *testing.T) {
+
+	type testCase struct {
+		input    uint64
+		expected string
+	}
+
+	testCases := []testCase{
+		{0, "0"},
+		{1000000, "1M"},
+		{125000000, "125M"},
+		{500500000, "500.50M"},
+		{500550000, "500.55M"},
+		{1000000000, "1B"},
+		{2800000000, "2.8B"},
+		{2850000000, "2.9B"},
+		{1000000000000, "1000B"},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.expected, func(t *testing.T) {
+			result := HumanNumber(tc.input)
+			if result != tc.expected {
+				t.Errorf("Expected %s, got %s", tc.expected, result)
+			}
+		})
+	}
+}
--- a/format/time.go
+++ b/format/time.go
@@ -60,7 +60,9 @@ func humanTime(t time.Time, zeroValue string) string {
 	}

 	delta := time.Since(t)
-	if delta < 0 {
+	if int(delta.Hours())/24/365 < -20 {
+		return "Forever"
+	} else if delta < 0 {
 		return humanDuration(-delta) + " from now"
 	}

--- a/format/time_test.go
+++ b/format/time_test.go
@@ -32,4 +32,14 @@ func TestHumanTime(t *testing.T) {
 		v := now.Add(800 * time.Millisecond)
 		assertEqual(t, HumanTime(v, ""), "Less than a second from now")
 	})
+
+	t.Run("time way in the future", func(t *testing.T) {
+		v := now.Add(24 * time.Hour * 365 * 200)
+		assertEqual(t, HumanTime(v, ""), "Forever")
+	})
+
+	t.Run("time way in the future lowercase", func(t *testing.T) {
+		v := now.Add(24 * time.Hour * 365 * 200)
+		assertEqual(t, HumanTimeLower(v, ""), "forever")
+	})
 }
--- a/go.mod
+++ b/go.mod
@@ -1,36 +1,38 @@
 module github.com/ollama/ollama

-go 1.22
-
-toolchain go1.22.0
+go 1.22.0

 require (
 	github.com/containerd/console v1.0.3
 	github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1
 	github.com/emirpasic/gods v1.18.1
-	github.com/gin-gonic/gin v1.9.1
-	github.com/golang/protobuf v1.5.0 // indirect
-	github.com/google/uuid v1.0.0
+	github.com/gin-gonic/gin v1.10.0
+	github.com/golang/protobuf v1.5.4 // indirect
+	github.com/google/uuid v1.1.2
 	github.com/mitchellh/mapstructure v1.5.0
 	github.com/olekukonko/tablewriter v0.0.5
 	github.com/spf13/cobra v1.7.0
-	github.com/stretchr/testify v1.8.4
+	github.com/stretchr/testify v1.9.0
 	github.com/x448/float16 v0.8.4
 	golang.org/x/sync v0.3.0
 )

 require (
 	github.com/nlpodyssey/gopickle v0.3.0
-	github.com/pdevine/tensor v0.0.0-20240228013915-64ccaa8d9ca9
+	github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c
 )

 require (
-	github.com/apache/arrow/go/arrow v0.0.0-20201229220542-30ce2eb5d4dc // indirect
+	github.com/apache/arrow/go/arrow v0.0.0-20211112161151-bc219186db40 // indirect
+	github.com/bytedance/sonic/loader v0.1.1 // indirect
 	github.com/chewxy/hm v1.0.0 // indirect
-	github.com/chewxy/math32 v1.0.8 // indirect
+	github.com/chewxy/math32 v1.10.1 // indirect
+	github.com/cloudwego/base64x v0.1.4 // indirect
+	github.com/cloudwego/iasm v0.2.0 // indirect
 	github.com/davecgh/go-spew v1.1.1 // indirect
 	github.com/gogo/protobuf v1.3.2 // indirect
-	github.com/google/flatbuffers v1.12.0 // indirect
+	github.com/google/flatbuffers v24.3.25+incompatible // indirect
+	github.com/kr/text v0.2.0 // indirect
 	github.com/mattn/go-runewidth v0.0.14 // indirect
 	github.com/pkg/errors v0.9.1 // indirect
 	github.com/pmezard/go-difflib v1.0.0 // indirect
@@ -38,40 +40,38 @@ require (
 	github.com/xtgo/set v1.0.0 // indirect
 	go4.org/unsafe/assume-no-moving-gc v0.0.0-20231121144256-b99613f794b6 // indirect
 	golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
-	gonum.org/v1/gonum v0.8.2 // indirect
+	gonum.org/v1/gonum v0.15.0 // indirect
 	gorgonia.org/vecf32 v0.9.0 // indirect
 	gorgonia.org/vecf64 v0.9.0 // indirect
 )

 require (
-	github.com/bytedance/sonic v1.9.1 // indirect
-	github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311 // indirect
-	github.com/gabriel-vasile/mimetype v1.4.2 // indirect
-	github.com/gin-contrib/cors v1.4.0
+	github.com/bytedance/sonic v1.11.6 // indirect
+	github.com/gabriel-vasile/mimetype v1.4.3 // indirect
+	github.com/gin-contrib/cors v1.7.2
 	github.com/gin-contrib/sse v0.1.0 // indirect
 	github.com/go-playground/locales v0.14.1 // indirect
 	github.com/go-playground/universal-translator v0.18.1 // indirect
-	github.com/go-playground/validator/v10 v10.14.0 // indirect
+	github.com/go-playground/validator/v10 v10.20.0 // indirect
 	github.com/goccy/go-json v0.10.2 // indirect
-	github.com/google/go-cmp v0.5.9 // indirect
 	github.com/inconshreveable/mousetrap v1.1.0 // indirect
 	github.com/json-iterator/go v1.1.12 // indirect
-	github.com/klauspost/cpuid/v2 v2.2.4 // indirect
-	github.com/leodido/go-urn v1.2.4 // indirect
-	github.com/mattn/go-isatty v0.0.19 // indirect
+	github.com/klauspost/cpuid/v2 v2.2.7 // indirect
+	github.com/leodido/go-urn v1.4.0 // indirect
+	github.com/mattn/go-isatty v0.0.20 // indirect
 	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
 	github.com/modern-go/reflect2 v1.0.2 // indirect
-	github.com/pelletier/go-toml/v2 v2.0.8 // indirect
+	github.com/pelletier/go-toml/v2 v2.2.2 // indirect
 	github.com/spf13/pflag v1.0.5 // indirect
 	github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
-	github.com/ugorji/go/codec v1.2.11 // indirect
-	golang.org/x/arch v0.3.0 // indirect
-	golang.org/x/crypto v0.14.0
-	golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63
-	golang.org/x/net v0.17.0 // indirect
-	golang.org/x/sys v0.13.0
-	golang.org/x/term v0.13.0
-	golang.org/x/text v0.14.0 // indirect
-	google.golang.org/protobuf v1.30.0
+	github.com/ugorji/go/codec v1.2.12 // indirect
+	golang.org/x/arch v0.8.0 // indirect
+	golang.org/x/crypto v0.23.0
+	golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa
+	golang.org/x/net v0.25.0 // indirect
+	golang.org/x/sys v0.20.0
+	golang.org/x/term v0.20.0
+	golang.org/x/text v0.15.0 // indirect
+	google.golang.org/protobuf v1.34.1
 	gopkg.in/yaml.v3 v3.0.1 // indirect
 )
--- a/go.sum
+++ b/go.sum
@@ -1,22 +1,32 @@
 cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
+cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
+dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU=
+gioui.org v0.0.0-20210308172011-57750fc8a0a6/go.mod h1:RSH6KIUZ0p2xy5zHDxgAM4zumjgTw83q2ge/PI+yyw8=
 github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
+github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo=
 github.com/ajstarks/svgo v0.0.0-20180226025133-644b8db467af/go.mod h1:K08gAheRH3/J6wwsYMMT4xOr94bZjxIelGM0+d/wbFw=
-github.com/apache/arrow/go/arrow v0.0.0-20201229220542-30ce2eb5d4dc h1:zvQ6w7KwtQWgMQiewOF9tFtundRMVZFSAksNV6ogzuY=
-github.com/apache/arrow/go/arrow v0.0.0-20201229220542-30ce2eb5d4dc/go.mod h1:c9sxoIT3YgLxH4UhLOCKaBlEojuMhVYpk4Ntv3opUTQ=
-github.com/bytedance/sonic v1.5.0/go.mod h1:ED5hyg4y6t3/9Ku1R6dU/4KyJ48DZ4jPhfY1O2AihPM=
-github.com/bytedance/sonic v1.9.1 h1:6iJ6NqdoxCDr6mbY8h18oSO+cShGSMRGCEo7F2h0x8s=
-github.com/bytedance/sonic v1.9.1/go.mod h1:i736AoUSYt75HyZLoJW9ERYxcy6eaN6h4BZXU064P/U=
+github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY=
+github.com/apache/arrow/go/arrow v0.0.0-20211112161151-bc219186db40 h1:q4dksr6ICHXqG5hm0ZW5IHyeEJXoIJSOZeBLmWPNeIQ=
+github.com/apache/arrow/go/arrow v0.0.0-20211112161151-bc219186db40/go.mod h1:Q7yQnSMnLvcXlZ8RV+jwz/6y1rQTqbX6C82SndT52Zs=
+github.com/boombuler/barcode v1.0.0/go.mod h1:paBWMcWSl3LHKBqUq+rly7CNSldXjb2rDl3JlRe0mD8=
+github.com/bytedance/sonic v1.11.6 h1:oUp34TzMlL+OY1OUWxHqsdkgC/Zfc85zGqw9siXjrc0=
+github.com/bytedance/sonic v1.11.6/go.mod h1:LysEHSvpvDySVdC2f87zGWf6CIKJcAvqab1ZaiQtds4=
+github.com/bytedance/sonic/loader v0.1.1 h1:c+e5Pt1k/cy5wMveRDyk2X4B9hF4g7an8N3zCYjJFNM=
+github.com/bytedance/sonic/loader v0.1.1/go.mod h1:ncP89zfokxS5LZrJxl5z0UJcsk4M4yY2JpfqGeCtNLU=
 github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
-github.com/chenzhuoyu/base64x v0.0.0-20211019084208-fb5309c8db06/go.mod h1:DH46F32mSOjUmXrMHnKwZdA8wcEefY7UVqBKYGjpdQY=
-github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311 h1:qSGYFH7+jGhDF8vLC+iwCD4WpbV1EBDSzWkJODFLams=
-github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311/go.mod h1:b583jCggY9gE99b6G5LEC39OIiVsWj+R97kbl5odCEk=
 github.com/chewxy/hm v1.0.0 h1:zy/TSv3LV2nD3dwUEQL2VhXeoXbb9QkpmdRAVUFiA6k=
 github.com/chewxy/hm v1.0.0/go.mod h1:qg9YI4q6Fkj/whwHR1D+bOGeF7SniIP40VweVepLjg0=
 github.com/chewxy/math32 v1.0.0/go.mod h1:Miac6hA1ohdDUTagnvJy/q+aNnEk16qWUdb8ZVhvCN0=
-github.com/chewxy/math32 v1.0.8 h1:fU5E4Ec4Z+5RtRAi3TovSxUjQPkgRh+HbP7tKB2OFbM=
-github.com/chewxy/math32 v1.0.8/go.mod h1:dOB2rcuFrCn6UHrze36WSLVPKtzPMRAQvBvUwkSsLqs=
+github.com/chewxy/math32 v1.10.1 h1:LFpeY0SLJXeaiej/eIp2L40VYfscTvKh/FSEZ68uMkU=
+github.com/chewxy/math32 v1.10.1/go.mod h1:dOB2rcuFrCn6UHrze36WSLVPKtzPMRAQvBvUwkSsLqs=
 github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
+github.com/cloudwego/base64x v0.1.4 h1:jwCgWpFanWmN8xoIUHa2rtzmkd5J2plF/dnLS6Xd/0Y=
+github.com/cloudwego/base64x v0.1.4/go.mod h1:0zlkT4Wn5C6NdauXdJRhSKRlJvmclQ1hhJgA0rcu/8w=
+github.com/cloudwego/iasm v0.2.0 h1:1KNIy1I1H9hNNFEEH3DVnI4UujN+1zjpuk6gwHLTssg=
+github.com/cloudwego/iasm v0.2.0/go.mod h1:8rXZaNYT2n95jn+zTI1sDr+IgcD2GVs0nlbbQPiEFhY=
 github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc=
+github.com/cncf/udpa/go v0.0.0-20201120205902-5459f2c99403/go.mod h1:WmhPx2Nbnhtbo57+VJT5O0JRkEi1Wbu0z5j0R8u5Hbk=
+github.com/cncf/xds/go v0.0.0-20210312221358-fbca930ec8ed/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs=
 github.com/containerd/console v1.0.3 h1:lIr7SlA5PxZyMV30bDW0MGbiOPXwc63yRuCP0ARubLw=
 github.com/containerd/console v1.0.3/go.mod h1:7LqA/THxQ86k76b8c/EMSiaJ3h1eZkMkXar0TQ1gf3U=
 github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
@@ -31,30 +41,35 @@ github.com/emirpasic/gods v1.18.1/go.mod h1:8tpGGwCnJ5H4r6BWwaV6OrWmMoPhUl5jm/FM
 github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
 github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
 github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98=
+github.com/envoyproxy/go-control-plane v0.9.9-0.20201210154907-fd9021fe5dad/go.mod h1:cXg6YxExXjJnVBQHBLXeUAgxn2UodCpnH306RInaBQk=
+github.com/envoyproxy/go-control-plane v0.9.9-0.20210217033140-668b12f5399d/go.mod h1:cXg6YxExXjJnVBQHBLXeUAgxn2UodCpnH306RInaBQk=
+github.com/envoyproxy/go-control-plane v0.9.9-0.20210512163311-63b5d3c536b0/go.mod h1:hliV/p42l8fGbc6Y9bQ70uLwIvmJyVE5k4iMKlh8wCQ=
 github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
 github.com/fogleman/gg v1.2.1-0.20190220221249-0403632d5b90/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k=
-github.com/gabriel-vasile/mimetype v1.4.2 h1:w5qFW6JKBz9Y393Y4q372O9A7cUSequkh1Q7OhCmWKU=
-github.com/gabriel-vasile/mimetype v1.4.2/go.mod h1:zApsH/mKG4w07erKIaJPFiX0Tsq9BFQgN3qGY5GnNgA=
-github.com/gin-contrib/cors v1.4.0 h1:oJ6gwtUl3lqV0WEIwM/LxPF1QZ5qe2lGWdY2+bz7y0g=
-github.com/gin-contrib/cors v1.4.0/go.mod h1:bs9pNM0x/UsmHPBWT2xZz9ROh8xYjYkiURUfmBoMlcs=
+github.com/fogleman/gg v1.3.0/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k=
+github.com/gabriel-vasile/mimetype v1.4.3 h1:in2uUcidCuFcDKtdcBxlR0rJ1+fsokWf+uqxgUFjbI0=
+github.com/gabriel-vasile/mimetype v1.4.3/go.mod h1:d8uq/6HKRL6CGdk+aubisF/M5GcPfT7nKyLpA0lbSSk=
+github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04=
+github.com/gin-contrib/cors v1.7.2 h1:oLDHxdg8W/XDoN/8zamqk/Drgt4oVZDvaV0YmvVICQw=
+github.com/gin-contrib/cors v1.7.2/go.mod h1:SUJVARKgQ40dmrzgXEVxj2m7Ig1v1qIboQkPDTQ9t2E=
 github.com/gin-contrib/sse v0.1.0 h1:Y/yl/+YNO8GZSjAhjMsSuLt29uWRFHdHYUb5lYOV9qE=
 github.com/gin-contrib/sse v0.1.0/go.mod h1:RHrZQHXnP2xjPF+u1gW/2HnVO7nvIa9PG3Gm+fLHvGI=
-github.com/gin-gonic/gin v1.8.1/go.mod h1:ji8BvRH1azfM+SYow9zQ6SZMvR8qOMZHmsCuWR9tTTk=
-github.com/gin-gonic/gin v1.9.1 h1:4idEAncQnU5cB7BeOkPtxjfCSye0AAm1R0RVIqJ+Jmg=
-github.com/gin-gonic/gin v1.9.1/go.mod h1:hPrL7YrpYKXt5YId3A/Tnip5kqbEAP+KLuI3SUcPTeU=
-github.com/go-playground/assert/v2 v2.0.1/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4=
+github.com/gin-gonic/gin v1.10.0 h1:nTuyha1TYqgedzytsKYqna+DfLos46nTv2ygFy86HFU=
+github.com/gin-gonic/gin v1.10.0/go.mod h1:4PMNQiOhvDRa013RKVbsiNwoyezlm2rm0uX/T7kzp5Y=
+github.com/go-fonts/dejavu v0.1.0/go.mod h1:4Wt4I4OU2Nq9asgDCteaAaWZOV24E+0/Pwo0gppep4g=
+github.com/go-fonts/latin-modern v0.2.0/go.mod h1:rQVLdDMK+mK1xscDwsqM5J8U2jrRa3T0ecnM9pNujks=
+github.com/go-fonts/liberation v0.1.1/go.mod h1:K6qoJYypsmfVjWg8KOVDQhLc8UDgIK2HYqyqAO9z7GY=
+github.com/go-fonts/stix v0.1.0/go.mod h1:w/c1f0ldAUlJmLBvlbkvVXLAD+tAMqobIIQpmnUIzUY=
+github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU=
+github.com/go-latex/latex v0.0.0-20210118124228-b3d85cf34e07/go.mod h1:CO1AlKB2CSIqUrmQPqA0gdRIlnLEY0gK5JGjh37zN5U=
 github.com/go-playground/assert/v2 v2.2.0 h1:JvknZsQTYeFEAhQwI4qEt9cyV5ONwRHC+lYKSsYSR8s=
 github.com/go-playground/assert/v2 v2.2.0/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4=
-github.com/go-playground/locales v0.14.0/go.mod h1:sawfccIbzZTqEDETgFXqTho0QybSa7l++s0DH+LDiLs=
 github.com/go-playground/locales v0.14.1 h1:EWaQ/wswjilfKLTECiXz7Rh+3BjFhfDFKv/oXslEjJA=
 github.com/go-playground/locales v0.14.1/go.mod h1:hxrqLVvrK65+Rwrd5Fc6F2O76J/NuW9t0sjnWqG1slY=
-github.com/go-playground/universal-translator v0.18.0/go.mod h1:UvRDBj+xPUEGrFYl+lu/H90nyDXpg0fqeB/AQUGNTVA=
 github.com/go-playground/universal-translator v0.18.1 h1:Bcnm0ZwsGyWbCzImXv+pAJnYK9S473LQFuzCbDbfSFY=
 github.com/go-playground/universal-translator v0.18.1/go.mod h1:xekY+UJKNuX9WP91TpwSH2VMlDf28Uj24BCp08ZFTUY=
-github.com/go-playground/validator/v10 v10.10.0/go.mod h1:74x4gJWsvQexRdW8Pn3dXSGrTK4nAUsbPlLADvpJkos=
-github.com/go-playground/validator/v10 v10.14.0 h1:vgvQWe3XCz3gIeFDm/HnTIbj6UGmg/+t63MyGU2n5js=
-github.com/go-playground/validator/v10 v10.14.0/go.mod h1:9iXMNT7sEkjXb0I+enO7QXmzG6QCsPWY4zveKFVRSyU=
-github.com/goccy/go-json v0.9.7/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I=
+github.com/go-playground/validator/v10 v10.20.0 h1:K9ISHbSaI0lyB2eWMPJo+kOS/FBExVwjEviJTixqxL8=
+github.com/go-playground/validator/v10 v10.20.0/go.mod h1:dbuPbCMFw/DrkbEynArYaCwl3amGuJotoKCe95atGMM=
 github.com/goccy/go-json v0.10.2 h1:CrxCmQqYDkv1z7lO7Wbh2HN93uovUHgrECaO5ZrCXAU=
 github.com/goccy/go-json v0.10.2/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I=
 github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
@@ -72,46 +87,51 @@ github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:W
 github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0=
 github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8=
 github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
-github.com/golang/protobuf v1.5.0 h1:LUVKkCeviFUMKqHa4tXIIij/lbhnMbP7Fn5wKdKkRh4=
+github.com/golang/protobuf v1.4.3/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
 github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
-github.com/google/flatbuffers v1.11.0/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8=
-github.com/google/flatbuffers v1.12.0 h1:/PtAHvnBY4Kqnx/xCQ3OIV9uYcSFGScBsWI3Oogeh6w=
-github.com/google/flatbuffers v1.12.0/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8=
+github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY=
+github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek=
+github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps=
+github.com/golang/snappy v0.0.3 h1:fHPg5GQYlCeLIPB9BZqMVR5nR9A+IM5zcgeTdjMYmLA=
+github.com/golang/snappy v0.0.3/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
+github.com/google/flatbuffers v2.0.0+incompatible/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8=
+github.com/google/flatbuffers v24.3.25+incompatible h1:CX395cjN9Kke9mmalRoL3d81AtFUxJM+yDthflgJGkI=
+github.com/google/flatbuffers v24.3.25+incompatible/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8=
 github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
 github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
 github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
 github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
-github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38=
-github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
+github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
+github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
 github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
-github.com/google/uuid v1.0.0 h1:b4Gk+7WdP/d3HZH8EJsZpvV7EtDOgaZLtnaNGIu1adA=
-github.com/google/uuid v1.0.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
+github.com/google/uuid v1.1.2 h1:EVhdT+1Kseyi1/pUmXKaFxYsDNy9RQYkMWRH68J/W7Y=
+github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
+github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw=
 github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
 github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
 github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
 github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
+github.com/jung-kurt/gofpdf v1.0.0/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes=
 github.com/jung-kurt/gofpdf v1.0.3-0.20190309125859-24315acbbda5/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes=
 github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
 github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
+github.com/klauspost/compress v1.13.1 h1:wXr2uRxZTJXHLly6qhJabee5JqIhTRoLBhDOA74hDEQ=
+github.com/klauspost/compress v1.13.1/go.mod h1:8dP1Hq4DHOhN9w426knH3Rhby4rFm6D8eO+e+Dq5Gzg=
 github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
-github.com/klauspost/cpuid/v2 v2.2.4 h1:acbojRNwl3o09bUq+yDCtZFc1aiwaAAxtcn8YkZXnvk=
-github.com/klauspost/cpuid/v2 v2.2.4/go.mod h1:RVVoqg1df56z8g3pUjL/3lE5UfnlrJX8tyFgg4nqhuY=
-github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
-github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
+github.com/klauspost/cpuid/v2 v2.2.7 h1:ZWSB3igEs+d0qvnxR/ZBzXVmxkgt8DdzP6m9pfuVLDM=
+github.com/klauspost/cpuid/v2 v2.2.7/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws=
+github.com/knz/go-libedit v1.10.1/go.mod h1:MZTVkCWyz0oBc7JOWP3wNAzd002ZbM/5hgShxwh4x8M=
 github.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0=
 github.com/kr/pretty v0.3.0/go.mod h1:640gp4NfQd8pI5XOwp5fnNeVWj67G7CFk/SaSQn7NBk=
-github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
-github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
 github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
 github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
-github.com/leodido/go-urn v1.2.1/go.mod h1:zt4jvISO2HfUBqxjfIshjdMTYS56ZS/qv49ictyFfxY=
-github.com/leodido/go-urn v1.2.4 h1:XlAE/cm/ms7TE/VMVoduSpNBoyc2dOxHs5MZSwAN63Q=
-github.com/leodido/go-urn v1.2.4/go.mod h1:7ZrI8mTSeBSHl/UaRyKQW1qZeMgak41ANeCNaVckg+4=
-github.com/mattn/go-isatty v0.0.14/go.mod h1:7GGIvUiUoEMVVmxf/4nioHXj79iQHKdU27kJ6hsGG94=
-github.com/mattn/go-isatty v0.0.19 h1:JITubQf0MOLdlGRuRq+jtsDlekdYPia9ZFsB8h/APPA=
-github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
+github.com/leodido/go-urn v1.4.0 h1:WT9HwE9SGECu3lg4d/dIA+jxlljEa1/ffXKmRjqdmIQ=
+github.com/leodido/go-urn v1.4.0/go.mod h1:bvxc+MVxLKB4z00jd1z+Dvzr47oO32F/QSNjSBOlFxI=
+github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
+github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
 github.com/mattn/go-runewidth v0.0.9/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI=
 github.com/mattn/go-runewidth v0.0.14 h1:+xnbZSEeDbOIg5/mE6JF0w6n9duR1l3/WmbinWVwUuU=
 github.com/mattn/go-runewidth v0.0.14/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
@@ -126,12 +146,15 @@ github.com/nlpodyssey/gopickle v0.3.0 h1:BLUE5gxFLyyNOPzlXxt6GoHEMMxD0qhsE4p0CIQ
 github.com/nlpodyssey/gopickle v0.3.0/go.mod h1:f070HJ/yR+eLi5WmM1OXJEGaTpuJEUiib19olXgYha0=
 github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N7AbDhec=
 github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY=
-github.com/pdevine/tensor v0.0.0-20240228013915-64ccaa8d9ca9 h1:DV4iXjNn6fGeDl1AkZ1I0QB/0DBjrc7kPpxHrmuDzW4=
-github.com/pdevine/tensor v0.0.0-20240228013915-64ccaa8d9ca9/go.mod h1:nR7l3gM6ubiOm+mCkmmUyIBUcBAyiUmW6dQrDZhugFE=
-github.com/pelletier/go-toml/v2 v2.0.1/go.mod h1:r9LEWfGN8R5k0VXJ+0BkIe7MYkRdwZOjgMj2KwnJFUo=
-github.com/pelletier/go-toml/v2 v2.0.8 h1:0ctb6s9mE31h0/lhu+J6OPmVeDxJn+kYnJc2jZR9tGQ=
-github.com/pelletier/go-toml/v2 v2.0.8/go.mod h1:vuYfssBdrU2XDZ9bYydBu6t+6a6PYNcZljzZR9VXg+4=
-github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA=
+github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c h1:GwiUUjKefgvSNmv3NCvI/BL0kDebW6Xa+kcdpdc1mTY=
+github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c/go.mod h1:PSojXDXF7TbgQiD6kkd98IHOS0QqTyUEaWRiS8+BLu8=
+github.com/pelletier/go-toml/v2 v2.2.2 h1:aYUidT7k73Pcl9nb2gScu7NSrKCSHIDE89b3+6Wq+LM=
+github.com/pelletier/go-toml/v2 v2.2.2/go.mod h1:1t835xjRzz80PqgE6HHgN2JOsmgYu/h4qDAS4n929Rs=
+github.com/phpdave11/gofpdf v1.4.2/go.mod h1:zpO6xFn9yxo3YLyMvW8HcKWVdbNqgIfOOp2dXMnm1mY=
+github.com/phpdave11/gofpdi v1.0.12/go.mod h1:vBmVV0Do6hSBHC8uKUQ71JGW+ZGQq74llk/7bXwjDoI=
+github.com/pierrec/lz4/v4 v4.1.8 h1:ieHkV+i2BRzngO4Wd/3HGowuZStgq6QkPsD1eolNAO4=
+github.com/pierrec/lz4/v4 v4.1.8/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4=
+github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
 github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
 github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
@@ -139,10 +162,11 @@ github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZN
 github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
 github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY=
 github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
-github.com/rogpeppe/go-internal v1.6.1/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc=
+github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ=
 github.com/rogpeppe/go-internal v1.8.0 h1:FCbCCtXNOY3UtUuHUYaghJg4y7Fd14rXifAYUAtL9R8=
 github.com/rogpeppe/go-internal v1.8.0/go.mod h1:WmiCO8CzOY8rg0OYDC4/i/2WRWAB6poM+XZ2dLUbcbE=
 github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
+github.com/ruudk/golang-pdf417 v0.0.0-20181029194003-1af4ab5afa58/go.mod h1:6lfFZQK844Gfx8o5WFuvpxWRwnSoipWe/p622j1v06w=
 github.com/spf13/cobra v1.7.0 h1:hyqWnYt1ZQShIddO5kBpj3vu05/++x6tJ6dg8EC572I=
 github.com/spf13/cobra v1.7.0/go.mod h1:uLxZILRyS/50WlhOIKD7W6V5bgeIt+4sICxh6uRMrb0=
 github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
@@ -150,96 +174,119 @@ github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
 github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
+github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA=
 github.com/stretchr/testify v1.1.4/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
-github.com/stretchr/testify v1.2.0/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
+github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
 github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
-github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
+github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA=
 github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
 github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
 github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
 github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
-github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
-github.com/stretchr/testify v1.8.3/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
-github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
 github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
+github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
+github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
 github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS4MhqMhdFk5YI=
 github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08=
-github.com/ugorji/go v1.2.7/go.mod h1:nF9osbDWLy6bDVv/Rtoh6QgnvNDpmCalQV5urGCCS6M=
-github.com/ugorji/go/codec v1.2.7/go.mod h1:WGN1fab3R1fzQlVQTkfxVtIBhWDRqOviHU95kRgeqEY=
-github.com/ugorji/go/codec v1.2.11 h1:BMaWp1Bb6fHwEtbplGBGJ498wD+LKlNSl25MjdZY4dU=
-github.com/ugorji/go/codec v1.2.11/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZgYf6w6lg=
+github.com/ugorji/go/codec v1.2.12 h1:9LC83zGrHhuUA9l16C9AHXAqEV/2wBQ4nkvumAE65EE=
+github.com/ugorji/go/codec v1.2.12/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZgYf6w6lg=
 github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM=
 github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg=
 github.com/xtgo/set v1.0.0 h1:6BCNBRv3ORNDQ7fyoJXRv+tstJz3m1JVFQErfeZz2pY=
 github.com/xtgo/set v1.0.0/go.mod h1:d3NHzGzSa0NmB2NhFyECA+QdRp29oEn2xbT+TpeFoM8=
 github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
 github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
+github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k=
+go.opentelemetry.io/proto/otlp v0.7.0/go.mod h1:PqfVotwruBrMGOCsRd/89rSnXhoiJIqeYNgFYFoEGnI=
 go4.org/unsafe/assume-no-moving-gc v0.0.0-20231121144256-b99613f794b6 h1:lGdhQUN/cnWdSH3291CUuxSEqc+AsGTiDxPP3r2J0l4=
 go4.org/unsafe/assume-no-moving-gc v0.0.0-20231121144256-b99613f794b6/go.mod h1:FftLjUGFEDu5k8lt0ddY+HcrH/qU/0qk+H8j9/nTl3E=
 golang.org/x/arch v0.0.0-20210923205945-b76863e36670/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8=
-golang.org/x/arch v0.3.0 h1:02VY4/ZcO/gBOH6PUaoiptASxtXU10jazRCP865E97k=
-golang.org/x/arch v0.3.0/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8=
+golang.org/x/arch v0.8.0 h1:3wRIsP3pM4yUptoR96otTUOXI367OS0+c9eeRi9doIc=
+golang.org/x/arch v0.8.0/go.mod h1:FEVrYAQjsQXMVJ1nsMoVVXPZg6p2JE2mx8psSWTDQys=
 golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
+golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
-golang.org/x/crypto v0.0.0-20210711020723-a769d52b0f97/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
-golang.org/x/crypto v0.14.0 h1:wBqGXzWJW6m1XrIKlAH0Hs1JJ7+9KBwnIO8v66Q9cHc=
-golang.org/x/crypto v0.14.0/go.mod h1:MVFd36DqK4CsrnJYDkBA3VC4m2GkXAM0PvzMCn4JQf4=
+golang.org/x/crypto v0.23.0 h1:dIJU/v2J8Mdglj/8rJ6UUOM3Zc9zLZxVZwwxMooUSAI=
+golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8=
 golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
-golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63 h1:m64FZMko/V45gv0bNmrNYoDEq8U5YUhetc9cBWKS1TQ=
-golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63/go.mod h1:0v4NqG35kSWCMzLaMeX+IQrlSnVE/bqGSyC2cz/9Le8=
+golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
+golang.org/x/exp v0.0.0-20191002040644-a1355ae1e2c3/go.mod h1:NOZ3BPKG0ec/BKJQgnvsSFpcKLM5xXVWnvZS97DWHgE=
+golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa h1:FRnLl4eNAQl8hwxVVC17teOw8kdjVDVAiFMtgUdTSRQ=
+golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa/go.mod h1:zk2irFbV9DP96SEBUUAy67IdHUaZuSnrz1n472HUCLE=
 golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs=
+golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js=
+golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
+golang.org/x/image v0.0.0-20190910094157-69e4b8554b2a/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
+golang.org/x/image v0.0.0-20200119044424-58c23975cae1/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
+golang.org/x/image v0.0.0-20200430140353-33d19683fad8/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
+golang.org/x/image v0.0.0-20200618115811-c13761719519/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
+golang.org/x/image v0.0.0-20201208152932-35266b937fa6/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
+golang.org/x/image v0.0.0-20210216034530-4410531fe030/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
 golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
 golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU=
 golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
+golang.org/x/lint v0.0.0-20210508222113-6edffad5e616/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY=
+golang.org/x/mobile v0.0.0-20190719004257-d2bd2a29d028/go.mod h1:E/iHnbuqvinMTCcRqshq8CkpyQDoeVncDDYHnLhea+o=
+golang.org/x/mod v0.1.0/go.mod h1:0QHyrYULN0/3qlju5TqG8bIK38QM8yzMo5ekMj3DlcY=
+golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg=
 golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
 golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
+golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
 golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
 golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
 golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
 golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
 golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
 golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
 golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
-golang.org/x/net v0.0.0-20200904194848-62affa334b73/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
+golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
 golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
-golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
-golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM=
-golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE=
+golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM=
+golang.org/x/net v0.0.0-20210614182718-04defd469f4e/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
+golang.org/x/net v0.25.0 h1:d/OCCoBEUq33pjydKrGQhw7IlUPI2Oylr+8qLx49kac=
+golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
 golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
+golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
 golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.3.0 h1:ftCYgMx6zT/asHUrPw8BLLscYtGznsLAnjq5RH9P66E=
 golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y=
 golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20200909081042-eff7692f9009/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20210304124612-50617c2ba197/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210330210617-4fbd30eecc44/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.0.0-20210806184541-e5e7981a1069/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.0.0-20220704084225-05e143d24a9e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.13.0 h1:Af8nKPmuFypiUBjVoU9V20FiaFXOcuZI21p0ycVYYGE=
-golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.20.0 h1:Od9JTbYCk261bKm4M/mw7AklTlFYIa0bIp9BgSm1S8Y=
+golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
 golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
-golang.org/x/term v0.13.0 h1:bb+I9cTfFazGW51MZqBVmZy7+JEJMouUHTUSKVQLBek=
-golang.org/x/term v0.13.0/go.mod h1:LTmsnFJwVN6bCy1rVCoS+qHT1HhALEFxKncY3WNNh4U=
+golang.org/x/term v0.20.0 h1:VnkxpohqXaOBYJtBmEppKUG6mXpi+4O6purfc2+sMhw=
+golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY=
 golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
-golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ=
-golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
+golang.org/x/text v0.15.0 h1:h1V/4gjBv8v9cjcR6+AR5+/cIYK5N/WAgiv4xlsEtAk=
+golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
 golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
@@ -247,34 +294,40 @@ golang.org/x/tools v0.0.0-20190206041539-40960b6deb8e/go.mod h1:n7NCudcB/nEzxVGm
 golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY=
 golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
 golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
+golang.org/x/tools v0.0.0-20190927191325-030b2cf1153e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
 golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
 golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
 golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
+golang.org/x/tools v0.1.4/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk=
 golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE=
 golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 gonum.org/v1/gonum v0.0.0-20180816165407-929014505bf4/go.mod h1:Y+Yx5eoAFn32cQvJDxZx5Dpnq+c3wtXuadVZAcxbbBo=
-gonum.org/v1/gonum v0.8.2 h1:CCXrcPKiGGotvnN6jfUsKk4rRqm7q09/YbKb5xCEvtM=
 gonum.org/v1/gonum v0.8.2/go.mod h1:oe/vMfY3deqTw+1EZJhuvEW2iwGF1bW9wwu7XCu0+v0=
-gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0 h1:OE9mWmgKkjJyEmDAAtGMPjXu+YNeGvK9VTSHY6+Qihc=
+gonum.org/v1/gonum v0.9.3/go.mod h1:TZumC3NeyVQskjXqmyWt4S3bINhy7B4eYwW69EbyX+0=
+gonum.org/v1/gonum v0.15.0 h1:2lYxjRbTYyxkJxlhC+LvJIx3SsANPdRybu1tGj9/OrQ=
+gonum.org/v1/gonum v0.15.0/go.mod h1:xzZVBJBtS+Mz4q0Yl2LJTk+OxOg4jiXZ7qBoM0uISGo=
 gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw=
 gonum.org/v1/plot v0.0.0-20190515093506-e2840ee46a6b/go.mod h1:Wt8AAjI+ypCyYX3nZBvf6cAIx93T+c/OS2HFAYskSZc=
+gonum.org/v1/plot v0.9.0/go.mod h1:3Pcqqmp6RHvJI72kgb8fThyUnav364FOsdDo2aGW5lY=
 google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
 google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
 google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc=
 google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc=
+google.golang.org/genproto v0.0.0-20200513103714-09dca8ec2884/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
 google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo=
-google.golang.org/genproto v0.0.0-20200911024640-645f7a48b24f h1:Yv4xsIx7HZOoyUGSJ2ksDyWE2qIBXROsZKt2ny3hCGM=
-google.golang.org/genproto v0.0.0-20200911024640-645f7a48b24f/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no=
+google.golang.org/genproto v0.0.0-20210630183607-d20f26d13c79/go.mod h1:yiaVoXHpRzHGyxV3o4DktVWY4mSUErTKaeEOq6C3t3U=
 google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c=
 google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg=
 google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY=
 google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk=
-google.golang.org/grpc v1.32.0 h1:zWTV+LMdc3kaiJMSTOFz2UgSBgx8RNQoTGiZu3fR9S0=
-google.golang.org/grpc v1.32.0/go.mod h1:N36X2cJ7JwdamYAgDz+s+rVMFjt3numwzf/HckM8pak=
-google.golang.org/grpc/cmd/protoc-gen-go-grpc v0.0.0-20200910201057-6591123024b3/go.mod h1:6Kw0yEErY5E/yWrBtf03jp27GLLJujG4z/JK95pnjjw=
+google.golang.org/grpc v1.33.1/go.mod h1:fr5YgcSWrqhRRxogOsw7RzIpsmvOZ6IcH4kBYTpR3n0=
+google.golang.org/grpc v1.36.0/go.mod h1:qjiiYl8FncCW8feJPdyg3v6XW24KsRHe+dy9BAGRRjU=
+google.golang.org/grpc v1.38.0/go.mod h1:NREThFqKR1f3iQ6oBuvc5LadQuXVGo9rkm5ZGrQdJfM=
+google.golang.org/grpc v1.39.0/go.mod h1:PImNr+rS9TWYb2O4/emRugxiyHZ5JyHW5F+RPnDzfrE=
 google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8=
 google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0=
 google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM=
@@ -283,20 +336,18 @@ google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzi
 google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
 google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
 google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
-google.golang.org/protobuf v1.24.0/go.mod h1:r/3tXBNzIEhYS9I1OUVjXDlt8tc493IdKGjtUeSXeh4=
 google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c=
 google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
-google.golang.org/protobuf v1.28.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=
-google.golang.org/protobuf v1.30.0 h1:kPPoIgf3TsEvrm0PFe15JQ+570QVxYzEvvHqChK+cng=
-google.golang.org/protobuf v1.30.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=
+google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
+google.golang.org/protobuf v1.27.1/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
+google.golang.org/protobuf v1.34.1 h1:9ddQBjfCyZPOHPUiPxpYESBLc+T8P3E+Vo4IbKZgFWg=
+google.golang.org/protobuf v1.34.1/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
-gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
 gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
-gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI=
-gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
+gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
+gopkg.in/yaml.v2 v2.2.3/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
 gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
-gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
 gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 gorgonia.org/vecf32 v0.9.0 h1:PClazic1r+JVJ1dEzRXgeiVl4g1/Hf/w+wUSqnco1Xg=
@@ -305,4 +356,5 @@ gorgonia.org/vecf64 v0.9.0 h1:bgZDP5x0OzBF64PjMGC3EvTdOoMEcmfAh1VCUnZFm1A=
 gorgonia.org/vecf64 v0.9.0/go.mod h1:hp7IOWCnRiVQKON73kkC/AUMtEXyf9kGlVrtPQ9ccVA=
 honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
 honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
+nullprogram.com/x/optparse v1.0.0/go.mod h1:KdyPE+Igbe0jQUrVfMqDMeJQIJZEuyV7pjYmp6pbG50=
 rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4=
--- a/gpu/amd_common.go
+++ b/gpu/amd_common.go
@@ -81,8 +81,10 @@ func commonAMDValidateLibDir() (string, error) {
 	}

 	// Well known location(s)
-	if rocmLibUsable(RocmStandardLocation) {
-		return RocmStandardLocation, nil
+	for _, path := range RocmStandardLocations {
+		if rocmLibUsable(path) {
+			return path, nil
+		}
 	}

 	// Installer payload location if we're running the installed binary
--- a/gpu/amd_hip_windows.go
+++ b/gpu/amd_hip_windows.go
@@ -3,7 +3,6 @@ package gpu
 import (
 	"fmt"
 	"log/slog"
-	"strconv"
 	"syscall"
 	"unsafe"

@@ -74,16 +73,22 @@ func (hl *HipLib) Release() {
 	hl.dll = 0
 }

-func (hl *HipLib) AMDDriverVersion() (string, error) {
+func (hl *HipLib) AMDDriverVersion() (driverMajor, driverMinor int, err error) {
 	if hl.dll == 0 {
-		return "", fmt.Errorf("dll has been unloaded")
+		return 0, 0, fmt.Errorf("dll has been unloaded")
 	}
 	var version int
 	status, _, err := syscall.SyscallN(hl.hipDriverGetVersion, uintptr(unsafe.Pointer(&version)))
 	if status != hipSuccess {
-		return "", fmt.Errorf("failed call to hipDriverGetVersion: %d %s", status, err)
+		return 0, 0, fmt.Errorf("failed call to hipDriverGetVersion: %d %s", status, err)
 	}
-	return strconv.Itoa(version), nil
+
+	slog.Debug("hipDriverGetVersion", "version", version)
+	// TODO - this isn't actually right, but the docs claim hipDriverGetVersion isn't accurate anyway...
+	driverMajor = version / 1000
+	driverMinor = (version - (driverMajor * 1000)) / 10
+
+	return driverMajor, driverMinor, nil
 }

 func (hl *HipLib) HipGetDeviceCount() int {
--- a/gpu/amd_linux.go
+++ b/gpu/amd_linux.go
@@ -8,6 +8,7 @@ import (
 	"log/slog"
 	"os"
 	"path/filepath"
+	"regexp"
 	"slices"
 	"strconv"
 	"strings"
@@ -25,12 +26,12 @@ const (
 	// Prefix with the node dir
 	GPUTotalMemoryFileGlob = "mem_banks/*/properties" // size_in_bytes line
 	GPUUsedMemoryFileGlob  = "mem_banks/*/used_memory"
-	RocmStandardLocation   = "/opt/rocm/lib"
 )

 var (
 	// Used to validate if the given ROCm lib is usable
-	ROCmLibGlobs = []string{"libhipblas.so.2*", "rocblas"} // TODO - probably include more coverage of files here...
+	ROCmLibGlobs          = []string{"libhipblas.so.2*", "rocblas"} // TODO - probably include more coverage of files here...
+	RocmStandardLocations = []string{"/opt/rocm/lib", "/usr/lib64"}
 )

 // Gather GPU information from the amdgpu driver if any supported GPUs are detected
@@ -41,10 +42,8 @@ func AMDGetGPUInfo() []GpuInfo {
 	}

 	// Opportunistic logging of driver version to aid in troubleshooting
-	ver, err := AMDDriverVersion()
-	if err == nil {
-		slog.Info("AMD Driver: " + ver)
-	} else {
+	driverMajor, driverMinor, err := AMDDriverVersion()
+	if err != nil {
 		// TODO - if we see users crash and burn with the upstreamed kernel this can be adjusted to hard-fail rocm support and fallback to CPU
 		slog.Warn("ollama recommends running the https://www.amd.com/en/support/linux-drivers", "error", err)
 	}
@@ -91,6 +90,7 @@ func AMDGetGPUInfo() []GpuInfo {
 		scanner := bufio.NewScanner(fp)
 		isCPU := false
 		var major, minor, patch uint64
+		var vendor, device uint64
 		for scanner.Scan() {
 			line := strings.TrimSpace(scanner.Text())
 			// Note: we could also use "cpu_cores_count X" where X is greater than zero to detect CPUs
@@ -118,6 +118,26 @@ func AMDGetGPUInfo() []GpuInfo {
 					slog.Debug("malformed int " + line)
 					continue
 				}
+			} else if strings.HasPrefix(line, "vendor_id") {
+				ver := strings.Fields(line)
+				if len(ver) != 2 {
+					slog.Debug("malformed vendor_id", "vendor_id", line)
+					continue
+				}
+				vendor, err = strconv.ParseUint(ver[1], 10, 32)
+				if err != nil {
+					slog.Debug("malformed vendor_id" + line)
+				}
+			} else if strings.HasPrefix(line, "device_id") {
+				ver := strings.Fields(line)
+				if len(ver) != 2 {
+					slog.Debug("malformed device_id", "device_id", line)
+					continue
+				}
+				device, err = strconv.ParseUint(ver[1], 10, 32)
+				if err != nil {
+					slog.Debug("malformed device_id" + line)
+				}
 			}

 			// TODO - any other properties we want to extract and record?
@@ -140,7 +160,7 @@ func AMDGetGPUInfo() []GpuInfo {
 		}

 		if int(major) < RocmComputeMin {
-			slog.Warn(fmt.Sprintf("amdgpu too old gfx%d%d%x", major, minor, patch), "gpu", gpuID)
+			slog.Warn(fmt.Sprintf("amdgpu too old gfx%d%x%x", major, minor, patch), "gpu", gpuID)
 			continue
 		}

@@ -210,24 +230,29 @@ func AMDGetGPUInfo() []GpuInfo {

 		// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
 		if totalMemory < IGPUMemLimit {
-			slog.Info("amdgpu appears to be an iGPU, skipping", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
+			slog.Info("unsupported Radeon iGPU detected skipping", "id", gpuID, "total", format.HumanBytes2(totalMemory))
 			continue
 		}
+		var name string
+		// TODO - PCI ID lookup
+		if vendor > 0 && device > 0 {
+			name = fmt.Sprintf("%04x:%04x", vendor, device)
+		}

-		slog.Info("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
-		slog.Info("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory))
+		slog.Debug("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
+		slog.Debug("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory))
 		gpuInfo := GpuInfo{
 			Library: "rocm",
 			memInfo: memInfo{
 				TotalMemory: totalMemory,
 				FreeMemory:  (totalMemory - usedMemory),
 			},
-			ID: fmt.Sprintf("%d", gpuID),
-			// Name: not exposed in sysfs directly, would require pci device id lookup
-			Major:         int(major),
-			Minor:         int(minor),
-			Patch:         int(patch),
+			ID:            fmt.Sprintf("%d", gpuID),
+			Name:          name,
+			Compute:       fmt.Sprintf("gfx%d%x%x", major, minor, patch),
 			MinimumMemory: rocmMinimumMemory,
+			DriverMajor:   driverMajor,
+			DriverMinor:   driverMinor,
 		}

 		// If the user wants to filter to a subset of devices, filter out if we aren't a match
@@ -266,7 +291,7 @@ func AMDGetGPUInfo() []GpuInfo {
 				}
 				slog.Debug("rocm supported GPUs", "types", supported)
 			}
-			gfx := fmt.Sprintf("gfx%d%d%x", gpuInfo.Major, gpuInfo.Minor, gpuInfo.Patch)
+			gfx := gpuInfo.Compute
 			if !slices.Contains[[]string, string](supported, gfx) {
 				slog.Warn("amdgpu is not supported", "gpu", gpuInfo.ID, "gpu_type", gfx, "library", libDir, "supported_types", supported)
 				// TODO - consider discrete markdown just for ROCM troubleshooting?
@@ -276,7 +301,7 @@ func AMDGetGPUInfo() []GpuInfo {
 				slog.Info("amdgpu is supported", "gpu", gpuInfo.ID, "gpu_type", gfx)
 			}
 		} else {
-			slog.Debug("skipping rocm gfx compatibility check with HSA_OVERRIDE_GFX_VERSION=" + gfxOverride)
+			slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride)
 		}

 		// The GPU has passed all the verification steps and is supported
@@ -322,19 +347,34 @@ func AMDValidateLibDir() (string, error) {
 	return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
 }

-func AMDDriverVersion() (string, error) {
-	_, err := os.Stat(DriverVersionFile)
+func AMDDriverVersion() (driverMajor, driverMinor int, err error) {
+	_, err = os.Stat(DriverVersionFile)
 	if err != nil {
-		return "", fmt.Errorf("amdgpu version file missing: %s %w", DriverVersionFile, err)
+		return 0, 0, fmt.Errorf("amdgpu version file missing: %s %w", DriverVersionFile, err)
 	}
 	fp, err := os.Open(DriverVersionFile)
 	if err != nil {
-		return "", err
+		return 0, 0, err
 	}
 	defer fp.Close()
 	verString, err := io.ReadAll(fp)
 	if err != nil {
-		return "", err
+		return 0, 0, err
 	}
-	return strings.TrimSpace(string(verString)), nil
+
+	pattern := `\A(\d+)\.(\d+).*`
+	regex := regexp.MustCompile(pattern)
+	match := regex.FindStringSubmatch(string(verString))
+	if len(match) < 2 {
+		return 0, 0, fmt.Errorf("malformed version string %s", string(verString))
+	}
+	driverMajor, err = strconv.Atoi(match[1])
+	if err != nil {
+		return 0, 0, err
+	}
+	driverMinor, err = strconv.Atoi(match[2])
+	if err != nil {
+		return 0, 0, err
+	}
+	return driverMajor, driverMinor, nil
 }
--- a/gpu/amd_windows.go
+++ b/gpu/amd_windows.go
@@ -7,14 +7,12 @@ import (
 	"os"
 	"path/filepath"
 	"slices"
-	"strconv"
 	"strings"

 	"github.com/ollama/ollama/format"
 )

 const (
-	RocmStandardLocation = "C:\\Program Files\\AMD\\ROCm\\5.7\\bin" // TODO glob?

 	// TODO  We're lookinng for this exact name to detect iGPUs since hipGetDeviceProperties never reports integrated==true
 	iGPUName = "AMD Radeon(TM) Graphics"
@@ -22,7 +20,8 @@ const (

 var (
 	// Used to validate if the given ROCm lib is usable
-	ROCmLibGlobs = []string{"hipblas.dll", "rocblas"} // TODO - probably include more coverage of files here...
+	ROCmLibGlobs          = []string{"hipblas.dll", "rocblas"}                 // TODO - probably include more coverage of files here...
+	RocmStandardLocations = []string{"C:\\Program Files\\AMD\\ROCm\\5.7\\bin"} // TODO glob?
 )

 func AMDGetGPUInfo() []GpuInfo {
@@ -34,13 +33,12 @@ func AMDGetGPUInfo() []GpuInfo {
 	}
 	defer hl.Release()

-	ver, err := hl.AMDDriverVersion()
-	if err == nil {
-		slog.Info("AMD Driver: " + ver)
-	} else {
-		// For now this is benign, but we may eventually need to fail compatibility checks
-		slog.Debug("error looking up amd driver version", "error", err)
-	}
+	// TODO - this reports incorrect version information, so omitting for now
+	// driverMajor, driverMinor, err := hl.AMDDriverVersion()
+	// if err != nil {
+	// 	// For now this is benign, but we may eventually need to fail compatibility checks
+	// 	slog.Debug("error looking up amd driver version", "error", err)
+	// }

 	// Note: the HIP library automatically handles subsetting to any HIP_VISIBLE_DEVICES the user specified
 	count := hl.HipGetDeviceCount()
@@ -62,10 +60,10 @@ func AMDGetGPUInfo() []GpuInfo {
 			return nil
 		}
 	} else {
-		slog.Debug("skipping rocm gfx compatibility check with HSA_OVERRIDE_GFX_VERSION=" + gfxOverride)
+		slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride)
 	}

-	slog.Info("detected hip devices", "count", count)
+	slog.Debug("detected hip devices", "count", count)
 	// TODO how to determine the underlying device ID when visible devices is causing this to subset?
 	for i := 0; i < count; i++ {
 		err = hl.HipSetDevice(i)
@@ -85,18 +83,11 @@ func AMDGetGPUInfo() []GpuInfo {
 		// Can luid be used on windows for setting visible devices (and is it actually set?)
 		n = bytes.IndexByte(props.GcnArchName[:], 0)
 		gfx := string(props.GcnArchName[:n])
-		slog.Info("hip device", "id", i, "name", name, "gfx", gfx)
-		var major, minor, patch string
-		switch len(gfx) {
-		case 6:
-			major, minor, patch = gfx[3:4], gfx[4:5], gfx[5:]
-		case 7:
-			major, minor, patch = gfx[3:5], gfx[5:6], gfx[6:]
-		}
+		slog.Debug("hip device", "id", i, "name", name, "gfx", gfx)
 		//slog.Info(fmt.Sprintf("[%d] Integrated: %d", i, props.iGPU)) // DOESN'T REPORT CORRECTLY!  Always 0
 		// TODO  Why isn't props.iGPU accurate!?
 		if strings.EqualFold(name, iGPUName) {
-			slog.Info("iGPU detected skipping", "id", i)
+			slog.Info("unsupported Radeon iGPU detected skipping", "id", i, "name", name, "gfx", gfx)
 			continue
 		}
 		if gfxOverride == "" {
@@ -106,7 +97,7 @@ func AMDGetGPUInfo() []GpuInfo {
 				slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage")
 				continue
 			} else {
-				slog.Info("amdgpu is supported", "gpu", i, "gpu_type", gfx)
+				slog.Debug("amdgpu is supported", "gpu", i, "gpu_type", gfx)
 			}
 		}

@@ -124,8 +115,8 @@ func AMDGetGPUInfo() []GpuInfo {

 		// TODO revisit this once ROCm v6 is available on windows.
 		// v5.7 only reports VRAM used by this process, so it's completely wrong and unusable
-		slog.Info("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
-		slog.Info("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
+		slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
+		slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
 		gpuInfo := GpuInfo{
 			Library: "rocm",
 			memInfo: memInfo{
@@ -135,31 +126,12 @@ func AMDGetGPUInfo() []GpuInfo {
 			ID:             fmt.Sprintf("%d", i), // TODO this is probably wrong if we specify visible devices
 			DependencyPath: libDir,
 			MinimumMemory:  rocmMinimumMemory,
-		}
-		if major != "" {
-			gpuInfo.Major, err = strconv.Atoi(major)
-			if err != nil {
-				slog.Info("failed to parse version", "version", gfx, "error", err)
-			}
-		}
-		if minor != "" {
-			gpuInfo.Minor, err = strconv.Atoi(minor)
-			if err != nil {
-				slog.Info("failed to parse version", "version", gfx, "error", err)
-			}
-		}
-		if patch != "" {
-			// Patch rev is hex; e.g. gfx90a
-			p, err := strconv.ParseInt(patch, 16, 0)
-			if err != nil {
-				slog.Info("failed to parse version", "version", gfx, "error", err)
-			} else {
-				gpuInfo.Patch = int(p)
-			}
-		}
-		if gpuInfo.Major < RocmComputeMin {
-			slog.Warn(fmt.Sprintf("amdgpu [%s] too old gfx%d%d%x", gpuInfo.ID, gpuInfo.Major, gpuInfo.Minor, gpuInfo.Patch))
-			continue
+			Name:           name,
+			Compute:        gfx,
+
+			// TODO - this information isn't accurate on windows, so don't report it until we find the right way to retrieve
+			// DriverMajor:    driverMajor,
+			// DriverMinor:    driverMinor,
 		}

 		resp = append(resp, gpuInfo)
--- a/gpu/assets.go
+++ b/gpu/assets.go
@@ -12,6 +12,8 @@ import (
 	"sync"
 	"syscall"
 	"time"
+
+	"github.com/ollama/ollama/server/envconfig"
 )

 var (
@@ -24,45 +26,8 @@ func PayloadsDir() (string, error) {
 	defer lock.Unlock()
 	var err error
 	if payloadsDir == "" {
-		runnersDir := os.Getenv("OLLAMA_RUNNERS_DIR")
-		// On Windows we do not carry the payloads inside the main executable
-		if runtime.GOOS == "windows" && runnersDir == "" {
-			appExe, err := os.Executable()
-			if err != nil {
-				slog.Error("failed to lookup executable path", "error", err)
-				return "", err
-			}
+		runnersDir := envconfig.RunnersDir

-			cwd, err := os.Getwd()
-			if err != nil {
-				slog.Error("failed to lookup working directory", "error", err)
-				return "", err
-			}
-
-			var paths []string
-			for _, root := range []string{appExe, cwd} {
-				paths = append(paths,
-					filepath.Join(root),
-					filepath.Join(root, "windows-"+runtime.GOARCH),
-					filepath.Join(root, "dist", "windows-"+runtime.GOARCH),
-				)
-			}
-
-			// Try a few variations to improve developer experience when building from source in the local tree
-			for _, p := range paths {
-				candidate := filepath.Join(p, "ollama_runners")
-				_, err := os.Stat(candidate)
-				if err == nil {
-					runnersDir = candidate
-					break
-				}
-			}
-			if runnersDir == "" {
-				err = fmt.Errorf("unable to locate llm runner directory.  Set OLLAMA_RUNNERS_DIR to the location of 'ollama_runners'")
-				slog.Error("incomplete distribution", "error", err)
-				return "", err
-			}
-		}
 		if runnersDir != "" {
 			payloadsDir = runnersDir
 			return payloadsDir, nil
@@ -70,7 +35,7 @@ func PayloadsDir() (string, error) {

 		// The remainder only applies on non-windows where we still carry payloads in the main executable
 		cleanupTmpDirs()
-		tmpDir := os.Getenv("OLLAMA_TMPDIR")
+		tmpDir := envconfig.TmpDir
 		if tmpDir == "" {
 			tmpDir, err = os.MkdirTemp("", "ollama")
 			if err != nil {
@@ -133,7 +98,7 @@ func cleanupTmpDirs() {
 func Cleanup() {
 	lock.Lock()
 	defer lock.Unlock()
-	runnersDir := os.Getenv("OLLAMA_RUNNERS_DIR")
+	runnersDir := envconfig.RunnersDir
 	if payloadsDir != "" && runnersDir == "" && runtime.GOOS != "windows" {
 		// We want to fully clean up the tmpdir parent of the payloads dir
 		tmpDir := filepath.Clean(filepath.Join(payloadsDir, ".."))
--- a/gpu/cpu_common.go
+++ b/gpu/cpu_common.go
@@ -8,14 +8,14 @@ import (

 func GetCPUVariant() string {
 	if cpu.X86.HasAVX2 {
-		slog.Info("CPU has AVX2")
+		slog.Debug("CPU has AVX2")
 		return "avx2"
 	}
 	if cpu.X86.HasAVX {
-		slog.Info("CPU has AVX")
+		slog.Debug("CPU has AVX")
 		return "avx"
 	}
-	slog.Info("CPU does not have vector extensions")
+	slog.Debug("CPU does not have vector extensions")
 	// else LCD
 	return ""
 }
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -21,11 +21,13 @@ import (
 	"unsafe"

 	"github.com/ollama/ollama/format"
+	"github.com/ollama/ollama/server/envconfig"
 )

 type handles struct {
 	deviceCount int
 	cudart      *C.cudart_handle_t
+	nvcuda      *C.nvcuda_handle_t
 }

 const (
@@ -62,6 +64,22 @@ var CudartWindowsGlobs = []string{
 	"c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll",
 }

+var NvcudaLinuxGlobs = []string{
+	"/usr/local/cuda*/targets/*/lib/libcuda.so*",
+	"/usr/lib/*-linux-gnu/nvidia/current/libcuda.so*",
+	"/usr/lib/*-linux-gnu/libcuda.so*",
+	"/usr/lib/wsl/lib/libcuda.so*",
+	"/usr/lib/wsl/drivers/*/libcuda.so*",
+	"/opt/cuda/lib*/libcuda.so*",
+	"/usr/local/cuda/lib*/libcuda.so*",
+	"/usr/lib*/libcuda.so*",
+	"/usr/local/lib*/libcuda.so*",
+}
+
+var NvcudaWindowsGlobs = []string{
+	"c:\\windows\\system*\\nvcuda.dll",
+}
+
 // Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
 // Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
 var CudaTegra string = os.Getenv("JETSON_JETPACK")
@@ -74,6 +92,8 @@ func initGPUHandles() *handles {
 	gpuHandles := &handles{}
 	var cudartMgmtName string
 	var cudartMgmtPatterns []string
+	var nvcudaMgmtName string
+	var nvcudaMgmtPatterns []string

 	tmpDir, _ := PayloadsDir()
 	switch runtime.GOOS {
@@ -82,6 +102,9 @@ func initGPUHandles() *handles {
 		localAppData := os.Getenv("LOCALAPPDATA")
 		cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", cudartMgmtName)}
 		cudartMgmtPatterns = append(cudartMgmtPatterns, CudartWindowsGlobs...)
+		// Aligned with driver, we can't carry as payloads
+		nvcudaMgmtName = "nvcuda.dll"
+		nvcudaMgmtPatterns = NvcudaWindowsGlobs
 	case "linux":
 		cudartMgmtName = "libcudart.so*"
 		if tmpDir != "" {
@@ -89,16 +112,30 @@ func initGPUHandles() *handles {
 			cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", cudartMgmtName)}
 		}
 		cudartMgmtPatterns = append(cudartMgmtPatterns, CudartLinuxGlobs...)
+		// Aligned with driver, we can't carry as payloads
+		nvcudaMgmtName = "libcuda.so*"
+		nvcudaMgmtPatterns = NvcudaLinuxGlobs
 	default:
 		return gpuHandles
 	}

-	slog.Info("Detecting GPUs")
+	slog.Debug("Detecting GPUs")
+	nvcudaLibPaths := FindGPULibs(nvcudaMgmtName, nvcudaMgmtPatterns)
+	if len(nvcudaLibPaths) > 0 {
+		deviceCount, nvcuda, libPath := LoadNVCUDAMgmt(nvcudaLibPaths)
+		if nvcuda != nil {
+			slog.Debug("detected GPUs", "count", deviceCount, "library", libPath)
+			gpuHandles.nvcuda = nvcuda
+			gpuHandles.deviceCount = deviceCount
+			return gpuHandles
+		}
+	}
+
 	cudartLibPaths := FindGPULibs(cudartMgmtName, cudartMgmtPatterns)
 	if len(cudartLibPaths) > 0 {
 		deviceCount, cudart, libPath := LoadCUDARTMgmt(cudartLibPaths)
 		if cudart != nil {
-			slog.Info("detected GPUs", "library", libPath, "count", deviceCount)
+			slog.Debug("detected GPUs", "library", libPath, "count", deviceCount)
 			gpuHandles.cudart = cudart
 			gpuHandles.deviceCount = deviceCount
 			return gpuHandles
@@ -118,6 +155,9 @@ func GetGPUInfo() GpuInfoList {
 		if gpuHandles.cudart != nil {
 			C.cudart_release(*gpuHandles.cudart)
 		}
+		if gpuHandles.nvcuda != nil {
+			C.nvcuda_release(*gpuHandles.nvcuda)
+		}
 	}()

 	// All our GPU builds on x86 have AVX enabled, so fallback to CPU if we don't detect at least AVX
@@ -126,6 +166,12 @@ func GetGPUInfo() GpuInfoList {
 		slog.Warn("CPU does not have AVX or AVX2, disabling GPU support.")
 	}

+	// On windows we bundle the nvidia library one level above the runner dir
+	depPath := ""
+	if runtime.GOOS == "windows" && envconfig.RunnersDir != "" {
+		depPath = filepath.Dir(envconfig.RunnersDir)
+	}
+
 	var memInfo C.mem_info_t
 	resp := []GpuInfo{}

@@ -138,7 +184,15 @@ func GetGPUInfo() GpuInfoList {
 		gpuInfo := GpuInfo{
 			Library: "cuda",
 		}
-		C.cudart_check_vram(*gpuHandles.cudart, C.int(i), &memInfo)
+		var driverMajor int
+		var driverMinor int
+		if gpuHandles.cudart != nil {
+			C.cudart_check_vram(*gpuHandles.cudart, C.int(i), &memInfo)
+		} else {
+			C.nvcuda_check_vram(*gpuHandles.nvcuda, C.int(i), &memInfo)
+			driverMajor = int(gpuHandles.nvcuda.driver_major)
+			driverMinor = int(gpuHandles.nvcuda.driver_minor)
+		}
 		if memInfo.err != nil {
 			slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
 			C.free(unsafe.Pointer(memInfo.err))
@@ -151,9 +205,12 @@ func GetGPUInfo() GpuInfoList {
 		gpuInfo.TotalMemory = uint64(memInfo.total)
 		gpuInfo.FreeMemory = uint64(memInfo.free)
 		gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
-		gpuInfo.Major = int(memInfo.major)
-		gpuInfo.Minor = int(memInfo.minor)
+		gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
 		gpuInfo.MinimumMemory = cudaMinimumMemory
+		gpuInfo.DependencyPath = depPath
+		gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
+		gpuInfo.DriverMajor = int(driverMajor)
+		gpuInfo.DriverMinor = int(driverMinor)

 		// TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
 		resp = append(resp, gpuInfo)
@@ -196,9 +253,10 @@ func GetCPUMem() (memInfo, error) {
 	return ret, nil
 }

-func FindGPULibs(baseLibName string, patterns []string) []string {
+func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
 	// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
 	var ldPaths []string
+	var patterns []string
 	gpuLibPaths := []string{}
 	slog.Debug("Searching for GPU library", "name", baseLibName)

@@ -218,8 +276,14 @@ func FindGPULibs(baseLibName string, patterns []string) []string {
 		}
 		patterns = append(patterns, filepath.Join(d, baseLibName+"*"))
 	}
+	patterns = append(patterns, defaultPatterns...)
 	slog.Debug("gpu library search", "globs", patterns)
 	for _, pattern := range patterns {
+
+		// Nvidia PhysX known to return bogus results
+		if strings.Contains(pattern, "PhysX") {
+			slog.Debug("skipping PhysX cuda library path", "path", pattern)
+		}
 		// Ignore glob discovery errors
 		matches, _ := filepath.Glob(pattern)
 		for _, match := range matches {
@@ -267,8 +331,25 @@ func LoadCUDARTMgmt(cudartLibPaths []string) (int, *C.cudart_handle_t, string) {
 	return 0, nil, ""
 }

+func LoadNVCUDAMgmt(nvcudaLibPaths []string) (int, *C.nvcuda_handle_t, string) {
+	var resp C.nvcuda_init_resp_t
+	resp.ch.verbose = getVerboseState()
+	for _, libPath := range nvcudaLibPaths {
+		lib := C.CString(libPath)
+		defer C.free(unsafe.Pointer(lib))
+		C.nvcuda_init(lib, &resp)
+		if resp.err != nil {
+			slog.Debug("Unable to load nvcuda", "library", libPath, "error", C.GoString(resp.err))
+			C.free(unsafe.Pointer(resp.err))
+		} else {
+			return int(resp.num_devices), &resp.ch, libPath
+		}
+	}
+	return 0, nil, ""
+}
+
 func getVerboseState() C.uint16_t {
-	if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
+	if envconfig.Debug {
 		return C.uint16_t(1)
 	}
 	return C.uint16_t(0)
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
@@ -10,6 +10,12 @@ package gpu
 import "C"
 import (
 	"runtime"
+
+	"github.com/ollama/ollama/format"
+)
+
+const (
+	metalMinimumMemory = 512 * format.MebiByte
 )

 func GetGPUInfo() GpuInfoList {
@@ -32,7 +38,7 @@ func GetGPUInfo() GpuInfoList {
 	// TODO is there a way to gather actual allocated video memory? (currentAllocatedSize doesn't work)
 	info.FreeMemory = info.TotalMemory

-	info.MinimumMemory = 0
+	info.MinimumMemory = metalMinimumMemory
 	return []GpuInfo{info}
 }

--- a/gpu/gpu_info.h
+++ b/gpu/gpu_info.h
@@ -39,16 +39,19 @@ extern "C" {
 #endif

 #define GPU_ID_LEN 64
+#define GPU_NAME_LEN 96

 typedef struct mem_info {
  char *err;  // If non-nill, caller responsible for freeing
  char gpu_id[GPU_ID_LEN];
+  char gpu_name[GPU_NAME_LEN];
  uint64_t total;
  uint64_t free;

  // Compute Capability
  int major; 
  int minor;
+  int patch;
 } mem_info_t;

 void cpu_check_ram(mem_info_t *resp);
@@ -58,6 +61,7 @@ void cpu_check_ram(mem_info_t *resp);
 #endif

 #include "gpu_info_cudart.h"
+#include "gpu_info_nvcuda.h"

 #endif  // __GPU_INFO_H__
 #endif  // __APPLE__
--- a/gpu/gpu_info_cpu.c
+++ b/gpu/gpu_info_cpu.c
@@ -10,8 +10,6 @@ void cpu_check_ram(mem_info_t *resp) {
  if (GlobalMemoryStatusEx(&info) != 0) {
    resp->total = info.ullTotalPhys;
    resp->free = info.ullAvailPhys;
-    resp->major = 0;
-    resp->minor = 0;
    snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0");
  } else {
    resp->err = LOAD_ERR();
@@ -31,8 +29,6 @@ void cpu_check_ram(mem_info_t *resp) {
  } else {
    resp->total = info.totalram * info.mem_unit;
    resp->free = info.freeram * info.mem_unit;
-    resp->major = 0;
-    resp->minor = 0;
    snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0");
  }
  return;
--- a/gpu/gpu_info_cudart.h
+++ b/gpu/gpu_info_cudart.h
@@ -6,9 +6,9 @@
 // Just enough typedef's to dlopen/dlsym for memory information
 typedef enum cudartReturn_enum {
  CUDART_SUCCESS = 0,
-  CUDA_ERROR_INVALID_VALUE = 1,
-  CUDA_ERROR_MEMORY_ALLOCATION = 2,
-  CUDA_ERROR_INSUFFICIENT_DRIVER = 35,
+  CUDART_ERROR_INVALID_VALUE = 1,
+  CUDART_ERROR_MEMORY_ALLOCATION = 2,
+  CUDART_ERROR_INSUFFICIENT_DRIVER = 35,
  // Other values omitted for now...
 } cudartReturn_t;

--- a/gpu/gpu_info_nvcuda.c
+++ b/gpu/gpu_info_nvcuda.c
@@ -0,0 +1,207 @@
+#ifndef __APPLE__  // TODO - maybe consider nvidia support on intel macs?
+
+#include <string.h>
+#include "gpu_info_nvcuda.h"
+
+void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
+  CUresult ret;
+  resp->err = NULL;
+  resp->num_devices = 0;
+  const int buflen = 256;
+  char buf[buflen + 1];
+  int i;
+
+  struct lookup {
+    char *s;
+    void **p;
+  } l[] = {
+   
+      {"cuInit", (void *)&resp->ch.cuInit},
+      {"cuDriverGetVersion", (void *)&resp->ch.cuDriverGetVersion},
+      {"cuDeviceGetCount", (void *)&resp->ch.cuDeviceGetCount},
+      {"cuDeviceGet", (void *)&resp->ch.cuDeviceGet},
+      {"cuDeviceGetAttribute", (void *)&resp->ch.cuDeviceGetAttribute},
+      {"cuDeviceGetUuid", (void *)&resp->ch.cuDeviceGetUuid},
+      {"cuDeviceGetName", (void *)&resp->ch.cuDeviceGetName},
+      {"cuCtxCreate_v3", (void *)&resp->ch.cuCtxCreate_v3},
+      {"cuMemGetInfo_v2", (void *)&resp->ch.cuMemGetInfo_v2},
+      {"cuCtxDestroy", (void *)&resp->ch.cuCtxDestroy},
+      {NULL, NULL},
+  };
+
+  resp->ch.handle = LOAD_LIBRARY(nvcuda_lib_path, RTLD_LAZY);
+  if (!resp->ch.handle) {
+    char *msg = LOAD_ERR();
+    LOG(resp->ch.verbose, "library %s load err: %s\n", nvcuda_lib_path, msg);
+    snprintf(buf, buflen,
+            "Unable to load %s library to query for Nvidia GPUs: %s",
+            nvcuda_lib_path, msg);
+    free(msg);
+    resp->err = strdup(buf);
+    return;
+  }
+
+  for (i = 0; l[i].s != NULL; i++) {
+    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
+    if (!*l[i].p) {
+      char *msg = LOAD_ERR();
+      LOG(resp->ch.verbose, "dlerr: %s\n", msg);
+      UNLOAD_LIBRARY(resp->ch.handle);
+      resp->ch.handle = NULL;
+      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
+              msg);
+      free(msg);
+      resp->err = strdup(buf);
+      return;
+    }
+  }
+
+  ret = (*resp->ch.cuInit)(0);
+  if (ret != CUDA_SUCCESS) {
+    LOG(resp->ch.verbose, "cuInit err: %d\n", ret);
+    UNLOAD_LIBRARY(resp->ch.handle);
+    resp->ch.handle = NULL;
+    if (ret == CUDA_ERROR_INSUFFICIENT_DRIVER) {
+      resp->err = strdup("your nvidia driver is too old or missing.  If you have a CUDA GPU please upgrade to run ollama");
+      return;
+    }
+    snprintf(buf, buflen, "nvcuda init failure: %d", ret);
+    resp->err = strdup(buf);
+    return;
+  }
+
+  int version = 0;
+  resp->ch.driver_major = 0;
+  resp->ch.driver_minor = 0;
+
+  // Report driver version if we're in verbose mode, ignore errors
+  ret = (*resp->ch.cuDriverGetVersion)(&version);
+  if (ret != CUDA_SUCCESS) {
+    LOG(resp->ch.verbose, "cuDriverGetVersion failed: %d\n", ret);
+  } else {
+    resp->ch.driver_major = version / 1000;
+    resp->ch.driver_minor = (version - (resp->ch.driver_major * 1000)) / 10;
+    LOG(resp->ch.verbose, "CUDA driver version: %d.%d\n", resp->ch.driver_major, resp->ch.driver_minor);
+  }
+
+  ret = (*resp->ch.cuDeviceGetCount)(&resp->num_devices);
+  if (ret != CUDA_SUCCESS) {
+    LOG(resp->ch.verbose, "cuDeviceGetCount err: %d\n", ret);
+    UNLOAD_LIBRARY(resp->ch.handle);
+    resp->ch.handle = NULL;
+    snprintf(buf, buflen, "unable to get device count: %d", ret);
+    resp->err = strdup(buf);
+    return;
+  }
+}
+
+const int buflen = 256;
+void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) {
+  resp->err = NULL;
+  nvcudaMemory_t memInfo = {0,0};
+  CUresult ret;
+  CUdevice device = -1;
+  CUcontext ctx = NULL;
+  char buf[buflen + 1];
+  CUuuid uuid = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+
+  if (h.handle == NULL) {
+    resp->err = strdup("nvcuda handle isn't initialized");
+    return;
+  }
+
+  ret = (*h.cuDeviceGet)(&device, i);
+  if (ret != CUDA_SUCCESS) {
+    snprintf(buf, buflen, "nvcuda device failed to initialize");
+    resp->err = strdup(buf);
+    return;
+  }
+
+  int major = 0;
+  int minor = 0;
+  ret = (*h.cuDeviceGetAttribute)(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);
+  if (ret != CUDA_SUCCESS) {
+    LOG(h.verbose, "[%d] device major lookup failure: %d\n", i, ret);
+  } else {
+    ret = (*h.cuDeviceGetAttribute)(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device);
+    if (ret != CUDA_SUCCESS) {
+      LOG(h.verbose, "[%d] device minor lookup failure: %d\n", i, ret);
+    } else {
+      resp->minor = minor;  
+      resp->major = major;  
+    }
+  }
+
+  ret = (*h.cuDeviceGetUuid)(&uuid, device);
+  if (ret != CUDA_SUCCESS) {
+    LOG(h.verbose, "[%d] device uuid lookup failure: %d\n", i, ret);
+    snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", i);
+  } else {
+    // GPU-d110a105-ac29-1d54-7b49-9c90440f215b
+    snprintf(&resp->gpu_id[0], GPU_ID_LEN,
+        "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
+        uuid.bytes[0],
+        uuid.bytes[1],
+        uuid.bytes[2],
+        uuid.bytes[3],
+        uuid.bytes[4],
+        uuid.bytes[5],
+        uuid.bytes[6],
+        uuid.bytes[7],
+        uuid.bytes[8],
+        uuid.bytes[9],
+        uuid.bytes[10],
+        uuid.bytes[11],
+        uuid.bytes[12],
+        uuid.bytes[13],
+        uuid.bytes[14],
+        uuid.bytes[15]
+      );
+  }
+
+  ret = (*h.cuDeviceGetName)(&resp->gpu_name[0], GPU_NAME_LEN, device);
+  if (ret != CUDA_SUCCESS) {
+    LOG(h.verbose, "[%d] device name lookup failure: %d\n", i, ret);
+    resp->gpu_name[0] = '\0';
+  }
+
+  // To get memory we have to set (and release) a context
+  ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
+  if (ret != CUDA_SUCCESS) {
+    snprintf(buf, buflen, "nvcuda failed to get primary device context %d", ret);
+    resp->err = strdup(buf);
+    return;
+  }
+
+  ret = (*h.cuMemGetInfo_v2)(&memInfo.free, &memInfo.total);
+  if (ret != CUDA_SUCCESS) {
+    snprintf(buf, buflen, "nvcuda device memory info lookup failure %d", ret);
+    resp->err = strdup(buf);
+    // Best effort on failure...
+    (*h.cuCtxDestroy)(ctx);
+    return;
+  }
+
+  resp->total = memInfo.total;
+  resp->free = memInfo.free;
+
+  LOG(h.verbose, "[%s] CUDA totalMem %lu mb\n", resp->gpu_id, resp->total / 1024 / 1024);
+  LOG(h.verbose, "[%s] CUDA freeMem %lu mb\n", resp->gpu_id, resp->free / 1024 / 1024);
+  LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor);
+
+  
+
+  ret = (*h.cuCtxDestroy)(ctx);
+  if (ret != CUDA_SUCCESS) {
+    LOG(1, "nvcuda failed to release primary device context %d", ret);
+  }
+}
+
+void nvcuda_release(nvcuda_handle_t h) {
+  LOG(h.verbose, "releasing nvcuda library\n");
+  UNLOAD_LIBRARY(h.handle);
+  // TODO and other context release logic?
+  h.handle = NULL;
+}
+
+#endif  // __APPLE__
--- a/gpu/gpu_info_nvcuda.h
+++ b/gpu/gpu_info_nvcuda.h
@@ -0,0 +1,74 @@
+#ifndef __APPLE__
+#ifndef __GPU_INFO_NVCUDA_H__
+#define __GPU_INFO_NVCUDA_H__
+#include "gpu_info.h"
+
+// Just enough typedef's to dlopen/dlsym for memory information
+typedef enum cudaError_enum {
+  CUDA_SUCCESS = 0,
+  CUDA_ERROR_INVALID_VALUE = 1,
+  CUDA_ERROR_MEMORY_ALLOCATION = 2,
+  CUDA_ERROR_NOT_INITIALIZED = 3,
+  CUDA_ERROR_INSUFFICIENT_DRIVER = 35,
+  // Other values omitted for now...
+} CUresult;
+
+typedef enum CUdevice_attribute_enum {
+  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75,
+  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76,
+
+  // TODO - not yet wired up but may be useful for Jetson or other
+  // integrated GPU scenarios with shared memory
+  CU_DEVICE_ATTRIBUTE_INTEGRATED = 18
+
+} CUdevice_attribute;
+
+typedef void *nvcudaDevice_t;  // Opaque is sufficient
+typedef struct nvcudaMemory_st {
+  uint64_t total;
+  uint64_t free;
+} nvcudaMemory_t;
+
+typedef struct nvcudaDriverVersion {
+  int major;
+  int minor;
+} nvcudaDriverVersion_t;
+
+typedef struct CUuuid_st {
+    unsigned char bytes[16];
+} CUuuid;
+
+typedef int CUdevice;
+typedef void* CUcontext;
+
+typedef struct nvcuda_handle {
+  void *handle;
+  uint16_t verbose;
+  int driver_major;
+  int driver_minor;
+  CUresult (*cuInit)(unsigned int Flags);
+  CUresult (*cuDriverGetVersion)(int *driverVersion);
+  CUresult (*cuDeviceGetCount)(int *);
+  CUresult (*cuDeviceGet)(CUdevice* device, int ordinal);
+  CUresult (*cuDeviceGetAttribute)(int* pi, CUdevice_attribute attrib, CUdevice dev);
+  CUresult (*cuDeviceGetUuid)(CUuuid* uuid, CUdevice dev); // signature compatible with cuDeviceGetUuid_v2
+  CUresult (*cuDeviceGetName)(char *name, int len, CUdevice dev);
+
+  // Context specific aspects
+  CUresult (*cuCtxCreate_v3)(CUcontext* pctx, void *params, int len, unsigned int flags, CUdevice dev);
+  CUresult (*cuMemGetInfo_v2)(uint64_t* free, uint64_t* total);
+  CUresult (*cuCtxDestroy)(CUcontext ctx);
+} nvcuda_handle_t;
+
+typedef struct nvcuda_init_resp {
+  char *err;  // If err is non-null handle is invalid
+  nvcuda_handle_t ch;
+  int num_devices;
+} nvcuda_init_resp_t;
+
+void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp);
+void nvcuda_check_vram(nvcuda_handle_t ch, int device_id, mem_info_t *resp);
+void nvcuda_release(nvcuda_handle_t ch);
+
+#endif  // __GPU_INFO_NVCUDA_H__
+#endif  // __APPLE__
--- a/gpu/types.go
+++ b/gpu/types.go
@@ -1,5 +1,12 @@
 package gpu

+import (
+	"fmt"
+	"log/slog"
+
+	"github.com/ollama/ollama/format"
+)
+
 type memInfo struct {
 	TotalMemory uint64 `json:"total_memory,omitempty"`
 	FreeMemory  uint64 `json:"free_memory,omitempty"`
@@ -20,11 +27,13 @@ type GpuInfo struct {
 	DependencyPath string `json:"lib_path,omitempty"`

 	// GPU information
-	ID    string `json:"gpu_id"`          // string to use for selection of this specific GPU
-	Name  string `json:"name"`            // user friendly name if available
-	Major int    `json:"major,omitempty"` // Major compatibility version (CC or gfx)
-	Minor int    `json:"minor,omitempty"` // Minor compatibility version (CC or gfx)
-	Patch int    `json:"patch,omitempty"` // Patch compatibility only matters on AMD
+	ID      string `json:"gpu_id"`  // string to use for selection of this specific GPU
+	Name    string `json:"name"`    // user friendly name if available
+	Compute string `json:"compute"` // Compute Capability or gfx
+
+	// Driver Information - TODO no need to put this on each GPU
+	DriverMajor int `json:"driver_major,omitempty"`
+	DriverMinor int `json:"driver_minor,omitempty"`

 	// TODO other performance capability info to help in scheduling decisions
 }
@@ -56,6 +65,21 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
 	return resp
 }

+// Report the GPU information into the log an Info level
+func (l GpuInfoList) LogDetails() {
+	for _, g := range l {
+		slog.Info("inference compute",
+			"id", g.ID,
+			"library", g.Library,
+			"compute", g.Compute,
+			"driver", fmt.Sprintf("%d.%d", g.DriverMajor, g.DriverMinor),
+			"name", g.Name,
+			"total", format.HumanBytes2(g.TotalMemory),
+			"available", format.HumanBytes2(g.FreeMemory),
+		)
+	}
+}
+
 // Sort by Free Space
 type ByFreeMemory []GpuInfo

--- a/integration/concurrency_test.go
+++ b/integration/concurrency_test.go
@@ -217,7 +217,7 @@ func TestMultiModelStress(t *testing.T) {
 			defer wg.Done()
 			for j := 0; j < 3; j++ {
 				slog.Info("Starting", "req", i, "iter", j, "model", req[i].Model)
-				DoGenerate(ctx, t, client, req[i], resp[i], 90*time.Second, 5*time.Second)
+				DoGenerate(ctx, t, client, req[i], resp[i], 120*time.Second, 5*time.Second)
 			}
 		}(i)
 	}
--- a/integration/max_queue_test.go
+++ b/integration/max_queue_test.go
@@ -0,0 +1,117 @@
+//go:build integration
+
+package integration
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"log/slog"
+	"os"
+	"strconv"
+	"strings"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/ollama/ollama/api"
+	"github.com/stretchr/testify/require"
+)
+
+func TestMaxQueue(t *testing.T) {
+	// Note: This test can be quite slow when running in CPU mode, so keep the threadCount low unless your on GPU
+	// Also note that by default Darwin can't sustain > ~128 connections without adjusting limits
+	threadCount := 32
+	mq := os.Getenv("OLLAMA_MAX_QUEUE")
+	if mq != "" {
+		var err error
+		threadCount, err = strconv.Atoi(mq)
+		require.NoError(t, err)
+	} else {
+		os.Setenv("OLLAMA_MAX_QUEUE", fmt.Sprintf("%d", threadCount))
+	}
+
+	req := api.GenerateRequest{
+		Model:  "orca-mini",
+		Prompt: "write a long historical fiction story about christopher columbus.  use at least 10 facts from his actual journey",
+		Options: map[string]interface{}{
+			"seed":        42,
+			"temperature": 0.0,
+		},
+	}
+	resp := []string{"explore", "discover", "ocean"}
+
+	// CPU mode takes much longer at the limit with a large queue setting
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
+	defer cancel()
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
+
+	require.NoError(t, PullIfMissing(ctx, client, req.Model))
+
+	// Context for the worker threads so we can shut them down
+	// embedCtx, embedCancel := context.WithCancel(ctx)
+	embedCtx := ctx
+
+	var genwg sync.WaitGroup
+	go func() {
+		genwg.Add(1)
+		defer genwg.Done()
+		slog.Info("Starting generate request")
+		DoGenerate(ctx, t, client, req, resp, 45*time.Second, 5*time.Second)
+		slog.Info("generate completed")
+	}()
+
+	// Give the generate a chance to get started before we start hammering on embed requests
+	time.Sleep(5 * time.Millisecond)
+
+	threadCount += 10 // Add a few extra to ensure we push the queue past its limit
+	busyCount := 0
+	resetByPeerCount := 0
+	canceledCount := 0
+	succesCount := 0
+	counterMu := sync.Mutex{}
+	var embedwg sync.WaitGroup
+	for i := 0; i < threadCount; i++ {
+		go func(i int) {
+			embedwg.Add(1)
+			defer embedwg.Done()
+			slog.Info("embed started", "id", i)
+			embedReq := api.EmbeddingRequest{
+				Model:   req.Model,
+				Prompt:  req.Prompt,
+				Options: req.Options,
+			}
+			// Fresh client for every request
+			client, _ = GetTestEndpoint()
+
+			resp, genErr := client.Embeddings(embedCtx, &embedReq)
+			counterMu.Lock()
+			defer counterMu.Unlock()
+			switch {
+			case genErr == nil:
+				succesCount++
+				require.Greater(t, len(resp.Embedding), 5) // somewhat arbitrary, but sufficient to be reasonable
+			case errors.Is(genErr, context.Canceled):
+				canceledCount++
+			case strings.Contains(genErr.Error(), "busy"):
+				busyCount++
+			case strings.Contains(genErr.Error(), "connection reset by peer"):
+				resetByPeerCount++
+			default:
+				require.NoError(t, genErr, "%d request failed", i)
+			}
+
+			slog.Info("embed finished", "id", i)
+		}(i)
+	}
+	genwg.Wait()
+	slog.Info("generate done, waiting for embeds")
+	embedwg.Wait()
+
+	require.Equal(t, resetByPeerCount, 0, "Connections reset by peer, have you updated your fd and socket limits?")
+	require.True(t, busyCount > 0, "no requests hit busy error but some should have")
+	require.True(t, canceledCount == 0, "no requests should have been canceled due to timeout")
+
+	slog.Info("embeds completed", "success", succesCount, "busy", busyCount, "reset", resetByPeerCount, "canceled", canceledCount)
+}
--- a/integration/utils_test.go
+++ b/integration/utils_test.go
@@ -85,7 +85,7 @@ func GetTestEndpoint() (*api.Client, string) {
 var serverMutex sync.Mutex
 var serverReady bool

-func startServer(ctx context.Context, ollamaHost string) error {
+func startServer(t *testing.T, ctx context.Context, ollamaHost string) error {
 	// Make sure the server has been built
 	CLIName, err := filepath.Abs("../ollama")
 	if err != nil {
@@ -107,7 +107,7 @@ func startServer(ctx context.Context, ollamaHost string) error {

 	if tmp := os.Getenv("OLLAMA_HOST"); tmp != ollamaHost {
 		slog.Info("setting env", "OLLAMA_HOST", ollamaHost)
-		os.Setenv("OLLAMA_HOST", ollamaHost)
+		t.Setenv("OLLAMA_HOST", ollamaHost)
 	}

 	slog.Info("starting server", "url", ollamaHost)
@@ -200,7 +200,7 @@ func InitServerConnection(ctx context.Context, t *testing.T) (*api.Client, strin
 		}
 		lifecycle.ServerLogFile = fp.Name()
 		fp.Close()
-		require.NoError(t, startServer(ctx, testEndpoint))
+		require.NoError(t, startServer(t, ctx, testEndpoint))
 	}

 	return client, testEndpoint, func() {
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@@ -66,7 +66,7 @@ struct server_params {
 };

 bool server_verbose = false;
-bool server_log_json = true;
+bool server_log_json = false;

 enum stop_type {
    STOP_FULL,
@@ -266,7 +266,7 @@ struct server_slot {
        sprintf(buffer, "prompt eval time     = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
                t_prompt_processing, n_prompt_tokens_processed,
                t_token, n_tokens_second);
-        LOG_INFO(buffer, {
+        LOG_DEBUG(buffer, {
            {"slot_id",                   id},
            {"task_id",                   task_id},
            {"t_prompt_processing",       t_prompt_processing},
@@ -280,7 +280,7 @@ struct server_slot {
        sprintf(buffer, "generation eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)",
                t_token_generation, n_decoded,
                t_token, n_tokens_second);
-        LOG_INFO(buffer, {
+        LOG_DEBUG(buffer, {
            {"slot_id",            id},
            {"task_id",            task_id},
            {"t_token_generation", t_token_generation},
@@ -290,7 +290,7 @@ struct server_slot {
        });

        sprintf(buffer, "          total time = %10.2f ms", t_prompt_processing + t_token_generation);
-        LOG_INFO(buffer, {
+        LOG_DEBUG(buffer, {
            {"slot_id",             id},
            {"task_id",             task_id},
            {"t_prompt_processing", t_prompt_processing},
@@ -371,7 +371,7 @@ struct llama_server_context
    {
        if (clp_ctx)
        {
-            LOG_INFO("freeing clip model", {});
+            LOG_DEBUG("freeing clip model", {});
            clip_free(clp_ctx);
            clp_ctx = nullptr;
        }
@@ -392,7 +392,7 @@ struct llama_server_context
        params = params_;
        if (!params.mmproj.empty()) {
            multimodal = true;
-            LOG_INFO("Multi Modal Mode Enabled", {});
+            LOG_DEBUG("Multi Modal Mode Enabled", {});
            clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
            if(clp_ctx == nullptr) {
                LOG_ERROR("unable to load clip model", {{"model", params.mmproj}});
@@ -445,7 +445,7 @@ struct llama_server_context

        const int32_t n_ctx_slot = n_ctx / params.n_parallel;

-        LOG_INFO("initializing slots", {{"n_slots", params.n_parallel}});
+        LOG_DEBUG("initializing slots", {{"n_slots", params.n_parallel}});
        for (int i = 0; i < params.n_parallel; i++)
        {
            server_slot slot;
@@ -454,7 +454,7 @@ struct llama_server_context
            slot.n_ctx = n_ctx_slot;
            slot.n_predict = params.n_predict;

-            LOG_INFO("new slot", {
+            LOG_DEBUG("new slot", {
                {"slot_id",    slot.id},
                {"n_ctx_slot", slot.n_ctx}
            });
@@ -468,7 +468,7 @@ struct llama_server_context
                //GGML_ASSERT(n_ctx_train % ga_w == 0     && "n_ctx_train must be a multiple of ga_w");    // NOLINT
                //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT

-                LOG_INFO("slot self-extend", {
+                LOG_DEBUG("slot self-extend", {
                    {"slot_id",   slot.id},
                    {"ga_n",      ga_n},
                    {"ga_w",      ga_w}
@@ -827,7 +827,7 @@ struct llama_server_context

        all_slots_are_idle = false;

-        LOG_INFO("slot is processing task", {
+        LOG_DEBUG("slot is processing task", {
            {"slot_id", slot->id},
            {"task_id", slot->task_id},
        });
@@ -1032,7 +1032,7 @@ struct llama_server_context
            slot.has_next_token = false;
        }

-        if (!slot.cache_tokens.empty() && result.tok == llama_token_eos(model))
+        if (!slot.cache_tokens.empty() && llama_token_is_eog(model, result.tok))
        {
            slot.stopped_eos = true;
            slot.has_next_token = false;
@@ -1144,12 +1144,15 @@ struct llama_server_context

        res.result_json = json
        {
-            {"content",    tkn.text_to_send},
            {"stop",       false},
            {"slot_id",    slot.id},
            {"multimodal", multimodal}
        };

+        if (!llama_token_is_eog(model, tkn.tok)) {
+            res.result_json["content"] = tkn.text_to_send;
+        }
+
        if (slot.sparams.n_probs > 0)
        {
            std::vector<completion_token_output> probs_output = {};
@@ -1183,8 +1186,6 @@ struct llama_server_context
            {"model",               params.model_alias},
            {"tokens_predicted",    slot.n_decoded},
            {"tokens_evaluated",    slot.n_prompt_tokens},
-            {"generation_settings", get_formated_generation(slot)},
-            {"prompt",              slot.prompt},
            {"truncated",           slot.truncated},
            {"stopped_eos",         slot.stopped_eos},
            {"stopped_word",        slot.stopped_word},
@@ -1503,7 +1504,7 @@ struct llama_server_context
                    }
                    slots_data.push_back(slot_data);
                }
-                LOG_INFO("slot data", {
+                LOG_DEBUG("slot data", {
                    {"task_id",            task.id},
                    {"n_idle_slots",       n_idle_slots},
                    {"n_processing_slots", n_processing_slots}
@@ -1565,7 +1566,7 @@ struct llama_server_context
    bool update_slots() {
        if (system_need_update)
        {
-            LOG_INFO("updating system prompt", {});
+            LOG_DEBUG("updating system prompt", {});
            system_prompt_update();
        }

@@ -1575,7 +1576,7 @@ struct llama_server_context
        {
            if (system_prompt.empty() && clean_kv_cache)
            {
-                LOG_INFO("all slots are idle and system prompt is empty, clear the KV cache", {});
+                LOG_DEBUG("all slots are idle and system prompt is empty, clear the KV cache", {});
                kv_cache_clear();
            }
            return true;
@@ -1598,7 +1599,7 @@ struct llama_server_context
                    const int n_left    = (int) system_tokens.size() + slot.n_past - n_keep;
                    const int n_discard = n_left / 2;

-                    LOG_INFO("slot context shift", {
+                    LOG_DEBUG("slot context shift", {
                        {"slot_id",         slot.id},
                        {"task_id",         slot.task_id},
                        {"n_keep",          n_keep},
@@ -1637,7 +1638,7 @@ struct llama_server_context
                slot.command = NONE;
                slot.t_last_used = ggml_time_us();

-                LOG_INFO("slot released", {
+                LOG_DEBUG("slot released", {
                    {"slot_id",         slot.id},
                    {"task_id",         slot.task_id},
                    {"n_ctx",           n_ctx},
@@ -1806,7 +1807,7 @@ struct llama_server_context
                            slot.ga_i = ga_i;
                        }

-                        LOG_INFO("slot progression", {
+                        LOG_DEBUG("slot progression", {
                            { "slot_id",    slot.id },
                            { "task_id",    slot.task_id },
                            { "n_past",     slot.n_past },
@@ -1821,7 +1822,7 @@ struct llama_server_context
                    if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0)
                    {
                        // we have to evaluate at least 1 token to generate logits.
-                        LOG_INFO("we have to evaluate at least 1 token to generate logits", {
+                        LOG_DEBUG("we have to evaluate at least 1 token to generate logits", {
                            { "slot_id", slot.id },
                            { "task_id", slot.task_id }
                        });
@@ -1833,7 +1834,7 @@ struct llama_server_context
                    }

                    int p0 = (int) system_tokens.size() + slot.n_past;
-                    LOG_INFO("kv cache rm [p0, end)", {
+                    LOG_DEBUG("kv cache rm [p0, end)", {
                        { "slot_id", slot.id },
                        { "task_id", slot.task_id },
                        { "p0",      p0 }
@@ -2490,11 +2491,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
        }
        else if (arg == "-v" || arg == "--verbose")
        {
-#if SERVER_VERBOSE != 1
-            LOG_WARNING("server.cpp is not built with verbose logging.", {});
-#else
            server_verbose = true;
-#endif
        }
        else if (arg == "--mlock")
        {
@@ -2600,7 +2597,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
        else if (arg == "--log-disable")
        {
            log_set_target(stdout);
-            LOG_INFO("logging to file is disabled.", {});
+            LOG_DEBUG("logging to file is disabled.", {});
        }
        else if (arg == "--slots-endpoint-disable")
        {
@@ -2644,18 +2641,18 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
            if (strncmp(sep, "int:", 4) == 0) {
                sep += 4;
                kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
-                kvo.int_value = std::atol(sep);
+                kvo.val_i64 = std::atol(sep);
            } else if (strncmp(sep, "float:", 6) == 0) {
                sep += 6;
                kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
-                kvo.float_value = std::atof(sep);
+                kvo.val_f64 = std::atof(sep);
            } else if (strncmp(sep, "bool:", 5) == 0) {
                sep += 5;
                kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
                if (std::strcmp(sep, "true") == 0) {
-                    kvo.bool_value = true;
+                    kvo.val_bool = true;
                } else if (std::strcmp(sep, "false") == 0) {
-                    kvo.bool_value = false;
+                    kvo.val_bool = false;
                } else {
                    fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
                    invalid_param = true;
@@ -2726,12 +2723,12 @@ static json format_detokenized_response(std::string content)
 static void log_server_request(const httplib::Request &req, const httplib::Response &res)
 {
    // skip GH copilot requests when using default port
-    if (req.path == "/v1/health" || req.path == "/v1/completions")
+    if (req.path == "/health" || req.path == "/v1/health" || req.path == "/v1/completions")
    {
        return;
    }

-    LOG_INFO("request", {
+    LOG_DEBUG("request", {
        {"remote_addr", req.remote_addr},
        {"remote_port", req.remote_port},
        {"status",      res.status},
@@ -3053,6 +3050,26 @@ int main(int argc, char **argv) {
        log_data["api_key"] = "api_key: " + std::to_string(sparams.api_keys.size()) + " keys loaded";
    }

+    if (sparams.n_threads_http < 1) {
+        // +2 threads for monitoring endpoints
+        sparams.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);
+    }
+    log_data["n_threads_http"] =  std::to_string(sparams.n_threads_http);
+    svr.new_task_queue = [&sparams] { return new httplib::ThreadPool(sparams.n_threads_http); };
+
+    LOG_INFO("HTTP server listening", log_data);
+    // run the HTTP server in a thread - see comment below
+    std::thread t([&]()
+            {
+                if (!svr.listen_after_bind())
+                {
+                    state.store(SERVER_STATE_ERROR);
+                    return 1;
+                }
+
+                return 0;
+            });
+
    // load the model
    if (!llama.load_model(params))
    {
@@ -3257,26 +3274,6 @@ int main(int argc, char **argv) {
    }*/
    //);

-    if (sparams.n_threads_http < 1) {
-        // +2 threads for monitoring endpoints
-        sparams.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);
-    }
-    log_data["n_threads_http"] =  std::to_string(sparams.n_threads_http);
-    svr.new_task_queue = [&sparams] { return new httplib::ThreadPool(sparams.n_threads_http); };
-
-    LOG_INFO("HTTP server listening", log_data);
-    // run the HTTP server in a thread - see comment below
-    std::thread t([&]()
-            {
-                if (!svr.listen_after_bind())
-                {
-                    state.store(SERVER_STATE_ERROR);
-                    return 1;
-                }
-
-                return 0;
-            });
-
    llama.queue_tasks.on_new_task(std::bind(
        &llama_server_context::process_single_task, &llama, std::placeholders::_1));
    llama.queue_tasks.on_finish_multitask(std::bind(
--- a/llm/ext_server/utils.hpp
+++ b/llm/ext_server/utils.hpp
@@ -55,9 +55,10 @@ extern bool server_log_json;
    } while (0)
 #endif

-#define LOG_ERROR(  MSG, ...) server_log("ERR",  __func__, __LINE__, MSG, __VA_ARGS__)
+#define LOG_ERROR(  MSG, ...) server_log("ERROR",  __func__, __LINE__, MSG, __VA_ARGS__)
 #define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
 #define LOG_INFO(   MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
+#define LOG_DEBUG(  MSG, ...) server_log("DEBUG", __func__, __LINE__, MSG, __VA_ARGS__)

 enum server_state {
    SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet
@@ -123,6 +124,10 @@ static inline void server_log(const char *level, const char *function, int line,
        {"timestamp", time(nullptr)},
    };

+    if (strncmp("DEBUG", level, strlen(level)) == 0 && !server_verbose) {
+        return;
+    }
+
    if (server_log_json) {
        log.merge_patch(
                {
@@ -137,14 +142,12 @@ static inline void server_log(const char *level, const char *function, int line,

        std::cout << log.dump(-1, ' ', false, json::error_handler_t::replace) << "\n" << std::flush;
    } else {
-        char buf[1024];
-        snprintf(buf, 1024, "%4s [%24s] %s", level, function, message);
-
        if (!extra.empty()) {
            log.merge_patch(extra);
        }
+
        std::stringstream ss;
-        ss << buf << " |";
+        ss << level << " [" << function << "] " << message << " |";
        for (const auto& el : log.items())
        {
            const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace);
--- a/llm/filetype.go
+++ b/llm/filetype.go
@@ -0,0 +1,140 @@
+package llm
+
+import "fmt"
+
+type fileType uint32
+
+const (
+	fileTypeF32 fileType = iota
+	fileTypeF16
+	fileTypeQ4_0
+	fileTypeQ4_1
+	fileTypeQ4_1_F16
+	fileTypeQ4_2 // unused
+	fileTypeQ4_3 // unused
+	fileTypeQ8_0
+	fileTypeQ5_0
+	fileTypeQ5_1
+	fileTypeQ2_K
+	fileTypeQ3_K_S
+	fileTypeQ3_K_M
+	fileTypeQ3_K_L
+	fileTypeQ4_K_S
+	fileTypeQ4_K_M
+	fileTypeQ5_K_S
+	fileTypeQ5_K_M
+	fileTypeQ6_K
+	fileTypeIQ2_XXS
+	fileTypeIQ2_XS
+	fileTypeQ2_K_S
+	fileTypeQ3_K_XS
+	fileTypeIQ3_XXS
+
+	fileTypeUnknown
+)
+
+func ParseFileType(s string) (fileType, error) {
+	switch s {
+	case "F32":
+		return fileTypeF32, nil
+	case "F16":
+		return fileTypeF16, nil
+	case "Q4_0":
+		return fileTypeQ4_0, nil
+	case "Q4_1":
+		return fileTypeQ4_1, nil
+	case "Q4_1_F16":
+		return fileTypeQ4_1_F16, nil
+	case "Q8_0":
+		return fileTypeQ8_0, nil
+	case "Q5_0":
+		return fileTypeQ5_0, nil
+	case "Q5_1":
+		return fileTypeQ5_1, nil
+	case "Q2_K":
+		return fileTypeQ2_K, nil
+	case "Q3_K_S":
+		return fileTypeQ3_K_S, nil
+	case "Q3_K_M":
+		return fileTypeQ3_K_M, nil
+	case "Q3_K_L":
+		return fileTypeQ3_K_L, nil
+	case "Q4_K_S":
+		return fileTypeQ4_K_S, nil
+	case "Q4_K_M":
+		return fileTypeQ4_K_M, nil
+	case "Q5_K_S":
+		return fileTypeQ5_K_S, nil
+	case "Q5_K_M":
+		return fileTypeQ5_K_M, nil
+	case "Q6_K":
+		return fileTypeQ6_K, nil
+	case "IQ2_XXS":
+		return fileTypeIQ2_XXS, nil
+	case "IQ2_XS":
+		return fileTypeIQ2_XS, nil
+	case "Q2_K_S":
+		return fileTypeQ2_K_S, nil
+	case "Q3_K_XS":
+		return fileTypeQ3_K_XS, nil
+	case "IQ3_XXS":
+		return fileTypeIQ3_XXS, nil
+	default:
+		return fileTypeUnknown, fmt.Errorf("unknown fileType: %s", s)
+	}
+}
+
+func (t fileType) String() string {
+	switch t {
+	case fileTypeF32:
+		return "F32"
+	case fileTypeF16:
+		return "F16"
+	case fileTypeQ4_0:
+		return "Q4_0"
+	case fileTypeQ4_1:
+		return "Q4_1"
+	case fileTypeQ4_1_F16:
+		return "Q4_1_F16"
+	case fileTypeQ8_0:
+		return "Q8_0"
+	case fileTypeQ5_0:
+		return "Q5_0"
+	case fileTypeQ5_1:
+		return "Q5_1"
+	case fileTypeQ2_K:
+		return "Q2_K"
+	case fileTypeQ3_K_S:
+		return "Q3_K_S"
+	case fileTypeQ3_K_M:
+		return "Q3_K_M"
+	case fileTypeQ3_K_L:
+		return "Q3_K_L"
+	case fileTypeQ4_K_S:
+		return "Q4_K_S"
+	case fileTypeQ4_K_M:
+		return "Q4_K_M"
+	case fileTypeQ5_K_S:
+		return "Q5_K_S"
+	case fileTypeQ5_K_M:
+		return "Q5_K_M"
+	case fileTypeQ6_K:
+		return "Q6_K"
+	case fileTypeIQ2_XXS:
+		return "IQ2_XXS"
+	case fileTypeIQ2_XS:
+		return "IQ2_XS"
+	case fileTypeQ2_K_S:
+		return "Q2_K_S"
+	case fileTypeQ3_K_XS:
+		return "Q3_K_XS"
+	case fileTypeIQ3_XXS:
+		return "IQ3_XXS"
+	default:
+		return "unknown"
+	}
+}
+
+func (t fileType) Value() uint32 {
+	return uint32(t)
+}
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -42,7 +42,7 @@ function init_vars {
        "-DLLAMA_NATIVE=off"
        )
    $script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
-    $script:ARCH = "amd64" # arm not yet supported.
+    $script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower()
    $script:DIST_BASE = "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_runners"
    md "$script:DIST_BASE" -ea 0 > $null
    if ($env:CGO_CFLAGS -contains "-g") {
@@ -213,11 +213,11 @@ function build_static() {
    }
 }

-function build_cpu() {
+function build_cpu($gen_arch) {
    if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu"))) {
        # remaining llama.cpp builds use MSVC 
        init_vars
-        $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
+        $script:cmakeDefs = $script:commonCpuDefs + @("-A", $gen_arch, "-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
        $script:buildDir="../build/windows/${script:ARCH}/cpu"
        $script:distDir="$script:DIST_BASE\cpu"
        write-host "Building LCD CPU"
@@ -349,11 +349,15 @@ if ($($args.count) -eq 0) {
    git_module_setup
    apply_patches
    build_static
-    build_cpu
-    build_cpu_avx
-    build_cpu_avx2
-    build_cuda
-    build_rocm
+    if ($script:ARCH -eq "arm64") {
+        build_cpu("ARM64")
+    } else { # amd64
+        build_cpu("x64")
+        build_cpu_avx
+        build_cpu_avx2
+        build_cuda
+        build_rocm
+    }

    cleanup
    write-host "`ngo generate completed.  LLM runners: $(get-childitem -path $script:DIST_BASE)"
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -13,82 +13,6 @@ type GGML struct {
 	model
 }

-const (
-	fileTypeF32 uint32 = iota
-	fileTypeF16
-	fileTypeQ4_0
-	fileTypeQ4_1
-	fileTypeQ4_1_F16
-	fileTypeQ8_0 uint32 = iota + 2
-	fileTypeQ5_0
-	fileTypeQ5_1
-	fileTypeQ2_K
-	fileTypeQ3_K_S
-	fileTypeQ3_K_M
-	fileTypeQ3_K_L
-	fileTypeQ4_K_S
-	fileTypeQ4_K_M
-	fileTypeQ5_K_S
-	fileTypeQ5_K_M
-	fileTypeQ6_K
-	fileTypeIQ2_XXS
-	fileTypeIQ2_XS
-	fileTypeQ2_K_S
-	fileTypeQ3_K_XS
-	fileTypeIQ3_XXS
-)
-
-func fileType(fileType uint32) string {
-	switch fileType {
-	case fileTypeF32:
-		return "F32"
-	case fileTypeF16:
-		return "F16"
-	case fileTypeQ4_0:
-		return "Q4_0"
-	case fileTypeQ4_1:
-		return "Q4_1"
-	case fileTypeQ4_1_F16:
-		return "Q4_1_F16"
-	case fileTypeQ8_0:
-		return "Q8_0"
-	case fileTypeQ5_0:
-		return "Q5_0"
-	case fileTypeQ5_1:
-		return "Q5_1"
-	case fileTypeQ2_K:
-		return "Q2_K"
-	case fileTypeQ3_K_S:
-		return "Q3_K_S"
-	case fileTypeQ3_K_M:
-		return "Q3_K_M"
-	case fileTypeQ3_K_L:
-		return "Q3_K_L"
-	case fileTypeQ4_K_S:
-		return "Q4_K_S"
-	case fileTypeQ4_K_M:
-		return "Q4_K_M"
-	case fileTypeQ5_K_S:
-		return "Q5_K_S"
-	case fileTypeQ5_K_M:
-		return "Q5_K_M"
-	case fileTypeQ6_K:
-		return "Q6_K"
-	case fileTypeIQ2_XXS:
-		return "IQ2_XXS"
-	case fileTypeIQ2_XS:
-		return "IQ2_XS"
-	case fileTypeQ2_K_S:
-		return "Q2_K_S"
-	case fileTypeQ3_K_XS:
-		return "Q3_K_XS"
-	case fileTypeIQ3_XXS:
-		return "IQ3_XXS"
-	default:
-		return "unknown"
-	}
-}
-
 type model interface {
 	KV() KV
 	Tensors() Tensors
@@ -121,12 +45,12 @@ func (kv KV) ParameterCount() uint64 {
 	return kv.u64("general.parameter_count")
 }

-func (kv KV) FileType() string {
+func (kv KV) FileType() fileType {
 	if u64 := kv.u64("general.file_type"); u64 > 0 {
 		return fileType(uint32(u64))
 	}

-	return "unknown"
+	return fileTypeUnknown
 }

 func (kv KV) BlockCount() uint64 {
@@ -286,6 +210,23 @@ const (

 var ErrUnsupportedFormat = errors.New("unsupported model format")

+func DetectGGMLType(b []byte) string {
+	switch binary.LittleEndian.Uint32(b[:4]) {
+	case FILE_MAGIC_GGML:
+		return "ggml"
+	case FILE_MAGIC_GGMF:
+		return "ggmf"
+	case FILE_MAGIC_GGJT:
+		return "ggjt"
+	case FILE_MAGIC_GGLA:
+		return "ggla"
+	case FILE_MAGIC_GGUF_LE, FILE_MAGIC_GGUF_BE:
+		return "gguf"
+	default:
+		return ""
+	}
+}
+
 func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
 	var magic uint32
 	if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
@@ -388,7 +329,10 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
 			4*batch*(1+4*embedding+context+context*heads),
 		)

-		partialOffload = 4*batch*(2*embedding+vocab) + embedding*vocab*105/128
+		partialOffload = max(
+			4*batch*(2*embedding+vocab)+embedding*vocab*105/128,
+			4*batch*(2+3*embedding+context+context*heads),
+		)
 	case "stablelm":
 		fullOffload = 4 * batch * (context*(1+heads) + 3*embedding + 2)
 		partialOffload = max(
--- a/llm/llama.cpp
+++ b/llm/llama.cpp
--- a/llm/llm.go
+++ b/llm/llm.go
@@ -4,6 +4,7 @@ package llm
 // #cgo darwin,arm64 LDFLAGS: ${SRCDIR}/build/darwin/arm64_static/libllama.a -lstdc++
 // #cgo darwin,amd64 LDFLAGS: ${SRCDIR}/build/darwin/x86_64_static/libllama.a -lstdc++
 // #cgo windows,amd64 LDFLAGS: ${SRCDIR}/build/windows/amd64_static/libllama.a -static -lstdc++
+// #cgo windows,arm64 LDFLAGS: ${SRCDIR}/build/windows/arm64_static/libllama.a -static -lstdc++
 // #cgo linux,amd64 LDFLAGS: ${SRCDIR}/build/linux/x86_64_static/libllama.a -lstdc++
 // #cgo linux,arm64 LDFLAGS: ${SRCDIR}/build/linux/arm64_static/libllama.a -lstdc++
 // #include <stdlib.h>
@@ -19,7 +20,7 @@ func SystemInfo() string {
 	return C.GoString(C.llama_print_system_info())
 }

-func Quantize(infile, outfile, filetype string) error {
+func Quantize(infile, outfile string, ftype fileType) error {
 	cinfile := C.CString(infile)
 	defer C.free(unsafe.Pointer(cinfile))

@@ -28,58 +29,10 @@ func Quantize(infile, outfile, filetype string) error {

 	params := C.llama_model_quantize_default_params()
 	params.nthread = -1
+	params.ftype = ftype.Value()

-	switch filetype {
-	case "F32":
-		params.ftype = fileTypeF32
-	case "F16":
-		params.ftype = fileTypeF16
-	case "Q4_0":
-		params.ftype = fileTypeQ4_0
-	case "Q4_1":
-		params.ftype = fileTypeQ4_1
-	case "Q4_1_F16":
-		params.ftype = fileTypeQ4_1_F16
-	case "Q8_0":
-		params.ftype = fileTypeQ8_0
-	case "Q5_0":
-		params.ftype = fileTypeQ5_0
-	case "Q5_1":
-		params.ftype = fileTypeQ5_1
-	case "Q2_K":
-		params.ftype = fileTypeQ2_K
-	case "Q3_K_S":
-		params.ftype = fileTypeQ3_K_S
-	case "Q3_K_M":
-		params.ftype = fileTypeQ3_K_M
-	case "Q3_K_L":
-		params.ftype = fileTypeQ3_K_L
-	case "Q4_K_S":
-		params.ftype = fileTypeQ4_K_S
-	case "Q4_K_M":
-		params.ftype = fileTypeQ4_K_M
-	case "Q5_K_S":
-		params.ftype = fileTypeQ5_K_S
-	case "Q5_K_M":
-		params.ftype = fileTypeQ5_K_M
-	case "Q6_K":
-		params.ftype = fileTypeQ6_K
-	case "IQ2_XXS":
-		params.ftype = fileTypeIQ2_XXS
-	case "IQ2_XS":
-		params.ftype = fileTypeIQ2_XS
-	case "Q2_K_S":
-		params.ftype = fileTypeQ2_K_S
-	case "Q3_K_XS":
-		params.ftype = fileTypeQ3_K_XS
-	case "IQ3_XXS":
-		params.ftype = fileTypeIQ3_XXS
-	default:
-		return fmt.Errorf("unknown filetype: %s", filetype)
-	}
-
-	if retval := C.llama_model_quantize(cinfile, coutfile, &params); retval != 0 {
-		return fmt.Errorf("llama_model_quantize: %d", retval)
+	if rc := C.llama_model_quantize(cinfile, coutfile, &params); rc != 0 {
+		return fmt.Errorf("llama_model_quantize: %d", rc)
 	}

 	return nil
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -3,30 +3,20 @@ package llm
 import (
 	"fmt"
 	"log/slog"
-	"os"
-	"strconv"

 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/gpu"
+	"github.com/ollama/ollama/server/envconfig"
 )

 // This algorithm looks for a complete fit to determine if we need to unload other models
 func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
-	var estimatedVRAM uint64
-	if opts.NumCtx > int(ggml.KV().ContextLength()) {
-		slog.Warn("requested context length is greater than model max context length", "requested", opts.NumCtx, "model", ggml.KV().ContextLength())
-		opts.NumCtx = int(ggml.KV().ContextLength())
-	}
-
-	if opts.NumCtx < 4 {
-		opts.NumCtx = 4
-	}
-
 	// Split up the GPUs by type and try them
+	var estimatedVRAM uint64
 	for _, gpus := range allGpus.ByLibrary() {
 		var layerCount int
-		layerCount, estimatedVRAM = EstimateGPULayers(gpus, ggml, projectors, opts)
+		layerCount, estimatedVRAM, _ = EstimateGPULayers(gpus, ggml, projectors, opts)
 		if opts.NumGPU < 0 {
 			if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) {
 				return true, estimatedVRAM
@@ -40,25 +30,15 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors
 	return false, estimatedVRAM
 }

-// Given a model and one or more GPU targets, predict how many layers and bytes we can load
+// Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
 // The GPUs provided must all be the same Library
-func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) (int, uint64) {
-	if gpus[0].Library == "cpu" {
-		return 0, 0
-	}
+func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) (int, uint64, uint64) {
 	var memoryAvailable uint64
 	for _, info := range gpus {
 		memoryAvailable += info.FreeMemory
 	}
-	userLimit := os.Getenv("OLLAMA_MAX_VRAM")
-	if userLimit != "" {
-		avail, err := strconv.ParseUint(userLimit, 10, 64)
-		if err != nil {
-			slog.Error("invalid setting, ignoring", "OLLAMA_MAX_VRAM", userLimit, "error", err)
-		} else {
-			slog.Info("user override memory limit", "OLLAMA_MAX_VRAM", avail, "actual", memoryAvailable)
-			memoryAvailable = avail
-		}
+	if envconfig.MaxVRAM > 0 {
+		memoryAvailable = envconfig.MaxVRAM
 	}

 	slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", format.HumanBytes2(memoryAvailable))
@@ -73,6 +53,12 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 		opts.NumCtx = max(opts.NumCtx, 2048)
 	}

+	layers := ggml.Tensors().Layers()
+	// add one layer worth of memory as a buffer
+	if blk0, ok := layers["blk.0"]; ok {
+		memoryMinimum += blk0.size()
+	}
+
 	// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
 	var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()

@@ -88,19 +74,17 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 	graphFullOffload *= uint64(len(gpus))
 	graphPartialOffload *= uint64(len(gpus))

+	// on metal there's no partial offload overhead
+	if gpus[0].Library == "metal" {
+		graphPartialOffload = graphFullOffload
+	}
+
 	// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
 	memoryRequiredTotal := memoryMinimum + graphFullOffload

 	// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
 	memoryRequiredPartial := memoryMinimum + graphPartialOffload

-	if memoryRequiredPartial > memoryAvailable {
-		slog.Debug("insufficient VRAM to load any model layers")
-		return 0, 0
-	}
-
-	layers := ggml.Tensors().Layers()
-
 	var memoryLayerOutput uint64
 	if layer, ok := layers["output_norm"]; ok {
 		memoryLayerOutput += layer.size()
@@ -120,15 +104,17 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts

 	var layerCount int
 	for i := 0; i < int(ggml.KV().BlockCount()); i++ {
-		memoryLayer := layers[fmt.Sprintf("blk.%d", i)].size()
+		if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
+			memoryLayer := blk.size()

-		// KV is proportional to the number of layers
-		memoryLayer += kv / ggml.KV().BlockCount()
+			// KV is proportional to the number of layers
+			memoryLayer += kv / ggml.KV().BlockCount()

-		memoryRequiredTotal += memoryLayer
-		if memoryAvailable > memoryRequiredPartial+memoryLayer {
-			memoryRequiredPartial += memoryLayer
-			layerCount++
+			memoryRequiredTotal += memoryLayer
+			if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredPartial+memoryLayer) {
+				memoryRequiredPartial += memoryLayer
+				layerCount++
+			}
 		}
 	}

@@ -137,7 +123,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 		memoryRequiredTotal += memoryLayerOutput
 	}

-	if memoryAvailable > memoryRequiredTotal {
+	if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredTotal) {
 		layerCount = int(ggml.KV().BlockCount()) + 1
 		memoryRequiredPartial = memoryRequiredTotal
 	}
@@ -148,10 +134,10 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 		"offload to gpu",
 		slog.Group(
 			"layers",
-			// actual number of layers offloaded
-			"real", opts.NumGPU,
+			// requested number of layers to offload
+			"requested", opts.NumGPU,
 			// estimated number of layers that can be offloaded
-			"estimate", layerCount,
+			"real", layerCount,
 		),
 		slog.Group(
 			"memory",
@@ -184,5 +170,13 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 			),
 		),
 	)
-	return layerCount, uint64(memoryRequiredPartial)
+	if gpus[0].Library == "cpu" {
+		return 0, 0, memoryRequiredTotal
+	}
+	if memoryRequiredPartial > memoryAvailable {
+		slog.Debug("insufficient VRAM to load any model layers")
+		return 0, 0, memoryRequiredTotal
+	}
+
+	return layerCount, memoryRequiredPartial, memoryRequiredTotal
 }
--- a/llm/patches/05-clip-fix.diff
+++ b/llm/patches/05-clip-fix.diff
@@ -0,0 +1,24 @@
+diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
+index e3c9bcd4..b43f892d 100644
+--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
+@@ -573,14 +573,16 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
+     struct ggml_tensor * embeddings = inp;
+     if (ctx->has_class_embedding) {
+         embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
+    }
+    ggml_set_name(embeddings, "embeddings");
+    ggml_set_input(embeddings);
+
+    if (ctx->has_class_embedding) {
+         embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
+                 embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
+         embeddings = ggml_acc(ctx0, embeddings, inp,
+                 embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
+     }
+-    ggml_set_name(embeddings, "embeddings");
+-    ggml_set_input(embeddings);
+-
+ 
+     struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
+     ggml_set_name(positions, "positions");
--- a/llm/server.go
+++ b/llm/server.go
@@ -26,6 +26,7 @@ import (
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/gpu"
+	"github.com/ollama/ollama/server/envconfig"
 )

 type LlamaServer interface {
@@ -37,6 +38,7 @@ type LlamaServer interface {
 	Detokenize(ctx context.Context, tokens []int) (string, error)
 	Close() error
 	EstimatedVRAM() uint64
+	EstimatedTotal() uint64
 }

 // llmServer is an instance of the llama.cpp server
@@ -48,7 +50,11 @@ type llmServer struct {
 	options api.Options

 	// TODO - this should be broken down by GPU
-	estimatedVRAM uint64 // Estimated usage of VRAM by the loaded model
+	estimatedVRAM  uint64 // Estimated usage of VRAM by the loaded model
+	estimatedTotal uint64 // Total size of model
+	totalLayers    uint64
+	gpuCount       int
+	loadDuration   time.Duration // Record how long it took the model to load

 	sem *semaphore.Weighted
 }
@@ -72,23 +78,18 @@ func LoadModel(model string) (*GGML, error) {
 // The gpu list must be a single family.
 func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options) (LlamaServer, error) {
 	var err error
-	if opts.NumCtx > int(ggml.KV().ContextLength()) {
-		slog.Warn("requested context length is greater than model max context length", "requested", opts.NumCtx, "model", ggml.KV().ContextLength())
-		opts.NumCtx = int(ggml.KV().ContextLength())
-	}
-
-	if opts.NumCtx < 4 {
-		opts.NumCtx = 4
-	}
-
-	cpuRunner := ""
+	var cpuRunner string
 	var estimatedVRAM uint64
+	var estimatedTotal uint64
 	var systemMemory uint64
+	gpuCount := len(gpus)
 	if (len(gpus) == 1 && gpus[0].Library == "cpu") || opts.NumGPU == 0 {

 		// TODO evaluate system memory to see if we should block the load, or force an unload of another CPU runner

 		cpuRunner = serverForCpu()
+		gpuCount = 0
+		_, _, estimatedTotal = EstimateGPULayers(gpus, ggml, projectors, opts)
 	} else {
 		if gpus[0].Library == "metal" {
 			memInfo, err := gpu.GetCPUMem()
@@ -100,12 +101,16 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 			}
 		}
 		var layers int
-		layers, estimatedVRAM = EstimateGPULayers(gpus, ggml, projectors, opts)
+		layers, estimatedVRAM, estimatedTotal = EstimateGPULayers(gpus, ggml, projectors, opts)

 		if gpus[0].Library == "metal" && estimatedVRAM > systemMemory {
 			// disable partial offloading when model is greater than total system memory as this
 			// can lead to locking up the system
 			opts.NumGPU = 0
+		} else if gpus[0].Library != "metal" && layers == 0 {
+			// Don't bother loading into the GPU if no layers can fit
+			cpuRunner = serverForCpu()
+			gpuCount = 0
 		} else if opts.NumGPU < 0 && layers > 0 && gpus[0].Library != "cpu" {
 			opts.NumGPU = layers
 		}
@@ -125,7 +130,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 	} else {
 		servers = serversForGpu(gpus[0]) // All GPUs in the list are matching Library and Variant
 	}
-	demandLib := strings.Trim(os.Getenv("OLLAMA_LLM_LIBRARY"), "\"' ")
+	demandLib := envconfig.LLMLibrary
 	if demandLib != "" {
 		serverPath := availableServers[demandLib]
 		if serverPath == "" {
@@ -133,6 +138,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		} else {
 			slog.Info("user override", "OLLAMA_LLM_LIBRARY", demandLib, "path", serverPath)
 			servers = []string{demandLib}
+			if strings.HasPrefix(demandLib, "cpu") {
+				// Omit the GPU flag to silence the warning
+				opts.NumGPU = -1
+			}
 		}
 	}

@@ -146,17 +155,14 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
 		"--embedding",
 	}
-	if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
-		params = append(params, "--log-format", "json")
-	} else {
-		params = append(params, "--log-disable")
-	}
+
+	params = append(params, "--log-disable")

 	if opts.NumGPU >= 0 {
 		params = append(params, "--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU))
 	}

-	if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
+	if envconfig.Debug {
 		params = append(params, "--verbose")
 	}

@@ -194,16 +200,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		params = append(params, "--numa")
 	}

-	// "--cont-batching", // TODO - doesn't seem to have any noticeable perf change for multiple requests
-	numParallel := 1
-	if onp := os.Getenv("OLLAMA_NUM_PARALLEL"); onp != "" {
-		numParallel, err = strconv.Atoi(onp)
-		if err != nil || numParallel <= 0 {
-			err = fmt.Errorf("invalid OLLAMA_NUM_PARALLEL=%s must be greater than zero - %w", onp, err)
-			slog.Error("misconfiguration", "error", err)
-			return nil, err
-		}
+	numParallel := envconfig.NumParallel
+
+	// TODO (jmorganca): multimodal models don't support parallel yet
+	// see https://github.com/ollama/ollama/issues/4165
+	if len(projectors) > 0 {
+		numParallel = 1
+		slog.Warn("multimodal models don't support parallel requests yet")
 	}
+
 	params = append(params, "--parallel", fmt.Sprintf("%d", numParallel))

 	for i := 0; i < len(servers); i++ {
@@ -211,10 +216,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		if dir == "" {
 			// Shouldn't happen
 			finalErr = fmt.Errorf("[%d] server %s not listed in available servers %v", i, servers[i], availableServers)
-			slog.Error("sever list inconsistent", "error", finalErr)
+			slog.Error("server list inconsistent", "error", finalErr)
 			continue
 		}

+		if strings.HasPrefix(servers[i], "cpu") {
+			// TODO if we tried a gpu runner first, and it failed, record the error and bubble that back up
+			gpuCount = 0
+		}
+
 		// Find an availableServers  port, retry on each iterration in case the failure was a port conflict race
 		port := 0
 		if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
@@ -234,13 +244,13 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		if runtime.GOOS == "windows" {
 			pathEnv = "PATH"
 		}
-		// append the server directory to LD_LIBRARY_PATH/PATH
+		// prepend the server directory to LD_LIBRARY_PATH/PATH
 		libraryPaths := []string{dir}

 		if libraryPath, ok := os.LookupEnv(pathEnv); ok {
 			// Append our runner directory to the path
 			// This will favor system libraries over our bundled library dependencies
-			libraryPaths = append(filepath.SplitList(libraryPath), libraryPaths...)
+			libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
 		}

 		// Note: we always put the dependency path first
@@ -268,30 +278,69 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		}

 		s := &llmServer{
-			port:          port,
-			cmd:           exec.Command(server, finalParams...),
-			status:        NewStatusWriter(os.Stderr),
-			options:       opts,
-			estimatedVRAM: estimatedVRAM,
-			sem:           semaphore.NewWeighted(int64(numParallel)),
+			port:           port,
+			cmd:            exec.Command(server, finalParams...),
+			status:         NewStatusWriter(os.Stderr),
+			options:        opts,
+			estimatedVRAM:  estimatedVRAM,
+			estimatedTotal: estimatedTotal,
+			sem:            semaphore.NewWeighted(int64(numParallel)),
+			totalLayers:    ggml.KV().BlockCount() + 1,
+			gpuCount:       gpuCount,
+			done:           make(chan error, 1),
 		}

-		libEnv := fmt.Sprintf("%s=%s", pathEnv, strings.Join(libraryPaths, string(filepath.ListSeparator)))
-		s.cmd.Env = append(os.Environ(), libEnv)
+		s.cmd.Env = os.Environ()
 		s.cmd.Stdout = os.Stdout
 		s.cmd.Stderr = s.status

-		// TODO - multiple GPU selection logic...
-		key, val := gpu.GpuInfoList(gpus).GetVisibleDevicesEnv()
-		if key != "" {
-			s.cmd.Env = append(s.cmd.Env, key+"="+val)
+		visibleDevicesEnv, visibleDevicesEnvVal := gpu.GpuInfoList(gpus).GetVisibleDevicesEnv()
+		pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
+
+		// Update or add the path and visible devices variable with our adjusted version
+		pathNeeded := true
+		devicesNeeded := visibleDevicesEnv != ""
+		for i := range s.cmd.Env {
+			cmp := strings.SplitN(s.cmd.Env[i], "=", 2)
+			if strings.EqualFold(cmp[0], pathEnv) {
+				s.cmd.Env[i] = pathEnv + "=" + pathEnvVal
+				pathNeeded = false
+			} else if devicesNeeded && strings.EqualFold(cmp[0], visibleDevicesEnv) {
+				s.cmd.Env[i] = visibleDevicesEnv + "=" + visibleDevicesEnvVal
+				devicesNeeded = false
+			}
+		}
+		if pathNeeded {
+			s.cmd.Env = append(s.cmd.Env, pathEnv+"="+pathEnvVal)
+		}
+		if devicesNeeded {
+			s.cmd.Env = append(s.cmd.Env, visibleDevicesEnv+"="+visibleDevicesEnvVal)
 		}

 		slog.Info("starting llama server", "cmd", s.cmd.String())
-		// Log at debug as the environment is inherited and might contain sensitive information
-		slog.Debug("subprocess", "environment", s.cmd.Env)
+		if envconfig.Debug {
+			filteredEnv := []string{}
+			for _, ev := range s.cmd.Env {
+				if strings.HasPrefix(ev, "CUDA_") ||
+					strings.HasPrefix(ev, "ROCM_") ||
+					strings.HasPrefix(ev, "HIP_") ||
+					strings.HasPrefix(ev, "HSA_") ||
+					strings.HasPrefix(ev, "GGML_") ||
+					strings.HasPrefix(ev, "PATH=") ||
+					strings.HasPrefix(ev, "LD_LIBRARY_PATH=") {
+					filteredEnv = append(filteredEnv, ev)
+				}
+			}
+			// Log at debug as the environment is inherited and might contain sensitive information
+			slog.Debug("subprocess", "environment", filteredEnv)
+		}

 		if err = s.cmd.Start(); err != nil {
+			// Detect permission denied and augment them essage about noexec
+			if errors.Is(err, os.ErrPermission) {
+				finalErr = fmt.Errorf("unable to start server %w.  %s may have noexec set.  Set OLLAMA_TMPDIR for server to a writable executable directory", err, dir)
+				continue
+			}
 			msg := ""
 			if s.status != nil && s.status.LastErrMsg != "" {
 				msg = s.status.LastErrMsg
@@ -303,17 +352,9 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr

 		// reap subprocess when it exits
 		go func() {
-			// Exit status managed via getServerStatus
-			_ = s.cmd.Wait()
+			s.done <- s.cmd.Wait()
 		}()

-		// TODO - make sure this is all wired up correctly
-		// if err = s.WaitUntilRunning(); err != nil {
-		// 	slog.Error("error starting llama server", "server", servers[i], "error", err)
-		// 	s.Close()
-		// 	finalErr = err
-		// 	continue
-		// }
 		return s, nil
 	}

@@ -345,7 +386,7 @@ type ServerStatus int

 const ( // iota is reset to 0
 	ServerStatusReady ServerStatus = iota
-	ServerStatusNoSlotsAvaialble
+	ServerStatusNoSlotsAvailable
 	ServerStatusLoadingModel
 	ServerStatusNotResponding
 	ServerStatusError
@@ -355,7 +396,7 @@ func (s ServerStatus) ToString() string {
 	switch s {
 	case ServerStatusReady:
 		return "llm server ready"
-	case ServerStatusNoSlotsAvaialble:
+	case ServerStatusNoSlotsAvailable:
 		return "llm busy - no slots available"
 	case ServerStatusLoadingModel:
 		return "llm server loading model"
@@ -380,6 +421,10 @@ func (s *llmServer) getServerStatus(ctx context.Context) (ServerStatus, error) {
 		if s.status != nil && s.status.LastErrMsg != "" {
 			msg = s.status.LastErrMsg
 		}
+		if s.cmd.ProcessState.ExitCode() == -1 {
+			// Most likely a signal killed it, log some more details to try to help troubleshoot
+			slog.Warn("llama runner process no longer running", "sys", s.cmd.ProcessState.Sys(), "string", s.cmd.ProcessState.String())
+		}
 		return ServerStatusError, fmt.Errorf("llama runner process no longer running: %d %s", s.cmd.ProcessState.ExitCode(), msg)
 	}

@@ -412,7 +457,7 @@ func (s *llmServer) getServerStatus(ctx context.Context) (ServerStatus, error) {
 	case "ok":
 		return ServerStatusReady, nil
 	case "no slot available":
-		return ServerStatusNoSlotsAvaialble, nil
+		return ServerStatusNoSlotsAvailable, nil
 	case "loading model":
 		return ServerStatusLoadingModel, nil
 	default:
@@ -420,6 +465,29 @@ func (s *llmServer) getServerStatus(ctx context.Context) (ServerStatus, error) {
 	}
 }

+// getServerStatusRetry will retry if ServerStatusNoSlotsAvailable is received
+func (s *llmServer) getServerStatusRetry(ctx context.Context) (ServerStatus, error) {
+	var retries int
+	for {
+		status, err := s.getServerStatus(ctx)
+		if err != nil {
+			return status, err
+		}
+
+		if status == ServerStatusNoSlotsAvailable {
+			if retries >= 10 {
+				return status, fmt.Errorf("no slots available after %d retries", retries)
+			}
+
+			time.Sleep(5 * time.Millisecond)
+			retries++
+			continue
+		}
+
+		return status, nil
+	}
+}
+
 func (s *llmServer) Ping(ctx context.Context) error {
 	_, err := s.getServerStatus(ctx)
 	if err != nil {
@@ -431,13 +499,11 @@ func (s *llmServer) Ping(ctx context.Context) error {

 func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
 	start := time.Now()
-	// TODO we need to wire up a better way to detect hangs during model load and startup of the server
 	expiresAt := time.Now().Add(10 * time.Minute) // be generous with timeout, large models can take a while to load
-	ticker := time.NewTicker(50 * time.Millisecond)
-	defer ticker.Stop()

 	slog.Info("waiting for llama runner to start responding")
 	var lastStatus ServerStatus = -1
+
 	for {
 		select {
 		case <-ctx.Done():
@@ -449,41 +515,39 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
 				msg = s.status.LastErrMsg
 			}
 			return fmt.Errorf("llama runner process has terminated: %v %s", err, msg)
-		case <-ticker.C:
-			if time.Now().After(expiresAt) {
-				// timeout
-				msg := ""
-				if s.status != nil && s.status.LastErrMsg != "" {
-					msg = s.status.LastErrMsg
-				}
-				return fmt.Errorf("timed out waiting for llama runner to start: %s", msg)
+		default:
+		}
+		if time.Now().After(expiresAt) {
+			// timeout
+			msg := ""
+			if s.status != nil && s.status.LastErrMsg != "" {
+				msg = s.status.LastErrMsg
 			}
-			if s.cmd.ProcessState != nil {
-				msg := ""
-				if s.status != nil && s.status.LastErrMsg != "" {
-					msg = s.status.LastErrMsg
-				}
-				return fmt.Errorf("llama runner process no longer running: %d %s", s.cmd.ProcessState.ExitCode(), msg)
-			}
-
-			c, cancel := context.WithTimeout(ctx, 200*time.Millisecond)
-			defer cancel()
-			status, err := s.getServerStatus(c)
-			if err != nil && lastStatus != status {
-				slog.Debug("server not yet available", "error", err)
-				lastStatus = status
-				continue
-			}
-
-			switch status {
-			case ServerStatusLoadingModel:
-				// TODO - this state never seems to happen with the current server.cpp code (bug?)
-				// it doesn't respond to the health endpoint until after the model is loaded
-				slog.Debug("loading model")
-			case ServerStatusReady:
-				slog.Debug(fmt.Sprintf("llama runner started in %f seconds", time.Since(start).Seconds()))
-				return nil
+			return fmt.Errorf("timed out waiting for llama runner to start: %s", msg)
+		}
+		if s.cmd.ProcessState != nil {
+			msg := ""
+			if s.status != nil && s.status.LastErrMsg != "" {
+				msg = s.status.LastErrMsg
 			}
+			return fmt.Errorf("llama runner process no longer running: %d %s", s.cmd.ProcessState.ExitCode(), msg)
+		}
+		ctx, cancel := context.WithTimeout(ctx, 200*time.Millisecond)
+		defer cancel()
+		status, _ := s.getServerStatus(ctx)
+		if lastStatus != status && status != ServerStatusReady {
+			// Only log on status changes
+			slog.Info("waiting for server to become available", "status", status.ToString())
+		}
+		switch status {
+		case ServerStatusReady:
+			s.loadDuration = time.Since(start)
+			slog.Info(fmt.Sprintf("llama runner started in %0.2f seconds", s.loadDuration.Seconds()))
+			return nil
+		default:
+			lastStatus = status
+			time.Sleep(time.Millisecond * 250)
+			continue
 		}
 	}
 }
@@ -517,7 +581,6 @@ ws ::= ([ \t\n] ws)?
 `

 const maxBufferSize = 512 * format.KiloByte
-const maxRetries = 3

 type ImageData struct {
 	Data []byte `json:"data"`
@@ -525,10 +588,11 @@ type ImageData struct {
 }

 type completion struct {
-	Content string `json:"content"`
-	Model   string `json:"model"`
-	Prompt  string `json:"prompt"`
-	Stop    bool   `json:"stop"`
+	Content      string `json:"content"`
+	Model        string `json:"model"`
+	Prompt       string `json:"prompt"`
+	Stop         bool   `json:"stop"`
+	StoppedLimit bool   `json:"stopped_limit"`

 	Timings struct {
 		PredictedN  int     `json:"predicted_n"`
@@ -547,6 +611,7 @@ type CompletionRequest struct {

 type CompletionResponse struct {
 	Content            string
+	DoneReason         string
 	Done               bool
 	PromptEvalCount    int
 	PromptEvalDuration time.Duration
@@ -593,7 +658,7 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 	}

 	// Make sure the server is ready
-	status, err := s.getServerStatus(ctx)
+	status, err := s.getServerStatusRetry(ctx)
 	if err != nil {
 		return err
 	} else if status != ServerStatusReady {
@@ -607,133 +672,119 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 		}
 	}

-	retryDelay := 100 * time.Microsecond
-	for retries := 0; retries < maxRetries; retries++ {
-		if retries > 0 {
-			time.Sleep(retryDelay) // wait before retrying
-			retryDelay *= 2        // exponential backoff
-		}
+	// Handling JSON marshaling with special characters unescaped.
+	buffer := &bytes.Buffer{}
+	enc := json.NewEncoder(buffer)
+	enc.SetEscapeHTML(false)

-		// Handling JSON marshaling with special characters unescaped.
-		buffer := &bytes.Buffer{}
-		enc := json.NewEncoder(buffer)
-		enc.SetEscapeHTML(false)
+	if err := enc.Encode(request); err != nil {
+		return fmt.Errorf("failed to marshal data: %v", err)
+	}

-		if err := enc.Encode(request); err != nil {
-			return fmt.Errorf("failed to marshal data: %v", err)
-		}
+	endpoint := fmt.Sprintf("http://127.0.0.1:%d/completion", s.port)
+	serverReq, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, buffer)
+	if err != nil {
+		return fmt.Errorf("error creating POST request: %v", err)
+	}
+	serverReq.Header.Set("Content-Type", "application/json")

-		endpoint := fmt.Sprintf("http://127.0.0.1:%d/completion", s.port)
-		req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, buffer)
+	res, err := http.DefaultClient.Do(serverReq)
+	if err != nil {
+		return fmt.Errorf("POST predict: %v", err)
+	}
+	defer res.Body.Close()
+
+	if res.StatusCode >= 400 {
+		bodyBytes, err := io.ReadAll(res.Body)
 		if err != nil {
-			return fmt.Errorf("error creating POST request: %v", err)
+			return fmt.Errorf("failed reading llm error response: %w", err)
 		}
-		req.Header.Set("Content-Type", "application/json")
+		log.Printf("llm predict error: %s", bodyBytes)
+		return fmt.Errorf("%s", bodyBytes)
+	}

-		resp, err := http.DefaultClient.Do(req)
-		if err != nil {
-			return fmt.Errorf("POST predict: %v", err)
-		}
-		defer resp.Body.Close()
+	scanner := bufio.NewScanner(res.Body)
+	buf := make([]byte, 0, maxBufferSize)
+	scanner.Buffer(buf, maxBufferSize)

-		if resp.StatusCode >= 400 {
-			bodyBytes, err := io.ReadAll(resp.Body)
-			if err != nil {
-				return fmt.Errorf("failed reading llm error response: %w", err)
+	// keep track of the last token generated, this is used to abort if the model starts looping
+	var lastToken string
+	var tokenRepeat int
+
+	for scanner.Scan() {
+		select {
+		case <-ctx.Done():
+			// This handles the request cancellation
+			return ctx.Err()
+		default:
+			line := scanner.Bytes()
+			if len(line) == 0 {
+				continue
 			}
-			log.Printf("llm predict error: %s", bodyBytes)
-			return fmt.Errorf("%s", bodyBytes)
-		}

-		scanner := bufio.NewScanner(resp.Body)
-		buf := make([]byte, 0, maxBufferSize)
-		scanner.Buffer(buf, maxBufferSize)
+			evt, ok := bytes.CutPrefix(line, []byte("data: "))
+			if !ok {
+				return fmt.Errorf("error parsing llm response stream: %s", line)
+			}

-		retryNeeded := false
-		// keep track of the last token generated, this is used to abort if the model starts looping
-		var lastToken string
-		var tokenRepeat int
+			var c completion
+			if err := json.Unmarshal(evt, &c); err != nil {
+				return fmt.Errorf("error unmarshaling llm prediction response: %v", err)
+			}

-		for scanner.Scan() {
-			select {
-			case <-ctx.Done():
-				// This handles the request cancellation
-				return ctx.Err()
+			switch {
+			case strings.TrimSpace(c.Content) == lastToken:
+				tokenRepeat++
 			default:
-				line := scanner.Bytes()
-				if len(line) == 0 {
-					continue
-				}
-
-				// try again on slot unavailable
-				if bytes.Contains(line, []byte("slot unavailable")) {
-					retryNeeded = true
-					break
-				}
-
-				evt, ok := bytes.CutPrefix(line, []byte("data: "))
-				if !ok {
-					return fmt.Errorf("error parsing llm response stream: %s", line)
-				}
-
-				var c completion
-				if err := json.Unmarshal(evt, &c); err != nil {
-					return fmt.Errorf("error unmarshaling llm prediction response: %v", err)
-				}
-
-				switch {
-				case strings.TrimSpace(c.Content) == lastToken:
-					tokenRepeat++
-				default:
-					lastToken = strings.TrimSpace(c.Content)
-					tokenRepeat = 0
-				}
-
-				// 30 picked as an arbitrary max token repeat limit, modify as needed
-				if tokenRepeat > 30 {
-					slog.Debug("prediction aborted, token repeat limit reached")
-					return ctx.Err()
-				}
-
-				if c.Content != "" {
-					fn(CompletionResponse{
-						Content: c.Content,
-					})
-				}
-
-				if c.Stop {
-					fn(CompletionResponse{
-						Done:               true,
-						PromptEvalCount:    c.Timings.PromptN,
-						PromptEvalDuration: parseDurationMs(c.Timings.PromptMS),
-						EvalCount:          c.Timings.PredictedN,
-						EvalDuration:       parseDurationMs(c.Timings.PredictedMS),
-					})
-					return nil
-				}
+				lastToken = strings.TrimSpace(c.Content)
+				tokenRepeat = 0
 			}
-		}

-		if err := scanner.Err(); err != nil {
-			if strings.Contains(err.Error(), "unexpected EOF") {
-				s.Close()
-				msg := ""
-				if s.status != nil && s.status.LastErrMsg != "" {
-					msg = s.status.LastErrMsg
+			// 30 picked as an arbitrary max token repeat limit, modify as needed
+			if tokenRepeat > 30 {
+				slog.Debug("prediction aborted, token repeat limit reached")
+				return ctx.Err()
+			}
+
+			if c.Content != "" {
+				fn(CompletionResponse{
+					Content: c.Content,
+				})
+			}
+
+			if c.Stop {
+				doneReason := "stop"
+				if c.StoppedLimit {
+					doneReason = "length"
 				}

-				return fmt.Errorf("an unknown error was encountered while running the model %s", msg)
+				fn(CompletionResponse{
+					Done:               true,
+					DoneReason:         doneReason,
+					PromptEvalCount:    c.Timings.PromptN,
+					PromptEvalDuration: parseDurationMs(c.Timings.PromptMS),
+					EvalCount:          c.Timings.PredictedN,
+					EvalDuration:       parseDurationMs(c.Timings.PredictedMS),
+				})
+				return nil
 			}
-			return fmt.Errorf("error reading llm response: %v", err)
-		}
-
-		if !retryNeeded {
-			return nil // success
 		}
 	}

-	// should never reach here ideally
-	return fmt.Errorf("max retries exceeded")
+	if err := scanner.Err(); err != nil {
+		if strings.Contains(err.Error(), "unexpected EOF") {
+			s.Close()
+			msg := ""
+			if s.status != nil && s.status.LastErrMsg != "" {
+				msg = s.status.LastErrMsg
+			}
+			return fmt.Errorf("an unknown error was encountered while running the model %s", msg)
+		}
+
+		return fmt.Errorf("error reading llm response: %v", err)
+	}
+
+	return nil
 }

 type EmbeddingRequest struct {
@@ -750,8 +801,9 @@ func (s *llmServer) Embedding(ctx context.Context, prompt string) ([]float64, er
 		return nil, err
 	}
 	defer s.sem.Release(1)
+
 	// Make sure the server is ready
-	status, err := s.getServerStatus(ctx)
+	status, err := s.getServerStatusRetry(ctx)
 	if err != nil {
 		return nil, err
 	} else if status != ServerStatusReady {
@@ -806,7 +858,7 @@ func (s *llmServer) Tokenize(ctx context.Context, content string) ([]int, error)
 	status, err := s.getServerStatus(ctx)
 	if err != nil {
 		return nil, err
-	} else if status != ServerStatusReady && status != ServerStatusNoSlotsAvaialble {
+	} else if status != ServerStatusReady && status != ServerStatusNoSlotsAvailable {
 		return nil, fmt.Errorf("unexpected server status: %s", status.ToString())
 	}

@@ -858,7 +910,7 @@ func (s *llmServer) Detokenize(ctx context.Context, tokens []int) (string, error
 	status, err := s.getServerStatus(ctx)
 	if err != nil {
 		return "", err
-	} else if status != ServerStatusReady && status != ServerStatusNoSlotsAvaialble {
+	} else if status != ServerStatusReady && status != ServerStatusNoSlotsAvailable {
 		return "", fmt.Errorf("unexpected server status: %s", status.ToString())
 	}

@@ -900,7 +952,16 @@ func (s *llmServer) Detokenize(ctx context.Context, tokens []int) (string, error
 func (s *llmServer) Close() error {
 	if s.cmd != nil {
 		slog.Debug("stopping llama server")
-		return s.cmd.Process.Kill()
+		if err := s.cmd.Process.Kill(); err != nil {
+			return err
+		}
+		// if ProcessState is already populated, Wait already completed, no need to wait again
+		if s.cmd.ProcessState == nil {
+			slog.Debug("waiting for llama server to exit")
+			<-s.done
+		}
+
+		slog.Debug("llama server stopped")
 	}

 	return nil
@@ -910,6 +971,10 @@ func (s *llmServer) EstimatedVRAM() uint64 {
 	return s.estimatedVRAM
 }

+func (s *llmServer) EstimatedTotal() uint64 {
+	return s.estimatedTotal
+}
+
 func parseDurationMs(ms float64) time.Duration {
 	dur, err := time.ParseDuration(fmt.Sprintf("%fms", ms))
 	if err != nil {
--- a/macapp/src/app.tsx
+++ b/macapp/src/app.tsx
@@ -19,7 +19,7 @@ export default function () {
  const [step, setStep] = useState<Step>(Step.WELCOME)
  const [commandCopied, setCommandCopied] = useState<boolean>(false)

-  const command = 'ollama run llama2'
+  const command = 'ollama run llama3'

  return (
    <div className='drag'>
--- a/openai/openai.go
+++ b/openai/openai.go
@@ -109,13 +109,12 @@ func toChatCompletion(id string, r api.ChatResponse) ChatCompletion {
 		Choices: []Choice{{
 			Index:   0,
 			Message: Message{Role: r.Message.Role, Content: r.Message.Content},
-			FinishReason: func(done bool) *string {
-				if done {
-					reason := "stop"
+			FinishReason: func(reason string) *string {
+				if len(reason) > 0 {
 					return &reason
 				}
 				return nil
-			}(r.Done),
+			}(r.DoneReason),
 		}},
 		Usage: Usage{
 			// TODO: ollama returns 0 for prompt eval if the prompt was cached, but openai returns the actual count
@@ -133,19 +132,16 @@ func toChunk(id string, r api.ChatResponse) ChatCompletionChunk {
 		Created:           time.Now().Unix(),
 		Model:             r.Model,
 		SystemFingerprint: "fp_ollama",
-		Choices: []ChunkChoice{
-			{
-				Index: 0,
-				Delta: Message{Role: "assistant", Content: r.Message.Content},
-				FinishReason: func(done bool) *string {
-					if done {
-						reason := "stop"
-						return &reason
-					}
-					return nil
-				}(r.Done),
-			},
-		},
+		Choices: []ChunkChoice{{
+			Index: 0,
+			Delta: Message{Role: "assistant", Content: r.Message.Content},
+			FinishReason: func(reason string) *string {
+				if len(reason) > 0 {
+					return &reason
+				}
+				return nil
+			}(r.DoneReason),
+		}},
 	}
 }

--- a/parser/parser.go
+++ b/parser/parser.go
@@ -1,132 +0,0 @@
-package parser
-
-import (
-	"bufio"
-	"bytes"
-	"errors"
-	"fmt"
-	"io"
-	"log/slog"
-	"slices"
-)
-
-type Command struct {
-	Name string
-	Args string
-}
-
-func (c *Command) Reset() {
-	c.Name = ""
-	c.Args = ""
-}
-
-func Parse(reader io.Reader) ([]Command, error) {
-	var commands []Command
-	var command, modelCommand Command
-
-	scanner := bufio.NewScanner(reader)
-	scanner.Buffer(make([]byte, 0, bufio.MaxScanTokenSize), bufio.MaxScanTokenSize)
-	scanner.Split(scanModelfile)
-	for scanner.Scan() {
-		line := scanner.Bytes()
-
-		fields := bytes.SplitN(line, []byte(" "), 2)
-		if len(fields) == 0 || len(fields[0]) == 0 {
-			continue
-		}
-
-		switch string(bytes.ToUpper(fields[0])) {
-		case "FROM":
-			command.Name = "model"
-			command.Args = string(bytes.TrimSpace(fields[1]))
-			// copy command for validation
-			modelCommand = command
-		case "ADAPTER":
-			command.Name = string(bytes.ToLower(fields[0]))
-			command.Args = string(bytes.TrimSpace(fields[1]))
-		case "LICENSE", "TEMPLATE", "SYSTEM", "PROMPT":
-			command.Name = string(bytes.ToLower(fields[0]))
-			command.Args = string(fields[1])
-		case "PARAMETER":
-			fields = bytes.SplitN(fields[1], []byte(" "), 2)
-			if len(fields) < 2 {
-				return nil, fmt.Errorf("missing value for %s", fields)
-			}
-
-			command.Name = string(fields[0])
-			command.Args = string(bytes.TrimSpace(fields[1]))
-		case "EMBED":
-			return nil, fmt.Errorf("deprecated command: EMBED is no longer supported, use the /embed API endpoint instead")
-		case "MESSAGE":
-			command.Name = string(bytes.ToLower(fields[0]))
-			fields = bytes.SplitN(fields[1], []byte(" "), 2)
-			if len(fields) < 2 {
-				return nil, fmt.Errorf("should be in the format <role> <message>")
-			}
-			if !slices.Contains([]string{"system", "user", "assistant"}, string(bytes.ToLower(fields[0]))) {
-				return nil, fmt.Errorf("role must be one of \"system\", \"user\", or \"assistant\"")
-			}
-			command.Args = fmt.Sprintf("%s: %s", string(bytes.ToLower(fields[0])), string(fields[1]))
-		default:
-			if !bytes.HasPrefix(fields[0], []byte("#")) {
-				// log a warning for unknown commands
-				slog.Warn(fmt.Sprintf("Unknown command: %s", fields[0]))
-			}
-			continue
-		}
-
-		commands = append(commands, command)
-		command.Reset()
-	}
-
-	if modelCommand.Args == "" {
-		return nil, errors.New("no FROM line for the model was specified")
-	}
-
-	return commands, scanner.Err()
-}
-
-func scanModelfile(data []byte, atEOF bool) (advance int, token []byte, err error) {
-	advance, token, err = scan([]byte(`"""`), []byte(`"""`), data, atEOF)
-	if err != nil {
-		return 0, nil, err
-	}
-
-	if advance > 0 && token != nil {
-		return advance, token, nil
-	}
-
-	advance, token, err = scan([]byte(`"`), []byte(`"`), data, atEOF)
-	if err != nil {
-		return 0, nil, err
-	}
-
-	if advance > 0 && token != nil {
-		return advance, token, nil
-	}
-
-	return bufio.ScanLines(data, atEOF)
-}
-
-func scan(openBytes, closeBytes, data []byte, atEOF bool) (advance int, token []byte, err error) {
-	newline := bytes.IndexByte(data, '\n')
-
-	if start := bytes.Index(data, openBytes); start >= 0 && start < newline {
-		end := bytes.Index(data[start+len(openBytes):], closeBytes)
-		if end < 0 {
-			if atEOF {
-				return 0, nil, fmt.Errorf("unterminated %s: expecting %s", openBytes, closeBytes)
-			} else {
-				return 0, nil, nil
-			}
-		}
-
-		n := start + len(openBytes) + end + len(closeBytes)
-
-		newData := data[:start]
-		newData = append(newData, data[start+len(openBytes):n-len(closeBytes)]...)
-		return n, newData, nil
-	}
-
-	return 0, nil, nil
-}
--- a/parser/parser_test.go
+++ b/parser/parser_test.go
@@ -1,98 +0,0 @@
-package parser
-
-import (
-	"strings"
-	"testing"
-
-	"github.com/stretchr/testify/assert"
-)
-
-func Test_Parser(t *testing.T) {
-
-	input := `
-FROM model1
-ADAPTER adapter1
-LICENSE MIT
-PARAMETER param1 value1
-PARAMETER param2 value2
-TEMPLATE template1
-`
-
-	reader := strings.NewReader(input)
-
-	commands, err := Parse(reader)
-	assert.Nil(t, err)
-
-	expectedCommands := []Command{
-		{Name: "model", Args: "model1"},
-		{Name: "adapter", Args: "adapter1"},
-		{Name: "license", Args: "MIT"},
-		{Name: "param1", Args: "value1"},
-		{Name: "param2", Args: "value2"},
-		{Name: "template", Args: "template1"},
-	}
-
-	assert.Equal(t, expectedCommands, commands)
-}
-
-func Test_Parser_NoFromLine(t *testing.T) {
-
-	input := `
-PARAMETER param1 value1
-PARAMETER param2 value2
-`
-
-	reader := strings.NewReader(input)
-
-	_, err := Parse(reader)
-	assert.ErrorContains(t, err, "no FROM line")
-}
-
-func Test_Parser_MissingValue(t *testing.T) {
-
-	input := `
-FROM foo
-PARAMETER param1
-`
-
-	reader := strings.NewReader(input)
-
-	_, err := Parse(reader)
-	assert.ErrorContains(t, err, "missing value for [param1]")
-
-}
-
-func Test_Parser_Messages(t *testing.T) {
-
-	input := `
-FROM foo
-MESSAGE system You are a Parser. Always Parse things.
-MESSAGE user Hey there!
-MESSAGE assistant Hello, I want to parse all the things!
-`
-
-	reader := strings.NewReader(input)
-	commands, err := Parse(reader)
-	assert.Nil(t, err)
-
-	expectedCommands := []Command{
-		{Name: "model", Args: "foo"},
-		{Name: "message", Args: "system: You are a Parser. Always Parse things."},
-		{Name: "message", Args: "user: Hey there!"},
-		{Name: "message", Args: "assistant: Hello, I want to parse all the things!"},
-	}
-
-	assert.Equal(t, expectedCommands, commands)
-}
-
-func Test_Parser_Messages_BadRole(t *testing.T) {
-
-	input := `
-FROM foo
-MESSAGE badguy I'm a bad guy!
-`
-
-	reader := strings.NewReader(input)
-	_, err := Parse(reader)
-	assert.ErrorContains(t, err, "role must be one of \"system\", \"user\", or \"assistant\"")
-}
--- a/readline/readline.go
+++ b/readline/readline.go
@@ -218,7 +218,7 @@ func (i *Instance) Readline() (string, error) {
 		case CharCtrlZ:
 			fd := int(syscall.Stdin)
 			return handleCharCtrlZ(fd, i.Terminal.termios)
-		case CharEnter:
+		case CharEnter, CharCtrlJ:
 			output := buf.String()
 			if output != "" {
 				i.History.Add([]rune(output))
@@ -232,7 +232,7 @@ func (i *Instance) Readline() (string, error) {
 				metaDel = false
 				continue
 			}
-			if r >= CharSpace || r == CharEnter {
+			if r >= CharSpace || r == CharEnter || r == CharCtrlJ {
 				buf.Add(r)
 			}
 		}
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@@ -7,6 +7,8 @@
 $ErrorActionPreference = "Stop"

 function checkEnv() {
+    $script:TARGET_ARCH=$Env:PROCESSOR_ARCHITECTURE.ToLower()
+    Write-host "Building for ${script:TARGET_ARCH}"
    write-host "Locating required tools and paths"
    $script:SRC_DIR=$PWD
    if (!$env:VCToolsRedistDir) {
@@ -30,7 +32,7 @@ function checkEnv() {
    
    $script:INNO_SETUP_DIR=(get-item "C:\Program Files*\Inno Setup*\")[0]

-    $script:DEPS_DIR="${script:SRC_DIR}\dist\windows-amd64"
+    $script:DEPS_DIR="${script:SRC_DIR}\dist\windows-${script:TARGET_ARCH}"
    $env:CGO_ENABLED="1"
    echo "Checking version"
    if (!$env:VERSION) {
@@ -81,8 +83,8 @@ function buildOllama() {
            /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} ollama.exe
        if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
    }
-    New-Item -ItemType Directory -Path .\dist\windows-amd64\ -Force
-    cp .\ollama.exe .\dist\windows-amd64\
+    New-Item -ItemType Directory -Path .\dist\windows-${script:TARGET_ARCH}\ -Force
+    cp .\ollama.exe .\dist\windows-${script:TARGET_ARCH}\
 }

 function buildApp() {
@@ -127,16 +129,16 @@ function buildInstaller() {
    cd "${script:SRC_DIR}\app"
    $env:PKG_VERSION=$script:PKG_VERSION
    if ("${env:KEY_CONTAINER}") {
-        & "${script:INNO_SETUP_DIR}\ISCC.exe" /SMySignTool="${script:SignTool} sign /fd sha256 /t http://timestamp.digicert.com /f ${script:OLLAMA_CERT} /csp `$qGoogle Cloud KMS Provider`$q /kc ${env:KEY_CONTAINER} `$f" .\ollama.iss
+        & "${script:INNO_SETUP_DIR}\ISCC.exe" /DARCH=$script:TARGET_ARCH /SMySignTool="${script:SignTool} sign /fd sha256 /t http://timestamp.digicert.com /f ${script:OLLAMA_CERT} /csp `$qGoogle Cloud KMS Provider`$q /kc ${env:KEY_CONTAINER} `$f" .\ollama.iss
    } else {
-        & "${script:INNO_SETUP_DIR}\ISCC.exe" .\ollama.iss
+        & "${script:INNO_SETUP_DIR}\ISCC.exe" /DARCH=$script:TARGET_ARCH .\ollama.iss
    }
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
 }

 function distZip() {
-    write-host "Generating stand-alone distribution zip file ${script:SRC_DIR}\dist\ollama-windows-amd64.zip"
-    Compress-Archive -Path "${script:SRC_DIR}\dist\windows-amd64\*" -DestinationPath "${script:SRC_DIR}\dist\ollama-windows-amd64.zip" -Force
+    write-host "Generating stand-alone distribution zip file ${script:SRC_DIR}\dist\ollama-windows-${script:TARGET_ARCH}.zip"
+    Compress-Archive -Path "${script:SRC_DIR}\dist\windows-${script:TARGET_ARCH}\*" -DestinationPath "${script:SRC_DIR}\dist\ollama-windows-${script:TARGET_ARCH}.zip" -Force
 }

 try {
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -166,8 +166,8 @@ fi

 if check_gpu lspci amdgpu || check_gpu lshw amdgpu; then
    # Look for pre-existing ROCm v6 before downloading the dependencies
-    for search in "${HIP_PATH:-''}" "${ROCM_PATH:-''}" "/opt/rocm"; do
-        if [ -n "${search}" ] && [ -e "${search}/lib/libhipblas.so.2" ]; then
+    for search in "${HIP_PATH:-''}" "${ROCM_PATH:-''}" "/opt/rocm" "/usr/lib64"; do
+        if [ -n "${search}" ] && [ -e "${search}/libhipblas.so.2" -o -e "${search}/lib/libhipblas.so.2" ]; then
            status "Compatible AMD GPU ROCm library detected at ${search}"
            install_success
            exit 0
--- a/server/envconfig/config.go
+++ b/server/envconfig/config.go
@@ -0,0 +1,174 @@
+package envconfig
+
+import (
+	"fmt"
+	"log/slog"
+	"os"
+	"path/filepath"
+	"runtime"
+	"strconv"
+	"strings"
+)
+
+var (
+	// Set via OLLAMA_ORIGINS in the environment
+	AllowOrigins []string
+	// Set via OLLAMA_DEBUG in the environment
+	Debug bool
+	// Set via OLLAMA_LLM_LIBRARY in the environment
+	LLMLibrary string
+	// Set via OLLAMA_MAX_LOADED_MODELS in the environment
+	MaxRunners int
+	// Set via OLLAMA_MAX_QUEUE in the environment
+	MaxQueuedRequests int
+	// Set via OLLAMA_MAX_VRAM in the environment
+	MaxVRAM uint64
+	// Set via OLLAMA_NOPRUNE in the environment
+	NoPrune bool
+	// Set via OLLAMA_NUM_PARALLEL in the environment
+	NumParallel int
+	// Set via OLLAMA_RUNNERS_DIR in the environment
+	RunnersDir string
+	// Set via OLLAMA_TMPDIR in the environment
+	TmpDir string
+)
+
+func AsMap() map[string]string {
+	return map[string]string{
+		"OLLAMA_ORIGINS":           fmt.Sprintf("%v", AllowOrigins),
+		"OLLAMA_DEBUG":             fmt.Sprintf("%v", Debug),
+		"OLLAMA_LLM_LIBRARY":       fmt.Sprintf("%v", LLMLibrary),
+		"OLLAMA_MAX_LOADED_MODELS": fmt.Sprintf("%v", MaxRunners),
+		"OLLAMA_MAX_QUEUE":         fmt.Sprintf("%v", MaxQueuedRequests),
+		"OLLAMA_MAX_VRAM":          fmt.Sprintf("%v", MaxVRAM),
+		"OLLAMA_NOPRUNE":           fmt.Sprintf("%v", NoPrune),
+		"OLLAMA_NUM_PARALLEL":      fmt.Sprintf("%v", NumParallel),
+		"OLLAMA_RUNNERS_DIR":       fmt.Sprintf("%v", RunnersDir),
+		"OLLAMA_TMPDIR":            fmt.Sprintf("%v", TmpDir),
+	}
+}
+
+var defaultAllowOrigins = []string{
+	"localhost",
+	"127.0.0.1",
+	"0.0.0.0",
+}
+
+// Clean quotes and spaces from the value
+func clean(key string) string {
+	return strings.Trim(os.Getenv(key), "\"' ")
+}
+
+func init() {
+	// default values
+	NumParallel = 1
+	MaxRunners = 1
+	MaxQueuedRequests = 512
+
+	LoadConfig()
+}
+
+func LoadConfig() {
+	if debug := clean("OLLAMA_DEBUG"); debug != "" {
+		d, err := strconv.ParseBool(debug)
+		if err == nil {
+			Debug = d
+		} else {
+			Debug = true
+		}
+	}
+
+	RunnersDir = clean("OLLAMA_RUNNERS_DIR")
+	if runtime.GOOS == "windows" && RunnersDir == "" {
+		// On Windows we do not carry the payloads inside the main executable
+		appExe, err := os.Executable()
+		if err != nil {
+			slog.Error("failed to lookup executable path", "error", err)
+		}
+
+		cwd, err := os.Getwd()
+		if err != nil {
+			slog.Error("failed to lookup working directory", "error", err)
+		}
+
+		var paths []string
+		for _, root := range []string{filepath.Dir(appExe), cwd} {
+			paths = append(paths,
+				filepath.Join(root),
+				filepath.Join(root, "windows-"+runtime.GOARCH),
+				filepath.Join(root, "dist", "windows-"+runtime.GOARCH),
+			)
+		}
+
+		// Try a few variations to improve developer experience when building from source in the local tree
+		for _, p := range paths {
+			candidate := filepath.Join(p, "ollama_runners")
+			_, err := os.Stat(candidate)
+			if err == nil {
+				RunnersDir = candidate
+				break
+			}
+		}
+		if RunnersDir == "" {
+			slog.Error("unable to locate llm runner directory.  Set OLLAMA_RUNNERS_DIR to the location of 'ollama_runners'")
+		}
+	}
+
+	TmpDir = clean("OLLAMA_TMPDIR")
+
+	userLimit := clean("OLLAMA_MAX_VRAM")
+	if userLimit != "" {
+		avail, err := strconv.ParseUint(userLimit, 10, 64)
+		if err != nil {
+			slog.Error("invalid setting, ignoring", "OLLAMA_MAX_VRAM", userLimit, "error", err)
+		} else {
+			MaxVRAM = avail
+		}
+	}
+
+	LLMLibrary = clean("OLLAMA_LLM_LIBRARY")
+
+	if onp := clean("OLLAMA_NUM_PARALLEL"); onp != "" {
+		val, err := strconv.Atoi(onp)
+		if err != nil || val <= 0 {
+			slog.Error("invalid setting must be greater than zero", "OLLAMA_NUM_PARALLEL", onp, "error", err)
+		} else {
+			NumParallel = val
+		}
+	}
+
+	if noprune := clean("OLLAMA_NOPRUNE"); noprune != "" {
+		NoPrune = true
+	}
+
+	if origins := clean("OLLAMA_ORIGINS"); origins != "" {
+		AllowOrigins = strings.Split(origins, ",")
+	}
+	for _, allowOrigin := range defaultAllowOrigins {
+		AllowOrigins = append(AllowOrigins,
+			fmt.Sprintf("http://%s", allowOrigin),
+			fmt.Sprintf("https://%s", allowOrigin),
+			fmt.Sprintf("http://%s:*", allowOrigin),
+			fmt.Sprintf("https://%s:*", allowOrigin),
+		)
+	}
+
+	maxRunners := clean("OLLAMA_MAX_LOADED_MODELS")
+	if maxRunners != "" {
+		m, err := strconv.Atoi(maxRunners)
+		if err != nil {
+			slog.Error("invalid setting", "OLLAMA_MAX_LOADED_MODELS", maxRunners, "error", err)
+		} else {
+			MaxRunners = m
+		}
+	}
+
+	if onp := os.Getenv("OLLAMA_MAX_QUEUE"); onp != "" {
+		p, err := strconv.Atoi(onp)
+		if err != nil || p <= 0 {
+			slog.Error("invalid setting", "OLLAMA_MAX_QUEUE", onp, "error", err)
+		} else {
+			MaxQueuedRequests = p
+		}
+	}
+}
--- a/server/envconfig/config_test.go
+++ b/server/envconfig/config_test.go
@@ -0,0 +1,20 @@
+package envconfig
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/require"
+)
+
+func TestConfig(t *testing.T) {
+	Debug = false // Reset whatever was loaded in init()
+	t.Setenv("OLLAMA_DEBUG", "")
+	LoadConfig()
+	require.False(t, Debug)
+	t.Setenv("OLLAMA_DEBUG", "false")
+	LoadConfig()
+	require.False(t, Debug)
+	t.Setenv("OLLAMA_DEBUG", "1")
+	LoadConfig()
+	require.True(t, Debug)
+}
--- a/server/images.go
+++ b/server/images.go
@@ -1,16 +1,16 @@
 package server

 import (
-	"archive/zip"
 	"bytes"
+	"cmp"
 	"context"
 	"crypto/sha256"
+	"encoding/base64"
 	"encoding/hex"
 	"encoding/json"
 	"errors"
 	"fmt"
 	"io"
-	"io/fs"
 	"log"
 	"log/slog"
 	"net/http"
@@ -20,15 +20,15 @@ import (
 	"runtime"
 	"strconv"
 	"strings"
-	"text/template"

 	"golang.org/x/exp/slices"

 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/convert"
+	"github.com/ollama/ollama/auth"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/llm"
-	"github.com/ollama/ollama/parser"
+	"github.com/ollama/ollama/server/envconfig"
+	"github.com/ollama/ollama/types/errtypes"
 	"github.com/ollama/ollama/types/model"
 	"github.com/ollama/ollama/version"
 )
@@ -52,7 +52,6 @@ type Model struct {
 	System         string
 	License        []string
 	Digest         string
-	Size           int64
 	Options        map[string]interface{}
 	Messages       []Message
 }
@@ -61,6 +60,76 @@ func (m *Model) IsEmbedding() bool {
 	return slices.Contains(m.Config.ModelFamilies, "bert") || slices.Contains(m.Config.ModelFamilies, "nomic-bert")
 }

+func (m *Model) String() string {
+	var modelfile model.File
+
+	modelfile.Commands = append(modelfile.Commands, model.Command{
+		Name: "model",
+		Args: m.ModelPath,
+	})
+
+	for _, adapter := range m.AdapterPaths {
+		modelfile.Commands = append(modelfile.Commands, model.Command{
+			Name: "adapter",
+			Args: adapter,
+		})
+	}
+
+	for _, projector := range m.ProjectorPaths {
+		modelfile.Commands = append(modelfile.Commands, model.Command{
+			Name: "model",
+			Args: projector,
+		})
+	}
+
+	if m.Template != "" {
+		modelfile.Commands = append(modelfile.Commands, model.Command{
+			Name: "template",
+			Args: m.Template,
+		})
+	}
+
+	if m.System != "" {
+		modelfile.Commands = append(modelfile.Commands, model.Command{
+			Name: "system",
+			Args: m.System,
+		})
+	}
+
+	for k, v := range m.Options {
+		switch v := v.(type) {
+		case []any:
+			for _, s := range v {
+				modelfile.Commands = append(modelfile.Commands, model.Command{
+					Name: k,
+					Args: fmt.Sprintf("%v", s),
+				})
+			}
+		default:
+			modelfile.Commands = append(modelfile.Commands, model.Command{
+				Name: k,
+				Args: fmt.Sprintf("%v", v),
+			})
+		}
+	}
+
+	for _, license := range m.License {
+		modelfile.Commands = append(modelfile.Commands, model.Command{
+			Name: "license",
+			Args: license,
+		})
+	}
+
+	for _, msg := range m.Messages {
+		modelfile.Commands = append(modelfile.Commands, model.Command{
+			Name: "message",
+			Args: fmt.Sprintf("%s %s", msg.Role, msg.Content),
+		})
+	}
+
+	return modelfile.String()
+}
+
 type Message struct {
 	Role    string `json:"role"`
 	Content string `json:"content"`
@@ -86,50 +155,11 @@ type ConfigV2 struct {
 	RootFS       RootFS `json:"rootfs"`
 }

-func (c *ConfigV2) SetModelFormat(format string) {
-	if c.ModelFormat == "" {
-		c.ModelFormat = format
-	}
-}
-
-func (c *ConfigV2) SetModelFamily(families ...string) {
-	for _, family := range families {
-		if c.ModelFamily == "" {
-			c.ModelFamily = family
-		}
-
-		if !slices.Contains(c.ModelFamilies, family) {
-			c.ModelFamilies = append(c.ModelFamilies, family)
-		}
-	}
-}
-
-func (c *ConfigV2) SetModelType(modelType string) {
-	if c.ModelType == "" {
-		c.ModelType = modelType
-	}
-}
-
-func (c *ConfigV2) SetFileType(fileType string) {
-	if c.FileType == "" {
-		c.FileType = fileType
-	}
-}
-
 type RootFS struct {
 	Type    string   `json:"type"`
 	DiffIDs []string `json:"diff_ids"`
 }

-func (m *ManifestV2) GetTotalSize() (total int64) {
-	for _, layer := range m.Layers {
-		total += layer.Size
-	}
-
-	total += m.Config.Size
-	return total
-}
-
 func GetManifest(mp ModelPath) (*ManifestV2, string, error) {
 	fp, err := mp.GetManifestPath()
 	if err != nil {
@@ -170,7 +200,6 @@ func GetModel(name string) (*Model, error) {
 		Digest:    digest,
 		Template:  "{{ .Prompt }}",
 		License:   []string{},
-		Size:      manifest.GetTotalSize(),
 	}

 	filename, err := GetBlobsPath(manifest.Config.Digest)
@@ -260,7 +289,7 @@ func GetModel(name string) (*Model, error) {
 	return model, nil
 }

-func realpath(mfDir, from string) string {
+func realpath(rel, from string) string {
 	abspath, err := filepath.Abs(from)
 	if err != nil {
 		return from
@@ -277,22 +306,15 @@ func realpath(mfDir, from string) string {
 		return filepath.Join(home, from[2:])
 	}

-	if _, err := os.Stat(filepath.Join(mfDir, from)); err == nil {
+	if _, err := os.Stat(filepath.Join(rel, from)); err == nil {
 		// this is a file relative to the Modelfile
-		return filepath.Join(mfDir, from)
+		return filepath.Join(rel, from)
 	}

 	return abspath
 }

-func CreateModel(ctx context.Context, name, modelFileDir, quantization string, commands []parser.Command, fn func(resp api.ProgressResponse)) error {
-	deleteMap := make(map[string]struct{})
-	if manifest, _, err := GetManifest(ParseModelPath(name)); err == nil {
-		for _, layer := range append(manifest.Layers, manifest.Config) {
-			deleteMap[layer.Digest] = struct{}{}
-		}
-	}
-
+func CreateModel(ctx context.Context, name, modelFileDir, quantization string, modelfile *model.File, fn func(resp api.ProgressResponse)) (err error) {
 	config := ConfigV2{
 		OS:           "linux",
 		Architecture: "amd64",
@@ -301,250 +323,181 @@ func CreateModel(ctx context.Context, name, modelFileDir, quantization string, c
 		},
 	}

-	var layers Layers
-	messages := []string{}
+	var messages []*api.Message
+	parameters := make(map[string]any)

-	params := make(map[string][]string)
-	fromParams := make(map[string]any)
-
-	for _, c := range commands {
+	var layers []*Layer
+	for _, c := range modelfile.Commands {
 		mediatype := fmt.Sprintf("application/vnd.ollama.image.%s", c.Name)

 		switch c.Name {
-		case "model":
-			if strings.HasPrefix(c.Args, "@") {
-				blobPath, err := GetBlobsPath(strings.TrimPrefix(c.Args, "@"))
+		case "model", "adapter":
+			var baseLayers []*layerWithGGML
+			if name := model.ParseName(c.Args); name.IsValid() {
+				baseLayers, err = parseFromModel(ctx, name, fn)
+				if err != nil {
+					return err
+				}
+			} else if strings.HasPrefix(c.Args, "@") {
+				blobpath, err := GetBlobsPath(strings.TrimPrefix(c.Args, "@"))
 				if err != nil {
 					return err
 				}

-				c.Args = blobPath
-			}
-
-			pathName := realpath(modelFileDir, c.Args)
-
-			ggufName, err := convertModel(name, pathName, fn)
-			if err != nil {
-				var pathErr *fs.PathError
-				switch {
-				case errors.Is(err, zip.ErrFormat):
-					// it's not a safetensor archive
-				case errors.As(err, &pathErr):
-					// it's not a file on disk, could be a model reference
-				default:
+				blob, err := os.Open(blobpath)
+				if err != nil {
 					return err
 				}
+				defer blob.Close()
+
+				baseLayers, err = parseFromFile(ctx, blob, fn)
+				if err != nil {
+					return err
+				}
+			} else if file, err := os.Open(realpath(modelFileDir, c.Args)); err == nil {
+				defer file.Close()
+
+				baseLayers, err = parseFromFile(ctx, file, fn)
+				if err != nil {
+					return err
+				}
+			} else {
+				return fmt.Errorf("invalid model reference: %s", c.Args)
 			}

-			if ggufName != "" {
-				pathName = ggufName
-				defer os.RemoveAll(ggufName)
-
-				if quantization != "" {
-					quantization = strings.ToUpper(quantization)
-					fn(api.ProgressResponse{Status: fmt.Sprintf("quantizing %s model to %s", "F16", quantization)})
-					tempfile, err := os.CreateTemp(filepath.Dir(ggufName), quantization)
+			for _, baseLayer := range baseLayers {
+				if quantization != "" &&
+					baseLayer.MediaType == "application/vnd.ollama.image.model" &&
+					baseLayer.GGML != nil &&
+					baseLayer.GGML.Name() == "gguf" {
+					want, err := llm.ParseFileType(quantization)
 					if err != nil {
 						return err
 					}
-					defer os.RemoveAll(tempfile.Name())

-					if err := llm.Quantize(ggufName, tempfile.Name(), quantization); err != nil {
-						return err
-					}
+					ft := baseLayer.GGML.KV().FileType()
+					if !slices.Contains([]string{"F16", "F32"}, ft.String()) {
+						return errors.New("quantization is only supported for F16 and F32 models")
+					} else if want != ft {
+						fn(api.ProgressResponse{Status: fmt.Sprintf("quantizing %s model to %s", ft, quantization)})

-					if err := tempfile.Close(); err != nil {
-						return err
-					}
-
-					pathName = tempfile.Name()
-				}
-			}
-
-			bin, err := os.Open(pathName)
-			if err != nil {
-				// not a file on disk so must be a model reference
-				modelpath := ParseModelPath(c.Args)
-				manifest, _, err := GetManifest(modelpath)
-				switch {
-				case errors.Is(err, os.ErrNotExist):
-					fn(api.ProgressResponse{Status: "pulling model"})
-					if err := PullModel(ctx, c.Args, &registryOptions{}, fn); err != nil {
-						return err
-					}
-
-					manifest, _, err = GetManifest(modelpath)
-					if err != nil {
-						return err
-					}
-				case err != nil:
-					return err
-				}
-
-				fn(api.ProgressResponse{Status: "reading model metadata"})
-				fromConfigPath, err := GetBlobsPath(manifest.Config.Digest)
-				if err != nil {
-					return err
-				}
-
-				fromConfigFile, err := os.Open(fromConfigPath)
-				if err != nil {
-					return err
-				}
-				defer fromConfigFile.Close()
-
-				var fromConfig ConfigV2
-				if err := json.NewDecoder(fromConfigFile).Decode(&fromConfig); err != nil {
-					return err
-				}
-
-				// if the model is still not in gguf format, error out
-				if fromConfig.ModelFormat != "gguf" {
-					return fmt.Errorf("%s is not in gguf format, this base model is not compatible with this version of ollama", c.Args)
-				}
-
-				config.SetModelFormat(fromConfig.ModelFormat)
-				config.SetModelFamily(append(fromConfig.ModelFamilies, fromConfig.ModelFamily)...)
-				config.SetModelType(fromConfig.ModelType)
-				config.SetFileType(fromConfig.FileType)
-
-				for _, layer := range manifest.Layers {
-					deleteMap[layer.Digest] = struct{}{}
-					if layer.MediaType == "application/vnd.ollama.image.params" {
-						fromParamsPath, err := GetBlobsPath(layer.Digest)
+						blob, err := GetBlobsPath(baseLayer.Digest)
 						if err != nil {
 							return err
 						}

-						fromParamsFile, err := os.Open(fromParamsPath)
+						temp, err := os.CreateTemp(filepath.Dir(blob), quantization)
 						if err != nil {
 							return err
 						}
-						defer fromParamsFile.Close()
+						defer temp.Close()
+						defer os.Remove(temp.Name())

-						if err := json.NewDecoder(fromParamsFile).Decode(&fromParams); err != nil {
+						if err := llm.Quantize(blob, temp.Name(), want); err != nil {
+							return err
+						}
+
+						baseLayer.Layer, err = NewLayer(temp, baseLayer.Layer.MediaType)
+						if err != nil {
 							return err
 						}
 					}
-
-					layer, err := NewLayerFromLayer(layer.Digest, layer.MediaType, modelpath.GetShortTagname())
-					if err != nil {
-						return err
-					}
-
-					layers.Add(layer)
 				}

-				deleteMap[manifest.Config.Digest] = struct{}{}
-				continue
+				if baseLayer.GGML != nil {
+					config.ModelFormat = cmp.Or(config.ModelFormat, baseLayer.GGML.Name())
+					config.ModelFamily = cmp.Or(config.ModelFamily, baseLayer.GGML.KV().Architecture())
+					config.ModelType = cmp.Or(config.ModelType, format.HumanNumber(baseLayer.GGML.KV().ParameterCount()))
+					config.FileType = cmp.Or(config.FileType, baseLayer.GGML.KV().FileType().String())
+					config.ModelFamilies = append(config.ModelFamilies, baseLayer.GGML.KV().Architecture())
+				}
+
+				layers = append(layers, baseLayer.Layer)
 			}
-			defer bin.Close()
-
-			var offset int64
-			for {
-				fn(api.ProgressResponse{Status: "creating model layer"})
-				if _, err := bin.Seek(offset, io.SeekStart); err != nil {
-					return err
-				}
-
-				ggml, size, err := llm.DecodeGGML(bin)
-				if errors.Is(err, io.EOF) {
-					break
-				} else if errors.Is(err, llm.ErrUnsupportedFormat) {
-					return fmt.Errorf("model binary specified in FROM field is not a valid gguf format model, %w", err)
-				} else if err != nil {
-					return err
-				}
-
-				config.SetModelFormat(ggml.Name())
-				config.SetModelFamily(ggml.KV().Architecture())
-				config.SetModelType(format.HumanNumber(ggml.KV().ParameterCount()))
-				config.SetFileType(ggml.KV().FileType())
-
-				mediatype := mediatype
-				if ggml.KV().Architecture() == "clip" {
-					mediatype = "application/vnd.ollama.image.projector"
-				}
-
-				sr := io.NewSectionReader(bin, offset, size)
-				layer, err := NewLayer(sr, mediatype)
-				if err != nil {
-					return err
-				}
-
-				layers.Add(layer)
-
-				offset += size
-			}
-		case "adapter":
-			if strings.HasPrefix(c.Args, "@") {
-				blobPath, err := GetBlobsPath(strings.TrimPrefix(c.Args, "@"))
-				if err != nil {
-					return err
-				}
-
-				c.Args = blobPath
-			}
-
-			fn(api.ProgressResponse{Status: "creating adapter layer"})
-			bin, err := os.Open(realpath(modelFileDir, c.Args))
-			if err != nil {
-				return err
-			}
-			defer bin.Close()
-
-			_, size, err := llm.DecodeGGML(bin)
+		case "license", "template", "system":
+			blob := strings.NewReader(c.Args)
+			layer, err := NewLayer(blob, mediatype)
 			if err != nil {
 				return err
 			}

-			sr := io.NewSectionReader(bin, 0, size)
-			layer, err := NewLayer(sr, mediatype)
-			if err != nil {
-				return err
+			if c.Name != "license" {
+				// replace
+				layers = slices.DeleteFunc(layers, func(layer *Layer) bool {
+					return layer.MediaType == mediatype
+				})
 			}

-			layers.Add(layer)
-		case "license":
-			fn(api.ProgressResponse{Status: "creating license layer"})
-
-			bin := strings.NewReader(c.Args)
-			layer, err := NewLayer(bin, mediatype)
-			if err != nil {
-				return err
-			}
-
-			layers.Add(layer)
-		case "template", "system":
-			fn(api.ProgressResponse{Status: fmt.Sprintf("creating %s layer", c.Name)})
-
-			bin := strings.NewReader(c.Args)
-			layer, err := NewLayer(bin, mediatype)
-			if err != nil {
-				return err
-			}
-
-			layers.Replace(layer)
+			layers = append(layers, layer)
 		case "message":
-			messages = append(messages, c.Args)
+			role, content, ok := strings.Cut(c.Args, ": ")
+			if !ok {
+				return fmt.Errorf("invalid message: %s", c.Args)
+			}
+
+			messages = append(messages, &api.Message{Role: role, Content: content})
 		default:
-			params[c.Name] = append(params[c.Name], c.Args)
+			ps, err := api.FormatParams(map[string][]string{c.Name: {c.Args}})
+			if err != nil {
+				return err
+			}
+
+			for k, v := range ps {
+				if ks, ok := parameters[k].([]string); ok {
+					parameters[k] = append(ks, v.([]string)...)
+				} else if vs, ok := v.([]string); ok {
+					parameters[k] = vs
+				} else {
+					parameters[k] = v
+				}
+			}
 		}
 	}

-	if len(messages) > 0 {
-		fn(api.ProgressResponse{Status: "creating parameters layer"})
+	var err2 error
+	layers = slices.DeleteFunc(layers, func(layer *Layer) bool {
+		switch layer.MediaType {
+		case "application/vnd.ollama.image.message":
+			// if there are new messages, remove the inherited ones
+			if len(messages) > 0 {
+				return true
+			}

-		msgs := make([]api.Message, 0)
+			return false
+		case "application/vnd.ollama.image.params":
+			// merge inherited parameters with new ones
+			r, err := layer.Open()
+			if err != nil {
+				err2 = err
+				return false
+			}
+			defer r.Close()

-		for _, m := range messages {
-			// todo: handle images
-			msg := strings.SplitN(m, ": ", 2)
-			msgs = append(msgs, api.Message{Role: msg[0], Content: msg[1]})
+			var ps map[string]any
+			if err := json.NewDecoder(r).Decode(&ps); err != nil {
+				err2 = err
+				return false
+			}
+
+			for k, v := range ps {
+				if _, ok := parameters[k]; !ok {
+					parameters[k] = v
+				}
+			}
+
+			return true
+		default:
+			return false
 		}
+	})

+	if err2 != nil {
+		return err2
+	}
+
+	if len(messages) > 0 {
 		var b bytes.Buffer
-		if err := json.NewEncoder(&b).Encode(msgs); err != nil {
+		if err := json.NewEncoder(&b).Encode(messages); err != nil {
 			return err
 		}

@@ -553,39 +506,25 @@ func CreateModel(ctx context.Context, name, modelFileDir, quantization string, c
 			return err
 		}

-		layers.Replace(layer)
+		layers = append(layers, layer)
 	}

-	if len(params) > 0 {
-		fn(api.ProgressResponse{Status: "creating parameters layer"})
-
-		formattedParams, err := api.FormatParams(params)
-		if err != nil {
-			return err
-		}
-
-		for k, v := range fromParams {
-			if _, ok := formattedParams[k]; !ok {
-				formattedParams[k] = v
-			}
-		}
-
+	if len(parameters) > 0 {
 		var b bytes.Buffer
-		if err := json.NewEncoder(&b).Encode(formattedParams); err != nil {
+		if err := json.NewEncoder(&b).Encode(parameters); err != nil {
 			return err
 		}

-		fn(api.ProgressResponse{Status: "creating config layer"})
 		layer, err := NewLayer(&b, "application/vnd.ollama.image.params")
 		if err != nil {
 			return err
 		}

-		layers.Replace(layer)
+		layers = append(layers, layer)
 	}

-	digests := make([]string, len(layers.items))
-	for i, layer := range layers.items {
+	digests := make([]string, len(layers))
+	for i, layer := range layers {
 		digests[i] = layer.Digest
 	}

@@ -596,36 +535,37 @@ func CreateModel(ctx context.Context, name, modelFileDir, quantization string, c
 		return err
 	}

-	configLayer, err := NewLayer(&b, "application/vnd.docker.container.image.v1+json")
+	layer, err := NewLayer(&b, "application/vnd.docker.container.image.v1+json")
 	if err != nil {
 		return err
 	}

-	delete(deleteMap, configLayer.Digest)
+	for _, layer := range append(layers, layer) {
+		if layer.status != "" {
+			fn(api.ProgressResponse{Status: layer.status})
+		}
+	}

-	for _, layer := range append(layers.items, configLayer) {
-		committed, err := layer.Commit()
-		if err != nil {
-			return err
+	unref := make(map[string]struct{})
+	if manifest, _, err := GetManifest(ParseModelPath(name)); err == nil {
+		for _, layer := range manifest.Layers {
+			if !slices.Contains(digests, layer.Digest) {
+				unref[layer.Digest] = struct{}{}
+			}
 		}

-		status := "writing layer"
-		if !committed {
-			status = "using already created layer"
+		if manifest.Config.Digest != layer.Digest {
+			unref[manifest.Config.Digest] = struct{}{}
 		}
-
-		fn(api.ProgressResponse{Status: fmt.Sprintf("%s %s", status, layer.Digest)})
-
-		delete(deleteMap, layer.Digest)
 	}

 	fn(api.ProgressResponse{Status: "writing manifest"})
-	if err := WriteManifest(name, configLayer, layers.items); err != nil {
+	if err := WriteManifest(name, layer, layers); err != nil {
 		return err
 	}

-	if noprune := os.Getenv("OLLAMA_NOPRUNE"); noprune == "" {
-		if err := deleteUnusedLayers(nil, deleteMap, false); err != nil {
+	if !envconfig.NoPrune {
+		if err := deleteUnusedLayers(nil, unref); err != nil {
 			return err
 		}
 	}
@@ -634,74 +574,6 @@ func CreateModel(ctx context.Context, name, modelFileDir, quantization string, c
 	return nil
 }

-func convertModel(name, path string, fn func(resp api.ProgressResponse)) (string, error) {
-	r, err := zip.OpenReader(path)
-	if err != nil {
-		return "", err
-	}
-	defer r.Close()
-
-	tempDir, err := os.MkdirTemp("", "ollama-convert")
-	if err != nil {
-		return "", err
-	}
-	defer os.RemoveAll(tempDir)
-
-	fn(api.ProgressResponse{Status: "unpacking model metadata"})
-	for _, f := range r.File {
-		fpath := filepath.Join(tempDir, f.Name)
-		outFile, err := os.OpenFile(fpath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, f.Mode())
-		if err != nil {
-			return "", err
-		}
-
-		rc, err := f.Open()
-		if err != nil {
-			return "", err
-		}
-
-		_, err = io.Copy(outFile, rc)
-		if err != nil {
-			return "", err
-		}
-
-		outFile.Close()
-		rc.Close()
-	}
-
-	mf, err := convert.GetModelFormat(tempDir)
-	if err != nil {
-		return "", err
-	}
-
-	params, err := mf.GetParams(tempDir)
-	if err != nil {
-		return "", err
-	}
-
-	mArch, err := mf.GetModelArch(name, tempDir, params)
-	if err != nil {
-		return "", err
-	}
-
-	fn(api.ProgressResponse{Status: "processing tensors"})
-	if err := mArch.GetTensors(); err != nil {
-		return "", err
-	}
-
-	if err := mArch.LoadVocab(); err != nil {
-		return "", err
-	}
-
-	fn(api.ProgressResponse{Status: "converting model"})
-	path, err = mArch.WriteGGUF()
-	if err != nil {
-		return "", err
-	}
-
-	return path, nil
-}
-
 func CopyModel(src, dst model.Name) error {
 	if !dst.IsFullyQualified() {
 		return model.Unqualified(dst)
@@ -710,6 +582,10 @@ func CopyModel(src, dst model.Name) error {
 		return model.Unqualified(src)
 	}

+	if src.Filepath() == dst.Filepath() {
+		return nil
+	}
+
 	manifests, err := GetManifestPath()
 	if err != nil {
 		return err
@@ -737,7 +613,7 @@ func CopyModel(src, dst model.Name) error {
 	return err
 }

-func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]struct{}, dryRun bool) error {
+func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]struct{}) error {
 	fp, err := GetManifestPath()
 	if err != nil {
 		return err
@@ -784,13 +660,9 @@ func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]struct{},
 			slog.Info(fmt.Sprintf("couldn't get file path for '%s': %v", k, err))
 			continue
 		}
-		if !dryRun {
-			if err := os.Remove(fp); err != nil {
-				slog.Info(fmt.Sprintf("couldn't remove file '%s': %v", fp, err))
-				continue
-			}
-		} else {
-			slog.Info(fmt.Sprintf("wanted to remove: %s", fp))
+		if err := os.Remove(fp); err != nil {
+			slog.Info(fmt.Sprintf("couldn't remove file '%s': %v", fp, err))
+			continue
 		}
 	}

@@ -813,14 +685,25 @@ func PruneLayers() error {
 	for _, blob := range blobs {
 		name := blob.Name()
 		name = strings.ReplaceAll(name, "-", ":")
-		if strings.HasPrefix(name, "sha256:") {
-			deleteMap[name] = struct{}{}
+
+		_, err := GetBlobsPath(name)
+		if err != nil {
+			if errors.Is(err, ErrInvalidDigestFormat) {
+				// remove invalid blobs (e.g. partial downloads)
+				if err := os.Remove(filepath.Join(p, blob.Name())); err != nil {
+					slog.Error("couldn't remove blob", "blob", blob.Name(), "error", err)
+				}
+			}
+
+			continue
 		}
+
+		deleteMap[name] = struct{}{}
 	}

 	slog.Info(fmt.Sprintf("total blobs: %d", len(deleteMap)))

-	err = deleteUnusedLayers(nil, deleteMap, false)
+	err = deleteUnusedLayers(nil, deleteMap)
 	if err != nil {
 		return err
 	}
@@ -876,7 +759,7 @@ func DeleteModel(name string) error {
 	}
 	deleteMap[manifest.Config.Digest] = struct{}{}

-	err = deleteUnusedLayers(&mp, deleteMap, false)
+	err = deleteUnusedLayers(&mp, deleteMap)
 	if err != nil {
 		return err
 	}
@@ -894,67 +777,6 @@ func DeleteModel(name string) error {
 	return nil
 }

-func ShowModelfile(model *Model) (string, error) {
-	var mt struct {
-		*Model
-		From       string
-		Parameters map[string][]any
-	}
-
-	mt.Parameters = make(map[string][]any)
-	for k, v := range model.Options {
-		if s, ok := v.([]any); ok {
-			mt.Parameters[k] = s
-			continue
-		}
-
-		mt.Parameters[k] = []any{v}
-	}
-
-	mt.Model = model
-	mt.From = model.ModelPath
-
-	if model.ParentModel != "" {
-		mt.From = model.ParentModel
-	}
-
-	modelFile := `# Modelfile generated by "ollama show"
-# To build a new Modelfile based on this one, replace the FROM line with:
-# FROM {{ .ShortName }}
-
-FROM {{ .From }}
-TEMPLATE """{{ .Template }}"""
-
-{{- if .System }}
-SYSTEM """{{ .System }}"""
-{{- end }}
-
-{{- range $adapter := .AdapterPaths }}
-ADAPTER {{ $adapter }}
-{{- end }}
-
-{{- range $k, $v := .Parameters }}
-{{- range $parameter := $v }}
-PARAMETER {{ $k }} {{ printf "%#v" $parameter }}
-{{- end }}
-{{- end }}`
-
-	tmpl, err := template.New("").Parse(modelFile)
-	if err != nil {
-		slog.Info(fmt.Sprintf("error parsing template: %q", err))
-		return "", err
-	}
-
-	var buf bytes.Buffer
-
-	if err = tmpl.Execute(&buf, mt); err != nil {
-		slog.Info(fmt.Sprintf("error executing template: %q", err))
-		return "", err
-	}
-
-	return buf.String(), nil
-}
-
 func PushModel(ctx context.Context, name string, regOpts *registryOptions, fn func(api.ProgressResponse)) error {
 	mp := ParseModelPath(name)
 	fn(api.ProgressResponse{Status: "retrieving manifest"})
@@ -976,9 +798,6 @@ func PushModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
 	for _, layer := range layers {
 		if err := uploadBlob(ctx, mp, layer, regOpts, fn); err != nil {
 			slog.Info(fmt.Sprintf("error uploading blob: %v", err))
-			if errors.Is(err, errUnauthorized) {
-				return fmt.Errorf("unable to push %s, make sure this namespace exists and you are authorized to push to it", ParseModelPath(name).GetNamespaceRepository())
-			}
 			return err
 		}
 	}
@@ -1015,7 +834,7 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
 	// build deleteMap to prune unused layers
 	deleteMap := make(map[string]struct{})

-	if noprune = os.Getenv("OLLAMA_NOPRUNE"); noprune == "" {
+	if !envconfig.NoPrune {
 		manifest, _, err = GetManifest(mp)
 		if err != nil && !errors.Is(err, os.ErrNotExist) {
 			return err
@@ -1100,7 +919,7 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu

 	if noprune == "" {
 		fn(api.ProgressResponse{Status: "removing any unused layers"})
-		err = deleteUnusedLayers(nil, deleteMap, false)
+		err = deleteUnusedLayers(nil, deleteMap)
 		if err != nil {
 			return err
 		}
@@ -1141,9 +960,40 @@ func GetSHA256Digest(r io.Reader) (string, int64) {
 	return fmt.Sprintf("sha256:%x", h.Sum(nil)), n
 }

-var errUnauthorized = errors.New("unauthorized")
+var errUnauthorized = fmt.Errorf("unauthorized: access denied")
+
+// getTokenSubject returns the subject of a JWT token, it does not validate the token
+func getTokenSubject(token string) string {
+	parts := strings.Split(token, ".")
+	if len(parts) != 3 {
+		slog.Error("jwt token does not contain 3 parts")
+		return ""
+	}
+
+	payload := parts[1]
+	payloadBytes, err := base64.RawURLEncoding.DecodeString(payload)
+	if err != nil {
+		slog.Error(fmt.Sprintf("failed to decode jwt payload: %v", err))
+		return ""
+	}
+
+	var payloadMap map[string]interface{}
+	if err := json.Unmarshal(payloadBytes, &payloadMap); err != nil {
+		slog.Error(fmt.Sprintf("failed to unmarshal payload JSON: %v", err))
+		return ""
+	}
+
+	sub, ok := payloadMap["sub"]
+	if !ok {
+		slog.Error("jwt does not contain 'sub' field")
+		return ""
+	}
+
+	return fmt.Sprintf("%s", sub)
+}

 func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.URL, headers http.Header, body io.ReadSeeker, regOpts *registryOptions) (*http.Response, error) {
+	anonymous := true // access will default to anonymous if no user is found associated with the public key
 	for i := 0; i < 2; i++ {
 		resp, err := makeRequest(ctx, method, requestURL, headers, body, regOpts)
 		if err != nil {
@@ -1162,6 +1012,7 @@ func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.UR
 			if err != nil {
 				return nil, err
 			}
+			anonymous = getTokenSubject(token) == "anonymous"
 			regOpts.Token = token
 			if body != nil {
 				_, err = body.Seek(0, io.SeekStart)
@@ -1182,6 +1033,16 @@ func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.UR
 		}
 	}

+	if anonymous {
+		// no user is associated with the public key, and the request requires non-anonymous access
+		pubKey, nestedErr := auth.GetPublicKey()
+		if nestedErr != nil {
+			slog.Error(fmt.Sprintf("couldn't get public key: %v", nestedErr))
+			return nil, errUnauthorized
+		}
+		return nil, &errtypes.UnknownOllamaKey{Key: pubKey}
+	}
+	// user is associated with the public key, but is not authorized to make the request
 	return nil, errUnauthorized
 }

--- a/Show More
+++ b/Show More