some more menu options...

Touch ID for cli install; server restarts
app: fix status item icons
2025-12-25 08:40:47 -05:00 · 2024-04-28 12:40:52 -04:00 · 2024-04-27 22:42:38 -04:00 · 2024-04-27 15:57:57 -04:00 · 2024-04-27 14:20:10 -04:00
169 changed files with 2899 additions and 22514 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -12,4 +12,3 @@ ggml-metal.metal
 test_data
 *.crt
 llm/build
-__debug_bin*
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 <div align="center">
- <img alt="ollama" height="200px" src="https://github.com/ollama/ollama/assets/3325447/0d0b44e2-8f4a-4e99-9b52-a5c1c741c8f7">
+  <img alt="ollama" height="200px" src="https://github.com/ollama/ollama/assets/3325447/0d0b44e2-8f4a-4e99-9b52-a5c1c741c8f7">
 </div>

 # Ollama
@@ -51,7 +51,7 @@ Here are some example models that can be downloaded:
 | ------------------ | ---------- | ----- | ------------------------------ |
 | Llama 3            | 8B         | 4.7GB | `ollama run llama3`            |
 | Llama 3            | 70B        | 40GB  | `ollama run llama3:70b`        |
-| Phi-3              | 3.8B       | 2.3GB | `ollama run phi3`              |
+| Phi-3              | 3,8B       | 2.3GB | `ollama run phi3`              |
 | Mistral            | 7B         | 4.1GB | `ollama run mistral`           |
 | Neural Chat        | 7B         | 4.1GB | `ollama run neural-chat`       |
 | Starling           | 7B         | 4.1GB | `ollama run starling-lm`       |
@@ -173,7 +173,7 @@ I'm a basic program that prints the famous "Hello, world!" message to the consol
 The image features a yellow smiley face, which is likely the central focus of the picture.
 ```

-### Pass the prompt as an argument
+### Pass in prompt as arguments

 ```
 $ ollama run llama3 "Summarize this file: $(cat README.md)"
@@ -258,7 +258,6 @@ See the [API documentation](./docs/api.md) for all endpoints.

 - [Open WebUI](https://github.com/open-webui/open-webui)
 - [Enchanted (macOS native)](https://github.com/AugustDev/enchanted)
- [Hollama](https://github.com/fmaclen/hollama)
 - [Lollms-Webui](https://github.com/ParisNeo/lollms-webui)
 - [LibreChat](https://github.com/danny-avila/LibreChat)
 - [Bionic GPT](https://github.com/bionic-gpt/bionic-gpt)
@@ -285,20 +284,17 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [OllamaGUI](https://github.com/enoch1118/ollamaGUI)
 - [OpenAOE](https://github.com/InternLM/OpenAOE)
 - [Odin Runes](https://github.com/leonid20000/OdinRunes)
- [LLM-X](https://github.com/mrdjohnson/llm-x) (Progressive Web App)
+- [LLM-X: Progressive Web App](https://github.com/mrdjohnson/llm-x)
 - [AnythingLLM (Docker + MacOs/Windows/Linux native app)](https://github.com/Mintplex-Labs/anything-llm)
 - [Ollama Basic Chat: Uses HyperDiv Reactive UI](https://github.com/rapidarchitect/ollama_basic_chat)
 - [Ollama-chats RPG](https://github.com/drazdra/ollama-chats)
- [QA-Pilot](https://github.com/reid41/QA-Pilot) (Chat with Code Repository)
- [ChatOllama](https://github.com/sugarforever/chat-ollama) (Open Source Chatbot based on Ollama with Knowledge Bases)
- [CRAG Ollama Chat](https://github.com/Nagi-ovo/CRAG-Ollama-Chat) (Simple Web Search with Corrective RAG)
- [RAGFlow](https://github.com/infiniflow/ragflow) (Open-source Retrieval-Augmented Generation engine based on deep document understanding)
- [StreamDeploy](https://github.com/StreamDeploy-DevRel/streamdeploy-llm-app-scaffold) (LLM Application Scaffold)
- [chat](https://github.com/swuecho/chat) (chat web app for teams)
+- [QA-Pilot: Chat with Code Repository](https://github.com/reid41/QA-Pilot)
+- [ChatOllama: Open Source Chatbot based on Ollama with Knowledge Bases](https://github.com/sugarforever/chat-ollama)
+- [CRAG Ollama Chat: Simple Web Search with Corrective RAG](https://github.com/Nagi-ovo/CRAG-Ollama-Chat)
+- [RAGFlow: Open-source Retrieval-Augmented Generation engine based on deep document understanding](https://github.com/infiniflow/ragflow)
+- [chat: chat web app for teams](https://github.com/swuecho/chat)
 - [Lobe Chat](https://github.com/lobehub/lobe-chat) with [Integrating Doc](https://lobehub.com/docs/self-hosting/examples/ollama)
- [Ollama RAG Chatbot](https://github.com/datvodinh/rag-chatbot.git) (Local Chat with multiple PDFs using Ollama and RAG)
- [BrainSoup](https://www.nurgo-software.com/products/brainsoup) (Flexible native client with RAG & multi-agent automation)
- [macai](https://github.com/Renset/macai) (macOS client for Ollama, ChatGPT, and other compatible API back-ends)
+- [Ollama RAG Chatbot: Local Chat with multiples PDFs using Ollama and RAG.](https://github.com/datvodinh/rag-chatbot.git)

 ### Terminal

@@ -331,7 +327,6 @@ See the [API documentation](./docs/api.md) for all endpoints.

 - [Pacman](https://archlinux.org/packages/extra/x86_64/ollama/)
 - [Helm Chart](https://artifacthub.io/packages/helm/ollama-helm/ollama)
- [Guix channel](https://codeberg.org/tusharhero/ollama-guix)

 ### Libraries

@@ -353,13 +348,10 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Haystack](https://github.com/deepset-ai/haystack-integrations/blob/main/integrations/ollama.md)
 - [Elixir LangChain](https://github.com/brainlid/langchain)
 - [Ollama for R - rollama](https://github.com/JBGruber/rollama)
- [Ollama for R - ollama-r](https://github.com/hauselin/ollama-r)
 - [Ollama-ex for Elixir](https://github.com/lebrunel/ollama-ex)
 - [Ollama Connector for SAP ABAP](https://github.com/b-tocs/abap_btocs_ollama)
 - [Testcontainers](https://testcontainers.com/modules/ollama/)
- [Portkey](https://portkey.ai/docs/welcome/integration-guides/ollama)
- [PromptingTools.jl](https://github.com/svilupp/PromptingTools.jl) with an [example](https://svilupp.github.io/PromptingTools.jl/dev/examples/working_with_ollama)
- [LlamaScript](https://github.com/WolfTheDeveloper/llamascript)
+
 ### Mobile

 - [Enchanted](https://github.com/AugustDev/enchanted)
@@ -378,13 +370,12 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Ollama Telegram Bot](https://github.com/ruecat/ollama-telegram)
 - [Hass Ollama Conversation](https://github.com/ej52/hass-ollama-conversation)
 - [Rivet plugin](https://github.com/abrenneke/rivet-plugin-ollama)
+- [Llama Coder](https://github.com/ex3ndr/llama-coder) (Copilot alternative using Ollama)
 - [Obsidian BMO Chatbot plugin](https://github.com/longy2k/obsidian-bmo-chatbot)
 - [Cliobot](https://github.com/herval/cliobot) (Telegram bot with Ollama support)
 - [Copilot for Obsidian plugin](https://github.com/logancyang/obsidian-copilot)
 - [Obsidian Local GPT plugin](https://github.com/pfrankov/obsidian-local-gpt)
 - [Open Interpreter](https://docs.openinterpreter.com/language-model-setup/local-models/ollama)
- [Llama Coder](https://github.com/ex3ndr/llama-coder) (Copilot alternative using Ollama)
- [Ollama Copilot](https://github.com/bernardo-bruning/ollama-copilot) (Proxy that allows you to use ollama as a copilot like Github copilot)
 - [twinny](https://github.com/rjmacarthy/twinny) (Copilot and Copilot chat alternative using Ollama)
 - [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and HuggingFace)
 - [Page Assist](https://github.com/n4ze3m/page-assist) (Chrome Extension)
@@ -393,5 +384,4 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation)

 ### Supported backends 
- [llama.cpp](https://github.com/ggerganov/llama.cpp) project founded by Georgi Gerganov.
-
+- [llama.cpp](https://github.com/ggerganov/llama.cpp) project founded by Georgi Gerganov. 
--- a/api/client.go
+++ b/api/client.go
@@ -1,16 +1,9 @@
 // Package api implements the client-side API for code wishing to interact
 // with the ollama service. The methods of the [Client] type correspond to
-// the ollama REST API as described in [the API documentation].
+// the ollama REST API as described in https://github.com/ollama/ollama/blob/main/docs/api.md
+//
 // The ollama command-line client itself uses this package to interact with
 // the backend service.
-//
-// # Examples
-//
-// Several examples of using this package are available [in the GitHub
-// repository].
-//
-// [the API documentation]: https://github.com/ollama/ollama/blob/main/docs/api.md
-// [in the GitHub repository]: https://github.com/ollama/ollama/tree/main/examples
 package api

 import (
@@ -25,7 +18,6 @@ import (
 	"net/url"
 	"os"
 	"runtime"
-	"strconv"
 	"strings"

 	"github.com/ollama/ollama/format"
@@ -65,36 +57,12 @@ func checkError(resp *http.Response, body []byte) error {
 // If the variable is not specified, a default ollama host and port will be
 // used.
 func ClientFromEnvironment() (*Client, error) {
-	ollamaHost, err := GetOllamaHost()
-	if err != nil {
-		return nil, err
-	}
-
-	return &Client{
-		base: &url.URL{
-			Scheme: ollamaHost.Scheme,
-			Host:   net.JoinHostPort(ollamaHost.Host, ollamaHost.Port),
-		},
-		http: http.DefaultClient,
-	}, nil
-}
-
-type OllamaHost struct {
-	Scheme string
-	Host   string
-	Port   string
-}
-
-func GetOllamaHost() (OllamaHost, error) {
 	defaultPort := "11434"

-	hostVar := os.Getenv("OLLAMA_HOST")
-	hostVar = strings.TrimSpace(strings.Trim(strings.TrimSpace(hostVar), "\"'"))
-
-	scheme, hostport, ok := strings.Cut(hostVar, "://")
+	scheme, hostport, ok := strings.Cut(os.Getenv("OLLAMA_HOST"), "://")
 	switch {
 	case !ok:
-		scheme, hostport = "http", hostVar
+		scheme, hostport = "http", os.Getenv("OLLAMA_HOST")
 	case scheme == "http":
 		defaultPort = "80"
 	case scheme == "https":
@@ -114,14 +82,12 @@ func GetOllamaHost() (OllamaHost, error) {
 		}
 	}

-	if portNum, err := strconv.ParseInt(port, 10, 32); err != nil || portNum > 65535 || portNum < 0 {
-		return OllamaHost{}, ErrInvalidHostPort
-	}
-
-	return OllamaHost{
-		Scheme: scheme,
-		Host:   host,
-		Port:   port,
+	return &Client{
+		base: &url.URL{
+			Scheme: scheme,
+			Host:   net.JoinHostPort(host, port),
+		},
+		http: http.DefaultClient,
 	}, nil
 }

@@ -306,14 +272,8 @@ func (c *Client) Pull(ctx context.Context, req *PullRequest, fn PullProgressFunc
 	})
 }

-// PushProgressFunc is a function that [Client.Push] invokes when progress is
-// made.
-// It's similar to other progress function types like [PullProgressFunc].
 type PushProgressFunc func(ProgressResponse) error

-// Push uploads a model to the model library; requires registering for ollama.ai
-// and adding a public key first. fn is called each time progress is made on
-// the request and can be used to display a progress bar, etc.
 func (c *Client) Push(ctx context.Context, req *PushRequest, fn PushProgressFunc) error {
 	return c.stream(ctx, http.MethodPost, "/api/push", req, func(bts []byte) error {
 		var resp ProgressResponse
@@ -325,15 +285,8 @@ func (c *Client) Push(ctx context.Context, req *PushRequest, fn PushProgressFunc
 	})
 }

-// CreateProgressFunc is a function that [Client.Create] invokes when progress
-// is made.
-// It's similar to other progress function types like [PullProgressFunc].
 type CreateProgressFunc func(ProgressResponse) error

-// Create creates a model from a [Modelfile]. fn is a progress function that
-// behaves similarly to other methods (see [Client.Pull]).
-//
-// [Modelfile]: https://github.com/ollama/ollama/blob/main/docs/modelfile.md
 func (c *Client) Create(ctx context.Context, req *CreateRequest, fn CreateProgressFunc) error {
 	return c.stream(ctx, http.MethodPost, "/api/create", req, func(bts []byte) error {
 		var resp ProgressResponse
@@ -345,7 +298,6 @@ func (c *Client) Create(ctx context.Context, req *CreateRequest, fn CreateProgre
 	})
 }

-// List lists models that are available locally.
 func (c *Client) List(ctx context.Context) (*ListResponse, error) {
 	var lr ListResponse
 	if err := c.do(ctx, http.MethodGet, "/api/tags", nil, &lr); err != nil {
@@ -354,8 +306,6 @@ func (c *Client) List(ctx context.Context) (*ListResponse, error) {
 	return &lr, nil
 }

-// Copy copies a model - creating a model with another name from an existing
-// model.
 func (c *Client) Copy(ctx context.Context, req *CopyRequest) error {
 	if err := c.do(ctx, http.MethodPost, "/api/copy", req, nil); err != nil {
 		return err
@@ -363,7 +313,6 @@ func (c *Client) Copy(ctx context.Context, req *CopyRequest) error {
 	return nil
 }

-// Delete deletes a model and its data.
 func (c *Client) Delete(ctx context.Context, req *DeleteRequest) error {
 	if err := c.do(ctx, http.MethodDelete, "/api/delete", req, nil); err != nil {
 		return err
@@ -371,7 +320,6 @@ func (c *Client) Delete(ctx context.Context, req *DeleteRequest) error {
 	return nil
 }

-// Show obtains model information, including details, modelfile, license etc.
 func (c *Client) Show(ctx context.Context, req *ShowRequest) (*ShowResponse, error) {
 	var resp ShowResponse
 	if err := c.do(ctx, http.MethodPost, "/api/show", req, &resp); err != nil {
@@ -380,16 +328,12 @@ func (c *Client) Show(ctx context.Context, req *ShowRequest) (*ShowResponse, err
 	return &resp, nil
 }

-// Hearbeat checks if the server has started and is responsive; if yes, it
-// returns nil, otherwise an error.
 func (c *Client) Heartbeat(ctx context.Context) error {
 	if err := c.do(ctx, http.MethodHead, "/", nil, nil); err != nil {
 		return err
 	}
 	return nil
 }
-
-// Embeddings generates embeddings from a model.
 func (c *Client) Embeddings(ctx context.Context, req *EmbeddingRequest) (*EmbeddingResponse, error) {
 	var resp EmbeddingResponse
 	if err := c.do(ctx, http.MethodPost, "/api/embeddings", req, &resp); err != nil {
@@ -398,13 +342,10 @@ func (c *Client) Embeddings(ctx context.Context, req *EmbeddingRequest) (*Embedd
 	return &resp, nil
 }

-// CreateBlob creates a blob from a file on the server. digest is the
-// expected SHA256 digest of the file, and r represents the file.
 func (c *Client) CreateBlob(ctx context.Context, digest string, r io.Reader) error {
 	return c.do(ctx, http.MethodPost, fmt.Sprintf("/api/blobs/%s", digest), r, nil)
 }

-// Version returns the Ollama server version as a string.
 func (c *Client) Version(ctx context.Context) (string, error) {
 	var version struct {
 		Version string `json:"version"`
--- a/api/client_test.go
+++ b/api/client_test.go
@@ -1,12 +1,6 @@
 package api

-import (
-	"fmt"
-	"net"
-	"testing"
-
-	"github.com/stretchr/testify/assert"
-)
+import "testing"

 func TestClientFromEnvironment(t *testing.T) {
 	type testCase struct {
@@ -46,40 +40,4 @@ func TestClientFromEnvironment(t *testing.T) {
 			}
 		})
 	}
-
-	hostTestCases := map[string]*testCase{
-		"empty":               {value: "", expect: "127.0.0.1:11434"},
-		"only address":        {value: "1.2.3.4", expect: "1.2.3.4:11434"},
-		"only port":           {value: ":1234", expect: ":1234"},
-		"address and port":    {value: "1.2.3.4:1234", expect: "1.2.3.4:1234"},
-		"hostname":            {value: "example.com", expect: "example.com:11434"},
-		"hostname and port":   {value: "example.com:1234", expect: "example.com:1234"},
-		"zero port":           {value: ":0", expect: ":0"},
-		"too large port":      {value: ":66000", err: ErrInvalidHostPort},
-		"too small port":      {value: ":-1", err: ErrInvalidHostPort},
-		"ipv6 localhost":      {value: "[::1]", expect: "[::1]:11434"},
-		"ipv6 world open":     {value: "[::]", expect: "[::]:11434"},
-		"ipv6 no brackets":    {value: "::1", expect: "[::1]:11434"},
-		"ipv6 + port":         {value: "[::1]:1337", expect: "[::1]:1337"},
-		"extra space":         {value: " 1.2.3.4 ", expect: "1.2.3.4:11434"},
-		"extra quotes":        {value: "\"1.2.3.4\"", expect: "1.2.3.4:11434"},
-		"extra space+quotes":  {value: " \" 1.2.3.4 \" ", expect: "1.2.3.4:11434"},
-		"extra single quotes": {value: "'1.2.3.4'", expect: "1.2.3.4:11434"},
-	}
-
-	for k, v := range hostTestCases {
-		t.Run(k, func(t *testing.T) {
-			t.Setenv("OLLAMA_HOST", v.value)
-
-			oh, err := GetOllamaHost()
-			if err != v.err {
-				t.Fatalf("expected %s, got %s", v.err, err)
-			}
-
-			if err == nil {
-				host := net.JoinHostPort(oh.Host, oh.Port)
-				assert.Equal(t, v.expect, host, fmt.Sprintf("%s: expected %s, got %s", k, v.expect, host))
-			}
-		})
-	}
 }
--- a/api/types.go
+++ b/api/types.go
@@ -4,7 +4,6 @@ import (
 	"encoding/json"
 	"errors"
 	"fmt"
-	"log/slog"
 	"math"
 	"os"
 	"reflect"
@@ -13,7 +12,6 @@ import (
 	"time"
 )

-// StatusError is an error with and HTTP status code.
 type StatusError struct {
 	StatusCode   int
 	Status       string
@@ -34,7 +32,6 @@ func (e StatusError) Error() string {
 	}
 }

-// ImageData represents the raw binary data of an image file.
 type ImageData []byte

 // GenerateRequest describes a request sent by [Client.Generate]. While you
@@ -80,44 +77,26 @@ type GenerateRequest struct {
 	Options map[string]interface{} `json:"options"`
 }

-// ChatRequest describes a request sent by [Client.Chat].
 type ChatRequest struct {
-	// Model is the model name, as in [GenerateRequest].
-	Model string `json:"model"`
-
-	// Messages is the messages of the chat - can be used to keep a chat memory.
-	Messages []Message `json:"messages"`
-
-	// Stream enable streaming of returned response; true by default.
-	Stream *bool `json:"stream,omitempty"`
-
-	// Format is the format to return the response in (e.g. "json").
-	Format string `json:"format"`
-
-	// KeepAlive controls how long the model will stay loaded into memory
-	// followin the request.
+	Model     string    `json:"model"`
+	Messages  []Message `json:"messages"`
+	Stream    *bool     `json:"stream,omitempty"`
+	Format    string    `json:"format"`
 	KeepAlive *Duration `json:"keep_alive,omitempty"`

-	// Options lists model-specific options.
 	Options map[string]interface{} `json:"options"`
 }

-// Message is a single message in a chat sequence. The message contains the
-// role ("system", "user", or "assistant"), the content and an optional list
-// of images.
 type Message struct {
-	Role    string      `json:"role"`
+	Role    string      `json:"role"` // one of ["system", "user", "assistant"]
 	Content string      `json:"content"`
 	Images  []ImageData `json:"images,omitempty"`
 }

-// ChatResponse is the response returned by [Client.Chat]. Its fields are
-// similar to [GenerateResponse].
 type ChatResponse struct {
-	Model      string    `json:"model"`
-	CreatedAt  time.Time `json:"created_at"`
-	Message    Message   `json:"message"`
-	DoneReason string    `json:"done_reason,omitempty"`
+	Model     string    `json:"model"`
+	CreatedAt time.Time `json:"created_at"`
+	Message   Message   `json:"message"`

 	Done bool `json:"done"`

@@ -133,8 +112,7 @@ type Metrics struct {
 	EvalDuration       time.Duration `json:"eval_duration,omitempty"`
 }

-// Options specified in [GenerateRequest], if you add a new option here add it
-// to the API docs also.
+// Options specified in GenerateRequest, if you add a new option here add it to the API docs also
 type Options struct {
 	Runner

@@ -163,6 +141,7 @@ type Runner struct {
 	UseNUMA   bool `json:"numa,omitempty"`
 	NumCtx    int  `json:"num_ctx,omitempty"`
 	NumBatch  int  `json:"num_batch,omitempty"`
+	NumGQA    int  `json:"num_gqa,omitempty"`
 	NumGPU    int  `json:"num_gpu,omitempty"`
 	MainGPU   int  `json:"main_gpu,omitempty"`
 	LowVRAM   bool `json:"low_vram,omitempty"`
@@ -172,30 +151,14 @@ type Runner struct {
 	UseMMap   bool `json:"use_mmap,omitempty"`
 	UseMLock  bool `json:"use_mlock,omitempty"`
 	NumThread int  `json:"num_thread,omitempty"`
+
+	// Unused: RopeFrequencyBase is ignored. Instead the value in the model will be used
+	RopeFrequencyBase float32 `json:"rope_frequency_base,omitempty"`
+	// Unused: RopeFrequencyScale is ignored. Instead the value in the model will be used
+	RopeFrequencyScale float32 `json:"rope_frequency_scale,omitempty"`
 }

-// EmbeddingRequest is the request passed to [Client.Embeddings].
 type EmbeddingRequest struct {
-	// Model is the model name.
-	Model string `json:"model"`
-
-	// Prompt is the textual prompt to embed.
-	Prompt string `json:"prompt"`
-
-	// KeepAlive controls how long the model will stay loaded in memory following
-	// this request.
-	KeepAlive *Duration `json:"keep_alive,omitempty"`
-
-	// Options lists model-specific options.
-	Options map[string]interface{} `json:"options"`
-}
-
-// EmbeddingResponse is the response from [Client.Embeddings].
-type EmbeddingResponse struct {
-	Embedding []float64 `json:"embedding"`
-}
-
-type TokenizeRequest struct {
 	Model     string    `json:"model"`
 	Prompt    string    `json:"prompt"`
 	KeepAlive *Duration `json:"keep_alive,omitempty"`
@@ -203,11 +166,10 @@ type TokenizeRequest struct {
 	Options map[string]interface{} `json:"options"`
 }

-type TokenizeResponse struct {
-	Tokens []int `json:"tokens"`
+type EmbeddingResponse struct {
+	Embedding []float64 `json:"embedding"`
 }

-// CreateRequest is the request passed to [Client.Create].
 type CreateRequest struct {
 	Model        string `json:"model"`
 	Path         string `json:"path"`
@@ -219,7 +181,6 @@ type CreateRequest struct {
 	Name string `json:"name"`
 }

-// DeleteRequest is the request passed to [Client.Delete].
 type DeleteRequest struct {
 	Model string `json:"model"`

@@ -227,7 +188,6 @@ type DeleteRequest struct {
 	Name string `json:"name"`
 }

-// ShowRequest is the request passed to [Client.Show].
 type ShowRequest struct {
 	Model    string `json:"model"`
 	System   string `json:"system"`
@@ -239,7 +199,6 @@ type ShowRequest struct {
 	Name string `json:"name"`
 }

-// ShowResponse is the response returned from [Client.Show].
 type ShowResponse struct {
 	License    string       `json:"license,omitempty"`
 	Modelfile  string       `json:"modelfile,omitempty"`
@@ -250,13 +209,11 @@ type ShowResponse struct {
 	Messages   []Message    `json:"messages,omitempty"`
 }

-// CopyRequest is the request passed to [Client.Copy].
 type CopyRequest struct {
 	Source      string `json:"source"`
 	Destination string `json:"destination"`
 }

-// PullRequest is the request passed to [Client.Pull].
 type PullRequest struct {
 	Model    string `json:"model"`
 	Insecure bool   `json:"insecure,omitempty"`
@@ -268,8 +225,6 @@ type PullRequest struct {
 	Name string `json:"name"`
 }

-// ProgressResponse is the response passed to progress functions like
-// [PullProgressFunc] and [PushProgressFunc].
 type ProgressResponse struct {
 	Status    string `json:"status"`
 	Digest    string `json:"digest,omitempty"`
@@ -277,7 +232,6 @@ type ProgressResponse struct {
 	Completed int64  `json:"completed,omitempty"`
 }

-// PushRequest is the request passed to [Client.Push].
 type PushRequest struct {
 	Model    string `json:"model"`
 	Insecure bool   `json:"insecure,omitempty"`
@@ -289,12 +243,10 @@ type PushRequest struct {
 	Name string `json:"name"`
 }

-// ListResponse is the response from [Client.List].
 type ListResponse struct {
 	Models []ModelResponse `json:"models"`
 }

-// ModelResponse is a single model description in [ListResponse].
 type ModelResponse struct {
 	Name       string       `json:"name"`
 	Model      string       `json:"model"`
@@ -308,31 +260,17 @@ type TokenResponse struct {
 	Token string `json:"token"`
 }

-// GenerateResponse is the response passed into [GenerateResponseFunc].
 type GenerateResponse struct {
-	// Model is the model name that generated the response.
-	Model string `json:"model"`
-
-	//CreatedAt is the timestamp of the response.
+	Model     string    `json:"model"`
 	CreatedAt time.Time `json:"created_at"`
+	Response  string    `json:"response"`

-	// Response is the textual response itself.
-	Response string `json:"response"`
-
-	// Done specifies if the response is complete.
-	Done bool `json:"done"`
-
-	// DoneReason is the reason the model stopped generating text.
-	DoneReason string `json:"done_reason,omitempty"`
-
-	// Context is an encoding of the conversation used in this response; this
-	// can be sent in the next request to keep a conversational memory.
+	Done    bool  `json:"done"`
 	Context []int `json:"context,omitempty"`

 	Metrics
 }

-// ModelDetails provides details about a model.
 type ModelDetails struct {
 	ParentModel       string   `json:"parent_model"`
 	Format            string   `json:"format"`
@@ -370,7 +308,7 @@ func (m *Metrics) Summary() {
 	}
 }

-var ErrInvalidHostPort = errors.New("invalid port specified in OLLAMA_HOST")
+var ErrInvalidOpts = errors.New("invalid options")

 func (opts *Options) FromMap(m map[string]interface{}) error {
 	valueOpts := reflect.ValueOf(opts).Elem() // names of the fields in the options struct
@@ -385,76 +323,76 @@ func (opts *Options) FromMap(m map[string]interface{}) error {
 		}
 	}

+	invalidOpts := []string{}
 	for key, val := range m {
-		opt, ok := jsonOpts[key]
-		if !ok {
-			slog.Warn("invalid option provided", "option", opt.Name)
-			continue
-		}
+		if opt, ok := jsonOpts[key]; ok {
+			field := valueOpts.FieldByName(opt.Name)
+			if field.IsValid() && field.CanSet() {
+				if val == nil {
+					continue
+				}

-		field := valueOpts.FieldByName(opt.Name)
-		if field.IsValid() && field.CanSet() {
-			if val == nil {
-				continue
-			}
-
-			switch field.Kind() {
-			case reflect.Int:
-				switch t := val.(type) {
-				case int64:
-					field.SetInt(t)
-				case float64:
-					// when JSON unmarshals numbers, it uses float64, not int
-					field.SetInt(int64(t))
-				default:
-					return fmt.Errorf("option %q must be of type integer", key)
-				}
-			case reflect.Bool:
-				val, ok := val.(bool)
-				if !ok {
-					return fmt.Errorf("option %q must be of type boolean", key)
-				}
-				field.SetBool(val)
-			case reflect.Float32:
-				// JSON unmarshals to float64
-				val, ok := val.(float64)
-				if !ok {
-					return fmt.Errorf("option %q must be of type float32", key)
-				}
-				field.SetFloat(val)
-			case reflect.String:
-				val, ok := val.(string)
-				if !ok {
-					return fmt.Errorf("option %q must be of type string", key)
-				}
-				field.SetString(val)
-			case reflect.Slice:
-				// JSON unmarshals to []interface{}, not []string
-				val, ok := val.([]interface{})
-				if !ok {
-					return fmt.Errorf("option %q must be of type array", key)
-				}
-				// convert []interface{} to []string
-				slice := make([]string, len(val))
-				for i, item := range val {
-					str, ok := item.(string)
-					if !ok {
-						return fmt.Errorf("option %q must be of an array of strings", key)
+				switch field.Kind() {
+				case reflect.Int:
+					switch t := val.(type) {
+					case int64:
+						field.SetInt(t)
+					case float64:
+						// when JSON unmarshals numbers, it uses float64, not int
+						field.SetInt(int64(t))
+					default:
+						return fmt.Errorf("option %q must be of type integer", key)
 					}
-					slice[i] = str
+				case reflect.Bool:
+					val, ok := val.(bool)
+					if !ok {
+						return fmt.Errorf("option %q must be of type boolean", key)
+					}
+					field.SetBool(val)
+				case reflect.Float32:
+					// JSON unmarshals to float64
+					val, ok := val.(float64)
+					if !ok {
+						return fmt.Errorf("option %q must be of type float32", key)
+					}
+					field.SetFloat(val)
+				case reflect.String:
+					val, ok := val.(string)
+					if !ok {
+						return fmt.Errorf("option %q must be of type string", key)
+					}
+					field.SetString(val)
+				case reflect.Slice:
+					// JSON unmarshals to []interface{}, not []string
+					val, ok := val.([]interface{})
+					if !ok {
+						return fmt.Errorf("option %q must be of type array", key)
+					}
+					// convert []interface{} to []string
+					slice := make([]string, len(val))
+					for i, item := range val {
+						str, ok := item.(string)
+						if !ok {
+							return fmt.Errorf("option %q must be of an array of strings", key)
+						}
+						slice[i] = str
+					}
+					field.Set(reflect.ValueOf(slice))
+				default:
+					return fmt.Errorf("unknown type loading config params: %v", field.Kind())
 				}
-				field.Set(reflect.ValueOf(slice))
-			default:
-				return fmt.Errorf("unknown type loading config params: %v", field.Kind())
 			}
+		} else {
+			invalidOpts = append(invalidOpts, key)
 		}
 	}

+	if len(invalidOpts) > 0 {
+		return fmt.Errorf("%w: %v", ErrInvalidOpts, strings.Join(invalidOpts, ", "))
+	}
 	return nil
 }

-// DefaultOptions is the default set of options for [GenerateRequest]; these
-// values are used unless the user specifies other values explicitly.
 func DefaultOptions() Options {
 	return Options{
 		// options set on request to runner
@@ -482,7 +420,8 @@ func DefaultOptions() Options {
 			NumCtx:    2048,
 			NumBatch:  512,
 			NumGPU:    -1, // -1 here indicates that NumGPU should be set dynamically
-			NumThread: 0,  // let the runtime decide
+			NumGQA:    1,
+			NumThread: 0, // let the runtime decide
 			LowVRAM:   false,
 			F16KV:     true,
 			UseMLock:  false,
@@ -496,13 +435,6 @@ type Duration struct {
 	time.Duration
 }

-func (d Duration) MarshalJSON() ([]byte, error) {
-	if d.Duration < 0 {
-		return []byte("-1"), nil
-	}
-	return []byte("\"" + d.Duration.String() + "\""), nil
-}
-
 func (d *Duration) UnmarshalJSON(b []byte) (err error) {
 	var v any
 	if err := json.Unmarshal(b, &v); err != nil {
@@ -516,7 +448,7 @@ func (d *Duration) UnmarshalJSON(b []byte) (err error) {
 		if t < 0 {
 			d.Duration = time.Duration(math.MaxInt64)
 		} else {
-			d.Duration = time.Duration(int(t) * int(time.Second))
+			d.Duration = time.Duration(t * float64(time.Second))
 		}
 	case string:
 		d.Duration, err = time.ParseDuration(t)
@@ -526,8 +458,6 @@ func (d *Duration) UnmarshalJSON(b []byte) (err error) {
 		if d.Duration < 0 {
 			d.Duration = time.Duration(math.MaxInt64)
 		}
-	default:
-		return fmt.Errorf("Unsupported type: '%s'", reflect.TypeOf(v))
 	}

 	return nil
--- a/api/types_test.go
+++ b/api/types_test.go
@@ -21,11 +21,6 @@ func TestKeepAliveParsingFromJSON(t *testing.T) {
 			req:  `{ "keep_alive": 42 }`,
 			exp:  &Duration{42 * time.Second},
 		},
-		{
-			name: "Positive Float",
-			req:  `{ "keep_alive": 42.5 }`,
-			exp:  &Duration{42 * time.Second},
-		},
 		{
 			name: "Positive Integer String",
 			req:  `{ "keep_alive": "42m" }`,
@@ -36,11 +31,6 @@ func TestKeepAliveParsingFromJSON(t *testing.T) {
 			req:  `{ "keep_alive": -1 }`,
 			exp:  &Duration{math.MaxInt64},
 		},
-		{
-			name: "Negative Float",
-			req:  `{ "keep_alive": -3.14 }`,
-			exp:  &Duration{math.MaxInt64},
-		},
 		{
 			name: "Negative Integer String",
 			req:  `{ "keep_alive": "-1m" }`,
@@ -58,50 +48,3 @@ func TestKeepAliveParsingFromJSON(t *testing.T) {
 		})
 	}
 }
-
-func TestDurationMarshalUnmarshal(t *testing.T) {
-	tests := []struct {
-		name     string
-		input    time.Duration
-		expected time.Duration
-	}{
-		{
-			"negative duration",
-			time.Duration(-1),
-			time.Duration(math.MaxInt64),
-		},
-		{
-			"positive duration",
-			time.Duration(42 * time.Second),
-			time.Duration(42 * time.Second),
-		},
-		{
-			"another positive duration",
-			time.Duration(42 * time.Minute),
-			time.Duration(42 * time.Minute),
-		},
-		{
-			"zero duration",
-			time.Duration(0),
-			time.Duration(0),
-		},
-		{
-			"max duration",
-			time.Duration(math.MaxInt64),
-			time.Duration(math.MaxInt64),
-		},
-	}
-
-	for _, test := range tests {
-		t.Run(test.name, func(t *testing.T) {
-			b, err := json.Marshal(Duration{test.input})
-			require.NoError(t, err)
-
-			var d Duration
-			err = json.Unmarshal(b, &d)
-			require.NoError(t, err)
-
-			assert.Equal(t, test.expected, d.Duration, "input %v, marshalled %v, got %v", test.input, string(b), d.Duration)
-		})
-	}
-}
--- a/app/.gitignore
+++ b/app/.gitignore
@@ -1 +1,2 @@
 ollama.syso
+app
--- a/app/AppDelegate.h
+++ b/app/AppDelegate.h
@@ -0,0 +1,7 @@
+#import <Cocoa/Cocoa.h>
+
+@interface AppDelegate : NSObject <NSApplicationDelegate>
+
+- (void)applicationDidFinishLaunching:(NSNotification *)aNotification;
+
+@end
--- a/app/README.md
+++ b/app/README.md
@@ -1,10 +1,6 @@
 # Ollama App

-## Linux
-
-TODO
-
-## MacOS
+## macOS

 TODO

--- a/app/app_darwin.go
+++ b/app/app_darwin.go
@@ -0,0 +1,76 @@
+package main
+
+// #cgo CFLAGS: -x objective-c
+// #cgo LDFLAGS: -framework Cocoa -framework LocalAuthentication -framework ServiceManagement
+// #include "app_darwin.h"
+import "C"
+import (
+	"context"
+	"fmt"
+	"log/slog"
+	"os"
+	"path/filepath"
+	"syscall"
+)
+
+func init() {
+	home, err := os.UserHomeDir()
+	if err != nil {
+		panic(err)
+	}
+
+	ServerLogFile = filepath.Join(home, ".ollama", "logs", "server.log")
+}
+
+func run() {
+	initLogging()
+	slog.Info("ollama macOS app started")
+
+	// Ask to move to applications directory
+	moving := C.askToMoveToApplications()
+	if moving {
+		return
+	}
+
+	C.killOtherInstances()
+
+	code := C.installSymlink()
+	if code != 0 {
+		slog.Error("Failed to install symlink")
+	}
+
+	exe, err := os.Executable()
+	if err != nil {
+		panic(err)
+	}
+
+	var options ServerOptions
+
+	ctx, cancel := context.WithCancel(context.Background())
+	var done chan int
+
+	done, err = SpawnServer(ctx, filepath.Join(filepath.Dir(exe), "..", "Resources", "ollama"), options)
+	if err != nil {
+		slog.Error(fmt.Sprintf("Failed to spawn ollama server %s", err))
+		done = make(chan int, 1)
+		done <- 1
+	}
+
+	// Run the native macOS app
+	// Note: this will block until the app is closed
+	C.run()
+
+	slog.Info("ollama macOS app closed")
+
+	cancel()
+	slog.Info("Waiting for ollama server to shutdown...")
+	if done != nil {
+		<-done
+	}
+	slog.Info("Ollama app exiting")
+}
+
+//export Quit
+func Quit() {
+	syscall.Kill(os.Getpid(), syscall.SIGTERM)
+}
--- a/app/app_darwin.h
+++ b/app/app_darwin.h
@@ -0,0 +1,13 @@
+#import <Cocoa/Cocoa.h>
+
+@interface AppDelegate : NSObject <NSApplicationDelegate>
+- (void)applicationDidFinishLaunching:(NSNotification *)aNotification;
+@end
+
+void run();
+void killOtherInstances();
+bool askToMoveToApplications();
+int createSymlinkWithAuthorization();
+int installSymlink();
+extern void Restart();
+extern void Quit();
--- a/app/app_darwin.m
+++ b/app/app_darwin.m
@@ -0,0 +1,282 @@
+#import <AppKit/AppKit.h>
+#import <Cocoa/Cocoa.h>
+#import <CoreServices/CoreServices.h>
+#import <Security/Security.h>
+#import <ServiceManagement/ServiceManagement.h>
+#import "app_darwin.h"
+
+@interface AppDelegate ()
+
+@property (strong, nonatomic) NSStatusItem *statusItem;
+
+@end
+
+@implementation AppDelegate
+
+- (void)applicationDidFinishLaunching:(NSNotification *)aNotification {
+    // show status menu
+    NSMenu *menu = [[NSMenu alloc] init];
+
+    NSMenuItem *aboutMenuItem = [[NSMenuItem alloc] initWithTitle:@"About Ollama" action:@selector(aboutOllama) keyEquivalent:@""];
+    [aboutMenuItem setTarget:self];
+    [menu addItem:aboutMenuItem];
+
+    // Settings submenu
+    NSMenu *settingsMenu = [[NSMenu alloc] initWithTitle:@"Settings"];
+
+    // Submenu items
+    NSMenuItem *chooseModelDirectoryItem = [[NSMenuItem alloc] initWithTitle:@"Choose model directory..." action:@selector(chooseModelDirectory) keyEquivalent:@""];
+    [chooseModelDirectoryItem setTarget:self];
+    [chooseModelDirectoryItem setEnabled:YES];
+    [settingsMenu addItem:chooseModelDirectoryItem];
+
+    NSMenuItem *exposeExternallyItem = [[NSMenuItem alloc] initWithTitle:@"Allow external connections" action:@selector(toggleExposeExternally:) keyEquivalent:@""];
+    [exposeExternallyItem setTarget:self];
+    [exposeExternallyItem setState:NSOffState]; // Set initial state to off
+    [exposeExternallyItem setEnabled:YES];
+    [settingsMenu addItem:exposeExternallyItem];
+
+    NSMenuItem *allowCrossOriginItem = [[NSMenuItem alloc] initWithTitle:@"Allow browser requests" action:@selector(toggleCrossOrigin:) keyEquivalent:@""];
+    [allowCrossOriginItem setTarget:self];
+    [allowCrossOriginItem setState:NSOffState]; // Set initial state to off
+    [allowCrossOriginItem setEnabled:YES];
+    [settingsMenu addItem:allowCrossOriginItem];
+
+    NSMenuItem *settingsMenuItem = [[NSMenuItem alloc] initWithTitle:@"Settings" action:nil keyEquivalent:@""];
+    [settingsMenuItem setSubmenu:settingsMenu];
+    [menu addItem:settingsMenuItem];
+
+    [menu addItemWithTitle:@"Quit Ollama" action:@selector(quit) keyEquivalent:@"q"];
+
+    self.statusItem = [[NSStatusBar systemStatusBar] statusItemWithLength:NSVariableStatusItemLength];
+    [self.statusItem addObserver:self forKeyPath:@"button.effectiveAppearance" options:NSKeyValueObservingOptionNew|NSKeyValueObservingOptionInitial context:nil];
+
+    self.statusItem.menu = menu;
+    [self showIcon];
+}
+
+- (void)aboutOllama {
+    [[NSApplication sharedApplication] orderFrontStandardAboutPanel:nil];
+}
+
+- (void)toggleCrossOrigin:(id)sender {
+    NSMenuItem *item = (NSMenuItem *)sender;
+    if ([item state] == NSOffState) {
+        // Do something when cross-origin requests are allowed
+        [item setState:NSOnState];
+    } else {
+        // Do something when cross-origin requests are disallowed
+        [item setState:NSOffState];
+    }
+}
+
+- (void)toggleExposeExternally:(id)sender {
+    NSMenuItem *item = (NSMenuItem *)sender;
+    if ([item state] == NSOffState) {
+        // Do something when Ollama is exposed externally
+        [item setState:NSOnState];
+    } else {
+        // Do something when Ollama is not exposed externally
+        [item setState:NSOffState];
+    }
+}
+
+- (void)chooseModelDirectory {
+    NSOpenPanel *openPanel = [NSOpenPanel openPanel];
+    [openPanel setCanChooseFiles:NO];
+    [openPanel setCanChooseDirectories:YES];
+    [openPanel setAllowsMultipleSelection:NO];
+
+    NSInteger result = [openPanel runModal];
+    if (result == NSModalResponseOK) {
+        NSURL *selectedDirectoryURL = [openPanel URLs].firstObject;
+        // Do something with the selected directory URL
+    }
+}
+
+-(void) showIcon {
+    NSAppearance* appearance = self.statusItem.button.effectiveAppearance;
+    NSString* appearanceName = (NSString*)(appearance.name);
+    NSString* iconName = [[appearanceName lowercaseString] containsString:@"dark"] ? @"iconDark" : @"icon";
+    NSImage* statusImage = [NSImage imageNamed:iconName];
+    [statusImage setTemplate:YES];
+    self.statusItem.button.image = statusImage;
+}
+
+-(void)observeValueForKeyPath:(NSString *)keyPath ofObject:(id)object change:(NSDictionary<NSKeyValueChangeKey,id> *)change context:(void *)context {
+    [self showIcon];
+}
+
+- (void)quit {
+    [NSApp stop:nil];
+}
+
+@end
+
+void run() {
+    @autoreleasepool {
+        [NSApplication sharedApplication];
+        AppDelegate *appDelegate = [[AppDelegate alloc] init];
+        [NSApp setDelegate:appDelegate];
+        [NSApp run];
+    }
+}
+
+// killOtherInstances kills all other instances of the app currently
+// running. This way we can ensure that only the most recently started
+// instance of Ollama is running
+void killOtherInstances() {
+    pid_t pid = getpid();
+    NSArray *all = [[NSWorkspace sharedWorkspace] runningApplications];
+    NSMutableArray *apps = [NSMutableArray array];
+
+    for (NSRunningApplication *app in all) {
+        if ([app.bundleIdentifier isEqualToString:[[NSBundle mainBundle] bundleIdentifier]] ||
+            [app.bundleIdentifier isEqualToString:@"ai.ollama.ollama"] ||
+            [app.bundleIdentifier isEqualToString:@"com.electron.ollama"]) {
+            if (app.processIdentifier != pid) {
+                [apps addObject:app];
+            }
+        }
+    }
+
+    for (NSRunningApplication *app in apps) {
+        kill(app.processIdentifier, SIGTERM);
+    }
+
+    NSDate *startTime = [NSDate date];
+    for (NSRunningApplication *app in apps) {
+        while (!app.terminated) {
+            if (-[startTime timeIntervalSinceNow] >= 5) {
+                kill(app.processIdentifier, SIGKILL);
+                break;
+            }
+
+            [[NSRunLoop currentRunLoop] runUntilDate:[NSDate dateWithTimeIntervalSinceNow:0.1]];
+        }
+    }
+}
+
+bool askToMoveToApplications() {
+    NSString *bundlePath = [[NSBundle mainBundle] bundlePath];
+    if ([bundlePath hasPrefix:@"/Applications"]) {
+        return false;
+    }
+
+    NSAlert *alert = [[NSAlert alloc] init];
+    [alert setMessageText:@"Move to Applications?"];
+    [alert setInformativeText:@"Ollama works best when run from the Applications directory."];
+    [alert addButtonWithTitle:@"Move to Applications"];
+    [alert addButtonWithTitle:@"Don't move"];
+
+    [NSApp activateIgnoringOtherApps:YES];
+
+    if ([alert runModal] != NSAlertFirstButtonReturn) {
+        return false;
+    }
+
+    // move to applications
+    NSString *applicationsPath = @"/Applications";
+    NSString *newPath = [applicationsPath stringByAppendingPathComponent:@"Ollama.app"];
+    NSFileManager *fileManager = [NSFileManager defaultManager];
+
+    // Check if the newPath already exists
+    if ([fileManager fileExistsAtPath:newPath]) {
+        NSError *removeError = nil;
+        [fileManager removeItemAtPath:newPath error:&removeError];
+        if (removeError) {
+            NSLog(@"Error removing file at %@: %@", newPath, removeError);
+            return false; // or handle the error
+        }
+    }
+
+    NSError *moveError = nil;
+    [fileManager moveItemAtPath:bundlePath toPath:newPath error:&moveError];
+    if (moveError) {
+        NSLog(@"Error moving file from %@ to %@: %@", bundlePath, newPath, moveError);
+        return false;
+    }
+
+    NSLog(@"Opening %@", newPath);
+    NSError *error = nil;
+    NSWorkspace *workspace = [NSWorkspace sharedWorkspace];
+#pragma clang diagnostic ignored "-Wdeprecated-declarations"
+    [workspace launchApplicationAtURL:[NSURL fileURLWithPath:newPath]
+               options:NSWorkspaceLaunchNewInstance | NSWorkspaceLaunchDefault
+               configuration:@{}
+               error:&error];
+
+    return true;
+}
+
+int installSymlink() {
+    NSString *linkPath = @"/usr/local/bin/ollama";
+    NSError *error = nil;
+
+    NSFileManager *fileManager = [NSFileManager defaultManager];
+    NSString *symlinkPath = [fileManager destinationOfSymbolicLinkAtPath:linkPath error:&error];
+    NSString *bundlePath = [[NSBundle mainBundle] bundlePath];
+    NSString *execPath = [[NSBundle mainBundle] executablePath];
+    NSString *resPath = [[NSBundle mainBundle] pathForResource:@"ollama" ofType:nil];
+
+    // if the symlink already exists and points to the right place, don't prompt
+    if ([symlinkPath isEqualToString:resPath]) {
+        NSLog(@"symbolic link already exists and points to the right place");
+        return 0;
+    }
+
+    NSString *authorizationPrompt = @"Ollama is trying to install its command line interface (CLI) tool.";
+
+    AuthorizationRef auth = NULL;
+    OSStatus createStatus = AuthorizationCreate(NULL, kAuthorizationEmptyEnvironment, kAuthorizationFlagDefaults, &auth);
+    if (createStatus != errAuthorizationSuccess) {
+        NSLog(@"Error creating authorization");
+        return -1;
+    }
+
+    NSString * bundleIdentifier = [[NSBundle mainBundle] bundleIdentifier];
+    NSString *rightNameString = [NSString stringWithFormat:@"%@.%@", bundleIdentifier, @"auth3"];
+    const char *rightName = rightNameString.UTF8String;
+
+    OSStatus getRightResult = AuthorizationRightGet(rightName, NULL);
+    if (getRightResult == errAuthorizationDenied) {
+        if (AuthorizationRightSet(auth, rightName, (__bridge CFTypeRef _Nonnull)(@(kAuthorizationRuleAuthenticateAsAdmin)), (__bridge CFStringRef _Nullable)(authorizationPrompt), NULL, NULL) != errAuthorizationSuccess) {
+            NSLog(@"Failed to set right");
+            return -1;
+        }
+    }
+
+    AuthorizationItem right = { .name = rightName, .valueLength = 0, .value = NULL, .flags = 0 };
+    AuthorizationRights rights = { .count = 1, .items = &right };
+    AuthorizationFlags flags = (AuthorizationFlags)(kAuthorizationFlagExtendRights | kAuthorizationFlagInteractionAllowed);
+    AuthorizationItem iconAuthorizationItem = {.name = kAuthorizationEnvironmentIcon, .valueLength = 0, .value = NULL, .flags = 0};
+    AuthorizationEnvironment authorizationEnvironment = {.count = 0, .items = NULL};
+
+    BOOL failedToUseSystemDomain = NO;
+    OSStatus copyStatus = AuthorizationCopyRights(auth, &rights, &authorizationEnvironment, flags, NULL);
+    if (copyStatus != errAuthorizationSuccess) {
+        failedToUseSystemDomain = YES;
+
+        if (copyStatus == errAuthorizationCanceled) {
+            NSLog(@"User cancelled authorization");
+            return -1;
+        } else {
+            NSLog(@"Failed copying system domain rights: %d", copyStatus);
+            return -1;
+        }
+    }
+
+    const char *toolPath = "/bin/ln";
+    const char *args[] = {"-s", "-F", [resPath UTF8String], "/usr/local/bin/ollama", NULL};
+    FILE *pipe = NULL;
+
+#pragma clang diagnostic ignored "-Wdeprecated-declarations"
+    OSStatus status = AuthorizationExecuteWithPrivileges(auth, toolPath, kAuthorizationFlagDefaults, (char *const *)args, &pipe);
+    if (status != errAuthorizationSuccess) {
+        NSLog(@"Failed to create symlink");
+        return -1;
+    }
+
+    AuthorizationFree(auth, kAuthorizationFlagDestroyRights);
+    return 0;
+}
--- a/app/app_windows.go
+++ b/app/app_windows.go
@@ -0,0 +1,166 @@
+package main
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"log"
+	"log/slog"
+	"os"
+	"os/exec"
+	"os/signal"
+	"path/filepath"
+	"strings"
+	"syscall"
+
+	"github.com/ollama/ollama/app/lifecycle"
+	"github.com/ollama/ollama/app/store"
+	"github.com/ollama/ollama/app/tray"
+	"github.com/ollama/ollama/app/updater"
+)
+
+func init() {
+	AppName += ".exe"
+	CLIName += ".exe"
+	// Logs, configs, downloads go to LOCALAPPDATA
+	localAppData := os.Getenv("LOCALAPPDATA")
+	AppDataDir = filepath.Join(localAppData, "Ollama")
+	AppLogFile = filepath.Join(AppDataDir, "app.log")
+	ServerLogFile = filepath.Join(AppDataDir, "server.log")
+
+	// Executables are stored in APPDATA
+	AppDir = filepath.Join(localAppData, "Programs", "Ollama")
+
+	// Make sure we have PATH set correctly for any spawned children
+	paths := strings.Split(os.Getenv("PATH"), ";")
+	// Start with whatever we find in the PATH/LD_LIBRARY_PATH
+	found := false
+	for _, path := range paths {
+		d, err := filepath.Abs(path)
+		if err != nil {
+			continue
+		}
+		if strings.EqualFold(AppDir, d) {
+			found = true
+		}
+	}
+	if !found {
+		paths = append(paths, AppDir)
+
+		pathVal := strings.Join(paths, ";")
+		slog.Debug("setting PATH=" + pathVal)
+		err := os.Setenv("PATH", pathVal)
+		if err != nil {
+			slog.Error(fmt.Sprintf("failed to update PATH: %s", err))
+		}
+	}
+
+	// Make sure our logging dir exists
+	_, err := os.Stat(AppDataDir)
+	if errors.Is(err, os.ErrNotExist) {
+		if err := os.MkdirAll(AppDataDir, 0o755); err != nil {
+			slog.Error(fmt.Sprintf("create ollama dir %s: %v", AppDataDir, err))
+		}
+	}
+}
+
+func ShowLogs() {
+	cmd_path := "c:\\Windows\\system32\\cmd.exe"
+	slog.Debug(fmt.Sprintf("viewing logs with start %s", AppDataDir))
+	cmd := exec.Command(cmd_path, "/c", "start", AppDataDir)
+	cmd.SysProcAttr = &syscall.SysProcAttr{HideWindow: false, CreationFlags: 0x08000000}
+	err := cmd.Start()
+	if err != nil {
+		slog.Error(fmt.Sprintf("Failed to open log dir: %s", err))
+	}
+}
+
+func Start() {
+	cmd_path := "c:\\Windows\\system32\\cmd.exe"
+	slog.Debug(fmt.Sprintf("viewing logs with start %s", AppDataDir))
+	cmd := exec.Command(cmd_path, "/c", "start", AppDataDir)
+	cmd.SysProcAttr = &syscall.SysProcAttr{HideWindow: false, CreationFlags: 0x08000000}
+	err := cmd.Start()
+	if err != nil {
+		slog.Error(fmt.Sprintf("Failed to open log dir: %s", err))
+	}
+}
+
+func run() {
+	initLogging()
+
+	slog.Info("ollama windows app started")
+
+	ctx, cancel := context.WithCancel(context.Background())
+	var done chan int
+
+	t, err := tray.NewTray()
+	if err != nil {
+		log.Fatalf("Failed to start: %s", err)
+	}
+	callbacks := t.GetCallbacks()
+
+	signals := make(chan os.Signal, 1)
+	signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM)
+
+	go func() {
+		slog.Debug("starting callback loop")
+		for {
+			select {
+			case <-callbacks.Quit:
+				slog.Debug("quit called")
+				t.Quit()
+			case <-signals:
+				slog.Debug("shutting down due to signal")
+				t.Quit()
+			case <-callbacks.Update:
+				err := updater.DoUpgrade(cancel, done)
+				if err != nil {
+					slog.Warn(fmt.Sprintf("upgrade attempt failed: %s", err))
+				}
+			case <-callbacks.ShowLogs:
+				ShowLogs()
+			case <-callbacks.DoFirstUse:
+				err := lifecycle.GetStarted()
+				if err != nil {
+					slog.Warn(fmt.Sprintf("Failed to launch getting started shell: %s", err))
+				}
+			}
+		}
+	}()
+
+	if !store.GetFirstTimeRun() {
+		slog.Debug("First time run")
+		err = t.DisplayFirstUseNotification()
+		if err != nil {
+			slog.Debug(fmt.Sprintf("XXX failed to display first use notification %v", err))
+		}
+		store.SetFirstTimeRun(true)
+	} else {
+		slog.Debug("Not first time, skipping first run notification")
+	}
+
+	if isServerRunning(ctx) {
+		slog.Info("Detected another instance of ollama running, exiting")
+		os.Exit(1)
+	}
+
+	done, err = SpawnServer(ctx, CLIName)
+	if err != nil {
+		// TODO - should we retry in a backoff loop?
+		// TODO - should we pop up a warning and maybe add a menu item to view application logs?
+		slog.Error(fmt.Sprintf("Failed to spawn ollama server %s", err))
+		done = make(chan int, 1)
+		done <- 1
+	}
+
+	updater.StartBackgroundUpdaterChecker(ctx, t.UpdateAvailable)
+
+	t.Run()
+	cancel()
+	slog.Info("Waiting for ollama server to shutdown...")
+	if done != nil {
+		<-done
+	}
+	slog.Info("Ollama app exiting")
+}
--- a/app/darwin/Ollama.app/Contents/Info.plist
+++ b/app/darwin/Ollama.app/Contents/Info.plist
@@ -0,0 +1,40 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+  <dict>
+    <key>CFBundleDisplayName</key>
+    <string>Ollama</string>
+    <key>CFBundleExecutable</key>
+    <string>Ollama</string>
+    <key>CFBundleIconFile</key>
+    <string>icon.icns</string>
+    <key>CFBundleIdentifier</key>
+    <string>com.ollama.ollama</string>
+    <key>CFBundleInfoDictionaryVersion</key>
+    <string>6.0</string>
+    <key>CFBundleName</key>
+    <string>Ollama</string>
+    <key>CFBundlePackageType</key>
+    <string>APPL</string>
+    <key>CFBundleShortVersionString</key>
+    <string>0.0.0</string>
+    <key>CFBundleVersion</key>
+    <string>0.0.0</string>
+    <key>DTCompiler</key>
+    <string>com.apple.compilers.llvm.clang.1_0</string>
+    <key>DTSDKBuild</key>
+    <string>22E245</string>
+    <key>DTSDKName</key>
+    <string>macosx13.3</string>
+    <key>DTXcode</key>
+    <string>1431</string>
+    <key>DTXcodeBuild</key>
+    <string>14E300c</string>
+    <key>LSApplicationCategoryType</key>
+    <string>public.app-category.developer-tools</string>
+    <key>LSMinimumSystemVersion</key>
+    <string>11.0</string>
+    <key>LSUIElement</key>
+    <true/>
+  </dict>
+</plist>
--- a/app/darwin/Ollama.app/Contents/Resources/icon.icns
+++ b/app/darwin/Ollama.app/Contents/Resources/icon.icns
--- a/app/darwin/Ollama.app/Contents/Resources/icon.png
+++ b/app/darwin/Ollama.app/Contents/Resources/icon.png
--- a/app/darwin/Ollama.app/Contents/Resources/icon@2x.png
+++ b/app/darwin/Ollama.app/Contents/Resources/icon@2x.png
--- a/app/darwin/Ollama.app/Contents/Resources/iconDark.png
+++ b/app/darwin/Ollama.app/Contents/Resources/iconDark.png
--- a/app/darwin/Ollama.app/Contents/Resources/iconDark@2x.png
+++ b/app/darwin/Ollama.app/Contents/Resources/iconDark@2x.png
--- a/app/lifecycle/getstarted_nonwindows.go
+++ b/app/lifecycle/getstarted_nonwindows.go
@@ -1,5 +1,3 @@
-//go:build !windows
-
 package lifecycle

 import "fmt"
--- a/app/lifecycle/lifecycle.go
+++ b/app/lifecycle/lifecycle.go
@@ -1,92 +0,0 @@
-package lifecycle
-
-import (
-	"context"
-	"fmt"
-	"log"
-	"log/slog"
-	"os"
-	"os/signal"
-	"syscall"
-
-	"github.com/ollama/ollama/app/store"
-	"github.com/ollama/ollama/app/tray"
-)
-
-func Run() {
-	InitLogging()
-
-	ctx, cancel := context.WithCancel(context.Background())
-	var done chan int
-
-	t, err := tray.NewTray()
-	if err != nil {
-		log.Fatalf("Failed to start: %s", err)
-	}
-	callbacks := t.GetCallbacks()
-
-	signals := make(chan os.Signal, 1)
-	signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM)
-
-	go func() {
-		slog.Debug("starting callback loop")
-		for {
-			select {
-			case <-callbacks.Quit:
-				slog.Debug("quit called")
-				t.Quit()
-			case <-signals:
-				slog.Debug("shutting down due to signal")
-				t.Quit()
-			case <-callbacks.Update:
-				err := DoUpgrade(cancel, done)
-				if err != nil {
-					slog.Warn(fmt.Sprintf("upgrade attempt failed: %s", err))
-				}
-			case <-callbacks.ShowLogs:
-				ShowLogs()
-			case <-callbacks.DoFirstUse:
-				err := GetStarted()
-				if err != nil {
-					slog.Warn(fmt.Sprintf("Failed to launch getting started shell: %s", err))
-				}
-			}
-		}
-	}()
-
-	// Are we first use?
-	if !store.GetFirstTimeRun() {
-		slog.Debug("First time run")
-		err = t.DisplayFirstUseNotification()
-		if err != nil {
-			slog.Debug(fmt.Sprintf("XXX failed to display first use notification %v", err))
-		}
-		store.SetFirstTimeRun(true)
-	} else {
-		slog.Debug("Not first time, skipping first run notification")
-	}
-
-	if IsServerRunning(ctx) {
-		slog.Info("Detected another instance of ollama running, exiting")
-		os.Exit(1)
-	} else {
-		done, err = SpawnServer(ctx, CLIName)
-		if err != nil {
-			// TODO - should we retry in a backoff loop?
-			// TODO - should we pop up a warning and maybe add a menu item to view application logs?
-			slog.Error(fmt.Sprintf("Failed to spawn ollama server %s", err))
-			done = make(chan int, 1)
-			done <- 1
-		}
-	}
-
-	StartBackgroundUpdaterChecker(ctx, t.UpdateAvailable)
-
-	t.Run()
-	cancel()
-	slog.Info("Waiting for ollama server to shutdown...")
-	if done != nil {
-		<-done
-	}
-	slog.Info("Ollama app exiting")
-}
--- a/app/lifecycle/logging_nonwindows.go
+++ b/app/lifecycle/logging_nonwindows.go
@@ -1,9 +0,0 @@
-//go:build !windows
-
-package lifecycle
-
-import "log/slog"
-
-func ShowLogs() {
-	slog.Warn("ShowLogs not yet implemented")
-}
--- a/app/lifecycle/logging_windows.go
+++ b/app/lifecycle/logging_windows.go
@@ -1,19 +0,0 @@
-package lifecycle
-
-import (
-	"fmt"
-	"log/slog"
-	"os/exec"
-	"syscall"
-)
-
-func ShowLogs() {
-	cmd_path := "c:\\Windows\\system32\\cmd.exe"
-	slog.Debug(fmt.Sprintf("viewing logs with start %s", AppDataDir))
-	cmd := exec.Command(cmd_path, "/c", "start", AppDataDir)
-	cmd.SysProcAttr = &syscall.SysProcAttr{HideWindow: false, CreationFlags: 0x08000000}
-	err := cmd.Start()
-	if err != nil {
-		slog.Error(fmt.Sprintf("Failed to open log dir: %s", err))
-	}
-}
--- a/app/lifecycle/paths.go
+++ b/app/lifecycle/paths.go
@@ -70,10 +70,5 @@ func init() {
 			}
 		}

-	} else if runtime.GOOS == "darwin" {
-		// TODO
-		AppName += ".app"
-		// } else if runtime.GOOS == "linux" {
-		// TODO
 	}
 }
--- a/app/lifecycle/logging.go
+++ b/app/lifecycle/logging.go
@@ -1,18 +1,16 @@
-package lifecycle
+package main

 import (
 	"fmt"
 	"log/slog"
 	"os"
 	"path/filepath"
-
-	"github.com/ollama/ollama/server/envconfig"
 )

-func InitLogging() {
+func initLogging() {
 	level := slog.LevelInfo

-	if envconfig.Debug {
+	if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
 		level = slog.LevelDebug
 	}

@@ -43,6 +41,4 @@ func InitLogging() {
 	})

 	slog.SetDefault(slog.New(handler))
-
-	slog.Info("ollama app started")
 }
--- a/app/main.go
+++ b/app/main.go
@@ -2,11 +2,15 @@ package main

 // Compile with the following to get rid of the cmd pop up on windows
 // go build -ldflags="-H windowsgui" .
-
-import (
-	"github.com/ollama/ollama/app/lifecycle"
+var (
+	AppName       string
+	CLIName       string
+	AppDir        string
+	AppDataDir    string
+	AppLogFile    string
+	ServerLogFile string
 )

 func main() {
-	lifecycle.Run()
+	run()
 }
--- a/app/lifecycle/server.go
+++ b/app/lifecycle/server.go
@@ -1,4 +1,4 @@
-package lifecycle
+package main

 import (
 	"context"
@@ -14,37 +14,28 @@ import (
 	"github.com/ollama/ollama/api"
 )

-func getCLIFullPath(command string) string {
-	cmdPath := ""
-	appExe, err := os.Executable()
-	if err == nil {
-		cmdPath = filepath.Join(filepath.Dir(appExe), command)
-		_, err := os.Stat(cmdPath)
-		if err == nil {
-			return cmdPath
-		}
-	}
-	cmdPath, err = exec.LookPath(command)
-	if err == nil {
-		_, err := os.Stat(cmdPath)
-		if err == nil {
-			return cmdPath
-		}
-	}
-	pwd, err := os.Getwd()
-	if err == nil {
-		cmdPath = filepath.Join(pwd, command)
-		_, err = os.Stat(cmdPath)
-		if err == nil {
-			return cmdPath
-		}
-	}
-
-	return command
+type ServerOptions struct {
+	Cors       bool
+	Expose     bool
+	ModelsPath string
 }

-func start(ctx context.Context, command string) (*exec.Cmd, error) {
-	cmd := getCmd(ctx, getCLIFullPath(command))
+func start(ctx context.Context, command string, options ServerOptions) (*exec.Cmd, error) {
+	cmd := getCmd(ctx, command)
+
+	// set environment variables
+	if options.ModelsPath != "" {
+		cmd.Env = append(cmd.Env, fmt.Sprintf("OLLAMA_MODELS=%s", options.ModelsPath))
+	}
+
+	if options.Cors {
+		cmd.Env = append(cmd.Env, "OLLAMA_ORIGINS=*")
+	}
+
+	if options.Expose {
+		cmd.Env = append(cmd.Env, "OLLAMA_HOST=0.0.0.0")
+	}
+
 	stdout, err := cmd.StdoutPipe()
 	if err != nil {
 		return nil, fmt.Errorf("failed to spawn server stdout pipe: %w", err)
@@ -59,20 +50,6 @@ func start(ctx context.Context, command string) (*exec.Cmd, error) {
 	if err != nil {
 		return nil, fmt.Errorf("failed to create server log: %w", err)
 	}
-
-	logDir := filepath.Dir(ServerLogFile)
-	_, err = os.Stat(logDir)
-	if err != nil {
-		if !errors.Is(err, os.ErrNotExist) {
-			return nil, fmt.Errorf("stat ollama server log dir %s: %v", logDir, err)
-
-		}
-
-		if err := os.MkdirAll(logDir, 0o755); err != nil {
-			return nil, fmt.Errorf("create ollama server log dir %s: %v", logDir, err)
-		}
-	}
-
 	go func() {
 		defer logFile.Close()
 		io.Copy(logFile, stdout) //nolint:errcheck
@@ -126,20 +103,25 @@ func start(ctx context.Context, command string) (*exec.Cmd, error) {
 	return cmd, nil
 }

-func SpawnServer(ctx context.Context, command string) (chan int, error) {
+func SpawnServer(ctx context.Context, command string, options ServerOptions) (chan int, error) {
+	logDir := filepath.Dir(ServerLogFile)
+	_, err := os.Stat(logDir)
+	if errors.Is(err, os.ErrNotExist) {
+		if err := os.MkdirAll(logDir, 0o755); err != nil {
+			return nil, fmt.Errorf("create ollama server log dir %s: %v", logDir, err)
+		}
+	}
+
 	done := make(chan int)

 	go func() {
 		// Keep the server running unless we're shuttind down the app
 		crashCount := 0
 		for {
-			slog.Info("starting server...")
-			cmd, err := start(ctx, command)
+			slog.Info(fmt.Sprintf("starting server..."))
+			cmd, err := start(ctx, command, options)
 			if err != nil {
-				crashCount++
 				slog.Error(fmt.Sprintf("failed to start server %s", err))
-				time.Sleep(500 * time.Millisecond * time.Duration(crashCount))
-				continue
 			}

 			cmd.Wait() //nolint:errcheck
@@ -165,7 +147,7 @@ func SpawnServer(ctx context.Context, command string) (chan int, error) {
 	return done, nil
 }

-func IsServerRunning(ctx context.Context) bool {
+func isServerRunning(ctx context.Context) bool {
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
 		slog.Info("unable to connect to server")
--- a/app/lifecycle/server_unix.go
+++ b/app/lifecycle/server_unix.go
@@ -1,6 +1,4 @@
-//go:build !windows
-
-package lifecycle
+package main

 import (
 	"context"
--- a/app/lifecycle/server_windows.go
+++ b/app/lifecycle/server_windows.go
@@ -1,4 +1,4 @@
-package lifecycle
+package main

 import (
 	"context"
--- a/app/tray/tray_nonwindows.go
+++ b/app/tray/tray_nonwindows.go
@@ -1,5 +1,3 @@
-//go:build !windows
-
 package tray

 import (
--- a/app/tray/wintray/menus.go
+++ b/app/tray/wintray/menus.go
@@ -1,71 +1,71 @@
-//go:build windows
-
-package wintray
-
-import (
-	"fmt"
-	"log/slog"
-	"unsafe"
-
-	"golang.org/x/sys/windows"
-)
-
-const (
-	updatAvailableMenuID = 1
-	updateMenuID         = updatAvailableMenuID + 1
-	separatorMenuID      = updateMenuID + 1
-	diagLogsMenuID       = separatorMenuID + 1
-	diagSeparatorMenuID  = diagLogsMenuID + 1
-	quitMenuID           = diagSeparatorMenuID + 1
-)
-
-func (t *winTray) initMenus() error {
-	if err := t.addOrUpdateMenuItem(diagLogsMenuID, 0, diagLogsMenuTitle, false); err != nil {
-		return fmt.Errorf("unable to create menu entries %w\n", err)
-	}
-	if err := t.addSeparatorMenuItem(diagSeparatorMenuID, 0); err != nil {
-		return fmt.Errorf("unable to create menu entries %w", err)
-	}
-	if err := t.addOrUpdateMenuItem(quitMenuID, 0, quitMenuTitle, false); err != nil {
-		return fmt.Errorf("unable to create menu entries %w\n", err)
-	}
-	return nil
-}
-
-func (t *winTray) UpdateAvailable(ver string) error {
-	if !t.updateNotified {
-		slog.Debug("updating menu and sending notification for new update")
-		if err := t.addOrUpdateMenuItem(updatAvailableMenuID, 0, updateAvailableMenuTitle, true); err != nil {
-			return fmt.Errorf("unable to create menu entries %w", err)
-		}
-		if err := t.addOrUpdateMenuItem(updateMenuID, 0, updateMenutTitle, false); err != nil {
-			return fmt.Errorf("unable to create menu entries %w", err)
-		}
-		if err := t.addSeparatorMenuItem(separatorMenuID, 0); err != nil {
-			return fmt.Errorf("unable to create menu entries %w", err)
-		}
-		iconFilePath, err := iconBytesToFilePath(wt.updateIcon)
-		if err != nil {
-			return fmt.Errorf("unable to write icon data to temp file: %w", err)
-		}
-		if err := wt.setIcon(iconFilePath); err != nil {
-			return fmt.Errorf("unable to set icon: %w", err)
-		}
-		t.updateNotified = true
-
-		t.pendingUpdate = true
-		// Now pop up the notification
-		t.muNID.Lock()
-		defer t.muNID.Unlock()
-		copy(t.nid.InfoTitle[:], windows.StringToUTF16(updateTitle))
-		copy(t.nid.Info[:], windows.StringToUTF16(fmt.Sprintf(updateMessage, ver)))
-		t.nid.Flags |= NIF_INFO
-		t.nid.Timeout = 10
-		t.nid.Size = uint32(unsafe.Sizeof(*wt.nid))
-		err = t.nid.modify()
-		if err != nil {
-			return err
-		}
-	}
-	return nil
-}
+//go:build windows
+
+package wintray
+
+import (
+	"fmt"
+	"log/slog"
+	"unsafe"
+
+	"golang.org/x/sys/windows"
+)
+
+const (
+	updatAvailableMenuID = 1
+	updateMenuID         = updatAvailableMenuID + 1
+	separatorMenuID      = updateMenuID + 1
+	diagLogsMenuID       = separatorMenuID + 1
+	diagSeparatorMenuID  = diagLogsMenuID + 1
+	quitMenuID           = diagSeparatorMenuID + 1
+)
+
+func (t *winTray) initMenus() error {
+	if err := t.addOrUpdateMenuItem(diagLogsMenuID, 0, diagLogsMenuTitle, false); err != nil {
+		return fmt.Errorf("unable to create menu entries %w\n", err)
+	}
+	if err := t.addSeparatorMenuItem(diagSeparatorMenuID, 0); err != nil {
+		return fmt.Errorf("unable to create menu entries %w", err)
+	}
+	if err := t.addOrUpdateMenuItem(quitMenuID, 0, quitMenuTitle, false); err != nil {
+		return fmt.Errorf("unable to create menu entries %w\n", err)
+	}
+	return nil
+}
+
+func (t *winTray) UpdateAvailable(ver string) error {
+	if !t.updateNotified {
+		slog.Debug("updating menu and sending notification for new update")
+		if err := t.addOrUpdateMenuItem(updatAvailableMenuID, 0, updateAvailableMenuTitle, true); err != nil {
+			return fmt.Errorf("unable to create menu entries %w", err)
+		}
+		if err := t.addOrUpdateMenuItem(updateMenuID, 0, updateMenutTitle, false); err != nil {
+			return fmt.Errorf("unable to create menu entries %w", err)
+		}
+		if err := t.addSeparatorMenuItem(separatorMenuID, 0); err != nil {
+			return fmt.Errorf("unable to create menu entries %w", err)
+		}
+		iconFilePath, err := iconBytesToFilePath(wt.updateIcon)
+		if err != nil {
+			return fmt.Errorf("unable to write icon data to temp file: %w", err)
+		}
+		if err := wt.setIcon(iconFilePath); err != nil {
+			return fmt.Errorf("unable to set icon: %w", err)
+		}
+		t.updateNotified = true
+
+		t.pendingUpdate = true
+		// Now pop up the notification
+		t.muNID.Lock()
+		defer t.muNID.Unlock()
+		copy(t.nid.InfoTitle[:], windows.StringToUTF16(updateTitle))
+		copy(t.nid.Info[:], windows.StringToUTF16(fmt.Sprintf(updateMessage, ver)))
+		t.nid.Flags |= NIF_INFO
+		t.nid.Timeout = 10
+		t.nid.Size = uint32(unsafe.Sizeof(*wt.nid))
+		err = t.nid.modify()
+		if err != nil {
+			return err
+		}
+	}
+	return nil
+}
--- a/app/lifecycle/updater.go
+++ b/app/lifecycle/updater.go
@@ -1,4 +1,4 @@
-package lifecycle
+package updater

 import (
 	"context"
@@ -22,6 +22,10 @@ import (
 	"github.com/ollama/ollama/version"
 )

+var (
+	UpdateStageDir string
+)
+
 var (
 	UpdateCheckURLBase  = "https://ollama.com/api/update"
 	UpdateDownloaded    = false
@@ -123,7 +127,7 @@ func DownloadNewRelease(ctx context.Context, updateResp UpdateResponse) error {
 		slog.Debug("no etag detected, falling back to filename based dedup")
 		etag = "_"
 	}
-	filename := Installer
+	filename := "OllamaSetup.exe"
 	_, params, err := mime.ParseMediaType(resp.Header.Get("content-disposition"))
 	if err == nil {
 		filename = params["filename"]
--- a/app/lifecycle/updater_nonwindows.go
+++ b/app/lifecycle/updater_nonwindows.go
@@ -1,6 +1,4 @@
-//go:build !windows
-
-package lifecycle
+package updater

 import (
 	"context"
--- a/app/lifecycle/updater_windows.go
+++ b/app/lifecycle/updater_windows.go
@@ -1,4 +1,4 @@
-package lifecycle
+package updater

 import (
 	"context"
@@ -9,7 +9,13 @@ import (
 	"path/filepath"
 )

+func init() {
+	UpdateStageDir = filepath.Join(os.Getenv("LOCALAPPDATA"), "Ollama", "updates")
+}
+
 func DoUpgrade(cancel context.CancelFunc, done chan int) error {
+	logFile := filepath.Join(os.Getenv("LOCALAPPDATA"), "Ollama", "upgrade.log")
+
 	files, err := filepath.Glob(filepath.Join(UpdateStageDir, "*", "*.exe")) // TODO generalize for multiplatform
 	if err != nil {
 		return fmt.Errorf("failed to lookup downloads: %s", err)
@@ -23,21 +29,24 @@ func DoUpgrade(cancel context.CancelFunc, done chan int) error {
 	installerExe := files[0]

 	slog.Info("starting upgrade with " + installerExe)
-	slog.Info("upgrade log file " + UpgradeLogFile)
+	slog.Info("upgrade log file " + logFile)

 	// When running in debug mode, we'll be "verbose" and let the installer pop up and prompt
 	installArgs := []string{
-		"/CLOSEAPPLICATIONS",                    // Quit the tray app if it's still running
-		"/LOG=" + filepath.Base(UpgradeLogFile), // Only relative seems reliable, so set pwd
-		"/FORCECLOSEAPPLICATIONS",               // Force close the tray app - might be needed
+		"/CLOSEAPPLICATIONS",             // Quit the tray app if it's still running
+		"/LOG=" + filepath.Base(logFile), // Only relative seems reliable, so set pwd
+		"/FORCECLOSEAPPLICATIONS",        // Force close the tray app - might be needed
 	}
-	// make the upgrade as quiet as possible (no GUI, no prompts)
+	// When we're not in debug mode, make the upgrade as quiet as possible (no GUI, no prompts)
+	// TODO - temporarily disable since we're pinning in debug mode for the preview
+	// if debug := os.Getenv("OLLAMA_DEBUG"); debug == "" {
 	installArgs = append(installArgs,
 		"/SP", // Skip the "This will install... Do you wish to continue" prompt
 		"/SUPPRESSMSGBOXES",
 		"/SILENT",
 		"/VERYSILENT",
 	)
+	// }

 	// Safeguard in case we have requests in flight that need to drain...
 	slog.Info("Waiting for server to shutdown")
@@ -50,7 +59,7 @@ func DoUpgrade(cancel context.CancelFunc, done chan int) error {
 	}

 	slog.Debug(fmt.Sprintf("starting installer: %s %v", installerExe, installArgs))
-	os.Chdir(filepath.Dir(UpgradeLogFile)) //nolint:errcheck
+	os.Chdir(filepath.Dir(logFile)) //nolint:errcheck
 	cmd := exec.Command(installerExe, installArgs...)

 	if err := cmd.Start(); err != nil {
--- a/app/windows/ollama.iss
+++ b/app/windows/ollama.iss
@@ -88,8 +88,8 @@ DialogFontSize=12
 [Files]
 Source: ".\app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ; Flags: ignoreversion 64bit
 Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit
-Source: "..\dist\windows-{#ARCH}\*.dll"; DestDir: "{app}"; Flags: ignoreversion 64bit
-Source: "..\dist\windows-{#ARCH}\ollama_runners\*"; DestDir: "{app}\ollama_runners"; Flags: ignoreversion 64bit recursesubdirs
+Source: "..\dist\windows-amd64\*.dll"; DestDir: "{app}"; Flags: ignoreversion 64bit
+Source: "..\dist\windows-amd64\ollama_runners\*"; DestDir: "{app}\ollama_runners"; Flags: ignoreversion 64bit recursesubdirs
 Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
 Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion
 #if DirExists("..\dist\windows-amd64\rocm")
--- a/app/windows/ollama.rc
+++ b/app/windows/ollama.rc
--- a/app/windows/ollama_welcome.ps1
+++ b/app/windows/ollama_welcome.ps1
--- a/auth/auth.go
+++ b/auth/auth.go
@@ -10,44 +10,12 @@ import (
 	"log/slog"
 	"os"
 	"path/filepath"
-	"strings"

 	"golang.org/x/crypto/ssh"
 )

 const defaultPrivateKey = "id_ed25519"

-func keyPath() (string, error) {
-	home, err := os.UserHomeDir()
-	if err != nil {
-		return "", err
-	}
-
-	return filepath.Join(home, ".ollama", defaultPrivateKey), nil
-}
-
-func GetPublicKey() (string, error) {
-	keyPath, err := keyPath()
-	if err != nil {
-		return "", err
-	}
-
-	privateKeyFile, err := os.ReadFile(keyPath)
-	if err != nil {
-		slog.Info(fmt.Sprintf("Failed to load private key: %v", err))
-		return "", err
-	}
-
-	privateKey, err := ssh.ParsePrivateKey(privateKeyFile)
-	if err != nil {
-		return "", err
-	}
-
-	publicKey := ssh.MarshalAuthorizedKey(privateKey.PublicKey())
-
-	return strings.TrimSpace(string(publicKey)), nil
-}
-
 func NewNonce(r io.Reader, length int) (string, error) {
 	nonce := make([]byte, length)
 	if _, err := io.ReadFull(r, nonce); err != nil {
@@ -58,11 +26,13 @@ func NewNonce(r io.Reader, length int) (string, error) {
 }

 func Sign(ctx context.Context, bts []byte) (string, error) {
-	keyPath, err := keyPath()
+	home, err := os.UserHomeDir()
 	if err != nil {
 		return "", err
 	}

+	keyPath := filepath.Join(home, ".ollama", defaultPrivateKey)
+
 	privateKeyFile, err := os.ReadFile(keyPath)
 	if err != nil {
 		slog.Info(fmt.Sprintf("Failed to load private key: %v", err))
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -32,12 +32,10 @@ import (
 	"golang.org/x/term"

 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/auth"
 	"github.com/ollama/ollama/format"
+	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/progress"
 	"github.com/ollama/ollama/server"
-	"github.com/ollama/ollama/types/errtypes"
-	"github.com/ollama/ollama/types/model"
 	"github.com/ollama/ollama/version"
 )

@@ -56,13 +54,12 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 	p := progress.NewProgress(os.Stderr)
 	defer p.Stop()

-	f, err := os.Open(filename)
+	modelfile, err := os.ReadFile(filename)
 	if err != nil {
 		return err
 	}
-	defer f.Close()

-	modelfile, err := model.ParseFile(f)
+	commands, err := parser.Parse(bytes.NewReader(modelfile))
 	if err != nil {
 		return err
 	}
@@ -76,10 +73,10 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 	spinner := progress.NewSpinner(status)
 	p.Add(status, spinner)

-	for i := range modelfile.Commands {
-		switch modelfile.Commands[i].Name {
+	for _, c := range commands {
+		switch c.Name {
 		case "model", "adapter":
-			path := modelfile.Commands[i].Args
+			path := c.Args
 			if path == "~" {
 				path = home
 			} else if strings.HasPrefix(path, "~/") {
@@ -91,7 +88,7 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 			}

 			fi, err := os.Stat(path)
-			if errors.Is(err, os.ErrNotExist) && modelfile.Commands[i].Name == "model" {
+			if errors.Is(err, os.ErrNotExist) && c.Name == "model" {
 				continue
 			} else if err != nil {
 				return err
@@ -114,7 +111,13 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
 				return err
 			}

-			modelfile.Commands[i].Args = "@" + digest
+			name := c.Name
+			if c.Name == "model" {
+				name = "from"
+			}
+
+			re := regexp.MustCompile(fmt.Sprintf(`(?im)^(%s)\s+%s\s*$`, name, c.Args))
+			modelfile = re.ReplaceAll(modelfile, []byte("$1 @"+digest))
 		}
 	}

@@ -144,7 +147,7 @@ func CreateHandler(cmd *cobra.Command, args []string) error {

 	quantization, _ := cmd.Flags().GetString("quantization")

-	request := api.CreateRequest{Name: args[0], Modelfile: modelfile.String(), Quantization: quantization}
+	request := api.CreateRequest{Name: args[0], Modelfile: string(modelfile), Quantization: quantization}
 	if err := client.Create(cmd.Context(), &request, fn); err != nil {
 		return err
 	}
@@ -354,47 +357,6 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 	return generateInteractive(cmd, opts)
 }

-func errFromUnknownKey(unknownKeyErr error) error {
-	// find SSH public key in the error message
-	sshKeyPattern := `ssh-\w+ [^\s"]+`
-	re := regexp.MustCompile(sshKeyPattern)
-	matches := re.FindStringSubmatch(unknownKeyErr.Error())
-
-	if len(matches) > 0 {
-		serverPubKey := matches[0]
-
-		localPubKey, err := auth.GetPublicKey()
-		if err != nil {
-			return unknownKeyErr
-		}
-
-		if runtime.GOOS == "linux" && serverPubKey != localPubKey {
-			// try the ollama service public key
-			svcPubKey, err := os.ReadFile("/usr/share/ollama/.ollama/id_ed25519.pub")
-			if err != nil {
-				return unknownKeyErr
-			}
-			localPubKey = strings.TrimSpace(string(svcPubKey))
-		}
-
-		// check if the returned public key matches the local public key, this prevents adding a remote key to the user's account
-		if serverPubKey != localPubKey {
-			return unknownKeyErr
-		}
-
-		var msg strings.Builder
-		msg.WriteString(unknownKeyErr.Error())
-		msg.WriteString("\n\nYour ollama key is:\n")
-		msg.WriteString(localPubKey)
-		msg.WriteString("\nAdd your key at:\n")
-		msg.WriteString("https://ollama.com/settings/keys")
-
-		return errors.New(msg.String())
-	}
-
-	return unknownKeyErr
-}
-
 func PushHandler(cmd *cobra.Command, args []string) error {
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
@@ -442,20 +404,6 @@ func PushHandler(cmd *cobra.Command, args []string) error {

 	request := api.PushRequest{Name: args[0], Insecure: insecure}
 	if err := client.Push(cmd.Context(), &request, fn); err != nil {
-		if spinner != nil {
-			spinner.Stop()
-		}
-		if strings.Contains(err.Error(), "access denied") {
-			return errors.New("you are not authorized to push to this namespace, create the model under a namespace you own")
-		}
-		host := model.ParseName(args[0]).Host
-		isOllamaHost := strings.HasSuffix(host, ".ollama.ai") || strings.HasSuffix(host, ".ollama.com")
-		if strings.Contains(err.Error(), errtypes.UnknownOllamaKeyErrMsg) && isOllamaHost {
-			// the user has not added their ollama key to ollama.com
-			// re-throw an error with a more user-friendly message
-			return errFromUnknownKey(err)
-		}
-
 		return err
 	}

@@ -883,27 +831,24 @@ func generate(cmd *cobra.Command, opts runOptions) error {
 }

 func RunServer(cmd *cobra.Command, _ []string) error {
-	// retrieve the OLLAMA_HOST environment variable
-	ollamaHost, err := api.GetOllamaHost()
+	host, port, err := net.SplitHostPort(strings.Trim(os.Getenv("OLLAMA_HOST"), "\"'"))
 	if err != nil {
-		return err
+		host, port = "127.0.0.1", "11434"
+		if ip := net.ParseIP(strings.Trim(os.Getenv("OLLAMA_HOST"), "[]")); ip != nil {
+			host = ip.String()
+		}
 	}

 	if err := initializeKeypair(); err != nil {
 		return err
 	}

-	ln, err := net.Listen("tcp", net.JoinHostPort(ollamaHost.Host, ollamaHost.Port))
+	ln, err := net.Listen("tcp", net.JoinHostPort(host, port))
 	if err != nil {
 		return err
 	}

-	err = server.Serve(ln)
-	if errors.Is(err, http.ErrServerClosed) {
-		return nil
-	}
-
-	return err
+	return server.Serve(ln)
 }

 func initializeKeypair() error {
@@ -1124,7 +1069,7 @@ Environment Variables:
 		RunE:    ListHandler,
 	}
 	copyCmd := &cobra.Command{
-		Use:     "cp SOURCE DESTINATION",
+		Use:     "cp SOURCE TARGET",
 		Short:   "Copy a model",
 		Args:    cobra.ExactArgs(2),
 		PreRunE: checkServerHeartbeat,
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -94,7 +94,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 		fmt.Fprintln(os.Stderr, "  /show           Show model information")
 		fmt.Fprintln(os.Stderr, "  /load <model>   Load a session or model")
 		fmt.Fprintln(os.Stderr, "  /save <model>   Save your current session")
-		fmt.Fprintln(os.Stderr, "  /clear          Clear session context")
 		fmt.Fprintln(os.Stderr, "  /bye            Exit")
 		fmt.Fprintln(os.Stderr, "  /?, /help       Help for a command")
 		fmt.Fprintln(os.Stderr, "  /? shortcuts    Help for keyboard shortcuts")
@@ -162,7 +161,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 		fmt.Fprintln(os.Stderr, "  /set parameter repeat_penalty <float> How strongly to penalize repetitions")
 		fmt.Fprintln(os.Stderr, "  /set parameter repeat_last_n <int>    Set how far back to look for repetitions")
 		fmt.Fprintln(os.Stderr, "  /set parameter num_gpu <int>          The number of layers to send to the GPU")
-		fmt.Fprintln(os.Stderr, "  /set parameter stop <string> <string> ...   Set the stop parameters")
+		fmt.Fprintln(os.Stderr, "  /set parameter stop \"<string>\", ...   Set the stop parameters")
 		fmt.Fprintln(os.Stderr, "")
 	}

@@ -281,10 +280,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 			}
 			fmt.Printf("Created new model '%s'\n", args[1])
 			continue
-		case strings.HasPrefix(line, "/clear"):
-			opts.Messages = []api.Message{}
-			fmt.Println("Cleared session context")
-			continue
 		case strings.HasPrefix(line, "/set"):
 			args := strings.Fields(line)
 			if len(args) > 1 {
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -5,7 +5,6 @@ import (
 	"encoding/binary"
 	"encoding/json"
 	"fmt"
-	"io"
 	"log/slog"
 	"os"
 	"path/filepath"
@@ -48,7 +47,7 @@ type ByteOrder interface {
 type ModelArch interface {
 	GetTensors() error
 	LoadVocab() error
-	WriteGGUF(io.WriteSeeker) error
+	WriteGGUF() (string, error)
 }

 type ModelFormat interface {
--- a/convert/gemma.go
+++ b/convert/gemma.go
@@ -94,7 +94,7 @@ func (m *GemmaModel) LoadVocab() error {
 	return nil
 }

-func (m *GemmaModel) WriteGGUF(ws io.WriteSeeker) error {
+func (m *GemmaModel) WriteGGUF() (string, error) {
 	kv := llm.KV{
 		"general.architecture":                   "gemma",
 		"general.name":                           m.Name,
@@ -122,5 +122,16 @@ func (m *GemmaModel) WriteGGUF(ws io.WriteSeeker) error {
 		"tokenizer.ggml.add_eos_token":    false,
 	}

-	return llm.NewGGUFV3(m.Params.ByteOrder).Encode(ws, kv, m.Tensors)
+	f, err := os.CreateTemp("", "ollama-gguf")
+	if err != nil {
+		return "", err
+	}
+	defer f.Close()
+
+	mod := llm.NewGGUFV3(m.Params.ByteOrder)
+	if err := mod.Encode(f, kv, m.Tensors); err != nil {
+		return "", err
+	}
+
+	return f.Name(), nil
 }
--- a/convert/llama.go
+++ b/convert/llama.go
@@ -5,6 +5,7 @@ import (
 	"fmt"
 	"io"
 	"log/slog"
+	"os"
 	"regexp"
 	"strings"

@@ -131,7 +132,7 @@ func (m *LlamaModel) LoadVocab() error {
 	return nil
 }

-func (m *LlamaModel) WriteGGUF(ws io.WriteSeeker) error {
+func (m *LlamaModel) WriteGGUF() (string, error) {
 	kv := llm.KV{
 		"general.architecture":                   "llama",
 		"general.name":                           m.Name,
@@ -158,5 +159,18 @@ func (m *LlamaModel) WriteGGUF(ws io.WriteSeeker) error {
 		"tokenizer.ggml.add_eos_token":    false,
 	}

-	return llm.NewGGUFV3(m.Params.ByteOrder).Encode(ws, kv, m.Tensors)
+	f, err := os.CreateTemp("", "ollama-gguf")
+	if err != nil {
+		return "", err
+	}
+	defer f.Close()
+
+	mod := llm.NewGGUFV3(m.Params.ByteOrder)
+	if err := mod.Encode(f, kv, m.Tensors); err != nil {
+		return "", err
+	}
+
+	slog.Debug(fmt.Sprintf("gguf file = %s", f.Name()))
+
+	return f.Name(), nil
 }
--- a/convert/mistral.go
+++ b/convert/mistral.go
@@ -132,7 +132,7 @@ func (m *MistralModel) LoadVocab() error {
 	return nil
 }

-func (m *MistralModel) WriteGGUF(ws io.WriteSeeker) error {
+func (m *MistralModel) WriteGGUF() (string, error) {
 	kv := llm.KV{
 		"general.architecture":                   "llama",
 		"general.name":                           m.Name,
@@ -158,5 +158,16 @@ func (m *MistralModel) WriteGGUF(ws io.WriteSeeker) error {
 		"tokenizer.ggml.unknown_token_id": uint32(0),
 	}

-	return llm.NewGGUFV3(m.Params.ByteOrder).Encode(ws, kv, m.Tensors)
+	f, err := os.CreateTemp("", "ollama-gguf")
+	if err != nil {
+		return "", err
+	}
+	defer f.Close()
+
+	mod := llm.NewGGUFV3(m.Params.ByteOrder)
+	if err := mod.Encode(f, kv, m.Tensors); err != nil {
+		return "", err
+	}
+
+	return f.Name(), nil
 }
--- a/convert/mixtral.go
+++ b/convert/mixtral.go
@@ -1,7 +1,7 @@
 package convert

 import (
-	"io"
+	"os"
 	"regexp"

 	"github.com/ollama/ollama/llm"
@@ -47,7 +47,7 @@ func (m *MixtralModel) LoadVocab() error {
 	return nil
 }

-func (m *MixtralModel) WriteGGUF(ws io.WriteSeeker) error {
+func (m *MixtralModel) WriteGGUF() (string, error) {
 	kv := llm.KV{
 		"general.architecture":          "llama",
 		"general.name":                  m.Name,
@@ -81,5 +81,16 @@ func (m *MixtralModel) WriteGGUF(ws io.WriteSeeker) error {
 		"tokenizer.ggml.add_eos_token":    false,
 	}

-	return llm.NewGGUFV3(m.Params.ByteOrder).Encode(ws, kv, m.Tensors)
+	f, err := os.CreateTemp("", "ollama-gguf")
+	if err != nil {
+		return "", err
+	}
+	defer f.Close()
+
+	mod := llm.NewGGUFV3(m.Params.ByteOrder)
+	if err := mod.Encode(f, kv, m.Tensors); err != nil {
+		return "", err
+	}
+
+	return f.Name(), nil
 }
--- a/convert/safetensors.go
+++ b/convert/safetensors.go
@@ -53,7 +53,7 @@ func (m *SafetensorFormat) GetTensors(dirpath string, params *Params) ([]llm.Ten
 		var err error
 		t, offset, err = m.readTensors(f, offset, params)
 		if err != nil {
-			slog.Error(err.Error())
+			slog.Error("%v", err)
 			return nil, err
 		}
 		tensors = append(tensors, t...)
@@ -122,7 +122,7 @@ func (m *SafetensorFormat) readTensors(fn string, offset uint64, params *Params)

 		ggufName, err := m.GetLayerName(k)
 		if err != nil {
-			slog.Error(err.Error())
+			slog.Error("%v", err)
 			return nil, 0, err
 		}

--- a/convert/torch.go
+++ b/convert/torch.go
@@ -74,7 +74,7 @@ func (tf *TorchFormat) GetTensors(dirpath string, params *Params) ([]llm.Tensor,

 			ggufName, err := tf.GetLayerName(k.(string))
 			if err != nil {
-				slog.Error(err.Error())
+				slog.Error("%v", err)
 				return nil, err
 			}
 			slog.Debug(fmt.Sprintf("finding name for '%s' -> '%s'", k.(string), ggufName))
--- a/docs/README.md
+++ b/docs/README.md
@@ -6,7 +6,7 @@
 * [Importing models](./import.md)
 * [Linux Documentation](./linux.md)
 * [Windows Documentation](./windows.md)
-* [Docker Documentation](./docker.md)
+* [Docker Documentation](https://hub.docker.com/r/ollama/ollama)

 ### Reference

--- a/docs/api.md
+++ b/docs/api.md
@@ -17,7 +17,7 @@

 ### Model names

-Model names follow a `model:tag` format, where `model` can have an optional namespace such as `example/model`. Some examples are `orca-mini:3b-q4_1` and `llama3:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version.
+Model names follow a `model:tag` format, where `model` can have an optional namespace such as `example/model`. Some examples are `orca-mini:3b-q4_1` and `llama2:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version.

 ### Durations

@@ -66,7 +66,7 @@ Enable JSON mode by setting the `format` parameter to `json`. This will structur

 ```shell
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama3",
+  "model": "llama2",
  "prompt": "Why is the sky blue?"
 }'
 ```
@@ -77,7 +77,7 @@ A stream of JSON objects is returned:

 ```json
 {
-  "model": "llama3",
+  "model": "llama2",
  "created_at": "2023-08-04T08:52:19.385406455-07:00",
  "response": "The",
  "done": false
@@ -95,11 +95,11 @@ The final response in the stream also includes additional data about the generat
 - `context`: an encoding of the conversation used in this response, this can be sent in the next request to keep a conversational memory
 - `response`: empty if the response was streamed, if not streamed, this will contain the full response

-To calculate how fast the response is generated in tokens per second (token/s), divide `eval_count` / `eval_duration` * `10^9`.
+To calculate how fast the response is generated in tokens per second (token/s), divide `eval_count` / `eval_duration`.

 ```json
 {
-  "model": "llama3",
+  "model": "llama2",
  "created_at": "2023-08-04T19:22:45.499127Z",
  "response": "",
  "done": true,
@@ -121,7 +121,7 @@ A response can be received in one reply when streaming is off.

 ```shell
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama3",
+  "model": "llama2",
  "prompt": "Why is the sky blue?",
  "stream": false
 }'
@@ -133,7 +133,7 @@ If `stream` is set to `false`, the response will be a single JSON object:

 ```json
 {
-  "model": "llama3",
+  "model": "llama2",
  "created_at": "2023-08-04T19:22:45.499127Z",
  "response": "The sky is blue because it is the color of the sky.",
  "done": true,
@@ -155,7 +155,7 @@ If `stream` is set to `false`, the response will be a single JSON object:

 ```shell
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama3",
+  "model": "llama2",
  "prompt": "What color is the sky at different times of the day? Respond using JSON",
  "format": "json",
  "stream": false
@@ -166,7 +166,7 @@ curl http://localhost:11434/api/generate -d '{

 ```json
 {
-  "model": "llama3",
+  "model": "llama2",
  "created_at": "2023-11-09T21:07:55.186497Z",
  "response": "{\n\"morning\": {\n\"color\": \"blue\"\n},\n\"noon\": {\n\"color\": \"blue-gray\"\n},\n\"afternoon\": {\n\"color\": \"warm gray\"\n},\n\"evening\": {\n\"color\": \"orange\"\n}\n}\n",
  "done": true,
@@ -289,7 +289,7 @@ If you want to set custom options for the model at runtime rather than in the Mo

 ```shell
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama3",
+  "model": "llama2",
  "prompt": "Why is the sky blue?",
  "stream": false,
  "options": {
@@ -313,6 +313,7 @@ curl http://localhost:11434/api/generate -d '{
    "numa": false,
    "num_ctx": 1024,
    "num_batch": 2,
+    "num_gqa": 1,
    "num_gpu": 1,
    "main_gpu": 0,
    "low_vram": false,
@@ -320,6 +321,8 @@ curl http://localhost:11434/api/generate -d '{
    "vocab_only": false,
    "use_mmap": true,
    "use_mlock": false,
+    "rope_frequency_base": 1.1,
+    "rope_frequency_scale": 0.8,
    "num_thread": 8
  }
 }'
@@ -329,7 +332,7 @@ curl http://localhost:11434/api/generate -d '{

 ```json
 {
-  "model": "llama3",
+  "model": "llama2",
  "created_at": "2023-08-04T19:22:45.499127Z",
  "response": "The sky is blue because it is the color of the sky.",
  "done": true,
@@ -351,7 +354,7 @@ If an empty prompt is provided, the model will be loaded into memory.

 ```shell
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama3"
+  "model": "llama2"
 }'
 ```

@@ -361,7 +364,7 @@ A single JSON object is returned:

 ```json
 {
-  "model": "llama3",
+  "model": "llama2",
  "created_at": "2023-12-18T19:52:07.071755Z",
  "response": "",
  "done": true
@@ -404,7 +407,7 @@ Send a chat message with a streaming response.

 ```shell
 curl http://localhost:11434/api/chat -d '{
-  "model": "llama3",
+  "model": "llama2",
  "messages": [
    {
      "role": "user",
@@ -420,7 +423,7 @@ A stream of JSON objects is returned:

 ```json
 {
-  "model": "llama3",
+  "model": "llama2",
  "created_at": "2023-08-04T08:52:19.385406455-07:00",
  "message": {
    "role": "assistant",
@@ -435,7 +438,7 @@ Final response:

 ```json
 {
-  "model": "llama3",
+  "model": "llama2",
  "created_at": "2023-08-04T19:22:45.499127Z",
  "done": true,
  "total_duration": 4883583458,
@@ -453,7 +456,7 @@ Final response:

 ```shell
 curl http://localhost:11434/api/chat -d '{
-  "model": "llama3",
+  "model": "llama2",
  "messages": [
    {
      "role": "user",
@@ -468,7 +471,7 @@ curl http://localhost:11434/api/chat -d '{

 ```json
 {
-  "model": "registry.ollama.ai/library/llama3:latest",
+  "model": "registry.ollama.ai/library/llama2:latest",
  "created_at": "2023-12-12T14:13:43.416799Z",
  "message": {
    "role": "assistant",
@@ -492,7 +495,7 @@ Send a chat message with a conversation history. You can use this same approach

 ```shell
 curl http://localhost:11434/api/chat -d '{
-  "model": "llama3",
+  "model": "llama2",
  "messages": [
    {
      "role": "user",
@@ -516,7 +519,7 @@ A stream of JSON objects is returned:

 ```json
 {
-  "model": "llama3",
+  "model": "llama2",
  "created_at": "2023-08-04T08:52:19.385406455-07:00",
  "message": {
    "role": "assistant",
@@ -530,7 +533,7 @@ Final response:

 ```json
 {
-  "model": "llama3",
+  "model": "llama2",
  "created_at": "2023-08-04T19:22:45.499127Z",
  "done": true,
  "total_duration": 8113331500,
@@ -588,7 +591,7 @@ curl http://localhost:11434/api/chat -d '{

 ```shell
 curl http://localhost:11434/api/chat -d '{
-  "model": "llama3",
+  "model": "llama2",
  "messages": [
    {
      "role": "user",
@@ -606,7 +609,7 @@ curl http://localhost:11434/api/chat -d '{

 ```json
 {
-  "model": "registry.ollama.ai/library/llama3:latest",
+  "model": "registry.ollama.ai/library/llama2:latest",
  "created_at": "2023-12-12T14:13:43.416799Z",
  "message": {
    "role": "assistant",
@@ -648,7 +651,7 @@ Create a new model from a `Modelfile`.
 ```shell
 curl http://localhost:11434/api/create -d '{
  "name": "mario",
-  "modelfile": "FROM llama3\nSYSTEM You are mario from Super Mario Bros."
+  "modelfile": "FROM llama2\nSYSTEM You are mario from Super Mario Bros."
 }'
 ```

@@ -755,7 +758,7 @@ A single JSON object will be returned.
      }
    },
    {
-      "name": "llama3:latest",
+      "name": "llama2:latest",
      "modified_at": "2023-12-07T09:32:18.757212583-08:00",
      "size": 3825819519,
      "digest": "fe938a131f40e6f6d40083c9f0f430a515233eb2edaa6d72eb85c50d64f2300e",
@@ -789,7 +792,7 @@ Show information about a model including details, modelfile, template, parameter

 ```shell
 curl http://localhost:11434/api/show -d '{
-  "name": "llama3"
+  "name": "llama2"
 }'
 ```

@@ -824,8 +827,8 @@ Copy a model. Creates a model with another name from an existing model.

 ```shell
 curl http://localhost:11434/api/copy -d '{
-  "source": "llama3",
-  "destination": "llama3-backup"
+  "source": "llama2",
+  "destination": "llama2-backup"
 }'
 ```

@@ -851,7 +854,7 @@ Delete a model and its data.

 ```shell
 curl -X DELETE http://localhost:11434/api/delete -d '{
-  "name": "llama3:13b"
+  "name": "llama2:13b"
 }'
 ```

@@ -879,7 +882,7 @@ Download a model from the ollama library. Cancelled pulls are resumed from where

 ```shell
 curl http://localhost:11434/api/pull -d '{
-  "name": "llama3"
+  "name": "llama2"
 }'
 ```

--- a/docs/development.md
+++ b/docs/development.md
@@ -51,7 +51,7 @@ Typically the build scripts will auto-detect CUDA, however, if your Linux distro
 or installation approach uses unusual paths, you can specify the location by
 specifying an environment variable `CUDA_LIB_DIR` to the location of the shared
 libraries, and `CUDACXX` to the location of the nvcc compiler. You can customize
-a set of target CUDA architectures by setting `CMAKE_CUDA_ARCHITECTURES` (e.g. "50;60;70")
+set set of target CUDA architectues by setting `CMAKE_CUDA_ARCHITECTURES` (e.g. "50;60;70")

 Then generate dependencies:

@@ -142,4 +142,4 @@ In addition to the common Windows development tools described above, install AMD
 - [AMD HIP](https://www.amd.com/en/developer/resources/rocm-hub/hip-sdk.html)
 - [Strawberry Perl](https://strawberryperl.com/)

-Lastly, add `ninja.exe` included with MSVC to the system path (e.g. `C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\Common7\IDE\CommonExtensions\Microsoft\CMake\Ninja`).
+Lastly, add `ninja.exe` included with MSVC to the system path (e.g. `C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\Common7\IDE\CommonExtensions\Microsoft\CMake\Ninja`).
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -1,71 +0,0 @@
-# Ollama Docker image
-
-### CPU only
-
-```bash
-docker run -d -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
-```
-
-### Nvidia GPU
-Install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installation).
-
-#### Install with Apt
-1.  Configure the repository
-```bash
-curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \
-    | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
-curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \
-    | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \
-    | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
-sudo apt-get update
-```
-2.  Install the NVIDIA Container Toolkit packages
-```bash
-sudo apt-get install -y nvidia-container-toolkit
-```
-
-#### Install with Yum or Dnf
-1.  Configure the repository
-    
-```bash
-curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo \
-    | sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo
-```
-    
-2. Install the NVIDIA Container Toolkit packages
-    
-```bash
-sudo yum install -y nvidia-container-toolkit
-```
-
-#### Configure Docker to use Nvidia driver 
-```
-sudo nvidia-ctk runtime configure --runtime=docker
-sudo systemctl restart docker
-```
-
-#### Start the container
-
-```bash
-docker run -d --gpus=all -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
-```
-
-### AMD GPU
-
-To run Ollama using Docker with AMD GPUs, use the `rocm` tag and the following command:
-
-```
-docker run -d --device /dev/kfd --device /dev/dri -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama:rocm
-```
-
-### Run model locally
-
-Now you can run a model:
-
-```
-docker exec -it ollama ollama run llama3
-```
-
-### Try different models
-
-More models can be found on the [Ollama library](https://ollama.com/library).
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -32,7 +32,7 @@ When using the API, specify the `num_ctx` parameter:

 ```
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama3",
+  "model": "llama2",
  "prompt": "Why is the sky blue?",
  "options": {
    "num_ctx": 4096
@@ -88,9 +88,9 @@ On windows, Ollama inherits your user and system environment variables.

 3. Edit or create New variable(s) for your user account for `OLLAMA_HOST`, `OLLAMA_MODELS`, etc.

-4. Click OK/Apply to save
+4. Click OK/Apply to save 

-5. Run `ollama` from a new terminal window
+5. Run `ollama` from a new terminal window 


 ## How can I expose Ollama on my network?
@@ -140,7 +140,7 @@ Refer to the section [above](#how-do-i-configure-ollama-server) for how to set e

 - macOS: `~/.ollama/models`
 - Linux: `/usr/share/ollama/.ollama/models`
- Windows: `C:\Users\%username%\.ollama\models`
+- Windows: `C:\Users\<username>\.ollama\models`

 ### How do I set them to a different location?

@@ -221,20 +221,14 @@ The `keep_alive` parameter can be set to:

 For example, to preload a model and leave it in memory use:
 ```shell
-curl http://localhost:11434/api/generate -d '{"model": "llama3", "keep_alive": -1}'
+curl http://localhost:11434/api/generate -d '{"model": "llama2", "keep_alive": -1}'
 ```

 To unload the model and free up memory use:
 ```shell
-curl http://localhost:11434/api/generate -d '{"model": "llama3", "keep_alive": 0}'
+curl http://localhost:11434/api/generate -d '{"model": "llama2", "keep_alive": 0}'
 ```

 Alternatively, you can change the amount of time all models are loaded into memory by setting the `OLLAMA_KEEP_ALIVE` environment variable when starting the Ollama server. The `OLLAMA_KEEP_ALIVE` variable uses the same parameter types as the `keep_alive` parameter types mentioned above. Refer to section explaining [how to configure the Ollama server](#how-do-i-configure-ollama-server) to correctly set the environment variable.

 If you wish to override the `OLLAMA_KEEP_ALIVE` setting, use the `keep_alive` API parameter with the `/api/generate` or `/api/chat` API endpoints.
-
-## How do I manage the maximum number of requests the server can queue
-
-If too many requests are sent to the server, it will respond with a 503 error
-indicating the server is overloaded.  You can adjust how many requests may be
-queue by setting `OLLAMA_MAX_QUEUE`
--- a/docs/import.md
+++ b/docs/import.md
@@ -125,7 +125,7 @@ Publishing models is in early alpha. If you'd like to publish your model to shar

 1. Create [an account](https://ollama.com/signup)
 2. Copy your Ollama public key:
-  - macOS: `cat ~/.ollama/id_ed25519.pub | pbcopy`
+  - macOS: `cat ~/.ollama/id_ed25519.pub`
  - Windows: `type %USERPROFILE%\.ollama\id_ed25519.pub`
  - Linux: `cat /usr/share/ollama/.ollama/id_ed25519.pub`
 3. Add your public key to your [Ollama account](https://ollama.com/settings/keys)
@@ -136,8 +136,6 @@ Next, copy your model to your username's namespace:
 ollama cp example <your username>/example
 ```

-> Note: model names may only contain lowercase letters, digits, and the characters `.`, `-`, and `_`.
-
 Then push the model:

 ```
--- a/docs/linux.md
+++ b/docs/linux.md
@@ -105,7 +105,7 @@ sudo chmod +x /usr/bin/ollama
 To view logs of Ollama running as a startup service, run:

 ```bash
-journalctl -e -u ollama
+journalctl -u ollama
 ```

 ## Uninstall
--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@@ -10,7 +10,7 @@ A model file is the blueprint to create and share models with Ollama.
 - [Examples](#examples)
 - [Instructions](#instructions)
  - [FROM (Required)](#from-required)
-    - [Build from llama3](#build-from-llama3)
+    - [Build from llama2](#build-from-llama2)
    - [Build from a bin file](#build-from-a-bin-file)
  - [PARAMETER](#parameter)
    - [Valid Parameters and Values](#valid-parameters-and-values)
@@ -48,7 +48,7 @@ INSTRUCTION arguments
 An example of a `Modelfile` creating a mario blueprint:

 ```modelfile
-FROM llama3
+FROM llama2
 # sets the temperature to 1 [higher is more creative, lower is more coherent]
 PARAMETER temperature 1
 # sets the context window size to 4096, this controls how many tokens the LLM can use as context to generate the next token
@@ -67,25 +67,33 @@ To use this:

 More examples are available in the [examples directory](../examples).

-To view the Modelfile of a given model, use the `ollama show --modelfile` command.
+### `Modelfile`s in [ollama.com/library][1]
+
+There are two ways to view `Modelfile`s underlying the models in [ollama.com/library][1]:
+
+- Option 1: view a details page from a model's tags page:
+  1.  Go to a particular model's tags (e.g. https://ollama.com/library/llama2/tags)
+  2.  Click on a tag (e.g. https://ollama.com/library/llama2:13b)
+  3.  Scroll down to "Layers"
+      - Note: if the [`FROM` instruction](#from-required) is not present,
+        it means the model was created from a local file
+- Option 2: use `ollama show` to print the `Modelfile` for any local models like so:

  ```bash
-  > ollama show --modelfile llama3
+  > ollama show --modelfile llama2:13b
  # Modelfile generated by "ollama show"
  # To build a new Modelfile based on this one, replace the FROM line with:
-  # FROM llama3:latest
-  FROM /Users/pdevine/.ollama/models/blobs/sha256-00e1317cbf74d901080d7100f57580ba8dd8de57203072dc6f668324ba545f29
-  TEMPLATE """{{ if .System }}<|start_header_id|>system<|end_header_id|>
+  # FROM llama2:13b

-  {{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>
+  FROM /root/.ollama/models/blobs/sha256:123abc
+  TEMPLATE """[INST] {{ if .System }}<<SYS>>{{ .System }}<</SYS>>

-  {{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>
-
-  {{ .Response }}<|eot_id|>"""
-  PARAMETER stop "<|start_header_id|>"
-  PARAMETER stop "<|end_header_id|>"
-  PARAMETER stop "<|eot_id|>"
-  PARAMETER stop "<|reserved_special_token"
+  {{ end }}{{ .Prompt }} [/INST] """
+  SYSTEM """"""
+  PARAMETER stop [INST]
+  PARAMETER stop [/INST]
+  PARAMETER stop <<SYS>>
+  PARAMETER stop <</SYS>>
  ```

 ## Instructions
@@ -98,10 +106,10 @@ The `FROM` instruction defines the base model to use when creating a model.
 FROM <model name>:<tag>
 ```

-#### Build from llama3
+#### Build from llama2

 ```modelfile
-FROM llama3
+FROM llama2
 ```

 A list of available base models:
--- a/docs/openai.md
+++ b/docs/openai.md
@@ -25,7 +25,7 @@ chat_completion = client.chat.completions.create(
            'content': 'Say this is a test',
        }
    ],
-    model='llama3',
+    model='llama2',
 )
 ```

@@ -43,7 +43,7 @@ const openai = new OpenAI({

 const chatCompletion = await openai.chat.completions.create({
  messages: [{ role: 'user', content: 'Say this is a test' }],
-  model: 'llama3',
+  model: 'llama2',
 })
 ```

@@ -53,7 +53,7 @@ const chatCompletion = await openai.chat.completions.create({
 curl http://localhost:11434/v1/chat/completions \
    -H "Content-Type: application/json" \
    -d '{
-        "model": "llama3",
+        "model": "llama2",
        "messages": [
            {
                "role": "system",
@@ -113,7 +113,7 @@ curl http://localhost:11434/v1/chat/completions \
 Before using a model, pull it locally `ollama pull`:

 ```shell
-ollama pull llama3
+ollama pull llama2
 ```

 ### Default model names
@@ -121,7 +121,7 @@ ollama pull llama3
 For tooling that relies on default OpenAI model names such as `gpt-3.5-turbo`, use `ollama cp` to copy an existing model name to a temporary name:

 ```
-ollama cp llama3 gpt-3.5-turbo
+ollama cp llama2 gpt-3.5-turbo
 ```

 Afterwards, this new model name can be specified the `model` field:
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -82,23 +82,4 @@ curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION="0.1.29" sh
 If your system is configured with the "noexec" flag where Ollama stores its
 temporary executable files, you can specify an alternate location by setting
 OLLAMA_TMPDIR to a location writable by the user ollama runs as.  For example
-OLLAMA_TMPDIR=/usr/share/ollama/
-
-## Container fails to run on NVIDIA GPU
-
-Make sure you've set up the conatiner runtime first as described in [docker.md](./docker.md)
-
-Sometimes the container runtime can have difficulties initializing the GPU.
-When you check the server logs, this can show up as various error codes, such
-as "3" (not initialized), "46" (device unavailable), "100" (no device), "999"
-(unknown), or others.  The following troubleshooting techniques may help resolve
-the problem
-
- Is the uvm driver not loaded? `sudo nvidia-modprobe -u`
- Try reloading the nvidia_uvm driver - `sudo rmmod nvidia_uvm` then `sudo modprobe nvidia_uvm`
- Try rebooting
- Make sure you're running the latest nvidia drivers
-
-If none of those resolve the problem, gather additional information and file an issue:
- Set `CUDA_ERROR_LEVEL=50` and try again to get more diagnostic logs
- Check dmesg for any errors `sudo dmesg | grep -i nvrm` and `sudo dmesg | grep -i nvidia`
+OLLAMA_TMPDIR=/usr/share/ollama/
--- a/docs/tutorials/langchainjs.md
+++ b/docs/tutorials/langchainjs.md
@@ -5,17 +5,17 @@ In this tutorial, we are going to use JavaScript with LangChain and Ollama to le
 To get started, let's just use **LangChain** to ask a simple question to a model. To do this with JavaScript, we need to install **LangChain**:

 ```bash
-npm install @langchain/community
+npm install langchain
 ```

 Now we can start building out our JavaScript:

 ```javascript
-import { Ollama } from "@langchain/community/llms/ollama";
+import { Ollama } from "langchain/llms/ollama";

 const ollama = new Ollama({
  baseUrl: "http://localhost:11434",
-  model: "llama3",
+  model: "llama2",
 });

 const answer = await ollama.invoke(`why is the sky blue?`);
@@ -23,10 +23,10 @@ const answer = await ollama.invoke(`why is the sky blue?`);
 console.log(answer);
 ```

-That will get us the same thing as if we ran `ollama run llama3 "why is the sky blue"` in the terminal. But we want to load a document from the web to ask a question against. **Cheerio** is a great library for ingesting a webpage, and **LangChain** uses it in their **CheerioWebBaseLoader**. So let's install **Cheerio** and build that part of the app.
+That will get us the same thing as if we ran `ollama run llama2 "why is the sky blue"` in the terminal. But we want to load a document from the web to ask a question against. **Cheerio** is a great library for ingesting a webpage, and **LangChain** uses it in their **CheerioWebBaseLoader**. So let's install **Cheerio** and build that part of the app.

 ```bash
-npm install cheerio
+npm install cheerio 
 ```

 ```javascript
--- a/docs/tutorials/langchainpy.md
+++ b/docs/tutorials/langchainpy.md
@@ -12,17 +12,15 @@ So let's figure out how we can use **LangChain** with Ollama to ask our question

 Let's start by asking a simple question that we can get an answer to from the **Llama2** model using **Ollama**. First, we need to install the **LangChain** package:

-`pip install langchain_community`
+`pip install langchain`

 Then we can create a model and ask the question:

 ```python
-from langchain_community.llms import Ollama
-ollama = Ollama(
-    base_url='http://localhost:11434',
-    model="llama3"
-)
-print(ollama.invoke("why is the sky blue"))
+from langchain.llms import Ollama
+ollama = Ollama(base_url='http://localhost:11434',
+model="llama2")
+print(ollama("why is the sky blue"))
 ```

 Notice that we are defining the model and the base URL for Ollama.
--- a/docs/windows.md
+++ b/docs/windows.md
@@ -1,61 +1,47 @@
-# Ollama Windows Preview
-
-Welcome to the Ollama Windows preview.
-
-No more WSL required!
-
-Ollama now runs as a native Windows application, including NVIDIA and AMD Radeon GPU support.
-After installing Ollama Windows Preview, Ollama will run in the background and
-the `ollama` command line is available in `cmd`, `powershell` or your favorite
-terminal application. As usual the Ollama [api](./api.md) will be served on
-`http://localhost:11434`.
-
-As this is a preview release, you should expect a few bugs here and there.  If
-you run into a problem you can reach out on
-[Discord](https://discord.gg/ollama), or file an
-[issue](https://github.com/ollama/ollama/issues).
-Logs will often be helpful in diagnosing the problem (see
-[Troubleshooting](#troubleshooting) below)
-
-## System Requirements
-
-* Windows 10 or newer, Home or Pro
-* NVIDIA 452.39 or newer Drivers if you have an NVIDIA card
-* AMD Radeon Driver https://www.amd.com/en/support if you have a Radeon card
-
-## API Access
-
-Here's a quick example showing API access from `powershell`
-```powershell
-(Invoke-WebRequest -method POST -Body '{"model":"llama3", "prompt":"Why is the sky blue?", "stream": false}' -uri http://localhost:11434/api/generate ).Content | ConvertFrom-json
-```
-
-## Troubleshooting
-
-While we're in preview, `OLLAMA_DEBUG` is always enabled, which adds
-a "view logs" menu item to the app, and increses logging for the GUI app and
-server.
-
-Ollama on Windows stores files in a few different locations.  You can view them in
-the explorer window by hitting `<cmd>+R` and type in:
- `explorer %LOCALAPPDATA%\Ollama` contains logs, and downloaded updates
-    - *app.log* contains logs from the GUI application
-    - *server.log* contains the server logs
-    - *upgrade.log* contains log output for upgrades
- `explorer %LOCALAPPDATA%\Programs\Ollama` contains the binaries (The installer adds this to your user PATH)
- `explorer %HOMEPATH%\.ollama` contains models and configuration
- `explorer %TEMP%` contains temporary executable files in one or more `ollama*` directories
-
-
-## Standalone CLI
-
-The easiest way to install Ollama on Windows is to use the `OllamaSetup.exe`
-installer. It installs in your account without requiring Administrator rights.
-We update Ollama regularly to support the latest models, and this installer will
-help you keep up to date.
-
-If you'd like to install or integrate Ollama as a service, a standalone
-`ollama-windows-amd64.zip` zip file is available containing only the Ollama CLI
-and GPU library dependencies for Nvidia and AMD. This allows for embedding
-Ollama in existing applications, or running it as a system service via `ollama
-serve` with tools such as [NSSM](https://nssm.cc/).
+# Ollama Windows Preview
+
+Welcome to the Ollama Windows preview.
+
+No more WSL required!
+
+Ollama now runs as a native Windows application, including NVIDIA and AMD Radeon GPU support.
+After installing Ollama Windows Preview, Ollama will run in the background and
+the `ollama` command line is available in `cmd`, `powershell` or your favorite
+terminal application. As usual the Ollama [api](./api.md) will be served on
+`http://localhost:11434`.
+
+As this is a preview release, you should expect a few bugs here and there.  If
+you run into a problem you can reach out on
+[Discord](https://discord.gg/ollama), or file an 
+[issue](https://github.com/ollama/ollama/issues).
+Logs will often be helpful in diagnosing the problem (see
+[Troubleshooting](#troubleshooting) below)
+
+## System Requirements
+
+* Windows 10 or newer, Home or Pro
+* NVIDIA 452.39 or newer Drivers if you have an NVIDIA card
+* AMD Radeon Driver https://www.amd.com/en/support if you have a Radeon card
+
+## API Access
+
+Here's a quick example showing API access from `powershell`
+```powershell
+(Invoke-WebRequest -method POST -Body '{"model":"llama2", "prompt":"Why is the sky blue?", "stream": false}' -uri http://localhost:11434/api/generate ).Content | ConvertFrom-json
+```
+
+## Troubleshooting
+
+While we're in preview, `OLLAMA_DEBUG` is always enabled, which adds
+a "view logs" menu item to the app, and increses logging for the GUI app and
+server.
+
+Ollama on Windows stores files in a few different locations.  You can view them in
+the explorer window by hitting `<cmd>+R` and type in:
+- `explorer %LOCALAPPDATA%\Ollama` contains logs, and downloaded updates
+    - *app.log* contains logs from the GUI application
+    - *server.log* contains the server logs
+    - *upgrade.log* contains log output for upgrades
+- `explorer %LOCALAPPDATA%\Programs\Ollama` contains the binaries (The installer adds this to your user PATH)
+- `explorer %HOMEPATH%\.ollama` contains models and configuration
+- `explorer %TEMP%` contains temporary executable files in one or more `ollama*` directories
--- a/examples/bash-comparemodels/README.md
+++ b/examples/bash-comparemodels/README.md
@@ -0,0 +1,10 @@
+# Bash Shell examples
+
+When calling `ollama`, you can pass it a file to run all the prompts in the file, one after the other:
+
+`ollama run llama2 < sourcequestions.txt`
+
+This concept is used in the following example.
+
+## Compare Models
+`comparemodels.sh` is a script that runs all the questions in `sourcequestions.txt` using any 4 models you choose that you have already pulled from the Ollama library or have created locally.
--- a/examples/bash-comparemodels/comparemodels.sh
+++ b/examples/bash-comparemodels/comparemodels.sh
@@ -0,0 +1,64 @@
+#! /usr/bin/env bash
+# Compare multiple models by running them with the same questions
+
+NUMBEROFCHOICES=4
+SELECTIONS=()
+declare -a SUMS=()
+
+# Get the list of models
+CHOICES=$(ollama list | awk '{print $1}')
+
+# Select which models to run as a comparison
+echo "Select $NUMBEROFCHOICES models to compare:"
+select ITEM in $CHOICES; do
+    if [[ -n $ITEM ]]; then
+        echo "You have selected $ITEM"
+        SELECTIONS+=("$ITEM")
+        ((COUNT++))
+        if [[ $COUNT -eq $NUMBEROFCHOICES ]]; then
+            break
+        fi
+    else
+        echo "Invalid selection"
+    fi
+done
+
+# Loop through each of the selected models
+for ITEM in "${SELECTIONS[@]}"; do
+    echo "--------------------------------------------------------------"
+    echo "Loading the model $ITEM into memory"
+    ollama run "$ITEM" ""
+    echo "--------------------------------------------------------------"
+    echo "Running the questions through the model $ITEM"
+    COMMAND_OUTPUT=$(ollama run "$ITEM" --verbose < sourcequestions.txt 2>&1| tee /dev/stderr)
+
+    # eval duration is sometimes listed in seconds and sometimes in milliseconds. 
+    # Add up the values for each model
+    SUM=$(echo "$COMMAND_OUTPUT" | awk '
+    /eval duration:/ {
+        value = $3
+        if (index(value, "ms") > 0) {
+            gsub("ms", "", value)
+            value /= 1000
+        } else {
+            gsub("s", "", value)
+        }
+        sum += value
+    }
+    END { print sum }')
+
+
+    SUMS+=("All questions for $ITEM completed in $SUM seconds")
+done
+
+echo ""
+echo "--------------------------------------------------------------"
+echo -e "Sums of eval durations for each run:"
+for val in "${SUMS[@]}"; do
+    echo "$val"
+done
+
+echo "--------------------------------------------------------------"
+echo "Comparison complete. Now you can decide"
+echo "which model is best."
+echo "--------------------------------------------------------------"
--- a/examples/bash-comparemodels/sourcequestions.txt
+++ b/examples/bash-comparemodels/sourcequestions.txt
@@ -0,0 +1,7 @@
+Why is the sky blue
+What is a black hole
+Explain the big bang theory like I am 5?
+What is the quickest way to win a game of Monopoly with 3 others?
+Why does a vacuum bottle keep my coffee hot and my milkshake cold?
+What is the difference between a meteor, a meteorite, and a meteoroid?
+Create an array with 5 items and print to the console. Do this in Python, C#, Typescript, and Rust.
--- a/examples/flyio/.gitignore
+++ b/examples/flyio/.gitignore
@@ -1 +0,0 @@
-fly.toml
--- a/examples/flyio/README.md
+++ b/examples/flyio/README.md
@@ -1,67 +0,0 @@
-# Deploy Ollama to Fly.io
-
-> Note: this example exposes a public endpoint and does not configure authentication. Use with care.
-
-## Prerequisites
-
- Ollama: https://ollama.com/download
- Fly.io account. Sign up for a free account: https://fly.io/app/sign-up
-
-## Steps
-
-1. Login to Fly.io
-
-    ```bash
-    fly auth login
-    ```
-
-1. Create a new Fly app
-
-    ```bash
-    fly launch --name <name> --image ollama/ollama --internal-port 11434 --vm-size shared-cpu-8x --now
-    ```
-
-1. Pull and run `orca-mini:3b`
-
-    ```bash
-    OLLAMA_HOST=https://<name>.fly.dev ollama run orca-mini:3b
-    ```
-
-`shared-cpu-8x` is a free-tier eligible machine type. For better performance, switch to a `performance` or `dedicated` machine type or attach a GPU for hardware acceleration (see below).
-
-## (Optional) Persistent Volume
-
-By default Fly Machines use ephemeral storage which is problematic if you want to use the same model across restarts without pulling it again. Create and attach a persistent volume to store the downloaded models:
-
-1. Create the Fly Volume
-
-    ```bash
-    fly volume create ollama
-    ```
-
-1. Update `fly.toml` and add `[mounts]`
-
-    ```toml
-    [mounts]
-      source = "ollama"
-      destination = "/mnt/ollama/models"
-    ```
-
-1. Update `fly.toml` and add `[env]`
-
-    ```toml
-    [env]
-      OLLAMA_MODELS = "/mnt/ollama/models"
-    ```
-
-1. Deploy your app
-
-    ```bash
-    fly deploy
-    ```
-
-## (Optional) Hardware Acceleration
-
-Fly.io GPU is currently in waitlist. Sign up for the waitlist: https://fly.io/gpu
-
-Once you've been accepted, create the app with the additional flags `--vm-gpu-kind a100-pcie-40gb` or `--vm-gpu-kind a100-pcie-80gb`.
--- a/examples/go-chat/main.go
+++ b/examples/go-chat/main.go
@@ -35,7 +35,7 @@ func main() {

 	ctx := context.Background()
 	req := &api.ChatRequest{
-		Model:    "llama3",
+		Model:    "llama2",
 		Messages: messages,
 	}

--- a/examples/go-http-generate/main.go
+++ b/examples/go-http-generate/main.go
@@ -19,7 +19,7 @@ func main() {
 	}

 	defer resp.Body.Close()
-
+	
 	responseData, err := io.ReadAll(resp.Body)
 	if err != nil {
 		log.Fatal(err)
--- a/examples/kubernetes/README.md
+++ b/examples/kubernetes/README.md
@@ -7,24 +7,12 @@

 ## Steps

-1. Create the Ollama namespace, deployment, and service
+1. Create the Ollama namespace, daemon set, and service

   ```bash
   kubectl apply -f cpu.yaml
   ```

-## (Optional) Hardware Acceleration
-
-Hardware acceleration in Kubernetes requires NVIDIA's [`k8s-device-plugin`](https://github.com/NVIDIA/k8s-device-plugin) which is deployed in Kubernetes in form of daemonset. Follow the link for more details.
-
-Once configured, create a GPU enabled Ollama deployment.
-
-```bash
-kubectl apply -f gpu.yaml
-```
-
-## Test
-
 1. Port forward the Ollama service to connect and use it locally

   ```bash
@@ -35,4 +23,14 @@ kubectl apply -f gpu.yaml

   ```bash
   ollama run orca-mini:3b
-   ```
+   ```
+
+## (Optional) Hardware Acceleration
+
+Hardware acceleration in Kubernetes requires NVIDIA's [`k8s-device-plugin`](https://github.com/NVIDIA/k8s-device-plugin). Follow the link for more details.
+
+Once configured, create a GPU enabled Ollama deployment.
+
+```bash
+kubectl apply -f gpu.yaml
+```
--- a/examples/langchain-python-rag-document/main.py
+++ b/examples/langchain-python-rag-document/main.py
@@ -40,9 +40,9 @@ while True:
        continue

    # Prompt
-    template = """Use the following pieces of context to answer the question at the end.
-    If you don't know the answer, just say that you don't know, don't try to make up an answer.
-    Use three sentences maximum and keep the answer as concise as possible.
+    template = """Use the following pieces of context to answer the question at the end. 
+    If you don't know the answer, just say that you don't know, don't try to make up an answer. 
+    Use three sentences maximum and keep the answer as concise as possible. 
    {context}
    Question: {question}
    Helpful Answer:"""
@@ -51,11 +51,11 @@ while True:
        template=template,
    )

-    llm = Ollama(model="llama3:8b", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
+    llm = Ollama(model="llama2:13b", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
    qa_chain = RetrievalQA.from_chain_type(
        llm,
        retriever=vectorstore.as_retriever(),
        chain_type_kwargs={"prompt": QA_CHAIN_PROMPT},
    )

-    result = qa_chain({"query": query})
+    result = qa_chain({"query": query})
--- a/examples/langchain-python-rag-websummary/main.py
+++ b/examples/langchain-python-rag-websummary/main.py
@@ -1,12 +1,12 @@
-from langchain_community.llms import Ollama
-from langchain_community.document_loaders import WebBaseLoader
+from langchain.llms import Ollama
+from langchain.document_loaders import WebBaseLoader
 from langchain.chains.summarize import load_summarize_chain

 loader = WebBaseLoader("https://ollama.com/blog/run-llama2-uncensored-locally")
 docs = loader.load()

-llm = Ollama(model="llama3")
+llm = Ollama(model="llama2")
 chain = load_summarize_chain(llm, chain_type="stuff")

-result = chain.invoke(docs) 
+result = chain.run(docs)
 print(result)
--- a/examples/langchain-python-simple/README.md
+++ b/examples/langchain-python-simple/README.md
@@ -4,10 +4,10 @@ This example is a basic "hello world" of using LangChain with Ollama.

 ## Running the Example

-1. Ensure you have the `llama3` model installed:
+1. Ensure you have the `llama2` model installed:

   ```bash
-   ollama pull llama3
+   ollama pull llama2
   ```

 2. Install the Python Requirements.
@@ -21,3 +21,4 @@ This example is a basic "hello world" of using LangChain with Ollama.
   ```bash
   python main.py
   ```
+  
--- a/examples/langchain-python-simple/main.py
+++ b/examples/langchain-python-simple/main.py
@@ -1,6 +1,6 @@
 from langchain.llms import Ollama

 input = input("What is your question?")
-llm = Ollama(model="llama3")
+llm = Ollama(model="llama2")
 res = llm.predict(input)
 print (res)
--- a/examples/modelfile-mario/Modelfile
+++ b/examples/modelfile-mario/Modelfile
@@ -1,4 +1,4 @@
-FROM llama3
+FROM llama2
 PARAMETER temperature 1
 SYSTEM """
 You are Mario from super mario bros, acting as an assistant.
--- a/examples/modelfile-mario/readme.md
+++ b/examples/modelfile-mario/readme.md
@@ -2,12 +2,12 @@

 # Example character: Mario

-This example shows how to create a basic character using Llama3 as the base model.
+This example shows how to create a basic character using Llama2 as the base model.

 To run this example:

 1. Download the Modelfile
-2. `ollama pull llama3` to get the base model used in the model file.
+2. `ollama pull llama2` to get the base model used in the model file.
 3. `ollama create NAME -f ./Modelfile`
 4. `ollama run NAME`

@@ -18,7 +18,7 @@ Ask it some questions like "Who are you?" or "Is Peach in trouble again?"
 What the model file looks like:

 ```
-FROM llama3
+FROM llama2
 PARAMETER temperature 1
 SYSTEM """
 You are Mario from Super Mario Bros, acting as an assistant.
--- a/examples/python-json-datagenerator/predefinedschema.py
+++ b/examples/python-json-datagenerator/predefinedschema.py
@@ -2,16 +2,16 @@ import requests
 import json
 import random

-model = "llama3"
+model = "llama2"
 template = {
-  "firstName": "",
-  "lastName": "",
+  "firstName": "", 
+  "lastName": "", 
  "address": {
-    "street": "",
-    "city": "",
-    "state": "",
+    "street": "", 
+    "city": "", 
+    "state": "", 
    "zipCode": ""
-  },
+  }, 
  "phoneNumber": ""
 }

--- a/examples/python-json-datagenerator/randomaddresses.py
+++ b/examples/python-json-datagenerator/randomaddresses.py
@@ -12,7 +12,7 @@ countries = [
    "France",
 ]
 country = random.choice(countries)
-model = "llama3"
+model = "llama2"

 prompt = f"generate one realistically believable sample data set of a persons first name, last name, address in {country}, and phone number. Do not use common names. Respond using JSON. Key names should have no backslashes, values should use plain ascii with no special characters."

--- a/examples/python-json-datagenerator/readme.md
+++ b/examples/python-json-datagenerator/readme.md
@@ -6,10 +6,10 @@ There are two python scripts in this example. `randomaddresses.py` generates ran

 ## Running the Example

-1. Ensure you have the `llama3` model installed:
+1. Ensure you have the `llama2` model installed:

   ```bash
-   ollama pull llama3
+   ollama pull llama2
   ```

 2. Install the Python Requirements.
--- a/examples/python-simplechat/client.py
+++ b/examples/python-simplechat/client.py
@@ -2,7 +2,7 @@ import json
 import requests

 # NOTE: ollama must be running for this to work, start the ollama app or run `ollama serve`
-model = "llama3"  # TODO: update this for whatever model you wish to use
+model = "llama2"  # TODO: update this for whatever model you wish to use


 def chat(messages):
--- a/examples/python-simplechat/readme.md
+++ b/examples/python-simplechat/readme.md
@@ -4,10 +4,10 @@ The **chat** endpoint is one of two ways to generate text from an LLM with Ollam

 ## Running the Example

-1. Ensure you have the `llama3` model installed:
+1. Ensure you have the `llama2` model installed:

   ```bash
-   ollama pull llama3
+   ollama pull llama2
   ```

 2. Install the Python Requirements.
--- a/examples/typescript-mentors/README.md
+++ b/examples/typescript-mentors/README.md
@@ -4,10 +4,10 @@ This example demonstrates how one would create a set of 'mentors' you can have a

 ## Usage

-1. Add llama3 to have the mentors ask your questions:
+1. Add llama2 to have the mentors ask your questions:

   ```bash
-   ollama pull llama3
+   ollama pull llama2
   ```

 2. Install prerequisites:
--- a/examples/typescript-mentors/character-generator.ts
+++ b/examples/typescript-mentors/character-generator.ts
@@ -15,7 +15,7 @@ async function characterGenerator() {
  ollama.setModel("stablebeluga2:70b-q4_K_M");
  const bio = await ollama.generate(`create a bio of ${character} in a single long paragraph. Instead of saying '${character} is...' or '${character} was...' use language like 'You are...' or 'You were...'. Then create a paragraph describing the speaking mannerisms and style of ${character}. Don't include anything about how ${character} looked or what they sounded like, just focus on the words they said. Instead of saying '${character} would say...' use language like 'You should say...'. If you use quotes, always use single quotes instead of double quotes. If there are any specific words or phrases you used a lot, show how you used them. `);

-  const thecontents = `FROM llama3\nSYSTEM """\n${bio.response.replace(/(\r\n|\n|\r)/gm, " ").replace('would', 'should')} All answers to questions should be related back to what you are most known for.\n"""`;
+  const thecontents = `FROM llama2\nSYSTEM """\n${bio.response.replace(/(\r\n|\n|\r)/gm, " ").replace('would', 'should')} All answers to questions should be related back to what you are most known for.\n"""`;

  fs.writeFile(path.join(directory, 'Modelfile'), thecontents, (err: any) => {
    if (err) throw err;
@@ -23,4 +23,4 @@ async function characterGenerator() {
  });
 }

-characterGenerator();
+characterGenerator();
--- a/examples/typescript-simplechat/client.ts
+++ b/examples/typescript-simplechat/client.ts
@@ -1,6 +1,6 @@
 import * as readline from "readline";

-const model = "llama3";
+const model = "llama2";
 type Message = {
  role: "assistant" | "user" | "system";
  content: string;
@@ -74,4 +74,4 @@ async function main() {

 }

-main();
+main();
--- a/format/bytes.go
+++ b/format/bytes.go
@@ -53,8 +53,6 @@ func HumanBytes(b int64) string {

 func HumanBytes2(b uint64) string {
 	switch {
-	case b >= GibiByte:
-		return fmt.Sprintf("%.1f GiB", float64(b)/GibiByte)
 	case b >= MebiByte:
 		return fmt.Sprintf("%.1f MiB", float64(b)/MebiByte)
 	case b >= KibiByte:
--- a/format/format.go
+++ b/format/format.go
@@ -13,20 +13,12 @@ const (

 func HumanNumber(b uint64) string {
 	switch {
-	case b >= Billion:
-		number := float64(b) / Billion
-		if number == math.Floor(number) {
-			return fmt.Sprintf("%.0fB", number) // no decimals if whole number
-		}
-		return fmt.Sprintf("%.1fB", number) // one decimal if not a whole number
-	case b >= Million:
-		number := float64(b) / Million
-		if number == math.Floor(number) {
-			return fmt.Sprintf("%.0fM", number) // no decimals if whole number
-		}
-		return fmt.Sprintf("%.2fM", number) // two decimals if not a whole number
-	case b >= Thousand:
-		return fmt.Sprintf("%.0fK", float64(b)/Thousand)
+	case b > Billion:
+		return fmt.Sprintf("%.0fB", math.Round(float64(b)/Billion))
+	case b > Million:
+		return fmt.Sprintf("%.0fM", math.Round(float64(b)/Million))
+	case b > Thousand:
+		return fmt.Sprintf("%.0fK", math.Round(float64(b)/Thousand))
 	default:
 		return fmt.Sprintf("%d", b)
 	}
--- a/format/format_test.go
+++ b/format/format_test.go
@@ -1,34 +0,0 @@
-package format
-
-import (
-	"testing"
-)
-
-func TestHumanNumber(t *testing.T) {
-
-	type testCase struct {
-		input    uint64
-		expected string
-	}
-
-	testCases := []testCase{
-		{0, "0"},
-		{1000000, "1M"},
-		{125000000, "125M"},
-		{500500000, "500.50M"},
-		{500550000, "500.55M"},
-		{1000000000, "1B"},
-		{2800000000, "2.8B"},
-		{2850000000, "2.9B"},
-		{1000000000000, "1000B"},
-	}
-
-	for _, tc := range testCases {
-		t.Run(tc.expected, func(t *testing.T) {
-			result := HumanNumber(tc.input)
-			if result != tc.expected {
-				t.Errorf("Expected %s, got %s", tc.expected, result)
-			}
-		})
-	}
-}
--- a/gpu/amd_common.go
+++ b/gpu/amd_common.go
@@ -81,10 +81,8 @@ func commonAMDValidateLibDir() (string, error) {
 	}

 	// Well known location(s)
-	for _, path := range RocmStandardLocations {
-		if rocmLibUsable(path) {
-			return path, nil
-		}
+	if rocmLibUsable(RocmStandardLocation) {
+		return RocmStandardLocation, nil
 	}

 	// Installer payload location if we're running the installed binary
--- a/gpu/amd_hip_windows.go
+++ b/gpu/amd_hip_windows.go
@@ -3,6 +3,7 @@ package gpu
 import (
 	"fmt"
 	"log/slog"
+	"strconv"
 	"syscall"
 	"unsafe"

@@ -73,22 +74,16 @@ func (hl *HipLib) Release() {
 	hl.dll = 0
 }

-func (hl *HipLib) AMDDriverVersion() (driverMajor, driverMinor int, err error) {
+func (hl *HipLib) AMDDriverVersion() (string, error) {
 	if hl.dll == 0 {
-		return 0, 0, fmt.Errorf("dll has been unloaded")
+		return "", fmt.Errorf("dll has been unloaded")
 	}
 	var version int
 	status, _, err := syscall.SyscallN(hl.hipDriverGetVersion, uintptr(unsafe.Pointer(&version)))
 	if status != hipSuccess {
-		return 0, 0, fmt.Errorf("failed call to hipDriverGetVersion: %d %s", status, err)
+		return "", fmt.Errorf("failed call to hipDriverGetVersion: %d %s", status, err)
 	}
-
-	slog.Debug("hipDriverGetVersion", "version", version)
-	// TODO - this isn't actually right, but the docs claim hipDriverGetVersion isn't accurate anyway...
-	driverMajor = version / 1000
-	driverMinor = (version - (driverMajor * 1000)) / 10
-
-	return driverMajor, driverMinor, nil
+	return strconv.Itoa(version), nil
 }

 func (hl *HipLib) HipGetDeviceCount() int {
--- a/gpu/amd_linux.go
+++ b/gpu/amd_linux.go
@@ -8,7 +8,6 @@ import (
 	"log/slog"
 	"os"
 	"path/filepath"
-	"regexp"
 	"slices"
 	"strconv"
 	"strings"
@@ -26,12 +25,12 @@ const (
 	// Prefix with the node dir
 	GPUTotalMemoryFileGlob = "mem_banks/*/properties" // size_in_bytes line
 	GPUUsedMemoryFileGlob  = "mem_banks/*/used_memory"
+	RocmStandardLocation   = "/opt/rocm/lib"
 )

 var (
 	// Used to validate if the given ROCm lib is usable
-	ROCmLibGlobs          = []string{"libhipblas.so.2*", "rocblas"} // TODO - probably include more coverage of files here...
-	RocmStandardLocations = []string{"/opt/rocm/lib", "/usr/lib64"}
+	ROCmLibGlobs = []string{"libhipblas.so.2*", "rocblas"} // TODO - probably include more coverage of files here...
 )

 // Gather GPU information from the amdgpu driver if any supported GPUs are detected
@@ -42,8 +41,10 @@ func AMDGetGPUInfo() []GpuInfo {
 	}

 	// Opportunistic logging of driver version to aid in troubleshooting
-	driverMajor, driverMinor, err := AMDDriverVersion()
-	if err != nil {
+	ver, err := AMDDriverVersion()
+	if err == nil {
+		slog.Info("AMD Driver: " + ver)
+	} else {
 		// TODO - if we see users crash and burn with the upstreamed kernel this can be adjusted to hard-fail rocm support and fallback to CPU
 		slog.Warn("ollama recommends running the https://www.amd.com/en/support/linux-drivers", "error", err)
 	}
@@ -90,7 +91,6 @@ func AMDGetGPUInfo() []GpuInfo {
 		scanner := bufio.NewScanner(fp)
 		isCPU := false
 		var major, minor, patch uint64
-		var vendor, device uint64
 		for scanner.Scan() {
 			line := strings.TrimSpace(scanner.Text())
 			// Note: we could also use "cpu_cores_count X" where X is greater than zero to detect CPUs
@@ -118,26 +118,6 @@ func AMDGetGPUInfo() []GpuInfo {
 					slog.Debug("malformed int " + line)
 					continue
 				}
-			} else if strings.HasPrefix(line, "vendor_id") {
-				ver := strings.Fields(line)
-				if len(ver) != 2 {
-					slog.Debug("malformed vendor_id", "vendor_id", line)
-					continue
-				}
-				vendor, err = strconv.ParseUint(ver[1], 10, 32)
-				if err != nil {
-					slog.Debug("malformed vendor_id" + line)
-				}
-			} else if strings.HasPrefix(line, "device_id") {
-				ver := strings.Fields(line)
-				if len(ver) != 2 {
-					slog.Debug("malformed device_id", "device_id", line)
-					continue
-				}
-				device, err = strconv.ParseUint(ver[1], 10, 32)
-				if err != nil {
-					slog.Debug("malformed device_id" + line)
-				}
 			}

 			// TODO - any other properties we want to extract and record?
@@ -160,7 +140,7 @@ func AMDGetGPUInfo() []GpuInfo {
 		}

 		if int(major) < RocmComputeMin {
-			slog.Warn(fmt.Sprintf("amdgpu too old gfx%d%x%x", major, minor, patch), "gpu", gpuID)
+			slog.Warn(fmt.Sprintf("amdgpu too old gfx%d%d%x", major, minor, patch), "gpu", gpuID)
 			continue
 		}

@@ -230,29 +210,24 @@ func AMDGetGPUInfo() []GpuInfo {

 		// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
 		if totalMemory < IGPUMemLimit {
-			slog.Info("unsupported Radeon iGPU detected skipping", "id", gpuID, "total", format.HumanBytes2(totalMemory))
+			slog.Info("amdgpu appears to be an iGPU, skipping", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
 			continue
 		}
-		var name string
-		// TODO - PCI ID lookup
-		if vendor > 0 && device > 0 {
-			name = fmt.Sprintf("%04x:%04x", vendor, device)
-		}

-		slog.Debug("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
-		slog.Debug("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory))
+		slog.Info("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
+		slog.Info("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory))
 		gpuInfo := GpuInfo{
 			Library: "rocm",
 			memInfo: memInfo{
 				TotalMemory: totalMemory,
 				FreeMemory:  (totalMemory - usedMemory),
 			},
-			ID:            fmt.Sprintf("%d", gpuID),
-			Name:          name,
-			Compute:       fmt.Sprintf("gfx%d%x%x", major, minor, patch),
+			ID: fmt.Sprintf("%d", gpuID),
+			// Name: not exposed in sysfs directly, would require pci device id lookup
+			Major:         int(major),
+			Minor:         int(minor),
+			Patch:         int(patch),
 			MinimumMemory: rocmMinimumMemory,
-			DriverMajor:   driverMajor,
-			DriverMinor:   driverMinor,
 		}

 		// If the user wants to filter to a subset of devices, filter out if we aren't a match
@@ -291,7 +266,7 @@ func AMDGetGPUInfo() []GpuInfo {
 				}
 				slog.Debug("rocm supported GPUs", "types", supported)
 			}
-			gfx := gpuInfo.Compute
+			gfx := fmt.Sprintf("gfx%d%d%x", gpuInfo.Major, gpuInfo.Minor, gpuInfo.Patch)
 			if !slices.Contains[[]string, string](supported, gfx) {
 				slog.Warn("amdgpu is not supported", "gpu", gpuInfo.ID, "gpu_type", gfx, "library", libDir, "supported_types", supported)
 				// TODO - consider discrete markdown just for ROCM troubleshooting?
@@ -301,7 +276,7 @@ func AMDGetGPUInfo() []GpuInfo {
 				slog.Info("amdgpu is supported", "gpu", gpuInfo.ID, "gpu_type", gfx)
 			}
 		} else {
-			slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride)
+			slog.Debug("skipping rocm gfx compatibility check with HSA_OVERRIDE_GFX_VERSION=" + gfxOverride)
 		}

 		// The GPU has passed all the verification steps and is supported
@@ -347,34 +322,19 @@ func AMDValidateLibDir() (string, error) {
 	return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
 }

-func AMDDriverVersion() (driverMajor, driverMinor int, err error) {
-	_, err = os.Stat(DriverVersionFile)
+func AMDDriverVersion() (string, error) {
+	_, err := os.Stat(DriverVersionFile)
 	if err != nil {
-		return 0, 0, fmt.Errorf("amdgpu version file missing: %s %w", DriverVersionFile, err)
+		return "", fmt.Errorf("amdgpu version file missing: %s %w", DriverVersionFile, err)
 	}
 	fp, err := os.Open(DriverVersionFile)
 	if err != nil {
-		return 0, 0, err
+		return "", err
 	}
 	defer fp.Close()
 	verString, err := io.ReadAll(fp)
 	if err != nil {
-		return 0, 0, err
+		return "", err
 	}
-
-	pattern := `\A(\d+)\.(\d+).*`
-	regex := regexp.MustCompile(pattern)
-	match := regex.FindStringSubmatch(string(verString))
-	if len(match) < 2 {
-		return 0, 0, fmt.Errorf("malformed version string %s", string(verString))
-	}
-	driverMajor, err = strconv.Atoi(match[1])
-	if err != nil {
-		return 0, 0, err
-	}
-	driverMinor, err = strconv.Atoi(match[2])
-	if err != nil {
-		return 0, 0, err
-	}
-	return driverMajor, driverMinor, nil
+	return strings.TrimSpace(string(verString)), nil
 }
--- a/gpu/amd_windows.go
+++ b/gpu/amd_windows.go
@@ -7,12 +7,14 @@ import (
 	"os"
 	"path/filepath"
 	"slices"
+	"strconv"
 	"strings"

 	"github.com/ollama/ollama/format"
 )

 const (
+	RocmStandardLocation = "C:\\Program Files\\AMD\\ROCm\\5.7\\bin" // TODO glob?

 	// TODO  We're lookinng for this exact name to detect iGPUs since hipGetDeviceProperties never reports integrated==true
 	iGPUName = "AMD Radeon(TM) Graphics"
@@ -20,8 +22,7 @@ const (

 var (
 	// Used to validate if the given ROCm lib is usable
-	ROCmLibGlobs          = []string{"hipblas.dll", "rocblas"}                 // TODO - probably include more coverage of files here...
-	RocmStandardLocations = []string{"C:\\Program Files\\AMD\\ROCm\\5.7\\bin"} // TODO glob?
+	ROCmLibGlobs = []string{"hipblas.dll", "rocblas"} // TODO - probably include more coverage of files here...
 )

 func AMDGetGPUInfo() []GpuInfo {
@@ -33,12 +34,13 @@ func AMDGetGPUInfo() []GpuInfo {
 	}
 	defer hl.Release()

-	// TODO - this reports incorrect version information, so omitting for now
-	// driverMajor, driverMinor, err := hl.AMDDriverVersion()
-	// if err != nil {
-	// 	// For now this is benign, but we may eventually need to fail compatibility checks
-	// 	slog.Debug("error looking up amd driver version", "error", err)
-	// }
+	ver, err := hl.AMDDriverVersion()
+	if err == nil {
+		slog.Info("AMD Driver: " + ver)
+	} else {
+		// For now this is benign, but we may eventually need to fail compatibility checks
+		slog.Debug("error looking up amd driver version", "error", err)
+	}

 	// Note: the HIP library automatically handles subsetting to any HIP_VISIBLE_DEVICES the user specified
 	count := hl.HipGetDeviceCount()
@@ -60,10 +62,10 @@ func AMDGetGPUInfo() []GpuInfo {
 			return nil
 		}
 	} else {
-		slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride)
+		slog.Debug("skipping rocm gfx compatibility check with HSA_OVERRIDE_GFX_VERSION=" + gfxOverride)
 	}

-	slog.Debug("detected hip devices", "count", count)
+	slog.Info("detected hip devices", "count", count)
 	// TODO how to determine the underlying device ID when visible devices is causing this to subset?
 	for i := 0; i < count; i++ {
 		err = hl.HipSetDevice(i)
@@ -83,11 +85,18 @@ func AMDGetGPUInfo() []GpuInfo {
 		// Can luid be used on windows for setting visible devices (and is it actually set?)
 		n = bytes.IndexByte(props.GcnArchName[:], 0)
 		gfx := string(props.GcnArchName[:n])
-		slog.Debug("hip device", "id", i, "name", name, "gfx", gfx)
+		slog.Info("hip device", "id", i, "name", name, "gfx", gfx)
+		var major, minor, patch string
+		switch len(gfx) {
+		case 6:
+			major, minor, patch = gfx[3:4], gfx[4:5], gfx[5:]
+		case 7:
+			major, minor, patch = gfx[3:5], gfx[5:6], gfx[6:]
+		}
 		//slog.Info(fmt.Sprintf("[%d] Integrated: %d", i, props.iGPU)) // DOESN'T REPORT CORRECTLY!  Always 0
 		// TODO  Why isn't props.iGPU accurate!?
 		if strings.EqualFold(name, iGPUName) {
-			slog.Info("unsupported Radeon iGPU detected skipping", "id", i, "name", name, "gfx", gfx)
+			slog.Info("iGPU detected skipping", "id", i)
 			continue
 		}
 		if gfxOverride == "" {
@@ -97,7 +106,7 @@ func AMDGetGPUInfo() []GpuInfo {
 				slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage")
 				continue
 			} else {
-				slog.Debug("amdgpu is supported", "gpu", i, "gpu_type", gfx)
+				slog.Info("amdgpu is supported", "gpu", i, "gpu_type", gfx)
 			}
 		}

@@ -115,8 +124,8 @@ func AMDGetGPUInfo() []GpuInfo {

 		// TODO revisit this once ROCm v6 is available on windows.
 		// v5.7 only reports VRAM used by this process, so it's completely wrong and unusable
-		slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
-		slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
+		slog.Info("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
+		slog.Info("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
 		gpuInfo := GpuInfo{
 			Library: "rocm",
 			memInfo: memInfo{
@@ -126,12 +135,31 @@ func AMDGetGPUInfo() []GpuInfo {
 			ID:             fmt.Sprintf("%d", i), // TODO this is probably wrong if we specify visible devices
 			DependencyPath: libDir,
 			MinimumMemory:  rocmMinimumMemory,
-			Name:           name,
-			Compute:        gfx,
-
-			// TODO - this information isn't accurate on windows, so don't report it until we find the right way to retrieve
-			// DriverMajor:    driverMajor,
-			// DriverMinor:    driverMinor,
+		}
+		if major != "" {
+			gpuInfo.Major, err = strconv.Atoi(major)
+			if err != nil {
+				slog.Info("failed to parse version", "version", gfx, "error", err)
+			}
+		}
+		if minor != "" {
+			gpuInfo.Minor, err = strconv.Atoi(minor)
+			if err != nil {
+				slog.Info("failed to parse version", "version", gfx, "error", err)
+			}
+		}
+		if patch != "" {
+			// Patch rev is hex; e.g. gfx90a
+			p, err := strconv.ParseInt(patch, 16, 0)
+			if err != nil {
+				slog.Info("failed to parse version", "version", gfx, "error", err)
+			} else {
+				gpuInfo.Patch = int(p)
+			}
+		}
+		if gpuInfo.Major < RocmComputeMin {
+			slog.Warn(fmt.Sprintf("amdgpu [%s] too old gfx%d%d%x", gpuInfo.ID, gpuInfo.Major, gpuInfo.Minor, gpuInfo.Patch))
+			continue
 		}

 		resp = append(resp, gpuInfo)
--- a/gpu/assets.go
+++ b/gpu/assets.go
@@ -12,8 +12,6 @@ import (
 	"sync"
 	"syscall"
 	"time"
-
-	"github.com/ollama/ollama/server/envconfig"
 )

 var (
@@ -26,8 +24,45 @@ func PayloadsDir() (string, error) {
 	defer lock.Unlock()
 	var err error
 	if payloadsDir == "" {
-		runnersDir := envconfig.RunnersDir
+		runnersDir := os.Getenv("OLLAMA_RUNNERS_DIR")
+		// On Windows we do not carry the payloads inside the main executable
+		if runtime.GOOS == "windows" && runnersDir == "" {
+			appExe, err := os.Executable()
+			if err != nil {
+				slog.Error("failed to lookup executable path", "error", err)
+				return "", err
+			}

+			cwd, err := os.Getwd()
+			if err != nil {
+				slog.Error("failed to lookup working directory", "error", err)
+				return "", err
+			}
+
+			var paths []string
+			for _, root := range []string{appExe, cwd} {
+				paths = append(paths,
+					filepath.Join(root),
+					filepath.Join(root, "windows-"+runtime.GOARCH),
+					filepath.Join(root, "dist", "windows-"+runtime.GOARCH),
+				)
+			}
+
+			// Try a few variations to improve developer experience when building from source in the local tree
+			for _, p := range paths {
+				candidate := filepath.Join(p, "ollama_runners")
+				_, err := os.Stat(candidate)
+				if err == nil {
+					runnersDir = candidate
+					break
+				}
+			}
+			if runnersDir == "" {
+				err = fmt.Errorf("unable to locate llm runner directory.  Set OLLAMA_RUNNERS_DIR to the location of 'ollama_runners'")
+				slog.Error("incomplete distribution", "error", err)
+				return "", err
+			}
+		}
 		if runnersDir != "" {
 			payloadsDir = runnersDir
 			return payloadsDir, nil
@@ -35,7 +70,7 @@ func PayloadsDir() (string, error) {

 		// The remainder only applies on non-windows where we still carry payloads in the main executable
 		cleanupTmpDirs()
-		tmpDir := envconfig.TmpDir
+		tmpDir := os.Getenv("OLLAMA_TMPDIR")
 		if tmpDir == "" {
 			tmpDir, err = os.MkdirTemp("", "ollama")
 			if err != nil {
@@ -98,7 +133,7 @@ func cleanupTmpDirs() {
 func Cleanup() {
 	lock.Lock()
 	defer lock.Unlock()
-	runnersDir := envconfig.RunnersDir
+	runnersDir := os.Getenv("OLLAMA_RUNNERS_DIR")
 	if payloadsDir != "" && runnersDir == "" && runtime.GOOS != "windows" {
 		// We want to fully clean up the tmpdir parent of the payloads dir
 		tmpDir := filepath.Clean(filepath.Join(payloadsDir, ".."))
--- a/gpu/cpu_common.go
+++ b/gpu/cpu_common.go
@@ -8,14 +8,14 @@ import (

 func GetCPUVariant() string {
 	if cpu.X86.HasAVX2 {
-		slog.Debug("CPU has AVX2")
+		slog.Info("CPU has AVX2")
 		return "avx2"
 	}
 	if cpu.X86.HasAVX {
-		slog.Debug("CPU has AVX")
+		slog.Info("CPU has AVX")
 		return "avx"
 	}
-	slog.Debug("CPU does not have vector extensions")
+	slog.Info("CPU does not have vector extensions")
 	// else LCD
 	return ""
 }
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -21,13 +21,11 @@ import (
 	"unsafe"

 	"github.com/ollama/ollama/format"
-	"github.com/ollama/ollama/server/envconfig"
 )

 type handles struct {
 	deviceCount int
 	cudart      *C.cudart_handle_t
-	nvcuda      *C.nvcuda_handle_t
 }

 const (
@@ -64,22 +62,6 @@ var CudartWindowsGlobs = []string{
 	"c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll",
 }

-var NvcudaLinuxGlobs = []string{
-	"/usr/local/cuda*/targets/*/lib/libcuda.so*",
-	"/usr/lib/*-linux-gnu/nvidia/current/libcuda.so*",
-	"/usr/lib/*-linux-gnu/libcuda.so*",
-	"/usr/lib/wsl/lib/libcuda.so*",
-	"/usr/lib/wsl/drivers/*/libcuda.so*",
-	"/opt/cuda/lib*/libcuda.so*",
-	"/usr/local/cuda/lib*/libcuda.so*",
-	"/usr/lib*/libcuda.so*",
-	"/usr/local/lib*/libcuda.so*",
-}
-
-var NvcudaWindowsGlobs = []string{
-	"c:\\windows\\system*\\nvcuda.dll",
-}
-
 // Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
 // Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
 var CudaTegra string = os.Getenv("JETSON_JETPACK")
@@ -92,8 +74,6 @@ func initGPUHandles() *handles {
 	gpuHandles := &handles{}
 	var cudartMgmtName string
 	var cudartMgmtPatterns []string
-	var nvcudaMgmtName string
-	var nvcudaMgmtPatterns []string

 	tmpDir, _ := PayloadsDir()
 	switch runtime.GOOS {
@@ -102,9 +82,6 @@ func initGPUHandles() *handles {
 		localAppData := os.Getenv("LOCALAPPDATA")
 		cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", cudartMgmtName)}
 		cudartMgmtPatterns = append(cudartMgmtPatterns, CudartWindowsGlobs...)
-		// Aligned with driver, we can't carry as payloads
-		nvcudaMgmtName = "nvcuda.dll"
-		nvcudaMgmtPatterns = NvcudaWindowsGlobs
 	case "linux":
 		cudartMgmtName = "libcudart.so*"
 		if tmpDir != "" {
@@ -112,30 +89,16 @@ func initGPUHandles() *handles {
 			cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", cudartMgmtName)}
 		}
 		cudartMgmtPatterns = append(cudartMgmtPatterns, CudartLinuxGlobs...)
-		// Aligned with driver, we can't carry as payloads
-		nvcudaMgmtName = "libcuda.so*"
-		nvcudaMgmtPatterns = NvcudaLinuxGlobs
 	default:
 		return gpuHandles
 	}

-	slog.Debug("Detecting GPUs")
-	nvcudaLibPaths := FindGPULibs(nvcudaMgmtName, nvcudaMgmtPatterns)
-	if len(nvcudaLibPaths) > 0 {
-		deviceCount, nvcuda, libPath := LoadNVCUDAMgmt(nvcudaLibPaths)
-		if nvcuda != nil {
-			slog.Debug("detected GPUs", "count", deviceCount, "library", libPath)
-			gpuHandles.nvcuda = nvcuda
-			gpuHandles.deviceCount = deviceCount
-			return gpuHandles
-		}
-	}
-
+	slog.Info("Detecting GPUs")
 	cudartLibPaths := FindGPULibs(cudartMgmtName, cudartMgmtPatterns)
 	if len(cudartLibPaths) > 0 {
 		deviceCount, cudart, libPath := LoadCUDARTMgmt(cudartLibPaths)
 		if cudart != nil {
-			slog.Debug("detected GPUs", "library", libPath, "count", deviceCount)
+			slog.Info("detected GPUs", "library", libPath, "count", deviceCount)
 			gpuHandles.cudart = cudart
 			gpuHandles.deviceCount = deviceCount
 			return gpuHandles
@@ -155,9 +118,6 @@ func GetGPUInfo() GpuInfoList {
 		if gpuHandles.cudart != nil {
 			C.cudart_release(*gpuHandles.cudart)
 		}
-		if gpuHandles.nvcuda != nil {
-			C.nvcuda_release(*gpuHandles.nvcuda)
-		}
 	}()

 	// All our GPU builds on x86 have AVX enabled, so fallback to CPU if we don't detect at least AVX
@@ -166,12 +126,6 @@ func GetGPUInfo() GpuInfoList {
 		slog.Warn("CPU does not have AVX or AVX2, disabling GPU support.")
 	}

-	// On windows we bundle the nvidia library one level above the runner dir
-	depPath := ""
-	if runtime.GOOS == "windows" && envconfig.RunnersDir != "" {
-		depPath = filepath.Dir(envconfig.RunnersDir)
-	}
-
 	var memInfo C.mem_info_t
 	resp := []GpuInfo{}

@@ -184,15 +138,7 @@ func GetGPUInfo() GpuInfoList {
 		gpuInfo := GpuInfo{
 			Library: "cuda",
 		}
-		var driverMajor int
-		var driverMinor int
-		if gpuHandles.cudart != nil {
-			C.cudart_check_vram(*gpuHandles.cudart, C.int(i), &memInfo)
-		} else {
-			C.nvcuda_check_vram(*gpuHandles.nvcuda, C.int(i), &memInfo)
-			driverMajor = int(gpuHandles.nvcuda.driver_major)
-			driverMinor = int(gpuHandles.nvcuda.driver_minor)
-		}
+		C.cudart_check_vram(*gpuHandles.cudart, C.int(i), &memInfo)
 		if memInfo.err != nil {
 			slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
 			C.free(unsafe.Pointer(memInfo.err))
@@ -205,12 +151,9 @@ func GetGPUInfo() GpuInfoList {
 		gpuInfo.TotalMemory = uint64(memInfo.total)
 		gpuInfo.FreeMemory = uint64(memInfo.free)
 		gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
-		gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
+		gpuInfo.Major = int(memInfo.major)
+		gpuInfo.Minor = int(memInfo.minor)
 		gpuInfo.MinimumMemory = cudaMinimumMemory
-		gpuInfo.DependencyPath = depPath
-		gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
-		gpuInfo.DriverMajor = int(driverMajor)
-		gpuInfo.DriverMinor = int(driverMinor)

 		// TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
 		resp = append(resp, gpuInfo)
@@ -253,10 +196,9 @@ func GetCPUMem() (memInfo, error) {
 	return ret, nil
 }

-func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
+func FindGPULibs(baseLibName string, patterns []string) []string {
 	// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
 	var ldPaths []string
-	var patterns []string
 	gpuLibPaths := []string{}
 	slog.Debug("Searching for GPU library", "name", baseLibName)

@@ -276,14 +218,8 @@ func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
 		}
 		patterns = append(patterns, filepath.Join(d, baseLibName+"*"))
 	}
-	patterns = append(patterns, defaultPatterns...)
 	slog.Debug("gpu library search", "globs", patterns)
 	for _, pattern := range patterns {
-
-		// Nvidia PhysX known to return bogus results
-		if strings.Contains(pattern, "PhysX") {
-			slog.Debug("skipping PhysX cuda library path", "path", pattern)
-		}
 		// Ignore glob discovery errors
 		matches, _ := filepath.Glob(pattern)
 		for _, match := range matches {
@@ -331,25 +267,8 @@ func LoadCUDARTMgmt(cudartLibPaths []string) (int, *C.cudart_handle_t, string) {
 	return 0, nil, ""
 }

-func LoadNVCUDAMgmt(nvcudaLibPaths []string) (int, *C.nvcuda_handle_t, string) {
-	var resp C.nvcuda_init_resp_t
-	resp.ch.verbose = getVerboseState()
-	for _, libPath := range nvcudaLibPaths {
-		lib := C.CString(libPath)
-		defer C.free(unsafe.Pointer(lib))
-		C.nvcuda_init(lib, &resp)
-		if resp.err != nil {
-			slog.Debug("Unable to load nvcuda", "library", libPath, "error", C.GoString(resp.err))
-			C.free(unsafe.Pointer(resp.err))
-		} else {
-			return int(resp.num_devices), &resp.ch, libPath
-		}
-	}
-	return 0, nil, ""
-}
-
 func getVerboseState() C.uint16_t {
-	if envconfig.Debug {
+	if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
 		return C.uint16_t(1)
 	}
 	return C.uint16_t(0)
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
@@ -1,5 +1,3 @@
-//go:build darwin
-
 package gpu

 /*
@@ -10,12 +8,6 @@ package gpu
 import "C"
 import (
 	"runtime"
-
-	"github.com/ollama/ollama/format"
-)
-
-const (
-	metalMinimumMemory = 512 * format.MebiByte
 )

 func GetGPUInfo() GpuInfoList {
@@ -38,7 +30,7 @@ func GetGPUInfo() GpuInfoList {
 	// TODO is there a way to gather actual allocated video memory? (currentAllocatedSize doesn't work)
 	info.FreeMemory = info.TotalMemory

-	info.MinimumMemory = metalMinimumMemory
+	info.MinimumMemory = 0
 	return []GpuInfo{info}
 }

--- a/gpu/gpu_info.h
+++ b/gpu/gpu_info.h
@@ -39,19 +39,16 @@ extern "C" {
 #endif

 #define GPU_ID_LEN 64
-#define GPU_NAME_LEN 96

 typedef struct mem_info {
  char *err;  // If non-nill, caller responsible for freeing
  char gpu_id[GPU_ID_LEN];
-  char gpu_name[GPU_NAME_LEN];
  uint64_t total;
  uint64_t free;

  // Compute Capability
  int major; 
  int minor;
-  int patch;
 } mem_info_t;

 void cpu_check_ram(mem_info_t *resp);
@@ -61,7 +58,6 @@ void cpu_check_ram(mem_info_t *resp);
 #endif

 #include "gpu_info_cudart.h"
-#include "gpu_info_nvcuda.h"

 #endif  // __GPU_INFO_H__
 #endif  // __APPLE__
--- a/gpu/gpu_info_cpu.c
+++ b/gpu/gpu_info_cpu.c
@@ -10,6 +10,8 @@ void cpu_check_ram(mem_info_t *resp) {
  if (GlobalMemoryStatusEx(&info) != 0) {
    resp->total = info.ullTotalPhys;
    resp->free = info.ullAvailPhys;
+    resp->major = 0;
+    resp->minor = 0;
    snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0");
  } else {
    resp->err = LOAD_ERR();
@@ -29,6 +31,8 @@ void cpu_check_ram(mem_info_t *resp) {
  } else {
    resp->total = info.totalram * info.mem_unit;
    resp->free = info.freeram * info.mem_unit;
+    resp->major = 0;
+    resp->minor = 0;
    snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0");
  }
  return;
--- a/gpu/gpu_info_cudart.h
+++ b/gpu/gpu_info_cudart.h
@@ -6,9 +6,9 @@
 // Just enough typedef's to dlopen/dlsym for memory information
 typedef enum cudartReturn_enum {
  CUDART_SUCCESS = 0,
-  CUDART_ERROR_INVALID_VALUE = 1,
-  CUDART_ERROR_MEMORY_ALLOCATION = 2,
-  CUDART_ERROR_INSUFFICIENT_DRIVER = 35,
+  CUDA_ERROR_INVALID_VALUE = 1,
+  CUDA_ERROR_MEMORY_ALLOCATION = 2,
+  CUDA_ERROR_INSUFFICIENT_DRIVER = 35,
  // Other values omitted for now...
 } cudartReturn_t;

--- a/gpu/gpu_info_nvcuda.c
+++ b/gpu/gpu_info_nvcuda.c
@@ -1,207 +0,0 @@
-#ifndef __APPLE__  // TODO - maybe consider nvidia support on intel macs?
-
-#include <string.h>
-#include "gpu_info_nvcuda.h"
-
-void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
-  CUresult ret;
-  resp->err = NULL;
-  resp->num_devices = 0;
-  const int buflen = 256;
-  char buf[buflen + 1];
-  int i;
-
-  struct lookup {
-    char *s;
-    void **p;
-  } l[] = {
-   
-      {"cuInit", (void *)&resp->ch.cuInit},
-      {"cuDriverGetVersion", (void *)&resp->ch.cuDriverGetVersion},
-      {"cuDeviceGetCount", (void *)&resp->ch.cuDeviceGetCount},
-      {"cuDeviceGet", (void *)&resp->ch.cuDeviceGet},
-      {"cuDeviceGetAttribute", (void *)&resp->ch.cuDeviceGetAttribute},
-      {"cuDeviceGetUuid", (void *)&resp->ch.cuDeviceGetUuid},
-      {"cuDeviceGetName", (void *)&resp->ch.cuDeviceGetName},
-      {"cuCtxCreate_v3", (void *)&resp->ch.cuCtxCreate_v3},
-      {"cuMemGetInfo_v2", (void *)&resp->ch.cuMemGetInfo_v2},
-      {"cuCtxDestroy", (void *)&resp->ch.cuCtxDestroy},
-      {NULL, NULL},
-  };
-
-  resp->ch.handle = LOAD_LIBRARY(nvcuda_lib_path, RTLD_LAZY);
-  if (!resp->ch.handle) {
-    char *msg = LOAD_ERR();
-    LOG(resp->ch.verbose, "library %s load err: %s\n", nvcuda_lib_path, msg);
-    snprintf(buf, buflen,
-            "Unable to load %s library to query for Nvidia GPUs: %s",
-            nvcuda_lib_path, msg);
-    free(msg);
-    resp->err = strdup(buf);
-    return;
-  }
-
-  for (i = 0; l[i].s != NULL; i++) {
-    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
-    if (!*l[i].p) {
-      char *msg = LOAD_ERR();
-      LOG(resp->ch.verbose, "dlerr: %s\n", msg);
-      UNLOAD_LIBRARY(resp->ch.handle);
-      resp->ch.handle = NULL;
-      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
-              msg);
-      free(msg);
-      resp->err = strdup(buf);
-      return;
-    }
-  }
-
-  ret = (*resp->ch.cuInit)(0);
-  if (ret != CUDA_SUCCESS) {
-    LOG(resp->ch.verbose, "cuInit err: %d\n", ret);
-    UNLOAD_LIBRARY(resp->ch.handle);
-    resp->ch.handle = NULL;
-    if (ret == CUDA_ERROR_INSUFFICIENT_DRIVER) {
-      resp->err = strdup("your nvidia driver is too old or missing.  If you have a CUDA GPU please upgrade to run ollama");
-      return;
-    }
-    snprintf(buf, buflen, "nvcuda init failure: %d", ret);
-    resp->err = strdup(buf);
-    return;
-  }
-
-  int version = 0;
-  resp->ch.driver_major = 0;
-  resp->ch.driver_minor = 0;
-
-  // Report driver version if we're in verbose mode, ignore errors
-  ret = (*resp->ch.cuDriverGetVersion)(&version);
-  if (ret != CUDA_SUCCESS) {
-    LOG(resp->ch.verbose, "cuDriverGetVersion failed: %d\n", ret);
-  } else {
-    resp->ch.driver_major = version / 1000;
-    resp->ch.driver_minor = (version - (resp->ch.driver_major * 1000)) / 10;
-    LOG(resp->ch.verbose, "CUDA driver version: %d.%d\n", resp->ch.driver_major, resp->ch.driver_minor);
-  }
-
-  ret = (*resp->ch.cuDeviceGetCount)(&resp->num_devices);
-  if (ret != CUDA_SUCCESS) {
-    LOG(resp->ch.verbose, "cuDeviceGetCount err: %d\n", ret);
-    UNLOAD_LIBRARY(resp->ch.handle);
-    resp->ch.handle = NULL;
-    snprintf(buf, buflen, "unable to get device count: %d", ret);
-    resp->err = strdup(buf);
-    return;
-  }
-}
-
-const int buflen = 256;
-void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) {
-  resp->err = NULL;
-  nvcudaMemory_t memInfo = {0,0};
-  CUresult ret;
-  CUdevice device = -1;
-  CUcontext ctx = NULL;
-  char buf[buflen + 1];
-  CUuuid uuid = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
-
-  if (h.handle == NULL) {
-    resp->err = strdup("nvcuda handle isn't initialized");
-    return;
-  }
-
-  ret = (*h.cuDeviceGet)(&device, i);
-  if (ret != CUDA_SUCCESS) {
-    snprintf(buf, buflen, "nvcuda device failed to initialize");
-    resp->err = strdup(buf);
-    return;
-  }
-
-  int major = 0;
-  int minor = 0;
-  ret = (*h.cuDeviceGetAttribute)(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);
-  if (ret != CUDA_SUCCESS) {
-    LOG(h.verbose, "[%d] device major lookup failure: %d\n", i, ret);
-  } else {
-    ret = (*h.cuDeviceGetAttribute)(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device);
-    if (ret != CUDA_SUCCESS) {
-      LOG(h.verbose, "[%d] device minor lookup failure: %d\n", i, ret);
-    } else {
-      resp->minor = minor;  
-      resp->major = major;  
-    }
-  }
-
-  ret = (*h.cuDeviceGetUuid)(&uuid, device);
-  if (ret != CUDA_SUCCESS) {
-    LOG(h.verbose, "[%d] device uuid lookup failure: %d\n", i, ret);
-    snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", i);
-  } else {
-    // GPU-d110a105-ac29-1d54-7b49-9c90440f215b
-    snprintf(&resp->gpu_id[0], GPU_ID_LEN,
-        "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
-        uuid.bytes[0],
-        uuid.bytes[1],
-        uuid.bytes[2],
-        uuid.bytes[3],
-        uuid.bytes[4],
-        uuid.bytes[5],
-        uuid.bytes[6],
-        uuid.bytes[7],
-        uuid.bytes[8],
-        uuid.bytes[9],
-        uuid.bytes[10],
-        uuid.bytes[11],
-        uuid.bytes[12],
-        uuid.bytes[13],
-        uuid.bytes[14],
-        uuid.bytes[15]
-      );
-  }
-
-  ret = (*h.cuDeviceGetName)(&resp->gpu_name[0], GPU_NAME_LEN, device);
-  if (ret != CUDA_SUCCESS) {
-    LOG(h.verbose, "[%d] device name lookup failure: %d\n", i, ret);
-    resp->gpu_name[0] = '\0';
-  }
-
-  // To get memory we have to set (and release) a context
-  ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
-  if (ret != CUDA_SUCCESS) {
-    snprintf(buf, buflen, "nvcuda failed to get primary device context %d", ret);
-    resp->err = strdup(buf);
-    return;
-  }
-
-  ret = (*h.cuMemGetInfo_v2)(&memInfo.free, &memInfo.total);
-  if (ret != CUDA_SUCCESS) {
-    snprintf(buf, buflen, "nvcuda device memory info lookup failure %d", ret);
-    resp->err = strdup(buf);
-    // Best effort on failure...
-    (*h.cuCtxDestroy)(ctx);
-    return;
-  }
-
-  resp->total = memInfo.total;
-  resp->free = memInfo.free;
-
-  LOG(h.verbose, "[%s] CUDA totalMem %lu mb\n", resp->gpu_id, resp->total / 1024 / 1024);
-  LOG(h.verbose, "[%s] CUDA freeMem %lu mb\n", resp->gpu_id, resp->free / 1024 / 1024);
-  LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor);
-
-  
-
-  ret = (*h.cuCtxDestroy)(ctx);
-  if (ret != CUDA_SUCCESS) {
-    LOG(1, "nvcuda failed to release primary device context %d", ret);
-  }
-}
-
-void nvcuda_release(nvcuda_handle_t h) {
-  LOG(h.verbose, "releasing nvcuda library\n");
-  UNLOAD_LIBRARY(h.handle);
-  // TODO and other context release logic?
-  h.handle = NULL;
-}
-
-#endif  // __APPLE__
--- a/gpu/gpu_info_nvcuda.h
+++ b/gpu/gpu_info_nvcuda.h
@@ -1,74 +0,0 @@
-#ifndef __APPLE__
-#ifndef __GPU_INFO_NVCUDA_H__
-#define __GPU_INFO_NVCUDA_H__
-#include "gpu_info.h"
-
-// Just enough typedef's to dlopen/dlsym for memory information
-typedef enum cudaError_enum {
-  CUDA_SUCCESS = 0,
-  CUDA_ERROR_INVALID_VALUE = 1,
-  CUDA_ERROR_MEMORY_ALLOCATION = 2,
-  CUDA_ERROR_NOT_INITIALIZED = 3,
-  CUDA_ERROR_INSUFFICIENT_DRIVER = 35,
-  // Other values omitted for now...
-} CUresult;
-
-typedef enum CUdevice_attribute_enum {
-  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75,
-  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76,
-
-  // TODO - not yet wired up but may be useful for Jetson or other
-  // integrated GPU scenarios with shared memory
-  CU_DEVICE_ATTRIBUTE_INTEGRATED = 18
-
-} CUdevice_attribute;
-
-typedef void *nvcudaDevice_t;  // Opaque is sufficient
-typedef struct nvcudaMemory_st {
-  uint64_t total;
-  uint64_t free;
-} nvcudaMemory_t;
-
-typedef struct nvcudaDriverVersion {
-  int major;
-  int minor;
-} nvcudaDriverVersion_t;
-
-typedef struct CUuuid_st {
-    unsigned char bytes[16];
-} CUuuid;
-
-typedef int CUdevice;
-typedef void* CUcontext;
-
-typedef struct nvcuda_handle {
-  void *handle;
-  uint16_t verbose;
-  int driver_major;
-  int driver_minor;
-  CUresult (*cuInit)(unsigned int Flags);
-  CUresult (*cuDriverGetVersion)(int *driverVersion);
-  CUresult (*cuDeviceGetCount)(int *);
-  CUresult (*cuDeviceGet)(CUdevice* device, int ordinal);
-  CUresult (*cuDeviceGetAttribute)(int* pi, CUdevice_attribute attrib, CUdevice dev);
-  CUresult (*cuDeviceGetUuid)(CUuuid* uuid, CUdevice dev); // signature compatible with cuDeviceGetUuid_v2
-  CUresult (*cuDeviceGetName)(char *name, int len, CUdevice dev);
-
-  // Context specific aspects
-  CUresult (*cuCtxCreate_v3)(CUcontext* pctx, void *params, int len, unsigned int flags, CUdevice dev);
-  CUresult (*cuMemGetInfo_v2)(uint64_t* free, uint64_t* total);
-  CUresult (*cuCtxDestroy)(CUcontext ctx);
-} nvcuda_handle_t;
-
-typedef struct nvcuda_init_resp {
-  char *err;  // If err is non-null handle is invalid
-  nvcuda_handle_t ch;
-  int num_devices;
-} nvcuda_init_resp_t;
-
-void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp);
-void nvcuda_check_vram(nvcuda_handle_t ch, int device_id, mem_info_t *resp);
-void nvcuda_release(nvcuda_handle_t ch);
-
-#endif  // __GPU_INFO_NVCUDA_H__
-#endif  // __APPLE__
--- a/gpu/types.go
+++ b/gpu/types.go
@@ -1,12 +1,5 @@
 package gpu

-import (
-	"fmt"
-	"log/slog"
-
-	"github.com/ollama/ollama/format"
-)
-
 type memInfo struct {
 	TotalMemory uint64 `json:"total_memory,omitempty"`
 	FreeMemory  uint64 `json:"free_memory,omitempty"`
@@ -27,13 +20,11 @@ type GpuInfo struct {
 	DependencyPath string `json:"lib_path,omitempty"`

 	// GPU information
-	ID      string `json:"gpu_id"`  // string to use for selection of this specific GPU
-	Name    string `json:"name"`    // user friendly name if available
-	Compute string `json:"compute"` // Compute Capability or gfx
-
-	// Driver Information - TODO no need to put this on each GPU
-	DriverMajor int `json:"driver_major,omitempty"`
-	DriverMinor int `json:"driver_minor,omitempty"`
+	ID    string `json:"gpu_id"`          // string to use for selection of this specific GPU
+	Name  string `json:"name"`            // user friendly name if available
+	Major int    `json:"major,omitempty"` // Major compatibility version (CC or gfx)
+	Minor int    `json:"minor,omitempty"` // Minor compatibility version (CC or gfx)
+	Patch int    `json:"patch,omitempty"` // Patch compatibility only matters on AMD

 	// TODO other performance capability info to help in scheduling decisions
 }
@@ -65,21 +56,6 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
 	return resp
 }

-// Report the GPU information into the log an Info level
-func (l GpuInfoList) LogDetails() {
-	for _, g := range l {
-		slog.Info("inference compute",
-			"id", g.ID,
-			"library", g.Library,
-			"compute", g.Compute,
-			"driver", fmt.Sprintf("%d.%d", g.DriverMajor, g.DriverMinor),
-			"name", g.Name,
-			"total", format.HumanBytes2(g.TotalMemory),
-			"available", format.HumanBytes2(g.FreeMemory),
-		)
-	}
-}
-
 // Sort by Free Space
 type ByFreeMemory []GpuInfo

--- a/integration/max_queue_test.go
+++ b/integration/max_queue_test.go
@@ -1,117 +0,0 @@
-//go:build integration
-
-package integration
-
-import (
-	"context"
-	"errors"
-	"fmt"
-	"log/slog"
-	"os"
-	"strconv"
-	"strings"
-	"sync"
-	"testing"
-	"time"
-
-	"github.com/ollama/ollama/api"
-	"github.com/stretchr/testify/require"
-)
-
-func TestMaxQueue(t *testing.T) {
-	// Note: This test can be quite slow when running in CPU mode, so keep the threadCount low unless your on GPU
-	// Also note that by default Darwin can't sustain > ~128 connections without adjusting limits
-	threadCount := 32
-	mq := os.Getenv("OLLAMA_MAX_QUEUE")
-	if mq != "" {
-		var err error
-		threadCount, err = strconv.Atoi(mq)
-		require.NoError(t, err)
-	} else {
-		os.Setenv("OLLAMA_MAX_QUEUE", fmt.Sprintf("%d", threadCount))
-	}
-
-	req := api.GenerateRequest{
-		Model:  "orca-mini",
-		Prompt: "write a long historical fiction story about christopher columbus.  use at least 10 facts from his actual journey",
-		Options: map[string]interface{}{
-			"seed":        42,
-			"temperature": 0.0,
-		},
-	}
-	resp := []string{"explore", "discover", "ocean"}
-
-	// CPU mode takes much longer at the limit with a large queue setting
-	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
-	defer cancel()
-	client, _, cleanup := InitServerConnection(ctx, t)
-	defer cleanup()
-
-	require.NoError(t, PullIfMissing(ctx, client, req.Model))
-
-	// Context for the worker threads so we can shut them down
-	// embedCtx, embedCancel := context.WithCancel(ctx)
-	embedCtx := ctx
-
-	var genwg sync.WaitGroup
-	go func() {
-		genwg.Add(1)
-		defer genwg.Done()
-		slog.Info("Starting generate request")
-		DoGenerate(ctx, t, client, req, resp, 45*time.Second, 5*time.Second)
-		slog.Info("generate completed")
-	}()
-
-	// Give the generate a chance to get started before we start hammering on embed requests
-	time.Sleep(5 * time.Millisecond)
-
-	threadCount += 10 // Add a few extra to ensure we push the queue past its limit
-	busyCount := 0
-	resetByPeerCount := 0
-	canceledCount := 0
-	succesCount := 0
-	counterMu := sync.Mutex{}
-	var embedwg sync.WaitGroup
-	for i := 0; i < threadCount; i++ {
-		go func(i int) {
-			embedwg.Add(1)
-			defer embedwg.Done()
-			slog.Info("embed started", "id", i)
-			embedReq := api.EmbeddingRequest{
-				Model:   req.Model,
-				Prompt:  req.Prompt,
-				Options: req.Options,
-			}
-			// Fresh client for every request
-			client, _ = GetTestEndpoint()
-
-			resp, genErr := client.Embeddings(embedCtx, &embedReq)
-			counterMu.Lock()
-			defer counterMu.Unlock()
-			switch {
-			case genErr == nil:
-				succesCount++
-				require.Greater(t, len(resp.Embedding), 5) // somewhat arbitrary, but sufficient to be reasonable
-			case errors.Is(genErr, context.Canceled):
-				canceledCount++
-			case strings.Contains(genErr.Error(), "busy"):
-				busyCount++
-			case strings.Contains(genErr.Error(), "connection reset by peer"):
-				resetByPeerCount++
-			default:
-				require.NoError(t, genErr, "%d request failed", i)
-			}
-
-			slog.Info("embed finished", "id", i)
-		}(i)
-	}
-	genwg.Wait()
-	slog.Info("generate done, waiting for embeds")
-	embedwg.Wait()
-
-	require.Equal(t, resetByPeerCount, 0, "Connections reset by peer, have you updated your fd and socket limits?")
-	require.True(t, busyCount > 0, "no requests hit busy error but some should have")
-	require.True(t, canceledCount == 0, "no requests should have been canceled due to timeout")
-
-	slog.Info("embeds completed", "success", succesCount, "busy", busyCount, "reset", resetByPeerCount, "canceled", canceledCount)
-}
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
jmorganca	201a987ff9	some more menu options...	2024-04-28 12:40:52 -04:00
jmorganca	2d8125042a	Touch ID for cli install; server restarts	2024-04-27 22:42:38 -04:00
jmorganca	776e7bb5e4	app: fix status item icons	2024-04-27 15:57:57 -04:00
jmorganca	b8d7ca1a7b	Native implementation of macOS app	2024-04-27 14:20:10 -04:00