Add TODO

add a /tokenize endpoint
Merge pull request #4316 from dhiltgen/more_buffer
2026-01-19 04:51:17 -05:00 · 2024-05-10 10:15:22 -07:00 · 2024-05-10 10:13:50 -07:00 · 2024-05-10 10:02:34 -07:00 · 2024-05-10 09:15:28 -07:00 · 2024-05-10 08:58:16 -07:00
53 changed files with 1759 additions and 1203 deletions
--- a/README.md
+++ b/README.md
@@ -258,6 +258,7 @@ See the [API documentation](./docs/api.md) for all endpoints.

 - [Open WebUI](https://github.com/open-webui/open-webui)
 - [Enchanted (macOS native)](https://github.com/AugustDev/enchanted)
+- [Hollama](https://github.com/fmaclen/hollama)
 - [Lollms-Webui](https://github.com/ParisNeo/lollms-webui)
 - [LibreChat](https://github.com/danny-avila/LibreChat)
 - [Bionic GPT](https://github.com/bionic-gpt/bionic-gpt)
@@ -297,6 +298,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Lobe Chat](https://github.com/lobehub/lobe-chat) with [Integrating Doc](https://lobehub.com/docs/self-hosting/examples/ollama)
 - [Ollama RAG Chatbot](https://github.com/datvodinh/rag-chatbot.git) (Local Chat with multiple PDFs using Ollama and RAG)
 - [BrainSoup](https://www.nurgo-software.com/products/brainsoup) (Flexible native client with RAG & multi-agent automation)
+- [macai](https://github.com/Renset/macai) (macOS client for Ollama, ChatGPT, and other compatible API back-ends)

 ### Terminal

@@ -329,6 +331,7 @@ See the [API documentation](./docs/api.md) for all endpoints.

 - [Pacman](https://archlinux.org/packages/extra/x86_64/ollama/)
 - [Helm Chart](https://artifacthub.io/packages/helm/ollama-helm/ollama)
+- [Guix channel](https://codeberg.org/tusharhero/ollama-guix)

 ### Libraries

@@ -355,7 +358,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Ollama Connector for SAP ABAP](https://github.com/b-tocs/abap_btocs_ollama)
 - [Testcontainers](https://testcontainers.com/modules/ollama/)
 - [Portkey](https://portkey.ai/docs/welcome/integration-guides/ollama)
-
+- [PromptingTools.jl](https://github.com/svilupp/PromptingTools.jl) with an [example](https://svilupp.github.io/PromptingTools.jl/dev/examples/working_with_ollama)
+- [LlamaScript](https://github.com/WolfTheDeveloper/llamascript)
 ### Mobile

 - [Enchanted](https://github.com/AugustDev/enchanted)
--- a/api/client.go
+++ b/api/client.go
@@ -1,9 +1,16 @@
 // Package api implements the client-side API for code wishing to interact
 // with the ollama service. The methods of the [Client] type correspond to
-// the ollama REST API as described in https://github.com/ollama/ollama/blob/main/docs/api.md
-//
+// the ollama REST API as described in [the API documentation].
 // The ollama command-line client itself uses this package to interact with
 // the backend service.
+//
+// # Examples
+//
+// Several examples of using this package are available [in the GitHub
+// repository].
+//
+// [the API documentation]: https://github.com/ollama/ollama/blob/main/docs/api.md
+// [in the GitHub repository]: https://github.com/ollama/ollama/tree/main/examples
 package api

 import (
@@ -299,8 +306,14 @@ func (c *Client) Pull(ctx context.Context, req *PullRequest, fn PullProgressFunc
 	})
 }

+// PushProgressFunc is a function that [Client.Push] invokes when progress is
+// made.
+// It's similar to other progress function types like [PullProgressFunc].
 type PushProgressFunc func(ProgressResponse) error

+// Push uploads a model to the model library; requires registering for ollama.ai
+// and adding a public key first. fn is called each time progress is made on
+// the request and can be used to display a progress bar, etc.
 func (c *Client) Push(ctx context.Context, req *PushRequest, fn PushProgressFunc) error {
 	return c.stream(ctx, http.MethodPost, "/api/push", req, func(bts []byte) error {
 		var resp ProgressResponse
@@ -312,8 +325,15 @@ func (c *Client) Push(ctx context.Context, req *PushRequest, fn PushProgressFunc
 	})
 }

+// CreateProgressFunc is a function that [Client.Create] invokes when progress
+// is made.
+// It's similar to other progress function types like [PullProgressFunc].
 type CreateProgressFunc func(ProgressResponse) error

+// Create creates a model from a [Modelfile]. fn is a progress function that
+// behaves similarly to other methods (see [Client.Pull]).
+//
+// [Modelfile]: https://github.com/ollama/ollama/blob/main/docs/modelfile.md
 func (c *Client) Create(ctx context.Context, req *CreateRequest, fn CreateProgressFunc) error {
 	return c.stream(ctx, http.MethodPost, "/api/create", req, func(bts []byte) error {
 		var resp ProgressResponse
@@ -325,6 +345,7 @@ func (c *Client) Create(ctx context.Context, req *CreateRequest, fn CreateProgre
 	})
 }

+// List lists models that are available locally.
 func (c *Client) List(ctx context.Context) (*ListResponse, error) {
 	var lr ListResponse
 	if err := c.do(ctx, http.MethodGet, "/api/tags", nil, &lr); err != nil {
@@ -333,6 +354,8 @@ func (c *Client) List(ctx context.Context) (*ListResponse, error) {
 	return &lr, nil
 }

+// Copy copies a model - creating a model with another name from an existing
+// model.
 func (c *Client) Copy(ctx context.Context, req *CopyRequest) error {
 	if err := c.do(ctx, http.MethodPost, "/api/copy", req, nil); err != nil {
 		return err
@@ -340,6 +363,7 @@ func (c *Client) Copy(ctx context.Context, req *CopyRequest) error {
 	return nil
 }

+// Delete deletes a model and its data.
 func (c *Client) Delete(ctx context.Context, req *DeleteRequest) error {
 	if err := c.do(ctx, http.MethodDelete, "/api/delete", req, nil); err != nil {
 		return err
@@ -347,6 +371,7 @@ func (c *Client) Delete(ctx context.Context, req *DeleteRequest) error {
 	return nil
 }

+// Show obtains model information, including details, modelfile, license etc.
 func (c *Client) Show(ctx context.Context, req *ShowRequest) (*ShowResponse, error) {
 	var resp ShowResponse
 	if err := c.do(ctx, http.MethodPost, "/api/show", req, &resp); err != nil {
@@ -355,12 +380,16 @@ func (c *Client) Show(ctx context.Context, req *ShowRequest) (*ShowResponse, err
 	return &resp, nil
 }

+// Hearbeat checks if the server has started and is responsive; if yes, it
+// returns nil, otherwise an error.
 func (c *Client) Heartbeat(ctx context.Context) error {
 	if err := c.do(ctx, http.MethodHead, "/", nil, nil); err != nil {
 		return err
 	}
 	return nil
 }
+
+// Embeddings generates embeddings from a model.
 func (c *Client) Embeddings(ctx context.Context, req *EmbeddingRequest) (*EmbeddingResponse, error) {
 	var resp EmbeddingResponse
 	if err := c.do(ctx, http.MethodPost, "/api/embeddings", req, &resp); err != nil {
@@ -369,10 +398,13 @@ func (c *Client) Embeddings(ctx context.Context, req *EmbeddingRequest) (*Embedd
 	return &resp, nil
 }

+// CreateBlob creates a blob from a file on the server. digest is the
+// expected SHA256 digest of the file, and r represents the file.
 func (c *Client) CreateBlob(ctx context.Context, digest string, r io.Reader) error {
 	return c.do(ctx, http.MethodPost, fmt.Sprintf("/api/blobs/%s", digest), r, nil)
 }

+// Version returns the Ollama server version as a string.
 func (c *Client) Version(ctx context.Context) (string, error) {
 	var version struct {
 		Version string `json:"version"`
--- a/api/types.go
+++ b/api/types.go
@@ -4,6 +4,7 @@ import (
 	"encoding/json"
 	"errors"
 	"fmt"
+	"log/slog"
 	"math"
 	"os"
 	"reflect"
@@ -12,6 +13,7 @@ import (
 	"time"
 )

+// StatusError is an error with and HTTP status code.
 type StatusError struct {
 	StatusCode   int
 	Status       string
@@ -32,6 +34,7 @@ func (e StatusError) Error() string {
 	}
 }

+// ImageData represents the raw binary data of an image file.
 type ImageData []byte

 // GenerateRequest describes a request sent by [Client.Generate]. While you
@@ -77,26 +80,44 @@ type GenerateRequest struct {
 	Options map[string]interface{} `json:"options"`
 }

+// ChatRequest describes a request sent by [Client.Chat].
 type ChatRequest struct {
-	Model     string    `json:"model"`
-	Messages  []Message `json:"messages"`
-	Stream    *bool     `json:"stream,omitempty"`
-	Format    string    `json:"format"`
+	// Model is the model name, as in [GenerateRequest].
+	Model string `json:"model"`
+
+	// Messages is the messages of the chat - can be used to keep a chat memory.
+	Messages []Message `json:"messages"`
+
+	// Stream enable streaming of returned response; true by default.
+	Stream *bool `json:"stream,omitempty"`
+
+	// Format is the format to return the response in (e.g. "json").
+	Format string `json:"format"`
+
+	// KeepAlive controls how long the model will stay loaded into memory
+	// followin the request.
 	KeepAlive *Duration `json:"keep_alive,omitempty"`

+	// Options lists model-specific options.
 	Options map[string]interface{} `json:"options"`
 }

+// Message is a single message in a chat sequence. The message contains the
+// role ("system", "user", or "assistant"), the content and an optional list
+// of images.
 type Message struct {
-	Role    string      `json:"role"` // one of ["system", "user", "assistant"]
+	Role    string      `json:"role"`
 	Content string      `json:"content"`
 	Images  []ImageData `json:"images,omitempty"`
 }

+// ChatResponse is the response returned by [Client.Chat]. Its fields are
+// similar to [GenerateResponse].
 type ChatResponse struct {
-	Model     string    `json:"model"`
-	CreatedAt time.Time `json:"created_at"`
-	Message   Message   `json:"message"`
+	Model      string    `json:"model"`
+	CreatedAt  time.Time `json:"created_at"`
+	Message    Message   `json:"message"`
+	DoneReason string    `json:"done_reason,omitempty"`

 	Done bool `json:"done"`

@@ -112,7 +133,8 @@ type Metrics struct {
 	EvalDuration       time.Duration `json:"eval_duration,omitempty"`
 }

-// Options specified in GenerateRequest, if you add a new option here add it to the API docs also
+// Options specified in [GenerateRequest], if you add a new option here add it
+// to the API docs also.
 type Options struct {
 	Runner

@@ -141,7 +163,6 @@ type Runner struct {
 	UseNUMA   bool `json:"numa,omitempty"`
 	NumCtx    int  `json:"num_ctx,omitempty"`
 	NumBatch  int  `json:"num_batch,omitempty"`
-	NumGQA    int  `json:"num_gqa,omitempty"`
 	NumGPU    int  `json:"num_gpu,omitempty"`
 	MainGPU   int  `json:"main_gpu,omitempty"`
 	LowVRAM   bool `json:"low_vram,omitempty"`
@@ -151,14 +172,30 @@ type Runner struct {
 	UseMMap   bool `json:"use_mmap,omitempty"`
 	UseMLock  bool `json:"use_mlock,omitempty"`
 	NumThread int  `json:"num_thread,omitempty"`
-
-	// Unused: RopeFrequencyBase is ignored. Instead the value in the model will be used
-	RopeFrequencyBase float32 `json:"rope_frequency_base,omitempty"`
-	// Unused: RopeFrequencyScale is ignored. Instead the value in the model will be used
-	RopeFrequencyScale float32 `json:"rope_frequency_scale,omitempty"`
 }

+// EmbeddingRequest is the request passed to [Client.Embeddings].
 type EmbeddingRequest struct {
+	// Model is the model name.
+	Model string `json:"model"`
+
+	// Prompt is the textual prompt to embed.
+	Prompt string `json:"prompt"`
+
+	// KeepAlive controls how long the model will stay loaded in memory following
+	// this request.
+	KeepAlive *Duration `json:"keep_alive,omitempty"`
+
+	// Options lists model-specific options.
+	Options map[string]interface{} `json:"options"`
+}
+
+// EmbeddingResponse is the response from [Client.Embeddings].
+type EmbeddingResponse struct {
+	Embedding []float64 `json:"embedding"`
+}
+
+type TokenizeRequest struct {
 	Model     string    `json:"model"`
 	Prompt    string    `json:"prompt"`
 	KeepAlive *Duration `json:"keep_alive,omitempty"`
@@ -166,10 +203,11 @@ type EmbeddingRequest struct {
 	Options map[string]interface{} `json:"options"`
 }

-type EmbeddingResponse struct {
-	Embedding []float64 `json:"embedding"`
+type TokenizeResponse struct {
+	Tokens []int `json:"tokens"`
 }

+// CreateRequest is the request passed to [Client.Create].
 type CreateRequest struct {
 	Model        string `json:"model"`
 	Path         string `json:"path"`
@@ -181,6 +219,7 @@ type CreateRequest struct {
 	Name string `json:"name"`
 }

+// DeleteRequest is the request passed to [Client.Delete].
 type DeleteRequest struct {
 	Model string `json:"model"`

@@ -188,6 +227,7 @@ type DeleteRequest struct {
 	Name string `json:"name"`
 }

+// ShowRequest is the request passed to [Client.Show].
 type ShowRequest struct {
 	Model    string `json:"model"`
 	System   string `json:"system"`
@@ -199,6 +239,7 @@ type ShowRequest struct {
 	Name string `json:"name"`
 }

+// ShowResponse is the response returned from [Client.Show].
 type ShowResponse struct {
 	License    string       `json:"license,omitempty"`
 	Modelfile  string       `json:"modelfile,omitempty"`
@@ -209,11 +250,13 @@ type ShowResponse struct {
 	Messages   []Message    `json:"messages,omitempty"`
 }

+// CopyRequest is the request passed to [Client.Copy].
 type CopyRequest struct {
 	Source      string `json:"source"`
 	Destination string `json:"destination"`
 }

+// PullRequest is the request passed to [Client.Pull].
 type PullRequest struct {
 	Model    string `json:"model"`
 	Insecure bool   `json:"insecure,omitempty"`
@@ -225,6 +268,8 @@ type PullRequest struct {
 	Name string `json:"name"`
 }

+// ProgressResponse is the response passed to progress functions like
+// [PullProgressFunc] and [PushProgressFunc].
 type ProgressResponse struct {
 	Status    string `json:"status"`
 	Digest    string `json:"digest,omitempty"`
@@ -232,6 +277,7 @@ type ProgressResponse struct {
 	Completed int64  `json:"completed,omitempty"`
 }

+// PushRequest is the request passed to [Client.Push].
 type PushRequest struct {
 	Model    string `json:"model"`
 	Insecure bool   `json:"insecure,omitempty"`
@@ -243,10 +289,12 @@ type PushRequest struct {
 	Name string `json:"name"`
 }

+// ListResponse is the response from [Client.List].
 type ListResponse struct {
 	Models []ModelResponse `json:"models"`
 }

+// ModelResponse is a single model description in [ListResponse].
 type ModelResponse struct {
 	Name       string       `json:"name"`
 	Model      string       `json:"model"`
@@ -260,17 +308,31 @@ type TokenResponse struct {
 	Token string `json:"token"`
 }

+// GenerateResponse is the response passed into [GenerateResponseFunc].
 type GenerateResponse struct {
-	Model     string    `json:"model"`
-	CreatedAt time.Time `json:"created_at"`
-	Response  string    `json:"response"`
+	// Model is the model name that generated the response.
+	Model string `json:"model"`

-	Done    bool  `json:"done"`
+	//CreatedAt is the timestamp of the response.
+	CreatedAt time.Time `json:"created_at"`
+
+	// Response is the textual response itself.
+	Response string `json:"response"`
+
+	// Done specifies if the response is complete.
+	Done bool `json:"done"`
+
+	// DoneReason is the reason the model stopped generating text.
+	DoneReason string `json:"done_reason,omitempty"`
+
+	// Context is an encoding of the conversation used in this response; this
+	// can be sent in the next request to keep a conversational memory.
 	Context []int `json:"context,omitempty"`

 	Metrics
 }

+// ModelDetails provides details about a model.
 type ModelDetails struct {
 	ParentModel       string   `json:"parent_model"`
 	Format            string   `json:"format"`
@@ -308,7 +370,6 @@ func (m *Metrics) Summary() {
 	}
 }

-var ErrInvalidOpts = errors.New("invalid options")
 var ErrInvalidHostPort = errors.New("invalid port specified in OLLAMA_HOST")

 func (opts *Options) FromMap(m map[string]interface{}) error {
@@ -324,76 +385,76 @@ func (opts *Options) FromMap(m map[string]interface{}) error {
 		}
 	}

-	invalidOpts := []string{}
 	for key, val := range m {
-		if opt, ok := jsonOpts[key]; ok {
-			field := valueOpts.FieldByName(opt.Name)
-			if field.IsValid() && field.CanSet() {
-				if val == nil {
-					continue
-				}
+		opt, ok := jsonOpts[key]
+		if !ok {
+			slog.Warn("invalid option provided", "option", opt.Name)
+			continue
+		}

-				switch field.Kind() {
-				case reflect.Int:
-					switch t := val.(type) {
-					case int64:
-						field.SetInt(t)
-					case float64:
-						// when JSON unmarshals numbers, it uses float64, not int
-						field.SetInt(int64(t))
-					default:
-						return fmt.Errorf("option %q must be of type integer", key)
-					}
-				case reflect.Bool:
-					val, ok := val.(bool)
-					if !ok {
-						return fmt.Errorf("option %q must be of type boolean", key)
-					}
-					field.SetBool(val)
-				case reflect.Float32:
-					// JSON unmarshals to float64
-					val, ok := val.(float64)
-					if !ok {
-						return fmt.Errorf("option %q must be of type float32", key)
-					}
-					field.SetFloat(val)
-				case reflect.String:
-					val, ok := val.(string)
-					if !ok {
-						return fmt.Errorf("option %q must be of type string", key)
-					}
-					field.SetString(val)
-				case reflect.Slice:
-					// JSON unmarshals to []interface{}, not []string
-					val, ok := val.([]interface{})
-					if !ok {
-						return fmt.Errorf("option %q must be of type array", key)
-					}
-					// convert []interface{} to []string
-					slice := make([]string, len(val))
-					for i, item := range val {
-						str, ok := item.(string)
-						if !ok {
-							return fmt.Errorf("option %q must be of an array of strings", key)
-						}
-						slice[i] = str
-					}
-					field.Set(reflect.ValueOf(slice))
-				default:
-					return fmt.Errorf("unknown type loading config params: %v", field.Kind())
-				}
+		field := valueOpts.FieldByName(opt.Name)
+		if field.IsValid() && field.CanSet() {
+			if val == nil {
+				continue
+			}
+
+			switch field.Kind() {
+			case reflect.Int:
+				switch t := val.(type) {
+				case int64:
+					field.SetInt(t)
+				case float64:
+					// when JSON unmarshals numbers, it uses float64, not int
+					field.SetInt(int64(t))
+				default:
+					return fmt.Errorf("option %q must be of type integer", key)
+				}
+			case reflect.Bool:
+				val, ok := val.(bool)
+				if !ok {
+					return fmt.Errorf("option %q must be of type boolean", key)
+				}
+				field.SetBool(val)
+			case reflect.Float32:
+				// JSON unmarshals to float64
+				val, ok := val.(float64)
+				if !ok {
+					return fmt.Errorf("option %q must be of type float32", key)
+				}
+				field.SetFloat(val)
+			case reflect.String:
+				val, ok := val.(string)
+				if !ok {
+					return fmt.Errorf("option %q must be of type string", key)
+				}
+				field.SetString(val)
+			case reflect.Slice:
+				// JSON unmarshals to []interface{}, not []string
+				val, ok := val.([]interface{})
+				if !ok {
+					return fmt.Errorf("option %q must be of type array", key)
+				}
+				// convert []interface{} to []string
+				slice := make([]string, len(val))
+				for i, item := range val {
+					str, ok := item.(string)
+					if !ok {
+						return fmt.Errorf("option %q must be of an array of strings", key)
+					}
+					slice[i] = str
+				}
+				field.Set(reflect.ValueOf(slice))
+			default:
+				return fmt.Errorf("unknown type loading config params: %v", field.Kind())
 			}
-		} else {
-			invalidOpts = append(invalidOpts, key)
 		}
 	}

-	if len(invalidOpts) > 0 {
-		return fmt.Errorf("%w: %v", ErrInvalidOpts, strings.Join(invalidOpts, ", "))
-	}
 	return nil
 }

+// DefaultOptions is the default set of options for [GenerateRequest]; these
+// values are used unless the user specifies other values explicitly.
 func DefaultOptions() Options {
 	return Options{
 		// options set on request to runner
@@ -421,8 +482,7 @@ func DefaultOptions() Options {
 			NumCtx:    2048,
 			NumBatch:  512,
 			NumGPU:    -1, // -1 here indicates that NumGPU should be set dynamically
-			NumGQA:    1,
-			NumThread: 0, // let the runtime decide
+			NumThread: 0,  // let the runtime decide
 			LowVRAM:   false,
 			F16KV:     true,
 			UseMLock:  false,
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -162,7 +162,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 		fmt.Fprintln(os.Stderr, "  /set parameter repeat_penalty <float> How strongly to penalize repetitions")
 		fmt.Fprintln(os.Stderr, "  /set parameter repeat_last_n <int>    Set how far back to look for repetitions")
 		fmt.Fprintln(os.Stderr, "  /set parameter num_gpu <int>          The number of layers to send to the GPU")
-		fmt.Fprintln(os.Stderr, "  /set parameter stop \"<string>\", ...   Set the stop parameters")
+		fmt.Fprintln(os.Stderr, "  /set parameter stop <string> <string> ...   Set the stop parameters")
 		fmt.Fprintln(os.Stderr, "")
 	}

--- a/convert/convert.go
+++ b/convert/convert.go
@@ -5,6 +5,7 @@ import (
 	"encoding/binary"
 	"encoding/json"
 	"fmt"
+	"io"
 	"log/slog"
 	"os"
 	"path/filepath"
@@ -47,7 +48,7 @@ type ByteOrder interface {
 type ModelArch interface {
 	GetTensors() error
 	LoadVocab() error
-	WriteGGUF() (string, error)
+	WriteGGUF(io.WriteSeeker) error
 }

 type ModelFormat interface {
--- a/convert/gemma.go
+++ b/convert/gemma.go
@@ -94,7 +94,7 @@ func (m *GemmaModel) LoadVocab() error {
 	return nil
 }

-func (m *GemmaModel) WriteGGUF() (string, error) {
+func (m *GemmaModel) WriteGGUF(ws io.WriteSeeker) error {
 	kv := llm.KV{
 		"general.architecture":                   "gemma",
 		"general.name":                           m.Name,
@@ -122,16 +122,5 @@ func (m *GemmaModel) WriteGGUF() (string, error) {
 		"tokenizer.ggml.add_eos_token":    false,
 	}

-	f, err := os.CreateTemp("", "ollama-gguf")
-	if err != nil {
-		return "", err
-	}
-	defer f.Close()
-
-	mod := llm.NewGGUFV3(m.Params.ByteOrder)
-	if err := mod.Encode(f, kv, m.Tensors); err != nil {
-		return "", err
-	}
-
-	return f.Name(), nil
+	return llm.NewGGUFV3(m.Params.ByteOrder).Encode(ws, kv, m.Tensors)
 }
--- a/convert/llama.go
+++ b/convert/llama.go
@@ -5,7 +5,6 @@ import (
 	"fmt"
 	"io"
 	"log/slog"
-	"os"
 	"regexp"
 	"strings"

@@ -132,7 +131,7 @@ func (m *LlamaModel) LoadVocab() error {
 	return nil
 }

-func (m *LlamaModel) WriteGGUF() (string, error) {
+func (m *LlamaModel) WriteGGUF(ws io.WriteSeeker) error {
 	kv := llm.KV{
 		"general.architecture":                   "llama",
 		"general.name":                           m.Name,
@@ -159,18 +158,5 @@ func (m *LlamaModel) WriteGGUF() (string, error) {
 		"tokenizer.ggml.add_eos_token":    false,
 	}

-	f, err := os.CreateTemp("", "ollama-gguf")
-	if err != nil {
-		return "", err
-	}
-	defer f.Close()
-
-	mod := llm.NewGGUFV3(m.Params.ByteOrder)
-	if err := mod.Encode(f, kv, m.Tensors); err != nil {
-		return "", err
-	}
-
-	slog.Debug(fmt.Sprintf("gguf file = %s", f.Name()))
-
-	return f.Name(), nil
+	return llm.NewGGUFV3(m.Params.ByteOrder).Encode(ws, kv, m.Tensors)
 }
--- a/convert/mistral.go
+++ b/convert/mistral.go
@@ -132,7 +132,7 @@ func (m *MistralModel) LoadVocab() error {
 	return nil
 }

-func (m *MistralModel) WriteGGUF() (string, error) {
+func (m *MistralModel) WriteGGUF(ws io.WriteSeeker) error {
 	kv := llm.KV{
 		"general.architecture":                   "llama",
 		"general.name":                           m.Name,
@@ -158,16 +158,5 @@ func (m *MistralModel) WriteGGUF() (string, error) {
 		"tokenizer.ggml.unknown_token_id": uint32(0),
 	}

-	f, err := os.CreateTemp("", "ollama-gguf")
-	if err != nil {
-		return "", err
-	}
-	defer f.Close()
-
-	mod := llm.NewGGUFV3(m.Params.ByteOrder)
-	if err := mod.Encode(f, kv, m.Tensors); err != nil {
-		return "", err
-	}
-
-	return f.Name(), nil
+	return llm.NewGGUFV3(m.Params.ByteOrder).Encode(ws, kv, m.Tensors)
 }
--- a/convert/mixtral.go
+++ b/convert/mixtral.go
@@ -1,7 +1,7 @@
 package convert

 import (
-	"os"
+	"io"
 	"regexp"

 	"github.com/ollama/ollama/llm"
@@ -47,7 +47,7 @@ func (m *MixtralModel) LoadVocab() error {
 	return nil
 }

-func (m *MixtralModel) WriteGGUF() (string, error) {
+func (m *MixtralModel) WriteGGUF(ws io.WriteSeeker) error {
 	kv := llm.KV{
 		"general.architecture":          "llama",
 		"general.name":                  m.Name,
@@ -81,16 +81,5 @@ func (m *MixtralModel) WriteGGUF() (string, error) {
 		"tokenizer.ggml.add_eos_token":    false,
 	}

-	f, err := os.CreateTemp("", "ollama-gguf")
-	if err != nil {
-		return "", err
-	}
-	defer f.Close()
-
-	mod := llm.NewGGUFV3(m.Params.ByteOrder)
-	if err := mod.Encode(f, kv, m.Tensors); err != nil {
-		return "", err
-	}
-
-	return f.Name(), nil
+	return llm.NewGGUFV3(m.Params.ByteOrder).Encode(ws, kv, m.Tensors)
 }
--- a/docs/README.md
+++ b/docs/README.md
@@ -6,7 +6,7 @@
 * [Importing models](./import.md)
 * [Linux Documentation](./linux.md)
 * [Windows Documentation](./windows.md)
-* [Docker Documentation](https://hub.docker.com/r/ollama/ollama)
+* [Docker Documentation](./docker.md)

 ### Reference

--- a/docs/api.md
+++ b/docs/api.md
@@ -313,7 +313,6 @@ curl http://localhost:11434/api/generate -d '{
    "numa": false,
    "num_ctx": 1024,
    "num_batch": 2,
-    "num_gqa": 1,
    "num_gpu": 1,
    "main_gpu": 0,
    "low_vram": false,
@@ -321,8 +320,6 @@ curl http://localhost:11434/api/generate -d '{
    "vocab_only": false,
    "use_mmap": true,
    "use_mlock": false,
-    "rope_frequency_base": 1.1,
-    "rope_frequency_scale": 0.8,
    "num_thread": 8
  }
 }'
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -0,0 +1,71 @@
+# Ollama Docker image
+
+### CPU only
+
+```bash
+docker run -d -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
+```
+
+### Nvidia GPU
+Install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installation).
+
+#### Install with Apt
+1.  Configure the repository
+```bash
+curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \
+    | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
+curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \
+    | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \
+    | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
+sudo apt-get update
+```
+2.  Install the NVIDIA Container Toolkit packages
+```bash
+sudo apt-get install -y nvidia-container-toolkit
+```
+
+#### Install with Yum or Dnf
+1.  Configure the repository
+    
+```bash
+curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo \
+    | sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo
+```
+    
+2. Install the NVIDIA Container Toolkit packages
+    
+```bash
+sudo yum install -y nvidia-container-toolkit
+```
+
+#### Configure Docker to use Nvidia driver 
+```
+sudo nvidia-ctk runtime configure --runtime=docker
+sudo systemctl restart docker
+```
+
+#### Start the container
+
+```bash
+docker run -d --gpus=all -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
+```
+
+### AMD GPU
+
+To run Ollama using Docker with AMD GPUs, use the `rocm` tag and the following command:
+
+```
+docker run -d --device /dev/kfd --device /dev/dri -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama:rocm
+```
+
+### Run model locally
+
+Now you can run a model:
+
+```
+docker exec -it ollama ollama run llama3
+```
+
+### Try different models
+
+More models can be found on the [Ollama library](https://ollama.com/library).
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -82,4 +82,23 @@ curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION="0.1.29" sh
 If your system is configured with the "noexec" flag where Ollama stores its
 temporary executable files, you can specify an alternate location by setting
 OLLAMA_TMPDIR to a location writable by the user ollama runs as.  For example
-OLLAMA_TMPDIR=/usr/share/ollama/
+OLLAMA_TMPDIR=/usr/share/ollama/
+
+## Container fails to run on NVIDIA GPU
+
+Make sure you've set up the conatiner runtime first as described in [docker.md](./docker.md)
+
+Sometimes the container runtime can have difficulties initializing the GPU.
+When you check the server logs, this can show up as various error codes, such
+as "3" (not initialized), "46" (device unavailable), "100" (no device), "999"
+(unknown), or others.  The following troubleshooting techniques may help resolve
+the problem
+
+- Is the uvm driver not loaded? `sudo nvidia-modprobe -u`
+- Try reloading the nvidia_uvm driver - `sudo rmmod nvidia_uvm` then `sudo modprobe nvidia_uvm`
+- Try rebooting
+- Make sure you're running the latest nvidia drivers
+
+If none of those resolve the problem, gather additional information and file an issue:
+- Set `CUDA_ERROR_LEVEL=50` and try again to get more diagnostic logs
+- Check dmesg for any errors `sudo dmesg | grep -i nvrm` and `sudo dmesg | grep -i nvidia`
--- a/docs/tutorials/langchainjs.md
+++ b/docs/tutorials/langchainjs.md
@@ -5,13 +5,13 @@ In this tutorial, we are going to use JavaScript with LangChain and Ollama to le
 To get started, let's just use **LangChain** to ask a simple question to a model. To do this with JavaScript, we need to install **LangChain**:

 ```bash
-npm install langchain
+npm install @langchain/community
 ```

 Now we can start building out our JavaScript:

 ```javascript
-import { Ollama } from "langchain/llms/ollama";
+import { Ollama } from "@langchain/community/llms/ollama";

 const ollama = new Ollama({
  baseUrl: "http://localhost:11434",
--- a/docs/tutorials/langchainpy.md
+++ b/docs/tutorials/langchainpy.md
@@ -12,7 +12,7 @@ So let's figure out how we can use **LangChain** with Ollama to ask our question

 Let's start by asking a simple question that we can get an answer to from the **Llama2** model using **Ollama**. First, we need to install the **LangChain** package:

-`pip install langchain`
+`pip install langchain_community`

 Then we can create a model and ask the question:

--- a/examples/bash-comparemodels/README.md
+++ b/examples/bash-comparemodels/README.md
@@ -1,10 +0,0 @@
-# Bash Shell examples
-
-When calling `ollama`, you can pass it a file to run all the prompts in the file, one after the other:
-
-`ollama run llama3 < sourcequestions.txt`
-
-This concept is used in the following example.
-
-## Compare Models
-`comparemodels.sh` is a script that runs all the questions in `sourcequestions.txt` using any 4 models you choose that you have already pulled from the Ollama library or have created locally.
--- a/examples/bash-comparemodels/comparemodels.sh
+++ b/examples/bash-comparemodels/comparemodels.sh
@@ -1,64 +0,0 @@
-#! /usr/bin/env bash
-# Compare multiple models by running them with the same questions
-
-NUMBEROFCHOICES=4
-SELECTIONS=()
-declare -a SUMS=()
-
-# Get the list of models
-CHOICES=$(ollama list | awk '{print $1}')
-
-# Select which models to run as a comparison
-echo "Select $NUMBEROFCHOICES models to compare:"
-select ITEM in $CHOICES; do
-    if [[ -n $ITEM ]]; then
-        echo "You have selected $ITEM"
-        SELECTIONS+=("$ITEM")
-        ((COUNT++))
-        if [[ $COUNT -eq $NUMBEROFCHOICES ]]; then
-            break
-        fi
-    else
-        echo "Invalid selection"
-    fi
-done
-
-# Loop through each of the selected models
-for ITEM in "${SELECTIONS[@]}"; do
-    echo "--------------------------------------------------------------"
-    echo "Loading the model $ITEM into memory"
-    ollama run "$ITEM" ""
-    echo "--------------------------------------------------------------"
-    echo "Running the questions through the model $ITEM"
-    COMMAND_OUTPUT=$(ollama run "$ITEM" --verbose < sourcequestions.txt 2>&1| tee /dev/stderr)
-
-    # eval duration is sometimes listed in seconds and sometimes in milliseconds. 
-    # Add up the values for each model
-    SUM=$(echo "$COMMAND_OUTPUT" | awk '
-    /eval duration:/ {
-        value = $3
-        if (index(value, "ms") > 0) {
-            gsub("ms", "", value)
-            value /= 1000
-        } else {
-            gsub("s", "", value)
-        }
-        sum += value
-    }
-    END { print sum }')
-
-
-    SUMS+=("All questions for $ITEM completed in $SUM seconds")
-done
-
-echo ""
-echo "--------------------------------------------------------------"
-echo -e "Sums of eval durations for each run:"
-for val in "${SUMS[@]}"; do
-    echo "$val"
-done
-
-echo "--------------------------------------------------------------"
-echo "Comparison complete. Now you can decide"
-echo "which model is best."
-echo "--------------------------------------------------------------"
--- a/examples/bash-comparemodels/sourcequestions.txt
+++ b/examples/bash-comparemodels/sourcequestions.txt
@@ -1,7 +0,0 @@
-Why is the sky blue
-What is a black hole
-Explain the big bang theory like I am 5?
-What is the quickest way to win a game of Monopoly with 3 others?
-Why does a vacuum bottle keep my coffee hot and my milkshake cold?
-What is the difference between a meteor, a meteorite, and a meteoroid?
-Create an array with 5 items and print to the console. Do this in Python, C#, Typescript, and Rust.
--- a/format/bytes.go
+++ b/format/bytes.go
@@ -53,6 +53,8 @@ func HumanBytes(b int64) string {

 func HumanBytes2(b uint64) string {
 	switch {
+	case b >= GibiByte:
+		return fmt.Sprintf("%.1f GiB", float64(b)/GibiByte)
 	case b >= MebiByte:
 		return fmt.Sprintf("%.1f MiB", float64(b)/MebiByte)
 	case b >= KibiByte:
--- a/format/format.go
+++ b/format/format.go
@@ -13,12 +13,20 @@ const (

 func HumanNumber(b uint64) string {
 	switch {
-	case b > Billion:
-		return fmt.Sprintf("%.0fB", math.Round(float64(b)/Billion))
-	case b > Million:
-		return fmt.Sprintf("%.0fM", math.Round(float64(b)/Million))
-	case b > Thousand:
-		return fmt.Sprintf("%.0fK", math.Round(float64(b)/Thousand))
+	case b >= Billion:
+		number := float64(b) / Billion
+		if number == math.Floor(number) {
+			return fmt.Sprintf("%.0fB", number) // no decimals if whole number
+		}
+		return fmt.Sprintf("%.1fB", number) // one decimal if not a whole number
+	case b >= Million:
+		number := float64(b) / Million
+		if number == math.Floor(number) {
+			return fmt.Sprintf("%.0fM", number) // no decimals if whole number
+		}
+		return fmt.Sprintf("%.2fM", number) // two decimals if not a whole number
+	case b >= Thousand:
+		return fmt.Sprintf("%.0fK", float64(b)/Thousand)
 	default:
 		return fmt.Sprintf("%d", b)
 	}
--- a/format/format_test.go
+++ b/format/format_test.go
@@ -0,0 +1,34 @@
+package format
+
+import (
+	"testing"
+)
+
+func TestHumanNumber(t *testing.T) {
+
+	type testCase struct {
+		input    uint64
+		expected string
+	}
+
+	testCases := []testCase{
+		{0, "0"},
+		{1000000, "1M"},
+		{125000000, "125M"},
+		{500500000, "500.50M"},
+		{500550000, "500.55M"},
+		{1000000000, "1B"},
+		{2800000000, "2.8B"},
+		{2850000000, "2.9B"},
+		{1000000000000, "1000B"},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.expected, func(t *testing.T) {
+			result := HumanNumber(tc.input)
+			if result != tc.expected {
+				t.Errorf("Expected %s, got %s", tc.expected, result)
+			}
+		})
+	}
+}
--- a/gpu/amd_hip_windows.go
+++ b/gpu/amd_hip_windows.go
@@ -3,7 +3,6 @@ package gpu
 import (
 	"fmt"
 	"log/slog"
-	"strconv"
 	"syscall"
 	"unsafe"

@@ -74,16 +73,22 @@ func (hl *HipLib) Release() {
 	hl.dll = 0
 }

-func (hl *HipLib) AMDDriverVersion() (string, error) {
+func (hl *HipLib) AMDDriverVersion() (driverMajor, driverMinor int, err error) {
 	if hl.dll == 0 {
-		return "", fmt.Errorf("dll has been unloaded")
+		return 0, 0, fmt.Errorf("dll has been unloaded")
 	}
 	var version int
 	status, _, err := syscall.SyscallN(hl.hipDriverGetVersion, uintptr(unsafe.Pointer(&version)))
 	if status != hipSuccess {
-		return "", fmt.Errorf("failed call to hipDriverGetVersion: %d %s", status, err)
+		return 0, 0, fmt.Errorf("failed call to hipDriverGetVersion: %d %s", status, err)
 	}
-	return strconv.Itoa(version), nil
+
+	slog.Debug("hipDriverGetVersion", "version", version)
+	// TODO - this isn't actually right, but the docs claim hipDriverGetVersion isn't accurate anyway...
+	driverMajor = version / 1000
+	driverMinor = (version - (driverMajor * 1000)) / 10
+
+	return driverMajor, driverMinor, nil
 }

 func (hl *HipLib) HipGetDeviceCount() int {
--- a/gpu/amd_linux.go
+++ b/gpu/amd_linux.go
@@ -8,6 +8,7 @@ import (
 	"log/slog"
 	"os"
 	"path/filepath"
+	"regexp"
 	"slices"
 	"strconv"
 	"strings"
@@ -41,10 +42,8 @@ func AMDGetGPUInfo() []GpuInfo {
 	}

 	// Opportunistic logging of driver version to aid in troubleshooting
-	ver, err := AMDDriverVersion()
-	if err == nil {
-		slog.Info("AMD Driver: " + ver)
-	} else {
+	driverMajor, driverMinor, err := AMDDriverVersion()
+	if err != nil {
 		// TODO - if we see users crash and burn with the upstreamed kernel this can be adjusted to hard-fail rocm support and fallback to CPU
 		slog.Warn("ollama recommends running the https://www.amd.com/en/support/linux-drivers", "error", err)
 	}
@@ -91,6 +90,7 @@ func AMDGetGPUInfo() []GpuInfo {
 		scanner := bufio.NewScanner(fp)
 		isCPU := false
 		var major, minor, patch uint64
+		var vendor, device uint64
 		for scanner.Scan() {
 			line := strings.TrimSpace(scanner.Text())
 			// Note: we could also use "cpu_cores_count X" where X is greater than zero to detect CPUs
@@ -118,6 +118,26 @@ func AMDGetGPUInfo() []GpuInfo {
 					slog.Debug("malformed int " + line)
 					continue
 				}
+			} else if strings.HasPrefix(line, "vendor_id") {
+				ver := strings.Fields(line)
+				if len(ver) != 2 {
+					slog.Debug("malformed vendor_id", "vendor_id", line)
+					continue
+				}
+				vendor, err = strconv.ParseUint(ver[1], 10, 32)
+				if err != nil {
+					slog.Debug("malformed vendor_id" + line)
+				}
+			} else if strings.HasPrefix(line, "device_id") {
+				ver := strings.Fields(line)
+				if len(ver) != 2 {
+					slog.Debug("malformed device_id", "device_id", line)
+					continue
+				}
+				device, err = strconv.ParseUint(ver[1], 10, 32)
+				if err != nil {
+					slog.Debug("malformed device_id" + line)
+				}
 			}

 			// TODO - any other properties we want to extract and record?
@@ -140,7 +160,7 @@ func AMDGetGPUInfo() []GpuInfo {
 		}

 		if int(major) < RocmComputeMin {
-			slog.Warn(fmt.Sprintf("amdgpu too old gfx%d%d%x", major, minor, patch), "gpu", gpuID)
+			slog.Warn(fmt.Sprintf("amdgpu too old gfx%d%x%x", major, minor, patch), "gpu", gpuID)
 			continue
 		}

@@ -210,24 +230,29 @@ func AMDGetGPUInfo() []GpuInfo {

 		// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
 		if totalMemory < IGPUMemLimit {
-			slog.Info("amdgpu appears to be an iGPU, skipping", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
+			slog.Info("unsupported Radeon iGPU detected skipping", "id", gpuID, "total", format.HumanBytes2(totalMemory))
 			continue
 		}
+		var name string
+		// TODO - PCI ID lookup
+		if vendor > 0 && device > 0 {
+			name = fmt.Sprintf("%04x:%04x", vendor, device)
+		}

-		slog.Info("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
-		slog.Info("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory))
+		slog.Debug("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
+		slog.Debug("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory))
 		gpuInfo := GpuInfo{
 			Library: "rocm",
 			memInfo: memInfo{
 				TotalMemory: totalMemory,
 				FreeMemory:  (totalMemory - usedMemory),
 			},
-			ID: fmt.Sprintf("%d", gpuID),
-			// Name: not exposed in sysfs directly, would require pci device id lookup
-			Major:         int(major),
-			Minor:         int(minor),
-			Patch:         int(patch),
+			ID:            fmt.Sprintf("%d", gpuID),
+			Name:          name,
+			Compute:       fmt.Sprintf("gfx%d%x%x", major, minor, patch),
 			MinimumMemory: rocmMinimumMemory,
+			DriverMajor:   driverMajor,
+			DriverMinor:   driverMinor,
 		}

 		// If the user wants to filter to a subset of devices, filter out if we aren't a match
@@ -266,7 +291,7 @@ func AMDGetGPUInfo() []GpuInfo {
 				}
 				slog.Debug("rocm supported GPUs", "types", supported)
 			}
-			gfx := fmt.Sprintf("gfx%d%d%x", gpuInfo.Major, gpuInfo.Minor, gpuInfo.Patch)
+			gfx := gpuInfo.Compute
 			if !slices.Contains[[]string, string](supported, gfx) {
 				slog.Warn("amdgpu is not supported", "gpu", gpuInfo.ID, "gpu_type", gfx, "library", libDir, "supported_types", supported)
 				// TODO - consider discrete markdown just for ROCM troubleshooting?
@@ -276,7 +301,7 @@ func AMDGetGPUInfo() []GpuInfo {
 				slog.Info("amdgpu is supported", "gpu", gpuInfo.ID, "gpu_type", gfx)
 			}
 		} else {
-			slog.Debug("skipping rocm gfx compatibility check with HSA_OVERRIDE_GFX_VERSION=" + gfxOverride)
+			slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride)
 		}

 		// The GPU has passed all the verification steps and is supported
@@ -322,19 +347,34 @@ func AMDValidateLibDir() (string, error) {
 	return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
 }

-func AMDDriverVersion() (string, error) {
-	_, err := os.Stat(DriverVersionFile)
+func AMDDriverVersion() (driverMajor, driverMinor int, err error) {
+	_, err = os.Stat(DriverVersionFile)
 	if err != nil {
-		return "", fmt.Errorf("amdgpu version file missing: %s %w", DriverVersionFile, err)
+		return 0, 0, fmt.Errorf("amdgpu version file missing: %s %w", DriverVersionFile, err)
 	}
 	fp, err := os.Open(DriverVersionFile)
 	if err != nil {
-		return "", err
+		return 0, 0, err
 	}
 	defer fp.Close()
 	verString, err := io.ReadAll(fp)
 	if err != nil {
-		return "", err
+		return 0, 0, err
 	}
-	return strings.TrimSpace(string(verString)), nil
+
+	pattern := `\A(\d+)\.(\d+).*`
+	regex := regexp.MustCompile(pattern)
+	match := regex.FindStringSubmatch(string(verString))
+	if len(match) < 2 {
+		return 0, 0, fmt.Errorf("malformed version string %s", string(verString))
+	}
+	driverMajor, err = strconv.Atoi(match[1])
+	if err != nil {
+		return 0, 0, err
+	}
+	driverMinor, err = strconv.Atoi(match[2])
+	if err != nil {
+		return 0, 0, err
+	}
+	return driverMajor, driverMinor, nil
 }
--- a/gpu/amd_windows.go
+++ b/gpu/amd_windows.go
@@ -7,7 +7,6 @@ import (
 	"os"
 	"path/filepath"
 	"slices"
-	"strconv"
 	"strings"

 	"github.com/ollama/ollama/format"
@@ -34,13 +33,12 @@ func AMDGetGPUInfo() []GpuInfo {
 	}
 	defer hl.Release()

-	ver, err := hl.AMDDriverVersion()
-	if err == nil {
-		slog.Info("AMD Driver: " + ver)
-	} else {
-		// For now this is benign, but we may eventually need to fail compatibility checks
-		slog.Debug("error looking up amd driver version", "error", err)
-	}
+	// TODO - this reports incorrect version information, so omitting for now
+	// driverMajor, driverMinor, err := hl.AMDDriverVersion()
+	// if err != nil {
+	// 	// For now this is benign, but we may eventually need to fail compatibility checks
+	// 	slog.Debug("error looking up amd driver version", "error", err)
+	// }

 	// Note: the HIP library automatically handles subsetting to any HIP_VISIBLE_DEVICES the user specified
 	count := hl.HipGetDeviceCount()
@@ -62,10 +60,10 @@ func AMDGetGPUInfo() []GpuInfo {
 			return nil
 		}
 	} else {
-		slog.Debug("skipping rocm gfx compatibility check with HSA_OVERRIDE_GFX_VERSION=" + gfxOverride)
+		slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride)
 	}

-	slog.Info("detected hip devices", "count", count)
+	slog.Debug("detected hip devices", "count", count)
 	// TODO how to determine the underlying device ID when visible devices is causing this to subset?
 	for i := 0; i < count; i++ {
 		err = hl.HipSetDevice(i)
@@ -85,18 +83,11 @@ func AMDGetGPUInfo() []GpuInfo {
 		// Can luid be used on windows for setting visible devices (and is it actually set?)
 		n = bytes.IndexByte(props.GcnArchName[:], 0)
 		gfx := string(props.GcnArchName[:n])
-		slog.Info("hip device", "id", i, "name", name, "gfx", gfx)
-		var major, minor, patch string
-		switch len(gfx) {
-		case 6:
-			major, minor, patch = gfx[3:4], gfx[4:5], gfx[5:]
-		case 7:
-			major, minor, patch = gfx[3:5], gfx[5:6], gfx[6:]
-		}
+		slog.Debug("hip device", "id", i, "name", name, "gfx", gfx)
 		//slog.Info(fmt.Sprintf("[%d] Integrated: %d", i, props.iGPU)) // DOESN'T REPORT CORRECTLY!  Always 0
 		// TODO  Why isn't props.iGPU accurate!?
 		if strings.EqualFold(name, iGPUName) {
-			slog.Info("iGPU detected skipping", "id", i)
+			slog.Info("unsupported Radeon iGPU detected skipping", "id", i, "name", name, "gfx", gfx)
 			continue
 		}
 		if gfxOverride == "" {
@@ -106,7 +97,7 @@ func AMDGetGPUInfo() []GpuInfo {
 				slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage")
 				continue
 			} else {
-				slog.Info("amdgpu is supported", "gpu", i, "gpu_type", gfx)
+				slog.Debug("amdgpu is supported", "gpu", i, "gpu_type", gfx)
 			}
 		}

@@ -124,8 +115,8 @@ func AMDGetGPUInfo() []GpuInfo {

 		// TODO revisit this once ROCm v6 is available on windows.
 		// v5.7 only reports VRAM used by this process, so it's completely wrong and unusable
-		slog.Info("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
-		slog.Info("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
+		slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
+		slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
 		gpuInfo := GpuInfo{
 			Library: "rocm",
 			memInfo: memInfo{
@@ -135,31 +126,12 @@ func AMDGetGPUInfo() []GpuInfo {
 			ID:             fmt.Sprintf("%d", i), // TODO this is probably wrong if we specify visible devices
 			DependencyPath: libDir,
 			MinimumMemory:  rocmMinimumMemory,
-		}
-		if major != "" {
-			gpuInfo.Major, err = strconv.Atoi(major)
-			if err != nil {
-				slog.Info("failed to parse version", "version", gfx, "error", err)
-			}
-		}
-		if minor != "" {
-			gpuInfo.Minor, err = strconv.Atoi(minor)
-			if err != nil {
-				slog.Info("failed to parse version", "version", gfx, "error", err)
-			}
-		}
-		if patch != "" {
-			// Patch rev is hex; e.g. gfx90a
-			p, err := strconv.ParseInt(patch, 16, 0)
-			if err != nil {
-				slog.Info("failed to parse version", "version", gfx, "error", err)
-			} else {
-				gpuInfo.Patch = int(p)
-			}
-		}
-		if gpuInfo.Major < RocmComputeMin {
-			slog.Warn(fmt.Sprintf("amdgpu [%s] too old gfx%d%d%x", gpuInfo.ID, gpuInfo.Major, gpuInfo.Minor, gpuInfo.Patch))
-			continue
+			Name:           name,
+			Compute:        gfx,
+
+			// TODO - this information isn't accurate on windows, so don't report it until we find the right way to retrieve
+			// DriverMajor:    driverMajor,
+			// DriverMinor:    driverMinor,
 		}

 		resp = append(resp, gpuInfo)
--- a/gpu/cpu_common.go
+++ b/gpu/cpu_common.go
@@ -8,14 +8,14 @@ import (

 func GetCPUVariant() string {
 	if cpu.X86.HasAVX2 {
-		slog.Info("CPU has AVX2")
+		slog.Debug("CPU has AVX2")
 		return "avx2"
 	}
 	if cpu.X86.HasAVX {
-		slog.Info("CPU has AVX")
+		slog.Debug("CPU has AVX")
 		return "avx"
 	}
-	slog.Info("CPU does not have vector extensions")
+	slog.Debug("CPU does not have vector extensions")
 	// else LCD
 	return ""
 }
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -31,8 +31,8 @@ type handles struct {
 }

 const (
-	cudaMinimumMemory = 256 * format.MebiByte
-	rocmMinimumMemory = 256 * format.MebiByte
+	cudaMinimumMemory = 457 * format.MebiByte
+	rocmMinimumMemory = 457 * format.MebiByte
 )

 var gpuMutex sync.Mutex
@@ -119,12 +119,12 @@ func initGPUHandles() *handles {
 		return gpuHandles
 	}

-	slog.Info("Detecting GPUs")
+	slog.Debug("Detecting GPUs")
 	nvcudaLibPaths := FindGPULibs(nvcudaMgmtName, nvcudaMgmtPatterns)
 	if len(nvcudaLibPaths) > 0 {
 		deviceCount, nvcuda, libPath := LoadNVCUDAMgmt(nvcudaLibPaths)
 		if nvcuda != nil {
-			slog.Info("detected GPUs", "count", deviceCount, "library", libPath)
+			slog.Debug("detected GPUs", "count", deviceCount, "library", libPath)
 			gpuHandles.nvcuda = nvcuda
 			gpuHandles.deviceCount = deviceCount
 			return gpuHandles
@@ -135,7 +135,7 @@ func initGPUHandles() *handles {
 	if len(cudartLibPaths) > 0 {
 		deviceCount, cudart, libPath := LoadCUDARTMgmt(cudartLibPaths)
 		if cudart != nil {
-			slog.Info("detected GPUs", "library", libPath, "count", deviceCount)
+			slog.Debug("detected GPUs", "library", libPath, "count", deviceCount)
 			gpuHandles.cudart = cudart
 			gpuHandles.deviceCount = deviceCount
 			return gpuHandles
@@ -184,10 +184,14 @@ func GetGPUInfo() GpuInfoList {
 		gpuInfo := GpuInfo{
 			Library: "cuda",
 		}
+		var driverMajor int
+		var driverMinor int
 		if gpuHandles.cudart != nil {
 			C.cudart_check_vram(*gpuHandles.cudart, C.int(i), &memInfo)
 		} else {
 			C.nvcuda_check_vram(*gpuHandles.nvcuda, C.int(i), &memInfo)
+			driverMajor = int(gpuHandles.nvcuda.driver_major)
+			driverMinor = int(gpuHandles.nvcuda.driver_minor)
 		}
 		if memInfo.err != nil {
 			slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
@@ -201,10 +205,12 @@ func GetGPUInfo() GpuInfoList {
 		gpuInfo.TotalMemory = uint64(memInfo.total)
 		gpuInfo.FreeMemory = uint64(memInfo.free)
 		gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
-		gpuInfo.Major = int(memInfo.major)
-		gpuInfo.Minor = int(memInfo.minor)
+		gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
 		gpuInfo.MinimumMemory = cudaMinimumMemory
 		gpuInfo.DependencyPath = depPath
+		gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
+		gpuInfo.DriverMajor = int(driverMajor)
+		gpuInfo.DriverMinor = int(driverMinor)

 		// TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
 		resp = append(resp, gpuInfo)
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
@@ -15,7 +15,7 @@ import (
 )

 const (
-	metalMinimumMemory = 384 * format.MebiByte
+	metalMinimumMemory = 512 * format.MebiByte
 )

 func GetGPUInfo() GpuInfoList {
--- a/gpu/gpu_info.h
+++ b/gpu/gpu_info.h
@@ -39,16 +39,19 @@ extern "C" {
 #endif

 #define GPU_ID_LEN 64
+#define GPU_NAME_LEN 96

 typedef struct mem_info {
  char *err;  // If non-nill, caller responsible for freeing
  char gpu_id[GPU_ID_LEN];
+  char gpu_name[GPU_NAME_LEN];
  uint64_t total;
  uint64_t free;

  // Compute Capability
  int major; 
  int minor;
+  int patch;
 } mem_info_t;

 void cpu_check_ram(mem_info_t *resp);
--- a/gpu/gpu_info_cpu.c
+++ b/gpu/gpu_info_cpu.c
@@ -10,8 +10,6 @@ void cpu_check_ram(mem_info_t *resp) {
  if (GlobalMemoryStatusEx(&info) != 0) {
    resp->total = info.ullTotalPhys;
    resp->free = info.ullAvailPhys;
-    resp->major = 0;
-    resp->minor = 0;
    snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0");
  } else {
    resp->err = LOAD_ERR();
@@ -31,8 +29,6 @@ void cpu_check_ram(mem_info_t *resp) {
  } else {
    resp->total = info.totalram * info.mem_unit;
    resp->free = info.freeram * info.mem_unit;
-    resp->major = 0;
-    resp->minor = 0;
    snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0");
  }
  return;
--- a/gpu/gpu_info_nvcuda.c
+++ b/gpu/gpu_info_nvcuda.c
@@ -22,6 +22,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
      {"cuDeviceGet", (void *)&resp->ch.cuDeviceGet},
      {"cuDeviceGetAttribute", (void *)&resp->ch.cuDeviceGetAttribute},
      {"cuDeviceGetUuid", (void *)&resp->ch.cuDeviceGetUuid},
+      {"cuDeviceGetName", (void *)&resp->ch.cuDeviceGetName},
      {"cuCtxCreate_v3", (void *)&resp->ch.cuCtxCreate_v3},
      {"cuMemGetInfo_v2", (void *)&resp->ch.cuMemGetInfo_v2},
      {"cuCtxDestroy", (void *)&resp->ch.cuCtxDestroy},
@@ -70,18 +71,17 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
  }

  int version = 0;
-  nvcudaDriverVersion_t driverVersion;
-  driverVersion.major = 0;
-  driverVersion.minor = 0;
+  resp->ch.driver_major = 0;
+  resp->ch.driver_minor = 0;

  // Report driver version if we're in verbose mode, ignore errors
  ret = (*resp->ch.cuDriverGetVersion)(&version);
  if (ret != CUDA_SUCCESS) {
    LOG(resp->ch.verbose, "cuDriverGetVersion failed: %d\n", ret);
  } else {
-    driverVersion.major = version / 1000;
-    driverVersion.minor = (version - (driverVersion.major * 1000)) / 10;
-    LOG(resp->ch.verbose, "CUDA driver version: %d-%d\n", driverVersion.major, driverVersion.minor);
+    resp->ch.driver_major = version / 1000;
+    resp->ch.driver_minor = (version - (resp->ch.driver_major * 1000)) / 10;
+    LOG(resp->ch.verbose, "CUDA driver version: %d.%d\n", resp->ch.driver_major, resp->ch.driver_minor);
  }

  ret = (*resp->ch.cuDeviceGetCount)(&resp->num_devices);
@@ -117,8 +117,6 @@ void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) {
    return;
  }

-  resp->major = 0;
-  resp->minor = 0;
  int major = 0;
  int minor = 0;
  ret = (*h.cuDeviceGetAttribute)(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);
@@ -161,6 +159,12 @@ void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) {
      );
  }

+  ret = (*h.cuDeviceGetName)(&resp->gpu_name[0], GPU_NAME_LEN, device);
+  if (ret != CUDA_SUCCESS) {
+    LOG(h.verbose, "[%d] device name lookup failure: %d\n", i, ret);
+    resp->gpu_name[0] = '\0';
+  }
+
  // To get memory we have to set (and release) a context
  ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
  if (ret != CUDA_SUCCESS) {
--- a/gpu/gpu_info_nvcuda.h
+++ b/gpu/gpu_info_nvcuda.h
@@ -44,12 +44,15 @@ typedef void* CUcontext;
 typedef struct nvcuda_handle {
  void *handle;
  uint16_t verbose;
+  int driver_major;
+  int driver_minor;
  CUresult (*cuInit)(unsigned int Flags);
  CUresult (*cuDriverGetVersion)(int *driverVersion);
  CUresult (*cuDeviceGetCount)(int *);
  CUresult (*cuDeviceGet)(CUdevice* device, int ordinal);
  CUresult (*cuDeviceGetAttribute)(int* pi, CUdevice_attribute attrib, CUdevice dev);
  CUresult (*cuDeviceGetUuid)(CUuuid* uuid, CUdevice dev); // signature compatible with cuDeviceGetUuid_v2
+  CUresult (*cuDeviceGetName)(char *name, int len, CUdevice dev);

  // Context specific aspects
  CUresult (*cuCtxCreate_v3)(CUcontext* pctx, void *params, int len, unsigned int flags, CUdevice dev);
--- a/gpu/types.go
+++ b/gpu/types.go
@@ -1,5 +1,12 @@
 package gpu

+import (
+	"fmt"
+	"log/slog"
+
+	"github.com/ollama/ollama/format"
+)
+
 type memInfo struct {
 	TotalMemory uint64 `json:"total_memory,omitempty"`
 	FreeMemory  uint64 `json:"free_memory,omitempty"`
@@ -20,11 +27,13 @@ type GpuInfo struct {
 	DependencyPath string `json:"lib_path,omitempty"`

 	// GPU information
-	ID    string `json:"gpu_id"`          // string to use for selection of this specific GPU
-	Name  string `json:"name"`            // user friendly name if available
-	Major int    `json:"major,omitempty"` // Major compatibility version (CC or gfx)
-	Minor int    `json:"minor,omitempty"` // Minor compatibility version (CC or gfx)
-	Patch int    `json:"patch,omitempty"` // Patch compatibility only matters on AMD
+	ID      string `json:"gpu_id"`  // string to use for selection of this specific GPU
+	Name    string `json:"name"`    // user friendly name if available
+	Compute string `json:"compute"` // Compute Capability or gfx
+
+	// Driver Information - TODO no need to put this on each GPU
+	DriverMajor int `json:"driver_major,omitempty"`
+	DriverMinor int `json:"driver_minor,omitempty"`

 	// TODO other performance capability info to help in scheduling decisions
 }
@@ -56,6 +65,21 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
 	return resp
 }

+// Report the GPU information into the log an Info level
+func (l GpuInfoList) LogDetails() {
+	for _, g := range l {
+		slog.Info("inference compute",
+			"id", g.ID,
+			"library", g.Library,
+			"compute", g.Compute,
+			"driver", fmt.Sprintf("%d.%d", g.DriverMajor, g.DriverMinor),
+			"name", g.Name,
+			"total", format.HumanBytes2(g.TotalMemory),
+			"available", format.HumanBytes2(g.FreeMemory),
+		)
+	}
+}
+
 // Sort by Free Space
 type ByFreeMemory []GpuInfo

--- a/integration/utils_test.go
+++ b/integration/utils_test.go
@@ -107,7 +107,7 @@ func startServer(ctx context.Context, ollamaHost string) error {

 	if tmp := os.Getenv("OLLAMA_HOST"); tmp != ollamaHost {
 		slog.Info("setting env", "OLLAMA_HOST", ollamaHost)
-		os.Setenv("OLLAMA_HOST", ollamaHost)
+		t.Setenv("OLLAMA_HOST", ollamaHost)
 	}

 	slog.Info("starting server", "url", ollamaHost)
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@@ -66,7 +66,7 @@ struct server_params {
 };

 bool server_verbose = false;
-bool server_log_json = true;
+bool server_log_json = false;

 enum stop_type {
    STOP_FULL,
@@ -266,7 +266,7 @@ struct server_slot {
        sprintf(buffer, "prompt eval time     = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
                t_prompt_processing, n_prompt_tokens_processed,
                t_token, n_tokens_second);
-        LOG_INFO(buffer, {
+        LOG_DEBUG(buffer, {
            {"slot_id",                   id},
            {"task_id",                   task_id},
            {"t_prompt_processing",       t_prompt_processing},
@@ -280,7 +280,7 @@ struct server_slot {
        sprintf(buffer, "generation eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)",
                t_token_generation, n_decoded,
                t_token, n_tokens_second);
-        LOG_INFO(buffer, {
+        LOG_DEBUG(buffer, {
            {"slot_id",            id},
            {"task_id",            task_id},
            {"t_token_generation", t_token_generation},
@@ -290,7 +290,7 @@ struct server_slot {
        });

        sprintf(buffer, "          total time = %10.2f ms", t_prompt_processing + t_token_generation);
-        LOG_INFO(buffer, {
+        LOG_DEBUG(buffer, {
            {"slot_id",             id},
            {"task_id",             task_id},
            {"t_prompt_processing", t_prompt_processing},
@@ -371,7 +371,7 @@ struct llama_server_context
    {
        if (clp_ctx)
        {
-            LOG_INFO("freeing clip model", {});
+            LOG_DEBUG("freeing clip model", {});
            clip_free(clp_ctx);
            clp_ctx = nullptr;
        }
@@ -392,7 +392,7 @@ struct llama_server_context
        params = params_;
        if (!params.mmproj.empty()) {
            multimodal = true;
-            LOG_INFO("Multi Modal Mode Enabled", {});
+            LOG_DEBUG("Multi Modal Mode Enabled", {});
            clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
            if(clp_ctx == nullptr) {
                LOG_ERROR("unable to load clip model", {{"model", params.mmproj}});
@@ -445,7 +445,7 @@ struct llama_server_context

        const int32_t n_ctx_slot = n_ctx / params.n_parallel;

-        LOG_INFO("initializing slots", {{"n_slots", params.n_parallel}});
+        LOG_DEBUG("initializing slots", {{"n_slots", params.n_parallel}});
        for (int i = 0; i < params.n_parallel; i++)
        {
            server_slot slot;
@@ -454,7 +454,7 @@ struct llama_server_context
            slot.n_ctx = n_ctx_slot;
            slot.n_predict = params.n_predict;

-            LOG_INFO("new slot", {
+            LOG_DEBUG("new slot", {
                {"slot_id",    slot.id},
                {"n_ctx_slot", slot.n_ctx}
            });
@@ -468,7 +468,7 @@ struct llama_server_context
                //GGML_ASSERT(n_ctx_train % ga_w == 0     && "n_ctx_train must be a multiple of ga_w");    // NOLINT
                //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT

-                LOG_INFO("slot self-extend", {
+                LOG_DEBUG("slot self-extend", {
                    {"slot_id",   slot.id},
                    {"ga_n",      ga_n},
                    {"ga_w",      ga_w}
@@ -827,7 +827,7 @@ struct llama_server_context

        all_slots_are_idle = false;

-        LOG_INFO("slot is processing task", {
+        LOG_DEBUG("slot is processing task", {
            {"slot_id", slot->id},
            {"task_id", slot->task_id},
        });
@@ -1504,7 +1504,7 @@ struct llama_server_context
                    }
                    slots_data.push_back(slot_data);
                }
-                LOG_INFO("slot data", {
+                LOG_DEBUG("slot data", {
                    {"task_id",            task.id},
                    {"n_idle_slots",       n_idle_slots},
                    {"n_processing_slots", n_processing_slots}
@@ -1566,7 +1566,7 @@ struct llama_server_context
    bool update_slots() {
        if (system_need_update)
        {
-            LOG_INFO("updating system prompt", {});
+            LOG_DEBUG("updating system prompt", {});
            system_prompt_update();
        }

@@ -1576,7 +1576,7 @@ struct llama_server_context
        {
            if (system_prompt.empty() && clean_kv_cache)
            {
-                LOG_INFO("all slots are idle and system prompt is empty, clear the KV cache", {});
+                LOG_DEBUG("all slots are idle and system prompt is empty, clear the KV cache", {});
                kv_cache_clear();
            }
            return true;
@@ -1599,7 +1599,7 @@ struct llama_server_context
                    const int n_left    = (int) system_tokens.size() + slot.n_past - n_keep;
                    const int n_discard = n_left / 2;

-                    LOG_INFO("slot context shift", {
+                    LOG_DEBUG("slot context shift", {
                        {"slot_id",         slot.id},
                        {"task_id",         slot.task_id},
                        {"n_keep",          n_keep},
@@ -1638,7 +1638,7 @@ struct llama_server_context
                slot.command = NONE;
                slot.t_last_used = ggml_time_us();

-                LOG_INFO("slot released", {
+                LOG_DEBUG("slot released", {
                    {"slot_id",         slot.id},
                    {"task_id",         slot.task_id},
                    {"n_ctx",           n_ctx},
@@ -1807,7 +1807,7 @@ struct llama_server_context
                            slot.ga_i = ga_i;
                        }

-                        LOG_INFO("slot progression", {
+                        LOG_DEBUG("slot progression", {
                            { "slot_id",    slot.id },
                            { "task_id",    slot.task_id },
                            { "n_past",     slot.n_past },
@@ -1822,7 +1822,7 @@ struct llama_server_context
                    if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0)
                    {
                        // we have to evaluate at least 1 token to generate logits.
-                        LOG_INFO("we have to evaluate at least 1 token to generate logits", {
+                        LOG_DEBUG("we have to evaluate at least 1 token to generate logits", {
                            { "slot_id", slot.id },
                            { "task_id", slot.task_id }
                        });
@@ -1834,7 +1834,7 @@ struct llama_server_context
                    }

                    int p0 = (int) system_tokens.size() + slot.n_past;
-                    LOG_INFO("kv cache rm [p0, end)", {
+                    LOG_DEBUG("kv cache rm [p0, end)", {
                        { "slot_id", slot.id },
                        { "task_id", slot.task_id },
                        { "p0",      p0 }
@@ -2491,11 +2491,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
        }
        else if (arg == "-v" || arg == "--verbose")
        {
-#if SERVER_VERBOSE != 1
-            LOG_WARNING("server.cpp is not built with verbose logging.", {});
-#else
            server_verbose = true;
-#endif
        }
        else if (arg == "--mlock")
        {
@@ -2601,7 +2597,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
        else if (arg == "--log-disable")
        {
            log_set_target(stdout);
-            LOG_INFO("logging to file is disabled.", {});
+            LOG_DEBUG("logging to file is disabled.", {});
        }
        else if (arg == "--slots-endpoint-disable")
        {
@@ -2727,12 +2723,12 @@ static json format_detokenized_response(std::string content)
 static void log_server_request(const httplib::Request &req, const httplib::Response &res)
 {
    // skip GH copilot requests when using default port
-    if (req.path == "/v1/health" || req.path == "/v1/completions")
+    if (req.path == "/health" || req.path == "/v1/health" || req.path == "/v1/completions")
    {
        return;
    }

-    LOG_INFO("request", {
+    LOG_DEBUG("request", {
        {"remote_addr", req.remote_addr},
        {"remote_port", req.remote_port},
        {"status",      res.status},
@@ -3054,6 +3050,26 @@ int main(int argc, char **argv) {
        log_data["api_key"] = "api_key: " + std::to_string(sparams.api_keys.size()) + " keys loaded";
    }

+    if (sparams.n_threads_http < 1) {
+        // +2 threads for monitoring endpoints
+        sparams.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);
+    }
+    log_data["n_threads_http"] =  std::to_string(sparams.n_threads_http);
+    svr.new_task_queue = [&sparams] { return new httplib::ThreadPool(sparams.n_threads_http); };
+
+    LOG_INFO("HTTP server listening", log_data);
+    // run the HTTP server in a thread - see comment below
+    std::thread t([&]()
+            {
+                if (!svr.listen_after_bind())
+                {
+                    state.store(SERVER_STATE_ERROR);
+                    return 1;
+                }
+
+                return 0;
+            });
+
    // load the model
    if (!llama.load_model(params))
    {
@@ -3258,26 +3274,6 @@ int main(int argc, char **argv) {
    }*/
    //);

-    if (sparams.n_threads_http < 1) {
-        // +2 threads for monitoring endpoints
-        sparams.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);
-    }
-    log_data["n_threads_http"] =  std::to_string(sparams.n_threads_http);
-    svr.new_task_queue = [&sparams] { return new httplib::ThreadPool(sparams.n_threads_http); };
-
-    LOG_INFO("HTTP server listening", log_data);
-    // run the HTTP server in a thread - see comment below
-    std::thread t([&]()
-            {
-                if (!svr.listen_after_bind())
-                {
-                    state.store(SERVER_STATE_ERROR);
-                    return 1;
-                }
-
-                return 0;
-            });
-
    llama.queue_tasks.on_new_task(std::bind(
        &llama_server_context::process_single_task, &llama, std::placeholders::_1));
    llama.queue_tasks.on_finish_multitask(std::bind(
--- a/llm/ext_server/utils.hpp
+++ b/llm/ext_server/utils.hpp
@@ -55,9 +55,10 @@ extern bool server_log_json;
    } while (0)
 #endif

-#define LOG_ERROR(  MSG, ...) server_log("ERR",  __func__, __LINE__, MSG, __VA_ARGS__)
+#define LOG_ERROR(  MSG, ...) server_log("ERROR",  __func__, __LINE__, MSG, __VA_ARGS__)
 #define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
 #define LOG_INFO(   MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
+#define LOG_DEBUG(  MSG, ...) server_log("DEBUG", __func__, __LINE__, MSG, __VA_ARGS__)

 enum server_state {
    SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet
@@ -123,6 +124,10 @@ static inline void server_log(const char *level, const char *function, int line,
        {"timestamp", time(nullptr)},
    };

+    if (strncmp("DEBUG", level, strlen(level)) == 0 && !server_verbose) {
+        return;
+    }
+
    if (server_log_json) {
        log.merge_patch(
                {
@@ -137,14 +142,12 @@ static inline void server_log(const char *level, const char *function, int line,

        std::cout << log.dump(-1, ' ', false, json::error_handler_t::replace) << "\n" << std::flush;
    } else {
-        char buf[1024];
-        snprintf(buf, 1024, "%4s [%24s] %s", level, function, message);
-
        if (!extra.empty()) {
            log.merge_patch(extra);
        }
+
        std::stringstream ss;
-        ss << buf << " |";
+        ss << level << " [" << function << "] " << message << " |";
        for (const auto& el : log.items())
        {
            const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace);
--- a/llm/filetype.go
+++ b/llm/filetype.go
@@ -0,0 +1,140 @@
+package llm
+
+import "fmt"
+
+type fileType uint32
+
+const (
+	fileTypeF32 fileType = iota
+	fileTypeF16
+	fileTypeQ4_0
+	fileTypeQ4_1
+	fileTypeQ4_1_F16
+	fileTypeQ4_2 // unused
+	fileTypeQ4_3 // unused
+	fileTypeQ8_0
+	fileTypeQ5_0
+	fileTypeQ5_1
+	fileTypeQ2_K
+	fileTypeQ3_K_S
+	fileTypeQ3_K_M
+	fileTypeQ3_K_L
+	fileTypeQ4_K_S
+	fileTypeQ4_K_M
+	fileTypeQ5_K_S
+	fileTypeQ5_K_M
+	fileTypeQ6_K
+	fileTypeIQ2_XXS
+	fileTypeIQ2_XS
+	fileTypeQ2_K_S
+	fileTypeQ3_K_XS
+	fileTypeIQ3_XXS
+
+	fileTypeUnknown
+)
+
+func ParseFileType(s string) (fileType, error) {
+	switch s {
+	case "F32":
+		return fileTypeF32, nil
+	case "F16":
+		return fileTypeF16, nil
+	case "Q4_0":
+		return fileTypeQ4_0, nil
+	case "Q4_1":
+		return fileTypeQ4_1, nil
+	case "Q4_1_F16":
+		return fileTypeQ4_1_F16, nil
+	case "Q8_0":
+		return fileTypeQ8_0, nil
+	case "Q5_0":
+		return fileTypeQ5_0, nil
+	case "Q5_1":
+		return fileTypeQ5_1, nil
+	case "Q2_K":
+		return fileTypeQ2_K, nil
+	case "Q3_K_S":
+		return fileTypeQ3_K_S, nil
+	case "Q3_K_M":
+		return fileTypeQ3_K_M, nil
+	case "Q3_K_L":
+		return fileTypeQ3_K_L, nil
+	case "Q4_K_S":
+		return fileTypeQ4_K_S, nil
+	case "Q4_K_M":
+		return fileTypeQ4_K_M, nil
+	case "Q5_K_S":
+		return fileTypeQ5_K_S, nil
+	case "Q5_K_M":
+		return fileTypeQ5_K_M, nil
+	case "Q6_K":
+		return fileTypeQ6_K, nil
+	case "IQ2_XXS":
+		return fileTypeIQ2_XXS, nil
+	case "IQ2_XS":
+		return fileTypeIQ2_XS, nil
+	case "Q2_K_S":
+		return fileTypeQ2_K_S, nil
+	case "Q3_K_XS":
+		return fileTypeQ3_K_XS, nil
+	case "IQ3_XXS":
+		return fileTypeIQ3_XXS, nil
+	default:
+		return fileTypeUnknown, fmt.Errorf("unknown fileType: %s", s)
+	}
+}
+
+func (t fileType) String() string {
+	switch t {
+	case fileTypeF32:
+		return "F32"
+	case fileTypeF16:
+		return "F16"
+	case fileTypeQ4_0:
+		return "Q4_0"
+	case fileTypeQ4_1:
+		return "Q4_1"
+	case fileTypeQ4_1_F16:
+		return "Q4_1_F16"
+	case fileTypeQ8_0:
+		return "Q8_0"
+	case fileTypeQ5_0:
+		return "Q5_0"
+	case fileTypeQ5_1:
+		return "Q5_1"
+	case fileTypeQ2_K:
+		return "Q2_K"
+	case fileTypeQ3_K_S:
+		return "Q3_K_S"
+	case fileTypeQ3_K_M:
+		return "Q3_K_M"
+	case fileTypeQ3_K_L:
+		return "Q3_K_L"
+	case fileTypeQ4_K_S:
+		return "Q4_K_S"
+	case fileTypeQ4_K_M:
+		return "Q4_K_M"
+	case fileTypeQ5_K_S:
+		return "Q5_K_S"
+	case fileTypeQ5_K_M:
+		return "Q5_K_M"
+	case fileTypeQ6_K:
+		return "Q6_K"
+	case fileTypeIQ2_XXS:
+		return "IQ2_XXS"
+	case fileTypeIQ2_XS:
+		return "IQ2_XS"
+	case fileTypeQ2_K_S:
+		return "Q2_K_S"
+	case fileTypeQ3_K_XS:
+		return "Q3_K_XS"
+	case fileTypeIQ3_XXS:
+		return "IQ3_XXS"
+	default:
+		return "unknown"
+	}
+}
+
+func (t fileType) Value() uint32 {
+	return uint32(t)
+}
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -13,82 +13,6 @@ type GGML struct {
 	model
 }

-const (
-	fileTypeF32 uint32 = iota
-	fileTypeF16
-	fileTypeQ4_0
-	fileTypeQ4_1
-	fileTypeQ4_1_F16
-	fileTypeQ8_0 uint32 = iota + 2
-	fileTypeQ5_0
-	fileTypeQ5_1
-	fileTypeQ2_K
-	fileTypeQ3_K_S
-	fileTypeQ3_K_M
-	fileTypeQ3_K_L
-	fileTypeQ4_K_S
-	fileTypeQ4_K_M
-	fileTypeQ5_K_S
-	fileTypeQ5_K_M
-	fileTypeQ6_K
-	fileTypeIQ2_XXS
-	fileTypeIQ2_XS
-	fileTypeQ2_K_S
-	fileTypeQ3_K_XS
-	fileTypeIQ3_XXS
-)
-
-func fileType(fileType uint32) string {
-	switch fileType {
-	case fileTypeF32:
-		return "F32"
-	case fileTypeF16:
-		return "F16"
-	case fileTypeQ4_0:
-		return "Q4_0"
-	case fileTypeQ4_1:
-		return "Q4_1"
-	case fileTypeQ4_1_F16:
-		return "Q4_1_F16"
-	case fileTypeQ8_0:
-		return "Q8_0"
-	case fileTypeQ5_0:
-		return "Q5_0"
-	case fileTypeQ5_1:
-		return "Q5_1"
-	case fileTypeQ2_K:
-		return "Q2_K"
-	case fileTypeQ3_K_S:
-		return "Q3_K_S"
-	case fileTypeQ3_K_M:
-		return "Q3_K_M"
-	case fileTypeQ3_K_L:
-		return "Q3_K_L"
-	case fileTypeQ4_K_S:
-		return "Q4_K_S"
-	case fileTypeQ4_K_M:
-		return "Q4_K_M"
-	case fileTypeQ5_K_S:
-		return "Q5_K_S"
-	case fileTypeQ5_K_M:
-		return "Q5_K_M"
-	case fileTypeQ6_K:
-		return "Q6_K"
-	case fileTypeIQ2_XXS:
-		return "IQ2_XXS"
-	case fileTypeIQ2_XS:
-		return "IQ2_XS"
-	case fileTypeQ2_K_S:
-		return "Q2_K_S"
-	case fileTypeQ3_K_XS:
-		return "Q3_K_XS"
-	case fileTypeIQ3_XXS:
-		return "IQ3_XXS"
-	default:
-		return "unknown"
-	}
-}
-
 type model interface {
 	KV() KV
 	Tensors() Tensors
@@ -121,12 +45,12 @@ func (kv KV) ParameterCount() uint64 {
 	return kv.u64("general.parameter_count")
 }

-func (kv KV) FileType() string {
+func (kv KV) FileType() fileType {
 	if u64 := kv.u64("general.file_type"); u64 > 0 {
 		return fileType(uint32(u64))
 	}

-	return "unknown"
+	return fileTypeUnknown
 }

 func (kv KV) BlockCount() uint64 {
@@ -286,6 +210,23 @@ const (

 var ErrUnsupportedFormat = errors.New("unsupported model format")

+func DetectGGMLType(b []byte) string {
+	switch binary.LittleEndian.Uint32(b[:4]) {
+	case FILE_MAGIC_GGML:
+		return "ggml"
+	case FILE_MAGIC_GGMF:
+		return "ggmf"
+	case FILE_MAGIC_GGJT:
+		return "ggjt"
+	case FILE_MAGIC_GGLA:
+		return "ggla"
+	case FILE_MAGIC_GGUF_LE, FILE_MAGIC_GGUF_BE:
+		return "gguf"
+	default:
+		return ""
+	}
+}
+
 func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
 	var magic uint32
 	if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
--- a/llm/llm.go
+++ b/llm/llm.go
@@ -20,7 +20,7 @@ func SystemInfo() string {
 	return C.GoString(C.llama_print_system_info())
 }

-func Quantize(infile, outfile, filetype string) error {
+func Quantize(infile, outfile string, ftype fileType) error {
 	cinfile := C.CString(infile)
 	defer C.free(unsafe.Pointer(cinfile))

@@ -29,58 +29,10 @@ func Quantize(infile, outfile, filetype string) error {

 	params := C.llama_model_quantize_default_params()
 	params.nthread = -1
+	params.ftype = ftype.Value()

-	switch filetype {
-	case "F32":
-		params.ftype = fileTypeF32
-	case "F16":
-		params.ftype = fileTypeF16
-	case "Q4_0":
-		params.ftype = fileTypeQ4_0
-	case "Q4_1":
-		params.ftype = fileTypeQ4_1
-	case "Q4_1_F16":
-		params.ftype = fileTypeQ4_1_F16
-	case "Q8_0":
-		params.ftype = fileTypeQ8_0
-	case "Q5_0":
-		params.ftype = fileTypeQ5_0
-	case "Q5_1":
-		params.ftype = fileTypeQ5_1
-	case "Q2_K":
-		params.ftype = fileTypeQ2_K
-	case "Q3_K_S":
-		params.ftype = fileTypeQ3_K_S
-	case "Q3_K_M":
-		params.ftype = fileTypeQ3_K_M
-	case "Q3_K_L":
-		params.ftype = fileTypeQ3_K_L
-	case "Q4_K_S":
-		params.ftype = fileTypeQ4_K_S
-	case "Q4_K_M":
-		params.ftype = fileTypeQ4_K_M
-	case "Q5_K_S":
-		params.ftype = fileTypeQ5_K_S
-	case "Q5_K_M":
-		params.ftype = fileTypeQ5_K_M
-	case "Q6_K":
-		params.ftype = fileTypeQ6_K
-	case "IQ2_XXS":
-		params.ftype = fileTypeIQ2_XXS
-	case "IQ2_XS":
-		params.ftype = fileTypeIQ2_XS
-	case "Q2_K_S":
-		params.ftype = fileTypeQ2_K_S
-	case "Q3_K_XS":
-		params.ftype = fileTypeQ3_K_XS
-	case "IQ3_XXS":
-		params.ftype = fileTypeIQ3_XXS
-	default:
-		return fmt.Errorf("unknown filetype: %s", filetype)
-	}
-
-	if retval := C.llama_model_quantize(cinfile, coutfile, &params); retval != 0 {
-		return fmt.Errorf("llama_model_quantize: %d", retval)
+	if rc := C.llama_model_quantize(cinfile, coutfile, &params); rc != 0 {
+		return fmt.Errorf("llama_model_quantize: %d", rc)
 	}

 	return nil
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -25,7 +25,7 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors
 	// Split up the GPUs by type and try them
 	for _, gpus := range allGpus.ByLibrary() {
 		var layerCount int
-		layerCount, estimatedVRAM = EstimateGPULayers(gpus, ggml, projectors, opts)
+		layerCount, estimatedVRAM, _ = EstimateGPULayers(gpus, ggml, projectors, opts)
 		if opts.NumGPU < 0 {
 			if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) {
 				return true, estimatedVRAM
@@ -39,12 +39,9 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors
 	return false, estimatedVRAM
 }

-// Given a model and one or more GPU targets, predict how many layers and bytes we can load
+// Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
 // The GPUs provided must all be the same Library
-func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) (int, uint64) {
-	if gpus[0].Library == "cpu" {
-		return 0, 0
-	}
+func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) (int, uint64, uint64) {
 	var memoryAvailable uint64
 	for _, info := range gpus {
 		memoryAvailable += info.FreeMemory
@@ -93,11 +90,6 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 	// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
 	memoryRequiredPartial := memoryMinimum + graphPartialOffload + layers["blk.0"].size()

-	if memoryRequiredPartial > memoryAvailable {
-		slog.Debug("insufficient VRAM to load any model layers")
-		return 0, 0
-	}
-
 	var memoryLayerOutput uint64
 	if layer, ok := layers["output_norm"]; ok {
 		memoryLayerOutput += layer.size()
@@ -181,5 +173,13 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 			),
 		),
 	)
-	return layerCount, uint64(memoryRequiredPartial)
+	if gpus[0].Library == "cpu" {
+		return 0, 0, memoryRequiredTotal
+	}
+	if memoryRequiredPartial > memoryAvailable {
+		slog.Debug("insufficient VRAM to load any model layers")
+		return 0, 0, memoryRequiredTotal
+	}
+
+	return layerCount, memoryRequiredPartial, memoryRequiredTotal
 }
--- a/llm/server.go
+++ b/llm/server.go
@@ -49,7 +49,11 @@ type llmServer struct {
 	options api.Options

 	// TODO - this should be broken down by GPU
-	estimatedVRAM uint64 // Estimated usage of VRAM by the loaded model
+	estimatedVRAM  uint64 // Estimated usage of VRAM by the loaded model
+	estimatedTotal uint64 // Total size of model
+	totalLayers    uint64
+	gpuCount       int
+	loadDuration   time.Duration // Record how long it took the model to load

 	sem *semaphore.Weighted
 }
@@ -83,12 +87,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr

 	cpuRunner := ""
 	var estimatedVRAM uint64
+	var estimatedTotal uint64
 	var systemMemory uint64
+	gpuCount := len(gpus)
 	if (len(gpus) == 1 && gpus[0].Library == "cpu") || opts.NumGPU == 0 {

 		// TODO evaluate system memory to see if we should block the load, or force an unload of another CPU runner

 		cpuRunner = serverForCpu()
+		gpuCount = 0
 	} else {
 		if gpus[0].Library == "metal" {
 			memInfo, err := gpu.GetCPUMem()
@@ -100,7 +107,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 			}
 		}
 		var layers int
-		layers, estimatedVRAM = EstimateGPULayers(gpus, ggml, projectors, opts)
+		layers, estimatedVRAM, estimatedTotal = EstimateGPULayers(gpus, ggml, projectors, opts)

 		if gpus[0].Library == "metal" && estimatedVRAM > systemMemory {
 			// disable partial offloading when model is greater than total system memory as this
@@ -133,6 +140,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		} else {
 			slog.Info("user override", "OLLAMA_LLM_LIBRARY", demandLib, "path", serverPath)
 			servers = []string{demandLib}
+			if strings.HasPrefix(demandLib, "cpu") {
+				// Omit the GPU flag to silence the warning
+				opts.NumGPU = -1
+			}
 		}
 	}

@@ -146,11 +157,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
 		"--embedding",
 	}
-	if envconfig.Debug {
-		params = append(params, "--log-format", "json")
-	} else {
-		params = append(params, "--log-disable")
-	}
+
+	params = append(params, "--log-disable")

 	if opts.NumGPU >= 0 {
 		params = append(params, "--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU))
@@ -210,10 +218,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		if dir == "" {
 			// Shouldn't happen
 			finalErr = fmt.Errorf("[%d] server %s not listed in available servers %v", i, servers[i], availableServers)
-			slog.Error("sever list inconsistent", "error", finalErr)
+			slog.Error("server list inconsistent", "error", finalErr)
 			continue
 		}

+		if strings.HasPrefix(servers[i], "cpu") {
+			// TODO if we tried a gpu runner first, and it failed, record the error and bubble that back up
+			gpuCount = 0
+		}
+
 		// Find an availableServers  port, retry on each iterration in case the failure was a port conflict race
 		port := 0
 		if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
@@ -267,46 +280,49 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		}

 		s := &llmServer{
-			port:          port,
-			cmd:           exec.Command(server, finalParams...),
-			status:        NewStatusWriter(os.Stderr),
-			options:       opts,
-			estimatedVRAM: estimatedVRAM,
-			sem:           semaphore.NewWeighted(int64(numParallel)),
+			port:           port,
+			cmd:            exec.Command(server, finalParams...),
+			status:         NewStatusWriter(os.Stderr),
+			options:        opts,
+			estimatedVRAM:  estimatedVRAM,
+			estimatedTotal: estimatedTotal,
+			sem:            semaphore.NewWeighted(int64(numParallel)),
+			totalLayers:    ggml.KV().BlockCount() + 1,
+			gpuCount:       gpuCount,
+			done:           make(chan error, 1),
 		}

-		s.cmd.Env = os.Environ()
 		s.cmd.Stdout = os.Stdout
 		s.cmd.Stderr = s.status

-		visibleDevicesEnv, visibleDevicesEnvVal := gpu.GpuInfoList(gpus).GetVisibleDevicesEnv()
-		pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
+		if v := strings.Join(libraryPaths, string(filepath.ListSeparator)); v != "" {
+			s.cmd.Env = append(s.cmd.Env, pathEnv+"="+v)
+		}

-		// Update or add the path and visible devices variable with our adjusted version
-		pathNeeded := true
-		devicesNeeded := visibleDevicesEnv != ""
-		for i := range s.cmd.Env {
-			cmp := strings.SplitN(s.cmd.Env[i], "=", 2)
-			if strings.EqualFold(cmp[0], pathEnv) {
-				s.cmd.Env[i] = pathEnv + "=" + pathEnvVal
-				pathNeeded = false
-			} else if devicesNeeded && strings.EqualFold(cmp[0], visibleDevicesEnv) {
-				s.cmd.Env[i] = visibleDevicesEnv + "=" + visibleDevicesEnvVal
-				devicesNeeded = false
+		if k, v := gpu.GpuInfoList(gpus).GetVisibleDevicesEnv(); k != "" {
+			s.cmd.Env = append(s.cmd.Env, k+"="+v)
+		}
+
+		for _, ev := range os.Environ() {
+			if strings.HasPrefix(ev, "CUDA_") ||
+				strings.HasPrefix(ev, "ROCM_") ||
+				strings.HasPrefix(ev, "HIP_") ||
+				strings.HasPrefix(ev, "HSA_") ||
+				strings.HasPrefix(ev, "GGML_") {
+				s.cmd.Env = append(s.cmd.Env, ev)
 			}
 		}
-		if pathNeeded {
-			s.cmd.Env = append(s.cmd.Env, pathEnv+"="+pathEnvVal)
-		}
-		if devicesNeeded {
-			s.cmd.Env = append(s.cmd.Env, visibleDevicesEnv+"="+visibleDevicesEnvVal)
-		}

 		slog.Info("starting llama server", "cmd", s.cmd.String())
 		// Log at debug as the environment is inherited and might contain sensitive information
 		slog.Debug("subprocess", "environment", s.cmd.Env)

 		if err = s.cmd.Start(); err != nil {
+			// Detect permission denied and augment them essage about noexec
+			if errors.Is(err, os.ErrPermission) {
+				finalErr = fmt.Errorf("unable to start server %w.  %s may have noexec set.  Set OLLAMA_TMPDIR for server to a writable executable directory", err, dir)
+				continue
+			}
 			msg := ""
 			if s.status != nil && s.status.LastErrMsg != "" {
 				msg = s.status.LastErrMsg
@@ -316,6 +332,11 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 			continue
 		}

+		// reap subprocess when it exits
+		go func() {
+			s.done <- s.cmd.Wait()
+		}()
+
 		return s, nil
 	}

@@ -382,6 +403,10 @@ func (s *llmServer) getServerStatus(ctx context.Context) (ServerStatus, error) {
 		if s.status != nil && s.status.LastErrMsg != "" {
 			msg = s.status.LastErrMsg
 		}
+		if s.cmd.ProcessState.ExitCode() == -1 {
+			// Most likely a signal killed it, log some more details to try to help troubleshoot
+			slog.Warn("llama runner process no longer running", "sys", s.cmd.ProcessState.Sys(), "string", s.cmd.ProcessState.String())
+		}
 		return ServerStatusError, fmt.Errorf("llama runner process no longer running: %d %s", s.cmd.ProcessState.ExitCode(), msg)
 	}

@@ -456,13 +481,11 @@ func (s *llmServer) Ping(ctx context.Context) error {

 func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
 	start := time.Now()
-	// TODO we need to wire up a better way to detect hangs during model load and startup of the server
 	expiresAt := time.Now().Add(10 * time.Minute) // be generous with timeout, large models can take a while to load
-	ticker := time.NewTicker(50 * time.Millisecond)
-	defer ticker.Stop()

 	slog.Info("waiting for llama runner to start responding")
 	var lastStatus ServerStatus = -1
+
 	for {
 		select {
 		case <-ctx.Done():
@@ -474,41 +497,39 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
 				msg = s.status.LastErrMsg
 			}
 			return fmt.Errorf("llama runner process has terminated: %v %s", err, msg)
-		case <-ticker.C:
-			if time.Now().After(expiresAt) {
-				// timeout
-				msg := ""
-				if s.status != nil && s.status.LastErrMsg != "" {
-					msg = s.status.LastErrMsg
-				}
-				return fmt.Errorf("timed out waiting for llama runner to start: %s", msg)
+		default:
+		}
+		if time.Now().After(expiresAt) {
+			// timeout
+			msg := ""
+			if s.status != nil && s.status.LastErrMsg != "" {
+				msg = s.status.LastErrMsg
 			}
-			if s.cmd.ProcessState != nil {
-				msg := ""
-				if s.status != nil && s.status.LastErrMsg != "" {
-					msg = s.status.LastErrMsg
-				}
-				return fmt.Errorf("llama runner process no longer running: %d %s", s.cmd.ProcessState.ExitCode(), msg)
-			}
-
-			c, cancel := context.WithTimeout(ctx, 200*time.Millisecond)
-			defer cancel()
-			status, err := s.getServerStatus(c)
-			if err != nil && lastStatus != status {
-				slog.Debug("server not yet available", "error", err)
-				lastStatus = status
-				continue
-			}
-
-			switch status {
-			case ServerStatusLoadingModel:
-				// TODO - this state never seems to happen with the current server.cpp code (bug?)
-				// it doesn't respond to the health endpoint until after the model is loaded
-				slog.Debug("loading model")
-			case ServerStatusReady:
-				slog.Debug(fmt.Sprintf("llama runner started in %f seconds", time.Since(start).Seconds()))
-				return nil
+			return fmt.Errorf("timed out waiting for llama runner to start: %s", msg)
+		}
+		if s.cmd.ProcessState != nil {
+			msg := ""
+			if s.status != nil && s.status.LastErrMsg != "" {
+				msg = s.status.LastErrMsg
 			}
+			return fmt.Errorf("llama runner process no longer running: %d %s", s.cmd.ProcessState.ExitCode(), msg)
+		}
+		ctx, cancel := context.WithTimeout(ctx, 200*time.Millisecond)
+		defer cancel()
+		status, _ := s.getServerStatus(ctx)
+		if lastStatus != status && status != ServerStatusReady {
+			// Only log on status changes
+			slog.Info("waiting for server to become available", "status", status.ToString())
+		}
+		switch status {
+		case ServerStatusReady:
+			s.loadDuration = time.Since(start)
+			slog.Info(fmt.Sprintf("llama runner started in %0.2f seconds", s.loadDuration.Seconds()))
+			return nil
+		default:
+			lastStatus = status
+			time.Sleep(time.Millisecond * 250)
+			continue
 		}
 	}
 }
@@ -549,10 +570,11 @@ type ImageData struct {
 }

 type completion struct {
-	Content string `json:"content"`
-	Model   string `json:"model"`
-	Prompt  string `json:"prompt"`
-	Stop    bool   `json:"stop"`
+	Content      string `json:"content"`
+	Model        string `json:"model"`
+	Prompt       string `json:"prompt"`
+	Stop         bool   `json:"stop"`
+	StoppedLimit bool   `json:"stopped_limit"`

 	Timings struct {
 		PredictedN  int     `json:"predicted_n"`
@@ -571,6 +593,7 @@ type CompletionRequest struct {

 type CompletionResponse struct {
 	Content            string
+	DoneReason         string
 	Done               bool
 	PromptEvalCount    int
 	PromptEvalDuration time.Duration
@@ -712,8 +735,14 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 			}

 			if c.Stop {
+				doneReason := "stop"
+				if c.StoppedLimit {
+					doneReason = "length"
+				}
+
 				fn(CompletionResponse{
 					Done:               true,
+					DoneReason:         doneReason,
 					PromptEvalCount:    c.Timings.PromptN,
 					PromptEvalDuration: parseDurationMs(c.Timings.PromptMS),
 					EvalCount:          c.Timings.PredictedN,
@@ -908,8 +937,11 @@ func (s *llmServer) Close() error {
 		if err := s.cmd.Process.Kill(); err != nil {
 			return err
 		}
-
-		_ = s.cmd.Wait()
+		// if ProcessState is already populated, Wait already completed, no need to wait again
+		if s.cmd.ProcessState == nil {
+			slog.Debug("waiting for llama server to exit")
+			<-s.done
+		}

 		slog.Debug("llama server stopped")
 	}
--- a/openai/openai.go
+++ b/openai/openai.go
@@ -107,15 +107,9 @@ func toChatCompletion(id string, r api.ChatResponse) ChatCompletion {
 		Model:             r.Model,
 		SystemFingerprint: "fp_ollama",
 		Choices: []Choice{{
-			Index:   0,
-			Message: Message{Role: r.Message.Role, Content: r.Message.Content},
-			FinishReason: func(done bool) *string {
-				if done {
-					reason := "stop"
-					return &reason
-				}
-				return nil
-			}(r.Done),
+			Index:        0,
+			Message:      Message{Role: r.Message.Role, Content: r.Message.Content},
+			FinishReason: &r.DoneReason,
 		}},
 		Usage: Usage{
 			// TODO: ollama returns 0 for prompt eval if the prompt was cached, but openai returns the actual count
@@ -135,15 +129,9 @@ func toChunk(id string, r api.ChatResponse) ChatCompletionChunk {
 		SystemFingerprint: "fp_ollama",
 		Choices: []ChunkChoice{
 			{
-				Index: 0,
-				Delta: Message{Role: "assistant", Content: r.Message.Content},
-				FinishReason: func(done bool) *string {
-					if done {
-						reason := "stop"
-						return &reason
-					}
-					return nil
-				}(r.Done),
+				Index:        0,
+				Delta:        Message{Role: "assistant", Content: r.Message.Content},
+				FinishReason: &r.DoneReason,
 			},
 		},
 	}
--- a/readline/readline.go
+++ b/readline/readline.go
@@ -218,7 +218,7 @@ func (i *Instance) Readline() (string, error) {
 		case CharCtrlZ:
 			fd := int(syscall.Stdin)
 			return handleCharCtrlZ(fd, i.Terminal.termios)
-		case CharEnter:
+		case CharEnter, CharCtrlJ:
 			output := buf.String()
 			if output != "" {
 				i.History.Add([]rune(output))
@@ -232,7 +232,7 @@ func (i *Instance) Readline() (string, error) {
 				metaDel = false
 				continue
 			}
-			if r >= CharSpace || r == CharEnter {
+			if r >= CharSpace || r == CharEnter || r == CharCtrlJ {
 				buf.Add(r)
 			}
 		}
--- a/server/images.go
+++ b/server/images.go
@@ -1,8 +1,8 @@
 package server

 import (
-	"archive/zip"
 	"bytes"
+	"cmp"
 	"context"
 	"crypto/sha256"
 	"encoding/base64"
@@ -11,7 +11,6 @@ import (
 	"errors"
 	"fmt"
 	"io"
-	"io/fs"
 	"log"
 	"log/slog"
 	"net/http"
@@ -26,7 +25,6 @@ import (

 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/auth"
-	"github.com/ollama/ollama/convert"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/server/envconfig"
@@ -54,7 +52,6 @@ type Model struct {
 	System         string
 	License        []string
 	Digest         string
-	Size           int64
 	Options        map[string]interface{}
 	Messages       []Message
 }
@@ -71,6 +68,20 @@ func (m *Model) String() string {
 		Args: m.ModelPath,
 	})

+	for _, adapter := range m.AdapterPaths {
+		modelfile.Commands = append(modelfile.Commands, model.Command{
+			Name: "adapter",
+			Args: adapter,
+		})
+	}
+
+	for _, projector := range m.ProjectorPaths {
+		modelfile.Commands = append(modelfile.Commands, model.Command{
+			Name: "model",
+			Args: projector,
+		})
+	}
+
 	if m.Template != "" {
 		modelfile.Commands = append(modelfile.Commands, model.Command{
 			Name: "template",
@@ -85,20 +96,6 @@ func (m *Model) String() string {
 		})
 	}

-	for _, adapter := range m.AdapterPaths {
-		modelfile.Commands = append(modelfile.Commands, model.Command{
-			Name: "adapter",
-			Args: adapter,
-		})
-	}
-
-	for _, projector := range m.ProjectorPaths {
-		modelfile.Commands = append(modelfile.Commands, model.Command{
-			Name: "projector",
-			Args: projector,
-		})
-	}
-
 	for k, v := range m.Options {
 		switch v := v.(type) {
 		case []any:
@@ -158,50 +155,11 @@ type ConfigV2 struct {
 	RootFS       RootFS `json:"rootfs"`
 }

-func (c *ConfigV2) SetModelFormat(format string) {
-	if c.ModelFormat == "" {
-		c.ModelFormat = format
-	}
-}
-
-func (c *ConfigV2) SetModelFamily(families ...string) {
-	for _, family := range families {
-		if c.ModelFamily == "" {
-			c.ModelFamily = family
-		}
-
-		if !slices.Contains(c.ModelFamilies, family) {
-			c.ModelFamilies = append(c.ModelFamilies, family)
-		}
-	}
-}
-
-func (c *ConfigV2) SetModelType(modelType string) {
-	if c.ModelType == "" {
-		c.ModelType = modelType
-	}
-}
-
-func (c *ConfigV2) SetFileType(fileType string) {
-	if c.FileType == "" {
-		c.FileType = fileType
-	}
-}
-
 type RootFS struct {
 	Type    string   `json:"type"`
 	DiffIDs []string `json:"diff_ids"`
 }

-func (m *ManifestV2) GetTotalSize() (total int64) {
-	for _, layer := range m.Layers {
-		total += layer.Size
-	}
-
-	total += m.Config.Size
-	return total
-}
-
 func GetManifest(mp ModelPath) (*ManifestV2, string, error) {
 	fp, err := mp.GetManifestPath()
 	if err != nil {
@@ -242,7 +200,6 @@ func GetModel(name string) (*Model, error) {
 		Digest:    digest,
 		Template:  "{{ .Prompt }}",
 		License:   []string{},
-		Size:      manifest.GetTotalSize(),
 	}

 	filename, err := GetBlobsPath(manifest.Config.Digest)
@@ -332,7 +289,7 @@ func GetModel(name string) (*Model, error) {
 	return model, nil
 }

-func realpath(mfDir, from string) string {
+func realpath(rel, from string) string {
 	abspath, err := filepath.Abs(from)
 	if err != nil {
 		return from
@@ -349,22 +306,15 @@ func realpath(mfDir, from string) string {
 		return filepath.Join(home, from[2:])
 	}

-	if _, err := os.Stat(filepath.Join(mfDir, from)); err == nil {
+	if _, err := os.Stat(filepath.Join(rel, from)); err == nil {
 		// this is a file relative to the Modelfile
-		return filepath.Join(mfDir, from)
+		return filepath.Join(rel, from)
 	}

 	return abspath
 }

-func CreateModel(ctx context.Context, name, modelFileDir, quantization string, modelfile *model.File, fn func(resp api.ProgressResponse)) error {
-	deleteMap := make(map[string]struct{})
-	if manifest, _, err := GetManifest(ParseModelPath(name)); err == nil {
-		for _, layer := range append(manifest.Layers, manifest.Config) {
-			deleteMap[layer.Digest] = struct{}{}
-		}
-	}
-
+func CreateModel(ctx context.Context, name, modelFileDir, quantization string, modelfile *model.File, fn func(resp api.ProgressResponse)) (err error) {
 	config := ConfigV2{
 		OS:           "linux",
 		Architecture: "amd64",
@@ -373,250 +323,181 @@ func CreateModel(ctx context.Context, name, modelFileDir, quantization string, m
 		},
 	}

-	var layers Layers
-	messages := []string{}
-
-	params := make(map[string][]string)
-	fromParams := make(map[string]any)
+	var messages []*api.Message
+	parameters := make(map[string]any)

+	var layers []*Layer
 	for _, c := range modelfile.Commands {
 		mediatype := fmt.Sprintf("application/vnd.ollama.image.%s", c.Name)

 		switch c.Name {
-		case "model":
-			if strings.HasPrefix(c.Args, "@") {
-				blobPath, err := GetBlobsPath(strings.TrimPrefix(c.Args, "@"))
+		case "model", "adapter":
+			var baseLayers []*layerWithGGML
+			if name := model.ParseName(c.Args); name.IsValid() {
+				baseLayers, err = parseFromModel(ctx, name, fn)
+				if err != nil {
+					return err
+				}
+			} else if strings.HasPrefix(c.Args, "@") {
+				blobpath, err := GetBlobsPath(strings.TrimPrefix(c.Args, "@"))
 				if err != nil {
 					return err
 				}

-				c.Args = blobPath
-			}
-
-			pathName := realpath(modelFileDir, c.Args)
-
-			ggufName, err := convertModel(name, pathName, fn)
-			if err != nil {
-				var pathErr *fs.PathError
-				switch {
-				case errors.Is(err, zip.ErrFormat):
-					// it's not a safetensor archive
-				case errors.As(err, &pathErr):
-					// it's not a file on disk, could be a model reference
-				default:
+				blob, err := os.Open(blobpath)
+				if err != nil {
 					return err
 				}
+				defer blob.Close()
+
+				baseLayers, err = parseFromFile(ctx, blob, fn)
+				if err != nil {
+					return err
+				}
+			} else if file, err := os.Open(realpath(modelFileDir, c.Args)); err == nil {
+				defer file.Close()
+
+				baseLayers, err = parseFromFile(ctx, file, fn)
+				if err != nil {
+					return err
+				}
+			} else {
+				return fmt.Errorf("invalid model reference: %s", c.Args)
 			}

-			if ggufName != "" {
-				pathName = ggufName
-				defer os.RemoveAll(ggufName)
-
-				if quantization != "" {
-					quantization = strings.ToUpper(quantization)
-					fn(api.ProgressResponse{Status: fmt.Sprintf("quantizing %s model to %s", "F16", quantization)})
-					tempfile, err := os.CreateTemp(filepath.Dir(ggufName), quantization)
+			for _, baseLayer := range baseLayers {
+				if quantization != "" &&
+					baseLayer.MediaType == "application/vnd.ollama.image.model" &&
+					baseLayer.GGML != nil &&
+					baseLayer.GGML.Name() == "gguf" {
+					want, err := llm.ParseFileType(quantization)
 					if err != nil {
 						return err
 					}
-					defer os.RemoveAll(tempfile.Name())

-					if err := llm.Quantize(ggufName, tempfile.Name(), quantization); err != nil {
-						return err
-					}
+					ft := baseLayer.GGML.KV().FileType()
+					if !slices.Contains([]string{"F16", "F32"}, ft.String()) {
+						return errors.New("quantization is only supported for F16 and F32 models")
+					} else if want != ft {
+						fn(api.ProgressResponse{Status: fmt.Sprintf("quantizing %s model to %s", ft, quantization)})

-					if err := tempfile.Close(); err != nil {
-						return err
-					}
-
-					pathName = tempfile.Name()
-				}
-			}
-
-			bin, err := os.Open(pathName)
-			if err != nil {
-				// not a file on disk so must be a model reference
-				modelpath := ParseModelPath(c.Args)
-				manifest, _, err := GetManifest(modelpath)
-				switch {
-				case errors.Is(err, os.ErrNotExist):
-					fn(api.ProgressResponse{Status: "pulling model"})
-					if err := PullModel(ctx, c.Args, &registryOptions{}, fn); err != nil {
-						return err
-					}
-
-					manifest, _, err = GetManifest(modelpath)
-					if err != nil {
-						return err
-					}
-				case err != nil:
-					return err
-				}
-
-				fn(api.ProgressResponse{Status: "reading model metadata"})
-				fromConfigPath, err := GetBlobsPath(manifest.Config.Digest)
-				if err != nil {
-					return err
-				}
-
-				fromConfigFile, err := os.Open(fromConfigPath)
-				if err != nil {
-					return err
-				}
-				defer fromConfigFile.Close()
-
-				var fromConfig ConfigV2
-				if err := json.NewDecoder(fromConfigFile).Decode(&fromConfig); err != nil {
-					return err
-				}
-
-				// if the model is still not in gguf format, error out
-				if fromConfig.ModelFormat != "gguf" {
-					return fmt.Errorf("%s is not in gguf format, this base model is not compatible with this version of ollama", c.Args)
-				}
-
-				config.SetModelFormat(fromConfig.ModelFormat)
-				config.SetModelFamily(append(fromConfig.ModelFamilies, fromConfig.ModelFamily)...)
-				config.SetModelType(fromConfig.ModelType)
-				config.SetFileType(fromConfig.FileType)
-
-				for _, layer := range manifest.Layers {
-					deleteMap[layer.Digest] = struct{}{}
-					if layer.MediaType == "application/vnd.ollama.image.params" {
-						fromParamsPath, err := GetBlobsPath(layer.Digest)
+						blob, err := GetBlobsPath(baseLayer.Digest)
 						if err != nil {
 							return err
 						}

-						fromParamsFile, err := os.Open(fromParamsPath)
+						temp, err := os.CreateTemp(filepath.Dir(blob), quantization)
 						if err != nil {
 							return err
 						}
-						defer fromParamsFile.Close()
+						defer temp.Close()
+						defer os.Remove(temp.Name())

-						if err := json.NewDecoder(fromParamsFile).Decode(&fromParams); err != nil {
+						if err := llm.Quantize(blob, temp.Name(), want); err != nil {
+							return err
+						}
+
+						baseLayer.Layer, err = NewLayer(temp, baseLayer.Layer.MediaType)
+						if err != nil {
 							return err
 						}
 					}
-
-					layer, err := NewLayerFromLayer(layer.Digest, layer.MediaType, modelpath.GetShortTagname())
-					if err != nil {
-						return err
-					}
-
-					layers.Add(layer)
 				}

-				deleteMap[manifest.Config.Digest] = struct{}{}
-				continue
+				if baseLayer.GGML != nil {
+					config.ModelFormat = cmp.Or(config.ModelFormat, baseLayer.GGML.Name())
+					config.ModelFamily = cmp.Or(config.ModelFamily, baseLayer.GGML.KV().Architecture())
+					config.ModelType = cmp.Or(config.ModelType, format.HumanNumber(baseLayer.GGML.KV().ParameterCount()))
+					config.FileType = cmp.Or(config.FileType, baseLayer.GGML.KV().FileType().String())
+					config.ModelFamilies = append(config.ModelFamilies, baseLayer.GGML.KV().Architecture())
+				}
+
+				layers = append(layers, baseLayer.Layer)
 			}
-			defer bin.Close()
-
-			var offset int64
-			for {
-				fn(api.ProgressResponse{Status: "creating model layer"})
-				if _, err := bin.Seek(offset, io.SeekStart); err != nil {
-					return err
-				}
-
-				ggml, size, err := llm.DecodeGGML(bin)
-				if errors.Is(err, io.EOF) {
-					break
-				} else if errors.Is(err, llm.ErrUnsupportedFormat) {
-					return fmt.Errorf("model binary specified in FROM field is not a valid gguf format model, %w", err)
-				} else if err != nil {
-					return err
-				}
-
-				config.SetModelFormat(ggml.Name())
-				config.SetModelFamily(ggml.KV().Architecture())
-				config.SetModelType(format.HumanNumber(ggml.KV().ParameterCount()))
-				config.SetFileType(ggml.KV().FileType())
-
-				mediatype := mediatype
-				if ggml.KV().Architecture() == "clip" {
-					mediatype = "application/vnd.ollama.image.projector"
-				}
-
-				sr := io.NewSectionReader(bin, offset, size)
-				layer, err := NewLayer(sr, mediatype)
-				if err != nil {
-					return err
-				}
-
-				layers.Add(layer)
-
-				offset += size
-			}
-		case "adapter":
-			if strings.HasPrefix(c.Args, "@") {
-				blobPath, err := GetBlobsPath(strings.TrimPrefix(c.Args, "@"))
-				if err != nil {
-					return err
-				}
-
-				c.Args = blobPath
-			}
-
-			fn(api.ProgressResponse{Status: "creating adapter layer"})
-			bin, err := os.Open(realpath(modelFileDir, c.Args))
-			if err != nil {
-				return err
-			}
-			defer bin.Close()
-
-			_, size, err := llm.DecodeGGML(bin)
+		case "license", "template", "system":
+			blob := strings.NewReader(c.Args)
+			layer, err := NewLayer(blob, mediatype)
 			if err != nil {
 				return err
 			}

-			sr := io.NewSectionReader(bin, 0, size)
-			layer, err := NewLayer(sr, mediatype)
-			if err != nil {
-				return err
+			if c.Name != "license" {
+				// replace
+				layers = slices.DeleteFunc(layers, func(layer *Layer) bool {
+					return layer.MediaType == mediatype
+				})
 			}

-			layers.Add(layer)
-		case "license":
-			fn(api.ProgressResponse{Status: "creating license layer"})
-
-			bin := strings.NewReader(c.Args)
-			layer, err := NewLayer(bin, mediatype)
-			if err != nil {
-				return err
-			}
-
-			layers.Add(layer)
-		case "template", "system":
-			fn(api.ProgressResponse{Status: fmt.Sprintf("creating %s layer", c.Name)})
-
-			bin := strings.NewReader(c.Args)
-			layer, err := NewLayer(bin, mediatype)
-			if err != nil {
-				return err
-			}
-
-			layers.Replace(layer)
+			layers = append(layers, layer)
 		case "message":
-			messages = append(messages, c.Args)
+			role, content, ok := strings.Cut(c.Args, ": ")
+			if !ok {
+				return fmt.Errorf("invalid message: %s", c.Args)
+			}
+
+			messages = append(messages, &api.Message{Role: role, Content: content})
 		default:
-			params[c.Name] = append(params[c.Name], c.Args)
+			ps, err := api.FormatParams(map[string][]string{c.Name: {c.Args}})
+			if err != nil {
+				return err
+			}
+
+			for k, v := range ps {
+				if ks, ok := parameters[k].([]string); ok {
+					parameters[k] = append(ks, v.([]string)...)
+				} else if vs, ok := v.([]string); ok {
+					parameters[k] = vs
+				} else {
+					parameters[k] = v
+				}
+			}
 		}
 	}

-	if len(messages) > 0 {
-		fn(api.ProgressResponse{Status: "creating parameters layer"})
+	var err2 error
+	layers = slices.DeleteFunc(layers, func(layer *Layer) bool {
+		switch layer.MediaType {
+		case "application/vnd.ollama.image.message":
+			// if there are new messages, remove the inherited ones
+			if len(messages) > 0 {
+				return true
+			}

-		msgs := make([]api.Message, 0)
+			return false
+		case "application/vnd.ollama.image.params":
+			// merge inherited parameters with new ones
+			r, err := layer.Open()
+			if err != nil {
+				err2 = err
+				return false
+			}
+			defer r.Close()

-		for _, m := range messages {
-			// todo: handle images
-			msg := strings.SplitN(m, ": ", 2)
-			msgs = append(msgs, api.Message{Role: msg[0], Content: msg[1]})
+			var ps map[string]any
+			if err := json.NewDecoder(r).Decode(&ps); err != nil {
+				err2 = err
+				return false
+			}
+
+			for k, v := range ps {
+				if _, ok := parameters[k]; !ok {
+					parameters[k] = v
+				}
+			}
+
+			return true
+		default:
+			return false
 		}
+	})

+	if err2 != nil {
+		return err2
+	}
+
+	if len(messages) > 0 {
 		var b bytes.Buffer
-		if err := json.NewEncoder(&b).Encode(msgs); err != nil {
+		if err := json.NewEncoder(&b).Encode(messages); err != nil {
 			return err
 		}

@@ -625,39 +506,25 @@ func CreateModel(ctx context.Context, name, modelFileDir, quantization string, m
 			return err
 		}

-		layers.Replace(layer)
+		layers = append(layers, layer)
 	}

-	if len(params) > 0 {
-		fn(api.ProgressResponse{Status: "creating parameters layer"})
-
-		formattedParams, err := api.FormatParams(params)
-		if err != nil {
-			return err
-		}
-
-		for k, v := range fromParams {
-			if _, ok := formattedParams[k]; !ok {
-				formattedParams[k] = v
-			}
-		}
-
+	if len(parameters) > 0 {
 		var b bytes.Buffer
-		if err := json.NewEncoder(&b).Encode(formattedParams); err != nil {
+		if err := json.NewEncoder(&b).Encode(parameters); err != nil {
 			return err
 		}

-		fn(api.ProgressResponse{Status: "creating config layer"})
 		layer, err := NewLayer(&b, "application/vnd.ollama.image.params")
 		if err != nil {
 			return err
 		}

-		layers.Replace(layer)
+		layers = append(layers, layer)
 	}

-	digests := make([]string, len(layers.items))
-	for i, layer := range layers.items {
+	digests := make([]string, len(layers))
+	for i, layer := range layers {
 		digests[i] = layer.Digest
 	}

@@ -668,36 +535,37 @@ func CreateModel(ctx context.Context, name, modelFileDir, quantization string, m
 		return err
 	}

-	configLayer, err := NewLayer(&b, "application/vnd.docker.container.image.v1+json")
+	layer, err := NewLayer(&b, "application/vnd.docker.container.image.v1+json")
 	if err != nil {
 		return err
 	}

-	delete(deleteMap, configLayer.Digest)
+	for _, layer := range append(layers, layer) {
+		if layer.status != "" {
+			fn(api.ProgressResponse{Status: layer.status})
+		}
+	}

-	for _, layer := range append(layers.items, configLayer) {
-		committed, err := layer.Commit()
-		if err != nil {
-			return err
+	unref := make(map[string]struct{})
+	if manifest, _, err := GetManifest(ParseModelPath(name)); err == nil {
+		for _, layer := range manifest.Layers {
+			if !slices.Contains(digests, layer.Digest) {
+				unref[layer.Digest] = struct{}{}
+			}
 		}

-		status := "writing layer"
-		if !committed {
-			status = "using already created layer"
+		if manifest.Config.Digest != layer.Digest {
+			unref[manifest.Config.Digest] = struct{}{}
 		}
-
-		fn(api.ProgressResponse{Status: fmt.Sprintf("%s %s", status, layer.Digest)})
-
-		delete(deleteMap, layer.Digest)
 	}

 	fn(api.ProgressResponse{Status: "writing manifest"})
-	if err := WriteManifest(name, configLayer, layers.items); err != nil {
+	if err := WriteManifest(name, layer, layers); err != nil {
 		return err
 	}

 	if !envconfig.NoPrune {
-		if err := deleteUnusedLayers(nil, deleteMap, false); err != nil {
+		if err := deleteUnusedLayers(nil, unref); err != nil {
 			return err
 		}
 	}
@@ -706,74 +574,6 @@ func CreateModel(ctx context.Context, name, modelFileDir, quantization string, m
 	return nil
 }

-func convertModel(name, path string, fn func(resp api.ProgressResponse)) (string, error) {
-	r, err := zip.OpenReader(path)
-	if err != nil {
-		return "", err
-	}
-	defer r.Close()
-
-	tempDir, err := os.MkdirTemp("", "ollama-convert")
-	if err != nil {
-		return "", err
-	}
-	defer os.RemoveAll(tempDir)
-
-	fn(api.ProgressResponse{Status: "unpacking model metadata"})
-	for _, f := range r.File {
-		fpath := filepath.Join(tempDir, f.Name)
-		outFile, err := os.OpenFile(fpath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, f.Mode())
-		if err != nil {
-			return "", err
-		}
-
-		rc, err := f.Open()
-		if err != nil {
-			return "", err
-		}
-
-		_, err = io.Copy(outFile, rc)
-		if err != nil {
-			return "", err
-		}
-
-		outFile.Close()
-		rc.Close()
-	}
-
-	mf, err := convert.GetModelFormat(tempDir)
-	if err != nil {
-		return "", err
-	}
-
-	params, err := mf.GetParams(tempDir)
-	if err != nil {
-		return "", err
-	}
-
-	mArch, err := mf.GetModelArch(name, tempDir, params)
-	if err != nil {
-		return "", err
-	}
-
-	fn(api.ProgressResponse{Status: "processing tensors"})
-	if err := mArch.GetTensors(); err != nil {
-		return "", err
-	}
-
-	if err := mArch.LoadVocab(); err != nil {
-		return "", err
-	}
-
-	fn(api.ProgressResponse{Status: "converting model"})
-	path, err = mArch.WriteGGUF()
-	if err != nil {
-		return "", err
-	}
-
-	return path, nil
-}
-
 func CopyModel(src, dst model.Name) error {
 	if !dst.IsFullyQualified() {
 		return model.Unqualified(dst)
@@ -813,7 +613,7 @@ func CopyModel(src, dst model.Name) error {
 	return err
 }

-func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]struct{}, dryRun bool) error {
+func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]struct{}) error {
 	fp, err := GetManifestPath()
 	if err != nil {
 		return err
@@ -860,13 +660,9 @@ func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]struct{},
 			slog.Info(fmt.Sprintf("couldn't get file path for '%s': %v", k, err))
 			continue
 		}
-		if !dryRun {
-			if err := os.Remove(fp); err != nil {
-				slog.Info(fmt.Sprintf("couldn't remove file '%s': %v", fp, err))
-				continue
-			}
-		} else {
-			slog.Info(fmt.Sprintf("wanted to remove: %s", fp))
+		if err := os.Remove(fp); err != nil {
+			slog.Info(fmt.Sprintf("couldn't remove file '%s': %v", fp, err))
+			continue
 		}
 	}

@@ -889,14 +685,25 @@ func PruneLayers() error {
 	for _, blob := range blobs {
 		name := blob.Name()
 		name = strings.ReplaceAll(name, "-", ":")
-		if strings.HasPrefix(name, "sha256:") {
-			deleteMap[name] = struct{}{}
+
+		_, err := GetBlobsPath(name)
+		if err != nil {
+			if errors.Is(err, ErrInvalidDigestFormat) {
+				// remove invalid blobs (e.g. partial downloads)
+				if err := os.Remove(filepath.Join(p, blob.Name())); err != nil {
+					slog.Error("couldn't remove blob", "blob", blob.Name(), "error", err)
+				}
+			}
+
+			continue
 		}
+
+		deleteMap[name] = struct{}{}
 	}

 	slog.Info(fmt.Sprintf("total blobs: %d", len(deleteMap)))

-	err = deleteUnusedLayers(nil, deleteMap, false)
+	err = deleteUnusedLayers(nil, deleteMap)
 	if err != nil {
 		return err
 	}
@@ -952,7 +759,7 @@ func DeleteModel(name string) error {
 	}
 	deleteMap[manifest.Config.Digest] = struct{}{}

-	err = deleteUnusedLayers(&mp, deleteMap, false)
+	err = deleteUnusedLayers(&mp, deleteMap)
 	if err != nil {
 		return err
 	}
@@ -1112,7 +919,7 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu

 	if noprune == "" {
 		fn(api.ProgressResponse{Status: "removing any unused layers"})
-		err = deleteUnusedLayers(nil, deleteMap, false)
+		err = deleteUnusedLayers(nil, deleteMap)
 		if err != nil {
 			return err
 		}
--- a/server/layers.go
+++ b/server/layers.go
@@ -5,39 +5,14 @@ import (
 	"fmt"
 	"io"
 	"os"
-	"strings"
-
-	"golang.org/x/exp/slices"
 )

-type Layers struct {
-	items []*Layer
-}
-
-func (ls *Layers) Add(layer *Layer) {
-	if layer.Size > 0 {
-		ls.items = append(ls.items, layer)
-	}
-}
-
-func (ls *Layers) Replace(layer *Layer) {
-	if layer.Size > 0 {
-		mediatype := layer.MediaType
-		layers := slices.DeleteFunc(ls.items, func(l *Layer) bool {
-			return l.MediaType == mediatype
-		})
-
-		ls.items = append(layers, layer)
-	}
-}
-
 type Layer struct {
 	MediaType string `json:"mediaType"`
 	Digest    string `json:"digest"`
 	Size      int64  `json:"size"`
 	From      string `json:"from,omitempty"`
-
-	tempFileName string
+	status    string
 }

 func NewLayer(r io.Reader, mediatype string) (*Layer, error) {
@@ -46,14 +21,12 @@ func NewLayer(r io.Reader, mediatype string) (*Layer, error) {
 		return nil, err
 	}

-	const delimiter = "-"
-
-	pattern := strings.Join([]string{"sha256", "*-partial"}, delimiter)
-	temp, err := os.CreateTemp(blobs, pattern)
+	temp, err := os.CreateTemp(blobs, "sha256-")
 	if err != nil {
 		return nil, err
 	}
 	defer temp.Close()
+	defer os.Remove(temp.Name())

 	sha256sum := sha256.New()
 	n, err := io.Copy(io.MultiWriter(temp, sha256sum), r)
@@ -61,11 +34,29 @@ func NewLayer(r io.Reader, mediatype string) (*Layer, error) {
 		return nil, err
 	}

+	if err := temp.Close(); err != nil {
+		return nil, err
+	}
+
+	digest := fmt.Sprintf("sha256:%x", sha256sum.Sum(nil))
+	blob, err := GetBlobsPath(digest)
+	if err != nil {
+		return nil, err
+	}
+
+	status := "using existing layer"
+	if _, err := os.Stat(blob); err != nil {
+		status = "creating new layer"
+		if err := os.Rename(temp.Name(), blob); err != nil {
+			return nil, err
+		}
+	}
+
 	return &Layer{
-		MediaType:    mediatype,
-		Digest:       fmt.Sprintf("sha256:%x", sha256sum.Sum(nil)),
-		Size:         n,
-		tempFileName: temp.Name(),
+		MediaType: mediatype,
+		Digest:    digest,
+		Size:      n,
+		status:    fmt.Sprintf("%s %s", status, digest),
 	}, nil
 }

@@ -85,21 +76,15 @@ func NewLayerFromLayer(digest, mediatype, from string) (*Layer, error) {
 		Digest:    digest,
 		Size:      fi.Size(),
 		From:      from,
+		status:    fmt.Sprintf("using existing layer %s", digest),
 	}, nil
 }

-func (l *Layer) Commit() (bool, error) {
-	// always remove temp
-	defer os.Remove(l.tempFileName)
-
+func (l *Layer) Open() (io.ReadCloser, error) {
 	blob, err := GetBlobsPath(l.Digest)
 	if err != nil {
-		return false, err
+		return nil, err
 	}

-	if _, err := os.Stat(blob); err != nil {
-		return true, os.Rename(l.tempFileName, blob)
-	}
-
-	return false, nil
+	return os.Open(blob)
 }
--- a/server/manifest.go
+++ b/server/manifest.go
@@ -0,0 +1,79 @@
+package server
+
+import (
+	"bytes"
+	"crypto/sha256"
+	"encoding/json"
+	"fmt"
+	"io"
+	"os"
+	"path/filepath"
+
+	"github.com/ollama/ollama/types/model"
+)
+
+type Manifest struct {
+	ManifestV2
+	Digest string `json:"-"`
+}
+
+func (m *Manifest) Size() (size int64) {
+	for _, layer := range append(m.Layers, m.Config) {
+		size += layer.Size
+	}
+
+	return
+}
+
+func ParseNamedManifest(name model.Name) (*Manifest, error) {
+	if !name.IsFullyQualified() {
+		return nil, model.Unqualified(name)
+	}
+
+	manifests, err := GetManifestPath()
+	if err != nil {
+		return nil, err
+	}
+
+	var manifest ManifestV2
+	manifestfile, err := os.Open(filepath.Join(manifests, name.Filepath()))
+	if err != nil {
+		return nil, err
+	}
+
+	sha256sum := sha256.New()
+	if err := json.NewDecoder(io.TeeReader(manifestfile, sha256sum)).Decode(&manifest); err != nil {
+		return nil, err
+	}
+
+	return &Manifest{
+		ManifestV2: manifest,
+		Digest:     fmt.Sprintf("%x", sha256sum.Sum(nil)),
+	}, nil
+}
+
+func WriteManifest(name string, config *Layer, layers []*Layer) error {
+	manifest := ManifestV2{
+		SchemaVersion: 2,
+		MediaType:     "application/vnd.docker.distribution.manifest.v2+json",
+		Config:        config,
+		Layers:        layers,
+	}
+
+	var b bytes.Buffer
+	if err := json.NewEncoder(&b).Encode(manifest); err != nil {
+		return err
+	}
+
+	modelpath := ParseModelPath(name)
+	manifestPath, err := modelpath.GetManifestPath()
+	if err != nil {
+		return err
+	}
+
+	if err := os.MkdirAll(filepath.Dir(manifestPath), 0o755); err != nil {
+		return err
+	}
+
+	return os.WriteFile(manifestPath, b.Bytes(), 0o644)
+}
--- a/server/manifests.go
+++ b/server/manifests.go
@@ -1,34 +0,0 @@
-package server
-
-import (
-	"bytes"
-	"encoding/json"
-	"os"
-	"path/filepath"
-)
-
-func WriteManifest(name string, config *Layer, layers []*Layer) error {
-	manifest := ManifestV2{
-		SchemaVersion: 2,
-		MediaType:     "application/vnd.docker.distribution.manifest.v2+json",
-		Config:        config,
-		Layers:        layers,
-	}
-
-	var b bytes.Buffer
-	if err := json.NewEncoder(&b).Encode(manifest); err != nil {
-		return err
-	}
-
-	modelpath := ParseModelPath(name)
-	manifestPath, err := modelpath.GetManifestPath()
-	if err != nil {
-		return err
-	}
-
-	if err := os.MkdirAll(filepath.Dir(manifestPath), 0o755); err != nil {
-		return err
-	}
-
-	return os.WriteFile(manifestPath, b.Bytes(), 0o644)
-}
--- a/server/model.go
+++ b/server/model.go
@@ -0,0 +1,261 @@
+package server
+
+import (
+	"archive/zip"
+	"bytes"
+	"context"
+	"errors"
+	"fmt"
+	"io"
+	"net/http"
+	"os"
+	"path/filepath"
+
+	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/convert"
+	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/types/model"
+)
+
+type layerWithGGML struct {
+	*Layer
+	*llm.GGML
+}
+
+func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressResponse)) (layers []*layerWithGGML, err error) {
+	modelpath := ParseModelPath(name.String())
+	manifest, _, err := GetManifest(modelpath)
+	switch {
+	case errors.Is(err, os.ErrNotExist):
+		if err := PullModel(ctx, name.String(), &registryOptions{}, fn); err != nil {
+			return nil, err
+		}
+
+		modelpath = ParseModelPath(name.String())
+		manifest, _, err = GetManifest(modelpath)
+		if err != nil {
+			return nil, err
+		}
+	case err != nil:
+		return nil, err
+	}
+
+	for _, layer := range manifest.Layers {
+		layer, err := NewLayerFromLayer(layer.Digest, layer.MediaType, modelpath.GetShortTagname())
+		if err != nil {
+			return nil, err
+		}
+
+		switch layer.MediaType {
+		case "application/vnd.ollama.image.model",
+			"application/vnd.ollama.image.projector",
+			"application/vnd.ollama.image.adapter":
+			blobpath, err := GetBlobsPath(layer.Digest)
+			if err != nil {
+				return nil, err
+			}
+
+			blob, err := os.Open(blobpath)
+			if err != nil {
+				return nil, err
+			}
+			defer blob.Close()
+
+			ggml, _, err := llm.DecodeGGML(blob)
+			if err != nil {
+				return nil, err
+			}
+
+			layers = append(layers, &layerWithGGML{layer, ggml})
+		default:
+			layers = append(layers, &layerWithGGML{layer, nil})
+		}
+
+	}
+
+	return layers, nil
+}
+
+func parseFromZipFile(_ context.Context, file *os.File, fn func(api.ProgressResponse)) (layers []*layerWithGGML, err error) {
+	stat, err := file.Stat()
+	if err != nil {
+		return nil, err
+	}
+
+	r, err := zip.NewReader(file, stat.Size())
+	if err != nil {
+		return nil, err
+	}
+
+	tempdir, err := os.MkdirTemp(filepath.Dir(file.Name()), "")
+	if err != nil {
+		return nil, err
+	}
+	defer os.RemoveAll(tempdir)
+
+	fn(api.ProgressResponse{Status: "unpacking model metadata"})
+	for _, f := range r.File {
+		// TODO(mxyng): this should not write out all files to disk
+		outfile, err := os.Create(filepath.Join(tempdir, f.Name))
+		if err != nil {
+			return nil, err
+		}
+		defer outfile.Close()
+
+		infile, err := f.Open()
+		if err != nil {
+			return nil, err
+		}
+		defer infile.Close()
+
+		if _, err = io.Copy(outfile, infile); err != nil {
+			return nil, err
+		}
+
+		if err := outfile.Close(); err != nil {
+			return nil, err
+		}
+
+		if err := infile.Close(); err != nil {
+			return nil, err
+		}
+	}
+
+	mf, err := convert.GetModelFormat(tempdir)
+	if err != nil {
+		return nil, err
+	}
+
+	params, err := mf.GetParams(tempdir)
+	if err != nil {
+		return nil, err
+	}
+
+	mArch, err := mf.GetModelArch("", tempdir, params)
+	if err != nil {
+		return nil, err
+	}
+
+	fn(api.ProgressResponse{Status: "processing tensors"})
+	if err := mArch.GetTensors(); err != nil {
+		return nil, err
+	}
+
+	if err := mArch.LoadVocab(); err != nil {
+		return nil, err
+	}
+
+	fn(api.ProgressResponse{Status: "converting model"})
+
+	// TODO(mxyng): this should write directly into a layer
+	// e.g. NewLayer(arch.Reader(), "application/vnd.ollama.image.model")
+	temp, err := os.CreateTemp(tempdir, "fp16")
+	if err != nil {
+		return nil, err
+	}
+	defer temp.Close()
+	defer os.Remove(temp.Name())
+
+	if err = mArch.WriteGGUF(temp); err != nil {
+		return nil, err
+	}
+
+	if _, err := temp.Seek(0, io.SeekStart); err != nil {
+		return nil, err
+	}
+
+	layer, err := NewLayer(temp, "application/vnd.ollama.image.model")
+	if err != nil {
+		return nil, fmt.Errorf("aaa: %w", err)
+	}
+
+	blobpath, err := GetBlobsPath(layer.Digest)
+	if err != nil {
+		return nil, err
+	}
+
+	bin, err := os.Open(blobpath)
+	if err != nil {
+		return nil, err
+	}
+	defer bin.Close()
+
+	ggml, _, err := llm.DecodeGGML(bin)
+	if err != nil {
+		return nil, err
+	}
+
+	layer, err = NewLayerFromLayer(layer.Digest, layer.MediaType, "")
+	if err != nil {
+		return nil, err
+	}
+
+	layers = append(layers, &layerWithGGML{layer, ggml})
+	return layers, nil
+}
+
+func parseFromFile(ctx context.Context, file *os.File, fn func(api.ProgressResponse)) (layers []*layerWithGGML, err error) {
+	sr := io.NewSectionReader(file, 0, 512)
+	contentType, err := detectContentType(sr)
+	if err != nil {
+		return nil, err
+	}
+
+	switch contentType {
+	case "gguf", "ggla":
+		// noop
+	case "application/zip":
+		return parseFromZipFile(ctx, file, fn)
+	default:
+		return nil, fmt.Errorf("unsupported content type: %s", contentType)
+	}
+
+	stat, err := file.Stat()
+	if err != nil {
+		return nil, err
+	}
+
+	var offset int64
+	for offset < stat.Size() {
+		ggml, n, err := llm.DecodeGGML(file)
+		if errors.Is(err, io.EOF) {
+			break
+		} else if err != nil {
+			return nil, err
+		}
+
+		mediatype := "application/vnd.ollama.image.model"
+		if ggml.Name() == "ggla" {
+			mediatype = "application/vnd.ollama.image.adapter"
+		} else if ggml.KV().Architecture() == "clip" {
+			mediatype = "application/vnd.ollama.image.projector"
+		}
+
+		layer, err := NewLayer(io.NewSectionReader(file, offset, n), mediatype)
+		if err != nil {
+			return nil, err
+		}
+
+		layers = append(layers, &layerWithGGML{layer, ggml})
+		offset = n
+	}
+
+	return layers, nil
+}
+
+func detectContentType(r io.Reader) (string, error) {
+	var b bytes.Buffer
+	if _, err := io.Copy(&b, r); err != nil {
+		return "", err
+	}
+
+	if contentType := llm.DetectGGMLType(b.Bytes()); contentType != "" {
+		return contentType, nil
+	}
+
+	if contentType := http.DetectContentType(b.Bytes()); contentType != "application/octet-stream" {
+		return contentType, nil
+	}
+
+	return "unknown", nil
+}
--- a/server/modelpath.go
+++ b/server/modelpath.go
@@ -154,9 +154,6 @@ func GetBlobsPath(digest string) (string, error) {
 	// only accept actual sha256 digests
 	pattern := "^sha256[:-][0-9a-fA-F]{64}$"
 	re := regexp.MustCompile(pattern)
-	if err != nil {
-		return "", err
-	}

 	if digest != "" && !re.MatchString(digest) {
 		return "", ErrInvalidDigestFormat
--- a/server/routes.go
+++ b/server/routes.go
@@ -127,10 +127,6 @@ func (s *Server) GenerateHandler(c *gin.Context) {

 	opts, err := modelOptions(model, req.Options)
 	if err != nil {
-		if errors.Is(err, api.ErrInvalidOpts) {
-			c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
-			return
-		}
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
 	}
@@ -156,9 +152,10 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 	// of `raw` mode so we need to check for it too
 	if req.Prompt == "" && req.Template == "" && req.System == "" {
 		c.JSON(http.StatusOK, api.GenerateResponse{
-			CreatedAt: time.Now().UTC(),
-			Model:     req.Model,
-			Done:      true,
+			CreatedAt:  time.Now().UTC(),
+			Model:      req.Model,
+			Done:       true,
+			DoneReason: "load",
 		})
 		return
 	}
@@ -226,10 +223,11 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 			}

 			resp := api.GenerateResponse{
-				Model:     req.Model,
-				CreatedAt: time.Now().UTC(),
-				Done:      r.Done,
-				Response:  r.Content,
+				Model:      req.Model,
+				CreatedAt:  time.Now().UTC(),
+				Done:       r.Done,
+				Response:   r.Content,
+				DoneReason: r.DoneReason,
 				Metrics: api.Metrics{
 					PromptEvalCount:    r.PromptEvalCount,
 					PromptEvalDuration: r.PromptEvalDuration,
@@ -370,10 +368,6 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) {

 	opts, err := modelOptions(model, req.Options)
 	if err != nil {
-		if errors.Is(err, api.ErrInvalidOpts) {
-			c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
-			return
-		}
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
 	}
@@ -413,6 +407,85 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) {
 	c.JSON(http.StatusOK, resp)
 }

+func (s *Server) TokenizeHandler(c *gin.Context) {
+	var req api.TokenizeRequest
+	err := c.ShouldBindJSON(&req)
+	switch {
+	case errors.Is(err, io.EOF):
+		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"})
+		return
+	case err != nil:
+		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()})
+		return
+	}
+
+	if req.Model == "" {
+		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "model is required"})
+		return
+	}
+
+	model, err := GetModel(req.Model)
+	if err != nil {
+		var pErr *fs.PathError
+		if errors.As(err, &pErr) {
+			c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found, try pulling it first", req.Model)})
+			return
+		}
+		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+		return
+	}
+
+	opts, err := modelOptions(model, req.Options)
+	if err != nil {
+		// TODO: handle specific errors
+		// if errors.Is(err, api.ErrInvalidOpts) {
+		// 	c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
+		// 	return
+		// }
+		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+		return
+	}
+
+	var sessionDuration time.Duration
+	if req.KeepAlive == nil {
+		sessionDuration = getDefaultSessionDuration()
+	} else {
+		sessionDuration = req.KeepAlive.Duration
+	}
+
+	rCh, eCh := s.sched.GetRunner(c.Request.Context(), model, opts, sessionDuration)
+	var runner *runnerRef
+	select {
+	case runner = <-rCh:
+	case err = <-eCh:
+		if errors.Is(err, context.Canceled) {
+			c.JSON(499, gin.H{"error": "request canceled"})
+			return
+		}
+
+		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+		return
+	}
+
+	// an empty request loads the model
+	if req.Prompt == "" {
+		c.JSON(http.StatusOK, api.TokenizeResponse{Tokens: []int{}})
+		return
+	}
+
+	tokens, err := runner.llama.Tokenize(c.Request.Context(), req.Prompt)
+	if err != nil {
+		slog.Info(fmt.Sprintf("embedding generation failed: %v", err))
+		c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to generate embedding"})
+		return
+	}
+
+	resp := api.TokenizeResponse{
+		Tokens: tokens,
+	}
+	c.JSON(http.StatusOK, resp)
+}
+
 func (s *Server) PullModelHandler(c *gin.Context) {
 	var req api.PullRequest
 	err := c.ShouldBindJSON(&req)
@@ -560,7 +633,7 @@ func (s *Server) CreateModelHandler(c *gin.Context) {
 		ctx, cancel := context.WithCancel(c.Request.Context())
 		defer cancel()

-		if err := CreateModel(ctx, name.String(), filepath.Dir(req.Path), req.Quantization, modelfile, fn); err != nil {
+		if err := CreateModel(ctx, name.String(), filepath.Dir(req.Path), strings.ToUpper(req.Quantization), modelfile, fn); err != nil {
 			ch <- gin.H{"error": err.Error()}
 		}
 	}()
@@ -719,62 +792,79 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
 }

 func (s *Server) ListModelsHandler(c *gin.Context) {
-	models := make([]api.ModelResponse, 0)
-	manifestsPath, err := GetManifestPath()
+	manifests, err := GetManifestPath()
 	if err != nil {
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
 	}

-	modelResponse := func(modelName string) (api.ModelResponse, error) {
-		model, err := GetModel(modelName)
-		if err != nil {
-			return api.ModelResponse{}, err
-		}
-
-		modelDetails := api.ModelDetails{
-			Format:            model.Config.ModelFormat,
-			Family:            model.Config.ModelFamily,
-			Families:          model.Config.ModelFamilies,
-			ParameterSize:     model.Config.ModelType,
-			QuantizationLevel: model.Config.FileType,
-		}
-
-		return api.ModelResponse{
-			Model:   model.ShortName,
-			Name:    model.ShortName,
-			Size:    model.Size,
-			Digest:  model.Digest,
-			Details: modelDetails,
-		}, nil
-	}
-
-	walkFunc := func(path string, info os.FileInfo, _ error) error {
+	var models []api.ModelResponse
+	if err := filepath.Walk(manifests, func(path string, info os.FileInfo, _ error) error {
 		if !info.IsDir() {
-			path, tag := filepath.Split(path)
-			model := strings.Trim(strings.TrimPrefix(path, manifestsPath), string(os.PathSeparator))
-			modelPath := strings.Join([]string{model, tag}, ":")
-			canonicalModelPath := strings.ReplaceAll(modelPath, string(os.PathSeparator), "/")
-
-			resp, err := modelResponse(canonicalModelPath)
+			rel, err := filepath.Rel(manifests, path)
 			if err != nil {
-				slog.Info(fmt.Sprintf("skipping file: %s", canonicalModelPath))
-				// nolint: nilerr
+				return err
+			}
+
+			if hidden, err := filepath.Match(".*", filepath.Base(rel)); err != nil {
+				return err
+			} else if hidden {
 				return nil
 			}

-			resp.ModifiedAt = info.ModTime()
-			models = append(models, resp)
+			n := model.ParseNameFromFilepath(rel)
+			if !n.IsValid() {
+				slog.Warn("bad manifest filepath", "path", rel)
+				return nil
+			}
+
+			m, err := ParseNamedManifest(n)
+			if err != nil {
+				slog.Warn("bad manifest", "name", n, "error", err)
+				return nil
+			}
+
+			f, err := m.Config.Open()
+			if err != nil {
+				slog.Warn("bad manifest config filepath", "name", n, "error", err)
+				return nil
+			}
+			defer f.Close()
+
+			var c ConfigV2
+			if err := json.NewDecoder(f).Decode(&c); err != nil {
+				slog.Warn("bad manifest config", "name", n, "error", err)
+				return nil
+			}
+
+			// tag should never be masked
+			models = append(models, api.ModelResponse{
+				Model:      n.DisplayShortest(),
+				Name:       n.DisplayShortest(),
+				Size:       m.Size(),
+				Digest:     m.Digest,
+				ModifiedAt: info.ModTime(),
+				Details: api.ModelDetails{
+					Format:            c.ModelFormat,
+					Family:            c.ModelFamily,
+					Families:          c.ModelFamilies,
+					ParameterSize:     c.ModelType,
+					QuantizationLevel: c.FileType,
+				},
+			})
 		}

 		return nil
-	}
-
-	if err := filepath.Walk(manifestsPath, walkFunc); err != nil {
+	}); err != nil {
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
 	}

+	slices.SortStableFunc(models, func(i, j api.ModelResponse) int {
+		// most recently modified first
+		return cmp.Compare(j.ModifiedAt.Unix(), i.ModifiedAt.Unix())
+	})
+
 	c.JSON(http.StatusOK, api.ListResponse{Models: models})
 }

@@ -796,7 +886,7 @@ func (s *Server) CopyModelHandler(c *gin.Context) {

 	dst := model.ParseName(r.Destination)
 	if !dst.IsValid() {
-		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("destination %q is invalid", r.Source)})
+		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("destination %q is invalid", r.Destination)})
 		return
 	}

@@ -852,11 +942,6 @@ func (s *Server) CreateBlobHandler(c *gin.Context) {
 		return
 	}

-	if _, err := layer.Commit(); err != nil {
-		c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
-		return
-	}
-
 	c.Status(http.StatusCreated)
 }

@@ -931,6 +1016,11 @@ func allowedHostsMiddleware(addr net.Addr) gin.HandlerFunc {
 		}

 		if allowedHost(host) {
+			if c.Request.Method == "OPTIONS" {
+				c.AbortWithStatus(http.StatusNoContent)
+				return
+			}
+
 			c.Next()
 			return
 		}
@@ -943,6 +1033,7 @@ func (s *Server) GenerateRoutes() http.Handler {
 	config := cors.DefaultConfig()
 	config.AllowWildcard = true
 	config.AllowBrowserExtensions = true
+	config.AllowHeaders = []string{"Authorization", "Content-Type", "User-Agent", "Accept", "X-Requested-With"}
 	config.AllowOrigins = envconfig.AllowOrigins

 	r := gin.Default()
@@ -955,6 +1046,7 @@ func (s *Server) GenerateRoutes() http.Handler {
 	r.POST("/api/generate", s.GenerateHandler)
 	r.POST("/api/chat", s.ChatHandler)
 	r.POST("/api/embeddings", s.EmbeddingsHandler)
+	r.POST("/api/tokenize", s.TokenizeHandler)
 	r.POST("/api/create", s.CreateModelHandler)
 	r.POST("/api/push", s.PushModelHandler)
 	r.POST("/api/copy", s.CopyModelHandler)
@@ -1027,7 +1119,8 @@ func Serve(ln net.Listener) error {
 	}

 	ctx, done := context.WithCancel(context.Background())
-	sched := InitScheduler(ctx)
+	schedCtx, schedDone := context.WithCancel(ctx)
+	sched := InitScheduler(schedCtx)
 	s := &Server{addr: ln.Addr(), sched: sched}
 	r := s.GenerateRoutes()

@@ -1042,23 +1135,31 @@ func Serve(ln net.Listener) error {
 	go func() {
 		<-signals
 		srvr.Close()
-		done()
+		schedDone()
 		sched.unloadAllRunners()
 		gpu.Cleanup()
-		os.Exit(0)
+		done()
 	}()

 	if err := llm.Init(); err != nil {
 		return fmt.Errorf("unable to initialize llm library %w", err)
 	}

-	s.sched.Run(ctx)
+	s.sched.Run(schedCtx)

 	// At startup we retrieve GPU information so we can get log messages before loading a model
 	// This will log warnings to the log in case we have problems with detected GPUs
-	_ = gpu.GetGPUInfo()
+	gpus := gpu.GetGPUInfo()
+	gpus.LogDetails()

-	return srvr.Serve(ln)
+	err = srvr.Serve(ln)
+	// If server is closed from the signal handler, wait for the ctx to be done
+	// otherwise error out quickly
+	if !errors.Is(err, http.ErrServerClosed) {
+		return err
+	}
+	<-ctx.Done()
+	return err
 }

 func waitForStream(c *gin.Context, ch chan interface{}) {
@@ -1167,10 +1268,6 @@ func (s *Server) ChatHandler(c *gin.Context) {

 	opts, err := modelOptions(model, req.Options)
 	if err != nil {
-		if errors.Is(err, api.ErrInvalidOpts) {
-			c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
-			return
-		}
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
 	}
@@ -1212,10 +1309,11 @@ func (s *Server) ChatHandler(c *gin.Context) {
 	// an empty request loads the model
 	if len(req.Messages) == 0 || prompt == "" {
 		resp := api.ChatResponse{
-			CreatedAt: time.Now().UTC(),
-			Model:     req.Model,
-			Done:      true,
-			Message:   api.Message{Role: "assistant"},
+			CreatedAt:  time.Now().UTC(),
+			Model:      req.Model,
+			Done:       true,
+			DoneReason: "load",
+			Message:    api.Message{Role: "assistant"},
 		}
 		c.JSON(http.StatusOK, resp)
 		return
@@ -1248,10 +1346,11 @@ func (s *Server) ChatHandler(c *gin.Context) {
 		fn := func(r llm.CompletionResponse) {

 			resp := api.ChatResponse{
-				Model:     req.Model,
-				CreatedAt: time.Now().UTC(),
-				Message:   api.Message{Role: "assistant", Content: r.Content},
-				Done:      r.Done,
+				Model:      req.Model,
+				CreatedAt:  time.Now().UTC(),
+				Message:    api.Message{Role: "assistant", Content: r.Content},
+				Done:       r.Done,
+				DoneReason: r.DoneReason,
 				Metrics: api.Metrics{
 					PromptEvalCount:    r.PromptEvalCount,
 					PromptEvalDuration: r.PromptEvalDuration,
--- a/server/routes_test.go
+++ b/server/routes_test.go
@@ -124,14 +124,12 @@ func Test_Routes(t *testing.T) {
 			Method: http.MethodPost,
 			Path:   "/api/create",
 			Setup: func(t *testing.T, req *http.Request) {
-				f, err := os.CreateTemp(t.TempDir(), "ollama-model")
-				assert.Nil(t, err)
-				defer f.Close()
+				fname := createTestFile(t, "ollama-model")

 				stream := false
 				createReq := api.CreateRequest{
 					Name:      "t-bone",
-					Modelfile: fmt.Sprintf("FROM %s", f.Name()),
+					Modelfile: fmt.Sprintf("FROM %s", fname),
 					Stream:    &stream,
 				}
 				jsonData, err := json.Marshal(createReq)
@@ -216,27 +214,25 @@ func Test_Routes(t *testing.T) {
 	httpSrv := httptest.NewServer(router)
 	t.Cleanup(httpSrv.Close)

-	workDir, err := os.MkdirTemp("", "ollama-test")
-	assert.Nil(t, err)
-	defer os.RemoveAll(workDir)
-	os.Setenv("OLLAMA_MODELS", workDir)
+	t.Setenv("OLLAMA_MODELS", t.TempDir())

 	for _, tc := range testCases {
-		t.Logf("Running Test: [%s]", tc.Name)
-		u := httpSrv.URL + tc.Path
-		req, err := http.NewRequestWithContext(context.TODO(), tc.Method, u, nil)
-		assert.Nil(t, err)
+		t.Run(tc.Name, func(t *testing.T) {
+			u := httpSrv.URL + tc.Path
+			req, err := http.NewRequestWithContext(context.TODO(), tc.Method, u, nil)
+			assert.Nil(t, err)

-		if tc.Setup != nil {
-			tc.Setup(t, req)
-		}
+			if tc.Setup != nil {
+				tc.Setup(t, req)
+			}

-		resp, err := httpSrv.Client().Do(req)
-		assert.Nil(t, err)
-		defer resp.Body.Close()
+			resp, err := httpSrv.Client().Do(req)
+			assert.Nil(t, err)
+			defer resp.Body.Close()

-		if tc.Expected != nil {
-			tc.Expected(t, resp)
-		}
+			if tc.Expected != nil {
+				tc.Expected(t, resp)
+			}
+		})
 	}
 }
--- a/server/sched.go
+++ b/server/sched.go
@@ -265,11 +265,14 @@ func (s *Scheduler) processCompleted(ctx context.Context) {

 			s.loadedMu.Lock()
 			slog.Debug("got lock to unload", "model", runner.model)
+			finished := runner.waitForVRAMRecovery()
 			runner.unload()
 			delete(s.loaded, runner.model)
 			s.loadedMu.Unlock()
 			slog.Debug("runner released", "model", runner.model)
 			runner.refMu.Unlock()
+
+			<-finished
 			slog.Debug("sending an unloaded event", "model", runner.model)
 			s.unloadedCh <- struct{}{}
 		}
@@ -465,6 +468,61 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool
 	return false
 }

+// Free memory reporting on GPUs can lag for a while even after the runner
+// exits, so we have to keep checking until we see the available memory recover,
+// otherwise subsequent model loads will get far less layers loaded or worse
+// case, may completely fall back to CPU mode.
+// This routine must be called before the runner unloads so it can establish
+// a before and after GPU memory allocation.  The returned channel
+// will be notified when we're done waiting, or have timed out and should
+// proceed anyway
+func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
+	finished := make(chan interface{}, 1)
+
+	// CPU or Metal don't need checking, so no waiting required
+	if len(runner.gpus) == 1 && (runner.gpus[0].Library == "cpu" || runner.gpus[0].Library == "metal") {
+		finished <- struct{}{}
+		return finished
+	}
+	start := time.Now()
+
+	// Establish a baseline before we unload
+	gpusBefore := gpu.GetGPUInfo()
+	var totalMemoryBefore, freeMemoryBefore uint64
+	for _, gpu := range gpusBefore {
+		totalMemoryBefore += gpu.TotalMemory
+		freeMemoryBefore += gpu.FreeMemory
+	}
+	go func() {
+		expiresAt := start.Add(5 * time.Second) // typical convergence is 0.5-1.5s
+		ticker := time.NewTicker(250 * time.Millisecond)
+		defer ticker.Stop()
+		for {
+			<-ticker.C
+			if time.Now().After(expiresAt) {
+				slog.Warn("gpu VRAM usage didn't recover within timeout", "seconds", time.Since(start).Seconds())
+				finished <- struct{}{}
+			}
+
+			// Query GPUs, look for free to go back up
+			gpusNow := gpu.GetGPUInfo()
+			var totalMemoryNow, freeMemoryNow uint64
+			for _, gpu := range gpusNow {
+				totalMemoryNow += gpu.TotalMemory
+				freeMemoryNow += gpu.FreeMemory
+			}
+			// If we're within ~80% of the estimated memory usage recovered, bail out
+			if float32(freeMemoryNow-freeMemoryBefore) > float32(runner.estimatedVRAM)*0.8 {
+				slog.Debug(fmt.Sprintf("gpu VRAM free memory converged after %0.2f seconds", time.Since(start).Seconds()))
+				finished <- struct{}{}
+				return
+			}
+		}
+	}()
+	return finished
+
+}
+
 type ByDuration []*runnerRef

 func (a ByDuration) Len() int      { return len(a) }
--- a/types/model/name.go
+++ b/types/model/name.go
@@ -35,6 +35,12 @@ func Unqualified(n Name) error {
 // spot in logs.
 const MissingPart = "!MISSING!"

+const (
+	defaultHost      = "registry.ollama.ai"
+	defaultNamespace = "library"
+	defaultTag       = "latest"
+)
+
 // DefaultName returns a name with the default values for the host, namespace,
 // and tag parts. The model and digest parts are empty.
 //
@@ -43,9 +49,9 @@ const MissingPart = "!MISSING!"
 //   - The default tag is ("latest")
 func DefaultName() Name {
 	return Name{
-		Host:      "registry.ollama.ai",
-		Namespace: "library",
-		Tag:       "latest",
+		Host:      defaultHost,
+		Namespace: defaultNamespace,
+		Tag:       defaultTag,
 	}
 }

@@ -169,6 +175,27 @@ func ParseNameBare(s string) Name {
 	return n
 }

+// ParseNameFromFilepath parses a 4-part filepath as a Name. The parts are
+// expected to be in the form:
+//
+// { host } "/" { namespace } "/" { model } "/" { tag }
+func ParseNameFromFilepath(s string) (n Name) {
+	parts := strings.Split(s, string(filepath.Separator))
+	if len(parts) != 4 {
+		return Name{}
+	}
+
+	n.Host = parts[0]
+	n.Namespace = parts[1]
+	n.Model = parts[2]
+	n.Tag = parts[3]
+	if !n.IsFullyQualified() {
+		return Name{}
+	}
+
+	return n
+}
+
 // Merge merges the host, namespace, and tag parts of the two names,
 // preferring the non-empty parts of a.
 func Merge(a, b Name) Name {
@@ -203,6 +230,27 @@ func (n Name) String() string {
 	return b.String()
 }

+// DisplayShort returns a short string version of the name.
+func (n Name) DisplayShortest() string {
+	var sb strings.Builder
+
+	if n.Host != defaultHost {
+		sb.WriteString(n.Host)
+		sb.WriteByte('/')
+		sb.WriteString(n.Namespace)
+		sb.WriteByte('/')
+	} else if n.Namespace != defaultNamespace {
+		sb.WriteString(n.Namespace)
+		sb.WriteByte('/')
+	}
+
+	// always include model and tag
+	sb.WriteString(n.Model)
+	sb.WriteString(":")
+	sb.WriteString(n.Tag)
+	return sb.String()
+}
+
 // IsValid reports whether all parts of the name are present and valid. The
 // digest is a special case, and is checked for validity only if present.
 func (n Name) IsValid() bool {
@@ -242,12 +290,14 @@ func (n Name) Filepath() string {
 	if !n.IsFullyQualified() {
 		panic("illegal attempt to get filepath of invalid name")
 	}
-	return strings.ToLower(filepath.Join(
-		n.Host,
-		n.Namespace,
-		n.Model,
+	return filepath.Join(
+		strings.ToLower(filepath.Join(
+			n.Host,
+			n.Namespace,
+			n.Model,
+		)),
 		n.Tag,
-	))
+	)
 }

 // LogValue returns a slog.Value that represents the name as a string.
--- a/types/model/name_test.go
+++ b/types/model/name_test.go
@@ -19,6 +19,16 @@ func TestParseNameParts(t *testing.T) {
 		wantFilepath    string
 		wantValidDigest bool
 	}{
+		{
+			in: "registry.ollama.ai/library/dolphin-mistral:7b-v2.6-dpo-laser-q6_K",
+			want: Name{
+				Host:      "registry.ollama.ai",
+				Namespace: "library",
+				Model:     "dolphin-mistral",
+				Tag:       "7b-v2.6-dpo-laser-q6_K",
+			},
+			wantFilepath: filepath.Join("registry.ollama.ai", "library", "dolphin-mistral", "7b-v2.6-dpo-laser-q6_K"),
+		},
 		{
 			in: "scheme://host:port/namespace/model:tag",
 			want: Name{
@@ -266,9 +276,9 @@ func TestFilepathAllocs(t *testing.T) {
 	allocs := testing.AllocsPerRun(1000, func() {
 		n.Filepath()
 	})
-	allowedAllocs := 2.0
+	var allowedAllocs float64 = 3
 	if runtime.GOOS == "windows" {
-		allowedAllocs = 4
+		allowedAllocs = 5
 	}
 	if allocs > allowedAllocs {
 		t.Errorf("allocs = %v; allowed %v", allocs, allowedAllocs)
@@ -309,6 +319,49 @@ func TestParseDigest(t *testing.T) {
 	}
 }

+func TestParseNameFromFilepath(t *testing.T) {
+	cases := map[string]Name{
+		filepath.Join("host", "namespace", "model", "tag"):      {Host: "host", Namespace: "namespace", Model: "model", Tag: "tag"},
+		filepath.Join("host:port", "namespace", "model", "tag"): {Host: "host:port", Namespace: "namespace", Model: "model", Tag: "tag"},
+		filepath.Join("namespace", "model", "tag"):              {},
+		filepath.Join("model", "tag"):                           {},
+		filepath.Join("model"):                                  {},
+		filepath.Join("..", "..", "model", "tag"):               {},
+		filepath.Join("", "namespace", ".", "tag"):              {},
+		filepath.Join(".", ".", ".", "."):                       {},
+		filepath.Join("/", "path", "to", "random", "file"):      {},
+	}
+
+	for in, want := range cases {
+		t.Run(in, func(t *testing.T) {
+			got := ParseNameFromFilepath(in)
+
+			if !reflect.DeepEqual(got, want) {
+				t.Errorf("parseNameFromFilepath(%q) = %v; want %v", in, got, want)
+			}
+		})
+	}
+}
+
+func TestDisplayShortest(t *testing.T) {
+	cases := map[string]string{
+		"registry.ollama.ai/library/model:latest": "model:latest",
+		"registry.ollama.ai/library/model:tag":    "model:tag",
+		"registry.ollama.ai/namespace/model:tag":  "namespace/model:tag",
+		"host/namespace/model:tag":                "host/namespace/model:tag",
+		"host/library/model:tag":                  "host/library/model:tag",
+	}
+
+	for in, want := range cases {
+		t.Run(in, func(t *testing.T) {
+			got := ParseNameBare(in).DisplayShortest()
+			if got != want {
+				t.Errorf("parseName(%q).DisplayShortest() = %q; want %q", in, got, want)
+			}
+		})
+	}
+}
+
 func FuzzName(f *testing.F) {
 	for s := range testCases {
 		f.Add(s)
Author	SHA1	Message	Date
Bruce MacDonald	938be81c45	Add TODO	2024-05-10 10:15:22 -07:00
Bruce MacDonald	19ce10e49e	add a /tokenize endpoint	2024-05-10 10:13:50 -07:00
Daniel Hiltgen	7e2bceceee	Merge pull request #4316 from dhiltgen/more_buffer Bump VRAM buffer back up	2024-05-10 10:02:34 -07:00
Daniel Hiltgen	30a7d7096c	Bump VRAM buffer back up Under stress scenarios we're seeing OOMs so this should help stabilize the allocations under heavy concurrency stress.	2024-05-10 09:15:28 -07:00
Michael Yang	200a18820e	Merge pull request #4306 from ollama/mxyng/fix-routes	2024-05-10 08:58:16 -07:00
Michael Yang	e03637176d	fix(routes): skip bad manifests	2024-05-10 08:46:11 -07:00
Bruce MacDonald	c02db93243	omit empty done reason	2024-05-09 16:45:29 -07:00
Michael Yang	ffa4d5134a	Merge pull request #4305 from ollama/mxyng/typo fix typo	2024-05-09 16:42:09 -07:00
Jeffrey Morgan	302d7fdbf3	prune partial downloads (#4272 )	2024-05-09 16:35:20 -07:00
Michael Yang	cf442cd57e	fix typo	2024-05-09 16:23:37 -07:00
Michael Yang	0e1ba65855	Merge pull request #4302 from ollama/mxyng/forward-env only forward some env vars	2024-05-09 16:21:05 -07:00
Michael Yang	6aad333c63	Merge pull request #4298 from ollama/mxyng/log-cleanup log clean up	2024-05-09 16:20:57 -07:00
Daniel Hiltgen	4fcc84e67a	Merge pull request #4304 from dhiltgen/signals Fix race in shutdown logic	2024-05-09 15:58:44 -07:00
Daniel Hiltgen	3ae2f441e0	Fix race in shutdown logic Ensure the runners are terminated	2024-05-09 15:54:02 -07:00
Zander Lewis	2abb3f6424	Update README.md (#4300 )	2024-05-09 15:30:49 -07:00
Michael Yang	ce3b212d12	only forward some env vars	2024-05-09 15:16:09 -07:00
Daniel Hiltgen	83d6d46e29	Merge pull request #4299 from dhiltgen/handle_vram_reporting_lag Wait for GPU free memory reporting to converge	2024-05-09 15:08:56 -07:00
Daniel Hiltgen	354ad9254e	Wait for GPU free memory reporting to converge The GPU drivers take a while to update their free memory reporting, so we need to wait until the values converge with what we're expecting before proceeding to start another runner in order to get an accurate picture.	2024-05-09 14:56:01 -07:00
Michael Yang	58876091f7	log clean up	2024-05-09 14:55:36 -07:00
Daniel Hiltgen	dc18eee39d	Merge pull request #4238 from dhiltgen/gpu_info Record more GPU information	2024-05-09 14:26:58 -07:00
Daniel Hiltgen	8727a9c140	Record more GPU information This cleans up the logging for GPU discovery a bit, and can serve as a foundation to report GPU information in a future UX.	2024-05-09 14:18:14 -07:00
Daniel Hiltgen	d0425f26cf	Merge pull request #4294 from dhiltgen/harden_subprocess_reaping Harden subprocess reaping	2024-05-09 14:02:16 -07:00
Bruce MacDonald	cfa84b8470	add done_reason to the api (#4235 )	2024-05-09 13:30:14 -07:00
Michael Yang	1580ed4c06	Merge pull request #4295 from ollama/mxyng/fix-list routes: skip invalid filepaths	2024-05-09 11:37:34 -07:00
Michael Yang	a7ee84fc31	routes: skip invalid filepaths	2024-05-09 11:23:22 -07:00
Daniel Hiltgen	84ac7ce139	Refine subprocess reaping	2024-05-09 11:21:31 -07:00
tusharhero	788b092c49	docs: add Guix package manager in README. (#4040 )	2024-05-09 11:10:24 -07:00
J S	5cde17a096	Add PromptingTools.jl (#2192 )	2024-05-09 09:39:05 -07:00
Daniel Hiltgen	c3837eb08c	Merge pull request #4289 from dhiltgen/doc_container_workarounds Doc container usage and workaround for nvidia errors	2024-05-09 09:27:29 -07:00
Daniel Hiltgen	8cc0ee2efe	Doc container usage and workaround for nvidia errors	2024-05-09 09:26:45 -07:00
Jeffrey Morgan	d5eec16d23	use model defaults for `num_gqa`, `rope_frequency_base` and `rope_frequency_scale` (#1983 )	2024-05-09 09:06:13 -07:00
Carlos Gamez	daa1a032f7	Update langchainjs.md (#2027 ) Updated sample code as per warning notification from the package maintainers	2024-05-08 20:21:03 -07:00
jmorganca	6042e8bc57	remove `bash-comparemodels` example	2024-05-08 19:49:45 -07:00
Daniel Hiltgen	920a4b0794	Merge remote-tracking branch 'upstream/main' into pr3702	2024-05-08 16:44:35 -07:00
Daniel Hiltgen	ee49844d09	Merge pull request #4153 from dhiltgen/gpu_verbose_response Add GPU usage	2024-05-08 16:39:11 -07:00
Daniel Hiltgen	8a516ac862	Merge pull request #4241 from dhiltgen/fix_tmp_override Detect noexec and report a better error	2024-05-08 15:34:22 -07:00
Daniel Hiltgen	bee2f4a3b0	Record GPU usage information This records more GPU usage information for eventual UX inclusion.	2024-05-08 14:45:39 -07:00
Bruce MacDonald	cef45feaa4	Add preflight OPTIONS handling and update CORS config (#4086 ) * Add preflight OPTIONS handling and update CORS config - Implement early return with HTTP 204 (No Content) for OPTIONS requests in allowedHostsMiddleware to optimize preflight handling. - Extend CORS configuration to explicitly allow 'Authorization' headers and 'OPTIONS' method when OLLAMA_ORIGINS environment variable is set. * allow auth, content-type, and user-agent headers * Update routes.go	2024-05-08 13:14:00 -07:00
Michael Yang	2687f02c96	Merge pull request #4265 from ollama/mxyng/fix-show-llava routes: fix show llava models	2024-05-08 12:51:21 -07:00
Michael Yang	b25976aeb8	routes: fix show llava models	2024-05-08 12:43:36 -07:00
Michael Yang	001f167aad	Merge pull request #4261 from ollama/mxyng/fix-tag-case types/model: fix tag case	2024-05-08 11:09:47 -07:00
Michael Yang	486a2c1d94	types/model: fix tag case	2024-05-08 08:47:16 -07:00
Michael Yang	88cf154483	Merge pull request #4244 from ollama/mxyng/skip-if-same skip if same quantization	2024-05-07 19:03:37 -07:00
Bruce MacDonald	8cbd3e7510	skip hidden files in list models handler (#4247 )	2024-05-07 19:01:45 -07:00
Michael Yang	eeb695261f	skip if same quantization	2024-05-07 17:44:19 -07:00
Bruce MacDonald	dc9b1111e0	fix invalid destination error message	2024-05-07 17:35:52 -07:00
Tobias Gårdhus	06ac829e70	Fix help string for stop parameter (#2307 )	2024-05-07 16:48:35 -07:00
Daniel Hiltgen	72700279e2	Detect noexec and report a better error This will bubble up a much more informative error message if noexec is preventing us from running the subprocess	2024-05-07 16:46:15 -07:00
boessu	5d3f7fff26	Update langchainpy.md (#4236 ) fixing pip code.	2024-05-07 16:36:34 -07:00
Eli Bendersky	d77c1c5f9d	api: fill up API documentation (#3596 ) * api: fill up API documentation Followup for #2878 Now that the documentation is more complete, mention it in the README. Updates #2840 * fix typo/lint * Update README.md Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com> --------- Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>	2024-05-07 16:27:46 -07:00
Giuseppe Lumia	2a5302a1cf	Fix paste of text with line feed characters (#3043 ) Some terminals may send line feed characters when pasting text with newlines.	2024-05-07 15:26:07 -07:00
Michael Yang	ffbd3d173f	Merge pull request #3715 from ollama/mxyng/modelname-2 update list handler to use model.Name	2024-05-07 15:21:39 -07:00
Michael Yang	1e0a669f75	Merge pull request #3682 from ollama/mxyng/quantize-all-the-things quantize any fp16/fp32 model	2024-05-07 15:20:49 -07:00
Bruce MacDonald	527e9be058	fix: store accurate model parameter size (#4058 ) - add test for number formatting - fix bug where 1B and 1M were not stored correctly - display 2 decimal points for million param sizes - display 1 decimal point for billion param sizes	2024-05-07 14:41:53 -07:00
Renat	34bea2e272	Add macai to list of Web & Desktop integrations (#3881 )	2024-05-07 13:31:34 -07:00
Fernando Maclen	fe44ae3371	Update README.md (#3884 )	2024-05-07 13:17:35 -07:00
Michael Yang	548a7df014	update list handler to use model.Name	2024-05-07 09:38:45 -07:00
Michael Yang	b2f00aa977	close zip files	2024-05-06 15:27:19 -07:00
Michael Yang	6694be5e50	convert/llama: use WriteSeeker	2024-05-06 15:24:01 -07:00
Michael Yang	f5e8b207fb	s/DisplayLongest/String/	2024-05-06 15:24:01 -07:00
Michael Yang	d245460362	only quantize language models	2024-05-06 15:24:01 -07:00
Michael Yang	4d0d0fa383	no iterator	2024-05-06 15:24:01 -07:00
Michael Yang	7ffe45734d	rebase	2024-05-06 15:24:01 -07:00
Michael Yang	01811c176a	comments	2024-05-06 15:24:01 -07:00
Michael Yang	a7248f6ea8	update tests	2024-05-06 15:24:01 -07:00
Michael Yang	9685c34509	quantize any fp16/fp32 model - FROM /path/to/{safetensors,pytorch} - FROM /path/to/fp{16,32}.bin - FROM model:fp{16,32}	2024-05-06 15:24:01 -07:00
ManniX-ITA	c496967e56	Merge branch 'ollama:main' into mannix-server	2024-04-18 18:45:15 +02:00
ManniX-ITA	c942e4a07b	Fixed startup sequence to report model loading	2024-04-17 17:40:32 +02:00
ManniX-ITA	bd54b08261	Streamlined WaitUntilRunning	2024-04-17 17:39:52 +02:00