fix formatting when exiting ollama run

history: update pos after compact
close input channel when receiving io.EOF
2026-01-09 16:10:26 -05:00 · 2023-10-27 21:26:23 -07:00 · 2023-10-27 20:38:03 -07:00 · 2023-10-27 20:26:04 -07:00 · 2023-10-27 20:01:48 -07:00 · 2023-10-27 18:29:00 -07:00
32 changed files with 1433 additions and 246 deletions
--- a/README.md
+++ b/README.md
@@ -29,7 +29,8 @@ curl https://ollama.ai/install.sh | sh

 ### Docker

-See the official [Docker image](https://hub.docker.com/r/ollama/ollama).
+The official [Ollama Docker image `ollama/ollama`](https://hub.docker.com/r/ollama/ollama)
+is available on Docker Hub.

 ## Quickstart

@@ -88,7 +89,7 @@ See the [guide](docs/import.md) on importing models for more information.

 ### Customize a prompt

-Models from the Ollama library can be customized with a prompt. The example
+Models from the Ollama library can be customized with a prompt. For example, to customize the `llama2` model:

 ```
 ollama pull llama2
@@ -178,8 +179,7 @@ ollama list
 Install `cmake` and `go`:

 ```
-brew install cmake
-brew install go
+brew install cmake go
 ```

 Then generate dependencies and build:
@@ -203,9 +203,8 @@ Finally, in a separate shell, run a model:

 ## REST API

-See the [API documentation](docs/api.md) for all endpoints.
-
-Ollama has an API for running and managing models. For example to generate text from a model:
+Ollama has a REST API for running and managing models.
+For example, to generate text from a model:

 ```
 curl -X POST http://localhost:11434/api/generate -d '{
@@ -214,6 +213,8 @@ curl -X POST http://localhost:11434/api/generate -d '{
 }'
 ```

+See the [API documentation](./docs/api.md) for all endpoints.
+
 ## Community Integrations

 - [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/modules/model_io/models/llms/integrations/ollama) with [example](https://js.langchain.com/docs/use_cases/question_answering/local_retrieval_qa)
@@ -233,3 +234,4 @@ curl -X POST http://localhost:11434/api/generate -d '{
 - [oterm](https://github.com/ggozad/oterm)
 - [Ellama Emacs client](https://github.com/s-kostyaev/ellama)
 - [OllamaSharp for .NET](https://github.com/awaescher/OllamaSharp)
+- [Minimalistic React UI for Ollama Models](https://github.com/richawo/minimal-llm-ui)
--- a/api/client.go
+++ b/api/client.go
@@ -18,10 +18,6 @@ import (
 	"github.com/jmorganca/ollama/version"
 )

-const DefaultHost = "127.0.0.1:11434"
-
-var envHost = os.Getenv("OLLAMA_HOST")
-
 type Client struct {
 	base *url.URL
 	http http.Client
@@ -44,14 +40,24 @@ func checkError(resp *http.Response, body []byte) error {
 }

 func ClientFromEnvironment() (*Client, error) {
+	defaultPort := "11434"
+
 	scheme, hostport, ok := strings.Cut(os.Getenv("OLLAMA_HOST"), "://")
-	if !ok {
+	switch {
+	case !ok:
 		scheme, hostport = "http", os.Getenv("OLLAMA_HOST")
+	case scheme == "http":
+		defaultPort = "80"
+	case scheme == "https":
+		defaultPort = "443"
 	}

+	// trim trailing slashes
+	hostport = strings.TrimRight(hostport, "/")
+
 	host, port, err := net.SplitHostPort(hostport)
 	if err != nil {
-		host, port = "127.0.0.1", "11434"
+		host, port = "127.0.0.1", defaultPort
 		if ip := net.ParseIP(strings.Trim(hostport, "[]")); ip != nil {
 			host = ip.String()
 		} else if hostport != "" {
--- a/api/client_test.go
+++ b/api/client_test.go
@@ -0,0 +1,43 @@
+package api
+
+import "testing"
+
+func TestClientFromEnvironment(t *testing.T) {
+	type testCase struct {
+		value  string
+		expect string
+		err    error
+	}
+
+	testCases := map[string]*testCase{
+		"empty":                      {value: "", expect: "http://127.0.0.1:11434"},
+		"only address":               {value: "1.2.3.4", expect: "http://1.2.3.4:11434"},
+		"only port":                  {value: ":1234", expect: "http://:1234"},
+		"address and port":           {value: "1.2.3.4:1234", expect: "http://1.2.3.4:1234"},
+		"scheme http and address":    {value: "http://1.2.3.4", expect: "http://1.2.3.4:80"},
+		"scheme https and address":   {value: "https://1.2.3.4", expect: "https://1.2.3.4:443"},
+		"scheme, address, and port":  {value: "https://1.2.3.4:1234", expect: "https://1.2.3.4:1234"},
+		"hostname":                   {value: "example.com", expect: "http://example.com:11434"},
+		"hostname and port":          {value: "example.com:1234", expect: "http://example.com:1234"},
+		"scheme http and hostname":   {value: "http://example.com", expect: "http://example.com:80"},
+		"scheme https and hostname":  {value: "https://example.com", expect: "https://example.com:443"},
+		"scheme, hostname, and port": {value: "https://example.com:1234", expect: "https://example.com:1234"},
+		"trailing slash":             {value: "example.com/", expect: "http://example.com:11434"},
+		"trailing slash port":        {value: "example.com:1234/", expect: "http://example.com:1234"},
+	}
+
+	for k, v := range testCases {
+		t.Run(k, func(t *testing.T) {
+			t.Setenv("OLLAMA_HOST", v.value)
+
+			client, err := ClientFromEnvironment()
+			if err != v.err {
+				t.Fatalf("expected %s, got %s", v.err, err)
+			}
+
+			if client.base.String() != v.expect {
+				t.Fatalf("expected %s, got %s", v.expect, client.base.String())
+			}
+		})
+	}
+}
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -22,7 +22,6 @@ import (

 	"github.com/dustin/go-humanize"
 	"github.com/olekukonko/tablewriter"
-	"github.com/pdevine/readline"
 	"github.com/spf13/cobra"
 	"golang.org/x/crypto/ssh"
 	"golang.org/x/term"
@@ -30,30 +29,11 @@ import (
 	"github.com/jmorganca/ollama/api"
 	"github.com/jmorganca/ollama/format"
 	"github.com/jmorganca/ollama/progressbar"
+	"github.com/jmorganca/ollama/readline"
 	"github.com/jmorganca/ollama/server"
 	"github.com/jmorganca/ollama/version"
 )

-type Painter struct {
-	IsMultiLine bool
-}
-
-func (p Painter) Paint(line []rune, _ int) []rune {
-	termType := os.Getenv("TERM")
-	if termType == "xterm-256color" && len(line) == 0 {
-		var prompt string
-		if p.IsMultiLine {
-			prompt = "Use \"\"\" to end multi-line input"
-		} else {
-			prompt = "Send a message (/? for help)"
-		}
-		return []rune(fmt.Sprintf("\033[38;5;245m%s\033[%dD\033[0m", prompt, len(prompt)))
-	}
-	// add a space and a backspace to prevent the cursor from walking up the screen
-	line = append(line, []rune(" \b")...)
-	return line
-}
-
 func CreateHandler(cmd *cobra.Command, args []string) error {
 	filename, _ := cmd.Flags().GetString("file")
 	filename, err := filepath.Abs(filename)
@@ -508,38 +488,11 @@ func generate(cmd *cobra.Command, model, prompt string, wordWrap bool) error {
 }

 func generateInteractive(cmd *cobra.Command, model string) error {
-	home, err := os.UserHomeDir()
-	if err != nil {
-		return err
-	}
-
 	// load the model
 	if err := generate(cmd, model, "", false); err != nil {
 		return err
 	}

-	completer := readline.NewPrefixCompleter(
-		readline.PcItem("/help"),
-		readline.PcItem("/list"),
-		readline.PcItem("/set",
-			readline.PcItem("history"),
-			readline.PcItem("nohistory"),
-			readline.PcItem("wordwrap"),
-			readline.PcItem("nowordwrap"),
-			readline.PcItem("verbose"),
-			readline.PcItem("quiet"),
-		),
-		readline.PcItem("/show",
-			readline.PcItem("license"),
-			readline.PcItem("modelfile"),
-			readline.PcItem("parameters"),
-			readline.PcItem("system"),
-			readline.PcItem("template"),
-		),
-		readline.PcItem("/exit"),
-		readline.PcItem("/bye"),
-	)
-
 	usage := func() {
 		fmt.Fprintln(os.Stderr, "Available Commands:")
 		fmt.Fprintln(os.Stderr, "  /set         Set session variables")
@@ -572,20 +525,17 @@ func generateInteractive(cmd *cobra.Command, model string) error {
 		fmt.Fprintln(os.Stderr, "")
 	}

-	var painter Painter
-
-	config := readline.Config{
-		Painter:      &painter,
-		Prompt:       ">>> ",
-		HistoryFile:  filepath.Join(home, ".ollama", "history"),
-		AutoComplete: completer,
+	prompt := readline.Prompt{
+		Prompt:         ">>> ",
+		AltPrompt:      "... ",
+		Placeholder:    "Send a message (/? for help)",
+		AltPlaceholder: `Use """ to end multi-line input`,
 	}

-	scanner, err := readline.NewEx(&config)
+	scanner, err := readline.New(prompt)
 	if err != nil {
 		return err
 	}
-	defer scanner.Close()

 	var wordWrap bool
 	termType := os.Getenv("TERM")
@@ -602,17 +552,20 @@ func generateInteractive(cmd *cobra.Command, model string) error {
 		wordWrap = false
 	}

+	fmt.Print(readline.StartBracketedPaste)
+	defer fmt.Printf(readline.EndBracketedPaste)
+
 	var multiLineBuffer string
-	var isMultiLine bool

 	for {
 		line, err := scanner.Readline()
 		switch {
 		case errors.Is(err, io.EOF):
+			fmt.Println()
 			return nil
 		case errors.Is(err, readline.ErrInterrupt):
 			if line == "" {
-				fmt.Println("Use Ctrl-D or /bye to exit.")
+				fmt.Println("\nUse Ctrl-D or /bye to exit.")
 			}

 			continue
@@ -623,23 +576,19 @@ func generateInteractive(cmd *cobra.Command, model string) error {
 		line = strings.TrimSpace(line)

 		switch {
-		case isMultiLine:
+		case scanner.Prompt.UseAlt:
 			if strings.HasSuffix(line, `"""`) {
-				isMultiLine = false
-				painter.IsMultiLine = isMultiLine
+				scanner.Prompt.UseAlt = false
 				multiLineBuffer += strings.TrimSuffix(line, `"""`)
 				line = multiLineBuffer
 				multiLineBuffer = ""
-				scanner.SetPrompt(">>> ")
 			} else {
 				multiLineBuffer += line + " "
 				continue
 			}
 		case strings.HasPrefix(line, `"""`):
-			isMultiLine = true
-			painter.IsMultiLine = isMultiLine
+			scanner.Prompt.UseAlt = true
 			multiLineBuffer = strings.TrimPrefix(line, `"""`) + " "
-			scanner.SetPrompt("... ")
 			continue
 		case strings.HasPrefix(line, "/list"):
 			args := strings.Fields(line)
@@ -666,19 +615,6 @@ func generateInteractive(cmd *cobra.Command, model string) error {
 				case "quiet":
 					cmd.Flags().Set("verbose", "false")
 					fmt.Println("Set 'quiet' mode.")
-				case "mode":
-					if len(args) > 2 {
-						switch args[2] {
-						case "vim":
-							scanner.SetVimMode(true)
-						case "emacs", "default":
-							scanner.SetVimMode(false)
-						default:
-							usage()
-						}
-					} else {
-						usage()
-					}
 				default:
 					fmt.Printf("Unknown command '/set %s'. Type /? for help\n", args[1])
 				}
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -16,19 +16,64 @@ journalctl -u ollama

 If you're running `ollama serve` directly, the logs will be printed to the console.

-## How can I expose the Ollama server?
+## How can I expose Ollama on my network?
+
+Ollama binds to 127.0.0.1 port 11434 by default. Change the bind address with the `OLLAMA_HOST` environment variable.
+
+On macOS:

 ```bash
 OLLAMA_HOST=0.0.0.0:11435 ollama serve
 ```

-By default, Ollama allows cross origin requests from `127.0.0.1` and `0.0.0.0`. To support more origins, you can use the `OLLAMA_ORIGINS` environment variable:
+On Linux:
+
+Create a `systemd` drop-in directory and set `Environment=OLLAMA_HOST`
+
+```bash
+mkdir -p /etc/systemd/system/ollama.service.d
+echo "[Service]" >>/etc/systemd/system/ollama.service.d/environment.conf
+```
+
+```bash
+echo "Environment=OLLAMA_HOST=0.0.0.0:11434" >>/etc/systemd/system/ollama.service.d/environment.conf
+```
+
+Reload `systemd` and restart Ollama:
+
+```bash
+systemctl daemon-reload
+systemctl restart ollama
+```
+
+## How can I allow additional web origins to access Ollama?
+
+Ollama allows cross origin requests from `127.0.0.1` and `0.0.0.0` by default. Add additional origins with the `OLLAMA_ORIGINS` environment variable:
+
+On macOS:

 ```bash
 OLLAMA_ORIGINS=http://192.168.1.1:*,https://example.com ollama serve
 ```

+On Linux:
+
+```bash
+echo "Environment=OLLAMA_ORIGINS=http://129.168.1.1:*,https://example.com" >>/etc/systemd/system/ollama.service.d/environment.conf
+```
+
+Reload `systemd` and restart Ollama:
+
+```bash
+systemctl daemon-reload
+systemctl restart ollama
+```
+
 ## Where are models stored?

 - macOS: Raw model data is stored under `~/.ollama/models`.
 - Linux: Raw model data is stored under `/usr/share/ollama/.ollama/models`
+
+### How can I change where Ollama stores models?
+
+To modify where models are stored, you can use the `OLLAMA_MODELS` environment variable. Note that on Linux this means defining `OLLAMA_MODELS` in a drop-in `/etc/systemd/system/ollama.service.d` service file, reloading systemd, and restarting the ollama service.
--- a/docs/import.md
+++ b/docs/import.md
@@ -1,8 +1,43 @@
 # Import a model

-This guide walks through importing a PyTorch, Safetensors or GGUF model.
+This guide walks through importing a GGUF, PyTorch or Safetensors model.

-## Supported models
+## Importing (GGUF)
+
+### Step 1: Write a `Modelfile`
+
+Start by creating a `Modelfile`. This file is the blueprint for your model, specifying weights, parameters, prompt templates and more.
+
+```
+FROM ./mistral-7b-v0.1.Q4_0.gguf
+```
+
+(Optional) many chat models require a prompt template in order to answer correctly. A default prompt template can be specified with the `TEMPLATE` instruction in the `Modelfile`:
+
+```
+FROM ./q4_0.bin
+TEMPLATE "[INST] {{ .Prompt }} [/INST]"
+```
+
+### Step 2: Create the Ollama model
+
+Finally, create a model from your `Modelfile`:
+
+```
+ollama create example -f Modelfile
+```
+
+### Step 3: Run your model
+
+Next, test the model with `ollama run`:
+
+```
+ollama run example "What is your favourite condiment?"
+```
+
+## Importing (PyTorch & Safetensors)
+
+### Supported models

 Ollama supports a set of model architectures, with support for more coming soon:

@@ -13,8 +48,6 @@ Ollama supports a set of model architectures, with support for more coming soon:

 To view a model's architecture, check the `config.json` file in its HuggingFace repo. You should see an entry under `architectures` (e.g. `LlamaForCausalLM`).

-## Importing
-
 ### Step 1: Clone the HuggingFace repository (optional)

 If the model is currently hosted in a HuggingFace repository, first clone that repository to download the raw model.
@@ -44,7 +77,7 @@ This will output two files into the directory:

 ### Step 3: Write a `Modelfile`

-Next, create a `Modelfile` for your model. This file is the blueprint for your model, specifying weights, parameters, prompt templates and more.
+Next, create a `Modelfile` for your model:

 ```
 FROM ./q4_0.bin
@@ -65,13 +98,15 @@ Finally, create a model from your `Modelfile`:
 ollama create example -f Modelfile
 ```

+### Step 5: Run your model
+
 Next, test the model with `ollama run`:

 ```
 ollama run example "What is your favourite condiment?"
 ```

-### Step 5: Publish your model (optional – early alpha)
+## Publishing your model (optional – early alpha)

 Publishing models is in early alpha. If you'd like to publish your model to share with others, follow these steps:

--- a/docs/linux.md
+++ b/docs/linux.md
@@ -1,12 +1,16 @@
-# Installing Ollama on Linux
+# Ollama on Linux

-> Note: A one line installer for Ollama is available by running:
+## Install
+
+Install Ollama running this one-liner:
 >
-> ```bash
-> curl https://ollama.ai/install.sh | sh
-> ```
+```bash
+curl https://ollama.ai/install.sh | sh
+```

-## Download the `ollama` binary
+## Manual install
+
+### Download the `ollama` binary

 Ollama is distributed as a self-contained binary. Download it to a directory in your PATH:

@@ -15,31 +19,7 @@ sudo curl -L https://ollama.ai/download/ollama-linux-amd64 -o /usr/bin/ollama
 sudo chmod +x /usr/bin/ollama
 ```

-## Start Ollama
-
-Start Ollama by running `ollama serve`:
-
-```bash
-ollama serve
-```
-
-Once Ollama is running, run a model in another terminal session:
-
-```bash
-ollama run llama2
-```
-
-## Install CUDA drivers (optional – for Nvidia GPUs)
-
-[Download and install](https://developer.nvidia.com/cuda-downloads) CUDA.
-
-Verify that the drivers are installed by running the following command, which should print details about your GPU:
-
-```bash
-nvidia-smi
-```
-
-## Adding Ollama as a startup service (optional)
+### Adding Ollama as a startup service (recommended)

 Create a user for Ollama:

@@ -60,7 +40,6 @@ User=ollama
 Group=ollama
 Restart=always
 RestartSec=3
-Environment="HOME=/usr/share/ollama"

 [Install]
 WantedBy=default.target
@@ -73,10 +52,65 @@ sudo systemctl daemon-reload
 sudo systemctl enable ollama
 ```

-### Viewing logs
+### Install CUDA drivers (optional – for Nvidia GPUs)
+
+[Download and install](https://developer.nvidia.com/cuda-downloads) CUDA.
+
+Verify that the drivers are installed by running the following command, which should print details about your GPU:
+
+```bash
+nvidia-smi
+```
+
+### Start Ollama
+
+Start Ollama using `systemd`:
+
+```bash
+sudo systemctl start ollama
+```
+
+## Update
+
+Update ollama by running the install script again:
+
+```bash
+curl https://ollama.ai/install.sh | sh
+```
+
+Or by downloading the ollama binary:
+
+```bash
+sudo curl -L https://ollama.ai/download/ollama-linux-amd64 -o /usr/bin/ollama
+sudo chmod +x /usr/bin/ollama
+```
+
+## Viewing logs

 To view logs of Ollama running as a startup service, run:

 ```bash
 journalctl -u ollama
 ```
+
+## Uninstall
+
+Remove the ollama service:
+
+```bash
+sudo systemctl stop ollama
+sudo systemctl disable ollama
+sudo rm /etc/systemd/system/ollama.service
+```
+
+Remove the ollama binary from your bin directory (either `/usr/local/bin`, `/usr/bin`, or `/bin`):
+
+```bash
+sudo rm $(which ollama)
+```
+
+Remove the downloaded models and Ollama service user:
+```bash
+sudo rm -r /usr/share/ollama
+sudo userdel ollama
+```
--- a/go.mod
+++ b/go.mod
@@ -4,13 +4,14 @@ go 1.20

 require (
 	github.com/dustin/go-humanize v1.0.1
+	github.com/emirpasic/gods v1.18.1
 	github.com/gin-gonic/gin v1.9.1
 	github.com/mattn/go-runewidth v0.0.14
 	github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db
 	github.com/olekukonko/tablewriter v0.0.5
-	github.com/pdevine/readline v1.5.2
 	github.com/spf13/cobra v1.7.0
 	golang.org/x/sync v0.3.0
+	gonum.org/v1/gonum v0.14.0
 )

 require github.com/rivo/uniseg v0.2.0 // indirect
@@ -39,12 +40,12 @@ require (
 	github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
 	github.com/ugorji/go/codec v1.2.11 // indirect
 	golang.org/x/arch v0.3.0 // indirect
-	golang.org/x/crypto v0.10.0
+	golang.org/x/crypto v0.14.0
 	golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63
-	golang.org/x/net v0.10.0 // indirect
-	golang.org/x/sys v0.11.0 // indirect
-	golang.org/x/term v0.10.0
-	golang.org/x/text v0.10.0 // indirect
+	golang.org/x/net v0.17.0 // indirect
+	golang.org/x/sys v0.13.0 // indirect
+	golang.org/x/term v0.13.0
+	golang.org/x/text v0.13.0 // indirect
 	google.golang.org/protobuf v1.30.0 // indirect
 	gopkg.in/yaml.v3 v3.0.1 // indirect
 )
--- a/go.sum
+++ b/go.sum
@@ -4,10 +4,6 @@ github.com/bytedance/sonic v1.9.1/go.mod h1:i736AoUSYt75HyZLoJW9ERYxcy6eaN6h4BZX
 github.com/chenzhuoyu/base64x v0.0.0-20211019084208-fb5309c8db06/go.mod h1:DH46F32mSOjUmXrMHnKwZdA8wcEefY7UVqBKYGjpdQY=
 github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311 h1:qSGYFH7+jGhDF8vLC+iwCD4WpbV1EBDSzWkJODFLams=
 github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311/go.mod h1:b583jCggY9gE99b6G5LEC39OIiVsWj+R97kbl5odCEk=
-github.com/chzyer/logex v1.2.1 h1:XHDu3E6q+gdHgsdTPH6ImJMIp436vR6MPtH8gP05QzM=
-github.com/chzyer/logex v1.2.1/go.mod h1:JLbx6lG2kDbNRFnfkgvh4eRJRPX1QCoOIWomwysCBrQ=
-github.com/chzyer/test v1.0.0 h1:p3BQDXSxOhOG0P9z6/hGnII4LGiEPOYBhs8asl/fC04=
-github.com/chzyer/test v1.0.0/go.mod h1:2JlltgoNkt4TW/z9V/IzDdFaMTM2JPIi26O1pF38GC8=
 github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
 github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
@@ -15,6 +11,8 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
 github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
+github.com/emirpasic/gods v1.18.1 h1:FXtiHYKDGKCW2KzwZKx0iC0PQmdlorYgdFG9jPXJ1Bc=
+github.com/emirpasic/gods v1.18.1/go.mod h1:8tpGGwCnJ5H4r6BWwaV6OrWmMoPhUl5jm/FMNAnJvWQ=
 github.com/gabriel-vasile/mimetype v1.4.2 h1:w5qFW6JKBz9Y393Y4q372O9A7cUSequkh1Q7OhCmWKU=
 github.com/gabriel-vasile/mimetype v1.4.2/go.mod h1:zApsH/mKG4w07erKIaJPFiX0Tsq9BFQgN3qGY5GnNgA=
 github.com/gin-contrib/cors v1.4.0 h1:oJ6gwtUl3lqV0WEIwM/LxPF1QZ5qe2lGWdY2+bz7y0g=
@@ -78,8 +76,6 @@ github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N
 github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY=
 github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 h1:onHthvaw9LFnH4t2DcNVpwGmV9E1BkGknEliJkfwQj0=
 github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58/go.mod h1:DXv8WO4yhMYhSNPKjeNKa5WY9YCIEBRbNzFFPJbWO6Y=
-github.com/pdevine/readline v1.5.2 h1:oz6Y5GdTmhPG+08hhxcAvtHitSANWuA2100Sppb38xI=
-github.com/pdevine/readline v1.5.2/go.mod h1:na/LbuE5PYwxI7GyopWdIs3U8HVe89lYlNTFTXH3wOw=
 github.com/pelletier/go-toml/v2 v2.0.1/go.mod h1:r9LEWfGN8R5k0VXJ+0BkIe7MYkRdwZOjgMj2KwnJFUo=
 github.com/pelletier/go-toml/v2 v2.0.8 h1:0ctb6s9mE31h0/lhu+J6OPmVeDxJn+kYnJc2jZR9tGQ=
 github.com/pelletier/go-toml/v2 v2.0.8/go.mod h1:vuYfssBdrU2XDZ9bYydBu6t+6a6PYNcZljzZR9VXg+4=
@@ -118,33 +114,34 @@ golang.org/x/arch v0.0.0-20210923205945-b76863e36670/go.mod h1:5om86z9Hs0C8fWVUu
 golang.org/x/arch v0.3.0 h1:02VY4/ZcO/gBOH6PUaoiptASxtXU10jazRCP865E97k=
 golang.org/x/arch v0.3.0/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8=
 golang.org/x/crypto v0.0.0-20210711020723-a769d52b0f97/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
-golang.org/x/crypto v0.10.0 h1:LKqV2xt9+kDzSTfOhx4FrkEBcMrAgHSYgzywV9zcGmM=
-golang.org/x/crypto v0.10.0/go.mod h1:o4eNf7Ede1fv+hwOwZsTHl9EsPFO6q6ZvYR8vYfY45I=
+golang.org/x/crypto v0.14.0 h1:wBqGXzWJW6m1XrIKlAH0Hs1JJ7+9KBwnIO8v66Q9cHc=
+golang.org/x/crypto v0.14.0/go.mod h1:MVFd36DqK4CsrnJYDkBA3VC4m2GkXAM0PvzMCn4JQf4=
 golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63 h1:m64FZMko/V45gv0bNmrNYoDEq8U5YUhetc9cBWKS1TQ=
 golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63/go.mod h1:0v4NqG35kSWCMzLaMeX+IQrlSnVE/bqGSyC2cz/9Le8=
 golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
-golang.org/x/net v0.10.0 h1:X2//UzNDwYmtCLn7To6G58Wr6f5ahEAQgKNzv9Y951M=
-golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
+golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM=
+golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE=
 golang.org/x/sync v0.3.0 h1:ftCYgMx6zT/asHUrPw8BLLscYtGznsLAnjq5RH9P66E=
 golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y=
 golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20210806184541-e5e7981a1069/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.0.0-20220310020820-b874c991c1a5/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20220704084225-05e143d24a9e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.11.0 h1:eG7RXZHdqOJ1i+0lgLgCpSXAp6M3LYlAo6osgSi0xOM=
-golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.13.0 h1:Af8nKPmuFypiUBjVoU9V20FiaFXOcuZI21p0ycVYYGE=
+golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
-golang.org/x/term v0.10.0 h1:3R7pNqamzBraeqj/Tj8qt1aQ2HpmlC+Cx/qL/7hn4/c=
-golang.org/x/term v0.10.0/go.mod h1:lpqdcUyK/oCiQxvxVrppt5ggO2KCZ5QblwqPnfZ6d5o=
+golang.org/x/term v0.13.0 h1:bb+I9cTfFazGW51MZqBVmZy7+JEJMouUHTUSKVQLBek=
+golang.org/x/term v0.13.0/go.mod h1:LTmsnFJwVN6bCy1rVCoS+qHT1HhALEFxKncY3WNNh4U=
 golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
-golang.org/x/text v0.10.0 h1:UpjohKhiEgNc0CSauXmwYftY1+LlaC75SJwh0SgCX58=
-golang.org/x/text v0.10.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
+golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k=
+golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+gonum.org/v1/gonum v0.14.0 h1:2NiG67LD1tEH0D7kM+ps2V+fXmsAnpUeec7n8tcr4S0=
+gonum.org/v1/gonum v0.14.0/go.mod h1:AoWeoz0becf9QMWtE8iWXNXc27fK4fNeHNf/oMejGfU=
 google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
 google.golang.org/protobuf v1.28.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=
 google.golang.org/protobuf v1.30.0 h1:kPPoIgf3TsEvrm0PFe15JQ+570QVxYzEvvHqChK+cng=
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -175,7 +175,8 @@ const (
 	// Magic constant for `ggla` files (LoRA adapter).
 	FILE_MAGIC_GGLA = 0x67676C61
 	// Magic constant for `gguf` files (versioned, gguf)
-	FILE_MAGIC_GGUF = 0x46554747
+	FILE_MAGIC_GGUF_LE = 0x46554747
+	FILE_MAGIC_GGUF_BE = 0x47475546
 )

 func DecodeGGML(r io.ReadSeeker) (*GGML, error) {
@@ -191,8 +192,10 @@ func DecodeGGML(r io.ReadSeeker) (*GGML, error) {
 		ggml.container = &containerGGJT{}
 	case FILE_MAGIC_GGLA:
 		ggml.container = &containerLORA{}
-	case FILE_MAGIC_GGUF:
-		ggml.container = &containerGGUF{}
+	case FILE_MAGIC_GGUF_LE:
+		ggml.container = &containerGGUF{bo: binary.LittleEndian}
+	case FILE_MAGIC_GGUF_BE:
+		ggml.container = &containerGGUF{bo: binary.BigEndian}
 	default:
 		return nil, errors.New("invalid file magic")
 	}
--- a/llm/gguf.go
+++ b/llm/gguf.go
@@ -3,12 +3,13 @@ package llm
 import (
 	"bytes"
 	"encoding/binary"
-	"errors"
 	"fmt"
 	"io"
 )

 type containerGGUF struct {
+	bo binary.ByteOrder
+
 	Version uint32

 	V1 struct {
@@ -27,15 +28,13 @@ func (c *containerGGUF) Name() string {
 }

 func (c *containerGGUF) Decode(r io.Reader) (model, error) {
-	binary.Read(r, binary.LittleEndian, &c.Version)
+	binary.Read(r, c.bo, &c.Version)

 	switch c.Version {
 	case 1:
-		binary.Read(r, binary.LittleEndian, &c.V1)
-	case 2:
-		binary.Read(r, binary.LittleEndian, &c.V2)
+		binary.Read(r, c.bo, &c.V1)
 	default:
-		return nil, errors.New("invalid version")
+		binary.Read(r, c.bo, &c.V2)
 	}

 	model := newGGUFModel(c)
@@ -209,75 +208,75 @@ func (llm *ggufModel) NumLayers() int64 {
 	return int64(v)
 }

-func (ggufModel) readU8(r io.Reader) uint8 {
+func (llm ggufModel) readU8(r io.Reader) uint8 {
 	var u8 uint8
-	binary.Read(r, binary.LittleEndian, &u8)
+	binary.Read(r, llm.bo, &u8)
 	return u8
 }

-func (ggufModel) readI8(r io.Reader) int8 {
+func (llm ggufModel) readI8(r io.Reader) int8 {
 	var i8 int8
-	binary.Read(r, binary.LittleEndian, &i8)
+	binary.Read(r, llm.bo, &i8)
 	return i8
 }

-func (ggufModel) readU16(r io.Reader) uint16 {
+func (llm ggufModel) readU16(r io.Reader) uint16 {
 	var u16 uint16
-	binary.Read(r, binary.LittleEndian, &u16)
+	binary.Read(r, llm.bo, &u16)
 	return u16
 }

-func (ggufModel) readI16(r io.Reader) int16 {
+func (llm ggufModel) readI16(r io.Reader) int16 {
 	var i16 int16
-	binary.Read(r, binary.LittleEndian, &i16)
+	binary.Read(r, llm.bo, &i16)
 	return i16
 }

-func (ggufModel) readU32(r io.Reader) uint32 {
+func (llm ggufModel) readU32(r io.Reader) uint32 {
 	var u32 uint32
-	binary.Read(r, binary.LittleEndian, &u32)
+	binary.Read(r, llm.bo, &u32)
 	return u32
 }

-func (ggufModel) readI32(r io.Reader) int32 {
+func (llm ggufModel) readI32(r io.Reader) int32 {
 	var i32 int32
-	binary.Read(r, binary.LittleEndian, &i32)
+	binary.Read(r, llm.bo, &i32)
 	return i32
 }

-func (ggufModel) readU64(r io.Reader) uint64 {
+func (llm ggufModel) readU64(r io.Reader) uint64 {
 	var u64 uint64
-	binary.Read(r, binary.LittleEndian, &u64)
+	binary.Read(r, llm.bo, &u64)
 	return u64
 }

-func (ggufModel) readI64(r io.Reader) int64 {
+func (llm ggufModel) readI64(r io.Reader) int64 {
 	var i64 int64
-	binary.Read(r, binary.LittleEndian, &i64)
+	binary.Read(r, llm.bo, &i64)
 	return i64
 }

-func (ggufModel) readF32(r io.Reader) float32 {
+func (llm ggufModel) readF32(r io.Reader) float32 {
 	var f32 float32
-	binary.Read(r, binary.LittleEndian, &f32)
+	binary.Read(r, llm.bo, &f32)
 	return f32
 }

-func (ggufModel) readF64(r io.Reader) float64 {
+func (llm ggufModel) readF64(r io.Reader) float64 {
 	var f64 float64
-	binary.Read(r, binary.LittleEndian, &f64)
+	binary.Read(r, llm.bo, &f64)
 	return f64
 }

-func (ggufModel) readBool(r io.Reader) bool {
+func (llm ggufModel) readBool(r io.Reader) bool {
 	var b bool
-	binary.Read(r, binary.LittleEndian, &b)
+	binary.Read(r, llm.bo, &b)
 	return b
 }

-func (ggufModel) readStringV1(r io.Reader) (string, error) {
+func (llm ggufModel) readStringV1(r io.Reader) (string, error) {
 	var nameLength uint32
-	binary.Read(r, binary.LittleEndian, &nameLength)
+	binary.Read(r, llm.bo, &nameLength)

 	var b bytes.Buffer
 	if _, err := io.CopyN(&b, r, int64(nameLength)); err != nil {
@@ -292,7 +291,7 @@ func (ggufModel) readStringV1(r io.Reader) (string, error) {

 func (llm ggufModel) readString(r io.Reader) (string, error) {
 	var nameLength uint64
-	binary.Read(r, binary.LittleEndian, &nameLength)
+	binary.Read(r, llm.bo, &nameLength)

 	var b bytes.Buffer
 	if _, err := io.CopyN(&b, r, int64(nameLength)); err != nil {
--- a/llm/llama.cpp/generate_darwin_amd64.go
+++ b/llm/llama.cpp/generate_darwin_amd64.go
@@ -12,7 +12,8 @@ package llm
 //go:generate mv ggml/build/cpu/bin/server ggml/build/cpu/bin/ollama-runner

 //go:generate git submodule update --force gguf
-//go:generate git -C gguf apply ../patches/0001-remove-warm-up-logging.patch
+//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch
+//go:generate git -C gguf apply ../patches/0001-metal-handle-ggml_scale-for-n-4-0-close-3754.patch
 //go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
 //go:generate cmake --build gguf/build/cpu --target server --config Release
 //go:generate mv gguf/build/cpu/bin/server gguf/build/cpu/bin/ollama-runner
--- a/llm/llama.cpp/generate_darwin_arm64.go
+++ b/llm/llama.cpp/generate_darwin_arm64.go
@@ -12,7 +12,8 @@ package llm
 //go:generate mv ggml/build/metal/bin/server ggml/build/metal/bin/ollama-runner

 //go:generate git submodule update --force gguf
-//go:generate git -C gguf apply ../patches/0001-remove-warm-up-logging.patch
+//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch
+//go:generate git -C gguf apply ../patches/0001-metal-handle-ggml_scale-for-n-4-0-close-3754.patch
 //go:generate cmake -S gguf -B gguf/build/metal -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
 //go:generate cmake --build gguf/build/metal --target server --config Release
 //go:generate mv gguf/build/metal/bin/server gguf/build/metal/bin/ollama-runner
--- a/llm/llama.cpp/generate_linux.go
+++ b/llm/llama.cpp/generate_linux.go
@@ -13,14 +13,14 @@ package llm

 //go:generate git submodule update --force gguf
 //go:generate git -C gguf apply ../patches/0001-copy-cuda-runtime-libraries.patch
-//go:generate git -C gguf apply ../patches/0001-remove-warm-up-logging.patch
-//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on
+//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch
+//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off
 //go:generate cmake --build gguf/build/cpu --target server --config Release
 //go:generate mv gguf/build/cpu/bin/server gguf/build/cpu/bin/ollama-runner

 //go:generate cmake -S ggml -B ggml/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
 //go:generate cmake --build ggml/build/cuda --target server --config Release
 //go:generate mv ggml/build/cuda/bin/server ggml/build/cuda/bin/ollama-runner
-//go:generate cmake -S gguf -B gguf/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
+//go:generate cmake -S gguf -B gguf/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off
 //go:generate cmake --build gguf/build/cuda --target server --config Release
 //go:generate mv gguf/build/cuda/bin/server gguf/build/cuda/bin/ollama-runner
--- a/llm/llama.cpp/generate_windows.go
+++ b/llm/llama.cpp/generate_windows.go
@@ -10,7 +10,7 @@ package llm
 //go:generate cmd /c move ggml\build\cpu\bin\Release\server.exe ggml\build\cpu\bin\Release\ollama-runner.exe

 //go:generate git submodule update --force gguf
-//go:generate git -C gguf apply ../patches/0001-remove-warm-up-logging.patch
-//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on
+//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch
+//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off
 //go:generate cmake --build gguf/build/cpu --target server --config Release
 //go:generate cmd /c move gguf\build\cpu\bin\Release\server.exe gguf\build\cpu\bin\Release\ollama-runner.exe
--- a/llm/llama.cpp/gguf
+++ b/llm/llama.cpp/gguf
--- a/llm/llama.cpp/patches/0001-metal-handle-ggml_scale-for-n-4-0-close-3754.patch
+++ b/llm/llama.cpp/patches/0001-metal-handle-ggml_scale-for-n-4-0-close-3754.patch
@@ -0,0 +1,91 @@
+From 469c9addef75893e6be12edda852d12e840bf064 Mon Sep 17 00:00:00 2001
+From: Georgi Gerganov <ggerganov@gmail.com>
+Date: Tue, 24 Oct 2023 09:46:50 +0300
+Subject: [PATCH 1/2] metal : handle ggml_scale for n%4 != 0 (close #3754)
+
+ggml-ci
+---
+ ggml-metal.m     | 18 +++++++++++++-----
+ ggml-metal.metal | 10 +++++++++-
+ 2 files changed, 22 insertions(+), 6 deletions(-)
+
+diff --git a/ggml-metal.m b/ggml-metal.m
+index c908106..c1901dc 100644
+--- a/ggml-metal.m
+++ b/ggml-metal.m
+@@ -62,6 +62,7 @@
+     GGML_METAL_DECL_KERNEL(mul);
+     GGML_METAL_DECL_KERNEL(mul_row); // TODO: avoid this extra kernel, instead extend the "mul" kernel to support broadcast
+     GGML_METAL_DECL_KERNEL(scale);
+    GGML_METAL_DECL_KERNEL(scale_4);
+     GGML_METAL_DECL_KERNEL(silu);
+     GGML_METAL_DECL_KERNEL(relu);
+     GGML_METAL_DECL_KERNEL(gelu);
+@@ -249,6 +250,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char* format, ...){
+         GGML_METAL_ADD_KERNEL(mul);
+         GGML_METAL_ADD_KERNEL(mul_row);
+         GGML_METAL_ADD_KERNEL(scale);
+        GGML_METAL_ADD_KERNEL(scale_4);
+         GGML_METAL_ADD_KERNEL(silu);
+         GGML_METAL_ADD_KERNEL(relu);
+         GGML_METAL_ADD_KERNEL(gelu);
+@@ -347,6 +349,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
+     GGML_METAL_DEL_KERNEL(mul);
+     GGML_METAL_DEL_KERNEL(mul_row);
+     GGML_METAL_DEL_KERNEL(scale);
+    GGML_METAL_DEL_KERNEL(scale_4);
+     GGML_METAL_DEL_KERNEL(silu);
+     GGML_METAL_DEL_KERNEL(relu);
+     GGML_METAL_DEL_KERNEL(gelu);
+@@ -923,15 +926,20 @@ void ggml_metal_graph_compute(
+ 
+                             const float scale = *(const float *) src1->data;
+ 
+-                            [encoder setComputePipelineState:ctx->pipeline_scale];
+                            int64_t n = ggml_nelements(dst);
+
+                            if (n % 4 == 0) {
+                                n /= 4;
+                                [encoder setComputePipelineState:ctx->pipeline_scale_4];
+                            } else {
+                                [encoder setComputePipelineState:ctx->pipeline_scale];
+                            }
+
+                             [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                             [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                             [encoder setBytes:&scale length:sizeof(scale) atIndex:2];
+ 
+-                            const int64_t n = ggml_nelements(dst);
+-                            GGML_ASSERT(n % 4 == 0);
+-
+-                            [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                         } break;
+                     case GGML_OP_UNARY:
+                         switch (ggml_get_unary_op(gf->nodes[i])) {
+diff --git a/ggml-metal.metal b/ggml-metal.metal
+index 69fc713..f4b4605 100644
+--- a/ggml-metal.metal
+++ b/ggml-metal.metal
+@@ -125,9 +125,17 @@ kernel void kernel_mul_row(
+ }
+ 
+ kernel void kernel_scale(
+        device const float * src0,
+        device       float * dst,
+        constant     float & scale,
+        uint tpig[[thread_position_in_grid]]) {
+    dst[tpig] = src0[tpig] * scale;
+}
+
+kernel void kernel_scale_4(
+         device const float4 * src0,
+         device       float4 * dst,
+-        constant     float & scale,
+        constant     float  & scale,
+         uint tpig[[thread_position_in_grid]]) {
+     dst[tpig] = src0[tpig] * scale;
+ }
+-- 
+2.39.3 (Apple Git-145)
+
--- a/llm/llama.cpp/patches/0001-remove-warm-up-logging.patch
+++ b/llm/llama.cpp/patches/0001-remove-warm-up-logging.patch
@@ -1,25 +0,0 @@
-From 8dbb5449db259a9c24796e7927d89bee98b6c8f5 Mon Sep 17 00:00:00 2001
-From: Bruce MacDonald <brucewmacdonald@gmail.com>
-Date: Thu, 5 Oct 2023 11:21:12 -0400
-Subject: [PATCH] remove warm up logging
-
---
- common/common.cpp | 2 --
- 1 file changed, 2 deletions(-)
-
-diff --git a/common/common.cpp b/common/common.cpp
-index 7370017..c4433fe 100644
--- a/common/common.cpp
-+++ b/common/common.cpp
-@@ -839,8 +839,6 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
-     }
- 
-     {
-        LOG("warming up the model with an empty run\n");
-
-         std::vector<llama_token> tmp = { llama_token_bos(lctx), llama_token_eos(lctx), };
-         llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
-         llama_kv_cache_tokens_rm(lctx, -1, -1);
-- 
-2.39.2 (Apple Git-143)
-
--- a/llm/llama.cpp/patches/0001-update-default-log-target.patch
+++ b/llm/llama.cpp/patches/0001-update-default-log-target.patch
@@ -0,0 +1,25 @@
+From 6465fec6290f0a7f5d4d0fbe6bcf634e4810dde6 Mon Sep 17 00:00:00 2001
+From: Michael Yang <mxyng@pm.me>
+Date: Mon, 23 Oct 2023 10:39:34 -0700
+Subject: [PATCH] default log stderr
+
+---
+ common/log.h | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/common/log.h b/common/log.h
+index b8953fd..25522cd 100644
+--- a/common/log.h
+++ b/common/log.h
+@@ -90,7 +90,7 @@
+ //  }
+ //
+ #ifndef LOG_TARGET
+-    #define LOG_TARGET log_handler()
+    #define LOG_TARGET nullptr
+ #endif
+ 
+ #ifndef LOG_TEE_TARGET
+-- 
+2.42.0
+
--- a/llm/llama.go
+++ b/llm/llama.go
@@ -212,6 +212,10 @@ func CheckVRAM() (int64, error) {
 	scanner := bufio.NewScanner(&stdout)
 	for scanner.Scan() {
 		line := scanner.Text()
+		if strings.Contains(line, "[Insufficient Permissions]") {
+			return 0, fmt.Errorf("GPU support may not enabled, check you have installed GPU drivers and have the necessary permissions to run nvidia-smi")
+		}
+
 		vram, err := strconv.ParseInt(strings.TrimSpace(line), 10, 64)
 		if err != nil {
 			return 0, fmt.Errorf("failed to parse available VRAM: %v", err)
@@ -243,12 +247,15 @@ func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
 			return 0
 		}

-		// Calculate bytes per layer
-		// TODO: this is a rough heuristic, better would be to calculate this based on number of layers and context size
+		/*
+		 Calculate bytes per layer, this will roughly be the size of the model file divided by the number of layers.
+		 We can store the model weights and the kv cache in vram,
+		 to enable kv chache vram storage add two additional layers to the number of layers retrieved from the model file.
+		*/
 		bytesPerLayer := fileSizeBytes / numLayer

-		// max number of layers we can fit in VRAM, subtract 8% to prevent consuming all available VRAM and running out of memory
-		layers := int(freeBytes/bytesPerLayer) * 92 / 100
+		// 75% of the absolute max number of layers we can fit in available VRAM, off-loading too many layers to the GPU can cause OOM errors
+		layers := int(freeBytes/bytesPerLayer) * 3 / 4
 		log.Printf("%d MB VRAM available, loading up to %d GPU layers", freeBytes/(1024*1024), layers)

 		return layers
--- a/readline/buffer.go
+++ b/readline/buffer.go
@@ -0,0 +1,370 @@
+package readline
+
+import (
+	"fmt"
+
+	"github.com/emirpasic/gods/lists/arraylist"
+	"golang.org/x/term"
+)
+
+type Buffer struct {
+	Pos       int
+	Buf       *arraylist.List
+	Prompt    *Prompt
+	LineWidth int
+	Width     int
+	Height    int
+}
+
+func NewBuffer(prompt *Prompt) (*Buffer, error) {
+	width, height, err := term.GetSize(0)
+	if err != nil {
+		fmt.Println("Error getting size:", err)
+		return nil, err
+	}
+
+	lwidth := width - len(prompt.Prompt)
+	if prompt.UseAlt {
+		lwidth = width - len(prompt.AltPrompt)
+	}
+
+	b := &Buffer{
+		Pos:       0,
+		Buf:       arraylist.New(),
+		Prompt:    prompt,
+		Width:     width,
+		Height:    height,
+		LineWidth: lwidth,
+	}
+
+	return b, nil
+}
+
+func (b *Buffer) MoveLeft() {
+	if b.Pos > 0 {
+		if b.Pos%b.LineWidth == 0 {
+			fmt.Printf(CursorUp + CursorBOL + cursorRightN(b.Width))
+		} else {
+			fmt.Print(CursorLeft)
+		}
+		b.Pos -= 1
+	}
+}
+
+func (b *Buffer) MoveLeftWord() {
+	if b.Pos > 0 {
+		var foundNonspace bool
+		for {
+			v, _ := b.Buf.Get(b.Pos - 1)
+			if v == ' ' {
+				if foundNonspace {
+					break
+				}
+			} else {
+				foundNonspace = true
+			}
+			b.MoveLeft()
+
+			if b.Pos == 0 {
+				break
+			}
+		}
+	}
+}
+
+func (b *Buffer) MoveRight() {
+	if b.Pos < b.Size() {
+		b.Pos += 1
+		if b.Pos%b.LineWidth == 0 {
+			fmt.Printf(CursorDown + CursorBOL + cursorRightN(b.PromptSize()))
+		} else {
+			fmt.Print(CursorRight)
+		}
+	}
+}
+
+func (b *Buffer) MoveRightWord() {
+	if b.Pos < b.Size() {
+		for {
+			b.MoveRight()
+			v, _ := b.Buf.Get(b.Pos)
+			if v == ' ' {
+				break
+			}
+
+			if b.Pos == b.Size() {
+				break
+			}
+		}
+	}
+}
+
+func (b *Buffer) MoveToStart() {
+	if b.Pos > 0 {
+		currLine := b.Pos / b.LineWidth
+		if currLine > 0 {
+			for cnt := 0; cnt < currLine; cnt++ {
+				fmt.Print(CursorUp)
+			}
+		}
+		fmt.Printf(CursorBOL + cursorRightN(b.PromptSize()))
+		b.Pos = 0
+	}
+}
+
+func (b *Buffer) MoveToEnd() {
+	if b.Pos < b.Size() {
+		currLine := b.Pos / b.LineWidth
+		totalLines := b.Size() / b.LineWidth
+		if currLine < totalLines {
+			for cnt := 0; cnt < totalLines-currLine; cnt++ {
+				fmt.Print(CursorDown)
+			}
+			remainder := b.Size() % b.LineWidth
+			fmt.Printf(CursorBOL + cursorRightN(b.PromptSize()+remainder))
+		} else {
+			fmt.Print(cursorRightN(b.Size() - b.Pos))
+		}
+
+		b.Pos = b.Size()
+	}
+}
+
+func (b *Buffer) Size() int {
+	return b.Buf.Size()
+}
+
+func min(n, m int) int {
+	if n > m {
+		return m
+	}
+	return n
+}
+
+func (b *Buffer) PromptSize() int {
+	if b.Prompt.UseAlt {
+		return len(b.Prompt.AltPrompt)
+	}
+	return len(b.Prompt.Prompt)
+}
+
+func (b *Buffer) Add(r rune) {
+	if b.Pos == b.Buf.Size() {
+		fmt.Printf("%c", r)
+		b.Buf.Add(r)
+		b.Pos += 1
+		if b.Pos > 0 && b.Pos%b.LineWidth == 0 {
+			fmt.Printf("\n%s", b.Prompt.AltPrompt)
+		}
+	} else {
+		fmt.Printf("%c", r)
+		b.Buf.Insert(b.Pos, r)
+		b.Pos += 1
+		if b.Pos > 0 && b.Pos%b.LineWidth == 0 {
+			fmt.Printf("\n%s", b.Prompt.AltPrompt)
+		}
+		b.drawRemaining()
+	}
+}
+
+func (b *Buffer) drawRemaining() {
+	var place int
+	remainingText := b.StringN(b.Pos)
+	if b.Pos > 0 {
+		place = b.Pos % b.LineWidth
+	}
+	fmt.Print(CursorHide)
+
+	// render the rest of the current line
+	currLine := remainingText[:min(b.LineWidth-place, len(remainingText))]
+	if len(currLine) > 0 {
+		fmt.Printf(ClearToEOL + currLine)
+		fmt.Print(cursorLeftN(len(currLine)))
+	} else {
+		fmt.Print(ClearToEOL)
+	}
+
+	// render the other lines
+	if len(remainingText) > len(currLine) {
+		remaining := []rune(remainingText[len(currLine):])
+		var totalLines int
+		for i, c := range remaining {
+			if i%b.LineWidth == 0 {
+				fmt.Printf("\n%s", b.Prompt.AltPrompt)
+				totalLines += 1
+			}
+			fmt.Printf("%c", c)
+		}
+		fmt.Print(ClearToEOL)
+		fmt.Print(cursorUpN(totalLines))
+		fmt.Printf(CursorBOL + cursorRightN(b.Width-len(currLine)))
+	}
+
+	fmt.Print(CursorShow)
+}
+
+func (b *Buffer) Remove() {
+	if b.Buf.Size() > 0 && b.Pos > 0 {
+		if b.Pos%b.LineWidth == 0 {
+			// if the user backspaces over the word boundary, do this magic to clear the line
+			// and move to the end of the previous line
+			fmt.Printf(CursorBOL + ClearToEOL)
+			fmt.Printf(CursorUp + CursorBOL + cursorRightN(b.Width) + " " + CursorLeft)
+		} else {
+			fmt.Printf(CursorLeft + " " + CursorLeft)
+		}
+
+		var eraseExtraLine bool
+		if (b.Size()-1)%b.LineWidth == 0 {
+			eraseExtraLine = true
+		}
+
+		b.Pos -= 1
+		b.Buf.Remove(b.Pos)
+
+		if b.Pos < b.Size() {
+			b.drawRemaining()
+			// this erases a line which is left over when backspacing in the middle of a line and there
+			// are trailing characters which go over the line width boundary
+			if eraseExtraLine {
+				remainingLines := (b.Size() - b.Pos) / b.LineWidth
+				fmt.Printf(cursorDownN(remainingLines+1) + CursorBOL + ClearToEOL)
+				place := b.Pos % b.LineWidth
+				fmt.Printf(cursorUpN(remainingLines+1) + cursorRightN(place+len(b.Prompt.Prompt)))
+			}
+		}
+	}
+}
+
+func (b *Buffer) Delete() {
+	if b.Size() > 0 && b.Pos < b.Size() {
+		b.Buf.Remove(b.Pos)
+		b.drawRemaining()
+		if b.Size()%b.LineWidth == 0 {
+			if b.Pos != b.Size() {
+				remainingLines := (b.Size() - b.Pos) / b.LineWidth
+				fmt.Printf(cursorDownN(remainingLines) + CursorBOL + ClearToEOL)
+				place := b.Pos % b.LineWidth
+				fmt.Printf(cursorUpN(remainingLines) + cursorRightN(place+len(b.Prompt.Prompt)))
+			}
+		}
+	}
+}
+
+func (b *Buffer) DeleteBefore() {
+	if b.Pos > 0 {
+		for cnt := b.Pos - 1; cnt >= 0; cnt-- {
+			b.Remove()
+		}
+	}
+}
+
+func (b *Buffer) DeleteRemaining() {
+	if b.Size() > 0 && b.Pos < b.Size() {
+		charsToDel := b.Size() - b.Pos
+		for cnt := 0; cnt < charsToDel; cnt++ {
+			b.Delete()
+		}
+	}
+}
+
+func (b *Buffer) DeleteWord() {
+	if b.Buf.Size() > 0 && b.Pos > 0 {
+		var foundNonspace bool
+		for {
+			v, _ := b.Buf.Get(b.Pos - 1)
+			if v == ' ' {
+				if !foundNonspace {
+					b.Remove()
+				} else {
+					break
+				}
+			} else {
+				foundNonspace = true
+				b.Remove()
+			}
+
+			if b.Pos == 0 {
+				break
+			}
+		}
+	}
+}
+
+func (b *Buffer) ClearScreen() {
+	fmt.Printf(ClearScreen + CursorReset + b.Prompt.Prompt)
+	if b.IsEmpty() {
+		ph := b.Prompt.Placeholder
+		fmt.Printf(ColorGrey + ph + cursorLeftN(len(ph)) + ColorDefault)
+	} else {
+		currPos := b.Pos
+		b.Pos = 0
+		b.drawRemaining()
+		fmt.Printf(CursorReset + cursorRightN(len(b.Prompt.Prompt)))
+		if currPos > 0 {
+			targetLine := currPos / b.LineWidth
+			if targetLine > 0 {
+				for cnt := 0; cnt < targetLine; cnt++ {
+					fmt.Print(CursorDown)
+				}
+			}
+			remainder := currPos % b.LineWidth
+			if remainder > 0 {
+				fmt.Print(cursorRightN(remainder))
+			}
+			if currPos%b.LineWidth == 0 {
+				fmt.Printf(CursorBOL + b.Prompt.AltPrompt)
+			}
+		}
+		b.Pos = currPos
+	}
+}
+
+func (b *Buffer) IsEmpty() bool {
+	return b.Buf.Empty()
+}
+
+func (b *Buffer) Replace(r []rune) {
+	b.Pos = 0
+	b.Buf.Clear()
+	fmt.Printf(ClearLine + CursorBOL + b.Prompt.Prompt)
+	for _, c := range r {
+		b.Add(c)
+	}
+}
+
+func (b *Buffer) String() string {
+	return b.StringN(0)
+}
+
+func (b *Buffer) StringN(n int) string {
+	return b.StringNM(n, 0)
+}
+
+func (b *Buffer) StringNM(n, m int) string {
+	var s string
+	if m == 0 {
+		m = b.Size()
+	}
+	for cnt := n; cnt < m; cnt++ {
+		c, _ := b.Buf.Get(cnt)
+		s += string(c.(rune))
+	}
+	return s
+}
+
+func cursorLeftN(n int) string {
+	return fmt.Sprintf(CursorLeftN, n)
+}
+
+func cursorRightN(n int) string {
+	return fmt.Sprintf(CursorRightN, n)
+}
+
+func cursorUpN(n int) string {
+	return fmt.Sprintf(CursorUpN, n)
+}
+
+func cursorDownN(n int) string {
+	return fmt.Sprintf(CursorDownN, n)
+}
--- a/readline/errors.go
+++ b/readline/errors.go
@@ -0,0 +1,17 @@
+package readline
+
+import (
+	"errors"
+)
+
+var (
+	ErrInterrupt = errors.New("Interrupt")
+)
+
+type InterruptError struct {
+	Line []rune
+}
+
+func (*InterruptError) Error() string {
+	return "Interrupted"
+}
--- a/readline/history.go
+++ b/readline/history.go
@@ -0,0 +1,152 @@
+package readline
+
+import (
+	"bufio"
+	"errors"
+	"io"
+	"os"
+	"path/filepath"
+	"strings"
+
+	"github.com/emirpasic/gods/lists/arraylist"
+)
+
+type History struct {
+	Buf      *arraylist.List
+	Autosave bool
+	Pos      int
+	Limit    int
+	Filename string
+	Enabled  bool
+}
+
+func NewHistory() (*History, error) {
+	h := &History{
+		Buf:      arraylist.New(),
+		Limit:    100, //resizeme
+		Autosave: true,
+		Enabled:  true,
+	}
+
+	err := h.Init()
+	if err != nil {
+		return nil, err
+	}
+
+	return h, nil
+}
+
+func (h *History) Init() error {
+	home, err := os.UserHomeDir()
+	if err != nil {
+		return err
+	}
+
+	path := filepath.Join(home, ".ollama", "history")
+	h.Filename = path
+
+	//todo check if the file exists
+	f, err := os.OpenFile(path, os.O_CREATE|os.O_RDONLY, 0600)
+	if err != nil {
+		if errors.Is(err, os.ErrNotExist) {
+			return nil
+		}
+		return err
+	}
+	defer f.Close()
+
+	r := bufio.NewReader(f)
+	for {
+		line, err := r.ReadString('\n')
+		if err != nil {
+			if err == io.EOF {
+				break
+			}
+			return err
+		}
+
+		line = strings.TrimSpace(line)
+		if len(line) == 0 {
+			continue
+		}
+
+		h.Add([]rune(line))
+	}
+
+	return nil
+}
+
+func (h *History) Add(l []rune) {
+	h.Buf.Add(l)
+	h.Compact()
+	h.Pos = h.Size()
+	if h.Autosave {
+		h.Save()
+	}
+}
+
+func (h *History) Compact() {
+	s := h.Buf.Size()
+	if s > h.Limit {
+		for cnt := 0; cnt < s-h.Limit; cnt++ {
+			h.Buf.Remove(0)
+		}
+	}
+}
+
+func (h *History) Clear() {
+	h.Buf.Clear()
+}
+
+func (h *History) Prev() []rune {
+	var line []rune
+	if h.Pos > 0 {
+		h.Pos -= 1
+	}
+	v, _ := h.Buf.Get(h.Pos)
+	line, _ = v.([]rune)
+	return line
+}
+
+func (h *History) Next() []rune {
+	var line []rune
+	if h.Pos < h.Buf.Size() {
+		h.Pos += 1
+		v, _ := h.Buf.Get(h.Pos)
+		line, _ = v.([]rune)
+	}
+	return line
+}
+
+func (h *History) Size() int {
+	return h.Buf.Size()
+}
+
+func (h *History) Save() error {
+	if !h.Enabled {
+		return nil
+	}
+
+	tmpFile := h.Filename + ".tmp"
+
+	f, err := os.OpenFile(tmpFile, os.O_CREATE|os.O_WRONLY|os.O_TRUNC|os.O_APPEND, 0666)
+	if err != nil {
+		return err
+	}
+	defer f.Close()
+
+	buf := bufio.NewWriter(f)
+	for cnt := 0; cnt < h.Size(); cnt++ {
+		v, _ := h.Buf.Get(cnt)
+		line, _ := v.([]rune)
+		buf.WriteString(string(line) + "\n")
+	}
+	buf.Flush()
+	f.Close()
+
+	if err = os.Rename(tmpFile, h.Filename); err != nil {
+		return err
+	}
+
+	return nil
+}
--- a/readline/readline.go
+++ b/readline/readline.go
@@ -0,0 +1,253 @@
+package readline
+
+import (
+	"bufio"
+	"fmt"
+	"io"
+	"os"
+	"syscall"
+)
+
+type Prompt struct {
+	Prompt         string
+	AltPrompt      string
+	Placeholder    string
+	AltPlaceholder string
+	UseAlt         bool
+}
+
+type Terminal struct {
+	outchan chan rune
+}
+
+type Instance struct {
+	Prompt   *Prompt
+	Terminal *Terminal
+	History  *History
+}
+
+func New(prompt Prompt) (*Instance, error) {
+	term, err := NewTerminal()
+	if err != nil {
+		return nil, err
+	}
+
+	history, err := NewHistory()
+	if err != nil {
+		return nil, err
+	}
+
+	return &Instance{
+		Prompt:   &prompt,
+		Terminal: term,
+		History:  history,
+	}, nil
+}
+
+func (i *Instance) Readline() (string, error) {
+	prompt := i.Prompt.Prompt
+	if i.Prompt.UseAlt {
+		prompt = i.Prompt.AltPrompt
+	}
+	fmt.Print(prompt)
+
+	termios, err := SetRawMode(syscall.Stdin)
+	if err != nil {
+		return "", err
+	}
+	defer UnsetRawMode(syscall.Stdin, termios)
+
+	buf, _ := NewBuffer(i.Prompt)
+
+	var esc bool
+	var escex bool
+	var metaDel bool
+	var pasteMode PasteMode
+
+	var currentLineBuf []rune
+
+	for {
+		if buf.IsEmpty() {
+			ph := i.Prompt.Placeholder
+			if i.Prompt.UseAlt {
+				ph = i.Prompt.AltPlaceholder
+			}
+			fmt.Printf(ColorGrey + ph + fmt.Sprintf(CursorLeftN, len(ph)) + ColorDefault)
+		}
+
+		r, err := i.Terminal.Read()
+
+		if buf.IsEmpty() {
+			fmt.Print(ClearToEOL)
+		}
+
+		if err != nil {
+			return "", io.EOF
+		}
+
+		if escex {
+			escex = false
+
+			switch r {
+			case KeyUp:
+				if i.History.Pos > 0 {
+					if i.History.Pos == i.History.Size() {
+						currentLineBuf = []rune(buf.String())
+					}
+					buf.Replace(i.History.Prev())
+				}
+			case KeyDown:
+				if i.History.Pos < i.History.Size() {
+					buf.Replace(i.History.Next())
+					if i.History.Pos == i.History.Size() {
+						buf.Replace(currentLineBuf)
+					}
+				}
+			case KeyLeft:
+				buf.MoveLeft()
+			case KeyRight:
+				buf.MoveRight()
+			case CharBracketedPaste:
+				var code string
+				for cnt := 0; cnt < 3; cnt++ {
+					r, err = i.Terminal.Read()
+					if err != nil {
+						return "", io.EOF
+					}
+
+					code += string(r)
+				}
+				if code == CharBracketedPasteStart {
+					pasteMode = PasteModeStart
+				} else if code == CharBracketedPasteEnd {
+					pasteMode = PasteModeEnd
+				}
+			case KeyDel:
+				if buf.Size() > 0 {
+					buf.Delete()
+				}
+				metaDel = true
+			case MetaStart:
+				buf.MoveToStart()
+			case MetaEnd:
+				buf.MoveToEnd()
+			default:
+				// skip any keys we don't know about
+				continue
+			}
+			continue
+		} else if esc {
+			esc = false
+
+			switch r {
+			case 'b':
+				buf.MoveLeftWord()
+			case 'f':
+				buf.MoveRightWord()
+			case CharEscapeEx:
+				escex = true
+			}
+			continue
+		}
+
+		switch r {
+		case CharNull:
+			continue
+		case CharEsc:
+			esc = true
+		case CharInterrupt:
+			return "", ErrInterrupt
+		case CharLineStart:
+			buf.MoveToStart()
+		case CharLineEnd:
+			buf.MoveToEnd()
+		case CharBackward:
+			buf.MoveLeft()
+		case CharForward:
+			buf.MoveRight()
+		case CharBackspace, CharCtrlH:
+			buf.Remove()
+		case CharTab:
+			// todo: convert back to real tabs
+			for cnt := 0; cnt < 8; cnt++ {
+				buf.Add(' ')
+			}
+		case CharDelete:
+			if buf.Size() > 0 {
+				buf.Delete()
+			} else {
+				return "", io.EOF
+			}
+		case CharKill:
+			buf.DeleteRemaining()
+		case CharCtrlU:
+			buf.DeleteBefore()
+		case CharCtrlL:
+			buf.ClearScreen()
+		case CharCtrlW:
+			buf.DeleteWord()
+		case CharEnter:
+			output := buf.String()
+			if output != "" {
+				i.History.Add([]rune(output))
+			}
+			buf.MoveToEnd()
+			fmt.Println()
+			switch pasteMode {
+			case PasteModeStart:
+				output = `"""` + output
+			case PasteModeEnd:
+				output = output + `"""`
+			}
+			return output, nil
+		default:
+			if metaDel {
+				metaDel = false
+				continue
+			}
+			if r >= CharSpace || r == CharEnter {
+				buf.Add(r)
+			}
+		}
+	}
+}
+
+func (i *Instance) HistoryEnable() {
+	i.History.Enabled = true
+}
+
+func (i *Instance) HistoryDisable() {
+	i.History.Enabled = false
+}
+
+func NewTerminal() (*Terminal, error) {
+	t := &Terminal{
+		outchan: make(chan rune),
+	}
+
+	go t.ioloop()
+
+	return t, nil
+}
+
+func (t *Terminal) ioloop() {
+	buf := bufio.NewReader(os.Stdin)
+
+	for {
+		r, _, err := buf.ReadRune()
+		if err != nil {
+			close(t.outchan)
+			break
+		}
+		t.outchan <- r
+	}
+}
+
+func (t *Terminal) Read() (rune, error) {
+	r, ok := <-t.outchan
+	if !ok {
+		return 0, io.EOF
+	}
+
+	return r, nil
+}
--- a/readline/term.go
+++ b/readline/term.go
@@ -0,0 +1,37 @@
+//go:build aix || darwin || dragonfly || freebsd || (linux && !appengine) || netbsd || openbsd || os400 || solaris
+// +build aix darwin dragonfly freebsd linux,!appengine netbsd openbsd os400 solaris
+
+package readline
+
+import (
+	"syscall"
+)
+
+type Termios syscall.Termios
+
+func SetRawMode(fd int) (*Termios, error) {
+	termios, err := getTermios(fd)
+	if err != nil {
+		return nil, err
+	}
+
+	newTermios := *termios
+	newTermios.Iflag &^= syscall.IGNBRK | syscall.BRKINT | syscall.PARMRK | syscall.ISTRIP | syscall.INLCR | syscall.IGNCR | syscall.ICRNL | syscall.IXON
+	newTermios.Lflag &^= syscall.ECHO | syscall.ECHONL | syscall.ICANON | syscall.ISIG | syscall.IEXTEN
+	newTermios.Cflag &^= syscall.CSIZE | syscall.PARENB
+	newTermios.Cflag |= syscall.CS8
+	newTermios.Cc[syscall.VMIN] = 1
+	newTermios.Cc[syscall.VTIME] = 0
+
+	return termios, setTermios(fd, &newTermios)
+}
+
+func UnsetRawMode(fd int, termios *Termios) error {
+	return setTermios(fd, termios)
+}
+
+// IsTerminal returns true if the given file descriptor is a terminal.
+func IsTerminal(fd int) bool {
+	_, err := getTermios(fd)
+	return err == nil
+}
--- a/readline/term_bsd.go
+++ b/readline/term_bsd.go
@@ -0,0 +1,24 @@
+//go:build darwin || freebsd || netbsd || openbsd
+package readline
+
+import (
+	"syscall"
+	"unsafe"
+)
+
+func getTermios(fd int) (*Termios, error) {
+	termios := new(Termios)
+	_, _, err := syscall.Syscall6(syscall.SYS_IOCTL, uintptr(fd), syscall.TIOCGETA, uintptr(unsafe.Pointer(termios)), 0, 0, 0)
+	if err != 0 {
+		return nil, err
+	}
+	return termios, nil
+}
+
+func setTermios(fd int, termios *Termios) error {
+	_, _, err := syscall.Syscall6(syscall.SYS_IOCTL, uintptr(fd), syscall.TIOCSETA, uintptr(unsafe.Pointer(termios)), 0, 0, 0)
+	if err != 0 {
+		return err
+	}
+	return nil
+}
--- a/readline/term_linux.go
+++ b/readline/term_linux.go
@@ -0,0 +1,27 @@
+//go:build linux || solaris
+package readline
+
+import (
+	"syscall"
+	"unsafe"
+)
+
+const tcgets = 0x5401
+const tcsets = 0x5402
+
+func getTermios(fd int) (*Termios, error) {
+	termios := new(Termios)
+	_, _, err := syscall.Syscall6(syscall.SYS_IOCTL, uintptr(fd), tcgets, uintptr(unsafe.Pointer(termios)), 0, 0, 0)
+	if err != 0 {
+		return nil, err
+	}
+	return termios, nil
+}
+
+func setTermios(fd int, termios *Termios) error {
+	_, _, err := syscall.Syscall6(syscall.SYS_IOCTL, uintptr(fd), tcsets, uintptr(unsafe.Pointer(termios)), 0, 0, 0)
+	if err != 0 {
+		return err
+	}
+	return nil
+}
--- a/readline/types.go
+++ b/readline/types.go
@@ -0,0 +1,86 @@
+package readline
+
+const (
+	CharNull      = 0
+	CharLineStart = 1
+	CharBackward  = 2
+	CharInterrupt = 3
+	CharDelete    = 4
+	CharLineEnd   = 5
+	CharForward   = 6
+	CharBell      = 7
+	CharCtrlH     = 8
+	CharTab       = 9
+	CharCtrlJ     = 10
+	CharKill      = 11
+	CharCtrlL     = 12
+	CharEnter     = 13
+	CharNext      = 14
+	CharPrev      = 16
+	CharBckSearch = 18
+	CharFwdSearch = 19
+	CharTranspose = 20
+	CharCtrlU     = 21
+	CharCtrlW     = 23
+	CharCtrlY     = 25
+	CharCtrlZ     = 26
+	CharEsc       = 27
+	CharSpace     = 32
+	CharEscapeEx  = 91
+	CharBackspace = 127
+)
+
+const (
+	KeyDel    = 51
+	KeyUp     = 65
+	KeyDown   = 66
+	KeyRight  = 67
+	KeyLeft   = 68
+	MetaEnd   = 70
+	MetaStart = 72
+)
+
+const (
+	CursorUp    = "\033[1A"
+	CursorDown  = "\033[1B"
+	CursorRight = "\033[1C"
+	CursorLeft  = "\033[1D"
+
+	CursorSave    = "\033[s"
+	CursorRestore = "\033[u"
+
+	CursorUpN    = "\033[%dA"
+	CursorDownN  = "\033[%dB"
+	CursorRightN = "\033[%dC"
+	CursorLeftN  = "\033[%dD"
+
+	CursorEOL  = "\033[E"
+	CursorBOL  = "\033[1G"
+	CursorHide = "\033[?25l"
+	CursorShow = "\033[?25h"
+
+	ClearToEOL  = "\033[K"
+	ClearLine   = "\033[2K"
+	ClearScreen = "\033[2J"
+	CursorReset = "\033[0;0f"
+
+	ColorGrey    = "\033[38;5;245m"
+	ColorDefault = "\033[0m"
+
+	StartBracketedPaste = "\033[?2004h"
+	EndBracketedPaste   = "\033[?2004l"
+)
+
+const (
+	CharBracketedPaste      = 50
+	CharBracketedPasteStart = "00~"
+	CharBracketedPasteEnd   = "01~"
+)
+
+type PasteMode int
+
+const (
+	PastModeOff = iota
+	PasteModeStart
+	PasteModeEnd
+)
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -74,6 +74,9 @@ configure_systemd() {
        $SUDO useradd -r -s /bin/false -m -d /usr/share/ollama ollama
    fi

+    status "Adding current user to ollama group..."
+    $SUDO usermod -a -G ollama $(whoami)
+
    status "Creating ollama systemd service..."
    cat <<EOF | $SUDO tee /etc/systemd/system/ollama.service >/dev/null
 [Unit]
@@ -86,7 +89,6 @@ User=ollama
 Group=ollama
 Restart=always
 RestartSec=3
-Environment="HOME=/usr/share/ollama"
 Environment="PATH=$PATH"

 [Install]
--- a/server/download.go
+++ b/server/download.go
@@ -15,6 +15,7 @@ import (
 	"strings"
 	"sync"
 	"sync/atomic"
+	"syscall"
 	"time"

 	"golang.org/x/sync/errgroup"
@@ -158,7 +159,8 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *Regis
 				w := io.NewOffsetWriter(file, part.StartsAt())
 				err := b.downloadChunk(inner, requestURL, w, part, opts)
 				switch {
-				case errors.Is(err, context.Canceled):
+				case errors.Is(err, context.Canceled), errors.Is(err, syscall.ENOSPC):
+					// return immediately if the context is canceled or the device is out of space
 					return err
 				case err != nil:
 					log.Printf("%s part %d attempt %d failed: %v, retrying", b.Digest[7:19], i, try, err)
--- a/server/images.go
+++ b/server/images.go
@@ -131,7 +131,7 @@ func (m *ManifestV2) GetTotalSize() (total int64) {
 }

 func GetManifest(mp ModelPath) (*ManifestV2, string, error) {
-	fp, err := mp.GetManifestPath(false)
+	fp, err := mp.GetManifestPath()
 	if err != nil {
 		return nil, "", err
 	}
@@ -595,10 +595,13 @@ func CreateManifest(name string, cfg *LayerReader, layers []*Layer) error {
 		return err
 	}

-	fp, err := mp.GetManifestPath(true)
+	fp, err := mp.GetManifestPath()
 	if err != nil {
 		return err
 	}
+	if err := os.MkdirAll(filepath.Dir(fp), 0o755); err != nil {
+		return err
+	}
 	return os.WriteFile(fp, manifestJSON, 0o644)
 }

@@ -710,16 +713,19 @@ func CreateLayer(f io.ReadSeeker) (*LayerReader, error) {

 func CopyModel(src, dest string) error {
 	srcModelPath := ParseModelPath(src)
-	srcPath, err := srcModelPath.GetManifestPath(false)
+	srcPath, err := srcModelPath.GetManifestPath()
 	if err != nil {
 		return err
 	}

 	destModelPath := ParseModelPath(dest)
-	destPath, err := destModelPath.GetManifestPath(true)
+	destPath, err := destModelPath.GetManifestPath()
 	if err != nil {
 		return err
 	}
+	if err := os.MkdirAll(filepath.Dir(destPath), 0o755); err != nil {
+		return err
+	}

 	// copy the file
 	input, err := os.ReadFile(srcPath)
@@ -882,7 +888,7 @@ func DeleteModel(name string) error {
 		return err
 	}

-	fp, err := mp.GetManifestPath(false)
+	fp, err := mp.GetManifestPath()
 	if err != nil {
 		return err
 	}
@@ -1121,10 +1127,13 @@ func PullModel(ctx context.Context, name string, regOpts *RegistryOptions, fn fu
 		return err
 	}

-	fp, err := mp.GetManifestPath(true)
+	fp, err := mp.GetManifestPath()
 	if err != nil {
 		return err
 	}
+	if err := os.MkdirAll(filepath.Dir(fp), 0o755); err != nil {
+		return err
+	}

 	err = os.WriteFile(fp, manifestJSON, 0o644)
 	if err != nil {
--- a/server/modelpath.go
+++ b/server/modelpath.go
@@ -85,20 +85,27 @@ func (mp ModelPath) GetShortTagname() string {
 	return fmt.Sprintf("%s/%s/%s:%s", mp.Registry, mp.Namespace, mp.Repository, mp.Tag)
 }

-func (mp ModelPath) GetManifestPath(createDir bool) (string, error) {
+// modelsDir returns the value of the OLLAMA_MODELS environment variable or the user's home directory if OLLAMA_MODELS is not set.
+// The models directory is where Ollama stores its model files and manifests.
+func modelsDir() (string, error) {
+	if models, exists := os.LookupEnv("OLLAMA_MODELS"); exists {
+		return models, nil
+	}
 	home, err := os.UserHomeDir()
 	if err != nil {
 		return "", err
 	}
+	return filepath.Join(home, ".ollama", "models"), nil
+}

-	path := filepath.Join(home, ".ollama", "models", "manifests", mp.Registry, mp.Namespace, mp.Repository, mp.Tag)
-	if createDir {
-		if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil {
-			return "", err
-		}
+// GetManifestPath returns the path to the manifest file for the given model path, it is up to the caller to create the directory if it does not exist.
+func (mp ModelPath) GetManifestPath() (string, error) {
+	dir, err := modelsDir()
+	if err != nil {
+		return "", err
 	}

-	return path, nil
+	return filepath.Join(dir, "manifests", mp.Registry, mp.Namespace, mp.Repository, mp.Tag), nil
 }

 func (mp ModelPath) BaseURL() *url.URL {
@@ -109,12 +116,12 @@ func (mp ModelPath) BaseURL() *url.URL {
 }

 func GetManifestPath() (string, error) {
-	home, err := os.UserHomeDir()
+	dir, err := modelsDir()
 	if err != nil {
 		return "", err
 	}

-	path := filepath.Join(home, ".ollama", "models", "manifests")
+	path := filepath.Join(dir, "manifests")
 	if err := os.MkdirAll(path, 0o755); err != nil {
 		return "", err
 	}
@@ -123,7 +130,7 @@ func GetManifestPath() (string, error) {
 }

 func GetBlobsPath(digest string) (string, error) {
-	home, err := os.UserHomeDir()
+	dir, err := modelsDir()
 	if err != nil {
 		return "", err
 	}
@@ -132,7 +139,7 @@ func GetBlobsPath(digest string) (string, error) {
 		digest = strings.ReplaceAll(digest, ":", "-")
 	}

-	path := filepath.Join(home, ".ollama", "models", "blobs", digest)
+	path := filepath.Join(dir, "blobs", digest)
 	dirPath := filepath.Dir(path)
 	if digest == "" {
 		dirPath = path
Author	SHA1	Message	Date
Jeffrey Morgan	9ec16f0f03	fix formatting when exiting `ollama run`	2023-10-27 21:26:23 -07:00
Jeffrey Morgan	57a58db1b0	history: update pos after compact	2023-10-27 20:38:03 -07:00
Jeffrey Morgan	2d75a4537c	close input channel when receiving `io.EOF`	2023-10-27 20:26:04 -07:00
Jeffrey Morgan	4748609611	Don't quit ioloop on `NUL` character (#940 ) * dont quit ioloop on 0 rune * check for closed channel * remove unused error on `Close()`	2023-10-27 20:01:48 -07:00
Jeffrey Morgan	c0dcea1398	Update faq.md	2023-10-27 18:29:00 -07:00
Jeffrey Morgan	3a1ed9ff70	restore building runner with `AVX` on by default (#900 )	2023-10-27 12:13:44 -07:00
Bruce MacDonald	6d283882b1	catch insufficient permissions nvidia err (#934 )	2023-10-27 12:42:40 -04:00
Bruce MacDonald	5c3491f425	allow for a configurable ollama model storage directory (#897 ) * allow for a configurable ollama models directory - set OLLAMA_MODELS in the environment that ollama is running in to change where model files are stored - update docs Co-Authored-By: Jeffrey Morgan <jmorganca@gmail.com> Co-Authored-By: Jay Nakrani <dhananjaynakrani@gmail.com> Co-Authored-By: Akhil Acharya <akhilcacharya@gmail.com> Co-Authored-By: Sasha Devol <sasha.devol@protonmail.com>	2023-10-27 10:19:59 -04:00
James Braza	e5d1ce4dde	Tweaks to `README.md` (#906 ) * Mentioned Docker Hub in docs * Consolidated brew installs to one line	2023-10-27 00:10:23 -07:00
Bruce MacDonald	2665f3c28e	offload 75% of available vram to improve stability (#921 )	2023-10-26 20:49:55 -04:00
Patrick Devine	a79f030e75	add bracketed paste mode (#922 )	2023-10-26 15:57:00 -07:00
Michael Yang	9bc5864a03	Merge pull request #918 from jmorganca/mxyng/fix-out-of-space fix(download): no retry when out of space	2023-10-26 12:24:20 -07:00
Michael Yang	b88cc0fac9	Merge pull request #916 from jmorganca/mxyng/fix-client-host fix(client): trim trailing slash	2023-10-26 12:24:12 -07:00
Patrick Devine	5b2cf16397	fix docker build annotations (#917 )	2023-10-26 12:00:33 -07:00
Michael Yang	910816a532	fix(download): no retry when out of space	2023-10-26 11:34:07 -07:00
Michael Yang	28c3f288e2	client: fix trailing slash	2023-10-26 11:09:38 -07:00
Patrick Devine	deeac961bb	new readline library (#847 )	2023-10-25 16:41:18 -07:00
Jeffrey Morgan	49443e7da5	fix typo in `README.md`	2023-10-25 16:19:27 -07:00
Ajay Kemparaj	bb8464c0d2	update golang.org/x/net fixes CVE-2023-3978,CVE-2023-39325,CVE-2023-44487 (#855 )	2023-10-25 16:17:24 -07:00
Michael Yang	daa5bb4473	Merge pull request #907 from jmorganca/mxyng/linux update linux.md	2023-10-25 15:03:34 -07:00
Michael Yang	92119de9d8	update linux.md	2023-10-25 14:57:50 -07:00
Michael Yang	53b0ba8d43	Merge pull request #893 from jmorganca/mxyng/update-faq update faq	2023-10-24 16:02:35 -07:00
Michael Yang	db342691f9	Update docs/faq.md Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>	2023-10-24 13:59:33 -07:00
Bruce MacDonald	cecf83141e	Linux uninstall instructions (#894 )	2023-10-24 14:07:05 -04:00
Michael Yang	a5a2adf1ec	update faq	2023-10-24 10:54:16 -07:00
Jeffrey Morgan	b0c9cd0f3b	fix metal assertion errors	2023-10-24 00:32:36 -07:00
Jeffrey Morgan	77f61c6301	update submodule commit	2023-10-24 00:30:27 -07:00
Jeffrey Morgan	f3604534e5	update submodule commit	2023-10-23 23:59:12 -07:00
Jeffrey Morgan	914428351a	Update import.md	2023-10-23 17:44:53 -07:00
Jeffrey Morgan	9afea9e3b9	Update import.md Separate GGUF and PyTorch guides	2023-10-23 17:42:17 -07:00
Bruce MacDonald	c039432b5c	add current user to ollama group on install (#772 )	2023-10-23 17:06:31 -04:00
Michael Yang	c345b4ca7c	Merge pull request #884 from jmorganca/mxyng/update-submodules bump submodules	2023-10-23 11:27:38 -07:00
Michael Yang	0c7a00a264	bump submodules pin to 9e70cc03229df19ca2d28ce23cc817198f897278 for now since 438c2ca83045a00ef244093d27e9ed41a8cb4ea9 is breaking	2023-10-23 11:17:59 -07:00
Michael Yang	36c160f1c3	Merge pull request #881 from jmorganca/mxyng/ggufv3 ggufv3	2023-10-23 10:50:45 -07:00
Michael Yang	b66bcaa582	Merge pull request #883 from jmorganca/mxyng/logs update default log target	2023-10-23 10:50:29 -07:00
Michael Yang	c9167494cb	update default log target	2023-10-23 10:44:50 -07:00
Michael Yang	125d0a013a	ggufv3 ggufv3 adds support for big endianness, mainly for s390x architecture. while that's not currently supported for ollama, the change is simple. loosen version check to be more forward compatible. unless specified, gguf versions other v1 will be decoded into v2.	2023-10-23 09:35:49 -07:00
Richard Awoyemi	ba2da6ceaa	Added a minimalist React UI for Ollama models to the community contributions.md (#870 )	2023-10-23 10:44:39 -04:00