wip /api/chat

2025-12-30 03:00:38 -05:00 · 2023-10-01 14:54:17 -07:00
11 changed files with 214 additions and 129 deletions
--- a/README.md
+++ b/README.md
@@ -217,7 +217,6 @@ curl -X POST http://localhost:11434/api/generate -d '{
 - [Dagger Chatbot](https://github.com/samalba/dagger-chatbot)
 - [LiteLLM](https://github.com/BerriAI/litellm)
 - [Discord AI Bot](https://github.com/mekb-turtle/discord-ai-bot)
- [Chatbot UI](https://github.com/ivanfioravanti/chatbot-ollama)
 - [HTML UI](https://github.com/rtcfirefly/ollama-ui)
 - [Typescript UI](https://github.com/ollama-interface/Ollama-Gui?tab=readme-ov-file)
 - [Dumbar](https://github.com/JerrySievert/Dumbar)
--- a/api/types.go
+++ b/api/types.go
@@ -31,6 +31,22 @@ func (e StatusError) Error() string {
 	}
 }

+// /api/chat
+type Message struct {
+	Role    string `json:"role"`
+	Content string `json:"content"`
+}
+
+type ChatRequest struct {
+	Model    string    `json:"model"`
+	Messages []Message `json:"messages"`
+}
+
+type ChatResponse struct {
+	CreatedAt time.Time `json:"created_at"`
+	Message   Message   `json:"message"`
+}
+
 type GenerateRequest struct {
 	Model    string `json:"model"`
 	Prompt   string `json:"prompt"`
@@ -280,38 +296,38 @@ func (opts *Options) FromMap(m map[string]interface{}) error {

 func DefaultOptions() Options {
 	return Options{
-		// options set on request to runner
-		NumPredict:       -1,
-		NumKeep:          -1,
+		Seed: -1,
+
+		UseNUMA: false,
+
+		NumCtx:             2048,
+		NumKeep:            -1,
+		NumBatch:           512,
+		NumGPU:             -1, // -1 here indicates that NumGPU should be set dynamically
+		NumGQA:             1,
+		LowVRAM:            false,
+		F16KV:              true,
+		UseMMap:            true,
+		UseMLock:           false,
+		RopeFrequencyBase:  10000.0,
+		RopeFrequencyScale: 1.0,
+		EmbeddingOnly:      true,
+
+		RepeatLastN:      64,
+		RepeatPenalty:    1.1,
+		FrequencyPenalty: 0.0,
+		PresencePenalty:  0.0,
 		Temperature:      0.8,
 		TopK:             40,
 		TopP:             0.9,
 		TFSZ:             1.0,
 		TypicalP:         1.0,
-		RepeatLastN:      64,
-		RepeatPenalty:    1.1,
-		PresencePenalty:  0.0,
-		FrequencyPenalty: 0.0,
 		Mirostat:         0,
 		MirostatTau:      5.0,
 		MirostatEta:      0.1,
 		PenalizeNewline:  true,
-		Seed:             -1,

-		// options set when the model is loaded
-		NumCtx:             2048,
-		RopeFrequencyBase:  10000.0,
-		RopeFrequencyScale: 1.0,
-		NumBatch:           512,
-		NumGPU:             -1, // -1 here indicates that NumGPU should be set dynamically
-		NumGQA:             1,
-		NumThread:          0, // let the runtime decide
-		LowVRAM:            false,
-		F16KV:              true,
-		UseMLock:           false,
-		UseMMap:            true,
-		UseNUMA:            false,
-		EmbeddingOnly:      true,
+		NumThread: 0, // let the runtime decide
 	}
 }

--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -380,20 +380,7 @@ func pull(model string, insecure bool) error {
 func RunGenerate(cmd *cobra.Command, args []string) error {
 	if len(args) > 1 {
 		// join all args into a single prompt
-		wordWrap := false
-		if term.IsTerminal(int(os.Stdout.Fd())) {
-			wordWrap = true
-		}
-
-		nowrap, err := cmd.Flags().GetBool("nowordwrap")
-		if err != nil {
-			return err
-		}
-		if nowrap {
-			wordWrap = false
-		}
-
-		return generate(cmd, args[0], strings.Join(args[1:], " "), wordWrap)
+		return generate(cmd, args[0], strings.Join(args[1:], " "))
 	}

 	if readline.IsTerminal(int(os.Stdin.Fd())) {
@@ -405,7 +392,7 @@ func RunGenerate(cmd *cobra.Command, args []string) error {

 type generateContextKey string

-func generate(cmd *cobra.Command, model, prompt string, wordWrap bool) error {
+func generate(cmd *cobra.Command, model, prompt string) error {
 	client, err := api.FromEnv()
 	if err != nil {
 		return err
@@ -421,9 +408,24 @@ func generate(cmd *cobra.Command, model, prompt string, wordWrap bool) error {
 		generateContext = []int{}
 	}

+	var wrapTerm bool
+	termType := os.Getenv("TERM")
+	if termType == "xterm-256color" {
+		wrapTerm = true
+	}
+
 	termWidth, _, err := term.GetSize(int(0))
 	if err != nil {
-		wordWrap = false
+		wrapTerm = false
+	}
+
+	// override wrapping if the user turned it off
+	nowrap, err := cmd.Flags().GetBool("nowordwrap")
+	if err != nil {
+		return err
+	}
+	if nowrap {
+		wrapTerm = false
 	}

 	cancelCtx, cancel := context.WithCancel(context.Background())
@@ -450,7 +452,7 @@ func generate(cmd *cobra.Command, model, prompt string, wordWrap bool) error {

 		latest = response

-		if wordWrap {
+		if wrapTerm {
 			for _, ch := range response.Response {
 				if currentLineLength+1 > termWidth-5 {
 					// backtrack the length of the last word and clear to the end of the line
@@ -531,7 +533,7 @@ func generateInteractive(cmd *cobra.Command, model string) error {
 	}

 	// load the model
-	if err := generate(cmd, model, "", false); err != nil {
+	if err := generate(cmd, model, ""); err != nil {
 		return err
 	}

@@ -577,21 +579,6 @@ func generateInteractive(cmd *cobra.Command, model string) error {
 	}
 	defer scanner.Close()

-	var wordWrap bool
-	termType := os.Getenv("TERM")
-	if termType == "xterm-256color" {
-		wordWrap = true
-	}
-
-	// override wrapping if the user turned it off
-	nowrap, err := cmd.Flags().GetBool("nowordwrap")
-	if err != nil {
-		return err
-	}
-	if nowrap {
-		wordWrap = false
-	}
-
 	var multiLineBuffer string
 	var isMultiLine bool

@@ -645,10 +632,10 @@ func generateInteractive(cmd *cobra.Command, model string) error {
 				case "nohistory":
 					scanner.HistoryDisable()
 				case "wordwrap":
-					wordWrap = true
+					cmd.Flags().Set("nowordwrap", "false")
 					fmt.Println("Set 'wordwrap' mode.")
 				case "nowordwrap":
-					wordWrap = false
+					cmd.Flags().Set("nowordwrap", "true")
 					fmt.Println("Set 'nowordwrap' mode.")
 				case "verbose":
 					cmd.Flags().Set("verbose", "true")
@@ -686,31 +673,15 @@ func generateInteractive(cmd *cobra.Command, model string) error {

 				switch args[1] {
 				case "license":
-					if resp.License == "" {
-						fmt.Println("No license was specified for this model.\n")
-					} else {
-						fmt.Println(resp.License)
-					}
+					fmt.Println(resp.License)
 				case "modelfile":
 					fmt.Println(resp.Modelfile)
 				case "parameters":
-					if resp.Parameters == "" {
-						fmt.Println("No parameters were specified for this model.\n")
-					} else {
-						fmt.Println(resp.Parameters)
-					}
+					fmt.Println(resp.Parameters)
 				case "system":
-					if resp.System == "" {
-						fmt.Println("No system prompt was specified for this model.\n")
-					} else {
-						fmt.Println(resp.System)
-					}
+					fmt.Println(resp.System)
 				case "template":
-					if resp.Template == "" {
-						fmt.Println("No prompt template was specified for this model.\n")
-					} else {
-						fmt.Println(resp.Template)
-					}
+					fmt.Println(resp.Template)
 				default:
 					fmt.Printf("Unknown command '/show %s'. Type /? for help\n", args[1])
 				}
@@ -727,7 +698,7 @@ func generateInteractive(cmd *cobra.Command, model string) error {
 		}

 		if len(line) > 0 && line[0] != '/' {
-			if err := generate(cmd, model, line, wordWrap); err != nil {
+			if err := generate(cmd, model, line); err != nil {
 				return err
 			}
 		}
@@ -739,7 +710,7 @@ func generateBatch(cmd *cobra.Command, model string) error {
 	for scanner.Scan() {
 		prompt := scanner.Text()
 		fmt.Printf(">>> %s\n", prompt)
-		if err := generate(cmd, model, prompt, false); err != nil {
+		if err := generate(cmd, model, prompt); err != nil {
 			return err
 		}
 	}
--- a/docs/development.md
+++ b/docs/development.md
@@ -10,25 +10,25 @@ Install required tools:
 - go version 1.20 or higher
 - gcc version 11.4.0 or higher

-```bash
+```
 brew install go cmake gcc
 ```

 Get the required libraries:

-```bash
+```
 go generate ./...
 ```

 Then build ollama:

-```bash
+```
 go build .
 ```

 Now you can run `ollama`:

-```bash
+```
 ./ollama
 ```

--- a/docs/faq.md
+++ b/docs/faq.md
@@ -2,13 +2,13 @@

 ## How can I expose the Ollama server?

-```bash
+```
 OLLAMA_HOST=0.0.0.0:11435 ollama serve
 ```

 By default, Ollama allows cross origin requests from `127.0.0.1` and `0.0.0.0`. To support more origins, you can use the `OLLAMA_ORIGINS` environment variable:

-```bash
+```
 OLLAMA_ORIGINS=http://192.168.1.1:*,https://example.com ollama serve
 ```

@@ -16,3 +16,4 @@ OLLAMA_ORIGINS=http://192.168.1.1:*,https://example.com ollama serve

 * macOS: Raw model data is stored under `~/.ollama/models`.
 * Linux: Raw model data is stored under `/usr/share/ollama/.ollama/models`
+
--- a/docs/linux.md
+++ b/docs/linux.md
@@ -2,7 +2,7 @@

 > Note: A one line installer for Ollama is available by running:
 >
-> ```bash
+> ```
 > curl https://ollama.ai/install.sh | sh
 > ```

@@ -10,7 +10,7 @@

 Ollama is distributed as a self-contained binary. Download it to a directory in your PATH:

-```bash
+```
 sudo curl -L https://ollama.ai/download/ollama-linux-amd64 -o /usr/bin/ollama
 sudo chmod +x /usr/bin/ollama
 ```
@@ -19,13 +19,13 @@ sudo chmod +x /usr/bin/ollama

 Start Ollama by running `ollama serve`:

-```bash
+```
 ollama serve
 ```

 Once Ollama is running, run a model in another terminal session:

-```bash
+```
 ollama run llama2
 ```

@@ -35,7 +35,7 @@ ollama run llama2

 Verify that the drivers are installed by running the following command, which should print details about your GPU:

-```bash
+```
 nvidia-smi
 ```

@@ -43,7 +43,7 @@ nvidia-smi

 Create a user for Ollama:

-```bash
+```
 sudo useradd -r -s /bin/false -m -d /usr/share/ollama ollama
 ```

@@ -68,7 +68,7 @@ WantedBy=default.target

 Then start the service:

-```bash
+```
 sudo systemctl daemon-reload
 sudo systemctl enable ollama
 ```
@@ -77,7 +77,7 @@ sudo systemctl enable ollama

 To view logs of Ollama running as a startup service, run:

-```bash
+```
 journalctl -u ollama
 ```

--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@@ -44,7 +44,7 @@ INSTRUCTION arguments

 An example of a model file creating a mario blueprint:

-```modelfile
+```
 FROM llama2
 # sets the temperature to 1 [higher is more creative, lower is more coherent]
 PARAMETER temperature 1
@@ -70,13 +70,13 @@ More examples are available in the [examples directory](../examples).

 The FROM instruction defines the base model to use when creating a model.

-```modelfile
+```
 FROM <model name>:<tag>
 ```

 #### Build from llama2

-```modelfile
+```
 FROM llama2
 ```

@@ -85,7 +85,7 @@ A list of available base models:

 #### Build from a bin file

-```modelfile
+```
 FROM ./ollama-model.bin
 ```

@@ -95,7 +95,7 @@ This bin file location should be specified as an absolute path or relative to th

 The EMBED instruction is used to add embeddings of files to a model. This is useful for adding custom data that the model can reference when generating an answer. Note that currently only text files are supported, formatted with each line as one embedding.

-```modelfile
+```
 FROM <model name>:<tag>
 EMBED <file path>.txt
 EMBED <different file path>.txt
@@ -106,7 +106,7 @@ EMBED <path to directory>/*.txt

 The `PARAMETER` instruction defines a parameter that can be set when the model is run.

-```modelfile
+```
 PARAMETER <parameter> <parametervalue>
 ```

@@ -142,7 +142,7 @@ PARAMETER <parameter> <parametervalue>
 | `{{ .Prompt }}` | The incoming prompt, this is not specified in the model file and will be set based on input.                 |
 | `{{ .First }}`  | A boolean value used to render specific template information for the first generation of a session.          |

-```modelfile
+```
 TEMPLATE """
 {{- if .First }}
 ### System:
@@ -162,7 +162,7 @@ SYSTEM """<system message>"""

 The `SYSTEM` instruction specifies the system prompt to be used in the template, if applicable.

-```modelfile
+```
 SYSTEM """<system message>"""
 ```

@@ -170,7 +170,7 @@ SYSTEM """<system message>"""

 The `ADAPTER` instruction specifies the LoRA adapter to apply to the base model. The value of this instruction should be an absolute path or a path relative to the Modelfile and the file must be in a GGML file format. The adapter should be tuned from the base model otherwise the behaviour is undefined.

-```modelfile
+```
 ADAPTER ./ollama-lora.bin
 ```

@@ -178,7 +178,7 @@ ADAPTER ./ollama-lora.bin

 The `LICENSE` instruction allows you to specify the legal license under which the model used with this Modelfile is shared or distributed.

-```modelfile
+```
 LICENSE """
 <license text>
 """
--- a/llm/llama.go
+++ b/llm/llama.go
@@ -218,6 +218,7 @@ func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
 	if opts.NumGPU != -1 {
 		return opts.NumGPU
 	}
+	n := 1 // default to enable metal on macOS
 	if runtime.GOOS == "linux" {
 		vramMib, err := CheckVRAM()
 		if err != nil {
@@ -234,11 +235,10 @@ func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
 		// TODO: this is a rough heuristic, better would be to calculate this based on number of layers and context size
 		bytesPerLayer := fileSizeBytes / numLayer

-		// max number of layers we can fit in VRAM
-		layers := int(totalVramBytes / bytesPerLayer)
-		log.Printf("%d MiB VRAM available, loading up to %d GPU layers", vramMib, layers)
+		// set n to the max number of layers we can fit in VRAM
+		return int(totalVramBytes / bytesPerLayer)

-		return layers
+		log.Printf("%d MiB VRAM available, loading up to %d GPU layers", vramMib, n)
 	}
 	// default to enable metal on macOS
 	return 1
@@ -417,25 +417,28 @@ type Prediction struct {
 }

 type PredictRequest struct {
-	Prompt           string   `json:"prompt"`
-	Stream           bool     `json:"stream"`
-	NPredict         int      `json:"n_predict"`
-	NKeep            int      `json:"n_keep"`
-	Temperature      float32  `json:"temperature"`
-	TopK             int      `json:"top_k"`
-	TopP             float32  `json:"top_p"`
-	TfsZ             float32  `json:"tfs_z"`
-	TypicalP         float32  `json:"typical_p"`
-	RepeatLastN      int      `json:"repeat_last_n"`
-	RepeatPenalty    float32  `json:"repeat_penalty"`
-	PresencePenalty  float32  `json:"presence_penalty"`
-	FrequencyPenalty float32  `json:"frequency_penalty"`
-	Mirostat         int      `json:"mirostat"`
-	MirostatTau      float32  `json:"mirostat_tau"`
-	MirostatEta      float32  `json:"mirostat_eta"`
-	PenalizeNl       bool     `json:"penalize_nl"`
-	Seed             int      `json:"seed"`
-	Stop             []string `json:"stop,omitempty"`
+	Stream           bool            `json:"stream"`
+	NPredict         int             `json:"n_predict,omitempty"`
+	TopK             int             `json:"top_k,omitempty"`
+	TopP             float32         `json:"top_p,omitempty"`
+	TfsZ             float32         `json:"tfs_z,omitempty"`
+	TypicalP         float32         `json:"typical_p,omitempty"`
+	RepeatLastN      int             `json:"repeat_last_n,omitempty"`
+	Temperature      float32         `json:"temperature,omitempty"`
+	RepeatPenalty    float32         `json:"repeat_penalty,omitempty"`
+	PresencePenalty  float32         `json:"presence_penalty,omitempty"`
+	FrequencyPenalty float32         `json:"frequency_penalty,omitempty"`
+	Mirostat         int             `json:"mirostat,omitempty"`
+	MirostatTau      float32         `json:"mirostat_tau,omitempty"`
+	MirostatEta      float32         `json:"mirostat_eta,omitempty"`
+	PenalizeNl       bool            `json:"penalize_nl,omitempty"`
+	NKeep            int             `json:"n_keep,omitempty"`
+	Seed             int             `json:"seed,omitempty"`
+	Prompt           string          `json:"prompt,omitempty"`
+	NProbs           int             `json:"n_probs,omitempty"`
+	LogitBias        map[int]float32 `json:"logit_bias,omitempty"`
+	IgnoreEos        bool            `json:"ignore_eos,omitempty"`
+	Stop             []string        `json:"stop,omitempty"`
 }

 func (llm *llama) Predict(ctx context.Context, prevContext []int, prompt string, fn func(api.GenerateResponse)) error {
@@ -467,10 +470,8 @@ func (llm *llama) Predict(ctx context.Context, prevContext []int, prompt string,
 		MirostatTau:      llm.MirostatTau,
 		MirostatEta:      llm.MirostatEta,
 		PenalizeNl:       llm.PenalizeNewline,
-		Seed:             llm.Seed,
 		Stop:             llm.Stop,
 	}
-
 	data, err := json.Marshal(predReq)
 	if err != nil {
 		return fmt.Errorf("error marshaling data: %v", err)
--- a/scripts/build_docker.sh
+++ b/scripts/build_docker.sh
--- a/server/images.go
+++ b/server/images.go
@@ -54,6 +54,54 @@ type Model struct {
 	Embeddings    []vector.Embedding
 }

+func (m *Model) ChatPrompt(messages []api.Message) (string, error) {
+	tmpl, err := template.New("").Parse(m.Template)
+	if err != nil {
+		return "", err
+	}
+
+	var vars struct {
+		System string
+		Prompt string
+		First  bool
+	}
+
+	vars.First = true
+
+	var sb strings.Builder
+	flush := func() {
+		tmpl.Execute(&sb, vars)
+		vars.System = ""
+		vars.Prompt = ""
+	}
+
+	// build the chat history from messages
+	for _, m := range messages {
+		if m.Role == "system" {
+			if vars.System != "" {
+				flush()
+			}
+			vars.System = m.Content
+		}
+
+		if m.Role == "user" {
+			if vars.Prompt != "" {
+				flush()
+			}
+			vars.Prompt = m.Content
+		}
+
+		if m.Role == "assistant" {
+			flush()
+			sb.Write([]byte(m.Content))
+		}
+	}
+
+	flush()
+
+	return sb.String(), nil
+}
+
 func (m *Model) Prompt(request api.GenerateRequest, embedding string) (string, error) {
 	t := m.Template
 	if request.Template != "" {
--- a/server/routes.go
+++ b/server/routes.go
@@ -156,6 +156,54 @@ func load(ctx context.Context, workDir string, model *Model, reqOpts map[string]
 	return nil
 }

+func ChatModelHandler(c *gin.Context) {
+	loaded.mu.Lock()
+	defer loaded.mu.Unlock()
+
+	var req api.ChatRequest
+	if err := c.ShouldBindJSON(&req); err != nil {
+		c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
+		return
+	}
+
+	model, err := GetModel(req.Model)
+	if err != nil {
+		c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
+		return
+	}
+
+	prompt, err := model.ChatPrompt(req.Messages)
+	if err != nil {
+		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+		return
+	}
+
+	var response string
+	fn := func(r api.GenerateResponse) {
+		response += r.Response
+	}
+
+	workDir := c.GetString("workDir")
+	if err := load(c.Request.Context(), workDir, model, nil, defaultSessionDuration); err != nil {
+		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+		return
+	}
+
+	fmt.Println(prompt)
+
+	if err := loaded.llm.Predict(c.Request.Context(), []int{}, prompt, fn); err != nil {
+		c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
+	}
+
+	c.JSON(http.StatusOK, api.ChatResponse{
+		Message: api.Message{
+			Role:    "assistant",
+			Content: response,
+		},
+		CreatedAt: time.Now().UTC(),
+	})
+}
+
 func GenerateHandler(c *gin.Context) {
 	loaded.mu.Lock()
 	defer loaded.mu.Unlock()
@@ -552,6 +600,7 @@ func Serve(ln net.Listener, allowOrigins []string) error {
 		},
 	)

+	r.POST("/api/chat", ChatModelHandler)
 	r.POST("/api/pull", PullModelHandler)
 	r.POST("/api/generate", GenerateHandler)
 	r.POST("/api/embeddings", EmbeddingHandler)