Compare commits

..

1 Commits
v0.1.1 ... api

Author SHA1 Message Date
Jeffrey Morgan
949fc4eafa wip /api/chat 2023-10-01 14:54:17 -07:00
11 changed files with 214 additions and 129 deletions

View File

@@ -217,7 +217,6 @@ curl -X POST http://localhost:11434/api/generate -d '{
- [Dagger Chatbot](https://github.com/samalba/dagger-chatbot)
- [LiteLLM](https://github.com/BerriAI/litellm)
- [Discord AI Bot](https://github.com/mekb-turtle/discord-ai-bot)
- [Chatbot UI](https://github.com/ivanfioravanti/chatbot-ollama)
- [HTML UI](https://github.com/rtcfirefly/ollama-ui)
- [Typescript UI](https://github.com/ollama-interface/Ollama-Gui?tab=readme-ov-file)
- [Dumbar](https://github.com/JerrySievert/Dumbar)

View File

@@ -31,6 +31,22 @@ func (e StatusError) Error() string {
}
}
// /api/chat
type Message struct {
Role string `json:"role"`
Content string `json:"content"`
}
type ChatRequest struct {
Model string `json:"model"`
Messages []Message `json:"messages"`
}
type ChatResponse struct {
CreatedAt time.Time `json:"created_at"`
Message Message `json:"message"`
}
type GenerateRequest struct {
Model string `json:"model"`
Prompt string `json:"prompt"`
@@ -280,38 +296,38 @@ func (opts *Options) FromMap(m map[string]interface{}) error {
func DefaultOptions() Options {
return Options{
// options set on request to runner
NumPredict: -1,
NumKeep: -1,
Seed: -1,
UseNUMA: false,
NumCtx: 2048,
NumKeep: -1,
NumBatch: 512,
NumGPU: -1, // -1 here indicates that NumGPU should be set dynamically
NumGQA: 1,
LowVRAM: false,
F16KV: true,
UseMMap: true,
UseMLock: false,
RopeFrequencyBase: 10000.0,
RopeFrequencyScale: 1.0,
EmbeddingOnly: true,
RepeatLastN: 64,
RepeatPenalty: 1.1,
FrequencyPenalty: 0.0,
PresencePenalty: 0.0,
Temperature: 0.8,
TopK: 40,
TopP: 0.9,
TFSZ: 1.0,
TypicalP: 1.0,
RepeatLastN: 64,
RepeatPenalty: 1.1,
PresencePenalty: 0.0,
FrequencyPenalty: 0.0,
Mirostat: 0,
MirostatTau: 5.0,
MirostatEta: 0.1,
PenalizeNewline: true,
Seed: -1,
// options set when the model is loaded
NumCtx: 2048,
RopeFrequencyBase: 10000.0,
RopeFrequencyScale: 1.0,
NumBatch: 512,
NumGPU: -1, // -1 here indicates that NumGPU should be set dynamically
NumGQA: 1,
NumThread: 0, // let the runtime decide
LowVRAM: false,
F16KV: true,
UseMLock: false,
UseMMap: true,
UseNUMA: false,
EmbeddingOnly: true,
NumThread: 0, // let the runtime decide
}
}

View File

@@ -380,20 +380,7 @@ func pull(model string, insecure bool) error {
func RunGenerate(cmd *cobra.Command, args []string) error {
if len(args) > 1 {
// join all args into a single prompt
wordWrap := false
if term.IsTerminal(int(os.Stdout.Fd())) {
wordWrap = true
}
nowrap, err := cmd.Flags().GetBool("nowordwrap")
if err != nil {
return err
}
if nowrap {
wordWrap = false
}
return generate(cmd, args[0], strings.Join(args[1:], " "), wordWrap)
return generate(cmd, args[0], strings.Join(args[1:], " "))
}
if readline.IsTerminal(int(os.Stdin.Fd())) {
@@ -405,7 +392,7 @@ func RunGenerate(cmd *cobra.Command, args []string) error {
type generateContextKey string
func generate(cmd *cobra.Command, model, prompt string, wordWrap bool) error {
func generate(cmd *cobra.Command, model, prompt string) error {
client, err := api.FromEnv()
if err != nil {
return err
@@ -421,9 +408,24 @@ func generate(cmd *cobra.Command, model, prompt string, wordWrap bool) error {
generateContext = []int{}
}
var wrapTerm bool
termType := os.Getenv("TERM")
if termType == "xterm-256color" {
wrapTerm = true
}
termWidth, _, err := term.GetSize(int(0))
if err != nil {
wordWrap = false
wrapTerm = false
}
// override wrapping if the user turned it off
nowrap, err := cmd.Flags().GetBool("nowordwrap")
if err != nil {
return err
}
if nowrap {
wrapTerm = false
}
cancelCtx, cancel := context.WithCancel(context.Background())
@@ -450,7 +452,7 @@ func generate(cmd *cobra.Command, model, prompt string, wordWrap bool) error {
latest = response
if wordWrap {
if wrapTerm {
for _, ch := range response.Response {
if currentLineLength+1 > termWidth-5 {
// backtrack the length of the last word and clear to the end of the line
@@ -531,7 +533,7 @@ func generateInteractive(cmd *cobra.Command, model string) error {
}
// load the model
if err := generate(cmd, model, "", false); err != nil {
if err := generate(cmd, model, ""); err != nil {
return err
}
@@ -577,21 +579,6 @@ func generateInteractive(cmd *cobra.Command, model string) error {
}
defer scanner.Close()
var wordWrap bool
termType := os.Getenv("TERM")
if termType == "xterm-256color" {
wordWrap = true
}
// override wrapping if the user turned it off
nowrap, err := cmd.Flags().GetBool("nowordwrap")
if err != nil {
return err
}
if nowrap {
wordWrap = false
}
var multiLineBuffer string
var isMultiLine bool
@@ -645,10 +632,10 @@ func generateInteractive(cmd *cobra.Command, model string) error {
case "nohistory":
scanner.HistoryDisable()
case "wordwrap":
wordWrap = true
cmd.Flags().Set("nowordwrap", "false")
fmt.Println("Set 'wordwrap' mode.")
case "nowordwrap":
wordWrap = false
cmd.Flags().Set("nowordwrap", "true")
fmt.Println("Set 'nowordwrap' mode.")
case "verbose":
cmd.Flags().Set("verbose", "true")
@@ -686,31 +673,15 @@ func generateInteractive(cmd *cobra.Command, model string) error {
switch args[1] {
case "license":
if resp.License == "" {
fmt.Println("No license was specified for this model.\n")
} else {
fmt.Println(resp.License)
}
fmt.Println(resp.License)
case "modelfile":
fmt.Println(resp.Modelfile)
case "parameters":
if resp.Parameters == "" {
fmt.Println("No parameters were specified for this model.\n")
} else {
fmt.Println(resp.Parameters)
}
fmt.Println(resp.Parameters)
case "system":
if resp.System == "" {
fmt.Println("No system prompt was specified for this model.\n")
} else {
fmt.Println(resp.System)
}
fmt.Println(resp.System)
case "template":
if resp.Template == "" {
fmt.Println("No prompt template was specified for this model.\n")
} else {
fmt.Println(resp.Template)
}
fmt.Println(resp.Template)
default:
fmt.Printf("Unknown command '/show %s'. Type /? for help\n", args[1])
}
@@ -727,7 +698,7 @@ func generateInteractive(cmd *cobra.Command, model string) error {
}
if len(line) > 0 && line[0] != '/' {
if err := generate(cmd, model, line, wordWrap); err != nil {
if err := generate(cmd, model, line); err != nil {
return err
}
}
@@ -739,7 +710,7 @@ func generateBatch(cmd *cobra.Command, model string) error {
for scanner.Scan() {
prompt := scanner.Text()
fmt.Printf(">>> %s\n", prompt)
if err := generate(cmd, model, prompt, false); err != nil {
if err := generate(cmd, model, prompt); err != nil {
return err
}
}

View File

@@ -10,25 +10,25 @@ Install required tools:
- go version 1.20 or higher
- gcc version 11.4.0 or higher
```bash
```
brew install go cmake gcc
```
Get the required libraries:
```bash
```
go generate ./...
```
Then build ollama:
```bash
```
go build .
```
Now you can run `ollama`:
```bash
```
./ollama
```

View File

@@ -2,13 +2,13 @@
## How can I expose the Ollama server?
```bash
```
OLLAMA_HOST=0.0.0.0:11435 ollama serve
```
By default, Ollama allows cross origin requests from `127.0.0.1` and `0.0.0.0`. To support more origins, you can use the `OLLAMA_ORIGINS` environment variable:
```bash
```
OLLAMA_ORIGINS=http://192.168.1.1:*,https://example.com ollama serve
```
@@ -16,3 +16,4 @@ OLLAMA_ORIGINS=http://192.168.1.1:*,https://example.com ollama serve
* macOS: Raw model data is stored under `~/.ollama/models`.
* Linux: Raw model data is stored under `/usr/share/ollama/.ollama/models`

View File

@@ -2,7 +2,7 @@
> Note: A one line installer for Ollama is available by running:
>
> ```bash
> ```
> curl https://ollama.ai/install.sh | sh
> ```
@@ -10,7 +10,7 @@
Ollama is distributed as a self-contained binary. Download it to a directory in your PATH:
```bash
```
sudo curl -L https://ollama.ai/download/ollama-linux-amd64 -o /usr/bin/ollama
sudo chmod +x /usr/bin/ollama
```
@@ -19,13 +19,13 @@ sudo chmod +x /usr/bin/ollama
Start Ollama by running `ollama serve`:
```bash
```
ollama serve
```
Once Ollama is running, run a model in another terminal session:
```bash
```
ollama run llama2
```
@@ -35,7 +35,7 @@ ollama run llama2
Verify that the drivers are installed by running the following command, which should print details about your GPU:
```bash
```
nvidia-smi
```
@@ -43,7 +43,7 @@ nvidia-smi
Create a user for Ollama:
```bash
```
sudo useradd -r -s /bin/false -m -d /usr/share/ollama ollama
```
@@ -68,7 +68,7 @@ WantedBy=default.target
Then start the service:
```bash
```
sudo systemctl daemon-reload
sudo systemctl enable ollama
```
@@ -77,7 +77,7 @@ sudo systemctl enable ollama
To view logs of Ollama running as a startup service, run:
```bash
```
journalctl -u ollama
```

View File

@@ -44,7 +44,7 @@ INSTRUCTION arguments
An example of a model file creating a mario blueprint:
```modelfile
```
FROM llama2
# sets the temperature to 1 [higher is more creative, lower is more coherent]
PARAMETER temperature 1
@@ -70,13 +70,13 @@ More examples are available in the [examples directory](../examples).
The FROM instruction defines the base model to use when creating a model.
```modelfile
```
FROM <model name>:<tag>
```
#### Build from llama2
```modelfile
```
FROM llama2
```
@@ -85,7 +85,7 @@ A list of available base models:
#### Build from a bin file
```modelfile
```
FROM ./ollama-model.bin
```
@@ -95,7 +95,7 @@ This bin file location should be specified as an absolute path or relative to th
The EMBED instruction is used to add embeddings of files to a model. This is useful for adding custom data that the model can reference when generating an answer. Note that currently only text files are supported, formatted with each line as one embedding.
```modelfile
```
FROM <model name>:<tag>
EMBED <file path>.txt
EMBED <different file path>.txt
@@ -106,7 +106,7 @@ EMBED <path to directory>/*.txt
The `PARAMETER` instruction defines a parameter that can be set when the model is run.
```modelfile
```
PARAMETER <parameter> <parametervalue>
```
@@ -142,7 +142,7 @@ PARAMETER <parameter> <parametervalue>
| `{{ .Prompt }}` | The incoming prompt, this is not specified in the model file and will be set based on input. |
| `{{ .First }}` | A boolean value used to render specific template information for the first generation of a session. |
```modelfile
```
TEMPLATE """
{{- if .First }}
### System:
@@ -162,7 +162,7 @@ SYSTEM """<system message>"""
The `SYSTEM` instruction specifies the system prompt to be used in the template, if applicable.
```modelfile
```
SYSTEM """<system message>"""
```
@@ -170,7 +170,7 @@ SYSTEM """<system message>"""
The `ADAPTER` instruction specifies the LoRA adapter to apply to the base model. The value of this instruction should be an absolute path or a path relative to the Modelfile and the file must be in a GGML file format. The adapter should be tuned from the base model otherwise the behaviour is undefined.
```modelfile
```
ADAPTER ./ollama-lora.bin
```
@@ -178,7 +178,7 @@ ADAPTER ./ollama-lora.bin
The `LICENSE` instruction allows you to specify the legal license under which the model used with this Modelfile is shared or distributed.
```modelfile
```
LICENSE """
<license text>
"""

View File

@@ -218,6 +218,7 @@ func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
if opts.NumGPU != -1 {
return opts.NumGPU
}
n := 1 // default to enable metal on macOS
if runtime.GOOS == "linux" {
vramMib, err := CheckVRAM()
if err != nil {
@@ -234,11 +235,10 @@ func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
// TODO: this is a rough heuristic, better would be to calculate this based on number of layers and context size
bytesPerLayer := fileSizeBytes / numLayer
// max number of layers we can fit in VRAM
layers := int(totalVramBytes / bytesPerLayer)
log.Printf("%d MiB VRAM available, loading up to %d GPU layers", vramMib, layers)
// set n to the max number of layers we can fit in VRAM
return int(totalVramBytes / bytesPerLayer)
return layers
log.Printf("%d MiB VRAM available, loading up to %d GPU layers", vramMib, n)
}
// default to enable metal on macOS
return 1
@@ -417,25 +417,28 @@ type Prediction struct {
}
type PredictRequest struct {
Prompt string `json:"prompt"`
Stream bool `json:"stream"`
NPredict int `json:"n_predict"`
NKeep int `json:"n_keep"`
Temperature float32 `json:"temperature"`
TopK int `json:"top_k"`
TopP float32 `json:"top_p"`
TfsZ float32 `json:"tfs_z"`
TypicalP float32 `json:"typical_p"`
RepeatLastN int `json:"repeat_last_n"`
RepeatPenalty float32 `json:"repeat_penalty"`
PresencePenalty float32 `json:"presence_penalty"`
FrequencyPenalty float32 `json:"frequency_penalty"`
Mirostat int `json:"mirostat"`
MirostatTau float32 `json:"mirostat_tau"`
MirostatEta float32 `json:"mirostat_eta"`
PenalizeNl bool `json:"penalize_nl"`
Seed int `json:"seed"`
Stop []string `json:"stop,omitempty"`
Stream bool `json:"stream"`
NPredict int `json:"n_predict,omitempty"`
TopK int `json:"top_k,omitempty"`
TopP float32 `json:"top_p,omitempty"`
TfsZ float32 `json:"tfs_z,omitempty"`
TypicalP float32 `json:"typical_p,omitempty"`
RepeatLastN int `json:"repeat_last_n,omitempty"`
Temperature float32 `json:"temperature,omitempty"`
RepeatPenalty float32 `json:"repeat_penalty,omitempty"`
PresencePenalty float32 `json:"presence_penalty,omitempty"`
FrequencyPenalty float32 `json:"frequency_penalty,omitempty"`
Mirostat int `json:"mirostat,omitempty"`
MirostatTau float32 `json:"mirostat_tau,omitempty"`
MirostatEta float32 `json:"mirostat_eta,omitempty"`
PenalizeNl bool `json:"penalize_nl,omitempty"`
NKeep int `json:"n_keep,omitempty"`
Seed int `json:"seed,omitempty"`
Prompt string `json:"prompt,omitempty"`
NProbs int `json:"n_probs,omitempty"`
LogitBias map[int]float32 `json:"logit_bias,omitempty"`
IgnoreEos bool `json:"ignore_eos,omitempty"`
Stop []string `json:"stop,omitempty"`
}
func (llm *llama) Predict(ctx context.Context, prevContext []int, prompt string, fn func(api.GenerateResponse)) error {
@@ -467,10 +470,8 @@ func (llm *llama) Predict(ctx context.Context, prevContext []int, prompt string,
MirostatTau: llm.MirostatTau,
MirostatEta: llm.MirostatEta,
PenalizeNl: llm.PenalizeNewline,
Seed: llm.Seed,
Stop: llm.Stop,
}
data, err := json.Marshal(predReq)
if err != nil {
return fmt.Errorf("error marshaling data: %v", err)

0
scripts/build_docker.sh Executable file → Normal file
View File

View File

@@ -54,6 +54,54 @@ type Model struct {
Embeddings []vector.Embedding
}
func (m *Model) ChatPrompt(messages []api.Message) (string, error) {
tmpl, err := template.New("").Parse(m.Template)
if err != nil {
return "", err
}
var vars struct {
System string
Prompt string
First bool
}
vars.First = true
var sb strings.Builder
flush := func() {
tmpl.Execute(&sb, vars)
vars.System = ""
vars.Prompt = ""
}
// build the chat history from messages
for _, m := range messages {
if m.Role == "system" {
if vars.System != "" {
flush()
}
vars.System = m.Content
}
if m.Role == "user" {
if vars.Prompt != "" {
flush()
}
vars.Prompt = m.Content
}
if m.Role == "assistant" {
flush()
sb.Write([]byte(m.Content))
}
}
flush()
return sb.String(), nil
}
func (m *Model) Prompt(request api.GenerateRequest, embedding string) (string, error) {
t := m.Template
if request.Template != "" {

View File

@@ -156,6 +156,54 @@ func load(ctx context.Context, workDir string, model *Model, reqOpts map[string]
return nil
}
func ChatModelHandler(c *gin.Context) {
loaded.mu.Lock()
defer loaded.mu.Unlock()
var req api.ChatRequest
if err := c.ShouldBindJSON(&req); err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
return
}
model, err := GetModel(req.Model)
if err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
return
}
prompt, err := model.ChatPrompt(req.Messages)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
var response string
fn := func(r api.GenerateResponse) {
response += r.Response
}
workDir := c.GetString("workDir")
if err := load(c.Request.Context(), workDir, model, nil, defaultSessionDuration); err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
fmt.Println(prompt)
if err := loaded.llm.Predict(c.Request.Context(), []int{}, prompt, fn); err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
}
c.JSON(http.StatusOK, api.ChatResponse{
Message: api.Message{
Role: "assistant",
Content: response,
},
CreatedAt: time.Now().UTC(),
})
}
func GenerateHandler(c *gin.Context) {
loaded.mu.Lock()
defer loaded.mu.Unlock()
@@ -552,6 +600,7 @@ func Serve(ln net.Listener, allowOrigins []string) error {
},
)
r.POST("/api/chat", ChatModelHandler)
r.POST("/api/pull", PullModelHandler)
r.POST("/api/generate", GenerateHandler)
r.POST("/api/embeddings", EmbeddingHandler)