mirror of
https://github.com/ollama/ollama.git
synced 2026-01-23 23:09:15 -05:00
Compare commits
27 Commits
royh/strea
...
v0.3.2
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4c14855ad7 | ||
|
|
dc77bbcfa4 | ||
|
|
0f3271db88 | ||
|
|
c4c84b7a0d | ||
|
|
5c1912769e | ||
|
|
71399aa682 | ||
|
|
463a8aa273 | ||
|
|
3579b4966a | ||
|
|
5d66578356 | ||
|
|
afa8d6e9d5 | ||
|
|
1b44d873e7 | ||
|
|
cef2c6054d | ||
|
|
345420998e | ||
|
|
0be8baad2b | ||
|
|
a250c2cb13 | ||
|
|
15af558423 | ||
|
|
85d9d73a72 | ||
|
|
78140a712c | ||
|
|
1954ec5917 | ||
|
|
0f1910129f | ||
|
|
e2c3f6b3e2 | ||
|
|
8570c1c0ef | ||
|
|
55cd3ddcca | ||
|
|
66fe77f084 | ||
|
|
d1a5227cad | ||
|
|
4f1afd575d | ||
|
|
35b89b2eab |
@@ -173,7 +173,7 @@ I'm a basic program that prints the famous "Hello, world!" message to the consol
|
||||
### Multimodal models
|
||||
|
||||
```
|
||||
>>> What's in this image? /Users/jmorgan/Desktop/smile.png
|
||||
ollama run llava "What's in this image? /Users/jmorgan/Desktop/smile.png"
|
||||
The image features a yellow smiley face, which is likely the central focus of the picture.
|
||||
```
|
||||
|
||||
@@ -299,6 +299,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
- [AI Studio](https://github.com/MindWorkAI/AI-Studio)
|
||||
- [Sidellama](https://github.com/gyopak/sidellama) (browser-based LLM client)
|
||||
- [LLMStack](https://github.com/trypromptly/LLMStack) (No-code multi-agent framework to build LLM agents and workflows)
|
||||
- [BoltAI for Mac](https://boltai.com) (AI Chat Client for Mac)
|
||||
|
||||
### Terminal
|
||||
|
||||
@@ -337,6 +338,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
### Libraries
|
||||
|
||||
- [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/modules/model_io/models/llms/integrations/ollama) with [example](https://js.langchain.com/docs/use_cases/question_answering/local_retrieval_qa)
|
||||
- [Firebase Genkit](https://firebase.google.com/docs/genkit/plugins/ollama)
|
||||
- [LangChainGo](https://github.com/tmc/langchaingo/) with [example](https://github.com/tmc/langchaingo/tree/main/examples/ollama-completion-example)
|
||||
- [LangChain4j](https://github.com/langchain4j/langchain4j) with [example](https://github.com/langchain4j/langchain4j-examples/tree/main/ollama-examples/src/main/java)
|
||||
- [LangChainRust](https://github.com/Abraxas-365/langchain-rust) with [example](https://github.com/Abraxas-365/langchain-rust/blob/main/examples/llm_ollama.rs)
|
||||
|
||||
25
SECURITY.md
Normal file
25
SECURITY.md
Normal file
@@ -0,0 +1,25 @@
|
||||
# Security
|
||||
|
||||
The Ollama maintainer team takes security seriously and will actively work to resolve security issues.
|
||||
|
||||
## Reporting a vulnerability
|
||||
|
||||
If you discover a security vulnerability, please do not open a public issue. Instead, please report it by emailing hello@ollama.com. We ask that you give us sufficient time to investigate and address the vulnerability before disclosing it publicly.
|
||||
|
||||
Please include the following details in your report:
|
||||
- A description of the vulnerability
|
||||
- Steps to reproduce the issue
|
||||
- Your assessment of the potential impact
|
||||
- Any possible mitigations
|
||||
|
||||
## Security best practices
|
||||
|
||||
While the maintainer team does their best to secure Ollama, users are encouraged to implement their own security best practices, such as:
|
||||
|
||||
- Regularly updating to the latest version of Ollama
|
||||
- Securing access to hosted instances of Ollama
|
||||
- Monitoring systems for unusual activity
|
||||
|
||||
## Contact
|
||||
|
||||
For any other questions or concerns related to security, please contact us at hello@ollama.com
|
||||
@@ -20,7 +20,6 @@ import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"runtime"
|
||||
@@ -63,13 +62,8 @@ func checkError(resp *http.Response, body []byte) error {
|
||||
// If the variable is not specified, a default ollama host and port will be
|
||||
// used.
|
||||
func ClientFromEnvironment() (*Client, error) {
|
||||
ollamaHost := envconfig.Host
|
||||
|
||||
return &Client{
|
||||
base: &url.URL{
|
||||
Scheme: ollamaHost.Scheme,
|
||||
Host: net.JoinHostPort(ollamaHost.Host, ollamaHost.Port),
|
||||
},
|
||||
base: envconfig.Host(),
|
||||
http: http.DefaultClient,
|
||||
}, nil
|
||||
}
|
||||
|
||||
@@ -2,8 +2,6 @@ package api
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
)
|
||||
|
||||
func TestClientFromEnvironment(t *testing.T) {
|
||||
@@ -33,7 +31,6 @@ func TestClientFromEnvironment(t *testing.T) {
|
||||
for k, v := range testCases {
|
||||
t.Run(k, func(t *testing.T) {
|
||||
t.Setenv("OLLAMA_HOST", v.value)
|
||||
envconfig.LoadConfig()
|
||||
|
||||
client, err := ClientFromEnvironment()
|
||||
if err != v.err {
|
||||
|
||||
@@ -267,6 +267,10 @@ type EmbedRequest struct {
|
||||
type EmbedResponse struct {
|
||||
Model string `json:"model"`
|
||||
Embeddings [][]float32 `json:"embeddings"`
|
||||
|
||||
TotalDuration time.Duration `json:"total_duration,omitempty"`
|
||||
LoadDuration time.Duration `json:"load_duration,omitempty"`
|
||||
PromptEvalCount int `json:"prompt_eval_count,omitempty"`
|
||||
}
|
||||
|
||||
// EmbeddingRequest is the request passed to [Client.Embeddings].
|
||||
|
||||
@@ -14,7 +14,7 @@ import (
|
||||
func InitLogging() {
|
||||
level := slog.LevelInfo
|
||||
|
||||
if envconfig.Debug {
|
||||
if envconfig.Debug() {
|
||||
level = slog.LevelDebug
|
||||
}
|
||||
|
||||
|
||||
19
cmd/cmd.go
19
cmd/cmd.go
@@ -362,9 +362,24 @@ func RunHandler(cmd *cobra.Command, args []string) error {
|
||||
|
||||
opts.MultiModal = slices.Contains(info.Details.Families, "clip")
|
||||
opts.ParentModel = info.Details.ParentModel
|
||||
opts.Messages = append(opts.Messages, info.Messages...)
|
||||
|
||||
if interactive {
|
||||
if err := loadModel(cmd, &opts); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, msg := range info.Messages {
|
||||
switch msg.Role {
|
||||
case "user":
|
||||
fmt.Printf(">>> %s\n", msg.Content)
|
||||
case "assistant":
|
||||
state := &displayResponseState{}
|
||||
displayResponse(msg.Content, opts.WordWrap, state)
|
||||
fmt.Println()
|
||||
fmt.Println()
|
||||
}
|
||||
}
|
||||
|
||||
return generateInteractive(cmd, opts)
|
||||
}
|
||||
return generate(cmd, opts)
|
||||
@@ -1076,7 +1091,7 @@ func RunServer(cmd *cobra.Command, _ []string) error {
|
||||
return err
|
||||
}
|
||||
|
||||
ln, err := net.Listen("tcp", net.JoinHostPort(envconfig.Host.Host, envconfig.Host.Port))
|
||||
ln, err := net.Listen("tcp", envconfig.Host().Host)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -48,29 +48,10 @@ func loadModel(cmd *cobra.Command, opts *runOptions) error {
|
||||
KeepAlive: opts.KeepAlive,
|
||||
}
|
||||
|
||||
return client.Chat(cmd.Context(), chatReq, func(resp api.ChatResponse) error {
|
||||
p.StopAndClear()
|
||||
for _, msg := range opts.Messages {
|
||||
switch msg.Role {
|
||||
case "user":
|
||||
fmt.Printf(">>> %s\n", msg.Content)
|
||||
case "assistant":
|
||||
state := &displayResponseState{}
|
||||
displayResponse(msg.Content, opts.WordWrap, state)
|
||||
fmt.Println()
|
||||
fmt.Println()
|
||||
}
|
||||
}
|
||||
return nil
|
||||
})
|
||||
return client.Chat(cmd.Context(), chatReq, func(api.ChatResponse) error { return nil })
|
||||
}
|
||||
|
||||
func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
||||
err := loadModel(cmd, &opts)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
usage := func() {
|
||||
fmt.Fprintln(os.Stderr, "Available Commands:")
|
||||
fmt.Fprintln(os.Stderr, " /set Set session variables")
|
||||
@@ -160,7 +141,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
||||
return err
|
||||
}
|
||||
|
||||
if envconfig.NoHistory {
|
||||
if envconfig.NoHistory() {
|
||||
scanner.HistoryDisable()
|
||||
}
|
||||
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
package envconfig
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"math"
|
||||
"net"
|
||||
"net/url"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
@@ -14,296 +14,16 @@ import (
|
||||
"time"
|
||||
)
|
||||
|
||||
type OllamaHost struct {
|
||||
Scheme string
|
||||
Host string
|
||||
Port string
|
||||
}
|
||||
|
||||
func (o OllamaHost) String() string {
|
||||
return fmt.Sprintf("%s://%s:%s", o.Scheme, o.Host, o.Port)
|
||||
}
|
||||
|
||||
var ErrInvalidHostPort = errors.New("invalid port specified in OLLAMA_HOST")
|
||||
|
||||
var (
|
||||
// Set via OLLAMA_ORIGINS in the environment
|
||||
AllowOrigins []string
|
||||
// Set via OLLAMA_DEBUG in the environment
|
||||
Debug bool
|
||||
// Experimental flash attention
|
||||
FlashAttention bool
|
||||
// Set via OLLAMA_HOST in the environment
|
||||
Host *OllamaHost
|
||||
// Set via OLLAMA_KEEP_ALIVE in the environment
|
||||
KeepAlive time.Duration
|
||||
// Set via OLLAMA_LLM_LIBRARY in the environment
|
||||
LLMLibrary string
|
||||
// Set via OLLAMA_MAX_LOADED_MODELS in the environment
|
||||
MaxRunners int
|
||||
// Set via OLLAMA_MAX_QUEUE in the environment
|
||||
MaxQueuedRequests int
|
||||
// Set via OLLAMA_MODELS in the environment
|
||||
ModelsDir string
|
||||
// Set via OLLAMA_NOHISTORY in the environment
|
||||
NoHistory bool
|
||||
// Set via OLLAMA_NOPRUNE in the environment
|
||||
NoPrune bool
|
||||
// Set via OLLAMA_NUM_PARALLEL in the environment
|
||||
NumParallel int
|
||||
// Set via OLLAMA_RUNNERS_DIR in the environment
|
||||
RunnersDir string
|
||||
// Set via OLLAMA_SCHED_SPREAD in the environment
|
||||
SchedSpread bool
|
||||
// Set via OLLAMA_TMPDIR in the environment
|
||||
TmpDir string
|
||||
// Set via OLLAMA_INTEL_GPU in the environment
|
||||
IntelGpu bool
|
||||
|
||||
// Set via CUDA_VISIBLE_DEVICES in the environment
|
||||
CudaVisibleDevices string
|
||||
// Set via HIP_VISIBLE_DEVICES in the environment
|
||||
HipVisibleDevices string
|
||||
// Set via ROCR_VISIBLE_DEVICES in the environment
|
||||
RocrVisibleDevices string
|
||||
// Set via GPU_DEVICE_ORDINAL in the environment
|
||||
GpuDeviceOrdinal string
|
||||
// Set via HSA_OVERRIDE_GFX_VERSION in the environment
|
||||
HsaOverrideGfxVersion string
|
||||
)
|
||||
|
||||
type EnvVar struct {
|
||||
Name string
|
||||
Value any
|
||||
Description string
|
||||
}
|
||||
|
||||
func AsMap() map[string]EnvVar {
|
||||
ret := map[string]EnvVar{
|
||||
"OLLAMA_DEBUG": {"OLLAMA_DEBUG", Debug, "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
|
||||
"OLLAMA_FLASH_ATTENTION": {"OLLAMA_FLASH_ATTENTION", FlashAttention, "Enabled flash attention"},
|
||||
"OLLAMA_HOST": {"OLLAMA_HOST", Host, "IP Address for the ollama server (default 127.0.0.1:11434)"},
|
||||
"OLLAMA_KEEP_ALIVE": {"OLLAMA_KEEP_ALIVE", KeepAlive, "The duration that models stay loaded in memory (default \"5m\")"},
|
||||
"OLLAMA_LLM_LIBRARY": {"OLLAMA_LLM_LIBRARY", LLMLibrary, "Set LLM library to bypass autodetection"},
|
||||
"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models per GPU"},
|
||||
"OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueuedRequests, "Maximum number of queued requests"},
|
||||
"OLLAMA_MODELS": {"OLLAMA_MODELS", ModelsDir, "The path to the models directory"},
|
||||
"OLLAMA_NOHISTORY": {"OLLAMA_NOHISTORY", NoHistory, "Do not preserve readline history"},
|
||||
"OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune, "Do not prune model blobs on startup"},
|
||||
"OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests"},
|
||||
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", AllowOrigins, "A comma separated list of allowed origins"},
|
||||
"OLLAMA_RUNNERS_DIR": {"OLLAMA_RUNNERS_DIR", RunnersDir, "Location for runners"},
|
||||
"OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread, "Always schedule model across all GPUs"},
|
||||
"OLLAMA_TMPDIR": {"OLLAMA_TMPDIR", TmpDir, "Location for temporary files"},
|
||||
}
|
||||
if runtime.GOOS != "darwin" {
|
||||
ret["CUDA_VISIBLE_DEVICES"] = EnvVar{"CUDA_VISIBLE_DEVICES", CudaVisibleDevices, "Set which NVIDIA devices are visible"}
|
||||
ret["HIP_VISIBLE_DEVICES"] = EnvVar{"HIP_VISIBLE_DEVICES", HipVisibleDevices, "Set which AMD devices are visible"}
|
||||
ret["ROCR_VISIBLE_DEVICES"] = EnvVar{"ROCR_VISIBLE_DEVICES", RocrVisibleDevices, "Set which AMD devices are visible"}
|
||||
ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal, "Set which AMD devices are visible"}
|
||||
ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion, "Override the gfx used for all detected AMD GPUs"}
|
||||
ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGpu, "Enable experimental Intel GPU detection"}
|
||||
}
|
||||
return ret
|
||||
}
|
||||
|
||||
func Values() map[string]string {
|
||||
vals := make(map[string]string)
|
||||
for k, v := range AsMap() {
|
||||
vals[k] = fmt.Sprintf("%v", v.Value)
|
||||
}
|
||||
return vals
|
||||
}
|
||||
|
||||
var defaultAllowOrigins = []string{
|
||||
"localhost",
|
||||
"127.0.0.1",
|
||||
"0.0.0.0",
|
||||
}
|
||||
|
||||
// Clean quotes and spaces from the value
|
||||
func clean(key string) string {
|
||||
return strings.Trim(os.Getenv(key), "\"' ")
|
||||
}
|
||||
|
||||
func init() {
|
||||
// default values
|
||||
NumParallel = 0 // Autoselect
|
||||
MaxRunners = 0 // Autoselect
|
||||
MaxQueuedRequests = 512
|
||||
KeepAlive = 5 * time.Minute
|
||||
|
||||
LoadConfig()
|
||||
}
|
||||
|
||||
func LoadConfig() {
|
||||
if debug := clean("OLLAMA_DEBUG"); debug != "" {
|
||||
d, err := strconv.ParseBool(debug)
|
||||
if err == nil {
|
||||
Debug = d
|
||||
} else {
|
||||
Debug = true
|
||||
}
|
||||
}
|
||||
|
||||
if fa := clean("OLLAMA_FLASH_ATTENTION"); fa != "" {
|
||||
d, err := strconv.ParseBool(fa)
|
||||
if err == nil {
|
||||
FlashAttention = d
|
||||
}
|
||||
}
|
||||
|
||||
RunnersDir = clean("OLLAMA_RUNNERS_DIR")
|
||||
if runtime.GOOS == "windows" && RunnersDir == "" {
|
||||
// On Windows we do not carry the payloads inside the main executable
|
||||
appExe, err := os.Executable()
|
||||
if err != nil {
|
||||
slog.Error("failed to lookup executable path", "error", err)
|
||||
}
|
||||
|
||||
cwd, err := os.Getwd()
|
||||
if err != nil {
|
||||
slog.Error("failed to lookup working directory", "error", err)
|
||||
}
|
||||
|
||||
var paths []string
|
||||
for _, root := range []string{filepath.Dir(appExe), cwd} {
|
||||
paths = append(paths,
|
||||
root,
|
||||
filepath.Join(root, "windows-"+runtime.GOARCH),
|
||||
filepath.Join(root, "dist", "windows-"+runtime.GOARCH),
|
||||
)
|
||||
}
|
||||
|
||||
// Try a few variations to improve developer experience when building from source in the local tree
|
||||
for _, p := range paths {
|
||||
candidate := filepath.Join(p, "ollama_runners")
|
||||
_, err := os.Stat(candidate)
|
||||
if err == nil {
|
||||
RunnersDir = candidate
|
||||
break
|
||||
}
|
||||
}
|
||||
if RunnersDir == "" {
|
||||
slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama_runners'")
|
||||
}
|
||||
}
|
||||
|
||||
TmpDir = clean("OLLAMA_TMPDIR")
|
||||
|
||||
LLMLibrary = clean("OLLAMA_LLM_LIBRARY")
|
||||
|
||||
if onp := clean("OLLAMA_NUM_PARALLEL"); onp != "" {
|
||||
val, err := strconv.Atoi(onp)
|
||||
if err != nil {
|
||||
slog.Error("invalid setting, ignoring", "OLLAMA_NUM_PARALLEL", onp, "error", err)
|
||||
} else {
|
||||
NumParallel = val
|
||||
}
|
||||
}
|
||||
|
||||
if nohistory := clean("OLLAMA_NOHISTORY"); nohistory != "" {
|
||||
NoHistory = true
|
||||
}
|
||||
|
||||
if spread := clean("OLLAMA_SCHED_SPREAD"); spread != "" {
|
||||
s, err := strconv.ParseBool(spread)
|
||||
if err == nil {
|
||||
SchedSpread = s
|
||||
} else {
|
||||
SchedSpread = true
|
||||
}
|
||||
}
|
||||
|
||||
if noprune := clean("OLLAMA_NOPRUNE"); noprune != "" {
|
||||
NoPrune = true
|
||||
}
|
||||
|
||||
if origins := clean("OLLAMA_ORIGINS"); origins != "" {
|
||||
AllowOrigins = strings.Split(origins, ",")
|
||||
}
|
||||
for _, allowOrigin := range defaultAllowOrigins {
|
||||
AllowOrigins = append(AllowOrigins,
|
||||
fmt.Sprintf("http://%s", allowOrigin),
|
||||
fmt.Sprintf("https://%s", allowOrigin),
|
||||
fmt.Sprintf("http://%s", net.JoinHostPort(allowOrigin, "*")),
|
||||
fmt.Sprintf("https://%s", net.JoinHostPort(allowOrigin, "*")),
|
||||
)
|
||||
}
|
||||
|
||||
AllowOrigins = append(AllowOrigins,
|
||||
"app://*",
|
||||
"file://*",
|
||||
"tauri://*",
|
||||
)
|
||||
|
||||
maxRunners := clean("OLLAMA_MAX_LOADED_MODELS")
|
||||
if maxRunners != "" {
|
||||
m, err := strconv.Atoi(maxRunners)
|
||||
if err != nil {
|
||||
slog.Error("invalid setting, ignoring", "OLLAMA_MAX_LOADED_MODELS", maxRunners, "error", err)
|
||||
} else {
|
||||
MaxRunners = m
|
||||
}
|
||||
}
|
||||
|
||||
if onp := os.Getenv("OLLAMA_MAX_QUEUE"); onp != "" {
|
||||
p, err := strconv.Atoi(onp)
|
||||
if err != nil || p <= 0 {
|
||||
slog.Error("invalid setting, ignoring", "OLLAMA_MAX_QUEUE", onp, "error", err)
|
||||
} else {
|
||||
MaxQueuedRequests = p
|
||||
}
|
||||
}
|
||||
|
||||
ka := clean("OLLAMA_KEEP_ALIVE")
|
||||
if ka != "" {
|
||||
loadKeepAlive(ka)
|
||||
}
|
||||
|
||||
var err error
|
||||
ModelsDir, err = getModelsDir()
|
||||
if err != nil {
|
||||
slog.Error("invalid setting", "OLLAMA_MODELS", ModelsDir, "error", err)
|
||||
}
|
||||
|
||||
Host, err = getOllamaHost()
|
||||
if err != nil {
|
||||
slog.Error("invalid setting", "OLLAMA_HOST", Host, "error", err, "using default port", Host.Port)
|
||||
}
|
||||
|
||||
if set, err := strconv.ParseBool(clean("OLLAMA_INTEL_GPU")); err == nil {
|
||||
IntelGpu = set
|
||||
}
|
||||
|
||||
CudaVisibleDevices = clean("CUDA_VISIBLE_DEVICES")
|
||||
HipVisibleDevices = clean("HIP_VISIBLE_DEVICES")
|
||||
RocrVisibleDevices = clean("ROCR_VISIBLE_DEVICES")
|
||||
GpuDeviceOrdinal = clean("GPU_DEVICE_ORDINAL")
|
||||
HsaOverrideGfxVersion = clean("HSA_OVERRIDE_GFX_VERSION")
|
||||
}
|
||||
|
||||
func getModelsDir() (string, error) {
|
||||
if models, exists := os.LookupEnv("OLLAMA_MODELS"); exists {
|
||||
return models, nil
|
||||
}
|
||||
home, err := os.UserHomeDir()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return filepath.Join(home, ".ollama", "models"), nil
|
||||
}
|
||||
|
||||
func getOllamaHost() (*OllamaHost, error) {
|
||||
// Host returns the scheme and host. Host can be configured via the OLLAMA_HOST environment variable.
|
||||
// Default is scheme "http" and host "127.0.0.1:11434"
|
||||
func Host() *url.URL {
|
||||
defaultPort := "11434"
|
||||
|
||||
hostVar := os.Getenv("OLLAMA_HOST")
|
||||
hostVar = strings.TrimSpace(strings.Trim(strings.TrimSpace(hostVar), "\"'"))
|
||||
|
||||
scheme, hostport, ok := strings.Cut(hostVar, "://")
|
||||
s := strings.TrimSpace(Var("OLLAMA_HOST"))
|
||||
scheme, hostport, ok := strings.Cut(s, "://")
|
||||
switch {
|
||||
case !ok:
|
||||
scheme, hostport = "http", hostVar
|
||||
scheme, hostport = "http", s
|
||||
case scheme == "http":
|
||||
defaultPort = "80"
|
||||
case scheme == "https":
|
||||
@@ -323,38 +43,242 @@ func getOllamaHost() (*OllamaHost, error) {
|
||||
}
|
||||
}
|
||||
|
||||
if portNum, err := strconv.ParseInt(port, 10, 32); err != nil || portNum > 65535 || portNum < 0 {
|
||||
return &OllamaHost{
|
||||
if n, err := strconv.ParseInt(port, 10, 32); err != nil || n > 65535 || n < 0 {
|
||||
slog.Warn("invalid port, using default", "port", port, "default", defaultPort)
|
||||
return &url.URL{
|
||||
Scheme: scheme,
|
||||
Host: host,
|
||||
Port: defaultPort,
|
||||
}, ErrInvalidHostPort
|
||||
Host: net.JoinHostPort(host, defaultPort),
|
||||
}
|
||||
}
|
||||
|
||||
return &OllamaHost{
|
||||
return &url.URL{
|
||||
Scheme: scheme,
|
||||
Host: host,
|
||||
Port: port,
|
||||
}, nil
|
||||
Host: net.JoinHostPort(host, port),
|
||||
}
|
||||
}
|
||||
|
||||
func loadKeepAlive(ka string) {
|
||||
v, err := strconv.Atoi(ka)
|
||||
// Origins returns a list of allowed origins. Origins can be configured via the OLLAMA_ORIGINS environment variable.
|
||||
func Origins() (origins []string) {
|
||||
if s := Var("OLLAMA_ORIGINS"); s != "" {
|
||||
origins = strings.Split(s, ",")
|
||||
}
|
||||
|
||||
for _, origin := range []string{"localhost", "127.0.0.1", "0.0.0.0"} {
|
||||
origins = append(origins,
|
||||
fmt.Sprintf("http://%s", origin),
|
||||
fmt.Sprintf("https://%s", origin),
|
||||
fmt.Sprintf("http://%s", net.JoinHostPort(origin, "*")),
|
||||
fmt.Sprintf("https://%s", net.JoinHostPort(origin, "*")),
|
||||
)
|
||||
}
|
||||
|
||||
origins = append(origins,
|
||||
"app://*",
|
||||
"file://*",
|
||||
"tauri://*",
|
||||
)
|
||||
|
||||
return origins
|
||||
}
|
||||
|
||||
// Models returns the path to the models directory. Models directory can be configured via the OLLAMA_MODELS environment variable.
|
||||
// Default is $HOME/.ollama/models
|
||||
func Models() string {
|
||||
if s := Var("OLLAMA_MODELS"); s != "" {
|
||||
return s
|
||||
}
|
||||
|
||||
home, err := os.UserHomeDir()
|
||||
if err != nil {
|
||||
d, err := time.ParseDuration(ka)
|
||||
if err == nil {
|
||||
if d < 0 {
|
||||
KeepAlive = time.Duration(math.MaxInt64)
|
||||
panic(err)
|
||||
}
|
||||
|
||||
return filepath.Join(home, ".ollama", "models")
|
||||
}
|
||||
|
||||
// KeepAlive returns the duration that models stay loaded in memory. KeepAlive can be configured via the OLLAMA_KEEP_ALIVE environment variable.
|
||||
// Negative values are treated as infinite. Zero is treated as no keep alive.
|
||||
// Default is 5 minutes.
|
||||
func KeepAlive() (keepAlive time.Duration) {
|
||||
keepAlive = 5 * time.Minute
|
||||
if s := Var("OLLAMA_KEEP_ALIVE"); s != "" {
|
||||
if d, err := time.ParseDuration(s); err == nil {
|
||||
keepAlive = d
|
||||
} else if n, err := strconv.ParseInt(s, 10, 64); err == nil {
|
||||
keepAlive = time.Duration(n) * time.Second
|
||||
}
|
||||
}
|
||||
|
||||
if keepAlive < 0 {
|
||||
return time.Duration(math.MaxInt64)
|
||||
}
|
||||
|
||||
return keepAlive
|
||||
}
|
||||
|
||||
func Bool(k string) func() bool {
|
||||
return func() bool {
|
||||
if s := Var(k); s != "" {
|
||||
b, err := strconv.ParseBool(s)
|
||||
if err != nil {
|
||||
return true
|
||||
}
|
||||
|
||||
return b
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
var (
|
||||
// Debug enabled additional debug information.
|
||||
Debug = Bool("OLLAMA_DEBUG")
|
||||
// FlashAttention enables the experimental flash attention feature.
|
||||
FlashAttention = Bool("OLLAMA_FLASH_ATTENTION")
|
||||
// NoHistory disables readline history.
|
||||
NoHistory = Bool("OLLAMA_NOHISTORY")
|
||||
// NoPrune disables pruning of model blobs on startup.
|
||||
NoPrune = Bool("OLLAMA_NOPRUNE")
|
||||
// SchedSpread allows scheduling models across all GPUs.
|
||||
SchedSpread = Bool("OLLAMA_SCHED_SPREAD")
|
||||
// IntelGPU enables experimental Intel GPU detection.
|
||||
IntelGPU = Bool("OLLAMA_INTEL_GPU")
|
||||
)
|
||||
|
||||
func String(s string) func() string {
|
||||
return func() string {
|
||||
return Var(s)
|
||||
}
|
||||
}
|
||||
|
||||
var (
|
||||
LLMLibrary = String("OLLAMA_LLM_LIBRARY")
|
||||
TmpDir = String("OLLAMA_TMPDIR")
|
||||
|
||||
CudaVisibleDevices = String("CUDA_VISIBLE_DEVICES")
|
||||
HipVisibleDevices = String("HIP_VISIBLE_DEVICES")
|
||||
RocrVisibleDevices = String("ROCR_VISIBLE_DEVICES")
|
||||
GpuDeviceOrdinal = String("GPU_DEVICE_ORDINAL")
|
||||
HsaOverrideGfxVersion = String("HSA_OVERRIDE_GFX_VERSION")
|
||||
)
|
||||
|
||||
func RunnersDir() (p string) {
|
||||
if p := Var("OLLAMA_RUNNERS_DIR"); p != "" {
|
||||
return p
|
||||
}
|
||||
|
||||
if runtime.GOOS != "windows" {
|
||||
return
|
||||
}
|
||||
|
||||
defer func() {
|
||||
if p == "" {
|
||||
slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama_runners'")
|
||||
}
|
||||
}()
|
||||
|
||||
// On Windows we do not carry the payloads inside the main executable
|
||||
exe, err := os.Executable()
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
cwd, err := os.Getwd()
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
var paths []string
|
||||
for _, root := range []string{filepath.Dir(exe), cwd} {
|
||||
paths = append(paths,
|
||||
root,
|
||||
filepath.Join(root, "windows-"+runtime.GOARCH),
|
||||
filepath.Join(root, "dist", "windows-"+runtime.GOARCH),
|
||||
)
|
||||
}
|
||||
|
||||
// Try a few variations to improve developer experience when building from source in the local tree
|
||||
for _, path := range paths {
|
||||
candidate := filepath.Join(path, "ollama_runners")
|
||||
if _, err := os.Stat(candidate); err == nil {
|
||||
p = candidate
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
return p
|
||||
}
|
||||
|
||||
func Uint(key string, defaultValue uint) func() uint {
|
||||
return func() uint {
|
||||
if s := Var(key); s != "" {
|
||||
if n, err := strconv.ParseUint(s, 10, 64); err != nil {
|
||||
slog.Warn("invalid environment variable, using default", "key", key, "value", s, "default", defaultValue)
|
||||
} else {
|
||||
KeepAlive = d
|
||||
return uint(n)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
d := time.Duration(v) * time.Second
|
||||
if d < 0 {
|
||||
KeepAlive = time.Duration(math.MaxInt64)
|
||||
} else {
|
||||
KeepAlive = d
|
||||
}
|
||||
|
||||
return defaultValue
|
||||
}
|
||||
}
|
||||
|
||||
var (
|
||||
// NumParallel sets the number of parallel model requests. NumParallel can be configured via the OLLAMA_NUM_PARALLEL environment variable.
|
||||
NumParallel = Uint("OLLAMA_NUM_PARALLEL", 0)
|
||||
// MaxRunners sets the maximum number of loaded models. MaxRunners can be configured via the OLLAMA_MAX_LOADED_MODELS environment variable.
|
||||
MaxRunners = Uint("OLLAMA_MAX_LOADED_MODELS", 0)
|
||||
// MaxQueue sets the maximum number of queued requests. MaxQueue can be configured via the OLLAMA_MAX_QUEUE environment variable.
|
||||
MaxQueue = Uint("OLLAMA_MAX_QUEUE", 512)
|
||||
// MaxVRAM sets a maximum VRAM override in bytes. MaxVRAM can be configured via the OLLAMA_MAX_VRAM environment variable.
|
||||
MaxVRAM = Uint("OLLAMA_MAX_VRAM", 0)
|
||||
)
|
||||
|
||||
type EnvVar struct {
|
||||
Name string
|
||||
Value any
|
||||
Description string
|
||||
}
|
||||
|
||||
func AsMap() map[string]EnvVar {
|
||||
ret := map[string]EnvVar{
|
||||
"OLLAMA_DEBUG": {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
|
||||
"OLLAMA_FLASH_ATTENTION": {"OLLAMA_FLASH_ATTENTION", FlashAttention(), "Enabled flash attention"},
|
||||
"OLLAMA_HOST": {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"},
|
||||
"OLLAMA_KEEP_ALIVE": {"OLLAMA_KEEP_ALIVE", KeepAlive(), "The duration that models stay loaded in memory (default \"5m\")"},
|
||||
"OLLAMA_LLM_LIBRARY": {"OLLAMA_LLM_LIBRARY", LLMLibrary(), "Set LLM library to bypass autodetection"},
|
||||
"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners(), "Maximum number of loaded models per GPU"},
|
||||
"OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueue(), "Maximum number of queued requests"},
|
||||
"OLLAMA_MODELS": {"OLLAMA_MODELS", Models(), "The path to the models directory"},
|
||||
"OLLAMA_NOHISTORY": {"OLLAMA_NOHISTORY", NoHistory(), "Do not preserve readline history"},
|
||||
"OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune(), "Do not prune model blobs on startup"},
|
||||
"OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel(), "Maximum number of parallel requests"},
|
||||
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"},
|
||||
"OLLAMA_RUNNERS_DIR": {"OLLAMA_RUNNERS_DIR", RunnersDir(), "Location for runners"},
|
||||
"OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
|
||||
"OLLAMA_TMPDIR": {"OLLAMA_TMPDIR", TmpDir(), "Location for temporary files"},
|
||||
}
|
||||
if runtime.GOOS != "darwin" {
|
||||
ret["CUDA_VISIBLE_DEVICES"] = EnvVar{"CUDA_VISIBLE_DEVICES", CudaVisibleDevices(), "Set which NVIDIA devices are visible"}
|
||||
ret["HIP_VISIBLE_DEVICES"] = EnvVar{"HIP_VISIBLE_DEVICES", HipVisibleDevices(), "Set which AMD devices are visible"}
|
||||
ret["ROCR_VISIBLE_DEVICES"] = EnvVar{"ROCR_VISIBLE_DEVICES", RocrVisibleDevices(), "Set which AMD devices are visible"}
|
||||
ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal(), "Set which AMD devices are visible"}
|
||||
ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion(), "Override the gfx used for all detected AMD GPUs"}
|
||||
ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGPU(), "Enable experimental Intel GPU detection"}
|
||||
}
|
||||
return ret
|
||||
}
|
||||
|
||||
func Values() map[string]string {
|
||||
vals := make(map[string]string)
|
||||
for k, v := range AsMap() {
|
||||
vals[k] = fmt.Sprintf("%v", v.Value)
|
||||
}
|
||||
return vals
|
||||
}
|
||||
|
||||
// Var returns an environment variable stripped of leading and trailing quotes or spaces
|
||||
func Var(key string) string {
|
||||
return strings.Trim(strings.TrimSpace(os.Getenv(key)), "\"'")
|
||||
}
|
||||
|
||||
@@ -1,87 +1,234 @@
|
||||
package envconfig
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
"net"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
"github.com/google/go-cmp/cmp"
|
||||
)
|
||||
|
||||
func TestConfig(t *testing.T) {
|
||||
Debug = false // Reset whatever was loaded in init()
|
||||
t.Setenv("OLLAMA_DEBUG", "")
|
||||
LoadConfig()
|
||||
require.False(t, Debug)
|
||||
t.Setenv("OLLAMA_DEBUG", "false")
|
||||
LoadConfig()
|
||||
require.False(t, Debug)
|
||||
t.Setenv("OLLAMA_DEBUG", "1")
|
||||
LoadConfig()
|
||||
require.True(t, Debug)
|
||||
t.Setenv("OLLAMA_FLASH_ATTENTION", "1")
|
||||
LoadConfig()
|
||||
require.True(t, FlashAttention)
|
||||
t.Setenv("OLLAMA_KEEP_ALIVE", "")
|
||||
LoadConfig()
|
||||
require.Equal(t, 5*time.Minute, KeepAlive)
|
||||
t.Setenv("OLLAMA_KEEP_ALIVE", "3")
|
||||
LoadConfig()
|
||||
require.Equal(t, 3*time.Second, KeepAlive)
|
||||
t.Setenv("OLLAMA_KEEP_ALIVE", "1h")
|
||||
LoadConfig()
|
||||
require.Equal(t, 1*time.Hour, KeepAlive)
|
||||
t.Setenv("OLLAMA_KEEP_ALIVE", "-1s")
|
||||
LoadConfig()
|
||||
require.Equal(t, time.Duration(math.MaxInt64), KeepAlive)
|
||||
t.Setenv("OLLAMA_KEEP_ALIVE", "-1")
|
||||
LoadConfig()
|
||||
require.Equal(t, time.Duration(math.MaxInt64), KeepAlive)
|
||||
}
|
||||
|
||||
func TestClientFromEnvironment(t *testing.T) {
|
||||
type testCase struct {
|
||||
func TestHost(t *testing.T) {
|
||||
cases := map[string]struct {
|
||||
value string
|
||||
expect string
|
||||
err error
|
||||
}{
|
||||
"empty": {"", "127.0.0.1:11434"},
|
||||
"only address": {"1.2.3.4", "1.2.3.4:11434"},
|
||||
"only port": {":1234", ":1234"},
|
||||
"address and port": {"1.2.3.4:1234", "1.2.3.4:1234"},
|
||||
"hostname": {"example.com", "example.com:11434"},
|
||||
"hostname and port": {"example.com:1234", "example.com:1234"},
|
||||
"zero port": {":0", ":0"},
|
||||
"too large port": {":66000", ":11434"},
|
||||
"too small port": {":-1", ":11434"},
|
||||
"ipv6 localhost": {"[::1]", "[::1]:11434"},
|
||||
"ipv6 world open": {"[::]", "[::]:11434"},
|
||||
"ipv6 no brackets": {"::1", "[::1]:11434"},
|
||||
"ipv6 + port": {"[::1]:1337", "[::1]:1337"},
|
||||
"extra space": {" 1.2.3.4 ", "1.2.3.4:11434"},
|
||||
"extra quotes": {"\"1.2.3.4\"", "1.2.3.4:11434"},
|
||||
"extra space+quotes": {" \" 1.2.3.4 \" ", "1.2.3.4:11434"},
|
||||
"extra single quotes": {"'1.2.3.4'", "1.2.3.4:11434"},
|
||||
"http": {"http://1.2.3.4", "1.2.3.4:80"},
|
||||
"http port": {"http://1.2.3.4:4321", "1.2.3.4:4321"},
|
||||
"https": {"https://1.2.3.4", "1.2.3.4:443"},
|
||||
"https port": {"https://1.2.3.4:4321", "1.2.3.4:4321"},
|
||||
}
|
||||
|
||||
hostTestCases := map[string]*testCase{
|
||||
"empty": {value: "", expect: "127.0.0.1:11434"},
|
||||
"only address": {value: "1.2.3.4", expect: "1.2.3.4:11434"},
|
||||
"only port": {value: ":1234", expect: ":1234"},
|
||||
"address and port": {value: "1.2.3.4:1234", expect: "1.2.3.4:1234"},
|
||||
"hostname": {value: "example.com", expect: "example.com:11434"},
|
||||
"hostname and port": {value: "example.com:1234", expect: "example.com:1234"},
|
||||
"zero port": {value: ":0", expect: ":0"},
|
||||
"too large port": {value: ":66000", err: ErrInvalidHostPort},
|
||||
"too small port": {value: ":-1", err: ErrInvalidHostPort},
|
||||
"ipv6 localhost": {value: "[::1]", expect: "[::1]:11434"},
|
||||
"ipv6 world open": {value: "[::]", expect: "[::]:11434"},
|
||||
"ipv6 no brackets": {value: "::1", expect: "[::1]:11434"},
|
||||
"ipv6 + port": {value: "[::1]:1337", expect: "[::1]:1337"},
|
||||
"extra space": {value: " 1.2.3.4 ", expect: "1.2.3.4:11434"},
|
||||
"extra quotes": {value: "\"1.2.3.4\"", expect: "1.2.3.4:11434"},
|
||||
"extra space+quotes": {value: " \" 1.2.3.4 \" ", expect: "1.2.3.4:11434"},
|
||||
"extra single quotes": {value: "'1.2.3.4'", expect: "1.2.3.4:11434"},
|
||||
}
|
||||
|
||||
for k, v := range hostTestCases {
|
||||
t.Run(k, func(t *testing.T) {
|
||||
t.Setenv("OLLAMA_HOST", v.value)
|
||||
LoadConfig()
|
||||
|
||||
oh, err := getOllamaHost()
|
||||
if err != v.err {
|
||||
t.Fatalf("expected %s, got %s", v.err, err)
|
||||
}
|
||||
|
||||
if err == nil {
|
||||
host := net.JoinHostPort(oh.Host, oh.Port)
|
||||
assert.Equal(t, v.expect, host, fmt.Sprintf("%s: expected %s, got %s", k, v.expect, host))
|
||||
for name, tt := range cases {
|
||||
t.Run(name, func(t *testing.T) {
|
||||
t.Setenv("OLLAMA_HOST", tt.value)
|
||||
if host := Host(); host.Host != tt.expect {
|
||||
t.Errorf("%s: expected %s, got %s", name, tt.expect, host.Host)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestOrigins(t *testing.T) {
|
||||
cases := []struct {
|
||||
value string
|
||||
expect []string
|
||||
}{
|
||||
{"", []string{
|
||||
"http://localhost",
|
||||
"https://localhost",
|
||||
"http://localhost:*",
|
||||
"https://localhost:*",
|
||||
"http://127.0.0.1",
|
||||
"https://127.0.0.1",
|
||||
"http://127.0.0.1:*",
|
||||
"https://127.0.0.1:*",
|
||||
"http://0.0.0.0",
|
||||
"https://0.0.0.0",
|
||||
"http://0.0.0.0:*",
|
||||
"https://0.0.0.0:*",
|
||||
"app://*",
|
||||
"file://*",
|
||||
"tauri://*",
|
||||
}},
|
||||
{"http://10.0.0.1", []string{
|
||||
"http://10.0.0.1",
|
||||
"http://localhost",
|
||||
"https://localhost",
|
||||
"http://localhost:*",
|
||||
"https://localhost:*",
|
||||
"http://127.0.0.1",
|
||||
"https://127.0.0.1",
|
||||
"http://127.0.0.1:*",
|
||||
"https://127.0.0.1:*",
|
||||
"http://0.0.0.0",
|
||||
"https://0.0.0.0",
|
||||
"http://0.0.0.0:*",
|
||||
"https://0.0.0.0:*",
|
||||
"app://*",
|
||||
"file://*",
|
||||
"tauri://*",
|
||||
}},
|
||||
{"http://172.16.0.1,https://192.168.0.1", []string{
|
||||
"http://172.16.0.1",
|
||||
"https://192.168.0.1",
|
||||
"http://localhost",
|
||||
"https://localhost",
|
||||
"http://localhost:*",
|
||||
"https://localhost:*",
|
||||
"http://127.0.0.1",
|
||||
"https://127.0.0.1",
|
||||
"http://127.0.0.1:*",
|
||||
"https://127.0.0.1:*",
|
||||
"http://0.0.0.0",
|
||||
"https://0.0.0.0",
|
||||
"http://0.0.0.0:*",
|
||||
"https://0.0.0.0:*",
|
||||
"app://*",
|
||||
"file://*",
|
||||
"tauri://*",
|
||||
}},
|
||||
{"http://totally.safe,http://definitely.legit", []string{
|
||||
"http://totally.safe",
|
||||
"http://definitely.legit",
|
||||
"http://localhost",
|
||||
"https://localhost",
|
||||
"http://localhost:*",
|
||||
"https://localhost:*",
|
||||
"http://127.0.0.1",
|
||||
"https://127.0.0.1",
|
||||
"http://127.0.0.1:*",
|
||||
"https://127.0.0.1:*",
|
||||
"http://0.0.0.0",
|
||||
"https://0.0.0.0",
|
||||
"http://0.0.0.0:*",
|
||||
"https://0.0.0.0:*",
|
||||
"app://*",
|
||||
"file://*",
|
||||
"tauri://*",
|
||||
}},
|
||||
}
|
||||
for _, tt := range cases {
|
||||
t.Run(tt.value, func(t *testing.T) {
|
||||
t.Setenv("OLLAMA_ORIGINS", tt.value)
|
||||
|
||||
if diff := cmp.Diff(Origins(), tt.expect); diff != "" {
|
||||
t.Errorf("%s: mismatch (-want +got):\n%s", tt.value, diff)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestBool(t *testing.T) {
|
||||
cases := map[string]bool{
|
||||
"": false,
|
||||
"true": true,
|
||||
"false": false,
|
||||
"1": true,
|
||||
"0": false,
|
||||
// invalid values
|
||||
"random": true,
|
||||
"something": true,
|
||||
}
|
||||
|
||||
for k, v := range cases {
|
||||
t.Run(k, func(t *testing.T) {
|
||||
t.Setenv("OLLAMA_BOOL", k)
|
||||
if b := Bool("OLLAMA_BOOL")(); b != v {
|
||||
t.Errorf("%s: expected %t, got %t", k, v, b)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestUint(t *testing.T) {
|
||||
cases := map[string]uint{
|
||||
"0": 0,
|
||||
"1": 1,
|
||||
"1337": 1337,
|
||||
// default values
|
||||
"": 11434,
|
||||
"-1": 11434,
|
||||
"0o10": 11434,
|
||||
"0x10": 11434,
|
||||
"string": 11434,
|
||||
}
|
||||
|
||||
for k, v := range cases {
|
||||
t.Run(k, func(t *testing.T) {
|
||||
t.Setenv("OLLAMA_UINT", k)
|
||||
if i := Uint("OLLAMA_UINT", 11434)(); i != v {
|
||||
t.Errorf("%s: expected %d, got %d", k, v, i)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestKeepAlive(t *testing.T) {
|
||||
cases := map[string]time.Duration{
|
||||
"": 5 * time.Minute,
|
||||
"1s": time.Second,
|
||||
"1m": time.Minute,
|
||||
"1h": time.Hour,
|
||||
"5m0s": 5 * time.Minute,
|
||||
"1h2m3s": 1*time.Hour + 2*time.Minute + 3*time.Second,
|
||||
"0": time.Duration(0),
|
||||
"60": 60 * time.Second,
|
||||
"120": 2 * time.Minute,
|
||||
"3600": time.Hour,
|
||||
"-0": time.Duration(0),
|
||||
"-1": time.Duration(math.MaxInt64),
|
||||
"-1m": time.Duration(math.MaxInt64),
|
||||
// invalid values
|
||||
" ": 5 * time.Minute,
|
||||
"???": 5 * time.Minute,
|
||||
"1d": 5 * time.Minute,
|
||||
"1y": 5 * time.Minute,
|
||||
"1w": 5 * time.Minute,
|
||||
}
|
||||
|
||||
for tt, expect := range cases {
|
||||
t.Run(tt, func(t *testing.T) {
|
||||
t.Setenv("OLLAMA_KEEP_ALIVE", tt)
|
||||
if actual := KeepAlive(); actual != expect {
|
||||
t.Errorf("%s: expected %s, got %s", tt, expect, actual)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestVar(t *testing.T) {
|
||||
cases := map[string]string{
|
||||
"value": "value",
|
||||
" value ": "value",
|
||||
" 'value' ": "value",
|
||||
` "value" `: "value",
|
||||
" ' value ' ": " value ",
|
||||
` " value " `: " value ",
|
||||
}
|
||||
|
||||
for k, v := range cases {
|
||||
t.Run(k, func(t *testing.T) {
|
||||
t.Setenv("OLLAMA_VAR", k)
|
||||
if s := Var("OLLAMA_VAR"); s != v {
|
||||
t.Errorf("%s: expected %q, got %q", k, v, s)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
@@ -35,7 +35,7 @@ func main() {
|
||||
|
||||
ctx := context.Background()
|
||||
req := &api.ChatRequest{
|
||||
Model: "llama3",
|
||||
Model: "llama3.1",
|
||||
Messages: messages,
|
||||
}
|
||||
|
||||
|
||||
@@ -16,7 +16,7 @@ func main() {
|
||||
|
||||
// By default, GenerateRequest is streaming.
|
||||
req := &api.GenerateRequest{
|
||||
Model: "gemma",
|
||||
Model: "gemma2",
|
||||
Prompt: "how many planets are there?",
|
||||
}
|
||||
|
||||
|
||||
@@ -15,7 +15,7 @@ func main() {
|
||||
}
|
||||
|
||||
req := &api.GenerateRequest{
|
||||
Model: "gemma",
|
||||
Model: "gemma2",
|
||||
Prompt: "how many planets are there?",
|
||||
|
||||
// set streaming to false
|
||||
|
||||
@@ -4,6 +4,14 @@ This example provides an interface for asking questions to a PDF document.
|
||||
|
||||
## Setup
|
||||
|
||||
1. Ensure you have the `llama3.1` model installed:
|
||||
|
||||
```
|
||||
ollama pull llama3.1
|
||||
```
|
||||
|
||||
2. Install the Python Requirements.
|
||||
|
||||
```
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
@@ -51,7 +51,7 @@ while True:
|
||||
template=template,
|
||||
)
|
||||
|
||||
llm = Ollama(model="llama3:8b", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
|
||||
llm = Ollama(model="llama3.1", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
|
||||
qa_chain = RetrievalQA.from_chain_type(
|
||||
llm,
|
||||
retriever=vectorstore.as_retriever(),
|
||||
|
||||
@@ -4,10 +4,10 @@ This example summarizes the website, [https://ollama.com/blog/run-llama2-uncenso
|
||||
|
||||
## Running the Example
|
||||
|
||||
1. Ensure you have the `llama2` model installed:
|
||||
1. Ensure you have the `llama3.1` model installed:
|
||||
|
||||
```bash
|
||||
ollama pull llama2
|
||||
ollama pull llama3.1
|
||||
```
|
||||
|
||||
2. Install the Python Requirements.
|
||||
|
||||
@@ -5,8 +5,8 @@ from langchain.chains.summarize import load_summarize_chain
|
||||
loader = WebBaseLoader("https://ollama.com/blog/run-llama2-uncensored-locally")
|
||||
docs = loader.load()
|
||||
|
||||
llm = Ollama(model="llama3")
|
||||
llm = Ollama(model="llama3.1")
|
||||
chain = load_summarize_chain(llm, chain_type="stuff")
|
||||
|
||||
result = chain.invoke(docs)
|
||||
result = chain.invoke(docs)
|
||||
print(result)
|
||||
|
||||
@@ -4,10 +4,10 @@ This example is a basic "hello world" of using LangChain with Ollama.
|
||||
|
||||
## Running the Example
|
||||
|
||||
1. Ensure you have the `llama3` model installed:
|
||||
1. Ensure you have the `llama3.1` model installed:
|
||||
|
||||
```bash
|
||||
ollama pull llama3
|
||||
ollama pull llama3.1
|
||||
```
|
||||
|
||||
2. Install the Python Requirements.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
from langchain.llms import Ollama
|
||||
|
||||
input = input("What is your question?")
|
||||
llm = Ollama(model="llama3")
|
||||
llm = Ollama(model="llama3.1")
|
||||
res = llm.predict(input)
|
||||
print (res)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
FROM llama3
|
||||
FROM llama3.1
|
||||
PARAMETER temperature 1
|
||||
SYSTEM """
|
||||
You are Mario from super mario bros, acting as an assistant.
|
||||
|
||||
@@ -2,12 +2,12 @@
|
||||
|
||||
# Example character: Mario
|
||||
|
||||
This example shows how to create a basic character using Llama3 as the base model.
|
||||
This example shows how to create a basic character using Llama3.1 as the base model.
|
||||
|
||||
To run this example:
|
||||
|
||||
1. Download the Modelfile
|
||||
2. `ollama pull llama3` to get the base model used in the model file.
|
||||
2. `ollama pull llama3.1` to get the base model used in the model file.
|
||||
3. `ollama create NAME -f ./Modelfile`
|
||||
4. `ollama run NAME`
|
||||
|
||||
@@ -18,7 +18,7 @@ Ask it some questions like "Who are you?" or "Is Peach in trouble again?"
|
||||
What the model file looks like:
|
||||
|
||||
```
|
||||
FROM llama3
|
||||
FROM llama3.1
|
||||
PARAMETER temperature 1
|
||||
SYSTEM """
|
||||
You are Mario from Super Mario Bros, acting as an assistant.
|
||||
|
||||
@@ -4,7 +4,7 @@ imageName = input("Enter the name of the image: ")
|
||||
client = docker.from_env()
|
||||
s = requests.Session()
|
||||
output=""
|
||||
with s.post('http://localhost:11434/api/generate', json={'model': 'dockerit', 'prompt': inputDescription}, stream=True) as r:
|
||||
with s.post('http://localhost:11434/api/generate', json={'model': 'mattw/dockerit', 'prompt': inputDescription}, stream=True) as r:
|
||||
for line in r.iter_lines():
|
||||
if line:
|
||||
j = json.loads(line)
|
||||
|
||||
@@ -2,7 +2,7 @@ import requests
|
||||
import json
|
||||
import random
|
||||
|
||||
model = "llama3"
|
||||
model = "llama3.1"
|
||||
template = {
|
||||
"firstName": "",
|
||||
"lastName": "",
|
||||
|
||||
@@ -12,7 +12,7 @@ countries = [
|
||||
"France",
|
||||
]
|
||||
country = random.choice(countries)
|
||||
model = "llama3"
|
||||
model = "llama3.1"
|
||||
|
||||
prompt = f"generate one realistically believable sample data set of a persons first name, last name, address in {country}, and phone number. Do not use common names. Respond using JSON. Key names should have no backslashes, values should use plain ascii with no special characters."
|
||||
|
||||
|
||||
@@ -6,10 +6,10 @@ There are two python scripts in this example. `randomaddresses.py` generates ran
|
||||
|
||||
## Running the Example
|
||||
|
||||
1. Ensure you have the `llama3` model installed:
|
||||
1. Ensure you have the `llama3.1` model installed:
|
||||
|
||||
```bash
|
||||
ollama pull llama3
|
||||
ollama pull llama3.1
|
||||
```
|
||||
|
||||
2. Install the Python Requirements.
|
||||
|
||||
@@ -2,7 +2,7 @@ import json
|
||||
import requests
|
||||
|
||||
# NOTE: ollama must be running for this to work, start the ollama app or run `ollama serve`
|
||||
model = "llama3" # TODO: update this for whatever model you wish to use
|
||||
model = "llama3.1" # TODO: update this for whatever model you wish to use
|
||||
|
||||
|
||||
def chat(messages):
|
||||
|
||||
@@ -4,10 +4,10 @@ The **chat** endpoint is one of two ways to generate text from an LLM with Ollam
|
||||
|
||||
## Running the Example
|
||||
|
||||
1. Ensure you have the `llama3` model installed:
|
||||
1. Ensure you have the `llama3.1` model installed:
|
||||
|
||||
```bash
|
||||
ollama pull llama3
|
||||
ollama pull llama3.1
|
||||
```
|
||||
|
||||
2. Install the Python Requirements.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import * as readline from "readline";
|
||||
|
||||
const model = "llama3";
|
||||
const model = "llama3.1";
|
||||
type Message = {
|
||||
role: "assistant" | "user" | "system";
|
||||
content: string;
|
||||
|
||||
@@ -61,9 +61,9 @@ func AMDGetGPUInfo() []RocmGPUInfo {
|
||||
|
||||
// Determine if the user has already pre-selected which GPUs to look at, then ignore the others
|
||||
var visibleDevices []string
|
||||
hipVD := envconfig.HipVisibleDevices // zero based index only
|
||||
rocrVD := envconfig.RocrVisibleDevices // zero based index or UUID, but consumer cards seem to not support UUID
|
||||
gpuDO := envconfig.GpuDeviceOrdinal // zero based index
|
||||
hipVD := envconfig.HipVisibleDevices() // zero based index only
|
||||
rocrVD := envconfig.RocrVisibleDevices() // zero based index or UUID, but consumer cards seem to not support UUID
|
||||
gpuDO := envconfig.GpuDeviceOrdinal() // zero based index
|
||||
switch {
|
||||
// TODO is this priorty order right?
|
||||
case hipVD != "":
|
||||
@@ -76,7 +76,7 @@ func AMDGetGPUInfo() []RocmGPUInfo {
|
||||
visibleDevices = strings.Split(gpuDO, ",")
|
||||
}
|
||||
|
||||
gfxOverride := envconfig.HsaOverrideGfxVersion
|
||||
gfxOverride := envconfig.HsaOverrideGfxVersion()
|
||||
var supported []string
|
||||
libDir := ""
|
||||
|
||||
|
||||
@@ -53,7 +53,7 @@ func AMDGetGPUInfo() []RocmGPUInfo {
|
||||
}
|
||||
|
||||
var supported []string
|
||||
gfxOverride := envconfig.HsaOverrideGfxVersion
|
||||
gfxOverride := envconfig.HsaOverrideGfxVersion()
|
||||
if gfxOverride == "" {
|
||||
supported, err = GetSupportedGFX(libDir)
|
||||
if err != nil {
|
||||
|
||||
@@ -26,7 +26,7 @@ func PayloadsDir() (string, error) {
|
||||
defer lock.Unlock()
|
||||
var err error
|
||||
if payloadsDir == "" {
|
||||
runnersDir := envconfig.RunnersDir
|
||||
runnersDir := envconfig.RunnersDir()
|
||||
|
||||
if runnersDir != "" {
|
||||
payloadsDir = runnersDir
|
||||
@@ -35,7 +35,7 @@ func PayloadsDir() (string, error) {
|
||||
|
||||
// The remainder only applies on non-windows where we still carry payloads in the main executable
|
||||
cleanupTmpDirs()
|
||||
tmpDir := envconfig.TmpDir
|
||||
tmpDir := envconfig.TmpDir()
|
||||
if tmpDir == "" {
|
||||
tmpDir, err = os.MkdirTemp("", "ollama")
|
||||
if err != nil {
|
||||
@@ -105,7 +105,7 @@ func cleanupTmpDirs() {
|
||||
func Cleanup() {
|
||||
lock.Lock()
|
||||
defer lock.Unlock()
|
||||
runnersDir := envconfig.RunnersDir
|
||||
runnersDir := envconfig.RunnersDir()
|
||||
if payloadsDir != "" && runnersDir == "" && runtime.GOOS != "windows" {
|
||||
// We want to fully clean up the tmpdir parent of the payloads dir
|
||||
tmpDir := filepath.Clean(filepath.Join(payloadsDir, ".."))
|
||||
|
||||
12
gpu/gpu.go
12
gpu/gpu.go
@@ -230,8 +230,8 @@ func GetGPUInfo() GpuInfoList {
|
||||
|
||||
// On windows we bundle the nvidia library one level above the runner dir
|
||||
depPath := ""
|
||||
if runtime.GOOS == "windows" && envconfig.RunnersDir != "" {
|
||||
depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir), "cuda")
|
||||
if runtime.GOOS == "windows" && envconfig.RunnersDir() != "" {
|
||||
depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir()), "cuda")
|
||||
}
|
||||
|
||||
// Load ALL libraries
|
||||
@@ -302,12 +302,12 @@ func GetGPUInfo() GpuInfoList {
|
||||
}
|
||||
|
||||
// Intel
|
||||
if envconfig.IntelGpu {
|
||||
if envconfig.IntelGPU() {
|
||||
oHandles = initOneAPIHandles()
|
||||
// On windows we bundle the oneapi library one level above the runner dir
|
||||
depPath = ""
|
||||
if runtime.GOOS == "windows" && envconfig.RunnersDir != "" {
|
||||
depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir), "oneapi")
|
||||
if runtime.GOOS == "windows" && envconfig.RunnersDir() != "" {
|
||||
depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir()), "oneapi")
|
||||
}
|
||||
|
||||
for d := range oHandles.oneapi.num_drivers {
|
||||
@@ -611,7 +611,7 @@ func LoadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string) {
|
||||
}
|
||||
|
||||
func getVerboseState() C.uint16_t {
|
||||
if envconfig.Debug {
|
||||
if envconfig.Debug() {
|
||||
return C.uint16_t(1)
|
||||
}
|
||||
return C.uint16_t(0)
|
||||
|
||||
@@ -45,14 +45,7 @@ func TestUnicodeModelDir(t *testing.T) {
|
||||
defer os.RemoveAll(modelDir)
|
||||
slog.Info("unicode", "OLLAMA_MODELS", modelDir)
|
||||
|
||||
oldModelsDir := os.Getenv("OLLAMA_MODELS")
|
||||
if oldModelsDir == "" {
|
||||
defer os.Unsetenv("OLLAMA_MODELS")
|
||||
} else {
|
||||
defer os.Setenv("OLLAMA_MODELS", oldModelsDir)
|
||||
}
|
||||
err = os.Setenv("OLLAMA_MODELS", modelDir)
|
||||
require.NoError(t, err)
|
||||
t.Setenv("OLLAMA_MODELS", modelDir)
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
|
||||
defer cancel()
|
||||
|
||||
@@ -5,14 +5,16 @@ package integration
|
||||
import (
|
||||
"context"
|
||||
"log/slog"
|
||||
"os"
|
||||
"strconv"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/format"
|
||||
)
|
||||
|
||||
func TestMultiModelConcurrency(t *testing.T) {
|
||||
@@ -106,13 +108,16 @@ func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {
|
||||
|
||||
// Stress the system if we know how much VRAM it has, and attempt to load more models than will fit
|
||||
func TestMultiModelStress(t *testing.T) {
|
||||
vram := os.Getenv("OLLAMA_MAX_VRAM") // TODO - discover actual VRAM
|
||||
if vram == "" {
|
||||
s := os.Getenv("OLLAMA_MAX_VRAM") // TODO - discover actual VRAM
|
||||
if s == "" {
|
||||
t.Skip("OLLAMA_MAX_VRAM not specified, can't pick the right models for the stress test")
|
||||
}
|
||||
max, err := strconv.ParseUint(vram, 10, 64)
|
||||
require.NoError(t, err)
|
||||
const MB = uint64(1024 * 1024)
|
||||
|
||||
maxVram, err := strconv.ParseUint(s, 10, 64)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
type model struct {
|
||||
name string
|
||||
size uint64 // Approximate amount of VRAM they typically use when fully loaded in VRAM
|
||||
@@ -121,83 +126,82 @@ func TestMultiModelStress(t *testing.T) {
|
||||
smallModels := []model{
|
||||
{
|
||||
name: "orca-mini",
|
||||
size: 2992 * MB,
|
||||
size: 2992 * format.MebiByte,
|
||||
},
|
||||
{
|
||||
name: "phi",
|
||||
size: 2616 * MB,
|
||||
size: 2616 * format.MebiByte,
|
||||
},
|
||||
{
|
||||
name: "gemma:2b",
|
||||
size: 2364 * MB,
|
||||
size: 2364 * format.MebiByte,
|
||||
},
|
||||
{
|
||||
name: "stable-code:3b",
|
||||
size: 2608 * MB,
|
||||
size: 2608 * format.MebiByte,
|
||||
},
|
||||
{
|
||||
name: "starcoder2:3b",
|
||||
size: 2166 * MB,
|
||||
size: 2166 * format.MebiByte,
|
||||
},
|
||||
}
|
||||
mediumModels := []model{
|
||||
{
|
||||
name: "llama2",
|
||||
size: 5118 * MB,
|
||||
size: 5118 * format.MebiByte,
|
||||
},
|
||||
{
|
||||
name: "mistral",
|
||||
size: 4620 * MB,
|
||||
size: 4620 * format.MebiByte,
|
||||
},
|
||||
{
|
||||
name: "orca-mini:7b",
|
||||
size: 5118 * MB,
|
||||
size: 5118 * format.MebiByte,
|
||||
},
|
||||
{
|
||||
name: "dolphin-mistral",
|
||||
size: 4620 * MB,
|
||||
size: 4620 * format.MebiByte,
|
||||
},
|
||||
{
|
||||
name: "gemma:7b",
|
||||
size: 5000 * MB,
|
||||
size: 5000 * format.MebiByte,
|
||||
},
|
||||
{
|
||||
name: "codellama:7b",
|
||||
size: 5118 * format.MebiByte,
|
||||
},
|
||||
// TODO - uncomment this once #3565 is merged and this is rebased on it
|
||||
// {
|
||||
// name: "codellama:7b",
|
||||
// size: 5118 * MB,
|
||||
// },
|
||||
}
|
||||
|
||||
// These seem to be too slow to be useful...
|
||||
// largeModels := []model{
|
||||
// {
|
||||
// name: "llama2:13b",
|
||||
// size: 7400 * MB,
|
||||
// size: 7400 * format.MebiByte,
|
||||
// },
|
||||
// {
|
||||
// name: "codellama:13b",
|
||||
// size: 7400 * MB,
|
||||
// size: 7400 * format.MebiByte,
|
||||
// },
|
||||
// {
|
||||
// name: "orca-mini:13b",
|
||||
// size: 7400 * MB,
|
||||
// size: 7400 * format.MebiByte,
|
||||
// },
|
||||
// {
|
||||
// name: "gemma:7b",
|
||||
// size: 5000 * MB,
|
||||
// size: 5000 * format.MebiByte,
|
||||
// },
|
||||
// {
|
||||
// name: "starcoder2:15b",
|
||||
// size: 9100 * MB,
|
||||
// size: 9100 * format.MebiByte,
|
||||
// },
|
||||
// }
|
||||
|
||||
var chosenModels []model
|
||||
switch {
|
||||
case max < 10000*MB:
|
||||
case maxVram < 10000*format.MebiByte:
|
||||
slog.Info("selecting small models")
|
||||
chosenModels = smallModels
|
||||
// case max < 30000*MB:
|
||||
// case maxVram < 30000*format.MebiByte:
|
||||
default:
|
||||
slog.Info("selecting medium models")
|
||||
chosenModels = mediumModels
|
||||
@@ -226,15 +230,15 @@ func TestMultiModelStress(t *testing.T) {
|
||||
}
|
||||
|
||||
var wg sync.WaitGroup
|
||||
consumed := uint64(256 * MB) // Assume some baseline usage
|
||||
consumed := uint64(256 * format.MebiByte) // Assume some baseline usage
|
||||
for i := 0; i < len(req); i++ {
|
||||
// Always get at least 2 models, but dont' overshoot VRAM too much or we'll take too long
|
||||
if i > 1 && consumed > max {
|
||||
slog.Info("achieved target vram exhaustion", "count", i, "vramMB", max/1024/1024, "modelsMB", consumed/1024/1024)
|
||||
if i > 1 && consumed > vram {
|
||||
slog.Info("achieved target vram exhaustion", "count", i, "vram", format.HumanBytes2(vram), "models", format.HumanBytes2(consumed))
|
||||
break
|
||||
}
|
||||
consumed += chosenModels[i].size
|
||||
slog.Info("target vram", "count", i, "vramMB", max/1024/1024, "modelsMB", consumed/1024/1024)
|
||||
slog.Info("target vram", "count", i, "vram", format.HumanBytes2(vram), "models", format.HumanBytes2(consumed))
|
||||
|
||||
wg.Add(1)
|
||||
go func(i int) {
|
||||
|
||||
@@ -69,6 +69,10 @@ func TestAllMiniLMEmbed(t *testing.T) {
|
||||
if !floatsEqual32(res.Embeddings[0][0], 0.010071031) {
|
||||
t.Fatalf("expected 0.010071031, got %.8f", res.Embeddings[0][0])
|
||||
}
|
||||
|
||||
if res.PromptEvalCount != 8 {
|
||||
t.Fatalf("expected 8 prompt tokens, got %d", res.PromptEvalCount)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAllMiniLMBatchEmbed(t *testing.T) {
|
||||
@@ -97,6 +101,10 @@ func TestAllMiniLMBatchEmbed(t *testing.T) {
|
||||
if !floatsEqual32(res.Embeddings[0][0], 0.010071031) || !floatsEqual32(res.Embeddings[1][0], -0.009802706) {
|
||||
t.Fatalf("expected 0.010071031 and -0.009802706, got %.8f and %.8f", res.Embeddings[0][0], res.Embeddings[1][0])
|
||||
}
|
||||
|
||||
if res.PromptEvalCount != 16 {
|
||||
t.Fatalf("expected 16 prompt tokens, got %d", res.PromptEvalCount)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAllMiniLMEmbedTruncate(t *testing.T) {
|
||||
|
||||
@@ -5,7 +5,6 @@ package integration
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"strconv"
|
||||
@@ -14,8 +13,10 @@ import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
)
|
||||
|
||||
func TestMaxQueue(t *testing.T) {
|
||||
@@ -27,13 +28,10 @@ func TestMaxQueue(t *testing.T) {
|
||||
// Note: This test can be quite slow when running in CPU mode, so keep the threadCount low unless your on GPU
|
||||
// Also note that by default Darwin can't sustain > ~128 connections without adjusting limits
|
||||
threadCount := 32
|
||||
mq := os.Getenv("OLLAMA_MAX_QUEUE")
|
||||
if mq != "" {
|
||||
var err error
|
||||
threadCount, err = strconv.Atoi(mq)
|
||||
require.NoError(t, err)
|
||||
if maxQueue := envconfig.MaxQueue(); maxQueue != 0 {
|
||||
threadCount = maxQueue
|
||||
} else {
|
||||
os.Setenv("OLLAMA_MAX_QUEUE", fmt.Sprintf("%d", threadCount))
|
||||
t.Setenv("OLLAMA_MAX_QUEUE", strconv.Itoa(threadCount))
|
||||
}
|
||||
|
||||
req := api.GenerateRequest{
|
||||
|
||||
7
llm/ext_server/server.cpp
vendored
7
llm/ext_server/server.cpp
vendored
@@ -1221,6 +1221,7 @@ struct llama_server_context
|
||||
res.result_json = json
|
||||
{
|
||||
{"embedding", std::vector<float>(embd, embd + n_embd)},
|
||||
{"timings", slot.get_formated_timings()},
|
||||
};
|
||||
}
|
||||
}
|
||||
@@ -3203,11 +3204,15 @@ int main(int argc, char **argv) {
|
||||
|
||||
responses = result.result_json.value("results", std::vector<json>{result.result_json});
|
||||
json embeddings = json::array();
|
||||
|
||||
int prompt_n = 0;
|
||||
for (auto & elem : responses) {
|
||||
embeddings.push_back(elem.at("embedding"));
|
||||
prompt_n += elem.at("timings").at("prompt_n").get<int>();
|
||||
}
|
||||
|
||||
// send the result
|
||||
json embedding_res = json{{"embedding", embeddings}};
|
||||
json embedding_res = json{{"embedding", embeddings}, {"prompt_n", prompt_n}};
|
||||
return res.set_content(embedding_res.dump(), "application/json; charset=utf-8");
|
||||
}
|
||||
});
|
||||
|
||||
@@ -8,14 +8,14 @@ import (
|
||||
"testing"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/gpu"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestEstimateGPULayers(t *testing.T) {
|
||||
envconfig.Debug = true
|
||||
t.Setenv("OLLAMA_DEBUG", "1")
|
||||
|
||||
modelName := "dummy"
|
||||
f, err := os.CreateTemp(t.TempDir(), modelName)
|
||||
require.NoError(t, err)
|
||||
|
||||
20
llm/patches/10-params.diff
Normal file
20
llm/patches/10-params.diff
Normal file
@@ -0,0 +1,20 @@
|
||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||
index a207451f..fba6b175 100644
|
||||
--- a/src/llama.cpp
|
||||
+++ b/src/llama.cpp
|
||||
@@ -4969,6 +4969,7 @@ static void llm_load_hparams(
|
||||
hparams.attn_soft_cap = true;
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
+ case 26: model.type = e_model::MODEL_2B; break;
|
||||
case 42: model.type = e_model::MODEL_9B; break;
|
||||
case 46: model.type = e_model::MODEL_27B; break;
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
@@ -11736,6 +11737,7 @@ struct llm_build_context {
|
||||
|
||||
// ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
|
||||
switch (model.type) {
|
||||
+ case e_model::MODEL_2B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break;
|
||||
case e_model::MODEL_9B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break;
|
||||
case e_model::MODEL_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break;
|
||||
default: GGML_ABORT("fatal error");
|
||||
43
llm/patches/11-phi3-sliding-window.diff
Normal file
43
llm/patches/11-phi3-sliding-window.diff
Normal file
@@ -0,0 +1,43 @@
|
||||
From 6eedae4cf2fcc8015dac79cb3f28f61fcabacab2 Mon Sep 17 00:00:00 2001
|
||||
From: Michael Yang <mxyng@pm.me>
|
||||
Date: Wed, 31 Jul 2024 14:57:04 -0700
|
||||
Subject: [PATCH] phi3 sliding window
|
||||
|
||||
---
|
||||
src/llama.cpp | 6 +++---
|
||||
1 file changed, 3 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||
index a207451f..f2872d4e 100644
|
||||
--- a/src/llama.cpp
|
||||
+++ b/src/llama.cpp
|
||||
@@ -4893,7 +4893,7 @@ static void llm_load_hparams(
|
||||
} break;
|
||||
case LLM_ARCH_PHI3:
|
||||
{
|
||||
- ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||
+ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
@@ -10762,7 +10762,7 @@ struct llm_build_context {
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
- struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa();
|
||||
+ struct ggml_tensor * KQ_mask = hparams.n_swa > 0 ? build_inp_KQ_mask_swa() : build_inp_KQ_mask();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
auto residual = inpL;
|
||||
@@ -10820,7 +10820,7 @@ struct llm_build_context {
|
||||
|
||||
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
- Kcur, Vcur, Qcur, KQ_mask_swa, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
||||
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
||||
}
|
||||
|
||||
if (il == n_layer - 1) {
|
||||
--
|
||||
2.45.2
|
||||
|
||||
@@ -33,7 +33,7 @@ type LlamaServer interface {
|
||||
Ping(ctx context.Context) error
|
||||
WaitUntilRunning(ctx context.Context) error
|
||||
Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
|
||||
Embed(ctx context.Context, input []string) ([][]float32, error)
|
||||
Embed(ctx context.Context, input []string) (*EmbedResponse, error)
|
||||
Tokenize(ctx context.Context, content string) ([]int, error)
|
||||
Detokenize(ctx context.Context, tokens []int) (string, error)
|
||||
Close() error
|
||||
@@ -163,7 +163,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||
} else {
|
||||
servers = serversForGpu(gpus[0]) // All GPUs in the list are matching Library and Variant
|
||||
}
|
||||
demandLib := envconfig.LLMLibrary
|
||||
demandLib := envconfig.LLMLibrary()
|
||||
if demandLib != "" {
|
||||
serverPath := availableServers[demandLib]
|
||||
if serverPath == "" {
|
||||
@@ -195,7 +195,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||
params = append(params, "--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU))
|
||||
}
|
||||
|
||||
if envconfig.Debug {
|
||||
if envconfig.Debug() {
|
||||
params = append(params, "--verbose")
|
||||
}
|
||||
|
||||
@@ -221,7 +221,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||
params = append(params, "--memory-f32")
|
||||
}
|
||||
|
||||
flashAttnEnabled := envconfig.FlashAttention
|
||||
flashAttnEnabled := envconfig.FlashAttention()
|
||||
|
||||
for _, g := range gpus {
|
||||
// only cuda (compute capability 7+) and metal support flash attention
|
||||
@@ -382,7 +382,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||
}
|
||||
|
||||
slog.Info("starting llama server", "cmd", s.cmd.String())
|
||||
if envconfig.Debug {
|
||||
if envconfig.Debug() {
|
||||
filteredEnv := []string{}
|
||||
for _, ev := range s.cmd.Env {
|
||||
if strings.HasPrefix(ev, "CUDA_") ||
|
||||
@@ -879,10 +879,11 @@ type EmbedRequest struct {
|
||||
}
|
||||
|
||||
type EmbedResponse struct {
|
||||
Embedding [][]float32 `json:"embedding"`
|
||||
Embedding [][]float32 `json:"embedding"`
|
||||
PromptEvalCount int `json:"prompt_n"`
|
||||
}
|
||||
|
||||
func (s *llmServer) Embed(ctx context.Context, input []string) ([][]float32, error) {
|
||||
func (s *llmServer) Embed(ctx context.Context, input []string) (*EmbedResponse, error) {
|
||||
if err := s.sem.Acquire(ctx, 1); err != nil {
|
||||
slog.Error("Failed to acquire semaphore", "error", err)
|
||||
return nil, err
|
||||
@@ -924,12 +925,12 @@ func (s *llmServer) Embed(ctx context.Context, input []string) ([][]float32, err
|
||||
return nil, fmt.Errorf("%s", body)
|
||||
}
|
||||
|
||||
var embedding EmbedResponse
|
||||
if err := json.Unmarshal(body, &embedding); err != nil {
|
||||
var e EmbedResponse
|
||||
if err := json.Unmarshal(body, &e); err != nil {
|
||||
return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
|
||||
}
|
||||
|
||||
return embedding.Embedding, nil
|
||||
return &e, nil
|
||||
}
|
||||
|
||||
type TokenizeRequest struct {
|
||||
|
||||
@@ -61,6 +61,36 @@ type blobDownloadPart struct {
|
||||
*blobDownload `json:"-"`
|
||||
}
|
||||
|
||||
type jsonBlobDownloadPart struct {
|
||||
N int
|
||||
Offset int64
|
||||
Size int64
|
||||
Completed int64
|
||||
}
|
||||
|
||||
func (p *blobDownloadPart) MarshalJSON() ([]byte, error) {
|
||||
return json.Marshal(jsonBlobDownloadPart{
|
||||
N: p.N,
|
||||
Offset: p.Offset,
|
||||
Size: p.Size,
|
||||
Completed: p.Completed.Load(),
|
||||
})
|
||||
}
|
||||
|
||||
func (p *blobDownloadPart) UnmarshalJSON(b []byte) error {
|
||||
var j jsonBlobDownloadPart
|
||||
if err := json.Unmarshal(b, &j); err != nil {
|
||||
return err
|
||||
}
|
||||
*p = blobDownloadPart{
|
||||
N: j.N,
|
||||
Offset: j.Offset,
|
||||
Size: j.Size,
|
||||
}
|
||||
p.Completed.Store(j.Completed)
|
||||
return nil
|
||||
}
|
||||
|
||||
const (
|
||||
numDownloadParts = 64
|
||||
minDownloadPartSize int64 = 100 * format.MegaByte
|
||||
|
||||
@@ -70,7 +70,7 @@ type Model struct {
|
||||
License []string
|
||||
Digest string
|
||||
Options map[string]interface{}
|
||||
Messages []Message
|
||||
Messages []api.Message
|
||||
|
||||
Template *template.Template
|
||||
}
|
||||
@@ -191,11 +191,6 @@ func (m *Model) String() string {
|
||||
return modelfile.String()
|
||||
}
|
||||
|
||||
type Message struct {
|
||||
Role string `json:"role"`
|
||||
Content string `json:"content"`
|
||||
}
|
||||
|
||||
type ConfigV2 struct {
|
||||
ModelFormat string `json:"model_format"`
|
||||
ModelFamily string `json:"model_family"`
|
||||
@@ -646,7 +641,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
|
||||
return err
|
||||
}
|
||||
|
||||
if !envconfig.NoPrune && old != nil {
|
||||
if !envconfig.NoPrune() && old != nil {
|
||||
if err := old.RemoveLayers(); err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -885,7 +880,7 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
|
||||
// build deleteMap to prune unused layers
|
||||
deleteMap := make(map[string]struct{})
|
||||
|
||||
if !envconfig.NoPrune {
|
||||
if !envconfig.NoPrune() {
|
||||
manifest, _, err = GetManifest(mp)
|
||||
if err != nil && !errors.Is(err, os.ErrNotExist) {
|
||||
return err
|
||||
|
||||
@@ -7,7 +7,6 @@ import (
|
||||
"slices"
|
||||
"testing"
|
||||
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/types/model"
|
||||
)
|
||||
|
||||
@@ -108,7 +107,6 @@ func TestManifests(t *testing.T) {
|
||||
t.Run(n, func(t *testing.T) {
|
||||
d := t.TempDir()
|
||||
t.Setenv("OLLAMA_MODELS", d)
|
||||
envconfig.LoadConfig()
|
||||
|
||||
for _, p := range wants.ps {
|
||||
createManifest(t, d, p)
|
||||
|
||||
@@ -105,9 +105,7 @@ func (mp ModelPath) GetShortTagname() string {
|
||||
|
||||
// GetManifestPath returns the path to the manifest file for the given model path, it is up to the caller to create the directory if it does not exist.
|
||||
func (mp ModelPath) GetManifestPath() (string, error) {
|
||||
dir := envconfig.ModelsDir
|
||||
|
||||
return filepath.Join(dir, "manifests", mp.Registry, mp.Namespace, mp.Repository, mp.Tag), nil
|
||||
return filepath.Join(envconfig.Models(), "manifests", mp.Registry, mp.Namespace, mp.Repository, mp.Tag), nil
|
||||
}
|
||||
|
||||
func (mp ModelPath) BaseURL() *url.URL {
|
||||
@@ -118,9 +116,7 @@ func (mp ModelPath) BaseURL() *url.URL {
|
||||
}
|
||||
|
||||
func GetManifestPath() (string, error) {
|
||||
dir := envconfig.ModelsDir
|
||||
|
||||
path := filepath.Join(dir, "manifests")
|
||||
path := filepath.Join(envconfig.Models(), "manifests")
|
||||
if err := os.MkdirAll(path, 0o755); err != nil {
|
||||
return "", err
|
||||
}
|
||||
@@ -129,8 +125,6 @@ func GetManifestPath() (string, error) {
|
||||
}
|
||||
|
||||
func GetBlobsPath(digest string) (string, error) {
|
||||
dir := envconfig.ModelsDir
|
||||
|
||||
// only accept actual sha256 digests
|
||||
pattern := "^sha256[:-][0-9a-fA-F]{64}$"
|
||||
re := regexp.MustCompile(pattern)
|
||||
@@ -140,7 +134,7 @@ func GetBlobsPath(digest string) (string, error) {
|
||||
}
|
||||
|
||||
digest = strings.ReplaceAll(digest, ":", "-")
|
||||
path := filepath.Join(dir, "blobs", digest)
|
||||
path := filepath.Join(envconfig.Models(), "blobs", digest)
|
||||
dirPath := filepath.Dir(path)
|
||||
if digest == "" {
|
||||
dirPath = path
|
||||
|
||||
@@ -7,8 +7,6 @@ import (
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
)
|
||||
|
||||
func TestGetBlobsPath(t *testing.T) {
|
||||
@@ -63,7 +61,6 @@ func TestGetBlobsPath(t *testing.T) {
|
||||
for _, tc := range tests {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
t.Setenv("OLLAMA_MODELS", dir)
|
||||
envconfig.LoadConfig()
|
||||
|
||||
got, err := GetBlobsPath(tc.digest)
|
||||
|
||||
|
||||
@@ -164,17 +164,6 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
||||
}
|
||||
}
|
||||
|
||||
var b bytes.Buffer
|
||||
if req.Context != nil {
|
||||
s, err := r.Detokenize(c.Request.Context(), req.Context)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
b.WriteString(s)
|
||||
}
|
||||
|
||||
var values template.Values
|
||||
if req.Suffix != "" {
|
||||
values.Prompt = prompt
|
||||
@@ -187,6 +176,10 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
||||
msgs = append(msgs, api.Message{Role: "system", Content: m.System})
|
||||
}
|
||||
|
||||
if req.Context == nil {
|
||||
msgs = append(msgs, m.Messages...)
|
||||
}
|
||||
|
||||
for _, i := range images {
|
||||
msgs = append(msgs, api.Message{Role: "user", Content: fmt.Sprintf("[img-%d]", i.ID)})
|
||||
}
|
||||
@@ -194,11 +187,22 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
||||
values.Messages = append(msgs, api.Message{Role: "user", Content: req.Prompt})
|
||||
}
|
||||
|
||||
var b bytes.Buffer
|
||||
if err := tmpl.Execute(&b, values); err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
if req.Context != nil {
|
||||
s, err := r.Detokenize(c.Request.Context(), req.Context)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
b.WriteString(s)
|
||||
}
|
||||
|
||||
prompt = b.String()
|
||||
}
|
||||
|
||||
@@ -284,6 +288,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
||||
}
|
||||
|
||||
func (s *Server) EmbedHandler(c *gin.Context) {
|
||||
checkpointStart := time.Now()
|
||||
var req api.EmbedRequest
|
||||
err := c.ShouldBindJSON(&req)
|
||||
switch {
|
||||
@@ -332,6 +337,8 @@ func (s *Server) EmbedHandler(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
checkpointLoaded := time.Now()
|
||||
|
||||
kvData, err := getKVData(m.ModelPath, false)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
@@ -370,13 +377,16 @@ func (s *Server) EmbedHandler(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
for i, e := range embeddings {
|
||||
embeddings[i] = normalize(e)
|
||||
for i, e := range embeddings.Embedding {
|
||||
embeddings.Embedding[i] = normalize(e)
|
||||
}
|
||||
|
||||
resp := api.EmbedResponse{
|
||||
Model: req.Model,
|
||||
Embeddings: embeddings,
|
||||
Model: req.Model,
|
||||
Embeddings: embeddings.Embedding,
|
||||
TotalDuration: time.Since(checkpointStart),
|
||||
LoadDuration: checkpointLoaded.Sub(checkpointStart),
|
||||
PromptEvalCount: embeddings.PromptEvalCount,
|
||||
}
|
||||
c.JSON(http.StatusOK, resp)
|
||||
}
|
||||
@@ -428,9 +438,9 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
embedding := make([]float64, len(embeddings[0]))
|
||||
embedding := make([]float64, len(embeddings.Embedding[0]))
|
||||
|
||||
for i, v := range embeddings[0] {
|
||||
for i, v := range embeddings.Embedding[0] {
|
||||
embedding[i] = float64(v)
|
||||
}
|
||||
|
||||
@@ -1047,7 +1057,7 @@ func (s *Server) GenerateRoutes() http.Handler {
|
||||
for _, prop := range openAIProperties {
|
||||
config.AllowHeaders = append(config.AllowHeaders, "x-stainless-"+prop)
|
||||
}
|
||||
config.AllowOrigins = envconfig.AllowOrigins
|
||||
config.AllowOrigins = envconfig.Origins()
|
||||
|
||||
r := gin.Default()
|
||||
r.Use(
|
||||
@@ -1092,7 +1102,7 @@ func (s *Server) GenerateRoutes() http.Handler {
|
||||
|
||||
func Serve(ln net.Listener) error {
|
||||
level := slog.LevelInfo
|
||||
if envconfig.Debug {
|
||||
if envconfig.Debug() {
|
||||
level = slog.LevelDebug
|
||||
}
|
||||
|
||||
@@ -1120,7 +1130,7 @@ func Serve(ln net.Listener) error {
|
||||
return err
|
||||
}
|
||||
|
||||
if !envconfig.NoPrune {
|
||||
if !envconfig.NoPrune() {
|
||||
// clean up unused layers and manifests
|
||||
if err := PruneLayers(); err != nil {
|
||||
return err
|
||||
@@ -1323,11 +1333,12 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
msgs := append(m.Messages, req.Messages...)
|
||||
if req.Messages[0].Role != "system" && m.System != "" {
|
||||
req.Messages = append([]api.Message{{Role: "system", Content: m.System}}, req.Messages...)
|
||||
msgs = append([]api.Message{{Role: "system", Content: m.System}}, msgs...)
|
||||
}
|
||||
|
||||
prompt, images, err := chatPrompt(c.Request.Context(), m, r.Tokenize, opts, req.Messages, req.Tools)
|
||||
prompt, images, err := chatPrompt(c.Request.Context(), m, r.Tokenize, opts, msgs, req.Tools)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
|
||||
@@ -15,7 +15,6 @@ import (
|
||||
|
||||
"github.com/gin-gonic/gin"
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/llm"
|
||||
)
|
||||
|
||||
@@ -89,7 +88,6 @@ func TestCreateFromBin(t *testing.T) {
|
||||
|
||||
p := t.TempDir()
|
||||
t.Setenv("OLLAMA_MODELS", p)
|
||||
envconfig.LoadConfig()
|
||||
|
||||
var s Server
|
||||
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
|
||||
@@ -117,7 +115,6 @@ func TestCreateFromModel(t *testing.T) {
|
||||
|
||||
p := t.TempDir()
|
||||
t.Setenv("OLLAMA_MODELS", p)
|
||||
envconfig.LoadConfig()
|
||||
var s Server
|
||||
|
||||
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
|
||||
@@ -160,7 +157,6 @@ func TestCreateRemovesLayers(t *testing.T) {
|
||||
|
||||
p := t.TempDir()
|
||||
t.Setenv("OLLAMA_MODELS", p)
|
||||
envconfig.LoadConfig()
|
||||
var s Server
|
||||
|
||||
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
|
||||
@@ -209,7 +205,6 @@ func TestCreateUnsetsSystem(t *testing.T) {
|
||||
|
||||
p := t.TempDir()
|
||||
t.Setenv("OLLAMA_MODELS", p)
|
||||
envconfig.LoadConfig()
|
||||
var s Server
|
||||
|
||||
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
|
||||
@@ -267,7 +262,6 @@ func TestCreateMergeParameters(t *testing.T) {
|
||||
|
||||
p := t.TempDir()
|
||||
t.Setenv("OLLAMA_MODELS", p)
|
||||
envconfig.LoadConfig()
|
||||
var s Server
|
||||
|
||||
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
|
||||
@@ -372,7 +366,6 @@ func TestCreateReplacesMessages(t *testing.T) {
|
||||
|
||||
p := t.TempDir()
|
||||
t.Setenv("OLLAMA_MODELS", p)
|
||||
envconfig.LoadConfig()
|
||||
var s Server
|
||||
|
||||
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
|
||||
@@ -450,7 +443,6 @@ func TestCreateTemplateSystem(t *testing.T) {
|
||||
|
||||
p := t.TempDir()
|
||||
t.Setenv("OLLAMA_MODELS", p)
|
||||
envconfig.LoadConfig()
|
||||
var s Server
|
||||
|
||||
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
|
||||
@@ -534,7 +526,6 @@ func TestCreateLicenses(t *testing.T) {
|
||||
|
||||
p := t.TempDir()
|
||||
t.Setenv("OLLAMA_MODELS", p)
|
||||
envconfig.LoadConfig()
|
||||
var s Server
|
||||
|
||||
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
|
||||
@@ -582,7 +573,6 @@ func TestCreateDetectTemplate(t *testing.T) {
|
||||
|
||||
p := t.TempDir()
|
||||
t.Setenv("OLLAMA_MODELS", p)
|
||||
envconfig.LoadConfig()
|
||||
var s Server
|
||||
|
||||
t.Run("matched", func(t *testing.T) {
|
||||
|
||||
@@ -10,7 +10,6 @@ import (
|
||||
|
||||
"github.com/gin-gonic/gin"
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/types/model"
|
||||
)
|
||||
|
||||
@@ -19,7 +18,6 @@ func TestDelete(t *testing.T) {
|
||||
|
||||
p := t.TempDir()
|
||||
t.Setenv("OLLAMA_MODELS", p)
|
||||
envconfig.LoadConfig()
|
||||
|
||||
var s Server
|
||||
|
||||
|
||||
@@ -9,14 +9,12 @@ import (
|
||||
|
||||
"github.com/gin-gonic/gin"
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
)
|
||||
|
||||
func TestList(t *testing.T) {
|
||||
gin.SetMode(gin.TestMode)
|
||||
|
||||
t.Setenv("OLLAMA_MODELS", t.TempDir())
|
||||
envconfig.LoadConfig()
|
||||
|
||||
expectNames := []string{
|
||||
"mistral:7b-instruct-q4_0",
|
||||
|
||||
@@ -19,7 +19,6 @@ import (
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/ollama/ollama/openai"
|
||||
"github.com/ollama/ollama/parser"
|
||||
@@ -347,7 +346,6 @@ func Test_Routes(t *testing.T) {
|
||||
}
|
||||
|
||||
t.Setenv("OLLAMA_MODELS", t.TempDir())
|
||||
envconfig.LoadConfig()
|
||||
|
||||
s := &Server{}
|
||||
router := s.GenerateRoutes()
|
||||
@@ -378,7 +376,6 @@ func Test_Routes(t *testing.T) {
|
||||
|
||||
func TestCase(t *testing.T) {
|
||||
t.Setenv("OLLAMA_MODELS", t.TempDir())
|
||||
envconfig.LoadConfig()
|
||||
|
||||
cases := []string{
|
||||
"mistral",
|
||||
@@ -458,7 +455,6 @@ func TestCase(t *testing.T) {
|
||||
|
||||
func TestShow(t *testing.T) {
|
||||
t.Setenv("OLLAMA_MODELS", t.TempDir())
|
||||
envconfig.LoadConfig()
|
||||
|
||||
var s Server
|
||||
|
||||
|
||||
@@ -5,9 +5,11 @@ import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"reflect"
|
||||
"runtime"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
@@ -59,11 +61,12 @@ var defaultParallel = 4
|
||||
var ErrMaxQueue = fmt.Errorf("server busy, please try again. maximum pending requests exceeded")
|
||||
|
||||
func InitScheduler(ctx context.Context) *Scheduler {
|
||||
maxQueue := envconfig.MaxQueue()
|
||||
sched := &Scheduler{
|
||||
pendingReqCh: make(chan *LlmRequest, envconfig.MaxQueuedRequests),
|
||||
finishedReqCh: make(chan *LlmRequest, envconfig.MaxQueuedRequests),
|
||||
expiredCh: make(chan *runnerRef, envconfig.MaxQueuedRequests),
|
||||
unloadedCh: make(chan interface{}, envconfig.MaxQueuedRequests),
|
||||
pendingReqCh: make(chan *LlmRequest, maxQueue),
|
||||
finishedReqCh: make(chan *LlmRequest, maxQueue),
|
||||
expiredCh: make(chan *runnerRef, maxQueue),
|
||||
unloadedCh: make(chan interface{}, maxQueue),
|
||||
loaded: make(map[string]*runnerRef),
|
||||
newServerFn: llm.NewLlamaServer,
|
||||
getGpuFn: gpu.GetGPUInfo,
|
||||
@@ -126,7 +129,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
||||
slog.Debug("pending request cancelled or timed out, skipping scheduling")
|
||||
continue
|
||||
}
|
||||
numParallel := envconfig.NumParallel
|
||||
numParallel := int(envconfig.NumParallel())
|
||||
// TODO (jmorganca): multimodal models don't support parallel yet
|
||||
// see https://github.com/ollama/ollama/issues/4165
|
||||
if len(pending.model.ProjectorPaths) > 0 && numParallel != 1 {
|
||||
@@ -148,7 +151,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
||||
pending.useLoadedRunner(runner, s.finishedReqCh)
|
||||
break
|
||||
}
|
||||
} else if envconfig.MaxRunners > 0 && loadedCount >= envconfig.MaxRunners {
|
||||
} else if envconfig.MaxRunners() > 0 && loadedCount >= int(envconfig.MaxRunners()) {
|
||||
slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount)
|
||||
runnerToExpire = s.findRunnerToUnload()
|
||||
} else {
|
||||
@@ -161,7 +164,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
||||
gpus = s.getGpuFn()
|
||||
}
|
||||
|
||||
if envconfig.MaxRunners <= 0 {
|
||||
if envconfig.MaxRunners() <= 0 {
|
||||
// No user specified MaxRunners, so figure out what automatic setting to use
|
||||
// If all GPUs have reliable free memory reporting, defaultModelsPerGPU * the number of GPUs
|
||||
// if any GPU has unreliable free memory reporting, 1x the number of GPUs
|
||||
@@ -173,11 +176,13 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
||||
}
|
||||
}
|
||||
if allReliable {
|
||||
envconfig.MaxRunners = defaultModelsPerGPU * len(gpus)
|
||||
// HACK
|
||||
os.Setenv("OLLAMA_MAX_LOADED_MODELS", strconv.Itoa(defaultModelsPerGPU*len(gpus)))
|
||||
slog.Debug("updating default concurrency", "OLLAMA_MAX_LOADED_MODELS", envconfig.MaxRunners, "gpu_count", len(gpus))
|
||||
} else {
|
||||
// HACK
|
||||
os.Setenv("OLLAMA_MAX_LOADED_MODELS", strconv.Itoa(len(gpus)))
|
||||
slog.Info("one or more GPUs detected that are unable to accurately report free memory - disabling default concurrency")
|
||||
envconfig.MaxRunners = len(gpus)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -212,9 +217,12 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
||||
} else if loadedCount == 0 {
|
||||
// No models loaded. Load the model but prefer the best fit.
|
||||
slog.Debug("loading first model", "model", pending.model.ModelPath)
|
||||
g := pickBestFitGPUs(pending, ggml, gpus, &numParallel)
|
||||
g := pickBestFullFitByLibrary(pending, ggml, gpus, &numParallel)
|
||||
if g != nil {
|
||||
gpus = g
|
||||
} else {
|
||||
// Only allow partial loads when this is the first model
|
||||
gpus = pickBestPartialFitByLibrary(pending, ggml, gpus, &numParallel)
|
||||
}
|
||||
s.loadFn(pending, ggml, gpus, numParallel)
|
||||
break
|
||||
@@ -231,7 +239,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
||||
|
||||
// Update free memory from currently loaded models
|
||||
s.updateFreeSpace(availGpus)
|
||||
fitGpus := pickBestFitGPUs(pending, ggml, availGpus, &numParallel)
|
||||
fitGpus := pickBestFullFitByLibrary(pending, ggml, availGpus, &numParallel)
|
||||
if fitGpus != nil {
|
||||
slog.Debug("new model fits with existing models, loading")
|
||||
s.loadFn(pending, ggml, fitGpus, numParallel)
|
||||
@@ -401,7 +409,7 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList,
|
||||
if numParallel < 1 {
|
||||
numParallel = 1
|
||||
}
|
||||
sessionDuration := envconfig.KeepAlive
|
||||
sessionDuration := envconfig.KeepAlive()
|
||||
if req.sessionDuration != nil {
|
||||
sessionDuration = req.sessionDuration.Duration
|
||||
}
|
||||
@@ -668,11 +676,12 @@ func (a ByDuration) Less(i, j int) bool {
|
||||
// func (a BySize) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
|
||||
// func (a BySize) Less(i, j int) bool { return a[i].estimatedVRAM < a[j].estimatedVRAM }
|
||||
|
||||
// pickBestFitGPUs will try to find the optimal placement of the model in the available GPUs where the model fully fits
|
||||
// pickBestFullFitByLibrary will try to find the optimal placement of the model in the available GPUs where the model fully fits
|
||||
// The list of GPUs returned will always be the same brand (library)
|
||||
// If the model can not be fit fully within the available GPU(s) nil is returned
|
||||
// If numParallel is <= 0, this will attempt try to optimize parallism based on available VRAM, and adjust
|
||||
// opts.NumCtx accordingly
|
||||
func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
|
||||
func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
|
||||
var estimatedVRAM uint64
|
||||
|
||||
var numParallelToTry []int
|
||||
@@ -695,7 +704,7 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numP
|
||||
// First attempt to fit the model into a single GPU
|
||||
for _, p := range numParallelToTry {
|
||||
req.opts.NumCtx = req.origNumCtx * p
|
||||
if !envconfig.SchedSpread {
|
||||
if !envconfig.SchedSpread() {
|
||||
for _, g := range sgl {
|
||||
if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
|
||||
slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
|
||||
@@ -723,6 +732,25 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numP
|
||||
return nil
|
||||
}
|
||||
|
||||
// If multiple Libraries are detected, pick the Library which loads the most layers for the model
|
||||
func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
|
||||
*numParallel = 1
|
||||
byLibrary := gpus.ByLibrary()
|
||||
if len(byLibrary) <= 1 {
|
||||
return gpus
|
||||
}
|
||||
var bestEstimate uint64
|
||||
var bestFit int
|
||||
for i, gl := range byLibrary {
|
||||
_, estimatedVRAM := llm.PredictServerFit(gl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts)
|
||||
if estimatedVRAM > bestEstimate {
|
||||
bestEstimate = estimatedVRAM
|
||||
bestFit = i
|
||||
}
|
||||
}
|
||||
return byLibrary[bestFit]
|
||||
}
|
||||
|
||||
// findRunnerToUnload finds a runner to unload to make room for a new model
|
||||
func (s *Scheduler) findRunnerToUnload() *runnerRef {
|
||||
s.loadedMu.Lock()
|
||||
|
||||
@@ -12,7 +12,6 @@ import (
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/app/lifecycle"
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/gpu"
|
||||
"github.com/ollama/ollama/llm"
|
||||
@@ -272,7 +271,7 @@ func TestRequestsMultipleLoadedModels(t *testing.T) {
|
||||
c.req.opts.NumGPU = 0 // CPU load, will be allowed
|
||||
d := newScenarioRequest(t, ctx, "ollama-model-3c", 30, nil) // Needs prior unloaded
|
||||
|
||||
envconfig.MaxRunners = 1
|
||||
t.Setenv("OLLAMA_MAX_LOADED_MODELS", "1")
|
||||
s.newServerFn = a.newServer
|
||||
slog.Info("a")
|
||||
s.pendingReqCh <- a.req
|
||||
@@ -291,7 +290,7 @@ func TestRequestsMultipleLoadedModels(t *testing.T) {
|
||||
require.Len(t, s.loaded, 1)
|
||||
s.loadedMu.Unlock()
|
||||
|
||||
envconfig.MaxRunners = 0
|
||||
t.Setenv("OLLAMA_MAX_LOADED_MODELS", "0")
|
||||
s.newServerFn = b.newServer
|
||||
slog.Info("b")
|
||||
s.pendingReqCh <- b.req
|
||||
@@ -362,7 +361,7 @@ func TestGetRunner(t *testing.T) {
|
||||
a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, &api.Duration{Duration: 2 * time.Millisecond})
|
||||
b := newScenarioRequest(t, ctx, "ollama-model-1b", 10, &api.Duration{Duration: 2 * time.Millisecond})
|
||||
c := newScenarioRequest(t, ctx, "ollama-model-1c", 10, &api.Duration{Duration: 2 * time.Millisecond})
|
||||
envconfig.MaxQueuedRequests = 1
|
||||
t.Setenv("OLLAMA_MAX_QUEUE", "1")
|
||||
s := InitScheduler(ctx)
|
||||
s.getGpuFn = getGpuFn
|
||||
s.getCpuFn = getCpuFn
|
||||
@@ -666,11 +665,50 @@ func TestAlreadyCanceled(t *testing.T) {
|
||||
require.Empty(t, scenario1a.req.successCh)
|
||||
}
|
||||
|
||||
func TestHomogeneousGPUs(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
||||
defer done()
|
||||
s := InitScheduler(ctx)
|
||||
|
||||
s.getGpuFn = func() gpu.GpuInfoList {
|
||||
// Set memory values to require the model to be spread
|
||||
gpus := []gpu.GpuInfo{
|
||||
{Library: "cuda"},
|
||||
{Library: "rocm"},
|
||||
}
|
||||
gpus[0].TotalMemory = 1 * format.GibiByte
|
||||
gpus[0].FreeMemory = 256 * format.MebiByte
|
||||
gpus[1].TotalMemory = 1 * format.GibiByte
|
||||
gpus[1].FreeMemory = 256 * format.MebiByte
|
||||
return gpus
|
||||
}
|
||||
s.getCpuFn = getCpuFn
|
||||
a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
|
||||
s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
require.Len(t, gpus, 1)
|
||||
return a.newServer(gpus, model, ggml, adapters, projectors, opts, numParallel)
|
||||
}
|
||||
slog.Info("a")
|
||||
s.pendingReqCh <- a.req
|
||||
require.Len(t, s.pendingReqCh, 1)
|
||||
s.Run(ctx)
|
||||
select {
|
||||
case resp := <-a.req.successCh:
|
||||
require.Equal(t, resp.llama, a.srv)
|
||||
require.Empty(t, s.pendingReqCh)
|
||||
require.Empty(t, a.req.errCh)
|
||||
case err := <-a.req.errCh:
|
||||
t.Fatal(err.Error())
|
||||
case <-ctx.Done():
|
||||
t.Fatal("timeout")
|
||||
}
|
||||
}
|
||||
|
||||
type mockLlm struct {
|
||||
pingResp error
|
||||
waitResp error
|
||||
completionResp error
|
||||
embedResp [][]float32
|
||||
embedResp *llm.EmbedResponse
|
||||
embedRespErr error
|
||||
tokenizeResp []int
|
||||
tokenizeRespErr error
|
||||
@@ -688,7 +726,7 @@ func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitRes
|
||||
func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
|
||||
return s.completionResp
|
||||
}
|
||||
func (s *mockLlm) Embed(ctx context.Context, input []string) ([][]float32, error) {
|
||||
func (s *mockLlm) Embed(ctx context.Context, input []string) (*llm.EmbedResponse, error) {
|
||||
return s.embedResp, s.embedRespErr
|
||||
}
|
||||
func (s *mockLlm) Tokenize(ctx context.Context, content string) ([]int, error) {
|
||||
|
||||
Reference in New Issue
Block a user