Compare commits

...

27 Commits

Author SHA1 Message Date
Michael Yang
4c14855ad7 Merge pull request #6106 from ollama/mxyng/default-sliding-window-attention
patches: phi3 optional sliding window attention
2024-07-31 16:12:06 -07:00
Blake Mizerany
dc77bbcfa4 server: fix json marshalling of downloadBlobPart (#6108) 2024-07-31 16:01:24 -07:00
Michael Yang
0f3271db88 patches: phi3 default sliding window attention 2024-07-31 14:58:34 -07:00
Michael Yang
c4c84b7a0d Merge pull request #5196 from ollama/mxyng/messages-2
include modelfile messages
2024-07-31 10:18:17 -07:00
Michael Yang
5c1912769e Merge pull request #5473 from ollama/mxyng/environ
fix: environ lookup
2024-07-31 10:18:05 -07:00
Daniel Nguyen
71399aa682 Added BoltAI as a desktop UI for Ollama (#6096) 2024-07-31 08:44:58 -07:00
Jeffrey Morgan
463a8aa273 Create SECURITY.md 2024-07-30 21:01:12 -07:00
Michael
3579b4966a Update README to include Firebase Genkit (#6083)
Firebase Genkit
2024-07-30 18:40:09 -07:00
Jeffrey Morgan
5d66578356 Update README.md
Better example for multi-modal input
2024-07-30 18:08:34 -07:00
jmorganca
afa8d6e9d5 patch gemma support 2024-07-30 18:07:29 -07:00
royjhan
1b44d873e7 Add Metrics to api\embed response (#5709)
* add prompt tokens to embed response

* rm slog

* metrics

* types

* prompt n

* clean up

* reset submodule

* update tests

* test name

* list metrics
2024-07-30 13:12:21 -07:00
Daniel Hiltgen
cef2c6054d Merge pull request #5859 from dhiltgen/homogeneous_gpus
Prevent partial loading on mixed GPU brands
2024-07-30 11:06:42 -07:00
Daniel Hiltgen
345420998e Prevent partial loading on mixed GPU brands
In mult-brand GPU setups, if we couldn't fully load the model we
would fall through the scheduler and mistakenly try to load across
a mix of brands.  This makes sure we find the set of GPU(s) that
best fit for the partial load.
2024-07-30 11:00:55 -07:00
Kim Hallberg
0be8baad2b Update and Fix example models (#6065)
* Update example models

* Remove unused README.md
2024-07-29 23:56:37 -07:00
Michael Yang
a250c2cb13 display messages 2024-07-26 13:39:57 -07:00
Michael Yang
15af558423 include modelfile messages 2024-07-26 11:40:11 -07:00
Michael Yang
85d9d73a72 comments 2024-07-22 11:49:03 -07:00
Michael Yang
78140a712c cleanup tests 2024-07-22 11:49:03 -07:00
Michael Yang
1954ec5917 uint64 2024-07-22 11:49:02 -07:00
Michael Yang
0f1910129f int 2024-07-22 11:30:07 -07:00
Michael Yang
e2c3f6b3e2 string 2024-07-22 11:27:52 -07:00
Michael Yang
8570c1c0ef keepalive 2024-07-22 11:27:22 -07:00
Michael Yang
55cd3ddcca bool 2024-07-22 11:27:21 -07:00
Michael Yang
66fe77f084 models 2024-07-22 11:26:12 -07:00
Michael Yang
d1a5227cad origins 2024-07-22 11:25:30 -07:00
Michael Yang
4f1afd575d host 2024-07-22 11:25:30 -07:00
Michael Yang
35b89b2eab rfc: dynamic environ lookup 2024-07-22 11:25:30 -07:00
54 changed files with 841 additions and 599 deletions

View File

@@ -173,7 +173,7 @@ I'm a basic program that prints the famous "Hello, world!" message to the consol
### Multimodal models
```
>>> What's in this image? /Users/jmorgan/Desktop/smile.png
ollama run llava "What's in this image? /Users/jmorgan/Desktop/smile.png"
The image features a yellow smiley face, which is likely the central focus of the picture.
```
@@ -299,6 +299,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [AI Studio](https://github.com/MindWorkAI/AI-Studio)
- [Sidellama](https://github.com/gyopak/sidellama) (browser-based LLM client)
- [LLMStack](https://github.com/trypromptly/LLMStack) (No-code multi-agent framework to build LLM agents and workflows)
- [BoltAI for Mac](https://boltai.com) (AI Chat Client for Mac)
### Terminal
@@ -337,6 +338,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
### Libraries
- [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/modules/model_io/models/llms/integrations/ollama) with [example](https://js.langchain.com/docs/use_cases/question_answering/local_retrieval_qa)
- [Firebase Genkit](https://firebase.google.com/docs/genkit/plugins/ollama)
- [LangChainGo](https://github.com/tmc/langchaingo/) with [example](https://github.com/tmc/langchaingo/tree/main/examples/ollama-completion-example)
- [LangChain4j](https://github.com/langchain4j/langchain4j) with [example](https://github.com/langchain4j/langchain4j-examples/tree/main/ollama-examples/src/main/java)
- [LangChainRust](https://github.com/Abraxas-365/langchain-rust) with [example](https://github.com/Abraxas-365/langchain-rust/blob/main/examples/llm_ollama.rs)

25
SECURITY.md Normal file
View File

@@ -0,0 +1,25 @@
# Security
The Ollama maintainer team takes security seriously and will actively work to resolve security issues.
## Reporting a vulnerability
If you discover a security vulnerability, please do not open a public issue. Instead, please report it by emailing hello@ollama.com. We ask that you give us sufficient time to investigate and address the vulnerability before disclosing it publicly.
Please include the following details in your report:
- A description of the vulnerability
- Steps to reproduce the issue
- Your assessment of the potential impact
- Any possible mitigations
## Security best practices
While the maintainer team does their best to secure Ollama, users are encouraged to implement their own security best practices, such as:
- Regularly updating to the latest version of Ollama
- Securing access to hosted instances of Ollama
- Monitoring systems for unusual activity
## Contact
For any other questions or concerns related to security, please contact us at hello@ollama.com

View File

@@ -20,7 +20,6 @@ import (
"encoding/json"
"fmt"
"io"
"net"
"net/http"
"net/url"
"runtime"
@@ -63,13 +62,8 @@ func checkError(resp *http.Response, body []byte) error {
// If the variable is not specified, a default ollama host and port will be
// used.
func ClientFromEnvironment() (*Client, error) {
ollamaHost := envconfig.Host
return &Client{
base: &url.URL{
Scheme: ollamaHost.Scheme,
Host: net.JoinHostPort(ollamaHost.Host, ollamaHost.Port),
},
base: envconfig.Host(),
http: http.DefaultClient,
}, nil
}

View File

@@ -2,8 +2,6 @@ package api
import (
"testing"
"github.com/ollama/ollama/envconfig"
)
func TestClientFromEnvironment(t *testing.T) {
@@ -33,7 +31,6 @@ func TestClientFromEnvironment(t *testing.T) {
for k, v := range testCases {
t.Run(k, func(t *testing.T) {
t.Setenv("OLLAMA_HOST", v.value)
envconfig.LoadConfig()
client, err := ClientFromEnvironment()
if err != v.err {

View File

@@ -267,6 +267,10 @@ type EmbedRequest struct {
type EmbedResponse struct {
Model string `json:"model"`
Embeddings [][]float32 `json:"embeddings"`
TotalDuration time.Duration `json:"total_duration,omitempty"`
LoadDuration time.Duration `json:"load_duration,omitempty"`
PromptEvalCount int `json:"prompt_eval_count,omitempty"`
}
// EmbeddingRequest is the request passed to [Client.Embeddings].

View File

@@ -14,7 +14,7 @@ import (
func InitLogging() {
level := slog.LevelInfo
if envconfig.Debug {
if envconfig.Debug() {
level = slog.LevelDebug
}

View File

@@ -362,9 +362,24 @@ func RunHandler(cmd *cobra.Command, args []string) error {
opts.MultiModal = slices.Contains(info.Details.Families, "clip")
opts.ParentModel = info.Details.ParentModel
opts.Messages = append(opts.Messages, info.Messages...)
if interactive {
if err := loadModel(cmd, &opts); err != nil {
return err
}
for _, msg := range info.Messages {
switch msg.Role {
case "user":
fmt.Printf(">>> %s\n", msg.Content)
case "assistant":
state := &displayResponseState{}
displayResponse(msg.Content, opts.WordWrap, state)
fmt.Println()
fmt.Println()
}
}
return generateInteractive(cmd, opts)
}
return generate(cmd, opts)
@@ -1076,7 +1091,7 @@ func RunServer(cmd *cobra.Command, _ []string) error {
return err
}
ln, err := net.Listen("tcp", net.JoinHostPort(envconfig.Host.Host, envconfig.Host.Port))
ln, err := net.Listen("tcp", envconfig.Host().Host)
if err != nil {
return err
}

View File

@@ -48,29 +48,10 @@ func loadModel(cmd *cobra.Command, opts *runOptions) error {
KeepAlive: opts.KeepAlive,
}
return client.Chat(cmd.Context(), chatReq, func(resp api.ChatResponse) error {
p.StopAndClear()
for _, msg := range opts.Messages {
switch msg.Role {
case "user":
fmt.Printf(">>> %s\n", msg.Content)
case "assistant":
state := &displayResponseState{}
displayResponse(msg.Content, opts.WordWrap, state)
fmt.Println()
fmt.Println()
}
}
return nil
})
return client.Chat(cmd.Context(), chatReq, func(api.ChatResponse) error { return nil })
}
func generateInteractive(cmd *cobra.Command, opts runOptions) error {
err := loadModel(cmd, &opts)
if err != nil {
return err
}
usage := func() {
fmt.Fprintln(os.Stderr, "Available Commands:")
fmt.Fprintln(os.Stderr, " /set Set session variables")
@@ -160,7 +141,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
return err
}
if envconfig.NoHistory {
if envconfig.NoHistory() {
scanner.HistoryDisable()
}

View File

@@ -1,11 +1,11 @@
package envconfig
import (
"errors"
"fmt"
"log/slog"
"math"
"net"
"net/url"
"os"
"path/filepath"
"runtime"
@@ -14,296 +14,16 @@ import (
"time"
)
type OllamaHost struct {
Scheme string
Host string
Port string
}
func (o OllamaHost) String() string {
return fmt.Sprintf("%s://%s:%s", o.Scheme, o.Host, o.Port)
}
var ErrInvalidHostPort = errors.New("invalid port specified in OLLAMA_HOST")
var (
// Set via OLLAMA_ORIGINS in the environment
AllowOrigins []string
// Set via OLLAMA_DEBUG in the environment
Debug bool
// Experimental flash attention
FlashAttention bool
// Set via OLLAMA_HOST in the environment
Host *OllamaHost
// Set via OLLAMA_KEEP_ALIVE in the environment
KeepAlive time.Duration
// Set via OLLAMA_LLM_LIBRARY in the environment
LLMLibrary string
// Set via OLLAMA_MAX_LOADED_MODELS in the environment
MaxRunners int
// Set via OLLAMA_MAX_QUEUE in the environment
MaxQueuedRequests int
// Set via OLLAMA_MODELS in the environment
ModelsDir string
// Set via OLLAMA_NOHISTORY in the environment
NoHistory bool
// Set via OLLAMA_NOPRUNE in the environment
NoPrune bool
// Set via OLLAMA_NUM_PARALLEL in the environment
NumParallel int
// Set via OLLAMA_RUNNERS_DIR in the environment
RunnersDir string
// Set via OLLAMA_SCHED_SPREAD in the environment
SchedSpread bool
// Set via OLLAMA_TMPDIR in the environment
TmpDir string
// Set via OLLAMA_INTEL_GPU in the environment
IntelGpu bool
// Set via CUDA_VISIBLE_DEVICES in the environment
CudaVisibleDevices string
// Set via HIP_VISIBLE_DEVICES in the environment
HipVisibleDevices string
// Set via ROCR_VISIBLE_DEVICES in the environment
RocrVisibleDevices string
// Set via GPU_DEVICE_ORDINAL in the environment
GpuDeviceOrdinal string
// Set via HSA_OVERRIDE_GFX_VERSION in the environment
HsaOverrideGfxVersion string
)
type EnvVar struct {
Name string
Value any
Description string
}
func AsMap() map[string]EnvVar {
ret := map[string]EnvVar{
"OLLAMA_DEBUG": {"OLLAMA_DEBUG", Debug, "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
"OLLAMA_FLASH_ATTENTION": {"OLLAMA_FLASH_ATTENTION", FlashAttention, "Enabled flash attention"},
"OLLAMA_HOST": {"OLLAMA_HOST", Host, "IP Address for the ollama server (default 127.0.0.1:11434)"},
"OLLAMA_KEEP_ALIVE": {"OLLAMA_KEEP_ALIVE", KeepAlive, "The duration that models stay loaded in memory (default \"5m\")"},
"OLLAMA_LLM_LIBRARY": {"OLLAMA_LLM_LIBRARY", LLMLibrary, "Set LLM library to bypass autodetection"},
"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models per GPU"},
"OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueuedRequests, "Maximum number of queued requests"},
"OLLAMA_MODELS": {"OLLAMA_MODELS", ModelsDir, "The path to the models directory"},
"OLLAMA_NOHISTORY": {"OLLAMA_NOHISTORY", NoHistory, "Do not preserve readline history"},
"OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune, "Do not prune model blobs on startup"},
"OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests"},
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", AllowOrigins, "A comma separated list of allowed origins"},
"OLLAMA_RUNNERS_DIR": {"OLLAMA_RUNNERS_DIR", RunnersDir, "Location for runners"},
"OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread, "Always schedule model across all GPUs"},
"OLLAMA_TMPDIR": {"OLLAMA_TMPDIR", TmpDir, "Location for temporary files"},
}
if runtime.GOOS != "darwin" {
ret["CUDA_VISIBLE_DEVICES"] = EnvVar{"CUDA_VISIBLE_DEVICES", CudaVisibleDevices, "Set which NVIDIA devices are visible"}
ret["HIP_VISIBLE_DEVICES"] = EnvVar{"HIP_VISIBLE_DEVICES", HipVisibleDevices, "Set which AMD devices are visible"}
ret["ROCR_VISIBLE_DEVICES"] = EnvVar{"ROCR_VISIBLE_DEVICES", RocrVisibleDevices, "Set which AMD devices are visible"}
ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal, "Set which AMD devices are visible"}
ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion, "Override the gfx used for all detected AMD GPUs"}
ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGpu, "Enable experimental Intel GPU detection"}
}
return ret
}
func Values() map[string]string {
vals := make(map[string]string)
for k, v := range AsMap() {
vals[k] = fmt.Sprintf("%v", v.Value)
}
return vals
}
var defaultAllowOrigins = []string{
"localhost",
"127.0.0.1",
"0.0.0.0",
}
// Clean quotes and spaces from the value
func clean(key string) string {
return strings.Trim(os.Getenv(key), "\"' ")
}
func init() {
// default values
NumParallel = 0 // Autoselect
MaxRunners = 0 // Autoselect
MaxQueuedRequests = 512
KeepAlive = 5 * time.Minute
LoadConfig()
}
func LoadConfig() {
if debug := clean("OLLAMA_DEBUG"); debug != "" {
d, err := strconv.ParseBool(debug)
if err == nil {
Debug = d
} else {
Debug = true
}
}
if fa := clean("OLLAMA_FLASH_ATTENTION"); fa != "" {
d, err := strconv.ParseBool(fa)
if err == nil {
FlashAttention = d
}
}
RunnersDir = clean("OLLAMA_RUNNERS_DIR")
if runtime.GOOS == "windows" && RunnersDir == "" {
// On Windows we do not carry the payloads inside the main executable
appExe, err := os.Executable()
if err != nil {
slog.Error("failed to lookup executable path", "error", err)
}
cwd, err := os.Getwd()
if err != nil {
slog.Error("failed to lookup working directory", "error", err)
}
var paths []string
for _, root := range []string{filepath.Dir(appExe), cwd} {
paths = append(paths,
root,
filepath.Join(root, "windows-"+runtime.GOARCH),
filepath.Join(root, "dist", "windows-"+runtime.GOARCH),
)
}
// Try a few variations to improve developer experience when building from source in the local tree
for _, p := range paths {
candidate := filepath.Join(p, "ollama_runners")
_, err := os.Stat(candidate)
if err == nil {
RunnersDir = candidate
break
}
}
if RunnersDir == "" {
slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama_runners'")
}
}
TmpDir = clean("OLLAMA_TMPDIR")
LLMLibrary = clean("OLLAMA_LLM_LIBRARY")
if onp := clean("OLLAMA_NUM_PARALLEL"); onp != "" {
val, err := strconv.Atoi(onp)
if err != nil {
slog.Error("invalid setting, ignoring", "OLLAMA_NUM_PARALLEL", onp, "error", err)
} else {
NumParallel = val
}
}
if nohistory := clean("OLLAMA_NOHISTORY"); nohistory != "" {
NoHistory = true
}
if spread := clean("OLLAMA_SCHED_SPREAD"); spread != "" {
s, err := strconv.ParseBool(spread)
if err == nil {
SchedSpread = s
} else {
SchedSpread = true
}
}
if noprune := clean("OLLAMA_NOPRUNE"); noprune != "" {
NoPrune = true
}
if origins := clean("OLLAMA_ORIGINS"); origins != "" {
AllowOrigins = strings.Split(origins, ",")
}
for _, allowOrigin := range defaultAllowOrigins {
AllowOrigins = append(AllowOrigins,
fmt.Sprintf("http://%s", allowOrigin),
fmt.Sprintf("https://%s", allowOrigin),
fmt.Sprintf("http://%s", net.JoinHostPort(allowOrigin, "*")),
fmt.Sprintf("https://%s", net.JoinHostPort(allowOrigin, "*")),
)
}
AllowOrigins = append(AllowOrigins,
"app://*",
"file://*",
"tauri://*",
)
maxRunners := clean("OLLAMA_MAX_LOADED_MODELS")
if maxRunners != "" {
m, err := strconv.Atoi(maxRunners)
if err != nil {
slog.Error("invalid setting, ignoring", "OLLAMA_MAX_LOADED_MODELS", maxRunners, "error", err)
} else {
MaxRunners = m
}
}
if onp := os.Getenv("OLLAMA_MAX_QUEUE"); onp != "" {
p, err := strconv.Atoi(onp)
if err != nil || p <= 0 {
slog.Error("invalid setting, ignoring", "OLLAMA_MAX_QUEUE", onp, "error", err)
} else {
MaxQueuedRequests = p
}
}
ka := clean("OLLAMA_KEEP_ALIVE")
if ka != "" {
loadKeepAlive(ka)
}
var err error
ModelsDir, err = getModelsDir()
if err != nil {
slog.Error("invalid setting", "OLLAMA_MODELS", ModelsDir, "error", err)
}
Host, err = getOllamaHost()
if err != nil {
slog.Error("invalid setting", "OLLAMA_HOST", Host, "error", err, "using default port", Host.Port)
}
if set, err := strconv.ParseBool(clean("OLLAMA_INTEL_GPU")); err == nil {
IntelGpu = set
}
CudaVisibleDevices = clean("CUDA_VISIBLE_DEVICES")
HipVisibleDevices = clean("HIP_VISIBLE_DEVICES")
RocrVisibleDevices = clean("ROCR_VISIBLE_DEVICES")
GpuDeviceOrdinal = clean("GPU_DEVICE_ORDINAL")
HsaOverrideGfxVersion = clean("HSA_OVERRIDE_GFX_VERSION")
}
func getModelsDir() (string, error) {
if models, exists := os.LookupEnv("OLLAMA_MODELS"); exists {
return models, nil
}
home, err := os.UserHomeDir()
if err != nil {
return "", err
}
return filepath.Join(home, ".ollama", "models"), nil
}
func getOllamaHost() (*OllamaHost, error) {
// Host returns the scheme and host. Host can be configured via the OLLAMA_HOST environment variable.
// Default is scheme "http" and host "127.0.0.1:11434"
func Host() *url.URL {
defaultPort := "11434"
hostVar := os.Getenv("OLLAMA_HOST")
hostVar = strings.TrimSpace(strings.Trim(strings.TrimSpace(hostVar), "\"'"))
scheme, hostport, ok := strings.Cut(hostVar, "://")
s := strings.TrimSpace(Var("OLLAMA_HOST"))
scheme, hostport, ok := strings.Cut(s, "://")
switch {
case !ok:
scheme, hostport = "http", hostVar
scheme, hostport = "http", s
case scheme == "http":
defaultPort = "80"
case scheme == "https":
@@ -323,38 +43,242 @@ func getOllamaHost() (*OllamaHost, error) {
}
}
if portNum, err := strconv.ParseInt(port, 10, 32); err != nil || portNum > 65535 || portNum < 0 {
return &OllamaHost{
if n, err := strconv.ParseInt(port, 10, 32); err != nil || n > 65535 || n < 0 {
slog.Warn("invalid port, using default", "port", port, "default", defaultPort)
return &url.URL{
Scheme: scheme,
Host: host,
Port: defaultPort,
}, ErrInvalidHostPort
Host: net.JoinHostPort(host, defaultPort),
}
}
return &OllamaHost{
return &url.URL{
Scheme: scheme,
Host: host,
Port: port,
}, nil
Host: net.JoinHostPort(host, port),
}
}
func loadKeepAlive(ka string) {
v, err := strconv.Atoi(ka)
// Origins returns a list of allowed origins. Origins can be configured via the OLLAMA_ORIGINS environment variable.
func Origins() (origins []string) {
if s := Var("OLLAMA_ORIGINS"); s != "" {
origins = strings.Split(s, ",")
}
for _, origin := range []string{"localhost", "127.0.0.1", "0.0.0.0"} {
origins = append(origins,
fmt.Sprintf("http://%s", origin),
fmt.Sprintf("https://%s", origin),
fmt.Sprintf("http://%s", net.JoinHostPort(origin, "*")),
fmt.Sprintf("https://%s", net.JoinHostPort(origin, "*")),
)
}
origins = append(origins,
"app://*",
"file://*",
"tauri://*",
)
return origins
}
// Models returns the path to the models directory. Models directory can be configured via the OLLAMA_MODELS environment variable.
// Default is $HOME/.ollama/models
func Models() string {
if s := Var("OLLAMA_MODELS"); s != "" {
return s
}
home, err := os.UserHomeDir()
if err != nil {
d, err := time.ParseDuration(ka)
if err == nil {
if d < 0 {
KeepAlive = time.Duration(math.MaxInt64)
panic(err)
}
return filepath.Join(home, ".ollama", "models")
}
// KeepAlive returns the duration that models stay loaded in memory. KeepAlive can be configured via the OLLAMA_KEEP_ALIVE environment variable.
// Negative values are treated as infinite. Zero is treated as no keep alive.
// Default is 5 minutes.
func KeepAlive() (keepAlive time.Duration) {
keepAlive = 5 * time.Minute
if s := Var("OLLAMA_KEEP_ALIVE"); s != "" {
if d, err := time.ParseDuration(s); err == nil {
keepAlive = d
} else if n, err := strconv.ParseInt(s, 10, 64); err == nil {
keepAlive = time.Duration(n) * time.Second
}
}
if keepAlive < 0 {
return time.Duration(math.MaxInt64)
}
return keepAlive
}
func Bool(k string) func() bool {
return func() bool {
if s := Var(k); s != "" {
b, err := strconv.ParseBool(s)
if err != nil {
return true
}
return b
}
return false
}
}
var (
// Debug enabled additional debug information.
Debug = Bool("OLLAMA_DEBUG")
// FlashAttention enables the experimental flash attention feature.
FlashAttention = Bool("OLLAMA_FLASH_ATTENTION")
// NoHistory disables readline history.
NoHistory = Bool("OLLAMA_NOHISTORY")
// NoPrune disables pruning of model blobs on startup.
NoPrune = Bool("OLLAMA_NOPRUNE")
// SchedSpread allows scheduling models across all GPUs.
SchedSpread = Bool("OLLAMA_SCHED_SPREAD")
// IntelGPU enables experimental Intel GPU detection.
IntelGPU = Bool("OLLAMA_INTEL_GPU")
)
func String(s string) func() string {
return func() string {
return Var(s)
}
}
var (
LLMLibrary = String("OLLAMA_LLM_LIBRARY")
TmpDir = String("OLLAMA_TMPDIR")
CudaVisibleDevices = String("CUDA_VISIBLE_DEVICES")
HipVisibleDevices = String("HIP_VISIBLE_DEVICES")
RocrVisibleDevices = String("ROCR_VISIBLE_DEVICES")
GpuDeviceOrdinal = String("GPU_DEVICE_ORDINAL")
HsaOverrideGfxVersion = String("HSA_OVERRIDE_GFX_VERSION")
)
func RunnersDir() (p string) {
if p := Var("OLLAMA_RUNNERS_DIR"); p != "" {
return p
}
if runtime.GOOS != "windows" {
return
}
defer func() {
if p == "" {
slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama_runners'")
}
}()
// On Windows we do not carry the payloads inside the main executable
exe, err := os.Executable()
if err != nil {
return
}
cwd, err := os.Getwd()
if err != nil {
return
}
var paths []string
for _, root := range []string{filepath.Dir(exe), cwd} {
paths = append(paths,
root,
filepath.Join(root, "windows-"+runtime.GOARCH),
filepath.Join(root, "dist", "windows-"+runtime.GOARCH),
)
}
// Try a few variations to improve developer experience when building from source in the local tree
for _, path := range paths {
candidate := filepath.Join(path, "ollama_runners")
if _, err := os.Stat(candidate); err == nil {
p = candidate
break
}
}
return p
}
func Uint(key string, defaultValue uint) func() uint {
return func() uint {
if s := Var(key); s != "" {
if n, err := strconv.ParseUint(s, 10, 64); err != nil {
slog.Warn("invalid environment variable, using default", "key", key, "value", s, "default", defaultValue)
} else {
KeepAlive = d
return uint(n)
}
}
} else {
d := time.Duration(v) * time.Second
if d < 0 {
KeepAlive = time.Duration(math.MaxInt64)
} else {
KeepAlive = d
}
return defaultValue
}
}
var (
// NumParallel sets the number of parallel model requests. NumParallel can be configured via the OLLAMA_NUM_PARALLEL environment variable.
NumParallel = Uint("OLLAMA_NUM_PARALLEL", 0)
// MaxRunners sets the maximum number of loaded models. MaxRunners can be configured via the OLLAMA_MAX_LOADED_MODELS environment variable.
MaxRunners = Uint("OLLAMA_MAX_LOADED_MODELS", 0)
// MaxQueue sets the maximum number of queued requests. MaxQueue can be configured via the OLLAMA_MAX_QUEUE environment variable.
MaxQueue = Uint("OLLAMA_MAX_QUEUE", 512)
// MaxVRAM sets a maximum VRAM override in bytes. MaxVRAM can be configured via the OLLAMA_MAX_VRAM environment variable.
MaxVRAM = Uint("OLLAMA_MAX_VRAM", 0)
)
type EnvVar struct {
Name string
Value any
Description string
}
func AsMap() map[string]EnvVar {
ret := map[string]EnvVar{
"OLLAMA_DEBUG": {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
"OLLAMA_FLASH_ATTENTION": {"OLLAMA_FLASH_ATTENTION", FlashAttention(), "Enabled flash attention"},
"OLLAMA_HOST": {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"},
"OLLAMA_KEEP_ALIVE": {"OLLAMA_KEEP_ALIVE", KeepAlive(), "The duration that models stay loaded in memory (default \"5m\")"},
"OLLAMA_LLM_LIBRARY": {"OLLAMA_LLM_LIBRARY", LLMLibrary(), "Set LLM library to bypass autodetection"},
"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners(), "Maximum number of loaded models per GPU"},
"OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueue(), "Maximum number of queued requests"},
"OLLAMA_MODELS": {"OLLAMA_MODELS", Models(), "The path to the models directory"},
"OLLAMA_NOHISTORY": {"OLLAMA_NOHISTORY", NoHistory(), "Do not preserve readline history"},
"OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune(), "Do not prune model blobs on startup"},
"OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel(), "Maximum number of parallel requests"},
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"},
"OLLAMA_RUNNERS_DIR": {"OLLAMA_RUNNERS_DIR", RunnersDir(), "Location for runners"},
"OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
"OLLAMA_TMPDIR": {"OLLAMA_TMPDIR", TmpDir(), "Location for temporary files"},
}
if runtime.GOOS != "darwin" {
ret["CUDA_VISIBLE_DEVICES"] = EnvVar{"CUDA_VISIBLE_DEVICES", CudaVisibleDevices(), "Set which NVIDIA devices are visible"}
ret["HIP_VISIBLE_DEVICES"] = EnvVar{"HIP_VISIBLE_DEVICES", HipVisibleDevices(), "Set which AMD devices are visible"}
ret["ROCR_VISIBLE_DEVICES"] = EnvVar{"ROCR_VISIBLE_DEVICES", RocrVisibleDevices(), "Set which AMD devices are visible"}
ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal(), "Set which AMD devices are visible"}
ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion(), "Override the gfx used for all detected AMD GPUs"}
ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGPU(), "Enable experimental Intel GPU detection"}
}
return ret
}
func Values() map[string]string {
vals := make(map[string]string)
for k, v := range AsMap() {
vals[k] = fmt.Sprintf("%v", v.Value)
}
return vals
}
// Var returns an environment variable stripped of leading and trailing quotes or spaces
func Var(key string) string {
return strings.Trim(strings.TrimSpace(os.Getenv(key)), "\"'")
}

View File

@@ -1,87 +1,234 @@
package envconfig
import (
"fmt"
"math"
"net"
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/google/go-cmp/cmp"
)
func TestConfig(t *testing.T) {
Debug = false // Reset whatever was loaded in init()
t.Setenv("OLLAMA_DEBUG", "")
LoadConfig()
require.False(t, Debug)
t.Setenv("OLLAMA_DEBUG", "false")
LoadConfig()
require.False(t, Debug)
t.Setenv("OLLAMA_DEBUG", "1")
LoadConfig()
require.True(t, Debug)
t.Setenv("OLLAMA_FLASH_ATTENTION", "1")
LoadConfig()
require.True(t, FlashAttention)
t.Setenv("OLLAMA_KEEP_ALIVE", "")
LoadConfig()
require.Equal(t, 5*time.Minute, KeepAlive)
t.Setenv("OLLAMA_KEEP_ALIVE", "3")
LoadConfig()
require.Equal(t, 3*time.Second, KeepAlive)
t.Setenv("OLLAMA_KEEP_ALIVE", "1h")
LoadConfig()
require.Equal(t, 1*time.Hour, KeepAlive)
t.Setenv("OLLAMA_KEEP_ALIVE", "-1s")
LoadConfig()
require.Equal(t, time.Duration(math.MaxInt64), KeepAlive)
t.Setenv("OLLAMA_KEEP_ALIVE", "-1")
LoadConfig()
require.Equal(t, time.Duration(math.MaxInt64), KeepAlive)
}
func TestClientFromEnvironment(t *testing.T) {
type testCase struct {
func TestHost(t *testing.T) {
cases := map[string]struct {
value string
expect string
err error
}{
"empty": {"", "127.0.0.1:11434"},
"only address": {"1.2.3.4", "1.2.3.4:11434"},
"only port": {":1234", ":1234"},
"address and port": {"1.2.3.4:1234", "1.2.3.4:1234"},
"hostname": {"example.com", "example.com:11434"},
"hostname and port": {"example.com:1234", "example.com:1234"},
"zero port": {":0", ":0"},
"too large port": {":66000", ":11434"},
"too small port": {":-1", ":11434"},
"ipv6 localhost": {"[::1]", "[::1]:11434"},
"ipv6 world open": {"[::]", "[::]:11434"},
"ipv6 no brackets": {"::1", "[::1]:11434"},
"ipv6 + port": {"[::1]:1337", "[::1]:1337"},
"extra space": {" 1.2.3.4 ", "1.2.3.4:11434"},
"extra quotes": {"\"1.2.3.4\"", "1.2.3.4:11434"},
"extra space+quotes": {" \" 1.2.3.4 \" ", "1.2.3.4:11434"},
"extra single quotes": {"'1.2.3.4'", "1.2.3.4:11434"},
"http": {"http://1.2.3.4", "1.2.3.4:80"},
"http port": {"http://1.2.3.4:4321", "1.2.3.4:4321"},
"https": {"https://1.2.3.4", "1.2.3.4:443"},
"https port": {"https://1.2.3.4:4321", "1.2.3.4:4321"},
}
hostTestCases := map[string]*testCase{
"empty": {value: "", expect: "127.0.0.1:11434"},
"only address": {value: "1.2.3.4", expect: "1.2.3.4:11434"},
"only port": {value: ":1234", expect: ":1234"},
"address and port": {value: "1.2.3.4:1234", expect: "1.2.3.4:1234"},
"hostname": {value: "example.com", expect: "example.com:11434"},
"hostname and port": {value: "example.com:1234", expect: "example.com:1234"},
"zero port": {value: ":0", expect: ":0"},
"too large port": {value: ":66000", err: ErrInvalidHostPort},
"too small port": {value: ":-1", err: ErrInvalidHostPort},
"ipv6 localhost": {value: "[::1]", expect: "[::1]:11434"},
"ipv6 world open": {value: "[::]", expect: "[::]:11434"},
"ipv6 no brackets": {value: "::1", expect: "[::1]:11434"},
"ipv6 + port": {value: "[::1]:1337", expect: "[::1]:1337"},
"extra space": {value: " 1.2.3.4 ", expect: "1.2.3.4:11434"},
"extra quotes": {value: "\"1.2.3.4\"", expect: "1.2.3.4:11434"},
"extra space+quotes": {value: " \" 1.2.3.4 \" ", expect: "1.2.3.4:11434"},
"extra single quotes": {value: "'1.2.3.4'", expect: "1.2.3.4:11434"},
}
for k, v := range hostTestCases {
t.Run(k, func(t *testing.T) {
t.Setenv("OLLAMA_HOST", v.value)
LoadConfig()
oh, err := getOllamaHost()
if err != v.err {
t.Fatalf("expected %s, got %s", v.err, err)
}
if err == nil {
host := net.JoinHostPort(oh.Host, oh.Port)
assert.Equal(t, v.expect, host, fmt.Sprintf("%s: expected %s, got %s", k, v.expect, host))
for name, tt := range cases {
t.Run(name, func(t *testing.T) {
t.Setenv("OLLAMA_HOST", tt.value)
if host := Host(); host.Host != tt.expect {
t.Errorf("%s: expected %s, got %s", name, tt.expect, host.Host)
}
})
}
}
func TestOrigins(t *testing.T) {
cases := []struct {
value string
expect []string
}{
{"", []string{
"http://localhost",
"https://localhost",
"http://localhost:*",
"https://localhost:*",
"http://127.0.0.1",
"https://127.0.0.1",
"http://127.0.0.1:*",
"https://127.0.0.1:*",
"http://0.0.0.0",
"https://0.0.0.0",
"http://0.0.0.0:*",
"https://0.0.0.0:*",
"app://*",
"file://*",
"tauri://*",
}},
{"http://10.0.0.1", []string{
"http://10.0.0.1",
"http://localhost",
"https://localhost",
"http://localhost:*",
"https://localhost:*",
"http://127.0.0.1",
"https://127.0.0.1",
"http://127.0.0.1:*",
"https://127.0.0.1:*",
"http://0.0.0.0",
"https://0.0.0.0",
"http://0.0.0.0:*",
"https://0.0.0.0:*",
"app://*",
"file://*",
"tauri://*",
}},
{"http://172.16.0.1,https://192.168.0.1", []string{
"http://172.16.0.1",
"https://192.168.0.1",
"http://localhost",
"https://localhost",
"http://localhost:*",
"https://localhost:*",
"http://127.0.0.1",
"https://127.0.0.1",
"http://127.0.0.1:*",
"https://127.0.0.1:*",
"http://0.0.0.0",
"https://0.0.0.0",
"http://0.0.0.0:*",
"https://0.0.0.0:*",
"app://*",
"file://*",
"tauri://*",
}},
{"http://totally.safe,http://definitely.legit", []string{
"http://totally.safe",
"http://definitely.legit",
"http://localhost",
"https://localhost",
"http://localhost:*",
"https://localhost:*",
"http://127.0.0.1",
"https://127.0.0.1",
"http://127.0.0.1:*",
"https://127.0.0.1:*",
"http://0.0.0.0",
"https://0.0.0.0",
"http://0.0.0.0:*",
"https://0.0.0.0:*",
"app://*",
"file://*",
"tauri://*",
}},
}
for _, tt := range cases {
t.Run(tt.value, func(t *testing.T) {
t.Setenv("OLLAMA_ORIGINS", tt.value)
if diff := cmp.Diff(Origins(), tt.expect); diff != "" {
t.Errorf("%s: mismatch (-want +got):\n%s", tt.value, diff)
}
})
}
}
func TestBool(t *testing.T) {
cases := map[string]bool{
"": false,
"true": true,
"false": false,
"1": true,
"0": false,
// invalid values
"random": true,
"something": true,
}
for k, v := range cases {
t.Run(k, func(t *testing.T) {
t.Setenv("OLLAMA_BOOL", k)
if b := Bool("OLLAMA_BOOL")(); b != v {
t.Errorf("%s: expected %t, got %t", k, v, b)
}
})
}
}
func TestUint(t *testing.T) {
cases := map[string]uint{
"0": 0,
"1": 1,
"1337": 1337,
// default values
"": 11434,
"-1": 11434,
"0o10": 11434,
"0x10": 11434,
"string": 11434,
}
for k, v := range cases {
t.Run(k, func(t *testing.T) {
t.Setenv("OLLAMA_UINT", k)
if i := Uint("OLLAMA_UINT", 11434)(); i != v {
t.Errorf("%s: expected %d, got %d", k, v, i)
}
})
}
}
func TestKeepAlive(t *testing.T) {
cases := map[string]time.Duration{
"": 5 * time.Minute,
"1s": time.Second,
"1m": time.Minute,
"1h": time.Hour,
"5m0s": 5 * time.Minute,
"1h2m3s": 1*time.Hour + 2*time.Minute + 3*time.Second,
"0": time.Duration(0),
"60": 60 * time.Second,
"120": 2 * time.Minute,
"3600": time.Hour,
"-0": time.Duration(0),
"-1": time.Duration(math.MaxInt64),
"-1m": time.Duration(math.MaxInt64),
// invalid values
" ": 5 * time.Minute,
"???": 5 * time.Minute,
"1d": 5 * time.Minute,
"1y": 5 * time.Minute,
"1w": 5 * time.Minute,
}
for tt, expect := range cases {
t.Run(tt, func(t *testing.T) {
t.Setenv("OLLAMA_KEEP_ALIVE", tt)
if actual := KeepAlive(); actual != expect {
t.Errorf("%s: expected %s, got %s", tt, expect, actual)
}
})
}
}
func TestVar(t *testing.T) {
cases := map[string]string{
"value": "value",
" value ": "value",
" 'value' ": "value",
` "value" `: "value",
" ' value ' ": " value ",
` " value " `: " value ",
}
for k, v := range cases {
t.Run(k, func(t *testing.T) {
t.Setenv("OLLAMA_VAR", k)
if s := Var("OLLAMA_VAR"); s != v {
t.Errorf("%s: expected %q, got %q", k, v, s)
}
})
}

View File

@@ -35,7 +35,7 @@ func main() {
ctx := context.Background()
req := &api.ChatRequest{
Model: "llama3",
Model: "llama3.1",
Messages: messages,
}

View File

@@ -16,7 +16,7 @@ func main() {
// By default, GenerateRequest is streaming.
req := &api.GenerateRequest{
Model: "gemma",
Model: "gemma2",
Prompt: "how many planets are there?",
}

View File

@@ -15,7 +15,7 @@ func main() {
}
req := &api.GenerateRequest{
Model: "gemma",
Model: "gemma2",
Prompt: "how many planets are there?",
// set streaming to false

View File

View File

@@ -4,6 +4,14 @@ This example provides an interface for asking questions to a PDF document.
## Setup
1. Ensure you have the `llama3.1` model installed:
```
ollama pull llama3.1
```
2. Install the Python Requirements.
```
pip install -r requirements.txt
```

View File

@@ -51,7 +51,7 @@ while True:
template=template,
)
llm = Ollama(model="llama3:8b", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
llm = Ollama(model="llama3.1", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
qa_chain = RetrievalQA.from_chain_type(
llm,
retriever=vectorstore.as_retriever(),

View File

@@ -4,10 +4,10 @@ This example summarizes the website, [https://ollama.com/blog/run-llama2-uncenso
## Running the Example
1. Ensure you have the `llama2` model installed:
1. Ensure you have the `llama3.1` model installed:
```bash
ollama pull llama2
ollama pull llama3.1
```
2. Install the Python Requirements.

View File

@@ -5,8 +5,8 @@ from langchain.chains.summarize import load_summarize_chain
loader = WebBaseLoader("https://ollama.com/blog/run-llama2-uncensored-locally")
docs = loader.load()
llm = Ollama(model="llama3")
llm = Ollama(model="llama3.1")
chain = load_summarize_chain(llm, chain_type="stuff")
result = chain.invoke(docs)
result = chain.invoke(docs)
print(result)

View File

@@ -4,10 +4,10 @@ This example is a basic "hello world" of using LangChain with Ollama.
## Running the Example
1. Ensure you have the `llama3` model installed:
1. Ensure you have the `llama3.1` model installed:
```bash
ollama pull llama3
ollama pull llama3.1
```
2. Install the Python Requirements.

View File

@@ -1,6 +1,6 @@
from langchain.llms import Ollama
input = input("What is your question?")
llm = Ollama(model="llama3")
llm = Ollama(model="llama3.1")
res = llm.predict(input)
print (res)

View File

@@ -1,4 +1,4 @@
FROM llama3
FROM llama3.1
PARAMETER temperature 1
SYSTEM """
You are Mario from super mario bros, acting as an assistant.

View File

@@ -2,12 +2,12 @@
# Example character: Mario
This example shows how to create a basic character using Llama3 as the base model.
This example shows how to create a basic character using Llama3.1 as the base model.
To run this example:
1. Download the Modelfile
2. `ollama pull llama3` to get the base model used in the model file.
2. `ollama pull llama3.1` to get the base model used in the model file.
3. `ollama create NAME -f ./Modelfile`
4. `ollama run NAME`
@@ -18,7 +18,7 @@ Ask it some questions like "Who are you?" or "Is Peach in trouble again?"
What the model file looks like:
```
FROM llama3
FROM llama3.1
PARAMETER temperature 1
SYSTEM """
You are Mario from Super Mario Bros, acting as an assistant.

View File

@@ -4,7 +4,7 @@ imageName = input("Enter the name of the image: ")
client = docker.from_env()
s = requests.Session()
output=""
with s.post('http://localhost:11434/api/generate', json={'model': 'dockerit', 'prompt': inputDescription}, stream=True) as r:
with s.post('http://localhost:11434/api/generate', json={'model': 'mattw/dockerit', 'prompt': inputDescription}, stream=True) as r:
for line in r.iter_lines():
if line:
j = json.loads(line)

View File

@@ -2,7 +2,7 @@ import requests
import json
import random
model = "llama3"
model = "llama3.1"
template = {
"firstName": "",
"lastName": "",

View File

@@ -12,7 +12,7 @@ countries = [
"France",
]
country = random.choice(countries)
model = "llama3"
model = "llama3.1"
prompt = f"generate one realistically believable sample data set of a persons first name, last name, address in {country}, and phone number. Do not use common names. Respond using JSON. Key names should have no backslashes, values should use plain ascii with no special characters."

View File

@@ -6,10 +6,10 @@ There are two python scripts in this example. `randomaddresses.py` generates ran
## Running the Example
1. Ensure you have the `llama3` model installed:
1. Ensure you have the `llama3.1` model installed:
```bash
ollama pull llama3
ollama pull llama3.1
```
2. Install the Python Requirements.

View File

@@ -2,7 +2,7 @@ import json
import requests
# NOTE: ollama must be running for this to work, start the ollama app or run `ollama serve`
model = "llama3" # TODO: update this for whatever model you wish to use
model = "llama3.1" # TODO: update this for whatever model you wish to use
def chat(messages):

View File

@@ -4,10 +4,10 @@ The **chat** endpoint is one of two ways to generate text from an LLM with Ollam
## Running the Example
1. Ensure you have the `llama3` model installed:
1. Ensure you have the `llama3.1` model installed:
```bash
ollama pull llama3
ollama pull llama3.1
```
2. Install the Python Requirements.

View File

@@ -1,6 +1,6 @@
import * as readline from "readline";
const model = "llama3";
const model = "llama3.1";
type Message = {
role: "assistant" | "user" | "system";
content: string;

View File

@@ -61,9 +61,9 @@ func AMDGetGPUInfo() []RocmGPUInfo {
// Determine if the user has already pre-selected which GPUs to look at, then ignore the others
var visibleDevices []string
hipVD := envconfig.HipVisibleDevices // zero based index only
rocrVD := envconfig.RocrVisibleDevices // zero based index or UUID, but consumer cards seem to not support UUID
gpuDO := envconfig.GpuDeviceOrdinal // zero based index
hipVD := envconfig.HipVisibleDevices() // zero based index only
rocrVD := envconfig.RocrVisibleDevices() // zero based index or UUID, but consumer cards seem to not support UUID
gpuDO := envconfig.GpuDeviceOrdinal() // zero based index
switch {
// TODO is this priorty order right?
case hipVD != "":
@@ -76,7 +76,7 @@ func AMDGetGPUInfo() []RocmGPUInfo {
visibleDevices = strings.Split(gpuDO, ",")
}
gfxOverride := envconfig.HsaOverrideGfxVersion
gfxOverride := envconfig.HsaOverrideGfxVersion()
var supported []string
libDir := ""

View File

@@ -53,7 +53,7 @@ func AMDGetGPUInfo() []RocmGPUInfo {
}
var supported []string
gfxOverride := envconfig.HsaOverrideGfxVersion
gfxOverride := envconfig.HsaOverrideGfxVersion()
if gfxOverride == "" {
supported, err = GetSupportedGFX(libDir)
if err != nil {

View File

@@ -26,7 +26,7 @@ func PayloadsDir() (string, error) {
defer lock.Unlock()
var err error
if payloadsDir == "" {
runnersDir := envconfig.RunnersDir
runnersDir := envconfig.RunnersDir()
if runnersDir != "" {
payloadsDir = runnersDir
@@ -35,7 +35,7 @@ func PayloadsDir() (string, error) {
// The remainder only applies on non-windows where we still carry payloads in the main executable
cleanupTmpDirs()
tmpDir := envconfig.TmpDir
tmpDir := envconfig.TmpDir()
if tmpDir == "" {
tmpDir, err = os.MkdirTemp("", "ollama")
if err != nil {
@@ -105,7 +105,7 @@ func cleanupTmpDirs() {
func Cleanup() {
lock.Lock()
defer lock.Unlock()
runnersDir := envconfig.RunnersDir
runnersDir := envconfig.RunnersDir()
if payloadsDir != "" && runnersDir == "" && runtime.GOOS != "windows" {
// We want to fully clean up the tmpdir parent of the payloads dir
tmpDir := filepath.Clean(filepath.Join(payloadsDir, ".."))

View File

@@ -230,8 +230,8 @@ func GetGPUInfo() GpuInfoList {
// On windows we bundle the nvidia library one level above the runner dir
depPath := ""
if runtime.GOOS == "windows" && envconfig.RunnersDir != "" {
depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir), "cuda")
if runtime.GOOS == "windows" && envconfig.RunnersDir() != "" {
depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir()), "cuda")
}
// Load ALL libraries
@@ -302,12 +302,12 @@ func GetGPUInfo() GpuInfoList {
}
// Intel
if envconfig.IntelGpu {
if envconfig.IntelGPU() {
oHandles = initOneAPIHandles()
// On windows we bundle the oneapi library one level above the runner dir
depPath = ""
if runtime.GOOS == "windows" && envconfig.RunnersDir != "" {
depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir), "oneapi")
if runtime.GOOS == "windows" && envconfig.RunnersDir() != "" {
depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir()), "oneapi")
}
for d := range oHandles.oneapi.num_drivers {
@@ -611,7 +611,7 @@ func LoadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string) {
}
func getVerboseState() C.uint16_t {
if envconfig.Debug {
if envconfig.Debug() {
return C.uint16_t(1)
}
return C.uint16_t(0)

View File

@@ -45,14 +45,7 @@ func TestUnicodeModelDir(t *testing.T) {
defer os.RemoveAll(modelDir)
slog.Info("unicode", "OLLAMA_MODELS", modelDir)
oldModelsDir := os.Getenv("OLLAMA_MODELS")
if oldModelsDir == "" {
defer os.Unsetenv("OLLAMA_MODELS")
} else {
defer os.Setenv("OLLAMA_MODELS", oldModelsDir)
}
err = os.Setenv("OLLAMA_MODELS", modelDir)
require.NoError(t, err)
t.Setenv("OLLAMA_MODELS", modelDir)
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
defer cancel()

View File

@@ -5,14 +5,16 @@ package integration
import (
"context"
"log/slog"
"os"
"strconv"
"sync"
"testing"
"time"
"github.com/ollama/ollama/api"
"github.com/stretchr/testify/require"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
)
func TestMultiModelConcurrency(t *testing.T) {
@@ -106,13 +108,16 @@ func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {
// Stress the system if we know how much VRAM it has, and attempt to load more models than will fit
func TestMultiModelStress(t *testing.T) {
vram := os.Getenv("OLLAMA_MAX_VRAM") // TODO - discover actual VRAM
if vram == "" {
s := os.Getenv("OLLAMA_MAX_VRAM") // TODO - discover actual VRAM
if s == "" {
t.Skip("OLLAMA_MAX_VRAM not specified, can't pick the right models for the stress test")
}
max, err := strconv.ParseUint(vram, 10, 64)
require.NoError(t, err)
const MB = uint64(1024 * 1024)
maxVram, err := strconv.ParseUint(s, 10, 64)
if err != nil {
t.Fatal(err)
}
type model struct {
name string
size uint64 // Approximate amount of VRAM they typically use when fully loaded in VRAM
@@ -121,83 +126,82 @@ func TestMultiModelStress(t *testing.T) {
smallModels := []model{
{
name: "orca-mini",
size: 2992 * MB,
size: 2992 * format.MebiByte,
},
{
name: "phi",
size: 2616 * MB,
size: 2616 * format.MebiByte,
},
{
name: "gemma:2b",
size: 2364 * MB,
size: 2364 * format.MebiByte,
},
{
name: "stable-code:3b",
size: 2608 * MB,
size: 2608 * format.MebiByte,
},
{
name: "starcoder2:3b",
size: 2166 * MB,
size: 2166 * format.MebiByte,
},
}
mediumModels := []model{
{
name: "llama2",
size: 5118 * MB,
size: 5118 * format.MebiByte,
},
{
name: "mistral",
size: 4620 * MB,
size: 4620 * format.MebiByte,
},
{
name: "orca-mini:7b",
size: 5118 * MB,
size: 5118 * format.MebiByte,
},
{
name: "dolphin-mistral",
size: 4620 * MB,
size: 4620 * format.MebiByte,
},
{
name: "gemma:7b",
size: 5000 * MB,
size: 5000 * format.MebiByte,
},
{
name: "codellama:7b",
size: 5118 * format.MebiByte,
},
// TODO - uncomment this once #3565 is merged and this is rebased on it
// {
// name: "codellama:7b",
// size: 5118 * MB,
// },
}
// These seem to be too slow to be useful...
// largeModels := []model{
// {
// name: "llama2:13b",
// size: 7400 * MB,
// size: 7400 * format.MebiByte,
// },
// {
// name: "codellama:13b",
// size: 7400 * MB,
// size: 7400 * format.MebiByte,
// },
// {
// name: "orca-mini:13b",
// size: 7400 * MB,
// size: 7400 * format.MebiByte,
// },
// {
// name: "gemma:7b",
// size: 5000 * MB,
// size: 5000 * format.MebiByte,
// },
// {
// name: "starcoder2:15b",
// size: 9100 * MB,
// size: 9100 * format.MebiByte,
// },
// }
var chosenModels []model
switch {
case max < 10000*MB:
case maxVram < 10000*format.MebiByte:
slog.Info("selecting small models")
chosenModels = smallModels
// case max < 30000*MB:
// case maxVram < 30000*format.MebiByte:
default:
slog.Info("selecting medium models")
chosenModels = mediumModels
@@ -226,15 +230,15 @@ func TestMultiModelStress(t *testing.T) {
}
var wg sync.WaitGroup
consumed := uint64(256 * MB) // Assume some baseline usage
consumed := uint64(256 * format.MebiByte) // Assume some baseline usage
for i := 0; i < len(req); i++ {
// Always get at least 2 models, but dont' overshoot VRAM too much or we'll take too long
if i > 1 && consumed > max {
slog.Info("achieved target vram exhaustion", "count", i, "vramMB", max/1024/1024, "modelsMB", consumed/1024/1024)
if i > 1 && consumed > vram {
slog.Info("achieved target vram exhaustion", "count", i, "vram", format.HumanBytes2(vram), "models", format.HumanBytes2(consumed))
break
}
consumed += chosenModels[i].size
slog.Info("target vram", "count", i, "vramMB", max/1024/1024, "modelsMB", consumed/1024/1024)
slog.Info("target vram", "count", i, "vram", format.HumanBytes2(vram), "models", format.HumanBytes2(consumed))
wg.Add(1)
go func(i int) {

View File

@@ -69,6 +69,10 @@ func TestAllMiniLMEmbed(t *testing.T) {
if !floatsEqual32(res.Embeddings[0][0], 0.010071031) {
t.Fatalf("expected 0.010071031, got %.8f", res.Embeddings[0][0])
}
if res.PromptEvalCount != 8 {
t.Fatalf("expected 8 prompt tokens, got %d", res.PromptEvalCount)
}
}
func TestAllMiniLMBatchEmbed(t *testing.T) {
@@ -97,6 +101,10 @@ func TestAllMiniLMBatchEmbed(t *testing.T) {
if !floatsEqual32(res.Embeddings[0][0], 0.010071031) || !floatsEqual32(res.Embeddings[1][0], -0.009802706) {
t.Fatalf("expected 0.010071031 and -0.009802706, got %.8f and %.8f", res.Embeddings[0][0], res.Embeddings[1][0])
}
if res.PromptEvalCount != 16 {
t.Fatalf("expected 16 prompt tokens, got %d", res.PromptEvalCount)
}
}
func TestAllMiniLMEmbedTruncate(t *testing.T) {

View File

@@ -5,7 +5,6 @@ package integration
import (
"context"
"errors"
"fmt"
"log/slog"
"os"
"strconv"
@@ -14,8 +13,10 @@ import (
"testing"
"time"
"github.com/ollama/ollama/api"
"github.com/stretchr/testify/require"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/envconfig"
)
func TestMaxQueue(t *testing.T) {
@@ -27,13 +28,10 @@ func TestMaxQueue(t *testing.T) {
// Note: This test can be quite slow when running in CPU mode, so keep the threadCount low unless your on GPU
// Also note that by default Darwin can't sustain > ~128 connections without adjusting limits
threadCount := 32
mq := os.Getenv("OLLAMA_MAX_QUEUE")
if mq != "" {
var err error
threadCount, err = strconv.Atoi(mq)
require.NoError(t, err)
if maxQueue := envconfig.MaxQueue(); maxQueue != 0 {
threadCount = maxQueue
} else {
os.Setenv("OLLAMA_MAX_QUEUE", fmt.Sprintf("%d", threadCount))
t.Setenv("OLLAMA_MAX_QUEUE", strconv.Itoa(threadCount))
}
req := api.GenerateRequest{

View File

@@ -1221,6 +1221,7 @@ struct llama_server_context
res.result_json = json
{
{"embedding", std::vector<float>(embd, embd + n_embd)},
{"timings", slot.get_formated_timings()},
};
}
}
@@ -3203,11 +3204,15 @@ int main(int argc, char **argv) {
responses = result.result_json.value("results", std::vector<json>{result.result_json});
json embeddings = json::array();
int prompt_n = 0;
for (auto & elem : responses) {
embeddings.push_back(elem.at("embedding"));
prompt_n += elem.at("timings").at("prompt_n").get<int>();
}
// send the result
json embedding_res = json{{"embedding", embeddings}};
json embedding_res = json{{"embedding", embeddings}, {"prompt_n", prompt_n}};
return res.set_content(embedding_res.dump(), "application/json; charset=utf-8");
}
});

View File

@@ -8,14 +8,14 @@ import (
"testing"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/gpu"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestEstimateGPULayers(t *testing.T) {
envconfig.Debug = true
t.Setenv("OLLAMA_DEBUG", "1")
modelName := "dummy"
f, err := os.CreateTemp(t.TempDir(), modelName)
require.NoError(t, err)

View File

@@ -0,0 +1,20 @@
diff --git a/src/llama.cpp b/src/llama.cpp
index a207451f..fba6b175 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -4969,6 +4969,7 @@ static void llm_load_hparams(
hparams.attn_soft_cap = true;
switch (hparams.n_layer) {
+ case 26: model.type = e_model::MODEL_2B; break;
case 42: model.type = e_model::MODEL_9B; break;
case 46: model.type = e_model::MODEL_27B; break;
default: model.type = e_model::MODEL_UNKNOWN;
@@ -11736,6 +11737,7 @@ struct llm_build_context {
// ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
switch (model.type) {
+ case e_model::MODEL_2B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break;
case e_model::MODEL_9B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break;
case e_model::MODEL_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break;
default: GGML_ABORT("fatal error");

View File

@@ -0,0 +1,43 @@
From 6eedae4cf2fcc8015dac79cb3f28f61fcabacab2 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 31 Jul 2024 14:57:04 -0700
Subject: [PATCH] phi3 sliding window
---
src/llama.cpp | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/src/llama.cpp b/src/llama.cpp
index a207451f..f2872d4e 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -4893,7 +4893,7 @@ static void llm_load_hparams(
} break;
case LLM_ARCH_PHI3:
{
- ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
+ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
@@ -10762,7 +10762,7 @@ struct llm_build_context {
struct ggml_tensor * inp_pos = build_inp_pos();
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
- struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa();
+ struct ggml_tensor * KQ_mask = hparams.n_swa > 0 ? build_inp_KQ_mask_swa() : build_inp_KQ_mask();
for (int il = 0; il < n_layer; ++il) {
auto residual = inpL;
@@ -10820,7 +10820,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
- Kcur, Vcur, Qcur, KQ_mask_swa, n_tokens, kv_head, n_kv, 1.0f, cb, il);
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
}
if (il == n_layer - 1) {
--
2.45.2

View File

@@ -33,7 +33,7 @@ type LlamaServer interface {
Ping(ctx context.Context) error
WaitUntilRunning(ctx context.Context) error
Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
Embed(ctx context.Context, input []string) ([][]float32, error)
Embed(ctx context.Context, input []string) (*EmbedResponse, error)
Tokenize(ctx context.Context, content string) ([]int, error)
Detokenize(ctx context.Context, tokens []int) (string, error)
Close() error
@@ -163,7 +163,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
} else {
servers = serversForGpu(gpus[0]) // All GPUs in the list are matching Library and Variant
}
demandLib := envconfig.LLMLibrary
demandLib := envconfig.LLMLibrary()
if demandLib != "" {
serverPath := availableServers[demandLib]
if serverPath == "" {
@@ -195,7 +195,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
params = append(params, "--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU))
}
if envconfig.Debug {
if envconfig.Debug() {
params = append(params, "--verbose")
}
@@ -221,7 +221,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
params = append(params, "--memory-f32")
}
flashAttnEnabled := envconfig.FlashAttention
flashAttnEnabled := envconfig.FlashAttention()
for _, g := range gpus {
// only cuda (compute capability 7+) and metal support flash attention
@@ -382,7 +382,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
}
slog.Info("starting llama server", "cmd", s.cmd.String())
if envconfig.Debug {
if envconfig.Debug() {
filteredEnv := []string{}
for _, ev := range s.cmd.Env {
if strings.HasPrefix(ev, "CUDA_") ||
@@ -879,10 +879,11 @@ type EmbedRequest struct {
}
type EmbedResponse struct {
Embedding [][]float32 `json:"embedding"`
Embedding [][]float32 `json:"embedding"`
PromptEvalCount int `json:"prompt_n"`
}
func (s *llmServer) Embed(ctx context.Context, input []string) ([][]float32, error) {
func (s *llmServer) Embed(ctx context.Context, input []string) (*EmbedResponse, error) {
if err := s.sem.Acquire(ctx, 1); err != nil {
slog.Error("Failed to acquire semaphore", "error", err)
return nil, err
@@ -924,12 +925,12 @@ func (s *llmServer) Embed(ctx context.Context, input []string) ([][]float32, err
return nil, fmt.Errorf("%s", body)
}
var embedding EmbedResponse
if err := json.Unmarshal(body, &embedding); err != nil {
var e EmbedResponse
if err := json.Unmarshal(body, &e); err != nil {
return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
}
return embedding.Embedding, nil
return &e, nil
}
type TokenizeRequest struct {

View File

@@ -61,6 +61,36 @@ type blobDownloadPart struct {
*blobDownload `json:"-"`
}
type jsonBlobDownloadPart struct {
N int
Offset int64
Size int64
Completed int64
}
func (p *blobDownloadPart) MarshalJSON() ([]byte, error) {
return json.Marshal(jsonBlobDownloadPart{
N: p.N,
Offset: p.Offset,
Size: p.Size,
Completed: p.Completed.Load(),
})
}
func (p *blobDownloadPart) UnmarshalJSON(b []byte) error {
var j jsonBlobDownloadPart
if err := json.Unmarshal(b, &j); err != nil {
return err
}
*p = blobDownloadPart{
N: j.N,
Offset: j.Offset,
Size: j.Size,
}
p.Completed.Store(j.Completed)
return nil
}
const (
numDownloadParts = 64
minDownloadPartSize int64 = 100 * format.MegaByte

View File

@@ -70,7 +70,7 @@ type Model struct {
License []string
Digest string
Options map[string]interface{}
Messages []Message
Messages []api.Message
Template *template.Template
}
@@ -191,11 +191,6 @@ func (m *Model) String() string {
return modelfile.String()
}
type Message struct {
Role string `json:"role"`
Content string `json:"content"`
}
type ConfigV2 struct {
ModelFormat string `json:"model_format"`
ModelFamily string `json:"model_family"`
@@ -646,7 +641,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
return err
}
if !envconfig.NoPrune && old != nil {
if !envconfig.NoPrune() && old != nil {
if err := old.RemoveLayers(); err != nil {
return err
}
@@ -885,7 +880,7 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
// build deleteMap to prune unused layers
deleteMap := make(map[string]struct{})
if !envconfig.NoPrune {
if !envconfig.NoPrune() {
manifest, _, err = GetManifest(mp)
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err

View File

@@ -7,7 +7,6 @@ import (
"slices"
"testing"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/types/model"
)
@@ -108,7 +107,6 @@ func TestManifests(t *testing.T) {
t.Run(n, func(t *testing.T) {
d := t.TempDir()
t.Setenv("OLLAMA_MODELS", d)
envconfig.LoadConfig()
for _, p := range wants.ps {
createManifest(t, d, p)

View File

@@ -105,9 +105,7 @@ func (mp ModelPath) GetShortTagname() string {
// GetManifestPath returns the path to the manifest file for the given model path, it is up to the caller to create the directory if it does not exist.
func (mp ModelPath) GetManifestPath() (string, error) {
dir := envconfig.ModelsDir
return filepath.Join(dir, "manifests", mp.Registry, mp.Namespace, mp.Repository, mp.Tag), nil
return filepath.Join(envconfig.Models(), "manifests", mp.Registry, mp.Namespace, mp.Repository, mp.Tag), nil
}
func (mp ModelPath) BaseURL() *url.URL {
@@ -118,9 +116,7 @@ func (mp ModelPath) BaseURL() *url.URL {
}
func GetManifestPath() (string, error) {
dir := envconfig.ModelsDir
path := filepath.Join(dir, "manifests")
path := filepath.Join(envconfig.Models(), "manifests")
if err := os.MkdirAll(path, 0o755); err != nil {
return "", err
}
@@ -129,8 +125,6 @@ func GetManifestPath() (string, error) {
}
func GetBlobsPath(digest string) (string, error) {
dir := envconfig.ModelsDir
// only accept actual sha256 digests
pattern := "^sha256[:-][0-9a-fA-F]{64}$"
re := regexp.MustCompile(pattern)
@@ -140,7 +134,7 @@ func GetBlobsPath(digest string) (string, error) {
}
digest = strings.ReplaceAll(digest, ":", "-")
path := filepath.Join(dir, "blobs", digest)
path := filepath.Join(envconfig.Models(), "blobs", digest)
dirPath := filepath.Dir(path)
if digest == "" {
dirPath = path

View File

@@ -7,8 +7,6 @@ import (
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/ollama/ollama/envconfig"
)
func TestGetBlobsPath(t *testing.T) {
@@ -63,7 +61,6 @@ func TestGetBlobsPath(t *testing.T) {
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
t.Setenv("OLLAMA_MODELS", dir)
envconfig.LoadConfig()
got, err := GetBlobsPath(tc.digest)

View File

@@ -164,17 +164,6 @@ func (s *Server) GenerateHandler(c *gin.Context) {
}
}
var b bytes.Buffer
if req.Context != nil {
s, err := r.Detokenize(c.Request.Context(), req.Context)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
b.WriteString(s)
}
var values template.Values
if req.Suffix != "" {
values.Prompt = prompt
@@ -187,6 +176,10 @@ func (s *Server) GenerateHandler(c *gin.Context) {
msgs = append(msgs, api.Message{Role: "system", Content: m.System})
}
if req.Context == nil {
msgs = append(msgs, m.Messages...)
}
for _, i := range images {
msgs = append(msgs, api.Message{Role: "user", Content: fmt.Sprintf("[img-%d]", i.ID)})
}
@@ -194,11 +187,22 @@ func (s *Server) GenerateHandler(c *gin.Context) {
values.Messages = append(msgs, api.Message{Role: "user", Content: req.Prompt})
}
var b bytes.Buffer
if err := tmpl.Execute(&b, values); err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
if req.Context != nil {
s, err := r.Detokenize(c.Request.Context(), req.Context)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
b.WriteString(s)
}
prompt = b.String()
}
@@ -284,6 +288,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
}
func (s *Server) EmbedHandler(c *gin.Context) {
checkpointStart := time.Now()
var req api.EmbedRequest
err := c.ShouldBindJSON(&req)
switch {
@@ -332,6 +337,8 @@ func (s *Server) EmbedHandler(c *gin.Context) {
return
}
checkpointLoaded := time.Now()
kvData, err := getKVData(m.ModelPath, false)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
@@ -370,13 +377,16 @@ func (s *Server) EmbedHandler(c *gin.Context) {
return
}
for i, e := range embeddings {
embeddings[i] = normalize(e)
for i, e := range embeddings.Embedding {
embeddings.Embedding[i] = normalize(e)
}
resp := api.EmbedResponse{
Model: req.Model,
Embeddings: embeddings,
Model: req.Model,
Embeddings: embeddings.Embedding,
TotalDuration: time.Since(checkpointStart),
LoadDuration: checkpointLoaded.Sub(checkpointStart),
PromptEvalCount: embeddings.PromptEvalCount,
}
c.JSON(http.StatusOK, resp)
}
@@ -428,9 +438,9 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) {
return
}
embedding := make([]float64, len(embeddings[0]))
embedding := make([]float64, len(embeddings.Embedding[0]))
for i, v := range embeddings[0] {
for i, v := range embeddings.Embedding[0] {
embedding[i] = float64(v)
}
@@ -1047,7 +1057,7 @@ func (s *Server) GenerateRoutes() http.Handler {
for _, prop := range openAIProperties {
config.AllowHeaders = append(config.AllowHeaders, "x-stainless-"+prop)
}
config.AllowOrigins = envconfig.AllowOrigins
config.AllowOrigins = envconfig.Origins()
r := gin.Default()
r.Use(
@@ -1092,7 +1102,7 @@ func (s *Server) GenerateRoutes() http.Handler {
func Serve(ln net.Listener) error {
level := slog.LevelInfo
if envconfig.Debug {
if envconfig.Debug() {
level = slog.LevelDebug
}
@@ -1120,7 +1130,7 @@ func Serve(ln net.Listener) error {
return err
}
if !envconfig.NoPrune {
if !envconfig.NoPrune() {
// clean up unused layers and manifests
if err := PruneLayers(); err != nil {
return err
@@ -1323,11 +1333,12 @@ func (s *Server) ChatHandler(c *gin.Context) {
return
}
msgs := append(m.Messages, req.Messages...)
if req.Messages[0].Role != "system" && m.System != "" {
req.Messages = append([]api.Message{{Role: "system", Content: m.System}}, req.Messages...)
msgs = append([]api.Message{{Role: "system", Content: m.System}}, msgs...)
}
prompt, images, err := chatPrompt(c.Request.Context(), m, r.Tokenize, opts, req.Messages, req.Tools)
prompt, images, err := chatPrompt(c.Request.Context(), m, r.Tokenize, opts, msgs, req.Tools)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return

View File

@@ -15,7 +15,6 @@ import (
"github.com/gin-gonic/gin"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/llm"
)
@@ -89,7 +88,6 @@ func TestCreateFromBin(t *testing.T) {
p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p)
envconfig.LoadConfig()
var s Server
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
@@ -117,7 +115,6 @@ func TestCreateFromModel(t *testing.T) {
p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p)
envconfig.LoadConfig()
var s Server
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
@@ -160,7 +157,6 @@ func TestCreateRemovesLayers(t *testing.T) {
p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p)
envconfig.LoadConfig()
var s Server
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
@@ -209,7 +205,6 @@ func TestCreateUnsetsSystem(t *testing.T) {
p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p)
envconfig.LoadConfig()
var s Server
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
@@ -267,7 +262,6 @@ func TestCreateMergeParameters(t *testing.T) {
p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p)
envconfig.LoadConfig()
var s Server
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
@@ -372,7 +366,6 @@ func TestCreateReplacesMessages(t *testing.T) {
p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p)
envconfig.LoadConfig()
var s Server
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
@@ -450,7 +443,6 @@ func TestCreateTemplateSystem(t *testing.T) {
p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p)
envconfig.LoadConfig()
var s Server
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
@@ -534,7 +526,6 @@ func TestCreateLicenses(t *testing.T) {
p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p)
envconfig.LoadConfig()
var s Server
w := createRequest(t, s.CreateModelHandler, api.CreateRequest{
@@ -582,7 +573,6 @@ func TestCreateDetectTemplate(t *testing.T) {
p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p)
envconfig.LoadConfig()
var s Server
t.Run("matched", func(t *testing.T) {

View File

@@ -10,7 +10,6 @@ import (
"github.com/gin-gonic/gin"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/types/model"
)
@@ -19,7 +18,6 @@ func TestDelete(t *testing.T) {
p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p)
envconfig.LoadConfig()
var s Server

View File

@@ -9,14 +9,12 @@ import (
"github.com/gin-gonic/gin"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/envconfig"
)
func TestList(t *testing.T) {
gin.SetMode(gin.TestMode)
t.Setenv("OLLAMA_MODELS", t.TempDir())
envconfig.LoadConfig()
expectNames := []string{
"mistral:7b-instruct-q4_0",

View File

@@ -19,7 +19,6 @@ import (
"github.com/stretchr/testify/require"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/openai"
"github.com/ollama/ollama/parser"
@@ -347,7 +346,6 @@ func Test_Routes(t *testing.T) {
}
t.Setenv("OLLAMA_MODELS", t.TempDir())
envconfig.LoadConfig()
s := &Server{}
router := s.GenerateRoutes()
@@ -378,7 +376,6 @@ func Test_Routes(t *testing.T) {
func TestCase(t *testing.T) {
t.Setenv("OLLAMA_MODELS", t.TempDir())
envconfig.LoadConfig()
cases := []string{
"mistral",
@@ -458,7 +455,6 @@ func TestCase(t *testing.T) {
func TestShow(t *testing.T) {
t.Setenv("OLLAMA_MODELS", t.TempDir())
envconfig.LoadConfig()
var s Server

View File

@@ -5,9 +5,11 @@ import (
"errors"
"fmt"
"log/slog"
"os"
"reflect"
"runtime"
"sort"
"strconv"
"strings"
"sync"
"time"
@@ -59,11 +61,12 @@ var defaultParallel = 4
var ErrMaxQueue = fmt.Errorf("server busy, please try again. maximum pending requests exceeded")
func InitScheduler(ctx context.Context) *Scheduler {
maxQueue := envconfig.MaxQueue()
sched := &Scheduler{
pendingReqCh: make(chan *LlmRequest, envconfig.MaxQueuedRequests),
finishedReqCh: make(chan *LlmRequest, envconfig.MaxQueuedRequests),
expiredCh: make(chan *runnerRef, envconfig.MaxQueuedRequests),
unloadedCh: make(chan interface{}, envconfig.MaxQueuedRequests),
pendingReqCh: make(chan *LlmRequest, maxQueue),
finishedReqCh: make(chan *LlmRequest, maxQueue),
expiredCh: make(chan *runnerRef, maxQueue),
unloadedCh: make(chan interface{}, maxQueue),
loaded: make(map[string]*runnerRef),
newServerFn: llm.NewLlamaServer,
getGpuFn: gpu.GetGPUInfo,
@@ -126,7 +129,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
slog.Debug("pending request cancelled or timed out, skipping scheduling")
continue
}
numParallel := envconfig.NumParallel
numParallel := int(envconfig.NumParallel())
// TODO (jmorganca): multimodal models don't support parallel yet
// see https://github.com/ollama/ollama/issues/4165
if len(pending.model.ProjectorPaths) > 0 && numParallel != 1 {
@@ -148,7 +151,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
pending.useLoadedRunner(runner, s.finishedReqCh)
break
}
} else if envconfig.MaxRunners > 0 && loadedCount >= envconfig.MaxRunners {
} else if envconfig.MaxRunners() > 0 && loadedCount >= int(envconfig.MaxRunners()) {
slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount)
runnerToExpire = s.findRunnerToUnload()
} else {
@@ -161,7 +164,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
gpus = s.getGpuFn()
}
if envconfig.MaxRunners <= 0 {
if envconfig.MaxRunners() <= 0 {
// No user specified MaxRunners, so figure out what automatic setting to use
// If all GPUs have reliable free memory reporting, defaultModelsPerGPU * the number of GPUs
// if any GPU has unreliable free memory reporting, 1x the number of GPUs
@@ -173,11 +176,13 @@ func (s *Scheduler) processPending(ctx context.Context) {
}
}
if allReliable {
envconfig.MaxRunners = defaultModelsPerGPU * len(gpus)
// HACK
os.Setenv("OLLAMA_MAX_LOADED_MODELS", strconv.Itoa(defaultModelsPerGPU*len(gpus)))
slog.Debug("updating default concurrency", "OLLAMA_MAX_LOADED_MODELS", envconfig.MaxRunners, "gpu_count", len(gpus))
} else {
// HACK
os.Setenv("OLLAMA_MAX_LOADED_MODELS", strconv.Itoa(len(gpus)))
slog.Info("one or more GPUs detected that are unable to accurately report free memory - disabling default concurrency")
envconfig.MaxRunners = len(gpus)
}
}
@@ -212,9 +217,12 @@ func (s *Scheduler) processPending(ctx context.Context) {
} else if loadedCount == 0 {
// No models loaded. Load the model but prefer the best fit.
slog.Debug("loading first model", "model", pending.model.ModelPath)
g := pickBestFitGPUs(pending, ggml, gpus, &numParallel)
g := pickBestFullFitByLibrary(pending, ggml, gpus, &numParallel)
if g != nil {
gpus = g
} else {
// Only allow partial loads when this is the first model
gpus = pickBestPartialFitByLibrary(pending, ggml, gpus, &numParallel)
}
s.loadFn(pending, ggml, gpus, numParallel)
break
@@ -231,7 +239,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
// Update free memory from currently loaded models
s.updateFreeSpace(availGpus)
fitGpus := pickBestFitGPUs(pending, ggml, availGpus, &numParallel)
fitGpus := pickBestFullFitByLibrary(pending, ggml, availGpus, &numParallel)
if fitGpus != nil {
slog.Debug("new model fits with existing models, loading")
s.loadFn(pending, ggml, fitGpus, numParallel)
@@ -401,7 +409,7 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList,
if numParallel < 1 {
numParallel = 1
}
sessionDuration := envconfig.KeepAlive
sessionDuration := envconfig.KeepAlive()
if req.sessionDuration != nil {
sessionDuration = req.sessionDuration.Duration
}
@@ -668,11 +676,12 @@ func (a ByDuration) Less(i, j int) bool {
// func (a BySize) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
// func (a BySize) Less(i, j int) bool { return a[i].estimatedVRAM < a[j].estimatedVRAM }
// pickBestFitGPUs will try to find the optimal placement of the model in the available GPUs where the model fully fits
// pickBestFullFitByLibrary will try to find the optimal placement of the model in the available GPUs where the model fully fits
// The list of GPUs returned will always be the same brand (library)
// If the model can not be fit fully within the available GPU(s) nil is returned
// If numParallel is <= 0, this will attempt try to optimize parallism based on available VRAM, and adjust
// opts.NumCtx accordingly
func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
var estimatedVRAM uint64
var numParallelToTry []int
@@ -695,7 +704,7 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numP
// First attempt to fit the model into a single GPU
for _, p := range numParallelToTry {
req.opts.NumCtx = req.origNumCtx * p
if !envconfig.SchedSpread {
if !envconfig.SchedSpread() {
for _, g := range sgl {
if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
@@ -723,6 +732,25 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numP
return nil
}
// If multiple Libraries are detected, pick the Library which loads the most layers for the model
func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
*numParallel = 1
byLibrary := gpus.ByLibrary()
if len(byLibrary) <= 1 {
return gpus
}
var bestEstimate uint64
var bestFit int
for i, gl := range byLibrary {
_, estimatedVRAM := llm.PredictServerFit(gl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts)
if estimatedVRAM > bestEstimate {
bestEstimate = estimatedVRAM
bestFit = i
}
}
return byLibrary[bestFit]
}
// findRunnerToUnload finds a runner to unload to make room for a new model
func (s *Scheduler) findRunnerToUnload() *runnerRef {
s.loadedMu.Lock()

View File

@@ -12,7 +12,6 @@ import (
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/app/lifecycle"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/gpu"
"github.com/ollama/ollama/llm"
@@ -272,7 +271,7 @@ func TestRequestsMultipleLoadedModels(t *testing.T) {
c.req.opts.NumGPU = 0 // CPU load, will be allowed
d := newScenarioRequest(t, ctx, "ollama-model-3c", 30, nil) // Needs prior unloaded
envconfig.MaxRunners = 1
t.Setenv("OLLAMA_MAX_LOADED_MODELS", "1")
s.newServerFn = a.newServer
slog.Info("a")
s.pendingReqCh <- a.req
@@ -291,7 +290,7 @@ func TestRequestsMultipleLoadedModels(t *testing.T) {
require.Len(t, s.loaded, 1)
s.loadedMu.Unlock()
envconfig.MaxRunners = 0
t.Setenv("OLLAMA_MAX_LOADED_MODELS", "0")
s.newServerFn = b.newServer
slog.Info("b")
s.pendingReqCh <- b.req
@@ -362,7 +361,7 @@ func TestGetRunner(t *testing.T) {
a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, &api.Duration{Duration: 2 * time.Millisecond})
b := newScenarioRequest(t, ctx, "ollama-model-1b", 10, &api.Duration{Duration: 2 * time.Millisecond})
c := newScenarioRequest(t, ctx, "ollama-model-1c", 10, &api.Duration{Duration: 2 * time.Millisecond})
envconfig.MaxQueuedRequests = 1
t.Setenv("OLLAMA_MAX_QUEUE", "1")
s := InitScheduler(ctx)
s.getGpuFn = getGpuFn
s.getCpuFn = getCpuFn
@@ -666,11 +665,50 @@ func TestAlreadyCanceled(t *testing.T) {
require.Empty(t, scenario1a.req.successCh)
}
func TestHomogeneousGPUs(t *testing.T) {
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
defer done()
s := InitScheduler(ctx)
s.getGpuFn = func() gpu.GpuInfoList {
// Set memory values to require the model to be spread
gpus := []gpu.GpuInfo{
{Library: "cuda"},
{Library: "rocm"},
}
gpus[0].TotalMemory = 1 * format.GibiByte
gpus[0].FreeMemory = 256 * format.MebiByte
gpus[1].TotalMemory = 1 * format.GibiByte
gpus[1].FreeMemory = 256 * format.MebiByte
return gpus
}
s.getCpuFn = getCpuFn
a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
require.Len(t, gpus, 1)
return a.newServer(gpus, model, ggml, adapters, projectors, opts, numParallel)
}
slog.Info("a")
s.pendingReqCh <- a.req
require.Len(t, s.pendingReqCh, 1)
s.Run(ctx)
select {
case resp := <-a.req.successCh:
require.Equal(t, resp.llama, a.srv)
require.Empty(t, s.pendingReqCh)
require.Empty(t, a.req.errCh)
case err := <-a.req.errCh:
t.Fatal(err.Error())
case <-ctx.Done():
t.Fatal("timeout")
}
}
type mockLlm struct {
pingResp error
waitResp error
completionResp error
embedResp [][]float32
embedResp *llm.EmbedResponse
embedRespErr error
tokenizeResp []int
tokenizeRespErr error
@@ -688,7 +726,7 @@ func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitRes
func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
return s.completionResp
}
func (s *mockLlm) Embed(ctx context.Context, input []string) ([][]float32, error) {
func (s *mockLlm) Embed(ctx context.Context, input []string) (*llm.EmbedResponse, error) {
return s.embedResp, s.embedRespErr
}
func (s *mockLlm) Tokenize(ctx context.Context, content string) ([]int, error) {