Compare commits

..

9 Commits

Author SHA1 Message Date
Daniel Hiltgen
7bec2724a5 integration: fix embedding tests error handling (#10478)
The cleanup routine from InitServerconnection should run in the defer of the test case to properly detect failures and report the server logs
2025-04-29 11:57:54 -07:00
Jesse Gross
a27462b708 ollamarunner: Temporarily disable worst case graph preallocation
When we later have a large batch running purely on a CPU, this
results the error:
GGML_ASSERT(talloc->buffer_id >= 0)

Disabling this means that we will incrementally reallocate memory
as the graph grows.

Fixes #10410
2025-04-29 11:04:58 -07:00
crStiv
6bf0b8193a readme: fix typos (#10399) 2025-04-29 10:30:44 -07:00
Devon Rifkin
db428adbb8 Merge pull request #10468 from ollama/drifkin/num-parallel-1 2025-04-29 10:21:36 -07:00
Devon Rifkin
fe5b9bb21b lower default num parallel to 2
this is in part to "pay" for #10452, which doubled the default context length. The combination isn't fully neutral though, because even though the old 4x2k limit and the new 2x4k limit are memory equivalent, the 1x fallback is larger with 4k
2025-04-29 02:04:14 -07:00
Devon Rifkin
6ec71d8fb6 Merge pull request #10452 from ollama/drifkin/4096-context-length
config: update default context length to 4096
2025-04-28 17:13:51 -07:00
Devon Rifkin
44b466eeb2 config: update default context length to 4096 2025-04-28 17:03:27 -07:00
Devon Rifkin
a25f3f8260 Merge pull request #10451 from ollama/revert-10364-drifkin/context-length
Revert "increase default context length to 4096"
2025-04-28 17:02:10 -07:00
Devon Rifkin
dd93e1af85 Revert "increase default context length to 4096 (#10364)"
This reverts commit 424f648632.
2025-04-28 16:54:11 -07:00
12 changed files with 50 additions and 576 deletions

View File

@@ -285,7 +285,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Bionic GPT](https://github.com/bionic-gpt/bionic-gpt)
- [HTML UI](https://github.com/rtcfirefly/ollama-ui)
- [Saddle](https://github.com/jikkuatwork/saddle)
- [TagSpaces](https://www.tagspaces.org) (A platform for file based apps, [utilizing Ollama](https://docs.tagspaces.org/ai/) for the generation of tags and descriptions)
- [TagSpaces](https://www.tagspaces.org) (A platform for file-based apps, [utilizing Ollama](https://docs.tagspaces.org/ai/) for the generation of tags and descriptions)
- [Chatbot UI](https://github.com/ivanfioravanti/chatbot-ollama)
- [Chatbot UI v2](https://github.com/mckaywrigley/chatbot-ui)
- [Typescript UI](https://github.com/ollama-interface/Ollama-Gui?tab=readme-ov-file)
@@ -325,14 +325,14 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [RWKV-Runner](https://github.com/josStorer/RWKV-Runner) (RWKV offline LLM deployment tool, also usable as a client for ChatGPT and Ollama)
- [Ollama Grid Search](https://github.com/dezoito/ollama-grid-search) (app to evaluate and compare models)
- [Olpaka](https://github.com/Otacon/olpaka) (User-friendly Flutter Web App for Ollama)
- [Casibase](https://casibase.org) (An open source AI knowledge base and dialogue system combining the latest RAG, SSO, ollama support and multiple large language models.)
- [Casibase](https://casibase.org) (An open source AI knowledge base and dialogue system combining the latest RAG, SSO, ollama support, and multiple large language models.)
- [OllamaSpring](https://github.com/CrazyNeil/OllamaSpring) (Ollama Client for macOS)
- [LLocal.in](https://github.com/kartikm7/llocal) (Easy to use Electron Desktop Client for Ollama)
- [Shinkai Desktop](https://github.com/dcSpark/shinkai-apps) (Two click install Local AI using Ollama + Files + RAG)
- [AiLama](https://github.com/zeyoyt/ailama) (A Discord User App that allows you to interact with Ollama anywhere in discord )
- [AiLama](https://github.com/zeyoyt/ailama) (A Discord User App that allows you to interact with Ollama anywhere in Discord)
- [Ollama with Google Mesop](https://github.com/rapidarchitect/ollama_mesop/) (Mesop Chat Client implementation with Ollama)
- [R2R](https://github.com/SciPhi-AI/R2R) (Open-source RAG engine)
- [Ollama-Kis](https://github.com/elearningshow/ollama-kis) (A simple easy to use GUI with sample custom LLM for Drivers Education)
- [Ollama-Kis](https://github.com/elearningshow/ollama-kis) (A simple easy-to-use GUI with sample custom LLM for Drivers Education)
- [OpenGPA](https://opengpa.org) (Open-source offline-first Enterprise Agentic Application)
- [Painting Droid](https://github.com/mateuszmigas/painting-droid) (Painting app with AI integrations)
- [Kerlig AI](https://www.kerlig.com/) (AI writing assistant for macOS)
@@ -341,16 +341,16 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [LLMStack](https://github.com/trypromptly/LLMStack) (No-code multi-agent framework to build LLM agents and workflows)
- [BoltAI for Mac](https://boltai.com) (AI Chat Client for Mac)
- [Harbor](https://github.com/av/harbor) (Containerized LLM Toolkit with Ollama as default backend)
- [PyGPT](https://github.com/szczyglis-dev/py-gpt) (AI desktop assistant for Linux, Windows and Mac)
- [Alpaca](https://github.com/Jeffser/Alpaca) (An Ollama client application for linux and macos made with GTK4 and Adwaita)
- [PyGPT](https://github.com/szczyglis-dev/py-gpt) (AI desktop assistant for Linux, Windows, and Mac)
- [Alpaca](https://github.com/Jeffser/Alpaca) (An Ollama client application for Linux and macOS made with GTK4 and Adwaita)
- [AutoGPT](https://github.com/Significant-Gravitas/AutoGPT/blob/master/docs/content/platform/ollama.md) (AutoGPT Ollama integration)
- [Go-CREW](https://www.jonathanhecl.com/go-crew/) (Powerful Offline RAG in Golang)
- [PartCAD](https://github.com/openvmp/partcad/) (CAD model generation with OpenSCAD and CadQuery)
- [Ollama4j Web UI](https://github.com/ollama4j/ollama4j-web-ui) - Java-based Web UI for Ollama built with Vaadin, Spring Boot and Ollama4j
- [Ollama4j Web UI](https://github.com/ollama4j/ollama4j-web-ui) - Java-based Web UI for Ollama built with Vaadin, Spring Boot, and Ollama4j
- [PyOllaMx](https://github.com/kspviswa/pyOllaMx) - macOS application capable of chatting with both Ollama and Apple MLX models.
- [Cline](https://github.com/cline/cline) - Formerly known as Claude Dev is a VSCode extension for multi-file/whole-repo coding
- [Cherry Studio](https://github.com/kangfenmao/cherry-studio) (Desktop client with Ollama support)
- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption)
- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy-focused LLM chat interface with optional encryption)
- [Archyve](https://github.com/nickthecook/archyve) (RAG-enabling document library)
- [crewAI with Mesop](https://github.com/rapidarchitect/ollama-crew-mesop) (Mesop Web Interface to run crewAI with Ollama)
- [Tkinter-based client](https://github.com/chyok/ollama-gui) (Python tkinter-based Client for Ollama)
@@ -368,7 +368,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [DualMind](https://github.com/tcsenpai/dualmind) (Experimental app allowing two models to talk to each other in the terminal or in a web interface)
- [ollamarama-matrix](https://github.com/h1ddenpr0cess20/ollamarama-matrix) (Ollama chatbot for the Matrix chat protocol)
- [ollama-chat-app](https://github.com/anan1213095357/ollama-chat-app) (Flutter-based chat app)
- [Perfect Memory AI](https://www.perfectmemory.ai/) (Productivity AI assists personalized by what you have seen on your screen, heard and said in the meetings)
- [Perfect Memory AI](https://www.perfectmemory.ai/) (Productivity AI assists personalized by what you have seen on your screen, heard, and said in the meetings)
- [Hexabot](https://github.com/hexastack/hexabot) (A conversational AI builder)
- [Reddit Rate](https://github.com/rapidarchitect/reddit_analyzer) (Search and Rate Reddit topics with a weighted summation)
- [OpenTalkGpt](https://github.com/adarshM84/OpenTalkGpt) (Chrome Extension to manage open-source models supported by Ollama, create custom models, and chat with models from a user-friendly UI)
@@ -386,7 +386,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [ChibiChat](https://github.com/CosmicEventHorizon/ChibiChat) (Kotlin-based Android app to chat with Ollama and Koboldcpp API endpoints)
- [LocalLLM](https://github.com/qusaismael/localllm) (Minimal Web-App to run ollama models on it with a GUI)
- [Ollamazing](https://github.com/buiducnhat/ollamazing) (Web extension to run Ollama models)
- [OpenDeepResearcher-via-searxng](https://github.com/benhaotang/OpenDeepResearcher-via-searxng) (A Deep Research equivent endpoint with Ollama support for running locally)
- [OpenDeepResearcher-via-searxng](https://github.com/benhaotang/OpenDeepResearcher-via-searxng) (A Deep Research equivalent endpoint with Ollama support for running locally)
- [AntSK](https://github.com/AIDotNet/AntSK) (Out-of-the-box & Adaptable RAG Chatbot)
- [MaxKB](https://github.com/1Panel-dev/MaxKB/) (Ready-to-use & flexible RAG Chatbot)
- [yla](https://github.com/danielekp/yla) (Web interface to freely interact with your customized models)
@@ -440,7 +440,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [PowershAI](https://github.com/rrg92/powershai) PowerShell module that brings AI to terminal on Windows, including support for Ollama
- [DeepShell](https://github.com/Abyss-c0re/deepshell) Your self-hosted AI assistant. Interactive Shell, Files and Folders analysis.
- [orbiton](https://github.com/xyproto/orbiton) Configuration-free text editor and IDE with support for tab completion with Ollama.
- [orca-cli](https://github.com/molbal/orca-cli) Ollama Registry CLI Application - Browse, pull and download models from Ollama Registry in your terminal.
- [orca-cli](https://github.com/molbal/orca-cli) Ollama Registry CLI Application - Browse, pull, and download models from Ollama Registry in your terminal.
- [GGUF-to-Ollama](https://github.com/jonathanhecl/gguf-to-ollama) - Importing GGUF to Ollama made easy (multiplatform)
### Apple Vision Pro
@@ -515,7 +515,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Swollama for Swift](https://github.com/marcusziade/Swollama) with [DocC](https://marcusziade.github.io/Swollama/documentation/swollama/)
- [GoLamify](https://github.com/prasad89/golamify)
- [Ollama for Haskell](https://github.com/tusharad/ollama-haskell)
- [multi-llm-ts](https://github.com/nbonamy/multi-llm-ts) (A Typescript/JavaScript library allowing access to different LLM in unified API)
- [multi-llm-ts](https://github.com/nbonamy/multi-llm-ts) (A Typescript/JavaScript library allowing access to different LLM in a unified API)
- [LlmTornado](https://github.com/lofcz/llmtornado) (C# library providing a unified interface for major FOSS & Commercial inference APIs)
- [Ollama for Zig](https://github.com/dravenk/ollama-zig)
- [Abso](https://github.com/lunary-ai/abso) (OpenAI-compatible TypeScript SDK for any LLM provider)
@@ -524,11 +524,11 @@ See the [API documentation](./docs/api.md) for all endpoints.
### Mobile
- [SwiftChat](https://github.com/aws-samples/swift-chat) (Lightning-fast Cross-platform AI chat app with native UI for Android, iOS and iPad)
- [SwiftChat](https://github.com/aws-samples/swift-chat) (Lightning-fast Cross-platform AI chat app with native UI for Android, iOS, and iPad)
- [Enchanted](https://github.com/AugustDev/enchanted)
- [Maid](https://github.com/Mobile-Artificial-Intelligence/maid)
- [Ollama App](https://github.com/JHubi1/ollama-app) (Modern and easy-to-use multi-platform client for Ollama)
- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption)
- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy-focused LLM chat interface with optional encryption)
- [Ollama Android Chat](https://github.com/sunshine0523/OllamaServer) (No need for Termux, start the Ollama service with one click on an Android device)
- [Reins](https://github.com/ibrahimcetin/reins) (Easily tweak parameters, customize system prompts per chat, and enhance your AI experiments with reasoning model support.)
@@ -552,7 +552,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Obsidian Local GPT plugin](https://github.com/pfrankov/obsidian-local-gpt)
- [Open Interpreter](https://docs.openinterpreter.com/language-model-setup/local-models/ollama)
- [Llama Coder](https://github.com/ex3ndr/llama-coder) (Copilot alternative using Ollama)
- [Ollama Copilot](https://github.com/bernardo-bruning/ollama-copilot) (Proxy that allows you to use ollama as a copilot like Github copilot)
- [Ollama Copilot](https://github.com/bernardo-bruning/ollama-copilot) (Proxy that allows you to use Ollama as a copilot like GitHub Copilot)
- [twinny](https://github.com/rjmacarthy/twinny) (Copilot and Copilot chat alternative using Ollama)
- [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and Hugging Face)
- [Page Assist](https://github.com/n4ze3m/page-assist) (Chrome Extension)
@@ -562,8 +562,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation)
- [ChatGPTBox: All in one browser extension](https://github.com/josStorer/chatGPTBox) with [Integrating Tutorial](https://github.com/josStorer/chatGPTBox/issues/616#issuecomment-1975186467)
- [Discord AI chat/moderation bot](https://github.com/rapmd73/Companion) Chat/moderation bot written in python. Uses Ollama to create personalities.
- [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depends on ollama server)
- [Terraform AWS Ollama & Open WebUI](https://github.com/xuyangbocn/terraform-aws-self-host-llm) (A Terraform module to deploy on AWS a ready-to-use Ollama service, together with its front end Open WebUI service.)
- [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depend on ollama server)
- [Terraform AWS Ollama & Open WebUI](https://github.com/xuyangbocn/terraform-aws-self-host-llm) (A Terraform module to deploy on AWS a ready-to-use Ollama service, together with its front-end Open WebUI service.)
- [node-red-contrib-ollama](https://github.com/jakubburkiewicz/node-red-contrib-ollama)
- [Local AI Helper](https://github.com/ivostoykov/localAI) (Chrome and Firefox extensions that enable interactions with the active tab and customisable API endpoints. Includes secure storage for user prompts.)
- [vnc-lm](https://github.com/jake83741/vnc-lm) (Discord bot for messaging with LLMs through Ollama and LiteLLM. Seamlessly move between local and flagship models.)

View File

@@ -1407,7 +1407,6 @@ func NewCLI() *cobra.Command {
envVars["OLLAMA_LLM_LIBRARY"],
envVars["OLLAMA_GPU_OVERHEAD"],
envVars["OLLAMA_LOAD_TIMEOUT"],
envVars["OLLAMA_CONTEXT_LENGTH"],
})
default:
appendEnvDocs(cmd, envs)

View File

@@ -20,7 +20,7 @@ Please refer to the [GPU docs](./gpu.md).
## How can I specify the context window size?
By default, Ollama uses a context window size of 4096 tokens, unless you have a single GPU with <= 4 GB of VRAM, in which case it will default to 2048 tokens.
By default, Ollama uses a context window size of 4096 tokens.
This can be overridden with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use:
@@ -31,7 +31,7 @@ OLLAMA_CONTEXT_LENGTH=8192 ollama serve
To change this when using `ollama run`, use `/set parameter`:
```shell
/set parameter num_ctx 8192
/set parameter num_ctx 4096
```
When using the API, specify the `num_ctx` parameter:
@@ -41,7 +41,7 @@ curl http://localhost:11434/api/generate -d '{
"model": "llama3.2",
"prompt": "Why is the sky blue?",
"options": {
"num_ctx": 8192
"num_ctx": 4096
}
}'
```

View File

@@ -169,7 +169,7 @@ var (
// Enable the new Ollama engine
NewEngine = Bool("OLLAMA_NEW_ENGINE")
// ContextLength sets the default context length
ContextLength = Int64("OLLAMA_CONTEXT_LENGTH", -1)
ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 4096)
)
func String(s string) func() string {
@@ -227,20 +227,6 @@ func Uint64(key string, defaultValue uint64) func() uint64 {
}
}
func Int64(key string, defaultValue int64) func() int64 {
return func() int64 {
if s := Var(key); s != "" {
if n, err := strconv.ParseInt(s, 10, 64); err != nil {
slog.Warn("invalid environment variable, using default", "key", key, "value", s, "default", defaultValue)
} else {
return n
}
}
return defaultValue
}
}
// Set aside VRAM per GPU
var GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0)
@@ -269,7 +255,7 @@ func AsMap() map[string]EnvVar {
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", AllowedOrigins(), "A comma separated list of allowed origins"},
"OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
"OLLAMA_MULTIUSER_CACHE": {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},
"OLLAMA_CONTEXT_LENGTH": {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default 4096 or 2048 with low VRAM)"},
"OLLAMA_CONTEXT_LENGTH": {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 4096)"},
"OLLAMA_NEW_ENGINE": {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"},
// Informational

View File

@@ -278,9 +278,9 @@ func TestVar(t *testing.T) {
}
func TestContextLength(t *testing.T) {
cases := map[string]int64{
"": -1,
"4096": 4096,
cases := map[string]uint{
"": 4096,
"2048": 2048,
}
for k, v := range cases {

View File

@@ -34,13 +34,15 @@ func cosineSimilarity[V float32 | float64](v1, v2 []V) V {
func TestAllMiniLMEmbeddings(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
defer cancel()
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
req := api.EmbeddingRequest{
Model: "all-minilm",
Prompt: "why is the sky blue?",
}
res, err := embeddingTestHelper(ctx, t, req)
res, err := embeddingTestHelper(ctx, client, t, req)
if err != nil {
t.Fatalf("error: %v", err)
@@ -62,13 +64,15 @@ func TestAllMiniLMEmbeddings(t *testing.T) {
func TestAllMiniLMEmbed(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
defer cancel()
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
req := api.EmbedRequest{
Model: "all-minilm",
Input: "why is the sky blue?",
}
res, err := embedTestHelper(ctx, t, req)
res, err := embedTestHelper(ctx, client, t, req)
if err != nil {
t.Fatalf("error: %v", err)
@@ -98,13 +102,15 @@ func TestAllMiniLMEmbed(t *testing.T) {
func TestAllMiniLMBatchEmbed(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
defer cancel()
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
req := api.EmbedRequest{
Model: "all-minilm",
Input: []string{"why is the sky blue?", "why is the grass green?"},
}
res, err := embedTestHelper(ctx, t, req)
res, err := embedTestHelper(ctx, client, t, req)
if err != nil {
t.Fatalf("error: %v", err)
@@ -144,6 +150,8 @@ func TestAllMiniLMBatchEmbed(t *testing.T) {
func TestAllMiniLMEmbedTruncate(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
defer cancel()
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
truncTrue, truncFalse := true, false
@@ -182,7 +190,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
res := make(map[string]*api.EmbedResponse)
for _, req := range reqs {
response, err := embedTestHelper(ctx, t, req.Request)
response, err := embedTestHelper(ctx, client, t, req.Request)
if err != nil {
t.Fatalf("error: %v", err)
}
@@ -198,7 +206,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
}
// check that truncate set to false returns an error if context length is exceeded
_, err := embedTestHelper(ctx, t, api.EmbedRequest{
_, err := embedTestHelper(ctx, client, t, api.EmbedRequest{
Model: "all-minilm",
Input: "why is the sky blue?",
Truncate: &truncFalse,
@@ -210,9 +218,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
}
}
func embeddingTestHelper(ctx context.Context, t *testing.T, req api.EmbeddingRequest) (*api.EmbeddingResponse, error) {
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
func embeddingTestHelper(ctx context.Context, client *api.Client, t *testing.T, req api.EmbeddingRequest) (*api.EmbeddingResponse, error) {
if err := PullIfMissing(ctx, client, req.Model); err != nil {
t.Fatalf("failed to pull model %s: %v", req.Model, err)
}
@@ -226,9 +232,7 @@ func embeddingTestHelper(ctx context.Context, t *testing.T, req api.EmbeddingReq
return response, nil
}
func embedTestHelper(ctx context.Context, t *testing.T, req api.EmbedRequest) (*api.EmbedResponse, error) {
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
func embedTestHelper(ctx context.Context, client *api.Client, t *testing.T, req api.EmbedRequest) (*api.EmbedResponse, error) {
if err := PullIfMissing(ctx, client, req.Model); err != nil {
t.Fatalf("failed to pull model %s: %v", req.Model, err)
}

View File

@@ -723,7 +723,9 @@ func (m *multiLPath) String() string {
return strings.Join(*m, ", ")
}
func (s *Server) reserveWorstCaseGraph() error {
// TODO(jessegross): This is causing tensor allocation failures with large batches when not offloaded
// to the GPU
/*func (s *Server) reserveWorstCaseGraph() error {
ctx := s.model.Backend().NewContext()
defer ctx.Close()
@@ -766,7 +768,7 @@ func (s *Server) reserveWorstCaseGraph() error {
}
return nil
}
}*/
func (s *Server) loadModel(
ctx context.Context,
@@ -803,10 +805,10 @@ func (s *Server) loadModel(
s.seqs = make([]*Sequence, s.parallel)
s.seqsSem = semaphore.NewWeighted(int64(s.parallel))
err = s.reserveWorstCaseGraph()
/*err = s.reserveWorstCaseGraph()
if err != nil {
panic(err)
}
}*/
s.status = llm.ServerStatusReady
s.ready.Done()

View File

@@ -1,226 +0,0 @@
package server
import (
"fmt"
"regexp"
"strconv"
"strings"
"github.com/ollama/ollama/api"
)
var (
pythonFuncRegex = regexp.MustCompile(`(\w+)\((.*?)\)`)
braces = map[rune]rune{
'[': ']',
'{': '}',
'(': ')',
'"': '"',
'\'': '\'',
}
)
// parsePythonValue converts a Python value string to its appropriate Go type
func parsePythonValue(value string) (any, error) {
value = strings.TrimSpace(value)
// string
if (strings.HasPrefix(value, "\"") && strings.HasSuffix(value, "\"")) ||
(strings.HasPrefix(value, "'") && strings.HasSuffix(value, "'")) {
// Remove quotes
result := value[1 : len(value)-1]
return result, nil
}
// bool
switch strings.ToLower(value) {
case "true":
return true, nil
case "false":
return false, nil
case "none":
return nil, nil
}
// int
if i, err := strconv.Atoi(value); err == nil {
return i, nil
}
// float
if f, err := strconv.ParseFloat(value, 64); err == nil {
return f, nil
}
// list
if strings.HasPrefix(value, "[") && strings.HasSuffix(value, "]") {
listStr := value[1 : len(value)-1]
var list []any
stack := []rune{}
start := 0
for i, char := range listStr {
if len(stack) != 0 && char == braces[stack[len(stack)-1]] {
stack = stack[:len(stack)-1]
} else if _, ok := braces[char]; ok {
stack = append(stack, char)
}
if len(stack) == 0 && (char == ',' || i == len(listStr)-1) {
end := i
if i == len(listStr)-1 {
end = i + 1
}
item := strings.TrimSpace(listStr[start:end])
if val, err := parsePythonValue(item); err == nil {
list = append(list, val)
} else {
return nil, fmt.Errorf("invalid list item: %s", item)
}
start = i + 1
}
}
return list, nil
}
// dictionary
if strings.HasPrefix(value, "{") && strings.HasSuffix(value, "}") && strings.Contains(value, ":") {
dictStr := value[1 : len(value)-1]
dict := make(map[any]any)
stack := []rune{}
start := 0
for i, char := range dictStr {
if len(stack) != 0 && char == braces[stack[len(stack)-1]] {
stack = stack[:len(stack)-1]
} else if _, ok := braces[char]; ok {
stack = append(stack, char)
}
if len(stack) == 0 && (char == ',' || i == len(dictStr)-1) {
end := i
if i == len(dictStr)-1 {
end = i + 1
}
item := strings.TrimSpace(dictStr[start:end])
kv := strings.SplitN(item, ":", 2)
if len(kv) != 2 {
return nil, fmt.Errorf("invalid dictionary key-value pair: %s", item)
}
key, err := parsePythonValue(strings.TrimSpace(kv[0]))
if err != nil {
return nil, fmt.Errorf("invalid dictionary key: %s", kv[0])
}
val, err := parsePythonValue(strings.TrimSpace(kv[1]))
if err != nil {
return nil, fmt.Errorf("invalid dictionary value: %s", kv[1])
}
dict[key] = val
start = i + 1
}
}
return dict, nil
}
// sets (stored as lists)
if strings.HasPrefix(value, "{") && strings.HasSuffix(value, "}") {
setStr := value[1 : len(value)-1]
var list []any
stack := []rune{}
start := 0
for i, char := range setStr {
if len(stack) != 0 && char == braces[stack[len(stack)-1]] {
stack = stack[:len(stack)-1]
} else if _, ok := braces[char]; ok {
stack = append(stack, char)
}
if len(stack) == 0 && (char == ',' || i == len(setStr)-1) {
end := i
if i == len(setStr)-1 {
end = i + 1
}
item := strings.TrimSpace(setStr[start:end])
if val, err := parsePythonValue(item); err == nil {
list = append(list, val)
} else {
return nil, fmt.Errorf("invalid set item: %s", item)
}
start = i + 1
}
}
return list, nil
}
return nil, fmt.Errorf("invalid Python value: %s", value)
}
// parsePythonToolCall parses Python function calls from a string
// it supports keyword arguments, as well as multiple functions in a single string
func parsePythonToolCall(s string) ([]api.ToolCall, error) {
matches := pythonFuncRegex.FindAllStringSubmatchIndex(s, -1)
if len(matches) == 0 {
return nil, fmt.Errorf("no Python function calls found")
}
var toolCalls []api.ToolCall
for _, match := range matches {
name := s[match[2]:match[3]]
args := s[match[4]:match[5]]
var arguments api.ToolCallFunctionArguments
if len(args) == 0 {
toolCalls = append(toolCalls, api.ToolCall{
Function: api.ToolCallFunction{
Name: name,
},
})
continue
}
start := 0
stack := []rune{}
for i, char := range args {
if len(stack) != 0 && char == braces[stack[len(stack)-1]] {
stack = stack[:len(stack)-1]
} else if _, ok := braces[char]; ok {
stack = append(stack, char)
}
if len(stack) == 0 && (char == ',' || i == len(args)-1) {
end := i
if i == len(args)-1 {
end = i + 1
}
kv := strings.SplitN(args[start:end], "=", 2)
if len(kv) == 2 {
key := strings.TrimSpace(kv[0])
valueStr := strings.TrimSpace(kv[1])
// Parse the value into appropriate type
value, err := parsePythonValue(valueStr)
if err != nil {
return nil, fmt.Errorf("failed to parse value for key %q: %v", key, err)
}
arguments[key] = value
} else {
return nil, fmt.Errorf("invalid argument format: %q", args[start:end])
}
start = i + 1
}
}
if len(arguments) > 0 {
toolCalls = append(toolCalls, api.ToolCall{
Function: api.ToolCallFunction{
Name: name,
Arguments: arguments,
},
})
}
}
if len(toolCalls) > 0 {
return toolCalls, nil
}
return nil, fmt.Errorf("failed to parse any valid tool calls")
}

View File

@@ -1,269 +0,0 @@
package server
import (
"testing"
"github.com/google/go-cmp/cmp"
"github.com/ollama/ollama/api"
)
func TestParsePythonFunctionCall(t *testing.T) {
t1 := api.ToolCall{
Function: api.ToolCallFunction{
Name: "get_current_weather",
Arguments: api.ToolCallFunctionArguments{
"location": "San Francisco, CA",
"format": "fahrenheit",
},
},
}
t2 := api.ToolCall{
Function: api.ToolCallFunction{
Name: "get_forecast",
Arguments: api.ToolCallFunctionArguments{
"days": 5,
"location": "Seattle",
},
},
}
t3 := api.ToolCall{
Function: api.ToolCallFunction{
Name: "get_current_weather",
Arguments: api.ToolCallFunctionArguments{
"list": []any{1, 2, 3},
"int": -1,
"float": 1.23,
"string": "hello",
},
},
}
t4 := api.ToolCall{
Function: api.ToolCallFunction{
Name: "get_current_weather",
},
}
cases := []struct {
name string
input string
want []api.ToolCall
err bool
}{
{
name: "malformed function call - missing closing paren",
input: "get_current_weather(location=\"San Francisco\"",
err: true,
},
{
name: "empty function call",
input: "get_current_weather()",
want: []api.ToolCall{t4},
err: false,
},
{
name: "single valid function call",
input: "get_current_weather(location=\"San Francisco, CA\", format=\"fahrenheit\")",
want: []api.ToolCall{t1},
},
{
name: "multiple valid function calls",
input: "get_current_weather(location=\"San Francisco, CA\", format=\"fahrenheit\") get_forecast(days=5, location=\"Seattle\")",
want: []api.ToolCall{t1, t2},
},
{
name: "multiple valid function calls with list",
input: "get_current_weather(list=[1,2,3], int=-1, float=1.23, string=\"hello\")",
want: []api.ToolCall{t3},
},
{
name: "positional arguments not supported",
input: "get_current_weather(1, 2, 3)",
err: true,
},
{
name: "invalid argument format without equals",
input: "get_current_weather(\"San Francisco\")",
err: true,
},
{
name: "nested lists",
input: "get_current_weather(data=[[1,2],[3,4]])",
want: []api.ToolCall{{
Function: api.ToolCallFunction{
Name: "get_current_weather",
Arguments: api.ToolCallFunctionArguments{
"data": []any{[]any{1, 2}, []any{3, 4}},
},
},
}},
},
{
name: "boolean and none values",
input: "get_current_weather(active=true, enabled=false, value=None)",
want: []api.ToolCall{{
Function: api.ToolCallFunction{
Name: "get_current_weather",
Arguments: api.ToolCallFunctionArguments{
"active": true,
"enabled": false,
"value": nil,
},
},
}},
},
{
name: "single vs double quotes",
input: "get_current_weather(str1='single', str2=\"double\")",
want: []api.ToolCall{{
Function: api.ToolCallFunction{
Name: "get_current_weather",
Arguments: api.ToolCallFunctionArguments{
"str1": "single",
"str2": "double",
},
},
}},
},
{
name: "whitespace handling",
input: "get_current_weather( location = \"San Francisco\" , temp = 72 )",
want: []api.ToolCall{{
Function: api.ToolCallFunction{
Name: "get_current_weather",
Arguments: api.ToolCallFunctionArguments{
"location": "San Francisco",
"temp": 72,
},
},
}},
},
}
for _, tt := range cases {
t.Run(tt.name, func(t *testing.T) {
got, err := parsePythonToolCall(tt.input)
if (err != nil) != tt.err {
t.Fatalf("expected error: %v, got error: %v", tt.err, err)
}
if tt.err {
return
}
if diff := cmp.Diff(got, tt.want); diff != "" {
t.Errorf("mismatch (-got +want):\n%s", diff)
}
})
}
}
func TestParsePythonValue(t *testing.T) {
cases := []struct {
name string
input string
want any
err bool
}{
{
name: "string with double quotes",
input: "\"hello\"",
want: "hello",
},
{
name: "string with single quotes",
input: "'world'",
want: "world",
},
{
name: "integer",
input: "42",
want: 42,
},
{
name: "float",
input: "3.14",
want: 3.14,
},
{
name: "boolean true",
input: "True",
want: true,
},
{
name: "boolean false",
input: "False",
want: false,
},
{
name: "none/null",
input: "None",
want: nil,
},
{
name: "simple list",
input: "[1, 2, 3]",
want: []any{1, 2, 3},
},
{
name: "nested list",
input: "[1, [2, 3], 4]",
want: []any{1, []any{2, 3}, 4},
},
{
name: "mixed type list",
input: "[1, \"two\", 3.0, true]",
want: []any{1, "two", 3.0, true},
},
{
name: "invalid list",
input: "[1, 2,",
want: nil,
err: true,
},
{
name: "dictionaries",
input: "{'a': 1, 'b': 2}",
want: map[any]any{"a": 1, "b": 2},
err: false,
},
{
name: "int dictionary",
input: "{1: 2}",
want: map[any]any{1: 2},
err: false,
},
{
name: "mixed type dictionary",
input: "{'a': 1, 'b': 2.0, 'c': True}",
want: map[any]any{"a": 1, "b": 2.0, "c": true},
err: false,
},
{
name: "invalid dictionary - missing closing brace",
input: "{'a': 1, 'b': 2",
want: nil,
err: true,
},
{
name: "sets",
input: "{1, 2, 3}",
want: []any{1, 2, 3},
err: false,
},
}
for _, tt := range cases {
t.Run(tt.name, func(t *testing.T) {
got, err := parsePythonValue(tt.input)
if (err != nil) != tt.err {
t.Fatalf("expected error: %v, got error: %v", tt.err, err)
}
if tt.err {
return
}
if diff := cmp.Diff(got, tt.want); diff != "" {
t.Errorf("mismatch (-got +want):\n%s", diff)
}
})
}
}

View File

@@ -299,9 +299,6 @@ func TestGenerateChat(t *testing.T) {
{Role: "user", Content: "Hello!"},
},
Stream: &stream,
Options: map[string]any{
"num_ctx": 1024,
},
})
if w.Code != http.StatusOK {
@@ -324,9 +321,6 @@ func TestGenerateChat(t *testing.T) {
{Role: "user", Content: "Hello!"},
},
Stream: &stream,
Options: map[string]any{
"num_ctx": 1024,
},
})
if w.Code != http.StatusOK {
@@ -350,9 +344,6 @@ func TestGenerateChat(t *testing.T) {
{Role: "user", Content: "Help me write tests."},
},
Stream: &stream,
Options: map[string]any{
"num_ctx": 1024,
},
})
if w.Code != http.StatusOK {

View File

@@ -81,6 +81,10 @@ func InitScheduler(ctx context.Context) *Scheduler {
// context must be canceled to decrement ref count and release the runner
func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration *api.Duration) (chan *runnerRef, chan error) {
if opts.NumCtx < 4 {
opts.NumCtx = 4
}
req := &LlmRequest{
ctx: c,
model: model,
@@ -110,11 +114,6 @@ func (s *Scheduler) Run(ctx context.Context) {
}()
}
const (
defaultContextLength = 4096
smallGpuContextLength = 2048
)
func (s *Scheduler) processPending(ctx context.Context) {
for {
select {
@@ -167,17 +166,6 @@ func (s *Scheduler) processPending(ctx context.Context) {
gpus = s.getGpuFn()
}
if pending.origNumCtx == -1 {
if len(gpus) == 1 && gpus[0].Library != "cpu" && gpus[0].TotalMemory <= 4096*1024*1024 {
slog.Info("GPU is small, limiting default context window", "num_ctx", smallGpuContextLength)
pending.opts.NumCtx = smallGpuContextLength
pending.origNumCtx = smallGpuContextLength
} else {
pending.opts.NumCtx = defaultContextLength
pending.origNumCtx = defaultContextLength
}
}
if envconfig.MaxRunners() <= 0 {
// No user specified MaxRunners, so figure out what automatic setting to use
// If all GPUs have reliable free memory reporting, defaultModelsPerGPU * the number of GPUs

View File

@@ -148,7 +148,6 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
successCh: make(chan *runnerRef, 1),
errCh: make(chan error, 1),
}
b.req.opts.NumCtx = 4096
b.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}}
return b
}