mirror of
https://github.com/ollama/ollama.git
synced 2026-06-03 13:59:06 -04:00
* broad lint fixes to sidestep CI scope glitch * runner: Remove CGO engines, use llama-server exclusively for GGML models Remove the vendored GGML and llama.cpp backend, CGO runner, Go model implementations, and sample. llama-server (built from upstream llama.cpp via FetchContent) is now the sole inference engine for GGUF-based models. (Safetensor based models continue to run on the new MLX engine.) This allows us to more rapidly pick up new capabilities and fixes from llama.cpp as they come out. On windows this now requires recent AMD driver versions to support ROCm v7 as llama.cpp currently does not support building against v6. * llama/compat: load Ollama-format GGUFs in llama-server Squashed from upstream/jmorganca/llama-compat on 2026-04-29. Source tip:0c33775d37. Original source commits: -25223160dllama/compat: add in-memory shim so llama-server can load Ollama-format GGUFs -7449b539allm,server: route Ollama-format gemma3 blobs through llama/compat -436f2e2b1llama/compat: make patch-apply idempotent -8c2c9d4c8llama/compat: extend gemma3 handler to cover 1B and 270M blobs -021389f7bllama/compat: shrink clip.cpp injection from 18 lines to 1 -61b367ec2llama/compat: shrink patch to pure call-site hooks (34 -> 20 lines) -36049361cllama/compat: simplify shim (gemma3-tested) -8fa664865llama/compat: add qwen35moe text handler -db0c74530llama/compat: add qwen35moe vision (clip) support -2a388da77llama/compat: split shared infra into a util TU -9a69a17dcllama/compat: document non-public API dependencies -d0f38a915llama/compat: add gpt-oss and lfm2 handlers -086071822llama/compat: add mistral3 text handler (vision TODO) -63bde9ff7llama/compat: add mistral3 vision (clip) support -3a57b89d5llama/compat: apply LLaMA RoPE permute to mistral3 vision Q/K -99cb87439llama/compat: add qwen35, gemma4, deepseek-ocr handlers -2c7850dballama/compat: add nemotron_h_moe handler (latent FFN + MTP skip) -9e3b54225llama/compat: add llama4 text + clip handlers -034fee349llama/compat: add gemma4 clip handler (gemma4v projector) -9945c5a93server: remove dhiltgen/* compat redirect table -5d4539101llama/compat: rewrite gemma4 tokenizer model to BPE -7e0765327llama/compat: add glm-ocr text handler + text-loader load-op hook -f1bd1a25allama/compat: add glm-ocr clip handler (glm4v projector) -4b5cf3420llama/compat: collapse text-loader hook back to one new patch line -eb4ecf4fcllama/compat: extend gemma4 clip handler to gemma4a (audio) -a23a5e76fllama/compat: fix gemma4a per-block norm tensor mapping -cd2dcaff4llama/compat: add embeddinggemma handler -1ce8a6b26llama/compat: add qwen3-vl + qwen2.5-vl handlers -fd98ffa1ellama/compat: add gemma3n + glm4moelite handlers -cc7bdf0bcllama/compat: handle null buft in maybe_load_tensor -0c33775d3llama/compat: disable mmap when load_op transforms text-side tensors * refine implementation * ci: fix windows MLX build * ci: fix windows llama-server build * ci: fix windows rocm build * ci: windows mlx tuning Shorten long-tail on build, and get OllamaSetup.exe back under 2g limit * ci: fix windows dependencies * win: fix dependency gathering * disable openmp * win: arm64 cross-compile build also DRY out CI steps * scheduler improvements * ci: improvements from #15982 * win: favor ninja for faster developer builds * win: fix build * win: fix arm64 cross-compile * win: avoid spaces in compiler path * misc discovery fixes, and bos handling * lint fixes * win: fix arm cross-compile build/CI bugs * llama.cpp update * win: handle multiple CRT dirs * vulkan: add windows iGPU detection * fix creation bugs for patched models, other refactoring work * tune batch size for better performance * ci and lint fixes * fix repeat_last_n bug * build: revamp build for better developer UX * amd, sampler, qwen3next fixes * version bump * fix mlx build * revamp GPU discovery Scanning the output of llama-server is turning out to be too error prone across llama.cpp updates, so this switches to a thin dynamic library load against the bundled GGML libraries so more details can be gathered from the API. * version bump * missing file * ci: fix cache miss on rocm build * refine vulkan dep handling * fix ps reporting bug on full GPU load * improve cmake wiring for customized local builds * version bump * docker build arg cleanup * improve windows exit error logs * fix community gemma4 support and ci flakes * fix mlx unit test * tighten up ps logic to avoid double counting fit log lines * version bump * fix ps view for full gpu layer offload * add MTP wiring for llama-server and create with GGUFs * pick best template by capabilities * version bump * ci: harden apt repos * remove unused cpu core discovery * adjust batch default logic to reduce OOMs * support larger tool calls * fix audio support, template show * qwen35 mtp patch support * flesh out dtypes * rocm deps * version bump * lint fix * block broken gfx1150 on windows * fix qwen3.5 moe mtp tensors in patch * mmproj oom fallback and vulkan on by default * qwen MTP compat fix * version bump * ci: fix WoA cross-compile * ci: workaround ui tool in cross-compile * version bump * win: enable OpenMP for CPU builds * build: improve developer UX * ci: windows path workaround for CPU build * win: fix WoA dependencies * win: fix large offset reads for mmproj patched loads * version bump * fix vulkan dup detection * add OLLAMA_IGPU_ENABLE and largely disable iGPUs by default * opt-in MTP, win large offset, integraton fixes * fix unit test scheduler interaction hang * fix multi-gpu filtering * version bump * review comments * fix thinking level * fix linux rocm ordering and granite 3.3 template * version bump * ci fix - non-shallow MLX checkout * bypass linux sysfs unit test on windows --------- Co-authored-by: jmorganca <jmorganca@gmail.com>
493 lines
14 KiB
Go
493 lines
14 KiB
Go
// Package api implements the client-side API for code wishing to interact
|
|
// with the ollama service. The methods of the [Client] type correspond to
|
|
// the ollama REST API as described in [the API documentation].
|
|
// The ollama command-line client itself uses this package to interact with
|
|
// the backend service.
|
|
//
|
|
// # Examples
|
|
//
|
|
// Several examples of using this package are available [in the GitHub
|
|
// repository].
|
|
//
|
|
// [the API documentation]: https://github.com/ollama/ollama/blob/main/docs/api.md
|
|
// [in the GitHub repository]: https://github.com/ollama/ollama/tree/main/api/examples
|
|
package api
|
|
|
|
import (
|
|
"bufio"
|
|
"bytes"
|
|
"context"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"net/url"
|
|
"runtime"
|
|
"strconv"
|
|
"time"
|
|
|
|
"github.com/ollama/ollama/auth"
|
|
"github.com/ollama/ollama/envconfig"
|
|
"github.com/ollama/ollama/format"
|
|
"github.com/ollama/ollama/version"
|
|
)
|
|
|
|
// Client encapsulates client state for interacting with the ollama
|
|
// service. Use [ClientFromEnvironment] to create new Clients.
|
|
type Client struct {
|
|
base *url.URL
|
|
http *http.Client
|
|
}
|
|
|
|
func checkError(resp *http.Response, body []byte) error {
|
|
if resp.StatusCode < http.StatusBadRequest {
|
|
return nil
|
|
}
|
|
|
|
if resp.StatusCode == http.StatusUnauthorized {
|
|
authError := AuthorizationError{StatusCode: resp.StatusCode}
|
|
json.Unmarshal(body, &authError)
|
|
return authError
|
|
}
|
|
|
|
apiError := StatusError{StatusCode: resp.StatusCode}
|
|
|
|
err := json.Unmarshal(body, &apiError)
|
|
if err != nil {
|
|
// Use the full body as the message if we fail to decode a response.
|
|
apiError.ErrorMessage = string(body)
|
|
}
|
|
|
|
return apiError
|
|
}
|
|
|
|
// ClientFromEnvironment creates a new [Client] using configuration from the
|
|
// environment variable OLLAMA_HOST, which points to the network host and
|
|
// port on which the ollama service is listening. The format of this variable
|
|
// is:
|
|
//
|
|
// <scheme>://<host>:<port>
|
|
//
|
|
// If the variable is not specified, a default ollama host and port will be
|
|
// used.
|
|
func ClientFromEnvironment() (*Client, error) {
|
|
return &Client{
|
|
base: envconfig.Host(),
|
|
http: http.DefaultClient,
|
|
}, nil
|
|
}
|
|
|
|
func NewClient(base *url.URL, http *http.Client) *Client {
|
|
return &Client{
|
|
base: base,
|
|
http: http,
|
|
}
|
|
}
|
|
|
|
func getAuthorizationToken(ctx context.Context, challenge string) (string, error) {
|
|
token, err := auth.Sign(ctx, []byte(challenge))
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
return token, nil
|
|
}
|
|
|
|
func (c *Client) do(ctx context.Context, method, path string, reqData, respData any) error {
|
|
var reqBody io.Reader
|
|
var data []byte
|
|
var err error
|
|
|
|
switch reqData := reqData.(type) {
|
|
case io.Reader:
|
|
// reqData is already an io.Reader
|
|
reqBody = reqData
|
|
case nil:
|
|
// noop
|
|
default:
|
|
data, err = json.Marshal(reqData)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
reqBody = bytes.NewReader(data)
|
|
}
|
|
|
|
requestURL := c.base.JoinPath(path)
|
|
|
|
var token string
|
|
if envconfig.UseAuth() || c.base.Hostname() == "ollama.com" {
|
|
now := strconv.FormatInt(time.Now().Unix(), 10)
|
|
chal := fmt.Sprintf("%s,%s?ts=%s", method, path, now)
|
|
token, err = getAuthorizationToken(ctx, chal)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
q := requestURL.Query()
|
|
q.Set("ts", now)
|
|
requestURL.RawQuery = q.Encode()
|
|
}
|
|
|
|
request, err := http.NewRequestWithContext(ctx, method, requestURL.String(), reqBody)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
request.Header.Set("Content-Type", "application/json")
|
|
request.Header.Set("Accept", "application/json")
|
|
request.Header.Set("User-Agent", fmt.Sprintf("ollama/%s (%s %s) Go/%s", version.Version, runtime.GOARCH, runtime.GOOS, runtime.Version()))
|
|
|
|
if token != "" {
|
|
request.Header.Set("Authorization", token)
|
|
}
|
|
|
|
respObj, err := c.http.Do(request)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer respObj.Body.Close()
|
|
|
|
respBody, err := io.ReadAll(respObj.Body)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := checkError(respObj, respBody); err != nil {
|
|
return err
|
|
}
|
|
|
|
if len(respBody) > 0 && respData != nil {
|
|
if err := json.Unmarshal(respBody, respData); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
const maxBufferSize = 8 * format.MegaByte
|
|
|
|
func (c *Client) stream(ctx context.Context, method, path string, data any, fn func([]byte) error) error {
|
|
var buf io.Reader
|
|
if data != nil {
|
|
bts, err := json.Marshal(data)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
buf = bytes.NewBuffer(bts)
|
|
}
|
|
|
|
requestURL := c.base.JoinPath(path)
|
|
|
|
var token string
|
|
if envconfig.UseAuth() || c.base.Hostname() == "ollama.com" {
|
|
var err error
|
|
now := strconv.FormatInt(time.Now().Unix(), 10)
|
|
chal := fmt.Sprintf("%s,%s?ts=%s", method, path, now)
|
|
token, err = getAuthorizationToken(ctx, chal)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
q := requestURL.Query()
|
|
q.Set("ts", now)
|
|
requestURL.RawQuery = q.Encode()
|
|
}
|
|
|
|
request, err := http.NewRequestWithContext(ctx, method, requestURL.String(), buf)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
request.Header.Set("Content-Type", "application/json")
|
|
request.Header.Set("Accept", "application/x-ndjson")
|
|
request.Header.Set("User-Agent", fmt.Sprintf("ollama/%s (%s %s) Go/%s", version.Version, runtime.GOARCH, runtime.GOOS, runtime.Version()))
|
|
|
|
if token != "" {
|
|
request.Header.Set("Authorization", token)
|
|
}
|
|
|
|
response, err := c.http.Do(request)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer response.Body.Close()
|
|
|
|
scanner := bufio.NewScanner(response.Body)
|
|
// increase the buffer size to avoid running out of space
|
|
scanBuf := make([]byte, 0, maxBufferSize)
|
|
scanner.Buffer(scanBuf, maxBufferSize)
|
|
for scanner.Scan() {
|
|
var errorResponse struct {
|
|
Error string `json:"error,omitempty"`
|
|
SigninURL string `json:"signin_url,omitempty"`
|
|
}
|
|
|
|
bts := scanner.Bytes()
|
|
if err := json.Unmarshal(bts, &errorResponse); err != nil {
|
|
if response.StatusCode >= http.StatusBadRequest {
|
|
return StatusError{
|
|
StatusCode: response.StatusCode,
|
|
Status: response.Status,
|
|
ErrorMessage: string(bts),
|
|
}
|
|
}
|
|
return errors.New(string(bts))
|
|
}
|
|
|
|
if response.StatusCode == http.StatusUnauthorized {
|
|
return AuthorizationError{
|
|
StatusCode: response.StatusCode,
|
|
Status: response.Status,
|
|
SigninURL: errorResponse.SigninURL,
|
|
}
|
|
} else if response.StatusCode >= http.StatusBadRequest {
|
|
return StatusError{
|
|
StatusCode: response.StatusCode,
|
|
Status: response.Status,
|
|
ErrorMessage: errorResponse.Error,
|
|
}
|
|
}
|
|
|
|
if errorResponse.Error != "" {
|
|
return errors.New(errorResponse.Error)
|
|
}
|
|
|
|
if err := fn(bts); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
if err := scanner.Err(); err != nil {
|
|
return err
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// GenerateResponseFunc is a function that [Client.Generate] invokes every time
|
|
// a response is received from the service. If this function returns an error,
|
|
// [Client.Generate] will stop generating and return this error.
|
|
type GenerateResponseFunc func(GenerateResponse) error
|
|
|
|
// Generate generates a response for a given prompt. The req parameter should
|
|
// be populated with prompt details. fn is called for each response (there may
|
|
// be multiple responses, e.g. in case streaming is enabled).
|
|
func (c *Client) Generate(ctx context.Context, req *GenerateRequest, fn GenerateResponseFunc) error {
|
|
return c.stream(ctx, http.MethodPost, "/api/generate", req, func(bts []byte) error {
|
|
var resp GenerateResponse
|
|
if err := json.Unmarshal(bts, &resp); err != nil {
|
|
return err
|
|
}
|
|
|
|
return fn(resp)
|
|
})
|
|
}
|
|
|
|
// ChatResponseFunc is a function that [Client.Chat] invokes every time
|
|
// a response is received from the service. If this function returns an error,
|
|
// [Client.Chat] will stop generating and return this error.
|
|
type ChatResponseFunc func(ChatResponse) error
|
|
|
|
// Chat generates the next message in a chat. [ChatRequest] may contain a
|
|
// sequence of messages which can be used to maintain chat history with a model.
|
|
// fn is called for each response (there may be multiple responses, e.g. if case
|
|
// streaming is enabled).
|
|
func (c *Client) Chat(ctx context.Context, req *ChatRequest, fn ChatResponseFunc) error {
|
|
return c.stream(ctx, http.MethodPost, "/api/chat", req, func(bts []byte) error {
|
|
var resp ChatResponse
|
|
if err := json.Unmarshal(bts, &resp); err != nil {
|
|
return err
|
|
}
|
|
|
|
return fn(resp)
|
|
})
|
|
}
|
|
|
|
// PullProgressFunc is a function that [Client.Pull] invokes every time there
|
|
// is progress with a "pull" request sent to the service. If this function
|
|
// returns an error, [Client.Pull] will stop the process and return this error.
|
|
type PullProgressFunc func(ProgressResponse) error
|
|
|
|
// Pull downloads a model from the ollama library. fn is called each time
|
|
// progress is made on the request and can be used to display a progress bar,
|
|
// etc.
|
|
func (c *Client) Pull(ctx context.Context, req *PullRequest, fn PullProgressFunc) error {
|
|
return c.stream(ctx, http.MethodPost, "/api/pull", req, func(bts []byte) error {
|
|
var resp ProgressResponse
|
|
if err := json.Unmarshal(bts, &resp); err != nil {
|
|
return err
|
|
}
|
|
|
|
return fn(resp)
|
|
})
|
|
}
|
|
|
|
// PushProgressFunc is a function that [Client.Push] invokes when progress is
|
|
// made.
|
|
// It's similar to other progress function types like [PullProgressFunc].
|
|
type PushProgressFunc func(ProgressResponse) error
|
|
|
|
// Push uploads a model to the model library; requires registering for ollama.ai
|
|
// and adding a public key first. fn is called each time progress is made on
|
|
// the request and can be used to display a progress bar, etc.
|
|
func (c *Client) Push(ctx context.Context, req *PushRequest, fn PushProgressFunc) error {
|
|
return c.stream(ctx, http.MethodPost, "/api/push", req, func(bts []byte) error {
|
|
var resp ProgressResponse
|
|
if err := json.Unmarshal(bts, &resp); err != nil {
|
|
return err
|
|
}
|
|
|
|
return fn(resp)
|
|
})
|
|
}
|
|
|
|
// CreateProgressFunc is a function that [Client.Create] invokes when progress
|
|
// is made.
|
|
// It's similar to other progress function types like [PullProgressFunc].
|
|
type CreateProgressFunc func(ProgressResponse) error
|
|
|
|
// Create creates a model from a [Modelfile]. fn is a progress function that
|
|
// behaves similarly to other methods (see [Client.Pull]).
|
|
//
|
|
// [Modelfile]: https://github.com/ollama/ollama/blob/main/docs/modelfile.mdx
|
|
func (c *Client) Create(ctx context.Context, req *CreateRequest, fn CreateProgressFunc) error {
|
|
return c.stream(ctx, http.MethodPost, "/api/create", req, func(bts []byte) error {
|
|
var resp ProgressResponse
|
|
if err := json.Unmarshal(bts, &resp); err != nil {
|
|
return err
|
|
}
|
|
|
|
return fn(resp)
|
|
})
|
|
}
|
|
|
|
// List lists models that are available locally.
|
|
func (c *Client) List(ctx context.Context) (*ListResponse, error) {
|
|
var lr ListResponse
|
|
if err := c.do(ctx, http.MethodGet, "/api/tags", nil, &lr); err != nil {
|
|
return nil, err
|
|
}
|
|
return &lr, nil
|
|
}
|
|
|
|
// ModelRecommendationsExperimental lists model recommendations from the local
|
|
// server's experimental recommendations endpoint.
|
|
func (c *Client) ModelRecommendationsExperimental(ctx context.Context) (*ModelRecommendationsResponse, error) {
|
|
var resp ModelRecommendationsResponse
|
|
if err := c.do(ctx, http.MethodGet, "/api/experimental/model-recommendations", nil, &resp); err != nil {
|
|
return nil, err
|
|
}
|
|
return &resp, nil
|
|
}
|
|
|
|
// ListRunning lists running models.
|
|
func (c *Client) ListRunning(ctx context.Context) (*ProcessResponse, error) {
|
|
var lr ProcessResponse
|
|
if err := c.do(ctx, http.MethodGet, "/api/ps", nil, &lr); err != nil {
|
|
return nil, err
|
|
}
|
|
return &lr, nil
|
|
}
|
|
|
|
// Copy copies a model - creating a model with another name from an existing
|
|
// model.
|
|
func (c *Client) Copy(ctx context.Context, req *CopyRequest) error {
|
|
if err := c.do(ctx, http.MethodPost, "/api/copy", req, nil); err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Delete deletes a model and its data.
|
|
func (c *Client) Delete(ctx context.Context, req *DeleteRequest) error {
|
|
if err := c.do(ctx, http.MethodDelete, "/api/delete", req, nil); err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Show obtains model information, including details, modelfile, license etc.
|
|
func (c *Client) Show(ctx context.Context, req *ShowRequest) (*ShowResponse, error) {
|
|
var resp ShowResponse
|
|
if err := c.do(ctx, http.MethodPost, "/api/show", req, &resp); err != nil {
|
|
return nil, err
|
|
}
|
|
return &resp, nil
|
|
}
|
|
|
|
// Heartbeat checks if the server has started and is responsive; if yes, it
|
|
// returns nil, otherwise an error.
|
|
func (c *Client) Heartbeat(ctx context.Context) error {
|
|
if err := c.do(ctx, http.MethodHead, "/", nil, nil); err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Embed generates embeddings from a model.
|
|
func (c *Client) Embed(ctx context.Context, req *EmbedRequest) (*EmbedResponse, error) {
|
|
var resp EmbedResponse
|
|
if err := c.do(ctx, http.MethodPost, "/api/embed", req, &resp); err != nil {
|
|
return nil, err
|
|
}
|
|
return &resp, nil
|
|
}
|
|
|
|
// Embeddings generates an embedding from a model.
|
|
func (c *Client) Embeddings(ctx context.Context, req *EmbeddingRequest) (*EmbeddingResponse, error) {
|
|
var resp EmbeddingResponse
|
|
if err := c.do(ctx, http.MethodPost, "/api/embeddings", req, &resp); err != nil {
|
|
return nil, err
|
|
}
|
|
return &resp, nil
|
|
}
|
|
|
|
// CreateBlob creates a blob from a file on the server. digest is the
|
|
// expected SHA256 digest of the file, and r represents the file.
|
|
func (c *Client) CreateBlob(ctx context.Context, digest string, r io.Reader) error {
|
|
return c.do(ctx, http.MethodPost, fmt.Sprintf("/api/blobs/%s", digest), r, nil)
|
|
}
|
|
|
|
// Version returns the Ollama server version as a string.
|
|
func (c *Client) Version(ctx context.Context) (string, error) {
|
|
var version struct {
|
|
Version string `json:"version"`
|
|
}
|
|
|
|
if err := c.do(ctx, http.MethodGet, "/api/version", nil, &version); err != nil {
|
|
return "", err
|
|
}
|
|
|
|
return version.Version, nil
|
|
}
|
|
|
|
// CloudStatusExperimental returns whether cloud features are disabled on the server.
|
|
func (c *Client) CloudStatusExperimental(ctx context.Context) (*StatusResponse, error) {
|
|
var status StatusResponse
|
|
if err := c.do(ctx, http.MethodGet, "/api/status", nil, &status); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return &status, nil
|
|
}
|
|
|
|
// Signout will signout a client for a local ollama server.
|
|
func (c *Client) Signout(ctx context.Context) error {
|
|
return c.do(ctx, http.MethodPost, "/api/signout", nil, nil)
|
|
}
|
|
|
|
// Disconnect will disconnect an ollama instance from ollama.com.
|
|
func (c *Client) Disconnect(ctx context.Context, encodedKey string) error {
|
|
return c.do(ctx, http.MethodDelete, fmt.Sprintf("/api/user/keys/%s", encodedKey), nil, nil)
|
|
}
|
|
|
|
func (c *Client) Whoami(ctx context.Context) (*UserResponse, error) {
|
|
var resp UserResponse
|
|
if err := c.do(ctx, http.MethodPost, "/api/me", nil, &resp); err != nil {
|
|
return nil, err
|
|
}
|
|
return &resp, nil
|
|
}
|