mirror of
https://github.com/ollama/ollama.git
synced 2026-01-02 04:29:51 -05:00
Compare commits
64 Commits
v0.1.8
...
remove-fir
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1d78d96fc6 | ||
|
|
686f85d6ca | ||
|
|
85951d25ef | ||
|
|
01ea6002c4 | ||
|
|
423862042a | ||
|
|
df18486c35 | ||
|
|
4e612a2e92 | ||
|
|
6e0f686afa | ||
|
|
c1844bbee2 | ||
|
|
cb745965ce | ||
|
|
8d29b6a2b6 | ||
|
|
724aa64bee | ||
|
|
d91c103e74 | ||
|
|
98ec7d81e3 | ||
|
|
7c438f2c53 | ||
|
|
6e46338d44 | ||
|
|
cdddd3df65 | ||
|
|
afa61bdf45 | ||
|
|
cc54a416c6 | ||
|
|
c819d7f68a | ||
|
|
5cba29b9d6 | ||
|
|
d17730356a | ||
|
|
32d79a6eea | ||
|
|
5b39503bcd | ||
|
|
1ae84bc2a2 | ||
|
|
db8bf336fc | ||
|
|
d77e094a90 | ||
|
|
dd3dc47ddb | ||
|
|
c5e1bbabda | ||
|
|
a49d6acc1e | ||
|
|
6e9bcdb9b3 | ||
|
|
13086363bd | ||
|
|
ec2a31e9b3 | ||
|
|
ec84c02d54 | ||
|
|
2a88b66bc9 | ||
|
|
2d0faea96c | ||
|
|
637142181a | ||
|
|
bcbff421c9 | ||
|
|
1359d6cf3b | ||
|
|
6e2d0224d9 | ||
|
|
921406f721 | ||
|
|
c7047d7353 | ||
|
|
1d155caba3 | ||
|
|
866324b9a5 | ||
|
|
145e060855 | ||
|
|
146072113d | ||
|
|
33d31d1b56 | ||
|
|
274c6cbf4c | ||
|
|
7ebbd89bbf | ||
|
|
9079b1bb6d | ||
|
|
6febde7200 | ||
|
|
325cfcd9ff | ||
|
|
639d0fd070 | ||
|
|
434a6f9d46 | ||
|
|
b13586cc72 | ||
|
|
84725ec7e3 | ||
|
|
dccac8c8fa | ||
|
|
f31961637f | ||
|
|
80362fedce | ||
|
|
5757925060 | ||
|
|
4512301756 | ||
|
|
2236a93efc | ||
|
|
96da0792e6 | ||
|
|
95d24262fc |
48
README.md
48
README.md
@@ -29,8 +29,7 @@ curl https://ollama.ai/install.sh | sh
|
||||
|
||||
### Docker
|
||||
|
||||
The official [Ollama Docker image `ollama/ollama`](https://hub.docker.com/r/ollama/ollama)
|
||||
is available on Docker Hub.
|
||||
The official [Ollama Docker image](https://hub.docker.com/r/ollama/ollama) `ollama/ollama` is available on Docker Hub.
|
||||
|
||||
## Quickstart
|
||||
|
||||
@@ -160,7 +159,7 @@ I'm a basic program that prints the famous "Hello, world!" message to the consol
|
||||
### Pass in prompt as arguments
|
||||
|
||||
```
|
||||
$ ollama run llama2 "summarize this file:" "$(cat README.md)"
|
||||
$ ollama run llama2 "Summarize this file: $(cat README.md)"
|
||||
Ollama is a lightweight, extensible framework for building and running language models on the local machine. It provides a simple API for creating, running, and managing models, as well as a library of pre-built models that can be easily used in a variety of applications.
|
||||
```
|
||||
|
||||
@@ -217,21 +216,44 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
|
||||
## Community Integrations
|
||||
|
||||
### Web & Desktop
|
||||
|
||||
- [HTML UI](https://github.com/rtcfirefly/ollama-ui)
|
||||
- [Chatbot UI](https://github.com/ivanfioravanti/chatbot-ollama)
|
||||
- [Typescript UI](https://github.com/ollama-interface/Ollama-Gui?tab=readme-ov-file)
|
||||
- [Minimalistic React UI for Ollama Models](https://github.com/richawo/minimal-llm-ui)
|
||||
- [Web UI](https://github.com/ollama-webui/ollama-webui)
|
||||
- [Ollamac](https://github.com/kevinhermawan/Ollamac)
|
||||
- [big-AGI](https://github.com/enricoros/big-agi/blob/main/docs/config-ollama.md)
|
||||
|
||||
### Terminal
|
||||
|
||||
- [oterm](https://github.com/ggozad/oterm)
|
||||
- [Ellama Emacs client](https://github.com/s-kostyaev/ellama)
|
||||
- [Emacs client](https://github.com/zweifisch/ollama)
|
||||
- [gen.nvim](https://github.com/David-Kunz/gen.nvim)
|
||||
- [ollama.nvim](https://github.com/nomnivore/ollama.nvim)
|
||||
- [gptel Emacs client](https://github.com/karthink/gptel)
|
||||
|
||||
### Libraries
|
||||
|
||||
- [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/modules/model_io/models/llms/integrations/ollama) with [example](https://js.langchain.com/docs/use_cases/question_answering/local_retrieval_qa)
|
||||
- [LlamaIndex](https://gpt-index.readthedocs.io/en/stable/examples/llm/ollama.html)
|
||||
- [LiteLLM](https://github.com/BerriAI/litellm)
|
||||
- [OllamaSharp for .NET](https://github.com/awaescher/OllamaSharp)
|
||||
- [Ollama-rs for Rust](https://github.com/pepperoni21/ollama-rs)
|
||||
- [Ollama4j for Java](https://github.com/amithkoujalgi/ollama4j)
|
||||
- [ModelFusion Typescript Library](https://modelfusion.dev/integration/model-provider/ollama)
|
||||
- [OllamaKit for Swift](https://github.com/kevinhermawan/OllamaKit)
|
||||
- [Ollama for Dart](https://github.com/breitburg/dart-ollama)
|
||||
|
||||
### Extensions & Plugins
|
||||
|
||||
- [Raycast extension](https://github.com/MassimilianoPasquini97/raycast_ollama)
|
||||
- [Discollama](https://github.com/mxyng/discollama) (Discord bot inside the Ollama discord channel)
|
||||
- [Continue](https://github.com/continuedev/continue)
|
||||
- [Obsidian Ollama plugin](https://github.com/hinterdupfinger/obsidian-ollama)
|
||||
- [Logseq Ollama plugin](https://github.com/omagdy7/ollama-logseq)
|
||||
- [Dagger Chatbot](https://github.com/samalba/dagger-chatbot)
|
||||
- [LiteLLM](https://github.com/BerriAI/litellm)
|
||||
- [Discord AI Bot](https://github.com/mekb-turtle/discord-ai-bot)
|
||||
- [Chatbot UI](https://github.com/ivanfioravanti/chatbot-ollama)
|
||||
- [HTML UI](https://github.com/rtcfirefly/ollama-ui)
|
||||
- [Typescript UI](https://github.com/ollama-interface/Ollama-Gui?tab=readme-ov-file)
|
||||
- [Dumbar](https://github.com/JerrySievert/Dumbar)
|
||||
- [Emacs client](https://github.com/zweifisch/ollama)
|
||||
- [oterm](https://github.com/ggozad/oterm)
|
||||
- [Ellama Emacs client](https://github.com/s-kostyaev/ellama)
|
||||
- [OllamaSharp for .NET](https://github.com/awaescher/OllamaSharp)
|
||||
- [Minimalistic React UI for Ollama Models](https://github.com/richawo/minimal-llm-ui)
|
||||
- [Hass Ollama Conversation](https://github.com/ej52/hass-ollama-conversation)
|
||||
|
||||
@@ -7,7 +7,7 @@ BASE_URL = os.environ.get('OLLAMA_HOST', 'http://localhost:11434')
|
||||
# Generate a response for a given prompt with a provided model. This is a streaming endpoint, so will be a series of responses.
|
||||
# The final response object will include statistics and additional data from the request. Use the callback function to override
|
||||
# the default handler.
|
||||
def generate(model_name, prompt, system=None, template=None, context=None, options=None, callback=None):
|
||||
def generate(model_name, prompt, system=None, template=None, format="", context=None, options=None, callback=None):
|
||||
try:
|
||||
url = f"{BASE_URL}/api/generate"
|
||||
payload = {
|
||||
@@ -16,7 +16,8 @@ def generate(model_name, prompt, system=None, template=None, context=None, optio
|
||||
"system": system,
|
||||
"template": template,
|
||||
"context": context,
|
||||
"options": options
|
||||
"options": options,
|
||||
"format": format,
|
||||
}
|
||||
|
||||
# Remove keys with None values
|
||||
|
||||
89
api/types.go
89
api/types.go
@@ -37,10 +37,56 @@ type GenerateRequest struct {
|
||||
Template string `json:"template"`
|
||||
Context []int `json:"context,omitempty"`
|
||||
Stream *bool `json:"stream,omitempty"`
|
||||
Raw bool `json:"raw,omitempty"`
|
||||
Format string `json:"format"`
|
||||
|
||||
Options map[string]interface{} `json:"options"`
|
||||
}
|
||||
|
||||
// Options specfied in GenerateRequest, if you add a new option here add it to the API docs also
|
||||
type Options struct {
|
||||
Runner
|
||||
|
||||
// Predict options used at runtime
|
||||
NumKeep int `json:"num_keep,omitempty"`
|
||||
Seed int `json:"seed,omitempty"`
|
||||
NumPredict int `json:"num_predict,omitempty"`
|
||||
TopK int `json:"top_k,omitempty"`
|
||||
TopP float32 `json:"top_p,omitempty"`
|
||||
TFSZ float32 `json:"tfs_z,omitempty"`
|
||||
TypicalP float32 `json:"typical_p,omitempty"`
|
||||
RepeatLastN int `json:"repeat_last_n,omitempty"`
|
||||
Temperature float32 `json:"temperature,omitempty"`
|
||||
RepeatPenalty float32 `json:"repeat_penalty,omitempty"`
|
||||
PresencePenalty float32 `json:"presence_penalty,omitempty"`
|
||||
FrequencyPenalty float32 `json:"frequency_penalty,omitempty"`
|
||||
Mirostat int `json:"mirostat,omitempty"`
|
||||
MirostatTau float32 `json:"mirostat_tau,omitempty"`
|
||||
MirostatEta float32 `json:"mirostat_eta,omitempty"`
|
||||
PenalizeNewline bool `json:"penalize_newline,omitempty"`
|
||||
Stop []string `json:"stop,omitempty"`
|
||||
}
|
||||
|
||||
// Runner options which must be set when the model is loaded into memory
|
||||
type Runner struct {
|
||||
UseNUMA bool `json:"numa,omitempty"`
|
||||
NumCtx int `json:"num_ctx,omitempty"`
|
||||
NumBatch int `json:"num_batch,omitempty"`
|
||||
NumGQA int `json:"num_gqa,omitempty"`
|
||||
NumGPU int `json:"num_gpu,omitempty"`
|
||||
MainGPU int `json:"main_gpu,omitempty"`
|
||||
LowVRAM bool `json:"low_vram,omitempty"`
|
||||
F16KV bool `json:"f16_kv,omitempty"`
|
||||
LogitsAll bool `json:"logits_all,omitempty"`
|
||||
VocabOnly bool `json:"vocab_only,omitempty"`
|
||||
UseMMap bool `json:"use_mmap,omitempty"`
|
||||
UseMLock bool `json:"use_mlock,omitempty"`
|
||||
EmbeddingOnly bool `json:"embedding_only,omitempty"`
|
||||
RopeFrequencyBase float32 `json:"rope_frequency_base,omitempty"`
|
||||
RopeFrequencyScale float32 `json:"rope_frequency_scale,omitempty"`
|
||||
NumThread int `json:"num_thread,omitempty"`
|
||||
}
|
||||
|
||||
type EmbeddingRequest struct {
|
||||
Model string `json:"model"`
|
||||
Prompt string `json:"prompt"`
|
||||
@@ -161,49 +207,6 @@ func (r *GenerateResponse) Summary() {
|
||||
}
|
||||
}
|
||||
|
||||
// Runner options which must be set when the model is loaded into memory
|
||||
type Runner struct {
|
||||
UseNUMA bool `json:"numa,omitempty"`
|
||||
NumCtx int `json:"num_ctx,omitempty"`
|
||||
NumBatch int `json:"num_batch,omitempty"`
|
||||
NumGQA int `json:"num_gqa,omitempty"`
|
||||
NumGPU int `json:"num_gpu,omitempty"`
|
||||
MainGPU int `json:"main_gpu,omitempty"`
|
||||
LowVRAM bool `json:"low_vram,omitempty"`
|
||||
F16KV bool `json:"f16_kv,omitempty"`
|
||||
LogitsAll bool `json:"logits_all,omitempty"`
|
||||
VocabOnly bool `json:"vocab_only,omitempty"`
|
||||
UseMMap bool `json:"use_mmap,omitempty"`
|
||||
UseMLock bool `json:"use_mlock,omitempty"`
|
||||
EmbeddingOnly bool `json:"embedding_only,omitempty"`
|
||||
RopeFrequencyBase float32 `json:"rope_frequency_base,omitempty"`
|
||||
RopeFrequencyScale float32 `json:"rope_frequency_scale,omitempty"`
|
||||
NumThread int `json:"num_thread,omitempty"`
|
||||
}
|
||||
|
||||
type Options struct {
|
||||
Runner
|
||||
|
||||
// Predict options used at runtime
|
||||
NumKeep int `json:"num_keep,omitempty"`
|
||||
Seed int `json:"seed,omitempty"`
|
||||
NumPredict int `json:"num_predict,omitempty"`
|
||||
TopK int `json:"top_k,omitempty"`
|
||||
TopP float32 `json:"top_p,omitempty"`
|
||||
TFSZ float32 `json:"tfs_z,omitempty"`
|
||||
TypicalP float32 `json:"typical_p,omitempty"`
|
||||
RepeatLastN int `json:"repeat_last_n,omitempty"`
|
||||
Temperature float32 `json:"temperature,omitempty"`
|
||||
RepeatPenalty float32 `json:"repeat_penalty,omitempty"`
|
||||
PresencePenalty float32 `json:"presence_penalty,omitempty"`
|
||||
FrequencyPenalty float32 `json:"frequency_penalty,omitempty"`
|
||||
Mirostat int `json:"mirostat,omitempty"`
|
||||
MirostatTau float32 `json:"mirostat_tau,omitempty"`
|
||||
MirostatEta float32 `json:"mirostat_eta,omitempty"`
|
||||
PenalizeNewline bool `json:"penalize_newline,omitempty"`
|
||||
Stop []string `json:"stop,omitempty"`
|
||||
}
|
||||
|
||||
var ErrInvalidOpts = fmt.Errorf("invalid options")
|
||||
|
||||
func (opts *Options) FromMap(m map[string]interface{}) error {
|
||||
|
||||
100
cmd/cmd.go
100
cmd/cmd.go
@@ -1,7 +1,6 @@
|
||||
package cmd
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"context"
|
||||
"crypto/ed25519"
|
||||
"crypto/rand"
|
||||
@@ -21,7 +20,6 @@ import (
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/dustin/go-humanize"
|
||||
"github.com/olekukonko/tablewriter"
|
||||
"github.com/spf13/cobra"
|
||||
"golang.org/x/crypto/ssh"
|
||||
@@ -174,7 +172,7 @@ func ListHandler(cmd *cobra.Command, args []string) error {
|
||||
|
||||
for _, m := range models.Models {
|
||||
if len(args) == 0 || strings.HasPrefix(m.Name, args[0]) {
|
||||
data = append(data, []string{m.Name, m.Digest[:12], humanize.Bytes(uint64(m.Size)), format.HumanTime(m.ModifiedAt, "Never")})
|
||||
data = append(data, []string{m.Name, m.Digest[:12], format.HumanBytes(m.Size), format.HumanTime(m.ModifiedAt, "Never")})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -350,34 +348,49 @@ func pull(model string, insecure bool) error {
|
||||
}
|
||||
|
||||
func RunGenerate(cmd *cobra.Command, args []string) error {
|
||||
if len(args) > 1 {
|
||||
// join all args into a single prompt
|
||||
wordWrap := false
|
||||
if term.IsTerminal(int(os.Stdout.Fd())) {
|
||||
wordWrap = true
|
||||
}
|
||||
format, err := cmd.Flags().GetString("format")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
nowrap, err := cmd.Flags().GetBool("nowordwrap")
|
||||
prompts := args[1:]
|
||||
|
||||
// prepend stdin to the prompt if provided
|
||||
if !term.IsTerminal(int(os.Stdin.Fd())) {
|
||||
in, err := io.ReadAll(os.Stdin)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if nowrap {
|
||||
wordWrap = false
|
||||
}
|
||||
|
||||
return generate(cmd, args[0], strings.Join(args[1:], " "), wordWrap)
|
||||
prompts = append([]string{string(in)}, prompts...)
|
||||
}
|
||||
|
||||
if readline.IsTerminal(int(os.Stdin.Fd())) {
|
||||
return generateInteractive(cmd, args[0])
|
||||
// output is being piped
|
||||
if !term.IsTerminal(int(os.Stdout.Fd())) {
|
||||
return generate(cmd, args[0], strings.Join(prompts, " "), false, format)
|
||||
}
|
||||
|
||||
return generateBatch(cmd, args[0])
|
||||
wordWrap := os.Getenv("TERM") == "xterm-256color"
|
||||
|
||||
nowrap, err := cmd.Flags().GetBool("nowordwrap")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if nowrap {
|
||||
wordWrap = false
|
||||
}
|
||||
|
||||
// prompts are provided via stdin or args so don't enter interactive mode
|
||||
if len(prompts) > 0 {
|
||||
return generate(cmd, args[0], strings.Join(prompts, " "), wordWrap, format)
|
||||
}
|
||||
|
||||
return generateInteractive(cmd, args[0], wordWrap, format)
|
||||
}
|
||||
|
||||
type generateContextKey string
|
||||
|
||||
func generate(cmd *cobra.Command, model, prompt string, wordWrap bool) error {
|
||||
func generate(cmd *cobra.Command, model, prompt string, wordWrap bool, format string) error {
|
||||
client, err := api.ClientFromEnvironment()
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -393,7 +406,7 @@ func generate(cmd *cobra.Command, model, prompt string, wordWrap bool) error {
|
||||
generateContext = []int{}
|
||||
}
|
||||
|
||||
termWidth, _, err := term.GetSize(int(0))
|
||||
termWidth, _, err := term.GetSize(int(os.Stdout.Fd()))
|
||||
if err != nil {
|
||||
wordWrap = false
|
||||
}
|
||||
@@ -414,7 +427,7 @@ func generate(cmd *cobra.Command, model, prompt string, wordWrap bool) error {
|
||||
var currentLineLength int
|
||||
var wordBuffer string
|
||||
|
||||
request := api.GenerateRequest{Model: model, Prompt: prompt, Context: generateContext}
|
||||
request := api.GenerateRequest{Model: model, Prompt: prompt, Context: generateContext, Format: format}
|
||||
fn := func(response api.GenerateResponse) error {
|
||||
if !spinner.IsFinished() {
|
||||
spinner.Finish()
|
||||
@@ -485,9 +498,9 @@ func generate(cmd *cobra.Command, model, prompt string, wordWrap bool) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func generateInteractive(cmd *cobra.Command, model string) error {
|
||||
func generateInteractive(cmd *cobra.Command, model string, wordWrap bool, format string) error {
|
||||
// load the model
|
||||
if err := generate(cmd, model, "", false); err != nil {
|
||||
if err := generate(cmd, model, "", false, ""); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -508,6 +521,8 @@ func generateInteractive(cmd *cobra.Command, model string) error {
|
||||
fmt.Fprintln(os.Stderr, " /set nohistory Disable history")
|
||||
fmt.Fprintln(os.Stderr, " /set wordwrap Enable wordwrap")
|
||||
fmt.Fprintln(os.Stderr, " /set nowordwrap Disable wordwrap")
|
||||
fmt.Fprintln(os.Stderr, " /set format json Enable JSON mode")
|
||||
fmt.Fprintln(os.Stderr, " /set noformat Disable formatting")
|
||||
fmt.Fprintln(os.Stderr, " /set verbose Show LLM stats")
|
||||
fmt.Fprintln(os.Stderr, " /set quiet Disable LLM stats")
|
||||
fmt.Fprintln(os.Stderr, "")
|
||||
@@ -535,21 +550,6 @@ func generateInteractive(cmd *cobra.Command, model string) error {
|
||||
return err
|
||||
}
|
||||
|
||||
var wordWrap bool
|
||||
termType := os.Getenv("TERM")
|
||||
if termType == "xterm-256color" {
|
||||
wordWrap = true
|
||||
}
|
||||
|
||||
// override wrapping if the user turned it off
|
||||
nowrap, err := cmd.Flags().GetBool("nowordwrap")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if nowrap {
|
||||
wordWrap = false
|
||||
}
|
||||
|
||||
fmt.Print(readline.StartBracketedPaste)
|
||||
defer fmt.Printf(readline.EndBracketedPaste)
|
||||
|
||||
@@ -613,6 +613,16 @@ func generateInteractive(cmd *cobra.Command, model string) error {
|
||||
case "quiet":
|
||||
cmd.Flags().Set("verbose", "false")
|
||||
fmt.Println("Set 'quiet' mode.")
|
||||
case "format":
|
||||
if len(args) < 3 || args[2] != "json" {
|
||||
fmt.Println("Invalid or missing format. For 'json' mode use '/set format json'")
|
||||
} else {
|
||||
format = args[2]
|
||||
fmt.Printf("Set format to '%s' mode.\n", args[2])
|
||||
}
|
||||
case "noformat":
|
||||
format = ""
|
||||
fmt.Println("Disabled format.")
|
||||
default:
|
||||
fmt.Printf("Unknown command '/set %s'. Type /? for help\n", args[1])
|
||||
}
|
||||
@@ -686,26 +696,13 @@ func generateInteractive(cmd *cobra.Command, model string) error {
|
||||
}
|
||||
|
||||
if len(line) > 0 && line[0] != '/' {
|
||||
if err := generate(cmd, model, line, wordWrap); err != nil {
|
||||
if err := generate(cmd, model, line, wordWrap, format); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func generateBatch(cmd *cobra.Command, model string) error {
|
||||
scanner := bufio.NewScanner(os.Stdin)
|
||||
for scanner.Scan() {
|
||||
prompt := scanner.Text()
|
||||
fmt.Printf(">>> %s\n", prompt)
|
||||
if err := generate(cmd, model, prompt, false); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func RunServer(cmd *cobra.Command, _ []string) error {
|
||||
host, port, err := net.SplitHostPort(os.Getenv("OLLAMA_HOST"))
|
||||
if err != nil {
|
||||
@@ -883,6 +880,7 @@ func NewCLI() *cobra.Command {
|
||||
runCmd.Flags().Bool("verbose", false, "Show timings for response")
|
||||
runCmd.Flags().Bool("insecure", false, "Use an insecure registry")
|
||||
runCmd.Flags().Bool("nowordwrap", false, "Don't wrap words to the next line automatically")
|
||||
runCmd.Flags().String("format", "", "Response format (e.g. json)")
|
||||
|
||||
serveCmd := &cobra.Command{
|
||||
Use: "serve",
|
||||
|
||||
162
docs/api.md
162
docs/api.md
@@ -41,11 +41,17 @@ Generate a response for a given prompt with a provided model. This is a streamin
|
||||
|
||||
Advanced parameters (optional):
|
||||
|
||||
- `format`: the format to return a response in. Currently the only accepted value is `json`
|
||||
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
|
||||
- `system`: system prompt to (overrides what is defined in the `Modelfile`)
|
||||
- `template`: the full prompt or prompt template (overrides what is defined in the `Modelfile`)
|
||||
- `context`: the context parameter returned from a previous request to `/generate`, this can be used to keep a short conversational memory
|
||||
- `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
|
||||
- `raw`: if `true` no formatting will be applied to the prompt and no context will be returned. You may choose to use the `raw` parameter if you are specifying a full templated prompt in your request to the API, and are managing history yourself.
|
||||
|
||||
### JSON mode
|
||||
|
||||
Enable JSON mode by setting the `format` parameter to `json` and specifying the model should use JSON in the `prompt`. This will structure the response as valid JSON. See the JSON mode [example](#request-json-mode) below.
|
||||
|
||||
### Examples
|
||||
|
||||
@@ -53,7 +59,7 @@ Advanced parameters (optional):
|
||||
|
||||
```shell
|
||||
curl -X POST http://localhost:11434/api/generate -d '{
|
||||
"model": "llama2:7b",
|
||||
"model": "llama2",
|
||||
"prompt": "Why is the sky blue?"
|
||||
}'
|
||||
```
|
||||
@@ -64,7 +70,7 @@ A stream of JSON objects is returned:
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "llama2:7b",
|
||||
"model": "llama2",
|
||||
"created_at": "2023-08-04T08:52:19.385406455-07:00",
|
||||
"response": "The",
|
||||
"done": false
|
||||
@@ -88,7 +94,7 @@ To calculate how fast the response is generated in tokens per second (token/s),
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "llama2:7b",
|
||||
"model": "llama2",
|
||||
"created_at": "2023-08-04T19:22:45.499127Z",
|
||||
"response": "",
|
||||
"context": [1, 2, 3],
|
||||
@@ -104,7 +110,7 @@ To calculate how fast the response is generated in tokens per second (token/s),
|
||||
}
|
||||
```
|
||||
|
||||
#### Request
|
||||
#### Request (No streaming)
|
||||
|
||||
```shell
|
||||
curl -X POST http://localhost:11434/api/generate -d '{
|
||||
@@ -136,6 +142,150 @@ If `stream` is set to `false`, the response will be a single JSON object:
|
||||
}
|
||||
```
|
||||
|
||||
#### Request (Raw mode)
|
||||
|
||||
In some cases you may wish to bypass the templating system and provide a full prompt. In this case, you can use the `raw` parameter to disable formatting and context.
|
||||
|
||||
```shell
|
||||
curl -X POST http://localhost:11434/api/generate -d '{
|
||||
"model": "mistral",
|
||||
"prompt": "[INST] why is the sky blue? [/INST]",
|
||||
"raw": true,
|
||||
"stream": false
|
||||
}'
|
||||
```
|
||||
|
||||
#### Response
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "mistral",
|
||||
"created_at": "2023-11-03T15:36:02.583064Z",
|
||||
"response": " The sky appears blue because of a phenomenon called Rayleigh scattering.",
|
||||
"done": true,
|
||||
"total_duration": 14648695333,
|
||||
"load_duration": 3302671417,
|
||||
"prompt_eval_count": 14,
|
||||
"prompt_eval_duration": 286243000,
|
||||
"eval_count": 129,
|
||||
"eval_duration": 10931424000
|
||||
}
|
||||
```
|
||||
|
||||
#### Request (JSON mode)
|
||||
|
||||
```shell
|
||||
curl -X POST http://localhost:11434/api/generate -d '{
|
||||
"model": "llama2",
|
||||
"prompt": "What color is the sky at different times of the day? Respond using JSON",
|
||||
"format": "json",
|
||||
"stream": false
|
||||
}'
|
||||
```
|
||||
|
||||
#### Response
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "llama2",
|
||||
"created_at": "2023-11-09T21:07:55.186497Z",
|
||||
"response": "{\n\"morning\": {\n\"color\": \"blue\"\n},\n\"noon\": {\n\"color\": \"blue-gray\"\n},\n\"afternoon\": {\n\"color\": \"warm gray\"\n},\n\"evening\": {\n\"color\": \"orange\"\n}\n}\n",
|
||||
"done": true,
|
||||
"total_duration": 4661289125,
|
||||
"load_duration": 1714434500,
|
||||
"prompt_eval_count": 36,
|
||||
"prompt_eval_duration": 264132000,
|
||||
"eval_count": 75,
|
||||
"eval_duration": 2112149000
|
||||
}
|
||||
```
|
||||
|
||||
The value of `response` will be a string containing JSON similar to:
|
||||
|
||||
```json
|
||||
{
|
||||
"morning": {
|
||||
"color": "blue"
|
||||
},
|
||||
"noon": {
|
||||
"color": "blue-gray"
|
||||
},
|
||||
"afternoon": {
|
||||
"color": "warm gray"
|
||||
},
|
||||
"evening": {
|
||||
"color": "orange"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### Request (With options)
|
||||
|
||||
If you want to set custom options for the model at runtime rather than in the Modelfile, you can do so with the `options` parameter. This example sets every available option, but you can set any of them individually and omit the ones you do not want to override.
|
||||
|
||||
```shell
|
||||
curl -X POST http://localhost:11434/api/generate -d '{
|
||||
"model": "llama2:7b",
|
||||
"prompt": "Why is the sky blue?",
|
||||
"stream": false,
|
||||
"options": {
|
||||
"num_keep": 5,
|
||||
"seed": 42,
|
||||
"num_predict": 100,
|
||||
"top_k": 20,
|
||||
"top_p": 0.9,
|
||||
"tfs_z": 0.5,
|
||||
"typical_p": 0.7,
|
||||
"repeat_last_n": 33,
|
||||
"temperature": 0.8,
|
||||
"repeat_penalty": 1.2,
|
||||
"presence_penalty": 1.5,
|
||||
"frequency_penalty": 1.0,
|
||||
"mirostat": 1,
|
||||
"mirostat_tau": 0.8,
|
||||
"mirostat_eta": 0.6,
|
||||
"penalize_newline": true,
|
||||
"stop": ["\n", "user:"],
|
||||
"numa": false,
|
||||
"num_ctx": 4,
|
||||
"num_batch": 2,
|
||||
"num_gqa": 1,
|
||||
"num_gpu": 1,
|
||||
"main_gpu": 0,
|
||||
"low_vram": false,
|
||||
"f16_kv": true,
|
||||
"logits_all": false,
|
||||
"vocab_only": false,
|
||||
"use_mmap": true,
|
||||
"use_mlock": false,
|
||||
"embedding_only": false,
|
||||
"rope_frequency_base": 1.1,
|
||||
"rope_frequency_scale": 0.8,
|
||||
"num_thread": 8
|
||||
}
|
||||
}'
|
||||
```
|
||||
|
||||
#### Response
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "llama2:7b",
|
||||
"created_at": "2023-08-04T19:22:45.499127Z",
|
||||
"response": "The sky is blue because it is the color of the sky.",
|
||||
"context": [1, 2, 3],
|
||||
"done": true,
|
||||
"total_duration": 5589157167,
|
||||
"load_duration": 3013701500,
|
||||
"sample_count": 114,
|
||||
"sample_duration": 81442000,
|
||||
"prompt_eval_count": 46,
|
||||
"prompt_eval_duration": 1160282000,
|
||||
"eval_count": 13,
|
||||
"eval_duration": 1325948000
|
||||
}
|
||||
```
|
||||
|
||||
## Create a Model
|
||||
|
||||
```shell
|
||||
@@ -235,9 +385,9 @@ curl http://localhost:11434/api/show -d '{
|
||||
```json
|
||||
{
|
||||
"license": "<contents of license block>",
|
||||
"modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llama2:latest\n\nFROM /Users/username/.ollama/models/blobs/sha256:8daa9615cce30c259a9555b1cc250d461d1bc69980a274b44d7eda0be78076d8\nTEMPLATE \"\"\"[INST] {{ if and .First .System }}<<SYS>>{{ .System }}<</SYS>>\n\n{{ end }}{{ .Prompt }} [/INST] \"\"\"\nSYSTEM \"\"\"\"\"\"\nPARAMETER stop [INST]\nPARAMETER stop [/INST]\nPARAMETER stop <<SYS>>\nPARAMETER stop <</SYS>>\n",
|
||||
"modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llama2:latest\n\nFROM /Users/username/.ollama/models/blobs/sha256:8daa9615cce30c259a9555b1cc250d461d1bc69980a274b44d7eda0be78076d8\nTEMPLATE \"\"\"[INST] <<SYS>>{{ .System }}<</SYS>>\n\n{{ .Prompt }} [/INST] \"\"\"\nSYSTEM \"\"\"\"\"\"\nPARAMETER stop [INST]\nPARAMETER stop [/INST]\nPARAMETER stop <<SYS>>\nPARAMETER stop <</SYS>>\n",
|
||||
"parameters": "stop [INST]\nstop [/INST]\nstop <<SYS>>\nstop <</SYS>>",
|
||||
"template": "[INST] {{ if and .First .System }}<<SYS>>{{ .System }}<</SYS>>\n\n{{ end }}{{ .Prompt }} [/INST] "
|
||||
"template": "[INST] <<SYS>>{{ .System }}<</SYS>>\n\n{{ .Prompt }} [/INST] "
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
19
docs/faq.md
19
docs/faq.md
@@ -74,6 +74,25 @@ systemctl restart ollama
|
||||
- macOS: Raw model data is stored under `~/.ollama/models`.
|
||||
- Linux: Raw model data is stored under `/usr/share/ollama/.ollama/models`
|
||||
|
||||
|
||||
|
||||
Below the models directory you will find a structure similar to the following:
|
||||
|
||||
```shell
|
||||
.
|
||||
├── blobs
|
||||
└── manifests
|
||||
└── registry.ollama.ai
|
||||
├── f0rodo
|
||||
├── library
|
||||
├── mattw
|
||||
└── saikatkumardey
|
||||
```
|
||||
|
||||
There is a `manifests/registry.ollama.ai/namespace` path. In example above, the user has downloaded models from the official `library`, `f0rodo`, `mattw`, and `saikatkumardey` namespaces. Within each of those directories, you will find directories for each of the models downloaded. And in there you will find a file name representing each tag. Each tag file is the manifest for the model.
|
||||
|
||||
The manifest lists all the layers used in this model. You will see a `media type` for each layer, along with a digest. That digest corresponds with a file in the `models/blobs directory`.
|
||||
|
||||
### How can I change where Ollama stores models?
|
||||
|
||||
To modify where models are stored, you can use the `OLLAMA_MODELS` environment variable. Note that on Linux this means defining `OLLAMA_MODELS` in a drop-in `/etc/systemd/system/ollama.service.d` service file, reloading systemd, and restarting the ollama service.
|
||||
|
||||
@@ -112,8 +112,8 @@ PARAMETER <parameter> <parametervalue>
|
||||
| repeat_last_n | Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx) | int | repeat_last_n 64 |
|
||||
| repeat_penalty | Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1) | float | repeat_penalty 1.1 |
|
||||
| temperature | The temperature of the model. Increasing the temperature will make the model answer more creatively. (Default: 0.8) | float | temperature 0.7 |
|
||||
| seed | Sets the random number seed to use for generation. Setting this to a specific number will make the model generate the same text for the same prompt. (Default: 0) | int | seed 42 |
|
||||
| stop | Sets the stop sequences to use. | string | stop "AI assistant:" |
|
||||
| seed | Sets the random number seed to use for generation. Setting this to a specific number will make the model generate the same text for the same prompt. (Default: 0) | int | seed 42 |
|
||||
| stop | Sets the stop sequences to use. When this pattern is encountered the LLM will stop generating text and return. Multiple stop patterns may be set by specifying multiple separate `stop` parameters in a modelfile. | string | stop "AI assistant:" |
|
||||
| tfs_z | Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting. (default: 1) | float | tfs_z 1 |
|
||||
| num_predict | Maximum number of tokens to predict when generating text. (Default: 128, -1 = infinite generation, -2 = fill context) | int | num_predict 42 |
|
||||
| top_k | Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40) | int | top_k 40 |
|
||||
@@ -129,14 +129,11 @@ PARAMETER <parameter> <parametervalue>
|
||||
| --------------- | ------------------------------------------------------------------------------------------------------------ |
|
||||
| `{{ .System }}` | The system prompt used to specify custom behavior, this must also be set in the Modelfile as an instruction. |
|
||||
| `{{ .Prompt }}` | The incoming prompt, this is not specified in the model file and will be set based on input. |
|
||||
| `{{ .First }}` | A boolean value used to render specific template information for the first generation of a session. |
|
||||
|
||||
```modelfile
|
||||
TEMPLATE """
|
||||
{{- if .First }}
|
||||
### System:
|
||||
{{ .System }}
|
||||
{{- end }}
|
||||
|
||||
### User:
|
||||
{{ .Prompt }}
|
||||
|
||||
@@ -4,5 +4,6 @@ Here is a list of ways you can use Ollama with other tools to build interesting
|
||||
|
||||
- [Using LangChain with Ollama in JavaScript](./tutorials/langchainjs.md)
|
||||
- [Using LangChain with Ollama in Python](./tutorials/langchainpy.md)
|
||||
- [Running Ollama on NVIDIA Jetson Devices](./tutorials/nvidia-jetson.md)
|
||||
|
||||
Also be sure to check out the [examples](../examples) directory for more ways to use Ollama.
|
||||
Also be sure to check out the [examples](../examples) directory for more ways to use Ollama.
|
||||
|
||||
@@ -23,13 +23,17 @@ const answer = await ollama.call(`why is the sky blue?`);
|
||||
console.log(answer);
|
||||
```
|
||||
|
||||
That will get us the same thing as if we ran `ollama run llama2 "why is the sky blue"` in the terminal. But we want to load a document from the web to ask a question against. **Cheerio** is a great library for ingesting a webpage, and **LangChain** uses it in their **CheerioWebBaseLoader**. So let's build that part of the app.
|
||||
That will get us the same thing as if we ran `ollama run llama2 "why is the sky blue"` in the terminal. But we want to load a document from the web to ask a question against. **Cheerio** is a great library for ingesting a webpage, and **LangChain** uses it in their **CheerioWebBaseLoader**. So let's install **Cheerio** and build that part of the app.
|
||||
|
||||
```bash
|
||||
npm install cheerio
|
||||
```
|
||||
|
||||
```javascript
|
||||
import { CheerioWebBaseLoader } from "langchain/document_loaders/web/cheerio";
|
||||
|
||||
const loader = new CheerioWebBaseLoader("https://en.wikipedia.org/wiki/2023_Hawaii_wildfires");
|
||||
const data = loader.load();
|
||||
const data = await loader.load();
|
||||
```
|
||||
|
||||
That will load the document. Although this page is smaller than the Odyssey, it is certainly bigger than the context size for most LLMs. So we are going to need to split into smaller pieces, and then select just the pieces relevant to our question. This is a great use for a vector datastore. In this example, we will use the **MemoryVectorStore** that is part of **LangChain**. But there is one more thing we need to get the content into the datastore. We have to run an embeddings process that converts the tokens in the text into a series of vectors. And for that, we are going to use **Tensorflow**. There is a lot of stuff going on in this one. First, install the **Tensorflow** components that we need.
|
||||
|
||||
38
docs/tutorials/nvidia-jetson.md
Normal file
38
docs/tutorials/nvidia-jetson.md
Normal file
@@ -0,0 +1,38 @@
|
||||
# Running Ollama on NVIDIA Jetson Devices
|
||||
|
||||
With some minor configuration, Ollama runs well on [NVIDIA Jetson Devices](https://www.nvidia.com/en-us/autonomous-machines/embedded-systems/). The following has been tested on [JetPack 5.1.2](https://developer.nvidia.com/embedded/jetpack).
|
||||
|
||||
NVIDIA Jetson devices are Linux-based embedded AI computers that are purpose-built for AI applications.
|
||||
|
||||
Jetsons have an integrated GPU that is wired directly to the memory controller of the machine. For this reason, the `nvidia-smi` command is unrecognized, and Ollama proceeds to operate in "CPU only"
|
||||
mode. This can be verified by using a monitoring tool like jtop.
|
||||
|
||||
In order to address this, we simply pass the path to the Jetson's pre-installed CUDA libraries into `ollama serve` (while in a tmux session). We then hardcode the num_gpu parameters into a cloned
|
||||
version of our target model.
|
||||
|
||||
Prerequisites:
|
||||
|
||||
- curl
|
||||
- tmux
|
||||
|
||||
Here are the steps:
|
||||
|
||||
- Install Ollama via standard Linux command (ignore the 404 error): `curl https://ollama.ai/install.sh | sh`
|
||||
- Stop the Ollama service: `sudo systemctl stop ollama`
|
||||
- Start Ollama serve in a tmux session called ollama_jetson and reference the CUDA libraries path: `tmux has-session -t ollama_jetson 2>/dev/null || tmux new-session -d -s ollama_jetson
|
||||
'LD_LIBRARY_PATH=/usr/local/cuda/lib64 ollama serve'`
|
||||
- Pull the model you want to use (e.g. mistral): `ollama pull mistral`
|
||||
- Create a new Modelfile specifically for enabling GPU support on the Jetson: `touch ModelfileMistralJetson`
|
||||
- In the ModelfileMistralJetson file, specify the FROM model and the num_gpu PARAMETER as shown below:
|
||||
|
||||
```
|
||||
FROM mistral
|
||||
PARAMETER num_gpu 999
|
||||
```
|
||||
|
||||
- Create a new model from your Modelfile: `ollama create mistral-jetson -f ./ModelfileMistralJetson`
|
||||
- Run the new model: `ollama run mistral-jetson`
|
||||
|
||||
If you run a monitoring tool like jtop you should now see that Ollama is using the Jetson's integrated GPU.
|
||||
|
||||
And that's it!
|
||||
10
examples/bash-comparemodels/README.md
Normal file
10
examples/bash-comparemodels/README.md
Normal file
@@ -0,0 +1,10 @@
|
||||
# Bash Shell examples
|
||||
|
||||
When calling `ollama`, you can pass it a file to run all the prompts in the file, one after the other:
|
||||
|
||||
`ollama run llama2 < sourcequestions.txt`
|
||||
|
||||
This concept is used in the following example.
|
||||
|
||||
## Compare Models
|
||||
`comparemodels.sh` is a script that runs all the questions in `sourcequestions.txt` using any 4 models you choose that you have already pulled from the Ollama library or have created locally.
|
||||
64
examples/bash-comparemodels/comparemodels.sh
Executable file
64
examples/bash-comparemodels/comparemodels.sh
Executable file
@@ -0,0 +1,64 @@
|
||||
#! /usr/bin/env bash
|
||||
# Compare multiple models by running them with the same questions
|
||||
|
||||
NUMBEROFCHOICES=4
|
||||
SELECTIONS=()
|
||||
declare -a SUMS=()
|
||||
|
||||
# Get the list of models
|
||||
CHOICES=$(ollama list | awk '{print $1}')
|
||||
|
||||
# Select which models to run as a comparison
|
||||
echo "Select $NUMBEROFCHOICES models to compare:"
|
||||
select ITEM in $CHOICES; do
|
||||
if [[ -n $ITEM ]]; then
|
||||
echo "You have selected $ITEM"
|
||||
SELECTIONS+=("$ITEM")
|
||||
((COUNT++))
|
||||
if [[ $COUNT -eq $NUMBEROFCHOICES ]]; then
|
||||
break
|
||||
fi
|
||||
else
|
||||
echo "Invalid selection"
|
||||
fi
|
||||
done
|
||||
|
||||
# Loop through each of the selected models
|
||||
for ITEM in "${SELECTIONS[@]}"; do
|
||||
echo "--------------------------------------------------------------"
|
||||
echo "Loading the model $ITEM into memory"
|
||||
ollama run "$ITEM" ""
|
||||
echo "--------------------------------------------------------------"
|
||||
echo "Running the questions through the model $ITEM"
|
||||
COMMAND_OUTPUT=$(ollama run "$ITEM" --verbose < sourcequestions.txt 2>&1| tee /dev/stderr)
|
||||
|
||||
# eval duration is sometimes listed in seconds and sometimes in milliseconds.
|
||||
# Add up the values for each model
|
||||
SUM=$(echo "$COMMAND_OUTPUT" | awk '
|
||||
/eval duration:/ {
|
||||
value = $3
|
||||
if (index(value, "ms") > 0) {
|
||||
gsub("ms", "", value)
|
||||
value /= 1000
|
||||
} else {
|
||||
gsub("s", "", value)
|
||||
}
|
||||
sum += value
|
||||
}
|
||||
END { print sum }')
|
||||
|
||||
|
||||
SUMS+=("All questions for $ITEM completed in $SUM seconds")
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "--------------------------------------------------------------"
|
||||
echo -e "Sums of eval durations for each run:"
|
||||
for val in "${SUMS[@]}"; do
|
||||
echo "$val"
|
||||
done
|
||||
|
||||
echo "--------------------------------------------------------------"
|
||||
echo "Comparison complete. Now you can decide"
|
||||
echo "which model is best."
|
||||
echo "--------------------------------------------------------------"
|
||||
7
examples/bash-comparemodels/sourcequestions.txt
Normal file
7
examples/bash-comparemodels/sourcequestions.txt
Normal file
@@ -0,0 +1,7 @@
|
||||
Why is the sky blue
|
||||
What is a black hole
|
||||
Explain the big bang theory like I am 5?
|
||||
What is the quickest way to win a game of Monopoly with 3 others?
|
||||
Why does a vacuum bottle keep my coffee hot and my milkshake cold?
|
||||
What is the difference between a meteor, a meteorite, and a meteoroid?
|
||||
Create an array with 5 items and print to the console. Do this in Python, C#, Typescript, and Rust.
|
||||
36
examples/kubernetes/README.md
Normal file
36
examples/kubernetes/README.md
Normal file
@@ -0,0 +1,36 @@
|
||||
# Deploy Ollama to Kubernetes
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Ollama: https://ollama.ai/download
|
||||
- Kubernetes cluster. This example will use Google Kubernetes Engine.
|
||||
|
||||
## Steps
|
||||
|
||||
1. Create the Ollama namespace, daemon set, and service
|
||||
|
||||
```bash
|
||||
kubectl apply -f cpu.yaml
|
||||
```
|
||||
|
||||
1. Port forward the Ollama service to connect and use it locally
|
||||
|
||||
```bash
|
||||
kubectl -n ollama port-forward service/ollama 11434:80
|
||||
```
|
||||
|
||||
1. Pull and run a model, for example `orca-mini:3b`
|
||||
|
||||
```bash
|
||||
ollama run orca-mini:3b
|
||||
```
|
||||
|
||||
## (Optional) Hardware Acceleration
|
||||
|
||||
Hardware acceleration in Kubernetes requires NVIDIA's [`k8s-device-plugin`](https://github.com/NVIDIA/k8s-device-plugin). Follow the link for more details.
|
||||
|
||||
Once configured, create a GPU enabled Ollama deployment.
|
||||
|
||||
```bash
|
||||
kubectl apply -f gpu.yaml
|
||||
```
|
||||
42
examples/kubernetes/cpu.yaml
Normal file
42
examples/kubernetes/cpu.yaml
Normal file
@@ -0,0 +1,42 @@
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: ollama
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: ollama
|
||||
namespace: ollama
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
name: ollama
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
name: ollama
|
||||
spec:
|
||||
containers:
|
||||
- name: ollama
|
||||
image: ollama/ollama:latest
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 11434
|
||||
protocol: TCP
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: ollama
|
||||
namespace: ollama
|
||||
spec:
|
||||
type: ClusterIP
|
||||
selector:
|
||||
name: ollama
|
||||
ports:
|
||||
- port: 80
|
||||
name: http
|
||||
targetPort: http
|
||||
protocol: TCP
|
||||
56
examples/kubernetes/gpu.yaml
Normal file
56
examples/kubernetes/gpu.yaml
Normal file
@@ -0,0 +1,56 @@
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: ollama
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: ollama
|
||||
namespace: ollama
|
||||
spec:
|
||||
strategy:
|
||||
type: Recreate
|
||||
selector:
|
||||
matchLabels:
|
||||
name: ollama
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
name: ollama
|
||||
spec:
|
||||
containers:
|
||||
- name: ollama
|
||||
image: ollama/ollama:latest
|
||||
env:
|
||||
- name: PATH
|
||||
value: /usr/local/nvidia/bin:/usr/local/nvidia/lib64:/usr/bin:/usr/sbin:/bin:/sbin
|
||||
- name: LD_LIBRARY_PATH
|
||||
value: /usr/local/nvidia/lib64
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 11434
|
||||
protocol: TCP
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: 1
|
||||
tolerations:
|
||||
- key: nvidia.com/gpu
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: ollama
|
||||
namespace: ollama
|
||||
spec:
|
||||
type: ClusterIP
|
||||
selector:
|
||||
name: ollama
|
||||
ports:
|
||||
- port: 80
|
||||
name: http
|
||||
targetPort: http
|
||||
protocol: TCP
|
||||
@@ -3,10 +3,8 @@
|
||||
|
||||
FROM orca
|
||||
TEMPLATE """
|
||||
{{- if .First }}
|
||||
### System:
|
||||
{{ .System }}
|
||||
{{- end }}
|
||||
### User:
|
||||
I hate it when my phone dies
|
||||
### Response:
|
||||
|
||||
@@ -3,10 +3,8 @@
|
||||
This is a simple sentiments analyzer using the Orca model. When you pull Orca from the registry, it has a Template already defined that looks like this:
|
||||
|
||||
```Modelfile
|
||||
{{- if .First }}
|
||||
### System:
|
||||
{{ .System }}
|
||||
{{- end }}
|
||||
|
||||
### User:
|
||||
{{ .Prompt }}
|
||||
|
||||
@@ -17,7 +17,7 @@ def generate(prompt, context):
|
||||
for line in r.iter_lines():
|
||||
body = json.loads(line)
|
||||
response_part = body.get('response', '')
|
||||
# the response streams one token at a time, print that as we recieve it
|
||||
# the response streams one token at a time, print that as we receive it
|
||||
print(response_part, end='', flush=True)
|
||||
|
||||
if 'error' in body:
|
||||
@@ -35,4 +35,4 @@ def main():
|
||||
print()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
|
||||
@@ -12,11 +12,11 @@ const (
|
||||
func HumanBytes(b int64) string {
|
||||
switch {
|
||||
case b > GigaByte:
|
||||
return fmt.Sprintf("%d GB", b/GigaByte)
|
||||
return fmt.Sprintf("%.1f GB", float64(b)/GigaByte)
|
||||
case b > MegaByte:
|
||||
return fmt.Sprintf("%d MB", b/MegaByte)
|
||||
return fmt.Sprintf("%.1f MB", float64(b)/MegaByte)
|
||||
case b > KiloByte:
|
||||
return fmt.Sprintf("%d KB", b/KiloByte)
|
||||
return fmt.Sprintf("%.1f KB", float64(b)/KiloByte)
|
||||
default:
|
||||
return fmt.Sprintf("%d B", b)
|
||||
}
|
||||
|
||||
25
format/format.go
Normal file
25
format/format.go
Normal file
@@ -0,0 +1,25 @@
|
||||
package format
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
)
|
||||
|
||||
const (
|
||||
Thousand = 1000
|
||||
Million = Thousand * 1000
|
||||
Billion = Million * 1000
|
||||
)
|
||||
|
||||
func HumanNumber(b uint64) string {
|
||||
switch {
|
||||
case b > Billion:
|
||||
return fmt.Sprintf("%.0fB", math.Round(float64(b)/Billion))
|
||||
case b > Million:
|
||||
return fmt.Sprintf("%.0fM", math.Round(float64(b)/Million))
|
||||
case b > Thousand:
|
||||
return fmt.Sprintf("%.0fK", math.Round(float64(b)/Thousand))
|
||||
default:
|
||||
return fmt.Sprintf("%d", b)
|
||||
}
|
||||
}
|
||||
1
go.mod
1
go.mod
@@ -3,7 +3,6 @@ module github.com/jmorganca/ollama
|
||||
go 1.20
|
||||
|
||||
require (
|
||||
github.com/dustin/go-humanize v1.0.1
|
||||
github.com/emirpasic/gods v1.18.1
|
||||
github.com/gin-gonic/gin v1.9.1
|
||||
github.com/mattn/go-runewidth v0.0.14
|
||||
|
||||
2
go.sum
2
go.sum
@@ -9,8 +9,6 @@ github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ3
|
||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
|
||||
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
|
||||
github.com/emirpasic/gods v1.18.1 h1:FXtiHYKDGKCW2KzwZKx0iC0PQmdlorYgdFG9jPXJ1Bc=
|
||||
github.com/emirpasic/gods v1.18.1/go.mod h1:8tpGGwCnJ5H4r6BWwaV6OrWmMoPhUl5jm/FMNAnJvWQ=
|
||||
github.com/gabriel-vasile/mimetype v1.4.2 h1:w5qFW6JKBz9Y393Y4q372O9A7cUSequkh1Q7OhCmWKU=
|
||||
|
||||
65
llm/gguf.go
65
llm/gguf.go
@@ -5,6 +5,8 @@ import (
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"io"
|
||||
|
||||
"github.com/jmorganca/ollama/format"
|
||||
)
|
||||
|
||||
type containerGGUF struct {
|
||||
@@ -21,6 +23,8 @@ type containerGGUF struct {
|
||||
NumTensor uint64
|
||||
NumKV uint64
|
||||
}
|
||||
|
||||
parameters uint64
|
||||
}
|
||||
|
||||
func (c *containerGGUF) Name() string {
|
||||
@@ -75,6 +79,14 @@ func newGGUFModel(container *containerGGUF) *ggufModel {
|
||||
}
|
||||
}
|
||||
|
||||
func (llm *ggufModel) NumTensor() uint64 {
|
||||
if llm.Version == 1 {
|
||||
return uint64(llm.V1.NumTensor)
|
||||
}
|
||||
|
||||
return llm.V2.NumTensor
|
||||
}
|
||||
|
||||
func (llm *ggufModel) NumKV() uint64 {
|
||||
if llm.Version == 1 {
|
||||
return uint64(llm.V1.NumKV)
|
||||
@@ -93,6 +105,10 @@ func (llm *ggufModel) ModelFamily() string {
|
||||
}
|
||||
|
||||
func (llm *ggufModel) ModelType() string {
|
||||
if llm.parameters > 0 {
|
||||
return format.HumanNumber(llm.parameters)
|
||||
}
|
||||
|
||||
switch llm.ModelFamily() {
|
||||
case "llama":
|
||||
if blocks, ok := llm.kv["llama.block_count"].(uint32); ok {
|
||||
@@ -127,13 +143,9 @@ func (llm *ggufModel) FileType() string {
|
||||
}
|
||||
|
||||
func (llm *ggufModel) Decode(r io.Reader) error {
|
||||
read := llm.readString
|
||||
if llm.Version == 1 {
|
||||
read = llm.readStringV1
|
||||
}
|
||||
|
||||
// decode key-values
|
||||
for i := 0; uint64(i) < llm.NumKV(); i++ {
|
||||
k, err := read(r)
|
||||
k, err := llm.readString(r)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -165,24 +177,14 @@ func (llm *ggufModel) Decode(r io.Reader) error {
|
||||
case ggufTypeBool:
|
||||
v = llm.readBool(r)
|
||||
case ggufTypeString:
|
||||
fn := llm.readString
|
||||
if llm.Version == 1 {
|
||||
fn = llm.readStringV1
|
||||
}
|
||||
|
||||
s, err := fn(r)
|
||||
s, err := llm.readString(r)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
v = s
|
||||
case ggufTypeArray:
|
||||
fn := llm.readArray
|
||||
if llm.Version == 1 {
|
||||
fn = llm.readArrayV1
|
||||
}
|
||||
|
||||
a, err := fn(r)
|
||||
a, err := llm.readArray(r)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -195,6 +197,25 @@ func (llm *ggufModel) Decode(r io.Reader) error {
|
||||
llm.kv[k] = v
|
||||
}
|
||||
|
||||
// decode tensors
|
||||
for i := 0; uint64(i) < llm.NumTensor(); i++ {
|
||||
if _, err := llm.readString(r); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
dimensions := llm.readU32(r)
|
||||
|
||||
var elements uint64 = 1
|
||||
for i := 0; uint32(i) < dimensions; i++ {
|
||||
elements *= llm.readU64(r)
|
||||
}
|
||||
|
||||
llm.readU32(r) // type
|
||||
llm.readU64(r) // offset
|
||||
|
||||
llm.parameters += elements
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -290,6 +311,10 @@ func (llm ggufModel) readStringV1(r io.Reader) (string, error) {
|
||||
}
|
||||
|
||||
func (llm ggufModel) readString(r io.Reader) (string, error) {
|
||||
if llm.Version == 1 {
|
||||
return llm.readStringV1(r)
|
||||
}
|
||||
|
||||
var nameLength uint64
|
||||
binary.Read(r, llm.bo, &nameLength)
|
||||
|
||||
@@ -339,6 +364,10 @@ func (llm *ggufModel) readArrayV1(r io.Reader) (arr []any, err error) {
|
||||
}
|
||||
|
||||
func (llm *ggufModel) readArray(r io.Reader) (arr []any, err error) {
|
||||
if llm.Version == 1 {
|
||||
return llm.readArrayV1(r)
|
||||
}
|
||||
|
||||
atype := llm.readU32(r)
|
||||
n := llm.readU64(r)
|
||||
|
||||
|
||||
47
llm/llama.go
47
llm/llama.go
@@ -27,6 +27,34 @@ import (
|
||||
"github.com/jmorganca/ollama/format"
|
||||
)
|
||||
|
||||
const jsonGrammar = `
|
||||
root ::= object
|
||||
value ::= object | array | string | number | ("true" | "false" | "null") ws
|
||||
|
||||
object ::=
|
||||
"{" ws (
|
||||
string ":" ws value
|
||||
("," ws string ":" ws value)*
|
||||
)? "}" ws
|
||||
|
||||
array ::=
|
||||
"[" ws (
|
||||
value
|
||||
("," ws value)*
|
||||
)? "]" ws
|
||||
|
||||
string ::=
|
||||
"\"" (
|
||||
[^"\\] |
|
||||
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
|
||||
)* "\"" ws
|
||||
|
||||
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
|
||||
|
||||
# Optional space: by convention, applied in this grammar after literal chars when allowed
|
||||
ws ::= ([ \t\n] ws)?
|
||||
`
|
||||
|
||||
//go:embed llama.cpp/*/build/*/bin/*
|
||||
var llamaCppEmbed embed.FS
|
||||
|
||||
@@ -196,7 +224,10 @@ type llama struct {
|
||||
Running
|
||||
}
|
||||
|
||||
var errNoGPU = errors.New("nvidia-smi command failed")
|
||||
var (
|
||||
errNvidiaSMI = errors.New("nvidia-smi command failed")
|
||||
errAvailableVRAM = errors.New("not enough VRAM available, falling back to CPU only")
|
||||
)
|
||||
|
||||
// CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs
|
||||
func CheckVRAM() (int64, error) {
|
||||
@@ -205,7 +236,7 @@ func CheckVRAM() (int64, error) {
|
||||
cmd.Stdout = &stdout
|
||||
err := cmd.Run()
|
||||
if err != nil {
|
||||
return 0, errNoGPU
|
||||
return 0, errNvidiaSMI
|
||||
}
|
||||
|
||||
var freeMiB int64
|
||||
@@ -226,8 +257,8 @@ func CheckVRAM() (int64, error) {
|
||||
|
||||
freeBytes := freeMiB * 1024 * 1024
|
||||
if freeBytes < 2*format.GigaByte {
|
||||
log.Printf("less than 2 GB VRAM available, falling back to CPU only")
|
||||
freeMiB = 0
|
||||
log.Printf("less than 2 GB VRAM available")
|
||||
return 0, errAvailableVRAM
|
||||
}
|
||||
|
||||
return freeBytes, nil
|
||||
@@ -240,7 +271,7 @@ func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
|
||||
if runtime.GOOS == "linux" {
|
||||
freeBytes, err := CheckVRAM()
|
||||
if err != nil {
|
||||
if err.Error() != "nvidia-smi command failed" {
|
||||
if !errors.Is(err, errNvidiaSMI) {
|
||||
log.Print(err.Error())
|
||||
}
|
||||
// nvidia driver not installed or no nvidia GPU found
|
||||
@@ -494,7 +525,7 @@ type prediction struct {
|
||||
|
||||
const maxBufferSize = 512 * format.KiloByte
|
||||
|
||||
func (llm *llama) Predict(ctx context.Context, prevContext []int, prompt string, fn func(api.GenerateResponse)) error {
|
||||
func (llm *llama) Predict(ctx context.Context, prevContext []int, prompt string, format string, fn func(api.GenerateResponse)) error {
|
||||
prevConvo, err := llm.Decode(ctx, prevContext)
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -529,6 +560,10 @@ func (llm *llama) Predict(ctx context.Context, prevContext []int, prompt string,
|
||||
"stop": llm.Stop,
|
||||
}
|
||||
|
||||
if format == "json" {
|
||||
request["grammar"] = jsonGrammar
|
||||
}
|
||||
|
||||
// Handling JSON marshaling with special characters unescaped.
|
||||
buffer := &bytes.Buffer{}
|
||||
enc := json.NewEncoder(buffer)
|
||||
|
||||
@@ -14,7 +14,7 @@ import (
|
||||
)
|
||||
|
||||
type LLM interface {
|
||||
Predict(context.Context, []int, string, func(api.GenerateResponse)) error
|
||||
Predict(context.Context, []int, string, string, func(api.GenerateResponse)) error
|
||||
Embedding(context.Context, string) ([]float64, error)
|
||||
Encode(context.Context, string) ([]int, error)
|
||||
Decode(context.Context, []int) (string, error)
|
||||
|
||||
@@ -291,7 +291,7 @@ func OptionShowDescriptionAtLineEnd() Option {
|
||||
}
|
||||
}
|
||||
|
||||
var defaultTheme = Theme{Saucer: "█", SaucerPadding: " ", BarStart: "|", BarEnd: "|"}
|
||||
var defaultTheme = Theme{Saucer: "█", SaucerPadding: " ", BarStart: "▕", BarEnd: "▏"}
|
||||
|
||||
// NewOptions constructs a new instance of ProgressBar, with any options you specify
|
||||
func NewOptions(max int, options ...Option) *ProgressBar {
|
||||
|
||||
@@ -180,7 +180,7 @@ install_cuda_driver_apt() {
|
||||
case $1 in
|
||||
debian)
|
||||
status 'Enabling contrib sources...'
|
||||
$SUDO sed 's/main/contrib/' < /etc/apt/sources.list | sudo tee /etc/apt/sources.list.d/contrib.list > /dev/null
|
||||
$SUDO sed 's/main/contrib/' < /etc/apt/sources.list | $SUDO tee /etc/apt/sources.list.d/contrib.list > /dev/null
|
||||
;;
|
||||
esac
|
||||
|
||||
|
||||
@@ -149,9 +149,10 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *Regis
|
||||
|
||||
i := i
|
||||
g.Go(func() error {
|
||||
var err error
|
||||
for try := 0; try < maxRetries; try++ {
|
||||
w := io.NewOffsetWriter(file, part.StartsAt())
|
||||
err := b.downloadChunk(inner, requestURL, w, part, opts)
|
||||
err = b.downloadChunk(inner, requestURL, w, part, opts)
|
||||
switch {
|
||||
case errors.Is(err, context.Canceled), errors.Is(err, syscall.ENOSPC):
|
||||
// return immediately if the context is canceled or the device is out of space
|
||||
@@ -160,11 +161,14 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *Regis
|
||||
log.Printf("%s part %d attempt %d failed: %v, retrying", b.Digest[7:19], i, try, err)
|
||||
continue
|
||||
default:
|
||||
if try > 0 {
|
||||
log.Printf("%s part %d completed after %d retries", b.Digest[7:19], i, try)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
return errMaxRetriesExceeded
|
||||
return fmt.Errorf("%w: %w", errMaxRetriesExceeded, err)
|
||||
})
|
||||
}
|
||||
|
||||
@@ -201,7 +205,7 @@ func (b *blobDownload) downloadChunk(ctx context.Context, requestURL *url.URL, w
|
||||
defer resp.Body.Close()
|
||||
|
||||
n, err := io.Copy(w, io.TeeReader(resp.Body, b))
|
||||
if err != nil && !errors.Is(err, context.Canceled) {
|
||||
if err != nil && !errors.Is(err, context.Canceled) && !errors.Is(err, io.ErrUnexpectedEOF) {
|
||||
// rollback progress
|
||||
b.Completed.Add(-n)
|
||||
return err
|
||||
@@ -212,7 +216,7 @@ func (b *blobDownload) downloadChunk(ctx context.Context, requestURL *url.URL, w
|
||||
return err
|
||||
}
|
||||
|
||||
// return nil or context.Canceled
|
||||
// return nil or context.Canceled or UnexpectedEOF (resumable)
|
||||
return err
|
||||
}
|
||||
|
||||
|
||||
@@ -60,12 +60,10 @@ func (m *Model) Prompt(request api.GenerateRequest) (string, error) {
|
||||
}
|
||||
|
||||
var vars struct {
|
||||
First bool
|
||||
System string
|
||||
Prompt string
|
||||
}
|
||||
|
||||
vars.First = len(request.Context) == 0
|
||||
vars.System = m.System
|
||||
vars.Prompt = request.Prompt
|
||||
|
||||
@@ -397,7 +395,7 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
newLayer.From = mp.GetNamespaceRepository()
|
||||
newLayer.From = mp.GetShortTagname()
|
||||
layers = append(layers, newLayer)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -158,9 +158,17 @@ func GenerateHandler(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
if req.Model == "" {
|
||||
// validate the request
|
||||
switch {
|
||||
case req.Model == "":
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "model is required"})
|
||||
return
|
||||
case len(req.Format) > 0 && req.Format != "json":
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "format must be json"})
|
||||
return
|
||||
case req.Raw && (req.Template != "" || req.System != "" || len(req.Context) > 0):
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "raw mode does not support template, system, or context"})
|
||||
return
|
||||
}
|
||||
|
||||
model, err := GetModel(req.Model)
|
||||
@@ -189,10 +197,13 @@ func GenerateHandler(c *gin.Context) {
|
||||
|
||||
checkpointLoaded := time.Now()
|
||||
|
||||
prompt, err := model.Prompt(req)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
prompt := req.Prompt
|
||||
if !req.Raw {
|
||||
prompt, err = model.Prompt(req)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
ch := make(chan any)
|
||||
@@ -215,10 +226,15 @@ func GenerateHandler(c *gin.Context) {
|
||||
r.LoadDuration = checkpointLoaded.Sub(checkpointStart)
|
||||
}
|
||||
|
||||
if req.Raw {
|
||||
// in raw mode the client must manage history on their own
|
||||
r.Context = nil
|
||||
}
|
||||
|
||||
ch <- r
|
||||
}
|
||||
|
||||
if err := loaded.runner.Predict(c.Request.Context(), req.Context, prompt, fn); err != nil {
|
||||
if err := loaded.runner.Predict(c.Request.Context(), req.Context, prompt, req.Format, fn); err != nil {
|
||||
ch <- gin.H{"error": err.Error()}
|
||||
}
|
||||
}()
|
||||
|
||||
@@ -40,14 +40,6 @@ type blobUpload struct {
|
||||
references atomic.Int32
|
||||
}
|
||||
|
||||
type blobUploadPart struct {
|
||||
// N is the part number
|
||||
N int
|
||||
Offset int64
|
||||
Size int64
|
||||
hash.Hash
|
||||
}
|
||||
|
||||
const (
|
||||
numUploadParts = 64
|
||||
minUploadPartSize int64 = 95 * 1000 * 1000
|
||||
@@ -100,7 +92,7 @@ func (b *blobUpload) Prepare(ctx context.Context, requestURL *url.URL, opts *Reg
|
||||
}
|
||||
|
||||
// set part.N to the current number of parts
|
||||
b.Parts = append(b.Parts, blobUploadPart{N: len(b.Parts), Offset: offset, Size: size, Hash: md5.New()})
|
||||
b.Parts = append(b.Parts, blobUploadPart{blobUpload: b, N: len(b.Parts), Offset: offset, Size: size})
|
||||
offset += size
|
||||
}
|
||||
|
||||
@@ -143,9 +135,10 @@ func (b *blobUpload) Run(ctx context.Context, opts *RegistryOptions) {
|
||||
case <-inner.Done():
|
||||
case requestURL := <-b.nextURL:
|
||||
g.Go(func() error {
|
||||
var err error
|
||||
for try := 0; try < maxRetries; try++ {
|
||||
r := io.NewSectionReader(f, part.Offset, part.Size)
|
||||
err := b.uploadChunk(inner, http.MethodPatch, requestURL, r, part, opts)
|
||||
part.ReadSeeker = io.NewSectionReader(f, part.Offset, part.Size)
|
||||
err = b.uploadChunk(inner, http.MethodPatch, requestURL, part, opts)
|
||||
switch {
|
||||
case errors.Is(err, context.Canceled):
|
||||
return err
|
||||
@@ -159,7 +152,7 @@ func (b *blobUpload) Run(ctx context.Context, opts *RegistryOptions) {
|
||||
return nil
|
||||
}
|
||||
|
||||
return errMaxRetriesExceeded
|
||||
return fmt.Errorf("%w: %w", errMaxRetriesExceeded, err)
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -197,7 +190,9 @@ func (b *blobUpload) Run(ctx context.Context, opts *RegistryOptions) {
|
||||
b.done = true
|
||||
}
|
||||
|
||||
func (b *blobUpload) uploadChunk(ctx context.Context, method string, requestURL *url.URL, rs io.ReadSeeker, part *blobUploadPart, opts *RegistryOptions) error {
|
||||
func (b *blobUpload) uploadChunk(ctx context.Context, method string, requestURL *url.URL, part *blobUploadPart, opts *RegistryOptions) error {
|
||||
part.Reset()
|
||||
|
||||
headers := make(http.Header)
|
||||
headers.Set("Content-Type", "application/octet-stream")
|
||||
headers.Set("Content-Length", fmt.Sprintf("%d", part.Size))
|
||||
@@ -207,8 +202,7 @@ func (b *blobUpload) uploadChunk(ctx context.Context, method string, requestURL
|
||||
headers.Set("Content-Range", fmt.Sprintf("%d-%d", part.Offset, part.Offset+part.Size-1))
|
||||
}
|
||||
|
||||
buw := blobUploadWriter{blobUpload: b}
|
||||
resp, err := makeRequest(ctx, method, requestURL, headers, io.TeeReader(rs, io.MultiWriter(&buw, part.Hash)), opts)
|
||||
resp, err := makeRequest(ctx, method, requestURL, headers, io.TeeReader(part.ReadSeeker, io.MultiWriter(part, part.Hash)), opts)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -234,11 +228,7 @@ func (b *blobUpload) uploadChunk(ctx context.Context, method string, requestURL
|
||||
}
|
||||
|
||||
for try := 0; try < maxRetries; try++ {
|
||||
rs.Seek(0, io.SeekStart)
|
||||
b.Completed.Add(-buw.written)
|
||||
buw.written = 0
|
||||
part.Hash = md5.New()
|
||||
err := b.uploadChunk(ctx, http.MethodPut, redirectURL, rs, part, nil)
|
||||
err = b.uploadChunk(ctx, http.MethodPut, redirectURL, part, nil)
|
||||
switch {
|
||||
case errors.Is(err, context.Canceled):
|
||||
return err
|
||||
@@ -252,7 +242,7 @@ func (b *blobUpload) uploadChunk(ctx context.Context, method string, requestURL
|
||||
return nil
|
||||
}
|
||||
|
||||
return errMaxRetriesExceeded
|
||||
return fmt.Errorf("%w: %w", errMaxRetriesExceeded, err)
|
||||
|
||||
case resp.StatusCode == http.StatusUnauthorized:
|
||||
auth := resp.Header.Get("www-authenticate")
|
||||
@@ -270,9 +260,6 @@ func (b *blobUpload) uploadChunk(ctx context.Context, method string, requestURL
|
||||
return err
|
||||
}
|
||||
|
||||
rs.Seek(0, io.SeekStart)
|
||||
b.Completed.Add(-buw.written)
|
||||
buw.written = 0
|
||||
return fmt.Errorf("http status %d %s: %s", resp.StatusCode, resp.Status, body)
|
||||
}
|
||||
|
||||
@@ -318,18 +305,33 @@ func (b *blobUpload) Wait(ctx context.Context, fn func(api.ProgressResponse)) er
|
||||
}
|
||||
}
|
||||
|
||||
type blobUploadWriter struct {
|
||||
type blobUploadPart struct {
|
||||
// N is the part number
|
||||
N int
|
||||
Offset int64
|
||||
Size int64
|
||||
hash.Hash
|
||||
|
||||
written int64
|
||||
|
||||
io.ReadSeeker
|
||||
*blobUpload
|
||||
}
|
||||
|
||||
func (b *blobUploadWriter) Write(p []byte) (n int, err error) {
|
||||
n = len(p)
|
||||
b.written += int64(n)
|
||||
b.Completed.Add(int64(n))
|
||||
func (p *blobUploadPart) Write(b []byte) (n int, err error) {
|
||||
n = len(b)
|
||||
p.written += int64(n)
|
||||
p.Completed.Add(int64(n))
|
||||
return n, nil
|
||||
}
|
||||
|
||||
func (p *blobUploadPart) Reset() {
|
||||
p.Seek(0, io.SeekStart)
|
||||
p.Completed.Add(-int64(p.written))
|
||||
p.written = 0
|
||||
p.Hash = md5.New()
|
||||
}
|
||||
|
||||
func uploadBlob(ctx context.Context, mp ModelPath, layer *Layer, opts *RegistryOptions, fn func(api.ProgressResponse)) error {
|
||||
requestURL := mp.BaseURL()
|
||||
requestURL = requestURL.JoinPath("v2", mp.GetNamespaceRepository(), "blobs", layer.Digest)
|
||||
|
||||
Reference in New Issue
Block a user