Compare commits

...

17 Commits

Author SHA1 Message Date
Roy Han
0ac5cbc00e separate deprecation changes 2024-06-28 13:22:37 -07:00
Roy Han
12209bd021 Remove Latest at API Level 2024-06-12 14:43:17 -07:00
Roy Han
be2c5fd71a Remove :latest from Ollama List 2024-06-10 16:39:44 -07:00
Michael Yang
0f3cf1d42e Merge pull request #4715 from ollama/mxyng/utf16-parser
proper utf16 support
2024-06-10 11:41:29 -07:00
Michael Yang
5bc029c529 Merge pull request #4921 from ollama/mxyng/import-md
update import.md
2024-06-10 11:41:09 -07:00
Michael Yang
e9a9c6a8e8 Merge pull request #4965 from ollama/mxyng/skip-layer-remove
fix: skip removing layers that no longer exist
2024-06-10 11:40:03 -07:00
Michael Yang
515f497e6d fix: skip removing layers that no longer exist 2024-06-10 11:32:19 -07:00
Michael Yang
b27268aaef add test 2024-06-10 11:32:15 -07:00
Michael Yang
f5f245cc15 Merge pull request #4938 from ollama/mxyng/fix-byte-order
fix parsing big endian gguf
2024-06-10 09:38:12 -07:00
Jim Scardelis
94d37fdcae fix: examples/langchain-python-rag-privategpt/requirements.txt (#3382) 2024-06-09 10:58:09 -07:00
Craig Hughes
b84aea1685 Critical fix from llama.cpp JSON grammar to forbid un-escaped escape characters inside strings, which breaks parsing. (#3782) 2024-06-09 10:57:09 -07:00
Napuh
896495de7b Add instructions to easily install specific versions on faq.md (#4084)
* Added instructions to easily install specific versions on faq.md

* Small typo

* Moved instructions on how to install specific version to linux.md

* Update docs/linux.md

* Update docs/linux.md

---------

Co-authored-by: Jeffrey Morgan <jmorganca@gmail.com>
2024-06-09 10:49:03 -07:00
dcasota
5528dd9d11 Error handling load_single_document() in ingest.py (#4852)
load_single_document() handles
- corrupt files
- empty (zero byte) files
- unsupported file extensions
2024-06-09 10:41:07 -07:00
Jeffrey Morgan
943172cbf4 Update api.md 2024-06-08 23:04:32 -07:00
Michael Yang
620d5c569e fix parsing big endian gguf 2024-06-08 12:35:26 -07:00
Michael Yang
b9ce7bf75e update import.md 2024-06-07 16:45:15 -07:00
Michael Yang
66ab48772f proper utf16 support 2024-06-05 13:11:50 -07:00
16 changed files with 240 additions and 213 deletions

View File

@@ -493,7 +493,7 @@ func ListHandler(cmd *cobra.Command, args []string) error {
for _, m := range models.Models {
if len(args) == 0 || strings.HasPrefix(m.Name, args[0]) {
data = append(data, []string{m.Name, m.Digest[:12], format.HumanBytes(m.Size), format.HumanTime(m.ModifiedAt, "Never")})
data = append(data, []string{m.Name[:len(m.Name)], m.Digest[:12], format.HumanBytes(m.Size), format.HumanTime(m.ModifiedAt, "Never")})
}
}

View File

@@ -1044,11 +1044,10 @@ GET /api/ps
List models that are currently loaded into memory.
\* If a model is loaded completely into system memory, `size_vram` is omitted from the response.
#### Examples
### Request
```shell
curl http://localhost:11434/api/ps
```
@@ -1080,4 +1079,4 @@ A single JSON object will be returned.
}
]
}
```
```

View File

@@ -1,170 +1,99 @@
# Import a model
# Import
This guide walks through importing a GGUF, PyTorch or Safetensors model.
GGUF models and select Safetensors models can be imported directly into Ollama.
## Importing (GGUF)
## Import GGUF
### Step 1: Write a `Modelfile`
A binary GGUF file can be imported directly into Ollama through a Modelfile.
Start by creating a `Modelfile`. This file is the blueprint for your model, specifying weights, parameters, prompt templates and more.
```
FROM ./mistral-7b-v0.1.Q4_0.gguf
```dockerfile
FROM /path/to/file.gguf
```
(Optional) many chat models require a prompt template in order to answer correctly. A default prompt template can be specified with the `TEMPLATE` instruction in the `Modelfile`:
## Import Safetensors
```
FROM ./mistral-7b-v0.1.Q4_0.gguf
TEMPLATE "[INST] {{ .Prompt }} [/INST]"
If the model being imported is one of these architectures, it can be imported directly into Ollama through a Modelfile:
- LlamaForCausalLM
- MistralForCausalLM
- GemmaForCausalLM
```dockerfile
FROM /path/to/safetensors/directory
```
### Step 2: Create the Ollama model
For architectures not directly convertable by Ollama, see llama.cpp's [guide](https://github.com/ggerganov/llama.cpp/blob/master/README.md#prepare-and-quantize) on conversion. After conversion, see [Import GGUF](#import-gguf).
Finally, create a model from your `Modelfile`:
## Automatic Quantization
> [!NOTE]
> Automatic quantization requires v0.1.35 or higher.
Ollama is capable of quantizing FP16 or FP32 models to any of the supported quantizations with the `-q/--quantize` flag in `ollama create`.
```dockerfile
FROM /path/to/my/gemma/f16/model
```
ollama create example -f Modelfile
```
### Step 3: Run your model
Next, test the model with `ollama run`:
```
ollama run example "What is your favourite condiment?"
```
## Importing (PyTorch & Safetensors)
> Importing from PyTorch and Safetensors is a longer process than importing from GGUF. Improvements that make it easier are a work in progress.
### Setup
First, clone the `ollama/ollama` repo:
```
git clone git@github.com:ollama/ollama.git ollama
cd ollama
```
and then fetch its `llama.cpp` submodule:
```shell
git submodule init
git submodule update llm/llama.cpp
$ ollama create -q Q4_K_M mymodel
transferring model data
quantizing F16 model to Q4_K_M
creating new layer sha256:735e246cc1abfd06e9cdcf95504d6789a6cd1ad7577108a70d9902fef503c1bd
creating new layer sha256:0853f0ad24e5865173bbf9ffcc7b0f5d56b66fd690ab1009867e45e7d2c4db0f
writing manifest
success
```
Next, install the Python dependencies:
### Supported Quantizations
```
python3 -m venv llm/llama.cpp/.venv
source llm/llama.cpp/.venv/bin/activate
pip install -r llm/llama.cpp/requirements.txt
<details>
<summary>Legacy Quantization</summary>
- `Q4_0`
- `Q4_1`
- `Q5_0`
- `Q5_1`
- `Q8_0`
</details>
<details>
<summary>K-means Quantization</summary>`
- `Q3_K_S`
- `Q3_K_M`
- `Q3_K_L`
- `Q4_K_S`
- `Q4_K_M`
- `Q5_K_S`
- `Q5_K_M`
- `Q6_K`
</details>
> [!NOTE]
> Activation-aware Weight Quantization (i.e. IQ) are not currently supported for automatic quantization however you can still import the quantized model into Ollama, see [Import GGUF](#import-gguf).
## Template Detection
> [!NOTE]
> Template detection requires v0.1.42 or higher.
Ollama uses model metadata, specifically `tokenizer.chat_template`, to automatically create a template appropriate for the model you're importing.
```dockerfile
FROM /path/to/my/gemma/model
```
Then build the `quantize` tool:
```
make -C llm/llama.cpp quantize
```shell
$ ollama create mymodel
transferring model data
using autodetected template gemma-instruct
creating new layer sha256:baa2a0edc27d19cc6b7537578a9a7ba1a4e3214dc185ed5ae43692b319af7b84
creating new layer sha256:ba66c3309914dbef07e5149a648fd1877f030d337a4f240d444ea335008943cb
writing manifest
success
```
### Clone the HuggingFace repository (optional)
If the model is currently hosted in a HuggingFace repository, first clone that repository to download the raw model.
Install [Git LFS](https://docs.github.com/en/repositories/working-with-files/managing-large-files/installing-git-large-file-storage), verify it's installed, and then clone the model's repository:
```
git lfs install
git clone https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1 model
```
### Convert the model
> Note: some model architectures require using specific convert scripts. For example, Qwen models require running `convert-hf-to-gguf.py` instead of `convert.py`
```
python llm/llama.cpp/convert.py ./model --outtype f16 --outfile converted.bin
```
### Quantize the model
```
llm/llama.cpp/quantize converted.bin quantized.bin q4_0
```
### Step 3: Write a `Modelfile`
Next, create a `Modelfile` for your model:
```
FROM quantized.bin
TEMPLATE "[INST] {{ .Prompt }} [/INST]"
```
### Step 4: Create the Ollama model
Finally, create a model from your `Modelfile`:
```
ollama create example -f Modelfile
```
### Step 5: Run your model
Next, test the model with `ollama run`:
```
ollama run example "What is your favourite condiment?"
```
## Publishing your model (optional early alpha)
Publishing models is in early alpha. If you'd like to publish your model to share with others, follow these steps:
1. Create [an account](https://ollama.com/signup)
2. Copy your Ollama public key:
- macOS: `cat ~/.ollama/id_ed25519.pub | pbcopy`
- Windows: `type %USERPROFILE%\.ollama\id_ed25519.pub`
- Linux: `cat /usr/share/ollama/.ollama/id_ed25519.pub`
3. Add your public key to your [Ollama account](https://ollama.com/settings/keys)
Next, copy your model to your username's namespace:
```
ollama cp example <your username>/example
```
> Note: model names may only contain lowercase letters, digits, and the characters `.`, `-`, and `_`.
Then push the model:
```
ollama push <your username>/example
```
After publishing, your model will be available at `https://ollama.com/<your username>/example`.
## Quantization reference
The quantization options are as follow (from highest highest to lowest levels of quantization). Note: some architectures such as Falcon do not support K quants.
- `q2_K`
- `q3_K`
- `q3_K_S`
- `q3_K_M`
- `q3_K_L`
- `q4_0` (recommended)
- `q4_1`
- `q4_K`
- `q4_K_S`
- `q4_K_M`
- `q5_0`
- `q5_1`
- `q5_K`
- `q5_K_S`
- `q5_K_M`
- `q6_K`
- `q8_0`
- `f16`
Defining a template in the Modelfile will disable this feature which may be useful if you want to use a different template than the autodetected one.

View File

@@ -100,6 +100,16 @@ sudo curl -L https://ollama.com/download/ollama-linux-amd64 -o /usr/bin/ollama
sudo chmod +x /usr/bin/ollama
```
## Installing specific versions
Use `OLLAMA_VERSION` environment variable with the install script to install a specific version of Ollama, including pre-releases. You can find the version numbers in the [releases page](https://github.com/ollama/ollama/releases).
For example:
```
curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION=0.1.32 sh
```
## Viewing logs
To view logs of Ollama running as a startup service, run:

View File

@@ -77,13 +77,21 @@ LOADER_MAPPING = {
def load_single_document(file_path: str) -> List[Document]:
ext = "." + file_path.rsplit(".", 1)[-1]
if ext in LOADER_MAPPING:
loader_class, loader_args = LOADER_MAPPING[ext]
loader = loader_class(file_path, **loader_args)
return loader.load()
if os.path.getsize(file_path) != 0:
filename, ext = os.path.splitext(file_path)
if ext in LOADER_MAPPING:
loader_class, loader_args = LOADER_MAPPING[ext]
try:
loader = loader_class(file_path, **loader_args)
if loader:
return loader.load()
except:
print(f"Corrupted file {file_path}. Ignoring it.")
else:
print(f"Unsupported file {file_path}. Ignoring it.")
else:
print(f"Empty file {file_path}. Ignoring it.")
raise ValueError(f"Unsupported file extension '{ext}'")
def load_documents(source_dir: str, ignored_files: List[str] = []) -> List[Document]:
"""
@@ -100,7 +108,8 @@ def load_documents(source_dir: str, ignored_files: List[str] = []) -> List[Docum
results = []
with tqdm(total=len(filtered_files), desc='Loading new documents', ncols=80) as pbar:
for i, docs in enumerate(pool.imap_unordered(load_single_document, filtered_files)):
results.extend(docs)
if docs:
results.extend(docs)
pbar.update()
return results

View File

@@ -11,4 +11,5 @@ tabulate==0.9.0
pandoc==2.3
pypandoc==1.11
tqdm==4.66.1
sentence_transformers==2.2.2
sentence_transformers==2.2.2
numpy>=1.22.2 # not directly required, pinned by Snyk to avoid a vulnerability

View File

@@ -231,8 +231,7 @@ const (
// Magic constant for `ggla` files (LoRA adapter).
FILE_MAGIC_GGLA = 0x67676C61
// Magic constant for `gguf` files (versioned, gguf)
FILE_MAGIC_GGUF_LE = 0x46554747
FILE_MAGIC_GGUF_BE = 0x47475546
FILE_MAGIC_GGUF = 0x46554747
)
var ErrUnsupportedFormat = errors.New("unsupported model format")
@@ -247,7 +246,7 @@ func DetectGGMLType(b []byte) string {
return "ggjt"
case FILE_MAGIC_GGLA:
return "ggla"
case FILE_MAGIC_GGUF_LE, FILE_MAGIC_GGUF_BE:
case FILE_MAGIC_GGUF:
return "gguf"
default:
return ""
@@ -255,21 +254,19 @@ func DetectGGMLType(b []byte) string {
}
func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
var magic uint32
var magic [4]byte
if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
return nil, 0, err
}
var c container
switch magic {
switch binary.LittleEndian.Uint32(magic[:]) {
case FILE_MAGIC_GGML, FILE_MAGIC_GGMF, FILE_MAGIC_GGJT:
return nil, 0, ErrUnsupportedFormat
case FILE_MAGIC_GGLA:
c = &containerGGLA{}
case FILE_MAGIC_GGUF_LE:
case FILE_MAGIC_GGUF:
c = &containerGGUF{ByteOrder: binary.LittleEndian}
case FILE_MAGIC_GGUF_BE:
c = &containerGGUF{ByteOrder: binary.BigEndian}
default:
return nil, 0, errors.New("invalid file magic")
}

View File

@@ -36,10 +36,23 @@ func (c *containerGGUF) Name() string {
}
func (c *containerGGUF) Decode(rs io.ReadSeeker) (model, error) {
if err := binary.Read(rs, c.ByteOrder, &c.Version); err != nil {
var version [4]byte
if err := binary.Read(rs, c.ByteOrder, &version); err != nil {
return nil, err
}
// if the lower 16 bits are 0, the byte order is probably wrong
if c.ByteOrder.Uint32(version[:])&1<<4 == 0 {
switch c.ByteOrder {
case binary.LittleEndian:
c.ByteOrder = binary.BigEndian
case binary.BigEndian:
c.ByteOrder = binary.LittleEndian
}
}
c.Version = c.ByteOrder.Uint32(version[:])
var err error
switch c.Version {
case 1:

View File

@@ -606,7 +606,7 @@ array ::=
string ::=
"\"" (
[^"\\] |
[^"\\\x7F\x00-\x1F] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
)* "\"" ws

View File

@@ -3,12 +3,15 @@ package parser
import (
"bufio"
"bytes"
"encoding/binary"
"errors"
"fmt"
"io"
"log/slog"
"strconv"
"strings"
"unicode"
"unicode/utf16"
"unicode/utf8"
)
type File struct {
@@ -69,33 +72,31 @@ func ParseFile(r io.Reader) (*File, error) {
var b bytes.Buffer
var role string
var lineCount int
var linePos int
var utf16 bool
var f File
br := bufio.NewReader(r)
for {
r, _, err := br.ReadRune()
if errors.Is(err, io.EOF) {
break
} else if err != nil {
var sc scannerDecoder = utf8ScannerDecoder{}
if bom, err := br.Peek(2); err != nil {
slog.Warn("error reading byte-order mark", "error", err)
} else if bytes.Equal(bom, []byte{0xFE, 0xFF}) {
sc = utf16ScannerDecoder{binary.LittleEndian}
//nolint:errcheck
br.Discard(2)
} else if bytes.Equal(bom, []byte{0xFF, 0xFE}) {
sc = utf16ScannerDecoder{binary.BigEndian}
//nolint:errcheck
br.Discard(2)
}
scanner := bufio.NewScanner(br)
scanner.Split(sc.ScanBytes)
for scanner.Scan() {
r, err := sc.DecodeRune(scanner.Bytes())
if err != nil {
return nil, err
}
// the utf16 byte order mark will be read as "unreadable" by ReadRune()
if isUnreadable(r) && lineCount == 0 && linePos == 0 {
utf16 = true
continue
}
// skip the second byte if we're reading utf16
if utf16 && r == 0 {
continue
}
next, r, err := parseRuneForState(r, curr)
if errors.Is(err, io.ErrUnexpectedEOF) {
return nil, fmt.Errorf("%w: %s", err, b.String())
@@ -103,13 +104,6 @@ func ParseFile(r io.Reader) (*File, error) {
return nil, err
}
if isNewline(r) {
lineCount++
linePos = 0
} else {
linePos++
}
// process the state transition, some transitions need to be intercepted and redirected
if next != curr {
switch curr {
@@ -309,10 +303,6 @@ func isNewline(r rune) bool {
return r == '\r' || r == '\n'
}
func isUnreadable(r rune) bool {
return r == unicode.ReplacementChar
}
func isValidMessageRole(role string) bool {
return role == "system" || role == "user" || role == "assistant"
}
@@ -325,3 +315,39 @@ func isValidCommand(cmd string) bool {
return false
}
}
type scannerDecoder interface {
ScanBytes(data []byte, atEOF bool) (advance int, token []byte, err error)
DecodeRune([]byte) (rune, error)
}
type utf8ScannerDecoder struct{}
func (utf8ScannerDecoder) ScanBytes(data []byte, atEOF bool) (advance int, token []byte, err error) {
return scanBytesN(data, 1, atEOF)
}
func (utf8ScannerDecoder) DecodeRune(data []byte) (rune, error) {
r, _ := utf8.DecodeRune(data)
return r, nil
}
type utf16ScannerDecoder struct {
binary.ByteOrder
}
func (utf16ScannerDecoder) ScanBytes(data []byte, atEOF bool) (advance int, token []byte, err error) {
return scanBytesN(data, 2, atEOF)
}
func (e utf16ScannerDecoder) DecodeRune(data []byte) (rune, error) {
return utf16.Decode([]uint16{e.ByteOrder.Uint16(data)})[0], nil
}
func scanBytesN(data []byte, n int, atEOF bool) (int, []byte, error) {
if atEOF && len(data) == 0 {
return 0, nil, nil
}
return n, data[:n], nil
}

View File

@@ -3,6 +3,7 @@ package server
import (
"crypto/sha256"
"encoding/json"
"errors"
"fmt"
"io"
"log/slog"
@@ -43,7 +44,9 @@ func (m *Manifest) Remove() error {
func (m *Manifest) RemoveLayers() error {
for _, layer := range append(m.Layers, m.Config) {
if err := layer.Remove(); err != nil {
if err := layer.Remove(); errors.Is(err, os.ErrNotExist) {
slog.Debug("layer does not exist", "digest", layer.Digest)
} else if err != nil {
return err
}
}

View File

@@ -747,8 +747,8 @@ func (s *Server) ListModelsHandler(c *gin.Context) {
// tag should never be masked
models = append(models, api.ListModelResponse{
Model: n.DisplayShortest(),
Name: n.DisplayShortest(),
Model: trimLatest(n.DisplayShortest()),
Name: trimLatest(n.DisplayShortest()),
Size: m.Size(),
Digest: m.digest,
ModifiedAt: m.fi.ModTime(),
@@ -1156,8 +1156,8 @@ func (s *Server) ProcessHandler(c *gin.Context) {
}
mr := api.ProcessModelResponse{
Model: model.ShortName,
Name: model.ShortName,
Model: trimLatest(model.ShortName),
Name: trimLatest(model.ShortName),
Size: int64(v.estimatedTotal),
SizeVRAM: int64(v.estimatedVRAM),
Digest: model.Digest,
@@ -1178,6 +1178,13 @@ func (s *Server) ProcessHandler(c *gin.Context) {
c.JSON(http.StatusOK, api.ProcessResponse{Models: models})
}
func trimLatest(s string) string {
if strings.HasSuffix(s, ":latest") {
return s[:len(s)-7]
}
return s
}
// ChatPrompt builds up a prompt from a series of messages for the currently `loaded` model
func chatPrompt(ctx context.Context, runner *runnerRef, template string, messages []api.Message, numCtx int) (string, error) {
encode := func(s string) ([]int, error) {

View File

@@ -1,12 +1,15 @@
package server
import (
"bytes"
"encoding/json"
"fmt"
"net/http"
"path/filepath"
"testing"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/types/model"
)
func TestDelete(t *testing.T) {
@@ -69,3 +72,33 @@ func TestDelete(t *testing.T) {
checkFileExists(t, filepath.Join(p, "manifests", "*", "*", "*", "*"), []string{})
checkFileExists(t, filepath.Join(p, "blobs", "*"), []string{})
}
func TestDeleteDuplicateLayers(t *testing.T) {
p := t.TempDir()
t.Setenv("OLLAMA_MODELS", p)
var s Server
n := model.ParseName("test")
var b bytes.Buffer
if err := json.NewEncoder(&b).Encode(&ConfigV2{}); err != nil {
t.Fatal(err)
}
config, err := NewLayer(&b, "application/vnd.docker.container.image.v1+json")
if err != nil {
t.Fatal(err)
}
// create a manifest with duplicate layers
if err := WriteManifest(n, config, []*Layer{config}); err != nil {
t.Fatal(err)
}
w := createRequest(t, s.DeleteModelHandler, api.DeleteRequest{Name: "test"})
if w.Code != http.StatusOK {
t.Errorf("expected status code 200, actual %d", w.Code)
}
checkFileExists(t, filepath.Join(p, "manifests", "*", "*", "*", "*"), []string{})
}

View File

@@ -16,12 +16,12 @@ func TestList(t *testing.T) {
expectNames := []string{
"mistral:7b-instruct-q4_0",
"zephyr:7b-beta-q5_K_M",
"apple/OpenELM:latest",
"apple/OpenELM",
"boreas:2b-code-v1.5-q6_K",
"notus:7b-v1-IQ2_S",
// TODO: host:port currently fails on windows (#4107)
// "localhost:5000/library/eurus:700b-v0.5-iq3_XXS",
"mynamespace/apeliotes:latest",
"mynamespace/apeliotes",
"myhost/mynamespace/lips:code",
}

View File

@@ -123,7 +123,7 @@ func Test_Routes(t *testing.T) {
require.NoError(t, err)
assert.Len(t, modelList.Models, 1)
assert.Equal(t, "test-model:latest", modelList.Models[0].Name)
assert.Equal(t, "test-model", modelList.Models[0].Name)
},
},
{