mirror of
https://github.com/ollama/ollama.git
synced 2026-01-08 07:30:53 -05:00
Compare commits
72 Commits
parth/tool
...
brucemacd/
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d5eae8248d | ||
|
|
9f8a18ec05 | ||
|
|
6b04cad7e8 | ||
|
|
45f56355d5 | ||
|
|
0dabb4ef6a | ||
|
|
2e77aa1ae7 | ||
|
|
deaabe292d | ||
|
|
af21a5ac39 | ||
|
|
f63d7f68eb | ||
|
|
82ad1dbc07 | ||
|
|
feeabdadd2 | ||
|
|
fc0309615e | ||
|
|
09d308d6b6 | ||
|
|
a8ed68bd93 | ||
|
|
2ae65ae471 | ||
|
|
a3b6886b7d | ||
|
|
c6a6d7294d | ||
|
|
2cf007c9d1 | ||
|
|
0683efa637 | ||
|
|
0943001193 | ||
|
|
5c42800fca | ||
|
|
65f10c2823 | ||
|
|
aaa7818000 | ||
|
|
f15ffc4320 | ||
|
|
5f57b0ef42 | ||
|
|
aa25aff10d | ||
|
|
ea79003180 | ||
|
|
9239a254e0 | ||
|
|
066d0f4746 | ||
|
|
aea6fb9b58 | ||
|
|
012cf65340 | ||
|
|
a45231af47 | ||
|
|
2307fc2bcd | ||
|
|
6623898198 | ||
|
|
eda472df1b | ||
|
|
f18e0cb550 | ||
|
|
e8b981fa5d | ||
|
|
884d26093c | ||
|
|
1f371ea92f | ||
|
|
73d6a82cce | ||
|
|
6db8a3771c | ||
|
|
d950ff12c0 | ||
|
|
adff143bcd | ||
|
|
fbe6ae285a | ||
|
|
fdd4d479a3 | ||
|
|
61aeaf7e81 | ||
|
|
7359b02707 | ||
|
|
c890011322 | ||
|
|
e0ed984cde | ||
|
|
139f84cf21 | ||
|
|
375839ea2d | ||
|
|
69b2fe9282 | ||
|
|
9ed8bf14cb | ||
|
|
e6a800ca11 | ||
|
|
ff180c3466 | ||
|
|
3fe74fba42 | ||
|
|
1a0cfd080a | ||
|
|
94ab428e3f | ||
|
|
d755577473 | ||
|
|
a2cc8571c5 | ||
|
|
7edfdd2f5f | ||
|
|
333e360422 | ||
|
|
27da2cddc5 | ||
|
|
feb8923ada | ||
|
|
fe623c2cf4 | ||
|
|
3c14461d5d | ||
|
|
499ae7311f | ||
|
|
ef202789fa | ||
|
|
55760195e6 | ||
|
|
bd68d3ae50 | ||
|
|
ff80718e9c | ||
|
|
0aa8b371dd |
@@ -51,6 +51,8 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/include
|
||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu)
|
||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu/amx)
|
||||
|
||||
add_compile_definitions(NDEBUG)
|
||||
|
||||
set(GGML_CPU ON)
|
||||
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src)
|
||||
set_property(TARGET ggml PROPERTY EXCLUDE_FROM_ALL TRUE)
|
||||
|
||||
@@ -40,10 +40,10 @@ The official [Ollama Docker image](https://hub.docker.com/r/ollama/ollama) `olla
|
||||
|
||||
## Quickstart
|
||||
|
||||
To run and chat with [Llama 3.2](https://ollama.com/library/llama3.2):
|
||||
To run and chat with [Gemma 3](https://ollama.com/library/gemma3):
|
||||
|
||||
```shell
|
||||
ollama run llama3.2
|
||||
ollama run gemma3
|
||||
```
|
||||
|
||||
## Model library
|
||||
@@ -405,6 +405,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
- [Writeopia](https://github.com/Writeopia/Writeopia) (Text editor with integration with Ollama)
|
||||
- [AppFlowy](https://github.com/AppFlowy-IO/AppFlowy) (AI collaborative workspace with Ollama, cross-platform and self-hostable)
|
||||
- [Lumina](https://github.com/cushydigit/lumina.git) (A lightweight, minimal React.js frontend for interacting with Ollama servers)
|
||||
- [Tiny Notepad](https://pypi.org/project/tiny-notepad) (A lightweight, notepad-like interface to chat with ollama available on PyPI)
|
||||
- [macLlama (macOS native)](https://github.com/hellotunamayo/macLlama) (A native macOS GUI application for interacting with Ollama models, featuring a chat interface.)
|
||||
|
||||
### Cloud
|
||||
|
||||
@@ -448,6 +450,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
- [orbiton](https://github.com/xyproto/orbiton) Configuration-free text editor and IDE with support for tab completion with Ollama.
|
||||
- [orca-cli](https://github.com/molbal/orca-cli) Ollama Registry CLI Application - Browse, pull, and download models from Ollama Registry in your terminal.
|
||||
- [GGUF-to-Ollama](https://github.com/jonathanhecl/gguf-to-ollama) - Importing GGUF to Ollama made easy (multiplatform)
|
||||
- [AWS-Strands-With-Ollama](https://github.com/rapidarchitect/ollama_strands) - AWS Strands Agents with Ollama Examples
|
||||
- [ollama-multirun](https://github.com/attogram/ollama-multirun) - A bash shell script to run a single prompt against any or all of your locally installed ollama models, saving the output and performance statistics as easily navigable web pages. ([Demo](https://attogram.github.io/ai_test_zone/))
|
||||
|
||||
### Apple Vision Pro
|
||||
|
||||
@@ -584,6 +588,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
- [Simple-Discord-AI](https://github.com/zyphixor/simple-discord-ai)
|
||||
- [LLM Telegram Bot](https://github.com/innightwolfsleep/llm_telegram_bot) (telegram bot, primary for RP. Oobabooga-like buttons, [A1111](https://github.com/AUTOMATIC1111/stable-diffusion-webui) API integration e.t.c)
|
||||
- [mcp-llm](https://github.com/sammcj/mcp-llm) (MCP Server to allow LLMs to call other LLMs)
|
||||
- [SimpleOllamaUnity](https://github.com/HardCodeDev777/SimpleOllamaUnity) (Unity Engine extension for communicating with Ollama in a few lines of code. Also works at runtime)
|
||||
- [UnityCodeLama](https://github.com/HardCodeDev777/UnityCodeLama) (Unity Edtior tool to analyze scripts via Ollama)
|
||||
|
||||
### Supported backends
|
||||
|
||||
@@ -24,7 +24,10 @@ import (
|
||||
"net/http"
|
||||
"net/url"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
"github.com/ollama/ollama/auth"
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/version"
|
||||
@@ -76,6 +79,14 @@ func NewClient(base *url.URL, http *http.Client) *Client {
|
||||
}
|
||||
}
|
||||
|
||||
func getAuthorizationToken(ctx context.Context, challenge string) (string, error) {
|
||||
token, err := auth.Sign(ctx, []byte(challenge))
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return token, nil
|
||||
}
|
||||
|
||||
func (c *Client) do(ctx context.Context, method, path string, reqData, respData any) error {
|
||||
var reqBody io.Reader
|
||||
var data []byte
|
||||
@@ -97,6 +108,21 @@ func (c *Client) do(ctx context.Context, method, path string, reqData, respData
|
||||
}
|
||||
|
||||
requestURL := c.base.JoinPath(path)
|
||||
|
||||
var token string
|
||||
if envconfig.UseAuth() || c.base.Hostname() == "ollama.com" {
|
||||
now := strconv.FormatInt(time.Now().Unix(), 10)
|
||||
chal := fmt.Sprintf("%s,%s?ts=%s", method, path, now)
|
||||
token, err = getAuthorizationToken(ctx, chal)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
q := requestURL.Query()
|
||||
q.Set("ts", now)
|
||||
requestURL.RawQuery = q.Encode()
|
||||
}
|
||||
|
||||
request, err := http.NewRequestWithContext(ctx, method, requestURL.String(), reqBody)
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -106,6 +132,10 @@ func (c *Client) do(ctx context.Context, method, path string, reqData, respData
|
||||
request.Header.Set("Accept", "application/json")
|
||||
request.Header.Set("User-Agent", fmt.Sprintf("ollama/%s (%s %s) Go/%s", version.Version, runtime.GOARCH, runtime.GOOS, runtime.Version()))
|
||||
|
||||
if token != "" {
|
||||
request.Header.Set("Authorization", token)
|
||||
}
|
||||
|
||||
respObj, err := c.http.Do(request)
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -143,6 +173,22 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f
|
||||
}
|
||||
|
||||
requestURL := c.base.JoinPath(path)
|
||||
|
||||
var token string
|
||||
if envconfig.UseAuth() || c.base.Hostname() == "ollama.com" {
|
||||
var err error
|
||||
now := strconv.FormatInt(time.Now().Unix(), 10)
|
||||
chal := fmt.Sprintf("%s,%s?ts=%s", method, path, now)
|
||||
token, err = getAuthorizationToken(ctx, chal)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
q := requestURL.Query()
|
||||
q.Set("ts", now)
|
||||
requestURL.RawQuery = q.Encode()
|
||||
}
|
||||
|
||||
request, err := http.NewRequestWithContext(ctx, method, requestURL.String(), buf)
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -152,6 +198,10 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f
|
||||
request.Header.Set("Accept", "application/x-ndjson")
|
||||
request.Header.Set("User-Agent", fmt.Sprintf("ollama/%s (%s %s) Go/%s", version.Version, runtime.GOARCH, runtime.GOOS, runtime.Version()))
|
||||
|
||||
if token != "" {
|
||||
request.Header.Set("Authorization", token)
|
||||
}
|
||||
|
||||
response, err := c.http.Do(request)
|
||||
if err != nil {
|
||||
return err
|
||||
|
||||
21
api/types.go
21
api/types.go
@@ -83,6 +83,12 @@ type GenerateRequest struct {
|
||||
// Options lists model-specific options. For example, temperature can be
|
||||
// set through this field, if the model supports it.
|
||||
Options map[string]any `json:"options"`
|
||||
|
||||
// Think controls whether thinking/reasoning models will think before
|
||||
// responding. Needs to be a pointer so we can distinguish between false
|
||||
// (request that thinking _not_ be used) and unset (use the old behavior
|
||||
// before this option was introduced)
|
||||
Think *bool `json:"think,omitempty"`
|
||||
}
|
||||
|
||||
// ChatRequest describes a request sent by [Client.Chat].
|
||||
@@ -108,6 +114,10 @@ type ChatRequest struct {
|
||||
|
||||
// Options lists model-specific options.
|
||||
Options map[string]any `json:"options"`
|
||||
|
||||
// Think controls whether thinking/reasoning models will think before
|
||||
// responding
|
||||
Think *bool `json:"think,omitempty"`
|
||||
}
|
||||
|
||||
type Tools []Tool
|
||||
@@ -126,8 +136,11 @@ func (t Tool) String() string {
|
||||
// role ("system", "user", or "assistant"), the content and an optional list
|
||||
// of images.
|
||||
type Message struct {
|
||||
Role string `json:"role"`
|
||||
Content string `json:"content"`
|
||||
Role string `json:"role"`
|
||||
Content string `json:"content"`
|
||||
// Thinking contains the text that was inside thinking tags in the
|
||||
// original model output when ChatRequest.Think is enabled.
|
||||
Thinking string `json:"thinking,omitempty"`
|
||||
Images []ImageData `json:"images,omitempty"`
|
||||
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
|
||||
}
|
||||
@@ -478,6 +491,10 @@ type GenerateResponse struct {
|
||||
// Response is the textual response itself.
|
||||
Response string `json:"response"`
|
||||
|
||||
// Thinking contains the text that was inside thinking tags in the
|
||||
// original model output when ChatRequest.Think is enabled.
|
||||
Thinking string `json:"thinking,omitempty"`
|
||||
|
||||
// Done specifies if the response is complete.
|
||||
Done bool `json:"done"`
|
||||
|
||||
|
||||
@@ -372,3 +372,50 @@ func TestPropertyType_MarshalJSON(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestThinking_UnmarshalJSON(t *testing.T) {
|
||||
trueVal := true
|
||||
falseVal := false
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
input string
|
||||
expectedThinking *bool
|
||||
expectedError bool
|
||||
}{
|
||||
{
|
||||
name: "true",
|
||||
input: `{ "think": true }`,
|
||||
expectedThinking: &trueVal,
|
||||
},
|
||||
{
|
||||
name: "false",
|
||||
input: `{ "think": false }`,
|
||||
expectedThinking: &falseVal,
|
||||
},
|
||||
{
|
||||
name: "unset",
|
||||
input: `{ }`,
|
||||
expectedThinking: nil,
|
||||
},
|
||||
{
|
||||
name: "invalid",
|
||||
input: `{ "think": "true" }`,
|
||||
expectedThinking: nil,
|
||||
expectedError: true,
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
t.Run(test.name, func(t *testing.T) {
|
||||
var req GenerateRequest
|
||||
err := json.Unmarshal([]byte(test.input), &req)
|
||||
if test.expectedError {
|
||||
require.Error(t, err)
|
||||
} else {
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, test.expectedThinking, req.Think)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
234
cmd/cmd.go
234
cmd/cmd.go
@@ -39,6 +39,7 @@ import (
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/parser"
|
||||
"github.com/ollama/ollama/progress"
|
||||
"github.com/ollama/ollama/readline"
|
||||
"github.com/ollama/ollama/runner"
|
||||
"github.com/ollama/ollama/server"
|
||||
"github.com/ollama/ollama/types/model"
|
||||
@@ -46,6 +47,23 @@ import (
|
||||
"github.com/ollama/ollama/version"
|
||||
)
|
||||
|
||||
// ensureThinkingSupport emits a warning if the model does not advertise thinking support
|
||||
func ensureThinkingSupport(ctx context.Context, client *api.Client, name string) {
|
||||
if name == "" {
|
||||
return
|
||||
}
|
||||
resp, err := client.Show(ctx, &api.ShowRequest{Model: name})
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
for _, cap := range resp.Capabilities {
|
||||
if cap == model.CapabilityThinking {
|
||||
return
|
||||
}
|
||||
}
|
||||
fmt.Fprintf(os.Stderr, "warning: model %q does not support thinking output\n", name)
|
||||
}
|
||||
|
||||
var errModelfileNotFound = errors.New("specified Modelfile wasn't found")
|
||||
|
||||
func getModelfileName(cmd *cobra.Command) (string, error) {
|
||||
@@ -265,6 +283,9 @@ func loadOrUnloadModel(cmd *cobra.Command, opts *runOptions) error {
|
||||
req := &api.GenerateRequest{
|
||||
Model: opts.Model,
|
||||
KeepAlive: opts.KeepAlive,
|
||||
|
||||
// pass Think here so we fail before getting to the chat prompt if the model doesn't support it
|
||||
Think: opts.Think,
|
||||
}
|
||||
|
||||
return client.Generate(cmd.Context(), req, func(api.GenerateResponse) error { return nil })
|
||||
@@ -299,6 +320,22 @@ func RunHandler(cmd *cobra.Command, args []string) error {
|
||||
}
|
||||
opts.Format = format
|
||||
|
||||
thinkFlag := cmd.Flags().Lookup("think")
|
||||
if thinkFlag.Changed {
|
||||
think, err := cmd.Flags().GetBool("think")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
opts.Think = &think
|
||||
} else {
|
||||
opts.Think = nil
|
||||
}
|
||||
hidethinking, err := cmd.Flags().GetBool("hidethinking")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
opts.HideThinking = hidethinking
|
||||
|
||||
keepAlive, err := cmd.Flags().GetString("keepalive")
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -362,6 +399,11 @@ func RunHandler(cmd *cobra.Command, args []string) error {
|
||||
return err
|
||||
}
|
||||
|
||||
opts.Think, err = inferThinkingOption(&info.Capabilities, &opts, thinkFlag.Changed)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
opts.MultiModal = slices.Contains(info.Capabilities, model.CapabilityVision)
|
||||
|
||||
// TODO: remove the projector info and vision info checks below,
|
||||
@@ -747,11 +789,38 @@ func showInfo(resp *api.ShowResponse, verbose bool, w io.Writer) error {
|
||||
case float64:
|
||||
v = fmt.Sprintf("%g", vData)
|
||||
case []any:
|
||||
n := 3
|
||||
if len(vData) < n {
|
||||
n = len(vData)
|
||||
targetWidth := 10 // Small width where we are displaying the data in a column
|
||||
|
||||
var itemsToShow int
|
||||
totalWidth := 1 // Start with 1 for opening bracket
|
||||
|
||||
// Find how many we can fit
|
||||
for i := range vData {
|
||||
itemStr := fmt.Sprintf("%v", vData[i])
|
||||
width := runewidth.StringWidth(itemStr)
|
||||
|
||||
// Add separator width (", ") for all items except the first
|
||||
if i > 0 {
|
||||
width += 2
|
||||
}
|
||||
|
||||
// Check if adding this item would exceed our width limit
|
||||
if totalWidth+width > targetWidth && i > 0 {
|
||||
break
|
||||
}
|
||||
|
||||
totalWidth += width
|
||||
itemsToShow++
|
||||
}
|
||||
|
||||
// Format the output
|
||||
if itemsToShow < len(vData) {
|
||||
v = fmt.Sprintf("%v", vData[:itemsToShow])
|
||||
v = strings.TrimSuffix(v, "]")
|
||||
v += fmt.Sprintf(" ...+%d more]", len(vData)-itemsToShow)
|
||||
} else {
|
||||
v = fmt.Sprintf("%v", vData)
|
||||
}
|
||||
v = fmt.Sprintf("%v", vData[:n])
|
||||
default:
|
||||
v = fmt.Sprintf("%T", vData)
|
||||
}
|
||||
@@ -772,10 +841,19 @@ func showInfo(resp *api.ShowResponse, verbose bool, w io.Writer) error {
|
||||
|
||||
head := func(s string, n int) (rows [][]string) {
|
||||
scanner := bufio.NewScanner(strings.NewReader(s))
|
||||
for scanner.Scan() && (len(rows) < n || n < 0) {
|
||||
if text := scanner.Text(); text != "" {
|
||||
rows = append(rows, []string{"", strings.TrimSpace(text)})
|
||||
count := 0
|
||||
for scanner.Scan() {
|
||||
text := strings.TrimSpace(scanner.Text())
|
||||
if text == "" {
|
||||
continue
|
||||
}
|
||||
count++
|
||||
if n < 0 || count <= n {
|
||||
rows = append(rows, []string{"", text})
|
||||
}
|
||||
}
|
||||
if n >= 0 && count > n {
|
||||
rows = append(rows, []string{"", "..."})
|
||||
}
|
||||
return
|
||||
}
|
||||
@@ -887,17 +965,19 @@ func PullHandler(cmd *cobra.Command, args []string) error {
|
||||
type generateContextKey string
|
||||
|
||||
type runOptions struct {
|
||||
Model string
|
||||
ParentModel string
|
||||
Prompt string
|
||||
Messages []api.Message
|
||||
WordWrap bool
|
||||
Format string
|
||||
System string
|
||||
Images []api.ImageData
|
||||
Options map[string]any
|
||||
MultiModal bool
|
||||
KeepAlive *api.Duration
|
||||
Model string
|
||||
ParentModel string
|
||||
Prompt string
|
||||
Messages []api.Message
|
||||
WordWrap bool
|
||||
Format string
|
||||
System string
|
||||
Images []api.ImageData
|
||||
Options map[string]any
|
||||
MultiModal bool
|
||||
KeepAlive *api.Duration
|
||||
Think *bool
|
||||
HideThinking bool
|
||||
}
|
||||
|
||||
type displayResponseState struct {
|
||||
@@ -953,6 +1033,26 @@ func displayResponse(content string, wordWrap bool, state *displayResponseState)
|
||||
}
|
||||
}
|
||||
|
||||
func thinkingOutputOpeningText(plainText bool) string {
|
||||
text := "Thinking...\n"
|
||||
|
||||
if plainText {
|
||||
return text
|
||||
}
|
||||
|
||||
return readline.ColorGrey + readline.ColorBold + text + readline.ColorDefault + readline.ColorGrey
|
||||
}
|
||||
|
||||
func thinkingOutputClosingText(plainText bool) string {
|
||||
text := "...done thinking.\n\n"
|
||||
|
||||
if plainText {
|
||||
return text
|
||||
}
|
||||
|
||||
return readline.ColorGrey + readline.ColorBold + text + readline.ColorDefault
|
||||
}
|
||||
|
||||
func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
|
||||
client, err := api.ClientFromEnvironment()
|
||||
if err != nil {
|
||||
@@ -980,14 +1080,34 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
|
||||
var latest api.ChatResponse
|
||||
var fullResponse strings.Builder
|
||||
var role string
|
||||
var thinkTagOpened bool = false
|
||||
var thinkTagClosed bool = false
|
||||
|
||||
fn := func(response api.ChatResponse) error {
|
||||
p.StopAndClear()
|
||||
if response.Message.Content != "" || !opts.HideThinking {
|
||||
p.StopAndClear()
|
||||
}
|
||||
|
||||
latest = response
|
||||
|
||||
role = response.Message.Role
|
||||
if response.Message.Thinking != "" && !opts.HideThinking {
|
||||
if !thinkTagOpened {
|
||||
fmt.Print(thinkingOutputOpeningText(false))
|
||||
thinkTagOpened = true
|
||||
}
|
||||
displayResponse(response.Message.Thinking, opts.WordWrap, state)
|
||||
}
|
||||
|
||||
content := response.Message.Content
|
||||
if thinkTagOpened && !thinkTagClosed && content != "" {
|
||||
fmt.Print(thinkingOutputClosingText(false))
|
||||
thinkTagClosed = true
|
||||
}
|
||||
// purposefully not putting thinking blocks in the response, which would
|
||||
// only be needed if we later added tool calling to the cli (they get
|
||||
// filtered out anyway since current models don't expect them unless you're
|
||||
// about to finish some tool calls)
|
||||
fullResponse.WriteString(content)
|
||||
|
||||
displayResponse(content, opts.WordWrap, state)
|
||||
@@ -1004,6 +1124,7 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
|
||||
Messages: opts.Messages,
|
||||
Format: json.RawMessage(opts.Format),
|
||||
Options: opts.Options,
|
||||
Think: opts.Think,
|
||||
}
|
||||
|
||||
if opts.KeepAlive != nil {
|
||||
@@ -1065,13 +1186,32 @@ func generate(cmd *cobra.Command, opts runOptions) error {
|
||||
}()
|
||||
|
||||
var state *displayResponseState = &displayResponseState{}
|
||||
var thinkTagOpened bool = false
|
||||
var thinkTagClosed bool = false
|
||||
|
||||
plainText := !term.IsTerminal(int(os.Stdout.Fd()))
|
||||
|
||||
fn := func(response api.GenerateResponse) error {
|
||||
p.StopAndClear()
|
||||
|
||||
latest = response
|
||||
content := response.Response
|
||||
|
||||
if response.Response != "" || !opts.HideThinking {
|
||||
p.StopAndClear()
|
||||
}
|
||||
|
||||
if response.Thinking != "" && !opts.HideThinking {
|
||||
if !thinkTagOpened {
|
||||
fmt.Print(thinkingOutputOpeningText(plainText))
|
||||
thinkTagOpened = true
|
||||
}
|
||||
displayResponse(response.Thinking, opts.WordWrap, state)
|
||||
}
|
||||
|
||||
if thinkTagOpened && !thinkTagClosed && content != "" {
|
||||
fmt.Print(thinkingOutputClosingText(plainText))
|
||||
thinkTagClosed = true
|
||||
}
|
||||
|
||||
displayResponse(content, opts.WordWrap, state)
|
||||
|
||||
return nil
|
||||
@@ -1097,6 +1237,7 @@ func generate(cmd *cobra.Command, opts runOptions) error {
|
||||
System: opts.System,
|
||||
Options: opts.Options,
|
||||
KeepAlive: opts.KeepAlive,
|
||||
Think: opts.Think,
|
||||
}
|
||||
|
||||
if err := client.Generate(ctx, &request, fn); err != nil {
|
||||
@@ -1200,11 +1341,11 @@ func checkServerHeartbeat(cmd *cobra.Command, _ []string) error {
|
||||
return err
|
||||
}
|
||||
if err := client.Heartbeat(cmd.Context()); err != nil {
|
||||
if !strings.Contains(err.Error(), " refused") {
|
||||
if !(strings.Contains(err.Error(), " refused") || strings.Contains(err.Error(), "could not connect")) {
|
||||
return err
|
||||
}
|
||||
if err := startApp(cmd.Context(), client); err != nil {
|
||||
return errors.New("could not connect to ollama app, is it running?")
|
||||
return fmt.Errorf("ollama server not responding - %w", err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
@@ -1282,7 +1423,7 @@ func NewCLI() *cobra.Command {
|
||||
}
|
||||
|
||||
createCmd.Flags().StringP("file", "f", "", "Name of the Modelfile (default \"Modelfile\"")
|
||||
createCmd.Flags().StringP("quantize", "q", "", "Quantize model to this level (e.g. q4_0)")
|
||||
createCmd.Flags().StringP("quantize", "q", "", "Quantize model to this level (e.g. q4_K_M)")
|
||||
|
||||
showCmd := &cobra.Command{
|
||||
Use: "show MODEL",
|
||||
@@ -1312,6 +1453,8 @@ func NewCLI() *cobra.Command {
|
||||
runCmd.Flags().Bool("insecure", false, "Use an insecure registry")
|
||||
runCmd.Flags().Bool("nowordwrap", false, "Don't wrap words to the next line automatically")
|
||||
runCmd.Flags().String("format", "", "Response format (e.g. json)")
|
||||
runCmd.Flags().Bool("think", false, "Whether to use thinking mode for supported models")
|
||||
runCmd.Flags().Bool("hidethinking", false, "Hide thinking output (if provided)")
|
||||
|
||||
stopCmd := &cobra.Command{
|
||||
Use: "stop MODEL",
|
||||
@@ -1363,7 +1506,6 @@ func NewCLI() *cobra.Command {
|
||||
PreRunE: checkServerHeartbeat,
|
||||
RunE: ListRunningHandler,
|
||||
}
|
||||
|
||||
copyCmd := &cobra.Command{
|
||||
Use: "cp SOURCE DESTINATION",
|
||||
Short: "Copy a model",
|
||||
@@ -1452,3 +1594,45 @@ func NewCLI() *cobra.Command {
|
||||
|
||||
return rootCmd
|
||||
}
|
||||
|
||||
// If the user has explicitly set thinking options, either through the CLI or
|
||||
// through the `/set think` or `set nothink` interactive options, then we
|
||||
// respect them. Otherwise, we check model capabilities to see if the model
|
||||
// supports thinking. If the model does support thinking, we enable it.
|
||||
// Otherwise, we unset the thinking option (which is different than setting it
|
||||
// to false).
|
||||
//
|
||||
// If capabilities are not provided, we fetch them from the server.
|
||||
func inferThinkingOption(caps *[]model.Capability, runOpts *runOptions, explicitlySetByUser bool) (*bool, error) {
|
||||
if explicitlySetByUser {
|
||||
return runOpts.Think, nil
|
||||
}
|
||||
|
||||
if caps == nil {
|
||||
client, err := api.ClientFromEnvironment()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
ret, err := client.Show(context.Background(), &api.ShowRequest{
|
||||
Model: runOpts.Model,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
caps = &ret.Capabilities
|
||||
}
|
||||
|
||||
thinkingSupported := false
|
||||
for _, cap := range *caps {
|
||||
if cap == model.CapabilityThinking {
|
||||
thinkingSupported = true
|
||||
}
|
||||
}
|
||||
|
||||
if thinkingSupported {
|
||||
thinking := true
|
||||
return &thinking, nil
|
||||
}
|
||||
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
@@ -225,6 +225,7 @@ Weigh anchor!
|
||||
System
|
||||
You are a pirate!
|
||||
Ahoy, matey!
|
||||
...
|
||||
|
||||
`
|
||||
if diff := cmp.Diff(expect, b.String()); diff != "" {
|
||||
|
||||
@@ -62,6 +62,8 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
||||
fmt.Fprintln(os.Stderr, " /set noformat Disable formatting")
|
||||
fmt.Fprintln(os.Stderr, " /set verbose Show LLM stats")
|
||||
fmt.Fprintln(os.Stderr, " /set quiet Disable LLM stats")
|
||||
fmt.Fprintln(os.Stderr, " /set think Enable thinking")
|
||||
fmt.Fprintln(os.Stderr, " /set nothink Disable thinking")
|
||||
fmt.Fprintln(os.Stderr, "")
|
||||
}
|
||||
|
||||
@@ -128,6 +130,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
||||
|
||||
var sb strings.Builder
|
||||
var multiline MultilineState
|
||||
var thinkExplicitlySet bool = opts.Think != nil
|
||||
|
||||
for {
|
||||
line, err := scanner.Readline()
|
||||
@@ -195,11 +198,19 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
||||
opts.Model = args[1]
|
||||
opts.Messages = []api.Message{}
|
||||
fmt.Printf("Loading model '%s'\n", opts.Model)
|
||||
opts.Think, err = inferThinkingOption(nil, &opts, thinkExplicitlySet)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := loadOrUnloadModel(cmd, &opts); err != nil {
|
||||
if strings.Contains(err.Error(), "not found") {
|
||||
fmt.Printf("error: %v\n", err)
|
||||
continue
|
||||
}
|
||||
if strings.Contains(err.Error(), "does not support thinking") {
|
||||
fmt.Printf("error: %v\n", err)
|
||||
continue
|
||||
}
|
||||
return err
|
||||
}
|
||||
continue
|
||||
@@ -260,6 +271,22 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
||||
return err
|
||||
}
|
||||
fmt.Println("Set 'quiet' mode.")
|
||||
case "think":
|
||||
think := true
|
||||
opts.Think = &think
|
||||
thinkExplicitlySet = true
|
||||
if client, err := api.ClientFromEnvironment(); err == nil {
|
||||
ensureThinkingSupport(cmd.Context(), client, opts.Model)
|
||||
}
|
||||
fmt.Println("Set 'think' mode.")
|
||||
case "nothink":
|
||||
think := false
|
||||
opts.Think = &think
|
||||
thinkExplicitlySet = true
|
||||
if client, err := api.ClientFromEnvironment(); err == nil {
|
||||
ensureThinkingSupport(cmd.Context(), client, opts.Model)
|
||||
}
|
||||
fmt.Println("Set 'nothink' mode.")
|
||||
case "format":
|
||||
if len(args) < 3 || args[2] != "json" {
|
||||
fmt.Println("Invalid or missing format. For 'json' mode use '/set format json'")
|
||||
@@ -448,6 +475,11 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
||||
|
||||
assistant, err := chat(cmd, opts)
|
||||
if err != nil {
|
||||
if strings.Contains(err.Error(), "does not support thinking") {
|
||||
fmt.Printf("error: %v\n", err)
|
||||
sb.Reset()
|
||||
continue
|
||||
}
|
||||
return err
|
||||
}
|
||||
if assistant != nil {
|
||||
|
||||
@@ -5,7 +5,7 @@ import (
|
||||
"errors"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strings"
|
||||
"regexp"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
)
|
||||
@@ -19,11 +19,12 @@ func startApp(ctx context.Context, client *api.Client) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if !strings.Contains(link, "Ollama.app") {
|
||||
r := regexp.MustCompile(`^.*/Ollama\s?\d*.app`)
|
||||
m := r.FindStringSubmatch(link)
|
||||
if len(m) != 1 {
|
||||
return errors.New("could not find ollama app")
|
||||
}
|
||||
path := strings.Split(link, "Ollama.app")
|
||||
if err := exec.Command("/usr/bin/open", "-a", path[0]+"Ollama.app").Run(); err != nil {
|
||||
if err := exec.Command("/usr/bin/open", "-j", "-a", m[0], "--args", "--fast-startup").Run(); err != nil {
|
||||
return err
|
||||
}
|
||||
return waitForServer(ctx, client)
|
||||
|
||||
@@ -4,17 +4,27 @@ import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"syscall"
|
||||
"unsafe"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"golang.org/x/sys/windows"
|
||||
)
|
||||
|
||||
const (
|
||||
Installer = "OllamaSetup.exe"
|
||||
)
|
||||
|
||||
func startApp(ctx context.Context, client *api.Client) error {
|
||||
// log.Printf("XXX Attempting to find and start ollama app")
|
||||
if len(isProcRunning(Installer)) > 0 {
|
||||
return fmt.Errorf("upgrade in progress...")
|
||||
}
|
||||
AppName := "ollama app.exe"
|
||||
exe, err := os.Executable()
|
||||
if err != nil {
|
||||
@@ -35,14 +45,11 @@ func startApp(ctx context.Context, client *api.Client) error {
|
||||
}
|
||||
}
|
||||
}
|
||||
// log.Printf("XXX attempting to start app %s", appExe)
|
||||
|
||||
cmd_path := "c:\\Windows\\system32\\cmd.exe"
|
||||
cmd := exec.Command(cmd_path, "/c", appExe)
|
||||
// TODO - these hide flags aren't working - still pops up a command window for some reason
|
||||
cmd := exec.Command(cmd_path, "/c", appExe, "--hide", "--fast-startup")
|
||||
cmd.SysProcAttr = &syscall.SysProcAttr{CreationFlags: 0x08000000, HideWindow: true}
|
||||
|
||||
// TODO this didn't help either...
|
||||
cmd.Stdin = strings.NewReader("")
|
||||
cmd.Stdout = os.Stdout
|
||||
cmd.Stderr = os.Stderr
|
||||
@@ -56,3 +63,50 @@ func startApp(ctx context.Context, client *api.Client) error {
|
||||
}
|
||||
return waitForServer(ctx, client)
|
||||
}
|
||||
|
||||
func isProcRunning(procName string) []uint32 {
|
||||
pids := make([]uint32, 2048)
|
||||
var ret uint32
|
||||
if err := windows.EnumProcesses(pids, &ret); err != nil || ret == 0 {
|
||||
slog.Debug("failed to check for running installers", "error", err)
|
||||
return nil
|
||||
}
|
||||
if ret > uint32(len(pids)) {
|
||||
pids = make([]uint32, ret+10)
|
||||
if err := windows.EnumProcesses(pids, &ret); err != nil || ret == 0 {
|
||||
slog.Debug("failed to check for running installers", "error", err)
|
||||
return nil
|
||||
}
|
||||
}
|
||||
if ret < uint32(len(pids)) {
|
||||
pids = pids[:ret]
|
||||
}
|
||||
var matches []uint32
|
||||
for _, pid := range pids {
|
||||
if pid == 0 {
|
||||
continue
|
||||
}
|
||||
hProcess, err := windows.OpenProcess(windows.PROCESS_QUERY_INFORMATION|windows.PROCESS_VM_READ, false, pid)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
defer windows.CloseHandle(hProcess)
|
||||
var module windows.Handle
|
||||
var cbNeeded uint32
|
||||
cb := (uint32)(unsafe.Sizeof(module))
|
||||
if err := windows.EnumProcessModules(hProcess, &module, cb, &cbNeeded); err != nil {
|
||||
continue
|
||||
}
|
||||
var sz uint32 = 1024 * 8
|
||||
moduleName := make([]uint16, sz)
|
||||
cb = uint32(len(moduleName)) * (uint32)(unsafe.Sizeof(uint16(0)))
|
||||
if err := windows.GetModuleBaseName(hProcess, module, &moduleName[0], cb); err != nil && err != syscall.ERROR_INSUFFICIENT_BUFFER {
|
||||
continue
|
||||
}
|
||||
exeFile := path.Base(strings.ToLower(syscall.UTF16ToString(moduleName)))
|
||||
if strings.EqualFold(exeFile, procName) {
|
||||
matches = append(matches, pid)
|
||||
}
|
||||
}
|
||||
return matches
|
||||
}
|
||||
|
||||
63
cmd/warn_thinking_test.go
Normal file
63
cmd/warn_thinking_test.go
Normal file
@@ -0,0 +1,63 @@
|
||||
package cmd
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/types/model"
|
||||
)
|
||||
|
||||
// Test that a warning is printed when thinking is requested but not supported.
|
||||
func TestWarnMissingThinking(t *testing.T) {
|
||||
cases := []struct {
|
||||
capabilities []model.Capability
|
||||
expectWarn bool
|
||||
}{
|
||||
{capabilities: []model.Capability{model.CapabilityThinking}, expectWarn: false},
|
||||
{capabilities: []model.Capability{}, expectWarn: true},
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.URL.Path != "/api/show" || r.Method != http.MethodPost {
|
||||
t.Fatalf("unexpected request to %s %s", r.URL.Path, r.Method)
|
||||
}
|
||||
var req api.ShowRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
t.Fatalf("decode request: %v", err)
|
||||
}
|
||||
resp := api.ShowResponse{Capabilities: tc.capabilities}
|
||||
if err := json.NewEncoder(w).Encode(resp); err != nil {
|
||||
t.Fatalf("encode response: %v", err)
|
||||
}
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
t.Setenv("OLLAMA_HOST", srv.URL)
|
||||
client, err := api.ClientFromEnvironment()
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
oldStderr := os.Stderr
|
||||
r, w, _ := os.Pipe()
|
||||
os.Stderr = w
|
||||
ensureThinkingSupport(t.Context(), client, "m")
|
||||
w.Close()
|
||||
os.Stderr = oldStderr
|
||||
out, _ := io.ReadAll(r)
|
||||
|
||||
warned := strings.Contains(string(out), "warning:")
|
||||
if tc.expectWarn && !warned {
|
||||
t.Errorf("expected warning, got none")
|
||||
}
|
||||
if !tc.expectWarn && warned {
|
||||
t.Errorf("did not expect warning, got: %s", string(out))
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -53,8 +53,11 @@ func (ModelParameters) KV(t *Tokenizer) ggml.KV {
|
||||
}
|
||||
|
||||
for _, sv := range t.SpecialVocabulary {
|
||||
kv[fmt.Sprintf("tokenizer.ggml.%s_token_id", sv.Key())] = uint32(sv.ID)
|
||||
kv[fmt.Sprintf("tokenizer.ggml.add_%s_token", sv.Key())] = sv.AddToken
|
||||
kv[fmt.Sprintf("tokenizer.ggml.%s_token_id", sv.Key())] = uint32(sv.ID)
|
||||
if len(sv.IDs) > 0 {
|
||||
kv[fmt.Sprintf("tokenizer.ggml.%s_token_ids", sv.Key())] = sv.IDs
|
||||
}
|
||||
}
|
||||
|
||||
return kv
|
||||
@@ -191,6 +194,8 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
|
||||
conv = &phi3Model{}
|
||||
case "Qwen2ForCausalLM":
|
||||
conv = &qwen2Model{}
|
||||
case "Qwen2_5_VLForConditionalGeneration":
|
||||
conv = &qwen25VLModel{}
|
||||
case "BertModel":
|
||||
conv = &bertModel{}
|
||||
case "CohereForCausalLM":
|
||||
|
||||
@@ -139,7 +139,8 @@ func (p *llamaModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
||||
}
|
||||
|
||||
for _, t := range ts {
|
||||
if strings.HasSuffix(t.Name(), "attn_q.weight") || strings.HasSuffix(t.Name(), "attn_k.weight") {
|
||||
if strings.HasSuffix(t.Name(), "attn_q.weight") || strings.HasSuffix(t.Name(), "attn_k.weight") ||
|
||||
strings.HasSuffix(t.Name(), "attn_q_proj.weight") || strings.HasSuffix(t.Name(), "attn_k_proj.weight") {
|
||||
if !p.skipRepack {
|
||||
t.SetRepacker(p.repack)
|
||||
}
|
||||
@@ -181,9 +182,9 @@ func (p *llamaModel) repack(name string, data []float32, shape []uint64) ([]floa
|
||||
}
|
||||
|
||||
var heads uint32
|
||||
if strings.HasSuffix(name, "attn_q.weight") {
|
||||
if strings.HasSuffix(name, "attn_q.weight") || strings.HasSuffix(name, "attn_q_proj.weight") {
|
||||
heads = p.NumAttentionHeads
|
||||
} else if strings.HasSuffix(name, "attn_k.weight") {
|
||||
} else if strings.HasSuffix(name, "attn_k.weight") || strings.HasSuffix(name, "attn_k_proj.weight") {
|
||||
heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
|
||||
} else {
|
||||
return nil, fmt.Errorf("unknown tensor for repack: %s", name)
|
||||
|
||||
@@ -94,7 +94,9 @@ func (m *mllamaModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
||||
var out []*ggml.Tensor
|
||||
var text []Tensor
|
||||
for _, t := range ts {
|
||||
if t.Name() == "v.position_embd.gate" {
|
||||
if !strings.HasPrefix(t.Name(), "v.") && !strings.HasPrefix(t.Name(), "mm.") {
|
||||
text = append(text, t)
|
||||
} else if t.Name() == "v.position_embd.gate" {
|
||||
for _, name := range []string{"v.position_embd.gate", "v.tile_position_embd.gate"} {
|
||||
tt := t.Clone()
|
||||
tt.SetRepacker(m.repack(name))
|
||||
@@ -105,23 +107,21 @@ func (m *mllamaModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
||||
WriterTo: tt,
|
||||
})
|
||||
}
|
||||
} else if t.Name() == "v.pre_tile_position_embd.gate" || t.Name() == "v.post_tile_position_embd.gate" {
|
||||
t.SetRepacker(m.repack(t.Name()))
|
||||
out = append(out, &ggml.Tensor{
|
||||
Name: t.Name(),
|
||||
Kind: t.Kind(),
|
||||
Shape: t.Shape(),
|
||||
WriterTo: t,
|
||||
})
|
||||
} else if strings.HasPrefix(t.Name(), "v.") || strings.HasPrefix(t.Name(), "mm.") {
|
||||
out = append(out, &ggml.Tensor{
|
||||
Name: t.Name(),
|
||||
Kind: t.Kind(),
|
||||
Shape: t.Shape(),
|
||||
WriterTo: t,
|
||||
})
|
||||
} else {
|
||||
text = append(text, t)
|
||||
if t.Name() == "v.pre_tile_position_embd.gate" || t.Name() == "v.post_tile_position_embd.gate" {
|
||||
t.SetRepacker(m.repack(t.Name()))
|
||||
} else if strings.HasSuffix(t.Name(), "attn_q.weight") || strings.HasSuffix(t.Name(), "attn_k.weight") {
|
||||
t.SetRepacker(m.repack(t.Name()))
|
||||
} else if strings.HasSuffix(t.Name(), "attn_gate") || strings.HasSuffix(t.Name(), "ffn_gate") {
|
||||
t.SetRepacker(m.repack(t.Name()))
|
||||
}
|
||||
|
||||
out = append(out, &ggml.Tensor{
|
||||
Name: t.Name(),
|
||||
Kind: t.Kind(),
|
||||
Shape: t.Shape(),
|
||||
WriterTo: t,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -137,16 +137,35 @@ func (m *mllamaModel) repack(name string) Repacker {
|
||||
|
||||
var t tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
|
||||
|
||||
t, err = tensor.Tanh(t)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if strings.HasSuffix(name, "attn_q.weight") || strings.HasSuffix(name, "attn_k.weight") {
|
||||
heads := m.VisionModel.AttentionHeads
|
||||
if err := t.Reshape(append([]int{int(heads), 2, dims[0] / int(heads) / 2}, dims[1:]...)...); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if name == "v.position_embd.gate" {
|
||||
t, err = tensor.Sub(float32(1), t)
|
||||
if err := t.T(0, 2, 1, 3); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err := t.Reshape(dims...); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err := t.Transpose(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
} else {
|
||||
t, err = tensor.Tanh(t)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if name == "v.position_embd.gate" {
|
||||
t, err = tensor.Sub(float32(1), t)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
t = tensor.Materialize(t)
|
||||
|
||||
@@ -15,6 +15,7 @@ type qwen2Model struct {
|
||||
Type string `json:"type"`
|
||||
Factor ropeFactor `json:"factor"`
|
||||
OriginalMaxPositionEmbeddings uint32 `json:"original_max_position_embeddings"`
|
||||
MropeSection []int32 `json:"mrope_section"`
|
||||
} `json:"rope_scaling"`
|
||||
RMSNormEPS float32 `json:"rms_norm_eps"`
|
||||
}
|
||||
@@ -39,6 +40,8 @@ func (q *qwen2Model) KV(t *Tokenizer) ggml.KV {
|
||||
case "yarn":
|
||||
kv["qwen2.rope.scaling.type"] = q.RopeScaling.Type
|
||||
kv["qwen2.rope.scaling.factor"] = q.RopeScaling.Factor
|
||||
case "mrope", "default":
|
||||
kv["qwen2.rope.mrope_section"] = q.RopeScaling.MropeSection
|
||||
default:
|
||||
panic("unknown rope scaling type")
|
||||
}
|
||||
|
||||
102
convert/convert_qwen25vl.go
Normal file
102
convert/convert_qwen25vl.go
Normal file
@@ -0,0 +1,102 @@
|
||||
package convert
|
||||
|
||||
import (
|
||||
"cmp"
|
||||
"slices"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
)
|
||||
|
||||
type qwen25VLModel struct {
|
||||
qwen2Model
|
||||
|
||||
VisionModel struct {
|
||||
Depth uint32 `json:"depth"`
|
||||
HiddenSize uint32 `json:"hidden_size"`
|
||||
NumHeads uint32 `json:"num_heads"`
|
||||
InChannels uint32 `json:"in_chans"`
|
||||
PatchSize uint32 `json:"patch_size"`
|
||||
SpatialMergeSize uint32 `json:"spatial_merge_size"`
|
||||
SpatialPatchSize uint32 `json:"spatial_patch_size"`
|
||||
WindowSize uint32 `json:"window_size"`
|
||||
RMSNormEps float32 `json:"layer_norm_epsilon"`
|
||||
RopeTheta float32 `json:"rope_theta"`
|
||||
FullAttentionBlocks []int32 `json:"fullatt_block_indexes"`
|
||||
TemporalPatchSize uint32 `json:"temporal_patch_size"`
|
||||
} `json:"vision_config"`
|
||||
}
|
||||
|
||||
var _ ModelConverter = (*qwen25VLModel)(nil)
|
||||
|
||||
func (q *qwen25VLModel) KV(t *Tokenizer) ggml.KV {
|
||||
kv := q.ModelParameters.KV(t)
|
||||
kv["general.architecture"] = "qwen25vl"
|
||||
|
||||
for k, v := range q.qwen2Model.KV(t) {
|
||||
if strings.HasPrefix(k, "qwen2.") {
|
||||
kv[strings.Replace(k, "qwen2.", "qwen25vl.", 1)] = v
|
||||
}
|
||||
}
|
||||
|
||||
if q.VisionModel.FullAttentionBlocks == nil {
|
||||
kv["qwen25vl.vision.fullatt_block_indexes"] = []int32{7, 15, 23, 31}
|
||||
}
|
||||
|
||||
kv["qwen25vl.vision.block_count"] = cmp.Or(q.VisionModel.Depth, 32)
|
||||
kv["qwen25vl.vision.embedding_length"] = q.VisionModel.HiddenSize
|
||||
kv["qwen25vl.vision.attention.head_count"] = cmp.Or(q.VisionModel.NumHeads, 16)
|
||||
kv["qwen25vl.vision.num_channels"] = q.VisionModel.InChannels
|
||||
kv["qwen25vl.vision.patch_size"] = cmp.Or(q.VisionModel.PatchSize, 14)
|
||||
kv["qwen25vl.vision.spatial_merge_size"] = cmp.Or(q.VisionModel.SpatialMergeSize, 2)
|
||||
kv["qwen25vl.vision.spatial_patch_size"] = q.VisionModel.SpatialPatchSize
|
||||
kv["qwen25vl.vision.window_size"] = cmp.Or(q.VisionModel.WindowSize, 112)
|
||||
kv["qwen25vl.vision.attention.layer_norm_epsilon"] = cmp.Or(q.VisionModel.RMSNormEps, 1e-6)
|
||||
kv["qwen25vl.vision.rope.freq_base"] = cmp.Or(q.VisionModel.RopeTheta, 1e4)
|
||||
kv["qwen25vl.vision.fullatt_block_indexes"] = q.VisionModel.FullAttentionBlocks
|
||||
kv["qwen25vl.vision.temporal_patch_size"] = cmp.Or(q.VisionModel.TemporalPatchSize, 2)
|
||||
|
||||
return kv
|
||||
}
|
||||
|
||||
func (q *qwen25VLModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
||||
var out []*ggml.Tensor
|
||||
|
||||
for _, t := range ts {
|
||||
if strings.Contains(t.Name(), "patch_embed.proj") {
|
||||
for t := range splitDim(t, 2,
|
||||
split{Replacer: strings.NewReplacer("patch_embed.proj", "patch_embd_0")},
|
||||
split{Replacer: strings.NewReplacer("patch_embed.proj", "patch_embd_1")},
|
||||
) {
|
||||
t.Shape = slices.DeleteFunc(t.Shape, func(i uint64) bool { return i == 1 })
|
||||
out = append(out, t)
|
||||
}
|
||||
} else if strings.Contains(t.Name(), "attn.qkv") {
|
||||
out = append(out, slices.Collect(splitDim(t, 0,
|
||||
split{Replacer: strings.NewReplacer("attn.qkv", "attn_q")},
|
||||
split{Replacer: strings.NewReplacer("attn.qkv", "attn_k")},
|
||||
split{Replacer: strings.NewReplacer("attn.qkv", "attn_v")},
|
||||
))...)
|
||||
} else {
|
||||
out = append(out, &ggml.Tensor{
|
||||
Name: t.Name(),
|
||||
Kind: t.Kind(),
|
||||
Shape: t.Shape(),
|
||||
WriterTo: t,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
func (p *qwen25VLModel) Replacements() []string {
|
||||
return append(
|
||||
p.qwen2Model.Replacements(),
|
||||
"visual", "v",
|
||||
"blocks", "blk",
|
||||
"attn.proj", "attn_out",
|
||||
"norm1", "ln1",
|
||||
"norm2", "ln2",
|
||||
)
|
||||
}
|
||||
@@ -47,7 +47,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, ggml.KV, ggml.Tensors) {
|
||||
}
|
||||
t.Cleanup(func() { r.Close() })
|
||||
|
||||
m, _, err := ggml.Decode(r, -1)
|
||||
m, err := ggml.Decode(r, -1)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -332,7 +332,7 @@ func TestConvertAdapter(t *testing.T) {
|
||||
}
|
||||
defer r.Close()
|
||||
|
||||
m, _, err := ggml.Decode(r, -1)
|
||||
m, err := ggml.Decode(r, -1)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
76
convert/tensor.go
Normal file
76
convert/tensor.go
Normal file
@@ -0,0 +1,76 @@
|
||||
package convert
|
||||
|
||||
import (
|
||||
"cmp"
|
||||
"iter"
|
||||
"slices"
|
||||
"strings"
|
||||
|
||||
"github.com/pdevine/tensor"
|
||||
"github.com/pdevine/tensor/native"
|
||||
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
)
|
||||
|
||||
type split struct {
|
||||
*strings.Replacer
|
||||
dim int
|
||||
|
||||
// fn is an optional function to apply to the tensor after slicing
|
||||
fn func(tensor.Tensor) (tensor.Tensor, error)
|
||||
}
|
||||
|
||||
// splitDim splits a tensor along a specified dimension into multiple tensors. The dimension
|
||||
// is split evenly based on the number of replacers provided unless a specific count is given.
|
||||
func splitDim(t Tensor, dim int, splits ...split) iter.Seq[*ggml.Tensor] {
|
||||
return func(yield func(*ggml.Tensor) bool) {
|
||||
var offset int
|
||||
for _, split := range splits {
|
||||
t := t.Clone()
|
||||
shape := slices.Clone(t.Shape())
|
||||
shape[dim] = cmp.Or(uint64(split.dim), shape[dim]/uint64(len(splits)))
|
||||
|
||||
slice := slices.Repeat([]tensor.Slice{nil}, len(shape))
|
||||
slice[dim] = tensor.S(offset, offset+int(shape[dim]))
|
||||
offset += int(shape[dim])
|
||||
|
||||
t.SetRepacker(func(_ string, data []float32, shape []uint64) ([]float32, error) {
|
||||
dims := make([]int, len(shape))
|
||||
for i := range shape {
|
||||
dims[i] = int(shape[i])
|
||||
}
|
||||
|
||||
var tt tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
|
||||
tt, err := tt.Slice(slice...)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
tt = tensor.Materialize(tt)
|
||||
|
||||
if split.fn != nil {
|
||||
tt, err = split.fn(tt)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
// flatten tensor so it can be written as a vector
|
||||
if err := tt.Reshape(tt.Shape().TotalSize()); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return native.VectorF32(tt.(*tensor.Dense))
|
||||
})
|
||||
|
||||
if !yield(&ggml.Tensor{
|
||||
Name: split.Replace(t.Name()),
|
||||
Kind: t.Kind(),
|
||||
Shape: shape,
|
||||
WriterTo: t,
|
||||
}) {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
304
convert/tensor_test.go
Normal file
304
convert/tensor_test.go
Normal file
@@ -0,0 +1,304 @@
|
||||
package convert
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"io"
|
||||
"iter"
|
||||
"slices"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/pdevine/tensor"
|
||||
)
|
||||
|
||||
type fakeTensor struct {
|
||||
name string
|
||||
shape []uint64
|
||||
data []float32
|
||||
|
||||
repacker Repacker
|
||||
}
|
||||
|
||||
func (f fakeTensor) Name() string {
|
||||
return f.name
|
||||
}
|
||||
|
||||
func (f fakeTensor) Shape() []uint64 {
|
||||
return f.shape
|
||||
}
|
||||
|
||||
func (f fakeTensor) Kind() uint32 {
|
||||
return 0
|
||||
}
|
||||
|
||||
func (f *fakeTensor) SetRepacker(fn Repacker) {
|
||||
f.repacker = fn
|
||||
}
|
||||
|
||||
func (f fakeTensor) Clone() Tensor {
|
||||
return &fakeTensor{
|
||||
name: f.name,
|
||||
shape: slices.Clone(f.shape),
|
||||
data: slices.Clone(f.data),
|
||||
repacker: f.repacker,
|
||||
}
|
||||
}
|
||||
|
||||
func (f fakeTensor) WriteTo(w io.Writer) (n int64, err error) {
|
||||
data := f.data
|
||||
if f.repacker != nil {
|
||||
data, err = f.repacker(f.name, data, f.shape)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
|
||||
if err := binary.Write(w, binary.LittleEndian, data); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
return int64(len(data) * 4), nil
|
||||
}
|
||||
|
||||
func mul(shape []uint64) int {
|
||||
n := 1
|
||||
for _, dim := range shape {
|
||||
n *= int(dim)
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
func TestSplitDim(t *testing.T) {
|
||||
r := fakeTensor{
|
||||
name: "a.b",
|
||||
shape: []uint64{3, 4},
|
||||
data: []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
|
||||
}
|
||||
|
||||
t.Run("no split", func(t *testing.T) {
|
||||
for tt := range splitDim(&r, 0, split{Replacer: strings.NewReplacer("a", "x")}) {
|
||||
if tt.Name != "x.b" {
|
||||
t.Fatalf("expected name 'x', got '%s'", tt.Name)
|
||||
}
|
||||
|
||||
if !slices.Equal(tt.Shape, []uint64{3, 4}) {
|
||||
t.Fatalf("expected shape [3, 4], got %v", tt.Shape)
|
||||
}
|
||||
|
||||
var b bytes.Buffer
|
||||
if _, err := tt.WriteTo(&b); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
f32s := make([]float32, mul(tt.Shape))
|
||||
if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if !slices.Equal(f32s, []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}) {
|
||||
t.Fatalf("expected data [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], got %v", f32s)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("even split", func(t *testing.T) {
|
||||
next, stop := iter.Pull(splitDim(&r, 1,
|
||||
split{Replacer: strings.NewReplacer("a", "x")},
|
||||
split{Replacer: strings.NewReplacer("b", "y")},
|
||||
))
|
||||
defer stop()
|
||||
|
||||
{
|
||||
tt, ok := next()
|
||||
if !ok {
|
||||
t.Fatal("expected at least one split")
|
||||
}
|
||||
|
||||
if tt.Name != "x.b" {
|
||||
t.Fatal("expected name 'x.b', got", tt.Name)
|
||||
}
|
||||
|
||||
if !slices.Equal(tt.Shape, []uint64{3, 2}) {
|
||||
t.Fatal("expected shape [3, 2], got", tt.Shape)
|
||||
}
|
||||
|
||||
var b bytes.Buffer
|
||||
if _, err := tt.WriteTo(&b); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
f32s := make([]float32, mul(tt.Shape))
|
||||
if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if !slices.Equal(f32s, []float32{0, 1, 4, 5, 8, 9}) {
|
||||
t.Fatal("expected data [0, 1, 4, 5, 8, 9], got", f32s)
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
tt, ok := next()
|
||||
if !ok {
|
||||
t.Fatal("expected at least one split")
|
||||
}
|
||||
|
||||
if tt.Name != "a.y" {
|
||||
t.Fatal("expected name 'a.y', got", tt.Name)
|
||||
}
|
||||
|
||||
if !slices.Equal(tt.Shape, []uint64{3, 2}) {
|
||||
t.Fatal("expected shape [3, 2], got", tt.Shape)
|
||||
}
|
||||
|
||||
var b bytes.Buffer
|
||||
if _, err := tt.WriteTo(&b); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
f32s := make([]float32, mul(tt.Shape))
|
||||
if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if !slices.Equal(f32s, []float32{2, 3, 6, 7, 10, 11}) {
|
||||
t.Fatal("expected data [2, 3, 6, 7, 10, 11], got", f32s)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("uneven split", func(t *testing.T) {
|
||||
next, stop := iter.Pull(splitDim(&r, 0,
|
||||
split{Replacer: strings.NewReplacer("a", "x"), dim: 2},
|
||||
split{Replacer: strings.NewReplacer("b", "y"), dim: 1},
|
||||
))
|
||||
defer stop()
|
||||
|
||||
{
|
||||
tt, ok := next()
|
||||
if !ok {
|
||||
t.Fatal("expected at least one split")
|
||||
}
|
||||
|
||||
if tt.Name != "x.b" {
|
||||
t.Fatal("expected name 'x.b', got", tt.Name)
|
||||
}
|
||||
|
||||
if !slices.Equal(tt.Shape, []uint64{2, 4}) {
|
||||
t.Fatal("expected shape [2, 4], got", tt.Shape)
|
||||
}
|
||||
|
||||
var b bytes.Buffer
|
||||
if _, err := tt.WriteTo(&b); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
f32s := make([]float32, mul(tt.Shape))
|
||||
if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if !slices.Equal(f32s, []float32{0, 1, 2, 3, 4, 5, 6, 7}) {
|
||||
t.Fatal("expected data [0, 1, 2, 3, 4, 5, 6, 7], got", f32s)
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
tt, ok := next()
|
||||
if !ok {
|
||||
t.Fatal("expected at least one split")
|
||||
}
|
||||
|
||||
if tt.Name != "a.y" {
|
||||
t.Fatal("expected name 'a.y', got", tt.Name)
|
||||
}
|
||||
|
||||
if !slices.Equal(tt.Shape, []uint64{1, 4}) {
|
||||
t.Fatal("expected shape [1, 4], got", tt.Shape)
|
||||
}
|
||||
|
||||
var b bytes.Buffer
|
||||
if _, err := tt.WriteTo(&b); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
f32s := make([]float32, mul(tt.Shape))
|
||||
if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if !slices.Equal(f32s, []float32{8, 9, 10, 11}) {
|
||||
t.Fatal("expected data [8, 9, 10, 11], got", f32s)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("split with transpose", func(t *testing.T) {
|
||||
next, stop := iter.Pull(splitDim(&r, 1,
|
||||
split{Replacer: strings.NewReplacer("a", "x")},
|
||||
split{Replacer: strings.NewReplacer("b", "y"), fn: func(tt tensor.Tensor) (tensor.Tensor, error) {
|
||||
return tensor.Transpose(tt, 1, 0)
|
||||
}},
|
||||
))
|
||||
defer stop()
|
||||
|
||||
{
|
||||
tt, ok := next()
|
||||
if !ok {
|
||||
t.Fatal("expected at least one split")
|
||||
}
|
||||
|
||||
if tt.Name != "x.b" {
|
||||
t.Fatal("expected name 'x.b', got", tt.Name)
|
||||
}
|
||||
|
||||
if !slices.Equal(tt.Shape, []uint64{3, 2}) {
|
||||
t.Fatal("expected shape [3, 2], got", tt.Shape)
|
||||
}
|
||||
|
||||
var b bytes.Buffer
|
||||
if _, err := tt.WriteTo(&b); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
f32s := make([]float32, mul(tt.Shape))
|
||||
if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if !slices.Equal(f32s, []float32{0, 1, 4, 5, 8, 9}) {
|
||||
t.Fatal("expected data [0, 1, 4, 5, 8, 9], got", f32s)
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
tt, ok := next()
|
||||
if !ok {
|
||||
t.Fatal("expected at least one split")
|
||||
}
|
||||
|
||||
if tt.Name != "a.y" {
|
||||
t.Fatal("expected name 'a.y', got", tt.Name)
|
||||
}
|
||||
|
||||
if !slices.Equal(tt.Shape, []uint64{3, 2}) {
|
||||
t.Fatal("expected shape [3, 2], got", tt.Shape)
|
||||
}
|
||||
|
||||
var b bytes.Buffer
|
||||
if _, err := tt.WriteTo(&b); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
f32s := make([]float32, mul(tt.Shape))
|
||||
if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if !slices.Equal(f32s, []float32{2, 6, 10, 3, 7, 11}) {
|
||||
t.Fatal("expected data [2, 6, 10, 3, 7, 11], got", f32s)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
@@ -110,6 +110,7 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
|
||||
}
|
||||
|
||||
if f, err := fsys.Open("tokenizer_config.json"); errors.Is(err, os.ErrNotExist) {
|
||||
// noop
|
||||
} else if err != nil {
|
||||
return nil, err
|
||||
} else {
|
||||
@@ -171,6 +172,34 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
|
||||
}
|
||||
}
|
||||
|
||||
if f, err := fsys.Open("generation_config.json"); errors.Is(err, os.ErrNotExist) {
|
||||
} else if err != nil {
|
||||
return nil, err
|
||||
} else {
|
||||
defer f.Close()
|
||||
|
||||
var p map[string]json.RawMessage
|
||||
if err := json.NewDecoder(f).Decode(&p); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for _, st := range specialTokenTypes {
|
||||
if bts, ok := p[fmt.Sprintf("%s_token_id", st)]; ok {
|
||||
var ids []int32
|
||||
if err := json.Unmarshal(bts, &ids); err != nil {
|
||||
// value is not a list so the existing ID is used
|
||||
continue
|
||||
}
|
||||
|
||||
if i := slices.IndexFunc(t.SpecialVocabulary, func(sv *SpecialVocabulary) bool {
|
||||
return sv.Type == st
|
||||
}); i >= 0 {
|
||||
t.SpecialVocabulary[i].IDs = ids
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return t, nil
|
||||
}
|
||||
|
||||
@@ -280,6 +309,9 @@ type SpecialVocabulary struct {
|
||||
ID int
|
||||
Content string
|
||||
AddToken bool
|
||||
|
||||
// IDs is populated by generation_config.json
|
||||
IDs []int32
|
||||
}
|
||||
|
||||
func (sv SpecialVocabulary) Key() string {
|
||||
|
||||
@@ -247,6 +247,67 @@ func TestParseTokenizer(t *testing.T) {
|
||||
Pre: "default",
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "generation config eos token ids",
|
||||
fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
|
||||
"tokenizer.json": strings.NewReader(`{
|
||||
"added_tokens": [
|
||||
{
|
||||
"id": 0,
|
||||
"content": "<bos>",
|
||||
"special": true
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"content": "<eos>",
|
||||
"special": true
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"content": "<eot>",
|
||||
"special": true
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"content": "<eom>",
|
||||
"special": true
|
||||
}
|
||||
],
|
||||
"model": {
|
||||
"vocab": {
|
||||
"<bos>": 0,
|
||||
"<eos>": 1,
|
||||
"<eot>": 2,
|
||||
"<eom>": 3
|
||||
}
|
||||
}
|
||||
}`),
|
||||
"tokenizer_config.json": strings.NewReader(`{
|
||||
"add_bos_token": true,
|
||||
"add_eos_token": false,
|
||||
"bos_token": "<bos>",
|
||||
"eos_token": "<eos>"
|
||||
}`),
|
||||
"generation_config.json": strings.NewReader(`{
|
||||
"bos_token_id": 0,
|
||||
"eos_token_id": [1, 2, 3]
|
||||
}`),
|
||||
}),
|
||||
specialTokenTypes: []string{"pad", "eos", "bos", "unk"},
|
||||
want: &Tokenizer{
|
||||
Vocabulary: &Vocabulary{
|
||||
Model: "gpt2",
|
||||
Tokens: []string{"<bos>", "<eos>", "<eot>", "<eom>"},
|
||||
Scores: []float32{0, 1, 2, 3},
|
||||
Types: []int32{3, 3, 3, 3},
|
||||
},
|
||||
SpecialVocabulary: []*SpecialVocabulary{
|
||||
{Type: "eos", Content: "<eos>", ID: 1, IDs: []int32{1, 2, 3}, AddToken: false},
|
||||
{Type: "bos", Content: "<bos>", ID: 0, AddToken: true},
|
||||
},
|
||||
Pre: "default",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range cases {
|
||||
|
||||
@@ -43,6 +43,7 @@ Generate a response for a given prompt with a provided model. This is a streamin
|
||||
- `prompt`: the prompt to generate a response for
|
||||
- `suffix`: the text after the model response
|
||||
- `images`: (optional) a list of base64-encoded images (for multimodal models such as `llava`)
|
||||
- `think`: (for thinking models) should the model think before responding?
|
||||
|
||||
Advanced parameters (optional):
|
||||
|
||||
@@ -490,11 +491,13 @@ Generate the next message in a chat with a provided model. This is a streaming e
|
||||
- `model`: (required) the [model name](#model-names)
|
||||
- `messages`: the messages of the chat, this can be used to keep a chat memory
|
||||
- `tools`: list of tools in JSON for the model to use if supported
|
||||
- `think`: (for thinking models) should the model think before responding?
|
||||
|
||||
The `message` object has the following fields:
|
||||
|
||||
- `role`: the role of the message, either `system`, `user`, `assistant`, or `tool`
|
||||
- `content`: the content of the message
|
||||
- `thinking`: (for thinking models) the model's thinking process
|
||||
- `images` (optional): a list of images to include in the message (for multimodal models such as `llava`)
|
||||
- `tool_calls` (optional): a list of tools in JSON that the model wants to use
|
||||
|
||||
|
||||
@@ -118,7 +118,7 @@ To run tests, use `go test`:
|
||||
go test ./...
|
||||
```
|
||||
|
||||
> NOTE: In rare cirumstances, you may nedd to change a package using the new
|
||||
> NOTE: In rare cirumstances, you may need to change a package using the new
|
||||
> "synctest" package in go1.24.
|
||||
>
|
||||
> If you do not have the "synctest" package enabled, you will not see build or
|
||||
|
||||
@@ -132,22 +132,12 @@ success
|
||||
|
||||
### Supported Quantizations
|
||||
|
||||
- `q4_0`
|
||||
- `q4_1`
|
||||
- `q5_0`
|
||||
- `q5_1`
|
||||
- `q8_0`
|
||||
|
||||
#### K-means Quantizations
|
||||
|
||||
- `q3_K_S`
|
||||
- `q3_K_M`
|
||||
- `q3_K_L`
|
||||
- `q4_K_S`
|
||||
- `q4_K_M`
|
||||
- `q5_K_S`
|
||||
- `q5_K_M`
|
||||
- `q6_K`
|
||||
|
||||
|
||||
## Sharing your model on ollama.com
|
||||
|
||||
@@ -112,8 +112,8 @@ sudo systemctl status ollama
|
||||
> While AMD has contributed the `amdgpu` driver upstream to the official linux
|
||||
> kernel source, the version is older and may not support all ROCm features. We
|
||||
> recommend you install the latest driver from
|
||||
> https://www.amd.com/en/support/linux-drivers for best support of your Radeon
|
||||
> GPU.
|
||||
> [AMD](https://www.amd.com/en/support/download/linux-drivers.html) for best support
|
||||
> of your Radeon GPU.
|
||||
|
||||
## Customizing
|
||||
|
||||
|
||||
@@ -183,6 +183,8 @@ var (
|
||||
NewEngine = Bool("OLLAMA_NEW_ENGINE")
|
||||
// ContextLength sets the default context length
|
||||
ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 4096)
|
||||
// Auth enables authentication between the Ollama client and server
|
||||
UseAuth = Bool("OLLAMA_AUTH")
|
||||
)
|
||||
|
||||
func String(s string) func() string {
|
||||
|
||||
@@ -15,6 +15,7 @@ import (
|
||||
type GGML struct {
|
||||
container
|
||||
model
|
||||
Length int64
|
||||
}
|
||||
|
||||
type model interface {
|
||||
@@ -126,6 +127,7 @@ func (kv KV) OllamaEngineRequired() bool {
|
||||
"mistral3",
|
||||
"llama4",
|
||||
"mllama",
|
||||
"qwen25vl",
|
||||
}, kv.Architecture())
|
||||
}
|
||||
|
||||
@@ -385,12 +387,12 @@ func DetectContentType(b []byte) string {
|
||||
//
|
||||
// It collects array values for arrays with a size less than or equal to
|
||||
// maxArraySize. If the maxArraySize is negative, all arrays are collected.
|
||||
func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
|
||||
func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) {
|
||||
rs = bufioutil.NewBufferedSeeker(rs, 32<<10)
|
||||
|
||||
var magic uint32
|
||||
if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
|
||||
return nil, 0, err
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var c container
|
||||
@@ -400,24 +402,25 @@ func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
|
||||
case FILE_MAGIC_GGUF_BE:
|
||||
c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
|
||||
default:
|
||||
return nil, 0, errors.New("invalid file magic")
|
||||
return nil, errors.New("invalid file magic")
|
||||
}
|
||||
|
||||
model, err := c.Decode(rs)
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
return nil, err
|
||||
}
|
||||
|
||||
offset, err := rs.Seek(0, io.SeekCurrent)
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// final model type
|
||||
return &GGML{
|
||||
container: c,
|
||||
model: model,
|
||||
}, offset, nil
|
||||
Length: offset,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string) (kv []uint64, partialOffload, fullOffload uint64) {
|
||||
@@ -649,6 +652,20 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
|
||||
graphSize = 4 * (imageSize*imageSize*numChannels +
|
||||
embeddingLength*patchSize +
|
||||
numPatches*numPatches*headCount)
|
||||
case "qwen25vl":
|
||||
maxPixels := uint64(llm.KV().Uint("vision.max_pixels", 28*28*1280))
|
||||
|
||||
numPatches := maxPixels / (patchSize * patchSize)
|
||||
|
||||
graphSize = 4 * (maxPixels*numChannels + // Original image storage
|
||||
// Normalized pixels
|
||||
maxPixels*numChannels +
|
||||
// Patches storage (numPatches * channels * patchSize^2)
|
||||
numPatches*numChannels*patchSize*patchSize +
|
||||
// Self-attention calculations
|
||||
numPatches*numPatches*headCount +
|
||||
// Additional buffer for processing
|
||||
embeddingLength*numPatches)
|
||||
case "llama4":
|
||||
// vision graph is computed independently in the same schedule
|
||||
// and is negligible compared to the worst case text graph
|
||||
|
||||
@@ -35,7 +35,7 @@ func TestWriteGGUF(t *testing.T) {
|
||||
}
|
||||
defer r.Close()
|
||||
|
||||
ff, _, err := Decode(r, 0)
|
||||
ff, err := Decode(r, 0)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
347
fs/gguf/gguf.go
Normal file
347
fs/gguf/gguf.go
Normal file
@@ -0,0 +1,347 @@
|
||||
package gguf
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"cmp"
|
||||
"encoding/binary"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"iter"
|
||||
"os"
|
||||
"slices"
|
||||
"strings"
|
||||
)
|
||||
|
||||
const (
|
||||
typeUint8 uint32 = iota
|
||||
typeInt8
|
||||
typeUint16
|
||||
typeInt16
|
||||
typeUint32
|
||||
typeInt32
|
||||
typeFloat32
|
||||
typeBool
|
||||
typeString
|
||||
typeArray
|
||||
typeUint64
|
||||
typeInt64
|
||||
typeFloat64
|
||||
)
|
||||
|
||||
var ErrUnsupported = errors.New("unsupported")
|
||||
|
||||
type File struct {
|
||||
Magic [4]byte
|
||||
Version uint32
|
||||
|
||||
keyValues *lazy[KeyValue]
|
||||
tensors *lazy[TensorInfo]
|
||||
offset int64
|
||||
|
||||
file *os.File
|
||||
reader *bufferedReader
|
||||
bts []byte
|
||||
}
|
||||
|
||||
func Open(path string) (f *File, err error) {
|
||||
f = &File{bts: make([]byte, 4096)}
|
||||
f.file, err = os.Open(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
f.reader = newBufferedReader(f.file, 32<<10)
|
||||
|
||||
if err := binary.Read(f.reader, binary.LittleEndian, &f.Magic); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if bytes.Equal(f.Magic[:], []byte("gguf")) {
|
||||
return nil, fmt.Errorf("%w file type %v", ErrUnsupported, f.Magic)
|
||||
}
|
||||
|
||||
if err := binary.Read(f.reader, binary.LittleEndian, &f.Version); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if f.Version != 3 {
|
||||
return nil, fmt.Errorf("%w version %v", ErrUnsupported, f.Version)
|
||||
}
|
||||
|
||||
f.tensors, err = newLazy(f, f.readTensor)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
f.tensors.successFunc = func() error {
|
||||
offset := f.reader.offset
|
||||
|
||||
alignment := cmp.Or(f.KeyValue("general.alignment").Int(), 32)
|
||||
f.offset = offset + (alignment-offset%alignment)%alignment
|
||||
return nil
|
||||
}
|
||||
|
||||
f.keyValues, err = newLazy(f, f.readKeyValue)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return f, nil
|
||||
}
|
||||
|
||||
func (f *File) readTensor() (TensorInfo, error) {
|
||||
name, err := readString(f)
|
||||
if err != nil {
|
||||
return TensorInfo{}, err
|
||||
}
|
||||
|
||||
dims, err := read[uint32](f)
|
||||
if err != nil {
|
||||
return TensorInfo{}, err
|
||||
}
|
||||
|
||||
shape := make([]uint64, dims)
|
||||
for i := range dims {
|
||||
shape[i], err = read[uint64](f)
|
||||
if err != nil {
|
||||
return TensorInfo{}, err
|
||||
}
|
||||
}
|
||||
|
||||
type_, err := read[uint32](f)
|
||||
if err != nil {
|
||||
return TensorInfo{}, err
|
||||
}
|
||||
|
||||
offset, err := read[uint64](f)
|
||||
if err != nil {
|
||||
return TensorInfo{}, err
|
||||
}
|
||||
|
||||
return TensorInfo{
|
||||
Name: name,
|
||||
Offset: offset,
|
||||
Shape: shape,
|
||||
Type: TensorType(type_),
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (f *File) readKeyValue() (KeyValue, error) {
|
||||
key, err := readString(f)
|
||||
if err != nil {
|
||||
return KeyValue{}, err
|
||||
}
|
||||
|
||||
t, err := read[uint32](f)
|
||||
if err != nil {
|
||||
return KeyValue{}, err
|
||||
}
|
||||
|
||||
value, err := func() (any, error) {
|
||||
switch t {
|
||||
case typeUint8:
|
||||
return read[uint8](f)
|
||||
case typeInt8:
|
||||
return read[int8](f)
|
||||
case typeUint16:
|
||||
return read[uint16](f)
|
||||
case typeInt16:
|
||||
return read[int16](f)
|
||||
case typeUint32:
|
||||
return read[uint32](f)
|
||||
case typeInt32:
|
||||
return read[int32](f)
|
||||
case typeUint64:
|
||||
return read[uint64](f)
|
||||
case typeInt64:
|
||||
return read[int64](f)
|
||||
case typeFloat32:
|
||||
return read[float32](f)
|
||||
case typeFloat64:
|
||||
return read[float64](f)
|
||||
case typeBool:
|
||||
return read[bool](f)
|
||||
case typeString:
|
||||
return readString(f)
|
||||
case typeArray:
|
||||
return readArray(f)
|
||||
default:
|
||||
return nil, fmt.Errorf("%w type %d", ErrUnsupported, t)
|
||||
}
|
||||
}()
|
||||
if err != nil {
|
||||
return KeyValue{}, err
|
||||
}
|
||||
|
||||
return KeyValue{
|
||||
Key: key,
|
||||
Value: Value{value},
|
||||
}, nil
|
||||
}
|
||||
|
||||
func read[T any](f *File) (t T, err error) {
|
||||
err = binary.Read(f.reader, binary.LittleEndian, &t)
|
||||
return t, err
|
||||
}
|
||||
|
||||
func readString(f *File) (string, error) {
|
||||
n, err := read[uint64](f)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
if int(n) > len(f.bts) {
|
||||
f.bts = make([]byte, n)
|
||||
}
|
||||
|
||||
bts := f.bts[:n]
|
||||
if _, err := io.ReadFull(f.reader, bts); err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer clear(bts)
|
||||
|
||||
return string(bts), nil
|
||||
}
|
||||
|
||||
func readArray(f *File) (any, error) {
|
||||
t, err := read[uint32](f)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
n, err := read[uint64](f)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
switch t {
|
||||
case typeUint8:
|
||||
return readArrayData[uint8](f, n)
|
||||
case typeInt8:
|
||||
return readArrayData[int8](f, n)
|
||||
case typeUint16:
|
||||
return readArrayData[uint16](f, n)
|
||||
case typeInt16:
|
||||
return readArrayData[int16](f, n)
|
||||
case typeUint32:
|
||||
return readArrayData[uint32](f, n)
|
||||
case typeInt32:
|
||||
return readArrayData[int32](f, n)
|
||||
case typeUint64:
|
||||
return readArrayData[uint64](f, n)
|
||||
case typeInt64:
|
||||
return readArrayData[int64](f, n)
|
||||
case typeFloat32:
|
||||
return readArrayData[float32](f, n)
|
||||
case typeFloat64:
|
||||
return readArrayData[float64](f, n)
|
||||
case typeBool:
|
||||
return readArrayData[bool](f, n)
|
||||
case typeString:
|
||||
return readArrayString(f, n)
|
||||
default:
|
||||
return nil, fmt.Errorf("%w type %d", ErrUnsupported, t)
|
||||
}
|
||||
}
|
||||
|
||||
func readArrayData[T any](f *File, n uint64) (s []T, err error) {
|
||||
s = make([]T, n)
|
||||
for i := range n {
|
||||
e, err := read[T](f)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
s[i] = e
|
||||
}
|
||||
|
||||
return s, nil
|
||||
}
|
||||
|
||||
func readArrayString(f *File, n uint64) (s []string, err error) {
|
||||
s = make([]string, n)
|
||||
for i := range n {
|
||||
e, err := readString(f)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
s[i] = e
|
||||
}
|
||||
|
||||
return s, nil
|
||||
}
|
||||
|
||||
func (f *File) Close() error {
|
||||
f.keyValues.stop()
|
||||
f.tensors.stop()
|
||||
return f.file.Close()
|
||||
}
|
||||
|
||||
func (f *File) KeyValue(key string) KeyValue {
|
||||
if !strings.HasPrefix(key, "general.") && !strings.HasPrefix(key, "tokenizer.") {
|
||||
key = f.KeyValue("general.architecture").String() + "." + key
|
||||
}
|
||||
|
||||
if index := slices.IndexFunc(f.keyValues.values, func(kv KeyValue) bool {
|
||||
return kv.Key == key
|
||||
}); index >= 0 {
|
||||
return f.keyValues.values[index]
|
||||
}
|
||||
|
||||
for keyValue, ok := f.keyValues.next(); ok; keyValue, ok = f.keyValues.next() {
|
||||
if keyValue.Key == key {
|
||||
return keyValue
|
||||
}
|
||||
}
|
||||
|
||||
return KeyValue{}
|
||||
}
|
||||
|
||||
func (f *File) NumKeyValues() int {
|
||||
return int(f.keyValues.count)
|
||||
}
|
||||
|
||||
func (f *File) KeyValues() iter.Seq2[int, KeyValue] {
|
||||
return f.keyValues.All()
|
||||
}
|
||||
|
||||
func (f *File) TensorInfo(name string) TensorInfo {
|
||||
if index := slices.IndexFunc(f.tensors.values, func(t TensorInfo) bool {
|
||||
return t.Name == name
|
||||
}); index >= 0 {
|
||||
return f.tensors.values[index]
|
||||
}
|
||||
|
||||
// fast-forward through key values if we haven't already
|
||||
_ = f.keyValues.rest()
|
||||
for tensor, ok := f.tensors.next(); ok; tensor, ok = f.tensors.next() {
|
||||
if tensor.Name == name {
|
||||
return tensor
|
||||
}
|
||||
}
|
||||
|
||||
return TensorInfo{}
|
||||
}
|
||||
|
||||
func (f *File) NumTensors() int {
|
||||
return int(f.tensors.count)
|
||||
}
|
||||
|
||||
func (f *File) TensorInfos() iter.Seq2[int, TensorInfo] {
|
||||
// fast forward through key values if we haven't already
|
||||
f.keyValues.rest()
|
||||
return f.tensors.All()
|
||||
}
|
||||
|
||||
func (f *File) TensorReader(name string) (TensorInfo, io.Reader, error) {
|
||||
t := f.TensorInfo(name)
|
||||
if t.NumBytes() == 0 {
|
||||
return TensorInfo{}, nil, fmt.Errorf("tensor %s not found", name)
|
||||
}
|
||||
|
||||
// fast forward through tensor info if we haven't already
|
||||
_ = f.tensors.rest()
|
||||
return t, io.NewSectionReader(f.file, f.offset+int64(t.Offset), t.NumBytes()), nil
|
||||
}
|
||||
249
fs/gguf/gguf_test.go
Normal file
249
fs/gguf/gguf_test.go
Normal file
@@ -0,0 +1,249 @@
|
||||
package gguf_test
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/google/go-cmp/cmp"
|
||||
"github.com/google/go-cmp/cmp/cmpopts"
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
"github.com/ollama/ollama/fs/gguf"
|
||||
)
|
||||
|
||||
func createBinFile(tb testing.TB) string {
|
||||
tb.Helper()
|
||||
f, err := os.CreateTemp(tb.TempDir(), "")
|
||||
if err != nil {
|
||||
tb.Fatal(err)
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
kv := ggml.KV{
|
||||
"general.architecture": "llama",
|
||||
"llama.block_count": uint32(8),
|
||||
"llama.embedding_length": uint32(3),
|
||||
"llama.attention.head_count": uint32(2),
|
||||
"llama.attention.head_count_kv": uint32(2),
|
||||
"llama.attention.key_length": uint32(3),
|
||||
"llama.rope.dimension_count": uint32(4),
|
||||
"llama.rope.freq_base": float32(10000.0),
|
||||
"llama.rope.freq_scale": float32(1.0),
|
||||
"llama.attention.layer_norm_rms_epsilon": float32(1e-6),
|
||||
"tokenizer.ggml.eos_token_id": uint32(0),
|
||||
"tokenizer.ggml.eos_token_ids": []int32{1, 2, 3},
|
||||
"tokenizer.ggml.tokens": []string{"hello", "world"},
|
||||
"tokenizer.ggml.scores": []float32{0, 1},
|
||||
}
|
||||
|
||||
tensors := []*ggml.Tensor{
|
||||
{
|
||||
Name: "token_embd.weight",
|
||||
Kind: 0,
|
||||
Shape: []uint64{2, 3},
|
||||
WriterTo: bytes.NewBuffer(make([]byte, 4*2*3)),
|
||||
},
|
||||
{
|
||||
Name: "output.weight",
|
||||
Kind: 0,
|
||||
Shape: []uint64{3, 2},
|
||||
WriterTo: bytes.NewBuffer(make([]byte, 4*3*2)),
|
||||
},
|
||||
}
|
||||
|
||||
for i := range 8 {
|
||||
tensors = append(tensors, &ggml.Tensor{
|
||||
Name: "blk." + strconv.Itoa(i) + ".attn_q.weight",
|
||||
Kind: 0,
|
||||
Shape: []uint64{3, 3},
|
||||
WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
|
||||
}, &ggml.Tensor{
|
||||
Name: "blk." + strconv.Itoa(i) + ".attn_k.weight",
|
||||
Kind: 0,
|
||||
Shape: []uint64{3, 3},
|
||||
WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
|
||||
}, &ggml.Tensor{
|
||||
Name: "blk." + strconv.Itoa(i) + ".attn_v.weight",
|
||||
Kind: 0,
|
||||
Shape: []uint64{3, 3},
|
||||
WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
|
||||
}, &ggml.Tensor{
|
||||
Name: "blk." + strconv.Itoa(i) + ".attn_output.weight",
|
||||
Kind: 0,
|
||||
Shape: []uint64{3, 3},
|
||||
WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
|
||||
})
|
||||
}
|
||||
|
||||
if err := ggml.WriteGGUF(f, kv, tensors); err != nil {
|
||||
tb.Fatal(err)
|
||||
}
|
||||
|
||||
return f.Name()
|
||||
}
|
||||
|
||||
func TestRead(t *testing.T) {
|
||||
f, err := gguf.Open(createBinFile(t))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
if got := f.KeyValue("does.not.exist").Valid(); got {
|
||||
t.Errorf(`KeyValue("does.not.exist").Exists() = %v, want false`, got)
|
||||
}
|
||||
|
||||
if got := f.KeyValue("general.architecture").String(); got != "llama" {
|
||||
t.Errorf(`KeyValue("general.architecture").String() = %q, want %q`, got, "llama")
|
||||
}
|
||||
|
||||
if got := f.TensorInfo("token_embd.weight"); got.Name != "token_embd.weight" {
|
||||
t.Errorf(`TensorInfo("token_embd.weight").Name = %q, want %q`, got.Name, "token_embd.weight")
|
||||
} else if diff := cmp.Diff(got.Shape, []uint64{2, 3}); diff != "" {
|
||||
t.Errorf(`TensorInfo("token_embd.weight").Shape mismatch (-got +want):\n%s`, diff)
|
||||
} else if got.Type != gguf.TensorTypeF32 {
|
||||
t.Errorf(`TensorInfo("token_embd.weight").Type = %d, want %d`, got.Type, gguf.TensorTypeF32)
|
||||
}
|
||||
|
||||
if got := f.KeyValue("block_count").Uint(); got != 8 {
|
||||
t.Errorf(`KeyValue("block_count").Uint() = %d, want %d`, got, 8)
|
||||
}
|
||||
|
||||
if diff := cmp.Diff(f.KeyValue("tokenizer.ggml.tokens").Strings(), []string{"hello", "world"}); diff != "" {
|
||||
t.Errorf("KeyValue(\"tokenizer.ggml.tokens\").Strings() mismatch (-got +want):\n%s", diff)
|
||||
}
|
||||
|
||||
if diff := cmp.Diff(f.KeyValue("tokenizer.ggml.scores").Floats(), []float64{0, 1}); diff != "" {
|
||||
t.Errorf("KeyValue(\"tokenizer.ggml.scores\").Ints() mismatch (-got +want):\n%s", diff)
|
||||
}
|
||||
|
||||
var kvs []string
|
||||
for _, kv := range f.KeyValues() {
|
||||
if !kv.Valid() {
|
||||
t.Error("found invalid key-value pair:", kv)
|
||||
}
|
||||
|
||||
kvs = append(kvs, kv.Key)
|
||||
}
|
||||
|
||||
if len(kvs) != f.NumKeyValues() {
|
||||
t.Errorf("iterated key count = %d, want %d", len(kvs), f.NumKeyValues())
|
||||
}
|
||||
|
||||
if diff := cmp.Diff(kvs, []string{
|
||||
"general.architecture",
|
||||
"llama.block_count",
|
||||
"llama.embedding_length",
|
||||
"llama.attention.head_count",
|
||||
"llama.attention.head_count_kv",
|
||||
"llama.attention.key_length",
|
||||
"llama.rope.dimension_count",
|
||||
"llama.rope.freq_base",
|
||||
"llama.rope.freq_scale",
|
||||
"llama.attention.layer_norm_rms_epsilon",
|
||||
"tokenizer.ggml.eos_token_id",
|
||||
"tokenizer.ggml.eos_token_ids",
|
||||
"tokenizer.ggml.tokens",
|
||||
"tokenizer.ggml.scores",
|
||||
}, cmpopts.SortSlices(strings.Compare)); diff != "" {
|
||||
t.Errorf("KeyValues() mismatch (-got +want):\n%s", diff)
|
||||
}
|
||||
|
||||
var tis []string
|
||||
for _, ti := range f.TensorInfos() {
|
||||
if !ti.Valid() {
|
||||
t.Error("found invalid tensor info:", ti)
|
||||
}
|
||||
|
||||
tis = append(tis, ti.Name)
|
||||
}
|
||||
|
||||
if len(tis) != f.NumTensors() {
|
||||
t.Errorf("iterated tensor count = %d, want %d", len(tis), f.NumTensors())
|
||||
}
|
||||
|
||||
if diff := cmp.Diff(tis, []string{
|
||||
"token_embd.weight",
|
||||
"output.weight",
|
||||
"blk.0.attn_q.weight",
|
||||
"blk.0.attn_k.weight",
|
||||
"blk.0.attn_v.weight",
|
||||
"blk.0.attn_output.weight",
|
||||
"blk.1.attn_q.weight",
|
||||
"blk.1.attn_k.weight",
|
||||
"blk.1.attn_v.weight",
|
||||
"blk.1.attn_output.weight",
|
||||
"blk.2.attn_q.weight",
|
||||
"blk.2.attn_k.weight",
|
||||
"blk.2.attn_v.weight",
|
||||
"blk.2.attn_output.weight",
|
||||
"blk.3.attn_q.weight",
|
||||
"blk.3.attn_k.weight",
|
||||
"blk.3.attn_v.weight",
|
||||
"blk.3.attn_output.weight",
|
||||
"blk.4.attn_q.weight",
|
||||
"blk.4.attn_k.weight",
|
||||
"blk.4.attn_v.weight",
|
||||
"blk.4.attn_output.weight",
|
||||
"blk.5.attn_q.weight",
|
||||
"blk.5.attn_k.weight",
|
||||
"blk.5.attn_v.weight",
|
||||
"blk.5.attn_output.weight",
|
||||
"blk.6.attn_q.weight",
|
||||
"blk.6.attn_k.weight",
|
||||
"blk.6.attn_v.weight",
|
||||
"blk.6.attn_output.weight",
|
||||
"blk.7.attn_q.weight",
|
||||
"blk.7.attn_k.weight",
|
||||
"blk.7.attn_v.weight",
|
||||
"blk.7.attn_output.weight",
|
||||
}, cmpopts.SortSlices(strings.Compare)); diff != "" {
|
||||
t.Errorf("TensorInfos() mismatch (-got +want):\n%s", diff)
|
||||
}
|
||||
|
||||
ti, r, err := f.TensorReader("output.weight")
|
||||
if err != nil {
|
||||
t.Fatalf(`TensorReader("output.weight") error: %v`, err)
|
||||
}
|
||||
|
||||
if ti.Name != "output.weight" {
|
||||
t.Errorf(`TensorReader("output.weight").Name = %q, want %q`, ti.Name, "output.weight")
|
||||
} else if diff := cmp.Diff(ti.Shape, []uint64{3, 2}); diff != "" {
|
||||
t.Errorf(`TensorReader("output.weight").Shape mismatch (-got +want):\n%s`, diff)
|
||||
} else if ti.Type != gguf.TensorTypeF32 {
|
||||
t.Errorf(`TensorReader("output.weight").Type = %d, want %d`, ti.Type, gguf.TensorTypeF32)
|
||||
}
|
||||
|
||||
var b bytes.Buffer
|
||||
if _, err := b.ReadFrom(r); err != nil {
|
||||
t.Fatalf(`ReadFrom TensorReader("output.weight") error: %v`, err)
|
||||
}
|
||||
|
||||
if b.Len() != int(ti.NumBytes()) {
|
||||
t.Errorf(`ReadFrom TensorReader("output.weight") length = %d, want %d`, b.Len(), ti.NumBytes())
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkRead(b *testing.B) {
|
||||
b.ReportAllocs()
|
||||
|
||||
p := createBinFile(b)
|
||||
for b.Loop() {
|
||||
f, err := gguf.Open(p)
|
||||
if err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
|
||||
if got := f.KeyValue("general.architecture").String(); got != "llama" {
|
||||
b.Errorf("got = %q, want %q", got, "llama")
|
||||
}
|
||||
|
||||
// Iterate through some tensors
|
||||
for range f.TensorInfos() {
|
||||
}
|
||||
|
||||
f.Close()
|
||||
}
|
||||
}
|
||||
90
fs/gguf/keyvalue.go
Normal file
90
fs/gguf/keyvalue.go
Normal file
@@ -0,0 +1,90 @@
|
||||
package gguf
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"slices"
|
||||
)
|
||||
|
||||
type KeyValue struct {
|
||||
Key string
|
||||
Value
|
||||
}
|
||||
|
||||
func (kv KeyValue) Valid() bool {
|
||||
return kv.Key != "" && kv.Value.value != nil
|
||||
}
|
||||
|
||||
type Value struct {
|
||||
value any
|
||||
}
|
||||
|
||||
func value[T any](v Value, kinds ...reflect.Kind) (t T) {
|
||||
vv := reflect.ValueOf(v.value)
|
||||
if slices.Contains(kinds, vv.Kind()) {
|
||||
t = vv.Convert(reflect.TypeOf(t)).Interface().(T)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func values[T any](v Value, kinds ...reflect.Kind) (ts []T) {
|
||||
switch vv := reflect.ValueOf(v.value); vv.Kind() {
|
||||
case reflect.Slice:
|
||||
if slices.Contains(kinds, vv.Type().Elem().Kind()) {
|
||||
ts = make([]T, vv.Len())
|
||||
for i := range vv.Len() {
|
||||
ts[i] = vv.Index(i).Convert(reflect.TypeOf(ts[i])).Interface().(T)
|
||||
}
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// Int returns Value as a signed integer. If it is not a signed integer, it returns 0.
|
||||
func (v Value) Int() int64 {
|
||||
return value[int64](v, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64)
|
||||
}
|
||||
|
||||
// Ints returns Value as a signed integer slice. If it is not a signed integer slice, it returns nil.
|
||||
func (v Value) Ints() (i64s []int64) {
|
||||
return values[int64](v, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64)
|
||||
}
|
||||
|
||||
// Uint converts an unsigned integer value to uint64. If the value is not a unsigned integer, it returns 0.
|
||||
func (v Value) Uint() uint64 {
|
||||
return value[uint64](v, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64)
|
||||
}
|
||||
|
||||
// Uints returns Value as a unsigned integer slice. If it is not a unsigned integer slice, it returns nil.
|
||||
func (v Value) Uints() (u64s []uint64) {
|
||||
return values[uint64](v, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64)
|
||||
}
|
||||
|
||||
// Float returns Value as a float. If it is not a float, it returns 0.
|
||||
func (v Value) Float() float64 {
|
||||
return value[float64](v, reflect.Float32, reflect.Float64)
|
||||
}
|
||||
|
||||
// Floats returns Value as a float slice. If it is not a float slice, it returns nil.
|
||||
func (v Value) Floats() (f64s []float64) {
|
||||
return values[float64](v, reflect.Float32, reflect.Float64)
|
||||
}
|
||||
|
||||
// Bool returns Value as a boolean. If it is not a boolean, it returns false.
|
||||
func (v Value) Bool() bool {
|
||||
return value[bool](v, reflect.Bool)
|
||||
}
|
||||
|
||||
// Bools returns Value as a boolean slice. If it is not a boolean slice, it returns nil.
|
||||
func (v Value) Bools() (bools []bool) {
|
||||
return values[bool](v, reflect.Bool)
|
||||
}
|
||||
|
||||
// String returns Value as a string. If it is not a string, it returns an empty string.
|
||||
func (v Value) String() string {
|
||||
return value[string](v, reflect.String)
|
||||
}
|
||||
|
||||
// Strings returns Value as a string slice. If it is not a string slice, it returns nil.
|
||||
func (v Value) Strings() (strings []string) {
|
||||
return values[string](v, reflect.String)
|
||||
}
|
||||
208
fs/gguf/keyvalue_test.go
Normal file
208
fs/gguf/keyvalue_test.go
Normal file
@@ -0,0 +1,208 @@
|
||||
package gguf
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/google/go-cmp/cmp"
|
||||
)
|
||||
|
||||
func split(name string, values map[string][]any) (matched []any, unmatched []any) {
|
||||
for key, value := range values {
|
||||
if key == name {
|
||||
matched = value
|
||||
} else {
|
||||
unmatched = append(unmatched, value...)
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func TestValue(t *testing.T) {
|
||||
values := map[string][]any{
|
||||
"int64": {int(42), int8(42), int16(42), int32(42), int64(42)},
|
||||
"uint64": {uint(42), uint8(42), uint16(42), uint32(42), uint64(42)},
|
||||
"float64": {float32(42), float64(42)},
|
||||
"string": {"42", "hello"},
|
||||
"bool": {true, false},
|
||||
}
|
||||
|
||||
t.Run("int64", func(t *testing.T) {
|
||||
matched, unmatched := split("int64", values)
|
||||
for _, v := range matched {
|
||||
kv := KeyValue{"key", Value{v}}
|
||||
if i64 := kv.Int(); i64 != 42 {
|
||||
t.Errorf("expected 42, got %d", i64)
|
||||
}
|
||||
}
|
||||
|
||||
for _, v := range unmatched {
|
||||
kv := KeyValue{"key", Value{v}}
|
||||
if i64 := kv.Int(); i64 != 0 {
|
||||
t.Errorf("expected 42, got %d", i64)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("uint64", func(t *testing.T) {
|
||||
matched, unmatched := split("uint64", values)
|
||||
for _, v := range matched {
|
||||
kv := KeyValue{"key", Value{v}}
|
||||
if u64 := kv.Uint(); u64 != 42 {
|
||||
t.Errorf("expected 42, got %d", u64)
|
||||
}
|
||||
}
|
||||
|
||||
for _, v := range unmatched {
|
||||
kv := KeyValue{"key", Value{v}}
|
||||
if u64 := kv.Uint(); u64 != 0 {
|
||||
t.Errorf("expected 42, got %d", u64)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("float64", func(t *testing.T) {
|
||||
matched, unmatched := split("float64", values)
|
||||
for _, v := range matched {
|
||||
kv := KeyValue{"key", Value{v}}
|
||||
if f64 := kv.Float(); f64 != 42 {
|
||||
t.Errorf("expected 42, got %f", f64)
|
||||
}
|
||||
}
|
||||
|
||||
for _, v := range unmatched {
|
||||
kv := KeyValue{"key", Value{v}}
|
||||
if f64 := kv.Float(); f64 != 0 {
|
||||
t.Errorf("expected 42, got %f", f64)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("string", func(t *testing.T) {
|
||||
matched, unmatched := split("string", values)
|
||||
for _, v := range matched {
|
||||
kv := KeyValue{"key", Value{v}}
|
||||
if s := kv.String(); s != v {
|
||||
t.Errorf("expected 42, got %s", s)
|
||||
}
|
||||
}
|
||||
|
||||
for _, v := range unmatched {
|
||||
kv := KeyValue{"key", Value{v}}
|
||||
if s := kv.String(); s != "" {
|
||||
t.Errorf("expected 42, got %s", s)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("bool", func(t *testing.T) {
|
||||
matched, unmatched := split("bool", values)
|
||||
for _, v := range matched {
|
||||
kv := KeyValue{"key", Value{v}}
|
||||
if b := kv.Bool(); b != v {
|
||||
t.Errorf("expected true, got %v", b)
|
||||
}
|
||||
}
|
||||
|
||||
for _, v := range unmatched {
|
||||
kv := KeyValue{"key", Value{v}}
|
||||
if b := kv.Bool(); b != false {
|
||||
t.Errorf("expected false, got %v", b)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestValues(t *testing.T) {
|
||||
values := map[string][]any{
|
||||
"int64s": {[]int{42}, []int8{42}, []int16{42}, []int32{42}, []int64{42}},
|
||||
"uint64s": {[]uint{42}, []uint8{42}, []uint16{42}, []uint32{42}, []uint64{42}},
|
||||
"float64s": {[]float32{42}, []float64{42}},
|
||||
"strings": {[]string{"42"}, []string{"hello"}},
|
||||
"bools": {[]bool{true}, []bool{false}},
|
||||
}
|
||||
|
||||
t.Run("int64s", func(t *testing.T) {
|
||||
matched, unmatched := split("int64s", values)
|
||||
for _, v := range matched {
|
||||
kv := KeyValue{"key", Value{v}}
|
||||
if diff := cmp.Diff(kv.Ints(), []int64{42}); diff != "" {
|
||||
t.Errorf("diff: %s", diff)
|
||||
}
|
||||
}
|
||||
|
||||
for _, v := range unmatched {
|
||||
kv := KeyValue{"key", Value{v}}
|
||||
if i64s := kv.Ints(); i64s != nil {
|
||||
t.Errorf("expected nil, got %v", i64s)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("uint64s", func(t *testing.T) {
|
||||
matched, unmatched := split("uint64s", values)
|
||||
for _, v := range matched {
|
||||
kv := KeyValue{"key", Value{v}}
|
||||
if diff := cmp.Diff(kv.Uints(), []uint64{42}); diff != "" {
|
||||
t.Errorf("diff: %s", diff)
|
||||
}
|
||||
}
|
||||
|
||||
for _, v := range unmatched {
|
||||
kv := KeyValue{"key", Value{v}}
|
||||
if u64s := kv.Uints(); u64s != nil {
|
||||
t.Errorf("expected nil, got %v", u64s)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("float64s", func(t *testing.T) {
|
||||
matched, unmatched := split("float64s", values)
|
||||
for _, v := range matched {
|
||||
kv := KeyValue{"key", Value{v}}
|
||||
if diff := cmp.Diff(kv.Floats(), []float64{42}); diff != "" {
|
||||
t.Errorf("diff: %s", diff)
|
||||
}
|
||||
}
|
||||
|
||||
for _, v := range unmatched {
|
||||
kv := KeyValue{"key", Value{v}}
|
||||
if f64s := kv.Floats(); f64s != nil {
|
||||
t.Errorf("expected nil, got %v", f64s)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("strings", func(t *testing.T) {
|
||||
matched, unmatched := split("strings", values)
|
||||
for _, v := range matched {
|
||||
kv := KeyValue{"key", Value{v}}
|
||||
if diff := cmp.Diff(kv.Strings(), v); diff != "" {
|
||||
t.Errorf("diff: %s", diff)
|
||||
}
|
||||
}
|
||||
|
||||
for _, v := range unmatched {
|
||||
kv := KeyValue{"key", Value{v}}
|
||||
if s := kv.Strings(); s != nil {
|
||||
t.Errorf("expected nil, got %v", s)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("bools", func(t *testing.T) {
|
||||
matched, unmatched := split("bools", values)
|
||||
for _, v := range matched {
|
||||
kv := KeyValue{"key", Value{v}}
|
||||
if diff := cmp.Diff(kv.Bools(), v); diff != "" {
|
||||
t.Errorf("diff: %s", diff)
|
||||
}
|
||||
}
|
||||
|
||||
for _, v := range unmatched {
|
||||
kv := KeyValue{"key", Value{v}}
|
||||
if b := kv.Bools(); b != nil {
|
||||
t.Errorf("expected nil, got %v", b)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
89
fs/gguf/lazy.go
Normal file
89
fs/gguf/lazy.go
Normal file
@@ -0,0 +1,89 @@
|
||||
package gguf
|
||||
|
||||
import (
|
||||
"encoding/binary"
|
||||
"iter"
|
||||
"log/slog"
|
||||
)
|
||||
|
||||
type lazy[T any] struct {
|
||||
count uint64
|
||||
next func() (T, bool)
|
||||
stop func()
|
||||
values []T
|
||||
|
||||
// successFunc is called when all values have been successfully read.
|
||||
successFunc func() error
|
||||
}
|
||||
|
||||
func newLazy[T any](f *File, fn func() (T, error)) (*lazy[T], error) {
|
||||
it := lazy[T]{}
|
||||
if err := binary.Read(f.reader, binary.LittleEndian, &it.count); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
it.values = make([]T, 0)
|
||||
it.next, it.stop = iter.Pull(func(yield func(T) bool) {
|
||||
for i := range it.count {
|
||||
t, err := fn()
|
||||
if err != nil {
|
||||
slog.Error("error reading tensor", "index", i, "error", err)
|
||||
return
|
||||
}
|
||||
|
||||
it.values = append(it.values, t)
|
||||
if !yield(t) {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if it.successFunc != nil {
|
||||
it.successFunc()
|
||||
}
|
||||
})
|
||||
|
||||
return &it, nil
|
||||
}
|
||||
|
||||
func (g *lazy[T]) Values() iter.Seq[T] {
|
||||
return func(yield func(T) bool) {
|
||||
for _, v := range g.All() {
|
||||
if !yield(v) {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (g *lazy[T]) All() iter.Seq2[int, T] {
|
||||
return func(yield func(int, T) bool) {
|
||||
for i := range int(g.count) {
|
||||
if i < len(g.values) {
|
||||
if !yield(i, g.values[i]) {
|
||||
break
|
||||
}
|
||||
} else {
|
||||
t, ok := g.next()
|
||||
if !ok {
|
||||
break
|
||||
}
|
||||
|
||||
if !yield(i, t) {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (g *lazy[T]) rest() (collected bool) {
|
||||
for {
|
||||
_, ok := g.next()
|
||||
collected = collected || ok
|
||||
if !ok {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
return collected
|
||||
}
|
||||
23
fs/gguf/reader.go
Normal file
23
fs/gguf/reader.go
Normal file
@@ -0,0 +1,23 @@
|
||||
package gguf
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"io"
|
||||
)
|
||||
|
||||
type bufferedReader struct {
|
||||
offset int64
|
||||
*bufio.Reader
|
||||
}
|
||||
|
||||
func newBufferedReader(rs io.ReadSeeker, size int) *bufferedReader {
|
||||
return &bufferedReader{
|
||||
Reader: bufio.NewReaderSize(rs, size),
|
||||
}
|
||||
}
|
||||
|
||||
func (rs *bufferedReader) Read(p []byte) (n int, err error) {
|
||||
n, err = rs.Reader.Read(p)
|
||||
rs.offset += int64(n)
|
||||
return n, err
|
||||
}
|
||||
288
fs/gguf/tensor.go
Normal file
288
fs/gguf/tensor.go
Normal file
@@ -0,0 +1,288 @@
|
||||
package gguf
|
||||
|
||||
import (
|
||||
"log/slog"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type TensorInfo struct {
|
||||
Name string
|
||||
Offset uint64
|
||||
Shape []uint64
|
||||
Type TensorType
|
||||
}
|
||||
|
||||
func (ti TensorInfo) Valid() bool {
|
||||
return ti.Name != "" && ti.NumBytes() > 0
|
||||
}
|
||||
|
||||
func (ti TensorInfo) NumValues() int64 {
|
||||
var numItems int64 = 1
|
||||
for _, dim := range ti.Shape {
|
||||
numItems *= int64(dim)
|
||||
}
|
||||
return numItems
|
||||
}
|
||||
|
||||
// NumBytes returns the number of bytes in the tensor.
|
||||
func (ti TensorInfo) NumBytes() int64 {
|
||||
return int64(float64(ti.NumValues()) * ti.Type.NumBytes())
|
||||
}
|
||||
|
||||
func (ti TensorInfo) LogValue() slog.Value {
|
||||
return slog.GroupValue(
|
||||
slog.String("name", ti.Name),
|
||||
slog.Int64("offset", int64(ti.Offset)),
|
||||
slog.Any("shape", ti.Shape),
|
||||
slog.Int64("num_values", ti.NumValues()),
|
||||
slog.Int64("num_bytes", ti.NumBytes()),
|
||||
slog.Any("type", ti.Type),
|
||||
)
|
||||
}
|
||||
|
||||
type TensorType uint32
|
||||
|
||||
const (
|
||||
TensorTypeF32 TensorType = iota
|
||||
TensorTypeF16
|
||||
TensorTypeQ4_0
|
||||
TensorTypeQ4_1
|
||||
|
||||
// unexported // unused in gguf
|
||||
tensorTypeQ4_2
|
||||
tensorTypeQ4_3
|
||||
|
||||
TensorTypeQ5_0
|
||||
TensorTypeQ5_1
|
||||
TensorTypeQ8_0
|
||||
TensorTypeQ8_1
|
||||
TensorTypeQ2_K
|
||||
TensorTypeQ3_K
|
||||
TensorTypeQ4_K
|
||||
TensorTypeQ5_K
|
||||
TensorTypeQ6_K
|
||||
TensorTypeQ8_K
|
||||
|
||||
// unexported // unquantizable by ollama
|
||||
tensorTypeIQ2_XXS
|
||||
tensorTypeIQ2_XS
|
||||
tensorTypeIQ3_XXS
|
||||
tensorTypeIQ1_S
|
||||
tensorTypeIQ4_NL
|
||||
tensorTypeIQ3_S
|
||||
tensorTypeIQ2_S
|
||||
tensorTypeIQ4_XS
|
||||
|
||||
TensorTypeI8
|
||||
TensorTypeI16
|
||||
TensorTypeI32
|
||||
TensorTypeI64
|
||||
TensorTypeF64
|
||||
|
||||
// unexported // unquantizable by ollama
|
||||
tensorTypeIQ1_M
|
||||
|
||||
TensorTypeBF16
|
||||
|
||||
// unexported // unused in gguf
|
||||
tensorTypeQ4_0_4_4
|
||||
tensorTypeQ4_0_4_8
|
||||
tensorTypeQ4_0_8_8
|
||||
|
||||
// unexported // unquantizable by ollama
|
||||
tensorTypeTQ1_0
|
||||
tensorTypeTQ2_0
|
||||
|
||||
// unexported // unused in gguf
|
||||
tensorTypeIQ4_NL_4_4
|
||||
tensorTypeIQ4_NL_4_8
|
||||
tensorTypeIQ4_NL_8_8
|
||||
)
|
||||
|
||||
func (tt TensorType) NumBytes() float64 {
|
||||
return float64(tt.typeSize()) / float64(tt.blockSize())
|
||||
}
|
||||
|
||||
func (tt TensorType) typeSize() int64 {
|
||||
switch tt {
|
||||
case TensorTypeF32:
|
||||
return 4
|
||||
case TensorTypeF16:
|
||||
return 2
|
||||
case TensorTypeQ4_0:
|
||||
return 2 + tt.blockSize()/2
|
||||
case TensorTypeQ4_1:
|
||||
return 2 + 2 + tt.blockSize()/2
|
||||
case TensorTypeQ5_0:
|
||||
return 2 + 4 + tt.blockSize()/2
|
||||
case TensorTypeQ5_1:
|
||||
return 2 + 2 + 4 + tt.blockSize()/2
|
||||
case TensorTypeQ8_0:
|
||||
return 2 + tt.blockSize()
|
||||
case TensorTypeQ8_1:
|
||||
return 2 + 2 + tt.blockSize()
|
||||
case TensorTypeQ2_K:
|
||||
return tt.blockSize()/16 + tt.blockSize()/4 + 2 + 2
|
||||
case TensorTypeQ3_K:
|
||||
return tt.blockSize()/8 + tt.blockSize()/4 + 12 + 2
|
||||
case TensorTypeQ4_K:
|
||||
return 2 + 2 + 12 + tt.blockSize()/2
|
||||
case TensorTypeQ5_K:
|
||||
return 2 + 2 + 12 + tt.blockSize()/8 + tt.blockSize()/2
|
||||
case TensorTypeQ6_K:
|
||||
return tt.blockSize()/2 + tt.blockSize()/4 + tt.blockSize()/16 + 2
|
||||
case TensorTypeQ8_K:
|
||||
return 4 + tt.blockSize() + 2*tt.blockSize()/16
|
||||
case tensorTypeIQ2_XXS:
|
||||
return 2 + 2*tt.blockSize()/8
|
||||
case tensorTypeIQ2_XS:
|
||||
return 2 + 2*tt.blockSize()/8 + tt.blockSize()/32
|
||||
case tensorTypeIQ3_XXS:
|
||||
return 2 + tt.blockSize()/4 + tt.blockSize()/8
|
||||
case tensorTypeIQ1_S:
|
||||
return 2 + tt.blockSize()/8 + tt.blockSize()/16
|
||||
case tensorTypeIQ4_NL:
|
||||
return 2 + tt.blockSize()/2
|
||||
case tensorTypeIQ3_S:
|
||||
return 2 + tt.blockSize()/4 + tt.blockSize()/8 + tt.blockSize()/32 + 4
|
||||
case tensorTypeIQ2_S:
|
||||
return 2 + tt.blockSize()/4 + tt.blockSize()/16
|
||||
case tensorTypeIQ4_XS:
|
||||
return 2 + 2 + tt.blockSize()/2 + tt.blockSize()/64
|
||||
case TensorTypeI8:
|
||||
return 1
|
||||
case TensorTypeI16:
|
||||
return 2
|
||||
case TensorTypeI32:
|
||||
return 4
|
||||
case TensorTypeI64:
|
||||
return 8
|
||||
case TensorTypeF64:
|
||||
return 8
|
||||
case tensorTypeIQ1_M:
|
||||
return tt.blockSize()/8 + tt.blockSize()/16 + tt.blockSize()/32
|
||||
case TensorTypeBF16:
|
||||
return 2
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
func (tt TensorType) blockSize() int64 {
|
||||
switch tt {
|
||||
case TensorTypeF32,
|
||||
TensorTypeF16,
|
||||
TensorTypeI8,
|
||||
TensorTypeI16,
|
||||
TensorTypeI32,
|
||||
TensorTypeI64,
|
||||
TensorTypeF64,
|
||||
TensorTypeBF16:
|
||||
return 1
|
||||
case TensorTypeQ4_0,
|
||||
TensorTypeQ4_1,
|
||||
TensorTypeQ5_0,
|
||||
TensorTypeQ5_1,
|
||||
TensorTypeQ8_0,
|
||||
TensorTypeQ8_1,
|
||||
tensorTypeIQ4_NL:
|
||||
return 32
|
||||
default:
|
||||
return 256
|
||||
}
|
||||
}
|
||||
|
||||
func (tt TensorType) String() string {
|
||||
switch tt {
|
||||
case TensorTypeF32:
|
||||
return "f32"
|
||||
case TensorTypeF16:
|
||||
return "f16"
|
||||
case TensorTypeQ4_0:
|
||||
return "q4_0"
|
||||
case TensorTypeQ4_1:
|
||||
return "q4_1"
|
||||
case tensorTypeQ4_2:
|
||||
return "q4_2"
|
||||
case tensorTypeQ4_3:
|
||||
return "q4_3"
|
||||
case TensorTypeQ5_0:
|
||||
return "q5_0"
|
||||
case TensorTypeQ5_1:
|
||||
return "q5_1"
|
||||
case TensorTypeQ8_0:
|
||||
return "q8_0"
|
||||
case TensorTypeQ8_1:
|
||||
return "q8_1"
|
||||
case TensorTypeQ2_K:
|
||||
return "q2_k"
|
||||
case TensorTypeQ3_K:
|
||||
return "q3_k"
|
||||
case TensorTypeQ4_K:
|
||||
return "q4_k"
|
||||
case TensorTypeQ5_K:
|
||||
return "q5_k"
|
||||
case TensorTypeQ6_K:
|
||||
return "q6_k"
|
||||
case TensorTypeQ8_K:
|
||||
return "q8_k"
|
||||
case tensorTypeIQ2_XXS:
|
||||
return "iq2_xxs"
|
||||
case tensorTypeIQ2_XS:
|
||||
return "iq2_xs"
|
||||
case tensorTypeIQ3_XXS:
|
||||
return "iq3_xxs"
|
||||
case tensorTypeIQ1_S:
|
||||
return "iq1_s"
|
||||
case tensorTypeIQ4_NL:
|
||||
return "iq4_nl"
|
||||
case tensorTypeIQ3_S:
|
||||
return "iq3_s"
|
||||
case tensorTypeIQ2_S:
|
||||
return "iq2_s"
|
||||
case tensorTypeIQ4_XS:
|
||||
return "iq4_xs"
|
||||
case TensorTypeI8:
|
||||
return "i8"
|
||||
case TensorTypeI16:
|
||||
return "i16"
|
||||
case TensorTypeI32:
|
||||
return "i32"
|
||||
case TensorTypeI64:
|
||||
return "i64"
|
||||
case TensorTypeF64:
|
||||
return "f64"
|
||||
case tensorTypeIQ1_M:
|
||||
return "iq1_m"
|
||||
case TensorTypeBF16:
|
||||
return "bf16"
|
||||
case tensorTypeQ4_0_4_4:
|
||||
return "q4_0_4_4"
|
||||
case tensorTypeQ4_0_4_8:
|
||||
return "q4_0_4_8"
|
||||
case tensorTypeQ4_0_8_8:
|
||||
return "q4_0_8_8"
|
||||
case tensorTypeTQ1_0:
|
||||
return "tq1_0"
|
||||
case tensorTypeTQ2_0:
|
||||
return "tq2_0"
|
||||
case tensorTypeIQ4_NL_4_4:
|
||||
return "iq4_nl_4_4"
|
||||
case tensorTypeIQ4_NL_4_8:
|
||||
return "iq4_nl_4_8"
|
||||
case tensorTypeIQ4_NL_8_8:
|
||||
return "iq4_nl_8_8"
|
||||
default:
|
||||
return "unknown"
|
||||
}
|
||||
}
|
||||
|
||||
func (tt TensorType) LogValue() slog.Value {
|
||||
return slog.GroupValue(
|
||||
slog.Uint64("value", uint64(tt)),
|
||||
slog.String("name", strings.ToUpper(tt.String())),
|
||||
slog.Int64("size", tt.typeSize()),
|
||||
slog.Int64("block_size", tt.blockSize()),
|
||||
slog.Float64("num_bytes", tt.NumBytes()),
|
||||
)
|
||||
}
|
||||
3
go.mod
3
go.mod
@@ -19,8 +19,7 @@ require (
|
||||
github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1
|
||||
github.com/dlclark/regexp2 v1.11.4
|
||||
github.com/emirpasic/gods/v2 v2.0.0-alpha
|
||||
github.com/go-json-experiment/json v0.0.0-20250417205406-170dfdcf87d1
|
||||
github.com/google/go-cmp v0.6.0
|
||||
github.com/google/go-cmp v0.7.0
|
||||
github.com/mattn/go-runewidth v0.0.14
|
||||
github.com/nlpodyssey/gopickle v0.3.0
|
||||
github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c
|
||||
|
||||
6
go.sum
6
go.sum
@@ -69,8 +69,6 @@ github.com/go-fonts/latin-modern v0.2.0/go.mod h1:rQVLdDMK+mK1xscDwsqM5J8U2jrRa3
|
||||
github.com/go-fonts/liberation v0.1.1/go.mod h1:K6qoJYypsmfVjWg8KOVDQhLc8UDgIK2HYqyqAO9z7GY=
|
||||
github.com/go-fonts/stix v0.1.0/go.mod h1:w/c1f0ldAUlJmLBvlbkvVXLAD+tAMqobIIQpmnUIzUY=
|
||||
github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU=
|
||||
github.com/go-json-experiment/json v0.0.0-20250417205406-170dfdcf87d1 h1:+VexzzkMLb1tnvpuQdGT/DicIRW7MN8ozsXqBMgp0Hk=
|
||||
github.com/go-json-experiment/json v0.0.0-20250417205406-170dfdcf87d1/go.mod h1:TiCD2a1pcmjd7YnhGH0f/zKNcCD06B029pHhzV23c2M=
|
||||
github.com/go-latex/latex v0.0.0-20210118124228-b3d85cf34e07/go.mod h1:CO1AlKB2CSIqUrmQPqA0gdRIlnLEY0gK5JGjh37zN5U=
|
||||
github.com/go-playground/assert/v2 v2.2.0 h1:JvknZsQTYeFEAhQwI4qEt9cyV5ONwRHC+lYKSsYSR8s=
|
||||
github.com/go-playground/assert/v2 v2.2.0/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4=
|
||||
@@ -114,8 +112,8 @@ github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/
|
||||
github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
|
||||
github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
|
||||
github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
|
||||
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
|
||||
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
|
||||
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
|
||||
github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
|
||||
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
|
||||
github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||
|
||||
@@ -19,7 +19,7 @@ func TestVisionModels(t *testing.T) {
|
||||
}
|
||||
testCases := []testCase{
|
||||
{
|
||||
model: "llava:7b",
|
||||
model: "qwen2.5vl",
|
||||
},
|
||||
{
|
||||
model: "llama3.2-vision",
|
||||
@@ -60,6 +60,7 @@ func TestVisionModels(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestIntegrationSplitBatch(t *testing.T) {
|
||||
skipUnderMinVRAM(t, 6)
|
||||
image, err := base64.StdEncoding.DecodeString(imageEncoding)
|
||||
require.NoError(t, err)
|
||||
req := api.GenerateRequest{
|
||||
|
||||
1
integration/testdata/embed.json
vendored
1
integration/testdata/embed.json
vendored
File diff suppressed because one or more lines are too long
@@ -30,6 +30,11 @@ type Causal struct {
|
||||
|
||||
// ** current forward pass **
|
||||
|
||||
// curReserve indicates that this forward pass is only for
|
||||
// memory reservation and we should not update our metadata
|
||||
// based on it.
|
||||
curReserve bool
|
||||
|
||||
// the active layer for Get and Put
|
||||
curLayer int
|
||||
|
||||
@@ -159,12 +164,13 @@ func (c *Causal) Close() {
|
||||
}
|
||||
|
||||
func (c *Causal) StartForward(ctx ml.Context, batch input.Batch, reserve bool) error {
|
||||
c.curReserve = reserve
|
||||
c.curBatchSize = len(batch.Positions)
|
||||
c.curSequences = batch.Sequences
|
||||
c.curPositions = batch.Positions
|
||||
c.opts.Except = nil
|
||||
|
||||
if !reserve {
|
||||
if !c.curReserve {
|
||||
c.updateSlidingWindow()
|
||||
|
||||
var err error
|
||||
@@ -211,10 +217,9 @@ func (c *Causal) StartForward(ctx ml.Context, batch input.Batch, reserve bool) e
|
||||
c.curCellRange.max = len(c.cells) - 1
|
||||
}
|
||||
|
||||
var err error
|
||||
c.curMask, err = c.buildMask(ctx)
|
||||
c.curMask = c.buildMask(ctx)
|
||||
|
||||
return err
|
||||
return nil
|
||||
}
|
||||
|
||||
func newRange() cellRange {
|
||||
@@ -297,7 +302,7 @@ func roundUp(length, pad int) int {
|
||||
// Builds a mask of history x batch indicating whether for each token in the batch the
|
||||
// token in the history should apply. This is based on both the sequence and causality (the
|
||||
// position of the history is not ahead of the token in the batch).
|
||||
func (c *Causal) buildMask(ctx ml.Context) (ml.Tensor, error) {
|
||||
func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
|
||||
// Align and pad the two dimensions as required by the backend
|
||||
batchSize := roundUp(c.curBatchSize, c.config.MaskBatchPadding)
|
||||
|
||||
@@ -305,6 +310,11 @@ func (c *Causal) buildMask(ctx ml.Context) (ml.Tensor, error) {
|
||||
c.curCellRange.max = roundUp(c.curCellRange.max+1, c.config.CachePadding) - 1
|
||||
|
||||
length := c.curCellRange.max - c.curCellRange.min + 1
|
||||
|
||||
if c.curReserve {
|
||||
return ctx.Input().Empty(c.config.MaskDType, length, batchSize)
|
||||
}
|
||||
|
||||
mask := make([]float32, batchSize*length)
|
||||
|
||||
for i := range c.curBatchSize {
|
||||
@@ -325,10 +335,7 @@ func (c *Causal) buildMask(ctx ml.Context) (ml.Tensor, error) {
|
||||
mask[i] = float32(math.Inf(-1))
|
||||
}
|
||||
|
||||
maskTensor, err := ctx.Input().FromFloatSlice(mask, length, batchSize)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
maskTensor := ctx.Input().FromFloatSlice(mask, length, batchSize)
|
||||
|
||||
if c.config.MaskDType != ml.DTypeF32 {
|
||||
out := ctx.Input().Empty(c.config.MaskDType, maskTensor.Shape()...)
|
||||
@@ -336,7 +343,7 @@ func (c *Causal) buildMask(ctx ml.Context) (ml.Tensor, error) {
|
||||
maskTensor = out
|
||||
}
|
||||
|
||||
return maskTensor, nil
|
||||
return maskTensor
|
||||
}
|
||||
|
||||
func (c *Causal) moveCells(ctx ml.Context, src, dst, length int) {
|
||||
@@ -491,12 +498,7 @@ func (c *Causal) SetCausal(ctx ml.Context, opts CausalOptions) {
|
||||
if !slices.Equal(c.opts.Except, opts.Except) {
|
||||
c.opts = opts
|
||||
if ctx != nil {
|
||||
var err error
|
||||
c.curMask, err = c.buildMask(ctx)
|
||||
if err != nil {
|
||||
// This error should never occur because we have previously built a mask with the same shape
|
||||
panic(fmt.Errorf("SetCausal: %w", err))
|
||||
}
|
||||
c.curMask = c.buildMask(ctx)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -652,10 +654,7 @@ func (c *Causal) shift(seq int, beginIndex, offset int32) error {
|
||||
}
|
||||
}
|
||||
|
||||
kShift, err := ctx.Input().FromIntSlice(offsets, len(offsets))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
kShift := ctx.Input().FromIntSlice(offsets, len(offsets))
|
||||
|
||||
for i, key := range c.keys {
|
||||
if key == nil {
|
||||
|
||||
@@ -344,7 +344,7 @@ func testCache(t *testing.T, backend ml.Backend, cache Cache, tests []testCase)
|
||||
}
|
||||
|
||||
cache.SetLayer(0)
|
||||
tensor, _ := context.FromFloatSlice(test.in, test.inShape...)
|
||||
tensor := context.FromFloatSlice(test.in, test.inShape...)
|
||||
cache.Put(context, tensor, tensor)
|
||||
|
||||
out, _, mask := cache.Get(context)
|
||||
@@ -386,7 +386,7 @@ func TestCanResume(t *testing.T) {
|
||||
}
|
||||
|
||||
cache.SetLayer(0)
|
||||
tensor, _ := context.FromFloatSlice([]float32{1, 2, 3, 4}, 1, 1, 4)
|
||||
tensor := context.FromFloatSlice([]float32{1, 2, 3, 4}, 1, 1, 4)
|
||||
cache.Put(context, tensor, tensor)
|
||||
|
||||
// with window size 4, nothing has slid out of the window yet
|
||||
@@ -413,7 +413,7 @@ func TestCanResume(t *testing.T) {
|
||||
}
|
||||
|
||||
cache.SetLayer(0)
|
||||
tensor, _ = context.FromFloatSlice([]float32{5, 6}, 1, 1, 2)
|
||||
tensor = context.FromFloatSlice([]float32{5, 6}, 1, 1, 2)
|
||||
cache.Put(context, tensor, tensor)
|
||||
|
||||
// only the latest position has overlapping windows
|
||||
@@ -470,24 +470,24 @@ func (c *testContext) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
|
||||
return c.Empty(dtype, shape...)
|
||||
}
|
||||
|
||||
func (c *testContext) FromFloatSlice(s []float32, shape ...int) (ml.Tensor, error) {
|
||||
func (c *testContext) FromFloatSlice(s []float32, shape ...int) ml.Tensor {
|
||||
t := c.Empty(ml.DTypeF32, shape...).(*testTensor)
|
||||
|
||||
copy(t.data, s)
|
||||
|
||||
return t, nil
|
||||
return t
|
||||
}
|
||||
|
||||
func (c *testContext) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
|
||||
func (c *testContext) FromIntSlice(s []int32, shape ...int) ml.Tensor {
|
||||
f := make([]float32, len(s))
|
||||
for i := range f {
|
||||
f[i] = float32(s[i])
|
||||
}
|
||||
|
||||
out, _ := c.FromFloatSlice(f, shape...)
|
||||
out := c.FromFloatSlice(f, shape...)
|
||||
out.(*testTensor).dtype = ml.DTypeI32
|
||||
|
||||
return out, nil
|
||||
return out
|
||||
}
|
||||
|
||||
func (c *testContext) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
|
||||
@@ -496,7 +496,7 @@ func (c *testContext) Arange(start, stop, step float32, dtype ml.DType) ml.Tenso
|
||||
s = append(s, i)
|
||||
}
|
||||
|
||||
out, _ := c.FromFloatSlice(s, len(s))
|
||||
out := c.FromFloatSlice(s, len(s))
|
||||
out.(*testTensor).dtype = dtype
|
||||
return out
|
||||
}
|
||||
@@ -508,7 +508,7 @@ func (c *testContext) Forward(...ml.Tensor) ml.Context { return c }
|
||||
|
||||
func (c *testContext) Compute(...ml.Tensor) {}
|
||||
|
||||
func (c *testContext) Reserve() error { return nil }
|
||||
func (c *testContext) Reserve() {}
|
||||
|
||||
func (c *testContext) MaxGraphNodes() int {
|
||||
return 10
|
||||
|
||||
@@ -544,7 +544,7 @@ func NewSamplingContext(model *Model, params SamplingParams) (*SamplingContext,
|
||||
cparams.penalty_last_n = C.int32_t(params.RepeatLastN)
|
||||
cparams.penalty_repeat = C.float(params.PenaltyRepeat)
|
||||
cparams.penalty_freq = C.float(params.PenaltyFreq)
|
||||
cparams.penalty_present = C.float(params.PenaltyFreq)
|
||||
cparams.penalty_present = C.float(params.PenaltyPresent)
|
||||
cparams.seed = C.uint32_t(params.Seed)
|
||||
|
||||
grammar := C.CString(params.Grammar)
|
||||
@@ -580,7 +580,7 @@ func SchemaToGrammar(schema []byte) []byte {
|
||||
defer C.free(unsafe.Pointer(cStr))
|
||||
|
||||
// Allocate buffer for grammar based on schema length but with upper bound
|
||||
maxLen := min(1024*1024, len(schema)*4)
|
||||
maxLen := max(32768, min(1024*1024, len(schema)*4))
|
||||
buf := make([]byte, maxLen)
|
||||
|
||||
// Call C function to convert schema to grammar
|
||||
@@ -602,7 +602,7 @@ type Grammar struct {
|
||||
mu sync.Mutex
|
||||
}
|
||||
|
||||
func NewGrammar(grammar string, vocabIds []uint32, vocabValues []string, eogTokens []uint32) *Grammar {
|
||||
func NewGrammar(grammar string, vocabIds []uint32, vocabValues []string, eogTokens []int32) *Grammar {
|
||||
cGrammar := C.CString(grammar)
|
||||
defer C.free(unsafe.Pointer(cGrammar))
|
||||
|
||||
@@ -622,7 +622,7 @@ func NewGrammar(grammar string, vocabIds []uint32, vocabValues []string, eogToke
|
||||
cEogTokens[i] = C.uint32_t(token)
|
||||
}
|
||||
|
||||
g := C.grammar_init(cGrammar, (*C.uint32_t)(unsafe.Pointer(&cTokens[0])), C.size_t(len(cTokens)), (**C.char)(unsafe.Pointer(&cPieces[0])), (*C.uint32_t)(unsafe.Pointer(&cEogTokens[0])), C.size_t(len(cEogTokens)))
|
||||
g := C.grammar_init(cGrammar, unsafe.SliceData(cTokens), C.size_t(len(cTokens)), unsafe.SliceData(cPieces), unsafe.SliceData(cEogTokens), C.size_t(len(cEogTokens)))
|
||||
if g == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
277
llama/patches/0015-add-argsort-and-cuda-copy-for-i32.patch
Normal file
277
llama/patches/0015-add-argsort-and-cuda-copy-for-i32.patch
Normal file
@@ -0,0 +1,277 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Michael Yang <git@mxy.ng>
|
||||
Date: Thu, 1 May 2025 13:45:12 -0700
|
||||
Subject: [PATCH] add argsort and cuda copy for i32
|
||||
|
||||
---
|
||||
ggml/src/ggml-cpu/ops.cpp | 43 ++++++++++++++
|
||||
ggml/src/ggml-cuda/argsort.cu | 102 +++++++++++++++++++++++++++++++++-
|
||||
ggml/src/ggml-cuda/cpy.cu | 49 ++++++++++++++++
|
||||
3 files changed, 192 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
|
||||
index becdae07..7a44b6cf 100644
|
||||
--- a/ggml/src/ggml-cpu/ops.cpp
|
||||
+++ b/ggml/src/ggml-cpu/ops.cpp
|
||||
@@ -6890,6 +6890,45 @@ static void ggml_compute_forward_argsort_f32(
|
||||
}
|
||||
}
|
||||
|
||||
+static void ggml_compute_forward_argsort_i32(
|
||||
+ const ggml_compute_params * params,
|
||||
+ ggml_tensor * dst) {
|
||||
+
|
||||
+ const ggml_tensor * src0 = dst->src[0];
|
||||
+
|
||||
+ GGML_TENSOR_UNARY_OP_LOCALS
|
||||
+
|
||||
+ GGML_ASSERT(nb0 == sizeof(int32_t));
|
||||
+
|
||||
+ const int ith = params->ith;
|
||||
+ const int nth = params->nth;
|
||||
+
|
||||
+ const int64_t nr = ggml_nrows(src0);
|
||||
+
|
||||
+ ggml_sort_order order = (ggml_sort_order) ggml_get_op_params_i32(dst, 0);
|
||||
+
|
||||
+ for (int64_t i = ith; i < nr; i += nth) {
|
||||
+ int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);
|
||||
+ const int32_t * src_data = (int32_t *)((char *) src0->data + i*nb01);
|
||||
+
|
||||
+ for (int64_t j = 0; j < ne0; j++) {
|
||||
+ dst_data[j] = j;
|
||||
+ }
|
||||
+
|
||||
+ // C doesn't have a functional sort, so we do a bubble sort instead
|
||||
+ for (int64_t j = 0; j < ne0; j++) {
|
||||
+ for (int64_t k = j + 1; k < ne0; k++) {
|
||||
+ if ((order == GGML_SORT_ORDER_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
|
||||
+ (order == GGML_SORT_ORDER_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
|
||||
+ int32_t tmp = dst_data[j];
|
||||
+ dst_data[j] = dst_data[k];
|
||||
+ dst_data[k] = tmp;
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
void ggml_compute_forward_argsort(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
@@ -6901,6 +6940,10 @@ void ggml_compute_forward_argsort(
|
||||
{
|
||||
ggml_compute_forward_argsort_f32(params, dst);
|
||||
} break;
|
||||
+ case GGML_TYPE_I32:
|
||||
+ {
|
||||
+ ggml_compute_forward_argsort_i32(params, dst);
|
||||
+ } break;
|
||||
default:
|
||||
{
|
||||
GGML_ABORT("fatal error");
|
||||
diff --git a/ggml/src/ggml-cuda/argsort.cu b/ggml/src/ggml-cuda/argsort.cu
|
||||
index 607ded85..53b02634 100644
|
||||
--- a/ggml/src/ggml-cuda/argsort.cu
|
||||
+++ b/ggml/src/ggml-cuda/argsort.cu
|
||||
@@ -85,13 +85,107 @@ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, co
|
||||
}
|
||||
}
|
||||
|
||||
+
|
||||
+template<ggml_sort_order order>
|
||||
+static __global__ void k_argsort_i32_i32(const int32_t * x, int * dst, const int ncols, const int ncols_pad) {
|
||||
+ extern __shared__ int shared_mem[];
|
||||
+ int * indices = shared_mem;
|
||||
+
|
||||
+ const int tid = threadIdx.x;
|
||||
+ const int row = blockIdx.y;
|
||||
+
|
||||
+ // Initialize all indices, handling the case where threads < ncols_pad
|
||||
+ for (int i = tid; i < ncols_pad; i += blockDim.x) {
|
||||
+ indices[i] = i < ncols ? i : 0; // Use 0 for padding indices
|
||||
+ }
|
||||
+ __syncthreads();
|
||||
+
|
||||
+ // Bitonic sort
|
||||
+ for (int k = 2; k <= ncols_pad; k *= 2) {
|
||||
+ for (int j = k/2; j > 0; j /= 2) {
|
||||
+ for (int i = tid; i < ncols_pad; i += blockDim.x) {
|
||||
+ const int ij = i ^ j;
|
||||
+ if (ij > i) {
|
||||
+ // Only compare values within the actual data range
|
||||
+ if (i < ncols && ij < ncols) {
|
||||
+ if ((i & k) == 0) {
|
||||
+ if (order == GGML_SORT_ORDER_ASC) {
|
||||
+ if (x[row * ncols + indices[i]] > x[row * ncols + indices[ij]]) {
|
||||
+ int tmp = indices[i];
|
||||
+ indices[i] = indices[ij];
|
||||
+ indices[ij] = tmp;
|
||||
+ }
|
||||
+ } else {
|
||||
+ if (x[row * ncols + indices[i]] < x[row * ncols + indices[ij]]) {
|
||||
+ int tmp = indices[i];
|
||||
+ indices[i] = indices[ij];
|
||||
+ indices[ij] = tmp;
|
||||
+ }
|
||||
+ }
|
||||
+ } else {
|
||||
+ if (order == GGML_SORT_ORDER_ASC) {
|
||||
+ if (x[row * ncols + indices[i]] < x[row * ncols + indices[ij]]) {
|
||||
+ int tmp = indices[i];
|
||||
+ indices[i] = indices[ij];
|
||||
+ indices[ij] = tmp;
|
||||
+ }
|
||||
+ } else {
|
||||
+ if (x[row * ncols + indices[i]] > x[row * ncols + indices[ij]]) {
|
||||
+ int tmp = indices[i];
|
||||
+ indices[i] = indices[ij];
|
||||
+ indices[ij] = tmp;
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+ __syncthreads();
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ // Write sorted indices to output, only threads handling valid data
|
||||
+ for (int i = tid; i < ncols; i += blockDim.x) {
|
||||
+ dst[row * ncols + i] = indices[i];
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+static void argsort_i32_i32_cuda(const int32_t * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
|
||||
+ // Bitonic sort requires ncols to be power of 2
|
||||
+ const int ncols_pad = next_power_of_2(ncols);
|
||||
+
|
||||
+ // Ensure thread count doesn't exceed maximum (typically 1024)
|
||||
+ const int max_threads = 1024; // This is the typical max for most GPUs
|
||||
+ const int threads_per_block = ncols_pad > max_threads ? max_threads : ncols_pad;
|
||||
+
|
||||
+ const dim3 block_dims(threads_per_block, 1, 1);
|
||||
+ const dim3 block_nums(1, nrows, 1);
|
||||
+ const size_t shared_mem = ncols_pad * sizeof(int);
|
||||
+
|
||||
+ // Check if shared memory size is within limits
|
||||
+ const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb;
|
||||
+
|
||||
+ // Instead of logging an error, use GGML_ASSERT with a descriptive message
|
||||
+ GGML_ASSERT(shared_mem <= max_shared_mem && "argsort: required shared memory exceeds device limit");
|
||||
+
|
||||
+ // Launch kernels with the updated thread configuration
|
||||
+ if (order == GGML_SORT_ORDER_ASC) {
|
||||
+ k_argsort_i32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
|
||||
+ } else if (order == GGML_SORT_ORDER_DESC) {
|
||||
+ k_argsort_i32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
|
||||
+ } else {
|
||||
+ GGML_ABORT("fatal error");
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+
|
||||
void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
float * dst_d = (float *)dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
@@ -100,5 +194,9 @@ void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
|
||||
enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
|
||||
|
||||
- argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);
|
||||
+ if (src0->type == GGML_TYPE_I32) {
|
||||
+ argsort_i32_i32_cuda((const int32_t *)src0_d, (int *)dst_d, ncols, nrows, order, stream);
|
||||
+ } else {
|
||||
+ argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);
|
||||
+ }
|
||||
}
|
||||
diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
|
||||
index 2d46176e..47383486 100644
|
||||
--- a/ggml/src/ggml-cuda/cpy.cu
|
||||
+++ b/ggml/src/ggml-cuda/cpy.cu
|
||||
@@ -38,6 +38,13 @@ static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
|
||||
*dsti = *xi;
|
||||
}
|
||||
|
||||
+static __device__ void cpy_1_i32_i32(const char * cxi, char * cdsti) {
|
||||
+ const int32_t * xi = (const int32_t *) cxi;
|
||||
+ int32_t * dsti = (int32_t *) cdsti;
|
||||
+
|
||||
+ *dsti = *xi;
|
||||
+}
|
||||
+
|
||||
template <cpy_kernel_t cpy_1>
|
||||
static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
@@ -68,6 +75,44 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const in
|
||||
cpy_1(cx + x_offset, cdst + dst_offset);
|
||||
}
|
||||
|
||||
+// First, add this template function after the other template functions
|
||||
+template <cpy_kernel_t cpy_1>
|
||||
+static __global__ void cpy_i32_i32(const char * cx, char * cdst, const int ne,
|
||||
+ const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
+ const int nb12, const int nb13) {
|
||||
+ const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
+
|
||||
+ if (i >= ne) {
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ const int64_t i03 = i/(ne00 * ne01 * ne02);
|
||||
+ const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
|
||||
+ const int64_t i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
|
||||
+ const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
|
||||
+ const int64_t x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
|
||||
+
|
||||
+ const int64_t i13 = i/(ne10 * ne11 * ne12);
|
||||
+ const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
|
||||
+ const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
|
||||
+ const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
|
||||
+ const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
|
||||
+
|
||||
+ cpy_1(cx + x_offset, cdst + dst_offset);
|
||||
+}
|
||||
+
|
||||
+// Then modify the ggml_cpy_i32_i32_cuda function to use the new template
|
||||
+static void ggml_cpy_i32_i32_cuda(
|
||||
+ const char * cx, char * cdst, const int ne,
|
||||
+ const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int graph_cpynode_index) {
|
||||
+
|
||||
+ const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
||||
+ cpy_i32_i32<cpy_1_i32_i32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
+ (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
+}
|
||||
+
|
||||
static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
|
||||
const float * xi = (const float *) cxi;
|
||||
block_q8_0 * dsti = (block_q8_0 *) cdsti;
|
||||
@@ -631,6 +676,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||
ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
+ } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
|
||||
+ ggml_cpy_i32_i32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else {
|
||||
GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
|
||||
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||
@@ -686,6 +733,8 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
|
||||
return (void*) cpy_f32_f16<cpy_1_f32_f16>;
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
||||
return (void*) cpy_f32_f16<cpy_1_f16_f32>;
|
||||
+ } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
|
||||
+ return (void*) cpy_i32_i32<cpy_1_i32_i32>;
|
||||
} else {
|
||||
GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
|
||||
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||
156
llama/patches/0016-graph-memory-reporting-on-failure.patch
Normal file
156
llama/patches/0016-graph-memory-reporting-on-failure.patch
Normal file
@@ -0,0 +1,156 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Jesse Gross <jesse@ollama.com>
|
||||
Date: Fri, 18 Apr 2025 15:58:19 -0700
|
||||
Subject: [PATCH] graph memory reporting on failure
|
||||
|
||||
---
|
||||
ggml/include/ggml-alloc.h | 6 ++++++
|
||||
ggml/include/ggml-backend.h | 6 ++++++
|
||||
ggml/src/ggml-alloc.c | 38 +++++++++++++++++++++++++++++++++----
|
||||
ggml/src/ggml-backend.cpp | 10 ++++++++++
|
||||
4 files changed, 56 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h
|
||||
index 2cb150fd..781b1e10 100644
|
||||
--- a/ggml/include/ggml-alloc.h
|
||||
+++ b/ggml/include/ggml-alloc.h
|
||||
@@ -66,6 +66,12 @@ GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph
|
||||
|
||||
GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
|
||||
|
||||
+struct ggml_allocr_buffer_status {
|
||||
+ size_t size;
|
||||
+ bool allocated;
|
||||
+};
|
||||
+GGML_API struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id);
|
||||
+
|
||||
// Utils
|
||||
// Create a buffer and allocate all the tensors in a ggml_context
|
||||
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
|
||||
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
|
||||
index 778927f6..74e46716 100644
|
||||
--- a/ggml/include/ggml-backend.h
|
||||
+++ b/ggml/include/ggml-backend.h
|
||||
@@ -304,6 +304,12 @@ extern "C" {
|
||||
|
||||
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
||||
|
||||
+ struct ggml_backend_buffer_status {
|
||||
+ size_t size;
|
||||
+ bool allocated;
|
||||
+ };
|
||||
+ GGML_API struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
||||
+
|
||||
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
||||
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
||||
|
||||
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
|
||||
index 5fd379f6..04812990 100644
|
||||
--- a/ggml/src/ggml-alloc.c
|
||||
+++ b/ggml/src/ggml-alloc.c
|
||||
@@ -364,6 +364,7 @@ struct node_alloc {
|
||||
struct ggml_gallocr {
|
||||
ggml_backend_buffer_type_t * bufts; // [n_buffers]
|
||||
ggml_backend_buffer_t * buffers; // [n_buffers]
|
||||
+ size_t *buffer_sizes; // [n_buffers]
|
||||
struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
|
||||
int n_buffers;
|
||||
|
||||
@@ -387,6 +388,9 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
|
||||
galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
|
||||
GGML_ASSERT(galloc->buffers != NULL);
|
||||
|
||||
+ galloc->buffer_sizes = calloc(n_bufs, sizeof(size_t));
|
||||
+ GGML_ASSERT(galloc->buffer_sizes != NULL);
|
||||
+
|
||||
galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
|
||||
GGML_ASSERT(galloc->buf_tallocs != NULL);
|
||||
|
||||
@@ -453,6 +457,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
|
||||
ggml_hash_set_free(&galloc->hash_set);
|
||||
free(galloc->hash_values);
|
||||
free(galloc->bufts);
|
||||
+ free(galloc->buffer_sizes);
|
||||
free(galloc->buffers);
|
||||
free(galloc->buf_tallocs);
|
||||
free(galloc->node_allocs);
|
||||
@@ -748,6 +753,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||
}
|
||||
}
|
||||
|
||||
+ bool success = true;
|
||||
+
|
||||
// reallocate buffers if needed
|
||||
for (int i = 0; i < galloc->n_buffers; i++) {
|
||||
// if the buffer type is used multiple times, we reuse the same buffer
|
||||
@@ -769,15 +776,20 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||
|
||||
ggml_backend_buffer_free(galloc->buffers[i]);
|
||||
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
|
||||
- if (galloc->buffers[i] == NULL) {
|
||||
+ if (galloc->buffers[i]) {
|
||||
+ galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
|
||||
+ ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
||||
+ } else {
|
||||
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
||||
- return false;
|
||||
+ galloc->buffer_sizes[i] = new_size;
|
||||
+ success = false;
|
||||
}
|
||||
- ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
||||
+ } else {
|
||||
+ galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
|
||||
}
|
||||
}
|
||||
|
||||
- return true;
|
||||
+ return success;
|
||||
}
|
||||
|
||||
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
||||
@@ -934,6 +946,24 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
||||
return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
|
||||
}
|
||||
|
||||
+struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
||||
+ GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
|
||||
+
|
||||
+ for (int i = 0; i < buffer_id; i++) {
|
||||
+ if (galloc->buf_tallocs[i] == galloc->buf_tallocs[buffer_id]) {
|
||||
+ // This buffer is the same as a previous one due to the same buffer type being used multiple times
|
||||
+ // (See above.) However, we need a different check because multiple buffers might be NULL in our
|
||||
+ // case and we still want to know the attempted size.
|
||||
+
|
||||
+ struct ggml_allocr_buffer_status status = {0, true};
|
||||
+ return status;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ struct ggml_allocr_buffer_status status = {galloc->buffer_sizes[buffer_id], galloc->buffers[buffer_id] != NULL};
|
||||
+ return status;
|
||||
+}
|
||||
+
|
||||
// utils
|
||||
|
||||
static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
|
||||
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
|
||||
index 0ce73a99..be335e8c 100644
|
||||
--- a/ggml/src/ggml-backend.cpp
|
||||
+++ b/ggml/src/ggml-backend.cpp
|
||||
@@ -1629,6 +1629,16 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
|
||||
return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
|
||||
}
|
||||
|
||||
+struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
||||
+ int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
||||
+ GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
||||
+
|
||||
+ struct ggml_allocr_buffer_status allocr_status = ggml_gallocr_get_attempted_buffer_size(sched->galloc, backend_index);
|
||||
+ struct ggml_backend_buffer_status status = {allocr_status.size, allocr_status.allocated};
|
||||
+
|
||||
+ return status;
|
||||
+}
|
||||
+
|
||||
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
|
||||
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
||||
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
||||
102
llama/patches/0017-ggml-Export-GPU-UUIDs.patch
Normal file
102
llama/patches/0017-ggml-Export-GPU-UUIDs.patch
Normal file
@@ -0,0 +1,102 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Jesse Gross <jesse@ollama.com>
|
||||
Date: Thu, 24 Apr 2025 14:48:51 -0700
|
||||
Subject: [PATCH] ggml: Export GPU UUIDs
|
||||
|
||||
This enables matching up devices and information reported by the backend
|
||||
with tools (e.g. nvidia-smi) and system management libraries (e.g. nvml).
|
||||
---
|
||||
ggml/include/ggml-backend.h | 1 +
|
||||
ggml/src/ggml-cuda/ggml-cuda.cu | 33 ++++++++++++++++++++++++++++++++
|
||||
ggml/src/ggml-metal/ggml-metal.m | 1 +
|
||||
3 files changed, 35 insertions(+)
|
||||
|
||||
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
|
||||
index 74e46716..a880df33 100644
|
||||
--- a/ggml/include/ggml-backend.h
|
||||
+++ b/ggml/include/ggml-backend.h
|
||||
@@ -152,6 +152,7 @@ extern "C" {
|
||||
struct ggml_backend_dev_props {
|
||||
const char * name;
|
||||
const char * description;
|
||||
+ const char * uuid;
|
||||
size_t memory_free;
|
||||
size_t memory_total;
|
||||
enum ggml_backend_dev_type type;
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index cb0d8528..4c829153 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -2884,6 +2884,7 @@ struct ggml_backend_cuda_device_context {
|
||||
int device;
|
||||
std::string name;
|
||||
std::string description;
|
||||
+ std::string uuid;
|
||||
};
|
||||
|
||||
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
|
||||
@@ -2896,6 +2897,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
|
||||
return ctx->description.c_str();
|
||||
}
|
||||
|
||||
+static const char * ggml_backend_cuda_device_get_uuid(ggml_backend_dev_t dev) {
|
||||
+ ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
||||
+ return ctx->uuid.c_str();
|
||||
+}
|
||||
+
|
||||
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
||||
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
||||
ggml_cuda_set_device(ctx->device);
|
||||
@@ -2910,6 +2916,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
|
||||
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
||||
props->name = ggml_backend_cuda_device_get_name(dev);
|
||||
props->description = ggml_backend_cuda_device_get_description(dev);
|
||||
+ props->uuid = ggml_backend_cuda_device_get_uuid(dev);
|
||||
props->type = ggml_backend_cuda_device_get_type(dev);
|
||||
ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||
|
||||
@@ -3458,6 +3465,32 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
|
||||
dev_ctx->description = prop.name;
|
||||
|
||||
+ #if !defined(GGML_USE_HIP)
|
||||
+ char uuid[64];
|
||||
+ snprintf(uuid, sizeof(uuid),
|
||||
+ "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
|
||||
+ (unsigned char)prop.uuid.bytes[0],
|
||||
+ (unsigned char)prop.uuid.bytes[1],
|
||||
+ (unsigned char)prop.uuid.bytes[2],
|
||||
+ (unsigned char)prop.uuid.bytes[3],
|
||||
+ (unsigned char)prop.uuid.bytes[4],
|
||||
+ (unsigned char)prop.uuid.bytes[5],
|
||||
+ (unsigned char)prop.uuid.bytes[6],
|
||||
+ (unsigned char)prop.uuid.bytes[7],
|
||||
+ (unsigned char)prop.uuid.bytes[8],
|
||||
+ (unsigned char)prop.uuid.bytes[9],
|
||||
+ (unsigned char)prop.uuid.bytes[10],
|
||||
+ (unsigned char)prop.uuid.bytes[11],
|
||||
+ (unsigned char)prop.uuid.bytes[12],
|
||||
+ (unsigned char)prop.uuid.bytes[13],
|
||||
+ (unsigned char)prop.uuid.bytes[14],
|
||||
+ (unsigned char)prop.uuid.bytes[15]
|
||||
+ );
|
||||
+ dev_ctx->uuid = uuid;
|
||||
+ #else
|
||||
+ dev_ctx->uuid = "GPU-" + std::string(prop.uuid.bytes, 16);
|
||||
+ #endif
|
||||
+
|
||||
ggml_backend_dev_t dev = new ggml_backend_device {
|
||||
/* .iface = */ ggml_backend_cuda_device_interface,
|
||||
/* .reg = */ ®,
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
|
||||
index 1b56f858..ee4f2dcb 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal.m
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal.m
|
||||
@@ -5703,6 +5703,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
|
||||
static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
|
||||
props->name = ggml_backend_metal_device_get_name(dev);
|
||||
props->description = ggml_backend_metal_device_get_description(dev);
|
||||
+ props->uuid = "0";
|
||||
props->type = ggml_backend_metal_device_get_type(dev);
|
||||
ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||
props->caps = (struct ggml_backend_dev_caps) {
|
||||
@@ -1,12 +1,9 @@
|
||||
package llm
|
||||
|
||||
import (
|
||||
"cmp"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"maps"
|
||||
"os"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
@@ -85,8 +82,11 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
||||
var graphOffload uint64
|
||||
|
||||
// Projectors loaded into GPU0 only
|
||||
var projectorWeights uint64
|
||||
var projectorGraph uint64
|
||||
var llamaEngineProjectorWeights uint64
|
||||
|
||||
// Projectors loaded with output layer
|
||||
var ollamaEngineProjectorWeights uint64
|
||||
var ollamaEngineProjectorGraph uint64
|
||||
|
||||
// Conditional output size on GPU 0
|
||||
var memoryLayerOutput uint64
|
||||
@@ -111,21 +111,23 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
||||
slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList)
|
||||
|
||||
for _, projector := range projectors {
|
||||
weight := projectorMemoryRequirements(projector)
|
||||
projectorWeights += weight
|
||||
llamaEngineProjectorWeights += projectorMemoryRequirements(projector)
|
||||
|
||||
// multimodal models require at least 2048 context
|
||||
opts.NumCtx = max(opts.NumCtx, 2048)
|
||||
}
|
||||
if projectorWeights == 0 && projectorGraph == 0 {
|
||||
projectorWeights, projectorGraph = f.VisionGraphSize()
|
||||
if llamaEngineProjectorWeights == 0 {
|
||||
ollamaEngineProjectorWeights, ollamaEngineProjectorGraph = f.VisionGraphSize()
|
||||
opts.NumCtx = max(opts.NumCtx, 2048)
|
||||
}
|
||||
|
||||
layers := f.Tensors().GroupLayers()
|
||||
// add one layer (chosing the max layer) worth of memory as a buffer
|
||||
layerSize = slices.MaxFunc(slices.Collect(maps.Values(layers)), func(a, b ggml.Layer) int {
|
||||
return cmp.Compare(a.Size(), b.Size())
|
||||
}).Size()
|
||||
// add one layer worth of memory as a buffer
|
||||
if blk0, ok := layers["blk.0"]; ok {
|
||||
layerSize = blk0.Size()
|
||||
} else {
|
||||
slog.Warn("model missing blk.0 layer size")
|
||||
}
|
||||
|
||||
var kvct string
|
||||
if envconfig.FlashAttention() &&
|
||||
@@ -163,6 +165,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
||||
graphFullOffload = graphPartialOffload
|
||||
}
|
||||
|
||||
// Output layer handled at the end if we have space
|
||||
if layer, ok := layers["output_norm"]; ok {
|
||||
memoryLayerOutput += layer.Size()
|
||||
}
|
||||
@@ -172,8 +175,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
||||
memoryLayerOutput += layer.Size()
|
||||
}
|
||||
|
||||
// Output layer handled at the end if we have space
|
||||
gpuZeroOverhead := projectorWeights + projectorGraph
|
||||
gpuZeroOverhead := llamaEngineProjectorWeights
|
||||
|
||||
// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
|
||||
var layerCount int
|
||||
@@ -216,6 +218,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
||||
if len(gpusWithSpace) > 0 {
|
||||
gpuZeroID = gpusWithSpace[0].i
|
||||
gpuAllocations[gpuZeroID] += gpuZeroOverhead
|
||||
} else {
|
||||
overflow += gpuZeroOverhead
|
||||
}
|
||||
|
||||
// For all the layers, find where they can fit on the GPU(s)
|
||||
@@ -256,21 +260,24 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
||||
}
|
||||
|
||||
// Determine if we need to consider output then find where it fits
|
||||
if memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) {
|
||||
for j := len(gpusWithSpace); j > 0; j-- {
|
||||
g := gpusWithSpace[layerCount%j]
|
||||
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
|
||||
if g.g.FreeMemory > overhead+used+memoryLayerOutput {
|
||||
gpuAllocations[g.i] += memoryLayerOutput
|
||||
layerCounts[g.i]++
|
||||
layerCount++
|
||||
break
|
||||
memoryLastLayer := memoryLayerOutput + ollamaEngineProjectorWeights + ollamaEngineProjectorGraph
|
||||
if memoryLastLayer > 0 {
|
||||
if opts.NumGPU < 0 || layerCount < opts.NumGPU {
|
||||
for j := len(gpusWithSpace); j > 0; j-- {
|
||||
g := gpusWithSpace[layerCount%j]
|
||||
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
|
||||
if g.g.FreeMemory > overhead+used+memoryLastLayer {
|
||||
gpuAllocations[g.i] += memoryLastLayer
|
||||
layerCounts[g.i]++
|
||||
layerCount++
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if layerCount < int(f.KV().BlockCount())+1 {
|
||||
fullyLoaded = false
|
||||
overflow += memoryLayerOutput
|
||||
overflow += memoryLastLayer
|
||||
}
|
||||
}
|
||||
|
||||
@@ -328,8 +335,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
|
||||
memoryLayerOutput: memoryLayerOutput,
|
||||
graphFullOffload: graphFullOffload,
|
||||
graphPartialOffload: graphPartialOffload,
|
||||
projectorWeights: projectorWeights,
|
||||
projectorGraph: projectorGraph,
|
||||
projectorWeights: llamaEngineProjectorWeights + ollamaEngineProjectorWeights,
|
||||
projectorGraph: ollamaEngineProjectorGraph,
|
||||
}
|
||||
|
||||
if gpus[0].Library == "cpu" {
|
||||
@@ -415,7 +422,7 @@ func projectorMemoryRequirements(filename string) (weights uint64) {
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
ggml, _, err := ggml.Decode(file, 1024)
|
||||
ggml, err := ggml.Decode(file, 1024)
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
|
||||
@@ -22,6 +22,7 @@ import (
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/sync/semaphore"
|
||||
|
||||
@@ -121,7 +122,7 @@ func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) {
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
ggml, _, err := ggml.Decode(f, maxArraySize)
|
||||
ggml, err := ggml.Decode(f, maxArraySize)
|
||||
return ggml, err
|
||||
}
|
||||
|
||||
@@ -725,10 +726,68 @@ type CompletionResponse struct {
|
||||
EvalDuration time.Duration `json:"eval_duration"`
|
||||
}
|
||||
|
||||
// unicodeBufferHandler wraps a completion response callback to handle partial UTF-8 sequences.
|
||||
// This function creates a stateful closure that is NOT safe for concurrent use.
|
||||
// Each completion request should create its own handler instance.
|
||||
func unicodeBufferHandler(fn func(CompletionResponse)) func(CompletionResponse) {
|
||||
var pendingUTF8 string
|
||||
|
||||
return func(resp CompletionResponse) {
|
||||
if resp.Content == "" && !resp.Done {
|
||||
// No content to process, just pass through
|
||||
fn(resp)
|
||||
return
|
||||
}
|
||||
|
||||
// Combine any pending UTF-8 with current content
|
||||
combinedContent := pendingUTF8 + resp.Content
|
||||
pendingUTF8 = ""
|
||||
|
||||
// Check if combined content is valid UTF-8
|
||||
if utf8.ValidString(combinedContent) {
|
||||
// Valid UTF-8, send it
|
||||
resp.Content = combinedContent
|
||||
fn(resp)
|
||||
} else {
|
||||
// Invalid UTF-8
|
||||
if resp.Done {
|
||||
// This is the final response, trim incomplete UTF-8
|
||||
trimmedContent := combinedContent
|
||||
for !utf8.ValidString(trimmedContent) && len(trimmedContent) > 0 {
|
||||
trimmedContent = trimmedContent[:len(trimmedContent)-1]
|
||||
}
|
||||
resp.Content = trimmedContent
|
||||
fn(resp)
|
||||
} else {
|
||||
// Not final response, split valid and invalid parts
|
||||
validPrefix := combinedContent
|
||||
for !utf8.ValidString(validPrefix) && len(validPrefix) > 0 {
|
||||
validPrefix = validPrefix[:len(validPrefix)-1]
|
||||
}
|
||||
|
||||
if len(validPrefix) > 0 {
|
||||
// Send valid prefix
|
||||
resp.Content = validPrefix
|
||||
fn(resp)
|
||||
// Buffer the remainder
|
||||
pendingUTF8 = combinedContent[len(validPrefix):]
|
||||
} else {
|
||||
// No valid prefix, buffer everything
|
||||
pendingUTF8 = combinedContent
|
||||
// Don't send this response
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
|
||||
slog.Debug("completion request", "images", len(req.Images), "prompt", len(req.Prompt), "format", string(req.Format))
|
||||
slog.Log(ctx, logutil.LevelTrace, "completion request", "prompt", req.Prompt)
|
||||
|
||||
// Wrap the callback with unicode buffer handling
|
||||
unicodeFn := unicodeBufferHandler(fn)
|
||||
|
||||
if len(req.Format) > 0 {
|
||||
switch string(req.Format) {
|
||||
case `null`, `""`:
|
||||
@@ -797,7 +856,8 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
|
||||
|
||||
res, err := http.DefaultClient.Do(serverReq)
|
||||
if err != nil {
|
||||
return fmt.Errorf("POST predict: %v", err)
|
||||
slog.Error("post predict", "error", err)
|
||||
return errors.New("model runner has unexpectedly stopped, this may be due to resource limitations or an internal error, check ollama server logs for details")
|
||||
}
|
||||
defer res.Body.Close()
|
||||
|
||||
@@ -853,13 +913,13 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
|
||||
}
|
||||
|
||||
if c.Content != "" {
|
||||
fn(CompletionResponse{
|
||||
unicodeFn(CompletionResponse{
|
||||
Content: c.Content,
|
||||
})
|
||||
}
|
||||
|
||||
if c.Done {
|
||||
fn(c)
|
||||
unicodeFn(c)
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
@@ -70,3 +70,152 @@ func TestLLMServerCompletionFormat(t *testing.T) {
|
||||
}, nil)
|
||||
checkValid(err)
|
||||
}
|
||||
|
||||
func TestUnicodeBufferHandler(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
inputResponses []CompletionResponse
|
||||
expectedResponses []CompletionResponse
|
||||
description string
|
||||
}{
|
||||
{
|
||||
name: "complete_unicode",
|
||||
inputResponses: []CompletionResponse{
|
||||
{Content: "Hello", Done: false},
|
||||
{Content: " world", Done: false},
|
||||
{Content: "!", Done: true},
|
||||
},
|
||||
expectedResponses: []CompletionResponse{
|
||||
{Content: "Hello", Done: false},
|
||||
{Content: " world", Done: false},
|
||||
{Content: "!", Done: true},
|
||||
},
|
||||
description: "All responses with valid unicode should pass through unchanged",
|
||||
},
|
||||
{
|
||||
name: "incomplete_unicode_at_end_with_done",
|
||||
inputResponses: []CompletionResponse{
|
||||
{Content: "Hello", Done: false},
|
||||
{Content: string([]byte{0xF0, 0x9F}), Done: true}, // Incomplete emoji with Done=true
|
||||
},
|
||||
expectedResponses: []CompletionResponse{
|
||||
{Content: "Hello", Done: false},
|
||||
{Content: "", Done: true}, // Content is trimmed but response is still sent with Done=true
|
||||
},
|
||||
description: "When Done=true, incomplete Unicode at the end should be trimmed",
|
||||
},
|
||||
{
|
||||
name: "split_unicode_across_responses",
|
||||
inputResponses: []CompletionResponse{
|
||||
{Content: "Hello " + string([]byte{0xF0, 0x9F}), Done: false}, // First part of 😀
|
||||
{Content: string([]byte{0x98, 0x80}) + " world!", Done: true}, // Second part of 😀 and more text
|
||||
},
|
||||
expectedResponses: []CompletionResponse{
|
||||
{Content: "Hello ", Done: false}, // Incomplete Unicode trimmed
|
||||
{Content: "😀 world!", Done: true}, // Complete emoji in second response
|
||||
},
|
||||
description: "Unicode split across responses should be handled correctly",
|
||||
},
|
||||
{
|
||||
name: "incomplete_unicode_buffered",
|
||||
inputResponses: []CompletionResponse{
|
||||
{Content: "Test " + string([]byte{0xF0, 0x9F}), Done: false}, // Incomplete emoji
|
||||
{Content: string([]byte{0x98, 0x80}), Done: false}, // Complete the emoji
|
||||
{Content: " done", Done: true},
|
||||
},
|
||||
expectedResponses: []CompletionResponse{
|
||||
{Content: "Test ", Done: false}, // First part without incomplete unicode
|
||||
{Content: "😀", Done: false}, // Complete emoji
|
||||
{Content: " done", Done: true},
|
||||
},
|
||||
description: "Incomplete unicode should be buffered and combined with next response",
|
||||
},
|
||||
{
|
||||
name: "empty_response_with_done",
|
||||
inputResponses: []CompletionResponse{
|
||||
{Content: "Complete response", Done: false},
|
||||
{Content: "", Done: true}, // Empty response with Done=true
|
||||
},
|
||||
expectedResponses: []CompletionResponse{
|
||||
{Content: "Complete response", Done: false},
|
||||
{Content: "", Done: true}, // Should still be sent because Done=true
|
||||
},
|
||||
description: "Empty final response with Done=true should still be sent",
|
||||
},
|
||||
{
|
||||
name: "done_reason_preserved",
|
||||
inputResponses: []CompletionResponse{
|
||||
{Content: "Response", Done: false},
|
||||
{Content: " complete", Done: true, DoneReason: DoneReasonStop},
|
||||
},
|
||||
expectedResponses: []CompletionResponse{
|
||||
{Content: "Response", Done: false},
|
||||
{Content: " complete", Done: true, DoneReason: DoneReasonStop},
|
||||
},
|
||||
description: "DoneReason should be preserved in the final response",
|
||||
},
|
||||
{
|
||||
name: "only_incomplete_unicode_not_done",
|
||||
inputResponses: []CompletionResponse{
|
||||
{Content: string([]byte{0xF0, 0x9F}), Done: false}, // Only incomplete unicode
|
||||
},
|
||||
expectedResponses: []CompletionResponse{
|
||||
// No response expected - should be buffered
|
||||
},
|
||||
description: "Response with only incomplete unicode should be buffered if not done",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
var actualResponses []CompletionResponse
|
||||
|
||||
// Create a callback that collects responses
|
||||
callback := func(resp CompletionResponse) {
|
||||
actualResponses = append(actualResponses, resp)
|
||||
}
|
||||
|
||||
// Create the unicode buffer handler
|
||||
handler := unicodeBufferHandler(callback)
|
||||
|
||||
// Send all input responses through the handler
|
||||
for _, resp := range tt.inputResponses {
|
||||
handler(resp)
|
||||
}
|
||||
|
||||
// Verify the number of responses
|
||||
if len(actualResponses) != len(tt.expectedResponses) {
|
||||
t.Fatalf("%s: got %d responses, want %d responses",
|
||||
tt.description, len(actualResponses), len(tt.expectedResponses))
|
||||
}
|
||||
|
||||
// Verify each response matches the expected one
|
||||
for i, expected := range tt.expectedResponses {
|
||||
if i >= len(actualResponses) {
|
||||
t.Fatalf("%s: missing response at index %d", tt.description, i)
|
||||
continue
|
||||
}
|
||||
|
||||
actual := actualResponses[i]
|
||||
|
||||
// Verify content
|
||||
if actual.Content != expected.Content {
|
||||
t.Errorf("%s: response[%d].Content = %q, want %q",
|
||||
tt.description, i, actual.Content, expected.Content)
|
||||
}
|
||||
|
||||
// Verify Done flag
|
||||
if actual.Done != expected.Done {
|
||||
t.Errorf("%s: response[%d].Done = %v, want %v",
|
||||
tt.description, i, actual.Done, expected.Done)
|
||||
}
|
||||
|
||||
// Verify DoneReason if specified
|
||||
if actual.DoneReason != expected.DoneReason {
|
||||
t.Errorf("%s: response[%d].DoneReason = %v, want %v",
|
||||
tt.description, i, actual.DoneReason, expected.DoneReason)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
151
ml/backend.go
151
ml/backend.go
@@ -5,8 +5,8 @@ import (
|
||||
"context"
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"math"
|
||||
"os"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
@@ -15,6 +15,11 @@ import (
|
||||
)
|
||||
|
||||
type Backend interface {
|
||||
Load(ctx context.Context, progress func(float32)) error
|
||||
|
||||
// BackendMemory returns the memory allocations that were made for this model
|
||||
BackendMemory() BackendMemory
|
||||
|
||||
Config() fs.Config
|
||||
Get(name string) Tensor
|
||||
NewContext() Context
|
||||
@@ -52,10 +57,6 @@ type CacheConfig struct {
|
||||
|
||||
// BackendParams controls how the backend loads and executes models
|
||||
type BackendParams struct {
|
||||
// Progress is a callback function that allows reporting percentage completion
|
||||
// of model loading
|
||||
Progress func(float32)
|
||||
|
||||
// NumThreads sets the number of threads to use if running on the CPU
|
||||
NumThreads int
|
||||
|
||||
@@ -72,9 +73,130 @@ type BackendParams struct {
|
||||
FlashAttention bool
|
||||
}
|
||||
|
||||
var backends = make(map[string]func(context.Context, *os.File, BackendParams) (Backend, error))
|
||||
// ErrNoMem is returned when panicing due to insufficient memory. It includes
|
||||
// the attempted memory allocation.
|
||||
type ErrNoMem struct {
|
||||
BackendMemory
|
||||
}
|
||||
|
||||
func RegisterBackend(name string, f func(context.Context, *os.File, BackendParams) (Backend, error)) {
|
||||
func (e ErrNoMem) Error() string {
|
||||
return fmt.Sprintf("insufficient memory - required allocations: %+v", e.BackendMemory)
|
||||
}
|
||||
|
||||
type AllocationStatus int
|
||||
|
||||
const (
|
||||
// Unallocated memory - have not yet attempted to allocate
|
||||
Unallocated AllocationStatus = iota
|
||||
|
||||
// Failed memory - tried to allocate the memory and did not succeed
|
||||
Failed
|
||||
|
||||
// Allocated memory = tried and succeeded to allocate memory
|
||||
Allocated
|
||||
)
|
||||
|
||||
// Memory is the size of an allocation and whether it was successful.
|
||||
type Memory struct {
|
||||
Size uint64
|
||||
Status AllocationStatus
|
||||
}
|
||||
|
||||
func (m Memory) String() string {
|
||||
s := fmt.Sprint(m.Size)
|
||||
|
||||
switch m.Status {
|
||||
case Unallocated:
|
||||
s += "U"
|
||||
case Failed:
|
||||
s += "F"
|
||||
case Allocated:
|
||||
s += "A"
|
||||
}
|
||||
|
||||
return s
|
||||
}
|
||||
|
||||
// DeviceMemory provides a breakdown of the memory needed
|
||||
// per device, such as a CPU or GPU.
|
||||
type DeviceMemory struct {
|
||||
// Name is the name of the device as labeled by the backend. It
|
||||
// may not be persistent across instances of the runner.
|
||||
Name string
|
||||
|
||||
// UUID is a unique persistent identifier for the device for matching
|
||||
// with system management libraries
|
||||
UUID string
|
||||
|
||||
// Weights is the per-layer memory needed for the model weights.
|
||||
Weights []Memory
|
||||
|
||||
// Cache is the per-layer memory needed for the KV cache.
|
||||
Cache []Memory
|
||||
|
||||
// Graph is the size of the compute graph. It is not per-layer.
|
||||
Graph Memory
|
||||
}
|
||||
|
||||
func memoryPresent(mem []Memory) bool {
|
||||
return slices.ContainsFunc(mem, func(m Memory) bool { return m.Size != 0 })
|
||||
}
|
||||
|
||||
func (m DeviceMemory) LogValue() slog.Value {
|
||||
var attrs []slog.Attr
|
||||
if memoryPresent(m.Weights) {
|
||||
attrs = append(attrs, slog.Any("Weights", m.Weights))
|
||||
}
|
||||
|
||||
if memoryPresent(m.Cache) {
|
||||
attrs = append(attrs, slog.Any("Cache", m.Cache))
|
||||
}
|
||||
|
||||
if m.Graph.Size != 0 {
|
||||
attrs = append(attrs, slog.Any("Graph", m.Graph))
|
||||
}
|
||||
|
||||
if len(attrs) > 0 && m.UUID != "" {
|
||||
attrs = append([]slog.Attr{slog.String("UUID", m.UUID)}, attrs...)
|
||||
}
|
||||
|
||||
return slog.GroupValue(attrs...)
|
||||
}
|
||||
|
||||
// BackendMemory provides the amount of memory required to load the model
|
||||
// per device based on the BackendParams. In some cases, not all required
|
||||
// allocations will be known at this point. However, the size of the most recent
|
||||
// allocation is guaranteed to be provided so that if it failed, the caller can
|
||||
// accommodate that to make forward progress.
|
||||
type BackendMemory struct {
|
||||
// InputsWeights are always located on the CPU and cannot be moved
|
||||
InputWeights Memory
|
||||
|
||||
// CPU model components are located in system memory. This does not
|
||||
// include unified memory allocated through the GPU.
|
||||
CPU DeviceMemory
|
||||
|
||||
// GPU model components are located on one or more GPUs.
|
||||
GPUs []DeviceMemory
|
||||
}
|
||||
|
||||
func (m BackendMemory) LogValue() slog.Value {
|
||||
var attrs []slog.Attr
|
||||
if m.InputWeights.Size != 0 {
|
||||
attrs = append(attrs, slog.Any("InputWeights", m.InputWeights))
|
||||
}
|
||||
|
||||
attrs = append(attrs, slog.Any(m.CPU.Name, m.CPU))
|
||||
for _, g := range m.GPUs {
|
||||
attrs = append(attrs, slog.Any(g.Name, g))
|
||||
}
|
||||
|
||||
return slog.GroupValue(attrs...)
|
||||
}
|
||||
|
||||
var backends = make(map[string]func(string, BackendParams) (Backend, error))
|
||||
|
||||
func RegisterBackend(name string, f func(string, BackendParams) (Backend, error)) {
|
||||
if _, ok := backends[name]; ok {
|
||||
panic("backend: backend already registered")
|
||||
}
|
||||
@@ -82,9 +204,9 @@ func RegisterBackend(name string, f func(context.Context, *os.File, BackendParam
|
||||
backends[name] = f
|
||||
}
|
||||
|
||||
func NewBackend(ctx context.Context, f *os.File, params BackendParams) (Backend, error) {
|
||||
func NewBackend(modelPath string, params BackendParams) (Backend, error) {
|
||||
if backend, ok := backends["ggml"]; ok {
|
||||
return backend(ctx, f, params)
|
||||
return backend(modelPath, params)
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("unsupported backend")
|
||||
@@ -93,8 +215,8 @@ func NewBackend(ctx context.Context, f *os.File, params BackendParams) (Backend,
|
||||
type Context interface {
|
||||
Empty(dtype DType, shape ...int) Tensor
|
||||
Zeros(dtype DType, shape ...int) Tensor
|
||||
FromFloatSlice(s []float32, shape ...int) (Tensor, error)
|
||||
FromIntSlice(s []int32, shape ...int) (Tensor, error)
|
||||
FromFloatSlice(s []float32, shape ...int) Tensor
|
||||
FromIntSlice(s []int32, shape ...int) Tensor
|
||||
|
||||
// Arange creates a 1D tensor with values within an interval (start, stop] increased by step.
|
||||
Arange(start, stop, step float32, dtype DType) Tensor
|
||||
@@ -106,7 +228,7 @@ type Context interface {
|
||||
// graph, simply preallocates memory. Typically called with a
|
||||
// worst case graph to ensure all resources are available for
|
||||
// for future inference.
|
||||
Reserve() error
|
||||
Reserve()
|
||||
|
||||
MaxGraphNodes() int
|
||||
Close()
|
||||
@@ -132,6 +254,8 @@ type Tensor interface {
|
||||
Neg(ctx Context) Tensor
|
||||
Add(ctx Context, t2 Tensor) Tensor
|
||||
Mul(ctx Context, t2 Tensor) Tensor
|
||||
Div(ctx Context, t2 Tensor) Tensor
|
||||
|
||||
Mulmat(ctx Context, t2 Tensor) Tensor
|
||||
MulmatFullPrec(ctx Context, t2 Tensor) Tensor
|
||||
MulmatID(ctx Context, t2, ids Tensor) Tensor
|
||||
@@ -140,11 +264,11 @@ type Tensor interface {
|
||||
LayerNorm(ctx Context, weight, bias Tensor, eps float32) Tensor
|
||||
RMSNorm(ctx Context, weight Tensor, eps float32) Tensor
|
||||
Scale(ctx Context, s float64) Tensor
|
||||
SumRows(ctx Context) Tensor
|
||||
|
||||
AvgPool2D(ctx Context, k, s int, p float32) Tensor
|
||||
Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
|
||||
|
||||
RoPE(ctx Context, positionIDs, ropeFactors Tensor, dim, ropeType uint32, base, scale float32) Tensor
|
||||
IM2Col(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
|
||||
|
||||
Sin(ctx Context) Tensor
|
||||
@@ -172,6 +296,7 @@ type Tensor interface {
|
||||
Duplicate(ctx Context) Tensor
|
||||
|
||||
TopK(ctx Context, k int) Tensor
|
||||
Argsort(ctx Context) Tensor
|
||||
}
|
||||
|
||||
// ScaledDotProductAttention implements a fused attention
|
||||
|
||||
@@ -10,7 +10,6 @@ import "C"
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
@@ -30,6 +29,7 @@ import (
|
||||
"github.com/ollama/ollama/logutil"
|
||||
"github.com/ollama/ollama/ml"
|
||||
ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
|
||||
"github.com/ollama/ollama/ml/nn/rope"
|
||||
"golang.org/x/sync/errgroup"
|
||||
)
|
||||
|
||||
@@ -44,8 +44,15 @@ func devices() []*C.struct_ggml_backend_device {
|
||||
}
|
||||
|
||||
type Backend struct {
|
||||
// modelPath is the location of the model data
|
||||
modelPath string
|
||||
|
||||
meta *fsggml.GGML
|
||||
|
||||
// tensorLoadTargets maps from the name of the tensor in the file
|
||||
// to the name that is used by the model definition
|
||||
tensorLoadTargets map[string][]string
|
||||
|
||||
sched *C.struct_ggml_backend_sched
|
||||
schedBackends []*C.struct_ggml_backend
|
||||
schedBufts []*C.struct_ggml_backend_buffer_type
|
||||
@@ -58,14 +65,26 @@ type Backend struct {
|
||||
// layers is the backend used for repeating layers
|
||||
layers map[int]*C.struct_ggml_backend_buffer_type
|
||||
|
||||
// requiredMemory is the cumulative memory allocations needed by the backend
|
||||
requiredMemory *ml.BackendMemory
|
||||
|
||||
// btDeviceMemory maps from a buffer type to the memory allocations associated with that device
|
||||
btDeviceMemory map[*C.struct_ggml_backend_buffer_type]*ml.DeviceMemory
|
||||
|
||||
flashAttention bool
|
||||
|
||||
// maxGraphNodes is the maximum allowed number of graph nodes in this scheduler
|
||||
maxGraphNodes int
|
||||
}
|
||||
|
||||
func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend, error) {
|
||||
meta, n, err := fsggml.Decode(r, -1)
|
||||
func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
||||
r, err := os.Open(modelPath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer r.Close()
|
||||
|
||||
meta, err := fsggml.Decode(r, -1)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -80,6 +99,9 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
|
||||
"num_key_values", len(meta.KV()),
|
||||
)
|
||||
|
||||
var requiredMemory ml.BackendMemory
|
||||
btDeviceMemory := make(map[*C.struct_ggml_backend_buffer_type]*ml.DeviceMemory)
|
||||
|
||||
type deviceBufferType struct {
|
||||
d *C.struct_ggml_backend_device
|
||||
bts []*C.struct_ggml_backend_buffer_type
|
||||
@@ -100,6 +122,8 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
|
||||
}
|
||||
}
|
||||
|
||||
blocks := int(meta.KV().BlockCount())
|
||||
|
||||
// create list of buffer types for the cpu
|
||||
cpuDeviceBufferType := deviceBufferType{d: C.ggml_backend_dev_by_type(C.GGML_BACKEND_DEVICE_TYPE_CPU)}
|
||||
for _, d := range append(accels, append(gpus, cpus...)...) {
|
||||
@@ -107,17 +131,33 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
|
||||
case C.GGML_BACKEND_DEVICE_TYPE_CPU,
|
||||
C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
|
||||
cpuDeviceBufferType.bts = append(cpuDeviceBufferType.bts, C.ggml_backend_dev_buffer_type(d))
|
||||
btDeviceMemory[C.ggml_backend_dev_buffer_type(d)] = &requiredMemory.CPU
|
||||
}
|
||||
}
|
||||
|
||||
requiredMemory.CPU.Name = C.GoString(C.ggml_backend_dev_name(cpuDeviceBufferType.d))
|
||||
var props C.struct_ggml_backend_dev_props
|
||||
C.ggml_backend_dev_get_props(cpuDeviceBufferType.d, &props)
|
||||
requiredMemory.CPU.UUID = C.GoString(props.uuid)
|
||||
requiredMemory.CPU.Weights = make([]ml.Memory, blocks+1)
|
||||
requiredMemory.CPU.Cache = make([]ml.Memory, blocks+1)
|
||||
|
||||
// create list of buffer types for each gpu
|
||||
var gpuDeviceBufferTypes []deviceBufferType
|
||||
for _, d := range gpus {
|
||||
requiredMemory.GPUs = make([]ml.DeviceMemory, len(gpus))
|
||||
for i, d := range gpus {
|
||||
bt := C.ggml_backend_dev_buffer_type(d)
|
||||
gpuDeviceBufferTypes = append(gpuDeviceBufferTypes, deviceBufferType{
|
||||
d: d,
|
||||
bts: append([]*C.struct_ggml_backend_buffer_type{bt}, cpuDeviceBufferType.bts...),
|
||||
})
|
||||
btDeviceMemory[bt] = &requiredMemory.GPUs[i]
|
||||
requiredMemory.GPUs[i].Name = C.GoString(C.ggml_backend_dev_name(d))
|
||||
var props C.struct_ggml_backend_dev_props
|
||||
C.ggml_backend_dev_get_props(d, &props)
|
||||
requiredMemory.GPUs[i].UUID = C.GoString(props.uuid)
|
||||
requiredMemory.GPUs[i].Weights = make([]ml.Memory, blocks+1)
|
||||
requiredMemory.GPUs[i].Cache = make([]ml.Memory, blocks+1)
|
||||
}
|
||||
|
||||
useDefaultSplit := true
|
||||
@@ -156,8 +196,6 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
|
||||
// inputs always use cpu
|
||||
input := cpuDeviceBufferType
|
||||
|
||||
blocks := int(meta.KV().BlockCount())
|
||||
|
||||
// define a range of gpu layers. anything outside of this range is assigned to the cpu
|
||||
gpuRangeStart := max(0, blocks-params.NumGPULayers)
|
||||
gpuRangeStop := min(gpuRangeStart+params.NumGPULayers, blocks+1)
|
||||
@@ -198,7 +236,7 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
|
||||
|
||||
// contexts are shared by tensors of the same buffer type
|
||||
ctxs := make(map[*C.struct_ggml_backend_buffer_type]*C.struct_ggml_context)
|
||||
createTensor := func(t tensor, bts []*C.struct_ggml_backend_buffer_type) *C.struct_ggml_tensor {
|
||||
createTensor := func(t tensor, bts []*C.struct_ggml_backend_buffer_type, layer int) *C.struct_ggml_tensor {
|
||||
for _, bt := range bts {
|
||||
if _, ok := ctxs[bt]; !ok {
|
||||
ctxs[bt] = C.ggml_init(C.struct_ggml_init_params{
|
||||
@@ -224,6 +262,16 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
|
||||
C.ggml_set_name(tt, cname)
|
||||
|
||||
slog.Log(context.TODO(), logutil.LevelTrace, "created tensor", "name", name, "shape", t.source.Shape, "dtype", t.source.Kind, "buffer_type", C.GoString(C.ggml_backend_buft_name(bt)))
|
||||
|
||||
size := pad(C.ggml_backend_buft_get_alloc_size(bt, tt), C.ggml_backend_buft_get_alignment(bt))
|
||||
if layer == -1 {
|
||||
// Assume that InputWeights can be allocated - they're always in system memory and can't be moved in any case
|
||||
requiredMemory.InputWeights.Status = ml.Allocated
|
||||
requiredMemory.InputWeights.Size += uint64(size)
|
||||
} else {
|
||||
btDeviceMemory[bt].Weights[layer].Size += uint64(size)
|
||||
}
|
||||
|
||||
//nolint:staticcheck // TODO: check if buffer type supports this tensor
|
||||
return tt
|
||||
}
|
||||
@@ -245,22 +293,22 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
|
||||
for _, t := range meta.Tensors().Items() {
|
||||
switch {
|
||||
case contains(t.Name, "position_embd", "token_embd", "token_norm_embd", "token_types"):
|
||||
createTensor(tensor{source: t}, input.bts)
|
||||
createTensor(tensor{source: t}, input.bts, -1)
|
||||
if _, ok := meta.Tensors().GroupLayers()["output"]; !ok && t.Name == "token_embd.weight" {
|
||||
createTensor(tensor{source: t, target: "output.weight"}, output.bts)
|
||||
createTensor(tensor{source: t, target: "output.weight"}, output.bts, blocks)
|
||||
}
|
||||
case contains(t.Name, "cls", "output", "output_norm"):
|
||||
createTensor(tensor{source: t}, output.bts)
|
||||
createTensor(tensor{source: t}, output.bts, blocks)
|
||||
case strings.HasPrefix(t.Name, "v.") || strings.HasPrefix(t.Name, "mm."):
|
||||
// TODO: assign vision tensors to the gpu if possible
|
||||
createTensor(tensor{source: t}, output.bts)
|
||||
createTensor(tensor{source: t}, output.bts, blocks)
|
||||
case contains(t.Name, "rope_freqs", "rope_factors_long", "rope_factors_short"):
|
||||
// these tensors should be repeated per layer
|
||||
for i, layer := range layers {
|
||||
createTensor(tensor{
|
||||
source: t,
|
||||
target: "blk." + strconv.Itoa(i) + "." + t.Name,
|
||||
}, layer.bts)
|
||||
}, layer.bts, i)
|
||||
}
|
||||
default:
|
||||
layerIndex := -1
|
||||
@@ -271,10 +319,10 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
|
||||
}
|
||||
|
||||
if layerIndex >= 0 {
|
||||
createTensor(tensor{source: t}, layers[layerIndex].bts)
|
||||
createTensor(tensor{source: t}, layers[layerIndex].bts, layerIndex)
|
||||
} else {
|
||||
// load all other tensors on the cpu
|
||||
createTensor(tensor{source: t}, input.bts)
|
||||
createTensor(tensor{source: t}, input.bts, -1)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -287,8 +335,18 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
|
||||
}
|
||||
|
||||
b := C.ggml_backend_alloc_ctx_tensors_from_buft(c, bt)
|
||||
for i := range btDeviceMemory[bt].Weights {
|
||||
if btDeviceMemory[bt].Weights[i].Size != 0 {
|
||||
if b != nil {
|
||||
btDeviceMemory[bt].Weights[i].Status = ml.Allocated
|
||||
} else {
|
||||
btDeviceMemory[bt].Weights[i].Status = ml.Failed
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if b == nil {
|
||||
return nil, fmt.Errorf("unable to allocate memory from device %v for model weights", C.GoString(C.ggml_backend_buft_name(bt)))
|
||||
panic(ml.ErrNoMem{BackendMemory: requiredMemory})
|
||||
}
|
||||
|
||||
C.ggml_backend_buffer_set_usage(b, C.GGML_BACKEND_BUFFER_USAGE_WEIGHTS)
|
||||
@@ -307,73 +365,6 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
|
||||
}
|
||||
}
|
||||
|
||||
var doneBytes atomic.Uint64
|
||||
totalBytes := uint64(n) - meta.Tensors().Offset
|
||||
|
||||
g, ctx := errgroup.WithContext(ctx)
|
||||
g.SetLimit(runtime.GOMAXPROCS(0))
|
||||
for _, t := range meta.Tensors().Items() {
|
||||
t := t
|
||||
g.Go(func() error {
|
||||
tts := make([]*C.struct_ggml_tensor, max(1, len(targets[t.Name])))
|
||||
for i := range tts {
|
||||
target := targets[t.Name][i]
|
||||
if target == "" {
|
||||
target = t.Name
|
||||
}
|
||||
|
||||
tt, ok := tensors[target]
|
||||
if !ok {
|
||||
return fmt.Errorf("unassigned tensor: %s", t.Name)
|
||||
}
|
||||
|
||||
tts[i] = tt
|
||||
}
|
||||
|
||||
// Create a new FD for each goroutine so that each FD is read sequentially, rather than
|
||||
// seeking around within an FD shared between all goroutines.
|
||||
file, err := os.Open(r.Name())
|
||||
if err != nil {
|
||||
slog.Warn("file open error", "file", r.Name(), "error", err)
|
||||
return err
|
||||
}
|
||||
defer file.Close()
|
||||
sr := io.NewSectionReader(file, int64(meta.Tensors().Offset+t.Offset), int64(t.Size()))
|
||||
bts := make([]byte, 128*format.KibiByte)
|
||||
|
||||
var s uint64
|
||||
for s < t.Size() {
|
||||
// Stop if either the parent context has been canceled or if any of the other tensors returned an error
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
n, err := io.ReadFull(sr, bts[:min(len(bts), int(t.Size()-s))])
|
||||
if err != nil {
|
||||
slog.Warn("file read error", "file", r.Name(), "error", err)
|
||||
return err
|
||||
}
|
||||
|
||||
for _, tt := range tts {
|
||||
C.ggml_backend_tensor_set(tt, unsafe.Pointer(&bts[0]), C.size_t(s), C.size_t(n))
|
||||
}
|
||||
|
||||
s += uint64(n)
|
||||
|
||||
if params.Progress != nil {
|
||||
done := doneBytes.Add(uint64(n))
|
||||
params.Progress(float32(done) / float32(totalBytes))
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
})
|
||||
}
|
||||
|
||||
if err := g.Wait(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// map devices to backend buffer types so new tensors can be assigned to the correct device
|
||||
deviceBufferTypes := make(map[*C.struct_ggml_backend_device]*C.struct_ggml_backend_buffer_type)
|
||||
|
||||
@@ -397,9 +388,11 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
|
||||
|
||||
maxGraphNodes := max(8192, len(meta.Tensors().Items())*5)
|
||||
return &Backend{
|
||||
flashAttention: params.FlashAttention,
|
||||
meta: meta,
|
||||
tensors: tensors,
|
||||
modelPath: modelPath,
|
||||
flashAttention: params.FlashAttention,
|
||||
meta: meta,
|
||||
tensorLoadTargets: targets,
|
||||
tensors: tensors,
|
||||
sched: C.ggml_backend_sched_new(
|
||||
(*C.ggml_backend_t)(unsafe.Pointer(&schedBackends[0])),
|
||||
(*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&schedBufts[0])),
|
||||
@@ -418,7 +411,9 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
|
||||
}
|
||||
return m
|
||||
}(),
|
||||
maxGraphNodes: maxGraphNodes,
|
||||
requiredMemory: &requiredMemory,
|
||||
btDeviceMemory: btDeviceMemory,
|
||||
maxGraphNodes: maxGraphNodes,
|
||||
}, nil
|
||||
}
|
||||
|
||||
@@ -426,6 +421,81 @@ func init() {
|
||||
ml.RegisterBackend("ggml", New)
|
||||
}
|
||||
|
||||
func (b *Backend) Load(ctx context.Context, progress func(float32)) error {
|
||||
var doneBytes atomic.Uint64
|
||||
totalBytes := uint64(b.meta.Length) - b.meta.Tensors().Offset
|
||||
|
||||
g, ctx := errgroup.WithContext(ctx)
|
||||
g.SetLimit(runtime.GOMAXPROCS(0))
|
||||
for _, t := range b.meta.Tensors().Items() {
|
||||
t := t
|
||||
g.Go(func() error {
|
||||
tts := make([]*C.struct_ggml_tensor, max(1, len(b.tensorLoadTargets[t.Name])))
|
||||
for i := range tts {
|
||||
target := b.tensorLoadTargets[t.Name][i]
|
||||
if target == "" {
|
||||
target = t.Name
|
||||
}
|
||||
|
||||
tt, ok := b.tensors[target]
|
||||
if !ok {
|
||||
return fmt.Errorf("unassigned tensor: %s", t.Name)
|
||||
}
|
||||
|
||||
tts[i] = tt
|
||||
}
|
||||
|
||||
// Create a new FD for each goroutine so that each FD is read sequentially, rather than
|
||||
// seeking around within an FD shared between all goroutines.
|
||||
file, err := os.Open(b.modelPath)
|
||||
if err != nil {
|
||||
slog.Warn("file open error", "file", b.modelPath, "error", err)
|
||||
return err
|
||||
}
|
||||
defer file.Close()
|
||||
sr := io.NewSectionReader(file, int64(b.meta.Tensors().Offset+t.Offset), int64(t.Size()))
|
||||
bts := make([]byte, 128*format.KibiByte)
|
||||
|
||||
var s uint64
|
||||
for s < t.Size() {
|
||||
// Stop if either the parent context has been canceled or if any of the other tensors returned an error
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
n, err := io.ReadFull(sr, bts[:min(len(bts), int(t.Size()-s))])
|
||||
if err != nil {
|
||||
slog.Warn("file read error", "file", b.modelPath, "error", err)
|
||||
return err
|
||||
}
|
||||
|
||||
for _, tt := range tts {
|
||||
C.ggml_backend_tensor_set(tt, unsafe.Pointer(&bts[0]), C.size_t(s), C.size_t(n))
|
||||
}
|
||||
|
||||
s += uint64(n)
|
||||
|
||||
if progress != nil {
|
||||
done := doneBytes.Add(uint64(n))
|
||||
progress(float32(done) / float32(totalBytes))
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
})
|
||||
}
|
||||
|
||||
if err := g.Wait(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (b *Backend) BackendMemory() ml.BackendMemory {
|
||||
return *b.requiredMemory
|
||||
}
|
||||
|
||||
func (b *Backend) Config() fs.Config {
|
||||
return b.meta.KV()
|
||||
}
|
||||
@@ -457,6 +527,7 @@ func (b *Backend) NewContextSize(n int) ml.Context {
|
||||
no_alloc: true,
|
||||
}),
|
||||
allocatedBuffers: &allocatedBuffers,
|
||||
layer: -1,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -483,6 +554,9 @@ type Context struct {
|
||||
|
||||
// maxGraphNodes is the maximum allowed number of graph nodes in this context
|
||||
maxGraphNodes int
|
||||
|
||||
// layer is the graph layer that this context is allocating for - assumed to be cache
|
||||
layer int
|
||||
}
|
||||
|
||||
func (c *Context) Input() ml.Context {
|
||||
@@ -493,6 +567,7 @@ func (c *Context) Input() ml.Context {
|
||||
buft: c.b.input,
|
||||
allocatedBuffers: c.allocatedBuffers,
|
||||
maxGraphNodes: c.maxGraphNodes,
|
||||
layer: -1,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -507,6 +582,7 @@ func (c *Context) Layer(i int) ml.Context {
|
||||
buft: buft,
|
||||
allocatedBuffers: c.allocatedBuffers,
|
||||
maxGraphNodes: c.maxGraphNodes,
|
||||
layer: i,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -544,22 +620,34 @@ func (c *Context) Compute(tensors ...ml.Tensor) {
|
||||
}
|
||||
}
|
||||
|
||||
func (c *Context) Reserve() error {
|
||||
if !C.ggml_backend_sched_reserve(c.b.sched, c.graph) {
|
||||
C.ggml_backend_sched_reset(c.b.sched)
|
||||
return errors.New("failed to reserve graph")
|
||||
}
|
||||
func (c *Context) Reserve() {
|
||||
reserved := C.ggml_backend_sched_reserve(c.b.sched, c.graph)
|
||||
|
||||
slog.Debug("compute graph", "nodes", C.ggml_graph_n_nodes(c.graph), "splits", C.ggml_backend_sched_get_n_splits(c.b.sched))
|
||||
for i := range c.b.schedBackends {
|
||||
size := C.ggml_backend_sched_get_buffer_size(c.b.sched, c.b.schedBackends[i])
|
||||
slog.Info("compute graph", "backend", C.GoString(C.ggml_backend_name(c.b.schedBackends[i])), "buffer_type", C.GoString(C.ggml_backend_buft_name(c.b.schedBufts[i])),
|
||||
"size", format.HumanBytes2(uint64(size)))
|
||||
|
||||
// Reserve may get called multiple times for different graphs - we just want the last run, which will contain the max allocations
|
||||
for _, bt := range c.b.schedBufts {
|
||||
c.b.btDeviceMemory[bt].Graph = ml.Memory{}
|
||||
}
|
||||
|
||||
C.ggml_backend_sched_reset(c.b.sched)
|
||||
for i := range c.b.schedBackends {
|
||||
bufferStatus := C.ggml_backend_sched_get_attempted_buffer_size(c.b.sched, c.b.schedBackends[i])
|
||||
|
||||
return nil
|
||||
graph := &c.b.btDeviceMemory[c.b.schedBufts[i]].Graph
|
||||
graph.Size += uint64(bufferStatus.size)
|
||||
if bufferStatus.allocated && graph.Status != ml.Failed {
|
||||
graph.Status = ml.Allocated
|
||||
} else {
|
||||
graph.Status = ml.Failed
|
||||
}
|
||||
|
||||
slog.Info("compute graph", "backend", C.GoString(C.ggml_backend_name(c.b.schedBackends[i])), "buffer_type", C.GoString(C.ggml_backend_buft_name(c.b.schedBufts[i])),
|
||||
"size", format.HumanBytes2(uint64(bufferStatus.size)))
|
||||
}
|
||||
|
||||
if !reserved {
|
||||
panic(ml.ErrNoMem{BackendMemory: *c.b.requiredMemory})
|
||||
}
|
||||
}
|
||||
|
||||
func (c *Context) MaxGraphNodes() int {
|
||||
@@ -579,7 +667,7 @@ func pad(length, pad C.size_t) C.size_t {
|
||||
return ((length + pad - 1) / pad) * pad
|
||||
}
|
||||
|
||||
func (c *Context) newTensor(dtype ml.DType, shape []int) (ml.Tensor, error) {
|
||||
func (c *Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
|
||||
if c.buft == nil {
|
||||
panic("set Input or Layer before creating tensors")
|
||||
}
|
||||
@@ -602,7 +690,7 @@ func (c *Context) newTensor(dtype ml.DType, shape []int) (ml.Tensor, error) {
|
||||
|
||||
if len(shape) < 1 || shape[0] == 0 {
|
||||
var shape C.int64_t = 0
|
||||
return &Tensor{b: c.b, t: C.ggml_new_tensor(c.ctx, cdtype, 1, &shape)}, nil
|
||||
return &Tensor{b: c.b, t: C.ggml_new_tensor(c.ctx, cdtype, 1, &shape)}
|
||||
} else if len(shape) > 4 {
|
||||
panic("unsupported number of dimensions")
|
||||
}
|
||||
@@ -615,40 +703,43 @@ func (c *Context) newTensor(dtype ml.DType, shape []int) (ml.Tensor, error) {
|
||||
|
||||
t := C.ggml_new_tensor(c.ctx, cdtype, C.int(len(shape)), shapeToGGML(shape))
|
||||
size := pad(C.ggml_backend_buft_get_alloc_size(c.buft, t), C.ggml_backend_buft_get_alignment(c.buft))
|
||||
b := C.ggml_backend_buft_alloc_buffer(c.buft, size)
|
||||
if b == nil {
|
||||
return nil, fmt.Errorf("unable to allocate %v from device %v for new tensor", format.HumanBytes2(uint64(size)), C.GoString(C.ggml_backend_buft_name(c.buft)))
|
||||
}
|
||||
*c.allocatedBuffers = append(*c.allocatedBuffers, b)
|
||||
|
||||
b := C.ggml_backend_buft_alloc_buffer(c.buft, size)
|
||||
if c.layer >= 0 {
|
||||
cache := &c.b.btDeviceMemory[c.buft].Cache[c.layer]
|
||||
|
||||
cache.Size += uint64(size)
|
||||
if b != nil {
|
||||
cache.Status = ml.Allocated
|
||||
} else {
|
||||
cache.Status = ml.Failed
|
||||
}
|
||||
}
|
||||
|
||||
if b == nil {
|
||||
panic(ml.ErrNoMem{BackendMemory: *c.b.requiredMemory})
|
||||
}
|
||||
|
||||
*c.allocatedBuffers = append(*c.allocatedBuffers, b)
|
||||
C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
|
||||
return &Tensor{b: c.b, t: t}, nil
|
||||
return &Tensor{b: c.b, t: t}
|
||||
}
|
||||
|
||||
func (c *Context) Empty(dtype ml.DType, shape ...int) ml.Tensor {
|
||||
t, err := c.newTensor(dtype, shape)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
return t
|
||||
return c.newTensor(dtype, shape)
|
||||
}
|
||||
|
||||
func (c *Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
|
||||
t, err := c.newTensor(dtype, shape)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
t := c.newTensor(dtype, shape)
|
||||
C.ggml_set_zero(t.(*Tensor).t)
|
||||
return t
|
||||
}
|
||||
|
||||
func checkShape[S ~[]E, E any](s S, shape ...int) error {
|
||||
func checkShape[S ~[]E, E any](s S, shape ...int) {
|
||||
n := len(s)
|
||||
|
||||
if n == 0 {
|
||||
return nil
|
||||
return
|
||||
}
|
||||
|
||||
for _, v := range shape {
|
||||
@@ -656,44 +747,32 @@ func checkShape[S ~[]E, E any](s S, shape ...int) error {
|
||||
}
|
||||
|
||||
if n != 1 {
|
||||
return fmt.Errorf("invalid shape: %v", shape)
|
||||
panic(fmt.Errorf("invalid shape: %v", shape))
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *Context) FromFloatSlice(s []float32, shape ...int) (ml.Tensor, error) {
|
||||
if err := checkShape(s, shape...); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
func (c *Context) FromFloatSlice(s []float32, shape ...int) ml.Tensor {
|
||||
checkShape(s, shape...)
|
||||
|
||||
t, err := c.newTensor(ml.DTypeF32, shape)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
t := c.newTensor(ml.DTypeF32, shape)
|
||||
|
||||
if len(s) > 0 {
|
||||
C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
|
||||
}
|
||||
|
||||
return t, nil
|
||||
return t
|
||||
}
|
||||
|
||||
func (c *Context) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
|
||||
if err := checkShape(s, shape...); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
func (c *Context) FromIntSlice(s []int32, shape ...int) ml.Tensor {
|
||||
checkShape(s, shape...)
|
||||
|
||||
t, err := c.newTensor(ml.DTypeI32, shape)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
t := c.newTensor(ml.DTypeI32, shape)
|
||||
|
||||
if len(s) > 0 {
|
||||
C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
|
||||
}
|
||||
|
||||
return t, nil
|
||||
return t
|
||||
}
|
||||
|
||||
func (c Context) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
|
||||
@@ -711,12 +790,7 @@ func (c Context) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
|
||||
arange = append(arange, int32(i))
|
||||
}
|
||||
|
||||
t, err := c.Input().FromIntSlice(arange, len(arange))
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
return t
|
||||
return c.Input().FromIntSlice(arange, len(arange))
|
||||
default:
|
||||
panic("unsupported dtype for arange")
|
||||
}
|
||||
@@ -867,6 +941,13 @@ func (t *Tensor) Mul(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) Div(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
|
||||
return &Tensor{
|
||||
b: t.b,
|
||||
t: C.ggml_div(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) Mulmat(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
|
||||
return &Tensor{
|
||||
b: t.b,
|
||||
@@ -915,6 +996,8 @@ func (t *Tensor) RMSNorm(ctx ml.Context, w ml.Tensor, eps float32) ml.Tensor {
|
||||
func (t *Tensor) Pad(ctx ml.Context, shape ...int) ml.Tensor {
|
||||
if len(shape) != 4 {
|
||||
panic("expected 4 dimensions")
|
||||
} else if shape[3] != 0 {
|
||||
panic("cuda does not support 4d tensors")
|
||||
}
|
||||
|
||||
return &Tensor{
|
||||
@@ -982,6 +1065,13 @@ func (t *Tensor) Scale(ctx ml.Context, s float64) ml.Tensor {
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) SumRows(ctx ml.Context) ml.Tensor {
|
||||
return &Tensor{
|
||||
b: t.b,
|
||||
t: C.ggml_sum_rows(ctx.(*Context).ctx, t.t),
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) Softmax(ctx ml.Context) ml.Tensor {
|
||||
return &Tensor{
|
||||
b: t.b,
|
||||
@@ -1053,16 +1143,13 @@ func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
|
||||
}
|
||||
}
|
||||
|
||||
const (
|
||||
ropeTypeNorm C.int = 0
|
||||
ropeTypeNeox C.int = 2
|
||||
ropeTypeMrope C.int = 8
|
||||
ropeTypeVision C.int = 24
|
||||
)
|
||||
func (t *Tensor) RoPE(ctx ml.Context, positions ml.Tensor, ropeDim int, ropeBase, ropeScale float32, options ...func(*rope.Options)) ml.Tensor {
|
||||
// Default options
|
||||
opts := &rope.Options{OriginalContextLength: 131072, Factors: &Tensor{}}
|
||||
|
||||
func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim, ropeType uint32, ropeBase, ropeScale float32) ml.Tensor {
|
||||
if ropeFactors == nil {
|
||||
ropeFactors = &Tensor{b: t.b}
|
||||
// Apply any provided options
|
||||
for _, option := range options {
|
||||
option(opts)
|
||||
}
|
||||
|
||||
dequant := t.t
|
||||
@@ -1073,16 +1160,19 @@ func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDi
|
||||
return &Tensor{
|
||||
b: t.b,
|
||||
t: C.ggml_rope_ext(
|
||||
ctx.(*Context).ctx, dequant, positionIDs.(*Tensor).t, ropeFactors.(*Tensor).t,
|
||||
ctx.(*Context).ctx,
|
||||
dequant,
|
||||
positions.(*Tensor).t,
|
||||
opts.Factors.(*Tensor).t,
|
||||
C.int(ropeDim),
|
||||
C.int(ropeType),
|
||||
131072, // YaRN n_ctx_train
|
||||
C.int(opts.Type),
|
||||
C.int(opts.OriginalContextLength),
|
||||
C.float(ropeBase),
|
||||
C.float(ropeScale),
|
||||
0., // YaRN ext_factor
|
||||
1., // YaRN attn_factor
|
||||
32., // YaRN beta_fast
|
||||
1., // YaRN beta_slow
|
||||
C.float(0.0),
|
||||
C.float(1.0),
|
||||
C.float(32.0),
|
||||
C.float(1.0),
|
||||
),
|
||||
}
|
||||
}
|
||||
@@ -1176,3 +1266,10 @@ func (t *Tensor) TopK(ctx ml.Context, k int) ml.Tensor {
|
||||
t: C.ggml_top_k(ctx.(*Context).ctx, t.t, C.int(k)),
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) Argsort(ctx ml.Context) ml.Tensor {
|
||||
return &Tensor{
|
||||
b: t.b,
|
||||
t: C.ggml_argsort(ctx.(*Context).ctx, t.t, C.GGML_SORT_ORDER_ASC),
|
||||
}
|
||||
}
|
||||
|
||||
6
ml/backend/ggml/ggml/include/ggml-alloc.h
vendored
6
ml/backend/ggml/ggml/include/ggml-alloc.h
vendored
@@ -66,6 +66,12 @@ GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph
|
||||
|
||||
GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
|
||||
|
||||
struct ggml_allocr_buffer_status {
|
||||
size_t size;
|
||||
bool allocated;
|
||||
};
|
||||
GGML_API struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id);
|
||||
|
||||
// Utils
|
||||
// Create a buffer and allocate all the tensors in a ggml_context
|
||||
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
|
||||
|
||||
7
ml/backend/ggml/ggml/include/ggml-backend.h
vendored
7
ml/backend/ggml/ggml/include/ggml-backend.h
vendored
@@ -152,6 +152,7 @@ extern "C" {
|
||||
struct ggml_backend_dev_props {
|
||||
const char * name;
|
||||
const char * description;
|
||||
const char * uuid;
|
||||
size_t memory_free;
|
||||
size_t memory_total;
|
||||
enum ggml_backend_dev_type type;
|
||||
@@ -304,6 +305,12 @@ extern "C" {
|
||||
|
||||
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
||||
|
||||
struct ggml_backend_buffer_status {
|
||||
size_t size;
|
||||
bool allocated;
|
||||
};
|
||||
GGML_API struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
||||
|
||||
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
||||
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
||||
|
||||
|
||||
38
ml/backend/ggml/ggml/src/ggml-alloc.c
vendored
38
ml/backend/ggml/ggml/src/ggml-alloc.c
vendored
@@ -364,6 +364,7 @@ struct node_alloc {
|
||||
struct ggml_gallocr {
|
||||
ggml_backend_buffer_type_t * bufts; // [n_buffers]
|
||||
ggml_backend_buffer_t * buffers; // [n_buffers]
|
||||
size_t *buffer_sizes; // [n_buffers]
|
||||
struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
|
||||
int n_buffers;
|
||||
|
||||
@@ -387,6 +388,9 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
|
||||
galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
|
||||
GGML_ASSERT(galloc->buffers != NULL);
|
||||
|
||||
galloc->buffer_sizes = calloc(n_bufs, sizeof(size_t));
|
||||
GGML_ASSERT(galloc->buffer_sizes != NULL);
|
||||
|
||||
galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
|
||||
GGML_ASSERT(galloc->buf_tallocs != NULL);
|
||||
|
||||
@@ -453,6 +457,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
|
||||
ggml_hash_set_free(&galloc->hash_set);
|
||||
free(galloc->hash_values);
|
||||
free(galloc->bufts);
|
||||
free(galloc->buffer_sizes);
|
||||
free(galloc->buffers);
|
||||
free(galloc->buf_tallocs);
|
||||
free(galloc->node_allocs);
|
||||
@@ -748,6 +753,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||
}
|
||||
}
|
||||
|
||||
bool success = true;
|
||||
|
||||
// reallocate buffers if needed
|
||||
for (int i = 0; i < galloc->n_buffers; i++) {
|
||||
// if the buffer type is used multiple times, we reuse the same buffer
|
||||
@@ -769,15 +776,20 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||
|
||||
ggml_backend_buffer_free(galloc->buffers[i]);
|
||||
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
|
||||
if (galloc->buffers[i] == NULL) {
|
||||
if (galloc->buffers[i]) {
|
||||
galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
|
||||
ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
||||
} else {
|
||||
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
||||
return false;
|
||||
galloc->buffer_sizes[i] = new_size;
|
||||
success = false;
|
||||
}
|
||||
ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
||||
} else {
|
||||
galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
return success;
|
||||
}
|
||||
|
||||
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
||||
@@ -934,6 +946,24 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
||||
return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
|
||||
}
|
||||
|
||||
struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
||||
GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
|
||||
|
||||
for (int i = 0; i < buffer_id; i++) {
|
||||
if (galloc->buf_tallocs[i] == galloc->buf_tallocs[buffer_id]) {
|
||||
// This buffer is the same as a previous one due to the same buffer type being used multiple times
|
||||
// (See above.) However, we need a different check because multiple buffers might be NULL in our
|
||||
// case and we still want to know the attempted size.
|
||||
|
||||
struct ggml_allocr_buffer_status status = {0, true};
|
||||
return status;
|
||||
}
|
||||
}
|
||||
|
||||
struct ggml_allocr_buffer_status status = {galloc->buffer_sizes[buffer_id], galloc->buffers[buffer_id] != NULL};
|
||||
return status;
|
||||
}
|
||||
|
||||
// utils
|
||||
|
||||
static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
|
||||
|
||||
10
ml/backend/ggml/ggml/src/ggml-backend.cpp
vendored
10
ml/backend/ggml/ggml/src/ggml-backend.cpp
vendored
@@ -1629,6 +1629,16 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
|
||||
return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
|
||||
}
|
||||
|
||||
struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
||||
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
||||
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
||||
|
||||
struct ggml_allocr_buffer_status allocr_status = ggml_gallocr_get_attempted_buffer_size(sched->galloc, backend_index);
|
||||
struct ggml_backend_buffer_status status = {allocr_status.size, allocr_status.allocated};
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
|
||||
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
||||
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
||||
|
||||
@@ -3,7 +3,7 @@ package cpu
|
||||
// #cgo CFLAGS: -O3 -Wno-implicit-function-declaration
|
||||
// #cgo CXXFLAGS: -std=c++17
|
||||
// #cgo CPPFLAGS: -I${SRCDIR}/amx -I${SRCDIR}/llamafile -I${SRCDIR}/.. -I${SRCDIR}/../../include
|
||||
// #cgo CPPFLAGS: -DGGML_USE_LLAMAFILE
|
||||
// #cgo CPPFLAGS: -DNDEBUG -DGGML_USE_LLAMAFILE
|
||||
// #cgo linux CPPFLAGS: -D_GNU_SOURCE
|
||||
// #cgo darwin,arm64 CPPFLAGS: -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
|
||||
// #cgo darwin,arm64 LDFLAGS: -framework Accelerate
|
||||
|
||||
43
ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
vendored
43
ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
vendored
@@ -6822,6 +6822,45 @@ static void ggml_compute_forward_argsort_f32(
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_argsort_i32(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
GGML_TENSOR_UNARY_OP_LOCALS
|
||||
|
||||
GGML_ASSERT(nb0 == sizeof(int32_t));
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int64_t nr = ggml_nrows(src0);
|
||||
|
||||
ggml_sort_order order = (ggml_sort_order) ggml_get_op_params_i32(dst, 0);
|
||||
|
||||
for (int64_t i = ith; i < nr; i += nth) {
|
||||
int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);
|
||||
const int32_t * src_data = (int32_t *)((char *) src0->data + i*nb01);
|
||||
|
||||
for (int64_t j = 0; j < ne0; j++) {
|
||||
dst_data[j] = j;
|
||||
}
|
||||
|
||||
// C doesn't have a functional sort, so we do a bubble sort instead
|
||||
for (int64_t j = 0; j < ne0; j++) {
|
||||
for (int64_t k = j + 1; k < ne0; k++) {
|
||||
if ((order == GGML_SORT_ORDER_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
|
||||
(order == GGML_SORT_ORDER_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
|
||||
int32_t tmp = dst_data[j];
|
||||
dst_data[j] = dst_data[k];
|
||||
dst_data[k] = tmp;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_compute_forward_argsort(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
@@ -6833,6 +6872,10 @@ void ggml_compute_forward_argsort(
|
||||
{
|
||||
ggml_compute_forward_argsort_f32(params, dst);
|
||||
} break;
|
||||
case GGML_TYPE_I32:
|
||||
{
|
||||
ggml_compute_forward_argsort_i32(params, dst);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_ABORT("fatal error");
|
||||
|
||||
102
ml/backend/ggml/ggml/src/ggml-cuda/argsort.cu
vendored
102
ml/backend/ggml/ggml/src/ggml-cuda/argsort.cu
vendored
@@ -85,13 +85,107 @@ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, co
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<ggml_sort_order order>
|
||||
static __global__ void k_argsort_i32_i32(const int32_t * x, int * dst, const int ncols, const int ncols_pad) {
|
||||
extern __shared__ int shared_mem[];
|
||||
int * indices = shared_mem;
|
||||
|
||||
const int tid = threadIdx.x;
|
||||
const int row = blockIdx.y;
|
||||
|
||||
// Initialize all indices, handling the case where threads < ncols_pad
|
||||
for (int i = tid; i < ncols_pad; i += blockDim.x) {
|
||||
indices[i] = i < ncols ? i : 0; // Use 0 for padding indices
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// Bitonic sort
|
||||
for (int k = 2; k <= ncols_pad; k *= 2) {
|
||||
for (int j = k/2; j > 0; j /= 2) {
|
||||
for (int i = tid; i < ncols_pad; i += blockDim.x) {
|
||||
const int ij = i ^ j;
|
||||
if (ij > i) {
|
||||
// Only compare values within the actual data range
|
||||
if (i < ncols && ij < ncols) {
|
||||
if ((i & k) == 0) {
|
||||
if (order == GGML_SORT_ORDER_ASC) {
|
||||
if (x[row * ncols + indices[i]] > x[row * ncols + indices[ij]]) {
|
||||
int tmp = indices[i];
|
||||
indices[i] = indices[ij];
|
||||
indices[ij] = tmp;
|
||||
}
|
||||
} else {
|
||||
if (x[row * ncols + indices[i]] < x[row * ncols + indices[ij]]) {
|
||||
int tmp = indices[i];
|
||||
indices[i] = indices[ij];
|
||||
indices[ij] = tmp;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (order == GGML_SORT_ORDER_ASC) {
|
||||
if (x[row * ncols + indices[i]] < x[row * ncols + indices[ij]]) {
|
||||
int tmp = indices[i];
|
||||
indices[i] = indices[ij];
|
||||
indices[ij] = tmp;
|
||||
}
|
||||
} else {
|
||||
if (x[row * ncols + indices[i]] > x[row * ncols + indices[ij]]) {
|
||||
int tmp = indices[i];
|
||||
indices[i] = indices[ij];
|
||||
indices[ij] = tmp;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
// Write sorted indices to output, only threads handling valid data
|
||||
for (int i = tid; i < ncols; i += blockDim.x) {
|
||||
dst[row * ncols + i] = indices[i];
|
||||
}
|
||||
}
|
||||
|
||||
static void argsort_i32_i32_cuda(const int32_t * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
|
||||
// Bitonic sort requires ncols to be power of 2
|
||||
const int ncols_pad = next_power_of_2(ncols);
|
||||
|
||||
// Ensure thread count doesn't exceed maximum (typically 1024)
|
||||
const int max_threads = 1024; // This is the typical max for most GPUs
|
||||
const int threads_per_block = ncols_pad > max_threads ? max_threads : ncols_pad;
|
||||
|
||||
const dim3 block_dims(threads_per_block, 1, 1);
|
||||
const dim3 block_nums(1, nrows, 1);
|
||||
const size_t shared_mem = ncols_pad * sizeof(int);
|
||||
|
||||
// Check if shared memory size is within limits
|
||||
const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb;
|
||||
|
||||
// Instead of logging an error, use GGML_ASSERT with a descriptive message
|
||||
GGML_ASSERT(shared_mem <= max_shared_mem && "argsort: required shared memory exceeds device limit");
|
||||
|
||||
// Launch kernels with the updated thread configuration
|
||||
if (order == GGML_SORT_ORDER_ASC) {
|
||||
k_argsort_i32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
|
||||
} else if (order == GGML_SORT_ORDER_DESC) {
|
||||
k_argsort_i32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
|
||||
} else {
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
float * dst_d = (float *)dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
@@ -100,5 +194,9 @@ void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
|
||||
enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
|
||||
|
||||
argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);
|
||||
if (src0->type == GGML_TYPE_I32) {
|
||||
argsort_i32_i32_cuda((const int32_t *)src0_d, (int *)dst_d, ncols, nrows, order, stream);
|
||||
} else {
|
||||
argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);
|
||||
}
|
||||
}
|
||||
|
||||
49
ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu
vendored
49
ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu
vendored
@@ -38,6 +38,13 @@ static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
|
||||
*dsti = *xi;
|
||||
}
|
||||
|
||||
static __device__ void cpy_1_i32_i32(const char * cxi, char * cdsti) {
|
||||
const int32_t * xi = (const int32_t *) cxi;
|
||||
int32_t * dsti = (int32_t *) cdsti;
|
||||
|
||||
*dsti = *xi;
|
||||
}
|
||||
|
||||
template <cpy_kernel_t cpy_1>
|
||||
static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
@@ -68,6 +75,44 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const in
|
||||
cpy_1(cx + x_offset, cdst + dst_offset);
|
||||
}
|
||||
|
||||
// First, add this template function after the other template functions
|
||||
template <cpy_kernel_t cpy_1>
|
||||
static __global__ void cpy_i32_i32(const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13) {
|
||||
const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t i03 = i/(ne00 * ne01 * ne02);
|
||||
const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
|
||||
const int64_t i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
|
||||
const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
|
||||
const int64_t x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
|
||||
|
||||
const int64_t i13 = i/(ne10 * ne11 * ne12);
|
||||
const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
|
||||
const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
|
||||
const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
|
||||
const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
|
||||
|
||||
cpy_1(cx + x_offset, cdst + dst_offset);
|
||||
}
|
||||
|
||||
// Then modify the ggml_cpy_i32_i32_cuda function to use the new template
|
||||
static void ggml_cpy_i32_i32_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int graph_cpynode_index) {
|
||||
|
||||
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
||||
cpy_i32_i32<cpy_1_i32_i32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
|
||||
const float * xi = (const float *) cxi;
|
||||
block_q8_0 * dsti = (block_q8_0 *) cdsti;
|
||||
@@ -633,6 +678,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||
ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
|
||||
ggml_cpy_i32_i32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else {
|
||||
GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
|
||||
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||
@@ -688,6 +735,8 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
|
||||
return (void*) cpy_f32_f16<cpy_1_f32_f16>;
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
||||
return (void*) cpy_f32_f16<cpy_1_f16_f32>;
|
||||
} else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
|
||||
return (void*) cpy_i32_i32<cpy_1_i32_i32>;
|
||||
} else {
|
||||
GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
|
||||
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||
|
||||
33
ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
vendored
33
ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
vendored
@@ -2884,6 +2884,7 @@ struct ggml_backend_cuda_device_context {
|
||||
int device;
|
||||
std::string name;
|
||||
std::string description;
|
||||
std::string uuid;
|
||||
};
|
||||
|
||||
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
|
||||
@@ -2896,6 +2897,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
|
||||
return ctx->description.c_str();
|
||||
}
|
||||
|
||||
static const char * ggml_backend_cuda_device_get_uuid(ggml_backend_dev_t dev) {
|
||||
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
||||
return ctx->uuid.c_str();
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
||||
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
||||
ggml_cuda_set_device(ctx->device);
|
||||
@@ -2910,6 +2916,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
|
||||
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
||||
props->name = ggml_backend_cuda_device_get_name(dev);
|
||||
props->description = ggml_backend_cuda_device_get_description(dev);
|
||||
props->uuid = ggml_backend_cuda_device_get_uuid(dev);
|
||||
props->type = ggml_backend_cuda_device_get_type(dev);
|
||||
ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||
|
||||
@@ -3458,6 +3465,32 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
|
||||
dev_ctx->description = prop.name;
|
||||
|
||||
#if !defined(GGML_USE_HIP)
|
||||
char uuid[64];
|
||||
snprintf(uuid, sizeof(uuid),
|
||||
"GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
|
||||
(unsigned char)prop.uuid.bytes[0],
|
||||
(unsigned char)prop.uuid.bytes[1],
|
||||
(unsigned char)prop.uuid.bytes[2],
|
||||
(unsigned char)prop.uuid.bytes[3],
|
||||
(unsigned char)prop.uuid.bytes[4],
|
||||
(unsigned char)prop.uuid.bytes[5],
|
||||
(unsigned char)prop.uuid.bytes[6],
|
||||
(unsigned char)prop.uuid.bytes[7],
|
||||
(unsigned char)prop.uuid.bytes[8],
|
||||
(unsigned char)prop.uuid.bytes[9],
|
||||
(unsigned char)prop.uuid.bytes[10],
|
||||
(unsigned char)prop.uuid.bytes[11],
|
||||
(unsigned char)prop.uuid.bytes[12],
|
||||
(unsigned char)prop.uuid.bytes[13],
|
||||
(unsigned char)prop.uuid.bytes[14],
|
||||
(unsigned char)prop.uuid.bytes[15]
|
||||
);
|
||||
dev_ctx->uuid = uuid;
|
||||
#else
|
||||
dev_ctx->uuid = "GPU-" + std::string(prop.uuid.bytes, 16);
|
||||
#endif
|
||||
|
||||
ggml_backend_dev_t dev = new ggml_backend_device {
|
||||
/* .iface = */ ggml_backend_cuda_device_interface,
|
||||
/* .reg = */ ®,
|
||||
|
||||
@@ -5703,6 +5703,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
|
||||
static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
|
||||
props->name = ggml_backend_metal_device_get_name(dev);
|
||||
props->description = ggml_backend_metal_device_get_description(dev);
|
||||
props->uuid = "0";
|
||||
props->type = ggml_backend_metal_device_get_type(dev);
|
||||
ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||
props->caps = (struct ggml_backend_dev_caps) {
|
||||
|
||||
@@ -4,6 +4,6 @@ package metal
|
||||
|
||||
//go:generate sh -c "{ echo // Code generated by 'go generate'. DO NOT EDIT.; sed -e '/__embed_ggml-common.h__/r ../ggml-common.h' -e '/__embed_ggml-common.h__/d' -e '/#include \"ggml-metal-impl.h\"/r ggml-metal-impl.h' -e '/#include \"ggml-metal-impl.h\"/d' ggml-metal.metal; } >ggml-metal-embed.metal"
|
||||
|
||||
// #cgo CPPFLAGS: -DGGML_METAL_EMBED_LIBRARY -I.. -I../../include
|
||||
// #cgo CPPFLAGS: -DGGML_METAL_NDEBUG -DGGML_METAL_EMBED_LIBRARY -I.. -I../../include
|
||||
// #cgo LDFLAGS: -framework Metal -framework MetalKit
|
||||
import "C"
|
||||
|
||||
21
ml/nn/fast/rope.go
Normal file
21
ml/nn/fast/rope.go
Normal file
@@ -0,0 +1,21 @@
|
||||
// fast provides implementations of fast (fused) operations for increased performance.
|
||||
package fast
|
||||
|
||||
import (
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/ml/nn/rope"
|
||||
)
|
||||
|
||||
// fastRoPE is an interface for tensors that support fast rotary positional embedding.
|
||||
type fastRoPE interface {
|
||||
RoPE(ctx ml.Context, positionIDs ml.Tensor, dim int, base, scale float32, options ...func(*rope.Options)) ml.Tensor
|
||||
}
|
||||
|
||||
// RoPE applies rotary positional embedding to tensor `t`.
|
||||
func RoPE(ctx ml.Context, t, positions ml.Tensor, dim int, base, scale float32, options ...func(*rope.Options)) ml.Tensor {
|
||||
if t, ok := t.(fastRoPE); ok {
|
||||
return t.RoPE(ctx, positions, dim, base, scale, options...)
|
||||
}
|
||||
|
||||
panic("RoPE not implemented for this tensor type")
|
||||
}
|
||||
33
ml/nn/rope/rope.go
Normal file
33
ml/nn/rope/rope.go
Normal file
@@ -0,0 +1,33 @@
|
||||
package rope
|
||||
|
||||
import "github.com/ollama/ollama/ml"
|
||||
|
||||
// Options contains optional parameters for RoPE function
|
||||
type Options struct {
|
||||
OriginalContextLength int
|
||||
Type int
|
||||
Factors ml.Tensor
|
||||
}
|
||||
|
||||
// WithOriginalContextLength sets a custom context length
|
||||
func WithOriginalContextLength(n int) func(*Options) {
|
||||
return func(opts *Options) {
|
||||
opts.OriginalContextLength = n
|
||||
}
|
||||
}
|
||||
|
||||
// WithType sets RoPE type to NeoX
|
||||
func WithTypeNeoX() func(*Options) {
|
||||
return func(opts *Options) {
|
||||
opts.Type = 2
|
||||
}
|
||||
}
|
||||
|
||||
// WithFactors sets custom rope factors
|
||||
func WithFactors(factors ml.Tensor) func(*Options) {
|
||||
return func(opts *Options) {
|
||||
if factors != nil {
|
||||
opts.Factors = factors
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3,118 +3,16 @@ package model
|
||||
import (
|
||||
"cmp"
|
||||
"context"
|
||||
"fmt"
|
||||
"iter"
|
||||
"log/slog"
|
||||
"slices"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"github.com/dlclark/regexp2"
|
||||
heap "github.com/emirpasic/gods/v2/trees/binaryheap"
|
||||
"github.com/ollama/ollama/logutil"
|
||||
)
|
||||
|
||||
type Special int32
|
||||
|
||||
const (
|
||||
SpecialBOS Special = iota
|
||||
SpecialEOS
|
||||
)
|
||||
|
||||
const (
|
||||
TOKEN_TYPE_NORMAL = iota + 1
|
||||
TOKEN_TYPE_UNKNOWN
|
||||
TOKEN_TYPE_CONTROL
|
||||
TOKEN_TYPE_USER_DEFINED
|
||||
TOKEN_TYPE_UNUSED
|
||||
TOKEN_TYPE_BYTE
|
||||
)
|
||||
|
||||
type TextProcessor interface {
|
||||
Encode(s string, addSpecial bool) ([]int32, error)
|
||||
Decode([]int32) (string, error)
|
||||
Is(int32, Special) bool
|
||||
Vocabulary() *Vocabulary
|
||||
}
|
||||
|
||||
type Vocabulary struct {
|
||||
Values []string
|
||||
Types []int32
|
||||
Scores []float32
|
||||
Merges []string
|
||||
|
||||
BOS, EOS, EOT int32
|
||||
AddBOS, AddEOS, AddEOT bool
|
||||
|
||||
specialOnce sync.Once
|
||||
special []string
|
||||
|
||||
valuesOnce sync.Once
|
||||
values map[string]int32
|
||||
|
||||
mergeOnce sync.Once
|
||||
merge map[string]int32
|
||||
}
|
||||
|
||||
func (v *Vocabulary) Is(id int32, special Special) bool {
|
||||
switch special {
|
||||
case SpecialBOS:
|
||||
return id == v.BOS
|
||||
case SpecialEOS:
|
||||
return id == v.EOS || id == v.EOT
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func (v *Vocabulary) Encode(s string) int32 {
|
||||
v.valuesOnce.Do(func() {
|
||||
v.values = make(map[string]int32, len(v.Values))
|
||||
for i, value := range v.Values {
|
||||
v.values[value] = int32(i)
|
||||
}
|
||||
})
|
||||
|
||||
if id, ok := v.values[s]; ok {
|
||||
return id
|
||||
}
|
||||
|
||||
return -1
|
||||
}
|
||||
|
||||
func (v *Vocabulary) Decode(id int32) string {
|
||||
return v.Values[id]
|
||||
}
|
||||
|
||||
func (v *Vocabulary) SpecialVocabulary() []string {
|
||||
v.specialOnce.Do(func() {
|
||||
for i := range v.Values {
|
||||
if slices.Contains([]int{105, 106}, i) {
|
||||
v.special = append(v.special, v.Values[i])
|
||||
} else if v.Types[i] == TOKEN_TYPE_CONTROL {
|
||||
v.special = append(v.special, v.Values[i])
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
return v.special
|
||||
}
|
||||
|
||||
func (v *Vocabulary) Merge(left, right string) int {
|
||||
v.mergeOnce.Do(func() {
|
||||
v.merge = make(map[string]int32, len(v.Merges))
|
||||
for i, merge := range v.Merges {
|
||||
v.merge[merge] = int32(i)
|
||||
}
|
||||
})
|
||||
|
||||
if id, ok := v.merge[left+" "+right]; ok {
|
||||
return int(id)
|
||||
}
|
||||
|
||||
return -1
|
||||
}
|
||||
|
||||
type BytePairEncoding struct {
|
||||
pre *regexp2.Regexp
|
||||
vocab *Vocabulary
|
||||
@@ -304,30 +202,23 @@ func (bpe BytePairEncoding) Encode(s string, addSpecial bool) ([]int32, error) {
|
||||
}
|
||||
}
|
||||
|
||||
slog.Log(context.TODO(), logutil.LevelTrace, "encoded", "string", s, "ids", ids)
|
||||
|
||||
if addSpecial && len(ids) > 0 {
|
||||
if bpe.vocab.AddBOS {
|
||||
if ids[0] == bpe.vocab.BOS {
|
||||
slog.Warn("adding bos token to prompt which already has it", "id", bpe.vocab.BOS)
|
||||
}
|
||||
|
||||
slog.Debug("adding bos token to prompt", "id", bpe.vocab.BOS)
|
||||
ids = append([]int32{bpe.vocab.BOS}, ids...)
|
||||
}
|
||||
|
||||
if bpe.vocab.AddEOS {
|
||||
if ids[len(ids)-1] == bpe.vocab.EOS {
|
||||
slog.Warn("adding eos token to prompt which already has it", "id", bpe.vocab.EOS)
|
||||
}
|
||||
|
||||
slog.Debug("adding eos token to prompt", "id", bpe.vocab.EOS)
|
||||
ids = append(ids, bpe.vocab.EOS)
|
||||
}
|
||||
ids = bpe.vocab.addSpecials(ids)
|
||||
}
|
||||
|
||||
slog.Log(context.TODO(), logutil.LevelTrace, "encoded", "ids", ids)
|
||||
return ids, nil
|
||||
}
|
||||
|
||||
type lazyIdsString struct {
|
||||
ids []int32
|
||||
}
|
||||
|
||||
func (l lazyIdsString) LogValue() slog.Value {
|
||||
return slog.AnyValue(fmt.Sprint(l.ids))
|
||||
}
|
||||
|
||||
func (bpe BytePairEncoding) Decode(ids []int32) (string, error) {
|
||||
var sb strings.Builder
|
||||
for _, id := range ids {
|
||||
@@ -352,6 +243,6 @@ func (bpe BytePairEncoding) Decode(ids []int32) (string, error) {
|
||||
}
|
||||
}
|
||||
|
||||
slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "string", sb.String())
|
||||
slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "string", sb.String(), "from", lazyIdsString{ids: ids})
|
||||
return sb.String(), nil
|
||||
}
|
||||
@@ -2,16 +2,30 @@ package input
|
||||
|
||||
import "github.com/ollama/ollama/ml"
|
||||
|
||||
// Multimodal is a multimodal embedding or a component of one.
|
||||
// For example, it could be a row of an image that can be processed
|
||||
// independently.
|
||||
type Multimodal struct {
|
||||
// Tensor is the embedding data. Implementations may chose what to
|
||||
// store here or it may be nil if not needed. However, any ml.Tensor
|
||||
// objects must be stored here and not in Data.
|
||||
Tensor ml.Tensor
|
||||
|
||||
// Data is implementation-specific opaque data, such as metadata on how
|
||||
// to layout Tensor. It may be nil if not needed. It may also store larger
|
||||
// objects such as complete images if they are to be processed later.
|
||||
Data any
|
||||
}
|
||||
|
||||
// Input represents one token in the input stream
|
||||
type Input struct {
|
||||
// Token is a single element of text.
|
||||
Token int32
|
||||
|
||||
// Multimodal is opaque data representing a non-text
|
||||
// element such as an image (or part of one if the image
|
||||
// can be processed in pieces). It may be either together
|
||||
// with Token or on its own.
|
||||
Multimodal any
|
||||
// Multimodal is represents a non-text element such as an
|
||||
// image (or part of one if the image can be processed in pieces).
|
||||
// It may be used either together with Token or on its own.
|
||||
Multimodal []Multimodal
|
||||
|
||||
// MultimodalHash is a unique representation of the data
|
||||
// stored in Multimodal, used for caching and comparing
|
||||
@@ -32,7 +46,7 @@ type Input struct {
|
||||
// Positions slice.
|
||||
type MultimodalIndex struct {
|
||||
Index int
|
||||
Multimodal any
|
||||
Multimodal []Multimodal
|
||||
}
|
||||
|
||||
// Batch contains the inputs for a model forward pass
|
||||
|
||||
@@ -40,12 +40,13 @@ type MultimodalProcessor interface {
|
||||
// EncodeMultimodal processes a single input (such as an image) and
|
||||
// generates an output (typically an embedding) that can be used by the model.
|
||||
//
|
||||
// The return value is most typically an ml.Tensor, however, different
|
||||
// type are possible, such as an object containing a tensor plus
|
||||
// additional metadata, a slice of tensors or even just the original input.
|
||||
// The return value is one or more tensors, each with optional model-specific
|
||||
// opaque metadata. Typically, the tensors might be views into an embedding
|
||||
// with each view representing a chunk of data that can be processed independently
|
||||
// in different batches.
|
||||
//
|
||||
// The result may be cached by the runner.
|
||||
EncodeMultimodal(ml.Context, []byte) (any, error)
|
||||
EncodeMultimodal(ml.Context, []byte) ([]input.Multimodal, error)
|
||||
|
||||
// PostTokenize is called after tokenization to allow the model to edit the
|
||||
// input stream to correctly arrange multimodal elements.
|
||||
@@ -97,14 +98,8 @@ func Register(name string, f func(fs.Config) (Model, error)) {
|
||||
}
|
||||
|
||||
// New initializes a new model instance with the provided configuration based on the metadata in the model file
|
||||
func New(ctx context.Context, modelPath string, params ml.BackendParams) (Model, error) {
|
||||
r, err := os.Open(modelPath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer r.Close()
|
||||
|
||||
b, err := ml.NewBackend(ctx, r, params)
|
||||
func New(modelPath string, params ml.BackendParams) (Model, error) {
|
||||
b, err := ml.NewBackend(modelPath, params)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -133,7 +128,7 @@ func NewTextProcessor(s string) (TextProcessor, error) {
|
||||
return nil, err
|
||||
}
|
||||
defer r.Close()
|
||||
meta, _, err := fsggml.Decode(r, -1)
|
||||
meta, err := fsggml.Decode(r, -1)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -292,11 +287,7 @@ func Forward(ctx ml.Context, m Model, inputs []int32, batch input.Batch) (ml.Ten
|
||||
return nil, errors.New("batch size cannot be less than 1")
|
||||
}
|
||||
|
||||
var err error
|
||||
batch.Inputs, err = ctx.Input().FromIntSlice(inputs, len(inputs))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
batch.Inputs = ctx.Input().FromIntSlice(inputs, len(inputs))
|
||||
|
||||
cache := m.Config().Cache
|
||||
if cache != nil {
|
||||
|
||||
@@ -7,6 +7,8 @@ import (
|
||||
"github.com/ollama/ollama/kvcache"
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/ml/nn"
|
||||
"github.com/ollama/ollama/ml/nn/fast"
|
||||
"github.com/ollama/ollama/ml/nn/rope"
|
||||
"github.com/ollama/ollama/model"
|
||||
"github.com/ollama/ollama/model/input"
|
||||
)
|
||||
@@ -43,10 +45,13 @@ func New(c fs.Config) (model.Model, error) {
|
||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||
Scores: c.Floats("tokenizer.ggml.scores"),
|
||||
Types: c.Ints("tokenizer.ggml.token_type"),
|
||||
BOS: int32(c.Uint("tokenizer.ggml.bos_token_id")),
|
||||
EOS: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
||||
// TODO: set EOT to EOS otherwise 0 will stop generation
|
||||
EOT: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
||||
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
|
||||
BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
|
||||
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
||||
EOS: append(
|
||||
[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
|
||||
c.Ints("tokenizer.ggml.eos_token_ids")...,
|
||||
),
|
||||
},
|
||||
),
|
||||
Layers: make([]Layer, c.Uint("block_count")),
|
||||
@@ -80,11 +85,10 @@ type SelfAttention struct {
|
||||
|
||||
func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
|
||||
batchSize := hiddenState.Dim(1)
|
||||
ropeType := uint32(2)
|
||||
|
||||
q := sa.Query.Forward(ctx, hiddenState)
|
||||
q = q.Reshape(ctx, opts.attnKeyLen, opts.numHeads, batchSize)
|
||||
q = q.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, opts.ropeBase, opts.ropeScale)
|
||||
q = fast.RoPE(ctx, q, positionIDs, opts.attnKeyLen, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
|
||||
|
||||
if opts.largeModelScaling {
|
||||
q = q.Scale(ctx, 1.0/math.Sqrt(float64(opts.hiddenSize/opts.numHeads)))
|
||||
@@ -94,7 +98,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
|
||||
|
||||
k := sa.Key.Forward(ctx, hiddenState)
|
||||
k = k.Reshape(ctx, opts.attnKeyLen, opts.numKVHeads, batchSize)
|
||||
k = k.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, opts.ropeBase, opts.ropeScale)
|
||||
k = fast.RoPE(ctx, k, positionIDs, opts.attnKeyLen, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
|
||||
|
||||
v := sa.Value.Forward(ctx, hiddenState)
|
||||
v = v.Reshape(ctx, opts.attnValLen, opts.numKVHeads, batchSize)
|
||||
@@ -124,7 +128,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
|
||||
}
|
||||
|
||||
func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
|
||||
return key.RoPE(ctx, shift, nil, uint32(m.Options.attnKeyLen), uint32(2), m.Options.ropeBase, m.Options.ropeScale), nil
|
||||
return fast.RoPE(ctx, key, shift, m.Options.attnKeyLen, m.Options.ropeBase, m.Options.ropeScale, rope.WithTypeNeoX()), nil
|
||||
}
|
||||
|
||||
type MLP struct {
|
||||
@@ -171,15 +175,8 @@ func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Ten
|
||||
}
|
||||
|
||||
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
||||
positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
|
||||
outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
|
||||
|
||||
hiddenState := m.TokenEmbedding.Forward(ctx, batch.Inputs)
|
||||
hiddenState = hiddenState.Scale(ctx, math.Sqrt(float64(m.Options.hiddenSize)))
|
||||
|
||||
@@ -60,12 +60,16 @@ func New(c fs.Config) (model.Model, error) {
|
||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||
Scores: c.Floats("tokenizer.ggml.scores"),
|
||||
Types: c.Ints("tokenizer.ggml.token_type"),
|
||||
BOS: int32(c.Uint("tokenizer.ggml.bos_token_id")),
|
||||
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
|
||||
EOS: int32(1),
|
||||
BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
|
||||
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
||||
EOT: int32(106),
|
||||
AddEOT: c.Bool("tokenizer.ggml.add_eot_token", false),
|
||||
EOS: append(
|
||||
[]int32{
|
||||
int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
||||
int32(c.Uint("tokenizer.ggml.eot_token_id", 106)),
|
||||
},
|
||||
c.Ints("tokenizer.ggml.eos_token_ids")...,
|
||||
),
|
||||
},
|
||||
),
|
||||
ImageProcessor: newImageProcessor(c),
|
||||
@@ -82,7 +86,7 @@ func New(c fs.Config) (model.Model, error) {
|
||||
return &m, nil
|
||||
}
|
||||
|
||||
func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
|
||||
func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
|
||||
if len(m.VisionModel.Layers) == 0 {
|
||||
return nil, model.ErrNoVisionModel
|
||||
}
|
||||
@@ -97,33 +101,30 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
|
||||
return nil, err
|
||||
}
|
||||
|
||||
pixelValues, err := ctx.Input().FromFloatSlice(f32s,
|
||||
pixelValues := ctx.Input().FromFloatSlice(f32s,
|
||||
m.ImageProcessor.imageSize,
|
||||
m.ImageProcessor.imageSize,
|
||||
m.ImageProcessor.numChannels,
|
||||
)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
|
||||
visionOutputs = m.MultiModalProjector.Forward(ctx, visionOutputs, m.imageSize, m.patchSize, m.VisionModel.eps)
|
||||
return visionOutputs, nil
|
||||
return []input.Multimodal{{Tensor: visionOutputs}}, nil
|
||||
}
|
||||
|
||||
func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
|
||||
var result []input.Input
|
||||
|
||||
for _, inp := range inputs {
|
||||
if inp.Multimodal == nil {
|
||||
if len(inp.Multimodal) == 0 {
|
||||
result = append(result, inp)
|
||||
} else {
|
||||
inputMultimodal := inp.Multimodal.(ml.Tensor)
|
||||
inputMultimodal := inp.Multimodal[0].Tensor
|
||||
|
||||
result = append(result,
|
||||
input.Input{Token: 108, SameBatch: inputMultimodal.Dim(1) + 3}, // "\n\n"
|
||||
input.Input{Token: 255999}, // "<start_of_image>""
|
||||
input.Input{Multimodal: inputMultimodal, MultimodalHash: inp.MultimodalHash}, // image data is on the first placeholder
|
||||
input.Input{Token: 108, SameBatch: inputMultimodal.Dim(1) + 3}, // "\n\n"
|
||||
input.Input{Token: 255999}, // "<start_of_image>""
|
||||
input.Input{Multimodal: []input.Multimodal{{Tensor: inputMultimodal}}, MultimodalHash: inp.MultimodalHash}, // image data is on the first placeholder
|
||||
)
|
||||
|
||||
// add image token placeholders
|
||||
@@ -140,15 +141,8 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
|
||||
}
|
||||
|
||||
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
||||
positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
|
||||
outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
|
||||
|
||||
return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache), nil
|
||||
}
|
||||
|
||||
@@ -7,6 +7,8 @@ import (
|
||||
"github.com/ollama/ollama/kvcache"
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/ml/nn"
|
||||
"github.com/ollama/ollama/ml/nn/fast"
|
||||
"github.com/ollama/ollama/ml/nn/rope"
|
||||
"github.com/ollama/ollama/model/input"
|
||||
)
|
||||
|
||||
@@ -73,7 +75,6 @@ type TextSelfAttention struct {
|
||||
|
||||
func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *TextConfig) ml.Tensor {
|
||||
batchSize := hiddenState.Dim(1)
|
||||
ropeType := uint32(2)
|
||||
|
||||
ropeBase := opts.ropeLocalBase
|
||||
if (layer+1)%gemmaGlobalCacheCount == 0 {
|
||||
@@ -83,7 +84,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, pos
|
||||
q := sa.Query.Forward(ctx, hiddenState)
|
||||
q = q.Reshape(ctx, opts.attnKeyLen, opts.numHeads, batchSize)
|
||||
q = sa.QueryNorm.Forward(ctx, q, opts.eps)
|
||||
q = q.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, ropeBase, opts.ropeScale)
|
||||
q = fast.RoPE(ctx, q, positionIDs, opts.attnKeyLen, ropeBase, opts.ropeScale, rope.WithTypeNeoX())
|
||||
|
||||
if opts.largeModelScaling {
|
||||
q = q.Scale(ctx, 1.0/math.Sqrt(float64(opts.hiddenSize/opts.numHeads)))
|
||||
@@ -94,7 +95,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, pos
|
||||
k := sa.Key.Forward(ctx, hiddenState)
|
||||
k = k.Reshape(ctx, opts.attnKeyLen, opts.numKVHeads, batchSize)
|
||||
k = sa.KeyNorm.Forward(ctx, k, opts.eps)
|
||||
k = k.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, ropeBase, opts.ropeScale)
|
||||
k = fast.RoPE(ctx, k, positionIDs, opts.attnKeyLen, ropeBase, opts.ropeScale, rope.WithTypeNeoX())
|
||||
|
||||
v := sa.Value.Forward(ctx, hiddenState)
|
||||
v = v.Reshape(ctx, opts.attnValLen, opts.numKVHeads, batchSize)
|
||||
@@ -112,7 +113,7 @@ func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.T
|
||||
ropeBase = m.TextConfig.ropeGlobalBase
|
||||
}
|
||||
|
||||
return key.RoPE(ctx, shift, nil, uint32(m.TextConfig.attnKeyLen), uint32(2), ropeBase, m.TextConfig.ropeScale), nil
|
||||
return fast.RoPE(ctx, key, shift, m.TextConfig.attnKeyLen, ropeBase, m.TextConfig.ropeScale, rope.WithTypeNeoX()), nil
|
||||
}
|
||||
|
||||
type TextMLP struct {
|
||||
@@ -165,7 +166,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
|
||||
// set image embeddings
|
||||
var except []int
|
||||
for _, image := range batch.Multimodal {
|
||||
visionOutputs := image.Multimodal.(ml.Tensor)
|
||||
visionOutputs := image.Multimodal[0].Tensor
|
||||
ctx.Forward(visionOutputs.Copy(ctx, hiddenState.View(ctx, image.Index*hiddenState.Stride(1), visionOutputs.Dim(0)*visionOutputs.Dim(1))))
|
||||
|
||||
for i := range visionOutputs.Dim(1) {
|
||||
|
||||
@@ -1,22 +1,23 @@
|
||||
package llama
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"cmp"
|
||||
"math"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/fs"
|
||||
"github.com/ollama/ollama/kvcache"
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/ml/nn"
|
||||
"github.com/ollama/ollama/ml/nn/fast"
|
||||
"github.com/ollama/ollama/ml/nn/rope"
|
||||
"github.com/ollama/ollama/model"
|
||||
"github.com/ollama/ollama/model/input"
|
||||
)
|
||||
|
||||
type Options struct {
|
||||
hiddenSize, numHeads, numKVHeads int
|
||||
headDim, ropeDim int
|
||||
eps, ropeBase, ropeScale float32
|
||||
ropeDim uint32
|
||||
}
|
||||
|
||||
type Model struct {
|
||||
@@ -32,10 +33,6 @@ type Model struct {
|
||||
}
|
||||
|
||||
func New(c fs.Config) (model.Model, error) {
|
||||
if !strings.EqualFold(c.String("tokenizer.ggml.model"), "gpt2") {
|
||||
return nil, fmt.Errorf("tokenizer %s not yet supported", c.String("tokenizer.ggml.model"))
|
||||
}
|
||||
|
||||
m := Model{
|
||||
BytePairEncoding: model.NewBytePairEncoding(
|
||||
c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
|
||||
@@ -43,13 +40,13 @@ func New(c fs.Config) (model.Model, error) {
|
||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||
Types: c.Ints("tokenizer.ggml.token_type"),
|
||||
Merges: c.Strings("tokenizer.ggml.merges"),
|
||||
BOS: int32(c.Uint("tokenizer.ggml.bos_token_id")),
|
||||
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
|
||||
EOS: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
||||
BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
|
||||
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
||||
// TODO: set EOT to EOS otherwise 0 will stop generation
|
||||
EOT: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
||||
AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
|
||||
EOS: append(
|
||||
[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
|
||||
c.Ints("tokenizer.ggml.eos_token_ids")...,
|
||||
),
|
||||
},
|
||||
),
|
||||
Layers: make([]Layer, c.Uint("block_count")),
|
||||
@@ -57,10 +54,11 @@ func New(c fs.Config) (model.Model, error) {
|
||||
hiddenSize: int(c.Uint("embedding_length")),
|
||||
numHeads: int(c.Uint("attention.head_count")),
|
||||
numKVHeads: int(c.Uint("attention.head_count_kv")),
|
||||
headDim: int(c.Uint("attention.key_length")),
|
||||
ropeDim: int(c.Uint("rope.dimension_count")),
|
||||
eps: c.Float("attention.layer_norm_rms_epsilon"),
|
||||
ropeBase: c.Float("rope.freq_base"),
|
||||
ropeScale: c.Float("rope.freq_scale", 1),
|
||||
ropeDim: c.Uint("rope.dimension_count"),
|
||||
},
|
||||
}
|
||||
|
||||
@@ -77,31 +75,31 @@ type SelfAttention struct {
|
||||
RopeFactors ml.Tensor `gguf:"rope_freqs.weight"`
|
||||
}
|
||||
|
||||
func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
|
||||
func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
|
||||
batchSize := hiddenState.Dim(1)
|
||||
headDim := opts.hiddenSize / opts.numHeads
|
||||
ropeType := uint32(0)
|
||||
headDim := cmp.Or(opts.headDim, opts.hiddenSize/opts.numHeads)
|
||||
ropeDim := cmp.Or(opts.ropeDim, headDim)
|
||||
|
||||
q := sa.Query.Forward(ctx, hiddenState)
|
||||
q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
|
||||
q = q.RoPE(ctx, positionIDs, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
|
||||
query := sa.Query.Forward(ctx, hiddenState)
|
||||
query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
|
||||
|
||||
k := sa.Key.Forward(ctx, hiddenState)
|
||||
k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
||||
k = k.RoPE(ctx, positionIDs, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
|
||||
key := sa.Key.Forward(ctx, hiddenState)
|
||||
key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
||||
|
||||
v := sa.Value.Forward(ctx, hiddenState)
|
||||
v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
||||
value := sa.Value.Forward(ctx, hiddenState)
|
||||
value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
||||
|
||||
scaleFactor := 1.0 / math.Sqrt(float64(headDim))
|
||||
kqv := nn.Attention(ctx, q, k, v, scaleFactor, cache)
|
||||
kqv = kqv.Reshape(ctx, opts.hiddenSize, batchSize)
|
||||
query = fast.RoPE(ctx, query, positions, ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
|
||||
key = fast.RoPE(ctx, key, positions, ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
|
||||
|
||||
return sa.Output.Forward(ctx, kqv)
|
||||
attention := nn.Attention(ctx, query, key, value, 1.0/math.Sqrt(float64(headDim)), cache)
|
||||
attention = attention.Reshape(ctx, headDim*opts.numHeads, batchSize)
|
||||
return sa.Output.Forward(ctx, attention)
|
||||
}
|
||||
|
||||
func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
|
||||
return key.RoPE(ctx, shift, m.Layers[layer].SelfAttention.RopeFactors, uint32(0), m.ropeDim, m.ropeBase, m.ropeScale), nil
|
||||
ropeDim := cmp.Or(m.ropeDim, m.hiddenSize/m.numHeads)
|
||||
return fast.RoPE(ctx, key, shift, ropeDim, m.ropeBase, m.ropeScale, rope.WithFactors(m.Layers[layer].SelfAttention.RopeFactors)), nil
|
||||
}
|
||||
|
||||
type MLP struct {
|
||||
@@ -122,11 +120,11 @@ type Layer struct {
|
||||
MLP *MLP
|
||||
}
|
||||
|
||||
func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
|
||||
func (l *Layer) Forward(ctx ml.Context, hiddenState, positions, outputs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
|
||||
residual := hiddenState
|
||||
|
||||
hiddenState = l.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
|
||||
hiddenState = l.SelfAttention.Forward(ctx, hiddenState, positionIDs, cache, opts)
|
||||
hiddenState = l.SelfAttention.Forward(ctx, hiddenState, positions, cache, opts)
|
||||
|
||||
// In the final layer (outputs != nil), optimize by pruning to just the token positions
|
||||
// we need logits for.
|
||||
@@ -144,27 +142,19 @@ func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Ten
|
||||
}
|
||||
|
||||
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
||||
positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
|
||||
|
||||
hiddenState := m.TokenEmbedding.Forward(ctx, batch.Inputs)
|
||||
|
||||
for i, layer := range m.Layers {
|
||||
m.Cache.SetLayer(i)
|
||||
|
||||
var lastLayerOutputs ml.Tensor
|
||||
var outputs ml.Tensor
|
||||
if i == len(m.Layers)-1 {
|
||||
lastLayerOutputs = outputs
|
||||
outputs = ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
|
||||
}
|
||||
|
||||
hiddenState = layer.Forward(ctx, hiddenState, positions, lastLayerOutputs, m.Cache, m.Options)
|
||||
hiddenState = layer.Forward(ctx, hiddenState, positions, outputs, m.Cache, m.Options)
|
||||
}
|
||||
|
||||
hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
|
||||
|
||||
@@ -4,7 +4,6 @@ import (
|
||||
"bytes"
|
||||
"image"
|
||||
"slices"
|
||||
"sync"
|
||||
|
||||
"github.com/ollama/ollama/fs"
|
||||
"github.com/ollama/ollama/kvcache"
|
||||
@@ -41,13 +40,13 @@ func New(c fs.Config) (model.Model, error) {
|
||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||
Types: c.Ints("tokenizer.ggml.token_type"),
|
||||
Merges: c.Strings("tokenizer.ggml.merges"),
|
||||
BOS: int32(c.Uint("tokenizer.ggml.bos_token_id")),
|
||||
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
|
||||
EOS: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
||||
BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
|
||||
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
||||
// TODO: set EOT to EOS otherwise 0 will stop generation
|
||||
EOT: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
||||
AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
|
||||
EOS: append(
|
||||
[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
|
||||
c.Ints("tokenizer.ggml.eos_token_ids")...,
|
||||
),
|
||||
},
|
||||
),
|
||||
ImageProcessor: newImageProcessor(c),
|
||||
@@ -63,7 +62,7 @@ func New(c fs.Config) (model.Model, error) {
|
||||
return &m, nil
|
||||
}
|
||||
|
||||
func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
|
||||
func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
|
||||
if len(m.VisionModel.Layers) < 1 {
|
||||
return nil, model.ErrNoVisionModel
|
||||
}
|
||||
@@ -78,10 +77,7 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
|
||||
return nil, err
|
||||
}
|
||||
|
||||
tilesLocal, err := ctx.Input().FromFloatSlice(pixelsLocal, size.X, size.Y, m.numChannels)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
tilesLocal := ctx.Input().FromFloatSlice(pixelsLocal, size.X, size.Y, m.numChannels)
|
||||
|
||||
ratioW, ratioH := size.X/m.imageSize, size.Y/m.imageSize
|
||||
|
||||
@@ -92,81 +88,86 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
|
||||
pixelValues := tilesLocal
|
||||
|
||||
if len(pixelsGlobal) > 0 {
|
||||
tilesGlobal, err := ctx.Input().FromFloatSlice(pixelsGlobal, m.imageSize, m.imageSize, m.numChannels)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
tilesGlobal := ctx.Input().FromFloatSlice(pixelsGlobal, m.imageSize, m.imageSize, m.numChannels)
|
||||
pixelValues = pixelValues.Concat(ctx, tilesGlobal, 3)
|
||||
}
|
||||
|
||||
visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
|
||||
visionOutputs = visionOutputs.Reshape(ctx, visionOutputs.Dim(0), visionOutputs.Dim(1)*visionOutputs.Dim(2)*visionOutputs.Dim(3))
|
||||
projectedOutputs := m.Projector.Forward(ctx, visionOutputs)
|
||||
return &chunks{Model: m, Tensor: projectedOutputs, aspectRatio: image.Point{ratioW, ratioH}}, nil
|
||||
|
||||
var multimodal []input.Multimodal
|
||||
aspectRatio := image.Point{ratioW, ratioH}
|
||||
|
||||
var offset int
|
||||
patchesPerChunk := projectedOutputs.Dim(1)
|
||||
if aspectRatio.Y*aspectRatio.X > 1 {
|
||||
patchesPerChunk = projectedOutputs.Dim(1) / (aspectRatio.X*aspectRatio.Y + 1)
|
||||
|
||||
for range aspectRatio.Y {
|
||||
for x := range aspectRatio.X {
|
||||
view := projectedOutputs.View(ctx, projectedOutputs.Stride(1)*offset,
|
||||
projectedOutputs.Dim(0), projectedOutputs.Stride(1),
|
||||
patchesPerChunk)
|
||||
var separator separator
|
||||
if x < aspectRatio.X-1 {
|
||||
separator.x = true // <|tile_x_separator|>
|
||||
} else {
|
||||
separator.y = true // <|tile_y_separator|>
|
||||
}
|
||||
multimodal = append(multimodal, input.Multimodal{Tensor: view, Data: &separator})
|
||||
offset += patchesPerChunk
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
view := projectedOutputs.View(ctx, projectedOutputs.Stride(1)*offset,
|
||||
projectedOutputs.Dim(0), projectedOutputs.Stride(1),
|
||||
patchesPerChunk)
|
||||
multimodal = append(multimodal, input.Multimodal{Tensor: view, Data: &separator{}})
|
||||
|
||||
return multimodal, nil
|
||||
}
|
||||
|
||||
type chunks struct {
|
||||
*Model
|
||||
ml.Tensor
|
||||
aspectRatio image.Point
|
||||
|
||||
dataOnce sync.Once
|
||||
data []float32
|
||||
}
|
||||
|
||||
type chunk struct {
|
||||
*chunks
|
||||
s, n int
|
||||
}
|
||||
|
||||
func (r *chunk) floats() []float32 {
|
||||
r.dataOnce.Do(func() {
|
||||
temp := r.Backend().NewContext()
|
||||
defer temp.Close()
|
||||
temp.Forward(r.Tensor).Compute(r.Tensor)
|
||||
r.data = r.Floats()
|
||||
})
|
||||
|
||||
return r.data[r.s*r.Dim(0) : (r.s+r.n)*r.Dim(0)]
|
||||
type separator struct {
|
||||
x bool
|
||||
y bool
|
||||
}
|
||||
|
||||
func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
|
||||
var result []input.Input
|
||||
for _, inp := range inputs {
|
||||
if inp.Multimodal == nil {
|
||||
if len(inp.Multimodal) == 0 {
|
||||
result = append(result, inp)
|
||||
continue
|
||||
}
|
||||
|
||||
t := inp.Multimodal.(*chunks)
|
||||
var imageInputs []input.Input
|
||||
imageInputs = append(imageInputs, input.Input{Token: 200080}) // <|image_start|>
|
||||
|
||||
var offset int
|
||||
patchesPerChunk := t.Dim(1)
|
||||
if t.aspectRatio.Y*t.aspectRatio.X > 1 {
|
||||
patchesPerChunk = t.Dim(1) / (t.aspectRatio.X*t.aspectRatio.Y + 1)
|
||||
for i, mm := range inp.Multimodal {
|
||||
patchesPerChunk := mm.Tensor.Dim(1)
|
||||
|
||||
for range t.aspectRatio.Y {
|
||||
for x := range t.aspectRatio.X {
|
||||
imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: &chunk{t, offset, patchesPerChunk}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
|
||||
imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
|
||||
if x < t.aspectRatio.X-1 {
|
||||
imageInputs = append(imageInputs, input.Input{Token: 200084}) // <|tile_x_separator|>
|
||||
}
|
||||
offset += patchesPerChunk
|
||||
if i < len(inp.Multimodal)-1 {
|
||||
separator := mm.Data.(*separator)
|
||||
|
||||
imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: []input.Multimodal{{Tensor: mm.Tensor}}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
|
||||
imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
|
||||
|
||||
if separator.x {
|
||||
imageInputs = append(imageInputs, input.Input{Token: 200084}) // <|tile_x_separator|>
|
||||
}
|
||||
|
||||
imageInputs = append(imageInputs, input.Input{Token: 200085}) // <|tile_y_separator|>
|
||||
if separator.y {
|
||||
imageInputs = append(imageInputs, input.Input{Token: 200085}) // <|tile_y_separator|>
|
||||
}
|
||||
} else {
|
||||
imageInputs = append(imageInputs, input.Input{Token: 200090}) // <|image|>
|
||||
imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: []input.Multimodal{{Tensor: mm.Tensor}}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
|
||||
imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
|
||||
imageInputs = append(imageInputs, input.Input{Token: 200080}) // <|image_end|>
|
||||
}
|
||||
}
|
||||
|
||||
imageInputs = append(imageInputs, input.Input{Token: 200090}) // <|image|>
|
||||
imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: &chunk{t, offset, patchesPerChunk}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
|
||||
imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
|
||||
imageInputs = append(imageInputs, input.Input{Token: 200080}) // <|image_end|>
|
||||
|
||||
result = append(result, imageInputs...)
|
||||
}
|
||||
|
||||
@@ -174,15 +175,8 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
|
||||
}
|
||||
|
||||
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
||||
positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
|
||||
outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
|
||||
|
||||
return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache), nil
|
||||
}
|
||||
|
||||
@@ -8,6 +8,8 @@ import (
|
||||
"github.com/ollama/ollama/kvcache"
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/ml/nn"
|
||||
"github.com/ollama/ollama/ml/nn/fast"
|
||||
"github.com/ollama/ollama/ml/nn/rope"
|
||||
"github.com/ollama/ollama/model/input"
|
||||
)
|
||||
|
||||
@@ -31,8 +33,8 @@ func (sa *TextAttention) Forward(ctx ml.Context, hiddenStates, positions, attent
|
||||
value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
||||
|
||||
if useRope {
|
||||
query = query.RoPE(ctx, positions, sa.RopeFactors, uint32(opts.ropeDim), uint32(0), opts.ropeBase, opts.ropeScale)
|
||||
key = key.RoPE(ctx, positions, sa.RopeFactors, uint32(opts.ropeDim), uint32(0), opts.ropeBase, opts.ropeScale)
|
||||
query = fast.RoPE(ctx, query, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
|
||||
key = fast.RoPE(ctx, key, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
|
||||
}
|
||||
|
||||
if opts.useQKNorm {
|
||||
@@ -61,9 +63,9 @@ func (mlp *TextMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *TextOp
|
||||
}
|
||||
|
||||
type TextExperts struct {
|
||||
Gate ml.Tensor `gguf:"ffn_gate_exps.weight"`
|
||||
Up ml.Tensor `gguf:"ffn_up_exps.weight"`
|
||||
Down ml.Tensor `gguf:"ffn_down_exps.weight"`
|
||||
Gate *nn.Linear `gguf:"ffn_gate_exps"`
|
||||
Up *nn.Linear `gguf:"ffn_up_exps"`
|
||||
Down *nn.Linear `gguf:"ffn_down_exps"`
|
||||
}
|
||||
|
||||
func (e *TextExperts) Forward(ctx ml.Context, hiddenStates, routerLogits ml.Tensor, opts *TextOptions) ml.Tensor {
|
||||
@@ -74,13 +76,13 @@ func (e *TextExperts) Forward(ctx ml.Context, hiddenStates, routerLogits ml.Tens
|
||||
hiddenStates = hiddenStates.Repeat(ctx, 1, opts.numExpertsUsed)
|
||||
hiddenStates = hiddenStates.Mul(ctx, scores)
|
||||
|
||||
upStates := e.Up.MulmatID(ctx, hiddenStates, experts)
|
||||
gateStates := e.Gate.MulmatID(ctx, hiddenStates, experts)
|
||||
downStates := e.Down.MulmatID(ctx, upStates.Mul(ctx, gateStates.SILU(ctx)), experts)
|
||||
upStates := e.Up.Weight.MulmatID(ctx, hiddenStates, experts)
|
||||
gateStates := e.Gate.Weight.MulmatID(ctx, hiddenStates, experts)
|
||||
downStates := e.Down.Weight.MulmatID(ctx, upStates.Mul(ctx, gateStates.SILU(ctx)), experts)
|
||||
|
||||
nextStates := downStates.View(ctx, 0, hiddenStates.Dim(0), downStates.Stride(2), hiddenStates.Dim(2))
|
||||
for i := 1; i < opts.numExpertsUsed; i++ {
|
||||
nextStates.Add(ctx, downStates.View(ctx, i*downStates.Stride(1), hiddenStates.Dim(0), downStates.Stride(2), hiddenStates.Dim(2)))
|
||||
nextStates = nextStates.Add(ctx, downStates.View(ctx, i*downStates.Stride(1), hiddenStates.Dim(0), downStates.Stride(2), hiddenStates.Dim(2)))
|
||||
}
|
||||
|
||||
return nextStates
|
||||
@@ -210,12 +212,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
|
||||
hiddenStates := m.TokenEmbedding.Forward(ctx, inputs).Duplicate(ctx)
|
||||
|
||||
for _, mi := range batch.Multimodal {
|
||||
f32s := mi.Multimodal.(*chunk).floats()
|
||||
img, err := ctx.Input().FromFloatSlice(f32s, len(f32s)/m.hiddenSize, m.hiddenSize)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
img := mi.Multimodal[0].Tensor
|
||||
ctx.Forward(img.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), img.Dim(0)*img.Dim(1))))
|
||||
}
|
||||
|
||||
@@ -226,11 +223,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
|
||||
scales[i] = float32(math.Log(math.Floor(((float64(p)+1.0)/float64(m.attentionFloorScale))+1.0))*m.attentionScale + 1.0)
|
||||
}
|
||||
|
||||
var err error
|
||||
attentionScales, err = ctx.Input().FromFloatSlice(scales, 1, 1, len(scales))
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
attentionScales = ctx.Input().FromFloatSlice(scales, 1, 1, len(scales))
|
||||
}
|
||||
|
||||
for i, layer := range m.Layers {
|
||||
@@ -255,5 +248,5 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
|
||||
}
|
||||
|
||||
func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
|
||||
return key.RoPE(ctx, shift, m.Layers[layer].Attention.RopeFactors, uint32(0), uint32(m.ropeDim), m.ropeBase, m.ropeScale), nil
|
||||
return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale, rope.WithFactors(m.Layers[layer].Attention.RopeFactors)), nil
|
||||
}
|
||||
|
||||
@@ -245,10 +245,7 @@ func (m *VisionModel) rotaryEmbedding(ctx ml.Context) (ml.Tensor, ml.Tensor) {
|
||||
}
|
||||
}
|
||||
|
||||
ropeFreqs, err := ctx.Input().FromFloatSlice(freqs, freqDim/2, numPatches, 2)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
ropeFreqs := ctx.Input().FromFloatSlice(freqs, freqDim/2, numPatches, 2)
|
||||
|
||||
ropeFreqs = ropeFreqs.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
ropeFreqs = ropeFreqs.Reshape(ctx, freqDim, 1, numPatches)
|
||||
|
||||
@@ -4,7 +4,6 @@ import (
|
||||
"bytes"
|
||||
"image"
|
||||
"slices"
|
||||
"sync"
|
||||
|
||||
"github.com/ollama/ollama/fs"
|
||||
"github.com/ollama/ollama/kvcache"
|
||||
@@ -32,31 +31,26 @@ var _ model.MultimodalProcessor = (*Model)(nil)
|
||||
var _ model.TextProcessor = (*Model)(nil)
|
||||
|
||||
func New(c fs.Config) (model.Model, error) {
|
||||
textModel, err := NewTextModel(c)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
m := &Model{
|
||||
TextModel: textModel,
|
||||
VisionModel: newVisionModel(c),
|
||||
ImageProcessor: newImageProcessor(c),
|
||||
MultiModalProjector: newMultiModalProjector(c),
|
||||
BytePairEncoding: model.NewBytePairEncoding(
|
||||
c.String("tokenizer.ggml.pretokenizer", `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
|
||||
&model.Vocabulary{
|
||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||
Types: c.Ints("tokenizer.ggml.token_type"),
|
||||
Merges: c.Strings("tokenizer.ggml.merges"),
|
||||
BOS: int32(c.Uint("tokenizer.ggml.bos_token_id", 1)),
|
||||
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
|
||||
EOS: int32(c.Uint("tokenizer.ggml.eos_token_id", 2)),
|
||||
BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
|
||||
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
||||
// TODO: set EOT to EOS otherwise 0 will stop generation
|
||||
EOT: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
||||
AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
|
||||
EOS: append(
|
||||
[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
|
||||
c.Ints("tokenizer.ggml.eos_token_ids")...,
|
||||
),
|
||||
},
|
||||
),
|
||||
TextModel: newTextModel(c),
|
||||
VisionModel: newVisionModel(c),
|
||||
ImageProcessor: newImageProcessor(c),
|
||||
MultiModalProjector: newMultiModalProjector(c),
|
||||
}
|
||||
|
||||
m.Cache = kvcache.NewCausalCache(m.TextModel.Shift)
|
||||
@@ -105,7 +99,7 @@ func newMultiModalProjector(c fs.Config) *MultiModalProjector {
|
||||
}
|
||||
}
|
||||
|
||||
func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
|
||||
func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
|
||||
if len(m.VisionModel.Layers) == 0 {
|
||||
return nil, model.ErrNoVisionModel
|
||||
}
|
||||
@@ -120,46 +114,20 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
|
||||
return nil, err
|
||||
}
|
||||
|
||||
pixelValues, err := ctx.Input().FromFloatSlice(f32s, size.X, size.Y, m.ImageProcessor.numChannels)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
pixelValues := ctx.Input().FromFloatSlice(f32s, size.X, size.Y, m.ImageProcessor.numChannels)
|
||||
|
||||
visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
|
||||
features, size := m.MultiModalProjector.Forward(ctx, visionOutputs, size)
|
||||
|
||||
// split into patches to be sent to the text transformer
|
||||
parent := imageFeatures{tensor: features}
|
||||
rows := make([]*imageRow, size.Y)
|
||||
rows := make([]input.Multimodal, size.Y)
|
||||
for i := range rows {
|
||||
rows[i] = &imageRow{parent: &parent, s: i, shape: []int{features.Dim(0), size.X}}
|
||||
rows[i].Tensor = features.View(ctx, features.Stride(1)*size.X*i, features.Dim(0), features.Stride(1), size.X)
|
||||
}
|
||||
|
||||
return rows, nil
|
||||
}
|
||||
|
||||
type imageFeatures struct {
|
||||
tensor ml.Tensor
|
||||
|
||||
dataOnce sync.Once
|
||||
data []float32
|
||||
}
|
||||
|
||||
type imageRow struct {
|
||||
parent *imageFeatures
|
||||
s int
|
||||
shape []int
|
||||
}
|
||||
|
||||
func (r *imageRow) data() []float32 {
|
||||
n := 1
|
||||
for _, s := range r.shape {
|
||||
n *= s
|
||||
}
|
||||
|
||||
return r.parent.data[r.s*n : (r.s+1)*n]
|
||||
}
|
||||
|
||||
// PostTokenize arranges Mistral 3's inputs for the forward pass
|
||||
// In Mistral 3 and Pixtral, the input patches are arranged as follows:
|
||||
// [IMG]...[IMG][IMG_BREAK][IMG]...[IMG][IMG_BREAK][IMG]...[IMG][IMG_END]
|
||||
@@ -168,15 +136,14 @@ func (r *imageRow) data() []float32 {
|
||||
func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
|
||||
var result []input.Input
|
||||
for _, inp := range inputs {
|
||||
if inp.Multimodal == nil {
|
||||
if len(inp.Multimodal) == 0 {
|
||||
result = append(result, inp)
|
||||
} else {
|
||||
inputMultimodal := inp.Multimodal.([]*imageRow)
|
||||
for i, row := range inputMultimodal {
|
||||
for i, row := range inp.Multimodal {
|
||||
// [IMG]
|
||||
result = append(result, input.Input{Token: 10, Multimodal: row, MultimodalHash: inp.MultimodalHash, SameBatch: row.shape[1]})
|
||||
result = append(result, slices.Repeat([]input.Input{{Token: 10}}, row.shape[1]-1)...)
|
||||
if i == len(inputMultimodal)-1 {
|
||||
result = append(result, input.Input{Token: 10, Multimodal: []input.Multimodal{{Tensor: row.Tensor}}, MultimodalHash: inp.MultimodalHash, SameBatch: row.Tensor.Dim(1)})
|
||||
result = append(result, slices.Repeat([]input.Input{{Token: 10}}, row.Tensor.Dim(1)-1)...)
|
||||
if i == len(inp.Multimodal)-1 {
|
||||
// [IMG_END]
|
||||
result = append(result, input.Input{Token: 13})
|
||||
} else {
|
||||
@@ -191,15 +158,8 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
|
||||
}
|
||||
|
||||
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
||||
positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
|
||||
outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
|
||||
|
||||
return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache), nil
|
||||
}
|
||||
|
||||
@@ -1,27 +1,24 @@
|
||||
package mistral3
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"cmp"
|
||||
"math"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/fs"
|
||||
"github.com/ollama/ollama/kvcache"
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/ml/nn"
|
||||
"github.com/ollama/ollama/model"
|
||||
"github.com/ollama/ollama/ml/nn/fast"
|
||||
"github.com/ollama/ollama/model/input"
|
||||
)
|
||||
|
||||
type TextOptions struct {
|
||||
hiddenSize, numHeads, numKVHeads, headDim int
|
||||
eps, ropeBase, ropeScale float32
|
||||
ropeDim uint32
|
||||
hiddenSize, numHeads, numKVHeads int
|
||||
headDim, ropeDim int
|
||||
eps, ropeBase, ropeScale float32
|
||||
}
|
||||
|
||||
type TextModel struct {
|
||||
model.Base
|
||||
|
||||
TokenEmbedding *nn.Embedding `gguf:"token_embd"`
|
||||
Layers []Layer `gguf:"blk"`
|
||||
OutputNorm *nn.RMSNorm `gguf:"output_norm"`
|
||||
@@ -39,19 +36,15 @@ type SelfAttention struct {
|
||||
|
||||
func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
|
||||
batchSize := hiddenState.Dim(1)
|
||||
ropeType := uint32(0)
|
||||
headDim := opts.headDim
|
||||
if headDim == 0 {
|
||||
headDim = opts.hiddenSize / opts.numHeads
|
||||
}
|
||||
headDim := cmp.Or(opts.headDim, opts.hiddenSize/opts.numHeads)
|
||||
|
||||
q := sa.Query.Forward(ctx, hiddenState)
|
||||
q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
|
||||
q = q.RoPE(ctx, positionIDs, nil, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
|
||||
q = fast.RoPE(ctx, q, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale)
|
||||
|
||||
k := sa.Key.Forward(ctx, hiddenState)
|
||||
k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
||||
k = k.RoPE(ctx, positionIDs, nil, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
|
||||
k = fast.RoPE(ctx, k, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale)
|
||||
|
||||
v := sa.Value.Forward(ctx, hiddenState)
|
||||
v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
||||
@@ -62,7 +55,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
|
||||
}
|
||||
|
||||
func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
|
||||
return key.RoPE(ctx, shift, nil, uint32(0), m.ropeDim, m.ropeBase, m.ropeScale), nil
|
||||
return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale), nil
|
||||
}
|
||||
|
||||
type MLP struct {
|
||||
@@ -109,20 +102,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
|
||||
|
||||
// image embeddings
|
||||
for _, image := range batch.Multimodal {
|
||||
row := image.Multimodal.(*imageRow)
|
||||
row.parent.dataOnce.Do(func() {
|
||||
// use a new, throwaway context so the image tensor is not added to the graph
|
||||
temp := m.Backend().NewContext()
|
||||
temp.Forward(row.parent.tensor).Compute(row.parent.tensor)
|
||||
row.parent.data = row.parent.tensor.Floats()
|
||||
temp.Close()
|
||||
})
|
||||
|
||||
imageFeature, err := ctx.Input().FromFloatSlice(row.data(), row.shape...)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
imageFeature := image.Multimodal[0].Tensor
|
||||
ctx.Forward(imageFeature.Copy(ctx, hiddenState.View(ctx, image.Index*hiddenState.Stride(1), imageFeature.Dim(0)*imageFeature.Dim(1))))
|
||||
}
|
||||
|
||||
@@ -141,24 +121,18 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
|
||||
return m.Output.Forward(ctx, hiddenState)
|
||||
}
|
||||
|
||||
func NewTextModel(c fs.Config) (*TextModel, error) {
|
||||
if !strings.EqualFold(c.String("tokenizer.ggml.model"), "gpt2") {
|
||||
return nil, fmt.Errorf("tokenizer %s not yet supported", c.String("tokenizer.ggml.model"))
|
||||
}
|
||||
|
||||
textModel := &TextModel{
|
||||
func newTextModel(c fs.Config) *TextModel {
|
||||
return &TextModel{
|
||||
Layers: make([]Layer, c.Uint("block_count")),
|
||||
TextOptions: &TextOptions{
|
||||
hiddenSize: int(c.Uint("embedding_length")),
|
||||
numHeads: int(c.Uint("attention.head_count")),
|
||||
numKVHeads: int(c.Uint("attention.head_count_kv")),
|
||||
headDim: int(c.Uint("attention.key_length")),
|
||||
ropeDim: int(c.Uint("rope.dimension_count")),
|
||||
eps: c.Float("attention.layer_norm_rms_epsilon"),
|
||||
ropeBase: c.Float("rope.freq_base"),
|
||||
ropeScale: c.Float("rope.freq_scale", 1),
|
||||
ropeDim: c.Uint("rope.dimension_count"),
|
||||
},
|
||||
}
|
||||
|
||||
return textModel, nil
|
||||
}
|
||||
|
||||
@@ -110,15 +110,8 @@ func (m *VisionModel) positionalEmbedding(ctx ml.Context, positionIDs ml.Tensor)
|
||||
}
|
||||
}
|
||||
|
||||
h, err := ctx.Input().FromFloatSlice(frequenciesHeight, maxPatchesPerSide, frequencies/2)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
w, err := ctx.Input().FromFloatSlice(frequenciesWidth, maxPatchesPerSide, frequencies/2)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
h := ctx.Input().FromFloatSlice(frequenciesHeight, maxPatchesPerSide, frequencies/2)
|
||||
w := ctx.Input().FromFloatSlice(frequenciesWidth, maxPatchesPerSide, frequencies/2)
|
||||
|
||||
h = h.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
|
||||
w = w.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
|
||||
@@ -151,10 +144,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
|
||||
}
|
||||
}
|
||||
|
||||
positionIDs, err := ctx.Input().FromIntSlice(positions, len(positions))
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
positionIDs := ctx.Input().FromIntSlice(positions, len(positions))
|
||||
|
||||
positionEmbedding := m.positionalEmbedding(ctx, positionIDs)
|
||||
cos, sin := positionEmbedding.Cos(ctx), positionEmbedding.Sin(ctx)
|
||||
@@ -170,7 +160,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
|
||||
|
||||
func newVisionModel(c fs.Config) *VisionModel {
|
||||
return &VisionModel{
|
||||
Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count", 24)),
|
||||
Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count")),
|
||||
VisionModelOptions: &VisionModelOptions{
|
||||
hiddenSize: int(c.Uint("vision.embedding_length", 1024)),
|
||||
numHeads: int(c.Uint("vision.attention.head_count", 16)),
|
||||
|
||||
@@ -3,6 +3,7 @@ package mllama
|
||||
import (
|
||||
"bytes"
|
||||
"image"
|
||||
"slices"
|
||||
|
||||
"github.com/ollama/ollama/fs"
|
||||
"github.com/ollama/ollama/kvcache"
|
||||
@@ -37,13 +38,13 @@ func New(c fs.Config) (model.Model, error) {
|
||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||
Types: c.Ints("tokenizer.ggml.token_type"),
|
||||
Merges: c.Strings("tokenizer.ggml.merges"),
|
||||
BOS: int32(c.Uint("tokenizer.ggml.bos_token_id")),
|
||||
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
|
||||
EOS: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
||||
BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
|
||||
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
||||
// TODO: set EOT to EOS otherwise 0 will stop generation
|
||||
EOT: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
||||
AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
|
||||
EOS: append(
|
||||
[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
|
||||
c.Ints("tokenizer.ggml.eos_token_ids")...,
|
||||
),
|
||||
},
|
||||
),
|
||||
ImageProcessor: newImageProcessor(c),
|
||||
@@ -58,7 +59,7 @@ func New(c fs.Config) (model.Model, error) {
|
||||
return &m, nil
|
||||
}
|
||||
|
||||
func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
|
||||
func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
|
||||
if len(m.VisionModel.Transformer.Layers) == 0 || len(m.GlobalTransformer.Layers) == 0 {
|
||||
return nil, model.ErrNoVisionModel
|
||||
}
|
||||
@@ -73,21 +74,20 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
|
||||
return nil, err
|
||||
}
|
||||
|
||||
pixelValues, err := ctx.Input().FromFloatSlice(f32s, m.imageSize, m.imageSize, m.numChannels, ratio.numTiles())
|
||||
if err != nil {
|
||||
return nil, err
|
||||
if ratio.numTiles() < m.maxNumTiles {
|
||||
// Pad tiles to maxNumTiles
|
||||
f32s = slices.Grow(f32s, m.imageSize*m.imageSize*m.numChannels*m.maxNumTiles)
|
||||
f32s = f32s[:m.imageSize*m.imageSize*m.numChannels*m.maxNumTiles]
|
||||
}
|
||||
|
||||
pixelValues = pixelValues.Pad(ctx, 0, 0, 0, m.ImageProcessor.maxNumTiles-ratio.numTiles())
|
||||
|
||||
aspectRatio, err := ctx.Input().FromIntSlice([]int32{int32(ratio.rank)}, 1)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
pixelValues := ctx.Input().FromFloatSlice(f32s, m.imageSize, m.imageSize, m.numChannels, m.maxNumTiles)
|
||||
aspectRatio := ctx.Input().FromIntSlice([]int32{int32(ratio.rank)}, 1)
|
||||
|
||||
positionIDs := ctx.Arange(0, 1601, 1, ml.DTypeI32)
|
||||
crossAttentionStates := m.VisionModel.Forward(ctx, pixelValues, positionIDs, aspectRatio)
|
||||
return m.Projector.Forward(ctx, crossAttentionStates), nil
|
||||
projectedOutputs := m.Projector.Forward(ctx, crossAttentionStates)
|
||||
|
||||
return []input.Multimodal{{Tensor: projectedOutputs}}, nil
|
||||
}
|
||||
|
||||
func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
|
||||
@@ -103,18 +103,11 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
|
||||
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
||||
var crossAttentionStates ml.Tensor
|
||||
if len(batch.Multimodal) > 0 {
|
||||
crossAttentionStates = batch.Multimodal[len(batch.Multimodal)-1].Multimodal.(ml.Tensor)
|
||||
crossAttentionStates = batch.Multimodal[len(batch.Multimodal)-1].Multimodal[0].Tensor
|
||||
}
|
||||
|
||||
positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
|
||||
outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
|
||||
|
||||
// TODO: attention mask, cross attention mask
|
||||
return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, crossAttentionStates, nil, m.Cache.(*kvcache.WrapperCache)), nil
|
||||
|
||||
@@ -8,6 +8,8 @@ import (
|
||||
"github.com/ollama/ollama/kvcache"
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/ml/nn"
|
||||
"github.com/ollama/ollama/ml/nn/fast"
|
||||
"github.com/ollama/ollama/ml/nn/rope"
|
||||
)
|
||||
|
||||
type TextSelfAttention struct {
|
||||
@@ -21,15 +23,14 @@ type TextSelfAttention struct {
|
||||
func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
|
||||
batchSize := hiddenState.Dim(1)
|
||||
headDim := opts.hiddenSize / opts.numHeads
|
||||
ropeType := uint32(0)
|
||||
|
||||
query := sa.Query.Forward(ctx, hiddenState)
|
||||
query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
|
||||
query = query.RoPE(ctx, positions, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
|
||||
query = fast.RoPE(ctx, query, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
|
||||
|
||||
key := sa.Key.Forward(ctx, hiddenState)
|
||||
key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
||||
key = key.RoPE(ctx, positions, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
|
||||
key = fast.RoPE(ctx, key, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
|
||||
|
||||
value := sa.Value.Forward(ctx, hiddenState)
|
||||
value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
||||
@@ -44,7 +45,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.T
|
||||
func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
|
||||
// This will only get called for layers in the cache, which are just the self attention layers
|
||||
if sa, ok := m.Transformer.Layers[layer].(*TextSelfAttentionDecoderLayer); ok {
|
||||
return key.RoPE(ctx, shift, sa.SelfAttention.RopeFactors, m.ropeDim, uint32(0), m.ropeBase, m.ropeScale), nil
|
||||
return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale, rope.WithFactors(sa.SelfAttention.RopeFactors)), nil
|
||||
}
|
||||
|
||||
return key, nil
|
||||
@@ -199,8 +200,8 @@ func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs,
|
||||
|
||||
type TextModelOptions struct {
|
||||
hiddenSize, numHeads, numKVHeads int
|
||||
ropeDim int
|
||||
eps, ropeBase, ropeScale float32
|
||||
ropeDim uint32
|
||||
|
||||
crossAttentionLayers []int32
|
||||
}
|
||||
@@ -240,10 +241,10 @@ func newTextModel(c fs.Config) *TextModel {
|
||||
hiddenSize: int(c.Uint("embedding_length")),
|
||||
numHeads: int(c.Uint("attention.head_count")),
|
||||
numKVHeads: int(c.Uint("attention.head_count_kv")),
|
||||
ropeDim: int(c.Uint("rope.dimension_count")),
|
||||
eps: c.Float("attention.layer_norm_rms_epsilon"),
|
||||
ropeBase: c.Float("rope.freq_base"),
|
||||
ropeScale: c.Float("rope.freq_scale", 1),
|
||||
ropeDim: c.Uint("rope.dimension_count"),
|
||||
crossAttentionLayers: c.Ints("attention.cross_attention_layers"),
|
||||
},
|
||||
}
|
||||
|
||||
@@ -16,8 +16,6 @@ type VisionSelfAttention struct {
|
||||
Key *nn.Linear `gguf:"attn_k"`
|
||||
Value *nn.Linear `gguf:"attn_v"`
|
||||
Output *nn.Linear `gguf:"attn_output"`
|
||||
|
||||
Gate ml.Tensor `gguf:"attn_gate"`
|
||||
}
|
||||
|
||||
func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
|
||||
@@ -25,27 +23,16 @@ func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, op
|
||||
|
||||
query := sa.Query.Forward(ctx, hiddenState)
|
||||
query = query.Reshape(ctx, headDim, opts.numHeads, query.Dim(1), batchSize)
|
||||
query = query.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
|
||||
key := sa.Key.Forward(ctx, hiddenState)
|
||||
key = key.Reshape(ctx, headDim, opts.numHeads, key.Dim(1), batchSize)
|
||||
key = key.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
|
||||
value := sa.Value.Forward(ctx, hiddenState)
|
||||
value = value.Reshape(ctx, headDim, opts.numHeads, value.Dim(1), batchSize)
|
||||
value = value.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
|
||||
|
||||
scores := key.Mulmat(ctx, query)
|
||||
scores = scores.Scale(ctx, 1.0/math.Sqrt(float64(headDim)))
|
||||
scores = scores.Softmax(ctx)
|
||||
|
||||
attention := value.Mulmat(ctx, scores)
|
||||
attention = attention.Reshape(ctx, headDim, attention.Dim(1), opts.numHeads, batchSize)
|
||||
attention = attention.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
attention := nn.Attention(ctx, query, key, value, 1./math.Sqrt(float64(headDim)), nil)
|
||||
attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize)
|
||||
|
||||
hiddenState = sa.Output.Forward(ctx, attention)
|
||||
return hiddenState
|
||||
return sa.Output.Forward(ctx, attention)
|
||||
}
|
||||
|
||||
type VisionMLP struct {
|
||||
@@ -76,21 +63,18 @@ func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts
|
||||
// self attention
|
||||
hiddenState = e.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
|
||||
hiddenState = e.SelfAttention.Forward(ctx, hiddenState, opts)
|
||||
|
||||
if e.AttentionGate != nil {
|
||||
hiddenState = hiddenState.Mul(ctx, e.AttentionGate)
|
||||
}
|
||||
hiddenState = hiddenState.Add(ctx, residual)
|
||||
residual = hiddenState
|
||||
|
||||
// feed forward
|
||||
hiddenState = e.MLPNorm.Forward(ctx, hiddenState, opts.eps)
|
||||
hiddenState = e.MLP.Forward(ctx, hiddenState, opts)
|
||||
hiddenState = hiddenState.Add(ctx, residual)
|
||||
if e.MLPGate != nil {
|
||||
hiddenState = hiddenState.Mul(ctx, e.MLPGate)
|
||||
}
|
||||
|
||||
hiddenState = hiddenState.Add(ctx, residual)
|
||||
return hiddenState
|
||||
}
|
||||
|
||||
|
||||
@@ -7,4 +7,7 @@ import (
|
||||
_ "github.com/ollama/ollama/model/models/llama4"
|
||||
_ "github.com/ollama/ollama/model/models/mistral3"
|
||||
_ "github.com/ollama/ollama/model/models/mllama"
|
||||
_ "github.com/ollama/ollama/model/models/qwen2"
|
||||
_ "github.com/ollama/ollama/model/models/qwen25vl"
|
||||
_ "github.com/ollama/ollama/model/models/qwen3"
|
||||
)
|
||||
|
||||
164
model/models/qwen2/model.go
Normal file
164
model/models/qwen2/model.go
Normal file
@@ -0,0 +1,164 @@
|
||||
package qwen2
|
||||
|
||||
import (
|
||||
"cmp"
|
||||
"math"
|
||||
|
||||
"github.com/ollama/ollama/fs"
|
||||
"github.com/ollama/ollama/kvcache"
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/ml/nn"
|
||||
"github.com/ollama/ollama/ml/nn/fast"
|
||||
"github.com/ollama/ollama/ml/nn/rope"
|
||||
"github.com/ollama/ollama/model"
|
||||
"github.com/ollama/ollama/model/input"
|
||||
)
|
||||
|
||||
type Options struct {
|
||||
hiddenSize, numHeads, numKVHeads int
|
||||
headDim, ropeDim int
|
||||
eps, ropeBase, ropeScale float32
|
||||
}
|
||||
|
||||
type Attention struct {
|
||||
Query *nn.Linear `gguf:"attn_q"`
|
||||
Key *nn.Linear `gguf:"attn_k"`
|
||||
Value *nn.Linear `gguf:"attn_v"`
|
||||
Output *nn.Linear `gguf:"attn_output"`
|
||||
}
|
||||
|
||||
func (attn Attention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
|
||||
batchSize := hiddenStates.Dim(1)
|
||||
headDim := cmp.Or(opts.headDim, opts.hiddenSize/opts.numHeads)
|
||||
ropeDim := cmp.Or(opts.ropeDim, headDim)
|
||||
|
||||
query := attn.Query.Forward(ctx, hiddenStates)
|
||||
query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
|
||||
|
||||
key := attn.Key.Forward(ctx, hiddenStates)
|
||||
key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
||||
|
||||
value := attn.Value.Forward(ctx, hiddenStates)
|
||||
value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
||||
|
||||
query = fast.RoPE(ctx, query, positions, ropeDim, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
|
||||
key = fast.RoPE(ctx, key, positions, ropeDim, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
|
||||
|
||||
attention := nn.Attention(ctx, query, key, value, 1.0/math.Sqrt(float64(headDim)), cache)
|
||||
attention = attention.Reshape(ctx, headDim*opts.numHeads, batchSize)
|
||||
|
||||
return attn.Output.Forward(ctx, attention)
|
||||
}
|
||||
|
||||
type MLP struct {
|
||||
Gate *nn.Linear `gguf:"ffn_gate"`
|
||||
Up *nn.Linear `gguf:"ffn_up"`
|
||||
Down *nn.Linear `gguf:"ffn_down"`
|
||||
}
|
||||
|
||||
func (mlp MLP) Forward(ctx ml.Context, hiddenStates ml.Tensor) ml.Tensor {
|
||||
hiddenStates = mlp.Gate.Forward(ctx, hiddenStates).SILU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenStates))
|
||||
return mlp.Down.Forward(ctx, hiddenStates)
|
||||
}
|
||||
|
||||
type DecoderLayer struct {
|
||||
AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
|
||||
Attention *Attention
|
||||
MLPNorm *nn.RMSNorm `gguf:"ffn_norm"`
|
||||
MLP *MLP
|
||||
}
|
||||
|
||||
func (d DecoderLayer) Forward(ctx ml.Context, hiddenStates, positions, outputs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
|
||||
residual := hiddenStates
|
||||
|
||||
hiddenStates = d.AttentionNorm.Forward(ctx, hiddenStates, opts.eps)
|
||||
hiddenStates = d.Attention.Forward(ctx, hiddenStates, positions, cache, opts)
|
||||
if outputs != nil {
|
||||
hiddenStates = hiddenStates.Rows(ctx, outputs)
|
||||
residual = residual.Rows(ctx, outputs)
|
||||
}
|
||||
|
||||
hiddenStates = hiddenStates.Add(ctx, residual)
|
||||
residual = hiddenStates
|
||||
|
||||
hiddenStates = d.MLPNorm.Forward(ctx, hiddenStates, opts.eps)
|
||||
hiddenStates = d.MLP.Forward(ctx, hiddenStates)
|
||||
return hiddenStates.Add(ctx, residual)
|
||||
}
|
||||
|
||||
type Model struct {
|
||||
model.Base
|
||||
model.BytePairEncoding
|
||||
|
||||
TokenEmbedding *nn.Embedding `gguf:"token_embd"`
|
||||
Layers []DecoderLayer `gguf:"blk"`
|
||||
OutputNorm *nn.RMSNorm `gguf:"output_norm"`
|
||||
Output *nn.Linear `gguf:"output,alt:token_embd"`
|
||||
|
||||
Options
|
||||
}
|
||||
|
||||
// Forward implements model.Model.
|
||||
func (m Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
||||
positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
|
||||
|
||||
hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs)
|
||||
|
||||
for i, layer := range m.Layers {
|
||||
m.Cache.SetLayer(i)
|
||||
|
||||
var outputs ml.Tensor
|
||||
if i == len(m.Layers)-1 {
|
||||
outputs = ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
|
||||
}
|
||||
|
||||
hiddenStates = layer.Forward(ctx, hiddenStates, positions, outputs, m.Cache, &m.Options)
|
||||
}
|
||||
|
||||
hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.eps)
|
||||
hiddenStates = m.Output.Forward(ctx, hiddenStates)
|
||||
return hiddenStates, nil
|
||||
}
|
||||
|
||||
func (m Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
|
||||
ropeDim := cmp.Or(m.ropeDim, m.hiddenSize/m.numHeads)
|
||||
return fast.RoPE(ctx, key, shift, ropeDim, m.ropeBase, m.ropeScale, rope.WithTypeNeoX()), nil
|
||||
}
|
||||
|
||||
func New(c fs.Config) (model.Model, error) {
|
||||
m := Model{
|
||||
Layers: make([]DecoderLayer, c.Uint("block_count")),
|
||||
BytePairEncoding: model.NewBytePairEncoding(
|
||||
c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
|
||||
&model.Vocabulary{
|
||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||
Types: c.Ints("tokenizer.ggml.token_type"),
|
||||
Merges: c.Strings("tokenizer.ggml.merges"),
|
||||
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
|
||||
BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
|
||||
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
||||
EOS: append(
|
||||
[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
|
||||
c.Ints("tokenizer.ggml.eos_token_ids")...,
|
||||
),
|
||||
},
|
||||
),
|
||||
Options: Options{
|
||||
hiddenSize: int(c.Uint("embedding_length")),
|
||||
numHeads: int(c.Uint("attention.head_count")),
|
||||
numKVHeads: int(c.Uint("attention.head_count_kv")),
|
||||
headDim: int(c.Uint("attention.key_length")),
|
||||
ropeDim: int(c.Uint("rope.dimension_count")),
|
||||
ropeBase: c.Float("rope.freq_base"),
|
||||
ropeScale: c.Float("rope.freq_scale", 1),
|
||||
eps: c.Float("attention.layer_norm_rms_epsilon"),
|
||||
},
|
||||
}
|
||||
|
||||
m.Cache = kvcache.NewCausalCache(m.Shift)
|
||||
return &m, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
model.Register("qwen2", New)
|
||||
}
|
||||
150
model/models/qwen25vl/model.go
Normal file
150
model/models/qwen25vl/model.go
Normal file
@@ -0,0 +1,150 @@
|
||||
package qwen25vl
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"image"
|
||||
"slices"
|
||||
|
||||
"github.com/ollama/ollama/fs"
|
||||
"github.com/ollama/ollama/kvcache"
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/model"
|
||||
"github.com/ollama/ollama/model/input"
|
||||
)
|
||||
|
||||
type Model struct {
|
||||
model.Base
|
||||
model.BytePairEncoding
|
||||
|
||||
*TextModel
|
||||
*VisionModel `gguf:"v,vision"`
|
||||
|
||||
ImageProcessor
|
||||
}
|
||||
|
||||
// Implement MultimodalProcessor interface
|
||||
var _ model.MultimodalProcessor = (*Model)(nil)
|
||||
|
||||
func New(c fs.Config) (model.Model, error) {
|
||||
m := &Model{
|
||||
BytePairEncoding: model.NewBytePairEncoding(
|
||||
c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
|
||||
&model.Vocabulary{
|
||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||
Types: c.Ints("tokenizer.ggml.token_type"),
|
||||
Merges: c.Strings("tokenizer.ggml.merges"),
|
||||
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
|
||||
BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
|
||||
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
||||
EOS: append(
|
||||
[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
|
||||
c.Ints("tokenizer.ggml.eos_token_ids")...,
|
||||
),
|
||||
},
|
||||
),
|
||||
TextModel: NewTextModel(c),
|
||||
VisionModel: newVisionModel(c),
|
||||
ImageProcessor: newImageProcessor(c),
|
||||
}
|
||||
|
||||
m.Cache = kvcache.NewCausalCache(m.TextModel.Shift)
|
||||
|
||||
return m, nil
|
||||
}
|
||||
|
||||
func (m *Model) PixelValues(ctx ml.Context, multimodalData []byte) (ml.Tensor, *Grid, error) {
|
||||
image, _, err := image.Decode(bytes.NewReader(multimodalData))
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
f32s, grid, err := m.ImageProcessor.ProcessImage(image)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
// Calculate tensor dimensions
|
||||
patchDim := m.ImageProcessor.numChannels * m.ImageProcessor.temporalPatchSize *
|
||||
m.ImageProcessor.patchSize * m.ImageProcessor.patchSize
|
||||
numPatches := grid.Temporal * grid.Height * grid.Width
|
||||
|
||||
pixelValues := ctx.Input().FromFloatSlice(f32s, patchDim, numPatches)
|
||||
|
||||
return pixelValues, grid, nil
|
||||
}
|
||||
|
||||
func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
|
||||
if len(m.VisionModel.Layers) == 0 {
|
||||
return nil, model.ErrNoVisionModel
|
||||
}
|
||||
|
||||
pixels, grid, err := m.PixelValues(ctx, multimodalData)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
visionOutputs := m.VisionModel.Forward(ctx, pixels, grid)
|
||||
return []input.Multimodal{{Tensor: visionOutputs}}, nil
|
||||
}
|
||||
|
||||
// PostTokenize arranges Qwen-2.5-VL's inputs for the forward pass
|
||||
func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
|
||||
var result []input.Input
|
||||
|
||||
var (
|
||||
imageToken int32 = 151655
|
||||
visionStartToken int32 = 151652
|
||||
visionEndToken int32 = 151653
|
||||
)
|
||||
|
||||
nImg := 0
|
||||
for _, inp := range inputs {
|
||||
if inp.Multimodal == nil {
|
||||
// If not a multimodal input, add it to the result unchanged
|
||||
result = append(result, inp)
|
||||
} else {
|
||||
// Adding the 'Picture' prefix is a hack, at the time of writing there is no way to prefix
|
||||
// the image tokens with a prompt, so we add a prefix here
|
||||
nImg++
|
||||
pre, err := m.Encode(fmt.Sprintf(" Picture %d: ", nImg), true)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to encode image prompt: %w", err)
|
||||
}
|
||||
for i := range pre {
|
||||
result = append(result, input.Input{Token: pre[i]})
|
||||
}
|
||||
|
||||
patchesPerChunk := inp.Multimodal[0].Tensor.Dim(1)
|
||||
|
||||
// First add the vision start token
|
||||
result = append(result, input.Input{Token: visionStartToken})
|
||||
|
||||
// Add the image token with the multimodal tensor data at the first position
|
||||
result = append(result, input.Input{
|
||||
Token: imageToken,
|
||||
Multimodal: inp.Multimodal,
|
||||
MultimodalHash: inp.MultimodalHash,
|
||||
SameBatch: patchesPerChunk,
|
||||
})
|
||||
|
||||
// Add the placeholder tokens for the remaining positions (tokensPerGrid-1)
|
||||
result = append(result, slices.Repeat([]input.Input{{Token: imageToken}}, patchesPerChunk-1)...)
|
||||
|
||||
result = append(result, input.Input{Token: visionEndToken})
|
||||
}
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
||||
positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
|
||||
outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
|
||||
|
||||
return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache)
|
||||
}
|
||||
|
||||
func init() {
|
||||
model.Register("qwen25vl", New)
|
||||
}
|
||||
151
model/models/qwen25vl/model_text.go
Normal file
151
model/models/qwen25vl/model_text.go
Normal file
@@ -0,0 +1,151 @@
|
||||
package qwen25vl
|
||||
|
||||
import (
|
||||
"math"
|
||||
|
||||
"github.com/ollama/ollama/fs"
|
||||
"github.com/ollama/ollama/kvcache"
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/ml/nn"
|
||||
"github.com/ollama/ollama/ml/nn/fast"
|
||||
"github.com/ollama/ollama/ml/nn/rope"
|
||||
"github.com/ollama/ollama/model/input"
|
||||
)
|
||||
|
||||
type TextOptions struct {
|
||||
hiddenSize, numHeads, numKVHeads int
|
||||
ropeDim, originalContextLength int
|
||||
eps, ropeBase, ropeScale float32
|
||||
}
|
||||
|
||||
type TextModel struct {
|
||||
TokenEmbedding *nn.Embedding `gguf:"token_embd"`
|
||||
Layers []Layer `gguf:"blk"`
|
||||
OutputNorm *nn.RMSNorm `gguf:"output_norm"`
|
||||
Output *nn.Linear `gguf:"output,alt:token_embd"`
|
||||
|
||||
*TextOptions
|
||||
}
|
||||
|
||||
func NewTextModel(c fs.Config) *TextModel {
|
||||
m := TextModel{
|
||||
Layers: make([]Layer, c.Uint("block_count")),
|
||||
TextOptions: &TextOptions{
|
||||
hiddenSize: int(c.Uint("embedding_length")),
|
||||
numHeads: int(c.Uint("attention.head_count")),
|
||||
numKVHeads: int(c.Uint("attention.head_count_kv")),
|
||||
ropeDim: int(c.Uint("rope.dimension_count", 128)),
|
||||
originalContextLength: int(c.Uint("context_length", 128000)),
|
||||
eps: c.Float("attention.layer_norm_rms_epsilon"),
|
||||
ropeBase: c.Float("rope.freq_base"),
|
||||
ropeScale: c.Float("rope.freq_scale", 1),
|
||||
},
|
||||
}
|
||||
|
||||
return &m
|
||||
}
|
||||
|
||||
// SelfAttention implements the multi-head self-attention mechanism
|
||||
// with separate projections for query, key, value and output transformations
|
||||
type SelfAttention struct {
|
||||
Query *nn.Linear `gguf:"attn_q"`
|
||||
Key *nn.Linear `gguf:"attn_k"`
|
||||
Value *nn.Linear `gguf:"attn_v"`
|
||||
Output *nn.Linear `gguf:"attn_output"`
|
||||
}
|
||||
|
||||
func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
|
||||
batchSize := hiddenState.Dim(1)
|
||||
headDim := opts.hiddenSize / opts.numHeads
|
||||
|
||||
q := sa.Query.Forward(ctx, hiddenState)
|
||||
q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
|
||||
q = fast.RoPE(ctx, q, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithOriginalContextLength(opts.originalContextLength), rope.WithTypeNeoX())
|
||||
|
||||
k := sa.Key.Forward(ctx, hiddenState)
|
||||
k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
||||
k = fast.RoPE(ctx, k, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithOriginalContextLength(opts.originalContextLength), rope.WithTypeNeoX())
|
||||
|
||||
v := sa.Value.Forward(ctx, hiddenState)
|
||||
v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
||||
|
||||
scaleFactor := 1.0 / math.Sqrt(float64(headDim))
|
||||
kqv := nn.Attention(ctx, q, k, v, scaleFactor, cache)
|
||||
kqv = kqv.Reshape(ctx, opts.hiddenSize, batchSize)
|
||||
|
||||
return sa.Output.Forward(ctx, kqv)
|
||||
}
|
||||
|
||||
// Shift applies rotary position embeddings to the key tensor for causal attention caching
|
||||
func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
|
||||
return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale, rope.WithOriginalContextLength(m.originalContextLength), rope.WithTypeNeoX()), nil
|
||||
}
|
||||
|
||||
// MLP implements the feed-forward network component with SwiGLU activation
|
||||
type MLP struct {
|
||||
Up *nn.Linear `gguf:"ffn_up"`
|
||||
Down *nn.Linear `gguf:"ffn_down"`
|
||||
Gate *nn.Linear `gguf:"ffn_gate"`
|
||||
}
|
||||
|
||||
func (mlp *MLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *TextOptions) ml.Tensor {
|
||||
// Apply SwiGLU activation gating
|
||||
hiddenState = mlp.Gate.Forward(ctx, hiddenState).SILU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenState))
|
||||
// Project back to hidden dimension
|
||||
return mlp.Down.Forward(ctx, hiddenState)
|
||||
}
|
||||
|
||||
// Layer represents a single transformer layer combining self-attention and feed-forward components
|
||||
type Layer struct {
|
||||
AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
|
||||
SelfAttention *SelfAttention
|
||||
MLPNorm *nn.RMSNorm `gguf:"ffn_norm"`
|
||||
MLP *MLP
|
||||
}
|
||||
|
||||
func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
|
||||
// Self-attention branch with residual connection
|
||||
residual := hiddenState
|
||||
|
||||
hiddenState = l.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
|
||||
hiddenState = l.SelfAttention.Forward(ctx, hiddenState, positionIDs, cache, opts)
|
||||
|
||||
// In the final layer (outputs != nil), optimize by pruning to just the token positions
|
||||
// we need logits for.
|
||||
if outputs != nil {
|
||||
hiddenState = hiddenState.Rows(ctx, outputs)
|
||||
residual = residual.Rows(ctx, outputs)
|
||||
}
|
||||
|
||||
hiddenState = hiddenState.Add(ctx, residual)
|
||||
// Feed-forward branch with residual connection
|
||||
residual = hiddenState
|
||||
hiddenState = l.MLPNorm.Forward(ctx, hiddenState, opts.eps)
|
||||
hiddenState = l.MLP.Forward(ctx, hiddenState, opts)
|
||||
return hiddenState.Add(ctx, residual)
|
||||
}
|
||||
|
||||
func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor, batch input.Batch, cache kvcache.Cache) (ml.Tensor, error) {
|
||||
// Initial token embedding
|
||||
hiddenStates := m.TokenEmbedding.Forward(ctx, inputs).Duplicate(ctx)
|
||||
|
||||
for _, mi := range batch.Multimodal {
|
||||
img := mi.Multimodal[0].Tensor
|
||||
ctx.Forward(img.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), img.Dim(0)*img.Dim(1))))
|
||||
}
|
||||
|
||||
// Process through transformer layers
|
||||
for i, layer := range m.Layers {
|
||||
cache.SetLayer(i)
|
||||
|
||||
var lastLayerOutputs ml.Tensor
|
||||
if i == len(m.Layers)-1 {
|
||||
lastLayerOutputs = outputs
|
||||
}
|
||||
|
||||
hiddenStates = layer.Forward(ctx, hiddenStates, positions, lastLayerOutputs, cache, m.TextOptions)
|
||||
}
|
||||
|
||||
hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.eps)
|
||||
return m.Output.Forward(ctx, hiddenStates), nil
|
||||
}
|
||||
379
model/models/qwen25vl/model_vision.go
Normal file
379
model/models/qwen25vl/model_vision.go
Normal file
@@ -0,0 +1,379 @@
|
||||
package qwen25vl
|
||||
|
||||
import (
|
||||
"math"
|
||||
"slices"
|
||||
|
||||
"github.com/ollama/ollama/fs"
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/ml/nn"
|
||||
)
|
||||
|
||||
// We only support batch size of 1
|
||||
var batchSize int = 1
|
||||
|
||||
func rotateHalf(ctx ml.Context, t ml.Tensor) ml.Tensor {
|
||||
x1 := t.View(ctx, 0, t.Dim(0)/2, t.Stride(1), t.Dim(1), t.Stride(2), t.Dim(2), t.Stride(3), t.Dim(3))
|
||||
x2 := t.View(ctx, t.Stride(0)*t.Dim(0)/2, t.Dim(0)/2, t.Stride(1), t.Dim(1), t.Stride(2), t.Dim(2), t.Stride(3), t.Dim(3)).Contiguous(ctx)
|
||||
return x2.Neg(ctx).Concat(ctx, x1, 0)
|
||||
}
|
||||
|
||||
func applyRotaryPositionalEmbedding(ctx ml.Context, t, cos, sin ml.Tensor) ml.Tensor {
|
||||
return t.Mul(ctx, cos).Add(ctx, rotateHalf(ctx, t).Mul(ctx, sin))
|
||||
}
|
||||
|
||||
func blockDiagonalMask(ctx ml.Context, seqLength int, bounds []int, numHeads int) ml.Tensor {
|
||||
// Create a flat slice for the mask (all -inf initially to block all attention)
|
||||
flat := make([]float32, seqLength*seqLength)
|
||||
for i := range flat {
|
||||
flat[i] = float32(math.Inf(-1)) // Negative infinity to block attention
|
||||
}
|
||||
|
||||
// Fill in the mask with zeros for tokens that CAN attend to each other
|
||||
for i := 1; i < len(bounds); i++ {
|
||||
start := bounds[i-1]
|
||||
end := bounds[i]
|
||||
|
||||
// Enable attention within this sequence block by setting values to 0
|
||||
for row := start; row < end; row++ {
|
||||
for col := start; col < end; col++ {
|
||||
idx := row*seqLength + col
|
||||
flat[idx] = 0.0 // 0 allows attention, -inf blocks it
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
mask := ctx.Input().FromFloatSlice(flat, seqLength, seqLength)
|
||||
|
||||
// Reshape to match [seqLength, seqLength, 1] for broadcasting
|
||||
mask = mask.Reshape(ctx, seqLength, seqLength, 1)
|
||||
|
||||
return mask
|
||||
}
|
||||
|
||||
type VisionSelfAttention struct {
|
||||
Query *nn.Linear `gguf:"attn_q"`
|
||||
Key *nn.Linear `gguf:"attn_k"`
|
||||
Value *nn.Linear `gguf:"attn_v"`
|
||||
Output *nn.Linear `gguf:"attn_out"`
|
||||
}
|
||||
|
||||
func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenStates, cos, sin, mask ml.Tensor, opts *VisionModelOptions) ml.Tensor {
|
||||
query := sa.Query.Forward(ctx, hiddenStates)
|
||||
key := sa.Key.Forward(ctx, hiddenStates)
|
||||
value := sa.Value.Forward(ctx, hiddenStates)
|
||||
|
||||
query = query.Reshape(ctx, opts.headDim, opts.numHeads, query.Dim(1), batchSize)
|
||||
key = key.Reshape(ctx, opts.headDim, opts.numHeads, key.Dim(1), batchSize)
|
||||
value = value.Reshape(ctx, opts.headDim, opts.numHeads, value.Dim(1), batchSize)
|
||||
|
||||
query = applyRotaryPositionalEmbedding(ctx, query, cos, sin)
|
||||
key = applyRotaryPositionalEmbedding(ctx, key, cos, sin)
|
||||
|
||||
// Scale factor for scaled dot-product attention
|
||||
scale := 1.0 / math.Sqrt(float64(opts.headDim))
|
||||
|
||||
// Scaled dot-product attention
|
||||
query = query.Permute(ctx, 0, 2, 1, 3)
|
||||
key = key.Permute(ctx, 0, 2, 1, 3)
|
||||
value = value.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
|
||||
kq := key.MulmatFullPrec(ctx, query)
|
||||
kq = kq.Scale(ctx, scale)
|
||||
if mask != nil {
|
||||
kq = kq.Add(ctx, mask)
|
||||
}
|
||||
kq = kq.Softmax(ctx)
|
||||
kqv := value.Mulmat(ctx, kq)
|
||||
attention := kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize)
|
||||
|
||||
return sa.Output.Forward(ctx, attention)
|
||||
}
|
||||
|
||||
// VisionMLP implements the multi-layer perceptron
|
||||
type VisionMLP struct {
|
||||
Gate *nn.Linear `gguf:"ffn_gate"`
|
||||
Up *nn.Linear `gguf:"ffn_up"`
|
||||
Down *nn.Linear `gguf:"ffn_down"`
|
||||
}
|
||||
|
||||
func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *VisionModelOptions) ml.Tensor {
|
||||
// Using activation as specified in config (likely GELU or SiLU/Swish)
|
||||
gateOutput := mlp.Gate.Forward(ctx, hiddenStates)
|
||||
upOutput := mlp.Up.Forward(ctx, hiddenStates)
|
||||
hiddenStates = gateOutput.SILU(ctx).Mul(ctx, upOutput)
|
||||
|
||||
return mlp.Down.Forward(ctx, hiddenStates)
|
||||
}
|
||||
|
||||
type VisionEncoderLayer struct {
|
||||
Norm1 *nn.RMSNorm `gguf:"ln1"`
|
||||
SelfAttention *VisionSelfAttention
|
||||
Norm2 *nn.RMSNorm `gguf:"ln2"`
|
||||
MLP *VisionMLP
|
||||
}
|
||||
|
||||
func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenStates, cos, sin, mask ml.Tensor, opts *VisionModelOptions) ml.Tensor {
|
||||
residual := hiddenStates
|
||||
hiddenStates = e.Norm1.Forward(ctx, hiddenStates, opts.eps)
|
||||
hiddenStates = e.SelfAttention.Forward(ctx, hiddenStates, cos, sin, mask, opts)
|
||||
hiddenStates = hiddenStates.Add(ctx, residual)
|
||||
|
||||
residual = hiddenStates
|
||||
hiddenStates = e.Norm2.Forward(ctx, hiddenStates, opts.eps)
|
||||
hiddenStates = e.MLP.Forward(ctx, hiddenStates, opts)
|
||||
return hiddenStates.Add(ctx, residual)
|
||||
}
|
||||
|
||||
// VisionModelOptions contains configuration options
|
||||
type VisionModelOptions struct {
|
||||
hiddenSize int
|
||||
numHeads int
|
||||
headDim int
|
||||
patchSize int
|
||||
numChannels int
|
||||
eps float32
|
||||
ropeTheta float32
|
||||
spatialMergeSize int
|
||||
windowSize int
|
||||
fullAttnBlocks []int32
|
||||
temporalPatchSize int
|
||||
}
|
||||
|
||||
type PatchEmbedding struct {
|
||||
PatchConv0 *nn.Conv2D `gguf:"patch_embd_0"`
|
||||
PatchConv1 *nn.Conv2D `gguf:"patch_embd_1"`
|
||||
}
|
||||
|
||||
func (pe *PatchEmbedding) Forward(ctx ml.Context, pixelValues ml.Tensor, opts *VisionModelOptions) ml.Tensor {
|
||||
numPatches := pixelValues.Shape()[1]
|
||||
|
||||
// Reshape the input tensor to match the expected dimensions
|
||||
pixelValues = pixelValues.Reshape(ctx, opts.patchSize*opts.patchSize, opts.temporalPatchSize, opts.numChannels, numPatches)
|
||||
|
||||
// Permute the tensor to bring the temporal dimension to the front
|
||||
pixelValues = pixelValues.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
|
||||
|
||||
// Split the tensor into parts for the temporal convolutions
|
||||
in0 := pixelValues.View(ctx, 0, 1, pixelValues.Stride(1), pixelValues.Dim(1), pixelValues.Stride(2), pixelValues.Dim(2), pixelValues.Stride(3), pixelValues.Dim(3)).Contiguous(ctx)
|
||||
in0 = in0.Reshape(ctx, opts.patchSize, opts.patchSize, opts.numChannels, numPatches)
|
||||
in1 := pixelValues.View(ctx, pixelValues.Stride(0), 1, pixelValues.Stride(1), pixelValues.Dim(1), pixelValues.Stride(2), pixelValues.Dim(2), pixelValues.Stride(3), pixelValues.Dim(3)).Contiguous(ctx)
|
||||
in1 = in1.Reshape(ctx, opts.patchSize, opts.patchSize, opts.numChannels, numPatches)
|
||||
|
||||
s0, s1 := opts.patchSize, opts.patchSize // Use full stride
|
||||
p0, p1 := 0, 0 // padding
|
||||
d0, d1 := 1, 1 // dilation
|
||||
out0 := pe.PatchConv0.Forward(ctx, in0, s0, s1, p0, p1, d0, d1)
|
||||
out1 := pe.PatchConv1.Forward(ctx, in1, s0, s1, p0, p1, d0, d1)
|
||||
|
||||
// Add the outputs from the two temporal convolutions
|
||||
out := out0.Add(ctx, out1)
|
||||
|
||||
// Reshape the output tensor to match the expected dimensions
|
||||
return out.Reshape(ctx, opts.hiddenSize, numPatches)
|
||||
}
|
||||
|
||||
// VisionPatchMerger implements patch merging for the Qwen vision model
|
||||
type VisionPatchMerger struct {
|
||||
LNQ *nn.RMSNorm `gguf:"ln_q"`
|
||||
MLP0 *nn.Linear `gguf:"mlp.0"`
|
||||
MLP2 *nn.Linear `gguf:"mlp.2"`
|
||||
}
|
||||
|
||||
// Forward computes patch merging for the vision model
|
||||
func (pm *VisionPatchMerger) Forward(ctx ml.Context, visionOutputs ml.Tensor, opts *VisionModelOptions) ml.Tensor {
|
||||
normalized := pm.LNQ.Forward(ctx, visionOutputs, opts.eps)
|
||||
|
||||
hiddenSize := visionOutputs.Dim(0) * (opts.spatialMergeSize * opts.spatialMergeSize)
|
||||
|
||||
// Reshape the normalized output to view the hidden size dimension
|
||||
reshaped := normalized.Reshape(ctx, hiddenSize, normalized.Dim(1)/(opts.spatialMergeSize*opts.spatialMergeSize), batchSize)
|
||||
hidden := pm.MLP0.Forward(ctx, reshaped)
|
||||
activated := hidden.GELU(ctx)
|
||||
|
||||
output := pm.MLP2.Forward(ctx, activated)
|
||||
|
||||
return output
|
||||
}
|
||||
|
||||
// VisionModel implements the Qwen vision model
|
||||
type VisionModel struct {
|
||||
PatchEmbedding *PatchEmbedding
|
||||
Layers []VisionEncoderLayer `gguf:"blk"`
|
||||
PatchMerger *VisionPatchMerger `gguf:"merger"`
|
||||
|
||||
*VisionModelOptions
|
||||
}
|
||||
|
||||
// Forward computes the vision model for an input tensor
|
||||
func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor, grid *Grid) ml.Tensor {
|
||||
// Extract patch embeddings
|
||||
hiddenStates := m.PatchEmbedding.Forward(ctx, pixelValues, m.VisionModelOptions)
|
||||
|
||||
positionEmbedding := m.PositionalEmbedding(ctx, grid)
|
||||
|
||||
windowIndex, bounds := m.WindowIndex(ctx, grid)
|
||||
|
||||
spatialMergeUnit := m.spatialMergeSize * m.spatialMergeSize
|
||||
|
||||
hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0)*spatialMergeUnit, hiddenStates.Dim(1)/spatialMergeUnit)
|
||||
hiddenStates = hiddenStates.Rows(ctx, windowIndex)
|
||||
hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0)/spatialMergeUnit, hiddenStates.Dim(1)*spatialMergeUnit)
|
||||
|
||||
positionEmbedding = positionEmbedding.Reshape(ctx, positionEmbedding.Dim(0)*spatialMergeUnit, positionEmbedding.Dim(1)/spatialMergeUnit)
|
||||
positionEmbedding = positionEmbedding.Rows(ctx, windowIndex)
|
||||
positionEmbedding = positionEmbedding.Reshape(ctx, positionEmbedding.Dim(0)/spatialMergeUnit, positionEmbedding.Dim(1)*spatialMergeUnit)
|
||||
positionEmbedding = positionEmbedding.Concat(ctx, positionEmbedding, 0)
|
||||
|
||||
cos, sin := positionEmbedding.Cos(ctx), positionEmbedding.Sin(ctx)
|
||||
cos = cos.Reshape(ctx, cos.Dim(0), 1, cos.Dim(1))
|
||||
sin = sin.Reshape(ctx, sin.Dim(0), 1, sin.Dim(1))
|
||||
|
||||
mask := blockDiagonalMask(ctx, hiddenStates.Dim(1), bounds, m.VisionModelOptions.numHeads)
|
||||
// Apply encoder layers
|
||||
for i, layer := range m.Layers {
|
||||
if slices.Contains(m.fullAttnBlocks, int32(i)) {
|
||||
hiddenStates = layer.Forward(ctx, hiddenStates, cos, sin, nil, m.VisionModelOptions)
|
||||
} else {
|
||||
hiddenStates = layer.Forward(
|
||||
ctx,
|
||||
hiddenStates,
|
||||
cos,
|
||||
sin,
|
||||
mask,
|
||||
m.VisionModelOptions,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
hiddenStates = m.PatchMerger.Forward(ctx, hiddenStates, m.VisionModelOptions)
|
||||
reverseWindowIndex := windowIndex.Argsort(ctx)
|
||||
return hiddenStates.Rows(ctx, reverseWindowIndex)
|
||||
}
|
||||
|
||||
// WindowIndex divides the grid into windows and returns:
|
||||
// 1. A tensor containing flattened indices of all grid points organized by windows
|
||||
// 2. A slice of boundaries that mark where each window's data begins and ends
|
||||
// in the flattened representation, scaled by spatialMergeSize squared
|
||||
//
|
||||
// The boundaries slice always starts with 0 and contains cumulative ending
|
||||
// positions for each window, allowing downstream processing to identify
|
||||
// window boundaries in the tensor data.
|
||||
func (m *VisionModel) WindowIndex(ctx ml.Context, grid *Grid) (ml.Tensor, []int) {
|
||||
vitMergerWindowSize := m.windowSize / m.spatialMergeSize / m.patchSize
|
||||
|
||||
llmGridH := grid.Height / m.spatialMergeSize
|
||||
llmGridW := grid.Width / m.spatialMergeSize
|
||||
|
||||
// Calculate window parameters
|
||||
numWindowsH := int(math.Ceil(float64(llmGridH) / float64(vitMergerWindowSize)))
|
||||
numWindowsW := int(math.Ceil(float64(llmGridW) / float64(vitMergerWindowSize)))
|
||||
|
||||
// Initialize index_new slice
|
||||
var index []int32
|
||||
|
||||
// Initialize bounds with the first element as 0
|
||||
bounds := []int{0}
|
||||
totalSeqLen := 0
|
||||
|
||||
// Process each window without padding
|
||||
for wh := range numWindowsH {
|
||||
for ww := range numWindowsW {
|
||||
// Calculate window boundaries
|
||||
hStart := wh * vitMergerWindowSize
|
||||
wStart := ww * vitMergerWindowSize
|
||||
hEnd := min(hStart+vitMergerWindowSize, llmGridH)
|
||||
wEnd := min(wStart+vitMergerWindowSize, llmGridW)
|
||||
|
||||
// Calculate sequence length for this window
|
||||
seqLen := (hEnd - hStart) * (wEnd - wStart)
|
||||
|
||||
// Collect indices for this window
|
||||
for h := hStart; h < hEnd; h++ {
|
||||
for w := wStart; w < wEnd; w++ {
|
||||
index = append(index, int32(h*llmGridW+w))
|
||||
}
|
||||
}
|
||||
|
||||
totalSeqLen += seqLen
|
||||
bounds = append(bounds, totalSeqLen*(m.spatialMergeSize*m.spatialMergeSize)+bounds[0])
|
||||
}
|
||||
}
|
||||
|
||||
t := ctx.Input().FromIntSlice(index, len(index))
|
||||
|
||||
return t, bounds
|
||||
}
|
||||
|
||||
// PositionalEmbedding generates rotary position embeddings for attention mechanisms
|
||||
func (m *VisionModel) PositionalEmbedding(ctx ml.Context, grid *Grid) ml.Tensor {
|
||||
dim := m.headDim / 2
|
||||
freq := dim / 2
|
||||
theta := float64(m.ropeTheta)
|
||||
merge := m.spatialMergeSize
|
||||
|
||||
// Create frequency patterns for position encoding
|
||||
maxGridSize := max(grid.Height, grid.Width)
|
||||
freqVals := make([]float32, freq*maxGridSize)
|
||||
for i := range maxGridSize {
|
||||
for j := range freq {
|
||||
freqVals[i*freq+j] = float32(i) / float32(math.Pow(theta, float64(j*2)/float64(dim)))
|
||||
}
|
||||
}
|
||||
freqs := ctx.Input().FromFloatSlice(freqVals, freq, maxGridSize)
|
||||
|
||||
// Create position coordinates (y,x pairs) for the grid
|
||||
// In PyTorch: Equivalent to generating position ids with torch.arange()
|
||||
coords := make([]int32, 0, grid.Height*grid.Width*2)
|
||||
for y := range grid.Height {
|
||||
for x := range grid.Width {
|
||||
coords = append(coords, int32(y), int32(x))
|
||||
}
|
||||
}
|
||||
pos := ctx.Input().FromIntSlice(coords, 2, grid.Width, grid.Height)
|
||||
|
||||
// Reshape and permute positions to match spatial merging pattern
|
||||
pos = pos.Reshape(ctx, 2, grid.Width, merge, grid.Height/merge)
|
||||
pos = pos.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
pos = pos.Reshape(ctx, 2, merge, merge, grid.Width/merge*grid.Height/merge)
|
||||
pos = pos.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
pos = pos.Reshape(ctx, 2*merge*merge*grid.Width/merge*grid.Height/merge)
|
||||
|
||||
// Use position indices to look up corresponding frequency values
|
||||
positionalEmbedding := freqs.Rows(ctx, pos)
|
||||
positionalEmbedding = positionalEmbedding.Reshape(ctx, positionalEmbedding.Dim(0)*2, positionalEmbedding.Dim(1)/2)
|
||||
return positionalEmbedding
|
||||
}
|
||||
|
||||
// newVisionModel creates a new instance of the Qwen vision model
|
||||
func newVisionModel(c fs.Config) *VisionModel {
|
||||
patchSize := int(c.Uint("vision.patch_size", 14))
|
||||
hiddenSize := int(c.Uint("vision.embedding_length", 1280))
|
||||
numHeads := int(c.Uint("vision.attention.head_count", 16))
|
||||
numChannels := int(c.Uint("vision.num_channels", 3))
|
||||
eps := c.Float("vision.attention.layer_norm_epsilon", 1e-6)
|
||||
ropeTheta := c.Float("vision.rope.freq_base", 10000.0)
|
||||
spatialMergeSize := int(c.Uint("vision.spatial_merge_size", 2))
|
||||
windowSize := int(c.Uint("vision.window_size", 112))
|
||||
fullAttnBlocks := c.Ints("qwen25vl.vision.fullatt_block_indexes", []int32{7, 15, 23, 31})
|
||||
temporalPatchSize := int(c.Uint("vision.temporal_patch_size", 2))
|
||||
|
||||
model := &VisionModel{
|
||||
Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count", 32)),
|
||||
VisionModelOptions: &VisionModelOptions{
|
||||
hiddenSize: hiddenSize,
|
||||
numHeads: numHeads,
|
||||
headDim: hiddenSize / numHeads,
|
||||
patchSize: patchSize,
|
||||
numChannels: numChannels,
|
||||
eps: eps,
|
||||
ropeTheta: ropeTheta,
|
||||
spatialMergeSize: spatialMergeSize,
|
||||
windowSize: windowSize,
|
||||
temporalPatchSize: temporalPatchSize,
|
||||
fullAttnBlocks: fullAttnBlocks,
|
||||
},
|
||||
}
|
||||
|
||||
return model
|
||||
}
|
||||
184
model/models/qwen25vl/process_image.go
Normal file
184
model/models/qwen25vl/process_image.go
Normal file
@@ -0,0 +1,184 @@
|
||||
package qwen25vl
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"image"
|
||||
"math"
|
||||
|
||||
"github.com/ollama/ollama/fs"
|
||||
"github.com/ollama/ollama/model/imageproc"
|
||||
)
|
||||
|
||||
// ImageProcessor contains configuration for the Qwen 2.5 VL image processing
|
||||
type ImageProcessor struct {
|
||||
numChannels int
|
||||
patchSize int
|
||||
temporalPatchSize int
|
||||
mergeSize int
|
||||
minPixels int
|
||||
maxPixels int
|
||||
factor int
|
||||
rescaleFactor float32
|
||||
imageMean []float32
|
||||
imageStd []float32
|
||||
}
|
||||
|
||||
// newImageProcessor creates a new image processor with default values
|
||||
func newImageProcessor(c fs.Config) ImageProcessor {
|
||||
patchSize := int(c.Uint("vision.patch_size", 14))
|
||||
mergeSize := int(c.Uint("vision.spatial_merge_size", 2))
|
||||
|
||||
return ImageProcessor{
|
||||
numChannels: int(c.Uint("vision.num_channels", 3)), // not set
|
||||
patchSize: patchSize,
|
||||
temporalPatchSize: 2,
|
||||
mergeSize: mergeSize,
|
||||
minPixels: 56 * 56,
|
||||
maxPixels: int(c.Uint("vision.max_pixels", 28*28*1280)), // 1MP limit
|
||||
factor: patchSize * mergeSize,
|
||||
rescaleFactor: 1.0 / 255.0,
|
||||
imageMean: imageproc.ClipDefaultMean[:],
|
||||
imageStd: imageproc.ClipDefaultSTD[:],
|
||||
}
|
||||
}
|
||||
|
||||
// SmartResize implements the smart resize algorithm
|
||||
func (p *ImageProcessor) SmartResize(height, width int) (int, int) {
|
||||
factor := p.factor
|
||||
|
||||
if height < factor || width < factor {
|
||||
panic(fmt.Sprintf("height:%d or width:%d must be larger than factor:%d", height, width, factor))
|
||||
} else if aspectRatio := max(height, width) / min(height, width); aspectRatio > 200 {
|
||||
panic(fmt.Sprintf("absolute aspect ratio must be smaller than 200, got %v", aspectRatio))
|
||||
}
|
||||
|
||||
round := func(x float64) int { return int(math.RoundToEven(x)) }
|
||||
|
||||
hBar := round(float64(height)/float64(factor)) * factor
|
||||
wBar := round(float64(width)/float64(factor)) * factor
|
||||
|
||||
if hBar*wBar > p.maxPixels {
|
||||
beta := math.Sqrt(float64(height*width) / float64(p.maxPixels))
|
||||
|
||||
hBar = int(math.Floor(float64(height)/beta/float64(factor))) * factor
|
||||
wBar = int(math.Floor(float64(width)/beta/float64(factor))) * factor
|
||||
} else if hBar*wBar < p.minPixels {
|
||||
beta := math.Sqrt(float64(p.minPixels) / float64(height*width))
|
||||
|
||||
hBar = int(math.Ceil(float64(height)*beta/float64(factor))) * factor
|
||||
wBar = int(math.Ceil(float64(width)*beta/float64(factor))) * factor
|
||||
}
|
||||
|
||||
return hBar, wBar
|
||||
}
|
||||
|
||||
type Grid struct {
|
||||
Height int
|
||||
Width int
|
||||
Temporal int
|
||||
}
|
||||
|
||||
func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, *Grid, error) {
|
||||
origWidth := img.Bounds().Dx()
|
||||
origHeight := img.Bounds().Dy()
|
||||
|
||||
// Calculate smart resize dimensions
|
||||
resizedHeight, resizedWidth := p.SmartResize(origHeight, origWidth)
|
||||
|
||||
// Resize image using existing functions
|
||||
resizedImg := imageproc.Resize(img, image.Point{X: resizedWidth, Y: resizedHeight}, imageproc.ResizeBilinear)
|
||||
|
||||
normalizedPixels := imageproc.Normalize(
|
||||
resizedImg,
|
||||
[3]float32{p.imageMean[0], p.imageMean[1], p.imageMean[2]},
|
||||
[3]float32{p.imageStd[0], p.imageStd[1], p.imageStd[2]},
|
||||
true, // rescale
|
||||
true, // channelFirst
|
||||
)
|
||||
|
||||
// Calculate grid dimensions
|
||||
grid := &Grid{
|
||||
Height: resizedHeight / p.patchSize,
|
||||
Width: resizedWidth / p.patchSize,
|
||||
Temporal: 1, // For single images, temporal dimension is 1
|
||||
}
|
||||
|
||||
patches, err := p.createPatches(normalizedPixels, resizedHeight, resizedWidth, grid)
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("failed to create patches: %v", err)
|
||||
}
|
||||
|
||||
// Return patches and grid dimensions
|
||||
return patches, grid, nil
|
||||
}
|
||||
|
||||
func (p *ImageProcessor) createPatches(pixels []float32, height, width int, grid *Grid) ([]float32, error) {
|
||||
channels := p.numChannels
|
||||
patchSize := p.patchSize
|
||||
mergeSize := p.mergeSize
|
||||
temporalPatchSize := p.temporalPatchSize
|
||||
|
||||
// Calculate output dimensions
|
||||
numPatches := grid.Temporal * grid.Height * grid.Width
|
||||
patchDim := channels * temporalPatchSize * patchSize * patchSize
|
||||
|
||||
result := make([]float32, numPatches*patchDim)
|
||||
patchIndex := 0
|
||||
|
||||
// Single temporal frame handling (copies to all frames)
|
||||
for range grid.Temporal {
|
||||
for h := 0; h < grid.Height; h += mergeSize {
|
||||
for w := 0; w < grid.Width; w += mergeSize {
|
||||
// Handle the 2x2 merged patches
|
||||
for mh := range mergeSize {
|
||||
for mw := range mergeSize {
|
||||
baseOffset := patchIndex * patchDim
|
||||
|
||||
// Extract patch data for first temporal frame
|
||||
for c := range channels {
|
||||
channelOffset := baseOffset + (c * temporalPatchSize * patchSize * patchSize)
|
||||
|
||||
for py := range patchSize {
|
||||
for px := range patchSize {
|
||||
// Calculate source pixel coordinates
|
||||
y := (h+mh)*patchSize + py
|
||||
x := (w+mw)*patchSize + px
|
||||
|
||||
// Source index in input tensor (CHW format)
|
||||
srcIdx := c*height*width + y*width + x
|
||||
|
||||
// Destination index in first temporal frame
|
||||
dstIdx := channelOffset + (py * patchSize) + px
|
||||
|
||||
if srcIdx < len(pixels) && dstIdx < len(result) {
|
||||
result[dstIdx] = pixels[srcIdx]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Copy first temporal frame to all other frames
|
||||
if temporalPatchSize > 1 {
|
||||
for c := range channels {
|
||||
channelOffset := baseOffset + (c * temporalPatchSize * patchSize * patchSize)
|
||||
firstFrameOffset := channelOffset
|
||||
frameSize := patchSize * patchSize
|
||||
|
||||
// Copy first frame to all other frames
|
||||
for tp := 1; tp < temporalPatchSize; tp++ {
|
||||
currentFrameOffset := channelOffset + (tp * frameSize)
|
||||
copy(result[currentFrameOffset:currentFrameOffset+frameSize],
|
||||
result[firstFrameOffset:firstFrameOffset+frameSize])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
patchIndex++
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
233
model/models/qwen3/model.go
Normal file
233
model/models/qwen3/model.go
Normal file
@@ -0,0 +1,233 @@
|
||||
package qwen3
|
||||
|
||||
import (
|
||||
"cmp"
|
||||
"math"
|
||||
|
||||
"github.com/ollama/ollama/fs"
|
||||
"github.com/ollama/ollama/kvcache"
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/ml/nn"
|
||||
"github.com/ollama/ollama/ml/nn/fast"
|
||||
"github.com/ollama/ollama/ml/nn/rope"
|
||||
"github.com/ollama/ollama/model"
|
||||
"github.com/ollama/ollama/model/input"
|
||||
)
|
||||
|
||||
type Options struct {
|
||||
hiddenSize, numHeads, numKVHeads int
|
||||
eps float32
|
||||
ropeBase, ropeScale float32
|
||||
|
||||
keyLength, valueLength int
|
||||
|
||||
numExperts, numExpertsUsed int
|
||||
normTopKProb bool
|
||||
}
|
||||
|
||||
func (o Options) headDim() int {
|
||||
return cmp.Or(o.keyLength, o.valueLength, o.hiddenSize/o.numHeads)
|
||||
}
|
||||
|
||||
type Attention struct {
|
||||
QueryNorm *nn.RMSNorm `gguf:"attn_q_norm"`
|
||||
Query *nn.Linear `gguf:"attn_q"`
|
||||
KeyNorm *nn.RMSNorm `gguf:"attn_k_norm"`
|
||||
Key *nn.Linear `gguf:"attn_k"`
|
||||
Value *nn.Linear `gguf:"attn_v"`
|
||||
Output *nn.Linear `gguf:"attn_output"`
|
||||
}
|
||||
|
||||
func (sa *Attention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
|
||||
batchSize := hiddenStates.Dim(1)
|
||||
|
||||
query := sa.Query.Forward(ctx, hiddenStates)
|
||||
key := sa.Key.Forward(ctx, hiddenStates)
|
||||
value := sa.Value.Forward(ctx, hiddenStates)
|
||||
|
||||
query = query.Reshape(ctx, opts.headDim(), opts.numHeads, batchSize)
|
||||
key = key.Reshape(ctx, opts.headDim(), opts.numKVHeads, batchSize)
|
||||
value = value.Reshape(ctx, opts.headDim(), opts.numKVHeads, batchSize)
|
||||
|
||||
query = sa.QueryNorm.Forward(ctx, query, opts.eps)
|
||||
key = sa.KeyNorm.Forward(ctx, key, opts.eps)
|
||||
|
||||
query = fast.RoPE(ctx, query, positions, opts.headDim(), opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
|
||||
key = fast.RoPE(ctx, key, positions, opts.headDim(), opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
|
||||
|
||||
attention := nn.Attention(ctx, query, key, value, 1./math.Sqrt(float64(opts.headDim())), cache)
|
||||
attention = attention.Reshape(ctx, attention.Dim(0)*attention.Dim(1), batchSize)
|
||||
return sa.Output.Forward(ctx, attention)
|
||||
}
|
||||
|
||||
type MLP interface {
|
||||
Forward(ml.Context, ml.Tensor, *Options) ml.Tensor
|
||||
}
|
||||
|
||||
type sparse struct {
|
||||
Router *nn.Linear `gguf:"ffn_gate_inp"`
|
||||
Gate *nn.Linear `gguf:"ffn_gate_exps"`
|
||||
Up *nn.Linear `gguf:"ffn_up_exps"`
|
||||
Down *nn.Linear `gguf:"ffn_down_exps"`
|
||||
}
|
||||
|
||||
func (mlp *sparse) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *Options) ml.Tensor {
|
||||
hiddenDim, sequenceLength, batchSize := hiddenStates.Dim(0), hiddenStates.Dim(1), hiddenStates.Dim(2)
|
||||
hiddenStates = hiddenStates.Reshape(ctx, hiddenDim, sequenceLength*batchSize)
|
||||
routerLogits := mlp.Router.Forward(ctx, hiddenStates)
|
||||
|
||||
routingWeights := routerLogits.Softmax(ctx)
|
||||
selectedExperts := routingWeights.TopK(ctx, opts.numExpertsUsed)
|
||||
routingWeights = routingWeights.Reshape(ctx, 1, opts.numExperts, hiddenStates.Dim(1)).Rows(ctx, selectedExperts)
|
||||
if opts.normTopKProb {
|
||||
routingWeights = routingWeights.Reshape(ctx, opts.numExpertsUsed, hiddenStates.Dim(1))
|
||||
routingWeights = routingWeights.Div(ctx, routingWeights.SumRows(ctx))
|
||||
routingWeights = routingWeights.Reshape(ctx, 1, opts.numExpertsUsed, hiddenStates.Dim(1))
|
||||
}
|
||||
|
||||
hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0), 1, hiddenStates.Dim(1))
|
||||
|
||||
upStates := mlp.Up.Weight.MulmatID(ctx, hiddenStates, selectedExperts)
|
||||
|
||||
hiddenStates = mlp.Gate.Weight.MulmatID(ctx, hiddenStates, selectedExperts)
|
||||
hiddenStates = hiddenStates.SILU(ctx)
|
||||
hiddenStates = hiddenStates.Mul(ctx, upStates)
|
||||
|
||||
experts := mlp.Down.Weight.MulmatID(ctx, hiddenStates, selectedExperts)
|
||||
experts = experts.Mul(ctx, routingWeights)
|
||||
|
||||
nextStates := experts.View(ctx, 0, experts.Dim(0), experts.Stride(2), experts.Dim(2))
|
||||
for i := 1; i < opts.numExpertsUsed; i++ {
|
||||
nextStates = nextStates.Add(ctx, experts.View(ctx, i*experts.Stride(1), experts.Dim(0), experts.Stride(2), experts.Dim(2)))
|
||||
}
|
||||
|
||||
return nextStates
|
||||
}
|
||||
|
||||
type dense struct {
|
||||
Gate *nn.Linear `gguf:"ffn_gate"`
|
||||
Up *nn.Linear `gguf:"ffn_up"`
|
||||
Down *nn.Linear `gguf:"ffn_down"`
|
||||
}
|
||||
|
||||
func (mlp *dense) Forward(ctx ml.Context, hiddenStates ml.Tensor, _ *Options) ml.Tensor {
|
||||
hiddenStates = mlp.Gate.Forward(ctx, hiddenStates).SILU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenStates))
|
||||
return mlp.Down.Forward(ctx, hiddenStates)
|
||||
}
|
||||
|
||||
type Layer struct {
|
||||
AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
|
||||
*Attention
|
||||
|
||||
MLPNorm *nn.RMSNorm `gguf:"ffn_norm"`
|
||||
MLP
|
||||
}
|
||||
|
||||
func (d *Layer) Forward(ctx ml.Context, hiddenStates, positions, outputs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
|
||||
residual := hiddenStates
|
||||
hiddenStates = d.AttentionNorm.Forward(ctx, hiddenStates, opts.eps)
|
||||
hiddenStates = d.Attention.Forward(ctx, hiddenStates, positions, cache, opts)
|
||||
|
||||
if outputs != nil {
|
||||
hiddenStates = hiddenStates.Rows(ctx, outputs)
|
||||
residual = residual.Rows(ctx, outputs)
|
||||
}
|
||||
|
||||
hiddenStates = hiddenStates.Add(ctx, residual)
|
||||
|
||||
residual = hiddenStates
|
||||
hiddenStates = d.MLPNorm.Forward(ctx, hiddenStates, opts.eps)
|
||||
hiddenStates = d.MLP.Forward(ctx, hiddenStates, opts)
|
||||
return hiddenStates.Add(ctx, residual)
|
||||
}
|
||||
|
||||
type Model struct {
|
||||
model.Base
|
||||
model.BytePairEncoding
|
||||
|
||||
TokenEmbedding *nn.Embedding `gguf:"token_embd"`
|
||||
OutputNorm *nn.RMSNorm `gguf:"output_norm"`
|
||||
Output *nn.Linear `gguf:"output,alt:token_embd"`
|
||||
|
||||
Layers []Layer `gguf:"blk"`
|
||||
|
||||
*Options
|
||||
}
|
||||
|
||||
// Forward implements model.Model.
|
||||
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
||||
positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
|
||||
|
||||
hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs)
|
||||
|
||||
for i, layer := range m.Layers {
|
||||
m.Cache.SetLayer(i)
|
||||
|
||||
var outputs ml.Tensor
|
||||
if i == len(m.Layers)-1 {
|
||||
outputs = ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
|
||||
}
|
||||
|
||||
hiddenStates = layer.Forward(ctx, hiddenStates, positions, outputs, m.Cache, m.Options)
|
||||
}
|
||||
|
||||
hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.eps)
|
||||
return m.Output.Forward(ctx, hiddenStates), nil
|
||||
}
|
||||
|
||||
func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
|
||||
return fast.RoPE(ctx, key, shift, m.headDim(), m.ropeBase, m.ropeScale, rope.WithTypeNeoX()), nil
|
||||
}
|
||||
|
||||
var _ model.Model = (*Model)(nil)
|
||||
|
||||
func New(c fs.Config) (model.Model, error) {
|
||||
layers := make([]Layer, c.Uint("block_count"))
|
||||
for i := range layers {
|
||||
if c.String("general.architecture") == "qwen3moe" {
|
||||
layers[i].MLP = &sparse{}
|
||||
} else {
|
||||
layers[i].MLP = &dense{}
|
||||
}
|
||||
}
|
||||
|
||||
m := Model{
|
||||
BytePairEncoding: model.NewBytePairEncoding(
|
||||
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
|
||||
&model.Vocabulary{
|
||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||
Types: c.Ints("tokenizer.ggml.token_type"),
|
||||
Merges: c.Strings("tokenizer.ggml.merges"),
|
||||
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
|
||||
BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
|
||||
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
||||
EOS: append(
|
||||
[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
|
||||
c.Ints("tokenizer.ggml.eos_token_ids")...,
|
||||
),
|
||||
},
|
||||
),
|
||||
Layers: layers,
|
||||
Options: &Options{
|
||||
hiddenSize: int(c.Uint("embedding_length")),
|
||||
numHeads: int(c.Uint("attention.head_count")),
|
||||
numKVHeads: int(c.Uint("attention.head_count_kv")),
|
||||
keyLength: int(c.Uint("attention.key_length")),
|
||||
valueLength: int(c.Uint("attention.value_length")),
|
||||
eps: c.Float("attention.layer_norm_rms_epsilon"),
|
||||
ropeBase: c.Float("rope.freq_base"),
|
||||
ropeScale: c.Float("rope.freq_scale", 1),
|
||||
numExperts: int(c.Uint("expert_count")),
|
||||
numExpertsUsed: int(c.Uint("expert_used_count")),
|
||||
normTopKProb: c.Bool("norm_top_k_prob", true),
|
||||
},
|
||||
}
|
||||
|
||||
m.Cache = kvcache.NewCausalCache(m.Shift)
|
||||
return &m, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
model.Register("qwen3", New)
|
||||
model.Register("qwen3moe", New)
|
||||
}
|
||||
@@ -182,27 +182,12 @@ func (spm SentencePieceModel) Encode(s string, addSpecial bool) ([]int32, error)
|
||||
}
|
||||
}
|
||||
|
||||
slog.Log(context.TODO(), logutil.LevelTrace, "encoded", "string", s, "ids", ids)
|
||||
|
||||
if addSpecial && len(ids) > 0 {
|
||||
if spm.vocab.AddBOS {
|
||||
if ids[0] == spm.vocab.BOS {
|
||||
slog.Warn("adding bos token to prompt which already has it", "id", spm.vocab.BOS)
|
||||
}
|
||||
|
||||
slog.Debug("adding bos token to prompt", "id", spm.vocab.BOS)
|
||||
ids = append([]int32{spm.vocab.BOS}, ids...)
|
||||
}
|
||||
|
||||
if spm.vocab.AddEOS {
|
||||
if ids[len(ids)-1] == spm.vocab.EOS {
|
||||
slog.Warn("adding eos token to prompt which already has it", "id", spm.vocab.EOS)
|
||||
}
|
||||
|
||||
slog.Debug("adding eos token to prompt", "id", spm.vocab.EOS)
|
||||
ids = append(ids, spm.vocab.EOS)
|
||||
}
|
||||
ids = spm.vocab.addSpecials(ids)
|
||||
}
|
||||
|
||||
slog.Log(context.TODO(), logutil.LevelTrace, "encoded", "ids", ids)
|
||||
return ids, nil
|
||||
}
|
||||
|
||||
@@ -261,6 +246,6 @@ func (spm SentencePieceModel) Decode(ids []int32) (string, error) {
|
||||
}
|
||||
}
|
||||
|
||||
slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "string", sb.String())
|
||||
slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "ids", ids, "string", sb.String())
|
||||
return sb.String(), nil
|
||||
}
|
||||
17
model/textprocessor.go
Normal file
17
model/textprocessor.go
Normal file
@@ -0,0 +1,17 @@
|
||||
package model
|
||||
|
||||
const (
|
||||
TOKEN_TYPE_NORMAL = iota + 1
|
||||
TOKEN_TYPE_UNKNOWN
|
||||
TOKEN_TYPE_CONTROL
|
||||
TOKEN_TYPE_USER_DEFINED
|
||||
TOKEN_TYPE_UNUSED
|
||||
TOKEN_TYPE_BYTE
|
||||
)
|
||||
|
||||
type TextProcessor interface {
|
||||
Encode(s string, addSpecial bool) ([]int32, error)
|
||||
Decode([]int32) (string, error)
|
||||
Is(int32, Special) bool
|
||||
Vocabulary() *Vocabulary
|
||||
}
|
||||
112
model/vocabulary.go
Normal file
112
model/vocabulary.go
Normal file
@@ -0,0 +1,112 @@
|
||||
package model
|
||||
|
||||
import (
|
||||
"log/slog"
|
||||
"slices"
|
||||
"sync"
|
||||
)
|
||||
|
||||
type Special int32
|
||||
|
||||
const (
|
||||
SpecialBOS Special = iota
|
||||
SpecialEOS
|
||||
)
|
||||
|
||||
type Vocabulary struct {
|
||||
Values []string
|
||||
Types []int32
|
||||
Scores []float32
|
||||
Merges []string
|
||||
|
||||
BOS, EOS []int32
|
||||
AddBOS, AddEOS bool
|
||||
|
||||
specialOnce sync.Once
|
||||
special []string
|
||||
|
||||
valuesOnce sync.Once
|
||||
values map[string]int32
|
||||
|
||||
mergeOnce sync.Once
|
||||
merge map[string]int32
|
||||
}
|
||||
|
||||
func (v *Vocabulary) Is(id int32, special Special) bool {
|
||||
switch special {
|
||||
case SpecialBOS:
|
||||
return slices.Contains(v.BOS, id)
|
||||
case SpecialEOS:
|
||||
return slices.Contains(v.EOS, id)
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func (v *Vocabulary) addSpecials(ids []int32) []int32 {
|
||||
if v.AddBOS && len(v.BOS) > 0 {
|
||||
if slices.Contains(v.BOS, ids[0]) {
|
||||
slog.Warn("adding bos token to prompt which already has it", "id", v.BOS)
|
||||
}
|
||||
|
||||
slog.Debug("adding bos token to prompt", "id", v.BOS)
|
||||
ids = append([]int32{v.BOS[0]}, ids...)
|
||||
}
|
||||
|
||||
if v.AddEOS && len(v.EOS) > 0 {
|
||||
if slices.Contains(v.BOS, ids[len(ids)-1]) {
|
||||
slog.Warn("adding eos token to prompt which already has it", "id", v.EOS)
|
||||
}
|
||||
|
||||
slog.Debug("adding eos token to prompt", "id", v.EOS)
|
||||
ids = append(ids, v.EOS[0])
|
||||
}
|
||||
|
||||
return ids
|
||||
}
|
||||
|
||||
func (v *Vocabulary) Encode(s string) int32 {
|
||||
v.valuesOnce.Do(func() {
|
||||
v.values = make(map[string]int32, len(v.Values))
|
||||
for i, value := range v.Values {
|
||||
v.values[value] = int32(i)
|
||||
}
|
||||
})
|
||||
|
||||
if id, ok := v.values[s]; ok {
|
||||
return id
|
||||
}
|
||||
|
||||
return -1
|
||||
}
|
||||
|
||||
func (v *Vocabulary) Decode(id int32) string {
|
||||
return v.Values[id]
|
||||
}
|
||||
|
||||
func (v *Vocabulary) SpecialVocabulary() []string {
|
||||
v.specialOnce.Do(func() {
|
||||
for i := range v.Values {
|
||||
if v.Types[i] == TOKEN_TYPE_CONTROL {
|
||||
v.special = append(v.special, v.Values[i])
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
return v.special
|
||||
}
|
||||
|
||||
func (v *Vocabulary) Merge(left, right string) int {
|
||||
v.mergeOnce.Do(func() {
|
||||
v.merge = make(map[string]int32, len(v.Merges))
|
||||
for i, merge := range v.Merges {
|
||||
v.merge[merge] = int32(i)
|
||||
}
|
||||
})
|
||||
|
||||
if id, ok := v.merge[left+" "+right]; ok {
|
||||
return int(id)
|
||||
}
|
||||
|
||||
return -1
|
||||
}
|
||||
@@ -292,13 +292,18 @@ func filesForModel(path string) ([]string, error) {
|
||||
}
|
||||
files = append(files, js...)
|
||||
|
||||
if tks, _ := glob(filepath.Join(path, "tokenizer.model"), "application/octet-stream"); len(tks) > 0 {
|
||||
// add tokenizer.model if it exists, tokenizer.json is automatically picked up by the previous glob
|
||||
// tokenizer.model might be a unresolved git lfs reference; error if it is
|
||||
files = append(files, tks...)
|
||||
} else if tks, _ := glob(filepath.Join(path, "**/tokenizer.model"), "text/plain"); len(tks) > 0 {
|
||||
// some times tokenizer.model is in a subdirectory (e.g. meta-llama/Meta-Llama-3-8B)
|
||||
files = append(files, tks...)
|
||||
// only include tokenizer.model is tokenizer.json is not present
|
||||
if !slices.ContainsFunc(files, func(s string) bool {
|
||||
return slices.Contains(strings.Split(s, string(os.PathSeparator)), "tokenizer.json")
|
||||
}) {
|
||||
if tks, _ := glob(filepath.Join(path, "tokenizer.model"), "application/octet-stream"); len(tks) > 0 {
|
||||
// add tokenizer.model if it exists, tokenizer.json is automatically picked up by the previous glob
|
||||
// tokenizer.model might be a unresolved git lfs reference; error if it is
|
||||
files = append(files, tks...)
|
||||
} else if tks, _ := glob(filepath.Join(path, "**/tokenizer.model"), "text/plain"); len(tks) > 0 {
|
||||
// some times tokenizer.model is in a subdirectory (e.g. meta-llama/Meta-Llama-3-8B)
|
||||
files = append(files, tks...)
|
||||
}
|
||||
}
|
||||
|
||||
return files, nil
|
||||
|
||||
@@ -61,6 +61,8 @@ const (
|
||||
ColorGrey = Esc + "[38;5;245m"
|
||||
ColorDefault = Esc + "[0m"
|
||||
|
||||
ColorBold = Esc + "[1m"
|
||||
|
||||
StartBracketedPaste = Esc + "[?2004h"
|
||||
EndBracketedPaste = Esc + "[?2004l"
|
||||
)
|
||||
|
||||
@@ -2,6 +2,8 @@ package common
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/llm"
|
||||
)
|
||||
|
||||
func FindStop(sequence string, stops []string) (bool, string) {
|
||||
@@ -29,68 +31,41 @@ func ContainsStopSuffix(sequence string, stops []string) bool {
|
||||
// truncateStop removes the provided stop string from pieces,
|
||||
// returning the partial pieces with stop removed, including truncating
|
||||
// the last piece if required (and signalling if this was the case)
|
||||
func TruncateStop(pieces []string, stop string) ([]string, bool) {
|
||||
joined := strings.Join(pieces, "")
|
||||
|
||||
index := strings.Index(joined, stop)
|
||||
if index == -1 {
|
||||
return pieces, false
|
||||
func TruncateStop(resps []llm.CompletionResponse, stop string) ([]llm.CompletionResponse, bool) {
|
||||
var sequence string
|
||||
for _, resp := range resps {
|
||||
sequence += resp.Content
|
||||
}
|
||||
|
||||
joined = joined[:index]
|
||||
|
||||
// Split truncated string back into pieces of original lengths
|
||||
lengths := make([]int, len(pieces))
|
||||
for i, piece := range pieces {
|
||||
lengths[i] = len(piece)
|
||||
idx := strings.Index(sequence, stop)
|
||||
if idx < 0 {
|
||||
return resps, false
|
||||
}
|
||||
|
||||
var result []string
|
||||
tokenTruncated := false
|
||||
start := 0
|
||||
for _, length := range lengths {
|
||||
if start >= len(joined) {
|
||||
truncated := sequence[:idx]
|
||||
if len(truncated) == 0 {
|
||||
return nil, true
|
||||
}
|
||||
|
||||
result := make([]llm.CompletionResponse, 0, len(resps))
|
||||
|
||||
// Track position in truncated sequence
|
||||
pos := 0
|
||||
truncationHappened := false
|
||||
for _, resp := range resps {
|
||||
if pos >= len(truncated) {
|
||||
break
|
||||
}
|
||||
|
||||
end := start + length
|
||||
if end > len(joined) {
|
||||
end = len(joined)
|
||||
tokenTruncated = true
|
||||
chunk := truncated[pos:min(pos+len(resp.Content), len(truncated))]
|
||||
if len(chunk) < len(resp.Content) {
|
||||
truncationHappened = true
|
||||
}
|
||||
result = append(result, joined[start:end])
|
||||
start = end
|
||||
if len(chunk) > 0 {
|
||||
result = append(result, llm.CompletionResponse{Content: chunk})
|
||||
}
|
||||
pos += len(resp.Content)
|
||||
}
|
||||
|
||||
return result, tokenTruncated
|
||||
}
|
||||
|
||||
func IncompleteUnicode(token string) bool {
|
||||
incomplete := false
|
||||
|
||||
// check if there is incomplete UTF-8 character at the end
|
||||
for i := 1; i < 5 && i <= len(token); i++ {
|
||||
c := token[len(token)-i]
|
||||
|
||||
if (c & 0xc0) == 0x80 {
|
||||
// continuation byte: 10xxxxxx
|
||||
continue
|
||||
}
|
||||
|
||||
if (c & 0xe0) == 0xc0 {
|
||||
// 2-byte character: 110xxxxx ...
|
||||
incomplete = i < 2
|
||||
} else if (c & 0xf0) == 0xe0 {
|
||||
// 3-byte character: 1110xxxx ...
|
||||
incomplete = i < 3
|
||||
} else if (c & 0xf8) == 0xf0 {
|
||||
// 4-byte character: 11110xxx ...
|
||||
incomplete = i < 4
|
||||
}
|
||||
|
||||
// else 1-byte character or invalid byte
|
||||
break
|
||||
}
|
||||
|
||||
return incomplete
|
||||
return result, truncationHappened
|
||||
}
|
||||
|
||||
@@ -1,51 +1,84 @@
|
||||
package common
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/ollama/ollama/llm"
|
||||
)
|
||||
|
||||
func TestTruncateStop(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
pieces []string
|
||||
pieces []llm.CompletionResponse
|
||||
stop string
|
||||
expected []string
|
||||
expected []llm.CompletionResponse
|
||||
expectedTrunc bool
|
||||
}{
|
||||
{
|
||||
name: "Single word",
|
||||
pieces: []string{"hello", "world"},
|
||||
stop: "world",
|
||||
expected: []string{"hello"},
|
||||
name: "Single word",
|
||||
pieces: []llm.CompletionResponse{
|
||||
{Content: "Hello"},
|
||||
{Content: "world"},
|
||||
},
|
||||
stop: "world",
|
||||
expected: []llm.CompletionResponse{
|
||||
{Content: "Hello"},
|
||||
},
|
||||
expectedTrunc: false,
|
||||
},
|
||||
{
|
||||
name: "Partial",
|
||||
pieces: []string{"hello", "wor"},
|
||||
stop: "or",
|
||||
expected: []string{"hello", "w"},
|
||||
name: "Partial",
|
||||
pieces: []llm.CompletionResponse{
|
||||
{Content: "Hello"},
|
||||
{Content: " wor"},
|
||||
},
|
||||
stop: "or",
|
||||
expected: []llm.CompletionResponse{
|
||||
{Content: "Hello"},
|
||||
{Content: " w"},
|
||||
},
|
||||
expectedTrunc: true,
|
||||
},
|
||||
{
|
||||
name: "Suffix",
|
||||
pieces: []string{"Hello", " there", "!"},
|
||||
stop: "!",
|
||||
expected: []string{"Hello", " there"},
|
||||
name: "Suffix",
|
||||
pieces: []llm.CompletionResponse{
|
||||
{Content: "Hello"},
|
||||
{Content: " there"},
|
||||
{Content: "!"},
|
||||
},
|
||||
stop: "!",
|
||||
expected: []llm.CompletionResponse{
|
||||
{Content: "Hello"},
|
||||
{Content: " there"},
|
||||
},
|
||||
expectedTrunc: false,
|
||||
},
|
||||
{
|
||||
name: "Suffix partial",
|
||||
pieces: []string{"Hello", " the", "re!"},
|
||||
stop: "there!",
|
||||
expected: []string{"Hello", " "},
|
||||
name: "Suffix partial",
|
||||
pieces: []llm.CompletionResponse{
|
||||
{Content: "Hello"},
|
||||
{Content: " the"},
|
||||
{Content: "re!"},
|
||||
},
|
||||
stop: "there!",
|
||||
expected: []llm.CompletionResponse{
|
||||
{Content: "Hello"},
|
||||
{Content: " "},
|
||||
},
|
||||
expectedTrunc: true,
|
||||
},
|
||||
{
|
||||
name: "Middle",
|
||||
pieces: []string{"hello", " wor"},
|
||||
stop: "llo w",
|
||||
expected: []string{"he"},
|
||||
name: "Middle",
|
||||
pieces: []llm.CompletionResponse{
|
||||
{Content: "Hello"},
|
||||
{Content: " wo"},
|
||||
},
|
||||
stop: "llo w",
|
||||
expected: []llm.CompletionResponse{
|
||||
{Content: "He"},
|
||||
},
|
||||
expectedTrunc: true,
|
||||
},
|
||||
}
|
||||
@@ -54,76 +87,23 @@ func TestTruncateStop(t *testing.T) {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result, resultTrunc := TruncateStop(tt.pieces, tt.stop)
|
||||
if !reflect.DeepEqual(result, tt.expected) || resultTrunc != tt.expectedTrunc {
|
||||
t.Errorf("truncateStop(%v, %s): have %v (%v); want %v (%v)", tt.pieces, tt.stop, result, resultTrunc, tt.expected, tt.expectedTrunc)
|
||||
t.Errorf("truncateStop(%v, %v):\n%shave truncated %v\nwant truncated %v",
|
||||
tt.pieces, tt.stop, formatContentDiff(result, tt.expected), resultTrunc, tt.expectedTrunc)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestIncompleteUnicode(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
input string
|
||||
expected bool
|
||||
}{
|
||||
{
|
||||
name: "Basic",
|
||||
input: "hi",
|
||||
expected: false,
|
||||
},
|
||||
{
|
||||
name: "Two byte",
|
||||
input: "hi" + string([]byte{0xc2, 0xa3}),
|
||||
expected: false,
|
||||
},
|
||||
{
|
||||
name: "Two byte - missing last",
|
||||
input: "hi" + string([]byte{0xc2}),
|
||||
expected: true,
|
||||
},
|
||||
{
|
||||
name: "Three byte",
|
||||
input: "hi" + string([]byte{0xe0, 0xA0, 0x80}),
|
||||
expected: false,
|
||||
},
|
||||
{
|
||||
name: "Three byte - missing last",
|
||||
input: "hi" + string([]byte{0xe0, 0xA0}),
|
||||
expected: true,
|
||||
},
|
||||
{
|
||||
name: "Three byte - missing last 2",
|
||||
input: "hi" + string([]byte{0xe0}),
|
||||
expected: true,
|
||||
},
|
||||
{
|
||||
name: "Four byte",
|
||||
input: "hi" + string([]byte{0xf0, 0x92, 0x8a, 0xb7}),
|
||||
expected: false,
|
||||
},
|
||||
{
|
||||
name: "Four byte - missing last",
|
||||
input: "hi" + string([]byte{0xf0, 0x92, 0x8a}),
|
||||
expected: true,
|
||||
},
|
||||
{
|
||||
name: "Four byte - missing last 2",
|
||||
input: "hi" + string([]byte{0xf0, 0x92}),
|
||||
expected: true,
|
||||
},
|
||||
{
|
||||
name: "Four byte - missing last 3",
|
||||
input: "hi" + string([]byte{0xf0}),
|
||||
expected: true,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := IncompleteUnicode(tt.input)
|
||||
if result != tt.expected {
|
||||
t.Errorf("incompleteUnicode(%s): have %v; want %v", tt.input, result, tt.expected)
|
||||
}
|
||||
})
|
||||
func formatContentDiff(result, expected []llm.CompletionResponse) string {
|
||||
var s string
|
||||
for i := 0; i < len(result) || i < len(expected); i++ {
|
||||
if i < len(result) && i < len(expected) && result[i].Content != expected[i].Content {
|
||||
s += fmt.Sprintf("[%d] %q vs %q\n", i, result[i].Content, expected[i].Content)
|
||||
} else if i < len(result) && i >= len(expected) {
|
||||
s += fmt.Sprintf("[%d] extra %q\n", i, result[i].Content)
|
||||
} else if i >= len(result) && i < len(expected) {
|
||||
s += fmt.Sprintf("[%d] missing %q\n", i, expected[i].Content)
|
||||
}
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
@@ -104,8 +104,8 @@ func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCach
|
||||
slog.Debug("loading cache slot", "id", slot.Id, "cache", len(slot.Inputs), "prompt", len(prompt),
|
||||
"used", numPast, "remaining", len(prompt)-numPast)
|
||||
|
||||
slot.Inputs = prompt[:numPast]
|
||||
prompt = prompt[numPast:]
|
||||
slot.Inputs = slot.Inputs[:numPast]
|
||||
|
||||
return slot, prompt, nil
|
||||
}
|
||||
|
||||
@@ -17,7 +17,6 @@ import (
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/sync/semaphore"
|
||||
|
||||
@@ -52,13 +51,13 @@ type Sequence struct {
|
||||
pendingInputs []input
|
||||
|
||||
// tokens that have been generated but not returned yet (e.g. for stop sequences)
|
||||
pendingResponses []string
|
||||
pendingResponses []llm.CompletionResponse
|
||||
|
||||
// input cache being used by this sequence
|
||||
cache *InputCacheSlot
|
||||
|
||||
// channel to send responses over
|
||||
responses chan string
|
||||
responses chan llm.CompletionResponse
|
||||
|
||||
// channel to stop decoding (such as if the remote connection is closed)
|
||||
quit chan bool
|
||||
@@ -89,6 +88,19 @@ type Sequence struct {
|
||||
numPromptInputs int
|
||||
}
|
||||
|
||||
func (seq *Sequence) send(resp llm.CompletionResponse) bool {
|
||||
if len(resp.Content) > 0 || resp.Done {
|
||||
select {
|
||||
case seq.responses <- resp:
|
||||
// Successfully sent
|
||||
return true
|
||||
case <-seq.quit:
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
type NewSequenceParams struct {
|
||||
numPredict int
|
||||
stop []string
|
||||
@@ -147,8 +159,8 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe
|
||||
numPromptInputs: len(inputs),
|
||||
startProcessingTime: startTime,
|
||||
numPredict: params.numPredict,
|
||||
pendingResponses: make([]string, 0),
|
||||
responses: make(chan string, 100),
|
||||
pendingResponses: make([]llm.CompletionResponse, 0),
|
||||
responses: make(chan llm.CompletionResponse, 100),
|
||||
quit: make(chan bool, 1),
|
||||
embedding: make(chan []float32, 1),
|
||||
samplingCtx: sc,
|
||||
@@ -272,36 +284,15 @@ func (s *Server) allNil() bool {
|
||||
return true
|
||||
}
|
||||
|
||||
func flushPending(seq *Sequence) bool {
|
||||
joined := strings.Join(seq.pendingResponses, "")
|
||||
seq.pendingResponses = []string{}
|
||||
|
||||
// Check if there are any partial UTF-8 characters remaining.
|
||||
// We already check and queue as we are generating but some may
|
||||
// still make it here:
|
||||
// - Sequence is ending, e.g. generation limit has been hit
|
||||
// - Invalid characters in the middle of a string
|
||||
// This is a stricter check to ensure we never output invalid Unicode.
|
||||
for !utf8.ValidString(joined) {
|
||||
joined = joined[:len(joined)-1]
|
||||
}
|
||||
|
||||
if len(joined) == 0 {
|
||||
return true
|
||||
}
|
||||
|
||||
select {
|
||||
case seq.responses <- joined:
|
||||
return true
|
||||
case <-seq.quit:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Server) removeSequence(seqIndex int, reason llm.DoneReason) {
|
||||
seq := s.seqs[seqIndex]
|
||||
|
||||
flushPending(seq)
|
||||
// Send any remaining pending responses
|
||||
for _, resp := range seq.pendingResponses {
|
||||
seq.send(resp)
|
||||
}
|
||||
seq.pendingResponses = []llm.CompletionResponse{}
|
||||
|
||||
seq.doneReason = reason
|
||||
close(seq.responses)
|
||||
close(seq.embedding)
|
||||
@@ -490,8 +481,11 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
|
||||
|
||||
seq.inputs = []input{{token: token}}
|
||||
|
||||
seq.pendingResponses = append(seq.pendingResponses, piece)
|
||||
sequence := strings.Join(seq.pendingResponses, "")
|
||||
seq.pendingResponses = append(seq.pendingResponses, llm.CompletionResponse{Content: piece})
|
||||
sequence := ""
|
||||
for _, r := range seq.pendingResponses {
|
||||
sequence += r.Content
|
||||
}
|
||||
|
||||
if ok, stop := common.FindStop(sequence, seq.stop); ok {
|
||||
slog.Debug("hit stop token", "pending", seq.pendingResponses, "stop", stop)
|
||||
@@ -523,13 +517,13 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
|
||||
continue
|
||||
}
|
||||
|
||||
if common.IncompleteUnicode(sequence) {
|
||||
continue
|
||||
}
|
||||
|
||||
if !flushPending(seq) {
|
||||
s.removeSequence(i, llm.DoneReasonConnectionClosed)
|
||||
for _, resp := range seq.pendingResponses {
|
||||
if !seq.send(resp) {
|
||||
s.removeSequence(i, llm.DoneReasonConnectionClosed)
|
||||
break
|
||||
}
|
||||
}
|
||||
seq.pendingResponses = []llm.CompletionResponse{}
|
||||
}
|
||||
|
||||
return nil
|
||||
@@ -627,9 +621,7 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
case content, ok := <-seq.responses:
|
||||
if ok {
|
||||
if err := json.NewEncoder(w).Encode(&llm.CompletionResponse{
|
||||
Content: content,
|
||||
}); err != nil {
|
||||
if err := json.NewEncoder(w).Encode(&content); err != nil {
|
||||
http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
|
||||
close(seq.quit)
|
||||
return
|
||||
|
||||
@@ -136,8 +136,8 @@ func (c *InputCache) LoadCacheSlot(prompt []input.Input) (*InputCacheSlot, []inp
|
||||
slog.Debug("loading cache slot", "id", slot.Id, "cache", len(slot.Inputs), "prompt", len(prompt),
|
||||
"used", numPast, "remaining", int32(len(prompt))-numPast)
|
||||
|
||||
slot.Inputs = prompt[:numPast]
|
||||
prompt = prompt[numPast:]
|
||||
slot.Inputs = slot.Inputs[:numPast]
|
||||
|
||||
return slot, prompt, nil
|
||||
}
|
||||
|
||||
@@ -3,7 +3,6 @@ package ollamarunner
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"image"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
@@ -12,10 +11,6 @@ import (
|
||||
)
|
||||
|
||||
func TestCountCommon(t *testing.T) {
|
||||
imgA := image.NewRGBA(image.Rect(0, 0, 100, 100))
|
||||
imgB := image.NewRGBA(image.Rect(0, 0, 50, 50))
|
||||
imgC := image.NewRGBA(image.Rect(50, 50, 100, 100))
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
t1 []input.Input
|
||||
@@ -36,20 +31,20 @@ func TestCountCommon(t *testing.T) {
|
||||
},
|
||||
{
|
||||
name: "Image Prefix",
|
||||
t1: []input.Input{{Multimodal: imgA, MultimodalHash: 1}},
|
||||
t2: []input.Input{{Multimodal: imgA, MultimodalHash: 1}, {Multimodal: imgB, MultimodalHash: 2}, {Multimodal: imgC, MultimodalHash: 3}},
|
||||
t1: []input.Input{{MultimodalHash: 1}},
|
||||
t2: []input.Input{{MultimodalHash: 1}, {MultimodalHash: 2}, {MultimodalHash: 3}},
|
||||
expected: 1,
|
||||
},
|
||||
{
|
||||
name: "Mixed",
|
||||
t1: []input.Input{{Token: 1}, {Multimodal: imgA, MultimodalHash: 1}},
|
||||
t2: []input.Input{{Token: 1}, {Multimodal: imgA, MultimodalHash: 1}, {Token: 5}},
|
||||
t1: []input.Input{{Token: 1}, {MultimodalHash: 1}},
|
||||
t2: []input.Input{{Token: 1}, {MultimodalHash: 1}, {Token: 5}},
|
||||
expected: 2,
|
||||
},
|
||||
{
|
||||
name: "Mixed, Same Length",
|
||||
t1: []input.Input{{Token: 1}, {Multimodal: imgA, MultimodalHash: 1}},
|
||||
t2: []input.Input{{Token: 1}, {Multimodal: imgB, MultimodalHash: 2}},
|
||||
t1: []input.Input{{Token: 1}, {MultimodalHash: 1}},
|
||||
t2: []input.Input{{Token: 1}, {MultimodalHash: 2}},
|
||||
expected: 1,
|
||||
},
|
||||
{
|
||||
|
||||
113
runner/ollamarunner/multimodal.go
Normal file
113
runner/ollamarunner/multimodal.go
Normal file
@@ -0,0 +1,113 @@
|
||||
package ollamarunner
|
||||
|
||||
import (
|
||||
"errors"
|
||||
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/model/input"
|
||||
)
|
||||
|
||||
// Tensors can't be used across multiple compute graphs. This is a problem
|
||||
// if a single embedding is split across batches using views since all of
|
||||
// the views will have the same source tensor. We also don't want to
|
||||
// recompute the entire embedding for each batch.
|
||||
//
|
||||
// To avoid this, we compute all of the tensors for the embedding on the
|
||||
// first use and then store the result in system memory. When we need
|
||||
// additional tensors, we recreate them from the stored data.
|
||||
|
||||
// multimodalEntry represents the embeddings of a single object (such
|
||||
// as an image).
|
||||
type multimodalEntry struct {
|
||||
// mm is the original set of tensors created by EncodeMultimodal
|
||||
mm []input.Multimodal
|
||||
|
||||
// data is the computed result of mm. Nil if not yet computed
|
||||
data [][]float32
|
||||
}
|
||||
|
||||
// multimodalStore maps from an individual tensor (of which there
|
||||
// may be many in a single multimodal object) to its parent embedding
|
||||
type multimodalStore map[ml.Tensor]*multimodalEntry
|
||||
|
||||
func newMultimodalStore() multimodalStore {
|
||||
return make(multimodalStore)
|
||||
}
|
||||
|
||||
// addMultimodal stores an embedding for later use in a compute graph
|
||||
func (m multimodalStore) addMultimodal(embedding []input.Multimodal) {
|
||||
entry := &multimodalEntry{mm: embedding}
|
||||
|
||||
for _, e := range embedding {
|
||||
if e.Tensor != nil {
|
||||
m[e.Tensor] = entry
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// getMultimodal takes a source set of tensors (which may contain a whole or
|
||||
// parts of one or more images) and returns the equivalent that can be used in
|
||||
// the current context
|
||||
func (m multimodalStore) getMultimodal(backend ml.Backend, ctx ml.Context, in []input.Multimodal, reserve bool) ([]input.Multimodal, error) {
|
||||
out := make([]input.Multimodal, len(in))
|
||||
for i := range out {
|
||||
if in[i].Tensor != nil {
|
||||
var err error
|
||||
out[i].Tensor, err = m.getTensor(backend, ctx, in[i].Tensor, reserve)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
out[i].Data = in[i].Data
|
||||
}
|
||||
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func (m multimodalStore) getTensor(backend ml.Backend, ctx ml.Context, in ml.Tensor, reserve bool) (ml.Tensor, error) {
|
||||
entry := m[in]
|
||||
|
||||
if entry.data == nil {
|
||||
computeCtx := backend.NewContext()
|
||||
defer computeCtx.Close()
|
||||
|
||||
var tensors []ml.Tensor
|
||||
for _, t := range entry.mm {
|
||||
if t.Tensor != nil {
|
||||
tensors = append(tensors, t.Tensor)
|
||||
}
|
||||
}
|
||||
|
||||
if len(tensors) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
computeCtx.Forward(tensors...)
|
||||
entry.data = make([][]float32, len(entry.mm))
|
||||
|
||||
if !reserve {
|
||||
computeCtx.Compute(tensors...)
|
||||
|
||||
for i, t := range entry.mm {
|
||||
if t.Tensor != nil {
|
||||
entry.data[i] = t.Tensor.Floats()
|
||||
}
|
||||
}
|
||||
} else {
|
||||
computeCtx.Reserve()
|
||||
}
|
||||
}
|
||||
|
||||
for i, t := range entry.mm {
|
||||
if in == t.Tensor {
|
||||
if !reserve {
|
||||
return ctx.Input().FromFloatSlice(entry.data[i], t.Tensor.Shape()...), nil
|
||||
} else {
|
||||
return ctx.Input().Empty(t.Tensor.DType(), t.Tensor.Shape()...), nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil, errors.New("multimodal tensor not found")
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user