mirror of
https://github.com/ollama/ollama.git
synced 2026-01-03 21:19:20 -05:00
Compare commits
9 Commits
brucemacd/
...
pdevine/ge
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b7349a4efd | ||
|
|
4cda3e3622 | ||
|
|
95fbf1da12 | ||
|
|
83d1a1ab55 | ||
|
|
035e69799e | ||
|
|
10e06d0a45 | ||
|
|
8cf1ea4fd8 | ||
|
|
d231229122 | ||
|
|
fad98fabab |
@@ -24,7 +24,7 @@ set(GGML_LLAMAFILE ON)
|
||||
set(GGML_CUDA_PEER_MAX_BATCH_SIZE 128)
|
||||
set(GGML_CUDA_GRAPHS ON)
|
||||
|
||||
if((CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
|
||||
if((NOT CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
|
||||
OR (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm|aarch64|ARM64|ARMv[0-9]+"))
|
||||
set(GGML_CPU_ALL_VARIANTS ON)
|
||||
endif()
|
||||
|
||||
@@ -1,338 +0,0 @@
|
||||
# Guide: Implementing Models in Ollama's Go Inference Engine
|
||||
|
||||
> **Note**: This guide and the Go inference engine are in early development and will be updated as implementation details evolve.
|
||||
|
||||
This guide outlines the process of implementing a new model in Ollama's inference engine. It covers everything from initial setup to publishing your model to ollama.com.
|
||||
|
||||
## Architecture Overview
|
||||
|
||||
Below is a diagram showing Ollama's inference engine architecture layers and how they interact:
|
||||
|
||||
```mermaid
|
||||
graph TB
|
||||
subgraph Models["Model Layer: LLM Implementations"]
|
||||
direction TB
|
||||
llama["model/models/llama"]
|
||||
mllama["model/models/mllama"]
|
||||
qwen["model/models/qwen2"]
|
||||
etc["...etc"]
|
||||
|
||||
note1[" Each model implements a<br>specific architecture:<br>- Defines model parameters<br>- Implements forward pass"]
|
||||
end
|
||||
|
||||
subgraph ML_Ops["Neural Network Operations"]
|
||||
direction TB
|
||||
nn_ops[" nn/<br>linear.go: Matrix multiplication<br>embedding.go: Token embedding lookups<br>normalization.go: Layer norm operations<br>convolution.go: Convolutional operations "]
|
||||
|
||||
backend[" ml/backend.go<br>Hardware Abstraction Layer:<br>- Defines tensor operations<br>- Manages computation graphs<br>- Handles memory allocation "]
|
||||
|
||||
note2[" Common neural net operations:<br>- Abstracts hardware details<br>- Provides unified API<br>- Manages computation flow "]
|
||||
end
|
||||
|
||||
subgraph Hardware["Backend Execution Layer"]
|
||||
direction TB
|
||||
backend_impl[" The backend package provides:<br>- Unified computation interface<br>- Automatic hardware selection<br>- Optimized kernels<br>- Efficient memory management "]
|
||||
|
||||
subgraph Backends["Backend Implementations"]
|
||||
direction LR
|
||||
cpu["backend/cpu<br>- Pure Go implementation<br>- Fallback for all platforms"]
|
||||
|
||||
metal["backend/metal<br>- Apple Silicon (M1/M2/M3)<br>- MLX integration<br>- Leverages Apple Neural Engine"]
|
||||
|
||||
onnx["backend/onnx<br>- Cross-platform compatibility<br>- ONNX Runtime integration<br>- Pre-compiled graph execution"]
|
||||
|
||||
ggml["backend/ggml<br>- CPU/GPU quantized compute<br>- Low-precision operations<br>- Memory-efficient inferencing"]
|
||||
end
|
||||
end
|
||||
|
||||
Models --> |" Makes high-level calls<br>(e.g., self-attention) "| ML_Ops
|
||||
ML_Ops --> |" Translates to tensor operations<br>(e.g., matmul, softmax) "| Hardware
|
||||
backend_impl --> Backends
|
||||
```
|
||||
|
||||
When implementing a new model, you'll primarily work in the model layer, interfacing with the neural network operations layer.
|
||||
|
||||
## Implementation Process Overview
|
||||
|
||||
Here's the high-level process for implementing a new model in Ollama:
|
||||
|
||||
1. **Environment Setup**: Clone the repository and set up your development environment
|
||||
2. **Research Implementation**: Understand the original model architecture
|
||||
3. **Project Structure Setup**: Set up the necessary file structure
|
||||
4. **Create Basic Modelfile**: Create a simple Modelfile for testing
|
||||
5. **Implement Weight Conversion**: Map from original format to GGUF
|
||||
6. **Open a Draft PR**: Create a draft pull request to establish communication with maintainers
|
||||
7. **Implement Model Logic**: Create the model architecture and forward pass
|
||||
8. **Quality Check and Final Steps**: Create a Modelfile, add tests and ensure functionality
|
||||
10. **Finalize PR and Publish**: Complete the PR and publish to ollama.com
|
||||
|
||||
## Implementation Steps in Detail
|
||||
|
||||
### 1. Environment Setup
|
||||
|
||||
First, clone the Ollama repository and get it running locally. Follow the development setup guide at:
|
||||
https://github.com/ollama/ollama/blob/main/docs/development.md
|
||||
|
||||
### 2. Research Implementation
|
||||
|
||||
Get the original model implementation running. This typically involves:
|
||||
- Cloning the research code repository (usually Python-based)
|
||||
- Setting up the required environment
|
||||
- Running inference with sample inputs
|
||||
- Understanding the model architecture and forward pass
|
||||
|
||||
### 3. Project Structure Setup
|
||||
|
||||
Create the necessary file structure by referencing previous model implementations. You'll need:
|
||||
|
||||
```
|
||||
convert/
|
||||
└── convert_your-model.go # Weight conversion logic (PyTorch/SafeTensors to GGML)
|
||||
model/
|
||||
└── your-model/
|
||||
└── model.go # Architecture and forward pass implementation
|
||||
```
|
||||
|
||||
Add your model to the main paths in [model/models/models.go](https://github.com/ollama/ollama/blob/main/model/models/models.go):
|
||||
|
||||
```
|
||||
package models
|
||||
|
||||
import (
|
||||
_ "github.com/ollama/ollama/model/models/llama"
|
||||
_ "github.com/ollama/ollama/model/models/mllama"
|
||||
_ "github.com/ollama/ollama/model/models/your-model" // Add your model here
|
||||
)
|
||||
```
|
||||
|
||||
### 4. Create a Basic Modelfile
|
||||
|
||||
Create a simple Modelfile early in the process to facilitate testing:
|
||||
|
||||
```
|
||||
FROM /path/to/model
|
||||
TEMPLATE "{{.Prompt}}" # Use a static prompt format for initial testing
|
||||
```
|
||||
|
||||
This allows you to test your implementation with consistent inputs before finalizing the proper prompt template.
|
||||
|
||||
### 5. Implement Weight Conversion
|
||||
|
||||
- Work on `convert/convert_your-model.go`
|
||||
- Reference existing conversion implementations
|
||||
- Conversion involves mapping from PyTorch/SafeTensors naming to GGUF naming as you see fit
|
||||
- Understand typical GGUF layout and structure:
|
||||
|
||||
**Typical GGUF Layout:**
|
||||
```
|
||||
GGUF
|
||||
├── Metadata Section
|
||||
│ ├── Model Parameters
|
||||
│ │ ├── General architecture parameters
|
||||
│ │ │ ├── "{arch}.vocab_size" (e.g., "llama.vocab_size")
|
||||
│ │ │ ├── "{arch}.context_length" (e.g., "llama.context_length")
|
||||
│ │ │ ├── "{arch}.embedding_length" (e.g., "llama.embedding_length")
|
||||
│ │ │ └── "{arch}.block_count" (e.g., "llama.block_count")
|
||||
│ │ │
|
||||
│ │ └── Architecture-specific parameters
|
||||
│ │ ├── "{arch}.attention.head_count" (e.g., "llama.attention.head_count")
|
||||
│ │ ├── "{arch}.attention.head_count_kv" (e.g., "llama.attention.head_count_kv")
|
||||
│ │ ├── "{arch}.rope.dimension_count" (e.g., "llama.rope.dimension_count")
|
||||
│ │ └── "{arch}.attention.layer_norm_rms_epsilon" (e.g., "llama.attention.layer_norm_rms_epsilon")
|
||||
│ │
|
||||
│ ├── Tokenizer parameters
|
||||
│ │ ├── "tokenizer.ggml.model" (e.g., "llama")
|
||||
│ │ ├── "tokenizer.ggml.tokens" (vocabulary tokens)
|
||||
│ │ ├── "tokenizer.ggml.bos_id" (beginning of sequence token ID)
|
||||
│ │ └── "tokenizer.ggml.eos_id" (end of sequence token ID)
|
||||
│ │
|
||||
│ └── General metadata
|
||||
│ └── "general.architecture" (e.g., "llama", "qwen2", "phi")
|
||||
│
|
||||
└── Tensor Data Section
|
||||
├── Common tensors:
|
||||
│ ├── "token_embd.weight" (token embedding matrix)
|
||||
│ ├── "rope_freqs.weight" (RoPE frequency weights)
|
||||
│ ├── "output_norm.weight" (final layer normalization)
|
||||
│ └── "output.weight" (output projection)
|
||||
│
|
||||
└── Layer-specific tensors:
|
||||
├── "blk.{i}.attn_q.weight" (query projection)
|
||||
├── "blk.{i}.attn_k.weight" (key projection)
|
||||
├── "blk.{i}.attn_v.weight" (value projection)
|
||||
├── "blk.{i}.attn_output.weight" (attention output)
|
||||
├── "blk.{i}.attn_norm.weight" (attention normalization)
|
||||
├── "blk.{i}.ffn_norm.weight" (feed-forward normalization)
|
||||
├── "blk.{i}.ffn_up.weight" (FFN up projection)
|
||||
├── "blk.{i}.ffn_down.weight" (FFN down projection)
|
||||
└── "blk.{i}.ffn_gate.weight" (FFN gate projection)
|
||||
```
|
||||
|
||||
- Key conversion details include:
|
||||
- Linear weight matrices (sometimes need transposition)
|
||||
- Layer normalization weights (might need reshaping)
|
||||
- **Note: In GGML, FFN values are for the MLP (Multi-Layer Perceptron) part of the architecture**
|
||||
|
||||
- Test conversion:
|
||||
```bash
|
||||
go run . create <my-model> -f /path/to/Modelfile
|
||||
```
|
||||
|
||||
### 6. Open a Draft PR
|
||||
|
||||
After implementing the initial weight conversion, creating a draft pull request is recommended as it:
|
||||
- Establishes a communication channel with Ollama maintainers
|
||||
- Allows for early feedback on your approach
|
||||
- Makes it easier to track progress and changes
|
||||
|
||||
To open a draft PR:
|
||||
1. Fork the repository
|
||||
2. Create a new branch for your model implementation
|
||||
3. Make initial commits with your weight conversion implementation
|
||||
4. Open a PR in the `ollama/ollama` repository and mark it as draft
|
||||
5. Include a clear description of the model you're implementing
|
||||
|
||||
### 7. Implement Model Logic
|
||||
|
||||
- Reference existing model implementations
|
||||
- Implement `New()` and `Forward()` functions in `model.go`:
|
||||
|
||||
**The `New()` function:**
|
||||
- Creates and initializes your model structure
|
||||
- Loads configuration parameters (embedding size, attention heads, etc.)
|
||||
- Sets up the tokenizer with vocabulary and special tokens
|
||||
- Initializes all model layers and weights
|
||||
- **Important**: Sets up the KV cache for efficient inference
|
||||
- Example:
|
||||
```go
|
||||
func New(c ml.Config) (model.Model, error) {
|
||||
m := &Model{
|
||||
// Initialize tokenizer
|
||||
BytePairEncoding: model.NewBytePairEncoding(...),
|
||||
// Create layer arrays
|
||||
Layers: make([]Layer, c.Uint("block_count")),
|
||||
// Set model parameters
|
||||
Options: &Options{...},
|
||||
}
|
||||
// Initialize KV cache for efficient inference
|
||||
m.Cache = kvcache.NewCausalCache(m.Shift)
|
||||
return m, nil
|
||||
}
|
||||
```
|
||||
|
||||
**The `Forward()` function:**
|
||||
- **What it does**: Defines the computational graph of your model
|
||||
- **Important**: The graph is NOT executed immediately - it's built first, then executed later when predictions are needed
|
||||
- Takes input tokens and converts them to embeddings
|
||||
- Processes inputs through transformer layers (attention and feed-forward networks)
|
||||
- Creates the path for data flow through your model's components
|
||||
- Example:
|
||||
```go
|
||||
func (m *Model) Forward(ctx ml.Context, opts model.Options) (ml.Tensor, error) {
|
||||
// Convert inputs to tensors
|
||||
inputTensor, _ := ctx.FromIntSlice(opts.Inputs, len(opts.Inputs))
|
||||
positionsTensor, _ := ctx.FromIntSlice(opts.Positions, len(opts.Positions))
|
||||
|
||||
// Initial token embedding
|
||||
hiddenStates := m.TokenEmbedding.Forward(ctx, inputTensor)
|
||||
|
||||
// Process through transformer layers
|
||||
for i, layer := range m.Layers {
|
||||
m.Cache.SetLayer(i)
|
||||
hiddenStates = layer.Forward(ctx, hiddenStates, positionsTensor, m.Cache, m.Options)
|
||||
}
|
||||
|
||||
// Final processing and output
|
||||
normalizedOutput := m.OutputNorm.Forward(ctx, hiddenStates, m.modelEpsilon)
|
||||
logits := m.Output.Forward(ctx, normalizedOutput)
|
||||
|
||||
// Return logits for requested positions
|
||||
outputsTensor, _ := ctx.FromIntSlice(opts.Outputs, len(opts.Outputs))
|
||||
return logits.Rows(ctx, outputsTensor), nil
|
||||
}
|
||||
```
|
||||
|
||||
**Key Components to Implement:**
|
||||
|
||||
1. **KV Cache**:
|
||||
- Improves inference performance for text generation
|
||||
- How it works: Stores previously computed key and value tensors from self-attention, avoiding redundant computations
|
||||
- Implementation: Use the `kvcache.NewCausalCache()` for autoregressive models
|
||||
- Important: Must implement the `Shift()` function to handle rotary position embeddings with the cache
|
||||
|
||||
2. **Self-Attention**:
|
||||
- Core component that learns contextual relationships between tokens
|
||||
- Implements query, key, value projections and their interactions
|
||||
- Must handle positional encoding (usually Rotary Position Embeddings)
|
||||
- Uses the KV cache to make generation efficient
|
||||
|
||||
3. **Normalization Layers**:
|
||||
- Purpose: Stabilizes training and maintains consistent activation distributions
|
||||
- Types: RMSNorm, LayerNorm, etc. depending on model architecture
|
||||
- Implementation: Apply before attention and feed-forward networks
|
||||
- Example: `normalizedOutput := m.OutputNorm.Forward(ctx, hiddenStates, m.modelEpsilon)`
|
||||
|
||||
4. **Activation Functions**:
|
||||
- Purpose: Introduces non-linearity into the model
|
||||
- Common types: SILU (Sigmoid Linear Unit), GELU, ReLU
|
||||
- Found in feed-forward/MLP blocks
|
||||
- Example:
|
||||
```go
|
||||
// SwiGLU activation in MLP
|
||||
gateActivation := mlp.Gate.Forward(ctx, hiddenState).SILU(ctx)
|
||||
upProjection := mlp.Up.Forward(ctx, hiddenState)
|
||||
intermediateStates := gateActivation.Mul(ctx, upProjection)
|
||||
```
|
||||
- Run your forward pass:
|
||||
```bash
|
||||
# in the root of the ollama directory
|
||||
go build .
|
||||
OLLAMA_DEBUG=1 ./ollama serve
|
||||
OLLAMA_DEBUG=1 ./ollama run <my-model>
|
||||
```
|
||||
- Compare output with research implementation
|
||||
|
||||
### 8. Quality Check and Final Steps
|
||||
|
||||
1. Add comprehensive tests to:
|
||||
- `model_test.go`
|
||||
- `convert_test.go`
|
||||
|
||||
2. Ensure tests cover:
|
||||
- Weight conversion
|
||||
- Model initialization
|
||||
- Text generation
|
||||
|
||||
3. **Create Final Modelfile**
|
||||
- Replace the static prompt with the proper Go template for your model:
|
||||
```
|
||||
FROM <converted-gguf>
|
||||
TEMPLATE <prompt-template> # Add the proper Go template for your model, including tools if needed
|
||||
LICENSE <license-info> # Add appropriate license information
|
||||
# Add additional parameters if needed
|
||||
```
|
||||
|
||||
4. **End-to-end Testing**
|
||||
- Run your model with your local Ollama build to ensure that it functions as expected
|
||||
|
||||
5. Benchmark
|
||||
- Run performance benchmarks on your model implementation
|
||||
```go
|
||||
# from the root of the Ollama directory, while a server is running locally
|
||||
go build .
|
||||
OLLAMA_DEBUG=1 ./ollama serve
|
||||
go test -bench=. -m <your-model-name> ./...
|
||||
```
|
||||
|
||||
### 9. Finalize PR and Publish to ollama.com
|
||||
|
||||
1. **Finalize Pull Request**
|
||||
- Move PR out of draft state
|
||||
- Address reviewer feedback
|
||||
|
||||
2. **Publish to ollama.com**
|
||||
- Push to ollama.com:
|
||||
```bash
|
||||
ollama create <your-namespace>/<your-model> -f /path/to/Modelfile
|
||||
ollama push <your-namespace>/<your-model>
|
||||
```
|
||||
@@ -120,6 +120,15 @@ func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
|
||||
return s
|
||||
}
|
||||
|
||||
func (kv KV) Floats(key string, defaultValue ...[]float32) []float32 {
|
||||
r := keyValue(kv, key, &array{})
|
||||
s := make([]float32, r.size)
|
||||
for i := range r.size {
|
||||
s[i] = float32(r.values[i].(float32))
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
func keyValue[T string | uint32 | uint64 | float32 | *array](kv KV, key string, defaultValue ...T) T {
|
||||
if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
|
||||
key = kv.Architecture() + "." + key
|
||||
|
||||
1
go.mod
1
go.mod
@@ -18,6 +18,7 @@ require (
|
||||
github.com/agnivade/levenshtein v1.1.1
|
||||
github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1
|
||||
github.com/dlclark/regexp2 v1.11.4
|
||||
github.com/emirpasic/gods v1.18.1
|
||||
github.com/emirpasic/gods/v2 v2.0.0-alpha
|
||||
github.com/google/go-cmp v0.6.0
|
||||
github.com/mattn/go-runewidth v0.0.14
|
||||
|
||||
2
go.sum
2
go.sum
@@ -44,6 +44,8 @@ github.com/dgryski/trifles v0.0.0-20200323201526-dd97f9abfb48 h1:fRzb/w+pyskVMQ+
|
||||
github.com/dgryski/trifles v0.0.0-20200323201526-dd97f9abfb48/go.mod h1:if7Fbed8SFyPtHLHbg49SI7NAdJiC5WIA09pe59rfAA=
|
||||
github.com/dlclark/regexp2 v1.11.4 h1:rPYF9/LECdNymJufQKmri9gV604RvvABwgOA8un7yAo=
|
||||
github.com/dlclark/regexp2 v1.11.4/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8=
|
||||
github.com/emirpasic/gods v1.18.1 h1:FXtiHYKDGKCW2KzwZKx0iC0PQmdlorYgdFG9jPXJ1Bc=
|
||||
github.com/emirpasic/gods v1.18.1/go.mod h1:8tpGGwCnJ5H4r6BWwaV6OrWmMoPhUl5jm/FMNAnJvWQ=
|
||||
github.com/emirpasic/gods/v2 v2.0.0-alpha h1:dwFlh8pBg1VMOXWGipNMRt8v96dKAIvBehtCt6OtunU=
|
||||
github.com/emirpasic/gods/v2 v2.0.0-alpha/go.mod h1:W0y4M2dtBB9U5z3YlghmpuUhiaZT2h6yoeE+C1sCp6A=
|
||||
github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
|
||||
|
||||
@@ -434,7 +434,7 @@ func (t *testTensor) Conv2D(ctx ml.Context, weight ml.Tensor, s0, s1, p0, p1, d0
|
||||
panic("not implemented")
|
||||
}
|
||||
|
||||
func (t *testTensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, dim uint32, base, scale float32) ml.Tensor {
|
||||
func (t *testTensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, dim, ropeType uint32, base, scale float32) ml.Tensor {
|
||||
panic("not implemented")
|
||||
}
|
||||
|
||||
|
||||
@@ -1,285 +0,0 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: jmorganca <jmorganca@gmail.com>
|
||||
Date: Sun, 16 Feb 2025 20:00:22 -0500
|
||||
Subject: [PATCH] use std::filesystem::path instead of wstring
|
||||
|
||||
---
|
||||
ggml/src/ggml-backend-reg.cpp | 116 ++++++++++++----------------------
|
||||
1 file changed, 40 insertions(+), 76 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
|
||||
index 84b21dd8..de78feae 100644
|
||||
--- a/ggml/src/ggml-backend-reg.cpp
|
||||
+++ b/ggml/src/ggml-backend-reg.cpp
|
||||
@@ -72,16 +72,6 @@
|
||||
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
||||
#endif
|
||||
|
||||
-static std::wstring utf8_to_utf16(const std::string & str) {
|
||||
- std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
|
||||
- return converter.from_bytes(str);
|
||||
-}
|
||||
-
|
||||
-static std::string utf16_to_utf8(const std::wstring & str) {
|
||||
- std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
|
||||
- return converter.to_bytes(str);
|
||||
-}
|
||||
-
|
||||
#if defined(__clang__)
|
||||
# pragma clang diagnostic pop
|
||||
#endif
|
||||
@@ -96,12 +86,12 @@ struct dl_handle_deleter {
|
||||
}
|
||||
};
|
||||
|
||||
-static dl_handle * dl_load_library(const std::wstring & path) {
|
||||
+static dl_handle * dl_load_library(const std::filesystem::path & path) {
|
||||
// suppress error dialogs for missing DLLs
|
||||
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
|
||||
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
|
||||
|
||||
- HMODULE handle = LoadLibraryW(path.c_str());
|
||||
+ HMODULE handle = LoadLibraryW(path.wstring().c_str());
|
||||
|
||||
SetErrorMode(old_mode);
|
||||
|
||||
@@ -129,8 +119,8 @@ struct dl_handle_deleter {
|
||||
}
|
||||
};
|
||||
|
||||
-static void * dl_load_library(const std::wstring & path) {
|
||||
- dl_handle * handle = dlopen(utf16_to_utf8(path).c_str(), RTLD_NOW | RTLD_LOCAL);
|
||||
+static void * dl_load_library(const std::filesystem::path & path) {
|
||||
+ dl_handle * handle = dlopen(path.string().c_str(), RTLD_NOW | RTLD_LOCAL);
|
||||
|
||||
return handle;
|
||||
}
|
||||
@@ -222,11 +212,11 @@ struct ggml_backend_registry {
|
||||
);
|
||||
}
|
||||
|
||||
- ggml_backend_reg_t load_backend(const std::wstring & path, bool silent) {
|
||||
+ ggml_backend_reg_t load_backend(const std::filesystem::path & path, bool silent) {
|
||||
dl_handle_ptr handle { dl_load_library(path) };
|
||||
if (!handle) {
|
||||
if (!silent) {
|
||||
- GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(path).c_str());
|
||||
+ GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path.string().c_str());
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
@@ -234,7 +224,7 @@ struct ggml_backend_registry {
|
||||
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
|
||||
if (score_fn && score_fn() == 0) {
|
||||
if (!silent) {
|
||||
- GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, utf16_to_utf8(path).c_str());
|
||||
+ GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path.string().c_str());
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
@@ -242,7 +232,7 @@ struct ggml_backend_registry {
|
||||
auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init");
|
||||
if (!backend_init_fn) {
|
||||
if (!silent) {
|
||||
- GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, utf16_to_utf8(path).c_str());
|
||||
+ GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, path.string().c_str());
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
@@ -251,16 +241,16 @@ struct ggml_backend_registry {
|
||||
if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
|
||||
if (!silent) {
|
||||
if (!reg) {
|
||||
- GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, utf16_to_utf8(path).c_str());
|
||||
+ GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, path.string().c_str());
|
||||
} else {
|
||||
GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
|
||||
- __func__, utf16_to_utf8(path).c_str(), reg->api_version, GGML_BACKEND_API_VERSION);
|
||||
+ __func__, path.string().c_str(), reg->api_version, GGML_BACKEND_API_VERSION);
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
- GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), utf16_to_utf8(path).c_str());
|
||||
+ GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path.string().c_str());
|
||||
|
||||
register_backend(reg, score_fn ? score_fn() : -1, std::move(handle));
|
||||
|
||||
@@ -396,14 +386,14 @@ ggml_backend_t ggml_backend_init_best(void) {
|
||||
|
||||
// Dynamic loading
|
||||
ggml_backend_reg_t ggml_backend_load(const char * path) {
|
||||
- return get_reg().load_backend(utf8_to_utf16(path), false);
|
||||
+ return get_reg().load_backend(path, false);
|
||||
}
|
||||
|
||||
void ggml_backend_unload(ggml_backend_reg_t reg) {
|
||||
get_reg().unload_backend(reg, true);
|
||||
}
|
||||
|
||||
-static std::wstring get_executable_path() {
|
||||
+static std::filesystem::path get_executable_path() {
|
||||
#if defined(__APPLE__)
|
||||
// get executable path
|
||||
std::vector<char> path;
|
||||
@@ -415,15 +405,9 @@ static std::wstring get_executable_path() {
|
||||
}
|
||||
path.resize(size);
|
||||
}
|
||||
- std::string base_path(path.data(), size);
|
||||
- // remove executable name
|
||||
- auto last_slash = base_path.find_last_of('/');
|
||||
- if (last_slash != std::string::npos) {
|
||||
- base_path = base_path.substr(0, last_slash);
|
||||
- }
|
||||
- return utf8_to_utf16(base_path + "/");
|
||||
+
|
||||
+ return std::filesystem::path(path.data()).parent_path();
|
||||
#elif defined(__linux__) || defined(__FreeBSD__)
|
||||
- std::string base_path = ".";
|
||||
std::vector<char> path(1024);
|
||||
while (true) {
|
||||
// get executable path
|
||||
@@ -436,76 +420,56 @@ static std::wstring get_executable_path() {
|
||||
break;
|
||||
}
|
||||
if (len < (ssize_t) path.size()) {
|
||||
- base_path = std::string(path.data(), len);
|
||||
- // remove executable name
|
||||
- auto last_slash = base_path.find_last_of('/');
|
||||
- if (last_slash != std::string::npos) {
|
||||
- base_path = base_path.substr(0, last_slash);
|
||||
- }
|
||||
- break;
|
||||
+ return std::filesystem::path(path.data()).parent_path();
|
||||
}
|
||||
path.resize(path.size() * 2);
|
||||
}
|
||||
-
|
||||
- return utf8_to_utf16(base_path + "/");
|
||||
#elif defined(_WIN32)
|
||||
std::vector<wchar_t> path(MAX_PATH);
|
||||
DWORD len = GetModuleFileNameW(NULL, path.data(), path.size());
|
||||
if (len == 0) {
|
||||
return {};
|
||||
}
|
||||
- std::wstring base_path(path.data(), len);
|
||||
- // remove executable name
|
||||
- auto last_slash = base_path.find_last_of('\\');
|
||||
- if (last_slash != std::string::npos) {
|
||||
- base_path = base_path.substr(0, last_slash);
|
||||
- }
|
||||
- return base_path + L"\\";
|
||||
-#else
|
||||
- return {};
|
||||
-#endif
|
||||
-}
|
||||
|
||||
-static std::wstring backend_filename_prefix() {
|
||||
-#ifdef _WIN32
|
||||
- return L"ggml-";
|
||||
+ return std::filesystem::path(path.data()).parent_path();
|
||||
#else
|
||||
- return L"libggml-";
|
||||
+ return {};
|
||||
#endif
|
||||
}
|
||||
|
||||
-static std::wstring backend_filename_suffix() {
|
||||
+static std::string backend_filename_prefix() {
|
||||
#ifdef _WIN32
|
||||
- return L".dll";
|
||||
+ return "ggml-";
|
||||
#else
|
||||
- return L".so";
|
||||
+ return "libggml-";
|
||||
#endif
|
||||
}
|
||||
|
||||
-static std::wstring path_separator() {
|
||||
+static std::string backend_filename_suffix() {
|
||||
#ifdef _WIN32
|
||||
- return L"\\";
|
||||
+ return ".dll";
|
||||
#else
|
||||
- return L"/";
|
||||
+ return ".so";
|
||||
#endif
|
||||
}
|
||||
|
||||
static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) {
|
||||
// enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
|
||||
// TODO: search system paths
|
||||
- std::wstring file_prefix = backend_filename_prefix() + utf8_to_utf16(name) + L"-";
|
||||
- std::vector<std::wstring> search_paths;
|
||||
+ namespace fs = std::filesystem;
|
||||
+ std::string file_prefix = backend_filename_prefix() + name + "-";
|
||||
+ std::vector<fs::path> search_paths;
|
||||
+
|
||||
if (user_search_path == nullptr) {
|
||||
- search_paths.push_back(L"." + path_separator());
|
||||
+ search_paths.push_back(fs::current_path());
|
||||
search_paths.push_back(get_executable_path());
|
||||
} else {
|
||||
- search_paths.push_back(utf8_to_utf16(user_search_path) + path_separator());
|
||||
+ search_paths.push_back(fs::u8path(user_search_path));
|
||||
}
|
||||
|
||||
int best_score = 0;
|
||||
- std::wstring best_path;
|
||||
+ fs::path best_path;
|
||||
|
||||
- namespace fs = std::filesystem;
|
||||
for (const auto & search_path : search_paths) {
|
||||
if (!fs::exists(search_path)) {
|
||||
continue;
|
||||
@@ -514,31 +478,31 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
|
||||
for (const auto & entry : dir_it) {
|
||||
try {
|
||||
if (entry.is_regular_file()) {
|
||||
- std::wstring filename = entry.path().filename().wstring();
|
||||
- std::wstring ext = entry.path().extension().wstring();
|
||||
+ std::string filename = entry.path().filename().string();
|
||||
+ std::string ext = entry.path().extension().string();
|
||||
if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
|
||||
- dl_handle_ptr handle { dl_load_library(entry.path().wstring()) };
|
||||
+ dl_handle_ptr handle { dl_load_library(entry.path()) };
|
||||
if (!handle) {
|
||||
- GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
|
||||
+ GGML_LOG_ERROR("%s: failed to load %s\n", __func__, entry.path().string().c_str());
|
||||
continue;
|
||||
}
|
||||
|
||||
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
|
||||
if (!score_fn) {
|
||||
- GGML_LOG_DEBUG("%s: failed to find ggml_backend_score in %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
|
||||
+ GGML_LOG_DEBUG("%s: failed to find ggml_backend_score in %s\n", __func__, entry.path().string().c_str());
|
||||
continue;
|
||||
}
|
||||
|
||||
int s = score_fn();
|
||||
- GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str(), s);
|
||||
+ GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, entry.path().string().c_str(), s);
|
||||
if (s > best_score) {
|
||||
best_score = s;
|
||||
- best_path = entry.path().wstring();
|
||||
+ best_path = entry.path();
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (const std::exception & e) {
|
||||
- GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str(), e.what());
|
||||
+ GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, entry.path().string().c_str(), e.what());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -546,7 +510,7 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
|
||||
if (best_score == 0) {
|
||||
// try to load the base backend
|
||||
for (const auto & search_path : search_paths) {
|
||||
- std::wstring path = search_path + backend_filename_prefix() + utf8_to_utf16(name) + backend_filename_suffix();
|
||||
+ fs::path path = fs::path(search_path) / (backend_filename_prefix() + name + backend_filename_suffix());
|
||||
if (fs::exists(path)) {
|
||||
return get_reg().load_backend(path, silent);
|
||||
}
|
||||
@@ -17,6 +17,7 @@ type Config interface {
|
||||
|
||||
Strings(string, ...[]string) []string
|
||||
Uints(string, ...[]uint32) []uint32
|
||||
Floats(string, ...[]float32) []float32
|
||||
}
|
||||
|
||||
type Backend interface {
|
||||
@@ -76,7 +77,7 @@ type Tensor interface {
|
||||
Scale(ctx Context, s float64) Tensor
|
||||
|
||||
Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
|
||||
RoPE(ctx Context, positionIDs, ropeFactors Tensor, dim uint32, base, scale float32) Tensor
|
||||
RoPE(ctx Context, positionIDs, ropeFactors Tensor, dim, ropeType uint32, base, scale float32) Tensor
|
||||
|
||||
Tanh(ctx Context) Tensor
|
||||
GELU(ctx Context) Tensor
|
||||
|
||||
@@ -596,10 +596,13 @@ func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
|
||||
}
|
||||
|
||||
const (
|
||||
ropeTypeNorm C.int = iota
|
||||
ropeTypeNorm C.int = 0
|
||||
ropeTypeNeox C.int = 2
|
||||
ropeTypeMrope C.int = 8
|
||||
ropeTypeVision C.int = 24
|
||||
)
|
||||
|
||||
func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim uint32, ropeBase, ropeScale float32) ml.Tensor {
|
||||
func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim, ropeType uint32, ropeBase, ropeScale float32) ml.Tensor {
|
||||
if ropeFactors == nil {
|
||||
ropeFactors = &Tensor{}
|
||||
}
|
||||
@@ -613,8 +616,8 @@ func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDi
|
||||
t: C.ggml_rope_ext(
|
||||
ctx.(*Context).ctx, dequant, positionIDs.(*Tensor).t, ropeFactors.(*Tensor).t,
|
||||
C.int(ropeDim),
|
||||
131072, // YaRN n_ctx_train
|
||||
ropeTypeNorm, // ROPE_TYPE_NORM
|
||||
C.int(ropeType),
|
||||
131072, // YaRN n_ctx_train
|
||||
C.float(ropeBase),
|
||||
C.float(ropeScale),
|
||||
0., // YaRN ext_factor
|
||||
|
||||
116
ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
vendored
116
ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
vendored
@@ -72,6 +72,16 @@
|
||||
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
||||
#endif
|
||||
|
||||
static std::wstring utf8_to_utf16(const std::string & str) {
|
||||
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
|
||||
return converter.from_bytes(str);
|
||||
}
|
||||
|
||||
static std::string utf16_to_utf8(const std::wstring & str) {
|
||||
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
|
||||
return converter.to_bytes(str);
|
||||
}
|
||||
|
||||
#if defined(__clang__)
|
||||
# pragma clang diagnostic pop
|
||||
#endif
|
||||
@@ -86,12 +96,12 @@ struct dl_handle_deleter {
|
||||
}
|
||||
};
|
||||
|
||||
static dl_handle * dl_load_library(const std::filesystem::path & path) {
|
||||
static dl_handle * dl_load_library(const std::wstring & path) {
|
||||
// suppress error dialogs for missing DLLs
|
||||
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
|
||||
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
|
||||
|
||||
HMODULE handle = LoadLibraryW(path.wstring().c_str());
|
||||
HMODULE handle = LoadLibraryW(path.c_str());
|
||||
|
||||
SetErrorMode(old_mode);
|
||||
|
||||
@@ -119,8 +129,8 @@ struct dl_handle_deleter {
|
||||
}
|
||||
};
|
||||
|
||||
static void * dl_load_library(const std::filesystem::path & path) {
|
||||
dl_handle * handle = dlopen(path.string().c_str(), RTLD_NOW | RTLD_LOCAL);
|
||||
static void * dl_load_library(const std::wstring & path) {
|
||||
dl_handle * handle = dlopen(utf16_to_utf8(path).c_str(), RTLD_NOW | RTLD_LOCAL);
|
||||
|
||||
return handle;
|
||||
}
|
||||
@@ -212,11 +222,11 @@ struct ggml_backend_registry {
|
||||
);
|
||||
}
|
||||
|
||||
ggml_backend_reg_t load_backend(const std::filesystem::path & path, bool silent) {
|
||||
ggml_backend_reg_t load_backend(const std::wstring & path, bool silent) {
|
||||
dl_handle_ptr handle { dl_load_library(path) };
|
||||
if (!handle) {
|
||||
if (!silent) {
|
||||
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path.string().c_str());
|
||||
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(path).c_str());
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
@@ -224,7 +234,7 @@ struct ggml_backend_registry {
|
||||
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
|
||||
if (score_fn && score_fn() == 0) {
|
||||
if (!silent) {
|
||||
GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path.string().c_str());
|
||||
GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, utf16_to_utf8(path).c_str());
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
@@ -232,7 +242,7 @@ struct ggml_backend_registry {
|
||||
auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init");
|
||||
if (!backend_init_fn) {
|
||||
if (!silent) {
|
||||
GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, path.string().c_str());
|
||||
GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, utf16_to_utf8(path).c_str());
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
@@ -241,16 +251,16 @@ struct ggml_backend_registry {
|
||||
if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
|
||||
if (!silent) {
|
||||
if (!reg) {
|
||||
GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, path.string().c_str());
|
||||
GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, utf16_to_utf8(path).c_str());
|
||||
} else {
|
||||
GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
|
||||
__func__, path.string().c_str(), reg->api_version, GGML_BACKEND_API_VERSION);
|
||||
__func__, utf16_to_utf8(path).c_str(), reg->api_version, GGML_BACKEND_API_VERSION);
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path.string().c_str());
|
||||
GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), utf16_to_utf8(path).c_str());
|
||||
|
||||
register_backend(reg, score_fn ? score_fn() : -1, std::move(handle));
|
||||
|
||||
@@ -386,14 +396,14 @@ ggml_backend_t ggml_backend_init_best(void) {
|
||||
|
||||
// Dynamic loading
|
||||
ggml_backend_reg_t ggml_backend_load(const char * path) {
|
||||
return get_reg().load_backend(path, false);
|
||||
return get_reg().load_backend(utf8_to_utf16(path), false);
|
||||
}
|
||||
|
||||
void ggml_backend_unload(ggml_backend_reg_t reg) {
|
||||
get_reg().unload_backend(reg, true);
|
||||
}
|
||||
|
||||
static std::filesystem::path get_executable_path() {
|
||||
static std::wstring get_executable_path() {
|
||||
#if defined(__APPLE__)
|
||||
// get executable path
|
||||
std::vector<char> path;
|
||||
@@ -405,9 +415,15 @@ static std::filesystem::path get_executable_path() {
|
||||
}
|
||||
path.resize(size);
|
||||
}
|
||||
|
||||
return std::filesystem::path(path.data()).parent_path();
|
||||
std::string base_path(path.data(), size);
|
||||
// remove executable name
|
||||
auto last_slash = base_path.find_last_of('/');
|
||||
if (last_slash != std::string::npos) {
|
||||
base_path = base_path.substr(0, last_slash);
|
||||
}
|
||||
return utf8_to_utf16(base_path + "/");
|
||||
#elif defined(__linux__) || defined(__FreeBSD__)
|
||||
std::string base_path = ".";
|
||||
std::vector<char> path(1024);
|
||||
while (true) {
|
||||
// get executable path
|
||||
@@ -420,56 +436,76 @@ static std::filesystem::path get_executable_path() {
|
||||
break;
|
||||
}
|
||||
if (len < (ssize_t) path.size()) {
|
||||
return std::filesystem::path(path.data()).parent_path();
|
||||
base_path = std::string(path.data(), len);
|
||||
// remove executable name
|
||||
auto last_slash = base_path.find_last_of('/');
|
||||
if (last_slash != std::string::npos) {
|
||||
base_path = base_path.substr(0, last_slash);
|
||||
}
|
||||
break;
|
||||
}
|
||||
path.resize(path.size() * 2);
|
||||
}
|
||||
|
||||
return utf8_to_utf16(base_path + "/");
|
||||
#elif defined(_WIN32)
|
||||
std::vector<wchar_t> path(MAX_PATH);
|
||||
DWORD len = GetModuleFileNameW(NULL, path.data(), path.size());
|
||||
if (len == 0) {
|
||||
return {};
|
||||
}
|
||||
|
||||
return std::filesystem::path(path.data()).parent_path();
|
||||
std::wstring base_path(path.data(), len);
|
||||
// remove executable name
|
||||
auto last_slash = base_path.find_last_of('\\');
|
||||
if (last_slash != std::string::npos) {
|
||||
base_path = base_path.substr(0, last_slash);
|
||||
}
|
||||
return base_path + L"\\";
|
||||
#else
|
||||
return {};
|
||||
#endif
|
||||
}
|
||||
|
||||
static std::string backend_filename_prefix() {
|
||||
static std::wstring backend_filename_prefix() {
|
||||
#ifdef _WIN32
|
||||
return "ggml-";
|
||||
return L"ggml-";
|
||||
#else
|
||||
return "libggml-";
|
||||
return L"libggml-";
|
||||
#endif
|
||||
}
|
||||
|
||||
static std::string backend_filename_suffix() {
|
||||
static std::wstring backend_filename_suffix() {
|
||||
#ifdef _WIN32
|
||||
return ".dll";
|
||||
return L".dll";
|
||||
#else
|
||||
return ".so";
|
||||
return L".so";
|
||||
#endif
|
||||
}
|
||||
|
||||
static std::wstring path_separator() {
|
||||
#ifdef _WIN32
|
||||
return L"\\";
|
||||
#else
|
||||
return L"/";
|
||||
#endif
|
||||
}
|
||||
|
||||
static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) {
|
||||
// enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
|
||||
// TODO: search system paths
|
||||
namespace fs = std::filesystem;
|
||||
std::string file_prefix = backend_filename_prefix() + name + "-";
|
||||
std::vector<fs::path> search_paths;
|
||||
|
||||
std::wstring file_prefix = backend_filename_prefix() + utf8_to_utf16(name) + L"-";
|
||||
std::vector<std::wstring> search_paths;
|
||||
if (user_search_path == nullptr) {
|
||||
search_paths.push_back(fs::current_path());
|
||||
search_paths.push_back(L"." + path_separator());
|
||||
search_paths.push_back(get_executable_path());
|
||||
} else {
|
||||
search_paths.push_back(fs::u8path(user_search_path));
|
||||
search_paths.push_back(utf8_to_utf16(user_search_path) + path_separator());
|
||||
}
|
||||
|
||||
int best_score = 0;
|
||||
fs::path best_path;
|
||||
std::wstring best_path;
|
||||
|
||||
namespace fs = std::filesystem;
|
||||
for (const auto & search_path : search_paths) {
|
||||
if (!fs::exists(search_path)) {
|
||||
continue;
|
||||
@@ -478,31 +514,31 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
|
||||
for (const auto & entry : dir_it) {
|
||||
try {
|
||||
if (entry.is_regular_file()) {
|
||||
std::string filename = entry.path().filename().string();
|
||||
std::string ext = entry.path().extension().string();
|
||||
std::wstring filename = entry.path().filename().wstring();
|
||||
std::wstring ext = entry.path().extension().wstring();
|
||||
if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
|
||||
dl_handle_ptr handle { dl_load_library(entry.path()) };
|
||||
dl_handle_ptr handle { dl_load_library(entry.path().wstring()) };
|
||||
if (!handle) {
|
||||
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, entry.path().string().c_str());
|
||||
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
|
||||
continue;
|
||||
}
|
||||
|
||||
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
|
||||
if (!score_fn) {
|
||||
GGML_LOG_DEBUG("%s: failed to find ggml_backend_score in %s\n", __func__, entry.path().string().c_str());
|
||||
GGML_LOG_DEBUG("%s: failed to find ggml_backend_score in %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
|
||||
continue;
|
||||
}
|
||||
|
||||
int s = score_fn();
|
||||
GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, entry.path().string().c_str(), s);
|
||||
GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str(), s);
|
||||
if (s > best_score) {
|
||||
best_score = s;
|
||||
best_path = entry.path();
|
||||
best_path = entry.path().wstring();
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (const std::exception & e) {
|
||||
GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, entry.path().string().c_str(), e.what());
|
||||
GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str(), e.what());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -510,7 +546,7 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
|
||||
if (best_score == 0) {
|
||||
// try to load the base backend
|
||||
for (const auto & search_path : search_paths) {
|
||||
fs::path path = fs::path(search_path) / (backend_filename_prefix() + name + backend_filename_suffix());
|
||||
std::wstring path = search_path + backend_filename_prefix() + utf8_to_utf16(name) + backend_filename_suffix();
|
||||
if (fs::exists(path)) {
|
||||
return get_reg().load_backend(path, silent);
|
||||
}
|
||||
|
||||
193
model/models/gemma2/model.go
Normal file
193
model/models/gemma2/model.go
Normal file
@@ -0,0 +1,193 @@
|
||||
package gemma2
|
||||
|
||||
import (
|
||||
"math"
|
||||
|
||||
"github.com/ollama/ollama/kvcache"
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/ml/nn"
|
||||
"github.com/ollama/ollama/model"
|
||||
)
|
||||
|
||||
type Options struct {
|
||||
hiddenSize, numHeads, numKVHeads int
|
||||
attnKeyLen, attnValLen int
|
||||
eps, ropeBase, ropeScale float32
|
||||
attnLogitSoftcap float32
|
||||
finalLogitSoftcap float32
|
||||
}
|
||||
|
||||
type Model struct {
|
||||
model.Base
|
||||
model.SentencePieceModel
|
||||
|
||||
TokenEmbedding *nn.Embedding `gguf:"token_embd"`
|
||||
Layers []Layer `gguf:"blk"`
|
||||
OutputNorm *nn.RMSNorm `gguf:"output_norm"` // is this supposed to be root means square?
|
||||
Output *nn.Linear `gguf:"output,alt:token_embd"` // just set to token_embd?
|
||||
|
||||
*Options
|
||||
}
|
||||
|
||||
func New(c ml.Config) (model.Model, error) {
|
||||
m := Model{
|
||||
SentencePieceModel: model.NewSentencePieceModel(
|
||||
c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
|
||||
&model.Vocabulary{
|
||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||
Scores: c.Floats("tokenizer.ggml.scores"),
|
||||
Types: c.Uints("tokenizer.ggml.token_type"),
|
||||
BOS: int32(c.Uint("tokenizer.ggml.bos_token_id")),
|
||||
EOS: int32(c.Uint("tokenizer.ggml.eos_token_id")),
|
||||
},
|
||||
),
|
||||
Layers: make([]Layer, c.Uint("block_count")),
|
||||
Options: &Options{
|
||||
hiddenSize: int(c.Uint("embedding_length")),
|
||||
numHeads: int(c.Uint("attention.head_count")),
|
||||
numKVHeads: int(c.Uint("attention.head_count_kv")),
|
||||
attnKeyLen: int(c.Uint("attention.key_length")),
|
||||
attnValLen: int(c.Uint("attention.value_length")),
|
||||
eps: c.Float("attention.layer_norm_rms_epsilon"),
|
||||
ropeBase: c.Float("rope.freq_base", 10000.0),
|
||||
ropeScale: c.Float("rope.freq_scale", 1.0),
|
||||
attnLogitSoftcap: c.Float("attn_logit_softcapping"),
|
||||
finalLogitSoftcap: c.Float("final_logit_softcapping"),
|
||||
},
|
||||
}
|
||||
|
||||
slidingWindowLen := int32(c.Uint("attention.sliding_window"))
|
||||
m.Cache = kvcache.NewWrapperCache(kvcache.NewSWACache(slidingWindowLen, m.Shift), kvcache.NewCausalCache(m.Shift))
|
||||
|
||||
return &m, nil
|
||||
}
|
||||
|
||||
type SelfAttention struct {
|
||||
Query *nn.Linear `gguf:"attn_q"`
|
||||
Key *nn.Linear `gguf:"attn_k"`
|
||||
Value *nn.Linear `gguf:"attn_v"`
|
||||
Output *nn.Linear `gguf:"attn_output"`
|
||||
}
|
||||
|
||||
func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
|
||||
batchSize := hiddenState.Dim(1)
|
||||
ropeType := uint32(2)
|
||||
|
||||
q := sa.Query.Forward(ctx, hiddenState)
|
||||
q = q.Reshape(ctx, opts.attnKeyLen, opts.numHeads, batchSize)
|
||||
q = q.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, opts.ropeBase, opts.ropeScale)
|
||||
|
||||
// todo: this should be 1.0/math.Sqrt(float64(headDim)) for 27B models
|
||||
q = q.Scale(ctx, 1.0/math.Sqrt(float64(opts.attnKeyLen)))
|
||||
|
||||
k := sa.Key.Forward(ctx, hiddenState)
|
||||
k = k.Reshape(ctx, opts.attnKeyLen, opts.numKVHeads, batchSize)
|
||||
k = k.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, opts.ropeBase, opts.ropeScale)
|
||||
|
||||
v := sa.Value.Forward(ctx, hiddenState)
|
||||
v = v.Reshape(ctx, opts.attnValLen, opts.numKVHeads, batchSize)
|
||||
|
||||
cache.Put(ctx, k, v)
|
||||
k, v, mask := cache.Get(ctx)
|
||||
|
||||
q = q.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
k = k.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
v = v.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
|
||||
|
||||
kq := k.Mulmat(ctx, q)
|
||||
|
||||
// logit softcap
|
||||
kq = kq.Scale(ctx, 1.0/float64(opts.attnLogitSoftcap))
|
||||
kq = kq.Tanh(ctx)
|
||||
kq = kq.Scale(ctx, float64(opts.attnLogitSoftcap))
|
||||
|
||||
kq = kq.Add(ctx, mask)
|
||||
kq = kq.Softmax(ctx)
|
||||
|
||||
kqv := v.Mulmat(ctx, kq)
|
||||
kqv = kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
kqv = kqv.Reshape(ctx, opts.attnValLen*opts.numHeads, batchSize)
|
||||
|
||||
return sa.Output.Forward(ctx, kqv)
|
||||
}
|
||||
|
||||
func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
|
||||
return key.RoPE(ctx, shift, nil, uint32(m.Options.attnKeyLen), uint32(2), m.Options.ropeBase, m.Options.ropeScale), nil
|
||||
}
|
||||
|
||||
type MLP struct {
|
||||
Up *nn.Linear `gguf:"ffn_up"`
|
||||
Down *nn.Linear `gguf:"ffn_down"`
|
||||
Gate *nn.Linear `gguf:"ffn_gate"`
|
||||
}
|
||||
|
||||
func (mlp *MLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *Options) ml.Tensor {
|
||||
hiddenState = mlp.Gate.Forward(ctx, hiddenState).GELU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenState))
|
||||
return mlp.Down.Forward(ctx, hiddenState)
|
||||
}
|
||||
|
||||
type Layer struct {
|
||||
AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
|
||||
SelfAttention *SelfAttention
|
||||
PostAttentionNorm *nn.RMSNorm `gguf:"post_attention_norm"`
|
||||
MLPNorm *nn.RMSNorm `gguf:"ffn_norm"`
|
||||
MLP *MLP
|
||||
PostMLPNorm *nn.RMSNorm `gguf:"post_ffw_norm"`
|
||||
}
|
||||
|
||||
func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
|
||||
residual := hiddenState
|
||||
|
||||
hiddenState = l.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
|
||||
hiddenState = l.SelfAttention.Forward(ctx, hiddenState, positionIDs, cache, opts)
|
||||
hiddenState = l.PostAttentionNorm.Forward(ctx, hiddenState, opts.eps)
|
||||
hiddenState = hiddenState.Add(ctx, residual)
|
||||
residual = hiddenState
|
||||
|
||||
hiddenState = l.MLPNorm.Forward(ctx, hiddenState, opts.eps)
|
||||
hiddenState = l.MLP.Forward(ctx, hiddenState, opts)
|
||||
hiddenState = l.PostMLPNorm.Forward(ctx, hiddenState, opts.eps)
|
||||
return hiddenState.Add(ctx, residual)
|
||||
}
|
||||
|
||||
func (m *Model) Forward(ctx ml.Context, opts model.Options) (ml.Tensor, error) {
|
||||
inputs, err := ctx.FromIntSlice(opts.Inputs, len(opts.Inputs))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
positions, err := ctx.FromIntSlice(opts.Positions, len(opts.Positions))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
hiddenState := m.TokenEmbedding.Forward(ctx, inputs)
|
||||
hiddenState = hiddenState.Scale(ctx, math.Sqrt(float64(m.Options.hiddenSize)))
|
||||
|
||||
for i, layer := range m.Layers {
|
||||
cacheType := i % 2
|
||||
m.Cache.SetLayer(i)
|
||||
wc := m.Cache.(*kvcache.WrapperCache)
|
||||
wc.SetLayerType(cacheType)
|
||||
hiddenState = layer.Forward(ctx, hiddenState, positions, m.Cache, m.Options)
|
||||
}
|
||||
|
||||
hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
|
||||
hiddenState = m.Output.Forward(ctx, hiddenState)
|
||||
|
||||
// final logit softcap
|
||||
hiddenState = hiddenState.Scale(ctx, 1.0/float64(m.Options.finalLogitSoftcap))
|
||||
hiddenState = hiddenState.Tanh(ctx)
|
||||
hiddenState = hiddenState.Scale(ctx, float64(m.Options.finalLogitSoftcap))
|
||||
|
||||
outputs, err := ctx.FromIntSlice(opts.Outputs, len(opts.Outputs))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return hiddenState.Rows(ctx, outputs), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
model.Register("gemma2", New)
|
||||
}
|
||||
@@ -67,14 +67,15 @@ type SelfAttention struct {
|
||||
func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
|
||||
batchSize := hiddenState.Dim(1)
|
||||
headDim := opts.hiddenSize / opts.numHeads
|
||||
ropeType := uint32(0)
|
||||
|
||||
q := sa.Query.Forward(ctx, hiddenState)
|
||||
q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
|
||||
q = q.RoPE(ctx, positionIDs, opts.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)
|
||||
q = q.RoPE(ctx, positionIDs, opts.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
|
||||
|
||||
k := sa.Key.Forward(ctx, hiddenState)
|
||||
k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
||||
k = k.RoPE(ctx, positionIDs, opts.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)
|
||||
k = k.RoPE(ctx, positionIDs, opts.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
|
||||
|
||||
v := sa.Value.Forward(ctx, hiddenState)
|
||||
v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
||||
@@ -99,7 +100,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
|
||||
}
|
||||
|
||||
func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
|
||||
return key.RoPE(ctx, shift, m.Options.RopeFactors, m.Options.ropeDim, m.Options.ropeBase, m.Options.ropeScale), nil
|
||||
return key.RoPE(ctx, shift, m.Options.RopeFactors, m.Options.ropeDim, uint32(0), m.Options.ropeBase, m.Options.ropeScale), nil
|
||||
}
|
||||
|
||||
type MLP struct {
|
||||
|
||||
@@ -19,14 +19,15 @@ type TextSelfAttention struct {
|
||||
func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, _ ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
|
||||
batchSize := hiddenState.Dim(1)
|
||||
headDim := opts.hiddenSize / opts.numHeads
|
||||
ropeType := uint32(0)
|
||||
|
||||
query := sa.Query.Forward(ctx, hiddenState)
|
||||
query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
|
||||
query = query.RoPE(ctx, positions, opts.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)
|
||||
query = query.RoPE(ctx, positions, opts.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
|
||||
|
||||
key := sa.Key.Forward(ctx, hiddenState)
|
||||
key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
||||
key = key.RoPE(ctx, positions, opts.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)
|
||||
key = key.RoPE(ctx, positions, opts.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
|
||||
|
||||
value := sa.Value.Forward(ctx, hiddenState)
|
||||
value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
||||
@@ -52,7 +53,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, _ m
|
||||
|
||||
func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
|
||||
// This will only get called for layers in the cache, which are just the self attention layers
|
||||
return key.RoPE(ctx, shift, m.RopeFactors, m.ropeDim, m.ropeBase, m.ropeScale), nil
|
||||
return key.RoPE(ctx, shift, m.RopeFactors, m.ropeDim, uint32(0), m.ropeBase, m.ropeScale), nil
|
||||
}
|
||||
|
||||
type TextMLP struct {
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package models
|
||||
|
||||
import (
|
||||
_ "github.com/ollama/ollama/model/models/gemma2"
|
||||
_ "github.com/ollama/ollama/model/models/llama"
|
||||
_ "github.com/ollama/ollama/model/models/mllama"
|
||||
)
|
||||
|
||||
@@ -18,6 +18,15 @@ const (
|
||||
SpecialEOS
|
||||
)
|
||||
|
||||
const (
|
||||
TOKEN_TYPE_NORMAL = iota + 1
|
||||
TOKEN_TYPE_UNKNOWN
|
||||
TOKEN_TYPE_CONTROL
|
||||
TOKEN_TYPE_USER_DEFINED
|
||||
TOKEN_TYPE_UNUSED
|
||||
TOKEN_TYPE_BYTE
|
||||
)
|
||||
|
||||
type TextProcessor interface {
|
||||
Encode(string) ([]int32, error)
|
||||
Decode([]int32) (string, error)
|
||||
@@ -27,7 +36,7 @@ type TextProcessor interface {
|
||||
type Vocabulary struct {
|
||||
Values []string
|
||||
Types []uint32
|
||||
Scores []uint32
|
||||
Scores []float32
|
||||
Merges []string
|
||||
|
||||
BOS, EOS int32
|
||||
@@ -75,7 +84,7 @@ func (v *Vocabulary) Decode(id int32) string {
|
||||
func (v *Vocabulary) SpecialVocabulary() []string {
|
||||
v.specialOnce.Do(func() {
|
||||
for i := range v.Values {
|
||||
if v.Types[i] == 3 {
|
||||
if v.Types[i] == TOKEN_TYPE_CONTROL {
|
||||
v.special = append(v.special, v.Values[i])
|
||||
}
|
||||
}
|
||||
|
||||
220
model/process_text_spm.go
Normal file
220
model/process_text_spm.go
Normal file
@@ -0,0 +1,220 @@
|
||||
package model
|
||||
|
||||
import (
|
||||
"iter"
|
||||
"log/slog"
|
||||
"strings"
|
||||
|
||||
"github.com/dlclark/regexp2"
|
||||
queue "github.com/emirpasic/gods/queues/priorityqueue"
|
||||
)
|
||||
|
||||
const spmWhitespaceSep = "▁"
|
||||
|
||||
func replaceWhitespaceBySeperator(s string) string {
|
||||
return strings.ReplaceAll(s, " ", spmWhitespaceSep)
|
||||
}
|
||||
|
||||
type SentencePieceModel struct {
|
||||
maxTokenLen int
|
||||
pre *regexp2.Regexp
|
||||
vocab *Vocabulary
|
||||
}
|
||||
|
||||
func NewSentencePieceModel(pre string, vocab *Vocabulary) SentencePieceModel {
|
||||
slog.Debug("Tokens", "num tokens", len(vocab.Values), "vals", vocab.Values[:3], "scores", vocab.Scores[:3], "types", vocab.Types[:3])
|
||||
|
||||
counter := map[int]int{}
|
||||
var maxTokenLen int
|
||||
for cnt := range vocab.Types {
|
||||
switch vocab.Types[cnt] {
|
||||
case TOKEN_TYPE_NORMAL, TOKEN_TYPE_USER_DEFINED, TOKEN_TYPE_UNUSED:
|
||||
maxTokenLen = max(maxTokenLen, len(vocab.Values[cnt]))
|
||||
fallthrough
|
||||
default:
|
||||
counter[int(vocab.Types[cnt])] += 1
|
||||
}
|
||||
}
|
||||
|
||||
slog.Debug("Token counts", "normal", counter[TOKEN_TYPE_NORMAL], "unknown", counter[TOKEN_TYPE_UNKNOWN], "control", counter[TOKEN_TYPE_CONTROL],
|
||||
"user defined", counter[TOKEN_TYPE_USER_DEFINED], "unused", counter[TOKEN_TYPE_UNUSED], "byte", counter[TOKEN_TYPE_BYTE],
|
||||
"max token len", maxTokenLen)
|
||||
|
||||
return SentencePieceModel{
|
||||
maxTokenLen: maxTokenLen,
|
||||
pre: regexp2.MustCompile(pre, regexp2.Unicode|regexp2.RE2),
|
||||
vocab: vocab,
|
||||
}
|
||||
}
|
||||
|
||||
func (spm SentencePieceModel) Is(id int32, special Special) bool {
|
||||
return spm.vocab.Is(id, special)
|
||||
}
|
||||
|
||||
func (spm *SentencePieceModel) split(s string) iter.Seq[string] {
|
||||
return func(yield func(string) bool) {
|
||||
for m, _ := spm.pre.FindStringMatch(s); m != nil; m, _ = spm.pre.FindNextMatch(m) {
|
||||
if !yield(m.String()) {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (spm SentencePieceModel) Encode(s string) ([]int32, error) {
|
||||
fragments := []fragment{{value: s}}
|
||||
for _, special := range spm.vocab.SpecialVocabulary() {
|
||||
// TODO: process special tokens concurrently
|
||||
id := spm.vocab.Encode(special)
|
||||
for i := 0; i < len(fragments); i++ {
|
||||
frag := fragments[i]
|
||||
if len(frag.ids) > 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
var middle []fragment
|
||||
switch i := strings.Index(frag.value, special); {
|
||||
case i < 0:
|
||||
middle = append(middle, frag)
|
||||
case i > 0:
|
||||
middle = append(middle, fragment{value: frag.value[:i]})
|
||||
fallthrough
|
||||
default:
|
||||
middle = append(middle, fragment{value: special, ids: []int32{id}})
|
||||
if rest := frag.value[i+len(special):]; rest != "" {
|
||||
middle = append(middle, fragment{value: rest})
|
||||
}
|
||||
}
|
||||
|
||||
fragments = append(fragments[:i], append(middle, fragments[i+1:]...)...)
|
||||
}
|
||||
}
|
||||
slog.Debug("fragments", "frags", fragments)
|
||||
|
||||
var ids []int32
|
||||
for _, frag := range fragments {
|
||||
if len(frag.ids) > 0 {
|
||||
ids = append(ids, frag.ids...)
|
||||
continue
|
||||
}
|
||||
|
||||
for split := range spm.split(frag.value) {
|
||||
split = replaceWhitespaceBySeperator(split)
|
||||
|
||||
var sb strings.Builder
|
||||
sb.Write([]byte(split))
|
||||
if id := spm.vocab.Encode(sb.String()); id >= 0 {
|
||||
ids = append(ids, id)
|
||||
continue
|
||||
}
|
||||
|
||||
runes := []rune(sb.String())
|
||||
pq := queue.NewWith(func(a, b any) int {
|
||||
priA := a.(*candidate)
|
||||
priB := b.(*candidate)
|
||||
if priA.score > priB.score || (priA.score == priB.score && priA.a < priB.a) {
|
||||
return 1
|
||||
}
|
||||
return -1
|
||||
})
|
||||
|
||||
merges := make([]merge, len(runes))
|
||||
for r := range runes {
|
||||
merges[r] = merge{
|
||||
p: r - 1,
|
||||
n: r + 1,
|
||||
runes: []rune{runes[r]},
|
||||
}
|
||||
}
|
||||
|
||||
pairwise := func(a, b int) *candidate {
|
||||
if a < 0 || b >= len(runes) {
|
||||
return nil
|
||||
}
|
||||
|
||||
left, right := string(merges[a].runes), string(merges[b].runes)
|
||||
if id := spm.vocab.Encode(left + right); id >= 0 {
|
||||
return &candidate{
|
||||
a: a,
|
||||
b: b,
|
||||
length: len(left + " " + right),
|
||||
score: spm.vocab.Scores[id],
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
for i := range len(runes) - 1 {
|
||||
if pair := pairwise(i, i+1); pair != nil {
|
||||
pq.Enqueue(pair)
|
||||
}
|
||||
}
|
||||
|
||||
pqv := pq.Values()
|
||||
for _, v := range pqv {
|
||||
e := v.(*candidate)
|
||||
slog.Debug("candidate", "candidate", e)
|
||||
}
|
||||
|
||||
for !pq.Empty() {
|
||||
v, _ := pq.Dequeue()
|
||||
pair := v.(*candidate)
|
||||
left, right := merges[pair.a], merges[pair.b]
|
||||
|
||||
if len(left.runes) == 0 || len(right.runes) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
merges[pair.a].runes = append(left.runes, right.runes...)
|
||||
merges[pair.b].runes = nil
|
||||
merges[pair.a].n = right.n
|
||||
if right.n < len(merges) {
|
||||
merges[right.n].p = pair.a
|
||||
}
|
||||
|
||||
if pair := pairwise(merges[pair.a].p, pair.a); pair != nil {
|
||||
pq.Enqueue(pair)
|
||||
}
|
||||
|
||||
if pair := pairwise(pair.a, merges[pair.a].n); pair != nil {
|
||||
pq.Enqueue(pair)
|
||||
}
|
||||
}
|
||||
|
||||
slog.Debug("merges", "merges", merges)
|
||||
|
||||
for _, merge := range merges {
|
||||
if len(merge.runes) > 0 {
|
||||
if id := spm.vocab.Encode(string(merge.runes)); id >= 0 {
|
||||
ids = append(ids, id)
|
||||
} else {
|
||||
slog.Debug("missing token", "token", string(merge.runes))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
slog.Debug("encoded", "ids", ids)
|
||||
|
||||
return ids, nil
|
||||
}
|
||||
|
||||
type candidate struct {
|
||||
a, b int
|
||||
score float32
|
||||
length int
|
||||
}
|
||||
|
||||
func (spm SentencePieceModel) Decode(ids []int32) (string, error) {
|
||||
var sb strings.Builder
|
||||
for _, id := range ids {
|
||||
data := spm.vocab.Decode(id)
|
||||
data = strings.ReplaceAll(data, spmWhitespaceSep, " ")
|
||||
if _, err := sb.WriteString(data); err != nil {
|
||||
return "", err
|
||||
}
|
||||
}
|
||||
|
||||
slog.Debug("decoded", "ids", ids, "text", sb.String())
|
||||
return sb.String(), nil
|
||||
}
|
||||
@@ -1,7 +1,6 @@
|
||||
package progress
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"io"
|
||||
"sync"
|
||||
@@ -14,8 +13,7 @@ type State interface {
|
||||
|
||||
type Progress struct {
|
||||
mu sync.Mutex
|
||||
// buffer output to minimize flickering on all terminals
|
||||
w *bufio.Writer
|
||||
w io.Writer
|
||||
|
||||
pos int
|
||||
|
||||
@@ -24,7 +22,7 @@ type Progress struct {
|
||||
}
|
||||
|
||||
func NewProgress(w io.Writer) *Progress {
|
||||
p := &Progress{w: bufio.NewWriter(w)}
|
||||
p := &Progress{w: w}
|
||||
go p.start()
|
||||
return p
|
||||
}
|
||||
@@ -50,14 +48,11 @@ func (p *Progress) Stop() bool {
|
||||
stopped := p.stop()
|
||||
if stopped {
|
||||
fmt.Fprint(p.w, "\n")
|
||||
p.w.Flush()
|
||||
}
|
||||
return stopped
|
||||
}
|
||||
|
||||
func (p *Progress) StopAndClear() bool {
|
||||
defer p.w.Flush()
|
||||
|
||||
fmt.Fprint(p.w, "\033[?25l")
|
||||
defer fmt.Fprint(p.w, "\033[?25h")
|
||||
|
||||
@@ -86,24 +81,20 @@ func (p *Progress) render() {
|
||||
p.mu.Lock()
|
||||
defer p.mu.Unlock()
|
||||
|
||||
defer p.w.Flush()
|
||||
|
||||
// eliminate flickering on terminals that support synchronized output
|
||||
fmt.Fprint(p.w, "\033[?2026h")
|
||||
defer fmt.Fprint(p.w, "\033[?2026l")
|
||||
|
||||
fmt.Fprint(p.w, "\033[?25l")
|
||||
defer fmt.Fprint(p.w, "\033[?25h")
|
||||
|
||||
// move the cursor back to the beginning
|
||||
for range p.pos - 1 {
|
||||
fmt.Fprint(p.w, "\033[A")
|
||||
// clear already rendered progress lines
|
||||
for i := range p.pos {
|
||||
if i > 0 {
|
||||
fmt.Fprint(p.w, "\033[A")
|
||||
}
|
||||
fmt.Fprint(p.w, "\033[2K\033[1G")
|
||||
}
|
||||
fmt.Fprint(p.w, "\033[1G")
|
||||
|
||||
// render progress lines
|
||||
for i, state := range p.states {
|
||||
fmt.Fprint(p.w, state.String(), "\033[K")
|
||||
fmt.Fprint(p.w, state.String())
|
||||
if i < len(p.states)-1 {
|
||||
fmt.Fprint(p.w, "\n")
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user