mirror of
https://github.com/ollama/ollama.git
synced 2026-01-06 06:31:14 -05:00
Compare commits
9 Commits
v0.5.3
...
parth/temp
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6556540655 | ||
|
|
3f60fd57e3 | ||
|
|
38cd80d52c | ||
|
|
c9a46140e6 | ||
|
|
1d529d8b7b | ||
|
|
a72f2dce45 | ||
|
|
08a832b482 | ||
|
|
2ddc32d5c5 | ||
|
|
2cde4b8817 |
@@ -407,8 +407,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
|
||||
### Database
|
||||
|
||||
- [PostgreSQL extension pgai](https://github.com/timescale/pgai) (Create and search embeddings from Ollama models using pgvector)
|
||||
- [Get started guide](https://github.com/timescale/pgai/blob/main/docs/ollama.md)
|
||||
- [pgai](https://github.com/timescale/pgai) - PostgreSQL as a vector database (Create and search embeddings from Ollama models using pgvector)
|
||||
- [Get started guide](https://github.com/timescale/pgai/blob/main/docs/vectorizer-quick-start.md)
|
||||
- [MindsDB](https://github.com/mindsdb/mindsdb/blob/staging/mindsdb/integrations/handlers/ollama_handler/README.md) (Connects Ollama models with nearly 200 data platforms and apps)
|
||||
- [chromem-go](https://github.com/philippgille/chromem-go/blob/v0.5.0/embed_ollama.go) with [example](https://github.com/philippgille/chromem-go/tree/v0.5.0/examples/rag-wikipedia-ollama)
|
||||
- [Kangaroo](https://github.com/dbkangaroo/kangaroo) (AI-powered SQL client and admin tool for popular databases)
|
||||
|
||||
10
api/types.go
10
api/types.go
@@ -103,10 +103,18 @@ type ChatRequest struct {
|
||||
// Tools is an optional list of tools the model has access to.
|
||||
Tools `json:"tools,omitempty"`
|
||||
|
||||
Debug *Debug `json:"debug,omitempty"`
|
||||
|
||||
Dry bool `json:"dry,omitempty"`
|
||||
|
||||
// Options lists model-specific options.
|
||||
Options map[string]interface{} `json:"options"`
|
||||
}
|
||||
|
||||
type Debug struct {
|
||||
Include []string `json:"include,omitempty"`
|
||||
}
|
||||
|
||||
type Tools []Tool
|
||||
|
||||
func (t Tools) String() string {
|
||||
@@ -190,6 +198,8 @@ type ChatResponse struct {
|
||||
Message Message `json:"message"`
|
||||
DoneReason string `json:"done_reason,omitempty"`
|
||||
|
||||
Debug map[string]any `json:"debug,omitempty"`
|
||||
|
||||
Done bool `json:"done"`
|
||||
|
||||
Metrics
|
||||
|
||||
99
llama/llama.cpp
vendored
99
llama/llama.cpp
vendored
@@ -3051,6 +3051,13 @@ struct llama_kv_cache {
|
||||
}
|
||||
};
|
||||
|
||||
// block of KV slots to move when defragging
|
||||
struct llama_kv_defrag_move {
|
||||
uint32_t src;
|
||||
uint32_t dst;
|
||||
uint32_t len;
|
||||
};
|
||||
|
||||
struct llama_control_vector {
|
||||
std::vector<struct ggml_tensor *> tensors; // per layer
|
||||
std::vector<ggml_context_ptr> ctxs;
|
||||
@@ -10828,35 +10835,23 @@ struct llm_build_context {
|
||||
return gf;
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
|
||||
struct ggml_cgraph * build_defrag(const std::vector<struct llama_kv_defrag_move> & moves) {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
for (uint32_t i = 0; i < ids.size(); ++i) {
|
||||
const uint32_t id = ids[i];
|
||||
|
||||
if (i == id || id == ids.size()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
uint32_t nm = 1;
|
||||
|
||||
while (i + nm < ids.size() && ids[i + nm] == id + nm) {
|
||||
nm++;
|
||||
}
|
||||
|
||||
for (const auto & move : moves) {
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
|
||||
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
|
||||
|
||||
ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
|
||||
n_embd_k_gqa, nm,
|
||||
n_embd_k_gqa, move.len,
|
||||
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
||||
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i));
|
||||
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*move.src));
|
||||
|
||||
ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
|
||||
n_embd_k_gqa, nm,
|
||||
n_embd_k_gqa, move.len,
|
||||
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
||||
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
|
||||
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*move.dst));
|
||||
|
||||
ggml_tensor * view_v_src;
|
||||
ggml_tensor * view_v_dst;
|
||||
@@ -10864,31 +10859,29 @@ struct llm_build_context {
|
||||
if (flash_attn) {
|
||||
// NOTE: the V cache is not transposed when using flash attention
|
||||
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
|
||||
n_embd_v_gqa, nm,
|
||||
n_embd_v_gqa, move.len,
|
||||
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
|
||||
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i));
|
||||
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*move.src));
|
||||
|
||||
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
|
||||
n_embd_v_gqa, nm,
|
||||
n_embd_v_gqa, move.len,
|
||||
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
|
||||
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id));
|
||||
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*move.dst));
|
||||
} else {
|
||||
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
|
||||
nm, n_embd_v_gqa,
|
||||
move.len, n_embd_v_gqa,
|
||||
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
||||
ggml_row_size(kv_self.v_l[il]->type, i));
|
||||
ggml_row_size(kv_self.v_l[il]->type, move.src));
|
||||
|
||||
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
|
||||
nm, n_embd_v_gqa,
|
||||
move.len, n_embd_v_gqa,
|
||||
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
||||
ggml_row_size(kv_self.v_l[il]->type, id));
|
||||
ggml_row_size(kv_self.v_l[il]->type, move.dst));
|
||||
}
|
||||
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
|
||||
}
|
||||
|
||||
i += nm - 1;
|
||||
}
|
||||
|
||||
//LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
|
||||
@@ -17351,7 +17344,7 @@ struct llm_build_context {
|
||||
}
|
||||
};
|
||||
|
||||
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
||||
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<struct llama_kv_defrag_move> & moves) {
|
||||
llama_ubatch dummy = {};
|
||||
dummy.equal_seqs = true;
|
||||
|
||||
@@ -17361,7 +17354,7 @@ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const
|
||||
|
||||
llm.init();
|
||||
|
||||
struct ggml_cgraph * result = llm.build_defrag(ids);
|
||||
struct ggml_cgraph * result = llm.build_defrag(moves);
|
||||
|
||||
llm.free();
|
||||
|
||||
@@ -18377,7 +18370,12 @@ static int llama_decode_internal(
|
||||
kv_self.head = 0;
|
||||
}
|
||||
|
||||
const auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
|
||||
auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
|
||||
if (!slot) {
|
||||
llama_kv_cache_defrag(kv_self);
|
||||
llama_kv_cache_update(&lctx);
|
||||
slot = llama_kv_cache_find_slot(kv_self, ubatch);
|
||||
}
|
||||
if (!slot) {
|
||||
return 1;
|
||||
}
|
||||
@@ -18782,8 +18780,8 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
||||
|
||||
//const int64_t t_start = ggml_time_us();
|
||||
|
||||
// number of cells moved
|
||||
uint32_t n_moves = 0;
|
||||
// groups of cells moved
|
||||
std::vector<struct llama_kv_defrag_move> moves;
|
||||
|
||||
// each move requires 6*n_layer tensors (see build_defrag)
|
||||
// - source view, destination view, copy operation
|
||||
@@ -18847,19 +18845,11 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
||||
// are we moving a continuous block of memory?
|
||||
bool cont = false;
|
||||
|
||||
// should we stop searching for the next move?
|
||||
bool stop = false;
|
||||
|
||||
// go back and move the nf cells to the hole
|
||||
for (; i1 < n_kv; ++i1) {
|
||||
auto & cell1 = kv_self.cells[i1];
|
||||
|
||||
if (cell1.is_empty() || ids[i1] != n_kv) {
|
||||
if (n_moves == max_moves) {
|
||||
stop = true;
|
||||
break;
|
||||
}
|
||||
|
||||
cont = false;
|
||||
continue;
|
||||
}
|
||||
@@ -18875,8 +18865,10 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
||||
kv_self.head = n_used;
|
||||
|
||||
if (!cont) {
|
||||
n_moves++;
|
||||
moves.push_back({i1, i0 + nf, 1});
|
||||
cont = true;
|
||||
} else {
|
||||
moves.back().len++;
|
||||
}
|
||||
|
||||
nf++;
|
||||
@@ -18886,22 +18878,16 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
||||
}
|
||||
}
|
||||
|
||||
if (stop || n_moves == max_moves) {
|
||||
break;
|
||||
}
|
||||
|
||||
//LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
|
||||
|
||||
i0 += nh - 1;
|
||||
}
|
||||
|
||||
if (n_moves == 0) {
|
||||
if (moves.size() == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
//LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
|
||||
|
||||
//LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer);
|
||||
//LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", moves.size());
|
||||
|
||||
#if 0
|
||||
// CPU defrag
|
||||
@@ -18976,11 +18962,18 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
||||
#else
|
||||
// ggml_graph defrag
|
||||
|
||||
ggml_backend_sched_reset(lctx.sched.get());
|
||||
for (std::size_t i = 0; i < moves.size(); i += max_moves) {
|
||||
std::vector<struct llama_kv_defrag_move> chunk;
|
||||
auto end = std::min(i + max_moves, moves.size());
|
||||
chunk.assign(moves.begin() + i, moves.begin() + end);
|
||||
|
||||
ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
|
||||
ggml_backend_sched_reset(lctx.sched.get());
|
||||
|
||||
llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
|
||||
//LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*chunk.size()*n_layer);
|
||||
ggml_cgraph * gf = llama_build_graph_defrag(lctx, chunk);
|
||||
|
||||
llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
|
||||
}
|
||||
#endif
|
||||
|
||||
//const int64_t t_end = ggml_time_us();
|
||||
|
||||
@@ -0,0 +1,242 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Jesse Gross <jesse@ollama.com>
|
||||
Date: Fri, 13 Dec 2024 16:11:59 -0800
|
||||
Subject: [PATCH] llama: Ensure KV cache is fully defragmented.
|
||||
|
||||
Sometimes the KV cache requires defragmentation even without
|
||||
triggering the threshold heuristic. In this case, decoding
|
||||
will not being able to find a KV cache slot. This is particularly
|
||||
difficult for the caller to handle if it happens in between
|
||||
ubatches. To avoid this, we should immediately trigger a defrag.
|
||||
|
||||
In addition, a heavily fragmented cache can require more than
|
||||
max_moves to defragment. Currently, we stop when we hit the limit
|
||||
but this can leave a cache that still does not have adequate space
|
||||
even after defragmentation is triggered. Instead, we should do
|
||||
multiple batches of processing until everything is complete.
|
||||
---
|
||||
src/llama.cpp | 99 ++++++++++++++++++++++++---------------------------
|
||||
1 file changed, 46 insertions(+), 53 deletions(-)
|
||||
|
||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||
index 4778a9ed..654e32bc 100644
|
||||
--- a/src/llama.cpp
|
||||
+++ b/src/llama.cpp
|
||||
@@ -3025,6 +3025,13 @@ struct llama_kv_cache {
|
||||
}
|
||||
};
|
||||
|
||||
+// block of KV slots to move when defragging
|
||||
+struct llama_kv_defrag_move {
|
||||
+ uint32_t src;
|
||||
+ uint32_t dst;
|
||||
+ uint32_t len;
|
||||
+};
|
||||
+
|
||||
struct llama_control_vector {
|
||||
std::vector<struct ggml_tensor *> tensors; // per layer
|
||||
std::vector<ggml_context_ptr> ctxs;
|
||||
@@ -10802,35 +10809,23 @@ struct llm_build_context {
|
||||
return gf;
|
||||
}
|
||||
|
||||
- struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
|
||||
+ struct ggml_cgraph * build_defrag(const std::vector<struct llama_kv_defrag_move> & moves) {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
- for (uint32_t i = 0; i < ids.size(); ++i) {
|
||||
- const uint32_t id = ids[i];
|
||||
-
|
||||
- if (i == id || id == ids.size()) {
|
||||
- continue;
|
||||
- }
|
||||
-
|
||||
- uint32_t nm = 1;
|
||||
-
|
||||
- while (i + nm < ids.size() && ids[i + nm] == id + nm) {
|
||||
- nm++;
|
||||
- }
|
||||
-
|
||||
+ for (const auto & move : moves) {
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
|
||||
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
|
||||
|
||||
ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
|
||||
- n_embd_k_gqa, nm,
|
||||
+ n_embd_k_gqa, move.len,
|
||||
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
||||
- ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i));
|
||||
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*move.src));
|
||||
|
||||
ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
|
||||
- n_embd_k_gqa, nm,
|
||||
+ n_embd_k_gqa, move.len,
|
||||
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
||||
- ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
|
||||
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*move.dst));
|
||||
|
||||
ggml_tensor * view_v_src;
|
||||
ggml_tensor * view_v_dst;
|
||||
@@ -10838,31 +10833,29 @@ struct llm_build_context {
|
||||
if (flash_attn) {
|
||||
// NOTE: the V cache is not transposed when using flash attention
|
||||
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
|
||||
- n_embd_v_gqa, nm,
|
||||
+ n_embd_v_gqa, move.len,
|
||||
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
|
||||
- ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i));
|
||||
+ ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*move.src));
|
||||
|
||||
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
|
||||
- n_embd_v_gqa, nm,
|
||||
+ n_embd_v_gqa, move.len,
|
||||
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
|
||||
- ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id));
|
||||
+ ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*move.dst));
|
||||
} else {
|
||||
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
|
||||
- nm, n_embd_v_gqa,
|
||||
+ move.len, n_embd_v_gqa,
|
||||
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
||||
- ggml_row_size(kv_self.v_l[il]->type, i));
|
||||
+ ggml_row_size(kv_self.v_l[il]->type, move.src));
|
||||
|
||||
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
|
||||
- nm, n_embd_v_gqa,
|
||||
+ move.len, n_embd_v_gqa,
|
||||
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
||||
- ggml_row_size(kv_self.v_l[il]->type, id));
|
||||
+ ggml_row_size(kv_self.v_l[il]->type, move.dst));
|
||||
}
|
||||
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
|
||||
}
|
||||
-
|
||||
- i += nm - 1;
|
||||
}
|
||||
|
||||
//LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
|
||||
@@ -17325,7 +17318,7 @@ struct llm_build_context {
|
||||
}
|
||||
};
|
||||
|
||||
-static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
||||
+static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<struct llama_kv_defrag_move> & moves) {
|
||||
llama_ubatch dummy = {};
|
||||
dummy.equal_seqs = true;
|
||||
|
||||
@@ -17335,7 +17328,7 @@ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const
|
||||
|
||||
llm.init();
|
||||
|
||||
- struct ggml_cgraph * result = llm.build_defrag(ids);
|
||||
+ struct ggml_cgraph * result = llm.build_defrag(moves);
|
||||
|
||||
llm.free();
|
||||
|
||||
@@ -18351,7 +18344,12 @@ static int llama_decode_internal(
|
||||
kv_self.head = 0;
|
||||
}
|
||||
|
||||
- const auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
|
||||
+ auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
|
||||
+ if (!slot) {
|
||||
+ llama_kv_cache_defrag(kv_self);
|
||||
+ llama_kv_cache_update(&lctx);
|
||||
+ slot = llama_kv_cache_find_slot(kv_self, ubatch);
|
||||
+ }
|
||||
if (!slot) {
|
||||
return 1;
|
||||
}
|
||||
@@ -18756,8 +18754,8 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
||||
|
||||
//const int64_t t_start = ggml_time_us();
|
||||
|
||||
- // number of cells moved
|
||||
- uint32_t n_moves = 0;
|
||||
+ // groups of cells moved
|
||||
+ std::vector<struct llama_kv_defrag_move> moves;
|
||||
|
||||
// each move requires 6*n_layer tensors (see build_defrag)
|
||||
// - source view, destination view, copy operation
|
||||
@@ -18821,19 +18819,11 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
||||
// are we moving a continuous block of memory?
|
||||
bool cont = false;
|
||||
|
||||
- // should we stop searching for the next move?
|
||||
- bool stop = false;
|
||||
-
|
||||
// go back and move the nf cells to the hole
|
||||
for (; i1 < n_kv; ++i1) {
|
||||
auto & cell1 = kv_self.cells[i1];
|
||||
|
||||
if (cell1.is_empty() || ids[i1] != n_kv) {
|
||||
- if (n_moves == max_moves) {
|
||||
- stop = true;
|
||||
- break;
|
||||
- }
|
||||
-
|
||||
cont = false;
|
||||
continue;
|
||||
}
|
||||
@@ -18849,8 +18839,10 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
||||
kv_self.head = n_used;
|
||||
|
||||
if (!cont) {
|
||||
- n_moves++;
|
||||
+ moves.push_back({i1, i0 + nf, 1});
|
||||
cont = true;
|
||||
+ } else {
|
||||
+ moves.back().len++;
|
||||
}
|
||||
|
||||
nf++;
|
||||
@@ -18860,22 +18852,16 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
||||
}
|
||||
}
|
||||
|
||||
- if (stop || n_moves == max_moves) {
|
||||
- break;
|
||||
- }
|
||||
-
|
||||
//LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
|
||||
|
||||
i0 += nh - 1;
|
||||
}
|
||||
|
||||
- if (n_moves == 0) {
|
||||
+ if (moves.size() == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
- //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
|
||||
-
|
||||
- //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer);
|
||||
+ //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", moves.size());
|
||||
|
||||
#if 0
|
||||
// CPU defrag
|
||||
@@ -18950,11 +18936,18 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
||||
#else
|
||||
// ggml_graph defrag
|
||||
|
||||
- ggml_backend_sched_reset(lctx.sched.get());
|
||||
+ for (std::size_t i = 0; i < moves.size(); i += max_moves) {
|
||||
+ std::vector<struct llama_kv_defrag_move> chunk;
|
||||
+ auto end = std::min(i + max_moves, moves.size());
|
||||
+ chunk.assign(moves.begin() + i, moves.begin() + end);
|
||||
|
||||
- ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
|
||||
+ ggml_backend_sched_reset(lctx.sched.get());
|
||||
+
|
||||
+ //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*chunk.size()*n_layer);
|
||||
+ ggml_cgraph * gf = llama_build_graph_defrag(lctx, chunk);
|
||||
|
||||
- llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
|
||||
+ llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
|
||||
+ }
|
||||
#endif
|
||||
|
||||
//const int64_t t_end = ggml_time_us();
|
||||
@@ -433,14 +433,7 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
|
||||
|
||||
err := s.lc.Decode(batch)
|
||||
if err != nil {
|
||||
if errors.Is(err, llama.ErrKvCacheFull) {
|
||||
slog.Debug("defragmenting kv cache")
|
||||
s.cache.lc.KvCacheDefrag()
|
||||
err = s.lc.Decode(batch)
|
||||
}
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to decode batch: %w", err)
|
||||
}
|
||||
return fmt.Errorf("failed to decode batch: %w", err)
|
||||
}
|
||||
|
||||
if crossAttention {
|
||||
|
||||
@@ -700,20 +700,24 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
|
||||
}
|
||||
|
||||
if len(req.Format) > 0 {
|
||||
switch {
|
||||
case bytes.Equal(req.Format, []byte(`""`)):
|
||||
// fallthrough
|
||||
case bytes.Equal(req.Format, []byte(`"json"`)):
|
||||
switch string(req.Format) {
|
||||
case `null`, `""`:
|
||||
// Field was set, but "missing" a value. We accept
|
||||
// these as "not set".
|
||||
break
|
||||
case `"json"`:
|
||||
request["grammar"] = grammarJSON
|
||||
case bytes.HasPrefix(req.Format, []byte("{")):
|
||||
default:
|
||||
if req.Format[0] != '{' {
|
||||
return fmt.Errorf("invalid format: %q; expected \"json\" or a valid JSON Schema object", req.Format)
|
||||
}
|
||||
|
||||
// User provided a JSON schema
|
||||
g := llama.SchemaToGrammar(req.Format)
|
||||
if g == nil {
|
||||
return fmt.Errorf("invalid JSON schema in format")
|
||||
}
|
||||
request["grammar"] = string(g)
|
||||
default:
|
||||
return fmt.Errorf("invalid format: %q; expected \"json\" or a valid JSON Schema", req.Format)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -39,25 +39,34 @@ func TestLLMServerCompletionFormat(t *testing.T) {
|
||||
|
||||
cancel() // prevent further processing if request makes it past the format check
|
||||
|
||||
checkCanceled := func(err error) {
|
||||
checkValid := func(err error) {
|
||||
t.Helper()
|
||||
if !errors.Is(err, context.Canceled) {
|
||||
t.Fatalf("Completion: err = %v; expected context.Canceled", err)
|
||||
}
|
||||
}
|
||||
|
||||
valids := []string{`"json"`, `{"type":"object"}`, ``, `""`}
|
||||
valids := []string{
|
||||
// "missing"
|
||||
``,
|
||||
`""`,
|
||||
`null`,
|
||||
|
||||
// JSON
|
||||
`"json"`,
|
||||
`{"type":"object"}`,
|
||||
}
|
||||
for _, valid := range valids {
|
||||
err := s.Completion(ctx, CompletionRequest{
|
||||
Options: new(api.Options),
|
||||
Format: []byte(valid),
|
||||
}, nil)
|
||||
checkCanceled(err)
|
||||
checkValid(err)
|
||||
}
|
||||
|
||||
err := s.Completion(ctx, CompletionRequest{
|
||||
Options: new(api.Options),
|
||||
Format: nil, // missing format
|
||||
}, nil)
|
||||
checkCanceled(err)
|
||||
checkValid(err)
|
||||
}
|
||||
|
||||
@@ -15,28 +15,36 @@ export CGO_CXXFLAGS=-mmacosx-version-min=11.3
|
||||
export CGO_LDFLAGS=-mmacosx-version-min=11.3
|
||||
|
||||
rm -rf llama/build dist/darwin-*
|
||||
|
||||
# Generate the universal ollama binary for stand-alone usage: metal + avx
|
||||
echo "Building binary"
|
||||
echo "Building darwin arm64"
|
||||
GOOS=darwin ARCH=arm64 GOARCH=arm64 make -j 8 dist
|
||||
echo "Building darwin amd64 with AVX enabled"
|
||||
GOOS=darwin ARCH=amd64 GOARCH=amd64 CUSTOM_CPU_FLAGS="avx" make -j 8 dist_exe
|
||||
|
||||
# Generate the universal ollama binary for stand-alone usage: metal + avx
|
||||
lipo -create -output dist/ollama-darwin dist/darwin-arm64/bin/ollama dist/darwin-amd64/bin/ollama
|
||||
|
||||
# sign the binary and rename it
|
||||
if [ -n "$APPLE_IDENTITY" ]; then
|
||||
codesign -f --timestamp -s "$APPLE_IDENTITY" --identifier ai.ollama.ollama --options=runtime dist/ollama-darwin
|
||||
else
|
||||
echo "WARNING: Skipping code signing - set APPLE_IDENTITY"
|
||||
fi
|
||||
ditto -c -k --keepParent dist/ollama-darwin dist/temp.zip
|
||||
if [ -n "$APPLE_IDENTITY" ]; then
|
||||
xcrun notarytool submit dist/temp.zip --wait --timeout 10m --apple-id $APPLE_ID --password $APPLE_PASSWORD --team-id $APPLE_TEAM_ID
|
||||
fi
|
||||
rm -f dist/temp.zip
|
||||
|
||||
# Build the app bundle
|
||||
echo "Building app"
|
||||
echo "Building darwin amd64 with runners"
|
||||
rm dist/darwin-amd64/bin/ollama
|
||||
GOOS=darwin ARCH=amd64 GOARCH=amd64 make -j 8 dist
|
||||
|
||||
# Generate the universal ollama binary for the app bundle: metal + no-avx
|
||||
lipo -create -output dist/ollama dist/darwin-arm64/bin/ollama dist/darwin-amd64/bin/ollama
|
||||
|
||||
|
||||
if [ -n "$APPLE_IDENTITY" ]; then
|
||||
codesign --deep --force --options=runtime --sign "$APPLE_IDENTITY" --timestamp dist/ollama
|
||||
else
|
||||
echo "Skipping code signing - set APPLE_IDENTITY"
|
||||
fi
|
||||
chmod +x dist/ollama
|
||||
|
||||
# build and optionally sign the mac app
|
||||
npm install --prefix macapp
|
||||
if [ -n "$APPLE_IDENTITY" ]; then
|
||||
@@ -46,14 +54,3 @@ else
|
||||
fi
|
||||
cp macapp/out/make/zip/darwin/universal/Ollama-darwin-universal-$VERSION.zip dist/Ollama-darwin.zip
|
||||
|
||||
# sign the binary and rename it
|
||||
if [ -n "$APPLE_IDENTITY" ]; then
|
||||
codesign -f --timestamp -s "$APPLE_IDENTITY" --identifier ai.ollama.ollama --options=runtime dist/ollama
|
||||
else
|
||||
echo "WARNING: Skipping code signing - set APPLE_IDENTITY"
|
||||
fi
|
||||
ditto -c -k --keepParent dist/ollama dist/temp.zip
|
||||
if [ -n "$APPLE_IDENTITY" ]; then
|
||||
xcrun notarytool submit dist/temp.zip --wait --timeout 10m --apple-id $APPLE_ID --password $APPLE_PASSWORD --team-id $APPLE_TEAM_ID
|
||||
fi
|
||||
rm -f dist/temp.zip
|
||||
|
||||
@@ -82,6 +82,10 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
|
||||
}
|
||||
|
||||
currMsgIdx := n
|
||||
// Warn user if messages are truncated from the input
|
||||
if numTruncatedMessages := len(msgs[0:currMsgIdx]); numTruncatedMessages > 0 {
|
||||
slog.Warn("truncated first messages from input", "num_truncated", numTruncatedMessages)
|
||||
}
|
||||
|
||||
for cnt, msg := range msgs[currMsgIdx:] {
|
||||
prefix := ""
|
||||
|
||||
@@ -1539,6 +1539,34 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
if req.Dry {
|
||||
var debug map[string]any
|
||||
if req.Debug != nil && req.Debug.Include != nil && slices.Contains(req.Debug.Include, "prompt") {
|
||||
debug = map[string]any{"prompt": prompt}
|
||||
}
|
||||
tokens, err := r.Tokenize(c.Request.Context(), prompt)
|
||||
if err != nil {
|
||||
slog.Error("tokenize error", "error", err)
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
c.JSON(http.StatusOK, api.ChatResponse{
|
||||
Model: req.Model,
|
||||
CreatedAt: time.Now().UTC(),
|
||||
Message: api.Message{Role: "assistant", Content: ""},
|
||||
Done: true,
|
||||
DoneReason: "dry_run",
|
||||
Debug: debug,
|
||||
Metrics: api.Metrics{
|
||||
PromptEvalCount: len(tokens),
|
||||
PromptEvalDuration: 0,
|
||||
EvalCount: 0,
|
||||
EvalDuration: 0,
|
||||
},
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
slog.Debug("chat request", "images", len(images), "prompt", prompt)
|
||||
|
||||
ch := make(chan any)
|
||||
@@ -1571,6 +1599,16 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
||||
res.LoadDuration = checkpointLoaded.Sub(checkpointStart)
|
||||
}
|
||||
|
||||
if req.Debug != nil && req.Debug.Include != nil && slices.Contains(req.Debug.Include, "prompt") {
|
||||
res.Debug = map[string]any{"prompt": prompt}
|
||||
if req.Stream != nil && !*req.Stream {
|
||||
tempMsg := res.Message
|
||||
res.Message = api.Message{Role: "assistant", Content: ""}
|
||||
ch <- res
|
||||
res.Message = tempMsg
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: tool call checking and filtering should be moved outside of this callback once streaming
|
||||
// however this was a simple change for now without reworking streaming logic of this (and other)
|
||||
// handlers
|
||||
|
||||
Reference in New Issue
Block a user