Compare commits

..

9 Commits

Author SHA1 Message Date
ParthSareen
6556540655 User interface prototype 2024-12-19 16:43:36 -08:00
ParthSareen
3f60fd57e3 Remove /template API 2024-12-19 14:47:51 -08:00
ParthSareen
38cd80d52c Add dry run option for chat request 2024-12-19 14:17:29 -08:00
ParthSareen
c9a46140e6 Warn user on truncation - ollama logs 2024-12-19 13:48:25 -08:00
ParthSareen
1d529d8b7b Add /template endpoint 2024-12-18 15:23:27 -08:00
Jeffrey Morgan
a72f2dce45 scripts: sign renamed macOS binary (#8131) 2024-12-17 18:03:49 -08:00
Jesse Gross
08a832b482 llama: Ensure KV cache is fully defragmented.
Sometimes the KV cache requires defragmentation even without
triggering the threshold heuristic. In this case, decoding
will not being able to find a KV cache slot. This is particularly
difficult for the caller to handle if it happens in between
ubatches. To avoid this, we should immediately trigger a defrag.

In addition, a heavily fragmented cache can require more than
max_moves to defragment. Currently, we stop when we hit the limit
but this can leave a cache that still does not have adequate space
even after defragmentation is triggered. Instead, we should do
multiple batches of processing until everything is complete.

Fixes #7949
2024-12-17 14:01:19 -08:00
Blake Mizerany
2ddc32d5c5 llm: do not error on "null" format (#8139)
This fixes another regression in the previous commit that fixed other
known bugs.
2024-12-17 09:49:37 -08:00
Jascha Beste
2cde4b8817 readme: change getting started guide link for pgai (#8119) 2024-12-16 22:13:23 -08:00
10 changed files with 385 additions and 95 deletions

View File

@@ -407,8 +407,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
### Database
- [PostgreSQL extension pgai](https://github.com/timescale/pgai) (Create and search embeddings from Ollama models using pgvector)
- [Get started guide](https://github.com/timescale/pgai/blob/main/docs/ollama.md)
- [pgai](https://github.com/timescale/pgai) - PostgreSQL as a vector database (Create and search embeddings from Ollama models using pgvector)
- [Get started guide](https://github.com/timescale/pgai/blob/main/docs/vectorizer-quick-start.md)
- [MindsDB](https://github.com/mindsdb/mindsdb/blob/staging/mindsdb/integrations/handlers/ollama_handler/README.md) (Connects Ollama models with nearly 200 data platforms and apps)
- [chromem-go](https://github.com/philippgille/chromem-go/blob/v0.5.0/embed_ollama.go) with [example](https://github.com/philippgille/chromem-go/tree/v0.5.0/examples/rag-wikipedia-ollama)
- [Kangaroo](https://github.com/dbkangaroo/kangaroo) (AI-powered SQL client and admin tool for popular databases)

View File

@@ -103,10 +103,18 @@ type ChatRequest struct {
// Tools is an optional list of tools the model has access to.
Tools `json:"tools,omitempty"`
Debug *Debug `json:"debug,omitempty"`
Dry bool `json:"dry,omitempty"`
// Options lists model-specific options.
Options map[string]interface{} `json:"options"`
}
type Debug struct {
Include []string `json:"include,omitempty"`
}
type Tools []Tool
func (t Tools) String() string {
@@ -190,6 +198,8 @@ type ChatResponse struct {
Message Message `json:"message"`
DoneReason string `json:"done_reason,omitempty"`
Debug map[string]any `json:"debug,omitempty"`
Done bool `json:"done"`
Metrics

99
llama/llama.cpp vendored
View File

@@ -3051,6 +3051,13 @@ struct llama_kv_cache {
}
};
// block of KV slots to move when defragging
struct llama_kv_defrag_move {
uint32_t src;
uint32_t dst;
uint32_t len;
};
struct llama_control_vector {
std::vector<struct ggml_tensor *> tensors; // per layer
std::vector<ggml_context_ptr> ctxs;
@@ -10828,35 +10835,23 @@ struct llm_build_context {
return gf;
}
struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
struct ggml_cgraph * build_defrag(const std::vector<struct llama_kv_defrag_move> & moves) {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
for (uint32_t i = 0; i < ids.size(); ++i) {
const uint32_t id = ids[i];
if (i == id || id == ids.size()) {
continue;
}
uint32_t nm = 1;
while (i + nm < ids.size() && ids[i + nm] == id + nm) {
nm++;
}
for (const auto & move : moves) {
for (int il = 0; il < n_layer; ++il) {
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
n_embd_k_gqa, nm,
n_embd_k_gqa, move.len,
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i));
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*move.src));
ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
n_embd_k_gqa, nm,
n_embd_k_gqa, move.len,
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*move.dst));
ggml_tensor * view_v_src;
ggml_tensor * view_v_dst;
@@ -10864,31 +10859,29 @@ struct llm_build_context {
if (flash_attn) {
// NOTE: the V cache is not transposed when using flash attention
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
n_embd_v_gqa, nm,
n_embd_v_gqa, move.len,
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i));
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*move.src));
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
n_embd_v_gqa, nm,
n_embd_v_gqa, move.len,
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id));
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*move.dst));
} else {
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
nm, n_embd_v_gqa,
move.len, n_embd_v_gqa,
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
ggml_row_size(kv_self.v_l[il]->type, i));
ggml_row_size(kv_self.v_l[il]->type, move.src));
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
nm, n_embd_v_gqa,
move.len, n_embd_v_gqa,
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
ggml_row_size(kv_self.v_l[il]->type, id));
ggml_row_size(kv_self.v_l[il]->type, move.dst));
}
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
}
i += nm - 1;
}
//LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
@@ -17351,7 +17344,7 @@ struct llm_build_context {
}
};
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<struct llama_kv_defrag_move> & moves) {
llama_ubatch dummy = {};
dummy.equal_seqs = true;
@@ -17361,7 +17354,7 @@ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const
llm.init();
struct ggml_cgraph * result = llm.build_defrag(ids);
struct ggml_cgraph * result = llm.build_defrag(moves);
llm.free();
@@ -18377,7 +18370,12 @@ static int llama_decode_internal(
kv_self.head = 0;
}
const auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
if (!slot) {
llama_kv_cache_defrag(kv_self);
llama_kv_cache_update(&lctx);
slot = llama_kv_cache_find_slot(kv_self, ubatch);
}
if (!slot) {
return 1;
}
@@ -18782,8 +18780,8 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
//const int64_t t_start = ggml_time_us();
// number of cells moved
uint32_t n_moves = 0;
// groups of cells moved
std::vector<struct llama_kv_defrag_move> moves;
// each move requires 6*n_layer tensors (see build_defrag)
// - source view, destination view, copy operation
@@ -18847,19 +18845,11 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
// are we moving a continuous block of memory?
bool cont = false;
// should we stop searching for the next move?
bool stop = false;
// go back and move the nf cells to the hole
for (; i1 < n_kv; ++i1) {
auto & cell1 = kv_self.cells[i1];
if (cell1.is_empty() || ids[i1] != n_kv) {
if (n_moves == max_moves) {
stop = true;
break;
}
cont = false;
continue;
}
@@ -18875,8 +18865,10 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
kv_self.head = n_used;
if (!cont) {
n_moves++;
moves.push_back({i1, i0 + nf, 1});
cont = true;
} else {
moves.back().len++;
}
nf++;
@@ -18886,22 +18878,16 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
}
}
if (stop || n_moves == max_moves) {
break;
}
//LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
i0 += nh - 1;
}
if (n_moves == 0) {
if (moves.size() == 0) {
return;
}
//LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
//LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer);
//LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", moves.size());
#if 0
// CPU defrag
@@ -18976,11 +18962,18 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
#else
// ggml_graph defrag
ggml_backend_sched_reset(lctx.sched.get());
for (std::size_t i = 0; i < moves.size(); i += max_moves) {
std::vector<struct llama_kv_defrag_move> chunk;
auto end = std::min(i + max_moves, moves.size());
chunk.assign(moves.begin() + i, moves.begin() + end);
ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
ggml_backend_sched_reset(lctx.sched.get());
llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
//LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*chunk.size()*n_layer);
ggml_cgraph * gf = llama_build_graph_defrag(lctx, chunk);
llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
}
#endif
//const int64_t t_end = ggml_time_us();

View File

@@ -0,0 +1,242 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Fri, 13 Dec 2024 16:11:59 -0800
Subject: [PATCH] llama: Ensure KV cache is fully defragmented.
Sometimes the KV cache requires defragmentation even without
triggering the threshold heuristic. In this case, decoding
will not being able to find a KV cache slot. This is particularly
difficult for the caller to handle if it happens in between
ubatches. To avoid this, we should immediately trigger a defrag.
In addition, a heavily fragmented cache can require more than
max_moves to defragment. Currently, we stop when we hit the limit
but this can leave a cache that still does not have adequate space
even after defragmentation is triggered. Instead, we should do
multiple batches of processing until everything is complete.
---
src/llama.cpp | 99 ++++++++++++++++++++++++---------------------------
1 file changed, 46 insertions(+), 53 deletions(-)
diff --git a/src/llama.cpp b/src/llama.cpp
index 4778a9ed..654e32bc 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3025,6 +3025,13 @@ struct llama_kv_cache {
}
};
+// block of KV slots to move when defragging
+struct llama_kv_defrag_move {
+ uint32_t src;
+ uint32_t dst;
+ uint32_t len;
+};
+
struct llama_control_vector {
std::vector<struct ggml_tensor *> tensors; // per layer
std::vector<ggml_context_ptr> ctxs;
@@ -10802,35 +10809,23 @@ struct llm_build_context {
return gf;
}
- struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
+ struct ggml_cgraph * build_defrag(const std::vector<struct llama_kv_defrag_move> & moves) {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
- for (uint32_t i = 0; i < ids.size(); ++i) {
- const uint32_t id = ids[i];
-
- if (i == id || id == ids.size()) {
- continue;
- }
-
- uint32_t nm = 1;
-
- while (i + nm < ids.size() && ids[i + nm] == id + nm) {
- nm++;
- }
-
+ for (const auto & move : moves) {
for (int il = 0; il < n_layer; ++il) {
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
- n_embd_k_gqa, nm,
+ n_embd_k_gqa, move.len,
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
- ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i));
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*move.src));
ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
- n_embd_k_gqa, nm,
+ n_embd_k_gqa, move.len,
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
- ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*move.dst));
ggml_tensor * view_v_src;
ggml_tensor * view_v_dst;
@@ -10838,31 +10833,29 @@ struct llm_build_context {
if (flash_attn) {
// NOTE: the V cache is not transposed when using flash attention
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
- n_embd_v_gqa, nm,
+ n_embd_v_gqa, move.len,
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
- ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i));
+ ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*move.src));
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
- n_embd_v_gqa, nm,
+ n_embd_v_gqa, move.len,
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
- ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id));
+ ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*move.dst));
} else {
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
- nm, n_embd_v_gqa,
+ move.len, n_embd_v_gqa,
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
- ggml_row_size(kv_self.v_l[il]->type, i));
+ ggml_row_size(kv_self.v_l[il]->type, move.src));
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
- nm, n_embd_v_gqa,
+ move.len, n_embd_v_gqa,
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
- ggml_row_size(kv_self.v_l[il]->type, id));
+ ggml_row_size(kv_self.v_l[il]->type, move.dst));
}
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
}
-
- i += nm - 1;
}
//LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
@@ -17325,7 +17318,7 @@ struct llm_build_context {
}
};
-static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
+static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<struct llama_kv_defrag_move> & moves) {
llama_ubatch dummy = {};
dummy.equal_seqs = true;
@@ -17335,7 +17328,7 @@ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const
llm.init();
- struct ggml_cgraph * result = llm.build_defrag(ids);
+ struct ggml_cgraph * result = llm.build_defrag(moves);
llm.free();
@@ -18351,7 +18344,12 @@ static int llama_decode_internal(
kv_self.head = 0;
}
- const auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
+ auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
+ if (!slot) {
+ llama_kv_cache_defrag(kv_self);
+ llama_kv_cache_update(&lctx);
+ slot = llama_kv_cache_find_slot(kv_self, ubatch);
+ }
if (!slot) {
return 1;
}
@@ -18756,8 +18754,8 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
//const int64_t t_start = ggml_time_us();
- // number of cells moved
- uint32_t n_moves = 0;
+ // groups of cells moved
+ std::vector<struct llama_kv_defrag_move> moves;
// each move requires 6*n_layer tensors (see build_defrag)
// - source view, destination view, copy operation
@@ -18821,19 +18819,11 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
// are we moving a continuous block of memory?
bool cont = false;
- // should we stop searching for the next move?
- bool stop = false;
-
// go back and move the nf cells to the hole
for (; i1 < n_kv; ++i1) {
auto & cell1 = kv_self.cells[i1];
if (cell1.is_empty() || ids[i1] != n_kv) {
- if (n_moves == max_moves) {
- stop = true;
- break;
- }
-
cont = false;
continue;
}
@@ -18849,8 +18839,10 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
kv_self.head = n_used;
if (!cont) {
- n_moves++;
+ moves.push_back({i1, i0 + nf, 1});
cont = true;
+ } else {
+ moves.back().len++;
}
nf++;
@@ -18860,22 +18852,16 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
}
}
- if (stop || n_moves == max_moves) {
- break;
- }
-
//LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
i0 += nh - 1;
}
- if (n_moves == 0) {
+ if (moves.size() == 0) {
return;
}
- //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
-
- //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer);
+ //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", moves.size());
#if 0
// CPU defrag
@@ -18950,11 +18936,18 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
#else
// ggml_graph defrag
- ggml_backend_sched_reset(lctx.sched.get());
+ for (std::size_t i = 0; i < moves.size(); i += max_moves) {
+ std::vector<struct llama_kv_defrag_move> chunk;
+ auto end = std::min(i + max_moves, moves.size());
+ chunk.assign(moves.begin() + i, moves.begin() + end);
- ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
+ ggml_backend_sched_reset(lctx.sched.get());
+
+ //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*chunk.size()*n_layer);
+ ggml_cgraph * gf = llama_build_graph_defrag(lctx, chunk);
- llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
+ llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
+ }
#endif
//const int64_t t_end = ggml_time_us();

View File

@@ -433,14 +433,7 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
err := s.lc.Decode(batch)
if err != nil {
if errors.Is(err, llama.ErrKvCacheFull) {
slog.Debug("defragmenting kv cache")
s.cache.lc.KvCacheDefrag()
err = s.lc.Decode(batch)
}
if err != nil {
return fmt.Errorf("failed to decode batch: %w", err)
}
return fmt.Errorf("failed to decode batch: %w", err)
}
if crossAttention {

View File

@@ -700,20 +700,24 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
}
if len(req.Format) > 0 {
switch {
case bytes.Equal(req.Format, []byte(`""`)):
// fallthrough
case bytes.Equal(req.Format, []byte(`"json"`)):
switch string(req.Format) {
case `null`, `""`:
// Field was set, but "missing" a value. We accept
// these as "not set".
break
case `"json"`:
request["grammar"] = grammarJSON
case bytes.HasPrefix(req.Format, []byte("{")):
default:
if req.Format[0] != '{' {
return fmt.Errorf("invalid format: %q; expected \"json\" or a valid JSON Schema object", req.Format)
}
// User provided a JSON schema
g := llama.SchemaToGrammar(req.Format)
if g == nil {
return fmt.Errorf("invalid JSON schema in format")
}
request["grammar"] = string(g)
default:
return fmt.Errorf("invalid format: %q; expected \"json\" or a valid JSON Schema", req.Format)
}
}

View File

@@ -39,25 +39,34 @@ func TestLLMServerCompletionFormat(t *testing.T) {
cancel() // prevent further processing if request makes it past the format check
checkCanceled := func(err error) {
checkValid := func(err error) {
t.Helper()
if !errors.Is(err, context.Canceled) {
t.Fatalf("Completion: err = %v; expected context.Canceled", err)
}
}
valids := []string{`"json"`, `{"type":"object"}`, ``, `""`}
valids := []string{
// "missing"
``,
`""`,
`null`,
// JSON
`"json"`,
`{"type":"object"}`,
}
for _, valid := range valids {
err := s.Completion(ctx, CompletionRequest{
Options: new(api.Options),
Format: []byte(valid),
}, nil)
checkCanceled(err)
checkValid(err)
}
err := s.Completion(ctx, CompletionRequest{
Options: new(api.Options),
Format: nil, // missing format
}, nil)
checkCanceled(err)
checkValid(err)
}

View File

@@ -15,28 +15,36 @@ export CGO_CXXFLAGS=-mmacosx-version-min=11.3
export CGO_LDFLAGS=-mmacosx-version-min=11.3
rm -rf llama/build dist/darwin-*
# Generate the universal ollama binary for stand-alone usage: metal + avx
echo "Building binary"
echo "Building darwin arm64"
GOOS=darwin ARCH=arm64 GOARCH=arm64 make -j 8 dist
echo "Building darwin amd64 with AVX enabled"
GOOS=darwin ARCH=amd64 GOARCH=amd64 CUSTOM_CPU_FLAGS="avx" make -j 8 dist_exe
# Generate the universal ollama binary for stand-alone usage: metal + avx
lipo -create -output dist/ollama-darwin dist/darwin-arm64/bin/ollama dist/darwin-amd64/bin/ollama
# sign the binary and rename it
if [ -n "$APPLE_IDENTITY" ]; then
codesign -f --timestamp -s "$APPLE_IDENTITY" --identifier ai.ollama.ollama --options=runtime dist/ollama-darwin
else
echo "WARNING: Skipping code signing - set APPLE_IDENTITY"
fi
ditto -c -k --keepParent dist/ollama-darwin dist/temp.zip
if [ -n "$APPLE_IDENTITY" ]; then
xcrun notarytool submit dist/temp.zip --wait --timeout 10m --apple-id $APPLE_ID --password $APPLE_PASSWORD --team-id $APPLE_TEAM_ID
fi
rm -f dist/temp.zip
# Build the app bundle
echo "Building app"
echo "Building darwin amd64 with runners"
rm dist/darwin-amd64/bin/ollama
GOOS=darwin ARCH=amd64 GOARCH=amd64 make -j 8 dist
# Generate the universal ollama binary for the app bundle: metal + no-avx
lipo -create -output dist/ollama dist/darwin-arm64/bin/ollama dist/darwin-amd64/bin/ollama
if [ -n "$APPLE_IDENTITY" ]; then
codesign --deep --force --options=runtime --sign "$APPLE_IDENTITY" --timestamp dist/ollama
else
echo "Skipping code signing - set APPLE_IDENTITY"
fi
chmod +x dist/ollama
# build and optionally sign the mac app
npm install --prefix macapp
if [ -n "$APPLE_IDENTITY" ]; then
@@ -46,14 +54,3 @@ else
fi
cp macapp/out/make/zip/darwin/universal/Ollama-darwin-universal-$VERSION.zip dist/Ollama-darwin.zip
# sign the binary and rename it
if [ -n "$APPLE_IDENTITY" ]; then
codesign -f --timestamp -s "$APPLE_IDENTITY" --identifier ai.ollama.ollama --options=runtime dist/ollama
else
echo "WARNING: Skipping code signing - set APPLE_IDENTITY"
fi
ditto -c -k --keepParent dist/ollama dist/temp.zip
if [ -n "$APPLE_IDENTITY" ]; then
xcrun notarytool submit dist/temp.zip --wait --timeout 10m --apple-id $APPLE_ID --password $APPLE_PASSWORD --team-id $APPLE_TEAM_ID
fi
rm -f dist/temp.zip

View File

@@ -82,6 +82,10 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
}
currMsgIdx := n
// Warn user if messages are truncated from the input
if numTruncatedMessages := len(msgs[0:currMsgIdx]); numTruncatedMessages > 0 {
slog.Warn("truncated first messages from input", "num_truncated", numTruncatedMessages)
}
for cnt, msg := range msgs[currMsgIdx:] {
prefix := ""

View File

@@ -1539,6 +1539,34 @@ func (s *Server) ChatHandler(c *gin.Context) {
return
}
if req.Dry {
var debug map[string]any
if req.Debug != nil && req.Debug.Include != nil && slices.Contains(req.Debug.Include, "prompt") {
debug = map[string]any{"prompt": prompt}
}
tokens, err := r.Tokenize(c.Request.Context(), prompt)
if err != nil {
slog.Error("tokenize error", "error", err)
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
c.JSON(http.StatusOK, api.ChatResponse{
Model: req.Model,
CreatedAt: time.Now().UTC(),
Message: api.Message{Role: "assistant", Content: ""},
Done: true,
DoneReason: "dry_run",
Debug: debug,
Metrics: api.Metrics{
PromptEvalCount: len(tokens),
PromptEvalDuration: 0,
EvalCount: 0,
EvalDuration: 0,
},
})
return
}
slog.Debug("chat request", "images", len(images), "prompt", prompt)
ch := make(chan any)
@@ -1571,6 +1599,16 @@ func (s *Server) ChatHandler(c *gin.Context) {
res.LoadDuration = checkpointLoaded.Sub(checkpointStart)
}
if req.Debug != nil && req.Debug.Include != nil && slices.Contains(req.Debug.Include, "prompt") {
res.Debug = map[string]any{"prompt": prompt}
if req.Stream != nil && !*req.Stream {
tempMsg := res.Message
res.Message = api.Message{Role: "assistant", Content: ""}
ch <- res
res.Message = tempMsg
}
}
// TODO: tool call checking and filtering should be moved outside of this callback once streaming
// however this was a simple change for now without reworking streaming logic of this (and other)
// handlers