fix(llama.cpp): fix eos without cache

fix(whisper.cpp): Add stubs and -lcuda
deps(whisper.cpp): update, fix cublas build
2026-07-05 22:09:02 -04:00 · 2024-03-18 12:14:16 +01:00 · 2024-03-18 12:13:39 +01:00 · 2024-03-16 10:38:57 +01:00
2 changed files with 6 additions and 3 deletions
--- a/7
+++ b/7
@@ -19,7 +19,7 @@ RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
 RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6

 # whisper.cpp version
-WHISPER_CPP_VERSION?=37a709f6558c6d9783199e2b8cbb136e1c41d346
+WHISPER_CPP_VERSION?=a56f435fd475afd7edf02bfbf9f8c77f527198c2

 # bert.cpp version
 BERT_VERSION?=6abe312cded14042f6b7c3cd8edf082713334a4d
@@ -91,10 +91,13 @@ ifeq ($(BUILD_TYPE),openblas)
 	export WHISPER_OPENBLAS=1
 endif

+
 ifeq ($(BUILD_TYPE),cublas)
-	CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH)
+	CGO_LDFLAGS+=-lcublas -lcudart -lculibos -lcublasLt -L$(CUDA_LIBPATH)
 	export LLAMA_CUBLAS=1
+# required by whisper.cpp
 	export WHISPER_CUBLAS=1
+	CGO_LDFLAGS+=-L$(CUDA_PATH)/stubs -lcuda
 endif

 ifeq ($(BUILD_TYPE),hipblas)
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -1084,7 +1084,7 @@ struct llama_server_context
            slot.has_next_token = false;
        }

-        if (!slot.cache_tokens.empty() && result.tok == llama_token_eos(model))
+        if (result.tok == llama_token_eos(model))
        {
            slot.stopped_eos = true;
            slot.has_next_token = false;
Author	SHA1	Message	Date
Ettore Di Giacinto	495191a54a	fix(llama.cpp): fix eos without cache	2024-03-18 12:14:16 +01:00
Ettore Di Giacinto	b790fca180	fix(whisper.cpp): Add stubs and -lcuda	2024-03-18 12:13:39 +01:00
Ettore Di Giacinto	0663f66205	deps(whisper.cpp): update, fix cublas build	2024-03-16 10:38:57 +01:00