Compare commits

...

14 Commits

Author SHA1 Message Date
renovate[bot]
137ac545c0 fix(deps): update module github.com/onsi/gomega to v1.29.0 2023-10-25 18:54:03 +00:00
Dave
b839eb80a1 Fix backend/cpp/llama CMakeList.txt on OSX (#1212)
* Fix backend/cpp/llama CMakeList.txt on OSX - detect OSX and use homebrew libraries

* sneak a logging fix in too for gallery debugging

* additional logging
2023-10-25 20:53:26 +02:00
renovate[bot]
23b03a7f03 fix(deps): update module github.com/onsi/gomega to v1.28.1 (#1205)
Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>
2023-10-24 09:16:02 +02:00
LocalAI [bot]
9196583651 ⬆️ Update ggerganov/llama.cpp (#1204)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2023-10-23 19:06:39 +02:00
Ettore Di Giacinto
fd28252e55 fix(Dockerfile): try to save some space
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2023-10-22 17:13:39 +02:00
renovate[bot]
94f20e2eb7 fix(deps): update github.com/nomic-ai/gpt4all/gpt4all-bindings/golang digest to c25dc51 (#1191)
Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>
2023-10-22 16:58:45 +02:00
Ettore Di Giacinto
5ced99a8e7 ci: more cleanup for workers
Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2023-10-22 12:27:04 +02:00
LocalAI [bot]
c377e61ff0 ⬆️ Update go-skynet/go-llama.cpp (#1156)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2023-10-22 08:55:44 +02:00
Ettore Di Giacinto
a6fe0a020a feat(llama.cpp): update (#1200)
**Description**

This PR updates llama.cpp to
465219b914

Supersedes #1195
2023-10-21 18:44:37 +02:00
Ettore Di Giacinto
bf2ed3d752 fix(Dockerfile): piper phonemize is required during build
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2023-10-21 16:40:41 +02:00
Ettore Di Giacinto
d17a92eef3 example(bruno): add image generation
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2023-10-21 11:38:23 +02:00
Ettore Di Giacinto
1a7be035d3 fix(Makefile): build all backends if none is specified
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2023-10-21 11:34:59 +02:00
Ettore Di Giacinto
004baaa30f feat(llama.cpp): update
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2023-10-21 11:04:03 +02:00
renovate[bot]
ef19268418 chore(deps): update actions/checkout action to v4 (#1006)
Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>
2023-10-21 08:55:44 +02:00
15 changed files with 197 additions and 179 deletions

View File

@@ -44,7 +44,7 @@ jobs:
branch: "master"
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
- name: Bump dependencies 🔧
run: |
bash .github/bump_deps.sh ${{ matrix.repository }} ${{ matrix.branch }} ${{ matrix.variable }}

View File

@@ -83,6 +83,10 @@ jobs:
sudo apt-get remove -y azure-cli || true
sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
sudo apt-get remove -y '^gfortran-.*' || true
sudo apt-get remove -y microsoft-edge-stable || true
sudo apt-get remove -y firefox || true
sudo apt-get remove -y powershell || true
sudo apt-get remove -y r-base-core || true
sudo apt-get autoremove -y
sudo apt-get clean
echo
@@ -93,7 +97,7 @@ jobs:
sudo rm -rfv build || true
df -h
- name: Checkout
uses: actions/checkout@v3
uses: actions/checkout@v4
- name: Docker meta
id: meta

View File

@@ -19,7 +19,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Clone
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
submodules: true
- uses: actions/setup-go@v4
@@ -66,7 +66,7 @@ jobs:
runs-on: macOS-latest
steps:
- name: Clone
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
submodules: true
- uses: actions/setup-go@v4

View File

@@ -21,7 +21,7 @@ jobs:
go-version: ['1.21.x']
steps:
- name: Clone
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
submodules: true
- name: Setup Go ${{ matrix.go-version }}

View File

@@ -53,7 +53,7 @@ jobs:
sudo rm -rfv build || true
df -h
- name: Clone
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
submodules: true
- name: Setup Go ${{ matrix.go-version }}
@@ -108,7 +108,7 @@ jobs:
go-version: ['1.21.x']
steps:
- name: Clone
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
submodules: true
- name: Setup Go ${{ matrix.go-version }}

View File

@@ -19,7 +19,7 @@ ENV GALLERIES='[{"name":"model-gallery", "url":"github:go-skynet/model-gallery/i
ARG GO_TAGS="stablediffusion tts"
RUN apt-get update && \
apt-get install -y ca-certificates curl patch pip cmake
apt-get install -y ca-certificates curl patch pip cmake && apt-get clean
# Use the variables in subsequent instructions
@@ -34,17 +34,18 @@ RUN if [ "${BUILD_TYPE}" = "cublas" ]; then \
dpkg -i cuda-keyring_1.0-1_all.deb && \
rm -f cuda-keyring_1.0-1_all.deb && \
apt-get update && \
apt-get install -y cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
apt-get install -y cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} && apt-get clean \
; fi
ENV PATH /usr/local/cuda/bin:${PATH}
# OpenBLAS requirements
RUN apt-get install -y libopenblas-dev
# Stable Diffusion requirements
RUN apt-get install -y libopencv-dev && \
ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
# OpenBLAS requirements and stable diffusion
RUN apt-get install -y \
libopenblas-dev \
libopencv-dev \
&& apt-get clean
# Set up OpenCV
RUN ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
WORKDIR /build
@@ -68,8 +69,7 @@ RUN curl -L "https://github.com/gabime/spdlog/archive/refs/tags/v${SPDLOG_VERSIO
cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/lib/. /usr/lib/ && \
ln -s /usr/lib/libpiper_phonemize.so /usr/lib/libpiper_phonemize.so.1 && \
cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/include/. /usr/include/ && \
rm spdlog-${SPDLOG_VERSION} -rf && \
rm /build/lib/Linux-$(uname -m)/piper_phonemize -rf
rm spdlog-${SPDLOG_VERSION} -rf
# Extras requirements
FROM requirements-core as requirements-extras

View File

@@ -4,11 +4,11 @@ GOVET=$(GOCMD) vet
BINARY_NAME=local-ai
# llama.cpp versions
GOLLAMA_VERSION?=1676dcd7a139b6cdfbaea5fd67f46dc25d9d8bcf
GOLLAMA_VERSION?=aeba71ee842819da681ea537e78846dc75949ac0
GOLLAMA_STABLE_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7
CPPLLAMA_VERSION?=24ba3d829e31a6eda3fa1723f692608c2fa3adda
CPPLLAMA_VERSION?=96981f37b1e3f450d9e63e571514217bf60f0a7f
# gpt4all version
GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
@@ -129,7 +129,13 @@ ifeq ($(findstring tts,$(GO_TAGS)),tts)
OPTIONAL_GRPC+=backend-assets/grpc/piper
endif
GRPC_BACKENDS?=backend-assets/grpc/langchain-huggingface backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/falcon backend-assets/grpc/bloomz backend-assets/grpc/llama backend-assets/grpc/llama-cpp backend-assets/grpc/llama-stable backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC)
ALL_GRPC_BACKENDS=backend-assets/grpc/langchain-huggingface backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/falcon backend-assets/grpc/bloomz backend-assets/grpc/llama backend-assets/grpc/llama-cpp backend-assets/grpc/llama-stable backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC)
GRPC_BACKENDS?=$(ALL_GRPC_BACKENDS) $(OPTIONAL_GRPC)
# If empty, then we build all
ifeq ($(GRPC_BACKENDS),)
GRPC_BACKENDS=$(ALL_GRPC_BACKENDS)
endif
.PHONY: all test build vendor

View File

@@ -4,6 +4,11 @@ set(TARGET grpc-server)
set(_PROTOBUF_LIBPROTOBUF libprotobuf)
set(_REFLECTION grpc++_reflection)
if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
link_directories("/opt/homebrew/lib")
include_directories("/opt/homebrew/include")
endif()
find_package(absl CONFIG REQUIRED)
find_package(Protobuf CONFIG REQUIRED)
find_package(gRPC CONFIG REQUIRED)
@@ -15,8 +20,7 @@ find_program(_GRPC_CPP_PLUGIN_EXECUTABLE grpc_cpp_plugin)
include_directories(${CMAKE_CURRENT_BINARY_DIR})
include_directories(${Protobuf_INCLUDE_DIRS})
message(STATUS "Using protobuf ${Protobuf_VERSION} ${Protobuf_INCLUDE_DIRS} ${CMAKE_CURRENT_BINARY_DIR}")
message(STATUS "Using protobuf version ${Protobuf_VERSION} | Protobuf_INCLUDE_DIRS: ${Protobuf_INCLUDE_DIRS} | CMAKE_CURRENT_BINARY_DIR: ${CMAKE_CURRENT_BINARY_DIR}")
# Proto file
get_filename_component(hw_proto "../../../../../../pkg/grpc/proto/backend.proto" ABSOLUTE)

View File

@@ -1,5 +1,5 @@
LLAMA_VERSION?=24ba3d829e31a6eda3fa1723f692608c2fa3adda
LLAMA_VERSION?=
CMAKE_ARGS?=
BUILD_TYPE?=

View File

@@ -88,6 +88,7 @@ static size_t find_partial_stop_string(const std::string &stop,
return std::string::npos;
}
template <class Iter>
static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
{
@@ -128,17 +129,15 @@ struct llama_server_context
size_t n_past = 0;
size_t n_remain = 0;
// json prompt;
std::vector<llama_token> embd;
std::vector<llama_token> last_n_tokens;
gpt_params params;
llama_model *model = nullptr;
llama_context *ctx = nullptr;
gpt_params params;
int n_ctx;
llama_sampling_context *ctx_sampling = nullptr;
grammar_parser::parse_state parsed_grammar;
llama_grammar *grammar = nullptr;
int n_ctx;
bool truncated = false;
bool stopped_eos = false;
@@ -171,7 +170,7 @@ struct llama_server_context
void rewind()
{
params.antiprompt.clear();
params.grammar.clear();
params.sparams.grammar.clear();
num_prompt_tokens = 0;
num_tokens_predicted = 0;
generated_text = "";
@@ -185,100 +184,87 @@ struct llama_server_context
multibyte_pending = 0;
n_remain = 0;
n_past = 0;
params.sparams.n_prev = n_ctx;
}
if (grammar != nullptr) {
llama_grammar_free(grammar);
grammar = nullptr;
void initSampling() {
if (ctx_sampling != nullptr) {
llama_sampling_free(ctx_sampling);
}
ctx_sampling = llama_sampling_init(params.sparams);
}
bool loadModel(const gpt_params &params_)
{
printf("load model %s\n", params_.model.c_str());
params = params_;
std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (model == nullptr)
{
printf("unable to load model %s\n", params_.model.c_str());
return false;
}
n_ctx = llama_n_ctx(ctx);
last_n_tokens.resize(n_ctx);
std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
return true;
}
std::vector<llama_token> tokenize_array(const char **prompts, bool add_bos) const
{
// If `add_bos` is true, we only add BOS, when json_prompt is a string,
std::vector<llama_token> tokenize_string(const char *prompt, bool add_bos) const {
// If `add_bos` is true, we only add BOS, when json_prompt is a string,
// or the first element of the json_prompt array is a string.
std::vector<llama_token> prompt_tokens;
bool first = true;
// Iterate over prompts
for (const char **p = prompts; *p != nullptr; ++p)
{
auto s = std::string(*p);
std::vector<llama_token> pp;
if (first)
{
pp = ::llama_tokenize(ctx, s, add_bos);
first = false;
}
else
{
pp = ::llama_tokenize(ctx, s, false);
}
prompt_tokens.insert(prompt_tokens.end(), pp.begin(), pp.end());
}
return prompt_tokens;
}
std::vector<llama_token> tokenize_string(const char *prompt, bool add_bos) const
{
// If `add_bos` is true, we only add BOS, when json_prompt is a string,
// or the first element of the json_prompt array is a string.
std::vector<llama_token> prompt_tokens;
std::vector<llama_token> prompt_tokens;
auto s = std::string(prompt);
prompt_tokens = ::llama_tokenize(ctx, s, add_bos);
return prompt_tokens;
}
std::vector<llama_token> tokenize_array(const char **prompts, bool add_bos) const {
std::vector<llama_token> prompt_tokens;
bool loadGrammar()
{
if (!params.grammar.empty()) {
parsed_grammar = grammar_parser::parse(params.grammar.c_str());
// will be empty (default) if there are parse errors
if (parsed_grammar.rules.empty()) {
printf("grammar parse error");
return false;
}
grammar_parser::print_grammar(stderr, parsed_grammar);
{
auto it = params.logit_bias.find(llama_token_eos(ctx));
if (it != params.logit_bias.end() && it->second == -INFINITY) {
printf("EOS token is disabled, which will cause most grammars to fail");
bool first = true;
bool is_string = true;
for (const char **p = prompts; *p != nullptr; ++p)
{
if (is_string)
{
auto s = std::string(*p);
std::vector<llama_token> p;
if (first)
{
p = ::llama_tokenize(ctx, s, add_bos);
first = false;
}
else
{
p = ::llama_tokenize(ctx, s, false);
}
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
}
else
{
if (first)
{
first = false;
}
//prompt_tokens.push_back(p.template get<llama_token>());
}
}
return prompt_tokens;
}
std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
grammar = llama_grammar_init(
grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
}
return true;
void truncatePrompt(std::vector<llama_token> &prompt_tokens) {
const int n_left = n_ctx - params.n_keep;
const int n_block_size = n_left / 2;
const int erased_blocks = (prompt_tokens.size() - params.n_keep - n_block_size) / n_block_size;
// Keep n_keep tokens at start of prompt (at most n_ctx - 4)
std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep);
new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_block_size, prompt_tokens.end());
truncated = true;
prompt_tokens = new_tokens;
}
void loadInfill()
{
bool suff_rm_leading_spc = true;
if (params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
params.input_suffix.erase(0, 1);
suff_rm_leading_spc = false;
}
@@ -294,6 +280,7 @@ struct llama_server_context
prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(ctx));
prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
prefix_tokens.push_back(llama_token_middle(ctx));
auto prompt_tokens = prefix_tokens;
num_prompt_tokens = prompt_tokens.size();
@@ -305,29 +292,24 @@ struct llama_server_context
params.n_keep = std::min(params.n_ctx - 4, params.n_keep);
// if input prompt is too big, truncate like normal
if (num_prompt_tokens >= (size_t)params.n_ctx)
if (num_prompt_tokens >= (size_t) n_ctx)
{
printf("Input prompt is too big, truncating. Can only take %d tokens but got %zu\n", params.n_ctx, num_prompt_tokens);
// todo we probably want to cut from both sides
const int n_left = (params.n_ctx - params.n_keep) / 2;
std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep);
const int erased_blocks = (num_prompt_tokens - params.n_keep - n_left - 1) / n_left;
new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_left, prompt_tokens.end());
std::copy(prompt_tokens.end() - params.n_ctx, prompt_tokens.end(), last_n_tokens.begin());
truncatePrompt(prompt_tokens);
num_prompt_tokens = prompt_tokens.size();
truncated = true;
prompt_tokens = new_tokens;
GGML_ASSERT(num_prompt_tokens < (size_t)n_ctx);
}
else
// push the prompt into the sampling context (do not apply grammar)
for (auto & token : prompt_tokens)
{
const size_t ps = num_prompt_tokens;
std::fill(last_n_tokens.begin(), last_n_tokens.end() - ps, 0);
std::copy(prompt_tokens.begin(), prompt_tokens.end(), last_n_tokens.end() - ps);
llama_sampling_accept(ctx_sampling, ctx, token, false);
}
// compare the evaluated prompt with the new prompt
n_past = common_part(embd, prompt_tokens);
embd = prompt_tokens;
if (n_past == num_prompt_tokens)
{
// we have to evaluate at least 1 token to generate logits.
@@ -335,6 +317,7 @@ struct llama_server_context
n_past--;
}
// since #3228 we now have to manually manage the KV cache
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
has_next_token = true;
@@ -352,38 +335,33 @@ struct llama_server_context
params.n_keep = std::min(n_ctx - 4, params.n_keep);
// if input prompt is too big, truncate like normal
if (num_prompt_tokens >= (size_t)n_ctx)
if (num_prompt_tokens >= (size_t) n_ctx)
{
const int n_left = (n_ctx - params.n_keep) / 2;
std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep);
const int erased_blocks = (num_prompt_tokens - params.n_keep - n_left - 1) / n_left;
new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_left, prompt_tokens.end());
std::copy(prompt_tokens.end() - n_ctx, prompt_tokens.end(), last_n_tokens.begin());
truncatePrompt(prompt_tokens);
num_prompt_tokens = prompt_tokens.size();
truncated = true;
prompt_tokens = new_tokens;
GGML_ASSERT(num_prompt_tokens < (size_t)n_ctx);
}
else
// push the prompt into the sampling context (do not apply grammar)
for (auto & token : prompt_tokens)
{
const size_t ps = num_prompt_tokens;
std::fill(last_n_tokens.begin(), last_n_tokens.end() - ps, 0);
std::copy(prompt_tokens.begin(), prompt_tokens.end(), last_n_tokens.end() - ps);
llama_sampling_accept(ctx_sampling, ctx, token, false);
}
// compare the evaluated prompt with the new prompt
n_past = common_part(embd, prompt_tokens);
embd = prompt_tokens;
if (n_past == num_prompt_tokens)
{
// we have to evaluate at least 1 token to generate logits.
n_past--;
}
// since #3228 we now have to manually manage the KV cache
// since #3228 we now have to manually manage the KV cache
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
has_next_token = true;
}
@@ -418,7 +396,6 @@ struct llama_server_context
n_past -= n_discard;
truncated = true;
}
bool tg = true;
@@ -433,7 +410,6 @@ struct llama_server_context
if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval, n_past, 0)))
{
has_next_token = false;
return result;
}
@@ -449,27 +425,24 @@ struct llama_server_context
{
// out of user input, sample next token
std::vector<llama_token_data> candidates;
candidates.reserve(llama_n_vocab(model));
result.tok = llama_sampling_sample(ctx_sampling, ctx, NULL);
result.tok = llama_sample_token(ctx, NULL, grammar, params, last_n_tokens, candidates);
llama_token_data_array cur_p = { ctx_sampling->cur.data(), ctx_sampling->cur.size(), false };
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
const int32_t n_probs = params.n_probs;
if (params.temp <= 0 && n_probs > 0)
const int32_t n_probs = params.sparams.n_probs;
if (params.sparams.temp <= 0 && n_probs > 0)
{
// For llama_sample_token_greedy we need to sort candidates
llama_sample_softmax(ctx, &candidates_p);
llama_sample_softmax(ctx, &cur_p);
}
for (size_t i = 0; i < std::min(candidates_p.size, (size_t)n_probs); ++i)
for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i)
{
result.probs.push_back({candidates_p.data[i].id, candidates_p.data[i].p});
result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p});
}
last_n_tokens.erase(last_n_tokens.begin());
last_n_tokens.push_back(result.tok);
llama_sampling_accept(ctx_sampling, ctx, result.tok, true);
if (tg) {
num_tokens_predicted++;
}
@@ -531,7 +504,7 @@ struct llama_server_context
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok);
generated_text += token_text;
if (params.n_probs > 0)
if (params.sparams.n_probs > 0)
{
generated_token_probs.push_back(token_with_probs);
}
@@ -583,7 +556,6 @@ struct llama_server_context
static const int n_embd = llama_n_embd(model);
if (!params.embedding)
{
printf("embedding disabled");
return std::vector<float>(n_embd, 0.0f);
}
const float *data = llama_get_embeddings(ctx);
@@ -599,30 +571,30 @@ static void parse_options_completion(bool streaming,const backend::PredictOption
llama.stream = streaming;
llama.params.n_predict = predict->tokens() == 0 ? -1 : predict->tokens();
llama.params.top_k = predict->topk();
llama.params.top_p = predict->topp();
llama.params.tfs_z = predict->tailfreesamplingz();
llama.params.typical_p = predict->typicalp();
llama.params.repeat_last_n = predict->repeat();
llama.params.temp = predict->temperature();
llama.params.repeat_penalty = predict->penalty();
llama.params.presence_penalty = predict->presencepenalty();
llama.params.frequency_penalty = predict->frequencypenalty();
llama.params.mirostat = predict->mirostat();
llama.params.mirostat_tau = predict->mirostattau();
llama.params.mirostat_eta = predict->mirostateta();
llama.params.penalize_nl = predict->penalizenl();
llama.params.sparams.top_k = predict->topk();
llama.params.sparams.top_p = predict->topp();
llama.params.sparams.tfs_z = predict->tailfreesamplingz();
llama.params.sparams.typical_p = predict->typicalp();
llama.params.sparams.penalty_last_n = predict->repeat();
llama.params.sparams.temp = predict->temperature();
llama.params.sparams.penalty_repeat = predict->penalty();
llama.params.sparams.penalty_present = predict->presencepenalty();
llama.params.sparams.penalty_freq = predict->frequencypenalty();
llama.params.sparams.mirostat = predict->mirostat();
llama.params.sparams.mirostat_tau = predict->mirostattau();
llama.params.sparams.mirostat_eta = predict->mirostateta();
llama.params.sparams.penalize_nl = predict->penalizenl();
llama.params.n_keep = predict->nkeep();
llama.params.seed = predict->seed();
llama.params.grammar = predict->grammar();
llama.params.sparams.grammar = predict->grammar();
// llama.params.n_probs = predict->
llama.params.prompt = predict->prompt();
llama.params.logit_bias.clear();
llama.params.sparams.logit_bias.clear();
if (predict->ignoreeos())
{
llama.params.logit_bias[llama_token_eos(llama.ctx)] = -INFINITY;
llama.params.sparams.logit_bias[llama_token_eos(llama.ctx)] = -INFINITY;
}
// const auto &logit_bias = body.find("logit_bias");
@@ -801,12 +773,7 @@ public:
parse_options_completion(false, request, llama);
if (!llama.loadGrammar())
{
//res.status = 400;
return Status::CANCELLED;
}
llama.initSampling();
llama.loadPrompt(request->prompt());
llama.beginCompletion();
size_t sent_count = 0;
@@ -848,7 +815,7 @@ public:
std::vector<completion_token_output> probs_output = {};
if (llama.params.n_probs > 0) {
if (llama.params.sparams.n_probs > 0) {
const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
@@ -879,12 +846,7 @@ public:
llama_reset_timings(llama.ctx);
parse_options_completion(false, request, llama);
if (!llama.loadGrammar())
{
//res.status = 400;
return Status::CANCELLED;
}
llama.initSampling();
llama.loadPrompt(request->prompt());
llama.beginCompletion();
@@ -915,7 +877,7 @@ public:
}
auto probs = llama.generated_token_probs;
if (llama.params.n_probs > 0 && llama.stopped_word) {
if (llama.params.sparams.n_probs > 0 && llama.stopped_word) {
const std::vector<llama_token> stop_word_toks = llama_tokenize(llama.ctx, llama.stopping_word, false);
probs = std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.end() - stop_word_toks.size());
}

View File

@@ -0,0 +1,25 @@
meta {
name: Generate image
type: http
seq: 1
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/v1/images/generations
body: json
auth: none
}
headers {
Content-Type: application/json
}
body:json {
{
"prompt": "<positive prompt>|<negative prompt>",
"model": "model-name",
"step": 51,
"size": "1024x1024",
"image": ""
}
}

View File

@@ -15,10 +15,16 @@ headers {
}
body:json {
{
"model": "{{DEFAULT_MODEL}}",
"messages": [{"role": "user", "content": "How could one use friction to cook an egg?"}],
"max_tokens": 256,
"temperature": 0.2
{
"model": "{{DEFAULT_MODEL}}",
"messages": [
{
"role": "user",
"content": "How could one use friction to cook an egg?"
}
],
"max_tokens": 256,
"temperature": 0.2,
"grammar": ""
}
}

6
go.mod
View File

@@ -20,9 +20,9 @@ require (
github.com/mudler/go-ggllm.cpp v0.0.0-20230709223052-862477d16eef
github.com/mudler/go-processmanager v0.0.0-20230818213616-f204007f963c
github.com/mudler/go-stable-diffusion v0.0.0-20230605122230-d89260f598af
github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20231016205817-9a19c740ee84
github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20231022042237-c25dc5193530
github.com/onsi/ginkgo/v2 v2.13.0
github.com/onsi/gomega v1.28.0
github.com/onsi/gomega v1.29.0
github.com/otiai10/openaigo v1.6.0
github.com/phayes/freeport v0.0.0-20220201140144-74d24b5ae9f5
github.com/prometheus/client_golang v1.17.0
@@ -89,7 +89,7 @@ require (
github.com/go-audio/riff v1.0.0 // indirect
github.com/go-logr/logr v1.2.4 // indirect
github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 // indirect
github.com/google/go-cmp v0.5.9 // indirect
github.com/google/go-cmp v0.6.0 // indirect
github.com/google/pprof v0.0.0-20210407192527-94a9f03dee38 // indirect
github.com/hashicorp/errwrap v1.0.0 // indirect
github.com/klauspost/compress v1.16.7 // indirect

8
go.sum
View File

@@ -73,6 +73,8 @@ github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/
github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38=
github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/google/pprof v0.0.0-20210407192527-94a9f03dee38 h1:yAJXTCF9TqKcTiHJAE8dj7HMvPfh66eeA2JYW7eFpSE=
github.com/google/pprof v0.0.0-20210407192527-94a9f03dee38/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE=
@@ -135,6 +137,8 @@ github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20231013181651-22de3c
github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20231013181651-22de3c56bdd4/go.mod h1:4T3CHXyrt+7FQHXaxULZfPjHbD8/99WuDDJa0YVZARI=
github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20231016205817-9a19c740ee84 h1:AiFzd+M2Uxz67fdn4nCnKR70me5yf88rXhoqhvfRDak=
github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20231016205817-9a19c740ee84/go.mod h1:4T3CHXyrt+7FQHXaxULZfPjHbD8/99WuDDJa0YVZARI=
github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20231022042237-c25dc5193530 h1:YXMxHwHMB9jCBo2Yu5gz3mTB3T1TnZs/HmPLv15LUSA=
github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20231022042237-c25dc5193530/go.mod h1:4T3CHXyrt+7FQHXaxULZfPjHbD8/99WuDDJa0YVZARI=
github.com/nwaples/rardecode v1.1.0 h1:vSxaY8vQhOcVr4mm5e8XllHWTiM4JF507A0Katqw7MQ=
github.com/nwaples/rardecode v1.1.0/go.mod h1:5DzqNKiOdpKKBH87u8VlvAnPZMXcGRhxWkRpHbbfGS0=
github.com/nxadm/tail v1.4.4/go.mod h1:kenIhsEOeOJmVchQTgglprH7qJGnHDVpk1VPCcaMI8A=
@@ -151,6 +155,10 @@ github.com/onsi/gomega v1.10.1/go.mod h1:iN09h71vgCQne3DLsj+A5owkum+a2tYe+TOCB1y
github.com/onsi/gomega v1.16.0/go.mod h1:HnhC7FXeEQY45zxNK3PPoIUhzk/80Xly9PcubAlGdZY=
github.com/onsi/gomega v1.28.0 h1:i2rg/p9n/UqIDAMFUJ6qIUUMcsqOuUHgbpbu235Vr1c=
github.com/onsi/gomega v1.28.0/go.mod h1:A1H2JE76sI14WIP57LMKj7FVfCHx3g3BcZVjJG8bjX8=
github.com/onsi/gomega v1.28.1 h1:MijcGUbfYuznzK/5R4CPNoUP/9Xvuo20sXfEm6XxoTA=
github.com/onsi/gomega v1.28.1/go.mod h1:9sxs+SwGrKI0+PWe4Fxa9tFQQBG5xSsSbMXOI8PPpoQ=
github.com/onsi/gomega v1.29.0 h1:KIA/t2t5UBzoirT4H9tsML45GEbo3ouUnBHsCfD2tVg=
github.com/onsi/gomega v1.29.0/go.mod h1:9sxs+SwGrKI0+PWe4Fxa9tFQQBG5xSsSbMXOI8PPpoQ=
github.com/otiai10/mint v1.6.1 h1:kgbTJmOpp/0ce7hk3H8jiSuR0MXmpwWRfqUdKww17qg=
github.com/otiai10/mint v1.6.1/go.mod h1:MJm72SBthJjz8qhefc4z1PYEieWmy8Bku7CjcAqyUSM=
github.com/otiai10/openaigo v1.6.0 h1:YTQEbtDSvawETOB/Kmb/6JvuHdHH/eIpSQfHVufiwY8=

View File

@@ -8,6 +8,7 @@ import (
"github.com/go-skynet/LocalAI/pkg/utils"
"github.com/imdario/mergo"
"github.com/rs/zerolog/log"
"gopkg.in/yaml.v2"
)
@@ -166,7 +167,9 @@ func getGalleryModels(gallery Gallery, basePath string) ([]*GalleryModel, error)
return yaml.Unmarshal(d, &models)
})
if err != nil {
if yamlErr, ok := err.(*yaml.TypeError); ok {
log.Debug().Msgf("YAML errors: %s\n\nwreckage of models: %+v", strings.Join(yamlErr.Errors, "\n"), models)
}
return models, err
}