mirror of
https://github.com/mudler/LocalAI.git
synced 2026-02-03 11:13:31 -05:00
Compare commits
1 Commits
v2.22.0
...
fix/closed
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
83110891fd |
7
.github/ci/modelslist.go
vendored
7
.github/ci/modelslist.go
vendored
@@ -6,7 +6,6 @@ import (
|
||||
"io/ioutil"
|
||||
"os"
|
||||
|
||||
"github.com/microcosm-cc/bluemonday"
|
||||
"gopkg.in/yaml.v3"
|
||||
)
|
||||
|
||||
@@ -280,12 +279,6 @@ func main() {
|
||||
return
|
||||
}
|
||||
|
||||
// Ensure that all arbitrary text content is sanitized before display
|
||||
for i, m := range models {
|
||||
models[i].Name = bluemonday.StrictPolicy().Sanitize(m.Name)
|
||||
models[i].Description = bluemonday.StrictPolicy().Sanitize(m.Description)
|
||||
}
|
||||
|
||||
// render the template
|
||||
data := struct {
|
||||
Models []*GalleryModel
|
||||
|
||||
4
.github/workflows/deploy-explorer.yaml
vendored
4
.github/workflows/deploy-explorer.yaml
vendored
@@ -33,7 +33,7 @@ jobs:
|
||||
run: |
|
||||
CGO_ENABLED=0 make build-api
|
||||
- name: rm
|
||||
uses: appleboy/ssh-action@v1.1.0
|
||||
uses: appleboy/ssh-action@v1.0.3
|
||||
with:
|
||||
host: ${{ secrets.EXPLORER_SSH_HOST }}
|
||||
username: ${{ secrets.EXPLORER_SSH_USERNAME }}
|
||||
@@ -53,7 +53,7 @@ jobs:
|
||||
rm: true
|
||||
target: ./local-ai
|
||||
- name: restarting
|
||||
uses: appleboy/ssh-action@v1.1.0
|
||||
uses: appleboy/ssh-action@v1.0.3
|
||||
with:
|
||||
host: ${{ secrets.EXPLORER_SSH_HOST }}
|
||||
username: ${{ secrets.EXPLORER_SSH_USERNAME }}
|
||||
|
||||
36
Dockerfile
36
Dockerfile
@@ -9,8 +9,6 @@ FROM ${BASE_IMAGE} AS requirements-core
|
||||
USER root
|
||||
|
||||
ARG GO_VERSION=1.22.6
|
||||
ARG CMAKE_VERSION=3.26.4
|
||||
ARG CMAKE_FROM_SOURCE=false
|
||||
ARG TARGETARCH
|
||||
ARG TARGETVARIANT
|
||||
|
||||
@@ -23,25 +21,13 @@ RUN apt-get update && \
|
||||
build-essential \
|
||||
ccache \
|
||||
ca-certificates \
|
||||
curl libssl-dev \
|
||||
cmake \
|
||||
curl \
|
||||
git \
|
||||
unzip upx-ucl && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install CMake (the version in 22.04 is too old)
|
||||
RUN <<EOT bash
|
||||
if [ "${CMAKE_FROM_SOURCE}}" = "true" ]; then
|
||||
curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
|
||||
else
|
||||
apt-get update && \
|
||||
apt-get install -y \
|
||||
cmake && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
fi
|
||||
EOT
|
||||
|
||||
# Install Go
|
||||
RUN curl -L -s https://go.dev/dl/go${GO_VERSION}.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -xz
|
||||
ENV PATH=$PATH:/root/go/bin:/usr/local/go/bin
|
||||
@@ -202,8 +188,6 @@ FROM ${GRPC_BASE_IMAGE} AS grpc
|
||||
# This is a bit of a hack, but it's required in order to be able to effectively cache this layer in CI
|
||||
ARG GRPC_MAKEFLAGS="-j4 -Otarget"
|
||||
ARG GRPC_VERSION=v1.65.0
|
||||
ARG CMAKE_FROM_SOURCE=false
|
||||
ARG CMAKE_VERSION=3.26.4
|
||||
|
||||
ENV MAKEFLAGS=${GRPC_MAKEFLAGS}
|
||||
|
||||
@@ -212,24 +196,12 @@ WORKDIR /build
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
ca-certificates \
|
||||
build-essential curl libssl-dev \
|
||||
build-essential \
|
||||
cmake \
|
||||
git && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install CMake (the version in 22.04 is too old)
|
||||
RUN <<EOT bash
|
||||
if [ "${CMAKE_FROM_SOURCE}}" = "true" ]; then
|
||||
curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
|
||||
else
|
||||
apt-get update && \
|
||||
apt-get install -y \
|
||||
cmake && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
fi
|
||||
EOT
|
||||
|
||||
# We install GRPC to a different prefix here so that we can copy in only the build artifacts later
|
||||
# saves several hundred MB on the final docker image size vs copying in the entire GRPC source tree
|
||||
# and running make install in the target container
|
||||
|
||||
18
Makefile
18
Makefile
@@ -8,7 +8,7 @@ DETECT_LIBS?=true
|
||||
# llama.cpp versions
|
||||
GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
|
||||
GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
|
||||
CPPLLAMA_VERSION?=96776405a17034dcfd53d3ddf5d142d34bdbb657
|
||||
CPPLLAMA_VERSION?=d5ed2b929d85bbd7dbeecb690880f07d9d7a6077
|
||||
|
||||
# go-rwkv version
|
||||
RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
|
||||
@@ -16,7 +16,7 @@ RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6
|
||||
|
||||
# whisper.cpp version
|
||||
WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
|
||||
WHISPER_CPP_VERSION?=fdbfb460ed546452a5d53611bba66d10d842e719
|
||||
WHISPER_CPP_VERSION?=ccc2547210e09e3a1785817383ab770389bb442b
|
||||
|
||||
# bert.cpp version
|
||||
BERT_REPO?=https://github.com/go-skynet/go-bert.cpp
|
||||
@@ -470,13 +470,13 @@ run-e2e-image:
|
||||
|
||||
run-e2e-aio: protogen-go
|
||||
@echo 'Running e2e AIO tests'
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e-aio
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts 5 -v -r ./tests/e2e-aio
|
||||
|
||||
test-e2e:
|
||||
@echo 'Running e2e tests'
|
||||
BUILD_TYPE=$(BUILD_TYPE) \
|
||||
LOCALAI_API=http://$(E2E_BRIDGE_IP):5390/v1 \
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts 5 -v -r ./tests/e2e
|
||||
|
||||
teardown-e2e:
|
||||
rm -rf $(TEST_DIR) || true
|
||||
@@ -484,24 +484,24 @@ teardown-e2e:
|
||||
|
||||
test-llama: prepare-test
|
||||
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts 5 -v -r $(TEST_PATHS)
|
||||
|
||||
test-llama-gguf: prepare-test
|
||||
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts 5 -v -r $(TEST_PATHS)
|
||||
|
||||
test-tts: prepare-test
|
||||
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="tts" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="tts" --flake-attempts 1 -v -r $(TEST_PATHS)
|
||||
|
||||
test-stablediffusion: prepare-test
|
||||
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stablediffusion" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stablediffusion" --flake-attempts 1 -v -r $(TEST_PATHS)
|
||||
|
||||
test-stores: backend-assets/grpc/local-store
|
||||
mkdir -p tests/integration/backend-assets/grpc
|
||||
cp -f backend-assets/grpc/local-store tests/integration/backend-assets/grpc/
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stores" --flake-attempts $(TEST_FLAKES) -v -r tests/integration
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stores" --flake-attempts 1 -v -r tests/integration
|
||||
|
||||
test-container:
|
||||
docker build --target requirements -t local-ai-test-container .
|
||||
|
||||
@@ -113,7 +113,7 @@ static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
|
||||
std::string ret;
|
||||
for (; begin != end; ++begin)
|
||||
{
|
||||
ret += common_token_to_piece(ctx, *begin);
|
||||
ret += llama_token_to_piece(ctx, *begin);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
@@ -121,7 +121,7 @@ static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
|
||||
// format incomplete utf-8 multibyte character for output
|
||||
static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
|
||||
{
|
||||
std::string out = token == -1 ? "" : common_token_to_piece(ctx, token);
|
||||
std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
|
||||
// if the size is 1 and first bit is 1, meaning it's a partial character
|
||||
// (size > 1 meaning it's already a known token)
|
||||
if (out.size() == 1 && (out[0] & 0x80) == 0x80)
|
||||
@@ -203,8 +203,8 @@ struct llama_client_slot
|
||||
std::string stopping_word;
|
||||
|
||||
// sampling
|
||||
struct common_sampler_params sparams;
|
||||
common_sampler *ctx_sampling = nullptr;
|
||||
struct gpt_sampler_params sparams;
|
||||
gpt_sampler *ctx_sampling = nullptr;
|
||||
|
||||
int32_t ga_i = 0; // group-attention state
|
||||
int32_t ga_n = 1; // group-attention factor
|
||||
@@ -257,7 +257,7 @@ struct llama_client_slot
|
||||
images.clear();
|
||||
}
|
||||
|
||||
bool has_budget(common_params &global_params) {
|
||||
bool has_budget(gpt_params &global_params) {
|
||||
if (params.n_predict == -1 && global_params.n_predict == -1)
|
||||
{
|
||||
return true; // limitless
|
||||
@@ -398,7 +398,7 @@ struct llama_server_context
|
||||
|
||||
clip_ctx *clp_ctx = nullptr;
|
||||
|
||||
common_params params;
|
||||
gpt_params params;
|
||||
|
||||
llama_batch batch;
|
||||
|
||||
@@ -441,7 +441,7 @@ struct llama_server_context
|
||||
}
|
||||
}
|
||||
|
||||
bool load_model(const common_params ¶ms_)
|
||||
bool load_model(const gpt_params ¶ms_)
|
||||
{
|
||||
params = params_;
|
||||
if (!params.mmproj.empty()) {
|
||||
@@ -458,9 +458,9 @@ struct llama_server_context
|
||||
}
|
||||
}
|
||||
|
||||
common_init_result common_init = common_init_from_params(params);
|
||||
model = common_init.model;
|
||||
ctx = common_init.context;
|
||||
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
||||
model = llama_init.model;
|
||||
ctx = llama_init.context;
|
||||
if (model == nullptr)
|
||||
{
|
||||
LOG_ERR("unable to load model: %s", params.model.c_str());
|
||||
@@ -578,12 +578,12 @@ struct llama_server_context
|
||||
std::vector<llama_token> p;
|
||||
if (first)
|
||||
{
|
||||
p = common_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
|
||||
p = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
|
||||
first = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
p = common_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
|
||||
p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
|
||||
}
|
||||
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
|
||||
}
|
||||
@@ -600,7 +600,7 @@ struct llama_server_context
|
||||
else
|
||||
{
|
||||
auto s = json_prompt.template get<std::string>();
|
||||
prompt_tokens = common_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
|
||||
prompt_tokens = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
|
||||
}
|
||||
|
||||
return prompt_tokens;
|
||||
@@ -629,7 +629,7 @@ struct llama_server_context
|
||||
|
||||
bool launch_slot_with_data(llama_client_slot* &slot, json data) {
|
||||
slot_params default_params;
|
||||
common_sampler_params default_sparams;
|
||||
gpt_sampler_params default_sparams;
|
||||
|
||||
slot->params.stream = json_value(data, "stream", false);
|
||||
slot->params.cache_prompt = json_value(data, "cache_prompt", false);
|
||||
@@ -769,7 +769,7 @@ struct llama_server_context
|
||||
}
|
||||
else if (el[0].is_string())
|
||||
{
|
||||
auto toks = common_tokenize(model, el[0].get<std::string>(), false);
|
||||
auto toks = llama_tokenize(model, el[0].get<std::string>(), false);
|
||||
for (auto tok : toks)
|
||||
{
|
||||
slot->sparams.logit_bias.push_back({tok, bias});
|
||||
@@ -801,7 +801,7 @@ struct llama_server_context
|
||||
sampler_names.emplace_back(name);
|
||||
}
|
||||
}
|
||||
slot->sparams.samplers = common_sampler_types_from_names(sampler_names, false);
|
||||
slot->sparams.samplers = gpt_sampler_types_from_names(sampler_names, false);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -885,9 +885,9 @@ struct llama_server_context
|
||||
|
||||
if (slot->ctx_sampling != nullptr)
|
||||
{
|
||||
common_sampler_free(slot->ctx_sampling);
|
||||
gpt_sampler_free(slot->ctx_sampling);
|
||||
}
|
||||
slot->ctx_sampling = common_sampler_init(model, slot->sparams);
|
||||
slot->ctx_sampling = gpt_sampler_init(model, slot->sparams);
|
||||
//llama_set_rng_seed(ctx, slot->params.seed);
|
||||
slot->command = LOAD_PROMPT;
|
||||
|
||||
@@ -914,13 +914,13 @@ struct llama_server_context
|
||||
system_tokens.clear();
|
||||
|
||||
if (!system_prompt.empty()) {
|
||||
system_tokens = common_tokenize(ctx, system_prompt, add_bos_token);
|
||||
system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token);
|
||||
|
||||
common_batch_clear(batch);
|
||||
llama_batch_clear(batch);
|
||||
|
||||
for (int i = 0; i < (int)system_tokens.size(); ++i)
|
||||
{
|
||||
common_batch_add(batch, system_tokens[i], i, { 0 }, false);
|
||||
llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
|
||||
}
|
||||
|
||||
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += params.n_batch)
|
||||
@@ -1009,7 +1009,7 @@ struct llama_server_context
|
||||
|
||||
bool process_token(completion_token_output &result, llama_client_slot &slot) {
|
||||
// remember which tokens were sampled - used for repetition penalties during sampling
|
||||
const std::string token_str = common_token_to_piece(ctx, result.tok);
|
||||
const std::string token_str = llama_token_to_piece(ctx, result.tok);
|
||||
slot.sampled = result.tok;
|
||||
|
||||
// search stop word and delete it
|
||||
@@ -1160,7 +1160,7 @@ struct llama_server_context
|
||||
samplers.reserve(slot.sparams.samplers.size());
|
||||
for (const auto & sampler : slot.sparams.samplers)
|
||||
{
|
||||
samplers.emplace_back(common_sampler_type_to_str(sampler));
|
||||
samplers.emplace_back(gpt_sampler_type_to_str(sampler));
|
||||
}
|
||||
|
||||
return json {
|
||||
@@ -1216,7 +1216,7 @@ struct llama_server_context
|
||||
if (slot.sparams.n_probs > 0)
|
||||
{
|
||||
std::vector<completion_token_output> probs_output = {};
|
||||
const std::vector<llama_token> to_send_toks = common_tokenize(ctx, tkn.text_to_send, false);
|
||||
const std::vector<llama_token> to_send_toks = llama_tokenize(ctx, tkn.text_to_send, false);
|
||||
size_t probs_pos = std::min(slot.sent_token_probs_index, slot.generated_token_probs.size());
|
||||
size_t probs_stop_pos = std::min(slot.sent_token_probs_index + to_send_toks.size(), slot.generated_token_probs.size());
|
||||
if (probs_pos < probs_stop_pos)
|
||||
@@ -1268,7 +1268,7 @@ struct llama_server_context
|
||||
std::vector<completion_token_output> probs = {};
|
||||
if (!slot.params.stream && slot.stopped_word)
|
||||
{
|
||||
const std::vector<llama_token> stop_word_toks = common_tokenize(ctx, slot.stopping_word, false);
|
||||
const std::vector<llama_token> stop_word_toks = llama_tokenize(ctx, slot.stopping_word, false);
|
||||
probs = std::vector<completion_token_output>(slot.generated_token_probs.begin(), slot.generated_token_probs.end() - stop_word_toks.size());
|
||||
}
|
||||
else
|
||||
@@ -1408,7 +1408,7 @@ struct llama_server_context
|
||||
}
|
||||
image_idx++;
|
||||
|
||||
common_batch_clear(batch);
|
||||
llama_batch_clear(batch);
|
||||
|
||||
// append prefix of next image
|
||||
const auto json_prompt = (image_idx >= (int) slot.images.size()) ?
|
||||
@@ -1418,7 +1418,7 @@ struct llama_server_context
|
||||
std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image
|
||||
for (int i = 0; i < (int) append_tokens.size(); ++i)
|
||||
{
|
||||
common_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true);
|
||||
llama_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true);
|
||||
slot.n_past += 1;
|
||||
}
|
||||
}
|
||||
@@ -1550,7 +1550,7 @@ struct llama_server_context
|
||||
update_system_prompt();
|
||||
}
|
||||
|
||||
common_batch_clear(batch);
|
||||
llama_batch_clear(batch);
|
||||
|
||||
if (all_slots_are_idle)
|
||||
{
|
||||
@@ -1628,7 +1628,7 @@ struct llama_server_context
|
||||
|
||||
// TODO: we always have to take into account the "system_tokens"
|
||||
// this is not great and needs to be improved somehow
|
||||
common_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true);
|
||||
llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true);
|
||||
slot.n_past += 1;
|
||||
}
|
||||
|
||||
@@ -1722,7 +1722,7 @@ struct llama_server_context
|
||||
|
||||
if (!slot.params.cache_prompt)
|
||||
{
|
||||
common_sampler_reset(slot.ctx_sampling);
|
||||
gpt_sampler_reset(slot.ctx_sampling);
|
||||
|
||||
slot.n_past = 0;
|
||||
slot.n_past_se = 0;
|
||||
@@ -1734,7 +1734,7 @@ struct llama_server_context
|
||||
// push the prompt into the sampling context (do not apply grammar)
|
||||
for (auto &token : prompt_tokens)
|
||||
{
|
||||
common_sampler_accept(slot.ctx_sampling, token, false);
|
||||
gpt_sampler_accept(slot.ctx_sampling, token, false);
|
||||
}
|
||||
|
||||
slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
|
||||
@@ -1826,7 +1826,7 @@ struct llama_server_context
|
||||
ga_i += ga_w/ga_n;
|
||||
}
|
||||
}
|
||||
common_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false);
|
||||
llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false);
|
||||
slot_npast++;
|
||||
}
|
||||
|
||||
@@ -1943,9 +1943,9 @@ struct llama_server_context
|
||||
}
|
||||
|
||||
completion_token_output result;
|
||||
const llama_token id = common_sampler_sample(slot.ctx_sampling, ctx, slot.i_batch - i);
|
||||
const llama_token id = gpt_sampler_sample(slot.ctx_sampling, ctx, slot.i_batch - i);
|
||||
|
||||
common_sampler_accept(slot.ctx_sampling, id, true);
|
||||
gpt_sampler_accept(slot.ctx_sampling, id, true);
|
||||
|
||||
slot.n_decoded += 1;
|
||||
if (slot.n_decoded == 1)
|
||||
@@ -1956,7 +1956,7 @@ struct llama_server_context
|
||||
}
|
||||
|
||||
result.tok = id;
|
||||
const auto * cur_p = common_sampler_get_candidates(slot.ctx_sampling);
|
||||
const auto * cur_p = gpt_sampler_get_candidates(slot.ctx_sampling);
|
||||
|
||||
for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) {
|
||||
result.probs.push_back({
|
||||
@@ -2009,7 +2009,7 @@ static json format_partial_response(
|
||||
struct token_translator
|
||||
{
|
||||
llama_context * ctx;
|
||||
std::string operator()(llama_token tok) const { return common_token_to_piece(ctx, tok); }
|
||||
std::string operator()(llama_token tok) const { return llama_token_to_piece(ctx, tok); }
|
||||
std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); }
|
||||
};
|
||||
|
||||
@@ -2203,7 +2203,7 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
|
||||
// }
|
||||
|
||||
static void params_parse(const backend::ModelOptions* request,
|
||||
common_params & params) {
|
||||
gpt_params & params) {
|
||||
|
||||
// this is comparable to: https://github.com/ggerganov/llama.cpp/blob/d9b33fe95bd257b36c84ee5769cc048230067d6f/examples/server/server.cpp#L1809
|
||||
|
||||
@@ -2311,7 +2311,7 @@ public:
|
||||
|
||||
grpc::Status LoadModel(ServerContext* context, const backend::ModelOptions* request, backend::Result* result) {
|
||||
// Implement LoadModel RPC
|
||||
common_params params;
|
||||
gpt_params params;
|
||||
params_parse(request, params);
|
||||
|
||||
llama_backend_init();
|
||||
|
||||
@@ -19,7 +19,7 @@ class TestBackendServicer(unittest.TestCase):
|
||||
This method sets up the gRPC service by starting the server
|
||||
"""
|
||||
self.service = subprocess.Popen(["python3", "backend.py", "--addr", "localhost:50051"])
|
||||
time.sleep(30)
|
||||
time.sleep(10)
|
||||
|
||||
def tearDown(self) -> None:
|
||||
"""
|
||||
|
||||
@@ -19,7 +19,7 @@ class TestBackendServicer(unittest.TestCase):
|
||||
This method sets up the gRPC service by starting the server
|
||||
"""
|
||||
self.service = subprocess.Popen(["python3", "backend.py", "--addr", "localhost:50051"])
|
||||
time.sleep(30)
|
||||
time.sleep(10)
|
||||
|
||||
def tearDown(self) -> None:
|
||||
"""
|
||||
|
||||
@@ -72,12 +72,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
Returns:
|
||||
A Result object that contains the result of the LoadModel operation.
|
||||
"""
|
||||
|
||||
model_name = request.Model
|
||||
|
||||
# Check to see if the Model exists in the filesystem already.
|
||||
if os.path.exists(request.ModelFile):
|
||||
model_name = request.ModelFile
|
||||
|
||||
compute = torch.float16
|
||||
if request.F16Memory == True:
|
||||
|
||||
@@ -13,9 +13,7 @@ if [ "x${BUILD_PROFILE}" == "xintel" ]; then
|
||||
EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
|
||||
fi
|
||||
|
||||
# We don't embed this into the images as it is a large dependency and not always needed.
|
||||
# Besides, the speed inference are not actually usable in the current state for production use-cases.
|
||||
if [ "x${BUILD_TYPE}" == "x" ] && [ "x${FROM_SOURCE}" == "xtrue" ]; then
|
||||
if [ "x${BUILD_TYPE}" == "x" ]; then
|
||||
ensureVenv
|
||||
# https://docs.vllm.ai/en/v0.6.1/getting_started/cpu-installation.html
|
||||
if [ ! -d vllm ]; then
|
||||
|
||||
@@ -12,7 +12,6 @@ import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strings"
|
||||
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
. "github.com/mudler/LocalAI/core/http"
|
||||
@@ -951,7 +950,7 @@ var _ = Describe("API test", func() {
|
||||
openai.ChatCompletionRequest{Model: "rwkv_test", Messages: []openai.ChatCompletionMessage{{Content: "Can you count up to five?", Role: "user"}}})
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(len(resp.Choices) > 0).To(BeTrue())
|
||||
Expect(strings.ToLower(resp.Choices[0].Message.Content)).To(Or(ContainSubstring("sure"), ContainSubstring("five")))
|
||||
Expect(resp.Choices[0].Message.Content).To(Or(ContainSubstring("Sure"), ContainSubstring("five")))
|
||||
|
||||
stream, err := client.CreateChatCompletionStream(context.TODO(), openai.ChatCompletionRequest{Model: "rwkv_test", Messages: []openai.ChatCompletionMessage{{Content: "Can you count up to five?", Role: "user"}}})
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
@@ -970,7 +969,7 @@ var _ = Describe("API test", func() {
|
||||
tokens++
|
||||
}
|
||||
Expect(text).ToNot(BeEmpty())
|
||||
Expect(strings.ToLower(text)).To(Or(ContainSubstring("sure"), ContainSubstring("five")))
|
||||
Expect(text).To(Or(ContainSubstring("Sure"), ContainSubstring("five")))
|
||||
|
||||
Expect(tokens).ToNot(Or(Equal(1), Equal(0)))
|
||||
})
|
||||
|
||||
@@ -6,7 +6,6 @@ import (
|
||||
|
||||
"github.com/chasefleming/elem-go"
|
||||
"github.com/chasefleming/elem-go/attrs"
|
||||
"github.com/microcosm-cc/bluemonday"
|
||||
"github.com/mudler/LocalAI/core/gallery"
|
||||
"github.com/mudler/LocalAI/core/p2p"
|
||||
"github.com/mudler/LocalAI/core/services"
|
||||
@@ -42,7 +41,7 @@ func DoneProgress(galleryID, text string, showDelete bool) string {
|
||||
"tabindex": "-1",
|
||||
"autofocus": "",
|
||||
},
|
||||
elem.Text(bluemonday.StrictPolicy().Sanitize(text)),
|
||||
elem.Text(text),
|
||||
),
|
||||
elem.If(showDelete, deleteButton(galleryID, modelName), reInstallButton(galleryID)),
|
||||
).Render()
|
||||
@@ -58,7 +57,7 @@ func ErrorProgress(err, galleryName string) string {
|
||||
"tabindex": "-1",
|
||||
"autofocus": "",
|
||||
},
|
||||
elem.Text("Error "+bluemonday.StrictPolicy().Sanitize(err)),
|
||||
elem.Text("Error "+err),
|
||||
),
|
||||
installButton(galleryName),
|
||||
).Render()
|
||||
@@ -171,7 +170,7 @@ func P2PNodeBoxes(nodes []p2p.NodeData) string {
|
||||
attrs.Props{
|
||||
"class": "text-gray-200 font-semibold ml-2 mr-1",
|
||||
},
|
||||
elem.Text(bluemonday.StrictPolicy().Sanitize(n.ID)),
|
||||
elem.Text(n.ID),
|
||||
),
|
||||
elem.Text("Status: "),
|
||||
elem.If(
|
||||
@@ -228,7 +227,7 @@ func StartProgressBar(uid, progress, text string) string {
|
||||
"tabindex": "-1",
|
||||
"autofocus": "",
|
||||
},
|
||||
elem.Text(bluemonday.StrictPolicy().Sanitize(text)), //Perhaps overly defensive
|
||||
elem.Text(text),
|
||||
elem.Div(attrs.Props{
|
||||
"hx-get": "/browse/job/progress/" + uid,
|
||||
"hx-trigger": "every 600ms",
|
||||
@@ -250,7 +249,9 @@ func cardSpan(text, icon string) elem.Node {
|
||||
"class": icon + " pr-2",
|
||||
}),
|
||||
|
||||
elem.Text(bluemonday.StrictPolicy().Sanitize(text)),
|
||||
elem.Text(text),
|
||||
|
||||
//elem.Text(text),
|
||||
)
|
||||
}
|
||||
|
||||
@@ -284,9 +285,11 @@ func searchableElement(text, icon string) elem.Node {
|
||||
elem.I(attrs.Props{
|
||||
"class": icon + " pr-2",
|
||||
}),
|
||||
elem.Text(bluemonday.StrictPolicy().Sanitize(text)),
|
||||
elem.Text(text),
|
||||
),
|
||||
),
|
||||
|
||||
//elem.Text(text),
|
||||
)
|
||||
}
|
||||
|
||||
@@ -300,7 +303,7 @@ func link(text, url string) elem.Node {
|
||||
elem.I(attrs.Props{
|
||||
"class": "fas fa-link pr-2",
|
||||
}),
|
||||
elem.Text(bluemonday.StrictPolicy().Sanitize(text)),
|
||||
elem.Text(text),
|
||||
)
|
||||
}
|
||||
func installButton(galleryName string) elem.Node {
|
||||
@@ -384,13 +387,13 @@ func ListModels(models []*gallery.GalleryModel, processTracker ProcessTracker, g
|
||||
attrs.Props{
|
||||
"class": "mb-2 text-xl font-bold leading-tight",
|
||||
},
|
||||
elem.Text(bluemonday.StrictPolicy().Sanitize(m.Name)),
|
||||
elem.Text(m.Name),
|
||||
),
|
||||
elem.P(
|
||||
attrs.Props{
|
||||
"class": "mb-4 text-sm [&:not(:hover)]:truncate text-base",
|
||||
},
|
||||
elem.Text(bluemonday.StrictPolicy().Sanitize(m.Description)),
|
||||
elem.Text(m.Description),
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
@@ -13,10 +13,15 @@ import (
|
||||
func WelcomeEndpoint(appConfig *config.ApplicationConfig,
|
||||
cl *config.BackendConfigLoader, ml *model.ModelLoader, modelStatus func() (map[string]string, map[string]string)) func(*fiber.Ctx) error {
|
||||
return func(c *fiber.Ctx) error {
|
||||
models, _ := services.ListModels(cl, ml, config.NoFilterFn, services.SKIP_IF_CONFIGURED)
|
||||
backendConfigs := cl.GetAllBackendConfigs()
|
||||
|
||||
galleryConfigs := map[string]*gallery.Config{}
|
||||
modelsWithBackendConfig := map[string]interface{}{}
|
||||
|
||||
for _, m := range backendConfigs {
|
||||
modelsWithBackendConfig[m.Name] = nil
|
||||
|
||||
cfg, err := gallery.GetLocalModelConfiguration(ml.ModelPath, m.Name)
|
||||
if err != nil {
|
||||
continue
|
||||
@@ -24,15 +29,13 @@ func WelcomeEndpoint(appConfig *config.ApplicationConfig,
|
||||
galleryConfigs[m.Name] = cfg
|
||||
}
|
||||
|
||||
modelsWithoutConfig, _ := services.ListModels(cl, ml, config.NoFilterFn, services.LOOSE_ONLY)
|
||||
|
||||
// Get model statuses to display in the UI the operation in progress
|
||||
processingModels, taskTypes := modelStatus()
|
||||
|
||||
summary := fiber.Map{
|
||||
"Title": "LocalAI API - " + internal.PrintableVersion(),
|
||||
"Version": internal.PrintableVersion(),
|
||||
"Models": modelsWithoutConfig,
|
||||
"Models": models,
|
||||
"ModelsConfig": backendConfigs,
|
||||
"GalleryConfig": galleryConfigs,
|
||||
"IsP2PEnabled": p2p.IsP2PEnabled(),
|
||||
|
||||
@@ -10,7 +10,6 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/gofiber/fiber/v2"
|
||||
"github.com/microcosm-cc/bluemonday"
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/core/schema"
|
||||
"github.com/mudler/LocalAI/core/services"
|
||||
@@ -84,7 +83,7 @@ func CreateAssistantEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoad
|
||||
|
||||
if !modelExists(cl, ml, request.Model) {
|
||||
log.Warn().Msgf("Model: %s was not found in list of models.", request.Model)
|
||||
return c.Status(fiber.StatusBadRequest).SendString(bluemonday.StrictPolicy().Sanitize(fmt.Sprintf("Model %q not found", request.Model)))
|
||||
return c.Status(fiber.StatusBadRequest).SendString("Model " + request.Model + " not found")
|
||||
}
|
||||
|
||||
if request.Tools == nil {
|
||||
@@ -148,7 +147,7 @@ func ListAssistantsEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoade
|
||||
// Convert string limit to integer
|
||||
limit, err := strconv.Atoi(limitQuery)
|
||||
if err != nil {
|
||||
return c.Status(http.StatusBadRequest).SendString(bluemonday.StrictPolicy().Sanitize(fmt.Sprintf("Invalid limit query value: %s", limitQuery)))
|
||||
return c.Status(http.StatusBadRequest).SendString(fmt.Sprintf("Invalid limit query value: %s", limitQuery))
|
||||
}
|
||||
|
||||
// Sort assistants
|
||||
@@ -289,7 +288,7 @@ func GetAssistantEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader,
|
||||
}
|
||||
}
|
||||
|
||||
return c.Status(fiber.StatusNotFound).SendString(bluemonday.StrictPolicy().Sanitize(fmt.Sprintf("Unable to find assistant with id: %s", assistantID)))
|
||||
return c.Status(fiber.StatusNotFound).SendString(fmt.Sprintf("Unable to find assistant with id: %s", assistantID))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -338,11 +337,11 @@ func CreateAssistantFileEndpoint(cl *config.BackendConfigLoader, ml *model.Model
|
||||
}
|
||||
}
|
||||
|
||||
return c.Status(fiber.StatusNotFound).SendString(bluemonday.StrictPolicy().Sanitize(fmt.Sprintf("Unable to find file_id: %s", request.FileID)))
|
||||
return c.Status(fiber.StatusNotFound).SendString(fmt.Sprintf("Unable to find file_id: %s", request.FileID))
|
||||
}
|
||||
}
|
||||
|
||||
return c.Status(fiber.StatusNotFound).SendString(bluemonday.StrictPolicy().Sanitize(fmt.Sprintf("Unable to find %q", assistantID)))
|
||||
return c.Status(fiber.StatusNotFound).SendString(fmt.Sprintf("Unable to find %q", assistantID))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -443,7 +442,7 @@ func ModifyAssistantEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoad
|
||||
return c.Status(fiber.StatusOK).JSON(newAssistant)
|
||||
}
|
||||
}
|
||||
return c.Status(fiber.StatusNotFound).SendString(bluemonday.StrictPolicy().Sanitize(fmt.Sprintf("Unable to find assistant with id: %s", assistantID)))
|
||||
return c.Status(fiber.StatusNotFound).SendString(fmt.Sprintf("Unable to find assistant with id: %s", assistantID))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -514,9 +513,9 @@ func GetAssistantFileEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoa
|
||||
if assistantFile.ID == fileId {
|
||||
return c.Status(fiber.StatusOK).JSON(assistantFile)
|
||||
}
|
||||
return c.Status(fiber.StatusNotFound).SendString(bluemonday.StrictPolicy().Sanitize(fmt.Sprintf("Unable to find assistant file with file_id: %s", fileId)))
|
||||
return c.Status(fiber.StatusNotFound).SendString(fmt.Sprintf("Unable to find assistant file with file_id: %s", fileId))
|
||||
}
|
||||
}
|
||||
return c.Status(fiber.StatusNotFound).SendString(bluemonday.StrictPolicy().Sanitize(fmt.Sprintf("Unable to find assistant file with assistant_id: %s", assistantID)))
|
||||
return c.Status(fiber.StatusNotFound).SendString(fmt.Sprintf("Unable to find assistant file with assistant_id: %s", assistantID))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,7 +8,6 @@ import (
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/microcosm-cc/bluemonday"
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/core/schema"
|
||||
|
||||
@@ -50,7 +49,7 @@ func UploadFilesEndpoint(cm *config.BackendConfigLoader, appConfig *config.Appli
|
||||
|
||||
err = c.SaveFile(file, savePath)
|
||||
if err != nil {
|
||||
return c.Status(fiber.StatusInternalServerError).SendString("Failed to save file: " + bluemonday.StrictPolicy().Sanitize(err.Error()))
|
||||
return c.Status(fiber.StatusInternalServerError).SendString("Failed to save file: " + err.Error())
|
||||
}
|
||||
|
||||
f := schema.File{
|
||||
@@ -122,7 +121,7 @@ func GetFilesEndpoint(cm *config.BackendConfigLoader, appConfig *config.Applicat
|
||||
return func(c *fiber.Ctx) error {
|
||||
file, err := getFileFromRequest(c)
|
||||
if err != nil {
|
||||
return c.Status(fiber.StatusInternalServerError).SendString(bluemonday.StrictPolicy().Sanitize(err.Error()))
|
||||
return c.Status(fiber.StatusInternalServerError).SendString(err.Error())
|
||||
}
|
||||
|
||||
return c.JSON(file)
|
||||
@@ -144,14 +143,14 @@ func DeleteFilesEndpoint(cm *config.BackendConfigLoader, appConfig *config.Appli
|
||||
return func(c *fiber.Ctx) error {
|
||||
file, err := getFileFromRequest(c)
|
||||
if err != nil {
|
||||
return c.Status(fiber.StatusInternalServerError).SendString(bluemonday.StrictPolicy().Sanitize(err.Error()))
|
||||
return c.Status(fiber.StatusInternalServerError).SendString(err.Error())
|
||||
}
|
||||
|
||||
err = os.Remove(filepath.Join(appConfig.UploadDir, file.Filename))
|
||||
if err != nil {
|
||||
// If the file doesn't exist then we should just continue to remove it
|
||||
if !errors.Is(err, os.ErrNotExist) {
|
||||
return c.Status(fiber.StatusInternalServerError).SendString(bluemonday.StrictPolicy().Sanitize(fmt.Sprintf("Unable to delete file: %s, %v", file.Filename, err)))
|
||||
return c.Status(fiber.StatusInternalServerError).SendString(fmt.Sprintf("Unable to delete file: %s, %v", file.Filename, err))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -181,12 +180,12 @@ func GetFilesContentsEndpoint(cm *config.BackendConfigLoader, appConfig *config.
|
||||
return func(c *fiber.Ctx) error {
|
||||
file, err := getFileFromRequest(c)
|
||||
if err != nil {
|
||||
return c.Status(fiber.StatusInternalServerError).SendString(bluemonday.StrictPolicy().Sanitize(err.Error()))
|
||||
return c.Status(fiber.StatusInternalServerError).SendString(err.Error())
|
||||
}
|
||||
|
||||
fileContents, err := os.ReadFile(filepath.Join(appConfig.UploadDir, file.Filename))
|
||||
if err != nil {
|
||||
return c.Status(fiber.StatusInternalServerError).SendString(bluemonday.StrictPolicy().Sanitize(err.Error()))
|
||||
return c.Status(fiber.StatusInternalServerError).SendString(err.Error())
|
||||
}
|
||||
|
||||
return c.Send(fileContents)
|
||||
|
||||
@@ -7,7 +7,6 @@ import (
|
||||
"github.com/dave-gray101/v2keyauth"
|
||||
"github.com/gofiber/fiber/v2"
|
||||
"github.com/gofiber/fiber/v2/middleware/keyauth"
|
||||
"github.com/microcosm-cc/bluemonday"
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
)
|
||||
|
||||
@@ -39,7 +38,7 @@ func getApiKeyErrorHandler(applicationConfig *config.ApplicationConfig) fiber.Er
|
||||
if applicationConfig.OpaqueErrors {
|
||||
return ctx.SendStatus(403)
|
||||
}
|
||||
return ctx.Status(403).SendString(bluemonday.StrictPolicy().Sanitize(err.Error()))
|
||||
return ctx.Status(403).SendString(err.Error())
|
||||
}
|
||||
if applicationConfig.OpaqueErrors {
|
||||
return ctx.SendStatus(500)
|
||||
|
||||
@@ -6,7 +6,6 @@ import (
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"github.com/microcosm-cc/bluemonday"
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/core/gallery"
|
||||
"github.com/mudler/LocalAI/core/http/elements"
|
||||
@@ -172,7 +171,7 @@ func RegisterUIRoutes(app *fiber.App,
|
||||
Search string `form:"search"`
|
||||
}{}
|
||||
if err := c.BodyParser(&form); err != nil {
|
||||
return c.Status(fiber.StatusBadRequest).SendString(bluemonday.StrictPolicy().Sanitize(err.Error()))
|
||||
return c.Status(fiber.StatusBadRequest).SendString(err.Error())
|
||||
}
|
||||
|
||||
models, _ := gallery.AvailableGalleryModels(appConfig.Galleries, appConfig.ModelPath)
|
||||
|
||||
@@ -8,10 +8,10 @@ import (
|
||||
type LooseFilePolicy int
|
||||
|
||||
const (
|
||||
LOOSE_ONLY LooseFilePolicy = iota
|
||||
SKIP_IF_CONFIGURED
|
||||
SKIP_IF_CONFIGURED LooseFilePolicy = iota
|
||||
SKIP_ALWAYS
|
||||
ALWAYS_INCLUDE
|
||||
LOOSE_ONLY
|
||||
)
|
||||
|
||||
func ListModels(bcl *config.BackendConfigLoader, ml *model.ModelLoader, filter config.BackendConfigFilterFn, looseFilePolicy LooseFilePolicy) ([]string, error) {
|
||||
@@ -21,13 +21,11 @@ func ListModels(bcl *config.BackendConfigLoader, ml *model.ModelLoader, filter c
|
||||
dataModels := []string{}
|
||||
|
||||
// Start with known configurations
|
||||
|
||||
for _, c := range bcl.GetBackendConfigsByFilter(filter) {
|
||||
// Is this better than looseFilePolicy <= SKIP_IF_CONFIGURED ? less performant but more readable?
|
||||
if (looseFilePolicy == SKIP_IF_CONFIGURED) || (looseFilePolicy == LOOSE_ONLY) {
|
||||
skipMap[c.Model] = nil
|
||||
}
|
||||
if looseFilePolicy != LOOSE_ONLY {
|
||||
if looseFilePolicy != LOOSE_ONLY {
|
||||
for _, c := range bcl.GetBackendConfigsByFilter(filter) {
|
||||
if looseFilePolicy == SKIP_IF_CONFIGURED {
|
||||
skipMap[c.Model] = nil
|
||||
}
|
||||
dataModels = append(dataModels, c.Name)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
llama_index==0.11.16
|
||||
llama_index==0.11.14
|
||||
requests==2.32.3
|
||||
weaviate_client==4.8.1
|
||||
transformers
|
||||
|
||||
@@ -1,2 +1,2 @@
|
||||
langchain==0.3.2
|
||||
openai==1.51.1
|
||||
langchain==0.3.1
|
||||
openai==1.50.2
|
||||
|
||||
@@ -5,7 +5,7 @@ metadata:
|
||||
spec:
|
||||
containers:
|
||||
- name: broken-pod
|
||||
image: nginx:1.27.2
|
||||
image: nginx:1.27.0
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
langchain==0.3.1
|
||||
openai==1.51.1
|
||||
openai==1.50.2
|
||||
chromadb==0.5.11
|
||||
llama-index==0.11.16
|
||||
llama-index==0.11.14
|
||||
@@ -1,4 +1,4 @@
|
||||
aiohttp==3.10.9
|
||||
aiohttp==3.10.8
|
||||
aiosignal==1.3.1
|
||||
async-timeout==4.0.3
|
||||
attrs==24.2.0
|
||||
@@ -6,19 +6,19 @@ certifi==2024.8.30
|
||||
charset-normalizer==3.3.2
|
||||
colorama==0.4.6
|
||||
dataclasses-json==0.6.7
|
||||
debugpy==1.8.6
|
||||
debugpy==1.8.2
|
||||
frozenlist==1.4.1
|
||||
greenlet==3.1.1
|
||||
idna==3.10
|
||||
langchain==0.3.2
|
||||
langchain==0.3.1
|
||||
langchain-community==0.3.1
|
||||
marshmallow==3.22.0
|
||||
marshmallow-enum==1.5.1
|
||||
multidict==6.1.0
|
||||
multidict==6.0.5
|
||||
mypy-extensions==1.0.0
|
||||
numexpr==2.10.1
|
||||
numpy==2.1.1
|
||||
openai==1.51.1
|
||||
openai==1.45.1
|
||||
openapi-schema-pydantic==1.2.4
|
||||
packaging>=23.2
|
||||
pydantic==2.9.2
|
||||
|
||||
@@ -1,2 +1,2 @@
|
||||
streamlit==1.39.0
|
||||
streamlit==1.38.0
|
||||
requests
|
||||
@@ -1,66 +0,0 @@
|
||||
---
|
||||
name: "chatml"
|
||||
|
||||
config_file: |
|
||||
mmap: true
|
||||
function:
|
||||
disable_no_action: true
|
||||
grammar:
|
||||
mixed_mode: false
|
||||
disable: true
|
||||
parallel_calls: true
|
||||
expect_strings_after_json: true
|
||||
json_regex_match:
|
||||
- "(?s)<tool_call>(.*?)</tool_call>"
|
||||
- "(?s)<tool_call>(.*)"
|
||||
capture_llm_results:
|
||||
- (?s)<scratchpad>(.*?)</scratchpad>
|
||||
replace_llm_results:
|
||||
- key: (?s)<scratchpad>(.*?)</scratchpad>
|
||||
value: ""
|
||||
template:
|
||||
chat_message: |
|
||||
<|im_start|>{{ .RoleName }}
|
||||
{{ if .FunctionCall -}}
|
||||
Function call:
|
||||
{{ else if eq .RoleName "tool" -}}
|
||||
Function response:
|
||||
{{ end -}}
|
||||
{{ if .Content -}}
|
||||
{{.Content }}
|
||||
{{ end -}}
|
||||
{{ if .FunctionCall -}}
|
||||
{{toJson .FunctionCall}}
|
||||
{{ end -}}<|im_end|>
|
||||
function: |
|
||||
<|im_start|>system
|
||||
# Tools
|
||||
|
||||
You may call one or more functions to assist with the user query.
|
||||
|
||||
You are provided with function signatures within <tools></tools> XML tags:
|
||||
<tools>
|
||||
{{range .Functions}}
|
||||
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
|
||||
{{end}}
|
||||
</tools>
|
||||
For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
|
||||
<tool_call>
|
||||
{"name": <function-name>, "arguments": <args-json-object>}
|
||||
</tool_call>
|
||||
<|im_end|>
|
||||
{{.Input -}}
|
||||
<|im_start|>assistant
|
||||
chat: |
|
||||
{{.Input -}}
|
||||
<|im_start|>assistant
|
||||
completion: |
|
||||
{{.Input}}
|
||||
context_size: 4096
|
||||
f16: true
|
||||
stopwords:
|
||||
- '<|im_end|>'
|
||||
- '<dummy32000>'
|
||||
- '</s>'
|
||||
- "<|eot_id|>"
|
||||
- "<|end_of_text|>"
|
||||
@@ -1,21 +1,4 @@
|
||||
---
|
||||
- name: "moe-girl-1ba-7bt-i1"
|
||||
icon: https://cdn-uploads.huggingface.co/production/uploads/634262af8d8089ebaefd410e/kTXXSSSqpb21rfyOX7FUa.jpeg
|
||||
# chatml
|
||||
url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
|
||||
urls:
|
||||
- https://huggingface.co/allura-org/MoE-Girl-1BA-7BT
|
||||
- https://huggingface.co/mradermacher/MoE-Girl-1BA-7BT-i1-GGUF
|
||||
description: |
|
||||
A finetune of OLMoE by AllenAI designed for roleplaying (and maybe general usecases if you try hard enough).
|
||||
PLEASE do not expect godliness out of this, it's a model with 1 billion active parameters. Expect something more akin to Gemma 2 2B, not Llama 3 8B.
|
||||
overrides:
|
||||
parameters:
|
||||
model: MoE-Girl-1BA-7BT.i1-Q4_K_M.gguf
|
||||
files:
|
||||
- filename: MoE-Girl-1BA-7BT.i1-Q4_K_M.gguf
|
||||
sha256: e6ef9c311c73573b243de6ff7538b386f430af30b2be0a96a5745c17137ad432
|
||||
uri: huggingface://mradermacher/MoE-Girl-1BA-7BT-i1-GGUF/MoE-Girl-1BA-7BT.i1-Q4_K_M.gguf
|
||||
- name: "salamandra-7b-instruct"
|
||||
icon: https://huggingface.co/BSC-LT/salamandra-7b-instruct/resolve/main/images/salamandra_header.png
|
||||
# Uses chatml
|
||||
@@ -100,88 +83,6 @@
|
||||
- filename: llama-3.2-1b-instruct-q8_0.gguf
|
||||
sha256: ba345c83bf5cc679c653b853c46517eea5a34f03ed2205449db77184d9ae62a9
|
||||
uri: huggingface://hugging-quants/Llama-3.2-1B-Instruct-Q8_0-GGUF/llama-3.2-1b-instruct-q8_0.gguf
|
||||
## Uncensored
|
||||
- !!merge <<: *llama32
|
||||
icon: https://cdn-uploads.huggingface.co/production/uploads/66c9d7a26f2335ba288810a4/4YDg-rcEXCK0fdTS1fBzE.webp
|
||||
name: "versatillama-llama-3.2-3b-instruct-abliterated"
|
||||
urls:
|
||||
- https://huggingface.co/QuantFactory/VersatiLlama-Llama-3.2-3B-Instruct-Abliterated-GGUF
|
||||
description: |
|
||||
Small but Smart Fine-Tuned on Vast dataset of Conversations. Able to Generate Human like text with high performance within its size. It is Very Versatile when compared for it's size and Parameters and offers capability almost as good as Llama 3.1 8B Instruct.
|
||||
overrides:
|
||||
parameters:
|
||||
model: VersatiLlama-Llama-3.2-3B-Instruct-Abliterated.Q4_K_M.gguf
|
||||
files:
|
||||
- filename: VersatiLlama-Llama-3.2-3B-Instruct-Abliterated.Q4_K_M.gguf
|
||||
sha256: 15b9e4a987f50d7594d030815c7166a996e20db46fe1e20da03e96955020312c
|
||||
uri: huggingface://QuantFactory/VersatiLlama-Llama-3.2-3B-Instruct-Abliterated-GGUF/VersatiLlama-Llama-3.2-3B-Instruct-Abliterated.Q4_K_M.gguf
|
||||
- !!merge <<: *llama32
|
||||
name: "llama3.2-3b-enigma"
|
||||
icon: https://cdn-uploads.huggingface.co/production/uploads/64f267a8a4f79a118e0fcc89/it7MY5MyLCLpFQev5dUis.jpeg
|
||||
urls:
|
||||
- https://huggingface.co/QuantFactory/Llama3.2-3B-Enigma-GGUF
|
||||
description: |
|
||||
Enigma is a code-instruct model built on Llama 3.2 3b. It is a high quality code instruct model with the Llama 3.2 Instruct chat format. The model is finetuned on synthetic code-instruct data generated with Llama 3.1 405b and supplemented with generalist synthetic data. It uses the Llama 3.2 Instruct prompt format.
|
||||
overrides:
|
||||
parameters:
|
||||
model: Llama3.2-3B-Enigma.Q4_K_M.gguf
|
||||
files:
|
||||
- filename: Llama3.2-3B-Enigma.Q4_K_M.gguf
|
||||
sha256: 4304e6ee1e348b228470700ec1e9423f5972333d376295195ce6cd5c70cae5e4
|
||||
uri: huggingface://QuantFactory/Llama3.2-3B-Enigma-GGUF/Llama3.2-3B-Enigma.Q4_K_M.gguf
|
||||
- !!merge <<: *llama32
|
||||
name: "llama3.2-3b-esper2"
|
||||
icon: https://cdn-uploads.huggingface.co/production/uploads/64f267a8a4f79a118e0fcc89/4I6oK8DG0so4VD8GroFsd.jpeg
|
||||
urls:
|
||||
- https://huggingface.co/QuantFactory/Llama3.2-3B-Esper2-GGUF
|
||||
description: |
|
||||
Esper 2 is a DevOps and cloud architecture code specialist built on Llama 3.2 3b. It is an AI assistant focused on AWS, Azure, GCP, Terraform, Dockerfiles, pipelines, shell scripts and more, with real world problem solving and high quality code instruct performance within the Llama 3.2 Instruct chat format. Finetuned on synthetic DevOps-instruct and code-instruct data generated with Llama 3.1 405b and supplemented with generalist chat data.
|
||||
overrides:
|
||||
parameters:
|
||||
model: Llama3.2-3B-Esper2.Q4_K_M.gguf
|
||||
files:
|
||||
- filename: Llama3.2-3B-Esper2.Q4_K_M.gguf
|
||||
sha256: 11d2bd674aa22a71a59ec49ad29b695000d14bc275b0195b8d7089bfc7582fc7
|
||||
uri: huggingface://QuantFactory/Llama3.2-3B-Esper2-GGUF/Llama3.2-3B-Esper2.Q4_K_M.gguf
|
||||
- !!merge <<: *llama32
|
||||
name: "llama-3.2-3b-agent007"
|
||||
urls:
|
||||
- https://huggingface.co/QuantFactory/Llama-3.2-3B-Agent007-GGUF
|
||||
description: |
|
||||
The model is a quantized version of EpistemeAI/Llama-3.2-3B-Agent007, developed by EpistemeAI and fine-tuned from unsloth/llama-3.2-3b-instruct-bnb-4bit. It was trained 2x faster with Unsloth and Huggingface's TRL library. Fine tuned with Agent datasets.
|
||||
overrides:
|
||||
parameters:
|
||||
model: Llama-3.2-3B-Agent007.Q4_K_M.gguf
|
||||
files:
|
||||
- filename: Llama-3.2-3B-Agent007.Q4_K_M.gguf
|
||||
sha256: 7a2543a69b116f2a059e2e445e5d362bb7df4a51b97e83d8785c1803dc9d687f
|
||||
uri: huggingface://QuantFactory/Llama-3.2-3B-Agent007-GGUF/Llama-3.2-3B-Agent007.Q4_K_M.gguf
|
||||
- !!merge <<: *llama32
|
||||
name: "llama-3.2-3b-agent007-coder"
|
||||
urls:
|
||||
- https://huggingface.co/QuantFactory/Llama-3.2-3B-Agent007-Coder-GGUF
|
||||
description: |
|
||||
The Llama-3.2-3B-Agent007-Coder-GGUF is a quantized version of the EpistemeAI/Llama-3.2-3B-Agent007-Coder model, which is a fine-tuned version of the unsloth/llama-3.2-3b-instruct-bnb-4bit model. It is created using llama.cpp and trained with additional datasets such as the Agent dataset, Code Alpaca 20K, and magpie ultra 0.1. This model is optimized for multilingual dialogue use cases and agentic retrieval and summarization tasks. The model is available for commercial and research use in multiple languages and is best used with the transformers library.
|
||||
overrides:
|
||||
parameters:
|
||||
model: Llama-3.2-3B-Agent007-Coder.Q4_K_M.gguf
|
||||
files:
|
||||
- filename: Llama-3.2-3B-Agent007-Coder.Q4_K_M.gguf
|
||||
sha256: 49a4861c094d94ef5faa33f69b02cd132bb0167f1c3ca59059404f85f61e1d12
|
||||
uri: huggingface://QuantFactory/Llama-3.2-3B-Agent007-Coder-GGUF/Llama-3.2-3B-Agent007-Coder.Q4_K_M.gguf
|
||||
- !!merge <<: *llama32
|
||||
name: "fireball-meta-llama-3.2-8b-instruct-agent-003-128k-code-dpo"
|
||||
urls:
|
||||
- https://huggingface.co/QuantFactory/Fireball-Meta-Llama-3.2-8B-Instruct-agent-003-128k-code-DPO-GGUF
|
||||
description: |
|
||||
The LLM model is a quantized version of EpistemeAI/Fireball-Meta-Llama-3.2-8B-Instruct-agent-003-128k-code-DPO, which is an experimental and revolutionary fine-tune with DPO dataset to allow LLama 3.1 8B to be an agentic coder. It has some built-in agent features such as search, calculator, and ReAct. Other noticeable features include self-learning using unsloth, RAG applications, and memory. The context window of the model is 128K. It can be integrated into projects using popular libraries like Transformers and vLLM. The model is suitable for use with Langchain or LLamaIndex. The model is developed by EpistemeAI and licensed under the Apache 2.0 license.
|
||||
overrides:
|
||||
parameters:
|
||||
model: Fireball-Meta-Llama-3.2-8B-Instruct-agent-003-128k-code-DPO.Q4_K_M.gguf
|
||||
files:
|
||||
- filename: Fireball-Meta-Llama-3.2-8B-Instruct-agent-003-128k-code-DPO.Q4_K_M.gguf
|
||||
sha256: 7f45fa79bc6c9847ef9fbad08c3bb5a0f2dbb56d2e2200a5d37b260a57274e55
|
||||
uri: huggingface://QuantFactory/Fireball-Meta-Llama-3.2-8B-Instruct-agent-003-128k-code-DPO-GGUF/Fireball-Meta-Llama-3.2-8B-Instruct-agent-003-128k-code-DPO.Q4_K_M.gguf
|
||||
- &qwen25
|
||||
## Qwen2.5
|
||||
name: "qwen2.5-14b-instruct"
|
||||
@@ -418,113 +319,6 @@
|
||||
- filename: calme-2.2-qwen2.5-72b.i1-Q4_K_M.gguf
|
||||
sha256: 5fdfa599724d7c78502c477ced1d294e92781b91d3265bd0748fbf15a6fefde6
|
||||
uri: huggingface://mradermacher/calme-2.2-qwen2.5-72b-i1-GGUF/calme-2.2-qwen2.5-72b.i1-Q4_K_M.gguf
|
||||
- !!merge <<: *qwen25
|
||||
name: "t.e-8.1-iq-imatrix-request"
|
||||
# chatml
|
||||
url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
|
||||
icon: https://cdn-uploads.huggingface.co/production/uploads/65d4cf2693a0a3744a27536c/K1aNPf32z-6tYZdcSQBzF.png
|
||||
urls:
|
||||
- https://huggingface.co/Cran-May/T.E-8.1
|
||||
- https://huggingface.co/Lewdiculous/T.E-8.1-GGUF-IQ-Imatrix-Request
|
||||
description: |
|
||||
Trained for roleplay uses.
|
||||
overrides:
|
||||
parameters:
|
||||
model: T.E-8.1-Q4_K_M-imat.gguf
|
||||
files:
|
||||
- filename: T.E-8.1-Q4_K_M-imat.gguf
|
||||
sha256: 1b7892b82c01ea4cbebe34cd00f9836cbbc369fc3247c1f44a92842201e7ec0b
|
||||
uri: huggingface://Lewdiculous/T.E-8.1-GGUF-IQ-Imatrix-Request/T.E-8.1-Q4_K_M-imat.gguf
|
||||
- !!merge <<: *qwen25
|
||||
name: "rombos-llm-v2.5.1-qwen-3b"
|
||||
icon: https://cdn-uploads.huggingface.co/production/uploads/642cc1c253e76b4c2286c58e/pNDtgE5FDkxxvbG4qiZ1A.jpeg
|
||||
urls:
|
||||
- https://huggingface.co/QuantFactory/Rombos-LLM-V2.5.1-Qwen-3b-GGUF
|
||||
description: |
|
||||
Rombos-LLM-V2.5.1-Qwen-3b is a little experiment that merges a high-quality LLM, arcee-ai/raspberry-3B, using the last step of the Continuous Finetuning method outlined in a Google document. The merge is done using the mergekit with the following parameters:
|
||||
|
||||
- Models: Qwen2.5-3B-Instruct, raspberry-3B
|
||||
- Merge method: ties
|
||||
- Base model: Qwen2.5-3B
|
||||
- Parameters: weight=1, density=1, normalize=true, int8_mask=true
|
||||
- Dtype: bfloat16
|
||||
|
||||
The model has been evaluated on various tasks and datasets, and the results are available on the Open LLM Leaderboard. The model has shown promising performance across different benchmarks.
|
||||
overrides:
|
||||
parameters:
|
||||
model: Rombos-LLM-V2.5.1-Qwen-3b.Q4_K_M.gguf
|
||||
files:
|
||||
- filename: Rombos-LLM-V2.5.1-Qwen-3b.Q4_K_M.gguf
|
||||
sha256: 656c342a2921cac8912e0123fc295c3bb3d631a85c671c12a3843a957e46d30d
|
||||
uri: huggingface://QuantFactory/Rombos-LLM-V2.5.1-Qwen-3b-GGUF/Rombos-LLM-V2.5.1-Qwen-3b.Q4_K_M.gguf
|
||||
- !!merge <<: *qwen25
|
||||
name: "qwen2.5-7b-ins-v3"
|
||||
urls:
|
||||
- https://huggingface.co/happzy2633/qwen2.5-7b-ins-v3
|
||||
- https://huggingface.co/bartowski/qwen2.5-7b-ins-v3-GGUF
|
||||
description: |
|
||||
Qwen 2.5 fine-tuned on CoT to match o1 performance. An attempt to build an Open o1 mathcing OpenAI o1 model
|
||||
Demo: https://huggingface.co/spaces/happzy2633/open-o1
|
||||
overrides:
|
||||
parameters:
|
||||
model: qwen2.5-7b-ins-v3-Q4_K_M.gguf
|
||||
files:
|
||||
- filename: qwen2.5-7b-ins-v3-Q4_K_M.gguf
|
||||
sha256: 9c23734072714a4886c0386ae0ff07a5e940d67ad52278e2ed689fec44e1e0c8
|
||||
uri: huggingface://bartowski/qwen2.5-7b-ins-v3-GGUF/qwen2.5-7b-ins-v3-Q4_K_M.gguf
|
||||
- &archfunct
|
||||
license: apache-2.0
|
||||
tags:
|
||||
- llm
|
||||
- gguf
|
||||
- gpu
|
||||
- qwen
|
||||
- qwen2.5
|
||||
- cpu
|
||||
- function-calling
|
||||
name: "arch-function-1.5b"
|
||||
uri: "github:mudler/LocalAI/gallery/arch-function.yaml@master"
|
||||
urls:
|
||||
- https://huggingface.co/katanemolabs/Arch-Function-1.5B
|
||||
- https://huggingface.co/mradermacher/Arch-Function-1.5B-GGUF
|
||||
description: |
|
||||
The Katanemo Arch-Function collection of large language models (LLMs) is a collection state-of-the-art (SOTA) LLMs specifically designed for function calling tasks. The models are designed to understand complex function signatures, identify required parameters, and produce accurate function call outputs based on natural language prompts. Achieving performance on par with GPT-4, these models set a new benchmark in the domain of function-oriented tasks, making them suitable for scenarios where automated API interaction and function execution is crucial.
|
||||
In summary, the Katanemo Arch-Function collection demonstrates:
|
||||
State-of-the-art performance in function calling
|
||||
Accurate parameter identification and suggestion, even in ambiguous or incomplete inputs
|
||||
High generalization across multiple function calling use cases, from API interactions to automated backend tasks.
|
||||
Optimized low-latency, high-throughput performance, making it suitable for real-time, production environments.
|
||||
overrides:
|
||||
parameters:
|
||||
model: Arch-Function-1.5B.Q4_K_M.gguf
|
||||
files:
|
||||
- filename: Arch-Function-1.5B.Q4_K_M.gguf
|
||||
sha256: 5ac54d2d50cca0ee0335ca2c9b688204c0829cd3a73de3ee3fda108281ad9691
|
||||
uri: huggingface://mradermacher/Arch-Function-1.5B-GGUF/Arch-Function-1.5B.Q4_K_M.gguf
|
||||
- !!merge <<: *archfunct
|
||||
name: "arch-function-7b"
|
||||
urls:
|
||||
- https://huggingface.co/katanemolabs/Arch-Function-7B
|
||||
- https://huggingface.co/mradermacher/Arch-Function-7B-GGUF
|
||||
overrides:
|
||||
parameters:
|
||||
model: Arch-Function-7B.Q4_K_M.gguf
|
||||
files:
|
||||
- filename: Arch-Function-7B.Q4_K_M.gguf
|
||||
sha256: 6e38661321d79d02b8cf57c79d97c6c0e19adb9ffa66083cc440c24e257234b6
|
||||
uri: huggingface://mradermacher/Arch-Function-7B-GGUF/Arch-Function-7B.Q4_K_M.gguf
|
||||
- !!merge <<: *archfunct
|
||||
name: "arch-function-3b"
|
||||
urls:
|
||||
- https://huggingface.co/katanemolabs/Arch-Function-3B
|
||||
- https://huggingface.co/mradermacher/Arch-Function-3B-GGUF
|
||||
overrides:
|
||||
parameters:
|
||||
model: Arch-Function-3B.Q4_K_M.gguf
|
||||
files:
|
||||
- filename: Arch-Function-3B.Q4_K_M.gguf
|
||||
sha256: 9945cb8d070498d163e5df90c1987f591d35e4fd2222a6c51bcfff848c4b573b
|
||||
uri: huggingface://mradermacher/Arch-Function-3B-GGUF/Arch-Function-3B.Q4_K_M.gguf
|
||||
- &smollm
|
||||
## SmolLM
|
||||
url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
|
||||
@@ -1034,21 +828,6 @@
|
||||
- filename: NightyGurps-14b-v1.1-Q4_K_M.gguf
|
||||
sha256: d09d53259ad2c0298150fa8c2db98fe42f11731af89fdc80ad0e255a19adc4b0
|
||||
uri: huggingface://bartowski/NightyGurps-14b-v1.1-GGUF/NightyGurps-14b-v1.1-Q4_K_M.gguf
|
||||
- !!merge <<: *llama31
|
||||
name: "llama-3.1-swallow-70b-v0.1-i1"
|
||||
icon: https://huggingface.co/tokyotech-llm/Llama-3.1-Swallow-70B-v0.1/resolve/main/logo.png
|
||||
urls:
|
||||
- https://huggingface.co/tokyotech-llm/Llama-3.1-Swallow-70B-v0.1
|
||||
- https://huggingface.co/mradermacher/Llama-3.1-Swallow-70B-v0.1-i1-GGUF
|
||||
description: |
|
||||
Llama 3.1 Swallow is a series of large language models (8B, 70B) that were built by continual pre-training on the Meta Llama 3.1 models. Llama 3.1 Swallow enhanced the Japanese language capabilities of the original Llama 3.1 while retaining the English language capabilities. We use approximately 200 billion tokens that were sampled from a large Japanese web corpus (Swallow Corpus Version 2), Japanese and English Wikipedia articles, and mathematical and coding contents, etc (see the Training Datasets section) for continual pre-training. The instruction-tuned models (Instruct) were built by supervised fine-tuning (SFT) on the synthetic data specially built for Japanese. See the Swallow Model Index section to find other model variants.
|
||||
overrides:
|
||||
parameters:
|
||||
model: Llama-3.1-Swallow-70B-v0.1.i1-Q4_K_M.gguf
|
||||
files:
|
||||
- filename: Llama-3.1-Swallow-70B-v0.1.i1-Q4_K_M.gguf
|
||||
sha256: 9eaa08a4872a26f56fe34b27a99f7bd0d22ee2b2d1c84cfcde2091b5f61af5fa
|
||||
uri: huggingface://mradermacher/Llama-3.1-Swallow-70B-v0.1-i1-GGUF/Llama-3.1-Swallow-70B-v0.1.i1-Q4_K_M.gguf
|
||||
## Uncensored models
|
||||
- !!merge <<: *llama31
|
||||
name: "humanish-roleplay-llama-3.1-8b-i1"
|
||||
@@ -1365,53 +1144,6 @@
|
||||
- filename: Llama-3.1-8B-ArliAI-RPMax-v1.1-Q4_K_M.gguf
|
||||
sha256: 0a601c7341228d9160332965298d799369a1dc2b7080771fb8051bdeb556b30c
|
||||
uri: huggingface://bartowski/Llama-3.1-8B-ArliAI-RPMax-v1.1-GGUF/Llama-3.1-8B-ArliAI-RPMax-v1.1-Q4_K_M.gguf
|
||||
- !!merge <<: *llama31
|
||||
name: "violet_twilight-v0.2-iq-imatrix"
|
||||
url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
|
||||
icon: https://cdn-uploads.huggingface.co/production/uploads/64adfd277b5ff762771e4571/P962FQhRG4I8nbU_DJolY.png
|
||||
urls:
|
||||
- https://huggingface.co/Epiculous/Violet_Twilight-v0.2
|
||||
- https://huggingface.co/Lewdiculous/Violet_Twilight-v0.2-GGUF-IQ-Imatrix
|
||||
description: |
|
||||
Now for something a bit different, Violet_Twilight-v0.2! This model is a SLERP merge of Azure_Dusk-v0.2 and Crimson_Dawn-v0.2!
|
||||
overrides:
|
||||
parameters:
|
||||
model: Violet_Twilight-v0.2-Q4_K_M-imat.gguf
|
||||
files:
|
||||
- filename: Violet_Twilight-v0.2-Q4_K_M-imat.gguf
|
||||
sha256: 0793d196a00cd6fd4e67b8c585b27a94d397e33d427e4ad4aa9a16b7abc339cd
|
||||
uri: huggingface://Lewdiculous/Violet_Twilight-v0.2-GGUF-IQ-Imatrix/Violet_Twilight-v0.2-Q4_K_M-imat.gguf
|
||||
- !!merge <<: *llama31
|
||||
url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
|
||||
name: "dans-personalityengine-v1.0.0-8b"
|
||||
urls:
|
||||
- https://huggingface.co/PocketDoc/Dans-PersonalityEngine-v1.0.0-8b
|
||||
- https://huggingface.co/bartowski/Dans-PersonalityEngine-v1.0.0-8b-GGUF
|
||||
description: |
|
||||
This model is intended to be multifarious in its capabilities and should be quite capable at both co-writing and roleplay as well as find itself quite at home performing sentiment analysis or summarization as part of a pipeline. It has been trained on a wide array of one shot instructions, multi turn instructions, role playing scenarios, text adventure games, co-writing, and much more. The full dataset is publicly available and can be found in the datasets section of the model page.
|
||||
|
||||
There has not been any form of harmfulness alignment done on this model, please take the appropriate precautions when using it in a production environment.
|
||||
overrides:
|
||||
parameters:
|
||||
model: Dans-PersonalityEngine-v1.0.0-8b-Q4_K_M.gguf
|
||||
files:
|
||||
- filename: Dans-PersonalityEngine-v1.0.0-8b-Q4_K_M.gguf
|
||||
sha256: 193b66434c9962e278bb171a21e652f0d3f299f04e86c95f9f75ec5aa8ff006e
|
||||
uri: huggingface://bartowski/Dans-PersonalityEngine-v1.0.0-8b-GGUF/Dans-PersonalityEngine-v1.0.0-8b-Q4_K_M.gguf
|
||||
- !!merge <<: *llama31
|
||||
name: "nihappy-l3.1-8b-v0.09"
|
||||
urls:
|
||||
- https://huggingface.co/Arkana08/NIHAPPY-L3.1-8B-v0.09
|
||||
- https://huggingface.co/QuantFactory/NIHAPPY-L3.1-8B-v0.09-GGUF
|
||||
description: |
|
||||
The model is a quantized version of Arkana08/NIHAPPY-L3.1-8B-v0.09 created using llama.cpp. It is a role-playing model that integrates the finest qualities of various pre-trained language models, focusing on dynamic storytelling.
|
||||
overrides:
|
||||
parameters:
|
||||
model: NIHAPPY-L3.1-8B-v0.09.Q4_K_M.gguf
|
||||
files:
|
||||
- filename: NIHAPPY-L3.1-8B-v0.09.Q4_K_M.gguf
|
||||
sha256: 9bd46a06093448b143bd2775f0fb1b1b172c851fafdce31289e13b7dfc23a0d7
|
||||
uri: huggingface://QuantFactory/NIHAPPY-L3.1-8B-v0.09-GGUF/NIHAPPY-L3.1-8B-v0.09.Q4_K_M.gguf
|
||||
- &deepseek
|
||||
## Deepseek
|
||||
url: "github:mudler/LocalAI/gallery/deepseek.yaml@master"
|
||||
@@ -2002,26 +1734,6 @@
|
||||
- filename: MagnusIntellectus-12B-v1.i1-Q4_K_M.gguf
|
||||
sha256: c97107983b4edc5b6f2a592d227ca2dd4196e2af3d3bc0fe6b7a8954a1fb5870
|
||||
uri: huggingface://mradermacher/MagnusIntellectus-12B-v1-i1-GGUF/MagnusIntellectus-12B-v1.i1-Q4_K_M.gguf
|
||||
- !!merge <<: *mistral03
|
||||
url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
|
||||
name: "mn-backyardai-party-12b-v1-iq-arm-imatrix"
|
||||
icon: https://huggingface.co/Sao10K/MN-BackyardAI-Party-12B-v1/resolve/main/party1.png
|
||||
urls:
|
||||
- https://huggingface.co/Sao10K/MN-BackyardAI-Party-12B-v1
|
||||
- https://huggingface.co/Lewdiculous/MN-BackyardAI-Party-12B-v1-GGUF-IQ-ARM-Imatrix
|
||||
description: |
|
||||
This is a group-chat based roleplaying model, based off of 12B-Lyra-v4a2, a variant of Lyra-v4 that is currently private.
|
||||
|
||||
It is trained on an entirely human-based dataset, based on forum / internet group roleplaying styles. The only augmentation done with LLMs is to the character sheets, to fit to the system prompt, to fit various character sheets within context.
|
||||
|
||||
This model is still capable of 1 on 1 roleplay, though I recommend using ChatML when doing that instead.
|
||||
overrides:
|
||||
parameters:
|
||||
model: MN-BackyardAI-Party-12B-v1-Q4_K_M-imat.gguf
|
||||
files:
|
||||
- filename: MN-BackyardAI-Party-12B-v1-Q4_K_M-imat.gguf
|
||||
sha256: cea68768dff58b553974b755bb40ef790ab8b86866d9b5c46bc2e6c3311b876a
|
||||
uri: huggingface://Lewdiculous/MN-BackyardAI-Party-12B-v1-GGUF-IQ-ARM-Imatrix/MN-BackyardAI-Party-12B-v1-Q4_K_M-imat.gguf
|
||||
- &mudler
|
||||
### START mudler's LocalAI specific-models
|
||||
url: "github:mudler/LocalAI/gallery/mudler.yaml@master"
|
||||
@@ -2618,37 +2330,6 @@
|
||||
- filename: Gemma-2-2B-ArliAI-RPMax-v1.1-Q4_K_M.gguf
|
||||
sha256: 89fe35345754d7e9de8d0c0d5bf35b2be9b12a09811b365b712b8b27112f7712
|
||||
uri: huggingface://bartowski/Gemma-2-2B-ArliAI-RPMax-v1.1-GGUF/Gemma-2-2B-ArliAI-RPMax-v1.1-Q4_K_M.gguf
|
||||
- !!merge <<: *gemma
|
||||
name: "gemma-2-9b-it-abliterated"
|
||||
urls:
|
||||
- https://huggingface.co/IlyaGusev/gemma-2-9b-it-abliterated
|
||||
- https://huggingface.co/bartowski/gemma-2-9b-it-abliterated-GGUF
|
||||
description: |
|
||||
Abliterated version of google/gemma-2-9b-it.
|
||||
|
||||
The abliteration script (link) is based on code from the blog post and heavily uses TransformerLens. The only major difference from the code used for Llama is scaling the embedding layer back.
|
||||
|
||||
Orthogonalization did not produce the same results as regular interventions since there are RMSNorm layers before merging activations into the residual stream. However, the final model still seems to be uncensored.
|
||||
overrides:
|
||||
parameters:
|
||||
model: gemma-2-9b-it-abliterated-Q4_K_M.gguf
|
||||
files:
|
||||
- filename: gemma-2-9b-it-abliterated-Q4_K_M.gguf
|
||||
sha256: 88d84ac9796732c10f6c58e0feb4db8e04c05d74bdb7047a5e37906a589896e1
|
||||
uri: huggingface://bartowski/gemma-2-9b-it-abliterated-GGUF/gemma-2-9b-it-abliterated-Q4_K_M.gguf
|
||||
- !!merge <<: *gemma
|
||||
name: "gemma-2-ataraxy-v3i-9b"
|
||||
urls:
|
||||
- https://huggingface.co/QuantFactory/Gemma-2-Ataraxy-v3i-9B-GGUF
|
||||
description: |
|
||||
Gemma-2-Ataraxy-v3i-9B is an experimental model that replaces the simpo model in the original recipe with a different simpo model and a writing model trained on Gutenberg, using a higher density. It is a merge of pre-trained language models created using mergekit, with della merge method using unsloth/gemma-2-9b-it as the base. The models included in the merge are nbeerbower/Gemma2-Gutenberg-Doppel-9B, ifable/gemma-2-Ifable-9B, and wzhouad/gemma-2-9b-it-WPO-HB. It has been quantized using llama.cpp.
|
||||
overrides:
|
||||
parameters:
|
||||
model: Gemma-2-Ataraxy-v3i-9B.Q4_K_M.gguf
|
||||
files:
|
||||
- filename: Gemma-2-Ataraxy-v3i-9B.Q4_K_M.gguf
|
||||
sha256: f14c5b9373d4058f0f812c6c34184addeb4aeeecb02a7bbcf9844d9afc8d0066
|
||||
uri: huggingface://QuantFactory/Gemma-2-Ataraxy-v3i-9B-GGUF/Gemma-2-Ataraxy-v3i-9B.Q4_K_M.gguf
|
||||
- &llama3
|
||||
url: "github:mudler/LocalAI/gallery/llama3-instruct.yaml@master"
|
||||
icon: https://cdn-uploads.huggingface.co/production/uploads/642cc1c253e76b4c2286c58e/aJJxKus1wP5N-euvHEUq7.png
|
||||
|
||||
@@ -3,6 +3,7 @@ name: "moondream2"
|
||||
|
||||
|
||||
config_file: |
|
||||
backend: llama-cpp
|
||||
context_size: 2046
|
||||
roles:
|
||||
user: "\nQuestion: "
|
||||
|
||||
@@ -41,7 +41,6 @@ func (llm *Base) Predict(opts *pb.PredictOptions) (string, error) {
|
||||
}
|
||||
|
||||
func (llm *Base) PredictStream(opts *pb.PredictOptions, results chan string) error {
|
||||
close(results)
|
||||
return fmt.Errorf("unimplemented")
|
||||
}
|
||||
|
||||
|
||||
@@ -144,6 +144,8 @@ func (s *server) PredictStream(in *pb.PredictOptions, stream pb.Backend_PredictS
|
||||
}()
|
||||
|
||||
err := s.llm.PredictStream(in, resultChan)
|
||||
// close the channel, so if resultChan is not closed by the LLM (maybe because does not implement PredictStream), the client will not hang
|
||||
close(resultChan)
|
||||
<-done
|
||||
|
||||
return err
|
||||
|
||||
@@ -28,7 +28,7 @@ var Aliases map[string]string = map[string]string{
|
||||
"langchain-huggingface": LCHuggingFaceBackend,
|
||||
}
|
||||
|
||||
var AutoDetect = os.Getenv("DISABLE_AUTODETECT") != "true"
|
||||
var autoDetect = os.Getenv("DISABLE_AUTODETECT") != "true"
|
||||
|
||||
const (
|
||||
LlamaGGML = "llama-ggml"
|
||||
@@ -62,7 +62,7 @@ func backendPath(assetDir, backend string) string {
|
||||
|
||||
// backendsInAssetDir returns the list of backends in the asset directory
|
||||
// that should be loaded
|
||||
func backendsInAssetDir(assetDir string) (map[string][]string, error) {
|
||||
func backendsInAssetDir(assetDir string) ([]string, error) {
|
||||
// Exclude backends from automatic loading
|
||||
excludeBackends := []string{LocalStoreBackend}
|
||||
entry, err := os.ReadDir(backendPath(assetDir, ""))
|
||||
@@ -86,7 +86,7 @@ ENTRY:
|
||||
|
||||
// Skip the llama.cpp variants if we are autoDetecting
|
||||
// But we always load the fallback variant if it exists
|
||||
if strings.Contains(e.Name(), LLamaCPP) && !strings.Contains(e.Name(), LLamaCPPFallback) && AutoDetect {
|
||||
if strings.Contains(e.Name(), LLamaCPP) && !strings.Contains(e.Name(), LLamaCPPFallback) && autoDetect {
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -94,7 +94,7 @@ ENTRY:
|
||||
}
|
||||
|
||||
// if we are autoDetecting, we want to show the llama.cpp variants as a single backend
|
||||
if AutoDetect {
|
||||
if autoDetect {
|
||||
// if we find the llama.cpp variants, show them of as a single backend (llama-cpp) as later we are going to pick that up
|
||||
// when starting the service
|
||||
foundLCPPAVX, foundLCPPAVX2, foundLCPPFallback, foundLCPPGRPC, foundLCPPCuda, foundLCPPHipblas, foundSycl16, foundSycl32 := false, false, false, false, false, false, false, false
|
||||
@@ -136,10 +136,6 @@ ENTRY:
|
||||
}
|
||||
}
|
||||
|
||||
return backends, nil
|
||||
}
|
||||
|
||||
func orderBackends(backends map[string][]string) ([]string, error) {
|
||||
// order backends from the asset directory.
|
||||
// as we scan for backends, we want to keep some order which backends are tried of.
|
||||
// for example, llama.cpp should be tried first, and we want to keep the huggingface backend at the last.
|
||||
@@ -185,9 +181,8 @@ func orderBackends(backends map[string][]string) ([]string, error) {
|
||||
return orderedBackends.Keys(), nil
|
||||
}
|
||||
|
||||
// selectGRPCProcessByHostCapabilities selects the GRPC process to start based on system capabilities
|
||||
// Note: this is now relevant only for llama.cpp
|
||||
func selectGRPCProcessByHostCapabilities(backend, assetDir string, f16 bool) string {
|
||||
// selectGRPCProcess selects the GRPC process to start based on system capabilities
|
||||
func selectGRPCProcess(backend, assetDir string, f16 bool) string {
|
||||
foundCUDA := false
|
||||
foundAMDGPU := false
|
||||
foundIntelGPU := false
|
||||
@@ -204,7 +199,6 @@ func selectGRPCProcessByHostCapabilities(backend, assetDir string, f16 bool) str
|
||||
return backendPath(assetDir, LLamaCPPGRPC)
|
||||
}
|
||||
|
||||
// Check for GPU-binaries that are shipped with single binary releases
|
||||
gpus, err := xsysinfo.GPUs()
|
||||
if err == nil {
|
||||
for _, gpu := range gpus {
|
||||
@@ -249,37 +243,32 @@ func selectGRPCProcessByHostCapabilities(backend, assetDir string, f16 bool) str
|
||||
return grpcProcess
|
||||
}
|
||||
|
||||
// No GPU found or no specific binaries found, try to load the CPU variant(s)
|
||||
|
||||
// Select the Fallback by default
|
||||
selectedProcess := backendPath(assetDir, LLamaCPPFallback)
|
||||
|
||||
// IF we find any optimized binary, we use that
|
||||
if xsysinfo.HasCPUCaps(cpuid.AVX2) {
|
||||
p := backendPath(assetDir, LLamaCPPAVX2)
|
||||
if _, err := os.Stat(p); err == nil {
|
||||
log.Info().Msgf("[%s] attempting to load with AVX2 variant", backend)
|
||||
selectedProcess = p
|
||||
grpcProcess = p
|
||||
}
|
||||
} else if xsysinfo.HasCPUCaps(cpuid.AVX) {
|
||||
p := backendPath(assetDir, LLamaCPPAVX)
|
||||
if _, err := os.Stat(p); err == nil {
|
||||
log.Info().Msgf("[%s] attempting to load with AVX variant", backend)
|
||||
selectedProcess = p
|
||||
grpcProcess = p
|
||||
}
|
||||
} else {
|
||||
p := backendPath(assetDir, LLamaCPPFallback)
|
||||
if _, err := os.Stat(p); err == nil {
|
||||
log.Info().Msgf("[%s] attempting to load with fallback variant", backend)
|
||||
grpcProcess = p
|
||||
}
|
||||
}
|
||||
|
||||
// Check if the binary exists!
|
||||
if _, err := os.Stat(selectedProcess); err == nil {
|
||||
return selectedProcess
|
||||
}
|
||||
|
||||
return ""
|
||||
return grpcProcess
|
||||
}
|
||||
|
||||
// starts the grpcModelProcess for the backend, and returns a grpc client
|
||||
// It also loads the model
|
||||
func (ml *ModelLoader) grpcModel(backend string, autodetect bool, o *Options) func(string, string, string) (*Model, error) {
|
||||
func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string, string) (*Model, error) {
|
||||
return func(modelID, modelName, modelFile string) (*Model, error) {
|
||||
|
||||
log.Debug().Msgf("Loading Model %s with gRPC (file: %s) (backend: %s): %+v", modelID, modelFile, backend, *o)
|
||||
@@ -335,9 +324,9 @@ func (ml *ModelLoader) grpcModel(backend string, autodetect bool, o *Options) fu
|
||||
return nil, fmt.Errorf("refering to a backend not in asset dir: %s", err.Error())
|
||||
}
|
||||
|
||||
if autodetect {
|
||||
if autoDetect {
|
||||
// autoDetect GRPC process to start based on system capabilities
|
||||
if selectedProcess := selectGRPCProcessByHostCapabilities(backend, o.assetDir, o.gRPCOptions.F16Memory); selectedProcess != "" {
|
||||
if selectedProcess := selectGRPCProcess(backend, o.assetDir, o.gRPCOptions.F16Memory); selectedProcess != "" {
|
||||
grpcProcess = selectedProcess
|
||||
}
|
||||
}
|
||||
@@ -418,11 +407,7 @@ func (ml *ModelLoader) grpcModel(backend string, autodetect bool, o *Options) fu
|
||||
}
|
||||
|
||||
func (ml *ModelLoader) ListAvailableBackends(assetdir string) ([]string, error) {
|
||||
backends, err := backendsInAssetDir(assetdir)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return orderBackends(backends)
|
||||
return backendsInAssetDir(assetdir)
|
||||
}
|
||||
|
||||
func (ml *ModelLoader) BackendLoader(opts ...Option) (client grpc.Backend, err error) {
|
||||
@@ -436,7 +421,13 @@ func (ml *ModelLoader) BackendLoader(opts ...Option) (client grpc.Backend, err e
|
||||
log.Debug().Msgf("%s is an alias of %s", backend, realBackend)
|
||||
}
|
||||
|
||||
ml.stopActiveBackends(o.modelID, o.singleActiveBackend)
|
||||
if o.singleActiveBackend {
|
||||
log.Debug().Msgf("Stopping all backends except '%s'", o.modelID)
|
||||
err := ml.StopGRPC(allExcept(o.modelID))
|
||||
if err != nil {
|
||||
log.Error().Err(err).Str("keptModel", o.modelID).Msg("error while shutting down all backends except for the keptModel")
|
||||
}
|
||||
}
|
||||
|
||||
var backendToConsume string
|
||||
|
||||
@@ -448,40 +439,14 @@ func (ml *ModelLoader) BackendLoader(opts ...Option) (client grpc.Backend, err e
|
||||
backendToConsume = backend
|
||||
}
|
||||
|
||||
model, err := ml.LoadModel(o.modelID, o.model, ml.grpcModel(backendToConsume, AutoDetect, o))
|
||||
model, err := ml.LoadModel(o.modelID, o.model, ml.grpcModel(backendToConsume, o))
|
||||
if err != nil {
|
||||
// XXX: This is too backend specific(llama-cpp), remove this bit or generalize further
|
||||
// We failed somehow starting the binary. For instance, could be that we are missing
|
||||
// some libraries if running in binary-only mode.
|
||||
// In this case, we attempt to load the model with the fallback variant.
|
||||
|
||||
// If not llama-cpp backend, return error immediately
|
||||
if backend != LLamaCPP {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Otherwise attempt with fallback
|
||||
log.Error().Msgf("[%s] Failed loading model, trying with fallback '%s'", backend, LLamaCPPFallback)
|
||||
model, err = ml.LoadModel(o.modelID, o.model, ml.grpcModel(LLamaCPPFallback, false, o))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return model.GRPC(o.parallelRequests, ml.wd), nil
|
||||
}
|
||||
|
||||
func (ml *ModelLoader) stopActiveBackends(modelID string, singleActiveBackend bool) {
|
||||
// If we can have only one backend active, kill all the others (except external backends)
|
||||
if singleActiveBackend {
|
||||
log.Debug().Msgf("Stopping all backends except '%s'", modelID)
|
||||
err := ml.StopGRPC(allExcept(modelID))
|
||||
if err != nil {
|
||||
log.Error().Err(err).Str("keptModel", modelID).Msg("error while shutting down all backends except for the keptModel - greedyloader continuing")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (ml *ModelLoader) GreedyLoader(opts ...Option) (grpc.Backend, error) {
|
||||
o := NewOptions(opts...)
|
||||
|
||||
@@ -493,12 +458,19 @@ func (ml *ModelLoader) GreedyLoader(opts ...Option) (grpc.Backend, error) {
|
||||
return m.GRPC(o.parallelRequests, ml.wd), nil
|
||||
}
|
||||
|
||||
ml.stopActiveBackends(o.modelID, o.singleActiveBackend)
|
||||
// If we can have only one backend active, kill all the others (except external backends)
|
||||
if o.singleActiveBackend {
|
||||
log.Debug().Msgf("Stopping all backends except '%s'", o.modelID)
|
||||
err := ml.StopGRPC(allExcept(o.modelID))
|
||||
if err != nil {
|
||||
log.Error().Err(err).Str("keptModel", o.modelID).Msg("error while shutting down all backends except for the keptModel - greedyloader continuing")
|
||||
}
|
||||
}
|
||||
|
||||
var err error
|
||||
|
||||
// get backends embedded in the binary
|
||||
autoLoadBackends, err := ml.ListAvailableBackends(o.assetDir)
|
||||
autoLoadBackends, err := backendsInAssetDir(o.assetDir)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -529,6 +501,39 @@ func (ml *ModelLoader) GreedyLoader(opts ...Option) (grpc.Backend, error) {
|
||||
err = errors.Join(err, fmt.Errorf("backend %s returned no usable model", key))
|
||||
log.Info().Msgf("[%s] Fails: %s", key, "backend returned no usable model")
|
||||
}
|
||||
|
||||
if autoDetect && key == LLamaCPP && err != nil {
|
||||
// try as hard as possible to run the llama.cpp variants
|
||||
backendToUse := ""
|
||||
if xsysinfo.HasCPUCaps(cpuid.AVX2) {
|
||||
if _, err := os.Stat(backendPath(o.assetDir, LLamaCPPAVX2)); err == nil {
|
||||
backendToUse = LLamaCPPAVX2
|
||||
}
|
||||
} else if xsysinfo.HasCPUCaps(cpuid.AVX) {
|
||||
if _, err := os.Stat(backendPath(o.assetDir, LLamaCPPAVX2)); err == nil {
|
||||
backendToUse = LLamaCPPAVX
|
||||
}
|
||||
} else {
|
||||
if _, err := os.Stat(backendPath(o.assetDir, LLamaCPPFallback)); err == nil {
|
||||
backendToUse = LLamaCPPFallback
|
||||
} else {
|
||||
// If we don't have a fallback, just skip fallback
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
// Autodetection failed, try the fallback
|
||||
log.Info().Msgf("[%s] Autodetection failed, trying the fallback", key)
|
||||
options = append(options, WithBackendString(backendToUse))
|
||||
model, modelerr = ml.BackendLoader(options...)
|
||||
if modelerr == nil && model != nil {
|
||||
log.Info().Msgf("[%s] Loads OK", key)
|
||||
return model, nil
|
||||
} else {
|
||||
err = errors.Join(err, fmt.Errorf("[%s]: %w", key, modelerr))
|
||||
log.Info().Msgf("[%s] Fails: %s", key, modelerr.Error())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("could not load model - all backends returned error: %s", err.Error())
|
||||
|
||||
@@ -157,11 +157,6 @@ func (ml *ModelLoader) ShutdownModel(modelName string) error {
|
||||
}
|
||||
time.Sleep(dur)
|
||||
retries++
|
||||
|
||||
if retries > 10 && os.Getenv("LOCALAI_FORCE_BACKEND_SHUTDOWN") == "true" {
|
||||
log.Warn().Msgf("Model %s is still busy after %d retries. Forcing shutdown.", modelName, retries)
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
return ml.deleteProcess(modelName)
|
||||
|
||||
Reference in New Issue
Block a user