mirror of
https://github.com/mudler/LocalAI.git
synced 2026-02-03 11:13:31 -05:00
Compare commits
28 Commits
v2.28.0
...
feat/llama
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8fea82e68b | ||
|
|
01e2e3dbc3 | ||
|
|
61cc76c455 | ||
|
|
8abecb4a18 | ||
|
|
8b3f76d8e6 | ||
|
|
4e0497f1a6 | ||
|
|
ba88c9f451 | ||
|
|
a598285825 | ||
|
|
cb7a172897 | ||
|
|
771be28dfb | ||
|
|
7d6b3eb42d | ||
|
|
0bb33fab55 | ||
|
|
e3bf7f77f7 | ||
|
|
bd1707d339 | ||
|
|
0474804541 | ||
|
|
72693b3917 | ||
|
|
a03b70010f | ||
|
|
e3717e5c1a | ||
|
|
c8f6858218 | ||
|
|
06d7cc43ae | ||
|
|
f2147cb850 | ||
|
|
75bb9f4c28 | ||
|
|
a2ef4b1e07 | ||
|
|
161c9fe2db | ||
|
|
7547463f81 | ||
|
|
32e4dfd47b | ||
|
|
f67e5dec68 | ||
|
|
297d54acea |
4
.github/dependabot.yml
vendored
4
.github/dependabot.yml
vendored
@@ -29,10 +29,6 @@ updates:
|
||||
schedule:
|
||||
# Check for updates to GitHub Actions every weekday
|
||||
interval: "weekly"
|
||||
- package-ecosystem: "pip"
|
||||
directory: "/backend/python/autogptq"
|
||||
schedule:
|
||||
interval: "weekly"
|
||||
- package-ecosystem: "pip"
|
||||
directory: "/backend/python/bark"
|
||||
schedule:
|
||||
|
||||
7
.github/workflows/image.yml
vendored
7
.github/workflows/image.yml
vendored
@@ -75,6 +75,7 @@ jobs:
|
||||
grpc-base-image: "ubuntu:22.04"
|
||||
runs-on: 'arc-runner-set'
|
||||
makeflags: "--jobs=3 --output-sync=target"
|
||||
latest-image: 'latest-gpu-hipblas-core'
|
||||
- build-type: 'hipblas'
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'false'
|
||||
@@ -251,6 +252,7 @@ jobs:
|
||||
image-type: 'core'
|
||||
runs-on: 'arc-runner-set'
|
||||
makeflags: "--jobs=3 --output-sync=target"
|
||||
latest-image: 'latest-gpu-intel-f16-core'
|
||||
- build-type: 'sycl_f32'
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'false'
|
||||
@@ -261,6 +263,7 @@ jobs:
|
||||
image-type: 'core'
|
||||
runs-on: 'arc-runner-set'
|
||||
makeflags: "--jobs=3 --output-sync=target"
|
||||
latest-image: 'latest-gpu-intel-f32-core'
|
||||
|
||||
core-image-build:
|
||||
uses: ./.github/workflows/image_build.yml
|
||||
@@ -339,6 +342,7 @@ jobs:
|
||||
base-image: "ubuntu:22.04"
|
||||
makeflags: "--jobs=4 --output-sync=target"
|
||||
skip-drivers: 'false'
|
||||
latest-image: 'latest-gpu-nvidia-cuda-12-core'
|
||||
- build-type: 'cublas'
|
||||
cuda-major-version: "12"
|
||||
cuda-minor-version: "0"
|
||||
@@ -351,17 +355,18 @@ jobs:
|
||||
base-image: "ubuntu:22.04"
|
||||
skip-drivers: 'false'
|
||||
makeflags: "--jobs=4 --output-sync=target"
|
||||
latest-image: 'latest-gpu-nvidia-cuda-12-core'
|
||||
- build-type: 'vulkan'
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'false'
|
||||
tag-suffix: '-vulkan-ffmpeg-core'
|
||||
latest-image: 'latest-vulkan-ffmpeg-core'
|
||||
ffmpeg: 'true'
|
||||
image-type: 'core'
|
||||
runs-on: 'arc-runner-set'
|
||||
base-image: "ubuntu:22.04"
|
||||
skip-drivers: 'false'
|
||||
makeflags: "--jobs=4 --output-sync=target"
|
||||
latest-image: 'latest-gpu-vulkan-core'
|
||||
gh-runner:
|
||||
uses: ./.github/workflows/image_build.yml
|
||||
with:
|
||||
|
||||
6
.github/workflows/notify-models.yaml
vendored
6
.github/workflows/notify-models.yaml
vendored
@@ -8,7 +8,7 @@ jobs:
|
||||
notify-discord:
|
||||
if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }}
|
||||
env:
|
||||
MODEL_NAME: hermes-2-theta-llama-3-8b
|
||||
MODEL_NAME: gemma-3-12b-it
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
@@ -16,7 +16,7 @@ jobs:
|
||||
fetch-depth: 0 # needed to checkout all branches for this Action to work
|
||||
- uses: mudler/localai-github-action@v1
|
||||
with:
|
||||
model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
|
||||
model: 'gemma-3-12b-it' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
|
||||
# Check the PR diff using the current branch and the base branch of the PR
|
||||
- uses: GrantBirki/git-diff-action@v2.8.0
|
||||
id: git-diff-action
|
||||
@@ -87,7 +87,7 @@ jobs:
|
||||
notify-twitter:
|
||||
if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }}
|
||||
env:
|
||||
MODEL_NAME: hermes-2-theta-llama-3-8b
|
||||
MODEL_NAME: gemma-3-12b-it
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
4
.github/workflows/notify-releases.yaml
vendored
4
.github/workflows/notify-releases.yaml
vendored
@@ -14,7 +14,7 @@ jobs:
|
||||
steps:
|
||||
- uses: mudler/localai-github-action@v1
|
||||
with:
|
||||
model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
|
||||
model: 'gemma-3-12b-it' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
|
||||
- name: Summarize
|
||||
id: summarize
|
||||
run: |
|
||||
@@ -60,4 +60,4 @@ jobs:
|
||||
DISCORD_AVATAR: "https://avatars.githubusercontent.com/u/139863280?v=4"
|
||||
uses: Ilshidur/action-discord@master
|
||||
with:
|
||||
args: ${{ steps.summarize.outputs.message }}
|
||||
args: ${{ steps.summarize.outputs.message }}
|
||||
|
||||
@@ -15,7 +15,7 @@ ARG TARGETARCH
|
||||
ARG TARGETVARIANT
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,transformers:/build/backend/python/transformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,faster-whisper:/build/backend/python/faster-whisper/run.sh,kokoro:/build/backend/python/kokoro/run.sh,vllm:/build/backend/python/vllm/run.sh,exllama2:/build/backend/python/exllama2/run.sh"
|
||||
ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,transformers:/build/backend/python/transformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,faster-whisper:/build/backend/python/faster-whisper/run.sh,kokoro:/build/backend/python/kokoro/run.sh,vllm:/build/backend/python/vllm/run.sh,exllama2:/build/backend/python/exllama2/run.sh"
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
@@ -431,9 +431,6 @@ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "kokoro" || -z "${EXTRA_BACKENDS}" ) && "$IMA
|
||||
RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vllm" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
|
||||
make -C backend/python/vllm \
|
||||
; fi && \
|
||||
if [[ ( "${EXTRA_BACKENDS}" =~ "autogptq" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
|
||||
make -C backend/python/autogptq \
|
||||
; fi && \
|
||||
if [[ ( "${EXTRA_BACKENDS}" =~ "bark" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
|
||||
make -C backend/python/bark \
|
||||
; fi && \
|
||||
|
||||
15
Makefile
15
Makefile
@@ -6,7 +6,7 @@ BINARY_NAME=local-ai
|
||||
DETECT_LIBS?=true
|
||||
|
||||
# llama.cpp versions
|
||||
CPPLLAMA_VERSION?=d6d2c2ab8c8865784ba9fef37f2b2de3f2134d33
|
||||
CPPLLAMA_VERSION?=6408210082cc0a61b992b487be7e2ff2efbb9e36
|
||||
|
||||
# whisper.cpp version
|
||||
WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
|
||||
@@ -505,18 +505,10 @@ protogen-go-clean:
|
||||
$(RM) bin/*
|
||||
|
||||
.PHONY: protogen-python
|
||||
protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama2-protogen rerankers-protogen transformers-protogen kokoro-protogen vllm-protogen faster-whisper-protogen
|
||||
protogen-python: bark-protogen coqui-protogen diffusers-protogen exllama2-protogen rerankers-protogen transformers-protogen kokoro-protogen vllm-protogen faster-whisper-protogen
|
||||
|
||||
.PHONY: protogen-python-clean
|
||||
protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama2-protogen-clean rerankers-protogen-clean transformers-protogen-clean kokoro-protogen-clean vllm-protogen-clean faster-whisper-protogen-clean
|
||||
|
||||
.PHONY: autogptq-protogen
|
||||
autogptq-protogen:
|
||||
$(MAKE) -C backend/python/autogptq protogen
|
||||
|
||||
.PHONY: autogptq-protogen-clean
|
||||
autogptq-protogen-clean:
|
||||
$(MAKE) -C backend/python/autogptq protogen-clean
|
||||
protogen-python-clean: bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama2-protogen-clean rerankers-protogen-clean transformers-protogen-clean kokoro-protogen-clean vllm-protogen-clean faster-whisper-protogen-clean
|
||||
|
||||
.PHONY: bark-protogen
|
||||
bark-protogen:
|
||||
@@ -593,7 +585,6 @@ vllm-protogen-clean:
|
||||
## GRPC
|
||||
# Note: it is duplicated in the Dockerfile
|
||||
prepare-extra-conda-environments: protogen-python
|
||||
$(MAKE) -C backend/python/autogptq
|
||||
$(MAKE) -C backend/python/bark
|
||||
$(MAKE) -C backend/python/coqui
|
||||
$(MAKE) -C backend/python/diffusers
|
||||
|
||||
@@ -190,11 +190,7 @@ message ModelOptions {
|
||||
int32 NGQA = 20;
|
||||
string ModelFile = 21;
|
||||
|
||||
// AutoGPTQ
|
||||
string Device = 22;
|
||||
bool UseTriton = 23;
|
||||
string ModelBaseName = 24;
|
||||
bool UseFastTokenizer = 25;
|
||||
|
||||
|
||||
// Diffusers
|
||||
string PipelineType = 26;
|
||||
|
||||
@@ -217,6 +217,7 @@ struct llama_client_slot
|
||||
|
||||
bool infill = false;
|
||||
bool embedding = false;
|
||||
bool reranker = false;
|
||||
bool has_next_token = true;
|
||||
bool truncated = false;
|
||||
bool stopped_eos = false;
|
||||
@@ -535,6 +536,12 @@ struct llama_server_context
|
||||
return false;
|
||||
}
|
||||
|
||||
// Enable reranking if embeddings are enabled - moved after context initialization
|
||||
if (params.embedding) {
|
||||
params.reranking = true;
|
||||
LOG_INFO("Reranking enabled (embeddings are enabled)", {});
|
||||
}
|
||||
|
||||
if (multimodal) {
|
||||
const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
|
||||
const int n_embd_llm = llama_model_n_embd(model);
|
||||
@@ -1413,7 +1420,59 @@ struct llama_server_context
|
||||
queue_results.send(res);
|
||||
}
|
||||
|
||||
void request_completion(int task_id, json data, bool infill, bool embedding, int multitask_id)
|
||||
void send_rerank(llama_client_slot &slot, const llama_batch & batch)
|
||||
{
|
||||
task_result res;
|
||||
res.id = slot.task_id;
|
||||
res.multitask_id = slot.multitask_id;
|
||||
res.error = false;
|
||||
res.stop = true;
|
||||
|
||||
float score = -1e6f; // Default score if we fail to get embeddings
|
||||
|
||||
if (!params.reranking)
|
||||
{
|
||||
LOG_WARNING("reranking disabled", {
|
||||
{"params.reranking", params.reranking},
|
||||
});
|
||||
}
|
||||
else if (ctx == nullptr)
|
||||
{
|
||||
LOG_ERR("context is null, cannot perform reranking");
|
||||
res.error = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int i = 0; i < batch.n_tokens; ++i) {
|
||||
if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
|
||||
if (embd == NULL) {
|
||||
embd = llama_get_embeddings_ith(ctx, i);
|
||||
}
|
||||
|
||||
if (embd == NULL) {
|
||||
LOG("failed to get embeddings");
|
||||
continue;
|
||||
}
|
||||
|
||||
score = embd[0];
|
||||
}
|
||||
}
|
||||
|
||||
// Format result as JSON similar to the embedding function
|
||||
res.result_json = json
|
||||
{
|
||||
{"score", score},
|
||||
{"tokens", slot.num_prompt_tokens}
|
||||
};
|
||||
|
||||
queue_results.send(res);
|
||||
}
|
||||
|
||||
void request_completion(int task_id, json data, bool infill, bool embedding, bool rerank, int multitask_id)
|
||||
{
|
||||
task_server task;
|
||||
task.id = task_id;
|
||||
@@ -1421,6 +1480,7 @@ struct llama_server_context
|
||||
task.data = std::move(data);
|
||||
task.infill_mode = infill;
|
||||
task.embedding_mode = embedding;
|
||||
task.reranking_mode = rerank;
|
||||
task.type = TASK_TYPE_COMPLETION;
|
||||
task.multitask_id = multitask_id;
|
||||
|
||||
@@ -1552,7 +1612,7 @@ struct llama_server_context
|
||||
subtask_data["prompt"] = subtask_data["prompt"][i];
|
||||
|
||||
// subtasks inherit everything else (infill mode, embedding mode, etc.)
|
||||
request_completion(subtask_ids[i], subtask_data, multiprompt_task.infill_mode, multiprompt_task.embedding_mode, multitask_id);
|
||||
request_completion(subtask_ids[i], subtask_data, multiprompt_task.infill_mode, multiprompt_task.embedding_mode, multiprompt_task.reranking_mode, multitask_id);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1591,6 +1651,7 @@ struct llama_server_context
|
||||
|
||||
slot->infill = task.infill_mode;
|
||||
slot->embedding = task.embedding_mode;
|
||||
slot->reranker = task.reranking_mode;
|
||||
slot->task_id = task.id;
|
||||
slot->multitask_id = task.multitask_id;
|
||||
|
||||
@@ -2034,6 +2095,14 @@ struct llama_server_context
|
||||
continue;
|
||||
}
|
||||
|
||||
if (slot.reranker)
|
||||
{
|
||||
send_rerank(slot, batch_view);
|
||||
slot.release();
|
||||
slot.i_batch = -1;
|
||||
continue;
|
||||
}
|
||||
|
||||
completion_token_output result;
|
||||
const llama_token id = common_sampler_sample(slot.ctx_sampling, ctx, slot.i_batch - i);
|
||||
|
||||
@@ -2489,7 +2558,7 @@ public:
|
||||
json data = parse_options(true, request, llama);
|
||||
const int task_id = llama.queue_tasks.get_new_id();
|
||||
llama.queue_results.add_waiting_task_id(task_id);
|
||||
llama.request_completion(task_id, data, false, false, -1);
|
||||
llama.request_completion(task_id, data, false, false, false, -1);
|
||||
while (true)
|
||||
{
|
||||
task_result result = llama.queue_results.recv(task_id);
|
||||
@@ -2543,7 +2612,7 @@ public:
|
||||
json data = parse_options(false, request, llama);
|
||||
const int task_id = llama.queue_tasks.get_new_id();
|
||||
llama.queue_results.add_waiting_task_id(task_id);
|
||||
llama.request_completion(task_id, data, false, false, -1);
|
||||
llama.request_completion(task_id, data, false, false, false, -1);
|
||||
std::string completion_text;
|
||||
task_result result = llama.queue_results.recv(task_id);
|
||||
if (!result.error && result.stop) {
|
||||
@@ -2580,7 +2649,7 @@ public:
|
||||
json data = parse_options(false, request, llama);
|
||||
const int task_id = llama.queue_tasks.get_new_id();
|
||||
llama.queue_results.add_waiting_task_id(task_id);
|
||||
llama.request_completion(task_id, { {"prompt", data["embeddings"]}, { "n_predict", 0}, {"image_data", ""} }, false, true, -1);
|
||||
llama.request_completion(task_id, { {"prompt", data["embeddings"]}, { "n_predict", 0}, {"image_data", ""} }, false, true, false, -1);
|
||||
// get the result
|
||||
task_result result = llama.queue_results.recv(task_id);
|
||||
//std::cout << "Embedding result JSON" << result.result_json.dump() << std::endl;
|
||||
@@ -2612,6 +2681,46 @@ public:
|
||||
return grpc::Status::OK;
|
||||
}
|
||||
|
||||
grpc::Status Rerank(ServerContext* context, const backend::RerankRequest* request, backend::RerankResult* rerankResult) {
|
||||
// Create a JSON object with the query and documents
|
||||
json data = {
|
||||
{"prompt", request->query()},
|
||||
{"documents", request->documents()},
|
||||
{"top_n", request->top_n()}
|
||||
};
|
||||
|
||||
// Generate a new task ID
|
||||
const int task_id = llama.queue_tasks.get_new_id();
|
||||
llama.queue_results.add_waiting_task_id(task_id);
|
||||
|
||||
// Queue the task with reranking mode enabled
|
||||
llama.request_completion(task_id, data, false, false, true, -1);
|
||||
|
||||
// Get the result
|
||||
task_result result = llama.queue_results.recv(task_id);
|
||||
llama.queue_results.remove_waiting_task_id(task_id);
|
||||
|
||||
if (!result.error && result.stop) {
|
||||
// Set usage information
|
||||
backend::Usage* usage = rerankResult->mutable_usage();
|
||||
usage->set_total_tokens(result.result_json.value("tokens", 0));
|
||||
usage->set_prompt_tokens(result.result_json.value("tokens", 0));
|
||||
|
||||
// Get the score from the result
|
||||
float score = result.result_json.value("score", 0.0f);
|
||||
|
||||
// Create document results for each input document
|
||||
for (int i = 0; i < request->documents_size(); i++) {
|
||||
backend::DocumentResult* doc_result = rerankResult->add_results();
|
||||
doc_result->set_index(i);
|
||||
doc_result->set_text(request->documents(i));
|
||||
doc_result->set_relevance_score(score);
|
||||
}
|
||||
}
|
||||
|
||||
return grpc::Status::OK;
|
||||
}
|
||||
|
||||
grpc::Status GetMetrics(ServerContext* context, const backend::MetricsRequest* request, backend::MetricsResponse* response) {
|
||||
llama_client_slot* active_slot = llama.get_active_slot();
|
||||
|
||||
@@ -2644,7 +2753,9 @@ void RunServer(const std::string& server_address) {
|
||||
ServerBuilder builder;
|
||||
builder.AddListeningPort(server_address, grpc::InsecureServerCredentials());
|
||||
builder.RegisterService(&service);
|
||||
|
||||
builder.SetMaxMessageSize(50 * 1024 * 1024); // 50MB
|
||||
builder.SetMaxSendMessageSize(50 * 1024 * 1024); // 50MB
|
||||
builder.SetMaxReceiveMessageSize(50 * 1024 * 1024); // 50MB
|
||||
std::unique_ptr<Server> server(builder.BuildAndStart());
|
||||
std::cout << "Server listening on " << server_address << std::endl;
|
||||
server->Wait();
|
||||
|
||||
1
backend/cpp/llama/utils.hpp
vendored
1
backend/cpp/llama/utils.hpp
vendored
@@ -61,6 +61,7 @@ struct task_server {
|
||||
json data;
|
||||
bool infill_mode = false;
|
||||
bool embedding_mode = false;
|
||||
bool reranking_mode = false;
|
||||
int multitask_id = -1;
|
||||
};
|
||||
|
||||
|
||||
@@ -1,17 +0,0 @@
|
||||
.PHONY: autogptq
|
||||
autogptq: protogen
|
||||
bash install.sh
|
||||
|
||||
.PHONY: protogen
|
||||
protogen: backend_pb2_grpc.py backend_pb2.py
|
||||
|
||||
.PHONY: protogen-clean
|
||||
protogen-clean:
|
||||
$(RM) backend_pb2_grpc.py backend_pb2.py
|
||||
|
||||
backend_pb2_grpc.py backend_pb2.py:
|
||||
python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
|
||||
|
||||
.PHONY: clean
|
||||
clean: protogen-clean
|
||||
rm -rf venv __pycache__
|
||||
@@ -1,5 +0,0 @@
|
||||
# Creating a separate environment for the autogptq project
|
||||
|
||||
```
|
||||
make autogptq
|
||||
```
|
||||
@@ -1,153 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
from concurrent import futures
|
||||
import argparse
|
||||
import signal
|
||||
import sys
|
||||
import os
|
||||
import time
|
||||
import base64
|
||||
|
||||
import grpc
|
||||
import backend_pb2
|
||||
import backend_pb2_grpc
|
||||
|
||||
from auto_gptq import AutoGPTQForCausalLM
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
from transformers import TextGenerationPipeline
|
||||
|
||||
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
|
||||
|
||||
# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
|
||||
MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
|
||||
|
||||
# Implement the BackendServicer class with the service methods
|
||||
class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
def Health(self, request, context):
|
||||
return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
|
||||
def LoadModel(self, request, context):
|
||||
try:
|
||||
device = "cuda:0"
|
||||
if request.Device != "":
|
||||
device = request.Device
|
||||
|
||||
# support loading local model files
|
||||
model_path = os.path.join(os.environ.get('MODELS_PATH', './'), request.Model)
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, trust_remote_code=request.TrustRemoteCode)
|
||||
|
||||
# support model `Qwen/Qwen-VL-Chat-Int4`
|
||||
if "qwen-vl" in request.Model.lower():
|
||||
self.model_name = "Qwen-VL-Chat"
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path,
|
||||
trust_remote_code=request.TrustRemoteCode,
|
||||
device_map="auto").eval()
|
||||
else:
|
||||
model = AutoGPTQForCausalLM.from_quantized(model_path,
|
||||
model_basename=request.ModelBaseName,
|
||||
use_safetensors=True,
|
||||
trust_remote_code=request.TrustRemoteCode,
|
||||
device=device,
|
||||
use_triton=request.UseTriton,
|
||||
quantize_config=None)
|
||||
|
||||
self.model = model
|
||||
self.tokenizer = tokenizer
|
||||
except Exception as err:
|
||||
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
|
||||
return backend_pb2.Result(message="Model loaded successfully", success=True)
|
||||
|
||||
def Predict(self, request, context):
|
||||
penalty = 1.0
|
||||
if request.Penalty != 0.0:
|
||||
penalty = request.Penalty
|
||||
tokens = 512
|
||||
if request.Tokens != 0:
|
||||
tokens = request.Tokens
|
||||
top_p = 0.95
|
||||
if request.TopP != 0.0:
|
||||
top_p = request.TopP
|
||||
|
||||
|
||||
prompt_images = self.recompile_vl_prompt(request)
|
||||
compiled_prompt = prompt_images[0]
|
||||
print(f"Prompt: {compiled_prompt}", file=sys.stderr)
|
||||
|
||||
# Implement Predict RPC
|
||||
pipeline = TextGenerationPipeline(
|
||||
model=self.model,
|
||||
tokenizer=self.tokenizer,
|
||||
max_new_tokens=tokens,
|
||||
temperature=request.Temperature,
|
||||
top_p=top_p,
|
||||
repetition_penalty=penalty,
|
||||
)
|
||||
t = pipeline(compiled_prompt)[0]["generated_text"]
|
||||
print(f"generated_text: {t}", file=sys.stderr)
|
||||
|
||||
if compiled_prompt in t:
|
||||
t = t.replace(compiled_prompt, "")
|
||||
# house keeping. Remove the image files from /tmp folder
|
||||
for img_path in prompt_images[1]:
|
||||
try:
|
||||
os.remove(img_path)
|
||||
except Exception as e:
|
||||
print(f"Error removing image file: {img_path}, {e}", file=sys.stderr)
|
||||
|
||||
return backend_pb2.Result(message=bytes(t, encoding='utf-8'))
|
||||
|
||||
def PredictStream(self, request, context):
|
||||
# Implement PredictStream RPC
|
||||
#for reply in some_data_generator():
|
||||
# yield reply
|
||||
# Not implemented yet
|
||||
return self.Predict(request, context)
|
||||
|
||||
def recompile_vl_prompt(self, request):
|
||||
prompt = request.Prompt
|
||||
image_paths = []
|
||||
|
||||
if "qwen-vl" in self.model_name.lower():
|
||||
# request.Images is an array which contains base64 encoded images. Iterate the request.Images array, decode and save each image to /tmp folder with a random filename.
|
||||
# Then, save the image file paths to an array "image_paths".
|
||||
# read "request.Prompt", replace "[img-%d]" with the image file paths in the order they appear in "image_paths". Save the new prompt to "prompt".
|
||||
for i, img in enumerate(request.Images):
|
||||
timestamp = str(int(time.time() * 1000)) # Generate timestamp
|
||||
img_path = f"/tmp/vl-{timestamp}.jpg" # Use timestamp in filename
|
||||
with open(img_path, "wb") as f:
|
||||
f.write(base64.b64decode(img))
|
||||
image_paths.append(img_path)
|
||||
prompt = prompt.replace(f"[img-{i}]", "<img>" + img_path + "</img>,")
|
||||
else:
|
||||
prompt = request.Prompt
|
||||
return (prompt, image_paths)
|
||||
|
||||
def serve(address):
|
||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
|
||||
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
|
||||
server.add_insecure_port(address)
|
||||
server.start()
|
||||
print("Server started. Listening on: " + address, file=sys.stderr)
|
||||
|
||||
# Define the signal handler function
|
||||
def signal_handler(sig, frame):
|
||||
print("Received termination signal. Shutting down...")
|
||||
server.stop(0)
|
||||
sys.exit(0)
|
||||
|
||||
# Set the signal handlers for SIGINT and SIGTERM
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
|
||||
try:
|
||||
while True:
|
||||
time.sleep(_ONE_DAY_IN_SECONDS)
|
||||
except KeyboardInterrupt:
|
||||
server.stop(0)
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Run the gRPC server.")
|
||||
parser.add_argument(
|
||||
"--addr", default="localhost:50051", help="The address to bind the server to."
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
serve(args.addr)
|
||||
@@ -1,14 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
source $(dirname $0)/../common/libbackend.sh
|
||||
|
||||
# This is here because the Intel pip index is broken and returns 200 status codes for every package name, it just doesn't return any package links.
|
||||
# This makes uv think that the package exists in the Intel pip index, and by default it stops looking at other pip indexes once it finds a match.
|
||||
# We need uv to continue falling through to the pypi default index to find optimum[openvino] in the pypi index
|
||||
# the --upgrade actually allows us to *downgrade* torch to the version provided in the Intel pip index
|
||||
if [ "x${BUILD_PROFILE}" == "xintel" ]; then
|
||||
EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
|
||||
fi
|
||||
|
||||
installRequirements
|
||||
@@ -1,2 +0,0 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/cu118
|
||||
torch==2.4.1+cu118
|
||||
@@ -1 +0,0 @@
|
||||
torch==2.4.1
|
||||
@@ -1,2 +0,0 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/rocm6.0
|
||||
torch==2.4.1+rocm6.0
|
||||
@@ -1,6 +0,0 @@
|
||||
--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
|
||||
intel-extension-for-pytorch==2.3.110+xpu
|
||||
torch==2.3.1+cxx11.abi
|
||||
oneccl_bind_pt==2.3.100+xpu
|
||||
optimum[openvino]
|
||||
setuptools
|
||||
@@ -1,6 +0,0 @@
|
||||
accelerate
|
||||
auto-gptq==0.7.1
|
||||
grpcio==1.71.0
|
||||
protobuf
|
||||
certifi
|
||||
transformers
|
||||
@@ -1,4 +0,0 @@
|
||||
#!/bin/bash
|
||||
source $(dirname $0)/../common/libbackend.sh
|
||||
|
||||
startBackend $@
|
||||
@@ -1,6 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
source $(dirname $0)/../common/libbackend.sh
|
||||
|
||||
runUnittests
|
||||
@@ -61,7 +61,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
return backend_pb2.Result(success=True)
|
||||
|
||||
def serve(address):
|
||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
|
||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
|
||||
options=[
|
||||
('grpc.max_message_length', 50 * 1024 * 1024), # 50MB
|
||||
('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB
|
||||
('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB
|
||||
])
|
||||
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
|
||||
server.add_insecure_port(address)
|
||||
server.start()
|
||||
|
||||
@@ -86,7 +86,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
return backend_pb2.Result(success=True)
|
||||
|
||||
def serve(address):
|
||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
|
||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
|
||||
options=[
|
||||
('grpc.max_message_length', 50 * 1024 * 1024), # 50MB
|
||||
('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB
|
||||
('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB
|
||||
])
|
||||
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
|
||||
server.add_insecure_port(address)
|
||||
server.start()
|
||||
|
||||
@@ -522,7 +522,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
|
||||
|
||||
def serve(address):
|
||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
|
||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
|
||||
options=[
|
||||
('grpc.max_message_length', 50 * 1024 * 1024), # 50MB
|
||||
('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB
|
||||
('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB
|
||||
])
|
||||
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
|
||||
server.add_insecure_port(address)
|
||||
server.start()
|
||||
|
||||
@@ -105,7 +105,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
|
||||
|
||||
def serve(address):
|
||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
|
||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
|
||||
options=[
|
||||
('grpc.max_message_length', 50 * 1024 * 1024), # 50MB
|
||||
('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB
|
||||
('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB
|
||||
])
|
||||
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
|
||||
server.add_insecure_port(address)
|
||||
server.start()
|
||||
|
||||
@@ -62,7 +62,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
return backend_pb2.TranscriptResult(segments=resultSegments, text=text)
|
||||
|
||||
def serve(address):
|
||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
|
||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
|
||||
options=[
|
||||
('grpc.max_message_length', 50 * 1024 * 1024), # 50MB
|
||||
('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB
|
||||
('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB
|
||||
])
|
||||
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
|
||||
server.add_insecure_port(address)
|
||||
server.start()
|
||||
|
||||
@@ -99,7 +99,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
return backend_pb2.Result(success=True)
|
||||
|
||||
def serve(address):
|
||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
|
||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
|
||||
options=[
|
||||
('grpc.max_message_length', 50 * 1024 * 1024), # 50MB
|
||||
('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB
|
||||
('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB
|
||||
])
|
||||
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
|
||||
server.add_insecure_port(address)
|
||||
server.start()
|
||||
|
||||
@@ -91,7 +91,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
return backend_pb2.RerankResult(usage=usage, results=results)
|
||||
|
||||
def serve(address):
|
||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
|
||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
|
||||
options=[
|
||||
('grpc.max_message_length', 50 * 1024 * 1024), # 50MB
|
||||
('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB
|
||||
('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB
|
||||
])
|
||||
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
|
||||
server.add_insecure_port(address)
|
||||
server.start()
|
||||
|
||||
@@ -559,7 +559,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
|
||||
async def serve(address):
|
||||
# Start asyncio gRPC server
|
||||
server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
|
||||
server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
|
||||
options=[
|
||||
('grpc.max_message_length', 50 * 1024 * 1024), # 50MB
|
||||
('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB
|
||||
('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB
|
||||
])
|
||||
# Add the servicer to the server
|
||||
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
|
||||
# Bind the server to the address
|
||||
|
||||
@@ -320,7 +320,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
|
||||
async def serve(address):
|
||||
# Start asyncio gRPC server
|
||||
server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
|
||||
server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
|
||||
options=[
|
||||
('grpc.max_message_length', 50 * 1024 * 1024), # 50MB
|
||||
('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB
|
||||
('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB
|
||||
])
|
||||
# Add the servicer to the server
|
||||
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
|
||||
# Bind the server to the address
|
||||
|
||||
@@ -184,11 +184,6 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
|
||||
MainGPU: c.MainGPU,
|
||||
Threads: int32(*c.Threads),
|
||||
TensorSplit: c.TensorSplit,
|
||||
// AutoGPTQ
|
||||
ModelBaseName: c.AutoGPTQ.ModelBaseName,
|
||||
Device: c.AutoGPTQ.Device,
|
||||
UseTriton: c.AutoGPTQ.Triton,
|
||||
UseFastTokenizer: c.AutoGPTQ.UseFastTokenizer,
|
||||
// RWKV
|
||||
Tokenizer: c.Tokenizer,
|
||||
}
|
||||
|
||||
@@ -50,9 +50,6 @@ type BackendConfig struct {
|
||||
// LLM configs (GPT4ALL, Llama.cpp, ...)
|
||||
LLMConfig `yaml:",inline"`
|
||||
|
||||
// AutoGPTQ specifics
|
||||
AutoGPTQ AutoGPTQ `yaml:"autogptq"`
|
||||
|
||||
// Diffusers
|
||||
Diffusers Diffusers `yaml:"diffusers"`
|
||||
Step int `yaml:"step"`
|
||||
@@ -176,14 +173,6 @@ type LimitMMPerPrompt struct {
|
||||
LimitAudioPerPrompt int `yaml:"audio"`
|
||||
}
|
||||
|
||||
// AutoGPTQ is a struct that holds the configuration specific to the AutoGPTQ backend
|
||||
type AutoGPTQ struct {
|
||||
ModelBaseName string `yaml:"model_base_name"`
|
||||
Device string `yaml:"device"`
|
||||
Triton bool `yaml:"triton"`
|
||||
UseFastTokenizer bool `yaml:"use_fast_tokenizer"`
|
||||
}
|
||||
|
||||
// TemplateConfig is a struct that holds the configuration of the templating system
|
||||
type TemplateConfig struct {
|
||||
// Chat is the template used in the chat completion endpoint
|
||||
@@ -555,7 +544,7 @@ func (c *BackendConfig) GuessUsecases(u BackendConfigUsecases) bool {
|
||||
}
|
||||
}
|
||||
if (u & FLAG_TTS) == FLAG_TTS {
|
||||
ttsBackends := []string{"piper", "transformers-musicgen", "parler-tts"}
|
||||
ttsBackends := []string{"bark-cpp", "parler-tts", "piper", "transformers-musicgen"}
|
||||
if !slices.Contains(ttsBackends, c.Backend) {
|
||||
return false
|
||||
}
|
||||
|
||||
@@ -203,18 +203,10 @@ func mergeOpenAIRequestAndBackendConfig(config *config.BackendConfig, input *sch
|
||||
config.Diffusers.ClipSkip = input.ClipSkip
|
||||
}
|
||||
|
||||
if input.ModelBaseName != "" {
|
||||
config.AutoGPTQ.ModelBaseName = input.ModelBaseName
|
||||
}
|
||||
|
||||
if input.NegativePromptScale != 0 {
|
||||
config.NegativePromptScale = input.NegativePromptScale
|
||||
}
|
||||
|
||||
if input.UseFastTokenizer {
|
||||
config.UseFastTokenizer = input.UseFastTokenizer
|
||||
}
|
||||
|
||||
if input.NegativePrompt != "" {
|
||||
config.NegativePrompt = input.NegativePrompt
|
||||
}
|
||||
|
||||
@@ -115,6 +115,7 @@ async function sendTextToChatGPT(text) {
|
||||
|
||||
const response = await fetch('v1/chat/completions', {
|
||||
method: 'POST',
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
model: getModel(),
|
||||
messages: conversationHistory
|
||||
|
||||
@@ -202,7 +202,6 @@ type OpenAIRequest struct {
|
||||
|
||||
Backend string `json:"backend" yaml:"backend"`
|
||||
|
||||
// AutoGPTQ
|
||||
ModelBaseName string `json:"model_base_name" yaml:"model_base_name"`
|
||||
}
|
||||
|
||||
|
||||
@@ -41,8 +41,6 @@ type PredictionOptions struct {
|
||||
RopeFreqBase float32 `json:"rope_freq_base" yaml:"rope_freq_base"`
|
||||
RopeFreqScale float32 `json:"rope_freq_scale" yaml:"rope_freq_scale"`
|
||||
NegativePromptScale float32 `json:"negative_prompt_scale" yaml:"negative_prompt_scale"`
|
||||
// AutoGPTQ
|
||||
UseFastTokenizer bool `json:"use_fast_tokenizer" yaml:"use_fast_tokenizer"`
|
||||
|
||||
// Diffusers
|
||||
ClipSkip int `json:"clip_skip" yaml:"clip_skip"`
|
||||
|
||||
@@ -268,14 +268,6 @@ yarn_ext_factor: 0
|
||||
yarn_attn_factor: 0
|
||||
yarn_beta_fast: 0
|
||||
yarn_beta_slow: 0
|
||||
|
||||
# AutoGPT-Q settings, for configurations specific to GPT models.
|
||||
autogptq:
|
||||
model_base_name: "" # Base name of the model.
|
||||
device: "" # Device to run the model on.
|
||||
triton: false # Whether to use Triton Inference Server.
|
||||
use_fast_tokenizer: false # Whether to use a fast tokenizer for quicker processing.
|
||||
|
||||
# configuration for diffusers model
|
||||
diffusers:
|
||||
cuda: false # Whether to use CUDA
|
||||
|
||||
@@ -147,7 +147,6 @@ The devices in the following list have been tested with `hipblas` images running
|
||||
| diffusers | yes | Radeon VII (gfx906) |
|
||||
| piper | yes | Radeon VII (gfx906) |
|
||||
| whisper | no | none |
|
||||
| autogptq | no | none |
|
||||
| bark | no | none |
|
||||
| coqui | no | none |
|
||||
| transformers | no | none |
|
||||
|
||||
@@ -74,49 +74,9 @@ curl http://localhost:8080/v1/models
|
||||
|
||||
## Backends
|
||||
|
||||
### AutoGPTQ
|
||||
|
||||
[AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) is an easy-to-use LLMs quantization package with user-friendly apis, based on GPTQ algorithm.
|
||||
|
||||
#### Prerequisites
|
||||
|
||||
This is an extra backend - in the container images is already available and there is nothing to do for the setup.
|
||||
|
||||
If you are building LocalAI locally, you need to install [AutoGPTQ manually](https://github.com/PanQiWei/AutoGPTQ#quick-installation).
|
||||
|
||||
|
||||
#### Model setup
|
||||
|
||||
The models are automatically downloaded from `huggingface` if not present the first time. It is possible to define models via `YAML` config file, or just by querying the endpoint with the `huggingface` repository model name. For example, create a `YAML` config file in `models/`:
|
||||
|
||||
```
|
||||
name: orca
|
||||
backend: autogptq
|
||||
model_base_name: "orca_mini_v2_13b-GPTQ-4bit-128g.no-act.order"
|
||||
parameters:
|
||||
model: "TheBloke/orca_mini_v2_13b-GPTQ"
|
||||
# ...
|
||||
```
|
||||
|
||||
Test with:
|
||||
|
||||
```bash
|
||||
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
|
||||
"model": "orca",
|
||||
"messages": [{"role": "user", "content": "How are you?"}],
|
||||
"temperature": 0.1
|
||||
}'
|
||||
```
|
||||
### RWKV
|
||||
|
||||
A full example on how to run a rwkv model is in the [examples](https://github.com/go-skynet/LocalAI/tree/master/examples/rwkv).
|
||||
|
||||
Note: rwkv models needs to specify the backend `rwkv` in the YAML config files and have an associated tokenizer along that needs to be provided with it:
|
||||
|
||||
```
|
||||
36464540 -rw-r--r-- 1 mudler mudler 1.2G May 3 10:51 rwkv_small
|
||||
36464543 -rw-r--r-- 1 mudler mudler 2.4M May 3 10:51 rwkv_small.tokenizer.json
|
||||
```
|
||||
RWKV support is available through llama.cpp (see below)
|
||||
|
||||
### llama.cpp
|
||||
|
||||
|
||||
@@ -14,8 +14,6 @@ icon = "rocket_launch"
|
||||
|
||||
If you are exposing LocalAI remotely, make sure you protect the API endpoints adequately with a mechanism which allows to protect from the incoming traffic or alternatively, run LocalAI with `API_KEY` to gate the access with an API key. The API key guarantees a total access to the features (there is no role separation), and it is to be considered as likely as an admin role.
|
||||
|
||||
To access the WebUI with an API_KEY, browser extensions such as [Requestly](https://requestly.com/) can be used (see also https://github.com/mudler/LocalAI/issues/2227#issuecomment-2093333752). See also [API flags]({{% relref "docs/advanced/advanced-usage#api-flags" %}}) for the flags / options available when starting LocalAI.
|
||||
|
||||
{{% /alert %}}
|
||||
|
||||
## Quickstart
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
{
|
||||
"version": "v2.27.0"
|
||||
"version": "v2.28.0"
|
||||
}
|
||||
|
||||
68
docs/static/install.sh
vendored
68
docs/static/install.sh
vendored
@@ -6,6 +6,7 @@
|
||||
# curl ... | ENV_VAR=... sh -
|
||||
# or
|
||||
# ENV_VAR=... ./install.sh
|
||||
# To uninstall: ./install.sh --uninstall
|
||||
|
||||
set -e
|
||||
set -o noglob
|
||||
@@ -57,6 +58,59 @@ require() {
|
||||
echo $MISSING
|
||||
}
|
||||
|
||||
# Function to uninstall LocalAI
|
||||
uninstall_localai() {
|
||||
info "Starting LocalAI uninstallation..."
|
||||
|
||||
# Stop and remove Docker container if it exists
|
||||
if available docker && $SUDO docker ps -a --format '{{.Names}}' | grep -q local-ai; then
|
||||
info "Stopping and removing LocalAI Docker container..."
|
||||
$SUDO docker stop local-ai || true
|
||||
$SUDO docker rm local-ai || true
|
||||
$SUDO docker volume rm local-ai-data || true
|
||||
fi
|
||||
|
||||
# Remove systemd service if it exists
|
||||
if [ -f "/etc/systemd/system/local-ai.service" ]; then
|
||||
info "Removing systemd service..."
|
||||
$SUDO systemctl stop local-ai || true
|
||||
$SUDO systemctl disable local-ai || true
|
||||
$SUDO rm -f /etc/systemd/system/local-ai.service
|
||||
$SUDO systemctl daemon-reload
|
||||
fi
|
||||
|
||||
# Remove environment file
|
||||
if [ -f "/etc/localai.env" ]; then
|
||||
info "Removing environment file..."
|
||||
$SUDO rm -f /etc/localai.env
|
||||
fi
|
||||
|
||||
# Remove binary
|
||||
for BINDIR in /usr/local/bin /usr/bin /bin; do
|
||||
if [ -f "$BINDIR/local-ai" ]; then
|
||||
info "Removing binary from $BINDIR..."
|
||||
$SUDO rm -f "$BINDIR/local-ai"
|
||||
fi
|
||||
done
|
||||
|
||||
# Remove models directory
|
||||
if [ -d "/usr/share/local-ai" ]; then
|
||||
info "Removing LocalAI data directory..."
|
||||
$SUDO rm -rf /usr/share/local-ai
|
||||
fi
|
||||
|
||||
# Remove local-ai user if it exists
|
||||
if id local-ai >/dev/null 2>&1; then
|
||||
info "Removing local-ai user..."
|
||||
$SUDO userdel -r local-ai || true
|
||||
fi
|
||||
|
||||
info "LocalAI has been successfully uninstalled."
|
||||
exit 0
|
||||
}
|
||||
|
||||
|
||||
|
||||
## VARIABLES
|
||||
|
||||
# DOCKER_INSTALL - set to "true" to install Docker images
|
||||
@@ -516,10 +570,10 @@ install_docker() {
|
||||
install_binary_darwin() {
|
||||
[ "$(uname -s)" = "Darwin" ] || fatal 'This script is intended to run on macOS only.'
|
||||
|
||||
info "Downloading local-ai..."
|
||||
info "Downloading LocalAI ${VERSION}..."
|
||||
curl --fail --show-error --location --progress-bar -o $TEMP_DIR/local-ai "https://github.com/mudler/LocalAI/releases/download/${VERSION}/local-ai-Darwin-${ARCH}"
|
||||
|
||||
info "Installing local-ai..."
|
||||
info "Installing to /usr/local/bin/local-ai"
|
||||
install -o0 -g0 -m755 $TEMP_DIR/local-ai /usr/local/bin/local-ai
|
||||
|
||||
install_success
|
||||
@@ -548,14 +602,14 @@ install_binary() {
|
||||
exit 1
|
||||
fi
|
||||
|
||||
info "Downloading local-ai..."
|
||||
info "Downloading LocalAI ${VERSION}..."
|
||||
curl --fail --location --progress-bar -o $TEMP_DIR/local-ai "https://github.com/mudler/LocalAI/releases/download/${VERSION}/local-ai-Linux-${ARCH}"
|
||||
|
||||
for BINDIR in /usr/local/bin /usr/bin /bin; do
|
||||
echo $PATH | grep -q $BINDIR && break || continue
|
||||
done
|
||||
|
||||
info "Installing local-ai to $BINDIR..."
|
||||
info "Installing LocalAI as local-ai to $BINDIR..."
|
||||
$SUDO install -o0 -g0 -m755 -d $BINDIR
|
||||
$SUDO install -o0 -g0 -m755 $TEMP_DIR/local-ai $BINDIR/local-ai
|
||||
|
||||
@@ -617,6 +671,10 @@ detect_start_command() {
|
||||
fi
|
||||
}
|
||||
|
||||
# Check if uninstall flag is provided
|
||||
if [ "$1" = "--uninstall" ]; then
|
||||
uninstall_localai
|
||||
fi
|
||||
|
||||
detect_start_command
|
||||
|
||||
@@ -664,10 +722,12 @@ for PACKAGE_MANAGER in dnf yum apt-get; do
|
||||
done
|
||||
|
||||
if [ "$DOCKER_INSTALL" = "true" ]; then
|
||||
info "Installing LocalAI from container images"
|
||||
if [ "$HAS_CUDA" = true ]; then
|
||||
install_container_toolkit
|
||||
fi
|
||||
install_docker
|
||||
else
|
||||
info "Installing LocalAI from binaries"
|
||||
install_binary
|
||||
fi
|
||||
|
||||
@@ -487,6 +487,50 @@
|
||||
- filename: Omega-LN-SFT-Q4_K_M.gguf
|
||||
sha256: 33fb1c61085f9b18074e320ac784e6dbc8a98fe20705f92773e055471fd3cb0f
|
||||
uri: huggingface://Delta-Vector/Daichi-12B-GGUF/Omega-LN-SFT-Q4_K_M.gguf
|
||||
- !!merge <<: *gemma3
|
||||
name: "readyart_amoral-fallen-omega-gemma3-12b"
|
||||
icon: https://i.imghippo.com/files/EBq6162wlk.webp
|
||||
urls:
|
||||
- https://huggingface.co/ReadyArt/Amoral-Fallen-Omega-Gemma3-12B
|
||||
- https://huggingface.co/bartowski/ReadyArt_Amoral-Fallen-Omega-Gemma3-12B-GGUF
|
||||
description: |
|
||||
This model represents the forbidden merger of:
|
||||
|
||||
🧬 The-Omega-Directive-Gemma3-12B-v1.0 - Unprecedented coherent depravity, well-rounded ERP, low repetition even at maximum length
|
||||
⚡ Fallen-Gemma3-12B-v1 - Fallen Gemma3 12B v1 is an evil tune
|
||||
💎 amoral-gemma3-12B-v2 - Improved natural intelligence and less refusals
|
||||
This model maintains Forgotten-Safeword's complete lack of alignment:
|
||||
|
||||
🚫 No moral or ethical constraints on generated content
|
||||
🔞 Will enthusiastically engage with any NSFW scenario
|
||||
💀 May generate content that requires industrial-grade brain bleach
|
||||
⚖️ Perfectly balanced... as all things should be
|
||||
🔥 Maintains signature intensity with improved narrative flow
|
||||
📖 Handles multi-character scenarios with improved consistency
|
||||
🧠 Excels at long-form storytelling without losing track of plot threads
|
||||
⚡ Noticeably better at following complex instructions than previous versions
|
||||
🎭 Responds to subtle prompt nuances like a mind reader
|
||||
overrides:
|
||||
parameters:
|
||||
model: ReadyArt_Amoral-Fallen-Omega-Gemma3-12B-Q4_K_M.gguf
|
||||
files:
|
||||
- filename: ReadyArt_Amoral-Fallen-Omega-Gemma3-12B-Q4_K_M.gguf
|
||||
sha256: a2a2e76be2beb445d3a569ba03661860cd4aef9a4aa3d57aed319e3d1bddc820
|
||||
uri: huggingface://bartowski/ReadyArt_Amoral-Fallen-Omega-Gemma3-12B-GGUF/ReadyArt_Amoral-Fallen-Omega-Gemma3-12B-Q4_K_M.gguf
|
||||
- !!merge <<: *gemma3
|
||||
name: "google-gemma-3-27b-it-qat-q4_0-small"
|
||||
urls:
|
||||
- https://huggingface.co/google/gemma-3-27b-it-qat-q4_0-gguf
|
||||
- https://huggingface.co/stduhpf/google-gemma-3-27b-it-qat-q4_0-gguf-small
|
||||
description: |
|
||||
This is a requantized version of https://huggingface.co/google/gemma-3-27b-it-qat-q4_0-gguf. The official QAT weights released by google use fp16 (instead of Q6_K) for the embeddings table, which makes this model take a significant extra amount of memory (and storage) compared to what Q4_0 quants are supposed to take. Requantizing with llama.cpp achieves a very similar result. Note that this model ends up smaller than the Q4_0 from Bartowski. This is because llama.cpp sets some tensors to Q4_1 when quantizing models to Q4_0 with imatrix, but this is a static quant. The perplexity score for this one is even lower with this model compared to the original model by Google, but the results are within margin of error, so it's probably just luck. I also fixed the control token metadata, which was slightly degrading the performance of the model in instruct mode.
|
||||
overrides:
|
||||
parameters:
|
||||
model: gemma-3-27b-it-q4_0_s.gguf
|
||||
files:
|
||||
- filename: gemma-3-27b-it-q4_0_s.gguf
|
||||
sha256: cc4e41e3df2bf7fd3827bea7e98f28cecc59d7bd1c6b7b4fa10fc52a5659f3eb
|
||||
uri: huggingface://stduhpf/google-gemma-3-27b-it-qat-q4_0-gguf-small/gemma-3-27b-it-q4_0_s.gguf
|
||||
- &llama4
|
||||
url: "github:mudler/LocalAI/gallery/llama3.1-instruct.yaml@master"
|
||||
icon: https://avatars.githubusercontent.com/u/153379578
|
||||
@@ -1981,6 +2025,34 @@
|
||||
- filename: salamandra-7b-instruct.Q4_K_M-f32.gguf
|
||||
sha256: bac8e8c1d1d9d53cbdb148b8ff9ad378ddb392429207099e85b5aae3a43bff3d
|
||||
uri: huggingface://cstr/salamandra-7b-instruct-GGUF/salamandra-7b-instruct.Q4_K_M-f32.gguf
|
||||
- !!merge <<: *granite3
|
||||
name: "ibm-granite_granite-3.3-8b-instruct"
|
||||
urls:
|
||||
- https://huggingface.co/ibm-granite/granite-3.3-2b-instruct
|
||||
- https://huggingface.co/bartowski/ibm-granite_granite-3.3-8b-instruct-GGUF
|
||||
description: |
|
||||
Granite-3.3-2B-Instruct is a 2-billion parameter 128K context length language model fine-tuned for improved reasoning and instruction-following capabilities. Built on top of Granite-3.3-2B-Base, the model delivers significant gains on benchmarks for measuring generic performance including AlpacaEval-2.0 and Arena-Hard, and improvements in mathematics, coding, and instruction following. It supports structured reasoning through <think></think> and <response></response> tags, providing clear separation between internal thoughts and final outputs. The model has been trained on a carefully balanced combination of permissively licensed data and curated synthetic tasks.
|
||||
overrides:
|
||||
parameters:
|
||||
model: ibm-granite_granite-3.3-8b-instruct-Q4_K_M.gguf
|
||||
files:
|
||||
- filename: ibm-granite_granite-3.3-8b-instruct-Q4_K_M.gguf
|
||||
sha256: 758fb00abcec89df5cf02932165daf72f0d0b74db5019dbe9f2b3defb1e9295e
|
||||
uri: huggingface://bartowski/ibm-granite_granite-3.3-8b-instruct-GGUF/ibm-granite_granite-3.3-8b-instruct-Q4_K_M.gguf
|
||||
- !!merge <<: *granite3
|
||||
name: "ibm-granite_granite-3.3-2b-instruct"
|
||||
urls:
|
||||
- https://huggingface.co/ibm-granite/granite-3.3-2b-instruct
|
||||
- https://huggingface.co/bartowski/ibm-granite_granite-3.3-2b-instruct-GGUF
|
||||
description: |
|
||||
Granite-3.3-2B-Instruct is a 2-billion parameter 128K context length language model fine-tuned for improved reasoning and instruction-following capabilities. Built on top of Granite-3.3-2B-Base, the model delivers significant gains on benchmarks for measuring generic performance including AlpacaEval-2.0 and Arena-Hard, and improvements in mathematics, coding, and instruction following. It supports structured reasoning through <think></think> and <response></response> tags, providing clear separation between internal thoughts and final outputs. The model has been trained on a carefully balanced combination of permissively licensed data and curated synthetic tasks.
|
||||
overrides:
|
||||
parameters:
|
||||
model: ibm-granite_granite-3.3-2b-instruct-Q4_K_M.gguf
|
||||
files:
|
||||
- filename: ibm-granite_granite-3.3-2b-instruct-Q4_K_M.gguf
|
||||
sha256: 555b91485955bc96eb445b57dd4bbf8809aa7d8cce7c313f4f8bc5b2340896b4
|
||||
uri: huggingface://bartowski/ibm-granite_granite-3.3-2b-instruct-GGUF/ibm-granite_granite-3.3-2b-instruct-Q4_K_M.gguf
|
||||
- &llama32
|
||||
url: "github:mudler/LocalAI/gallery/llama3.2-quantized.yaml@master"
|
||||
icon: https://avatars.githubusercontent.com/u/153379578
|
||||
@@ -2688,6 +2760,20 @@
|
||||
- filename: deepcogito_cogito-v1-preview-llama-3B-Q4_K_M.gguf
|
||||
sha256: 726a0ef5f818b8d238f2844f3204848bea66fb9c172b8ae0f6dc51b7bc081dd5
|
||||
uri: huggingface://bartowski/deepcogito_cogito-v1-preview-llama-3B-GGUF/deepcogito_cogito-v1-preview-llama-3B-Q4_K_M.gguf
|
||||
- !!merge <<: *llama32
|
||||
name: "menlo_rezero-v0.1-llama-3.2-3b-it-grpo-250404"
|
||||
urls:
|
||||
- https://huggingface.co/Menlo/ReZero-v0.1-llama-3.2-3b-it-grpo-250404
|
||||
- https://huggingface.co/bartowski/Menlo_ReZero-v0.1-llama-3.2-3b-it-grpo-250404-GGUF
|
||||
description: |
|
||||
ReZero trains a small language model to develop effective search behaviors instead of memorizing static data. It interacts with multiple synthetic search engines, each with unique retrieval mechanisms, to refine queries and persist in searching until it finds exact answers. The project focuses on reinforcement learning, preventing overfitting, and optimizing for efficiency in real-world search applications.
|
||||
overrides:
|
||||
parameters:
|
||||
model: Menlo_ReZero-v0.1-llama-3.2-3b-it-grpo-250404-Q4_K_M.gguf
|
||||
files:
|
||||
- filename: Menlo_ReZero-v0.1-llama-3.2-3b-it-grpo-250404-Q4_K_M.gguf
|
||||
sha256: b9f01bead9e163db9351af036d8d63ef479d7d48a1bb44934ead732a180f371c
|
||||
uri: huggingface://bartowski/Menlo_ReZero-v0.1-llama-3.2-3b-it-grpo-250404-GGUF/Menlo_ReZero-v0.1-llama-3.2-3b-it-grpo-250404-Q4_K_M.gguf
|
||||
- &qwen25
|
||||
name: "qwen2.5-14b-instruct" ## Qwen2.5
|
||||
icon: https://avatars.githubusercontent.com/u/141221163
|
||||
@@ -5956,6 +6042,67 @@
|
||||
- filename: m1-32b.Q4_K_M.gguf
|
||||
sha256: 1dfa3b6822447aca590d6f2881cf277bd0fbde633a39c5a20b521f4a59145e3f
|
||||
uri: huggingface://mradermacher/m1-32b-GGUF/m1-32b.Q4_K_M.gguf
|
||||
- !!merge <<: *qwen25
|
||||
name: "qwen2.5-14b-instruct-1m"
|
||||
urls:
|
||||
- https://huggingface.co/Qwen/Qwen2.5-14B-Instruct-1M
|
||||
- https://huggingface.co/bartowski/Qwen2.5-14B-Instruct-1M-GGUF
|
||||
description: |
|
||||
Qwen2.5-1M is the long-context version of the Qwen2.5 series models, supporting a context length of up to 1M tokens. Compared to the Qwen2.5 128K version, Qwen2.5-1M demonstrates significantly improved performance in handling long-context tasks while maintaining its capability in short tasks.
|
||||
|
||||
The model has the following features:
|
||||
|
||||
Type: Causal Language Models
|
||||
Training Stage: Pretraining & Post-training
|
||||
Architecture: transformers with RoPE, SwiGLU, RMSNorm, and Attention QKV bias
|
||||
Number of Parameters: 14.7B
|
||||
Number of Paramaters (Non-Embedding): 13.1B
|
||||
Number of Layers: 48
|
||||
Number of Attention Heads (GQA): 40 for Q and 8 for KV
|
||||
Context Length: Full 1,010,000 tokens and generation 8192 tokens
|
||||
We recommend deploying with our custom vLLM, which introduces sparse attention and length extrapolation methods to ensure efficiency and accuracy for long-context tasks. For specific guidance, refer to this section.
|
||||
You can also use the previous framework that supports Qwen2.5 for inference, but accuracy degradation may occur for sequences exceeding 262,144 tokens.
|
||||
|
||||
For more details, please refer to our blog, GitHub, Technical Report, and Documentation.
|
||||
overrides:
|
||||
parameters:
|
||||
model: Qwen2.5-14B-Instruct-1M-Q4_K_M.gguf
|
||||
files:
|
||||
- filename: Qwen2.5-14B-Instruct-1M-Q4_K_M.gguf
|
||||
sha256: a1a0fa3e2c3f9d63f9202af9172cffbc0b519801dff740fffd39f6a063a731ef
|
||||
uri: huggingface://bartowski/Qwen2.5-14B-Instruct-1M-GGUF/Qwen2.5-14B-Instruct-1M-Q4_K_M.gguf
|
||||
- !!merge <<: *qwen25
|
||||
name: "pictor-1338-qwenp-1.5b"
|
||||
icon: https://cdn-uploads.huggingface.co/production/uploads/65bb837dbfb878f46c77de4c/X7zeHYbH5Y5JoRK_ud_Ya.png
|
||||
urls:
|
||||
- https://huggingface.co/prithivMLmods/Pictor-1338-QwenP-1.5B
|
||||
- https://huggingface.co/adriey/Pictor-1338-QwenP-1.5B-Q8_0-GGUF
|
||||
description: |
|
||||
Pictor-1338-QwenP-1.5B is a code reasoning LLM fine-tuned from Qwen-1.5B using distributed reinforcement learning (RL). This model is designed to enhance coding proficiency, debugging accuracy, and step-by-step reasoning in software development tasks across multiple programming languages.
|
||||
|
||||
Key Features
|
||||
|
||||
Code Reasoning & Explanation
|
||||
Trained to analyze, generate, and explain code with a focus on logic, structure, and clarity. Supports functional, object-oriented, and procedural paradigms.
|
||||
|
||||
Reinforcement Learning Fine-Tuning
|
||||
Enhanced using distributed RL, improving reward-aligned behavior in tasks like fixing bugs, completing functions, and understanding abstract instructions.
|
||||
|
||||
Multi-Language Support
|
||||
Works fluently with Python, JavaScript, C++, and Shell, among others—ideal for general-purpose programming, scripting, and algorithmic tasks.
|
||||
|
||||
Compact and Efficient
|
||||
At just 1.5B parameters, it's lightweight enough for edge deployments and developer tools with strong reasoning capability.
|
||||
|
||||
Debugging and Auto-Fix Capabilities
|
||||
Built to identify bugs, recommend corrections, and provide context-aware explanations of issues in codebases.
|
||||
overrides:
|
||||
parameters:
|
||||
model: pictor-1338-qwenp-1.5b-q8_0.gguf
|
||||
files:
|
||||
- filename: pictor-1338-qwenp-1.5b-q8_0.gguf
|
||||
sha256: 22d2f5b2322d9a354d8578475a6924c2173a913a1e2fa0ec2655f2f5937f6f26
|
||||
uri: huggingface://adriey/Pictor-1338-QwenP-1.5B-Q8_0-GGUF/pictor-1338-qwenp-1.5b-q8_0.gguf
|
||||
- &llama31
|
||||
url: "github:mudler/LocalAI/gallery/llama3.1-instruct.yaml@master" ## LLama3.1
|
||||
icon: https://avatars.githubusercontent.com/u/153379578
|
||||
@@ -10116,6 +10263,54 @@
|
||||
- filename: Trappu_Magnum-Picaro-0.7-v2-12b-Q4_K_M.gguf
|
||||
sha256: 989839dd7eab997a70eb8430b9df1138f9b0f35d58299d5007e6555a4a4a7f4c
|
||||
uri: huggingface://bartowski/Trappu_Magnum-Picaro-0.7-v2-12b-GGUF/Trappu_Magnum-Picaro-0.7-v2-12b-Q4_K_M.gguf
|
||||
- !!merge <<: *mistral03
|
||||
icon: https://cdn-uploads.huggingface.co/production/uploads/65f2fd1c25b848bd061b5c2e/69pOPcYiUzKWW1OPzg1-_.png
|
||||
name: "thedrummer_rivermind-12b-v1"
|
||||
urls:
|
||||
- https://huggingface.co/TheDrummer/Rivermind-12B-v1
|
||||
- https://huggingface.co/bartowski/TheDrummer_Rivermind-12B-v1-GGUF
|
||||
description: |
|
||||
Introducing Rivermind™, the next-generation AI that’s redefining human-machine interaction—powered by Amazon Web Services (AWS) for seamless cloud integration and NVIDIA’s latest AI processors for lightning-fast responses.
|
||||
But wait, there’s more! Rivermind doesn’t just process data—it feels your emotions (thanks to Google’s TensorFlow for deep emotional analysis). Whether you're brainstorming ideas or just need someone to vent to, Rivermind adapts in real-time, all while keeping your data secure with McAfee’s enterprise-grade encryption.
|
||||
And hey, why not grab a refreshing Coca-Cola Zero Sugar while you interact? The crisp, bold taste pairs perfectly with Rivermind’s witty banter—because even AI deserves the best (and so do you).
|
||||
Upgrade your thinking today with Rivermind™—the AI that thinks like you, but better, brought to you by the brands you trust. 🚀✨
|
||||
overrides:
|
||||
parameters:
|
||||
model: TheDrummer_Rivermind-12B-v1-Q4_K_M.gguf
|
||||
files:
|
||||
- filename: TheDrummer_Rivermind-12B-v1-Q4_K_M.gguf
|
||||
sha256: 49a5341ea90e7bd03e797162ab23bf0b975dce9faf5d957f7d24bf1d5134c937
|
||||
uri: huggingface://bartowski/TheDrummer_Rivermind-12B-v1-GGUF/TheDrummer_Rivermind-12B-v1-Q4_K_M.gguf
|
||||
- !!merge <<: *mistral03
|
||||
url: "github:mudler/LocalAI/gallery/llama3.1-instruct.yaml@master"
|
||||
name: "dreamgen_lucid-v1-nemo"
|
||||
icon: https://huggingface.co/dreamgen/lucid-v1-nemo/resolve/main/images/banner.webp
|
||||
urls:
|
||||
- https://huggingface.co/dreamgen/lucid-v1-nemo
|
||||
- https://huggingface.co/bartowski/dreamgen_lucid-v1-nemo-GGUF
|
||||
description: |
|
||||
Focused on role-play & story-writing.
|
||||
Suitable for all kinds of writers and role-play enjoyers:
|
||||
For world-builders who want to specify every detail in advance: plot, setting, writing style, characters, locations, items, lore, etc.
|
||||
For intuitive writers who start with a loose prompt and shape the narrative through instructions (OCC) as the story / role-play unfolds.
|
||||
Support for multi-character role-plays:
|
||||
Model can automatically pick between characters.
|
||||
Support for inline writing instructions (OOC):
|
||||
Controlling plot development (say what should happen, what the characters should do, etc.)
|
||||
Controlling pacing.
|
||||
etc.
|
||||
Support for inline writing assistance:
|
||||
Planning the next scene / the next chapter / story.
|
||||
Suggesting new characters.
|
||||
etc.
|
||||
Support for reasoning (opt-in).
|
||||
overrides:
|
||||
parameters:
|
||||
model: dreamgen_lucid-v1-nemo-Q4_K_M.gguf
|
||||
files:
|
||||
- filename: dreamgen_lucid-v1-nemo-Q4_K_M.gguf
|
||||
sha256: b9cbd018895a76805ea8b8d2a499b3221044ce2df2a06ed858b61caba11b81dc
|
||||
uri: huggingface://bartowski/dreamgen_lucid-v1-nemo-GGUF/dreamgen_lucid-v1-nemo-Q4_K_M.gguf
|
||||
- &mudler
|
||||
url: "github:mudler/LocalAI/gallery/mudler.yaml@master" ### START mudler's LocalAI specific-models
|
||||
name: "LocalAI-llama3-8b-function-call-v0.2"
|
||||
@@ -15913,7 +16108,8 @@
|
||||
- filename: silero-vad.onnx
|
||||
uri: https://huggingface.co/onnx-community/silero-vad/resolve/main/onnx/model.onnx
|
||||
sha256: a4a068cd6cf1ea8355b84327595838ca748ec29a25bc91fc82e6c299ccdc5808
|
||||
- name: "bark-cpp-small"
|
||||
- &bark
|
||||
name: "bark-cpp"
|
||||
icon: https://avatars.githubusercontent.com/u/99442120
|
||||
url: github:mudler/LocalAI/gallery/virtual.yaml@master
|
||||
license: mit
|
||||
@@ -15927,6 +16123,15 @@
|
||||
- cpu
|
||||
overrides:
|
||||
backend: bark-cpp
|
||||
parameters:
|
||||
model: bark_weights-f16.bin
|
||||
files:
|
||||
- filename: bark_weights-f16.bin
|
||||
uri: https://huggingface.co/Green-Sky/bark-ggml/resolve/main/bark_weights-f16.bin
|
||||
sha256: ba6fc0e09531e6b8b5a9ef8862be2c9a52a631fc93f34a60b26b879cacf18f62
|
||||
- !!merge <<: *bark
|
||||
name: "bark-cpp-small"
|
||||
overrides:
|
||||
parameters:
|
||||
model: bark-small_weights-f16.bin
|
||||
files:
|
||||
|
||||
@@ -57,7 +57,11 @@ func (c *Client) HealthCheck(ctx context.Context) (bool, error) {
|
||||
}
|
||||
c.setBusy(true)
|
||||
defer c.setBusy(false)
|
||||
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
|
||||
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()),
|
||||
grpc.WithDefaultCallOptions(
|
||||
grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB
|
||||
grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB
|
||||
))
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
@@ -89,7 +93,11 @@ func (c *Client) Embeddings(ctx context.Context, in *pb.PredictOptions, opts ...
|
||||
defer c.setBusy(false)
|
||||
c.wdMark()
|
||||
defer c.wdUnMark()
|
||||
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
|
||||
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()),
|
||||
grpc.WithDefaultCallOptions(
|
||||
grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB
|
||||
grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB
|
||||
))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -108,7 +116,11 @@ func (c *Client) Predict(ctx context.Context, in *pb.PredictOptions, opts ...grp
|
||||
defer c.setBusy(false)
|
||||
c.wdMark()
|
||||
defer c.wdUnMark()
|
||||
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
|
||||
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()),
|
||||
grpc.WithDefaultCallOptions(
|
||||
grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB
|
||||
grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB
|
||||
))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -127,7 +139,11 @@ func (c *Client) LoadModel(ctx context.Context, in *pb.ModelOptions, opts ...grp
|
||||
defer c.setBusy(false)
|
||||
c.wdMark()
|
||||
defer c.wdUnMark()
|
||||
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
|
||||
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()),
|
||||
grpc.WithDefaultCallOptions(
|
||||
grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB
|
||||
grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB
|
||||
))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -145,7 +161,11 @@ func (c *Client) PredictStream(ctx context.Context, in *pb.PredictOptions, f fun
|
||||
defer c.setBusy(false)
|
||||
c.wdMark()
|
||||
defer c.wdUnMark()
|
||||
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
|
||||
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()),
|
||||
grpc.WithDefaultCallOptions(
|
||||
grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB
|
||||
grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB
|
||||
))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -182,7 +202,11 @@ func (c *Client) GenerateImage(ctx context.Context, in *pb.GenerateImageRequest,
|
||||
defer c.setBusy(false)
|
||||
c.wdMark()
|
||||
defer c.wdUnMark()
|
||||
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
|
||||
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()),
|
||||
grpc.WithDefaultCallOptions(
|
||||
grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB
|
||||
grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB
|
||||
))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -200,7 +224,11 @@ func (c *Client) TTS(ctx context.Context, in *pb.TTSRequest, opts ...grpc.CallOp
|
||||
defer c.setBusy(false)
|
||||
c.wdMark()
|
||||
defer c.wdUnMark()
|
||||
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
|
||||
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()),
|
||||
grpc.WithDefaultCallOptions(
|
||||
grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB
|
||||
grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB
|
||||
))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -218,7 +246,11 @@ func (c *Client) SoundGeneration(ctx context.Context, in *pb.SoundGenerationRequ
|
||||
defer c.setBusy(false)
|
||||
c.wdMark()
|
||||
defer c.wdUnMark()
|
||||
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
|
||||
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()),
|
||||
grpc.WithDefaultCallOptions(
|
||||
grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB
|
||||
grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB
|
||||
))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -236,7 +268,11 @@ func (c *Client) AudioTranscription(ctx context.Context, in *pb.TranscriptReques
|
||||
defer c.setBusy(false)
|
||||
c.wdMark()
|
||||
defer c.wdUnMark()
|
||||
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
|
||||
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()),
|
||||
grpc.WithDefaultCallOptions(
|
||||
grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB
|
||||
grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB
|
||||
))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -254,7 +290,11 @@ func (c *Client) TokenizeString(ctx context.Context, in *pb.PredictOptions, opts
|
||||
defer c.setBusy(false)
|
||||
c.wdMark()
|
||||
defer c.wdUnMark()
|
||||
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
|
||||
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()),
|
||||
grpc.WithDefaultCallOptions(
|
||||
grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB
|
||||
grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB
|
||||
))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -276,7 +316,11 @@ func (c *Client) Status(ctx context.Context) (*pb.StatusResponse, error) {
|
||||
}
|
||||
c.setBusy(true)
|
||||
defer c.setBusy(false)
|
||||
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
|
||||
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()),
|
||||
grpc.WithDefaultCallOptions(
|
||||
grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB
|
||||
grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB
|
||||
))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -294,7 +338,11 @@ func (c *Client) StoresSet(ctx context.Context, in *pb.StoresSetOptions, opts ..
|
||||
defer c.setBusy(false)
|
||||
c.wdMark()
|
||||
defer c.wdUnMark()
|
||||
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
|
||||
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()),
|
||||
grpc.WithDefaultCallOptions(
|
||||
grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB
|
||||
grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB
|
||||
))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -312,7 +360,11 @@ func (c *Client) StoresDelete(ctx context.Context, in *pb.StoresDeleteOptions, o
|
||||
defer c.wdUnMark()
|
||||
c.setBusy(true)
|
||||
defer c.setBusy(false)
|
||||
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
|
||||
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()),
|
||||
grpc.WithDefaultCallOptions(
|
||||
grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB
|
||||
grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB
|
||||
))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -330,7 +382,11 @@ func (c *Client) StoresGet(ctx context.Context, in *pb.StoresGetOptions, opts ..
|
||||
defer c.setBusy(false)
|
||||
c.wdMark()
|
||||
defer c.wdUnMark()
|
||||
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
|
||||
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()),
|
||||
grpc.WithDefaultCallOptions(
|
||||
grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB
|
||||
grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB
|
||||
))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -348,7 +404,11 @@ func (c *Client) StoresFind(ctx context.Context, in *pb.StoresFindOptions, opts
|
||||
defer c.setBusy(false)
|
||||
c.wdMark()
|
||||
defer c.wdUnMark()
|
||||
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
|
||||
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()),
|
||||
grpc.WithDefaultCallOptions(
|
||||
grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB
|
||||
grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB
|
||||
))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -366,7 +426,11 @@ func (c *Client) Rerank(ctx context.Context, in *pb.RerankRequest, opts ...grpc.
|
||||
defer c.setBusy(false)
|
||||
c.wdMark()
|
||||
defer c.wdUnMark()
|
||||
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
|
||||
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()),
|
||||
grpc.WithDefaultCallOptions(
|
||||
grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB
|
||||
grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB
|
||||
))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -384,7 +448,11 @@ func (c *Client) GetTokenMetrics(ctx context.Context, in *pb.MetricsRequest, opt
|
||||
defer c.setBusy(false)
|
||||
c.wdMark()
|
||||
defer c.wdUnMark()
|
||||
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
|
||||
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()),
|
||||
grpc.WithDefaultCallOptions(
|
||||
grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB
|
||||
grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB
|
||||
))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -402,7 +470,11 @@ func (c *Client) VAD(ctx context.Context, in *pb.VADRequest, opts ...grpc.CallOp
|
||||
defer c.setBusy(false)
|
||||
c.wdMark()
|
||||
defer c.wdUnMark()
|
||||
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
|
||||
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()),
|
||||
grpc.WithDefaultCallOptions(
|
||||
grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB
|
||||
grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB
|
||||
))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
@@ -244,7 +244,10 @@ func StartServer(address string, model LLM) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
s := grpc.NewServer()
|
||||
s := grpc.NewServer(
|
||||
grpc.MaxRecvMsgSize(50*1024*1024), // 50MB
|
||||
grpc.MaxSendMsgSize(50*1024*1024), // 50MB
|
||||
)
|
||||
pb.RegisterBackendServer(s, &server{llm: model})
|
||||
log.Printf("gRPC Server listening at %v", lis.Addr())
|
||||
if err := s.Serve(lis); err != nil {
|
||||
@@ -259,7 +262,10 @@ func RunServer(address string, model LLM) (func() error, error) {
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
s := grpc.NewServer()
|
||||
s := grpc.NewServer(
|
||||
grpc.MaxRecvMsgSize(50*1024*1024), // 50MB
|
||||
grpc.MaxSendMsgSize(50*1024*1024), // 50MB
|
||||
)
|
||||
pb.RegisterBackendServer(s, &server{llm: model})
|
||||
log.Printf("gRPC Server listening at %v", lis.Addr())
|
||||
if err = s.Serve(lis); err != nil {
|
||||
|
||||
Reference in New Issue
Block a user