Compare commits

..

20 Commits

Author SHA1 Message Date
Ettore Di Giacinto
bcf02449b3 ci(dockerhub): push images also to dockerhub (#1542)
Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2024-01-04 08:32:29 +01:00
LocalAI [bot]
d48faf35ab ⬆️ Update ggerganov/llama.cpp (#1544)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-01-04 00:08:03 +01:00
Ettore Di Giacinto
583bd28a5c fix(diffusers): add omegaconf dependency (#1540)
Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2024-01-04 00:06:41 +01:00
LocalAI [bot]
7e1d8c489b ⬆️ Update ggerganov/llama.cpp (#1533)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-01-03 08:43:35 +01:00
LocalAI [bot]
de28867374 ⬆️ Update ggerganov/llama.cpp (#1531)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-01-02 00:28:22 +00:00
Ettore Di Giacinto
a1aa6cb7c2 fix(entrypoint): cd to backend dir before start (#1530)
Certain backends as vall-e-x are not meant to be used as a library, so
we want to start the process in the same folder where the backend and
all the assets are fixes #1394
2024-01-01 22:02:48 +01:00
Ettore Di Giacinto
85e2767dca feat: add trimsuffix (#1528) 2024-01-01 14:39:42 +01:00
Ettore Di Giacinto
fd48cb6506 deps(llama.cpp): update and sync grpc server (#1527)
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2024-01-01 14:39:31 +01:00
Ettore Di Giacinto
522659eb59 feat(prepare): allow to specify additional files to download (#1526) 2024-01-01 14:39:13 +01:00
Ettore Di Giacinto
f068efe509 docs(phi-2): add example (#1525) 2024-01-01 10:51:47 +01:00
Ettore Di Giacinto
726fe416bb docs: update hot topics
Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2024-01-01 10:41:39 +01:00
Ettore Di Giacinto
66fa4f1767 feat: share models by url (#1522)
* feat: allow to pass by models via args

* expose it also as an env/arg

* docs: enhancements to build/requirements

* do not display status always

* print download status

* not all mesages are debug
2024-01-01 10:31:03 +01:00
Ettore Di Giacinto
d6565f3b99 Update _index.en.md
Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2023-12-31 10:58:22 +01:00
LocalAI [bot]
27686ff20b ⬆️ Update ggerganov/llama.cpp (#1518)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2023-12-31 00:19:08 +00:00
LocalAI [bot]
a8b865022f ⬆️ Update docs version mudler/LocalAI (#1517)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2023-12-30 23:50:24 +00:00
Ettore Di Giacinto
c1888a8062 feat(preload): prepare models in galleries (#1515)
Previously if applying models from the gallery API, we didn't actually
allowed remote URLs as models as nothing was actually downloading the
models referenced in the configuration file. Now we call Preload after
we have all the models loaded in memory.
2023-12-30 18:55:18 +01:00
Ettore Di Giacinto
a95bb0521d fix(download): correctly check for not found error (#1514) 2023-12-30 15:36:46 +01:00
Chris Natale
e2311a145c Fix: Set proper Homebrew install location for x86 Macs (#1510)
* set proper Homebrew install location for x86 Macs

* fix: remove prior conditional that my logic replaces
2023-12-30 12:37:26 +01:00
lunamidori5
d4e0bab6be Update version.json (2.3.0) (#1511)
Update version.json

Signed-off-by: lunamidori5 <118759930+lunamidori5@users.noreply.github.com>
2023-12-30 10:19:46 +01:00
LocalAI [bot]
5b0dc20e4c ⬆️ Update ggerganov/llama.cpp (#1509)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2023-12-30 09:19:07 +00:00
25 changed files with 407 additions and 139 deletions

View File

@@ -27,8 +27,10 @@ jobs:
platforms: ${{ matrix.platforms }}
runs-on: ${{ matrix.runs-on }}
secrets:
dockerUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
dockerPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
strategy:
# Pushing with all jobs in parallel
# eats the bandwidth of all the nodes
@@ -107,8 +109,10 @@ jobs:
platforms: ${{ matrix.platforms }}
runs-on: ${{ matrix.runs-on }}
secrets:
dockerUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
dockerPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
strategy:
matrix:
include:

View File

@@ -46,6 +46,10 @@ on:
required: true
dockerPassword:
required: true
quayUsername:
required: true
quayPassword:
required: true
jobs:
reusable_image-build:
runs-on: ${{ inputs.runs-on }}
@@ -100,7 +104,9 @@ jobs:
id: meta
uses: docker/metadata-action@v5
with:
images: quay.io/go-skynet/local-ai
images: |
quay.io/go-skynet/local-ai
localai/localai
tags: |
type=ref,event=branch
type=semver,pattern={{raw}}
@@ -122,10 +128,17 @@ jobs:
if: github.event_name != 'pull_request'
uses: docker/login-action@v3
with:
registry: quay.io
username: ${{ secrets.dockerUsername }}
password: ${{ secrets.dockerPassword }}
- name: Login to DockerHub
if: github.event_name != 'pull_request'
uses: docker/login-action@v3
with:
registry: quay.io
username: ${{ secrets.quayUsername }}
password: ${{ secrets.quayPassword }}
- name: Build and push
uses: docker/build-push-action@v5
with:

View File

@@ -8,7 +8,7 @@ GOLLAMA_VERSION?=aeba71ee842819da681ea537e78846dc75949ac0
GOLLAMA_STABLE_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7
CPPLLAMA_VERSION?=65e5f6dadbba4b496bba27f573e473c66b446496
CPPLLAMA_VERSION?=cb1e2818e0e12ec99f7236ec5d4f3ffd8bcc2f4a
# gpt4all version
GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all

View File

@@ -40,6 +40,7 @@
[Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
- Start and share models with config file: https://github.com/mudler/LocalAI/pull/1522
- 🐸 Coqui: https://github.com/mudler/LocalAI/pull/1489
- Inline templates: https://github.com/mudler/LocalAI/pull/1452
- Mixtral: https://github.com/mudler/LocalAI/pull/1449

View File

@@ -5,6 +5,7 @@ import (
"errors"
"fmt"
"os"
"path/filepath"
"strings"
config "github.com/go-skynet/LocalAI/api/config"
@@ -16,6 +17,7 @@ import (
"github.com/go-skynet/LocalAI/metrics"
"github.com/go-skynet/LocalAI/pkg/assets"
"github.com/go-skynet/LocalAI/pkg/model"
"github.com/go-skynet/LocalAI/pkg/utils"
"github.com/gofiber/fiber/v2"
"github.com/gofiber/fiber/v2/middleware/cors"
@@ -36,6 +38,26 @@ func Startup(opts ...options.AppOption) (*options.Option, *config.ConfigLoader,
log.Info().Msgf("Starting LocalAI using %d threads, with models path: %s", options.Threads, options.Loader.ModelPath)
log.Info().Msgf("LocalAI version: %s", internal.PrintableVersion())
modelPath := options.Loader.ModelPath
if len(options.ModelsURL) > 0 {
for _, url := range options.ModelsURL {
if utils.LooksLikeURL(url) {
// md5 of model name
md5Name := utils.MD5(url)
// check if file exists
if _, err := os.Stat(filepath.Join(modelPath, md5Name)); errors.Is(err, os.ErrNotExist) {
err := utils.DownloadFile(url, filepath.Join(modelPath, md5Name)+".yaml", "", func(fileName, current, total string, percent float64) {
utils.DisplayDownloadFunction(fileName, current, total, percent)
})
if err != nil {
log.Error().Msgf("error loading model: %s", err.Error())
}
}
}
}
}
cl := config.NewConfigLoader()
if err := cl.LoadConfigs(options.Loader.ModelPath); err != nil {
log.Error().Msgf("error loading config files: %s", err.Error())
@@ -51,6 +73,18 @@ func Startup(opts ...options.AppOption) (*options.Option, *config.ConfigLoader,
log.Error().Msgf("error downloading models: %s", err.Error())
}
if options.PreloadJSONModels != "" {
if err := localai.ApplyGalleryFromString(options.Loader.ModelPath, options.PreloadJSONModels, cl, options.Galleries); err != nil {
return nil, nil, err
}
}
if options.PreloadModelsFromPath != "" {
if err := localai.ApplyGalleryFromFile(options.Loader.ModelPath, options.PreloadModelsFromPath, cl, options.Galleries); err != nil {
return nil, nil, err
}
}
if options.Debug {
for _, v := range cl.ListConfigs() {
cfg, _ := cl.GetConfig(v)
@@ -67,18 +101,6 @@ func Startup(opts ...options.AppOption) (*options.Option, *config.ConfigLoader,
}
}
if options.PreloadJSONModels != "" {
if err := localai.ApplyGalleryFromString(options.Loader.ModelPath, options.PreloadJSONModels, cl, options.Galleries); err != nil {
return nil, nil, err
}
}
if options.PreloadModelsFromPath != "" {
if err := localai.ApplyGalleryFromFile(options.Loader.ModelPath, options.PreloadModelsFromPath, cl, options.Galleries); err != nil {
return nil, nil, err
}
}
// turn off any process that was started by GRPC if the context is canceled
go func() {
<-options.Context.Done()

View File

@@ -159,6 +159,9 @@ func Finetune(config config.Config, input, prediction string) string {
for _, c := range config.TrimSpace {
prediction = strings.TrimSpace(strings.TrimPrefix(prediction, c))
}
return prediction
for _, c := range config.TrimSuffix {
prediction = strings.TrimSpace(strings.TrimSuffix(prediction, c))
}
return prediction
}

View File

@@ -1,6 +1,7 @@
package api_config
import (
"errors"
"fmt"
"io/fs"
"os"
@@ -51,6 +52,14 @@ type Config struct {
// CUDA
// Explicitly enable CUDA or not (some backends might need it)
CUDA bool `yaml:"cuda"`
DownloadFiles []File `yaml:"download_files"`
}
type File struct {
Filename string `yaml:"filename" json:"filename"`
SHA256 string `yaml:"sha256" json:"sha256"`
URI string `yaml:"uri" json:"uri"`
}
type VallE struct {
@@ -102,16 +111,18 @@ type LLMConfig struct {
StopWords []string `yaml:"stopwords"`
Cutstrings []string `yaml:"cutstrings"`
TrimSpace []string `yaml:"trimspace"`
ContextSize int `yaml:"context_size"`
NUMA bool `yaml:"numa"`
LoraAdapter string `yaml:"lora_adapter"`
LoraBase string `yaml:"lora_base"`
LoraScale float32 `yaml:"lora_scale"`
NoMulMatQ bool `yaml:"no_mulmatq"`
DraftModel string `yaml:"draft_model"`
NDraft int32 `yaml:"n_draft"`
Quantization string `yaml:"quantization"`
MMProj string `yaml:"mmproj"`
TrimSuffix []string `yaml:"trimsuffix"`
ContextSize int `yaml:"context_size"`
NUMA bool `yaml:"numa"`
LoraAdapter string `yaml:"lora_adapter"`
LoraBase string `yaml:"lora_base"`
LoraScale float32 `yaml:"lora_scale"`
NoMulMatQ bool `yaml:"no_mulmatq"`
DraftModel string `yaml:"draft_model"`
NDraft int32 `yaml:"n_draft"`
Quantization string `yaml:"quantization"`
MMProj string `yaml:"mmproj"`
RopeScaling string `yaml:"rope_scaling"`
YarnExtFactor float32 `yaml:"yarn_ext_factor"`
@@ -266,22 +277,44 @@ func (cm *ConfigLoader) ListConfigs() []string {
return res
}
// Preload prepare models if they are not local but url or huggingface repositories
func (cm *ConfigLoader) Preload(modelPath string) error {
cm.Lock()
defer cm.Unlock()
status := func(fileName, current, total string, percent float64) {
utils.DisplayDownloadFunction(fileName, current, total, percent)
}
log.Info().Msgf("Preloading models from %s", modelPath)
for i, config := range cm.configs {
// Download files and verify their SHA
for _, file := range config.DownloadFiles {
log.Debug().Msgf("Checking %q exists and matches SHA", file.Filename)
if err := utils.VerifyPath(file.Filename, modelPath); err != nil {
return err
}
// Create file path
filePath := filepath.Join(modelPath, file.Filename)
if err := utils.DownloadFile(file.URI, filePath, file.SHA256, status); err != nil {
return err
}
}
modelURL := config.PredictionOptions.Model
modelURL = utils.ConvertURL(modelURL)
if strings.HasPrefix(modelURL, "http://") || strings.HasPrefix(modelURL, "https://") {
if utils.LooksLikeURL(modelURL) {
// md5 of model name
md5Name := utils.MD5(modelURL)
// check if file exists
if _, err := os.Stat(filepath.Join(modelPath, md5Name)); err == os.ErrNotExist {
err := utils.DownloadFile(modelURL, filepath.Join(modelPath, md5Name), "", func(fileName, current, total string, percent float64) {
log.Info().Msgf("Downloading %s: %s/%s (%.2f%%)", fileName, current, total, percent)
})
if _, err := os.Stat(filepath.Join(modelPath, md5Name)); errors.Is(err, os.ErrNotExist) {
err := utils.DownloadFile(modelURL, filepath.Join(modelPath, md5Name), "", status)
if err != nil {
return err
}

View File

@@ -130,6 +130,12 @@ func (g *galleryApplier) Start(c context.Context, cm *config.ConfigLoader) {
continue
}
err = cm.Preload(g.modelPath)
if err != nil {
updateError(err)
continue
}
g.updateStatus(op.id, &galleryOpStatus{Processed: true, Message: "completed", Progress: 100})
}
}

View File

@@ -40,9 +40,12 @@ type Option struct {
SingleBackend bool
ParallelBackendRequests bool
WatchDogIdle bool
WatchDogBusy bool
WatchDog bool
WatchDogIdle bool
WatchDogBusy bool
WatchDog bool
ModelsURL []string
WatchDogBusyTimeout, WatchDogIdleTimeout time.Duration
}
@@ -63,6 +66,12 @@ func NewOptions(o ...AppOption) *Option {
return opt
}
func WithModelsURL(urls ...string) AppOption {
return func(o *Option) {
o.ModelsURL = urls
}
}
func WithCors(b bool) AppOption {
return func(o *Option) {
o.CORS = b

View File

@@ -17,9 +17,17 @@ cmake_minimum_required(VERSION 3.15)
set(TARGET grpc-server)
set(_PROTOBUF_LIBPROTOBUF libprotobuf)
set(_REFLECTION grpc++_reflection)
if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
link_directories("/opt/homebrew/lib")
include_directories("/opt/homebrew/include")
# Set correct Homebrew install folder for Apple Silicon and Intel Macs
if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "arm64")
set(HOMEBREW_DEFAULT_PREFIX "/opt/homebrew")
else()
set(HOMEBREW_DEFAULT_PREFIX "/usr/local")
endif()
link_directories("${HOMEBREW_DEFAULT_PREFIX}/lib")
include_directories("${HOMEBREW_DEFAULT_PREFIX}/include")
endif()
find_package(absl CONFIG REQUIRED)

View File

@@ -26,6 +26,7 @@
#include <mutex>
#include <chrono>
#include <regex>
#include <condition_variable>
#include <grpcpp/ext/proto_server_reflection_plugin.h>
#include <grpcpp/grpcpp.h>
#include <grpcpp/health_check_service_interface.h>
@@ -40,12 +41,15 @@ using backend::HealthMessage;
///// LLAMA.CPP server code below
#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
using json = nlohmann::json;
struct server_params
{
std::string hostname = "127.0.0.1";
std::string api_key;
std::string public_path = "examples/server/public";
int32_t port = 8080;
int32_t read_timeout = 600;
@@ -89,7 +93,7 @@ static inline bool is_base64(uint8_t c)
return (isalnum(c) || (c == '+') || (c == '/'));
}
static std::vector<uint8_t> base64_decode(std::string const &encoded_string)
static std::vector<uint8_t> base64_decode(const std::string & encoded_string)
{
int i = 0;
int j = 0;
@@ -216,10 +220,10 @@ struct slot_image
int32_t id;
bool request_encode_image = false;
float* image_embedding = nullptr;
float * image_embedding = nullptr;
int32_t image_tokens = 0;
clip_image_u8 img_data;
clip_image_u8 * img_data;
std::string prefix_prompt; // before of this image
};
@@ -441,15 +445,16 @@ struct llama_client_slot
generated_token_probs.clear();
for (slot_image &img : images)
for (slot_image & img : images)
{
free(img.image_embedding);
delete[] img.img_data.data;
if (img.img_data) {
clip_image_u8_free(img.img_data);
}
img.prefix_prompt = "";
}
images.clear();
// llama_set_rng_seed(ctx, params.seed); in batched the seed matter???????
}
bool has_budget(gpt_params &global_params) {
@@ -550,7 +555,9 @@ struct llama_server_context
std::vector<task_result> queue_results;
std::vector<task_multi> queue_multitasks;
std::mutex mutex_tasks; // also guards id_gen, and queue_multitasks
std::condition_variable condition_tasks;
std::mutex mutex_results;
std::condition_variable condition_results;
~llama_server_context()
{
@@ -769,6 +776,42 @@ struct llama_server_context
slot->prompt = "";
}
slot->sparams.penalty_prompt_tokens.clear();
slot->sparams.use_penalty_prompt_tokens = false;
const auto &penalty_prompt = data.find("penalty_prompt");
if (penalty_prompt != data.end())
{
if (penalty_prompt->is_string())
{
const auto penalty_prompt_string = penalty_prompt->get<std::string>();
auto penalty_tokens = llama_tokenize(model, penalty_prompt_string, false);
slot->sparams.penalty_prompt_tokens.swap(penalty_tokens);
if (slot->params.n_predict > 0)
{
slot->sparams.penalty_prompt_tokens.reserve(slot->sparams.penalty_prompt_tokens.size() + slot->params.n_predict);
}
slot->sparams.use_penalty_prompt_tokens = true;
}
else if (penalty_prompt->is_array())
{
const auto n_tokens = penalty_prompt->size();
slot->sparams.penalty_prompt_tokens.reserve(n_tokens + std::max(0, slot->params.n_predict));
const int n_vocab = llama_n_vocab(model);
for (const auto &penalty_token : *penalty_prompt)
{
if (penalty_token.is_number_integer())
{
const auto tok = penalty_token.get<llama_token>();
if (tok >= 0 && tok < n_vocab)
{
slot->sparams.penalty_prompt_tokens.push_back(tok);
}
}
}
slot->sparams.use_penalty_prompt_tokens = true;
}
}
slot->sparams.logit_bias.clear();
if (json_value(data, "ignore_eos", false))
@@ -821,24 +864,17 @@ struct llama_server_context
{
for (const auto &img : *images_data)
{
std::string data_b64 = img["data"].get<std::string>();
const std::vector<uint8_t> image_buffer = base64_decode(img["data"].get<std::string>());
slot_image img_sl;
img_sl.id = img.count("id") != 0 ? img["id"].get<int>() : slot->images.size();
int width, height, channels;
std::vector<uint8_t> image_buffer = base64_decode(data_b64);
data_b64.clear();
auto data = stbi_load_from_memory(image_buffer.data(), image_buffer.size(), &width, &height, &channels, 3);
if (!data) {
img_sl.img_data = clip_image_u8_init();
if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
{
LOG_TEE("slot %i - failed to load image [id: %i]\n", slot->id, img_sl.id);
return false;
}
LOG_TEE("slot %i - image loaded [id: %i] resolution (%i x %i)\n", slot->id, img_sl.id, width, height);
img_sl.img_data.nx = width;
img_sl.img_data.ny = height;
img_sl.img_data.size = width * height * 3;
img_sl.img_data.data = new uint8_t[width * height * 3]();
memcpy(img_sl.img_data.data, data, width * height * 3);
stbi_image_free(data);
LOG_TEE("slot %i - loaded image\n", slot->id);
img_sl.request_encode_image = true;
slot->images.push_back(img_sl);
}
@@ -893,6 +929,7 @@ struct llama_server_context
llama_sampling_free(slot->ctx_sampling);
}
slot->ctx_sampling = llama_sampling_init(slot->sparams);
llama_set_rng_seed(ctx, slot->params.seed);
slot->command = LOAD_PROMPT;
all_slots_are_idle = false;
@@ -1000,6 +1037,12 @@ struct llama_server_context
slot.generated_text += token_str;
slot.has_next_token = true;
if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1)
{
// we can change penalty_prompt_tokens because it is always created from scratch each request
slot.ctx_sampling->params.penalty_prompt_tokens.push_back(result.tok);
}
// check if there is incomplete UTF-8 character at the end
bool incomplete = false;
for (unsigned i = 1; i < 5 && i <= slot.generated_text.size(); ++i)
@@ -1106,8 +1149,8 @@ struct llama_server_context
{
continue;
}
clip_image_f32 img_res;
if (!clip_image_preprocess(clp_ctx, &img.img_data, &img_res, /*pad2square =*/ true))
clip_image_f32 * img_res = clip_image_f32_init();
if (!clip_image_preprocess(clp_ctx, img.img_data, img_res, /*pad2square =*/ true))
{
LOG_TEE("Error processing the given image");
clip_free(clp_ctx);
@@ -1122,11 +1165,12 @@ struct llama_server_context
return false;
}
LOG_TEE("slot %i - encoding image [id: %i]\n", slot.id, img.id);
if (!clip_image_encode(clp_ctx, params.n_threads, &img_res, img.image_embedding))
if (!clip_image_encode(clp_ctx, params.n_threads, img_res, img.image_embedding))
{
LOG_TEE("Unable to encode image\n");
return false;
}
clip_image_f32_free(img_res);
img.request_encode_image = false;
}
@@ -1135,7 +1179,7 @@ struct llama_server_context
void send_error(task_server& task, std::string error)
{
std::lock_guard<std::mutex> lock(mutex_results);
std::unique_lock<std::mutex> lock(mutex_results);
task_result res;
res.id = task.id;
res.multitask_id = task.multitask_id;
@@ -1143,6 +1187,7 @@ struct llama_server_context
res.error = true;
res.result_json = { { "content", error } };
queue_results.push_back(res);
condition_results.notify_all();
}
void add_multi_task(int id, std::vector<int>& sub_ids)
@@ -1152,6 +1197,7 @@ struct llama_server_context
multi.id = id;
std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end()));
queue_multitasks.push_back(multi);
condition_tasks.notify_one();
}
void update_multi_task(int multitask_id, int subtask_id, task_result& result)
@@ -1163,6 +1209,7 @@ struct llama_server_context
{
multitask.subtasks_remaining.erase(subtask_id);
multitask.results.push_back(result);
condition_tasks.notify_one();
}
}
}
@@ -1181,7 +1228,7 @@ struct llama_server_context
{"n_ctx", slot.n_ctx},
{"model", params.model_alias},
{"seed", slot.params.seed},
{"temp", slot.sparams.temp},
{"temperature", slot.sparams.temp},
{"top_k", slot.sparams.top_k},
{"top_p", slot.sparams.top_p},
{"min_p", slot.sparams.min_p},
@@ -1191,6 +1238,8 @@ struct llama_server_context
{"repeat_penalty", slot.sparams.penalty_repeat},
{"presence_penalty", slot.sparams.penalty_present},
{"frequency_penalty", slot.sparams.penalty_freq},
{"penalty_prompt_tokens", slot.sparams.penalty_prompt_tokens},
{"use_penalty_prompt_tokens", slot.sparams.use_penalty_prompt_tokens},
{"mirostat", slot.sparams.mirostat},
{"mirostat_tau", slot.sparams.mirostat_tau},
{"mirostat_eta", slot.sparams.mirostat_eta},
@@ -1208,7 +1257,7 @@ struct llama_server_context
void send_partial_response(llama_client_slot &slot, completion_token_output tkn)
{
std::lock_guard<std::mutex> lock(mutex_results);
std::unique_lock<std::mutex> lock(mutex_results);
task_result res;
res.id = slot.task_id;
res.multitask_id = slot.multitask_id;
@@ -1244,11 +1293,12 @@ struct llama_server_context
}
queue_results.push_back(res);
condition_results.notify_all();
}
void send_final_response(llama_client_slot &slot)
{
std::lock_guard<std::mutex> lock(mutex_results);
std::unique_lock<std::mutex> lock(mutex_results);
task_result res;
res.id = slot.task_id;
res.multitask_id = slot.multitask_id;
@@ -1304,11 +1354,12 @@ struct llama_server_context
}
queue_results.push_back(res);
condition_results.notify_all();
}
void send_embedding(llama_client_slot &slot)
{
std::lock_guard<std::mutex> lock(mutex_results);
std::unique_lock<std::mutex> lock(mutex_results);
task_result res;
res.id = slot.task_id;
res.multitask_id = slot.multitask_id;
@@ -1336,6 +1387,7 @@ struct llama_server_context
};
}
queue_results.push_back(res);
condition_results.notify_all();
}
int request_completion(json data, bool infill, bool embedding, int multitask_id)
@@ -1359,6 +1411,7 @@ struct llama_server_context
// otherwise, it's a single-prompt task, we actually queue it
queue_tasks.push_back(task);
condition_tasks.notify_one();
return task.id;
}
@@ -1366,13 +1419,10 @@ struct llama_server_context
{
while (true)
{
std::this_thread::sleep_for(std::chrono::microseconds(5));
std::lock_guard<std::mutex> lock(mutex_results);
if (queue_results.empty())
{
continue;
}
std::unique_lock<std::mutex> lock(mutex_results);
condition_results.wait(lock, [&]{
return !queue_results.empty();
});
for (int i = 0; i < (int) queue_results.size(); i++)
{
@@ -1468,12 +1518,13 @@ struct llama_server_context
void request_cancel(int task_id)
{
std::lock_guard<std::mutex> lock(mutex_tasks);
std::unique_lock<std::mutex> lock(mutex_tasks);
task_server task;
task.id = id_gen++;
task.type = CANCEL_TASK;
task.target_id = task_id;
queue_tasks.push_back(task);
condition_tasks.notify_one();
}
int split_multiprompt_task(task_server& multiprompt_task)
@@ -1499,7 +1550,7 @@ struct llama_server_context
void process_tasks()
{
std::lock_guard<std::mutex> lock(mutex_tasks);
std::unique_lock<std::mutex> lock(mutex_tasks);
while (!queue_tasks.empty())
{
task_server task = queue_tasks.front();
@@ -1571,6 +1622,7 @@ struct llama_server_context
std::lock_guard<std::mutex> lock(mutex_results);
queue_results.push_back(aggregate_result);
condition_results.notify_all();
queue_iterator = queue_multitasks.erase(queue_iterator);
}
@@ -1601,8 +1653,10 @@ struct llama_server_context
LOG_TEE("all slots are idle and system prompt is empty, clear the KV cache\n");
kv_cache_clear();
}
// avoid 100% usage of cpu all time
std::this_thread::sleep_for(std::chrono::milliseconds(5));
std::unique_lock<std::mutex> lock(mutex_tasks);
condition_tasks.wait(lock, [&]{
return !queue_tasks.empty();
});
}
for (llama_client_slot &slot : slots)
@@ -1962,28 +2016,35 @@ json oaicompat_completion_params_parse(
llama_params["__oaicompat"] = true;
// Map OpenAI parameters to llama.cpp parameters
//
// For parameters that are defined by the OpenAI documentation (e.g.
// temperature), we explicitly specify OpenAI's intended default; we
// need to do that because sometimes OpenAI disagrees with llama.cpp
//
// https://platform.openai.com/docs/api-reference/chat/create
llama_sampling_params default_sparams;
llama_params["model"] = json_value(body, "model", std::string("uknown"));
llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
llama_params["cache_prompt"] = json_value(body, "cache_prompt", false);
llama_params["temperature"] = json_value(body, "temperature", 0.8);
llama_params["top_k"] = json_value(body, "top_k", 40);
llama_params["top_p"] = json_value(body, "top_p", 0.95);
llama_params["temperature"] = json_value(body, "temperature", 0.0);
llama_params["top_k"] = json_value(body, "top_k", default_sparams.top_k);
llama_params["top_p"] = json_value(body, "top_p", 1.0);
llama_params["n_predict"] = json_value(body, "max_tokens", -1);
llama_params["logit_bias"] = json_value(body, "logit_bias",json::object());
llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0);
llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0);
llama_params["seed"] = json_value(body, "seed", 0);
llama_params["seed"] = json_value(body, "seed", LLAMA_DEFAULT_SEED);
llama_params["stream"] = json_value(body, "stream", false);
llama_params["mirostat"] = json_value(body, "mirostat", false);
llama_params["mirostat_tau"] = json_value(body, "mirostat_tau", 0.0);
llama_params["mirostat_eta"] = json_value(body, "mirostat_eta", 0.0);
llama_params["penalize_nl"] = json_value(body, "penalize_nl", false);
llama_params["typical_p"] = json_value(body, "typical_p", 0.0);
llama_params["repeat_last_n"] = json_value(body, "repeat_last_n", 0);
llama_params["mirostat"] = json_value(body, "mirostat", default_sparams.mirostat);
llama_params["mirostat_tau"] = json_value(body, "mirostat_tau", default_sparams.mirostat_tau);
llama_params["mirostat_eta"] = json_value(body, "mirostat_eta", default_sparams.mirostat_eta);
llama_params["penalize_nl"] = json_value(body, "penalize_nl", default_sparams.penalize_nl);
llama_params["typical_p"] = json_value(body, "typical_p", default_sparams.typical_p);
llama_params["repeat_last_n"] = json_value(body, "repeat_last_n", default_sparams.penalty_last_n);
llama_params["ignore_eos"] = json_value(body, "ignore_eos", false);
llama_params["tfs_z"] = json_value(body, "tfs_z", 0.0);
llama_params["tfs_z"] = json_value(body, "tfs_z", default_sparams.tfs_z);
if (llama_params.count("grammar") != 0) {
if (body.count("grammar") != 0) {
llama_params["grammar"] = json_value(body, "grammar", json::object());
}

View File

@@ -53,6 +53,7 @@ dependencies:
- nvidia-nccl-cu12==2.18.1
- nvidia-nvjitlink-cu12==12.2.140
- nvidia-nvtx-cu12==12.1.105
- omegaconf
- packaging==23.2
- pillow==10.0.1
- protobuf==4.24.4

View File

@@ -11,4 +11,6 @@ source activate exllama
# get the directory where the bash script is located
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
cd $DIR
python $DIR/exllama.py $@

View File

@@ -11,4 +11,6 @@ source activate exllama2
# get the directory where the bash script is located
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
cd $DIR
python $DIR/exllama2_backend.py $@

View File

@@ -10,4 +10,6 @@ source activate transformers
# get the directory where the bash script is located
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
cd $DIR
python $DIR/ttsvalle.py $@

View File

@@ -36,10 +36,10 @@ In a nutshell:
- Local, OpenAI drop-in alternative REST API. You own your data.
- NO GPU required. NO Internet access is required either
- Optional, GPU Acceleration is available in `llama.cpp`-compatible LLMs. See also the [build section](https://localai.io/basics/build/index.html).
- Optional, GPU Acceleration is available. See also the [build section](https://localai.io/basics/build/index.html).
- Supports multiple models
- 🏃 Once loaded the first time, it keep models loaded in memory for faster inference
- ⚡ Doesn't shell-out, but uses C++ bindings for a faster inference and better performance.
- ⚡ Doesn't shell-out, but uses bindings for a faster inference and better performance.
LocalAI is focused on making the AI accessible to anyone. Any contribution, feedback and PR is welcome!

View File

@@ -359,15 +359,7 @@ docker run --env REBUILD=true localai
docker run --env-file .env localai
```
### Build only a single backend
You can control the backends that are built by setting the `GRPC_BACKENDS` environment variable. For instance, to build only the `llama-cpp` backend only:
```bash
make GRPC_BACKENDS=backend-assets/grpc/llama-cpp build
```
By default, all the backends are built.
### Extra backends

View File

@@ -7,16 +7,15 @@ url = '/basics/build/'
+++
### Build locally
### Build
#### Container image
Requirements:
Either Docker/podman, or
- Golang >= 1.21
- Cmake/make
- GCC
- Docker or podman, or a container engine
In order to build the `LocalAI` container image locally you can use `docker`:
In order to build the `LocalAI` container image locally you can use `docker`, for example:
```
# build the image
@@ -24,7 +23,45 @@ docker build -t localai .
docker run localai
```
Or you can build the manually binary with `make`:
#### Locally
In order to build LocalAI locally, you need the following requirements:
- Golang >= 1.21
- Cmake/make
- GCC
- GRPC
To install the dependencies follow the instructions below:
{{< tabs >}}
{{% tab name="Apple" %}}
```bash
brew install abseil cmake go grpc protobuf wget
```
{{% /tab %}}
{{% tab name="Debian" %}}
```bash
apt install protobuf-compiler-grpc libgrpc-dev make cmake
```
{{% /tab %}}
{{% tab name="From source" %}}
Specify `BUILD_GRPC_FOR_BACKEND_LLAMA=true` to build automatically the gRPC dependencies
```bash
make ... BUILD_GRPC_FOR_BACKEND_LLAMA=true build
```
{{% /tab %}}
{{< /tabs >}}
To build LocalAI with `make`:
```
git clone https://github.com/go-skynet/LocalAI
@@ -32,7 +69,7 @@ cd LocalAI
make build
```
To run: `./local-ai`
This should produce the binary `local-ai`
{{% notice note %}}
@@ -54,7 +91,7 @@ docker run --rm -ti -p 8080:8080 -e DEBUG=true -e MODELS_PATH=/models -e THREADS
{{% /notice %}}
### Build on mac
### Example: Build on mac
Building on Mac (M1 or M2) works, but you may need to install some prerequisites using `brew`.
@@ -188,6 +225,16 @@ make BUILD_TYPE=metal build
# Note: only models quantized with q4_0 are supported!
```
### Build only a single backend
You can control the backends that are built by setting the `GRPC_BACKENDS` environment variable. For instance, to build only the `llama-cpp` backend only:
```bash
make GRPC_BACKENDS=backend-assets/grpc/llama-cpp build
```
By default, all the backends are built.
### Windows compatibility
Make sure to give enough resources to the running container. See https://github.com/go-skynet/LocalAI/issues/2

View File

@@ -1,3 +1,3 @@
{
"version": "v2.2.0"
"version": "v2.3.1"
}

View File

@@ -67,6 +67,17 @@ curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/jso
```
### Phi-2
```
cp -r examples/configurations/phi-2.yaml models/
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "phi-2",
"messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
}'
```
### Mixtral
```

View File

@@ -0,0 +1,17 @@
name: phi-2
context_size: 2048
f16: true
gpu_layers: 90
mmap: true
trimsuffix:
- "\n"
parameters:
model: huggingface://TheBloke/phi-2-GGUF/phi-2.Q8_0.gguf
temperature: 0.2
top_k: 40
top_p: 0.95
template:
chat: &template |
Instruct: {{.Input}}
Output:
completion: *template

View File

@@ -99,6 +99,11 @@ func main() {
Usage: "A List of models to apply in JSON at start",
EnvVars: []string{"PRELOAD_MODELS"},
},
&cli.StringFlag{
Name: "models",
Usage: "A List of models URLs configurations.",
EnvVars: []string{"MODELS"},
},
&cli.StringFlag{
Name: "preload-models-config",
Usage: "A List of models to apply at startup. Path to a YAML config file",
@@ -222,6 +227,7 @@ For a list of compatible model, check out: https://localai.io/model-compatibilit
options.WithBackendAssetsOutput(ctx.String("backend-assets-path")),
options.WithUploadLimitMB(ctx.Int("upload-limit")),
options.WithApiKeys(ctx.StringSlice("api-keys")),
options.WithModelsURL(append(ctx.StringSlice("models"), ctx.Args().Slice()...)...),
}
idleWatchDog := ctx.Bool("enable-watchdog-idle")

View File

@@ -239,10 +239,10 @@ func (ml *ModelLoader) GreedyLoader(opts ...Option) (*grpc.Client, error) {
for _, b := range o.externalBackends {
allBackendsToAutoLoad = append(allBackendsToAutoLoad, b)
}
log.Debug().Msgf("Loading model '%s' greedly from all the available backends: %s", o.model, strings.Join(allBackendsToAutoLoad, ", "))
log.Info().Msgf("Loading model '%s' greedly from all the available backends: %s", o.model, strings.Join(allBackendsToAutoLoad, ", "))
for _, b := range allBackendsToAutoLoad {
log.Debug().Msgf("[%s] Attempting to load", b)
log.Info().Msgf("[%s] Attempting to load", b)
options := []Option{
WithBackendString(b),
WithModel(o.model),
@@ -257,14 +257,14 @@ func (ml *ModelLoader) GreedyLoader(opts ...Option) (*grpc.Client, error) {
model, modelerr := ml.BackendLoader(options...)
if modelerr == nil && model != nil {
log.Debug().Msgf("[%s] Loads OK", b)
log.Info().Msgf("[%s] Loads OK", b)
return model, nil
} else if modelerr != nil {
err = multierror.Append(err, modelerr)
log.Debug().Msgf("[%s] Fails: %s", b, modelerr.Error())
log.Info().Msgf("[%s] Fails: %s", b, modelerr.Error())
} else if model == nil {
err = multierror.Append(err, fmt.Errorf("backend returned no usable model"))
log.Debug().Msgf("[%s] Fails: %s", b, "backend returned no usable model")
log.Info().Msgf("[%s] Fails: %s", b, "backend returned no usable model")
}
}

View File

@@ -29,9 +29,9 @@ func DisplayDownloadFunction(fileName string, current string, total string, perc
}
if total != "" {
log.Debug().Msgf("Downloading %s: %s/%s (%.2f%%) ETA: %s", fileName, current, total, percentage, eta)
log.Info().Msgf("Downloading %s: %s/%s (%.2f%%) ETA: %s", fileName, current, total, percentage, eta)
} else {
log.Debug().Msgf("Downloading: %s", current)
log.Info().Msgf("Downloading: %s", current)
}
}
}

View File

@@ -15,27 +15,8 @@ import (
"github.com/rs/zerolog/log"
)
const (
githubURI = "github:"
)
func GetURI(url string, f func(url string, i []byte) error) error {
if strings.HasPrefix(url, githubURI) {
parts := strings.Split(url, ":")
repoParts := strings.Split(parts[1], "@")
branch := "main"
if len(repoParts) > 1 {
branch = repoParts[1]
}
repoPath := strings.Split(repoParts[0], "/")
org := repoPath[0]
project := repoPath[1]
projectPath := strings.Join(repoPath[2:], "/")
url = fmt.Sprintf("https://raw.githubusercontent.com/%s/%s/%s/%s", org, project, branch, projectPath)
}
url = ConvertURL(url)
if strings.HasPrefix(url, "file://") {
rawURL := strings.TrimPrefix(url, "file://")
@@ -71,10 +52,57 @@ func GetURI(url string, f func(url string, i []byte) error) error {
return f(url, body)
}
const (
HuggingFacePrefix = "huggingface://"
HTTPPrefix = "http://"
HTTPSPrefix = "https://"
GithubURI = "github:"
GithubURI2 = "github://"
)
func LooksLikeURL(s string) bool {
return strings.HasPrefix(s, HTTPPrefix) ||
strings.HasPrefix(s, HTTPSPrefix) ||
strings.HasPrefix(s, HuggingFacePrefix) ||
strings.HasPrefix(s, GithubURI) ||
strings.HasPrefix(s, GithubURI2)
}
func ConvertURL(s string) string {
switch {
case strings.HasPrefix(s, "huggingface://"):
repository := strings.Replace(s, "huggingface://", "", 1)
case strings.HasPrefix(s, GithubURI2):
repository := strings.Replace(s, GithubURI2, "", 1)
repoParts := strings.Split(repository, "@")
branch := "main"
if len(repoParts) > 1 {
branch = repoParts[1]
}
repoPath := strings.Split(repoParts[0], "/")
org := repoPath[0]
project := repoPath[1]
projectPath := strings.Join(repoPath[2:], "/")
return fmt.Sprintf("https://raw.githubusercontent.com/%s/%s/%s/%s", org, project, branch, projectPath)
case strings.HasPrefix(s, GithubURI):
parts := strings.Split(s, ":")
repoParts := strings.Split(parts[1], "@")
branch := "main"
if len(repoParts) > 1 {
branch = repoParts[1]
}
repoPath := strings.Split(repoParts[0], "/")
org := repoPath[0]
project := repoPath[1]
projectPath := strings.Join(repoPath[2:], "/")
return fmt.Sprintf("https://raw.githubusercontent.com/%s/%s/%s/%s", org, project, branch, projectPath)
case strings.HasPrefix(s, HuggingFacePrefix):
repository := strings.Replace(s, HuggingFacePrefix, "", 1)
// convert repository to a full URL.
// e.g. TheBloke/Mixtral-8x7B-v0.1-GGUF/mixtral-8x7b-v0.1.Q2_K.gguf@main -> https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q2_K.gguf
owner := strings.Split(repository, "/")[0]