mirror of
https://github.com/mudler/LocalAI.git
synced 2026-02-03 11:13:31 -05:00
Compare commits
37 Commits
timeout_te
...
v2.18.1
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b941732f54 | ||
|
|
e591ff2e74 | ||
|
|
bd2f95c130 | ||
|
|
ad85c5a1e7 | ||
|
|
421eb8a727 | ||
|
|
b7ff441cc0 | ||
|
|
83d867ad46 | ||
|
|
6acba2bcbe | ||
|
|
6a2a10603c | ||
|
|
356907a5cf | ||
|
|
7ab7a188d0 | ||
|
|
ff1a5bfc62 | ||
|
|
522f185baf | ||
|
|
f7b5a4ca7d | ||
|
|
1d30955677 | ||
|
|
d3307e93d3 | ||
|
|
8d9a452e4b | ||
|
|
466eb82845 | ||
|
|
7e562d10a3 | ||
|
|
7b1e792732 | ||
|
|
30b883affe | ||
|
|
20ec4d0342 | ||
|
|
a9f8460086 | ||
|
|
98b3b2b1ab | ||
|
|
e8bc0a789b | ||
|
|
2b6a2c7dde | ||
|
|
c8c8238f9d | ||
|
|
3eaf59021c | ||
|
|
a8bfb6f9c2 | ||
|
|
b783c811db | ||
|
|
59af0e77af | ||
|
|
5d83c8d3a2 | ||
|
|
8f968d0341 | ||
|
|
f93fe30350 | ||
|
|
784ccf97ba | ||
|
|
a0163dafce | ||
|
|
f072cb3cd0 |
8
.github/workflows/image.yml
vendored
8
.github/workflows/image.yml
vendored
@@ -39,7 +39,7 @@ jobs:
|
||||
strategy:
|
||||
# Pushing with all jobs in parallel
|
||||
# eats the bandwidth of all the nodes
|
||||
max-parallel: ${{ github.event_name != 'pull_request' && 6 || 12 }}
|
||||
max-parallel: ${{ github.event_name != 'pull_request' && 6 || 10 }}
|
||||
matrix:
|
||||
include:
|
||||
# Extra images
|
||||
@@ -257,6 +257,7 @@ jobs:
|
||||
quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
|
||||
quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
|
||||
strategy:
|
||||
max-parallel: ${{ github.event_name != 'pull_request' && 2 || 4 }}
|
||||
matrix:
|
||||
include:
|
||||
- build-type: ''
|
||||
@@ -316,11 +317,12 @@ jobs:
|
||||
base-image: "ubuntu:22.04"
|
||||
makeflags: "--jobs=4 --output-sync=target"
|
||||
- build-type: 'vulkan'
|
||||
platforms: 'linux/amd64,linux/arm64'
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'false'
|
||||
tag-suffix: '-vulkan-ffmpeg-core'
|
||||
latest-image: 'latest-vulkan-ffmpeg-core'
|
||||
ffmpeg: 'true'
|
||||
image-type: 'core'
|
||||
runs-on: 'arc-runner-set'
|
||||
base-image: "ubuntu:22.04"
|
||||
makeflags: "--jobs=4 --output-sync=target"
|
||||
makeflags: "--jobs=4 --output-sync=target"
|
||||
|
||||
2
.github/workflows/test.yml
vendored
2
.github/workflows/test.yml
vendored
@@ -220,7 +220,7 @@ jobs:
|
||||
export CPLUS_INCLUDE_PATH=/usr/local/include
|
||||
# Used to run the newer GNUMake version from brew that supports --output-sync
|
||||
export PATH="/opt/homebrew/opt/make/libexec/gnubin:$PATH"
|
||||
BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF" make --jobs 4 --output-sync=target test
|
||||
BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" make --jobs 4 --output-sync=target test
|
||||
- name: Setup tmate session if tests fail
|
||||
if: ${{ failure() }}
|
||||
uses: mxschmitt/action-tmate@v3.18
|
||||
|
||||
@@ -282,6 +282,8 @@ COPY --from=grpc /opt/grpc /usr/local
|
||||
|
||||
# Rebuild with defaults backends
|
||||
WORKDIR /build
|
||||
|
||||
## Build the binary
|
||||
RUN make build
|
||||
|
||||
RUN if [ ! -d "/build/sources/go-piper/piper-phonemize/pi/lib/" ]; then \
|
||||
|
||||
28
Makefile
28
Makefile
@@ -5,7 +5,7 @@ BINARY_NAME=local-ai
|
||||
|
||||
# llama.cpp versions
|
||||
GOLLAMA_STABLE_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
|
||||
CPPLLAMA_VERSION?=e112b610a1a75cb7fa8351e1a933e2e7a755a5ce
|
||||
CPPLLAMA_VERSION?=cb5fad4c6c2cbef92e9b8b63449e1cb7664e4846
|
||||
|
||||
# gpt4all version
|
||||
GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
|
||||
@@ -54,7 +54,7 @@ override LD_FLAGS += -X "github.com/go-skynet/LocalAI/internal.Commit=$(shell gi
|
||||
|
||||
OPTIONAL_TARGETS?=
|
||||
|
||||
OS := $(shell uname -s)
|
||||
export OS := $(shell uname -s)
|
||||
ARCH := $(shell uname -m)
|
||||
GREEN := $(shell tput -Txterm setaf 2)
|
||||
YELLOW := $(shell tput -Txterm setaf 3)
|
||||
@@ -80,8 +80,8 @@ ifeq ($(OS),Darwin)
|
||||
BUILD_TYPE=metal
|
||||
# disable metal if on Darwin and any other value is explicitly passed.
|
||||
else ifneq ($(BUILD_TYPE),metal)
|
||||
CMAKE_ARGS+=-DLLAMA_METAL=OFF
|
||||
export LLAMA_NO_ACCELERATE=1
|
||||
CMAKE_ARGS+=-DGGML_METAL=OFF
|
||||
export GGML_NO_ACCELERATE=1
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_TYPE),metal)
|
||||
@@ -98,13 +98,13 @@ endif
|
||||
|
||||
ifeq ($(BUILD_TYPE),cublas)
|
||||
CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH)
|
||||
export LLAMA_CUBLAS=1
|
||||
export GGML_CUDA=1
|
||||
export WHISPER_CUDA=1
|
||||
CGO_LDFLAGS_WHISPER+=-L$(CUDA_LIBPATH)/stubs/ -lcuda -lcufft
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_TYPE),vulkan)
|
||||
CMAKE_ARGS+=-DLLAMA_VULKAN=1
|
||||
CMAKE_ARGS+=-DGGML_VULKAN=1
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_TYPE),hipblas)
|
||||
@@ -118,13 +118,13 @@ ifeq ($(BUILD_TYPE),hipblas)
|
||||
export WHISPER_HIPBLAS=1
|
||||
GPU_TARGETS ?= gfx900,gfx906,gfx908,gfx940,gfx941,gfx942,gfx90a,gfx1030,gfx1031,gfx1100,gfx1101
|
||||
AMDGPU_TARGETS ?= "$(GPU_TARGETS)"
|
||||
CMAKE_ARGS+=-DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
|
||||
CMAKE_ARGS+=-DGGML_HIPBLAS=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
|
||||
CGO_LDFLAGS += -O3 --rtlib=compiler-rt -unwindlib=libgcc -lhipblas -lrocblas --hip-link -L${ROCM_HOME}/lib/llvm/lib
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_TYPE),metal)
|
||||
CGO_LDFLAGS+=-framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
|
||||
export LLAMA_METAL=1
|
||||
export GGML_METAL=1
|
||||
export WHISPER_METAL=1
|
||||
endif
|
||||
|
||||
@@ -354,7 +354,7 @@ else
|
||||
endif
|
||||
|
||||
dist-cross-linux-arm64:
|
||||
CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_NATIVE=off" GRPC_BACKENDS="backend-assets/grpc/llama-cpp-fallback backend-assets/grpc/llama-cpp-grpc backend-assets/util/llama-cpp-rpc-server" \
|
||||
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_NATIVE=off" GRPC_BACKENDS="backend-assets/grpc/llama-cpp-fallback backend-assets/grpc/llama-cpp-grpc backend-assets/util/llama-cpp-rpc-server" \
|
||||
STATIC=true $(MAKE) build
|
||||
mkdir -p release
|
||||
# if BUILD_ID is empty, then we don't append it to the binary name
|
||||
@@ -711,21 +711,21 @@ backend-assets/grpc/llama-cpp-avx2: backend-assets/grpc
|
||||
cp -rf backend/cpp/llama backend/cpp/llama-avx2
|
||||
$(MAKE) -C backend/cpp/llama-avx2 purge
|
||||
$(info ${GREEN}I llama-cpp build info:avx2${RESET})
|
||||
CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on" $(MAKE) VARIANT="llama-avx2" build-llama-cpp-grpc-server
|
||||
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-avx2" build-llama-cpp-grpc-server
|
||||
cp -rfv backend/cpp/llama-avx2/grpc-server backend-assets/grpc/llama-cpp-avx2
|
||||
|
||||
backend-assets/grpc/llama-cpp-avx: backend-assets/grpc
|
||||
cp -rf backend/cpp/llama backend/cpp/llama-avx
|
||||
$(MAKE) -C backend/cpp/llama-avx purge
|
||||
$(info ${GREEN}I llama-cpp build info:avx${RESET})
|
||||
CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off" $(MAKE) VARIANT="llama-avx" build-llama-cpp-grpc-server
|
||||
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" $(MAKE) VARIANT="llama-avx" build-llama-cpp-grpc-server
|
||||
cp -rfv backend/cpp/llama-avx/grpc-server backend-assets/grpc/llama-cpp-avx
|
||||
|
||||
backend-assets/grpc/llama-cpp-fallback: backend-assets/grpc
|
||||
cp -rf backend/cpp/llama backend/cpp/llama-fallback
|
||||
$(MAKE) -C backend/cpp/llama-fallback purge
|
||||
$(info ${GREEN}I llama-cpp build info:fallback${RESET})
|
||||
CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off" $(MAKE) VARIANT="llama-fallback" build-llama-cpp-grpc-server
|
||||
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" $(MAKE) VARIANT="llama-fallback" build-llama-cpp-grpc-server
|
||||
cp -rfv backend/cpp/llama-fallback/grpc-server backend-assets/grpc/llama-cpp-fallback
|
||||
# TODO: every binary should have its own folder instead, so can have different metal implementations
|
||||
ifeq ($(BUILD_TYPE),metal)
|
||||
@@ -736,7 +736,7 @@ backend-assets/grpc/llama-cpp-cuda: backend-assets/grpc
|
||||
cp -rf backend/cpp/llama backend/cpp/llama-cuda
|
||||
$(MAKE) -C backend/cpp/llama-cuda purge
|
||||
$(info ${GREEN}I llama-cpp build info:cuda${RESET})
|
||||
CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_CUDA=ON" $(MAKE) VARIANT="llama-cuda" build-llama-cpp-grpc-server
|
||||
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_CUDA=ON" $(MAKE) VARIANT="llama-cuda" build-llama-cpp-grpc-server
|
||||
cp -rfv backend/cpp/llama-cuda/grpc-server backend-assets/grpc/llama-cpp-cuda
|
||||
|
||||
backend-assets/grpc/llama-cpp-hipblas: backend-assets/grpc
|
||||
@@ -764,7 +764,7 @@ backend-assets/grpc/llama-cpp-grpc: backend-assets/grpc
|
||||
cp -rf backend/cpp/llama backend/cpp/llama-grpc
|
||||
$(MAKE) -C backend/cpp/llama-grpc purge
|
||||
$(info ${GREEN}I llama-cpp build info:grpc${RESET})
|
||||
CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_RPC=ON -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off" $(MAKE) VARIANT="llama-grpc" build-llama-cpp-grpc-server
|
||||
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" TARGET="--target grpc-server --target rpc-server" $(MAKE) VARIANT="llama-grpc" build-llama-cpp-grpc-server
|
||||
cp -rfv backend/cpp/llama-grpc/grpc-server backend-assets/grpc/llama-cpp-grpc
|
||||
|
||||
backend-assets/util/llama-cpp-rpc-server: backend-assets/grpc/llama-cpp-grpc
|
||||
|
||||
@@ -4,34 +4,44 @@ LLAMA_VERSION?=
|
||||
CMAKE_ARGS?=
|
||||
BUILD_TYPE?=
|
||||
ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
|
||||
TARGET?=--target grpc-server
|
||||
|
||||
# If build type is cublas, then we set -DLLAMA_CUBLAS=ON to CMAKE_ARGS automatically
|
||||
# Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
|
||||
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
|
||||
|
||||
# If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
|
||||
ifeq ($(BUILD_TYPE),cublas)
|
||||
CMAKE_ARGS+=-DLLAMA_CUBLAS=ON
|
||||
# If build type is openblas then we set -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
|
||||
CMAKE_ARGS+=-DGGML_CUDA=ON
|
||||
# If build type is openblas then we set -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
|
||||
# to CMAKE_ARGS automatically
|
||||
else ifeq ($(BUILD_TYPE),openblas)
|
||||
CMAKE_ARGS+=-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
|
||||
# If build type is clblas (openCL) we set -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
|
||||
CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
|
||||
# If build type is clblas (openCL) we set -DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
|
||||
else ifeq ($(BUILD_TYPE),clblas)
|
||||
CMAKE_ARGS+=-DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
|
||||
CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
|
||||
# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++
|
||||
else ifeq ($(BUILD_TYPE),hipblas)
|
||||
CMAKE_ARGS+=-DLLAMA_HIPBLAS=ON
|
||||
# If it's OSX, DO NOT embed the metal library - -DLLAMA_METAL_EMBED_LIBRARY=ON requires further investigation
|
||||
CMAKE_ARGS+=-DGGML_HIPBLAS=ON
|
||||
# If it's OSX, DO NOT embed the metal library - -DGGML_METAL_EMBED_LIBRARY=ON requires further investigation
|
||||
# But if it's OSX without metal, disable it here
|
||||
else ifeq ($(OS),darwin)
|
||||
else ifeq ($(OS),Darwin)
|
||||
ifneq ($(BUILD_TYPE),metal)
|
||||
CMAKE_ARGS+=-DLLAMA_METAL=OFF
|
||||
CMAKE_ARGS+=-DGGML_METAL=OFF
|
||||
else
|
||||
CMAKE_ARGS+=-DGGML_METAL=ON
|
||||
# Until this is tested properly, we disable embedded metal file
|
||||
# as we already embed it as part of the LocalAI assets
|
||||
CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=OFF
|
||||
TARGET+=--target ggml-metal
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_TYPE),sycl_f16)
|
||||
CMAKE_ARGS+=-DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
|
||||
CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_TYPE),sycl_f32)
|
||||
CMAKE_ARGS+=-DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
endif
|
||||
|
||||
llama.cpp:
|
||||
@@ -62,8 +72,8 @@ grpc-server: llama.cpp llama.cpp/examples/grpc-server
|
||||
@echo "Building grpc-server with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
|
||||
ifneq (,$(findstring sycl,$(BUILD_TYPE)))
|
||||
bash -c "source $(ONEAPI_VARS); \
|
||||
cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && $(MAKE)"
|
||||
cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET)"
|
||||
else
|
||||
cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && $(MAKE)
|
||||
cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET)
|
||||
endif
|
||||
cp llama.cpp/build/bin/grpc-server .
|
||||
@@ -886,6 +886,8 @@ struct llama_server_context
|
||||
{"task_id", slot->task_id},
|
||||
});
|
||||
|
||||
LOG_TEE("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str());
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@@ -142,12 +142,14 @@ func gRPCPredictOpts(c config.BackendConfig, modelPath string) *pb.PredictOption
|
||||
MirostatTAU: float32(*c.LLMConfig.MirostatTAU),
|
||||
Debug: *c.Debug,
|
||||
StopPrompts: c.StopWords,
|
||||
Repeat: int32(c.RepeatPenalty),
|
||||
Repeat: int32(c.RepeatLastN),
|
||||
FrequencyPenalty: float32(c.FrequencyPenalty),
|
||||
PresencePenalty: float32(c.PresencePenalty),
|
||||
Penalty: float32(c.RepeatPenalty),
|
||||
NKeep: int32(c.Keep),
|
||||
Batch: int32(c.Batch),
|
||||
IgnoreEOS: c.IgnoreEOS,
|
||||
Seed: getSeed(c),
|
||||
FrequencyPenalty: float32(c.FrequencyPenalty),
|
||||
MLock: *c.MMlock,
|
||||
MMap: *c.MMap,
|
||||
MainGPU: c.MainGPU,
|
||||
|
||||
@@ -18,7 +18,7 @@ type TranscriptCMD struct {
|
||||
Backend string `short:"b" default:"whisper" help:"Backend to run the transcription model"`
|
||||
Model string `short:"m" required:"" help:"Model name to run the TTS"`
|
||||
Language string `short:"l" help:"Language of the audio file"`
|
||||
Translate bool `short:"t" help:"Translate the transcription to english"`
|
||||
Translate bool `short:"c" help:"Translate the transcription to english"`
|
||||
Threads int `short:"t" default:"1" help:"Number of threads used for parallel computation"`
|
||||
ModelsPath string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
|
||||
BackendAssetsPath string `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"`
|
||||
|
||||
@@ -26,6 +26,7 @@ func RegisterUIRoutes(app *fiber.App,
|
||||
appConfig *config.ApplicationConfig,
|
||||
galleryService *services.GalleryService,
|
||||
auth func(*fiber.Ctx) error) {
|
||||
tmpLMS := services.NewListModelsService(ml, cl, appConfig) // TODO: once createApplication() is fully in use, reference the central instance.
|
||||
|
||||
// keeps the state of models that are being installed from the UI
|
||||
var processingModels = xsync.NewSyncedMap[string, string]()
|
||||
@@ -235,7 +236,7 @@ func RegisterUIRoutes(app *fiber.App,
|
||||
|
||||
// Show the Chat page
|
||||
app.Get("/chat/:model", auth, func(c *fiber.Ctx) error {
|
||||
backendConfigs := cl.GetAllBackendConfigs()
|
||||
backendConfigs, _ := tmpLMS.ListModels("", true)
|
||||
|
||||
summary := fiber.Map{
|
||||
"Title": "LocalAI - Chat with " + c.Params("model"),
|
||||
@@ -249,7 +250,7 @@ func RegisterUIRoutes(app *fiber.App,
|
||||
})
|
||||
|
||||
app.Get("/talk/", auth, func(c *fiber.Ctx) error {
|
||||
backendConfigs := cl.GetAllBackendConfigs()
|
||||
backendConfigs, _ := tmpLMS.ListModels("", true)
|
||||
|
||||
if len(backendConfigs) == 0 {
|
||||
// If no model is available redirect to the index which suggests how to install models
|
||||
@@ -259,7 +260,7 @@ func RegisterUIRoutes(app *fiber.App,
|
||||
summary := fiber.Map{
|
||||
"Title": "LocalAI - Talk",
|
||||
"ModelsConfig": backendConfigs,
|
||||
"Model": backendConfigs[0].Name,
|
||||
"Model": backendConfigs[0].ID,
|
||||
"Version": internal.PrintableVersion(),
|
||||
}
|
||||
|
||||
@@ -269,7 +270,7 @@ func RegisterUIRoutes(app *fiber.App,
|
||||
|
||||
app.Get("/chat/", auth, func(c *fiber.Ctx) error {
|
||||
|
||||
backendConfigs := cl.GetAllBackendConfigs()
|
||||
backendConfigs, _ := tmpLMS.ListModels("", true)
|
||||
|
||||
if len(backendConfigs) == 0 {
|
||||
// If no model is available redirect to the index which suggests how to install models
|
||||
@@ -277,9 +278,9 @@ func RegisterUIRoutes(app *fiber.App,
|
||||
}
|
||||
|
||||
summary := fiber.Map{
|
||||
"Title": "LocalAI - Chat with " + backendConfigs[0].Name,
|
||||
"Title": "LocalAI - Chat with " + backendConfigs[0].ID,
|
||||
"ModelsConfig": backendConfigs,
|
||||
"Model": backendConfigs[0].Name,
|
||||
"Model": backendConfigs[0].ID,
|
||||
"Version": internal.PrintableVersion(),
|
||||
}
|
||||
|
||||
|
||||
@@ -100,10 +100,10 @@ SOFTWARE.
|
||||
<option value="" disabled class="text-gray-400" >Select a model</option>
|
||||
{{ $model:=.Model}}
|
||||
{{ range .ModelsConfig }}
|
||||
{{ if eq .Name $model }}
|
||||
<option value="/chat/{{.Name}}" selected class="bg-gray-700 text-white">{{.Name}}</option>
|
||||
{{ if eq .ID $model }}
|
||||
<option value="/chat/{{.ID}}" selected class="bg-gray-700 text-white">{{.ID}}</option>
|
||||
{{ else }}
|
||||
<option value="/chat/{{.Name}}" class="bg-gray-700 text-white">{{.Name}}</option>
|
||||
<option value="/chat/{{.ID}}" class="bg-gray-700 text-white">{{.ID}}</option>
|
||||
{{ end }}
|
||||
{{ end }}
|
||||
</select>
|
||||
|
||||
@@ -62,7 +62,7 @@
|
||||
<option value="" disabled class="text-gray-400" >Select a model</option>
|
||||
|
||||
{{ range .ModelsConfig }}
|
||||
<option value="{{.Name}}" class="bg-gray-700 text-white">{{.Name}}</option>
|
||||
<option value="{{.ID}}" class="bg-gray-700 text-white">{{.ID}}</option>
|
||||
{{ end }}
|
||||
</select>
|
||||
</div>
|
||||
@@ -76,7 +76,7 @@
|
||||
<option value="" disabled class="text-gray-400" >Select a model</option>
|
||||
|
||||
{{ range .ModelsConfig }}
|
||||
<option value="{{.Name}}" class="bg-gray-700 text-white">{{.Name}}</option>
|
||||
<option value="{{.ID}}" class="bg-gray-700 text-white">{{.ID}}</option>
|
||||
{{ end }}
|
||||
</select>
|
||||
</div>
|
||||
@@ -89,7 +89,7 @@
|
||||
>
|
||||
<option value="" disabled class="text-gray-400" >Select a model</option>
|
||||
{{ range .ModelsConfig }}
|
||||
<option value="{{.Name}}" class="bg-gray-700 text-white">{{.Name}}</option>
|
||||
<option value="{{.ID}}" class="bg-gray-700 text-white">{{.ID}}</option>
|
||||
{{ end }}
|
||||
</select>
|
||||
</div>
|
||||
|
||||
@@ -25,7 +25,10 @@ type PredictionOptions struct {
|
||||
Batch int `json:"batch" yaml:"batch"`
|
||||
IgnoreEOS bool `json:"ignore_eos" yaml:"ignore_eos"`
|
||||
RepeatPenalty float64 `json:"repeat_penalty" yaml:"repeat_penalty"`
|
||||
Keep int `json:"n_keep" yaml:"n_keep"`
|
||||
|
||||
RepeatLastN int `json:"repeat_last_n" yaml:"repeat_last_n"`
|
||||
|
||||
Keep int `json:"n_keep" yaml:"n_keep"`
|
||||
|
||||
FrequencyPenalty float64 `json:"frequency_penalty" yaml:"frequency_penalty"`
|
||||
PresencePenalty float64 `json:"presence_penalty" yaml:"presence_penalty"`
|
||||
|
||||
@@ -118,7 +118,7 @@ And we convert it to the gguf format that LocalAI can consume:
|
||||
|
||||
# Convert to gguf
|
||||
git clone https://github.com/ggerganov/llama.cpp.git
|
||||
pushd llama.cpp && make LLAMA_CUBLAS=1 && popd
|
||||
pushd llama.cpp && make GGML_CUDA=1 && popd
|
||||
|
||||
# We need to convert the pytorch model into ggml for quantization
|
||||
# It crates 'ggml-model-f16.bin' in the 'merged' directory.
|
||||
|
||||
@@ -55,4 +55,4 @@ This typically happens when your prompt exceeds the context size. Try to reduce
|
||||
|
||||
### I'm getting a 'SIGILL' error, what's wrong?
|
||||
|
||||
Your CPU probably does not have support for certain instructions that are compiled by default in the pre-built binaries. If you are running in a container, try setting `REBUILD=true` and disable the CPU instructions that are not compatible with your CPU. For instance: `CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF" make build`
|
||||
Your CPU probably does not have support for certain instructions that are compiled by default in the pre-built binaries. If you are running in a container, try setting `REBUILD=true` and disable the CPU instructions that are not compatible with your CPU. For instance: `CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" make build`
|
||||
@@ -101,14 +101,14 @@ Here is the list of the variables available that can be used to customize the bu
|
||||
LocalAI uses different backends based on ggml and llama.cpp to run models. If your CPU doesn't support common instruction sets, you can disable them during build:
|
||||
|
||||
```
|
||||
CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_AVX=OFF -DLLAMA_FMA=OFF" make build
|
||||
CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_AVX=OFF -DGGML_FMA=OFF" make build
|
||||
```
|
||||
|
||||
To have effect on the container image, you need to set `REBUILD=true`:
|
||||
|
||||
```
|
||||
docker run quay.io/go-skynet/localai
|
||||
docker run --rm -ti -p 8080:8080 -e DEBUG=true -e MODELS_PATH=/models -e THREADS=1 -e REBUILD=true -e CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_AVX=OFF -DLLAMA_FMA=OFF" -v $PWD/models:/models quay.io/go-skynet/local-ai:latest
|
||||
docker run --rm -ti -p 8080:8080 -e DEBUG=true -e MODELS_PATH=/models -e THREADS=1 -e REBUILD=true -e CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_AVX=OFF -DGGML_FMA=OFF" -v $PWD/models:/models quay.io/go-skynet/local-ai:latest
|
||||
```
|
||||
|
||||
{{% /alert %}}
|
||||
|
||||
@@ -8,6 +8,16 @@ icon = "rocket_launch"
|
||||
|
||||
**LocalAI** is a free, open-source alternative to OpenAI (Anthropic, etc.), functioning as a drop-in replacement REST API for local inferencing. It allows you to run [LLMs]({{% relref "docs/features/text-generation" %}}), generate images, and produce audio, all locally or on-premises with consumer-grade hardware, supporting multiple model families and architectures.
|
||||
|
||||
{{% alert icon="💡" %}}
|
||||
|
||||
**Security considerations**
|
||||
|
||||
If you are exposing LocalAI remotely, make sure you protect the API endpoints adeguately with a mechanism which allows to protect from the incoming traffic or alternatively, run LocalAI with `API_KEY` to gate the access with an API key. The API key guarantees a total access to the features (there is no role separation), and it is to be considered as likely as an admin role.
|
||||
|
||||
To access the WebUI with an API_KEY, browser extensions such as [Requestly](https://requestly.com/) can be used (see also https://github.com/mudler/LocalAI/issues/2227#issuecomment-2093333752). See also [API flags]({{% relref "docs/advanced/advanced-usage#api-flags" %}}) for the flags / options available when starting LocalAI.
|
||||
|
||||
{{% /alert %}}
|
||||
|
||||
## Using the Bash Installer
|
||||
|
||||
Install LocalAI easily using the bash installer with the following command:
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
{
|
||||
"version": "v2.17.1"
|
||||
"version": "v2.18.0"
|
||||
}
|
||||
|
||||
@@ -22,7 +22,7 @@ else
|
||||
echo "@@@@@"
|
||||
echo "If you are experiencing issues with the pre-compiled builds, try setting REBUILD=true"
|
||||
echo "If you are still experiencing issues with the build, try setting CMAKE_ARGS and disable the instructions set as needed:"
|
||||
echo 'CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF"'
|
||||
echo 'CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF"'
|
||||
echo "see the documentation at: https://localai.io/basics/build/index.html"
|
||||
echo "Note: See also https://github.com/go-skynet/LocalAI/issues/288"
|
||||
echo "@@@@@"
|
||||
|
||||
@@ -65,7 +65,7 @@ And we convert it to the gguf format that LocalAI can consume:
|
||||
|
||||
# Convert to gguf
|
||||
git clone https://github.com/ggerganov/llama.cpp.git
|
||||
pushd llama.cpp && make LLAMA_CUBLAS=1 && popd
|
||||
pushd llama.cpp && make GGML_CUDA=1 && popd
|
||||
|
||||
# We need to convert the pytorch model into ggml for quantization
|
||||
# It crates 'ggml-model-f16.bin' in the 'merged' directory.
|
||||
|
||||
@@ -1600,7 +1600,7 @@
|
||||
"source": [
|
||||
"\n",
|
||||
"!git clone https://github.com/ggerganov/llama.cpp.git\n",
|
||||
"!cd llama.cpp && make LLAMA_CUBLAS=1\n",
|
||||
"!cd llama.cpp && make GGML_CUDA=1\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -2,7 +2,7 @@ version: "3"
|
||||
|
||||
services:
|
||||
api:
|
||||
image: quay.io/go-skynet/local-ai:v1.18.0-ffmpeg
|
||||
image: quay.io/go-skynet/local-ai:latest
|
||||
# As initially LocalAI will download the models defined in PRELOAD_MODELS
|
||||
# you might need to tweak the healthcheck values here according to your network connection.
|
||||
# Here we give a timespan of 20m to download all the required files.
|
||||
|
||||
@@ -92,6 +92,41 @@
|
||||
- filename: qwen2-1.5b-instruct-q8_0.gguf
|
||||
sha256: c9d33989d77f4bd6966084332087921b9613eda01d5f44dc0b4e9a7382a2bfbb
|
||||
uri: huggingface://DeepMount00/Qwen2-1.5B-Ita-GGUF/qwen2-1.5b-instruct-q8_0.gguf
|
||||
- !!merge <<: *qwen2
|
||||
name: "einstein-v7-qwen2-7b"
|
||||
icon: https://cdn-uploads.huggingface.co/production/uploads/6468ce47e134d050a58aa89c/KLQP1jK-DIzpwHzYRIH-Q.png
|
||||
description: |
|
||||
This model is a full fine-tuned version of Qwen/Qwen2-7B on diverse datasets.
|
||||
urls:
|
||||
- https://huggingface.co/Weyaxi/Einstein-v7-Qwen2-7B
|
||||
- https://huggingface.co/bartowski/Einstein-v7-Qwen2-7B-GGUF
|
||||
overrides:
|
||||
parameters:
|
||||
model: Einstein-v7-Qwen2-7B-Q4_K_M.gguf
|
||||
files:
|
||||
- filename: Einstein-v7-Qwen2-7B-Q4_K_M.gguf
|
||||
sha256: 277b212ea65894723d2b86fb0f689fa5ecb54c9794f0fd2fb643655dc62812ce
|
||||
uri: huggingface://bartowski/Einstein-v7-Qwen2-7B-GGUF/Einstein-v7-Qwen2-7B-Q4_K_M.gguf
|
||||
- !!merge <<: *qwen2
|
||||
name: "arcee-spark"
|
||||
icon: https://i.ibb.co/80ssNWS/o-Vdk-Qx-ARNmzr-Pi1h-Efj-SA.webp
|
||||
description: |
|
||||
Arcee Spark is a powerful 7B parameter language model that punches well above its weight class. Initialized from Qwen2, this model underwent a sophisticated training process:
|
||||
|
||||
Fine-tuned on 1.8 million samples
|
||||
Merged with Qwen2-7B-Instruct using Arcee's mergekit
|
||||
Further refined using Direct Preference Optimization (DPO)
|
||||
|
||||
This meticulous process results in exceptional performance, with Arcee Spark achieving the highest score on MT-Bench for models of its size, outperforming even GPT-3.5 on many tasks.
|
||||
urls:
|
||||
- https://huggingface.co/arcee-ai/Arcee-Spark-GGUF
|
||||
overrides:
|
||||
parameters:
|
||||
model: Arcee-Spark-Q4_K_M.gguf
|
||||
files:
|
||||
- filename: Arcee-Spark-Q4_K_M.gguf
|
||||
sha256: 44123276d7845dc13f73ca4aa431dc4c931104eb7d2186f2a73d076fa0ee2330
|
||||
uri: huggingface://arcee-ai/Arcee-Spark-GGUF/Arcee-Spark-Q4_K_M.gguf
|
||||
- &mistral03
|
||||
## START Mistral
|
||||
url: "github:mudler/LocalAI/gallery/mistral-0.3.yaml@master"
|
||||
@@ -277,6 +312,34 @@
|
||||
- filename: gemma-1.1-7b-it-Q4_K_M.gguf
|
||||
sha256: 47821da72ee9e80b6fd43c6190ad751b485fb61fa5664590f7a73246bcd8332e
|
||||
uri: huggingface://bartowski/gemma-1.1-7b-it-GGUF/gemma-1.1-7b-it-Q4_K_M.gguf
|
||||
- !!merge <<: *gemma
|
||||
name: "gemma-2-27b-it"
|
||||
urls:
|
||||
- https://huggingface.co/google/gemma-2-27b-it
|
||||
- https://huggingface.co/bartowski/gemma-2-27b-it-GGUF
|
||||
description: |
|
||||
Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models. They are text-to-text, decoder-only large language models, available in English, with open weights for both pre-trained variants and instruction-tuned variants. Gemma models are well-suited for a variety of text generation tasks, including question answering, summarization, and reasoning. Their relatively small size makes it possible to deploy them in environments with limited resources such as a laptop, desktop or your own cloud infrastructure, democratizing access to state of the art AI models and helping foster innovation for everyone.
|
||||
overrides:
|
||||
parameters:
|
||||
model: gemma-2-27b-it-Q4_K_M.gguf
|
||||
files:
|
||||
- filename: gemma-2-27b-it-Q4_K_M.gguf
|
||||
uri: huggingface://bartowski/gemma-2-27b-it-GGUF/gemma-2-27b-it-Q4_K_M.gguf
|
||||
sha256: ca86fbdb791842cf2e5eb276a6916e326b3b5d58d9ab60ee3e18b1c6f01fc181
|
||||
- !!merge <<: *gemma
|
||||
name: "gemma-2-9b-it"
|
||||
urls:
|
||||
- https://huggingface.co/google/gemma-2-9b-it
|
||||
- https://huggingface.co/bartowski/gemma-2-9b-it-GGUF
|
||||
description: |
|
||||
Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models. They are text-to-text, decoder-only large language models, available in English, with open weights for both pre-trained variants and instruction-tuned variants. Gemma models are well-suited for a variety of text generation tasks, including question answering, summarization, and reasoning. Their relatively small size makes it possible to deploy them in environments with limited resources such as a laptop, desktop or your own cloud infrastructure, democratizing access to state of the art AI models and helping foster innovation for everyone.
|
||||
overrides:
|
||||
parameters:
|
||||
model: gemma-2-9b-it-Q4_K_M.gguf
|
||||
files:
|
||||
- filename: gemma-2-9b-it-Q4_K_M.gguf
|
||||
uri: huggingface://bartowski/gemma-2-9b-it-GGUF/gemma-2-9b-it-Q4_K_M.gguf
|
||||
sha256: c70fd20caec79fb953b83031c46ddea4e99905835a66af7b8a856aa1b2534614
|
||||
- &llama3
|
||||
url: "github:mudler/LocalAI/gallery/llama3-instruct.yaml@master"
|
||||
icon: https://cdn-uploads.huggingface.co/production/uploads/642cc1c253e76b4c2286c58e/aJJxKus1wP5N-euvHEUq7.png
|
||||
@@ -1205,6 +1268,83 @@
|
||||
- filename: LLaMA3-iterative-DPO-final-Q4_K_M.gguf
|
||||
sha256: 480703ff85af337e1db2a9d9a678a3ac8ca0802e366b14d9c59b81d3fc689da8
|
||||
uri: huggingface://bartowski/LLaMA3-iterative-DPO-final-GGUF/LLaMA3-iterative-DPO-final-Q4_K_M.gguf
|
||||
- !!merge <<: *llama3
|
||||
name: "new-dawn-llama-3-70b-32K-v1.0"
|
||||
urls:
|
||||
- https://huggingface.co/bartowski/New-Dawn-Llama-3-70B-32K-v1.0-GGUF
|
||||
- https://huggingface.co/sophosympatheia/New-Dawn-Llama-3-70B-32K-v1.0
|
||||
icon: https://imgur.com/tKzncGo.png
|
||||
description: |
|
||||
This model is a multi-level SLERP merge of several Llama 3 70B variants. See the merge recipe below for details. I extended the context window for this model out to 32K by snagging some layers from abacusai/Smaug-Llama-3-70B-Instruct-32K using a technique similar to what I used for Midnight Miqu, which was further honed by jukofyork.
|
||||
This model is uncensored. You are responsible for whatever you do with it.
|
||||
|
||||
This model was designed for roleplaying and storytelling and I think it does well at both. It may also perform well at other tasks but I have not tested its performance in other areas.
|
||||
overrides:
|
||||
parameters:
|
||||
model: New-Dawn-Llama-3-70B-32K-v1.0-Q4_K_M.gguf
|
||||
files:
|
||||
- filename: New-Dawn-Llama-3-70B-32K-v1.0-Q4_K_M.gguf
|
||||
sha256: 30561ae5decac4ad46775c76a9a40fb43436ade96bc132b4b9cc6749b9e2f448
|
||||
uri: huggingface://bartowski/New-Dawn-Llama-3-70B-32K-v1.0-GGUF/New-Dawn-Llama-3-70B-32K-v1.0-Q4_K_M.gguf
|
||||
- !!merge <<: *llama3
|
||||
name: "l3-aethora-15b-v2"
|
||||
urls:
|
||||
- https://huggingface.co/bartowski/L3-Aethora-15B-V2-GGUF
|
||||
- https://huggingface.co/ZeusLabs/L3-Aethora-15B-V2
|
||||
icon: https://cdn-uploads.huggingface.co/production/uploads/64545af5ec40bbbd01242ca6/yJpwVd5UTnAVDoEPVVCS1.png
|
||||
description: |
|
||||
L3-Aethora-15B v2 is an advanced language model built upon the Llama 3 architecture. It employs state-of-the-art training techniques and a curated dataset to deliver enhanced performance across a wide range of tasks.
|
||||
overrides:
|
||||
parameters:
|
||||
model: L3-Aethora-15B-V2-Q4_K_M.gguf
|
||||
files:
|
||||
- filename: L3-Aethora-15B-V2-Q4_K_M.gguf
|
||||
sha256: 014a215739e1574e354780f218776e54807548d0c32555274c4d96d7628f29b6
|
||||
uri: huggingface://bartowski/L3-Aethora-15B-V2-GGUF/L3-Aethora-15B-V2-Q4_K_M.gguf
|
||||
- !!merge <<: *llama3
|
||||
name: "bungo-l3-8b-iq-imatrix"
|
||||
urls:
|
||||
- https://huggingface.co/Lewdiculous/Bungo-L3-8B-GGUF-IQ-Imatrix-Request
|
||||
icon: https://cdn-uploads.huggingface.co/production/uploads/65d4cf2693a0a3744a27536c/ezaxE50ef-7RsFi3gUbNp.webp
|
||||
description: |
|
||||
An experimental model that turned really well. Scores high on Chai leaderboard (slerp8bv2 there). Feel smarter than average L3 merges for RP.
|
||||
overrides:
|
||||
parameters:
|
||||
model: Bungo-L3-8B-Q4_K_M-imat.gguf
|
||||
files:
|
||||
- filename: Bungo-L3-8B-Q4_K_M-imat.gguf
|
||||
sha256: 88d0139954e8f9525b80636a6269df885008c4837a1332f84f9a5dc6f37c9b8f
|
||||
uri: huggingface://Lewdiculous/Bungo-L3-8B-GGUF-IQ-Imatrix-Request/Bungo-L3-8B-Q4_K_M-imat.gguf
|
||||
- !!merge <<: *llama3
|
||||
name: "llama3-8b-darkidol-2.1-uncensored-1048k-iq-imatrix"
|
||||
urls:
|
||||
- https://huggingface.co/LWDCLS/llama3-8B-DarkIdol-2.1-Uncensored-1048K-GGUF-IQ-Imatrix-Request
|
||||
icon: https://cdn-uploads.huggingface.co/production/uploads/65d4cf2693a0a3744a27536c/tKL5W1G5WCHm4609LEmiM.png
|
||||
description: |
|
||||
The module combination has been readjusted to better fulfill various roles and has been adapted for mobile phones.
|
||||
Uncensored 1048K
|
||||
overrides:
|
||||
parameters:
|
||||
model: llama3-8B-DarkIdol-2.1-Uncensored-1048K-Q4_K_M-imat.gguf
|
||||
files:
|
||||
- filename: llama3-8B-DarkIdol-2.1-Uncensored-1048K-Q4_K_M-imat.gguf
|
||||
sha256: 86f0f1e10fc315689e09314aebb7354bb40d8fe95de008d21a75dc8fff1cd2fe
|
||||
uri: huggingface://LWDCLS/llama3-8B-DarkIdol-2.1-Uncensored-1048K-GGUF-IQ-Imatrix-Request/llama3-8B-DarkIdol-2.1-Uncensored-1048K-Q4_K_M-imat.gguf
|
||||
- !!merge <<: *llama3
|
||||
name: "llama3-turbcat-instruct-8b"
|
||||
urls:
|
||||
- https://huggingface.co/turboderp/llama3-turbcat-instruct-8b
|
||||
- https://huggingface.co/bartowski/llama3-turbcat-instruct-8b-GGUF
|
||||
icon: https://huggingface.co/turboderp/llama3-turbcat-instruct-8b/resolve/main/8.png
|
||||
description: |
|
||||
This is a direct upgrade over cat 70B, with 2x the dataset size(2GB-> 5GB), added Chinese support with quality on par with the original English dataset. The medical COT portion of the dataset has been sponsored by steelskull, and the action packed character play portion was donated by Gryphe's(aesir dataset). Note that 8b is based on llama3 with limited Chinese support due to base model choice. The chat format in 8b is llama3. The 72b has more comprehensive Chinese support and the format will be chatml.
|
||||
overrides:
|
||||
parameters:
|
||||
model: llama3-turbcat-instruct-8b-Q4_K_M.gguf
|
||||
files:
|
||||
- filename: llama3-turbcat-instruct-8b-Q4_K_M.gguf
|
||||
sha256: a9a36e3220d901a8ad80c75608a81aaeed3a9cdf111247462bf5e3443aad5461
|
||||
uri: huggingface://bartowski/llama3-turbcat-instruct-8b-GGUF/llama3-turbcat-instruct-8b-Q4_K_M.gguf
|
||||
- &dolphin
|
||||
name: "dolphin-2.9-llama3-8b"
|
||||
url: "github:mudler/LocalAI/gallery/hermes-2-pro-mistral.yaml@master"
|
||||
@@ -1982,6 +2122,25 @@
|
||||
- filename: Llama-3-Update-3.0-mmproj-model-f16.gguf
|
||||
sha256: 3d2f36dff61d6157cadf102df86a808eb9f8a230be1bc0bc99039d81a895468a
|
||||
uri: huggingface://Nitral-AI/Llama-3-Update-3.0-mmproj-model-f16/Llama-3-Update-3.0-mmproj-model-f16.gguf
|
||||
- !!merge <<: *llama3
|
||||
name: "llama3-8b-darkidol-1.2-iq-imatrix"
|
||||
urls:
|
||||
- https://huggingface.co/LWDCLS/llama3-8B-DarkIdol-1.2-GGUF-IQ-Imatrix-Request
|
||||
- https://huggingface.co/aifeifei798/llama3-8B-DarkIdol-1.2
|
||||
description: |
|
||||
The module combination has been readjusted to better fulfill various roles and has been adapted for mobile phones.
|
||||
icon: https://huggingface.co/aifeifei798/llama3-8B-DarkIdol-1.2/resolve/main/llama3-8B-DarkIdol-1.2.png
|
||||
overrides:
|
||||
mmproj: Llama-3-Update-3.0-mmproj-model-f16.gguf
|
||||
parameters:
|
||||
model: llama3-8B-DarkIdol-1.2-Q4_K_M-imat.gguf
|
||||
files:
|
||||
- filename: llama3-8B-DarkIdol-1.2-Q4_K_M-imat.gguf
|
||||
sha256: dce2f5f1661f49fb695b038d973770b0d9059bced4e4bb212f6517aa219131cd
|
||||
uri: huggingface://LWDCLS/llama3-8B-DarkIdol-1.2-GGUF-IQ-Imatrix-Request/llama3-8B-DarkIdol-1.2-Q4_K_M-imat.gguf
|
||||
- filename: Llama-3-Update-3.0-mmproj-model-f16.gguf
|
||||
sha256: 3d2f36dff61d6157cadf102df86a808eb9f8a230be1bc0bc99039d81a895468a
|
||||
uri: huggingface://Nitral-AI/Llama-3-Update-3.0-mmproj-model-f16/Llama-3-Update-3.0-mmproj-model-f16.gguf
|
||||
- &chatml
|
||||
### ChatML
|
||||
url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
|
||||
@@ -2545,6 +2704,69 @@
|
||||
- filename: "Codestral-22B-v0.1-Q4_K_M.gguf"
|
||||
uri: "huggingface://bartowski/Codestral-22B-v0.1-GGUF/Codestral-22B-v0.1-Q4_K_M.gguf"
|
||||
sha256: 003e48ed892850b80994fcddca2bd6b833b092a4ef2db2853c33a3144245e06c
|
||||
- &llm-compiler
|
||||
url: "github:mudler/LocalAI/gallery/codellama.yaml@master"
|
||||
name: "llm-compiler-13b-imat"
|
||||
license: other
|
||||
description: |
|
||||
LLM Compiler is a state-of-the-art LLM that builds upon Code Llama with improved performance for code optimization and compiler reasoning.
|
||||
LLM Compiler is free for both research and commercial use.
|
||||
LLM Compiler is available in two flavors:
|
||||
|
||||
LLM Compiler, the foundational models, pretrained on over 500B tokens of LLVM-IR, x86_84, ARM, and CUDA assembly codes and trained to predict the effect of LLVM optimizations;
|
||||
and LLM Compiler FTD, which is further fine-tuned to predict the best optimizations for code in LLVM assembly to reduce code size, and to disassemble assembly code to LLVM-IR.
|
||||
urls:
|
||||
- https://huggingface.co/legraphista/llm-compiler-13b-IMat-GGUF
|
||||
- https://huggingface.co/facebook/llm-compiler-13b
|
||||
tags:
|
||||
- llm
|
||||
- gguf
|
||||
- gpu
|
||||
- code
|
||||
- cpu
|
||||
overrides:
|
||||
parameters:
|
||||
model: llm-compiler-13b.Q4_K.gguf
|
||||
files:
|
||||
- filename: "llm-compiler-13b.Q4_K.gguf"
|
||||
uri: "huggingface://legraphista/llm-compiler-13b-IMat-GGUF/llm-compiler-13b.Q4_K.gguf"
|
||||
sha256: dad41a121d0d67432c289aba8ffffc93159e2b24ca3d1c62e118c9f4cbf0c890
|
||||
- !!merge <<: *llm-compiler
|
||||
name: "llm-compiler-13b-ftd"
|
||||
urls:
|
||||
- https://huggingface.co/QuantFactory/llm-compiler-13b-ftd-GGUF
|
||||
- https://huggingface.co/facebook/llm-compiler-13b-ftd
|
||||
overrides:
|
||||
parameters:
|
||||
model: llm-compiler-13b-ftd.Q4_K_M.gguf
|
||||
files:
|
||||
- filename: "llm-compiler-13b-ftd.Q4_K_M.gguf"
|
||||
uri: "huggingface://QuantFactory/llm-compiler-13b-ftd-GGUF/llm-compiler-13b-ftd.Q4_K_M.gguf"
|
||||
sha256: a5d19ae6b3fbe6724784363161b66cd2c8d8a3905761c0fb08245b3c03697db1
|
||||
- !!merge <<: *llm-compiler
|
||||
name: "llm-compiler-7b-imat-GGUF"
|
||||
urls:
|
||||
- https://huggingface.co/legraphista/llm-compiler-7b-IMat-GGUF
|
||||
- https://huggingface.co/facebook/llm-compiler-7b
|
||||
overrides:
|
||||
parameters:
|
||||
model: llm-compiler-7b.Q4_K.gguf
|
||||
files:
|
||||
- filename: "llm-compiler-7b.Q4_K.gguf"
|
||||
uri: "huggingface://legraphista/llm-compiler-7b-IMat-GGUF/llm-compiler-7b.Q4_K.gguf"
|
||||
sha256: 84926979701fa4591ff5ede94a6c5829a62efa620590e5815af984707d446926
|
||||
- !!merge <<: *llm-compiler
|
||||
name: "llm-compiler-7b-ftd-imat"
|
||||
urls:
|
||||
- https://huggingface.co/legraphista/llm-compiler-7b-ftd-IMat-GGUF
|
||||
- https://huggingface.co/facebook/llm-compiler-7b-ftd
|
||||
overrides:
|
||||
parameters:
|
||||
model: llm-compiler-7b-ftd.Q4_K.gguf
|
||||
files:
|
||||
- filename: "llm-compiler-7b-ftd.Q4_K.gguf"
|
||||
uri: "huggingface://legraphista/llm-compiler-7b-ftd-IMat-GGUF/llm-compiler-7b-ftd.Q4_K.gguf"
|
||||
sha256: d862dd18ed335413787d0ad196522a9902a3c10a6456afdab8721822cb0ddde8
|
||||
- &openvino
|
||||
### START OpenVINO
|
||||
url: "github:mudler/LocalAI/gallery/openvino.yaml@master"
|
||||
|
||||
@@ -42,3 +42,9 @@ func SetPrefix(suffix string) func(*GrammarOption) {
|
||||
o.Prefix = suffix
|
||||
}
|
||||
}
|
||||
|
||||
func SetPropOrder(order string) func(*GrammarOption) {
|
||||
return func(o *GrammarOption) {
|
||||
o.PropOrder = order
|
||||
}
|
||||
}
|
||||
|
||||
@@ -32,6 +32,11 @@ type GrammarConfig struct {
|
||||
|
||||
// ExpectStringsAfterJSON enables mixed string suffix
|
||||
ExpectStringsAfterJSON bool `yaml:"expect_strings_after_json"`
|
||||
|
||||
// PropOrder selects what order to print properties
|
||||
// for instance name,arguments will make print { "name": "foo", "arguments": { "bar": "baz" } }
|
||||
// instead of { "arguments": { "bar": "baz" }, "name": "foo" }
|
||||
PropOrder string `yaml:"properties_order"`
|
||||
}
|
||||
|
||||
// FunctionsConfig is the configuration for the tool/function call.
|
||||
@@ -104,6 +109,8 @@ func (g GrammarConfig) Options() []func(o *GrammarOption) {
|
||||
if g.ExpectStringsAfterJSON {
|
||||
opts = append(opts, ExpectStringsAfterJSON)
|
||||
}
|
||||
|
||||
opts = append(opts, SetPropOrder(g.PropOrder))
|
||||
return opts
|
||||
}
|
||||
|
||||
|
||||
@@ -247,14 +247,23 @@ func selectGRPCProcess(backend, assetDir string, f16 bool) string {
|
||||
}
|
||||
|
||||
if xsysinfo.HasCPUCaps(cpuid.AVX2) {
|
||||
log.Info().Msgf("[%s] attempting to load with AVX2 variant", backend)
|
||||
grpcProcess = backendPath(assetDir, LLamaCPPAVX2)
|
||||
p := backendPath(assetDir, LLamaCPPAVX2)
|
||||
if _, err := os.Stat(p); err == nil {
|
||||
log.Info().Msgf("[%s] attempting to load with AVX2 variant", backend)
|
||||
grpcProcess = p
|
||||
}
|
||||
} else if xsysinfo.HasCPUCaps(cpuid.AVX) {
|
||||
log.Info().Msgf("[%s] attempting to load with AVX variant", backend)
|
||||
grpcProcess = backendPath(assetDir, LLamaCPPAVX)
|
||||
p := backendPath(assetDir, LLamaCPPAVX)
|
||||
if _, err := os.Stat(p); err == nil {
|
||||
log.Info().Msgf("[%s] attempting to load with AVX variant", backend)
|
||||
grpcProcess = p
|
||||
}
|
||||
} else {
|
||||
log.Info().Msgf("[%s] attempting to load with fallback variant", backend)
|
||||
grpcProcess = backendPath(assetDir, LLamaCPPFallback)
|
||||
p := backendPath(assetDir, LLamaCPPFallback)
|
||||
if _, err := os.Stat(p); err == nil {
|
||||
log.Info().Msgf("[%s] attempting to load with fallback variant", backend)
|
||||
grpcProcess = p
|
||||
}
|
||||
}
|
||||
|
||||
return grpcProcess
|
||||
@@ -509,6 +518,39 @@ func (ml *ModelLoader) GreedyLoader(opts ...Option) (grpc.Backend, error) {
|
||||
err = errors.Join(err, fmt.Errorf("backend %s returned no usable model", key))
|
||||
log.Info().Msgf("[%s] Fails: %s", key, "backend returned no usable model")
|
||||
}
|
||||
|
||||
if autoDetect && key == LLamaCPP && err != nil {
|
||||
// try as hard as possible to run the llama.cpp variants
|
||||
backendToUse := ""
|
||||
if xsysinfo.HasCPUCaps(cpuid.AVX2) {
|
||||
if _, err := os.Stat(backendPath(o.assetDir, LLamaCPPAVX2)); err == nil {
|
||||
backendToUse = LLamaCPPAVX2
|
||||
}
|
||||
} else if xsysinfo.HasCPUCaps(cpuid.AVX) {
|
||||
if _, err := os.Stat(backendPath(o.assetDir, LLamaCPPAVX2)); err == nil {
|
||||
backendToUse = LLamaCPPAVX
|
||||
}
|
||||
} else {
|
||||
if _, err := os.Stat(backendPath(o.assetDir, LLamaCPPFallback)); err == nil {
|
||||
backendToUse = LLamaCPPFallback
|
||||
} else {
|
||||
// If we don't have a fallback, just skip fallback
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
// Autodetection failed, try the fallback
|
||||
log.Info().Msgf("[%s] Autodetection failed, trying the fallback", key)
|
||||
options = append(options, WithBackendString(backendToUse))
|
||||
model, modelerr = ml.BackendLoader(options...)
|
||||
if modelerr == nil && model != nil {
|
||||
log.Info().Msgf("[%s] Loads OK", key)
|
||||
return model, nil
|
||||
} else {
|
||||
err = errors.Join(err, fmt.Errorf("[%s]: %w", key, modelerr))
|
||||
log.Info().Msgf("[%s] Fails: %s", key, modelerr.Error())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("could not load model - all backends returned error: %s", err.Error())
|
||||
|
||||
@@ -701,6 +701,9 @@ const docTemplate = `{
|
||||
"prompt": {
|
||||
"description": "Prompt is read only by completion/image API calls"
|
||||
},
|
||||
"repeat_last_n": {
|
||||
"type": "integer"
|
||||
},
|
||||
"repeat_penalty": {
|
||||
"type": "number"
|
||||
},
|
||||
@@ -751,6 +754,10 @@ const docTemplate = `{
|
||||
"description": "Common options between all the API calls, part of the OpenAI spec",
|
||||
"type": "number"
|
||||
},
|
||||
"translate": {
|
||||
"description": "Only for audio transcription",
|
||||
"type": "boolean"
|
||||
},
|
||||
"typical_p": {
|
||||
"type": "number"
|
||||
},
|
||||
|
||||
@@ -694,6 +694,9 @@
|
||||
"prompt": {
|
||||
"description": "Prompt is read only by completion/image API calls"
|
||||
},
|
||||
"repeat_last_n": {
|
||||
"type": "integer"
|
||||
},
|
||||
"repeat_penalty": {
|
||||
"type": "number"
|
||||
},
|
||||
@@ -744,6 +747,10 @@
|
||||
"description": "Common options between all the API calls, part of the OpenAI spec",
|
||||
"type": "number"
|
||||
},
|
||||
"translate": {
|
||||
"description": "Only for audio transcription",
|
||||
"type": "boolean"
|
||||
},
|
||||
"typical_p": {
|
||||
"type": "number"
|
||||
},
|
||||
|
||||
@@ -292,6 +292,8 @@ definitions:
|
||||
type: number
|
||||
prompt:
|
||||
description: Prompt is read only by completion/image API calls
|
||||
repeat_last_n:
|
||||
type: integer
|
||||
repeat_penalty:
|
||||
type: number
|
||||
response_format:
|
||||
@@ -328,6 +330,9 @@ definitions:
|
||||
description: Common options between all the API calls, part of the OpenAI
|
||||
spec
|
||||
type: number
|
||||
translate:
|
||||
description: Only for audio transcription
|
||||
type: boolean
|
||||
typical_p:
|
||||
type: number
|
||||
use_fast_tokenizer:
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
package e2e_test
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
@@ -39,7 +38,7 @@ var _ = BeforeSuite(func() {
|
||||
|
||||
var defaultConfig openai.ClientConfig
|
||||
if apiEndpoint == "" {
|
||||
startDockerImage("")
|
||||
startDockerImage()
|
||||
defaultConfig = openai.DefaultConfig(apiKey)
|
||||
apiEndpoint = "http://localhost:" + apiPort + "/v1" // So that other tests can reference this value safely.
|
||||
defaultConfig.BaseURL = apiEndpoint
|
||||
@@ -59,41 +58,9 @@ var _ = BeforeSuite(func() {
|
||||
})
|
||||
|
||||
var _ = AfterSuite(func() {
|
||||
|
||||
// if the suite failed, logs will be printed
|
||||
// to the console
|
||||
if CurrentGinkgoTestDescription().Failed {
|
||||
if resource != nil {
|
||||
logs := bytes.NewBufferString("")
|
||||
err := pool.Client.Logs(docker.LogsOptions{
|
||||
Container: resource.Container.ID,
|
||||
OutputStream: logs,
|
||||
ErrorStream: logs,
|
||||
Stdout: true,
|
||||
Stderr: true,
|
||||
Timestamps: true,
|
||||
})
|
||||
if err != nil {
|
||||
fmt.Println("Could not take logs for failed suite", err.Error())
|
||||
}
|
||||
fmt.Println("Suite failed, printing logs")
|
||||
fmt.Println(logs.String())
|
||||
|
||||
c, err := pool.Client.InspectContainer(resource.Container.ID)
|
||||
if err != nil {
|
||||
fmt.Println("Could not inspect container", err.Error())
|
||||
}
|
||||
fmt.Println("Container state")
|
||||
fmt.Println("Running:", c.State.Running)
|
||||
fmt.Println("ExitCode:", c.State.ExitCode)
|
||||
fmt.Println("Error:", c.State.Error)
|
||||
}
|
||||
}
|
||||
|
||||
if resource != nil {
|
||||
Expect(pool.Purge(resource)).To(Succeed())
|
||||
}
|
||||
|
||||
//dat, err := os.ReadFile(resource.Container.LogPath)
|
||||
//Expect(err).To(Not(HaveOccurred()))
|
||||
//Expect(string(dat)).To(ContainSubstring("GRPC Service Ready"))
|
||||
@@ -104,8 +71,8 @@ var _ = AfterEach(func() {
|
||||
//Expect(dbClient.Clear()).To(Succeed())
|
||||
})
|
||||
|
||||
func startDockerImage(endpoint string) {
|
||||
p, err := dockertest.NewPool(endpoint)
|
||||
func startDockerImage() {
|
||||
p, err := dockertest.NewPool("")
|
||||
Expect(err).To(Not(HaveOccurred()))
|
||||
Expect(p.Client.Ping()).To(Succeed())
|
||||
|
||||
|
||||
Reference in New Issue
Block a user