Compare commits

..

163 Commits

Author SHA1 Message Date
Ettore Di Giacinto
2bbb221fb1 tests(petals): temp disable 2024-04-08 21:28:59 +00:00
LocalAI [bot]
195be10050 ⬆️ Update ggerganov/llama.cpp (#1973)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-04-08 23:26:52 +02:00
fakezeta
a38618db02 fix regression #1971 (#1972)
fixes regression #1971 introduced by intel_extension_for_transformers==1.4
2024-04-08 22:33:51 +02:00
LocalAI [bot]
efcca15d3f ⬆️ Update ggerganov/llama.cpp (#1970)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-04-08 08:38:47 +02:00
LocalAI [bot]
a153b628c2 ⬆️ Update ggerganov/whisper.cpp (#1969)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-04-08 08:38:17 +02:00
Ettore Di Giacinto
f36d86ba6d fix(hermes-2-pro-mistral): correct dashes in template to suppress newlines (#1966)
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2024-04-07 18:23:47 +02:00
Ettore Di Giacinto
74492a81c7 doc(quickstart): fix typo
Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2024-04-07 11:06:35 +02:00
LocalAI [bot]
ed13782986 ⬆️ Update ggerganov/llama.cpp (#1964)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-04-07 10:32:10 +02:00
Ettore Di Giacinto
8342553214 fix(llama.cpp): set better defaults for llama.cpp (#1961)
fix(defaults): set better defaults for llama.cpp

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2024-04-06 22:56:45 +02:00
LocalAI [bot]
8aa5f5a660 ⬆️ Update ggerganov/llama.cpp (#1960)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-04-06 19:15:25 +00:00
LocalAI [bot]
b2d9e3f704 ⬆️ Update ggerganov/llama.cpp (#1959)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-04-05 08:41:55 +02:00
LocalAI [bot]
f744e1f931 ⬆️ Update ggerganov/whisper.cpp (#1958)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-04-05 08:41:35 +02:00
cryptk
b85dad0286 feat: first pass at improving logging (#1956)
Signed-off-by: Chris Jowett <421501+cryptk@users.noreply.github.com>
2024-04-04 09:24:22 +02:00
LocalAI [bot]
3851b51d98 ⬆️ Update ggerganov/llama.cpp (#1953)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-04-04 00:27:57 +02:00
Ettore Di Giacinto
ff77d3bc22 fix(seed): generate random seed per-request if -1 is set (#1952)
* fix(seed): generate random seed per-request if -1 is set

Also update ci with new workflows and allow the aio tests to run with an
api key

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* docs(openvino): Add OpenVINO example

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2024-04-03 22:25:47 +02:00
Ettore Di Giacinto
93cfec3c32 ci: correctly tag latest and aio images 2024-04-03 11:30:23 +02:00
Ettore Di Giacinto
89560ef87f fix(ci): manually tag latest images (#1948)
fix(ci): manually tag images

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2024-04-02 19:25:46 +02:00
Ettore Di Giacinto
9bc209ba73 fix(welcome): stable model list (#1949) 2024-04-02 19:25:32 +02:00
Ettore Di Giacinto
84e0dc3246 fix(hermes-2-pro-mistral): correct stopwords (#1947)
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2024-04-02 15:38:00 +02:00
LocalAI [bot]
4d4d76114d ⬆️ Update ggerganov/llama.cpp (#1941)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-04-02 09:16:04 +02:00
cryptk
86bc5f1350 fix: use exec in entrypoint scripts to fix signal handling (#1943) 2024-04-02 09:15:44 +02:00
Ettore Di Giacinto
e8f02c083f fix(functions): respect when selected from string (#1940)
* fix(functions): respect when selected from string

* fix(toolschoice): decode both string and objects
2024-04-01 19:39:54 +02:00
Ettore Di Giacinto
ebb1fcedea fix(hermes-2-pro-mistral): add stopword for toolcall (#1939)
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2024-04-01 11:48:35 +02:00
LocalAI [bot]
66f90f8dc1 ⬆️ Update ggerganov/llama.cpp (#1937)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-04-01 08:59:23 +02:00
Ettore Di Giacinto
3c778b538a Update phi-2-orange.yaml
Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2024-03-31 13:06:41 +02:00
Ettore Di Giacinto
35290e146b fix(grammar): respect JSONmode and grammar from user input (#1935)
* fix(grammar): Fix JSON mode and custom grammar

* tests(aio): add jsonmode test

* tests(aio): add functioncall test

* fix(aio): use hermes-2-pro-mistral as llm for CPU profile

* add phi-2-orange
2024-03-31 13:04:09 +02:00
LocalAI [bot]
784657a652 ⬆️ Update ggerganov/llama.cpp (#1934)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-03-31 00:27:38 +01:00
LocalAI [bot]
831efa8893 ⬆️ Update ggerganov/whisper.cpp (#1933)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-03-31 00:27:16 +01:00
Ettore Di Giacinto
957f428fd5 fix(tools): correctly render tools response in templates (#1932)
* fix(tools): allow to correctly display both Functions and Tools

* models(hermes-2-pro): correctly display function results
2024-03-30 19:02:07 +01:00
Ettore Di Giacinto
61e5e6bc36 fix(swagger): do not specify a host (#1930)
In this way the requests are redirected to the host used by the client
to perform the request.
2024-03-30 12:04:41 +01:00
Ettore Di Giacinto
eab4a91a9b fix(aio): correctly detect intel systems (#1931)
Also rename SIZE to PROFILE
2024-03-30 12:04:32 +01:00
LocalAI [bot]
2bba62ca4d ⬆️ Update ggerganov/llama.cpp (#1928)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-03-29 22:52:01 +00:00
Ettore Di Giacinto
bcdc83b46d Update quickstart.md
Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2024-03-29 23:00:06 +01:00
Ettore Di Giacinto
92fbdfd06f feat(swagger): update (#1929) 2024-03-29 22:48:58 +01:00
cryptk
93702e39d4 feat(build): adjust number of parallel make jobs (#1915)
* feat(build): adjust number of parallel make jobs

* fix: update make on MacOS from brew to support --output-sync argument

* fix: cache grpc with version as part of key to improve validity of cache hits

* fix: use gmake for tests-apple to use the updated GNU make version

* fix: actually use the new make version for tests-apple

* feat: parallelize tests-extra

* feat: attempt to cache grpc build for docker images

* fix: don't quote GRPC version

* fix: don't cache go modules, we have limited cache space, better used elsewhere

* fix: release with the same version of go that we test with

* fix: don't fail on exporting cache layers

* fix: remove deprecated BUILD_GRPC docker arg from Makefile
2024-03-29 22:32:40 +01:00
LocalAI [bot]
a7fc89c207 ⬆️ Update ggerganov/whisper.cpp (#1927)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-03-29 22:29:50 +01:00
Ettore Di Giacinto
123a5a2e16 feat(swagger): Add swagger API doc (#1926)
* makefile(build): add minimal and api build target

* feat(swagger): Add swagger
2024-03-29 22:29:33 +01:00
LocalAI [bot]
ab2f403dd0 ⬆️ Update ggerganov/whisper.cpp (#1924)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-03-29 00:13:59 +01:00
LocalAI [bot]
b9c5e14e2c ⬆️ Update ggerganov/llama.cpp (#1923)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-03-29 00:13:38 +01:00
Ettore Di Giacinto
bf65ed6eb8 feat(webui): add partials, show backends associated to models (#1922)
* feat(webui): add partials, show backends associated to models

* fix(auth): put assistant and backend under auth
2024-03-28 21:52:52 +01:00
Ettore Di Giacinto
4e79294f97 Update README.md
Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2024-03-28 19:52:40 +01:00
Ettore Di Giacinto
8477e8fac3 Update quickstart.md
Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2024-03-28 18:28:30 +01:00
Ettore Di Giacinto
13ccd2afef docs(aio-usage): update docs to show examples (#1921)
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2024-03-28 18:16:58 +01:00
Ettore Di Giacinto
23b833d171 Update run-other-models.md
Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2024-03-28 12:42:37 +01:00
LocalAI [bot]
07c49ee4b8 ⬆️ Update ggerganov/whisper.cpp (#1914)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-03-27 22:53:13 +00:00
LocalAI [bot]
07c4bdda7c ⬆️ Update ggerganov/llama.cpp (#1913)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-03-27 21:57:59 +00:00
Ettore Di Giacinto
2266d8263c Update README.md 2024-03-27 22:48:46 +01:00
Ettore Di Giacinto
160eb48b2b Update quickstart.md 2024-03-27 22:47:59 +01:00
cryptk
0c0efc871c fix(build): better CI logging and correct some build failure modes in Makefile (#1899)
* feat: group make output by target when running parallelized builds in CI

* fix: quote GO_TAGS in makefile to fix handling of whitespace in value

* fix: set CPATH to find opencv2 in it's commonly installed location

* fix: add missing go mod dropreplace for go-llama.cpp

* chore: remove opencv symlink from github workflows
2024-03-27 21:12:19 +01:00
Gianluca Boiano
7ef5f3b473 ⬆️ Update M0Rf30/go-tiny-dream (#1911) 2024-03-27 21:12:04 +01:00
Ettore Di Giacinto
66ee4afb95 feat(welcome): add simple welcome page (#1912)
* feat(welcome): add simple welcome page

* feat(api): add 404 handling
2024-03-27 21:10:58 +01:00
Ettore Di Giacinto
93f0b7ae03 update hot topics
Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2024-03-27 18:17:12 +01:00
fakezeta
8210ffcb6c feat: Token Stream support for Transformer, fix: missing package for OpenVINO (#1908)
* Streaming working

* Small fix for regression on CUDA and XPU

* use pip version of optimum[openvino]

* Update backend/python/transformers/transformers_server.py

Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>

* Token streaming support

fix optimum[openvino] package in install.sh

* Token Streaming support

---------

Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2024-03-27 17:50:35 +01:00
fakezeta
e7cbe32601 feat: Openvino runtime for transformer backend and streaming support for Openvino and CUDA (#1892)
* fixes #1775 and #1774

Add BitsAndBytes Quantization and fixes embedding on CUDA devices

* Manage 4bit and 8 bit quantization

Manage different BitsAndBytes options with the quantization: parameter in yaml

* fix compilation errors on non CUDA environment

* OpenVINO draft

First draft of OpenVINO integration in transformer backend

* first working implementation

* Streaming working

* Small fix for regression on CUDA and XPU

* use pip version of optimum[openvino]

* Update backend/python/transformers/transformers_server.py

Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>

---------

Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2024-03-26 23:31:43 +00:00
LocalAI [bot]
b500ceaf73 ⬆️ Update ggerganov/llama.cpp (#1904)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-03-26 23:21:54 +00:00
LocalAI [bot]
d3c283ac19 ⬆️ Update docs version mudler/LocalAI (#1903)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-03-26 22:56:42 +01:00
Ettore Di Giacinto
607586e0b7 fix: downgrade torch (#1902)
Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2024-03-26 22:56:02 +01:00
Steven Christou
2d7913b3be feat(assistant): Assistant and AssistantFiles api (#1803)
* Initial implementation of assistants api

* Move load/save configs to utils

* Save assistant and assistantfiles config to disk.

* Add tsets for assistant api

* Fix models path spelling mistake.

* Remove personal go.mod information

---------

Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2024-03-26 18:54:35 +01:00
Sebastian.W
b7ffe66219 Enhance autogptq backend to support VL models (#1860)
* Enhance autogptq backend to support VL models

* update dependencies for autogptq

* remove redundant auto-gptq dependency

* Convert base64 to image_url for Qwen-VL model

* implemented model inference for qwen-vl

* remove user prompt from generated answer

* fixed write image error

---------

Co-authored-by: Binghua Wu <bingwu@estee.com>
2024-03-26 18:48:14 +01:00
Ettore Di Giacinto
e58410fa99 feat(aio): add intel profile (#1901)
* feat(aio): add intel profile

* docs: clarify AIO images features
2024-03-26 18:45:25 +01:00
LocalAI [bot]
1395e505cd ⬆️ Update ggerganov/llama.cpp (#1897)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-03-26 00:34:10 +01:00
LocalAI [bot]
42a4c86dca ⬆️ Update ggerganov/whisper.cpp (#1896)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-03-26 00:33:46 +01:00
Ettore Di Giacinto
c9adc5680c fix(aio): make image-gen for GPU functional, update docs (#1895)
* readme: update quickstart

* aio(gpu): fix dreamshaper

* tests(aio): allow to run tests also against an endpoint

* docs: split content

* tests: less verbosity

---------

Co-authored-by: Dave <dave@gray101.com>
2024-03-25 21:04:32 +00:00
Enrico Ros
08c7b17298 Fix NVIDIA VRAM detection on WSL2 environments (#1894)
* NVIDIA VRAM detection on WSL2 environments

More robust single NVIDIA GPU memory detection, following the
improved NVIDIA WSL2 detection patch yesterday #1891.

Tested and working on WSL2, Linux.

Signed-off-by: Enrico Ros <enrico.ros@gmail.com>

* Update aio/entrypoint.sh

Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>

---------

Signed-off-by: Enrico Ros <enrico.ros@gmail.com>
Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2024-03-25 18:36:18 +01:00
Enrico Ros
5e12382524 NVIDIA GPU detection support for WSL2 environments (#1891)
This change makes the assumption that "Microsoft Corporation Device 008e"
is an NVIDIA CUDA device. If this is not the case, please update the
hardware detection script here.

Signed-off-by: Enrico Ros <enrico.ros@gmail.com>
Co-authored-by: Dave <dave@gray101.com>
2024-03-25 08:32:40 +01:00
Ettore Di Giacinto
6cf99527f8 docs(aio): Add All-in-One images docs (#1887)
* docs(aio): Add AIO images docs

* add image generation link to quickstart

* while reviewing I noticed this one link was missing, so quickly adding it.

Signed-off-by: Dave <dave@gray101.com>
Co-authored-by: Dave <dave@gray101.com>
2024-03-25 02:01:30 +00:00
LocalAI [bot]
3e293f1465 ⬆️ Update ggerganov/llama.cpp (#1889)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-03-24 21:12:18 +00:00
LocalAI [bot]
0106c58181 ⬆️ Update ggerganov/llama.cpp (#1885)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-03-24 14:54:01 +01:00
Ettore Di Giacinto
bd25d8049c fix(watchdog): use ShutdownModel instead of StopModel (#1882)
Fixes #1760
2024-03-23 16:19:57 +01:00
Ettore Di Giacinto
49cec7fd61 ci(aio): add latest tag images (#1884)
Tangentially also fixes #1868
2024-03-23 16:08:32 +01:00
Ettore Di Giacinto
d9456f2a23 ci(aio): publish hipblas and Intel GPU images (#1883)
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2024-03-23 15:54:14 +01:00
Ettore Di Giacinto
8495750cb8 Update release.yml
Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2024-03-23 15:22:26 +01:00
Ettore Di Giacinto
1f501cc1ef Update README.md
Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2024-03-23 10:42:14 +01:00
LocalAI [bot]
a922119c41 ⬆️ Update ggerganov/llama.cpp (#1881)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-03-23 09:23:28 +01:00
Richard Palethorpe
643d85d2cc feat(stores): Vector store backend (#1795)
Add simple vector store backend

Signed-off-by: Richard Palethorpe <io@richiejp.com>
2024-03-22 21:14:04 +01:00
Ettore Di Giacinto
4b1ee0c170 feat(aio): add tests, update model definitions (#1880) 2024-03-22 21:13:11 +01:00
Ettore Di Giacinto
3bec467a91 feat(models): add phi-2-chat, llava-1.6, bakllava, cerbero (#1879) 2024-03-22 21:12:48 +01:00
Ettore Di Giacinto
600152df23 fix(config): pass by config options, respect defaults (#1878)
This bug had the unpleasant effect that it ignored defaults passed by
the CLI. For instance threads could be changed only via model config
file.
2024-03-22 20:55:11 +01:00
LocalAI [bot]
dd84c29a3d ⬆️ Update ggerganov/whisper.cpp (#1875)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-03-22 09:14:56 +01:00
LocalAI [bot]
07468c8786 ⬆️ Update ggerganov/llama.cpp (#1874)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-03-22 09:14:42 +01:00
Ettore Di Giacinto
418ba02025 ci: fix typo
Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2024-03-22 09:14:17 +01:00
Ettore Di Giacinto
abc9360dc6 feat(aio): entrypoint, update workflows (#1872) 2024-03-21 22:09:04 +01:00
Sebastian
743095b7d8 docs(mac): improve documentation for mac build (#1873)
* docs(mac): Improve documentation for mac build

- added documentation to build from current master
- added troubleshooting information

Signed-off-by: Sebastian <tauven@gmail.com>

* docs(max): fix typo

Signed-off-by: Sebastian <tauven@gmail.com>

---------

Signed-off-by: Sebastian <tauven@gmail.com>
2024-03-21 22:08:33 +01:00
Ettore Di Giacinto
3cf64d1e7e Update README.md
Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2024-03-21 08:57:41 +01:00
Ettore Di Giacinto
e533dcf506 feat(functions/aio): all-in-one images, function template enhancements (#1862)
* feat(startup): allow to specify models from local files

* feat(aio): add Dockerfile, make targets, aio profiles

* feat(template): add Function and LastMessage

* add hermes2-pro-mistral

* update hermes2 definition

* feat(template): add sprig

* feat(template): expose FunctionCall

* feat(aio): switch llm for text
2024-03-21 01:12:20 +01:00
LocalAI [bot]
eeaf8c7ccd ⬆️ Update ggerganov/whisper.cpp (#1867)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-03-20 22:26:29 +00:00
LocalAI [bot]
7e34dfdae7 ⬆️ Update ggerganov/llama.cpp (#1866)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-03-20 22:13:29 +00:00
LocalAI [bot]
e4bf51d5bd ⬆️ Update ggerganov/llama.cpp (#1864)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-03-20 09:05:53 +01:00
LocalAI [bot]
ead61bf9d5 ⬆️ Update ggerganov/llama.cpp (#1857)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-03-19 00:03:17 +00:00
LocalAI [bot]
b12a205320 ⬆️ Update docs version mudler/LocalAI (#1856)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-03-19 00:44:45 +01:00
LocalAI [bot]
621541a92f ⬆️ Update ggerganov/whisper.cpp (#1508)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-03-19 00:44:23 +01:00
Dave
ed5734ae25 test/fix: OSX Test Repair (#1843)
* test with gguf instead of ggml. Updates testPrompt to match? Adds debugging line to Dockerfile that I've found helpful recently.

* fix testPrompt slightly

* Sad Experiment: Test GH runner without metal?

* break apart CGO_LDFLAGS

* switch runner

* upstream llama.cpp disables Metal on Github CI!

* missed a dir from clean-tests

* CGO_LDFLAGS

* tmate failure + NO_ACCELERATE

* whisper.cpp has a metal fix

* do the exact opposite of the name of this branch, but keep it around for unrelated fixes?

* add back newlines

* add tmate to linux for testing

* update fixtures

* timeout for tmate
2024-03-18 19:19:43 +01:00
Ettore Di Giacinto
a046dcac5e fix(config-watcher): start only if config-directory exists (#1854)
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2024-03-18 19:14:48 +01:00
Ettore Di Giacinto
843f93e1ab fix(config): default to debug=false if not set (#1853) 2024-03-18 18:59:39 +01:00
Ettore Di Giacinto
fa9e330fc6 fix(llama.cpp): fix eos without cache (#1852) 2024-03-18 18:59:24 +01:00
Ettore Di Giacinto
b202bfaaa0 deps(whisper.cpp): update, fix cublas build (#1846)
fix(whisper.cpp): Add stubs and -lcuda
2024-03-18 15:56:53 +01:00
LocalAI [bot]
0eb0ac7dd0 ⬆️ Update ggerganov/llama.cpp (#1848)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-03-18 08:57:58 +01:00
LocalAI [bot]
d2b83d8357 ⬆️ Update docs version mudler/LocalAI (#1847)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-03-17 23:08:32 +01:00
Ettore Di Giacinto
88b65f63d0 fix(go-llama): use llama-cpp as default (#1849)
* fix(go-llama): use llama-cpp as default

Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>

* fix(backends): drop obsoleted lines

---------

Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2024-03-17 23:08:22 +01:00
cryptk
020ce29cd8 fix(make): allow to parallelize jobs (#1845)
* fix: clean up Makefile dependencies to allow for parallel builds

* refactor: remove old unused backend from Makefile

* fix: finish removing legacy backend, update piper

* fix: I broke llama... I fixed llama

* feat: give the tests and builds a few threads

* fix: ensure libraries are replaced before build, add dropreplace target

* Fix image build workflows
2024-03-17 15:39:20 +01:00
Chakib Benziane
801b481beb fixes #1051: handle openai presence and request penalty parameters (#1817)
* fix request debugging, disable marshalling of context fields

Signed-off-by: blob42 <contact@blob42.xyz>

* merge frequency_penalty request parm with config

Signed-off-by: blob42 <contact@blob42.xyz>

* openai: add presence_penalty parameter

Signed-off-by: blob42 <contact@blob42.xyz>

---------

Signed-off-by: blob42 <contact@blob42.xyz>
2024-03-17 09:43:20 +01:00
LocalAI [bot]
8967ed1601 ⬆️ Update ggerganov/llama.cpp (#1840)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-03-16 11:25:41 +00:00
LocalAI [bot]
5826fb8e6d ⬆️ Update mudler/go-piper (#1844)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-03-15 23:51:03 +00:00
Ettore Di Giacinto
89351f1a7d feat(embeddings): do not require to be configured (#1842)
Certain engines requires to know during model loading
if the embedding feature has to be enabled, however, it is impractical
to have to set it to ALL the backends that supports embeddings.

There are transformers and sentencentransformers that seamelessly handle
both cases, without having this settings to be explicitly enabled.

The case sussist only for ggml-based models that needs to enable
featuresets during model loading (and thus settings `embedding` is
required), however most of the other engines does not require this.

This change disables the check done at code side, making easier to use
embeddings by not having to specify explicitly `embeddings: true`.

Part of: https://github.com/mudler/LocalAI/issues/1373
2024-03-15 18:14:23 +01:00
Ettore Di Giacinto
ae2e4fc2fe docs(transformers): add docs section about transformers (#1841) 2024-03-15 18:13:30 +01:00
Dave
db199f61da fix: osx build default.metallib (#1837)
fix: osx build default.metallib (#1837)
* port osx fix from refactor pr to slim pr
* manually bump llama.cpp version to unstick CI?
2024-03-15 08:18:58 +00:00
LocalAI [bot]
44adbd2c75 ⬆️ Update go-skynet/go-llama.cpp (#1835)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-03-14 23:06:42 +00:00
Ettore Di Giacinto
20136ca8b7 feat(tts): add Elevenlabs and OpenAI TTS compatibility layer (#1834)
* feat(elevenlabs): map elevenlabs API support to TTS

This allows elevenlabs Clients to work automatically with LocalAI by
supporting the elevenlabs API.

The elevenlabs server endpoint is implemented such as it is wired to the
TTS endpoints.

Fixes: https://github.com/mudler/LocalAI/issues/1809

* feat(openai/tts): compat layer with openai tts

Fixes: #1276

* fix: adapt tts CLI
2024-03-14 23:08:34 +01:00
Dave
45d520f913 fix: OSX Build Files for llama.cpp (#1836)
bot ate my changes, seperate branch
2024-03-14 23:07:47 +01:00
fakezeta
3882130911 feat: Add Bitsandbytes quantization for transformer backend enhancement #1775 and fix: Transformer backend error on CUDA #1774 (#1823)
* fixes #1775 and #1774

Add BitsAndBytes Quantization and fixes embedding on CUDA devices

* Manage 4bit and 8 bit quantization

Manage different BitsAndBytes options with the quantization: parameter in yaml

* fix compilation errors on non CUDA environment
2024-03-14 23:06:30 +01:00
cryptk
a6b540737f fix: missing OpenCL libraries from docker containers during clblas docker build (#1830) 2024-03-14 08:40:37 +01:00
LocalAI [bot]
f82065703d ⬆️ Update ggerganov/llama.cpp (#1827)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-03-14 08:39:39 +01:00
cryptk
b423af001d fix: the correct BUILD_TYPE for OpenCL is clblas (with no t) (#1828) 2024-03-14 08:39:21 +01:00
Ettore Di Giacinto
b9e77d394b feat(model-help): display help text in markdown (#1825)
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2024-03-13 21:50:46 +01:00
Ettore Di Giacinto
57222497ec fix(docker-compose): update docker compose file (#1824)
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2024-03-13 17:57:45 +01:00
LocalAI [bot]
5c5f07c1e7 ⬆️ Update ggerganov/llama.cpp (#1821)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-03-13 10:05:46 +01:00
Ettore Di Giacinto
f895d06605 fix(config): set better defaults for inferencing (#1822)
* fix(defaults): set better defaults for inferencing

This changeset aim to have better defaults and to properly detect when
no inference settings are provided with the model.

If not specified, we defaults to mirostat sampling, and offload all the
GPU layers (if a GPU is detected).

Related to https://github.com/mudler/LocalAI/issues/1373 and https://github.com/mudler/LocalAI/issues/1723

* Adapt tests

* Also pre-initialize default seed
2024-03-13 10:05:30 +01:00
Ettore Di Giacinto
bc8f648a91 fix(doc/examples): set defaults to mirostat (#1820)
The default sampler on some models don't return enough candidates which
leads to a false sense of randomness. Tracing back the code it looks
that with the temperature sampler there might not be enough
candidates to pick from, and since the seed and "randomness" take effect
while picking a good candidate this yields to the same results over and
over.

Fixes https://github.com/mudler/LocalAI/issues/1723 by updating the
examples and documentation to use mirostat instead.
2024-03-11 19:49:03 +01:00
LocalAI [bot]
8e57f4df31 ⬆️ Update ggerganov/llama.cpp (#1818)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-03-11 00:02:37 +01:00
LocalAI [bot]
a08cc5adbb ⬆️ Update ggerganov/llama.cpp (#1816)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-03-10 09:32:09 +01:00
LocalAI [bot]
595a73fce4 ⬆️ Update ggerganov/llama.cpp (#1813)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-03-09 09:27:06 +01:00
LocalAI [bot]
dc919e08e8 ⬆️ Update ggerganov/llama.cpp (#1811)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-03-08 08:21:25 +01:00
Ettore Di Giacinto
5d1018495f feat(intel): add diffusers/transformers support (#1746)
* feat(intel): add diffusers support

* try to consume upstream container image

* Debug

* Manually install deps

* Map transformers/hf cache dir to modelpath if not specified

* fix(compel): update initialization, pass by all gRPC options

* fix: add dependencies, implement transformers for xpu

* base it from the oneapi image

* Add pillow

* set threads if specified when launching the API

* Skip conda install if intel

* defaults to non-intel

* ci: add to pipelines

* prepare compel only if enabled

* Skip conda install if intel

* fix cleanup

* Disable compel by default

* Install torch 2.1.0 with Intel

* Skip conda on some setups

* Detect python

* Quiet output

* Do not override system python with conda

* Prefer python3

* Fixups

* exllama2: do not install without conda (overrides pytorch version)

* exllama/exllama2: do not install if not using cuda

* Add missing dataset dependency

* Small fixups, symlink to python, add requirements

* Add neural_speed to the deps

* correctly handle model offloading

* fix: device_map == xpu

* go back at calling python, fixed at dockerfile level

* Exllama2 restricted to only nvidia gpus

* Tokenizer to xpu
2024-03-07 14:37:45 +01:00
LocalAI [bot]
ad6fd7a991 ⬆️ Update ggerganov/llama.cpp (#1805)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-03-06 23:28:31 +01:00
LocalAI [bot]
e022b5959e ⬆️ Update mudler/go-stable-diffusion (#1802)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-03-05 23:39:57 +00:00
LocalAI [bot]
db7f4955a1 ⬆️ Update ggerganov/llama.cpp (#1801)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-03-05 21:50:27 +00:00
Dave
5c69dd155f feat(autogpt/transformers): consume trust_remote_code (#1799)
trusting remote code by default is a danger to our users
2024-03-05 19:47:15 +01:00
TwinFin
504f2e8bf4 Update Backend Dependancies (#1797)
* Update transformers.yml

Signed-off-by: TwinFin <57421631+TwinFinz@users.noreply.github.com>

* Update transformers-rocm.yml

Signed-off-by: TwinFin <57421631+TwinFinz@users.noreply.github.com>

* Update transformers-nvidia.yml

Signed-off-by: TwinFin <57421631+TwinFinz@users.noreply.github.com>

---------

Signed-off-by: TwinFin <57421631+TwinFinz@users.noreply.github.com>
2024-03-05 10:10:00 +00:00
Luna Midori
e586dc2924 Edit links in readme and integrations page (#1796)
* Update integrations.md

Signed-off-by: Luna Midori <118759930+lunamidori5@users.noreply.github.com>

* Update README.md

Signed-off-by: Luna Midori <118759930+lunamidori5@users.noreply.github.com>

* Update README.md

Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
Signed-off-by: Luna Midori <118759930+lunamidori5@users.noreply.github.com>

* Update README.md

Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
Signed-off-by: Luna Midori <118759930+lunamidori5@users.noreply.github.com>

---------

Signed-off-by: Luna Midori <118759930+lunamidori5@users.noreply.github.com>
Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2024-03-05 10:14:30 +01:00
Ettore Di Giacinto
333f918005 Update integrations.md
Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2024-03-05 09:45:54 +01:00
LocalAI [bot]
c8e29033c2 ⬆️ Update ggerganov/llama.cpp (#1794)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-03-05 08:59:09 +01:00
LocalAI [bot]
d0bd961bde ⬆️ Update ggerganov/llama.cpp (#1791)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-03-04 09:44:21 +01:00
Ettore Di Giacinto
006511ee25 Revert "feat(assistant): Initial implementation of assistants api" (#1790)
Revert "feat(assistant): Initial implementation of assistants api (#1761)"

This reverts commit 4ab72146cd.
2024-03-03 10:31:06 +01:00
Steven Christou
4ab72146cd feat(assistant): Initial implementation of assistants api (#1761)
Initial implementation of assistants api
2024-03-03 08:50:43 +01:00
LocalAI [bot]
b60a3fc879 ⬆️ Update ggerganov/llama.cpp (#1789)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-03-03 08:49:23 +01:00
Ettore Di Giacinto
a0eeb74957 Update hot topics/roadmap
Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2024-03-02 09:35:40 +01:00
LocalAI [bot]
daa0b8741c ⬆️ Update ggerganov/llama.cpp (#1785)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-03-01 22:38:24 +00:00
Ludovic Leroux
939411300a Bump vLLM version + more options when loading models in vLLM (#1782)
* Bump vLLM version to 0.3.2

* Add vLLM model loading options

* Remove transformers-exllama

* Fix install exllama
2024-03-01 22:48:53 +01:00
Dave
1c312685aa refactor: move remaining api packages to core (#1731)
* core 1

* api/openai/files fix

* core 2 - core/config

* move over core api.go and tests to the start of core/http

* move over localai specific endpoints to core/http, begin the service/endpoint split there

* refactor big chunk on the plane

* refactor chunk 2 on plane, next step: port and modify changes to request.go

* easy fixes for request.go, major changes not done yet

* lintfix

* json tag lintfix?

* gitignore and .keep files

* strange fix attempt: rename the config dir?
2024-03-01 16:19:53 +01:00
LocalAI [bot]
316de82f51 ⬆️ Update ggerganov/llama.cpp (#1779)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-02-29 22:33:30 +00:00
Ettore Di Giacinto
9068bc5271 Create SECURITY.md
Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2024-02-29 19:53:04 +01:00
Oussama
31a4c9c9d3 Fix Command Injection Vulnerability (#1778)
* Added fix for command injection

* changed function name from sh to runCommand
2024-02-29 18:32:29 +00:00
Ettore Di Giacinto
c1966af2cf ci: reduce stress on self-hosted runners (#1776)
Split jobs by self-hosted and free public runner provided by Github

Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2024-02-29 11:40:08 +01:00
LocalAI [bot]
c665898652 ⬆️ Update donomii/go-rwkv.cpp (#1771)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-02-28 23:50:27 +00:00
LocalAI [bot]
f651a660aa ⬆️ Update ggerganov/llama.cpp (#1772)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-02-28 23:02:30 +01:00
Ettore Di Giacinto
ba672b51da Update README.md
Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2024-02-28 16:03:38 +01:00
Ettore Di Giacinto
be498c5dd9 Update openai-functions.md
Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2024-02-28 15:58:31 +01:00
Ettore Di Giacinto
6e95beccb9 Update overview.md
Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2024-02-28 15:24:08 +01:00
Ettore Di Giacinto
c8be839481 Update openai-functions.md
Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2024-02-27 23:24:46 +01:00
LocalAI [bot]
c7e08813a5 ⬆️ Update ggerganov/llama.cpp (#1767)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-02-27 23:12:51 +01:00
LocalAI [bot]
d21a6b33ab ⬆️ Update ggerganov/llama.cpp (#1756)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-02-27 18:07:51 +00:00
Joshua Waring
9112cf153e Update integrations.md (#1765)
Added Jetbrains compatible plugin for LocalAI

Signed-off-by: Joshua Waring <Joshhua5@users.noreply.github.com>
2024-02-27 17:35:59 +01:00
Ettore Di Giacinto
3868ac8402 Update README.md
Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2024-02-27 15:44:15 +01:00
Ettore Di Giacinto
3f09010227 Update README.md
Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2024-02-27 15:43:15 +01:00
Ettore Di Giacinto
d6cf82aba3 fix(tests): re-enable tests after code move (#1764)
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2024-02-27 15:04:19 +01:00
Ettore Di Giacinto
dfe54639b1 Update README.md
Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2024-02-27 10:37:56 +01:00
Ettore Di Giacinto
bc5f5aa538 deps(llama.cpp): update (#1759)
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2024-02-26 13:18:44 +01:00
Ettore Di Giacinto
05818e0425 fix(functions): handle correctly when there are no results (#1758) 2024-02-26 08:38:23 +01:00
Sertaç Özercan
7f72a61104 ci: add stablediffusion to release (#1757)
Signed-off-by: Sertac Ozercan <sozercan@gmail.com>
2024-02-25 23:06:18 +00:00
LocalAI [bot]
8e45d47740 ⬆️ Update ggerganov/llama.cpp (#1753)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-02-25 10:03:19 +01:00
LocalAI [bot]
71771d1e9b ⬆️ Update docs version mudler/LocalAI (#1752)
Signed-off-by: GitHub <noreply@github.com>
Co-authored-by: mudler <mudler@users.noreply.github.com>
2024-02-25 10:02:52 +01:00
Ettore Di Giacinto
aa098e4d0b fix(sse): do not omit empty finish_reason (#1745)
Fixes https://github.com/mudler/LocalAI/issues/1744
2024-02-24 11:51:59 +01:00
Ludovic Leroux
0135e1e3b9 fix: vllm - use AsyncLLMEngine to allow true streaming mode (#1749)
* fix: use vllm AsyncLLMEngine to bring true stream

Current vLLM implementation uses the LLMEngine, which was designed for offline batch inference, which results in the streaming mode outputing all blobs at once at the end of the inference.

This PR reworks the gRPC server to use asyncio and gRPC.aio, in combination with vLLM's AsyncLLMEngine to bring true stream mode.

This PR also passes more parameters to vLLM during inference (presence_penalty, frequency_penalty, stop, ignore_eos, seed, ...).

* Remove unused import
2024-02-24 11:48:45 +01:00
206 changed files with 13509 additions and 3408 deletions

View File

@@ -3,3 +3,4 @@ models
examples/chatbot-ui/models
examples/rwkv/models
examples/**/models
Dockerfile*

31
.editorconfig Normal file
View File

@@ -0,0 +1,31 @@
root = true
[*]
indent_style = space
indent_size = 2
end_of_line = lf
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
[*.go]
indent_style = tab
[Makefile]
indent_style = tab
[*.proto]
indent_size = 2
[*.py]
indent_size = 4
[*.js]
indent_size = 2
[*.yaml]
indent_size = 2
[*.md]
trim_trailing_whitespace = false

2
.env
View File

@@ -18,7 +18,7 @@
## Default path for models
#
MODELS_PATH=/models
# MODELS_PATH=/models
## Enable debug mode
# DEBUG=true

19
.github/labeler.yml vendored Normal file
View File

@@ -0,0 +1,19 @@
enhancements:
- head-branch: ['^feature', 'feature']
kind/documentation:
- any:
- changed-files:
- any-glob-to-any-file: 'docs/*'
- changed-files:
- any-glob-to-any-file: '*.md'
examples:
- any:
- changed-files:
- any-glob-to-any-file: 'examples/*'
ci:
- any:
- changed-files:
- any-glob-to-any-file: '.github/*'

12
.github/release.yml vendored
View File

@@ -12,13 +12,23 @@ changelog:
- title: "Bug fixes :bug:"
labels:
- bug
- regression
- title: Exciting New Features 🎉
labels:
- Semver-Minor
- enhancement
- ux
- roadmap
- title: 🧠 Models
labels:
- area/ai-model
- title: 📖 Documentation and examples
labels:
- kind/documentation
- examples
- title: 👒 Dependencies
labels:
- dependencies
- title: Other Changes
labels:
- "*"
- "*"

View File

@@ -22,6 +22,7 @@ jobs:
platforms: ${{ matrix.platforms }}
runs-on: ${{ matrix.runs-on }}
base-image: ${{ matrix.base-image }}
makeflags: ${{ matrix.makeflags }}
secrets:
dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
@@ -41,6 +42,7 @@ jobs:
image-type: 'extras'
runs-on: 'arc-runner-set'
base-image: "ubuntu:22.04"
makeflags: "--jobs=3 --output-sync=target"
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "1"
@@ -51,6 +53,7 @@ jobs:
image-type: 'extras'
runs-on: 'arc-runner-set'
base-image: "ubuntu:22.04"
makeflags: "--jobs=3 --output-sync=target"
- build-type: 'hipblas'
platforms: 'linux/amd64'
tag-latest: 'false'
@@ -59,6 +62,16 @@ jobs:
image-type: 'extras'
base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target"
- build-type: 'sycl_f16'
platforms: 'linux/amd64'
tag-latest: 'false'
base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
tag-suffix: 'sycl-f16-ffmpeg'
ffmpeg: 'true'
image-type: 'extras'
runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target"
core-image-build:
uses: ./.github/workflows/image_build.yml
with:
@@ -72,6 +85,7 @@ jobs:
platforms: ${{ matrix.platforms }}
runs-on: ${{ matrix.runs-on }}
base-image: ${{ matrix.base-image }}
makeflags: ${{ matrix.makeflags }}
secrets:
dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
@@ -88,6 +102,7 @@ jobs:
image-type: 'core'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:22.04"
makeflags: "--jobs=5 --output-sync=target"
- build-type: 'sycl_f16'
platforms: 'linux/amd64'
tag-latest: 'false'
@@ -96,6 +111,7 @@ jobs:
ffmpeg: 'true'
image-type: 'core'
runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target"
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "1"
@@ -106,3 +122,4 @@ jobs:
image-type: 'core'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:22.04"
makeflags: "--jobs=5 --output-sync=target"

View File

@@ -13,7 +13,7 @@ concurrency:
cancel-in-progress: true
jobs:
extras-image-build:
self-hosted-jobs:
uses: ./.github/workflows/image_build.yml
with:
tag-latest: ${{ matrix.tag-latest }}
@@ -26,6 +26,10 @@ jobs:
platforms: ${{ matrix.platforms }}
runs-on: ${{ matrix.runs-on }}
base-image: ${{ matrix.base-image }}
aio: ${{ matrix.aio }}
makeflags: ${{ matrix.makeflags }}
latest-image: ${{ matrix.latest-image }}
latest-image-aio: ${{ matrix.latest-image-aio }}
secrets:
dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
@@ -37,6 +41,7 @@ jobs:
max-parallel: ${{ github.event_name != 'pull_request' && 2 || 4 }}
matrix:
include:
# Extra images
- build-type: ''
#platforms: 'linux/amd64,linux/arm64'
platforms: 'linux/amd64'
@@ -46,14 +51,16 @@ jobs:
image-type: 'extras'
runs-on: 'arc-runner-set'
base-image: "ubuntu:22.04"
makeflags: "--jobs=3 --output-sync=target"
- build-type: ''
platforms: 'linux/amd64'
tag-latest: 'false'
tag-latest: 'auto'
tag-suffix: '-ffmpeg'
ffmpeg: 'true'
image-type: 'extras'
runs-on: 'arc-runner-set'
base-image: "ubuntu:22.04"
makeflags: "--jobs=3 --output-sync=target"
- build-type: 'cublas'
cuda-major-version: "11"
cuda-minor-version: "7"
@@ -64,6 +71,7 @@ jobs:
image-type: 'extras'
runs-on: 'arc-runner-set'
base-image: "ubuntu:22.04"
makeflags: "--jobs=3 --output-sync=target"
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "1"
@@ -74,26 +82,35 @@ jobs:
image-type: 'extras'
runs-on: 'arc-runner-set'
base-image: "ubuntu:22.04"
makeflags: "--jobs=3 --output-sync=target"
- build-type: 'cublas'
cuda-major-version: "11"
cuda-minor-version: "7"
platforms: 'linux/amd64'
tag-latest: 'false'
tag-latest: 'auto'
tag-suffix: '-cublas-cuda11-ffmpeg'
ffmpeg: 'true'
image-type: 'extras'
runs-on: 'arc-runner-set'
base-image: "ubuntu:22.04"
aio: "-aio-gpu-nvidia-cuda-11"
latest-image: 'latest-gpu-nvidia-cuda-11'
latest-image-aio: 'latest-aio-gpu-nvidia-cuda-11'
makeflags: "--jobs=3 --output-sync=target"
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "1"
platforms: 'linux/amd64'
tag-latest: 'false'
tag-latest: 'auto'
tag-suffix: '-cublas-cuda12-ffmpeg'
ffmpeg: 'true'
image-type: 'extras'
runs-on: 'arc-runner-set'
base-image: "ubuntu:22.04"
aio: "-aio-gpu-nvidia-cuda-12"
latest-image: 'latest-gpu-nvidia-cuda-12'
latest-image-aio: 'latest-aio-gpu-nvidia-cuda-12'
makeflags: "--jobs=3 --output-sync=target"
- build-type: ''
#platforms: 'linux/amd64,linux/arm64'
platforms: 'linux/amd64'
@@ -103,14 +120,19 @@ jobs:
image-type: 'extras'
base-image: "ubuntu:22.04"
runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target"
- build-type: 'hipblas'
platforms: 'linux/amd64'
tag-latest: 'false'
tag-latest: 'auto'
tag-suffix: '-hipblas-ffmpeg'
ffmpeg: 'true'
image-type: 'extras'
aio: "-aio-gpu-hipblas"
base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
latest-image: 'latest-gpu-hipblas'
latest-image-aio: 'latest-aio-gpu-hipblas'
runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target"
- build-type: 'hipblas'
platforms: 'linux/amd64'
tag-latest: 'false'
@@ -119,6 +141,87 @@ jobs:
image-type: 'extras'
base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target"
- build-type: 'sycl_f16'
platforms: 'linux/amd64'
tag-latest: 'auto'
base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
tag-suffix: '-sycl-f16-ffmpeg'
ffmpeg: 'true'
image-type: 'extras'
runs-on: 'arc-runner-set'
aio: "-aio-gpu-intel-f16"
latest-image: 'latest-gpu-intel-f16'
latest-image-aio: 'latest-aio-gpu-intel-f16'
makeflags: "--jobs=3 --output-sync=target"
- build-type: 'sycl_f32'
platforms: 'linux/amd64'
tag-latest: 'auto'
base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
tag-suffix: '-sycl-f32-ffmpeg'
ffmpeg: 'true'
image-type: 'extras'
runs-on: 'arc-runner-set'
aio: "-aio-gpu-intel-f32"
latest-image: 'latest-gpu-intel-f32'
latest-image-aio: 'latest-aio-gpu-intel-f32'
makeflags: "--jobs=3 --output-sync=target"
# Core images
- build-type: 'sycl_f16'
platforms: 'linux/amd64'
tag-latest: 'false'
base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
tag-suffix: '-sycl-f16-core'
ffmpeg: 'false'
image-type: 'core'
runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target"
- build-type: 'sycl_f32'
platforms: 'linux/amd64'
tag-latest: 'false'
base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
tag-suffix: '-sycl-f32-core'
ffmpeg: 'false'
image-type: 'core'
runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target"
- build-type: 'sycl_f16'
platforms: 'linux/amd64'
tag-latest: 'false'
base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
tag-suffix: '-sycl-f16-ffmpeg-core'
ffmpeg: 'true'
image-type: 'core'
runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target"
- build-type: 'sycl_f32'
platforms: 'linux/amd64'
tag-latest: 'false'
base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
tag-suffix: '-sycl-f32-ffmpeg-core'
ffmpeg: 'true'
image-type: 'core'
runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target"
- build-type: 'hipblas'
platforms: 'linux/amd64'
tag-latest: 'false'
tag-suffix: '-hipblas-ffmpeg-core'
ffmpeg: 'true'
image-type: 'core'
base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target"
- build-type: 'hipblas'
platforms: 'linux/amd64'
tag-latest: 'false'
tag-suffix: '-hipblas-core'
ffmpeg: 'false'
image-type: 'core'
base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target"
core-image-build:
uses: ./.github/workflows/image_build.yml
with:
@@ -131,7 +234,11 @@ jobs:
cuda-minor-version: ${{ matrix.cuda-minor-version }}
platforms: ${{ matrix.platforms }}
runs-on: ${{ matrix.runs-on }}
aio: ${{ matrix.aio }}
base-image: ${{ matrix.base-image }}
makeflags: ${{ matrix.makeflags }}
latest-image: ${{ matrix.latest-image }}
latest-image-aio: ${{ matrix.latest-image-aio }}
secrets:
dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
@@ -140,62 +247,18 @@ jobs:
strategy:
matrix:
include:
- build-type: 'hipblas'
platforms: 'linux/amd64'
tag-latest: 'false'
tag-suffix: '-hipblas-ffmpeg-core'
ffmpeg: 'true'
image-type: 'core'
base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
runs-on: 'arc-runner-set'
- build-type: 'hipblas'
platforms: 'linux/amd64'
tag-latest: 'false'
tag-suffix: '-hipblas-core'
ffmpeg: 'false'
image-type: 'core'
base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
runs-on: 'arc-runner-set'
- build-type: ''
platforms: 'linux/amd64'
tag-latest: 'false'
tag-latest: 'auto'
tag-suffix: '-ffmpeg-core'
ffmpeg: 'true'
image-type: 'core'
base-image: "ubuntu:22.04"
runs-on: 'ubuntu-latest'
- build-type: 'sycl_f16'
platforms: 'linux/amd64'
tag-latest: 'false'
base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
tag-suffix: '-sycl-f16-core'
ffmpeg: 'false'
image-type: 'core'
runs-on: 'arc-runner-set'
- build-type: 'sycl_f32'
platforms: 'linux/amd64'
tag-latest: 'false'
base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
tag-suffix: '-sycl-f32-core'
ffmpeg: 'false'
image-type: 'core'
runs-on: 'arc-runner-set'
- build-type: 'sycl_f16'
platforms: 'linux/amd64'
tag-latest: 'false'
base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
tag-suffix: '-sycl-f16-ffmpeg-core'
ffmpeg: 'true'
image-type: 'core'
runs-on: 'arc-runner-set'
- build-type: 'sycl_f32'
platforms: 'linux/amd64'
tag-latest: 'false'
base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
tag-suffix: '-sycl-f32-ffmpeg-core'
ffmpeg: 'true'
image-type: 'core'
runs-on: 'arc-runner-set'
aio: "-aio-cpu"
latest-image: 'latest-cpu'
latest-image-aio: 'latest-aio-cpu'
makeflags: "--jobs=5 --output-sync=target"
- build-type: 'cublas'
cuda-major-version: "11"
cuda-minor-version: "7"
@@ -206,6 +269,7 @@ jobs:
image-type: 'core'
base-image: "ubuntu:22.04"
runs-on: 'ubuntu-latest'
makeflags: "--jobs=5 --output-sync=target"
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "1"
@@ -216,6 +280,7 @@ jobs:
image-type: 'core'
base-image: "ubuntu:22.04"
runs-on: 'ubuntu-latest'
makeflags: "--jobs=5 --output-sync=target"
- build-type: 'cublas'
cuda-major-version: "11"
cuda-minor-version: "7"
@@ -226,6 +291,7 @@ jobs:
image-type: 'core'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:22.04"
makeflags: "--jobs=5 --output-sync=target"
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "1"
@@ -236,3 +302,4 @@ jobs:
image-type: 'core'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:22.04"
makeflags: "--jobs=5 --output-sync=target"

View File

@@ -29,6 +29,14 @@ on:
description: 'Tag latest'
default: ''
type: string
latest-image:
description: 'Tag latest'
default: ''
type: string
latest-image-aio:
description: 'Tag latest'
default: ''
type: string
tag-suffix:
description: 'Tag suffix'
default: ''
@@ -46,6 +54,16 @@ on:
required: true
default: ''
type: string
makeflags:
description: 'Make Flags'
required: false
default: '--jobs=3 --output-sync=target'
type: string
aio:
description: 'AIO Image Name'
required: false
default: ''
type: string
secrets:
dockerUsername:
required: true
@@ -69,6 +87,7 @@ jobs:
&& sudo apt-get install -y git
- name: Checkout
uses: actions/checkout@v4
- name: Release space from worker
if: inputs.runs-on == 'ubuntu-latest'
run: |
@@ -110,6 +129,7 @@ jobs:
sudo rm -rf "/usr/local/share/boost" || true
sudo rm -rf "$AGENT_TOOLSDIRECTORY" || true
df -h
- name: Docker meta
id: meta
uses: docker/metadata-action@v5
@@ -125,6 +145,34 @@ jobs:
latest=${{ inputs.tag-latest }}
suffix=${{ inputs.tag-suffix }}
- name: Docker meta AIO (quay.io)
if: inputs.aio != ''
id: meta_aio
uses: docker/metadata-action@v5
with:
images: |
quay.io/go-skynet/local-ai
tags: |
type=ref,event=branch
type=semver,pattern={{raw}}
flavor: |
latest=${{ inputs.tag-latest }}
suffix=${{ inputs.aio }}
- name: Docker meta AIO (dockerhub)
if: inputs.aio != ''
id: meta_aio_dockerhub
uses: docker/metadata-action@v5
with:
images: |
localai/localai
tags: |
type=ref,event=branch
type=semver,pattern={{raw}}
flavor: |
latest=${{ inputs.tag-latest }}
suffix=${{ inputs.aio }}
- name: Set up QEMU
uses: docker/setup-qemu-action@master
with:
@@ -149,6 +197,25 @@ jobs:
username: ${{ secrets.quayUsername }}
password: ${{ secrets.quayPassword }}
- name: Cache GRPC
uses: docker/build-push-action@v5
with:
builder: ${{ steps.buildx.outputs.name }}
build-args: |
IMAGE_TYPE=${{ inputs.image-type }}
BASE_IMAGE=${{ inputs.base-image }}
MAKEFLAGS=${{ inputs.makeflags }}
GRPC_VERSION=v1.58.0
context: .
file: ./Dockerfile
cache-from: type=gha
cache-to: type=gha,ignore-error=true
target: grpc
platforms: ${{ inputs.platforms }}
push: false
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
- name: Build and push
uses: docker/build-push-action@v5
with:
@@ -160,12 +227,75 @@ jobs:
FFMPEG=${{ inputs.ffmpeg }}
IMAGE_TYPE=${{ inputs.image-type }}
BASE_IMAGE=${{ inputs.base-image }}
MAKEFLAGS=${{ inputs.makeflags }}
context: .
file: ./Dockerfile
cache-from: type=gha
platforms: ${{ inputs.platforms }}
push: ${{ github.event_name != 'pull_request' }}
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
- name: Inspect image
if: github.event_name != 'pull_request'
run: |
docker pull localai/localai:${{ steps.meta.outputs.version }}
docker image inspect localai/localai:${{ steps.meta.outputs.version }}
docker pull quay.io/go-skynet/local-ai:${{ steps.meta.outputs.version }}
docker image inspect quay.io/go-skynet/local-ai:${{ steps.meta.outputs.version }}
- name: Build and push AIO image
if: inputs.aio != ''
uses: docker/build-push-action@v5
with:
builder: ${{ steps.buildx.outputs.name }}
build-args: |
BASE_IMAGE=quay.io/go-skynet/local-ai:${{ steps.meta.outputs.version }}
MAKEFLAGS=${{ inputs.makeflags }}
context: .
file: ./Dockerfile.aio
platforms: ${{ inputs.platforms }}
push: ${{ github.event_name != 'pull_request' }}
tags: ${{ steps.meta_aio.outputs.tags }}
labels: ${{ steps.meta_aio.outputs.labels }}
- name: Build and push AIO image (dockerhub)
if: inputs.aio != ''
uses: docker/build-push-action@v5
with:
builder: ${{ steps.buildx.outputs.name }}
build-args: |
BASE_IMAGE=localai/localai:${{ steps.meta.outputs.version }}
MAKEFLAGS=${{ inputs.makeflags }}
context: .
file: ./Dockerfile.aio
platforms: ${{ inputs.platforms }}
push: ${{ github.event_name != 'pull_request' }}
tags: ${{ steps.meta_aio_dockerhub.outputs.tags }}
labels: ${{ steps.meta_aio_dockerhub.outputs.labels }}
- name: Latest tag
# run this on branches, when it is a tag and there is a latest-image defined
if: github.event_name != 'pull_request' && inputs.latest-image != '' && github.ref_type == 'tag'
run: |
docker pull localai/localai:${{ steps.meta.outputs.version }}
docker tag localai/localai:${{ steps.meta.outputs.version }} localai/localai:${{ inputs.latest-image }}
docker pull quay.io/go-skynet/local-ai:${{ steps.meta.outputs.version }}
docker tag quay.io/go-skynet/local-ai:${{ steps.meta.outputs.version }} quay.io/go-skynet/local-ai:${{ inputs.latest-image }}
- name: Latest AIO tag
# run this on branches, when it is a tag and there is a latest-image defined
if: github.event_name != 'pull_request' && inputs.latest-image-aio != '' && github.ref_type == 'tag'
run: |
docker pull localai/localai:${{ steps.meta_aio_dockerhub.outputs.version }}
docker tag localai/localai:${{ steps.meta_aio_dockerhub.outputs.version }} localai/localai:${{ inputs.latest-image-aio }}
docker pull quay.io/go-skynet/local-ai:${{ steps.meta_aio.outputs.version }}
docker tag quay.io/go-skynet/local-ai:${{ steps.meta_aio.outputs.version }} quay.io/go-skynet/local-ai:${{ inputs.latest-image-aio }}
- name: job summary
run: |
echo "Built image: ${{ steps.meta.outputs.labels }}" >> $GITHUB_STEP_SUMMARY
- name: job summary(AIO)
if: inputs.aio != ''
run: |
echo "Built image: ${{ steps.meta_aio.outputs.labels }}" >> $GITHUB_STEP_SUMMARY

12
.github/workflows/labeler.yml vendored Normal file
View File

@@ -0,0 +1,12 @@
name: "Pull Request Labeler"
on:
- pull_request_target
jobs:
labeler:
permissions:
contents: read
pull-requests: write
runs-on: ubuntu-latest
steps:
- uses: actions/labeler@v5

View File

@@ -2,6 +2,9 @@ name: Build and Release
on: push
env:
GRPC_VERSION: v1.58.0
permissions:
contents: write
@@ -32,7 +35,8 @@ jobs:
submodules: true
- uses: actions/setup-go@v4
with:
go-version: '>=1.21.0'
go-version: '1.21.x'
cache: false
- name: Dependencies
run: |
sudo apt-get update
@@ -54,17 +58,17 @@ jobs:
uses: actions/cache@v3
with:
path: grpc
key: ${{ runner.os }}-grpc
key: ${{ runner.os }}-grpc-${{ env.GRPC_VERSION }}
- name: Build grpc
if: steps.cache-grpc.outputs.cache-hit != 'true'
run: |
git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
git clone --recurse-submodules -b ${{ env.GRPC_VERSION }} --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
-DgRPC_BUILD_TESTS=OFF \
../.. && sudo make -j12
../.. && sudo make --jobs 5 --output-sync=target
- name: Install gRPC
run: |
cd grpc && cd cmake/build && sudo make -j12 install
cd grpc && cd cmake/build && sudo make --jobs 5 --output-sync=target install
- name: Build
id: build
env:
@@ -89,6 +93,35 @@ jobs:
files: |
release/*
build-stablediffusion:
runs-on: ubuntu-latest
steps:
- name: Clone
uses: actions/checkout@v4
with:
submodules: true
- uses: actions/setup-go@v4
with:
go-version: '1.21.x'
cache: false
- name: Dependencies
run: |
sudo apt-get install -y --no-install-recommends libopencv-dev
- name: Build stablediffusion
run: |
make backend-assets/grpc/stablediffusion
mkdir -p release && cp backend-assets/grpc/stablediffusion release
- uses: actions/upload-artifact@v3
with:
name: stablediffusion
path: release/
- name: Release
uses: softprops/action-gh-release@v1
if: startsWith(github.ref, 'refs/tags/')
with:
files: |
release/*
build-macOS:
strategy:
matrix:
@@ -107,7 +140,8 @@ jobs:
submodules: true
- uses: actions/setup-go@v4
with:
go-version: '>=1.21.0'
go-version: '1.21.x'
cache: false
- name: Dependencies
run: |
brew install protobuf grpc

27
.github/workflows/secscan.yaml vendored Normal file
View File

@@ -0,0 +1,27 @@
name: "Security Scan"
# Run workflow each time code is pushed to your repository and on a schedule.
# The scheduled workflow runs every at 00:00 on Sunday UTC time.
on:
push:
schedule:
- cron: '0 0 * * 0'
jobs:
tests:
runs-on: ubuntu-latest
env:
GO111MODULE: on
steps:
- name: Checkout Source
uses: actions/checkout@v3
- name: Run Gosec Security Scanner
uses: securego/gosec@master
with:
# we let the report trigger content trigger a failure using the GitHub Security features.
args: '-no-fail -fmt sarif -out results.sarif ./...'
- name: Upload SARIF file
uses: github/codeql-action/upload-sarif@v2
with:
# Path to SARIF file relative to the root of the repository
sarif_file: results.sarif

View File

@@ -33,15 +33,15 @@ jobs:
sudo apt-get update && \
sudo apt-get install -y conda
sudo apt-get install -y ca-certificates cmake curl patch
sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
sudo apt-get install -y libopencv-dev
sudo rm -rfv /usr/bin/conda || true
- name: Test transformers
run: |
export PATH=$PATH:/opt/conda/bin
make -C backend/python/transformers
make -C backend/python/transformers test
make --jobs=5 --output-sync=target -C backend/python/transformers
make --jobs=5 --output-sync=target -C backend/python/transformers test
tests-sentencetransformers:
runs-on: ubuntu-latest
@@ -62,15 +62,15 @@ jobs:
sudo apt-get update && \
sudo apt-get install -y conda
sudo apt-get install -y ca-certificates cmake curl patch
sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
sudo apt-get install -y libopencv-dev
sudo rm -rfv /usr/bin/conda || true
- name: Test sentencetransformers
run: |
export PATH=$PATH:/opt/conda/bin
make -C backend/python/sentencetransformers
make -C backend/python/sentencetransformers test
make --jobs=5 --output-sync=target -C backend/python/sentencetransformers
make --jobs=5 --output-sync=target -C backend/python/sentencetransformers test
tests-diffusers:
runs-on: ubuntu-latest
@@ -91,15 +91,15 @@ jobs:
sudo apt-get update && \
sudo apt-get install -y conda
sudo apt-get install -y ca-certificates cmake curl patch
sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
sudo apt-get install -y libopencv-dev
sudo rm -rfv /usr/bin/conda || true
- name: Test diffusers
run: |
export PATH=$PATH:/opt/conda/bin
make -C backend/python/diffusers
make -C backend/python/diffusers test
make --jobs=5 --output-sync=target -C backend/python/diffusers
make --jobs=5 --output-sync=target -C backend/python/diffusers test
tests-transformers-musicgen:
@@ -121,46 +121,46 @@ jobs:
sudo apt-get update && \
sudo apt-get install -y conda
sudo apt-get install -y ca-certificates cmake curl patch
sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
sudo apt-get install -y libopencv-dev
sudo rm -rfv /usr/bin/conda || true
- name: Test transformers-musicgen
run: |
export PATH=$PATH:/opt/conda/bin
make -C backend/python/transformers-musicgen
make -C backend/python/transformers-musicgen test
make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen
make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen test
tests-petals:
runs-on: ubuntu-latest
steps:
- name: Clone
uses: actions/checkout@v4
with:
submodules: true
- name: Dependencies
run: |
sudo apt-get update
sudo apt-get install build-essential ffmpeg
curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
sudo apt-get update && \
sudo apt-get install -y conda
sudo apt-get install -y ca-certificates cmake curl patch
sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
# tests-petals:
# runs-on: ubuntu-latest
# steps:
# - name: Clone
# uses: actions/checkout@v4
# with:
# submodules: true
# - name: Dependencies
# run: |
# sudo apt-get update
# sudo apt-get install build-essential ffmpeg
# curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
# sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
# gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
# sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
# sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
# sudo apt-get update && \
# sudo apt-get install -y conda
# sudo apt-get install -y ca-certificates cmake curl patch
# sudo apt-get install -y libopencv-dev
sudo rm -rfv /usr/bin/conda || true
# sudo rm -rfv /usr/bin/conda || true
- name: Test petals
run: |
export PATH=$PATH:/opt/conda/bin
make -C backend/python/petals
make -C backend/python/petals test
# - name: Test petals
# run: |
# export PATH=$PATH:/opt/conda/bin
# make --jobs=5 --output-sync=target -C backend/python/petals
# make --jobs=5 --output-sync=target -C backend/python/petals test
@@ -223,15 +223,15 @@ jobs:
# sudo apt-get update && \
# sudo apt-get install -y conda
# sudo apt-get install -y ca-certificates cmake curl patch
# sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
# sudo apt-get install -y libopencv-dev
# sudo rm -rfv /usr/bin/conda || true
# - name: Test bark
# run: |
# export PATH=$PATH:/opt/conda/bin
# make -C backend/python/bark
# make -C backend/python/bark test
# make --jobs=5 --output-sync=target -C backend/python/bark
# make --jobs=5 --output-sync=target -C backend/python/bark test
# Below tests needs GPU. Commented out for now
@@ -255,13 +255,13 @@ jobs:
# sudo apt-get update && \
# sudo apt-get install -y conda
# sudo apt-get install -y ca-certificates cmake curl patch
# sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
# sudo apt-get install -y libopencv-dev
# sudo rm -rfv /usr/bin/conda || true
# - name: Test vllm
# run: |
# export PATH=$PATH:/opt/conda/bin
# make -C backend/python/vllm
# make -C backend/python/vllm test
# make --jobs=5 --output-sync=target -C backend/python/vllm
# make --jobs=5 --output-sync=target -C backend/python/vllm test
tests-vallex:
runs-on: ubuntu-latest
steps:
@@ -281,13 +281,13 @@ jobs:
sudo apt-get update && \
sudo apt-get install -y conda
sudo apt-get install -y ca-certificates cmake curl patch
sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
sudo apt-get install -y libopencv-dev
sudo rm -rfv /usr/bin/conda || true
- name: Test vall-e-x
run: |
export PATH=$PATH:/opt/conda/bin
make -C backend/python/vall-e-x
make -C backend/python/vall-e-x test
make --jobs=5 --output-sync=target -C backend/python/vall-e-x
make --jobs=5 --output-sync=target -C backend/python/vall-e-x test
tests-coqui:
runs-on: ubuntu-latest
@@ -313,5 +313,5 @@ jobs:
- name: Test coqui
run: |
export PATH=$PATH:/opt/conda/bin
make -C backend/python/coqui
make -C backend/python/coqui test
make --jobs=5 --output-sync=target -C backend/python/coqui
make --jobs=5 --output-sync=target -C backend/python/coqui test

View File

@@ -9,6 +9,9 @@ on:
tags:
- '*'
env:
GRPC_VERSION: v1.58.0
concurrency:
group: ci-tests-${{ github.head_ref || github.ref }}-${{ github.repository }}
cancel-in-progress: true
@@ -60,6 +63,7 @@ jobs:
uses: actions/setup-go@v4
with:
go-version: ${{ matrix.go-version }}
cache: false
# You can test your matrix by printing the current Go version
- name: Display Go version
run: go version
@@ -75,7 +79,7 @@ jobs:
sudo apt-get update && \
sudo apt-get install -y conda
sudo apt-get install -y ca-certificates cmake curl patch
sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
sudo apt-get install -y libopencv-dev
sudo rm -rfv /usr/bin/conda || true
PATH=$PATH:/opt/conda/bin make -C backend/python/sentencetransformers
@@ -91,23 +95,79 @@ jobs:
uses: actions/cache@v3
with:
path: grpc
key: ${{ runner.os }}-grpc
key: ${{ runner.os }}-grpc-${{ env.GRPC_VERSION }}
- name: Build grpc
if: steps.cache-grpc.outputs.cache-hit != 'true'
run: |
git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
git clone --recurse-submodules -b ${{ env.GRPC_VERSION }} --depth 1 --jobs 5 --shallow-submodules https://github.com/grpc/grpc && \
cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
-DgRPC_BUILD_TESTS=OFF \
../.. && sudo make -j12
../.. && sudo make --jobs 5
- name: Install gRPC
run: |
cd grpc && cd cmake/build && sudo make -j12 install
cd grpc && cd cmake/build && sudo make --jobs 5 install
- name: Test
run: |
GO_TAGS="stablediffusion tts" make test
GO_TAGS="stablediffusion tts" make --jobs 5 --output-sync=target test
- name: Setup tmate session if tests fail
if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3
timeout-minutes: 5
tests-aio-container:
runs-on: ubuntu-latest
steps:
- name: Release space from worker
run: |
echo "Listing top largest packages"
pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
head -n 30 <<< "${pkgs}"
echo
df -h
echo
sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
sudo apt-get remove --auto-remove android-sdk-platform-tools || true
sudo apt-get purge --auto-remove android-sdk-platform-tools || true
sudo rm -rf /usr/local/lib/android
sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
sudo rm -rf /usr/share/dotnet
sudo apt-get remove -y '^mono-.*' || true
sudo apt-get remove -y '^ghc-.*' || true
sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
sudo apt-get remove -y 'php.*' || true
sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
sudo apt-get remove -y '^google-.*' || true
sudo apt-get remove -y azure-cli || true
sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
sudo apt-get remove -y '^gfortran-.*' || true
sudo apt-get autoremove -y
sudo apt-get clean
echo
echo "Listing top largest packages"
pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
head -n 30 <<< "${pkgs}"
echo
sudo rm -rfv build || true
df -h
- name: Clone
uses: actions/checkout@v4
with:
submodules: true
- name: Build images
run: |
docker build --build-arg FFMPEG=true --build-arg IMAGE_TYPE=core --build-arg MAKEFLAGS="--jobs=5 --output-sync=target" -t local-ai:tests -f Dockerfile .
BASE_IMAGE=local-ai:tests DOCKER_AIO_IMAGE=local-ai-aio:test make docker-aio
- name: Test
run: |
LOCALAI_MODELS_DIR=$PWD/models LOCALAI_IMAGE_TAG=test LOCALAI_IMAGE=local-ai-aio \
make run-e2e-aio
- name: Setup tmate session if tests fail
if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3
timeout-minutes: 5
tests-apple:
runs-on: macOS-latest
runs-on: macOS-14
strategy:
matrix:
go-version: ['1.21.x']
@@ -120,14 +180,21 @@ jobs:
uses: actions/setup-go@v4
with:
go-version: ${{ matrix.go-version }}
cache: false
# You can test your matrix by printing the current Go version
- name: Display Go version
run: go version
- name: Dependencies
run: |
brew install protobuf grpc
brew install protobuf grpc make
- name: Test
run: |
export C_INCLUDE_PATH=/usr/local/include
export CPLUS_INCLUDE_PATH=/usr/local/include
CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF" make test
# Used to run the newer GNUMake version from brew that supports --output-sync
export PATH="/opt/homebrew/opt/make/libexec/gnubin:$PATH"
BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF" make --jobs 4 --output-sync=target test
- name: Setup tmate session if tests fail
if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3
timeout-minutes: 5

4
.gitignore vendored
View File

@@ -21,6 +21,7 @@ local-ai
!charts/*
# prevent above rules from omitting the api/localai folder
!api/localai
!core/**/localai
# Ignore models
models/*
@@ -34,6 +35,7 @@ release/
.idea
# Generated during build
backend-assets/
backend-assets/*
!backend-assets/.keep
prepare
/ggml-metal.metal

5
.vscode/extensions.json vendored Normal file
View File

@@ -0,0 +1,5 @@
{
"recommendations": [
"golang.go"
]
}

View File

@@ -4,6 +4,8 @@ ARG BASE_IMAGE=ubuntu:22.04
# extras or core
FROM ${BASE_IMAGE} as requirements-core
USER root
ARG GO_VERSION=1.21.7
ARG BUILD_TYPE
ARG CUDA_MAJOR_VERSION=11
@@ -21,7 +23,7 @@ RUN apt-get update && \
apt-get install -y ca-certificates curl patch pip cmake git && apt-get clean
# Install Go
RUN curl -L -s https://go.dev/dl/go$GO_VERSION.linux-$TARGETARCH.tar.gz | tar -v -C /usr/local -xz
RUN curl -L -s https://go.dev/dl/go$GO_VERSION.linux-$TARGETARCH.tar.gz | tar -C /usr/local -xz
ENV PATH $PATH:/usr/local/go/bin
COPY --chmod=644 custom-ca-certs/* /usr/local/share/ca-certificates/
@@ -61,7 +63,9 @@ WORKDIR /build
RUN test -n "$TARGETARCH" \
|| (echo 'warn: missing $TARGETARCH, either set this `ARG` manually, or run using `docker buildkit`')
# Extras requirements
###################################
###################################
FROM requirements-core as requirements-extras
RUN curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
@@ -79,6 +83,35 @@ RUN pip install --upgrade pip
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
RUN apt-get install -y espeak-ng espeak && apt-get clean
RUN if [ ! -e /usr/bin/python ]; then \
ln -s /usr/bin/python3 /usr/bin/python \
; fi
###################################
###################################
FROM ${BASE_IMAGE} as grpc
ARG MAKEFLAGS
ARG GRPC_VERSION=v1.58.0
ENV MAKEFLAGS=${MAKEFLAGS}
WORKDIR /build
RUN apt-get update && \
apt-get install -y g++ cmake git && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
RUN git clone --recurse-submodules --jobs 4 -b ${GRPC_VERSION} --depth 1 --shallow-submodules https://github.com/grpc/grpc
RUN cd grpc && \
mkdir -p cmake/build && \
cd cmake/build && \
cmake -DgRPC_INSTALL=ON -DgRPC_BUILD_TESTS=OFF ../.. && \
make
###################################
###################################
@@ -86,9 +119,11 @@ FROM requirements-${IMAGE_TYPE} as builder
ARG GO_TAGS="stablediffusion tts"
ARG GRPC_BACKENDS
ARG BUILD_GRPC=true
ARG MAKEFLAGS
ENV GRPC_BACKENDS=${GRPC_BACKENDS}
ENV GO_TAGS=${GO_TAGS}
ENV MAKEFLAGS=${MAKEFLAGS}
ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
ENV NVIDIA_REQUIRE_CUDA="cuda>=${CUDA_MAJOR_VERSION}.0"
ENV NVIDIA_VISIBLE_DEVICES=all
@@ -97,17 +132,22 @@ WORKDIR /build
COPY . .
COPY .git .
RUN echo "GO_TAGS: $GO_TAGS"
RUN make prepare
# If we are building with clblas support, we need the libraries for the builds
RUN if [ "${BUILD_TYPE}" = "clblas" ]; then \
apt-get update && \
apt-get install -y libclblast-dev && \
apt-get clean \
; fi
# stablediffusion does not tolerate a newer version of abseil, build it first
RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
RUN if [ "${BUILD_GRPC}" = "true" ]; then \
git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
-DgRPC_BUILD_TESTS=OFF \
../.. && make -j12 install \
; fi
COPY --from=grpc /build/grpc ./grpc/
RUN cd /build/grpc/cmake/build && make install
# Rebuild with defaults backends
RUN make build
@@ -126,10 +166,12 @@ ARG FFMPEG
ARG BUILD_TYPE
ARG TARGETARCH
ARG IMAGE_TYPE=extras
ARG MAKEFLAGS
ENV BUILD_TYPE=${BUILD_TYPE}
ENV REBUILD=false
ENV HEALTHCHECK_ENDPOINT=http://localhost:8080/readyz
ENV MAKEFLAGS=${MAKEFLAGS}
ARG CUDA_MAJOR_VERSION=11
ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
@@ -142,6 +184,13 @@ RUN if [ "${FFMPEG}" = "true" ]; then \
apt-get install -y ffmpeg && apt-get clean \
; fi
# Add OpenCL
RUN if [ "${BUILD_TYPE}" = "clblas" ]; then \
apt-get update && \
apt-get install -y libclblast1 && \
apt-get clean \
; fi
WORKDIR /build
# we start fresh & re-copy all assets because `make build` does not clean up nicely after itself
@@ -151,7 +200,7 @@ WORKDIR /build
COPY . .
COPY --from=builder /build/sources ./sources/
COPY --from=builder /build/grpc ./grpc/
COPY --from=grpc /build/grpc ./grpc/
RUN make prepare-sources && cd /build/grpc/cmake/build && make install && rm -rf grpc
@@ -166,43 +215,43 @@ COPY --from=builder /build/backend-assets/grpc/stablediffusion ./backend-assets/
## Duplicated from Makefile to avoid having a big layer that's hard to push
RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
PATH=$PATH:/opt/conda/bin make -C backend/python/autogptq \
make -C backend/python/autogptq \
; fi
RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
PATH=$PATH:/opt/conda/bin make -C backend/python/bark \
make -C backend/python/bark \
; fi
RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
PATH=$PATH:/opt/conda/bin make -C backend/python/diffusers \
make -C backend/python/diffusers \
; fi
RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
PATH=$PATH:/opt/conda/bin make -C backend/python/vllm \
make -C backend/python/vllm \
; fi
RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
PATH=$PATH:/opt/conda/bin make -C backend/python/mamba \
make -C backend/python/mamba \
; fi
RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
PATH=$PATH:/opt/conda/bin make -C backend/python/sentencetransformers \
make -C backend/python/sentencetransformers \
; fi
RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
PATH=$PATH:/opt/conda/bin make -C backend/python/transformers \
make -C backend/python/transformers \
; fi
RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
PATH=$PATH:/opt/conda/bin make -C backend/python/vall-e-x \
make -C backend/python/vall-e-x \
; fi
RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
PATH=$PATH:/opt/conda/bin make -C backend/python/exllama \
make -C backend/python/exllama \
; fi
RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
PATH=$PATH:/opt/conda/bin make -C backend/python/exllama2 \
make -C backend/python/exllama2 \
; fi
RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
PATH=$PATH:/opt/conda/bin make -C backend/python/petals \
make -C backend/python/petals \
; fi
RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
PATH=$PATH:/opt/conda/bin make -C backend/python/transformers-musicgen \
make -C backend/python/transformers-musicgen \
; fi
RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
PATH=$PATH:/opt/conda/bin make -C backend/python/coqui \
make -C backend/python/coqui \
; fi
# Make sure the models directory exists
@@ -211,6 +260,7 @@ RUN mkdir -p /build/models
# Define the health check command
HEALTHCHECK --interval=1m --timeout=10m --retries=10 \
CMD curl -f $HEALTHCHECK_ENDPOINT || exit 1
VOLUME /build/models
EXPOSE 8080
ENTRYPOINT [ "/build/entrypoint.sh" ]

8
Dockerfile.aio Normal file
View File

@@ -0,0 +1,8 @@
ARG BASE_IMAGE=ubuntu:22.04
FROM ${BASE_IMAGE}
RUN apt-get update && apt-get install -y pciutils && apt-get clean
COPY aio/ /aio
ENTRYPOINT [ "/aio/entrypoint.sh" ]

359
Makefile
View File

@@ -4,11 +4,8 @@ GOVET=$(GOCMD) vet
BINARY_NAME=local-ai
# llama.cpp versions
GOLLAMA_VERSION?=aeba71ee842819da681ea537e78846dc75949ac0
GOLLAMA_STABLE_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7
CPPLLAMA_VERSION?=fd43d66f46ee3b5345fb8a74a252d86ccd34a409
GOLLAMA_STABLE_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
CPPLLAMA_VERSION?=cc4a95426d17417d3c83f12bdb514fbe8abe2a88
# gpt4all version
GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
@@ -16,34 +13,37 @@ GPT4ALL_VERSION?=27a8b020c36b0df8f8b82a252d261cda47cf44b8
# go-rwkv version
RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
RWKV_VERSION?=633c5a3485c403cb2520693dc0991a25dace9f0f
RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6
# whisper.cpp version
WHISPER_CPP_VERSION?=37a709f6558c6d9783199e2b8cbb136e1c41d346
WHISPER_CPP_VERSION?=13c22321d1ac758ce68a429c23104e234b440769
# bert.cpp version
BERT_VERSION?=6abe312cded14042f6b7c3cd8edf082713334a4d
# go-piper version
PIPER_VERSION?=d6b6275ba037dabdba4a8b65dfdf6b2a73a67f07
PIPER_VERSION?=9d0100873a7dbb0824dfea40e8cec70a1b110759
# stablediffusion version
STABLEDIFFUSION_VERSION?=d5d2be8e7e395c2d73ceef61e6fe8d240f2cd831
STABLEDIFFUSION_VERSION?=362df9da29f882dbf09ade61972d16a1f53c3485
# tinydream version
TINYDREAM_VERSION?=772a9c0d9aaf768290e63cca3c904fe69faf677a
TINYDREAM_VERSION?=22a12a4bc0ac5455856f28f3b771331a551a4293
export BUILD_TYPE?=
export STABLE_BUILD_TYPE?=$(BUILD_TYPE)
export CMAKE_ARGS?=
CGO_LDFLAGS?=
CGO_LDFLAGS_WHISPER?=
CUDA_LIBPATH?=/usr/local/cuda/lib64/
GO_TAGS?=
BUILD_ID?=git
TEST_DIR=/tmp/test
TEST_FLAKES?=5
RANDOM := $(shell bash -c 'echo $$RANDOM')
VERSION?=$(shell git describe --always --tags || echo "dev" )
@@ -70,7 +70,7 @@ UNAME_S := $(shell uname -s)
endif
ifeq ($(OS),Darwin)
CGO_LDFLAGS += -lcblas -framework Accelerate
ifeq ($(OSX_SIGNING_IDENTITY),)
OSX_SIGNING_IDENTITY := $(shell security find-identity -v -p codesigning | grep '"' | head -n 1 | sed -E 's/.*"(.*)"/\1/')
endif
@@ -81,6 +81,12 @@ ifeq ($(OS),Darwin)
# disable metal if on Darwin and any other value is explicitly passed.
else ifneq ($(BUILD_TYPE),metal)
CMAKE_ARGS+=-DLLAMA_METAL=OFF
export LLAMA_NO_ACCELERATE=1
endif
ifeq ($(BUILD_TYPE),metal)
# -lcblas removed: it seems to always be listed as a duplicate flag.
CGO_LDFLAGS += -framework Accelerate
endif
endif
@@ -89,10 +95,12 @@ ifeq ($(BUILD_TYPE),openblas)
export WHISPER_OPENBLAS=1
endif
ifeq ($(BUILD_TYPE),cublas)
CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH)
export LLAMA_CUBLAS=1
export WHISPER_CUBLAS=1
CGO_LDFLAGS_WHISPER+=-L$(CUDA_LIBPATH)/stubs/ -lcuda
endif
ifeq ($(BUILD_TYPE),hipblas)
@@ -146,15 +154,16 @@ endif
ALL_GRPC_BACKENDS=backend-assets/grpc/langchain-huggingface
ALL_GRPC_BACKENDS+=backend-assets/grpc/bert-embeddings
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml
ALL_GRPC_BACKENDS+=backend-assets/grpc/gpt4all
ALL_GRPC_BACKENDS+=backend-assets/grpc/rwkv
ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper
ALL_GRPC_BACKENDS+=backend-assets/grpc/local-store
ALL_GRPC_BACKENDS+=$(OPTIONAL_GRPC)
GRPC_BACKENDS?=$(ALL_GRPC_BACKENDS) $(OPTIONAL_GRPC)
TEST_PATHS?=./api/... ./pkg/... ./core/...
# If empty, then we build all
ifeq ($(GRPC_BACKENDS),)
@@ -165,40 +174,41 @@ ifeq ($(BUILD_API_ONLY),true)
GRPC_BACKENDS=
endif
.PHONY: all test build vendor
.PHONY: all test build vendor get-sources prepare-sources prepare
all: help
## GPT4ALL
sources/gpt4all:
git clone --recurse-submodules $(GPT4ALL_REPO) sources/gpt4all
cd sources/gpt4all && git checkout -b build $(GPT4ALL_VERSION) && git submodule update --init --recursive --depth 1
## go-piper
sources/go-piper:
git clone --recurse-submodules https://github.com/mudler/go-piper sources/go-piper
cd sources/go-piper && git checkout -b build $(PIPER_VERSION) && git submodule update --init --recursive --depth 1
## BERT embeddings
sources/go-bert:
git clone --recurse-submodules https://github.com/go-skynet/go-bert.cpp sources/go-bert
cd sources/go-bert && git checkout -b build $(BERT_VERSION) && git submodule update --init --recursive --depth 1
## stable diffusion
sources/go-stable-diffusion:
git clone --recurse-submodules https://github.com/mudler/go-stable-diffusion sources/go-stable-diffusion
cd sources/go-stable-diffusion && git checkout -b build $(STABLEDIFFUSION_VERSION) && git submodule update --init --recursive --depth 1
sources/go-bert/libgobert.a: sources/go-bert
$(MAKE) -C sources/go-bert libgobert.a
sources/go-stable-diffusion/libstablediffusion.a:
$(MAKE) -C sources/go-stable-diffusion libstablediffusion.a
## go-llama-ggml
sources/go-llama-ggml:
git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp sources/go-llama-ggml
cd sources/go-llama-ggml && git checkout -b build $(GOLLAMA_STABLE_VERSION) && git submodule update --init --recursive --depth 1
## tiny-dream
sources/go-tiny-dream:
git clone --recurse-submodules https://github.com/M0Rf30/go-tiny-dream sources/go-tiny-dream
cd sources/go-tiny-dream && git checkout -b build $(TINYDREAM_VERSION) && git submodule update --init --recursive --depth 1
sources/go-llama-ggml/libbinding.a: sources/go-llama-ggml
$(MAKE) -C sources/go-llama-ggml BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a
sources/go-tiny-dream/libtinydream.a:
$(MAKE) -C sources/go-tiny-dream libtinydream.a
## go-piper
sources/go-piper:
git clone --recurse-submodules https://github.com/mudler/go-piper sources/go-piper
cd sources/go-piper && git checkout -b build $(PIPER_VERSION) && git submodule update --init --recursive --depth 1
sources/go-piper/libpiper_binding.a: sources/go-piper
$(MAKE) -C sources/go-piper libpiper_binding.a example/main piper.o
## GPT4ALL
sources/gpt4all:
git clone --recurse-submodules $(GPT4ALL_REPO) sources/gpt4all
cd sources/gpt4all && git checkout -b build $(GPT4ALL_VERSION) && git submodule update --init --recursive --depth 1
sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a: sources/gpt4all
$(MAKE) -C sources/gpt4all/gpt4all-bindings/golang/ libgpt4all.a
## RWKV
sources/go-rwkv:
@@ -208,23 +218,23 @@ sources/go-rwkv:
sources/go-rwkv/librwkv.a: sources/go-rwkv
cd sources/go-rwkv && cd rwkv.cpp && cmake . -DRWKV_BUILD_SHARED_LIBRARY=OFF && cmake --build . && cp librwkv.a ..
sources/go-bert/libgobert.a: sources/go-bert
$(MAKE) -C sources/go-bert libgobert.a
## stable diffusion
sources/go-stable-diffusion:
git clone --recurse-submodules https://github.com/mudler/go-stable-diffusion sources/go-stable-diffusion
cd sources/go-stable-diffusion && git checkout -b build $(STABLEDIFFUSION_VERSION) && git submodule update --init --recursive --depth 1
backend-assets/gpt4all: sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a
mkdir -p backend-assets/gpt4all
@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.so backend-assets/gpt4all/ || true
@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.dylib backend-assets/gpt4all/ || true
@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.dll backend-assets/gpt4all/ || true
sources/go-stable-diffusion/libstablediffusion.a: sources/go-stable-diffusion
CPATH="$(CPATH):/usr/include/opencv4" $(MAKE) -C sources/go-stable-diffusion libstablediffusion.a
backend-assets/espeak-ng-data: sources/go-piper
mkdir -p backend-assets/espeak-ng-data
$(MAKE) -C sources/go-piper piper.o
@cp -rf sources/go-piper/piper-phonemize/pi/share/espeak-ng-data/. backend-assets/espeak-ng-data
## tiny-dream
sources/go-tiny-dream:
git clone --recurse-submodules https://github.com/M0Rf30/go-tiny-dream sources/go-tiny-dream
cd sources/go-tiny-dream && git checkout -b build $(TINYDREAM_VERSION) && git submodule update --init --recursive --depth 1
sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a: sources/gpt4all
$(MAKE) -C sources/gpt4all/gpt4all-bindings/golang/ libgpt4all.a
sources/go-tiny-dream/libtinydream.a: sources/go-tiny-dream
$(MAKE) -C sources/go-tiny-dream libtinydream.a
## whisper
sources/whisper.cpp:
git clone https://github.com/ggerganov/whisper.cpp.git sources/whisper.cpp
cd sources/whisper.cpp && git checkout -b build $(WHISPER_CPP_VERSION) && git submodule update --init --recursive --depth 1
@@ -232,47 +242,35 @@ sources/whisper.cpp:
sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
cd sources/whisper.cpp && make libwhisper.a
sources/go-llama:
git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp sources/go-llama
cd sources/go-llama && git checkout -b build $(GOLLAMA_VERSION) && git submodule update --init --recursive --depth 1
sources/go-llama-ggml:
git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp sources/go-llama-ggml
cd sources/go-llama-ggml && git checkout -b build $(GOLLAMA_STABLE_VERSION) && git submodule update --init --recursive --depth 1
sources/go-llama/libbinding.a: sources/go-llama
$(MAKE) -C sources/go-llama BUILD_TYPE=$(BUILD_TYPE) libbinding.a
sources/go-llama-ggml/libbinding.a: sources/go-llama-ggml
$(MAKE) -C sources/go-llama-ggml BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a
sources/go-piper/libpiper_binding.a: sources/go-piper
$(MAKE) -C sources/go-piper libpiper_binding.a example/main
backend/cpp/llama/llama.cpp:
LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama llama.cpp
get-sources: backend/cpp/llama/llama.cpp sources/go-llama sources/go-llama-ggml sources/gpt4all sources/go-piper sources/go-rwkv sources/whisper.cpp sources/go-bert sources/go-stable-diffusion sources/go-tiny-dream
touch $@
get-sources: sources/go-llama-ggml sources/gpt4all sources/go-piper sources/go-rwkv sources/whisper.cpp sources/go-bert sources/go-stable-diffusion sources/go-tiny-dream
replace:
$(GOCMD) mod edit -replace github.com/nomic-ai/gpt4all/gpt4all-bindings/golang=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang
$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(CURDIR)/sources/go-rwkv
$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp
$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go
$(GOCMD) mod edit -replace github.com/go-skynet/go-bert.cpp=$(CURDIR)/sources/go-bert
$(GOCMD) mod edit -replace github.com/mudler/go-stable-diffusion=$(CURDIR)/sources/go-stable-diffusion
$(GOCMD) mod edit -replace github.com/M0Rf30/go-tiny-dream=$(CURDIR)/sources/go-tiny-dream
$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURDIR)/sources/go-piper
$(GOCMD) mod edit -replace github.com/mudler/go-stable-diffusion=$(CURDIR)/sources/go-stable-diffusion
$(GOCMD) mod edit -replace github.com/nomic-ai/gpt4all/gpt4all-bindings/golang=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang
dropreplace:
$(GOCMD) mod edit -dropreplace github.com/donomii/go-rwkv.cpp
$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp
$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp/bindings/go
$(GOCMD) mod edit -dropreplace github.com/go-skynet/go-bert.cpp
$(GOCMD) mod edit -dropreplace github.com/M0Rf30/go-tiny-dream
$(GOCMD) mod edit -dropreplace github.com/mudler/go-piper
$(GOCMD) mod edit -dropreplace github.com/mudler/go-stable-diffusion
$(GOCMD) mod edit -dropreplace github.com/nomic-ai/gpt4all/gpt4all-bindings/golang
$(GOCMD) mod edit -dropreplace github.com/go-skynet/go-llama.cpp
prepare-sources: get-sources replace
$(GOCMD) mod download
touch $@
## GENERIC
rebuild: ## Rebuilds the project
$(GOCMD) clean -cache
$(MAKE) -C sources/go-llama clean
$(MAKE) -C sources/go-llama-ggml clean
$(MAKE) -C sources/gpt4all/gpt4all-bindings/golang/ clean
$(MAKE) -C sources/go-rwkv clean
@@ -284,7 +282,6 @@ rebuild: ## Rebuilds the project
$(MAKE) build
prepare: prepare-sources $(OPTIONAL_TARGETS)
touch $@
clean: ## Remove build related file
$(GOCMD) clean -cache
@@ -295,16 +292,27 @@ clean: ## Remove build related file
rm -rf backend-assets
$(MAKE) -C backend/cpp/grpc clean
$(MAKE) -C backend/cpp/llama clean
$(MAKE) dropreplace
clean-tests:
rm -rf test-models
rm -rf test-dir
rm -rf core/http/backend-assets
## Build:
build: backend-assets grpcs prepare ## Build the project
build: prepare backend-assets grpcs ## Build the project
$(info ${GREEN}I local-ai build info:${RESET})
$(info ${GREEN}I BUILD_TYPE: ${YELLOW}$(BUILD_TYPE)${RESET})
$(info ${GREEN}I GO_TAGS: ${YELLOW}$(GO_TAGS)${RESET})
$(info ${GREEN}I LD_FLAGS: ${YELLOW}$(LD_FLAGS)${RESET})
CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o $(BINARY_NAME) ./
build-minimal:
BUILD_GRPC_FOR_BACKEND_LLAMA=true GRPC_BACKENDS=backend-assets/grpc/llama-cpp GO_TAGS=none $(MAKE) build
build-api:
BUILD_GRPC_FOR_BACKEND_LLAMA=true BUILD_API_ONLY=true GO_TAGS=none $(MAKE) build
dist: build
mkdir -p release
cp $(BINARY_NAME) release/$(BINARY_NAME)-$(BUILD_ID)-$(OS)-$(ARCH)
@@ -316,10 +324,10 @@ osx-signed: build
run: prepare ## run local-ai
CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) run ./
test-models/testmodel:
test-models/testmodel.ggml:
mkdir test-models
mkdir test-dir
wget -q https://huggingface.co/TheBloke/orca_mini_3B-GGML/resolve/main/orca-mini-3b.ggmlv3.q4_0.bin -O test-models/testmodel
wget -q https://huggingface.co/TheBloke/orca_mini_3B-GGML/resolve/main/orca-mini-3b.ggmlv3.q4_0.bin -O test-models/testmodel.ggml
wget -q https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en
wget -q https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin -O test-models/bert
wget -q https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav
@@ -328,15 +336,15 @@ test-models/testmodel:
cp tests/models_fixtures/* test-models
prepare-test: grpcs
cp -rf backend-assets api
cp -rf backend-assets core/http
cp tests/models_fixtures/* test-models
test: prepare test-models/testmodel grpcs
test: prepare test-models/testmodel.ggml grpcs
@echo 'Running tests'
export GO_TAGS="tts stablediffusion"
export GO_TAGS="tts stablediffusion debug"
$(MAKE) prepare-test
HUGGINGFACE_GRPC=$(abspath ./)/backend/python/sentencetransformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!gpt4all && !llama && !llama-gguf" --flake-attempts 5 --fail-fast -v -r ./api ./pkg
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!gpt4all && !llama && !llama-gguf" --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
$(MAKE) test-gpt4all
$(MAKE) test-llama
$(MAKE) test-llama-gguf
@@ -347,12 +355,16 @@ prepare-e2e:
mkdir -p $(TEST_DIR)
cp -rfv $(abspath ./tests/e2e-fixtures)/gpu.yaml $(TEST_DIR)/gpu.yaml
test -e $(TEST_DIR)/ggllm-test-model.bin || wget -q https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GGUF/resolve/main/codellama-7b-instruct.Q2_K.gguf -O $(TEST_DIR)/ggllm-test-model.bin
docker build --build-arg BUILD_GRPC=true --build-arg GRPC_BACKENDS="$(GRPC_BACKENDS)" --build-arg IMAGE_TYPE=core --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg CUDA_MAJOR_VERSION=11 --build-arg CUDA_MINOR_VERSION=7 --build-arg FFMPEG=true -t localai-tests .
docker build --build-arg GRPC_BACKENDS="$(GRPC_BACKENDS)" --build-arg IMAGE_TYPE=core --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg CUDA_MAJOR_VERSION=11 --build-arg CUDA_MINOR_VERSION=7 --build-arg FFMPEG=true -t localai-tests .
run-e2e-image:
ls -liah $(abspath ./tests/e2e-fixtures)
docker run -p 5390:8080 -e MODELS_PATH=/models -e THREADS=1 -e DEBUG=true -d --rm -v $(TEST_DIR):/models --gpus all --name e2e-tests-$(RANDOM) localai-tests
run-e2e-aio:
@echo 'Running e2e AIO tests'
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts 5 -v -r ./tests/e2e-aio
test-e2e:
@echo 'Running e2e tests'
BUILD_TYPE=$(BUILD_TYPE) \
@@ -365,23 +377,28 @@ teardown-e2e:
test-gpt4all: prepare-test
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="gpt4all" --flake-attempts 5 -v -r ./api ./pkg
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="gpt4all" --flake-attempts 5 -v -r $(TEST_PATHS)
test-llama: prepare-test
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts 5 -v -r ./api ./pkg
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts 5 -v -r $(TEST_PATHS)
test-llama-gguf: prepare-test
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts 5 -v -r ./api ./pkg
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts 5 -v -r $(TEST_PATHS)
test-tts: prepare-test
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="tts" --flake-attempts 1 -v -r ./api ./pkg
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="tts" --flake-attempts 1 -v -r $(TEST_PATHS)
test-stablediffusion: prepare-test
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stablediffusion" --flake-attempts 1 -v -r ./api ./pkg
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stablediffusion" --flake-attempts 1 -v -r $(TEST_PATHS)
test-stores: backend-assets/grpc/local-store
mkdir -p tests/integration/backend-assets/grpc
cp -f backend-assets/grpc/local-store tests/integration/backend-assets/grpc/
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stores" --flake-attempts 1 -v -r tests/integration
test-container:
docker build --target requirements -t local-ai-test-container .
@@ -451,91 +468,94 @@ ifeq ($(BUILD_API_ONLY),true)
touch backend-assets/keep
endif
backend-assets/grpc:
backend-assets/espeak-ng-data: sources/go-piper sources/go-piper/libpiper_binding.a
mkdir -p backend-assets/espeak-ng-data
@cp -rf sources/go-piper/piper-phonemize/pi/share/espeak-ng-data/. backend-assets/espeak-ng-data
backend-assets/gpt4all: sources/gpt4all sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a
mkdir -p backend-assets/gpt4all
@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.so backend-assets/gpt4all/ || true
@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.dylib backend-assets/gpt4all/ || true
@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.dll backend-assets/gpt4all/ || true
backend-assets/grpc: replace
mkdir -p backend-assets/grpc
backend-assets/grpc/llama: backend-assets/grpc sources/go-llama/libbinding.a
$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama LIBRARY_PATH=$(CURDIR)/sources/go-llama \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama ./backend/go/llm/llama/
# TODO: every binary should have its own folder instead, so can have different implementations
ifeq ($(BUILD_TYPE),metal)
cp backend/cpp/llama/llama.cpp/ggml-metal.metal backend-assets/grpc/
endif
## BACKEND CPP LLAMA START
# Sets the variables in case it has to build the gRPC locally.
INSTALLED_PACKAGES=$(CURDIR)/backend/cpp/grpc/installed_packages
INSTALLED_LIB_CMAKE=$(INSTALLED_PACKAGES)/lib/cmake
ADDED_CMAKE_ARGS=-Dabsl_DIR=${INSTALLED_LIB_CMAKE}/absl \
-DProtobuf_DIR=${INSTALLED_LIB_CMAKE}/protobuf \
-Dutf8_range_DIR=${INSTALLED_LIB_CMAKE}/utf8_range \
-DgRPC_DIR=${INSTALLED_LIB_CMAKE}/grpc \
-DCMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES=${INSTALLED_PACKAGES}/include
backend/cpp/llama/grpc-server:
ifdef BUILD_GRPC_FOR_BACKEND_LLAMA
$(MAKE) -C backend/cpp/grpc build
export _PROTOBUF_PROTOC=${INSTALLED_PACKAGES}/bin/proto && \
export _GRPC_CPP_PLUGIN_EXECUTABLE=${INSTALLED_PACKAGES}/bin/grpc_cpp_plugin && \
export PATH="${INSTALLED_PACKAGES}/bin:${PATH}" && \
CMAKE_ARGS="${CMAKE_ARGS} ${ADDED_CMAKE_ARGS}" LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama grpc-server
else
echo "BUILD_GRPC_FOR_BACKEND_LLAMA is not defined."
LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama grpc-server
endif
## BACKEND CPP LLAMA END
##
backend-assets/grpc/llama-cpp: backend-assets/grpc backend/cpp/llama/grpc-server
cp -rfv backend/cpp/llama/grpc-server backend-assets/grpc/llama-cpp
# TODO: every binary should have its own folder instead, so can have different metal implementations
ifeq ($(BUILD_TYPE),metal)
cp backend/cpp/llama/llama.cpp/build/bin/ggml-metal.metal backend-assets/grpc/
endif
backend-assets/grpc/llama-ggml: backend-assets/grpc sources/go-llama-ggml/libbinding.a
$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama-ggml
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama-ggml LIBRARY_PATH=$(CURDIR)/sources/go-llama-ggml \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/
backend-assets/grpc/gpt4all: backend-assets/grpc backend-assets/gpt4all sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang/ LIBRARY_PATH=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang/ \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gpt4all ./backend/go/llm/gpt4all/
backend-assets/grpc/rwkv: backend-assets/grpc sources/go-rwkv/librwkv.a
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-rwkv LIBRARY_PATH=$(CURDIR)/sources/go-rwkv \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/rwkv ./backend/go/llm/rwkv
backend-assets/grpc/bert-embeddings: backend-assets/grpc sources/go-bert/libgobert.a
backend-assets/grpc/bert-embeddings: sources/go-bert sources/go-bert/libgobert.a backend-assets/grpc
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-bert LIBRARY_PATH=$(CURDIR)/sources/go-bert \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bert-embeddings ./backend/go/llm/bert/
backend-assets/grpc/gpt4all: sources/gpt4all sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a backend-assets/gpt4all backend-assets/grpc
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang/ LIBRARY_PATH=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang/ \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gpt4all ./backend/go/llm/gpt4all/
backend-assets/grpc/langchain-huggingface: backend-assets/grpc
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/langchain-huggingface ./backend/go/llm/langchain/
backend-assets/grpc/stablediffusion: backend-assets/grpc
if [ ! -f backend-assets/grpc/stablediffusion ]; then \
$(MAKE) sources/go-stable-diffusion/libstablediffusion.a; \
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-stable-diffusion/ LIBRARY_PATH=$(CURDIR)/sources/go-stable-diffusion/ \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion ./backend/go/image/stablediffusion; \
fi
backend/cpp/llama/llama.cpp:
LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama llama.cpp
backend-assets/grpc/tinydream: backend-assets/grpc sources/go-tiny-dream/libtinydream.a
CGO_LDFLAGS="$(CGO_LDFLAGS)" LIBRARY_PATH=$(CURDIR)/go-tiny-dream \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/tinydream ./backend/go/image/tinydream
INSTALLED_PACKAGES=$(CURDIR)/backend/cpp/grpc/installed_packages
INSTALLED_LIB_CMAKE=$(INSTALLED_PACKAGES)/lib/cmake
ADDED_CMAKE_ARGS=-Dabsl_DIR=${INSTALLED_LIB_CMAKE}/absl \
-DProtobuf_DIR=${INSTALLED_LIB_CMAKE}/protobuf \
-Dutf8_range_DIR=${INSTALLED_LIB_CMAKE}/utf8_range \
-DgRPC_DIR=${INSTALLED_LIB_CMAKE}/grpc \
-DCMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES=${INSTALLED_PACKAGES}/include
backend/cpp/llama/grpc-server:
# Conditionally build grpc for the llama backend to use if needed
ifdef BUILD_GRPC_FOR_BACKEND_LLAMA
$(MAKE) -C backend/cpp/grpc build
_PROTOBUF_PROTOC=${INSTALLED_PACKAGES}/bin/proto \
_GRPC_CPP_PLUGIN_EXECUTABLE=${INSTALLED_PACKAGES}/bin/grpc_cpp_plugin \
PATH="${INSTALLED_PACKAGES}/bin:${PATH}" \
CMAKE_ARGS="${CMAKE_ARGS} ${ADDED_CMAKE_ARGS}" \
LLAMA_VERSION=$(CPPLLAMA_VERSION) \
$(MAKE) -C backend/cpp/llama grpc-server
else
echo "BUILD_GRPC_FOR_BACKEND_LLAMA is not defined."
LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama grpc-server
endif
backend-assets/grpc/piper: backend-assets/grpc backend-assets/espeak-ng-data sources/go-piper/libpiper_binding.a
backend-assets/grpc/llama-cpp: backend-assets/grpc backend/cpp/llama/grpc-server
cp -rfv backend/cpp/llama/grpc-server backend-assets/grpc/llama-cpp
# TODO: every binary should have its own folder instead, so can have different metal implementations
ifeq ($(BUILD_TYPE),metal)
cp backend/cpp/llama/llama.cpp/build/bin/default.metallib backend-assets/grpc/
endif
backend-assets/grpc/llama-ggml: sources/go-llama-ggml sources/go-llama-ggml/libbinding.a backend-assets/grpc
$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama-ggml
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama-ggml LIBRARY_PATH=$(CURDIR)/sources/go-llama-ggml \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/
backend-assets/grpc/piper: sources/go-piper sources/go-piper/libpiper_binding.a backend-assets/grpc backend-assets/espeak-ng-data
CGO_CXXFLAGS="$(PIPER_CGO_CXXFLAGS)" CGO_LDFLAGS="$(PIPER_CGO_LDFLAGS)" LIBRARY_PATH=$(CURDIR)/sources/go-piper \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/piper ./backend/go/tts/
backend-assets/grpc/whisper: backend-assets/grpc sources/whisper.cpp/libwhisper.a
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/whisper.cpp LIBRARY_PATH=$(CURDIR)/sources/whisper.cpp \
backend-assets/grpc/rwkv: sources/go-rwkv sources/go-rwkv/librwkv.a backend-assets/grpc
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-rwkv LIBRARY_PATH=$(CURDIR)/sources/go-rwkv \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/rwkv ./backend/go/llm/rwkv
backend-assets/grpc/stablediffusion: sources/go-stable-diffusion sources/go-stable-diffusion/libstablediffusion.a backend-assets/grpc
CGO_LDFLAGS="$(CGO_LDFLAGS)" CPATH="$(CPATH):$(CURDIR)/sources/go-stable-diffusion/:/usr/include/opencv4" LIBRARY_PATH=$(CURDIR)/sources/go-stable-diffusion/ \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion ./backend/go/image/stablediffusion
backend-assets/grpc/tinydream: sources/go-tiny-dream sources/go-tiny-dream/libtinydream.a backend-assets/grpc
CGO_LDFLAGS="$(CGO_LDFLAGS)" LIBRARY_PATH=$(CURDIR)/go-tiny-dream \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/tinydream ./backend/go/image/tinydream
backend-assets/grpc/whisper: sources/whisper.cpp sources/whisper.cpp/libwhisper.a backend-assets/grpc
CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_WHISPER)" C_INCLUDE_PATH=$(CURDIR)/sources/whisper.cpp LIBRARY_PATH=$(CURDIR)/sources/whisper.cpp \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./backend/go/transcribe/
backend-assets/grpc/local-store: backend-assets/grpc
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/local-store ./backend/go/stores/
grpcs: prepare $(GRPC_BACKENDS)
DOCKER_IMAGE?=local-ai
DOCKER_AIO_IMAGE?=local-ai-aio
IMAGE_TYPE?=core
BASE_IMAGE?=ubuntu:22.04
@@ -543,13 +563,38 @@ docker:
docker build \
--build-arg BASE_IMAGE=$(BASE_IMAGE) \
--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
--build-arg GO_TAGS=$(GO_TAGS) \
--build-arg GO_TAGS="$(GO_TAGS)" \
--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
--build-arg BUILD_TYPE=$(BUILD_TYPE) \
-t $(DOCKER_IMAGE) .
docker-aio:
@echo "Building AIO image with base $(BASE_IMAGE) as $(DOCKER_AIO_IMAGE)"
docker build \
--build-arg BASE_IMAGE=$(BASE_IMAGE) \
--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
-t $(DOCKER_AIO_IMAGE) -f Dockerfile.aio .
docker-aio-all:
$(MAKE) docker-aio DOCKER_AIO_SIZE=cpu
$(MAKE) docker-aio DOCKER_AIO_SIZE=cpu
docker-image-intel:
docker build \
--build-arg BASE_IMAGE=intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04 \
--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
--build-arg GO_TAGS="none" \
--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
--build-arg BUILD_TYPE=sycl_f32 -t $(DOCKER_IMAGE) .
docker-image-intel-xpu:
docker build \
--build-arg BASE_IMAGE=intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04 \
--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
--build-arg GO_TAGS="none" \
--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
--build-arg BUILD_TYPE=sycl_f32 -t $(DOCKER_IMAGE) .
.PHONY: swagger
swagger:
swag init -g core/http/api.go --output swagger

View File

@@ -20,14 +20,14 @@
</a>
</p>
[<img src="https://img.shields.io/badge/dockerhub-images-important.svg?logo=Docker">](https://hub.docker.com/r/localai/localai)
[<img src="https://img.shields.io/badge/quay.io-images-important.svg?">](https://quay.io/repository/go-skynet/local-ai?tab=tags&tag=latest)
> :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/)
>
> [💻 Quickstart](https://localai.io/basics/getting_started/) [📣 News](https://localai.io/basics/news/) [ 🛫 Examples ](https://github.com/go-skynet/LocalAI/tree/master/examples/) [ 🖼️ Models ](https://localai.io/models/) [ 🚀 Roadmap ](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
[![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)
<p align="center">
<a href="https://hub.docker.com/r/localai/localai" target="blank">
<img src="https://img.shields.io/badge/dockerhub-images-important.svg?logo=Docker" alt="LocalAI Docker hub"/>
</a>
<a href="https://quay.io/repository/go-skynet/local-ai?tab=tags&tag=latest" target="blank">
<img src="https://img.shields.io/badge/quay.io-images-important.svg?" alt="LocalAI Quay.io"/>
</a>
</p>
<p align="center">
<a href="https://twitter.com/LocalAI_API" target="blank">
@@ -36,20 +36,27 @@
<a href="https://discord.gg/uJAeKSAGDy" target="blank">
<img src="https://dcbadge.vercel.app/api/server/uJAeKSAGDy?style=flat-square&theme=default-inverted" alt="Join LocalAI Discord Community"/>
</a>
</p>
**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API thats compatible with OpenAI API specifications for local inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU.
> :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/)
>
> [💻 Quickstart](https://localai.io/basics/getting_started/) [📣 News](https://localai.io/basics/news/) [ 🛫 Examples ](https://github.com/go-skynet/LocalAI/tree/master/examples/) [ 🖼️ Models ](https://localai.io/models/) [ 🚀 Roadmap ](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
[![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)
**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API thats compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU.
## 🔥🔥 Hot topics / Roadmap
[Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
- Parallel function calling: https://github.com/mudler/LocalAI/pull/1726
- Landing page: https://github.com/mudler/LocalAI/pull/1922
- Openvino support: https://github.com/mudler/LocalAI/pull/1892
- Vector store: https://github.com/mudler/LocalAI/pull/1795
- All-in-one container image: https://github.com/mudler/LocalAI/issues/1855
- Parallel function calling: https://github.com/mudler/LocalAI/pull/1726 / Tools API support: https://github.com/mudler/LocalAI/pull/1715
- Upload file API: https://github.com/mudler/LocalAI/pull/1703
- Tools API support: https://github.com/mudler/LocalAI/pull/1715
- LLaVa 1.6: https://github.com/mudler/LocalAI/pull/1714
- ROCm container images: https://github.com/mudler/LocalAI/pull/1595
- Intel GPU support (sycl): https://github.com/mudler/LocalAI/issues/1653
- Deprecation of old backends: https://github.com/mudler/LocalAI/issues/1651
- ROCm container images: https://github.com/mudler/LocalAI/pull/1595 / Intel GPU support (sycl, transformers, diffusers): https://github.com/mudler/LocalAI/issues/1653
- Mamba support: https://github.com/mudler/LocalAI/pull/1589
- Start and share models with config file: https://github.com/mudler/LocalAI/pull/1522
- 🐸 Coqui: https://github.com/mudler/LocalAI/pull/1489
@@ -59,15 +66,21 @@ Hot topics (looking for contributors):
- Backends v2: https://github.com/mudler/LocalAI/issues/1126
- Improving UX v2: https://github.com/mudler/LocalAI/issues/1373
- Assistant API: https://github.com/mudler/LocalAI/issues/1273
- Moderation endpoint: https://github.com/mudler/LocalAI/issues/999
- Vulkan: https://github.com/mudler/LocalAI/issues/1647
If you want to help and contribute, issues up for grabs: https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3A%22up+for+grabs%22
## 💻 [Getting started](https://localai.io/basics/getting_started/index.html)
For a detailed step-by-step introduction, refer to the [Getting Started](https://localai.io/basics/getting_started/index.html) guide. For those in a hurry, here's a straightforward one-liner to launch a LocalAI instance with [phi-2](https://huggingface.co/microsoft/phi-2) using `docker`:
For a detailed step-by-step introduction, refer to the [Getting Started](https://localai.io/basics/getting_started/index.html) guide.
```
docker run -ti -p 8080:8080 localai/localai:v2.7.0-ffmpeg-core phi-2
For those in a hurry, here's a straightforward one-liner to launch a LocalAI AIO(All-in-one) Image using `docker`:
```bash
docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
# or, if you have an Nvidia GPU:
# docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-aio-gpu-nvidia-cuda-12
```
## 🚀 [Features](https://localai.io/features/)
@@ -97,9 +110,6 @@ WebUIs:
Model galleries
- https://github.com/go-skynet/model-gallery
UI / Management Programs
- [LocalAI Manager](https://io.midori-ai.xyz/howtos/easy-model-installer/)
Other:
- Helm chart https://github.com/go-skynet/helm-charts
@@ -110,6 +120,7 @@ Other:
- Slack bot https://github.com/mudler/LocalAGI/tree/main/examples/slack
- Telegram bot https://github.com/mudler/LocalAI/tree/master/examples/telegram-bot
- Examples: https://github.com/mudler/LocalAI/tree/master/examples/
### 🔗 Resources
@@ -121,6 +132,8 @@ Other:
## :book: 🎥 [Media, Blogs, Social](https://localai.io/basics/news/#media-blogs-social)
- [Run LocalAI on AWS EKS with Pulumi](https://www.pulumi.com/ai/answers/tiZMDoZzZV6TLxgDXNBnFE/deploying-helm-charts-on-aws-eks)
- [Run LocalAI on AWS](https://staleks.hashnode.dev/installing-localai-on-aws-ec2-instance)
- [Create a slackbot for teams and OSS projects that answer to documentation](https://mudler.pm/posts/smart-slackbot-for-teams/)
- [LocalAI meets k8sgpt](https://www.youtube.com/watch?v=PKrDNuJ_dfE)
- [Question Answering on Documents locally with LangChain, LocalAI, Chroma, and GPT4All](https://mudler.pm/posts/localai-question-answering/)

42
SECURITY.md Normal file
View File

@@ -0,0 +1,42 @@
# Security Policy
## Introduction
At LocalAI, we take the security of our software seriously. We understand the importance of protecting our community from vulnerabilities and are committed to ensuring the safety and security of our users.
## Supported Versions
We provide support and updates for certain versions of our software. The following table outlines which versions are currently supported with security updates:
| Version | Supported |
| ------- | ------------------ |
| > 2.0 | :white_check_mark: |
| < 2.0 | :x: |
Please ensure that you are using a supported version to receive the latest security updates.
## Reporting a Vulnerability
We encourage the responsible disclosure of any security vulnerabilities. If you believe you've found a security issue in our software, we kindly ask you to follow the steps below to report it to us:
1. **Email Us:** Send an email to [security@localai.io](mailto:security@localai.io) with a detailed report. Please do not disclose the vulnerability publicly or to any third parties before it has been addressed by us.
2. **Expect a Response:** We aim to acknowledge receipt of vulnerability reports within 48 hours. Our security team will review your report and work closely with you to understand the impact and ensure a thorough investigation.
3. **Collaboration:** If the vulnerability is accepted, we will work with you and our community to address the issue promptly. We'll keep you informed throughout the resolution process and may request additional information or collaboration.
4. **Disclosure:** Once the vulnerability has been resolved, we encourage a coordinated disclosure. We believe in transparency and will work with you to ensure that our community is informed in a responsible manner.
## Use of Third-Party Platforms
As a Free and Open Source Software (FOSS) organization, we do not offer monetary bounties. However, researchers who wish to report vulnerabilities can also do so via [Huntr](https://huntr.dev/bounties), a platform that recognizes contributions to open source security.
## Contact
For any security-related inquiries beyond vulnerability reporting, please contact us at [security@localai.io](mailto:security@localai.io).
## Acknowledgments
We appreciate the efforts of those who contribute to the security of our project. Your responsible disclosure is invaluable to the safety and integrity of LocalAI.
Thank you for helping us keep LocalAI secure.

5
aio/cpu/README.md Normal file
View File

@@ -0,0 +1,5 @@
## AIO CPU size
Use this image with CPU-only.
Please keep using only C++ backends so the base image is as small as possible (without CUDA, cuDNN, python, etc).

12
aio/cpu/embeddings.yaml Normal file
View File

@@ -0,0 +1,12 @@
name: text-embedding-ada-002
backend: bert-embeddings
parameters:
model: huggingface://mudler/all-MiniLM-L6-v2/ggml-model-q4_0.bin
usage: |
You can test this model with curl like this:
curl http://localhost:8080/embeddings -X POST -H "Content-Type: application/json" -d '{
"input": "Your text string goes here",
"model": "text-embedding-ada-002"
}'

62
aio/cpu/image-gen.yaml Normal file
View File

@@ -0,0 +1,62 @@
name: stablediffusion
backend: stablediffusion
parameters:
model: stablediffusion_assets
license: "BSD-3"
urls:
- https://github.com/EdVince/Stable-Diffusion-NCNN
- https://github.com/EdVince/Stable-Diffusion-NCNN/blob/main/LICENSE
description: |
Stable Diffusion in NCNN with c++, supported txt2img and img2img
download_files:
- filename: "stablediffusion_assets/AutoencoderKL-256-256-fp16-opt.param"
sha256: "18ca4b66685e21406bcf64c484b3b680b4949900415536d599cc876579c85c82"
uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-256-256-fp16-opt.param"
- filename: "stablediffusion_assets/AutoencoderKL-512-512-fp16-opt.param"
sha256: "cf45f63aacf3dbbab0f59ed92a6f2c14d9a1801314631cd3abe91e3c85639a20"
uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-512-512-fp16-opt.param"
- filename: "stablediffusion_assets/AutoencoderKL-base-fp16.param"
sha256: "0254a056dce61b0c27dc9ec1b78b53bcf55315c540f55f051eb841aa992701ba"
uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-base-fp16.param"
- filename: "stablediffusion_assets/AutoencoderKL-encoder-512-512-fp16.bin"
sha256: "ddcb79a9951b9f91e05e087739ed69da2c1c4ae30ba4168cce350b49d617c9fa"
uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/AutoencoderKL-encoder-512-512-fp16.bin"
- filename: "stablediffusion_assets/AutoencoderKL-fp16.bin"
sha256: "f02e71f80e70252734724bbfaed5c4ddd3a8ed7e61bb2175ff5f53099f0e35dd"
uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/AutoencoderKL-fp16.bin"
- filename: "stablediffusion_assets/FrozenCLIPEmbedder-fp16.bin"
sha256: "1c9a12f4e1dd1b295a388045f7f28a2352a4d70c3dc96a542189a3dd7051fdd6"
uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/FrozenCLIPEmbedder-fp16.bin"
- filename: "stablediffusion_assets/FrozenCLIPEmbedder-fp16.param"
sha256: "471afbe678dd1fd3fe764ef9c6eccaccb0a7d7e601f27b462aa926b20eb368c9"
uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/FrozenCLIPEmbedder-fp16.param"
- filename: "stablediffusion_assets/log_sigmas.bin"
sha256: "a2089f8aa4c61f9c200feaec541ab3f5c94233b28deb6d5e8bcd974fa79b68ac"
uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/raw/main/x86/linux/assets/log_sigmas.bin"
- filename: "stablediffusion_assets/UNetModel-256-256-MHA-fp16-opt.param"
sha256: "a58c380229f09491776df837b7aa7adffc0a87821dc4708b34535da2e36e3da1"
uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-256-256-MHA-fp16-opt.param"
- filename: "stablediffusion_assets/UNetModel-512-512-MHA-fp16-opt.param"
sha256: "f12034067062827bd7f43d1d21888d1f03905401acf6c6eea22be23c259636fa"
uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-512-512-MHA-fp16-opt.param"
- filename: "stablediffusion_assets/UNetModel-base-MHA-fp16.param"
sha256: "696f6975de49f4325b53ce32aff81861a6d6c07cd9ce3f0aae2cc405350af38d"
uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-base-MHA-fp16.param"
- filename: "stablediffusion_assets/UNetModel-MHA-fp16.bin"
sha256: "d618918d011bfc1f644c0f2a33bf84931bd53b28a98492b0a8ed6f3a818852c3"
uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/UNetModel-MHA-fp16.bin"
- filename: "stablediffusion_assets/vocab.txt"
sha256: "e30e57b6f1e47616982ef898d8922be24e535b4fa3d0110477b3a6f02ebbae7d"
uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/vocab.txt"
usage: |
curl http://localhost:8080/v1/images/generations \
-H "Content-Type: application/json" \
-d '{
"prompt": "<positive prompt>|<negative prompt>",
"step": 25,
"size": "512x512"
}'

View File

@@ -0,0 +1,18 @@
name: whisper-1
backend: whisper
parameters:
model: ggml-whisper-base.bin
usage: |
## example audio file
wget --quiet --show-progress -O gb1.ogg https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg
## Send the example audio file to the transcriptions endpoint
curl http://localhost:8080/v1/audio/transcriptions \
-H "Content-Type: multipart/form-data" \
-F file="@$PWD/gb1.ogg" -F model="whisper-1"
download_files:
- filename: "ggml-whisper-base.bin"
sha256: "60ed5bc3dd14eea856493d334349b405782ddcaf0028d4b5df4088345fba2efe"
uri: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin"

View File

@@ -0,0 +1,15 @@
name: tts-1
download_files:
- filename: voice-en-us-amy-low.tar.gz
uri: https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-en-us-amy-low.tar.gz
parameters:
model: en-us-amy-low.onnx
usage: |
To test if this model works as expected, you can use the following curl command:
curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{
"model":"voice-en-us-amy-low",
"input": "Hi, this is a test."
}'

53
aio/cpu/text-to-text.yaml Normal file
View File

@@ -0,0 +1,53 @@
name: gpt-4
mmap: true
parameters:
model: huggingface://NousResearch/Hermes-2-Pro-Mistral-7B-GGUF/Hermes-2-Pro-Mistral-7B.Q2_K.gguf
template:
chat_message: |
<|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
{{- if .FunctionCall }}<tool_call>{{end}}
{{- if eq .RoleName "tool" }}<tool_result>{{end }}
{{- if .Content}}
{{.Content}}
{{- end }}
{{- if .FunctionCall}}{{toJson .FunctionCall}}{{end }}
{{- if .FunctionCall }}</tool_call>{{end }}
{{- if eq .RoleName "tool" }}</tool_result>{{end }}
<|im_end|>
# https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF#prompt-format-for-function-calling
function: |
<|im_start|>system
You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
<tools>
{{range .Functions}}
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
{{end}}
</tools>
Use the following pydantic model json schema for each tool call you will make:
{'title': 'FunctionCall', 'type': 'object', 'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name']}
For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
<tool_call>
{'arguments': <args-dict>, 'name': <function-name>}
</tool_call>
<|im_end|>
{{.Input -}}
<|im_start|>assistant
<tool_call>
chat: |
{{.Input -}}
<|im_start|>assistant
completion: |
{{.Input}}
context_size: 4096
f16: true
stopwords:
- <|im_end|>
- <dummy32000>
- "\n</tool_call>"
- "\n\n\n"
usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "gpt-4",
"messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
}'

31
aio/cpu/vision.yaml Normal file
View File

@@ -0,0 +1,31 @@
backend: llama-cpp
context_size: 4096
f16: true
mmap: true
name: gpt-4-vision-preview
roles:
user: "USER:"
assistant: "ASSISTANT:"
system: "SYSTEM:"
mmproj: bakllava-mmproj.gguf
parameters:
model: bakllava.gguf
template:
chat: |
A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
{{.Input}}
ASSISTANT:
download_files:
- filename: bakllava.gguf
uri: huggingface://mys/ggml_bakllava-1/ggml-model-q4_k.gguf
- filename: bakllava-mmproj.gguf
uri: huggingface://mys/ggml_bakllava-1/mmproj-model-f16.gguf
usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "gpt-4-vision-preview",
"messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'

138
aio/entrypoint.sh Executable file
View File

@@ -0,0 +1,138 @@
#!/bin/bash
echo "===> LocalAI All-in-One (AIO) container starting..."
GPU_ACCELERATION=false
GPU_VENDOR=""
function check_intel() {
if lspci | grep -E 'VGA|3D' | grep -iq intel; then
echo "Intel GPU detected"
if [ -d /opt/intel ]; then
GPU_ACCELERATION=true
GPU_VENDOR=intel
else
echo "Intel GPU detected, but Intel GPU drivers are not installed. GPU acceleration will not be available."
fi
fi
}
function check_nvidia_wsl() {
if lspci | grep -E 'VGA|3D' | grep -iq "Microsoft Corporation Device 008e"; then
# We make the assumption this WSL2 cars is NVIDIA, then check for nvidia-smi
# Make sure the container was run with `--gpus all` as the only required parameter
echo "NVIDIA GPU detected via WSL2"
# nvidia-smi should be installed in the container
if nvidia-smi; then
GPU_ACCELERATION=true
GPU_VENDOR=nvidia
else
echo "NVIDIA GPU detected via WSL2, but nvidia-smi is not installed. GPU acceleration will not be available."
fi
fi
}
function check_amd() {
if lspci | grep -E 'VGA|3D' | grep -iq amd; then
echo "AMD GPU detected"
# Check if ROCm is installed
if [ -d /opt/rocm ]; then
GPU_ACCELERATION=true
GPU_VENDOR=amd
else
echo "AMD GPU detected, but ROCm is not installed. GPU acceleration will not be available."
fi
fi
}
function check_nvidia() {
if lspci | grep -E 'VGA|3D' | grep -iq nvidia; then
echo "NVIDIA GPU detected"
# nvidia-smi should be installed in the container
if nvidia-smi; then
GPU_ACCELERATION=true
GPU_VENDOR=nvidia
else
echo "NVIDIA GPU detected, but nvidia-smi is not installed. GPU acceleration will not be available."
fi
fi
}
function check_metal() {
if system_profiler SPDisplaysDataType | grep -iq 'Metal'; then
echo "Apple Metal supported GPU detected"
GPU_ACCELERATION=true
GPU_VENDOR=apple
fi
}
function detect_gpu() {
case "$(uname -s)" in
Linux)
check_nvidia
check_amd
check_intel
check_nvidia_wsl
;;
Darwin)
check_metal
;;
esac
}
function detect_gpu_size() {
# Attempting to find GPU memory size for NVIDIA GPUs
if [ "$GPU_ACCELERATION" = true ] && [ "$GPU_VENDOR" = "nvidia" ]; then
echo "NVIDIA GPU detected. Attempting to find memory size..."
# Using head -n 1 to get the total memory of the 1st NVIDIA GPU detected.
# If handling multiple GPUs is required in the future, this is the place to do it
nvidia_sm=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits | head -n 1)
if [ ! -z "$nvidia_sm" ]; then
echo "Total GPU Memory: $nvidia_sm MiB"
# if bigger than 8GB, use 16GB
#if [ "$nvidia_sm" -gt 8192 ]; then
# GPU_SIZE=gpu-16g
#else
GPU_SIZE=gpu-8g
#fi
else
echo "Unable to determine NVIDIA GPU memory size. Falling back to CPU."
GPU_SIZE=gpu-8g
fi
elif [ "$GPU_ACCELERATION" = true ] && [ "$GPU_VENDOR" = "intel" ]; then
GPU_SIZE=intel
# Default to a generic GPU size until we implement GPU size detection for non NVIDIA GPUs
elif [ "$GPU_ACCELERATION" = true ]; then
echo "Non-NVIDIA GPU detected. Specific GPU memory size detection is not implemented."
GPU_SIZE=gpu-8g
# default to cpu if GPU_SIZE is not set
else
echo "GPU acceleration is not enabled or supported. Defaulting to CPU."
GPU_SIZE=cpu
fi
}
function check_vars() {
if [ -z "$MODELS" ]; then
echo "MODELS environment variable is not set. Please set it to a comma-separated list of model YAML files to load."
exit 1
fi
if [ -z "$PROFILE" ]; then
echo "PROFILE environment variable is not set. Please set it to one of the following: cpu, gpu-8g, gpu-16g, apple"
exit 1
fi
}
detect_gpu
detect_gpu_size
PROFILE="${PROFILE:-$GPU_SIZE}" # default to cpu
export MODELS="${MODELS:-/aio/${PROFILE}/embeddings.yaml,/aio/${PROFILE}/text-to-speech.yaml,/aio/${PROFILE}/image-gen.yaml,/aio/${PROFILE}/text-to-text.yaml,/aio/${PROFILE}/speech-to-text.yaml,/aio/${PROFILE}/vision.yaml}"
check_vars
echo "===> Starting LocalAI[$PROFILE] with the following models: $MODELS"
exec /build/entrypoint.sh "$@"

View File

@@ -0,0 +1,12 @@
name: text-embedding-ada-002
backend: sentencetransformers
parameters:
model: all-MiniLM-L6-v2
usage: |
You can test this model with curl like this:
curl http://localhost:8080/embeddings -X POST -H "Content-Type: application/json" -d '{
"input": "Your text string goes here",
"model": "text-embedding-ada-002"
}'

25
aio/gpu-8g/image-gen.yaml Normal file
View File

@@ -0,0 +1,25 @@
name: stablediffusion
parameters:
model: DreamShaper_8_pruned.safetensors
backend: diffusers
step: 25
f16: true
diffusers:
pipeline_type: StableDiffusionPipeline
cuda: true
enable_parameters: "negative_prompt,num_inference_steps"
scheduler_type: "k_dpmpp_2m"
download_files:
- filename: DreamShaper_8_pruned.safetensors
uri: huggingface://Lykon/DreamShaper/DreamShaper_8_pruned.safetensors
usage: |
curl http://localhost:8080/v1/images/generations \
-H "Content-Type: application/json" \
-d '{
"prompt": "<positive prompt>|<negative prompt>",
"step": 25,
"size": "512x512"
}'

View File

@@ -0,0 +1,18 @@
name: whisper-1
backend: whisper
parameters:
model: ggml-whisper-base.bin
usage: |
## example audio file
wget --quiet --show-progress -O gb1.ogg https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg
## Send the example audio file to the transcriptions endpoint
curl http://localhost:8080/v1/audio/transcriptions \
-H "Content-Type: multipart/form-data" \
-F file="@$PWD/gb1.ogg" -F model="whisper-1"
download_files:
- filename: "ggml-whisper-base.bin"
sha256: "60ed5bc3dd14eea856493d334349b405782ddcaf0028d4b5df4088345fba2efe"
uri: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin"

View File

@@ -0,0 +1,15 @@
name: tts-1
download_files:
- filename: voice-en-us-amy-low.tar.gz
uri: https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-en-us-amy-low.tar.gz
parameters:
model: en-us-amy-low.onnx
usage: |
To test if this model works as expected, you can use the following curl command:
curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{
"model":"tts-1",
"input": "Hi, this is a test."
}'

View File

@@ -0,0 +1,53 @@
name: gpt-4
mmap: true
parameters:
model: huggingface://NousResearch/Hermes-2-Pro-Mistral-7B-GGUF/Hermes-2-Pro-Mistral-7B.Q6_K.gguf
template:
chat_message: |
<|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
{{- if .FunctionCall }}<tool_call>{{end}}
{{- if eq .RoleName "tool" }}<tool_result>{{end }}
{{- if .Content}}
{{.Content}}
{{- end }}
{{- if .FunctionCall}}{{toJson .FunctionCall}}{{end }}
{{- if .FunctionCall }}</tool_call>{{end }}
{{- if eq .RoleName "tool" }}</tool_result>{{end }}
<|im_end|>
# https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF#prompt-format-for-function-calling
function: |
<|im_start|>system
You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
<tools>
{{range .Functions}}
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
{{end}}
</tools>
Use the following pydantic model json schema for each tool call you will make:
{'title': 'FunctionCall', 'type': 'object', 'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name']}
For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
<tool_call>
{'arguments': <args-dict>, 'name': <function-name>}
</tool_call>
<|im_end|>
{{.Input -}}
<|im_start|>assistant
<tool_call>
chat: |
{{.Input -}}
<|im_start|>assistant
completion: |
{{.Input}}
context_size: 4096
f16: true
stopwords:
- <|im_end|>
- <dummy32000>
- "\n</tool_call>"
- "\n\n\n"
usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "gpt-4",
"messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
}'

35
aio/gpu-8g/vision.yaml Normal file
View File

@@ -0,0 +1,35 @@
backend: llama-cpp
context_size: 4096
f16: true
mmap: true
name: gpt-4-vision-preview
roles:
user: "USER:"
assistant: "ASSISTANT:"
system: "SYSTEM:"
mmproj: llava-v1.6-7b-mmproj-f16.gguf
parameters:
model: llava-v1.6-mistral-7b.Q5_K_M.gguf
temperature: 0.2
top_k: 40
top_p: 0.95
seed: -1
template:
chat: |
A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
{{.Input}}
ASSISTANT:
download_files:
- filename: llava-v1.6-mistral-7b.Q5_K_M.gguf
uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/llava-v1.6-mistral-7b.Q5_K_M.gguf
- filename: llava-v1.6-7b-mmproj-f16.gguf
uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/mmproj-model-f16.gguf
usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "gpt-4-vision-preview",
"messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'

12
aio/intel/embeddings.yaml Normal file
View File

@@ -0,0 +1,12 @@
name: text-embedding-ada-002
backend: sentencetransformers
parameters:
model: all-MiniLM-L6-v2
usage: |
You can test this model with curl like this:
curl http://localhost:8080/embeddings -X POST -H "Content-Type: application/json" -d '{
"input": "Your text string goes here",
"model": "text-embedding-ada-002"
}'

20
aio/intel/image-gen.yaml Normal file
View File

@@ -0,0 +1,20 @@
name: stablediffusion
parameters:
model: runwayml/stable-diffusion-v1-5
backend: diffusers
step: 25
f16: true
diffusers:
pipeline_type: StableDiffusionPipeline
cuda: true
enable_parameters: "negative_prompt,num_inference_steps"
scheduler_type: "k_dpmpp_2m"
usage: |
curl http://localhost:8080/v1/images/generations \
-H "Content-Type: application/json" \
-d '{
"prompt": "<positive prompt>|<negative prompt>",
"step": 25,
"size": "512x512"
}'

View File

@@ -0,0 +1,18 @@
name: whisper-1
backend: whisper
parameters:
model: ggml-whisper-base.bin
usage: |
## example audio file
wget --quiet --show-progress -O gb1.ogg https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg
## Send the example audio file to the transcriptions endpoint
curl http://localhost:8080/v1/audio/transcriptions \
-H "Content-Type: multipart/form-data" \
-F file="@$PWD/gb1.ogg" -F model="whisper-1"
download_files:
- filename: "ggml-whisper-base.bin"
sha256: "60ed5bc3dd14eea856493d334349b405782ddcaf0028d4b5df4088345fba2efe"
uri: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin"

View File

@@ -0,0 +1,15 @@
name: tts-1
download_files:
- filename: voice-en-us-amy-low.tar.gz
uri: https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-en-us-amy-low.tar.gz
parameters:
model: en-us-amy-low.onnx
usage: |
To test if this model works as expected, you can use the following curl command:
curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{
"model":"tts-1",
"input": "Hi, this is a test."
}'

View File

@@ -0,0 +1,53 @@
name: gpt-4
mmap: false
f16: false
parameters:
model: huggingface://NousResearch/Hermes-2-Pro-Mistral-7B-GGUF/Hermes-2-Pro-Mistral-7B.Q6_K.gguf
template:
chat_message: |
<|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
{{- if .FunctionCall }}<tool_call>{{end}}
{{- if eq .RoleName "tool" }}<tool_result>{{end }}
{{- if .Content}}
{{.Content}}
{{- end }}
{{- if .FunctionCall}}{{toJson .FunctionCall}}{{end }}
{{- if .FunctionCall }}</tool_call>{{end }}
{{- if eq .RoleName "tool" }}</tool_result>{{end }}
<|im_end|>
# https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF#prompt-format-for-function-calling
function: |
<|im_start|>system
You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
<tools>
{{range .Functions}}
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
{{end}}
</tools>
Use the following pydantic model json schema for each tool call you will make:
{'title': 'FunctionCall', 'type': 'object', 'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name']}
For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
<tool_call>
{'arguments': <args-dict>, 'name': <function-name>}
</tool_call>
<|im_end|>
{{.Input -}}
<|im_start|>assistant
<tool_call>
chat: |
{{.Input -}}
<|im_start|>assistant
completion: |
{{.Input}}
context_size: 4096
stopwords:
- <|im_end|>
- "\n</tool_call>"
- <dummy32000>
- "\n\n\n"
usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "gpt-4",
"messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
}'

35
aio/intel/vision.yaml Normal file
View File

@@ -0,0 +1,35 @@
backend: llama-cpp
context_size: 4096
mmap: false
f16: false
name: gpt-4-vision-preview
roles:
user: "USER:"
assistant: "ASSISTANT:"
system: "SYSTEM:"
mmproj: llava-v1.6-7b-mmproj-f16.gguf
parameters:
model: llava-v1.6-mistral-7b.Q5_K_M.gguf
temperature: 0.2
top_k: 40
top_p: 0.95
seed: -1
template:
chat: |
A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
{{.Input}}
ASSISTANT:
download_files:
- filename: llava-v1.6-mistral-7b.Q5_K_M.gguf
uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/llava-v1.6-mistral-7b.Q5_K_M.gguf
- filename: llava-v1.6-7b-mmproj-f16.gguf
uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/mmproj-model-f16.gguf
usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "gpt-4-vision-preview",
"messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'

View File

@@ -1,162 +0,0 @@
package localai
import (
"context"
"fmt"
"strings"
config "github.com/go-skynet/LocalAI/core/config"
"github.com/go-skynet/LocalAI/pkg/grpc/proto"
"github.com/go-skynet/LocalAI/core/options"
"github.com/gofiber/fiber/v2"
"github.com/rs/zerolog/log"
gopsutil "github.com/shirou/gopsutil/v3/process"
)
type BackendMonitorRequest struct {
Model string `json:"model" yaml:"model"`
}
type BackendMonitorResponse struct {
MemoryInfo *gopsutil.MemoryInfoStat
MemoryPercent float32
CPUPercent float64
}
type BackendMonitor struct {
configLoader *config.ConfigLoader
options *options.Option // Taking options in case we need to inspect ExternalGRPCBackends, though that's out of scope for now, hence the name.
}
func NewBackendMonitor(configLoader *config.ConfigLoader, options *options.Option) BackendMonitor {
return BackendMonitor{
configLoader: configLoader,
options: options,
}
}
func (bm *BackendMonitor) SampleLocalBackendProcess(model string) (*BackendMonitorResponse, error) {
config, exists := bm.configLoader.GetConfig(model)
var backend string
if exists {
backend = config.Model
} else {
// Last ditch effort: use it raw, see if a backend happens to match.
backend = model
}
if !strings.HasSuffix(backend, ".bin") {
backend = fmt.Sprintf("%s.bin", backend)
}
pid, err := bm.options.Loader.GetGRPCPID(backend)
if err != nil {
log.Error().Msgf("model %s : failed to find pid %+v", model, err)
return nil, err
}
// Name is slightly frightening but this does _not_ create a new process, rather it looks up an existing process by PID.
backendProcess, err := gopsutil.NewProcess(int32(pid))
if err != nil {
log.Error().Msgf("model %s [PID %d] : error getting process info %+v", model, pid, err)
return nil, err
}
memInfo, err := backendProcess.MemoryInfo()
if err != nil {
log.Error().Msgf("model %s [PID %d] : error getting memory info %+v", model, pid, err)
return nil, err
}
memPercent, err := backendProcess.MemoryPercent()
if err != nil {
log.Error().Msgf("model %s [PID %d] : error getting memory percent %+v", model, pid, err)
return nil, err
}
cpuPercent, err := backendProcess.CPUPercent()
if err != nil {
log.Error().Msgf("model %s [PID %d] : error getting cpu percent %+v", model, pid, err)
return nil, err
}
return &BackendMonitorResponse{
MemoryInfo: memInfo,
MemoryPercent: memPercent,
CPUPercent: cpuPercent,
}, nil
}
func (bm BackendMonitor) getModelLoaderIDFromCtx(c *fiber.Ctx) (string, error) {
input := new(BackendMonitorRequest)
// Get input data from the request body
if err := c.BodyParser(input); err != nil {
return "", err
}
config, exists := bm.configLoader.GetConfig(input.Model)
var backendId string
if exists {
backendId = config.Model
} else {
// Last ditch effort: use it raw, see if a backend happens to match.
backendId = input.Model
}
if !strings.HasSuffix(backendId, ".bin") {
backendId = fmt.Sprintf("%s.bin", backendId)
}
return backendId, nil
}
func BackendMonitorEndpoint(bm BackendMonitor) func(c *fiber.Ctx) error {
return func(c *fiber.Ctx) error {
backendId, err := bm.getModelLoaderIDFromCtx(c)
if err != nil {
return err
}
model := bm.options.Loader.CheckIsLoaded(backendId)
if model == "" {
return fmt.Errorf("backend %s is not currently loaded", backendId)
}
status, rpcErr := model.GRPC(false, nil).Status(context.TODO())
if rpcErr != nil {
log.Warn().Msgf("backend %s experienced an error retrieving status info: %s", backendId, rpcErr.Error())
val, slbErr := bm.SampleLocalBackendProcess(backendId)
if slbErr != nil {
return fmt.Errorf("backend %s experienced an error retrieving status info via rpc: %s, then failed local node process sample: %s", backendId, rpcErr.Error(), slbErr.Error())
}
return c.JSON(proto.StatusResponse{
State: proto.StatusResponse_ERROR,
Memory: &proto.MemoryUsageData{
Total: val.MemoryInfo.VMS,
Breakdown: map[string]uint64{
"gopsutil-RSS": val.MemoryInfo.RSS,
},
},
})
}
return c.JSON(status)
}
}
func BackendShutdownEndpoint(bm BackendMonitor) func(c *fiber.Ctx) error {
return func(c *fiber.Ctx) error {
backendId, err := bm.getModelLoaderIDFromCtx(c)
if err != nil {
return err
}
return bm.options.Loader.ShutdownModel(backendId)
}
}

View File

@@ -1,326 +0,0 @@
package localai
import (
"context"
"fmt"
"os"
"slices"
"strings"
"sync"
json "github.com/json-iterator/go"
"gopkg.in/yaml.v3"
config "github.com/go-skynet/LocalAI/core/config"
"github.com/go-skynet/LocalAI/pkg/gallery"
"github.com/go-skynet/LocalAI/pkg/utils"
"github.com/gofiber/fiber/v2"
"github.com/google/uuid"
"github.com/rs/zerolog/log"
)
type galleryOp struct {
req gallery.GalleryModel
id string
galleries []gallery.Gallery
galleryName string
}
type galleryOpStatus struct {
FileName string `json:"file_name"`
Error error `json:"error"`
Processed bool `json:"processed"`
Message string `json:"message"`
Progress float64 `json:"progress"`
TotalFileSize string `json:"file_size"`
DownloadedFileSize string `json:"downloaded_size"`
}
type galleryApplier struct {
modelPath string
sync.Mutex
C chan galleryOp
statuses map[string]*galleryOpStatus
}
func NewGalleryService(modelPath string) *galleryApplier {
return &galleryApplier{
modelPath: modelPath,
C: make(chan galleryOp),
statuses: make(map[string]*galleryOpStatus),
}
}
func prepareModel(modelPath string, req gallery.GalleryModel, cm *config.ConfigLoader, downloadStatus func(string, string, string, float64)) error {
config, err := gallery.GetGalleryConfigFromURL(req.URL)
if err != nil {
return err
}
config.Files = append(config.Files, req.AdditionalFiles...)
return gallery.InstallModel(modelPath, req.Name, &config, req.Overrides, downloadStatus)
}
func (g *galleryApplier) updateStatus(s string, op *galleryOpStatus) {
g.Lock()
defer g.Unlock()
g.statuses[s] = op
}
func (g *galleryApplier) getStatus(s string) *galleryOpStatus {
g.Lock()
defer g.Unlock()
return g.statuses[s]
}
func (g *galleryApplier) getAllStatus() map[string]*galleryOpStatus {
g.Lock()
defer g.Unlock()
return g.statuses
}
func (g *galleryApplier) Start(c context.Context, cm *config.ConfigLoader) {
go func() {
for {
select {
case <-c.Done():
return
case op := <-g.C:
utils.ResetDownloadTimers()
g.updateStatus(op.id, &galleryOpStatus{Message: "processing", Progress: 0})
// updates the status with an error
updateError := func(e error) {
g.updateStatus(op.id, &galleryOpStatus{Error: e, Processed: true, Message: "error: " + e.Error()})
}
// displayDownload displays the download progress
progressCallback := func(fileName string, current string, total string, percentage float64) {
g.updateStatus(op.id, &galleryOpStatus{Message: "processing", FileName: fileName, Progress: percentage, TotalFileSize: total, DownloadedFileSize: current})
utils.DisplayDownloadFunction(fileName, current, total, percentage)
}
var err error
// if the request contains a gallery name, we apply the gallery from the gallery list
if op.galleryName != "" {
if strings.Contains(op.galleryName, "@") {
err = gallery.InstallModelFromGallery(op.galleries, op.galleryName, g.modelPath, op.req, progressCallback)
} else {
err = gallery.InstallModelFromGalleryByName(op.galleries, op.galleryName, g.modelPath, op.req, progressCallback)
}
} else {
err = prepareModel(g.modelPath, op.req, cm, progressCallback)
}
if err != nil {
updateError(err)
continue
}
// Reload models
err = cm.LoadConfigs(g.modelPath)
if err != nil {
updateError(err)
continue
}
err = cm.Preload(g.modelPath)
if err != nil {
updateError(err)
continue
}
g.updateStatus(op.id, &galleryOpStatus{Processed: true, Message: "completed", Progress: 100})
}
}
}()
}
type galleryModel struct {
gallery.GalleryModel `yaml:",inline"` // https://github.com/go-yaml/yaml/issues/63
ID string `json:"id"`
}
func processRequests(modelPath, s string, cm *config.ConfigLoader, galleries []gallery.Gallery, requests []galleryModel) error {
var err error
for _, r := range requests {
utils.ResetDownloadTimers()
if r.ID == "" {
err = prepareModel(modelPath, r.GalleryModel, cm, utils.DisplayDownloadFunction)
} else {
if strings.Contains(r.ID, "@") {
err = gallery.InstallModelFromGallery(
galleries, r.ID, modelPath, r.GalleryModel, utils.DisplayDownloadFunction)
} else {
err = gallery.InstallModelFromGalleryByName(
galleries, r.ID, modelPath, r.GalleryModel, utils.DisplayDownloadFunction)
}
}
}
return err
}
func ApplyGalleryFromFile(modelPath, s string, cm *config.ConfigLoader, galleries []gallery.Gallery) error {
dat, err := os.ReadFile(s)
if err != nil {
return err
}
var requests []galleryModel
if err := yaml.Unmarshal(dat, &requests); err != nil {
return err
}
return processRequests(modelPath, s, cm, galleries, requests)
}
func ApplyGalleryFromString(modelPath, s string, cm *config.ConfigLoader, galleries []gallery.Gallery) error {
var requests []galleryModel
err := json.Unmarshal([]byte(s), &requests)
if err != nil {
return err
}
return processRequests(modelPath, s, cm, galleries, requests)
}
/// Endpoint Service
type ModelGalleryService struct {
galleries []gallery.Gallery
modelPath string
galleryApplier *galleryApplier
}
type GalleryModel struct {
ID string `json:"id"`
gallery.GalleryModel
}
func CreateModelGalleryService(galleries []gallery.Gallery, modelPath string, galleryApplier *galleryApplier) ModelGalleryService {
return ModelGalleryService{
galleries: galleries,
modelPath: modelPath,
galleryApplier: galleryApplier,
}
}
func (mgs *ModelGalleryService) GetOpStatusEndpoint() func(c *fiber.Ctx) error {
return func(c *fiber.Ctx) error {
status := mgs.galleryApplier.getStatus(c.Params("uuid"))
if status == nil {
return fmt.Errorf("could not find any status for ID")
}
return c.JSON(status)
}
}
func (mgs *ModelGalleryService) GetAllStatusEndpoint() func(c *fiber.Ctx) error {
return func(c *fiber.Ctx) error {
return c.JSON(mgs.galleryApplier.getAllStatus())
}
}
func (mgs *ModelGalleryService) ApplyModelGalleryEndpoint() func(c *fiber.Ctx) error {
return func(c *fiber.Ctx) error {
input := new(GalleryModel)
// Get input data from the request body
if err := c.BodyParser(input); err != nil {
return err
}
uuid, err := uuid.NewUUID()
if err != nil {
return err
}
mgs.galleryApplier.C <- galleryOp{
req: input.GalleryModel,
id: uuid.String(),
galleryName: input.ID,
galleries: mgs.galleries,
}
return c.JSON(struct {
ID string `json:"uuid"`
StatusURL string `json:"status"`
}{ID: uuid.String(), StatusURL: c.BaseURL() + "/models/jobs/" + uuid.String()})
}
}
func (mgs *ModelGalleryService) ListModelFromGalleryEndpoint() func(c *fiber.Ctx) error {
return func(c *fiber.Ctx) error {
log.Debug().Msgf("Listing models from galleries: %+v", mgs.galleries)
models, err := gallery.AvailableGalleryModels(mgs.galleries, mgs.modelPath)
if err != nil {
return err
}
log.Debug().Msgf("Models found from galleries: %+v", models)
for _, m := range models {
log.Debug().Msgf("Model found from galleries: %+v", m)
}
dat, err := json.Marshal(models)
if err != nil {
return err
}
return c.Send(dat)
}
}
// NOTE: This is different (and much simpler!) than above! This JUST lists the model galleries that have been loaded, not their contents!
func (mgs *ModelGalleryService) ListModelGalleriesEndpoint() func(c *fiber.Ctx) error {
return func(c *fiber.Ctx) error {
log.Debug().Msgf("Listing model galleries %+v", mgs.galleries)
dat, err := json.Marshal(mgs.galleries)
if err != nil {
return err
}
return c.Send(dat)
}
}
func (mgs *ModelGalleryService) AddModelGalleryEndpoint() func(c *fiber.Ctx) error {
return func(c *fiber.Ctx) error {
input := new(gallery.Gallery)
// Get input data from the request body
if err := c.BodyParser(input); err != nil {
return err
}
if slices.ContainsFunc(mgs.galleries, func(gallery gallery.Gallery) bool {
return gallery.Name == input.Name
}) {
return fmt.Errorf("%s already exists", input.Name)
}
dat, err := json.Marshal(mgs.galleries)
if err != nil {
return err
}
log.Debug().Msgf("Adding %+v to gallery list", *input)
mgs.galleries = append(mgs.galleries, *input)
return c.Send(dat)
}
}
func (mgs *ModelGalleryService) RemoveModelGalleryEndpoint() func(c *fiber.Ctx) error {
return func(c *fiber.Ctx) error {
input := new(gallery.Gallery)
// Get input data from the request body
if err := c.BodyParser(input); err != nil {
return err
}
if !slices.ContainsFunc(mgs.galleries, func(gallery gallery.Gallery) bool {
return gallery.Name == input.Name
}) {
return fmt.Errorf("%s is not currently registered", input.Name)
}
mgs.galleries = slices.DeleteFunc(mgs.galleries, func(gallery gallery.Gallery) bool {
return gallery.Name == input.Name
})
return c.Send(nil)
}
}

View File

@@ -1,53 +0,0 @@
package localai
import (
fiberContext "github.com/go-skynet/LocalAI/api/ctx"
"github.com/go-skynet/LocalAI/core/backend"
config "github.com/go-skynet/LocalAI/core/config"
"github.com/rs/zerolog/log"
"github.com/go-skynet/LocalAI/core/options"
"github.com/gofiber/fiber/v2"
)
type TTSRequest struct {
Model string `json:"model" yaml:"model"`
Input string `json:"input" yaml:"input"`
Backend string `json:"backend" yaml:"backend"`
}
func TTSEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
return func(c *fiber.Ctx) error {
input := new(TTSRequest)
// Get input data from the request body
if err := c.BodyParser(input); err != nil {
return err
}
modelFile, err := fiberContext.ModelFromContext(c, o.Loader, input.Model, false)
if err != nil {
modelFile = input.Model
log.Warn().Msgf("Model not found in context: %s", input.Model)
}
cfg, err := config.Load(modelFile, o.Loader.ModelPath, cm, false, 0, 0, false)
if err != nil {
modelFile = input.Model
log.Warn().Msgf("Model not found in context: %s", input.Model)
} else {
modelFile = cfg.Model
}
log.Debug().Msgf("Request for model: %s", modelFile)
if input.Backend != "" {
cfg.Backend = input.Backend
}
filePath, _, err := backend.ModelTTS(cfg.Backend, input.Input, modelFile, o.Loader, o, *cfg)
if err != nil {
return err
}
return c.Download(filePath)
}
}

View File

@@ -18,6 +18,48 @@ service Backend {
rpc TTS(TTSRequest) returns (Result) {}
rpc TokenizeString(PredictOptions) returns (TokenizationResponse) {}
rpc Status(HealthMessage) returns (StatusResponse) {}
rpc StoresSet(StoresSetOptions) returns (Result) {}
rpc StoresDelete(StoresDeleteOptions) returns (Result) {}
rpc StoresGet(StoresGetOptions) returns (StoresGetResult) {}
rpc StoresFind(StoresFindOptions) returns (StoresFindResult) {}
}
message StoresKey {
repeated float Floats = 1;
}
message StoresValue {
bytes Bytes = 1;
}
message StoresSetOptions {
repeated StoresKey Keys = 1;
repeated StoresValue Values = 2;
}
message StoresDeleteOptions {
repeated StoresKey Keys = 1;
}
message StoresGetOptions {
repeated StoresKey Keys = 1;
}
message StoresGetResult {
repeated StoresKey Keys = 1;
repeated StoresValue Values = 2;
}
message StoresFindOptions {
StoresKey Key = 1;
int32 TopK = 2;
}
message StoresFindResult {
repeated StoresKey Keys = 1;
repeated StoresValue Values = 2;
repeated float Similarities = 3;
}
message HealthMessage {}
@@ -121,11 +163,16 @@ message ModelOptions {
bool NoMulMatQ = 37;
string DraftModel = 39;
string AudioPath = 38;
// vllm
string Quantization = 40;
float GPUMemoryUtilization = 50;
bool TrustRemoteCode = 51;
bool EnforceEager = 52;
int32 SwapSpace = 53;
int32 MaxModelLen = 54;
string MMProj = 41;
@@ -186,6 +233,7 @@ message TTSRequest {
string text = 1;
string model = 2;
string dst = 3;
string voice = 4;
}
message TokenizationResponse {
@@ -207,4 +255,4 @@ message StatusResponse {
}
State state = 1;
MemoryUsageData memory = 2;
}
}

View File

@@ -48,7 +48,7 @@ $(INSTALLED_PACKAGES): grpc_build
$(GRPC_REPO):
git clone --depth $(GIT_CLONE_DEPTH) -b $(TAG_LIB_GRPC) $(GIT_REPO_LIB_GRPC) $(GRPC_REPO)/grpc
cd $(GRPC_REPO)/grpc && git submodule update --init --recursive --depth $(GIT_CLONE_DEPTH)
cd $(GRPC_REPO)/grpc && git submodule update --jobs 2 --init --recursive --depth $(GIT_CLONE_DEPTH)
$(GRPC_BUILD): $(GRPC_REPO)
mkdir -p $(GRPC_BUILD)

View File

@@ -12,12 +12,18 @@ ifeq ($(BUILD_TYPE),cublas)
# to CMAKE_ARGS automatically
else ifeq ($(BUILD_TYPE),openblas)
CMAKE_ARGS+=-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
# If build type is clblast (openCL) we set -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
else ifeq ($(BUILD_TYPE),clblast)
# If build type is clblas (openCL) we set -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
else ifeq ($(BUILD_TYPE),clblas)
CMAKE_ARGS+=-DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++
else ifeq ($(BUILD_TYPE),hipblas)
CMAKE_ARGS+=-DLLAMA_HIPBLAS=ON
# If it's OSX, DO NOT embed the metal library - -DLLAMA_METAL_EMBED_LIBRARY=ON requires further investigation
# But if it's OSX without metal, disable it here
else ifeq ($(OS),darwin)
ifneq ($(BUILD_TYPE),metal)
CMAKE_ARGS+=-DLLAMA_METAL=OFF
endif
endif
ifeq ($(BUILD_TYPE),sycl_f16)
@@ -35,7 +41,7 @@ llama.cpp:
fi
cd llama.cpp && git checkout -b build $(LLAMA_VERSION) && git submodule update --init --recursive --depth 1
llama.cpp/examples/grpc-server:
llama.cpp/examples/grpc-server: llama.cpp
mkdir -p llama.cpp/examples/grpc-server
cp -r $(abspath ./)/CMakeLists.txt llama.cpp/examples/grpc-server/
cp -r $(abspath ./)/grpc-server.cpp llama.cpp/examples/grpc-server/

View File

@@ -58,9 +58,11 @@ struct server_params
int32_t read_timeout = 600;
int32_t write_timeout = 600;
bool slots_endpoint = true;
bool metrics_endpoint = false;
};
bool server_verbose = false;
bool server_log_json = true;
static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b)
{
@@ -316,12 +318,76 @@ struct llama_client_slot
}
void print_timings() const {
LOG_TEE("\n");
LOG_TEE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, t_prompt_processing, num_prompt_tokens_processed, t_prompt_processing / num_prompt_tokens_processed, 1e3 / t_prompt_processing * num_prompt_tokens_processed);
LOG_TEE("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, t_token_generation, n_decoded,t_token_generation / n_decoded, 1e3 / t_token_generation * n_decoded);
LOG_TEE("%s: total time = %10.2f ms\n", __func__, t_prompt_processing + t_token_generation);
char buffer[512];
double t_token = t_prompt_processing / num_prompt_tokens_processed;
double n_tokens_second = 1e3 / t_prompt_processing * num_prompt_tokens_processed;
sprintf(buffer, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
t_prompt_processing, num_prompt_tokens_processed,
t_token, n_tokens_second);
LOG_INFO(buffer, {
{"slot_id", id},
{"task_id", task_id},
{"t_prompt_processing", t_prompt_processing},
{"num_prompt_tokens_processed", num_prompt_tokens_processed},
{"t_token", t_token},
{"n_tokens_second", n_tokens_second},
});
t_token = t_token_generation / n_decoded;
n_tokens_second = 1e3 / t_token_generation * n_decoded;
sprintf(buffer, "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)",
t_token_generation, n_decoded,
t_token, n_tokens_second);
LOG_INFO(buffer, {
{"slot_id", id},
{"task_id", task_id},
{"t_token_generation", t_token_generation},
{"n_decoded", n_decoded},
{"t_token", t_token},
{"n_tokens_second", n_tokens_second},
});
sprintf(buffer, " total time = %10.2f ms", t_prompt_processing + t_token_generation);
LOG_INFO(buffer, {
{"slot_id", id},
{"task_id", task_id},
{"t_prompt_processing", t_prompt_processing},
{"t_token_generation", t_token_generation},
{"t_total", t_prompt_processing + t_token_generation},
});
}
};
struct llama_metrics {
uint64_t n_prompt_tokens_processed_total = 0;
uint64_t n_tokens_predicted_total = 0;
uint64_t n_prompt_tokens_processed = 0;
uint64_t t_prompt_processing = 0;
uint64_t n_tokens_predicted = 0;
uint64_t t_tokens_generation = 0;
void on_prompt_eval(const llama_client_slot &slot) {
n_prompt_tokens_processed_total += slot.num_prompt_tokens_processed;
n_prompt_tokens_processed += slot.num_prompt_tokens_processed;
t_prompt_processing += slot.t_prompt_processing;
}
void on_prediction(const llama_client_slot &slot) {
n_tokens_predicted_total += slot.n_decoded;
n_tokens_predicted += slot.n_decoded;
t_tokens_generation += slot.t_token_generation;
}
void reset_bucket() {
n_prompt_tokens_processed = 0;
t_prompt_processing = 0;
n_tokens_predicted = 0;
t_tokens_generation = 0;
}
};
@@ -359,6 +425,8 @@ struct llama_server_context
llama_server_queue queue_tasks;
llama_server_response queue_results;
llama_metrics metrics;
~llama_server_context()
{
if (ctx)
@@ -378,7 +446,7 @@ struct llama_server_context
params = params_;
if (!params.mmproj.empty()) {
multimodal = true;
LOG_TEE("Multi Modal Mode Enabled");
LOG_INFO("Multi Modal Mode Enabled", {});
clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
if(clp_ctx == nullptr) {
LOG_ERROR("unable to load clip model", {{"model", params.mmproj}});
@@ -415,13 +483,23 @@ struct llama_server_context
return true;
}
void validate_model_chat_template(server_params & sparams) {
llama_chat_message chat[] = {{"user", "test"}};
std::vector<char> buf(1);
int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size());
if (res < 0) {
LOG_ERROR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
sparams.chat_template = "<|im_start|>"; // llama_chat_apply_template only checks if <|im_start|> exist in the template
}
}
void initialize() {
// create slots
all_slots_are_idle = true;
const int32_t n_ctx_slot = n_ctx / params.n_parallel;
LOG_TEE("Available slots:\n");
LOG_INFO("initializing slots", {{"n_slots", params.n_parallel}});
for (int i = 0; i < params.n_parallel; i++)
{
llama_client_slot slot;
@@ -430,7 +508,10 @@ struct llama_server_context
slot.n_ctx = n_ctx_slot;
slot.n_predict = params.n_predict;
LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot);
LOG_INFO("new slot", {
{"slot_id", slot.id},
{"n_ctx_slot", slot.n_ctx}
});
const int ga_n = params.grp_attn_n;
const int ga_w = params.grp_attn_w;
@@ -440,7 +521,12 @@ struct llama_server_context
GGML_ASSERT(ga_w % ga_n == 0 && "ga_w must be a multiple of ga_n"); // NOLINT
//GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT
LOG_TEE(" -> Slot %i - self-extend: ga_n = %d, ga_w = %d\n", slot.id, ga_n, ga_w);
LOG_INFO("slot self-extend", {
{"slot_id", slot.id},
{"ga_n", ga_n},
{"ga_w", ga_w}
});
}
slot.ga_i = 0;
@@ -726,10 +812,16 @@ struct llama_server_context
img_sl.img_data = clip_image_u8_init();
if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
{
LOG_TEE("slot %i - failed to load image [id: %i]\n", slot->id, img_sl.id);
LOG_ERROR("failed to load image", {
{"slot_id", slot->id},
{"img_sl_id", img_sl.id}
});
return false;
}
LOG_TEE("slot %i - loaded image\n", slot->id);
LOG_VERBOSE("image loaded", {
{"slot_id", slot->id},
{"img_sl_id", img_sl.id}
});
img_sl.request_encode_image = true;
slot->images.push_back(img_sl);
}
@@ -789,7 +881,10 @@ struct llama_server_context
all_slots_are_idle = false;
LOG_TEE("slot %i is processing [task id: %i]\n", slot->id, slot->task_id);
LOG_INFO("slot is processing task", {
{"slot_id", slot->id},
{"task_id", slot->task_id},
});
return true;
}
@@ -814,10 +909,24 @@ struct llama_server_context
llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
}
if (llama_decode(ctx, batch) != 0)
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += params.n_batch)
{
LOG_TEE("%s: llama_decode() failed\n", __func__);
return;
const int32_t n_tokens = std::min(params.n_batch, (int32_t) (batch.n_tokens - i));
llama_batch batch_view = {
n_tokens,
batch.token + i,
nullptr,
batch.pos + i,
batch.n_seq_id + i,
batch.seq_id + i,
batch.logits + i,
0, 0, 0, // unused
};
if (llama_decode(ctx, batch_view) != 0)
{
LOG_TEE("%s: llama_decode() failed\n", __func__);
return;
}
}
// assign the system KV cache to all parallel sequences
@@ -975,7 +1084,7 @@ struct llama_server_context
slot.has_next_token = false;
}
if (!slot.cache_tokens.empty() && result.tok == llama_token_eos(model))
if (result.tok == llama_token_eos(model))
{
slot.stopped_eos = true;
slot.has_next_token = false;
@@ -1351,7 +1460,7 @@ struct llama_server_context
if (slot == nullptr)
{
// if no slot is available, we defer this task for processing later
LOG_VERBOSE("no slot is available", {});
LOG_VERBOSE("no slot is available", {{"task_id", task.id}});
queue_tasks.defer(task);
break;
}
@@ -1425,7 +1534,7 @@ struct llama_server_context
bool update_slots() {
if (system_need_update)
{
LOG_TEE("updating system prompt\n");
LOG_INFO("updating system prompt", {});
update_system_prompt();
}
@@ -1435,12 +1544,13 @@ struct llama_server_context
{
if (system_prompt.empty() && clean_kv_cache)
{
LOG_TEE("all slots are idle and system prompt is empty, clear the KV cache\n");
LOG_INFO("all slots are idle and system prompt is empty, clear the KV cache", {});
kv_cache_clear();
}
return true;
}
LOG_VERBOSE("posting NEXT_RESPONSE", {});
task_server task;
task.type = TASK_TYPE_NEXT_RESPONSE;
task.target_id = -1;
@@ -1471,6 +1581,7 @@ struct llama_server_context
}
// decode any currently ongoing sequences
LOG_VERBOSE("decoding ongoing sequences", {});
for (auto & slot : slots)
{
// release the slot
@@ -1480,7 +1591,15 @@ struct llama_server_context
slot.command = NONE;
slot.t_last_used = ggml_time_us();
LOG_TEE("slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
LOG_INFO("slot released", {
{"slot_id", slot.id},
{"task_id", slot.task_id},
{"n_ctx", n_ctx},
{"n_past", slot.n_past},
{"n_system_tokens", system_tokens.size()},
{"n_cache_tokens", slot.cache_tokens.size()},
{"truncated", slot.truncated}
});
queue_tasks.notify_slot_changed();
continue;
@@ -1607,6 +1726,14 @@ struct llama_server_context
}
slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
// the last token of the cache is not in the KV cache until the next call to llama_decode
// (it was sampled, pushed into the "cache_tokens", but not yet put in the context)
if (slot.n_past > 0 && slot.n_past == (int32_t) slot.cache_tokens.size())
{
slot.n_past -= 1;
}
slot.num_prompt_tokens_processed = slot.num_prompt_tokens - slot.n_past;
if (slot.ga_n != 1)
@@ -1628,7 +1755,12 @@ struct llama_server_context
slot.ga_i = ga_i;
}
LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed);
LOG_INFO("slot progression", {
{ "slot_id", slot.id },
{ "task_id", slot.task_id },
{ "n_past", slot.n_past },
{ "num_prompt_tokens_processed", slot.num_prompt_tokens_processed }
});
}
slot.cache_tokens = prompt_tokens;
@@ -1636,7 +1768,10 @@ struct llama_server_context
if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0)
{
// we have to evaluate at least 1 token to generate logits.
LOG_TEE("slot %d : we have to evaluate at least 1 token to generate logits\n", slot.id);
LOG_INFO("we have to evaluate at least 1 token to generate logits", {
{ "slot_id", slot.id },
{ "task_id", slot.task_id }
});
slot.n_past--;
if (slot.ga_i > 0)
{
@@ -1644,9 +1779,13 @@ struct llama_server_context
}
}
LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past);
llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1);
int p0 = (int) system_tokens.size() + slot.n_past;
LOG_INFO("kv cache rm [p0, end)", {
{ "slot_id", slot.id },
{ "task_id", slot.task_id },
{ "p0", p0 }
});
llama_kv_cache_seq_rm(ctx, slot.id, p0, -1);
LOG_VERBOSE("prompt ingested", {
{"n_past", slot.n_past},
@@ -1681,7 +1820,13 @@ struct llama_server_context
if (has_images && !ingest_images(slot, n_batch))
{
LOG_TEE("failed processing images\n");
LOG_ERROR("failed processing images", {
"slot_id", slot.id,
"task_id", slot.task_id,
});
// FIXME @phymbert: to be properly tested
// early returning without changing the slot state will block the slot for ever
// no one at the moment is checking the return value
return false;
}
@@ -1723,9 +1868,9 @@ struct llama_server_context
LOG_TEE("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
llama_kv_cache_seq_shift(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd);
llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd);
llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n);
llama_kv_cache_seq_shift(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w,slot.n_past_se + ib * bd, dd);
llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w,slot.n_past_se + ib * bd, dd);
slot.n_past_se -= bd;
@@ -1781,7 +1926,7 @@ struct llama_server_context
send_embedding(slot);
slot.release();
slot.i_batch = -1;
return true;
continue;
}
completion_token_output result;
@@ -1794,6 +1939,7 @@ struct llama_server_context
{
slot.t_start_genereration = ggml_time_us();
slot.t_prompt_processing = (slot.t_start_genereration - slot.t_start_process_prompt) / 1e3;
metrics.on_prompt_eval(slot);
}
llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
@@ -1816,11 +1962,14 @@ struct llama_server_context
slot.release();
slot.print_timings();
send_final_response(slot);
metrics.on_prediction(slot);
}
slot.i_batch = -1;
}
}
LOG_VERBOSE("slots updated", {});
return true;
}
@@ -1849,18 +1998,6 @@ static json format_partial_response(
return res;
}
static json format_tokenizer_response(const std::vector<llama_token> &tokens)
{
return json{
{"tokens", tokens}};
}
static json format_detokenized_response(std::string content)
{
return json{
{"content", content}};
}
struct token_translator
{
llama_context * ctx;
@@ -2119,9 +2256,9 @@ static void params_parse(const backend::ModelOptions* request,
params.use_mmap = request->mmap();
params.embedding = request->embeddings();
if (request->ropescaling() == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; }
else if (request->ropescaling() == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; }
else { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; }
if (request->ropescaling() == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
else if (request->ropescaling() == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
else { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
if ( request->yarnextfactor() != 0.0f ) {
params.yarn_ext_factor = request->yarnextfactor();
}

View File

@@ -0,0 +1,14 @@
//go:build debug
// +build debug
package main
import (
"github.com/rs/zerolog/log"
)
func assert(cond bool, msg string) {
if !cond {
log.Fatal().Stack().Msg(msg)
}
}

26
backend/go/stores/main.go Normal file
View File

@@ -0,0 +1,26 @@
package main
// Note: this is started internally by LocalAI and a server is allocated for each store
import (
"flag"
"os"
grpc "github.com/go-skynet/LocalAI/pkg/grpc"
"github.com/rs/zerolog"
"github.com/rs/zerolog/log"
)
var (
addr = flag.String("addr", "localhost:50051", "the address to connect to")
)
func main() {
log.Logger = log.Output(zerolog.ConsoleWriter{Out: os.Stderr})
flag.Parse()
if err := grpc.StartServer(*addr, NewStore()); err != nil {
panic(err)
}
}

View File

@@ -0,0 +1,7 @@
//go:build !debug
// +build !debug
package main
func assert(cond bool, msg string) {
}

507
backend/go/stores/store.go Normal file
View File

@@ -0,0 +1,507 @@
package main
// This is a wrapper to statisfy the GRPC service interface
// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
import (
"container/heap"
"fmt"
"math"
"slices"
"github.com/go-skynet/LocalAI/pkg/grpc/base"
pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
"github.com/rs/zerolog/log"
)
type Store struct {
base.SingleThread
// The sorted keys
keys [][]float32
// The sorted values
values [][]byte
// If for every K it holds that ||k||^2 = 1, then we can use the normalized distance functions
// TODO: Should we normalize incoming keys if they are not instead?
keysAreNormalized bool
// The first key decides the length of the keys
keyLen int
}
// TODO: Only used for sorting using Go's builtin implementation. The interfaces are columnar because
// that's theoretically best for memory layout and cache locality, but this isn't optimized yet.
type Pair struct {
Key []float32
Value []byte
}
func NewStore() *Store {
return &Store{
keys: make([][]float32, 0),
values: make([][]byte, 0),
keysAreNormalized: true,
keyLen: -1,
}
}
func compareSlices(k1, k2 []float32) int {
assert(len(k1) == len(k2), fmt.Sprintf("compareSlices: len(k1) = %d, len(k2) = %d", len(k1), len(k2)))
return slices.Compare(k1, k2)
}
func hasKey(unsortedSlice [][]float32, target []float32) bool {
return slices.ContainsFunc(unsortedSlice, func(k []float32) bool {
return compareSlices(k, target) == 0
})
}
func findInSortedSlice(sortedSlice [][]float32, target []float32) (int, bool) {
return slices.BinarySearchFunc(sortedSlice, target, func(k, t []float32) int {
return compareSlices(k, t)
})
}
func isSortedPairs(kvs []Pair) bool {
for i := 1; i < len(kvs); i++ {
if compareSlices(kvs[i-1].Key, kvs[i].Key) > 0 {
return false
}
}
return true
}
func isSortedKeys(keys [][]float32) bool {
for i := 1; i < len(keys); i++ {
if compareSlices(keys[i-1], keys[i]) > 0 {
return false
}
}
return true
}
func sortIntoKeySlicese(keys []*pb.StoresKey) [][]float32 {
ks := make([][]float32, len(keys))
for i, k := range keys {
ks[i] = k.Floats
}
slices.SortFunc(ks, compareSlices)
assert(len(ks) == len(keys), fmt.Sprintf("len(ks) = %d, len(keys) = %d", len(ks), len(keys)))
assert(isSortedKeys(ks), "keys are not sorted")
return ks
}
func (s *Store) Load(opts *pb.ModelOptions) error {
return nil
}
// Sort the incoming kvs and merge them with the existing sorted kvs
func (s *Store) StoresSet(opts *pb.StoresSetOptions) error {
if len(opts.Keys) == 0 {
return fmt.Errorf("no keys to add")
}
if len(opts.Keys) != len(opts.Values) {
return fmt.Errorf("len(keys) = %d, len(values) = %d", len(opts.Keys), len(opts.Values))
}
if s.keyLen == -1 {
s.keyLen = len(opts.Keys[0].Floats)
} else {
if len(opts.Keys[0].Floats) != s.keyLen {
return fmt.Errorf("Try to add key with length %d when existing length is %d", len(opts.Keys[0].Floats), s.keyLen)
}
}
kvs := make([]Pair, len(opts.Keys))
for i, k := range opts.Keys {
if s.keysAreNormalized && !isNormalized(k.Floats) {
s.keysAreNormalized = false
var sample []float32
if len(s.keys) > 5 {
sample = k.Floats[:5]
} else {
sample = k.Floats
}
log.Debug().Msgf("Key is not normalized: %v", sample)
}
kvs[i] = Pair{
Key: k.Floats,
Value: opts.Values[i].Bytes,
}
}
slices.SortFunc(kvs, func(a, b Pair) int {
return compareSlices(a.Key, b.Key)
})
assert(len(kvs) == len(opts.Keys), fmt.Sprintf("len(kvs) = %d, len(opts.Keys) = %d", len(kvs), len(opts.Keys)))
assert(isSortedPairs(kvs), "keys are not sorted")
l := len(kvs) + len(s.keys)
merge_ks := make([][]float32, 0, l)
merge_vs := make([][]byte, 0, l)
i, j := 0, 0
for {
if i+j >= l {
break
}
if i >= len(kvs) {
merge_ks = append(merge_ks, s.keys[j])
merge_vs = append(merge_vs, s.values[j])
j++
continue
}
if j >= len(s.keys) {
merge_ks = append(merge_ks, kvs[i].Key)
merge_vs = append(merge_vs, kvs[i].Value)
i++
continue
}
c := compareSlices(kvs[i].Key, s.keys[j])
if c < 0 {
merge_ks = append(merge_ks, kvs[i].Key)
merge_vs = append(merge_vs, kvs[i].Value)
i++
} else if c > 0 {
merge_ks = append(merge_ks, s.keys[j])
merge_vs = append(merge_vs, s.values[j])
j++
} else {
merge_ks = append(merge_ks, kvs[i].Key)
merge_vs = append(merge_vs, kvs[i].Value)
i++
j++
}
}
assert(len(merge_ks) == l, fmt.Sprintf("len(merge_ks) = %d, l = %d", len(merge_ks), l))
assert(isSortedKeys(merge_ks), "merge keys are not sorted")
s.keys = merge_ks
s.values = merge_vs
return nil
}
func (s *Store) StoresDelete(opts *pb.StoresDeleteOptions) error {
if len(opts.Keys) == 0 {
return fmt.Errorf("no keys to delete")
}
if len(opts.Keys) == 0 {
return fmt.Errorf("no keys to add")
}
if s.keyLen == -1 {
s.keyLen = len(opts.Keys[0].Floats)
} else {
if len(opts.Keys[0].Floats) != s.keyLen {
return fmt.Errorf("Trying to delete key with length %d when existing length is %d", len(opts.Keys[0].Floats), s.keyLen)
}
}
ks := sortIntoKeySlicese(opts.Keys)
l := len(s.keys) - len(ks)
merge_ks := make([][]float32, 0, l)
merge_vs := make([][]byte, 0, l)
tail_ks := s.keys
tail_vs := s.values
for _, k := range ks {
j, found := findInSortedSlice(tail_ks, k)
if found {
merge_ks = append(merge_ks, tail_ks[:j]...)
merge_vs = append(merge_vs, tail_vs[:j]...)
tail_ks = tail_ks[j+1:]
tail_vs = tail_vs[j+1:]
} else {
assert(!hasKey(s.keys, k), fmt.Sprintf("Key exists, but was not found: t=%d, %v", len(tail_ks), k))
}
log.Debug().Msgf("Delete: found = %v, t = %d, j = %d, len(merge_ks) = %d, len(merge_vs) = %d", found, len(tail_ks), j, len(merge_ks), len(merge_vs))
}
merge_ks = append(merge_ks, tail_ks...)
merge_vs = append(merge_vs, tail_vs...)
assert(len(merge_ks) <= len(s.keys), fmt.Sprintf("len(merge_ks) = %d, len(s.keys) = %d", len(merge_ks), len(s.keys)))
s.keys = merge_ks
s.values = merge_vs
assert(len(s.keys) >= l, fmt.Sprintf("len(s.keys) = %d, l = %d", len(s.keys), l))
assert(isSortedKeys(s.keys), "keys are not sorted")
assert(func() bool {
for _, k := range ks {
if _, found := findInSortedSlice(s.keys, k); found {
return false
}
}
return true
}(), "Keys to delete still present")
if len(s.keys) != l {
log.Debug().Msgf("Delete: Some keys not found: len(s.keys) = %d, l = %d", len(s.keys), l)
}
return nil
}
func (s *Store) StoresGet(opts *pb.StoresGetOptions) (pb.StoresGetResult, error) {
pbKeys := make([]*pb.StoresKey, 0, len(opts.Keys))
pbValues := make([]*pb.StoresValue, 0, len(opts.Keys))
ks := sortIntoKeySlicese(opts.Keys)
if len(s.keys) == 0 {
log.Debug().Msgf("Get: No keys in store")
}
if s.keyLen == -1 {
s.keyLen = len(opts.Keys[0].Floats)
} else {
if len(opts.Keys[0].Floats) != s.keyLen {
return pb.StoresGetResult{}, fmt.Errorf("Try to get a key with length %d when existing length is %d", len(opts.Keys[0].Floats), s.keyLen)
}
}
tail_k := s.keys
tail_v := s.values
for i, k := range ks {
j, found := findInSortedSlice(tail_k, k)
if found {
pbKeys = append(pbKeys, &pb.StoresKey{
Floats: k,
})
pbValues = append(pbValues, &pb.StoresValue{
Bytes: tail_v[j],
})
tail_k = tail_k[j+1:]
tail_v = tail_v[j+1:]
} else {
assert(!hasKey(s.keys, k), fmt.Sprintf("Key exists, but was not found: i=%d, %v", i, k))
}
}
if len(pbKeys) != len(opts.Keys) {
log.Debug().Msgf("Get: Some keys not found: len(pbKeys) = %d, len(opts.Keys) = %d, len(s.Keys) = %d", len(pbKeys), len(opts.Keys), len(s.keys))
}
return pb.StoresGetResult{
Keys: pbKeys,
Values: pbValues,
}, nil
}
func isNormalized(k []float32) bool {
var sum float32
for _, v := range k {
sum += v
}
return sum == 1.0
}
// TODO: This we could replace with handwritten SIMD code
func normalizedCosineSimilarity(k1, k2 []float32) float32 {
assert(len(k1) == len(k2), fmt.Sprintf("normalizedCosineSimilarity: len(k1) = %d, len(k2) = %d", len(k1), len(k2)))
var dot float32
for i := 0; i < len(k1); i++ {
dot += k1[i] * k2[i]
}
assert(dot >= -1 && dot <= 1, fmt.Sprintf("dot = %f", dot))
// 2.0 * (1.0 - dot) would be the Euclidean distance
return dot
}
type PriorityItem struct {
Similarity float32
Key []float32
Value []byte
}
type PriorityQueue []*PriorityItem
func (pq PriorityQueue) Len() int { return len(pq) }
func (pq PriorityQueue) Less(i, j int) bool {
// Inverted because the most similar should be at the top
return pq[i].Similarity < pq[j].Similarity
}
func (pq PriorityQueue) Swap(i, j int) {
pq[i], pq[j] = pq[j], pq[i]
}
func (pq *PriorityQueue) Push(x any) {
item := x.(*PriorityItem)
*pq = append(*pq, item)
}
func (pq *PriorityQueue) Pop() any {
old := *pq
n := len(old)
item := old[n-1]
*pq = old[0 : n-1]
return item
}
func (s *Store) StoresFindNormalized(opts *pb.StoresFindOptions) (pb.StoresFindResult, error) {
tk := opts.Key.Floats
top_ks := make(PriorityQueue, 0, int(opts.TopK))
heap.Init(&top_ks)
for i, k := range s.keys {
sim := normalizedCosineSimilarity(tk, k)
heap.Push(&top_ks, &PriorityItem{
Similarity: sim,
Key: k,
Value: s.values[i],
})
if top_ks.Len() > int(opts.TopK) {
heap.Pop(&top_ks)
}
}
similarities := make([]float32, top_ks.Len())
pbKeys := make([]*pb.StoresKey, top_ks.Len())
pbValues := make([]*pb.StoresValue, top_ks.Len())
for i := top_ks.Len() - 1; i >= 0; i-- {
item := heap.Pop(&top_ks).(*PriorityItem)
similarities[i] = item.Similarity
pbKeys[i] = &pb.StoresKey{
Floats: item.Key,
}
pbValues[i] = &pb.StoresValue{
Bytes: item.Value,
}
}
return pb.StoresFindResult{
Keys: pbKeys,
Values: pbValues,
Similarities: similarities,
}, nil
}
func cosineSimilarity(k1, k2 []float32, mag1 float64) float32 {
assert(len(k1) == len(k2), fmt.Sprintf("cosineSimilarity: len(k1) = %d, len(k2) = %d", len(k1), len(k2)))
var dot, mag2 float64
for i := 0; i < len(k1); i++ {
dot += float64(k1[i] * k2[i])
mag2 += float64(k2[i] * k2[i])
}
sim := float32(dot / (mag1 * math.Sqrt(mag2)))
assert(sim >= -1 && sim <= 1, fmt.Sprintf("sim = %f", sim))
return sim
}
func (s *Store) StoresFindFallback(opts *pb.StoresFindOptions) (pb.StoresFindResult, error) {
tk := opts.Key.Floats
top_ks := make(PriorityQueue, 0, int(opts.TopK))
heap.Init(&top_ks)
var mag1 float64
for _, v := range tk {
mag1 += float64(v * v)
}
mag1 = math.Sqrt(mag1)
for i, k := range s.keys {
dist := cosineSimilarity(tk, k, mag1)
heap.Push(&top_ks, &PriorityItem{
Similarity: dist,
Key: k,
Value: s.values[i],
})
if top_ks.Len() > int(opts.TopK) {
heap.Pop(&top_ks)
}
}
similarities := make([]float32, top_ks.Len())
pbKeys := make([]*pb.StoresKey, top_ks.Len())
pbValues := make([]*pb.StoresValue, top_ks.Len())
for i := top_ks.Len() - 1; i >= 0; i-- {
item := heap.Pop(&top_ks).(*PriorityItem)
similarities[i] = item.Similarity
pbKeys[i] = &pb.StoresKey{
Floats: item.Key,
}
pbValues[i] = &pb.StoresValue{
Bytes: item.Value,
}
}
return pb.StoresFindResult{
Keys: pbKeys,
Values: pbValues,
Similarities: similarities,
}, nil
}
func (s *Store) StoresFind(opts *pb.StoresFindOptions) (pb.StoresFindResult, error) {
tk := opts.Key.Floats
if len(tk) != s.keyLen {
return pb.StoresFindResult{}, fmt.Errorf("Try to find key with length %d when existing length is %d", len(tk), s.keyLen)
}
if opts.TopK < 1 {
return pb.StoresFindResult{}, fmt.Errorf("opts.TopK = %d, must be >= 1", opts.TopK)
}
if s.keyLen == -1 {
s.keyLen = len(opts.Key.Floats)
} else {
if len(opts.Key.Floats) != s.keyLen {
return pb.StoresFindResult{}, fmt.Errorf("Try to add key with length %d when existing length is %d", len(opts.Key.Floats), s.keyLen)
}
}
if s.keysAreNormalized && isNormalized(tk) {
return s.StoresFindNormalized(opts)
} else {
if s.keysAreNormalized {
var sample []float32
if len(s.keys) > 5 {
sample = tk[:5]
} else {
sample = tk
}
log.Debug().Msgf("Trying to compare non-normalized key with normalized keys: %v", sample)
}
return s.StoresFindFallback(opts)
}
}

View File

@@ -11,21 +11,21 @@ import (
"github.com/go-skynet/LocalAI/core/schema"
)
func sh(c string) (string, error) {
cmd := exec.Command("/bin/sh", "-c", c)
func runCommand(command []string) (string, error) {
cmd := exec.Command(command[0], command[1:]...)
cmd.Env = os.Environ()
o, err := cmd.CombinedOutput()
return string(o), err
out, err := cmd.CombinedOutput()
return string(out), err
}
// AudioToWav converts audio to wav for transcribe. It bashes out to ffmpeg
// AudioToWav converts audio to wav for transcribe.
// TODO: use https://github.com/mccoyst/ogg?
func audioToWav(src, dst string) error {
out, err := sh(fmt.Sprintf("ffmpeg -i %s -format s16le -ar 16000 -ac 1 -acodec pcm_s16le %s", src, dst))
command := []string{"ffmpeg", "-i", src, "-format", "s16le", "-ar", "16000", "-ac", "1", "-acodec", "pcm_s16le", dst}
out, err := runCommand(command)
if err != nil {
return fmt.Errorf("error: %w out: %s", err, out)
}
return nil
}

View File

@@ -5,12 +5,14 @@ import signal
import sys
import os
import time
import base64
import grpc
import backend_pb2
import backend_pb2_grpc
from auto_gptq import AutoGPTQForCausalLM
from transformers import AutoTokenizer
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import TextGenerationPipeline
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
@@ -28,12 +30,22 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
if request.Device != "":
device = request.Device
tokenizer = AutoTokenizer.from_pretrained(request.Model, use_fast=request.UseFastTokenizer)
# support loading local model files
model_path = os.path.join(os.environ.get('MODELS_PATH', './'), request.Model)
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, trust_remote_code=request.TrustRemoteCode)
model = AutoGPTQForCausalLM.from_quantized(request.Model,
# support model `Qwen/Qwen-VL-Chat-Int4`
if "qwen-vl" in request.Model.lower():
self.model_name = "Qwen-VL-Chat"
model = AutoModelForCausalLM.from_pretrained(model_path,
trust_remote_code=request.TrustRemoteCode,
use_triton=request.UseTriton,
device_map="auto").eval()
else:
model = AutoGPTQForCausalLM.from_quantized(model_path,
model_basename=request.ModelBaseName,
use_safetensors=True,
trust_remote_code=True,
trust_remote_code=request.TrustRemoteCode,
device=device,
use_triton=request.UseTriton,
quantize_config=None)
@@ -55,6 +67,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
if request.TopP != 0.0:
top_p = request.TopP
prompt_images = self.recompile_vl_prompt(request)
compiled_prompt = prompt_images[0]
print(f"Prompt: {compiled_prompt}", file=sys.stderr)
# Implement Predict RPC
pipeline = TextGenerationPipeline(
model=self.model,
@@ -64,10 +81,17 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
top_p=top_p,
repetition_penalty=penalty,
)
t = pipeline(request.Prompt)[0]["generated_text"]
# Remove prompt from response if present
if request.Prompt in t:
t = t.replace(request.Prompt, "")
t = pipeline(compiled_prompt)[0]["generated_text"]
print(f"generated_text: {t}", file=sys.stderr)
if compiled_prompt in t:
t = t.replace(compiled_prompt, "")
# house keeping. Remove the image files from /tmp folder
for img_path in prompt_images[1]:
try:
os.remove(img_path)
except Exception as e:
print(f"Error removing image file: {img_path}, {e}", file=sys.stderr)
return backend_pb2.Result(message=bytes(t, encoding='utf-8'))
@@ -78,6 +102,24 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
# Not implemented yet
return self.Predict(request, context)
def recompile_vl_prompt(self, request):
prompt = request.Prompt
image_paths = []
if "qwen-vl" in self.model_name.lower():
# request.Images is an array which contains base64 encoded images. Iterate the request.Images array, decode and save each image to /tmp folder with a random filename.
# Then, save the image file paths to an array "image_paths".
# read "request.Prompt", replace "[img-%d]" with the image file paths in the order they appear in "image_paths". Save the new prompt to "prompt".
for i, img in enumerate(request.Images):
timestamp = str(int(time.time() * 1000)) # Generate timestamp
img_path = f"/tmp/vl-{timestamp}.jpg" # Use timestamp in filename
with open(img_path, "wb") as f:
f.write(base64.b64decode(img))
image_paths.append(img_path)
prompt = prompt.replace(f"[img-{i}]", "<img>" + img_path + "</img>,")
else:
prompt = request.Prompt
return (prompt, image_paths)
def serve(address):
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))

View File

@@ -1,3 +1,7 @@
####
# Attention! This file is abandoned.
# Please use the ../common-env/transformers/transformers.yml file to manage dependencies.
###
name: autogptq
channels:
- defaults
@@ -24,12 +28,12 @@ dependencies:
- xz=5.4.2=h5eee18b_0
- zlib=1.2.13=h5eee18b_0
- pip:
- accelerate==0.23.0
- accelerate==0.27.0
- aiohttp==3.8.5
- aiosignal==1.3.1
- async-timeout==4.0.3
- attrs==23.1.0
- auto-gptq==0.4.2
- auto-gptq==0.7.1
- certifi==2023.7.22
- charset-normalizer==3.3.0
- datasets==2.14.5
@@ -59,6 +63,7 @@ dependencies:
- nvidia-nccl-cu12==2.18.1
- nvidia-nvjitlink-cu12==12.2.140
- nvidia-nvtx-cu12==12.1.105
- optimum==1.17.1
- packaging==23.2
- pandas==2.1.1
- peft==0.5.0
@@ -71,13 +76,15 @@ dependencies:
- regex==2023.10.3
- requests==2.31.0
- rouge==1.0.1
- safetensors==0.3.3
- safetensors>=0.3.3
- six==1.16.0
- sympy==1.12
- tokenizers==0.14.0
- torch==2.1.0
- tqdm==4.66.1
- torch==2.2.1
- torchvision==0.17.1
- transformers==4.34.0
- transformers_stream_generator==0.0.5
- triton==2.1.0
- typing-extensions==4.8.0
- tzdata==2023.3

View File

File diff suppressed because one or more lines are too long

View File

File diff suppressed because one or more lines are too long

View File

@@ -8,6 +8,13 @@ ifeq ($(BUILD_TYPE), hipblas)
CONDA_ENV_PATH = "transformers-rocm.yml"
endif
# Intel GPU are supposed to have dependencies installed in the main python
# environment, so we skip conda installation for SYCL builds.
# https://github.com/intel/intel-extension-for-pytorch/issues/538
ifneq (,$(findstring sycl,$(BUILD_TYPE)))
export SKIP_CONDA=1
endif
.PHONY: transformers
transformers:
@echo "Installing $(CONDA_ENV_PATH)..."

View File

@@ -1,24 +1,38 @@
#!/bin/bash
set -ex
SKIP_CONDA=${SKIP_CONDA:-0}
# Check if environment exist
conda_env_exists(){
! conda list --name "${@}" >/dev/null 2>/dev/null
}
if conda_env_exists "transformers" ; then
echo "Creating virtual environment..."
conda env create --name transformers --file $1
echo "Virtual environment created."
else
echo "Virtual environment already exists."
if [ $SKIP_CONDA -eq 1 ]; then
echo "Skipping conda environment installation"
else
export PATH=$PATH:/opt/conda/bin
if conda_env_exists "transformers" ; then
echo "Creating virtual environment..."
conda env create --name transformers --file $1
echo "Virtual environment created."
else
echo "Virtual environment already exists."
fi
fi
if [ -d "/opt/intel" ]; then
# Intel GPU: If the directory exists, we assume we are using the intel image
# (no conda env)
# https://github.com/intel/intel-extension-for-pytorch/issues/538
pip install intel-extension-for-transformers datasets sentencepiece tiktoken neural_speed optimum[openvino]
fi
if [ "$PIP_CACHE_PURGE" = true ] ; then
export PATH=$PATH:/opt/conda/bin
# Activate conda environment
source activate transformers
if [ $SKIP_CONDA -eq 0 ]; then
# Activate conda environment
source activate transformers
fi
pip cache purge
fi

View File

@@ -24,19 +24,21 @@ dependencies:
- xz=5.4.2=h5eee18b_0
- zlib=1.2.13=h5eee18b_0
- pip:
- accelerate==0.23.0
- accelerate==0.27.0
- aiohttp==3.8.5
- aiosignal==1.3.1
- async-timeout==4.0.3
- auto-gptq==0.7.1
- attrs==23.1.0
- bark==0.1.5
- bitsandbytes==0.43.0
- boto3==1.28.61
- botocore==1.31.61
- certifi==2023.7.22
- TTS==0.22.0
- charset-normalizer==3.3.0
- datasets==2.14.5
- sentence-transformers==2.2.2
- sentence-transformers==2.5.1 # Updated Version
- sentencepiece==0.1.99
- dill==0.3.7
- einops==0.7.0
@@ -68,6 +70,7 @@ dependencies:
- nvidia-nccl-cu12==2.18.1
- nvidia-nvjitlink-cu12==12.2.140
- nvidia-nvtx-cu12==12.1.105
- optimum==1.17.1
- packaging==23.2
- pandas
- peft==0.5.0
@@ -81,12 +84,13 @@ dependencies:
- requests==2.31.0
- rouge==1.0.1
- s3transfer==0.7.0
- safetensors==0.3.3
- scipy==1.11.3
- safetensors>=0.4.1
- scipy==1.12.0 # Updated Version
- six==1.16.0
- sympy==1.12
- tokenizers
- torch==2.1.2
- torchvision==0.16.2
- torchaudio==2.1.2
- tqdm==4.66.1
- triton==2.1.0
@@ -94,7 +98,6 @@ dependencies:
- tzdata==2023.3
- urllib3==1.26.17
- xxhash==3.4.1
- auto-gptq==0.6.0
- yarl==1.9.2
- soundfile
- langid
@@ -113,7 +116,8 @@ dependencies:
- sudachipy
- sudachidict_core
- vocos
- vllm==0.2.7
- transformers>=4.36.0 # Required for Mixtral.
- vllm==0.3.2
- transformers>=4.38.2 # Updated Version
- transformers_stream_generator==0.0.5
- xformers==0.0.23.post1
prefix: /opt/conda/envs/transformers

View File

@@ -26,7 +26,8 @@ dependencies:
- pip:
- --pre
- --extra-index-url https://download.pytorch.org/whl/nightly/
- accelerate==0.23.0
- accelerate==0.27.0
- auto-gptq==0.7.1
- aiohttp==3.8.5
- aiosignal==1.3.1
- async-timeout==4.0.3
@@ -38,7 +39,7 @@ dependencies:
- TTS==0.22.0
- charset-normalizer==3.3.0
- datasets==2.14.5
- sentence-transformers==2.2.2
- sentence-transformers==2.5.1 # Updated Version
- sentencepiece==0.1.99
- dill==0.3.7
- einops==0.7.0
@@ -71,8 +72,8 @@ dependencies:
- requests==2.31.0
- rouge==1.0.1
- s3transfer==0.7.0
- safetensors==0.3.3
- scipy==1.11.3
- safetensors>=0.4.1
- scipy==1.12.0 # Updated Version
- six==1.16.0
- sympy==1.12
- tokenizers
@@ -82,7 +83,6 @@ dependencies:
- triton==2.1.0
- typing-extensions==4.8.0
- tzdata==2023.3
- auto-gptq==0.6.0
- urllib3==1.26.17
- xxhash==3.4.1
- yarl==1.9.2
@@ -90,6 +90,7 @@ dependencies:
- langid
- wget
- unidecode
- optimum==1.17.1
- pyopenjtalk-prebuilt
- pypinyin
- inflect
@@ -103,7 +104,8 @@ dependencies:
- sudachipy
- sudachidict_core
- vocos
- vllm==0.2.7
- transformers>=4.36.0 # Required for Mixtral.
- vllm==0.3.2
- transformers>=4.38.2 # Updated Version
- transformers_stream_generator==0.0.5
- xformers==0.0.23.post1
prefix: /opt/conda/envs/transformers

View File

@@ -24,19 +24,21 @@ dependencies:
- xz=5.4.2=h5eee18b_0
- zlib=1.2.13=h5eee18b_0
- pip:
- accelerate==0.23.0
- accelerate==0.27.0
- aiohttp==3.8.5
- aiosignal==1.3.1
- auto-gptq==0.7.1
- async-timeout==4.0.3
- attrs==23.1.0
- bark==0.1.5
- boto3==1.28.61
- botocore==1.31.61
- certifi==2023.7.22
- coloredlogs==15.0.1
- TTS==0.22.0
- charset-normalizer==3.3.0
- datasets==2.14.5
- sentence-transformers==2.2.2
- sentence-transformers==2.5.1 # Updated Version
- sentencepiece==0.1.99
- dill==0.3.7
- einops==0.7.0
@@ -47,6 +49,7 @@ dependencies:
- funcy==2.0
- grpcio==1.59.0
- huggingface-hub
- humanfriendly==10.0
- idna==3.4
- jinja2==3.1.2
- jmespath==1.0.1
@@ -56,6 +59,10 @@ dependencies:
- multiprocess==0.70.15
- networkx
- numpy==1.26.0
- onnx==1.15.0
- openvino==2024.0.0
- openvino-telemetry==2023.2.1
- optimum[openvino]==1.17.1
- packaging==23.2
- pandas
- peft==0.5.0
@@ -69,18 +76,18 @@ dependencies:
- requests==2.31.0
- rouge==1.0.1
- s3transfer==0.7.0
- safetensors==0.3.3
- scipy==1.11.3
- safetensors>=0.4.1
- scipy==1.12.0 # Updated Version
- six==1.16.0
- sympy==1.12
- tokenizers
- torch==2.1.2
- torchvision==0.16.2
- torchaudio==2.1.2
- tqdm==4.66.1
- triton==2.1.0
- typing-extensions==4.8.0
- tzdata==2023.3
- auto-gptq==0.6.0
- urllib3==1.26.17
- xxhash==3.4.1
- yarl==1.9.2
@@ -101,7 +108,8 @@ dependencies:
- sudachipy
- sudachidict_core
- vocos
- vllm==0.2.7
- transformers>=4.36.0 # Required for Mixtral.
- vllm==0.3.2
- transformers>=4.38.2 # Updated Version
- transformers_stream_generator==0.0.5
- xformers==0.0.23.post1
prefix: /opt/conda/envs/transformers
prefix: /opt/conda/envs/transformers

View File

File diff suppressed because one or more lines are too long

View File

@@ -4,6 +4,13 @@ ifeq ($(BUILD_TYPE), hipblas)
export CONDA_ENV_PATH = "diffusers-rocm.yml"
endif
# Intel GPU are supposed to have dependencies installed in the main python
# environment, so we skip conda installation for SYCL builds.
# https://github.com/intel/intel-extension-for-pytorch/issues/538
ifneq (,$(findstring sycl,$(BUILD_TYPE)))
export SKIP_CONDA=1
endif
.PHONY: diffusers
diffusers:
@echo "Installing $(CONDA_ENV_PATH)..."

View File

@@ -21,14 +21,15 @@ from diffusers import StableDiffusionXLPipeline, StableDiffusionDepth2ImgPipelin
from diffusers import StableDiffusionImg2ImgPipeline, AutoPipelineForText2Image, ControlNetModel, StableVideoDiffusionPipeline
from diffusers.pipelines.stable_diffusion import safety_checker
from diffusers.utils import load_image,export_to_video
from compel import Compel
from compel import Compel, ReturnedEmbeddingsType
from transformers import CLIPTextModel
from safetensors.torch import load_file
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
COMPEL=os.environ.get("COMPEL", "1") == "1"
COMPEL=os.environ.get("COMPEL", "0") == "1"
XPU=os.environ.get("XPU", "0") == "1"
CLIPSKIP=os.environ.get("CLIPSKIP", "1") == "1"
SAFETENSORS=os.environ.get("SAFETENSORS", "1") == "1"
CHUNK_SIZE=os.environ.get("CHUNK_SIZE", "8")
@@ -36,6 +37,10 @@ FPS=os.environ.get("FPS", "7")
DISABLE_CPU_OFFLOAD=os.environ.get("DISABLE_CPU_OFFLOAD", "0") == "1"
FRAMES=os.environ.get("FRAMES", "64")
if XPU:
import intel_extension_for_pytorch as ipex
print(ipex.xpu.get_device_name(0))
# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
@@ -231,8 +236,13 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
if request.SchedulerType != "":
self.pipe.scheduler = get_scheduler(request.SchedulerType, self.pipe.scheduler.config)
if not self.img2vid:
self.compel = Compel(tokenizer=self.pipe.tokenizer, text_encoder=self.pipe.text_encoder)
if COMPEL:
self.compel = Compel(
tokenizer=[self.pipe.tokenizer, self.pipe.tokenizer_2 ],
text_encoder=[self.pipe.text_encoder, self.pipe.text_encoder_2],
returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED,
requires_pooled=[False, True]
)
if request.ControlNet:
@@ -247,6 +257,8 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
self.pipe.to('cuda')
if self.controlnet:
self.controlnet.to('cuda')
if XPU:
self.pipe = self.pipe.to("xpu")
# Assume directory from request.ModelFile.
# Only if request.LoraAdapter it's not an absolute path
if request.LoraAdapter and request.ModelFile != "" and not os.path.isabs(request.LoraAdapter) and request.LoraAdapter:
@@ -386,8 +398,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
image = {}
if COMPEL:
conditioning = self.compel.build_conditioning_tensor(prompt)
kwargs["prompt_embeds"]= conditioning
conditioning, pooled = self.compel.build_conditioning_tensor(prompt)
kwargs["prompt_embeds"] = conditioning
kwargs["pooled_prompt_embeds"] = pooled
# pass the kwargs dictionary to the self.pipe method
image = self.pipe(
guidance_scale=self.cfg_scale,

View File

File diff suppressed because one or more lines are too long

View File

@@ -1,24 +1,50 @@
#!/bin/bash
set -ex
SKIP_CONDA=${SKIP_CONDA:-0}
# Check if environment exist
conda_env_exists(){
! conda list --name "${@}" >/dev/null 2>/dev/null
}
if conda_env_exists "diffusers" ; then
echo "Creating virtual environment..."
conda env create --name diffusers --file $1
echo "Virtual environment created."
else
echo "Virtual environment already exists."
if [ $SKIP_CONDA -eq 1 ]; then
echo "Skipping conda environment installation"
else
export PATH=$PATH:/opt/conda/bin
if conda_env_exists "diffusers" ; then
echo "Creating virtual environment..."
conda env create --name diffusers --file $1
echo "Virtual environment created."
else
echo "Virtual environment already exists."
fi
fi
if [ -d "/opt/intel" ]; then
# Intel GPU: If the directory exists, we assume we are using the Intel image
# https://github.com/intel/intel-extension-for-pytorch/issues/538
pip install torch==2.1.0a0 \
torchvision==0.16.0a0 \
torchaudio==2.1.0a0 \
intel-extension-for-pytorch==2.1.10+xpu \
--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
pip install google-api-python-client \
grpcio \
grpcio-tools \
diffusers==0.24.0 \
transformers>=4.25.1 \
accelerate \
compel==2.0.2 \
Pillow
fi
if [ "$PIP_CACHE_PURGE" = true ] ; then
export PATH=$PATH:/opt/conda/bin
# Activate conda environment
source activate diffusers
if [ $SKIP_CONDA -ne 1 ]; then
# Activate conda environment
source activate diffusers
fi
pip cache purge
fi

View File

@@ -3,10 +3,15 @@
##
## A bash script wrapper that runs the diffusers server with conda
export PATH=$PATH:/opt/conda/bin
# Activate conda environment
source activate diffusers
if [ -d "/opt/intel" ]; then
# Assumes we are using the Intel oneAPI container image
# https://github.com/intel/intel-extension-for-pytorch/issues/538
export XPU=1
else
export PATH=$PATH:/opt/conda/bin
# Activate conda environment
source activate diffusers
fi
# get the directory where the bash script is located
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"

View File

@@ -1,7 +1,8 @@
export CONDA_ENV_PATH = "exllama.yml"
.PHONY: exllama
exllama:
$(MAKE) -C ../common-env/transformers
bash install.sh
bash install.sh ${CONDA_ENV_PATH}
.PHONY: run
run:

View File

File diff suppressed because one or more lines are too long

View File

@@ -1,14 +1,27 @@
#!/bin/bash
set -ex
##
## A bash script installs the required dependencies of VALL-E-X and prepares the environment
export PATH=$PATH:/opt/conda/bin
# Activate conda environment
source activate transformers
if [ "$BUILD_TYPE" != "cublas" ]; then
echo "[exllama] Attention!!! Nvidia GPU is required - skipping installation"
exit 0
fi
echo $CONDA_PREFIX
# Check if environment exist
conda_env_exists(){
! conda list --name "${@}" >/dev/null 2>/dev/null
}
if conda_env_exists "exllama" ; then
echo "Creating virtual environment..."
conda env create --name exllama --file $1
echo "Virtual environment created."
else
echo "Virtual environment already exists."
fi
source activate exllama
git clone https://github.com/turboderp/exllama $CONDA_PREFIX/exllama && pushd $CONDA_PREFIX/exllama && pip install -r requirements.txt && popd

View File

@@ -2,11 +2,10 @@
##
## A bash script wrapper that runs the exllama server with conda
export PATH=$PATH:/opt/conda/bin
# Activate conda environment
source activate transformers
source activate exllama
# get the directory where the bash script is located
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"

View File

File diff suppressed because one or more lines are too long

View File

@@ -2,10 +2,14 @@
set -e
##
## A bash script installs the required dependencies of VALL-E-X and prepares the environment
export PATH=$PATH:/opt/conda/bin
export SHA=c0ddebaaaf8ffd1b3529c2bb654e650bce2f790f
# Activate conda environment
if [ "$BUILD_TYPE" != "cublas" ]; then
echo "[exllamav2] Attention!!! Nvidia GPU is required - skipping installation"
exit 0
fi
export PATH=$PATH:/opt/conda/bin
source activate transformers
echo $CONDA_PREFIX

View File

File diff suppressed because one or more lines are too long

View File

@@ -2,13 +2,14 @@
set -e
##
## A bash script installs the required dependencies of VALL-E-X and prepares the environment
export PATH=$PATH:/opt/conda/bin
if [ "$BUILD_TYPE" != "cublas" ]; then
echo "[mamba] Attention!!! nvcc is required - skipping installation"
exit 0
fi
export PATH=$PATH:/opt/conda/bin
# Activate conda environment
source activate transformers

View File

@@ -1,7 +1,7 @@
.PHONY: petals
petals:
@echo "Creating virtual environment..."
@conda env create --name petals --file petals.yml
bash install.sh "petals.yml"
@echo "Virtual environment created."
.PHONY: run

View File

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,5 @@
#!/bin/bash
export PATH=$PATH:/opt/conda/bin
conda env create --name petals --file $1

View File

File diff suppressed because one or more lines are too long

View File

File diff suppressed because one or more lines are too long

View File

File diff suppressed because one or more lines are too long

View File

@@ -3,10 +3,16 @@
##
## A bash script wrapper that runs the transformers server with conda
export PATH=$PATH:/opt/conda/bin
# Activate conda environment
source activate transformers
if [ -d "/opt/intel" ]; then
# Assumes we are using the Intel oneAPI container image
# https://github.com/intel/intel-extension-for-pytorch/issues/538
export XPU=1
else
export PATH=$PATH:/opt/conda/bin
# Activate conda environment
source activate transformers
fi
# get the directory where the bash script is located
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"

View File

@@ -8,6 +8,8 @@ import argparse
import signal
import sys
import os
from threading import Thread
import asyncio
import time
import backend_pb2
@@ -16,7 +18,14 @@ import backend_pb2_grpc
import grpc
import torch
import torch.cuda
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, set_seed
XPU=os.environ.get("XPU", "0") == "1"
if XPU:
from transformers import AutoTokenizer, AutoModel, set_seed, TextIteratorStreamer
else:
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, set_seed, BitsAndBytesConfig, TextIteratorStreamer
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
@@ -67,23 +76,101 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
A Result object that contains the result of the LoadModel operation.
"""
model_name = request.Model
compute = "auto"
if request.F16Memory == True:
compute=torch.bfloat16
self.CUDA = request.CUDA
self.OV=False
device_map="cpu"
quantization = None
if self.CUDA:
if request.Device:
device_map=request.Device
else:
device_map="cuda:0"
if request.Quantization == "bnb_4bit":
quantization = BitsAndBytesConfig(
load_in_4bit = True,
bnb_4bit_compute_dtype = compute,
bnb_4bit_quant_type = "nf4",
bnb_4bit_use_double_quant = True,
load_in_8bit = False,
)
elif request.Quantization == "bnb_8bit":
quantization = BitsAndBytesConfig(
load_in_4bit=False,
bnb_4bit_compute_dtype = None,
load_in_8bit=True,
)
try:
if request.Type == "AutoModelForCausalLM":
self.model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
if XPU:
import intel_extension_for_pytorch as ipex
from intel_extension_for_transformers.transformers.modeling import AutoModelForCausalLM
device_map="xpu"
compute=torch.float16
if request.Quantization == "xpu_4bit":
xpu_4bit = True
xpu_8bit = False
elif request.Quantization == "xpu_8bit":
xpu_4bit = False
xpu_8bit = True
else:
xpu_4bit = False
xpu_8bit = False
self.model = AutoModelForCausalLM.from_pretrained(model_name,
trust_remote_code=request.TrustRemoteCode,
use_safetensors=True,
device_map=device_map,
load_in_4bit=xpu_4bit,
load_in_8bit=xpu_8bit,
torch_dtype=compute)
else:
self.model = AutoModelForCausalLM.from_pretrained(model_name,
trust_remote_code=request.TrustRemoteCode,
use_safetensors=True,
quantization_config=quantization,
device_map=device_map,
torch_dtype=compute)
elif request.Type == "OVModelForCausalLM":
from optimum.intel.openvino import OVModelForCausalLM
from openvino.runtime import Core
if "GPU" in Core().available_devices:
device_map="GPU"
else:
device_map="CPU"
self.model = OVModelForCausalLM.from_pretrained(model_name,
compile=True,
device=device_map)
self.OV = True
else:
self.model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
self.model = AutoModel.from_pretrained(model_name,
trust_remote_code=request.TrustRemoteCode,
use_safetensors=True,
quantization_config=quantization,
device_map=device_map,
torch_dtype=compute)
self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_safetensors=True)
self.XPU = False
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.CUDA = False
if request.CUDA or torch.cuda.is_available():
if XPU and self.OV == False:
self.XPU = True
try:
print("Loading model", model_name, "to CUDA.", file=sys.stderr)
self.model = self.model.to("cuda")
self.CUDA = True
print("Optimizing model", model_name, "to XPU.", file=sys.stderr)
self.model = ipex.optimize_transformers(self.model, inplace=True, dtype=torch.float16, device="xpu")
except Exception as err:
print("Not using CUDA:", err, file=sys.stderr)
print("Not using XPU:", err, file=sys.stderr)
except Exception as err:
print("Error:", err, file=sys.stderr)
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
# Implement your logic here for the LoadModel service
# Replace this with your desired response
@@ -109,15 +196,84 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
encoded_input = self.tokenizer(request.Embeddings, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
# Create word embeddings
model_output = self.model(**encoded_input)
if self.CUDA:
encoded_input = encoded_input.to("cuda")
with torch.no_grad():
model_output = self.model(**encoded_input)
# Pool to get sentence embeddings; i.e. generate one 1024 vector for the entire sentence
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']).detach().numpy()
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
print("Calculated embeddings for: " + request.Embeddings, file=sys.stderr)
print("Embeddings:", sentence_embeddings, file=sys.stderr)
return backend_pb2.EmbeddingResult(embeddings=sentence_embeddings)
return backend_pb2.EmbeddingResult(embeddings=sentence_embeddings[0])
def Predict(self, request, context):
async def _predict(self, request, context, streaming=False):
set_seed(request.Seed)
if request.TopP == 0:
request.TopP = 0.9
max_tokens = 200
if request.Tokens > 0:
max_tokens = request.Tokens
inputs = self.tokenizer(request.Prompt, return_tensors="pt")
if self.CUDA:
inputs = inputs.to("cuda")
if XPU and self.OV == False:
inputs = inputs.to("xpu")
streaming = False
if streaming:
streamer=TextIteratorStreamer(self.tokenizer,
skip_prompt=True,
skip_special_tokens=True)
config=dict(inputs,
max_new_tokens=max_tokens,
temperature=request.Temperature,
top_p=request.TopP,
top_k=request.TopK,
do_sample=True,
attention_mask=inputs["attention_mask"],
eos_token_id=self.tokenizer.eos_token_id,
pad_token_id=self.tokenizer.eos_token_id,
streamer=streamer)
thread=Thread(target=self.model.generate, kwargs=config)
thread.start()
generated_text = ""
try:
for new_text in streamer:
generated_text += new_text
yield backend_pb2.Reply(message=bytes(new_text, encoding='utf-8'))
finally:
thread.join()
else:
if XPU and self.OV == False:
outputs = self.model.generate(inputs["input_ids"],
max_new_tokens=max_tokens,
temperature=request.Temperature,
top_p=request.TopP,
top_k=request.TopK,
do_sample=True,
pad_token=self.tokenizer.eos_token_id)
else:
outputs = self.model.generate(inputs["input_ids"],
max_new_tokens=max_tokens,
temperature=request.Temperature,
top_p=request.TopP,
top_k=request.TopK,
do_sample=True,
attention_mask=inputs["attention_mask"],
eos_token_id=self.tokenizer.eos_token_id,
pad_token_id=self.tokenizer.eos_token_id)
generated_text = self.tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True)[0]
if streaming:
return
yield backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8'))
async def Predict(self, request, context):
"""
Generates text based on the given prompt and sampling parameters.
@@ -128,28 +284,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
Returns:
backend_pb2.Reply: The predict result.
"""
set_seed(request.Seed)
if request.TopP == 0:
request.TopP = 0.9
gen = self._predict(request, context, streaming=False)
res = await gen.__anext__()
return res
max_tokens = 200
if request.Tokens > 0:
max_tokens = request.Tokens
inputs = self.tokenizer(request.Prompt, return_tensors="pt").input_ids
if self.CUDA:
inputs = inputs.to("cuda")
outputs = self.model.generate(inputs,max_new_tokens=max_tokens, temperature=request.Temperature, top_p=request.TopP)
generated_text = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
# Remove prompt from response if present
if request.Prompt in generated_text:
generated_text = generated_text.replace(request.Prompt, "")
return backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8'))
def PredictStream(self, request, context):
async def PredictStream(self, request, context):
"""
Generates text based on the given prompt and sampling parameters, and streams the results.
@@ -160,31 +299,33 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
Returns:
backend_pb2.Result: The predict stream result.
"""
yield self.Predict(request, context)
iterations = self._predict(request, context, streaming=True)
try:
async for iteration in iterations:
yield iteration
finally:
await iterations.aclose()
def serve(address):
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
async def serve(address):
# Start asyncio gRPC server
server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
# Add the servicer to the server
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
# Bind the server to the address
server.add_insecure_port(address)
server.start()
# Gracefully shutdown the server on SIGTERM or SIGINT
loop = asyncio.get_event_loop()
for sig in (signal.SIGINT, signal.SIGTERM):
loop.add_signal_handler(
sig, lambda: asyncio.ensure_future(server.stop(5))
)
# Start the server
await server.start()
print("Server started. Listening on: " + address, file=sys.stderr)
# Define the signal handler function
def signal_handler(sig, frame):
print("Received termination signal. Shutting down...")
server.stop(0)
sys.exit(0)
# Set the signal handlers for SIGINT and SIGTERM
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
try:
while True:
time.sleep(_ONE_DAY_IN_SECONDS)
except KeyboardInterrupt:
server.stop(0)
# Wait for the server to be terminated
await server.wait_for_termination()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run the gRPC server.")
@@ -193,4 +334,4 @@ if __name__ == "__main__":
)
args = parser.parse_args()
serve(args.addr)
asyncio.run(serve(args.addr))

View File

@@ -1,3 +1,7 @@
ifneq (,$(findstring sycl,$(BUILD_TYPE)))
export SKIP_CONDA=1
endif
.PHONY: ttsvalle
ttsvalle:
$(MAKE) -C ../common-env/transformers

View File

File diff suppressed because one or more lines are too long

View File

@@ -2,13 +2,16 @@
##
## A bash script installs the required dependencies of VALL-E-X and prepares the environment
export PATH=$PATH:/opt/conda/bin
export SHA=3faaf8ccadb154d63b38070caf518ce9309ea0f4
# Activate conda environment
source activate transformers
SKIP_CONDA=${SKIP_CONDA:-0}
echo $CONDA_PREFIX
if [ $SKIP_CONDA -ne 1 ]; then
source activate transformers
else
export PATH=$PATH:/opt/conda/bin
CONDA_PREFIX=$PWD
fi
git clone https://github.com/Plachtaa/VALL-E-X.git $CONDA_PREFIX/vall-e-x && pushd $CONDA_PREFIX/vall-e-x && git checkout -b build $SHA && popd

View File

@@ -79,7 +79,7 @@ dependencies:
- pypinyin==0.49.0
- python-multipart==0.0.6
- regex==2023.10.3
- safetensors==0.4.0
- safetensors>=0.4.0
- semantic-version==2.10.0
- soundfile==0.12.1
- starlette==0.27.0

View File

File diff suppressed because one or more lines are too long

View File

@@ -1,6 +1,6 @@
#!/usr/bin/env python3
import asyncio
from concurrent import futures
import time
import argparse
import signal
import sys
@@ -10,7 +10,10 @@ import backend_pb2
import backend_pb2_grpc
import grpc
from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.sampling_params import SamplingParams
from vllm.utils import random_uuid
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
@@ -79,16 +82,30 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
Returns:
backend_pb2.Result: The load model result.
"""
engine_args = AsyncEngineArgs(
model=request.Model,
)
if request.Quantization != "":
engine_args.quantization = request.Quantization
if request.GPUMemoryUtilization != 0:
engine_args.gpu_memory_utilization = request.GPUMemoryUtilization
if request.TrustRemoteCode:
engine_args.trust_remote_code = request.TrustRemoteCode
if request.EnforceEager:
engine_args.enforce_eager = request.EnforceEager
if request.SwapSpace != 0:
engine_args.swap_space = request.SwapSpace
if request.MaxModelLen != 0:
engine_args.max_model_len = request.MaxModelLen
try:
if request.Quantization != "":
self.llm = LLM(model=request.Model, quantization=request.Quantization)
else:
self.llm = LLM(model=request.Model)
self.llm = AsyncLLMEngine.from_engine_args(engine_args)
except Exception as err:
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
return backend_pb2.Result(message="Model loaded successfully", success=True)
def Predict(self, request, context):
async def Predict(self, request, context):
"""
Generates text based on the given prompt and sampling parameters.
@@ -99,24 +116,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
Returns:
backend_pb2.Reply: The predict result.
"""
if request.TopP == 0:
request.TopP = 0.9
gen = self._predict(request, context, streaming=False)
res = await gen.__anext__()
return res
max_tokens = 200
if request.Tokens > 0:
max_tokens = request.Tokens
sampling_params = SamplingParams(max_tokens=max_tokens, temperature=request.Temperature, top_p=request.TopP)
outputs = self.llm.generate([request.Prompt], sampling_params)
generated_text = outputs[0].outputs[0].text
# Remove prompt from response if present
if request.Prompt in generated_text:
generated_text = generated_text.replace(request.Prompt, "")
return backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8'))
def PredictStream(self, request, context):
async def PredictStream(self, request, context):
"""
Generates text based on the given prompt and sampling parameters, and streams the results.
@@ -127,30 +131,84 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
Returns:
backend_pb2.Result: The predict stream result.
"""
yield self.Predict(request, context)
iterations = self._predict(request, context, streaming=True)
try:
async for iteration in iterations:
yield iteration
finally:
await iterations.aclose()
def serve(address):
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
async def _predict(self, request, context, streaming=False):
# Build sampling parameters
sampling_params = SamplingParams(top_p=0.9, max_tokens=200)
if request.TopP != 0:
sampling_params.top_p = request.TopP
if request.Tokens > 0:
sampling_params.max_tokens = request.Tokens
if request.Temperature != 0:
sampling_params.temperature = request.Temperature
if request.TopK != 0:
sampling_params.top_k = request.TopK
if request.PresencePenalty != 0:
sampling_params.presence_penalty = request.PresencePenalty
if request.FrequencyPenalty != 0:
sampling_params.frequency_penalty = request.FrequencyPenalty
if request.StopPrompts:
sampling_params.stop = request.StopPrompts
if request.IgnoreEOS:
sampling_params.ignore_eos = request.IgnoreEOS
if request.Seed != 0:
sampling_params.seed = request.Seed
# Generate text
request_id = random_uuid()
outputs = self.llm.generate(request.Prompt, sampling_params, request_id)
# Stream the results
generated_text = ""
try:
async for request_output in outputs:
iteration_text = request_output.outputs[0].text
if streaming:
# Remove text already sent as vllm concatenates the text from previous yields
delta_iteration_text = iteration_text.removeprefix(generated_text)
# Send the partial result
yield backend_pb2.Reply(message=bytes(delta_iteration_text, encoding='utf-8'))
# Keep track of text generated
generated_text = iteration_text
finally:
await outputs.aclose()
# If streaming, we already sent everything
if streaming:
return
# Sending the final generated text
yield backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8'))
async def serve(address):
# Start asyncio gRPC server
server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
# Add the servicer to the server
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
# Bind the server to the address
server.add_insecure_port(address)
server.start()
# Gracefully shutdown the server on SIGTERM or SIGINT
loop = asyncio.get_event_loop()
for sig in (signal.SIGINT, signal.SIGTERM):
loop.add_signal_handler(
sig, lambda: asyncio.ensure_future(server.stop(5))
)
# Start the server
await server.start()
print("Server started. Listening on: " + address, file=sys.stderr)
# Define the signal handler function
def signal_handler(sig, frame):
print("Received termination signal. Shutting down...")
server.stop(0)
sys.exit(0)
# Set the signal handlers for SIGINT and SIGTERM
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
try:
while True:
time.sleep(_ONE_DAY_IN_SECONDS)
except KeyboardInterrupt:
server.stop(0)
# Wait for the server to be terminated
await server.wait_for_termination()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run the gRPC server.")
@@ -159,4 +217,4 @@ if __name__ == "__main__":
)
args = parser.parse_args()
serve(args.addr)
asyncio.run(serve(args.addr))

0
configuration/.keep Normal file
View File

View File

@@ -3,36 +3,32 @@ package backend
import (
"fmt"
config "github.com/go-skynet/LocalAI/core/config"
"github.com/go-skynet/LocalAI/core/options"
"github.com/go-skynet/LocalAI/core/config"
"github.com/go-skynet/LocalAI/pkg/grpc"
model "github.com/go-skynet/LocalAI/pkg/model"
)
func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, c config.Config, o *options.Option) (func() ([]float32, error), error) {
if !c.Embeddings {
return nil, fmt.Errorf("endpoint disabled for this model by API configuration")
}
func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (func() ([]float32, error), error) {
modelFile := backendConfig.Model
modelFile := c.Model
grpcOpts := gRPCModelOpts(c)
grpcOpts := gRPCModelOpts(backendConfig)
var inferenceModel interface{}
var err error
opts := modelOpts(c, o, []model.Option{
opts := modelOpts(backendConfig, appConfig, []model.Option{
model.WithLoadGRPCLoadModelOpts(grpcOpts),
model.WithThreads(uint32(c.Threads)),
model.WithAssetDir(o.AssetsDestination),
model.WithThreads(uint32(*backendConfig.Threads)),
model.WithAssetDir(appConfig.AssetsDestination),
model.WithModel(modelFile),
model.WithContext(o.Context),
model.WithContext(appConfig.Context),
})
if c.Backend == "" {
if backendConfig.Backend == "" {
inferenceModel, err = loader.GreedyLoader(opts...)
} else {
opts = append(opts, model.WithBackendString(c.Backend))
opts = append(opts, model.WithBackendString(backendConfig.Backend))
inferenceModel, err = loader.BackendLoader(opts...)
}
if err != nil {
@@ -43,7 +39,7 @@ func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, c config.
switch model := inferenceModel.(type) {
case grpc.Backend:
fn = func() ([]float32, error) {
predictOptions := gRPCPredictOpts(c, loader.ModelPath)
predictOptions := gRPCPredictOpts(backendConfig, loader.ModelPath)
if len(tokens) > 0 {
embeds := []int32{}
@@ -52,7 +48,7 @@ func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, c config.
}
predictOptions.EmbeddingTokens = embeds
res, err := model.Embeddings(o.Context, predictOptions)
res, err := model.Embeddings(appConfig.Context, predictOptions)
if err != nil {
return nil, err
}
@@ -61,7 +57,7 @@ func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, c config.
}
predictOptions.Embeddings = s
res, err := model.Embeddings(o.Context, predictOptions)
res, err := model.Embeddings(appConfig.Context, predictOptions)
if err != nil {
return nil, err
}

View File

@@ -1,34 +1,25 @@
package backend
import (
config "github.com/go-skynet/LocalAI/core/config"
"github.com/go-skynet/LocalAI/core/options"
"github.com/go-skynet/LocalAI/core/config"
"github.com/go-skynet/LocalAI/pkg/grpc/proto"
model "github.com/go-skynet/LocalAI/pkg/model"
)
func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negative_prompt, src, dst string, loader *model.ModelLoader, c config.Config, o *options.Option) (func() error, error) {
opts := modelOpts(c, o, []model.Option{
model.WithBackendString(c.Backend),
model.WithAssetDir(o.AssetsDestination),
model.WithThreads(uint32(c.Threads)),
model.WithContext(o.Context),
model.WithModel(c.Model),
model.WithLoadGRPCLoadModelOpts(&proto.ModelOptions{
CUDA: c.CUDA || c.Diffusers.CUDA,
SchedulerType: c.Diffusers.SchedulerType,
PipelineType: c.Diffusers.PipelineType,
CFGScale: c.Diffusers.CFGScale,
LoraAdapter: c.LoraAdapter,
LoraScale: c.LoraScale,
LoraBase: c.LoraBase,
IMG2IMG: c.Diffusers.IMG2IMG,
CLIPModel: c.Diffusers.ClipModel,
CLIPSubfolder: c.Diffusers.ClipSubFolder,
CLIPSkip: int32(c.Diffusers.ClipSkip),
ControlNet: c.Diffusers.ControlNet,
}),
func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negative_prompt, src, dst string, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (func() error, error) {
threads := backendConfig.Threads
if *threads == 0 && appConfig.Threads != 0 {
threads = &appConfig.Threads
}
gRPCOpts := gRPCModelOpts(backendConfig)
opts := modelOpts(backendConfig, appConfig, []model.Option{
model.WithBackendString(backendConfig.Backend),
model.WithAssetDir(appConfig.AssetsDestination),
model.WithThreads(uint32(*threads)),
model.WithContext(appConfig.Context),
model.WithModel(backendConfig.Model),
model.WithLoadGRPCLoadModelOpts(gRPCOpts),
})
inferenceModel, err := loader.BackendLoader(
@@ -40,19 +31,19 @@ func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negat
fn := func() error {
_, err := inferenceModel.GenerateImage(
o.Context,
appConfig.Context,
&proto.GenerateImageRequest{
Height: int32(height),
Width: int32(width),
Mode: int32(mode),
Step: int32(step),
Seed: int32(seed),
CLIPSkip: int32(c.Diffusers.ClipSkip),
CLIPSkip: int32(backendConfig.Diffusers.ClipSkip),
PositivePrompt: positive_prompt,
NegativePrompt: negative_prompt,
Dst: dst,
Src: src,
EnableParameters: c.Diffusers.EnableParameters,
EnableParameters: backendConfig.Diffusers.EnableParameters,
})
return err
}

View File

@@ -8,8 +8,8 @@ import (
"sync"
"unicode/utf8"
config "github.com/go-skynet/LocalAI/core/config"
"github.com/go-skynet/LocalAI/core/options"
"github.com/go-skynet/LocalAI/core/config"
"github.com/go-skynet/LocalAI/pkg/gallery"
"github.com/go-skynet/LocalAI/pkg/grpc"
model "github.com/go-skynet/LocalAI/pkg/model"
@@ -26,9 +26,12 @@ type TokenUsage struct {
Completion int
}
func ModelInference(ctx context.Context, s string, images []string, loader *model.ModelLoader, c config.Config, o *options.Option, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
func ModelInference(ctx context.Context, s string, images []string, loader *model.ModelLoader, c config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
modelFile := c.Model
threads := c.Threads
if *threads == 0 && o.Threads != 0 {
threads = &o.Threads
}
grpcOpts := gRPCModelOpts(c)
var inferenceModel grpc.Backend
@@ -36,7 +39,7 @@ func ModelInference(ctx context.Context, s string, images []string, loader *mode
opts := modelOpts(c, o, []model.Option{
model.WithLoadGRPCLoadModelOpts(grpcOpts),
model.WithThreads(uint32(c.Threads)), // some models uses this to allocate threads during startup
model.WithThreads(uint32(*threads)), // some models uses this to allocate threads during startup
model.WithAssetDir(o.AssetsDestination),
model.WithModel(modelFile),
model.WithContext(o.Context),
@@ -140,7 +143,7 @@ func ModelInference(ctx context.Context, s string, images []string, loader *mode
var cutstrings map[string]*regexp.Regexp = make(map[string]*regexp.Regexp)
var mu sync.Mutex = sync.Mutex{}
func Finetune(config config.Config, input, prediction string) string {
func Finetune(config config.BackendConfig, input, prediction string) string {
if config.Echo {
prediction = input + prediction
}

View File

@@ -1,22 +1,21 @@
package backend
import (
"math/rand"
"os"
"path/filepath"
"github.com/go-skynet/LocalAI/core/config"
pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
model "github.com/go-skynet/LocalAI/pkg/model"
config "github.com/go-skynet/LocalAI/core/config"
"github.com/go-skynet/LocalAI/core/options"
)
func modelOpts(c config.Config, o *options.Option, opts []model.Option) []model.Option {
if o.SingleBackend {
func modelOpts(c config.BackendConfig, so *config.ApplicationConfig, opts []model.Option) []model.Option {
if so.SingleBackend {
opts = append(opts, model.WithSingleActiveBackend())
}
if o.ParallelBackendRequests {
if so.ParallelBackendRequests {
opts = append(opts, model.EnableParallelRequests)
}
@@ -28,52 +27,73 @@ func modelOpts(c config.Config, o *options.Option, opts []model.Option) []model.
opts = append(opts, model.WithGRPCAttemptsDelay(c.GRPC.AttemptsSleepTime))
}
for k, v := range o.ExternalGRPCBackends {
for k, v := range so.ExternalGRPCBackends {
opts = append(opts, model.WithExternalBackend(k, v))
}
return opts
}
func gRPCModelOpts(c config.Config) *pb.ModelOptions {
func getSeed(c config.BackendConfig) int32 {
seed := int32(*c.Seed)
if seed == config.RAND_SEED {
seed = rand.Int31()
}
return seed
}
func gRPCModelOpts(c config.BackendConfig) *pb.ModelOptions {
b := 512
if c.Batch != 0 {
b = c.Batch
}
return &pb.ModelOptions{
ContextSize: int32(c.ContextSize),
Seed: int32(c.Seed),
NBatch: int32(b),
NoMulMatQ: c.NoMulMatQ,
CUDA: c.CUDA, // diffusers, transformers
DraftModel: c.DraftModel,
AudioPath: c.VallE.AudioPath,
Quantization: c.Quantization,
MMProj: c.MMProj,
YarnExtFactor: c.YarnExtFactor,
YarnAttnFactor: c.YarnAttnFactor,
YarnBetaFast: c.YarnBetaFast,
YarnBetaSlow: c.YarnBetaSlow,
LoraAdapter: c.LoraAdapter,
LoraBase: c.LoraBase,
LoraScale: c.LoraScale,
NGQA: c.NGQA,
RMSNormEps: c.RMSNormEps,
F16Memory: c.F16,
MLock: c.MMlock,
RopeFreqBase: c.RopeFreqBase,
RopeScaling: c.RopeScaling,
Type: c.ModelType,
RopeFreqScale: c.RopeFreqScale,
NUMA: c.NUMA,
Embeddings: c.Embeddings,
LowVRAM: c.LowVRAM,
NGPULayers: int32(c.NGPULayers),
MMap: c.MMap,
MainGPU: c.MainGPU,
Threads: int32(c.Threads),
TensorSplit: c.TensorSplit,
CUDA: c.CUDA || c.Diffusers.CUDA,
SchedulerType: c.Diffusers.SchedulerType,
PipelineType: c.Diffusers.PipelineType,
CFGScale: c.Diffusers.CFGScale,
LoraAdapter: c.LoraAdapter,
LoraScale: c.LoraScale,
F16Memory: *c.F16,
LoraBase: c.LoraBase,
IMG2IMG: c.Diffusers.IMG2IMG,
CLIPModel: c.Diffusers.ClipModel,
CLIPSubfolder: c.Diffusers.ClipSubFolder,
CLIPSkip: int32(c.Diffusers.ClipSkip),
ControlNet: c.Diffusers.ControlNet,
ContextSize: int32(*c.ContextSize),
Seed: getSeed(c),
NBatch: int32(b),
NoMulMatQ: c.NoMulMatQ,
DraftModel: c.DraftModel,
AudioPath: c.VallE.AudioPath,
Quantization: c.Quantization,
GPUMemoryUtilization: c.GPUMemoryUtilization,
TrustRemoteCode: c.TrustRemoteCode,
EnforceEager: c.EnforceEager,
SwapSpace: int32(c.SwapSpace),
MaxModelLen: int32(c.MaxModelLen),
MMProj: c.MMProj,
YarnExtFactor: c.YarnExtFactor,
YarnAttnFactor: c.YarnAttnFactor,
YarnBetaFast: c.YarnBetaFast,
YarnBetaSlow: c.YarnBetaSlow,
NGQA: c.NGQA,
RMSNormEps: c.RMSNormEps,
MLock: *c.MMlock,
RopeFreqBase: c.RopeFreqBase,
RopeScaling: c.RopeScaling,
Type: c.ModelType,
RopeFreqScale: c.RopeFreqScale,
NUMA: c.NUMA,
Embeddings: c.Embeddings,
LowVRAM: *c.LowVRAM,
NGPULayers: int32(*c.NGPULayers),
MMap: *c.MMap,
MainGPU: c.MainGPU,
Threads: int32(*c.Threads),
TensorSplit: c.TensorSplit,
// AutoGPTQ
ModelBaseName: c.AutoGPTQ.ModelBaseName,
Device: c.AutoGPTQ.Device,
@@ -84,46 +104,47 @@ func gRPCModelOpts(c config.Config) *pb.ModelOptions {
}
}
func gRPCPredictOpts(c config.Config, modelPath string) *pb.PredictOptions {
func gRPCPredictOpts(c config.BackendConfig, modelPath string) *pb.PredictOptions {
promptCachePath := ""
if c.PromptCachePath != "" {
p := filepath.Join(modelPath, c.PromptCachePath)
os.MkdirAll(filepath.Dir(p), 0755)
promptCachePath = p
}
return &pb.PredictOptions{
Temperature: float32(c.Temperature),
TopP: float32(c.TopP),
Temperature: float32(*c.Temperature),
TopP: float32(*c.TopP),
NDraft: c.NDraft,
TopK: int32(c.TopK),
Tokens: int32(c.Maxtokens),
Threads: int32(c.Threads),
TopK: int32(*c.TopK),
Tokens: int32(*c.Maxtokens),
Threads: int32(*c.Threads),
PromptCacheAll: c.PromptCacheAll,
PromptCacheRO: c.PromptCacheRO,
PromptCachePath: promptCachePath,
F16KV: c.F16,
DebugMode: c.Debug,
F16KV: *c.F16,
DebugMode: *c.Debug,
Grammar: c.Grammar,
NegativePromptScale: c.NegativePromptScale,
RopeFreqBase: c.RopeFreqBase,
RopeFreqScale: c.RopeFreqScale,
NegativePrompt: c.NegativePrompt,
Mirostat: int32(c.LLMConfig.Mirostat),
MirostatETA: float32(c.LLMConfig.MirostatETA),
MirostatTAU: float32(c.LLMConfig.MirostatTAU),
Debug: c.Debug,
Mirostat: int32(*c.LLMConfig.Mirostat),
MirostatETA: float32(*c.LLMConfig.MirostatETA),
MirostatTAU: float32(*c.LLMConfig.MirostatTAU),
Debug: *c.Debug,
StopPrompts: c.StopWords,
Repeat: int32(c.RepeatPenalty),
NKeep: int32(c.Keep),
Batch: int32(c.Batch),
IgnoreEOS: c.IgnoreEOS,
Seed: int32(c.Seed),
Seed: getSeed(c),
FrequencyPenalty: float32(c.FrequencyPenalty),
MLock: c.MMlock,
MMap: c.MMap,
MLock: *c.MMlock,
MMap: *c.MMap,
MainGPU: c.MainGPU,
TensorSplit: c.TensorSplit,
TailFreeSamplingZ: float32(c.TFZ),
TypicalP: float32(c.TypicalP),
TailFreeSamplingZ: float32(*c.TFZ),
TypicalP: float32(*c.TypicalP),
}
}

23
core/backend/stores.go Normal file
View File

@@ -0,0 +1,23 @@
package backend
import (
"github.com/go-skynet/LocalAI/core/config"
"github.com/go-skynet/LocalAI/pkg/grpc"
"github.com/go-skynet/LocalAI/pkg/model"
)
func StoreBackend(sl *model.ModelLoader, appConfig *config.ApplicationConfig, storeName string) (grpc.Backend, error) {
if storeName == "" {
storeName = "default"
}
sc := []model.Option{
model.WithBackendString(model.LocalStoreBackend),
model.WithAssetDir(appConfig.AssetsDestination),
model.WithModel(storeName),
}
return sl.BackendLoader(sc...)
}

View File

@@ -4,25 +4,24 @@ import (
"context"
"fmt"
config "github.com/go-skynet/LocalAI/core/config"
"github.com/go-skynet/LocalAI/core/config"
"github.com/go-skynet/LocalAI/core/schema"
"github.com/go-skynet/LocalAI/core/options"
"github.com/go-skynet/LocalAI/pkg/grpc/proto"
model "github.com/go-skynet/LocalAI/pkg/model"
)
func ModelTranscription(audio, language string, loader *model.ModelLoader, c config.Config, o *options.Option) (*schema.Result, error) {
func ModelTranscription(audio, language string, ml *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (*schema.Result, error) {
opts := modelOpts(c, o, []model.Option{
opts := modelOpts(backendConfig, appConfig, []model.Option{
model.WithBackendString(model.WhisperBackend),
model.WithModel(c.Model),
model.WithContext(o.Context),
model.WithThreads(uint32(c.Threads)),
model.WithAssetDir(o.AssetsDestination),
model.WithModel(backendConfig.Model),
model.WithContext(appConfig.Context),
model.WithThreads(uint32(*backendConfig.Threads)),
model.WithAssetDir(appConfig.AssetsDestination),
})
whisperModel, err := o.Loader.BackendLoader(opts...)
whisperModel, err := ml.BackendLoader(opts...)
if err != nil {
return nil, err
}
@@ -34,6 +33,6 @@ func ModelTranscription(audio, language string, loader *model.ModelLoader, c con
return whisperModel.AudioTranscription(context.Background(), &proto.TranscriptRequest{
Dst: audio,
Language: language,
Threads: uint32(c.Threads),
Threads: uint32(*backendConfig.Threads),
})
}

View File

@@ -6,8 +6,8 @@ import (
"os"
"path/filepath"
config "github.com/go-skynet/LocalAI/core/config"
"github.com/go-skynet/LocalAI/core/options"
"github.com/go-skynet/LocalAI/core/config"
"github.com/go-skynet/LocalAI/pkg/grpc/proto"
model "github.com/go-skynet/LocalAI/pkg/model"
"github.com/go-skynet/LocalAI/pkg/utils"
@@ -29,53 +29,59 @@ func generateUniqueFileName(dir, baseName, ext string) string {
}
}
func ModelTTS(backend, text, modelFile string, loader *model.ModelLoader, o *options.Option, c config.Config) (string, *proto.Result, error) {
func ModelTTS(backend, text, modelFile, voice string, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (string, *proto.Result, error) {
bb := backend
if bb == "" {
bb = model.PiperBackend
}
grpcOpts := gRPCModelOpts(c)
grpcOpts := gRPCModelOpts(backendConfig)
opts := modelOpts(config.Config{}, o, []model.Option{
opts := modelOpts(config.BackendConfig{}, appConfig, []model.Option{
model.WithBackendString(bb),
model.WithModel(modelFile),
model.WithContext(o.Context),
model.WithAssetDir(o.AssetsDestination),
model.WithContext(appConfig.Context),
model.WithAssetDir(appConfig.AssetsDestination),
model.WithLoadGRPCLoadModelOpts(grpcOpts),
})
piperModel, err := o.Loader.BackendLoader(opts...)
ttsModel, err := loader.BackendLoader(opts...)
if err != nil {
return "", nil, err
}
if piperModel == nil {
if ttsModel == nil {
return "", nil, fmt.Errorf("could not load piper model")
}
if err := os.MkdirAll(o.AudioDir, 0755); err != nil {
if err := os.MkdirAll(appConfig.AudioDir, 0755); err != nil {
return "", nil, fmt.Errorf("failed creating audio directory: %s", err)
}
fileName := generateUniqueFileName(o.AudioDir, "piper", ".wav")
filePath := filepath.Join(o.AudioDir, fileName)
fileName := generateUniqueFileName(appConfig.AudioDir, "tts", ".wav")
filePath := filepath.Join(appConfig.AudioDir, fileName)
// If the model file is not empty, we pass it joined with the model path
modelPath := ""
if modelFile != "" {
if bb != model.TransformersMusicGen {
modelPath = filepath.Join(o.Loader.ModelPath, modelFile)
if err := utils.VerifyPath(modelPath, o.Loader.ModelPath); err != nil {
// If the model file is not empty, we pass it joined with the model path
// Checking first that it exists and is not outside ModelPath
// TODO: we should actually first check if the modelFile is looking like
// a FS path
mp := filepath.Join(loader.ModelPath, modelFile)
if _, err := os.Stat(mp); err == nil {
if err := utils.VerifyPath(mp, appConfig.ModelPath); err != nil {
return "", nil, err
}
modelPath = mp
} else {
modelPath = modelFile
}
}
res, err := piperModel.TTS(context.Background(), &proto.TTSRequest{
res, err := ttsModel.TTS(context.Background(), &proto.TTSRequest{
Text: text,
Model: modelPath,
Voice: voice,
Dst: filePath,
})

View File

@@ -1,4 +1,4 @@
package options
package config
import (
"context"
@@ -6,28 +6,27 @@ import (
"encoding/json"
"time"
"github.com/go-skynet/LocalAI/metrics"
"github.com/go-skynet/LocalAI/pkg/gallery"
model "github.com/go-skynet/LocalAI/pkg/model"
"github.com/rs/zerolog/log"
)
type Option struct {
type ApplicationConfig struct {
Context context.Context
ConfigFile string
Loader *model.ModelLoader
ModelPath string
UploadLimitMB, Threads, ContextSize int
DisableWelcomePage bool
F16 bool
Debug, DisableMessage bool
ImageDir string
AudioDir string
UploadDir string
ConfigsDir string
CORS bool
PreloadJSONModels string
PreloadModelsFromPath string
CORSAllowOrigins string
ApiKeys []string
Metrics *metrics.Metrics
ModelLibraryURL string
@@ -52,10 +51,10 @@ type Option struct {
WatchDogBusyTimeout, WatchDogIdleTimeout time.Duration
}
type AppOption func(*Option)
type AppOption func(*ApplicationConfig)
func NewOptions(o ...AppOption) *Option {
opt := &Option{
func NewApplicationConfig(o ...AppOption) *ApplicationConfig {
opt := &ApplicationConfig{
Context: context.Background(),
UploadLimitMB: 15,
Threads: 1,
@@ -70,63 +69,73 @@ func NewOptions(o ...AppOption) *Option {
}
func WithModelsURL(urls ...string) AppOption {
return func(o *Option) {
return func(o *ApplicationConfig) {
o.ModelsURL = urls
}
}
func WithModelPath(path string) AppOption {
return func(o *ApplicationConfig) {
o.ModelPath = path
}
}
func WithCors(b bool) AppOption {
return func(o *Option) {
return func(o *ApplicationConfig) {
o.CORS = b
}
}
func WithModelLibraryURL(url string) AppOption {
return func(o *Option) {
return func(o *ApplicationConfig) {
o.ModelLibraryURL = url
}
}
var EnableWatchDog = func(o *Option) {
var EnableWatchDog = func(o *ApplicationConfig) {
o.WatchDog = true
}
var EnableWatchDogIdleCheck = func(o *Option) {
var EnableWatchDogIdleCheck = func(o *ApplicationConfig) {
o.WatchDog = true
o.WatchDogIdle = true
}
var EnableWatchDogBusyCheck = func(o *Option) {
var EnableWatchDogBusyCheck = func(o *ApplicationConfig) {
o.WatchDog = true
o.WatchDogBusy = true
}
var DisableWelcomePage = func(o *ApplicationConfig) {
o.DisableWelcomePage = true
}
func SetWatchDogBusyTimeout(t time.Duration) AppOption {
return func(o *Option) {
return func(o *ApplicationConfig) {
o.WatchDogBusyTimeout = t
}
}
func SetWatchDogIdleTimeout(t time.Duration) AppOption {
return func(o *Option) {
return func(o *ApplicationConfig) {
o.WatchDogIdleTimeout = t
}
}
var EnableSingleBackend = func(o *Option) {
var EnableSingleBackend = func(o *ApplicationConfig) {
o.SingleBackend = true
}
var EnableParallelBackendRequests = func(o *Option) {
var EnableParallelBackendRequests = func(o *ApplicationConfig) {
o.ParallelBackendRequests = true
}
var EnableGalleriesAutoload = func(o *Option) {
var EnableGalleriesAutoload = func(o *ApplicationConfig) {
o.AutoloadGalleries = true
}
func WithExternalBackend(name string, uri string) AppOption {
return func(o *Option) {
return func(o *ApplicationConfig) {
if o.ExternalGRPCBackends == nil {
o.ExternalGRPCBackends = make(map[string]string)
}
@@ -135,135 +144,149 @@ func WithExternalBackend(name string, uri string) AppOption {
}
func WithCorsAllowOrigins(b string) AppOption {
return func(o *Option) {
return func(o *ApplicationConfig) {
o.CORSAllowOrigins = b
}
}
func WithBackendAssetsOutput(out string) AppOption {
return func(o *Option) {
return func(o *ApplicationConfig) {
o.AssetsDestination = out
}
}
func WithBackendAssets(f embed.FS) AppOption {
return func(o *Option) {
return func(o *ApplicationConfig) {
o.BackendAssets = f
}
}
func WithStringGalleries(galls string) AppOption {
return func(o *Option) {
return func(o *ApplicationConfig) {
if galls == "" {
log.Debug().Msgf("no galleries to load")
o.Galleries = []gallery.Gallery{}
return
}
var galleries []gallery.Gallery
if err := json.Unmarshal([]byte(galls), &galleries); err != nil {
log.Error().Msgf("failed loading galleries: %s", err.Error())
log.Error().Err(err).Msg("failed loading galleries")
}
o.Galleries = append(o.Galleries, galleries...)
}
}
func WithGalleries(galleries []gallery.Gallery) AppOption {
return func(o *Option) {
return func(o *ApplicationConfig) {
o.Galleries = append(o.Galleries, galleries...)
}
}
func WithContext(ctx context.Context) AppOption {
return func(o *Option) {
return func(o *ApplicationConfig) {
o.Context = ctx
}
}
func WithYAMLConfigPreload(configFile string) AppOption {
return func(o *Option) {
return func(o *ApplicationConfig) {
o.PreloadModelsFromPath = configFile
}
}
func WithJSONStringPreload(configFile string) AppOption {
return func(o *Option) {
return func(o *ApplicationConfig) {
o.PreloadJSONModels = configFile
}
}
func WithConfigFile(configFile string) AppOption {
return func(o *Option) {
return func(o *ApplicationConfig) {
o.ConfigFile = configFile
}
}
func WithModelLoader(loader *model.ModelLoader) AppOption {
return func(o *Option) {
o.Loader = loader
}
}
func WithUploadLimitMB(limit int) AppOption {
return func(o *Option) {
return func(o *ApplicationConfig) {
o.UploadLimitMB = limit
}
}
func WithThreads(threads int) AppOption {
return func(o *Option) {
return func(o *ApplicationConfig) {
o.Threads = threads
}
}
func WithContextSize(ctxSize int) AppOption {
return func(o *Option) {
return func(o *ApplicationConfig) {
o.ContextSize = ctxSize
}
}
func WithF16(f16 bool) AppOption {
return func(o *Option) {
return func(o *ApplicationConfig) {
o.F16 = f16
}
}
func WithDebug(debug bool) AppOption {
return func(o *Option) {
return func(o *ApplicationConfig) {
o.Debug = debug
}
}
func WithDisableMessage(disableMessage bool) AppOption {
return func(o *Option) {
return func(o *ApplicationConfig) {
o.DisableMessage = disableMessage
}
}
func WithAudioDir(audioDir string) AppOption {
return func(o *Option) {
return func(o *ApplicationConfig) {
o.AudioDir = audioDir
}
}
func WithImageDir(imageDir string) AppOption {
return func(o *Option) {
return func(o *ApplicationConfig) {
o.ImageDir = imageDir
}
}
func WithUploadDir(uploadDir string) AppOption {
return func(o *Option) {
return func(o *ApplicationConfig) {
o.UploadDir = uploadDir
}
}
func WithConfigsDir(configsDir string) AppOption {
return func(o *ApplicationConfig) {
o.ConfigsDir = configsDir
}
}
func WithApiKeys(apiKeys []string) AppOption {
return func(o *Option) {
return func(o *ApplicationConfig) {
o.ApiKeys = apiKeys
}
}
func WithMetrics(meter *metrics.Metrics) AppOption {
return func(o *Option) {
o.Metrics = meter
// ToConfigLoaderOptions returns a slice of ConfigLoader Option.
// Some options defined at the application level are going to be passed as defaults for
// all the configuration for the models.
// This includes for instance the context size or the number of threads.
// If a model doesn't set configs directly to the config model file
// it will use the defaults defined here.
func (o *ApplicationConfig) ToConfigLoaderOptions() []ConfigLoaderOption {
return []ConfigLoaderOption{
LoadOptionContextSize(o.ContextSize),
LoadOptionDebug(o.Debug),
LoadOptionF16(o.F16),
LoadOptionThreads(o.Threads),
}
}
// func WithMetrics(meter *metrics.Metrics) AppOption {
// return func(o *StartupOptions) {
// o.Metrics = meter
// }
// }

View File

@@ -0,0 +1,597 @@
package config
import (
"errors"
"fmt"
"io/fs"
"os"
"path/filepath"
"sort"
"strings"
"sync"
"github.com/go-skynet/LocalAI/core/schema"
"github.com/go-skynet/LocalAI/pkg/downloader"
"github.com/go-skynet/LocalAI/pkg/utils"
"github.com/rs/zerolog/log"
"gopkg.in/yaml.v3"
"github.com/charmbracelet/glamour"
)
const (
RAND_SEED = -1
)
type BackendConfig struct {
schema.PredictionOptions `yaml:"parameters"`
Name string `yaml:"name"`
F16 *bool `yaml:"f16"`
Threads *int `yaml:"threads"`
Debug *bool `yaml:"debug"`
Roles map[string]string `yaml:"roles"`
Embeddings bool `yaml:"embeddings"`
Backend string `yaml:"backend"`
TemplateConfig TemplateConfig `yaml:"template"`
PromptStrings, InputStrings []string `yaml:"-"`
InputToken [][]int `yaml:"-"`
functionCallString, functionCallNameString string `yaml:"-"`
FunctionsConfig Functions `yaml:"function"`
FeatureFlag FeatureFlag `yaml:"feature_flags"` // Feature Flag registry. We move fast, and features may break on a per model/backend basis. Registry for (usually temporary) flags that indicate aborting something early.
// LLM configs (GPT4ALL, Llama.cpp, ...)
LLMConfig `yaml:",inline"`
// AutoGPTQ specifics
AutoGPTQ AutoGPTQ `yaml:"autogptq"`
// Diffusers
Diffusers Diffusers `yaml:"diffusers"`
Step int `yaml:"step"`
// GRPC Options
GRPC GRPC `yaml:"grpc"`
// Vall-e-x
VallE VallE `yaml:"vall-e"`
// CUDA
// Explicitly enable CUDA or not (some backends might need it)
CUDA bool `yaml:"cuda"`
DownloadFiles []File `yaml:"download_files"`
Description string `yaml:"description"`
Usage string `yaml:"usage"`
}
type File struct {
Filename string `yaml:"filename" json:"filename"`
SHA256 string `yaml:"sha256" json:"sha256"`
URI string `yaml:"uri" json:"uri"`
}
type VallE struct {
AudioPath string `yaml:"audio_path"`
}
type FeatureFlag map[string]*bool
func (ff FeatureFlag) Enabled(s string) bool {
v, exist := ff[s]
return exist && v != nil && *v
}
type GRPC struct {
Attempts int `yaml:"attempts"`
AttemptsSleepTime int `yaml:"attempts_sleep_time"`
}
type Diffusers struct {
CUDA bool `yaml:"cuda"`
PipelineType string `yaml:"pipeline_type"`
SchedulerType string `yaml:"scheduler_type"`
EnableParameters string `yaml:"enable_parameters"` // A list of comma separated parameters to specify
CFGScale float32 `yaml:"cfg_scale"` // Classifier-Free Guidance Scale
IMG2IMG bool `yaml:"img2img"` // Image to Image Diffuser
ClipSkip int `yaml:"clip_skip"` // Skip every N frames
ClipModel string `yaml:"clip_model"` // Clip model to use
ClipSubFolder string `yaml:"clip_subfolder"` // Subfolder to use for clip model
ControlNet string `yaml:"control_net"`
}
type LLMConfig struct {
SystemPrompt string `yaml:"system_prompt"`
TensorSplit string `yaml:"tensor_split"`
MainGPU string `yaml:"main_gpu"`
RMSNormEps float32 `yaml:"rms_norm_eps"`
NGQA int32 `yaml:"ngqa"`
PromptCachePath string `yaml:"prompt_cache_path"`
PromptCacheAll bool `yaml:"prompt_cache_all"`
PromptCacheRO bool `yaml:"prompt_cache_ro"`
MirostatETA *float64 `yaml:"mirostat_eta"`
MirostatTAU *float64 `yaml:"mirostat_tau"`
Mirostat *int `yaml:"mirostat"`
NGPULayers *int `yaml:"gpu_layers"`
MMap *bool `yaml:"mmap"`
MMlock *bool `yaml:"mmlock"`
LowVRAM *bool `yaml:"low_vram"`
Grammar string `yaml:"grammar"`
StopWords []string `yaml:"stopwords"`
Cutstrings []string `yaml:"cutstrings"`
TrimSpace []string `yaml:"trimspace"`
TrimSuffix []string `yaml:"trimsuffix"`
ContextSize *int `yaml:"context_size"`
NUMA bool `yaml:"numa"`
LoraAdapter string `yaml:"lora_adapter"`
LoraBase string `yaml:"lora_base"`
LoraScale float32 `yaml:"lora_scale"`
NoMulMatQ bool `yaml:"no_mulmatq"`
DraftModel string `yaml:"draft_model"`
NDraft int32 `yaml:"n_draft"`
Quantization string `yaml:"quantization"`
GPUMemoryUtilization float32 `yaml:"gpu_memory_utilization"` // vLLM
TrustRemoteCode bool `yaml:"trust_remote_code"` // vLLM
EnforceEager bool `yaml:"enforce_eager"` // vLLM
SwapSpace int `yaml:"swap_space"` // vLLM
MaxModelLen int `yaml:"max_model_len"` // vLLM
MMProj string `yaml:"mmproj"`
RopeScaling string `yaml:"rope_scaling"`
ModelType string `yaml:"type"`
YarnExtFactor float32 `yaml:"yarn_ext_factor"`
YarnAttnFactor float32 `yaml:"yarn_attn_factor"`
YarnBetaFast float32 `yaml:"yarn_beta_fast"`
YarnBetaSlow float32 `yaml:"yarn_beta_slow"`
}
type AutoGPTQ struct {
ModelBaseName string `yaml:"model_base_name"`
Device string `yaml:"device"`
Triton bool `yaml:"triton"`
UseFastTokenizer bool `yaml:"use_fast_tokenizer"`
}
type Functions struct {
DisableNoAction bool `yaml:"disable_no_action"`
NoActionFunctionName string `yaml:"no_action_function_name"`
NoActionDescriptionName string `yaml:"no_action_description_name"`
ParallelCalls bool `yaml:"parallel_calls"`
}
type TemplateConfig struct {
Chat string `yaml:"chat"`
ChatMessage string `yaml:"chat_message"`
Completion string `yaml:"completion"`
Edit string `yaml:"edit"`
Functions string `yaml:"function"`
}
func (c *BackendConfig) SetFunctionCallString(s string) {
c.functionCallString = s
}
func (c *BackendConfig) SetFunctionCallNameString(s string) {
c.functionCallNameString = s
}
func (c *BackendConfig) ShouldUseFunctions() bool {
return ((c.functionCallString != "none" || c.functionCallString == "") || c.ShouldCallSpecificFunction())
}
func (c *BackendConfig) ShouldCallSpecificFunction() bool {
return len(c.functionCallNameString) > 0
}
func (c *BackendConfig) FunctionToCall() string {
if c.functionCallNameString != "" &&
c.functionCallNameString != "none" && c.functionCallNameString != "auto" {
return c.functionCallNameString
}
return c.functionCallString
}
func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
lo := &LoadOptions{}
lo.Apply(opts...)
ctx := lo.ctxSize
threads := lo.threads
f16 := lo.f16
debug := lo.debug
// https://github.com/ggerganov/llama.cpp/blob/75cd4c77292034ecec587ecb401366f57338f7c0/common/sampling.h#L22
defaultTopP := 0.95
defaultTopK := 40
defaultTemp := 0.9
defaultMaxTokens := 2048
defaultMirostat := 2
defaultMirostatTAU := 5.0
defaultMirostatETA := 0.1
defaultTypicalP := 1.0
defaultTFZ := 1.0
// Try to offload all GPU layers (if GPU is found)
defaultNGPULayers := 99999999
trueV := true
falseV := false
if cfg.Seed == nil {
// random number generator seed
defaultSeed := RAND_SEED
cfg.Seed = &defaultSeed
}
if cfg.TopK == nil {
cfg.TopK = &defaultTopK
}
if cfg.TypicalP == nil {
cfg.TypicalP = &defaultTypicalP
}
if cfg.TFZ == nil {
cfg.TFZ = &defaultTFZ
}
if cfg.MMap == nil {
// MMap is enabled by default
cfg.MMap = &trueV
}
if cfg.MMlock == nil {
// MMlock is disabled by default
cfg.MMlock = &falseV
}
if cfg.TopP == nil {
cfg.TopP = &defaultTopP
}
if cfg.Temperature == nil {
cfg.Temperature = &defaultTemp
}
if cfg.Maxtokens == nil {
cfg.Maxtokens = &defaultMaxTokens
}
if cfg.Mirostat == nil {
cfg.Mirostat = &defaultMirostat
}
if cfg.MirostatETA == nil {
cfg.MirostatETA = &defaultMirostatETA
}
if cfg.MirostatTAU == nil {
cfg.MirostatTAU = &defaultMirostatTAU
}
if cfg.NGPULayers == nil {
cfg.NGPULayers = &defaultNGPULayers
}
if cfg.LowVRAM == nil {
cfg.LowVRAM = &falseV
}
// Value passed by the top level are treated as default (no implicit defaults)
// defaults are set by the user
if ctx == 0 {
ctx = 1024
}
if cfg.ContextSize == nil {
cfg.ContextSize = &ctx
}
if threads == 0 {
// Threads can't be 0
threads = 4
}
if cfg.Threads == nil {
cfg.Threads = &threads
}
if cfg.F16 == nil {
cfg.F16 = &f16
}
if cfg.Debug == nil {
cfg.Debug = &falseV
}
if debug {
cfg.Debug = &trueV
}
}
////// Config Loader ////////
type BackendConfigLoader struct {
configs map[string]BackendConfig
sync.Mutex
}
type LoadOptions struct {
debug bool
threads, ctxSize int
f16 bool
}
func LoadOptionDebug(debug bool) ConfigLoaderOption {
return func(o *LoadOptions) {
o.debug = debug
}
}
func LoadOptionThreads(threads int) ConfigLoaderOption {
return func(o *LoadOptions) {
o.threads = threads
}
}
func LoadOptionContextSize(ctxSize int) ConfigLoaderOption {
return func(o *LoadOptions) {
o.ctxSize = ctxSize
}
}
func LoadOptionF16(f16 bool) ConfigLoaderOption {
return func(o *LoadOptions) {
o.f16 = f16
}
}
type ConfigLoaderOption func(*LoadOptions)
func (lo *LoadOptions) Apply(options ...ConfigLoaderOption) {
for _, l := range options {
l(lo)
}
}
// Load a config file for a model
func (cl *BackendConfigLoader) LoadBackendConfigFileByName(modelName, modelPath string, opts ...ConfigLoaderOption) (*BackendConfig, error) {
// Load a config file if present after the model name
cfg := &BackendConfig{
PredictionOptions: schema.PredictionOptions{
Model: modelName,
},
}
cfgExisting, exists := cl.GetBackendConfig(modelName)
if exists {
cfg = &cfgExisting
} else {
// Try loading a model config file
modelConfig := filepath.Join(modelPath, modelName+".yaml")
if _, err := os.Stat(modelConfig); err == nil {
if err := cl.LoadBackendConfig(
modelConfig, opts...,
); err != nil {
return nil, fmt.Errorf("failed loading model config (%s) %s", modelConfig, err.Error())
}
cfgExisting, exists = cl.GetBackendConfig(modelName)
if exists {
cfg = &cfgExisting
}
}
}
cfg.SetDefaults(opts...)
return cfg, nil
}
func NewBackendConfigLoader() *BackendConfigLoader {
return &BackendConfigLoader{
configs: make(map[string]BackendConfig),
}
}
func ReadBackendConfigFile(file string, opts ...ConfigLoaderOption) ([]*BackendConfig, error) {
c := &[]*BackendConfig{}
f, err := os.ReadFile(file)
if err != nil {
return nil, fmt.Errorf("cannot read config file: %w", err)
}
if err := yaml.Unmarshal(f, c); err != nil {
return nil, fmt.Errorf("cannot unmarshal config file: %w", err)
}
for _, cc := range *c {
cc.SetDefaults(opts...)
}
return *c, nil
}
func ReadBackendConfig(file string, opts ...ConfigLoaderOption) (*BackendConfig, error) {
lo := &LoadOptions{}
lo.Apply(opts...)
c := &BackendConfig{}
f, err := os.ReadFile(file)
if err != nil {
return nil, fmt.Errorf("cannot read config file: %w", err)
}
if err := yaml.Unmarshal(f, c); err != nil {
return nil, fmt.Errorf("cannot unmarshal config file: %w", err)
}
c.SetDefaults(opts...)
return c, nil
}
func (cm *BackendConfigLoader) LoadBackendConfigFile(file string, opts ...ConfigLoaderOption) error {
cm.Lock()
defer cm.Unlock()
c, err := ReadBackendConfigFile(file, opts...)
if err != nil {
return fmt.Errorf("cannot load config file: %w", err)
}
for _, cc := range c {
cm.configs[cc.Name] = *cc
}
return nil
}
func (cl *BackendConfigLoader) LoadBackendConfig(file string, opts ...ConfigLoaderOption) error {
cl.Lock()
defer cl.Unlock()
c, err := ReadBackendConfig(file, opts...)
if err != nil {
return fmt.Errorf("cannot read config file: %w", err)
}
cl.configs[c.Name] = *c
return nil
}
func (cl *BackendConfigLoader) GetBackendConfig(m string) (BackendConfig, bool) {
cl.Lock()
defer cl.Unlock()
v, exists := cl.configs[m]
return v, exists
}
func (cl *BackendConfigLoader) GetAllBackendConfigs() []BackendConfig {
cl.Lock()
defer cl.Unlock()
var res []BackendConfig
for _, v := range cl.configs {
res = append(res, v)
}
sort.SliceStable(res, func(i, j int) bool {
return res[i].Name < res[j].Name
})
return res
}
func (cl *BackendConfigLoader) ListBackendConfigs() []string {
cl.Lock()
defer cl.Unlock()
var res []string
for k := range cl.configs {
res = append(res, k)
}
return res
}
// Preload prepare models if they are not local but url or huggingface repositories
func (cl *BackendConfigLoader) Preload(modelPath string) error {
cl.Lock()
defer cl.Unlock()
status := func(fileName, current, total string, percent float64) {
utils.DisplayDownloadFunction(fileName, current, total, percent)
}
log.Info().Msgf("Preloading models from %s", modelPath)
renderMode := "dark"
if os.Getenv("COLOR") != "" {
renderMode = os.Getenv("COLOR")
}
glamText := func(t string) {
out, err := glamour.Render(t, renderMode)
if err == nil && os.Getenv("NO_COLOR") == "" {
fmt.Println(out)
} else {
fmt.Println(t)
}
}
for i, config := range cl.configs {
// Download files and verify their SHA
for _, file := range config.DownloadFiles {
log.Debug().Msgf("Checking %q exists and matches SHA", file.Filename)
if err := utils.VerifyPath(file.Filename, modelPath); err != nil {
return err
}
// Create file path
filePath := filepath.Join(modelPath, file.Filename)
if err := downloader.DownloadFile(file.URI, filePath, file.SHA256, status); err != nil {
return err
}
}
modelURL := config.PredictionOptions.Model
modelURL = downloader.ConvertURL(modelURL)
if downloader.LooksLikeURL(modelURL) {
// md5 of model name
md5Name := utils.MD5(modelURL)
// check if file exists
if _, err := os.Stat(filepath.Join(modelPath, md5Name)); errors.Is(err, os.ErrNotExist) {
err := downloader.DownloadFile(modelURL, filepath.Join(modelPath, md5Name), "", status)
if err != nil {
return err
}
}
cc := cl.configs[i]
c := &cc
c.PredictionOptions.Model = md5Name
cl.configs[i] = *c
}
if cl.configs[i].Name != "" {
glamText(fmt.Sprintf("**Model name**: _%s_", cl.configs[i].Name))
}
if cl.configs[i].Description != "" {
//glamText("**Description**")
glamText(cl.configs[i].Description)
}
if cl.configs[i].Usage != "" {
//glamText("**Usage**")
glamText(cl.configs[i].Usage)
}
}
return nil
}
// LoadBackendConfigsFromPath reads all the configurations of the models from a path
// (non-recursive)
func (cm *BackendConfigLoader) LoadBackendConfigsFromPath(path string, opts ...ConfigLoaderOption) error {
cm.Lock()
defer cm.Unlock()
entries, err := os.ReadDir(path)
if err != nil {
return err
}
files := make([]fs.FileInfo, 0, len(entries))
for _, entry := range entries {
info, err := entry.Info()
if err != nil {
return err
}
files = append(files, info)
}
for _, file := range files {
// Skip templates, YAML and .keep files
if !strings.Contains(file.Name(), ".yaml") && !strings.Contains(file.Name(), ".yml") {
continue
}
c, err := ReadBackendConfig(filepath.Join(path, file.Name()), opts...)
if err == nil {
cm.configs[c.Name] = *c
}
}
return nil
}

View File

@@ -1,429 +0,0 @@
package config
import (
"errors"
"fmt"
"io/fs"
"os"
"path/filepath"
"strings"
"sync"
"github.com/go-skynet/LocalAI/pkg/downloader"
"github.com/go-skynet/LocalAI/pkg/utils"
"github.com/rs/zerolog/log"
"gopkg.in/yaml.v3"
)
type Config struct {
PredictionOptions `yaml:"parameters"`
Name string `yaml:"name"`
F16 bool `yaml:"f16"`
Threads int `yaml:"threads"`
Debug bool `yaml:"debug"`
Roles map[string]string `yaml:"roles"`
Embeddings bool `yaml:"embeddings"`
Backend string `yaml:"backend"`
TemplateConfig TemplateConfig `yaml:"template"`
PromptStrings, InputStrings []string `yaml:"-"`
InputToken [][]int `yaml:"-"`
functionCallString, functionCallNameString string `yaml:"-"`
FunctionsConfig Functions `yaml:"function"`
FeatureFlag FeatureFlag `yaml:"feature_flags"` // Feature Flag registry. We move fast, and features may break on a per model/backend basis. Registry for (usually temporary) flags that indicate aborting something early.
// LLM configs (GPT4ALL, Llama.cpp, ...)
LLMConfig `yaml:",inline"`
// AutoGPTQ specifics
AutoGPTQ AutoGPTQ `yaml:"autogptq"`
// Diffusers
Diffusers Diffusers `yaml:"diffusers"`
Step int `yaml:"step"`
// GRPC Options
GRPC GRPC `yaml:"grpc"`
// Vall-e-x
VallE VallE `yaml:"vall-e"`
// CUDA
// Explicitly enable CUDA or not (some backends might need it)
CUDA bool `yaml:"cuda"`
DownloadFiles []File `yaml:"download_files"`
Description string `yaml:"description"`
Usage string `yaml:"usage"`
}
type File struct {
Filename string `yaml:"filename" json:"filename"`
SHA256 string `yaml:"sha256" json:"sha256"`
URI string `yaml:"uri" json:"uri"`
}
type VallE struct {
AudioPath string `yaml:"audio_path"`
}
type FeatureFlag map[string]*bool
func (ff FeatureFlag) Enabled(s string) bool {
v, exist := ff[s]
return exist && v != nil && *v
}
type GRPC struct {
Attempts int `yaml:"attempts"`
AttemptsSleepTime int `yaml:"attempts_sleep_time"`
}
type Diffusers struct {
CUDA bool `yaml:"cuda"`
PipelineType string `yaml:"pipeline_type"`
SchedulerType string `yaml:"scheduler_type"`
EnableParameters string `yaml:"enable_parameters"` // A list of comma separated parameters to specify
CFGScale float32 `yaml:"cfg_scale"` // Classifier-Free Guidance Scale
IMG2IMG bool `yaml:"img2img"` // Image to Image Diffuser
ClipSkip int `yaml:"clip_skip"` // Skip every N frames
ClipModel string `yaml:"clip_model"` // Clip model to use
ClipSubFolder string `yaml:"clip_subfolder"` // Subfolder to use for clip model
ControlNet string `yaml:"control_net"`
}
type LLMConfig struct {
SystemPrompt string `yaml:"system_prompt"`
TensorSplit string `yaml:"tensor_split"`
MainGPU string `yaml:"main_gpu"`
RMSNormEps float32 `yaml:"rms_norm_eps"`
NGQA int32 `yaml:"ngqa"`
PromptCachePath string `yaml:"prompt_cache_path"`
PromptCacheAll bool `yaml:"prompt_cache_all"`
PromptCacheRO bool `yaml:"prompt_cache_ro"`
MirostatETA float64 `yaml:"mirostat_eta"`
MirostatTAU float64 `yaml:"mirostat_tau"`
Mirostat int `yaml:"mirostat"`
NGPULayers int `yaml:"gpu_layers"`
MMap bool `yaml:"mmap"`
MMlock bool `yaml:"mmlock"`
LowVRAM bool `yaml:"low_vram"`
Grammar string `yaml:"grammar"`
StopWords []string `yaml:"stopwords"`
Cutstrings []string `yaml:"cutstrings"`
TrimSpace []string `yaml:"trimspace"`
TrimSuffix []string `yaml:"trimsuffix"`
ContextSize int `yaml:"context_size"`
NUMA bool `yaml:"numa"`
LoraAdapter string `yaml:"lora_adapter"`
LoraBase string `yaml:"lora_base"`
LoraScale float32 `yaml:"lora_scale"`
NoMulMatQ bool `yaml:"no_mulmatq"`
DraftModel string `yaml:"draft_model"`
NDraft int32 `yaml:"n_draft"`
Quantization string `yaml:"quantization"`
MMProj string `yaml:"mmproj"`
RopeScaling string `yaml:"rope_scaling"`
ModelType string `yaml:"type"`
YarnExtFactor float32 `yaml:"yarn_ext_factor"`
YarnAttnFactor float32 `yaml:"yarn_attn_factor"`
YarnBetaFast float32 `yaml:"yarn_beta_fast"`
YarnBetaSlow float32 `yaml:"yarn_beta_slow"`
}
type AutoGPTQ struct {
ModelBaseName string `yaml:"model_base_name"`
Device string `yaml:"device"`
Triton bool `yaml:"triton"`
UseFastTokenizer bool `yaml:"use_fast_tokenizer"`
}
type Functions struct {
DisableNoAction bool `yaml:"disable_no_action"`
NoActionFunctionName string `yaml:"no_action_function_name"`
NoActionDescriptionName string `yaml:"no_action_description_name"`
ParallelCalls bool `yaml:"parallel_calls"`
}
type TemplateConfig struct {
Chat string `yaml:"chat"`
ChatMessage string `yaml:"chat_message"`
Completion string `yaml:"completion"`
Edit string `yaml:"edit"`
Functions string `yaml:"function"`
}
type ConfigLoader struct {
configs map[string]Config
sync.Mutex
}
func (c *Config) SetFunctionCallString(s string) {
c.functionCallString = s
}
func (c *Config) SetFunctionCallNameString(s string) {
c.functionCallNameString = s
}
func (c *Config) ShouldUseFunctions() bool {
return ((c.functionCallString != "none" || c.functionCallString == "") || c.ShouldCallSpecificFunction())
}
func (c *Config) ShouldCallSpecificFunction() bool {
return len(c.functionCallNameString) > 0
}
func (c *Config) FunctionToCall() string {
return c.functionCallNameString
}
// Load a config file for a model
func Load(modelName, modelPath string, cm *ConfigLoader, debug bool, threads, ctx int, f16 bool) (*Config, error) {
// Load a config file if present after the model name
modelConfig := filepath.Join(modelPath, modelName+".yaml")
var cfg *Config
defaults := func() {
cfg = DefaultConfig(modelName)
cfg.ContextSize = ctx
cfg.Threads = threads
cfg.F16 = f16
cfg.Debug = debug
}
cfgExisting, exists := cm.GetConfig(modelName)
if !exists {
if _, err := os.Stat(modelConfig); err == nil {
if err := cm.LoadConfig(modelConfig); err != nil {
return nil, fmt.Errorf("failed loading model config (%s) %s", modelConfig, err.Error())
}
cfgExisting, exists = cm.GetConfig(modelName)
if exists {
cfg = &cfgExisting
} else {
defaults()
}
} else {
defaults()
}
} else {
cfg = &cfgExisting
}
// Set the parameters for the language model prediction
//updateConfig(cfg, input)
// Don't allow 0 as setting
if cfg.Threads == 0 {
if threads != 0 {
cfg.Threads = threads
} else {
cfg.Threads = 4
}
}
// Enforce debug flag if passed from CLI
if debug {
cfg.Debug = true
}
return cfg, nil
}
func defaultPredictOptions(modelFile string) PredictionOptions {
return PredictionOptions{
TopP: 0.7,
TopK: 80,
Maxtokens: 512,
Temperature: 0.9,
Model: modelFile,
}
}
func DefaultConfig(modelFile string) *Config {
return &Config{
PredictionOptions: defaultPredictOptions(modelFile),
}
}
func NewConfigLoader() *ConfigLoader {
return &ConfigLoader{
configs: make(map[string]Config),
}
}
func ReadConfigFile(file string) ([]*Config, error) {
c := &[]*Config{}
f, err := os.ReadFile(file)
if err != nil {
return nil, fmt.Errorf("cannot read config file: %w", err)
}
if err := yaml.Unmarshal(f, c); err != nil {
return nil, fmt.Errorf("cannot unmarshal config file: %w", err)
}
return *c, nil
}
func ReadConfig(file string) (*Config, error) {
c := &Config{}
f, err := os.ReadFile(file)
if err != nil {
return nil, fmt.Errorf("cannot read config file: %w", err)
}
if err := yaml.Unmarshal(f, c); err != nil {
return nil, fmt.Errorf("cannot unmarshal config file: %w", err)
}
return c, nil
}
func (cm *ConfigLoader) LoadConfigFile(file string) error {
cm.Lock()
defer cm.Unlock()
c, err := ReadConfigFile(file)
if err != nil {
return fmt.Errorf("cannot load config file: %w", err)
}
for _, cc := range c {
cm.configs[cc.Name] = *cc
}
return nil
}
func (cm *ConfigLoader) LoadConfig(file string) error {
cm.Lock()
defer cm.Unlock()
c, err := ReadConfig(file)
if err != nil {
return fmt.Errorf("cannot read config file: %w", err)
}
cm.configs[c.Name] = *c
return nil
}
func (cm *ConfigLoader) GetConfig(m string) (Config, bool) {
cm.Lock()
defer cm.Unlock()
v, exists := cm.configs[m]
return v, exists
}
func (cm *ConfigLoader) GetAllConfigs() []Config {
cm.Lock()
defer cm.Unlock()
var res []Config
for _, v := range cm.configs {
res = append(res, v)
}
return res
}
func (cm *ConfigLoader) ListConfigs() []string {
cm.Lock()
defer cm.Unlock()
var res []string
for k := range cm.configs {
res = append(res, k)
}
return res
}
// Preload prepare models if they are not local but url or huggingface repositories
func (cm *ConfigLoader) Preload(modelPath string) error {
cm.Lock()
defer cm.Unlock()
status := func(fileName, current, total string, percent float64) {
utils.DisplayDownloadFunction(fileName, current, total, percent)
}
log.Info().Msgf("Preloading models from %s", modelPath)
for i, config := range cm.configs {
// Download files and verify their SHA
for _, file := range config.DownloadFiles {
log.Debug().Msgf("Checking %q exists and matches SHA", file.Filename)
if err := utils.VerifyPath(file.Filename, modelPath); err != nil {
return err
}
// Create file path
filePath := filepath.Join(modelPath, file.Filename)
if err := downloader.DownloadFile(file.URI, filePath, file.SHA256, status); err != nil {
return err
}
}
modelURL := config.PredictionOptions.Model
modelURL = downloader.ConvertURL(modelURL)
if downloader.LooksLikeURL(modelURL) {
// md5 of model name
md5Name := utils.MD5(modelURL)
// check if file exists
if _, err := os.Stat(filepath.Join(modelPath, md5Name)); errors.Is(err, os.ErrNotExist) {
err := downloader.DownloadFile(modelURL, filepath.Join(modelPath, md5Name), "", status)
if err != nil {
return err
}
}
cc := cm.configs[i]
c := &cc
c.PredictionOptions.Model = md5Name
cm.configs[i] = *c
}
if cm.configs[i].Name != "" {
log.Info().Msgf("Model name: %s", cm.configs[i].Name)
}
if cm.configs[i].Description != "" {
log.Info().Msgf("Model description: %s", cm.configs[i].Description)
}
if cm.configs[i].Usage != "" {
log.Info().Msgf("Model usage: \n%s", cm.configs[i].Usage)
}
}
return nil
}
func (cm *ConfigLoader) LoadConfigs(path string) error {
cm.Lock()
defer cm.Unlock()
entries, err := os.ReadDir(path)
if err != nil {
return err
}
files := make([]fs.FileInfo, 0, len(entries))
for _, entry := range entries {
info, err := entry.Info()
if err != nil {
return err
}
files = append(files, info)
}
for _, file := range files {
// Skip templates, YAML and .keep files
if !strings.Contains(file.Name(), ".yaml") && !strings.Contains(file.Name(), ".yml") {
continue
}
c, err := ReadConfig(filepath.Join(path, file.Name()))
if err == nil {
cm.configs[c.Name] = *c
}
}
return nil
}

Some files were not shown because too many files have changed in this diff Show More