mirror of
https://github.com/mudler/LocalAI.git
synced 2026-02-03 11:13:31 -05:00
Compare commits
509 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cd4c0b8aa6 | ||
|
|
7437d0c9ca | ||
|
|
029f97c2a2 | ||
|
|
3be71be696 | ||
|
|
6adb019f8f | ||
|
|
fcaa0a2f01 | ||
|
|
fd17a3312c | ||
|
|
12d0fe610b | ||
|
|
11c67d16b8 | ||
|
|
63f7c86c4d | ||
|
|
ac89bf77bf | ||
|
|
0395cc02fb | ||
|
|
616972fca0 | ||
|
|
942fbff62d | ||
|
|
2612a0c910 | ||
|
|
2dcb6d7247 | ||
|
|
6978eec69f | ||
|
|
2fcfe54466 | ||
|
|
4e7506a3be | ||
|
|
2a46217f90 | ||
|
|
31ff9dbd52 | ||
|
|
9483abef03 | ||
|
|
ce3e8b3e31 | ||
|
|
f3bb84c9a7 | ||
|
|
ecb1297582 | ||
|
|
73fc702b3c | ||
|
|
e3af62ae1a | ||
|
|
dc21604741 | ||
|
|
5433f1a70e | ||
|
|
d5e032bdcd | ||
|
|
de786f6586 | ||
|
|
8b9bc4aa6e | ||
|
|
e6cea7d28e | ||
|
|
7d7d56f2ce | ||
|
|
1caae91ab6 | ||
|
|
e90f2cb0ca | ||
|
|
5a4291fadd | ||
|
|
91ef58ee5a | ||
|
|
a86e8c78f1 | ||
|
|
adb24214c6 | ||
|
|
f03a0430aa | ||
|
|
73bc12abc0 | ||
|
|
7fa437bbcc | ||
|
|
4a27c99928 | ||
|
|
6ce94834b6 | ||
|
|
84a26458dc | ||
|
|
7aa377b6a9 | ||
|
|
64e66dda4a | ||
|
|
a085f61fdc | ||
|
|
21bdfe5fa4 | ||
|
|
7ebd7b2454 | ||
|
|
6984749ea1 | ||
|
|
c0a206bc7a | ||
|
|
01bbb31fb3 | ||
|
|
72111c597d | ||
|
|
b2f9fc870b | ||
|
|
1fc6d469ac | ||
|
|
05848b2027 | ||
|
|
1da0644aa3 | ||
|
|
c087cd1377 | ||
|
|
c621412f6a | ||
|
|
5a8b1892cd | ||
|
|
5b20426863 | ||
|
|
5c6cd50ed6 | ||
|
|
bace6516f1 | ||
|
|
3baadf6f27 | ||
|
|
8804c701b8 | ||
|
|
7b3ceb19bb | ||
|
|
e7f3effea1 | ||
|
|
61694a2ffb | ||
|
|
573a3f104c | ||
|
|
0e8af53a5b | ||
|
|
960ffa808c | ||
|
|
92719568e5 | ||
|
|
163939af71 | ||
|
|
399f1241dc | ||
|
|
58c9ade2e8 | ||
|
|
6e1c93d84f | ||
|
|
4076ea0494 | ||
|
|
26cbf77c0d | ||
|
|
640790d628 | ||
|
|
4132adea2f | ||
|
|
2b2d907a3a | ||
|
|
6e8f4f584b | ||
|
|
662cfc2b48 | ||
|
|
a25d355d66 | ||
|
|
6d1cfdbefc | ||
|
|
5ecc478968 | ||
|
|
aef5c4291b | ||
|
|
c059f912b9 | ||
|
|
bc1e059259 | ||
|
|
38dc07793a | ||
|
|
da6ef0967d | ||
|
|
7a011e60bd | ||
|
|
e13dd5b09f | ||
|
|
86ee303bd6 | ||
|
|
978ee96fd3 | ||
|
|
3ad5691db6 | ||
|
|
0027681090 | ||
|
|
8cba990edc | ||
|
|
88857696d4 | ||
|
|
23f347e687 | ||
|
|
b6e3dc5f02 | ||
|
|
69667521e2 | ||
|
|
2a92effc5d | ||
|
|
a65e012aa2 | ||
|
|
8e9b41d05f | ||
|
|
078da5c2f0 | ||
|
|
c5af5d139c | ||
|
|
2c9279a542 | ||
|
|
a67d22f5f2 | ||
|
|
dc7c51dcc7 | ||
|
|
98df65c7aa | ||
|
|
1559b6b522 | ||
|
|
a0244e3fb4 | ||
|
|
d66396201a | ||
|
|
9628860c0e | ||
|
|
cae9bf1308 | ||
|
|
5bb5da0760 | ||
|
|
867973a850 | ||
|
|
701cd6b6d5 | ||
|
|
7f61d397d5 | ||
|
|
1ae0b896fa | ||
|
|
3937407cb3 | ||
|
|
0e34ae4f3f | ||
|
|
a38b99ecb6 | ||
|
|
a4a4358182 | ||
|
|
4bc39c2db3 | ||
|
|
cc3df759f8 | ||
|
|
378161060c | ||
|
|
f2f788fe60 | ||
|
|
9fa8ed6b1e | ||
|
|
7fc37c5e29 | ||
|
|
4bc4b1e8bc | ||
|
|
e495b89f18 | ||
|
|
ba09eaea1b | ||
|
|
61cc76c455 | ||
|
|
8abecb4a18 | ||
|
|
8b3f76d8e6 | ||
|
|
4e0497f1a6 | ||
|
|
ba88c9f451 | ||
|
|
a598285825 | ||
|
|
cb7a172897 | ||
|
|
771be28dfb | ||
|
|
7d6b3eb42d | ||
|
|
0bb33fab55 | ||
|
|
e3bf7f77f7 | ||
|
|
bd1707d339 | ||
|
|
0474804541 | ||
|
|
72693b3917 | ||
|
|
a03b70010f | ||
|
|
e3717e5c1a | ||
|
|
c8f6858218 | ||
|
|
06d7cc43ae | ||
|
|
f2147cb850 | ||
|
|
75bb9f4c28 | ||
|
|
a2ef4b1e07 | ||
|
|
161c9fe2db | ||
|
|
7547463f81 | ||
|
|
32e4dfd47b | ||
|
|
f67e5dec68 | ||
|
|
297d54acea | ||
|
|
56f44d448c | ||
|
|
0f0fafacd9 | ||
|
|
4f239bac89 | ||
|
|
04d74ac648 | ||
|
|
18c3dc33ee | ||
|
|
508cfa7369 | ||
|
|
1f94cddbae | ||
|
|
21ae7b4cd4 | ||
|
|
bef22ab547 | ||
|
|
eb04e8cdcf | ||
|
|
17e533a086 | ||
|
|
4fc68409ff | ||
|
|
e587044449 | ||
|
|
1f09db5161 | ||
|
|
05b744f086 | ||
|
|
89ca4bc02d | ||
|
|
e626aa48a4 | ||
|
|
752b5e0339 | ||
|
|
637d72d6e3 | ||
|
|
f3bfec580a | ||
|
|
165c1ddff3 | ||
|
|
fb83238e9e | ||
|
|
700bfa41c7 | ||
|
|
25bdc350df | ||
|
|
1b899e1a68 | ||
|
|
3bf13f8c69 | ||
|
|
7a00729374 | ||
|
|
d484028532 | ||
|
|
0eb7fc2c41 | ||
|
|
a69e30e0c9 | ||
|
|
9c018e6bff | ||
|
|
281e818047 | ||
|
|
270f0e2157 | ||
|
|
673e59e76c | ||
|
|
5a8a2adb44 | ||
|
|
a7317d23bf | ||
|
|
2bab9b5fe2 | ||
|
|
081be3ba7d | ||
|
|
25e6f21322 | ||
|
|
b4df1c9cf3 | ||
|
|
4fbd6609f2 | ||
|
|
7387932f89 | ||
|
|
59c37e67b2 | ||
|
|
c09d227647 | ||
|
|
547d322b28 | ||
|
|
a6f0bb410f | ||
|
|
710f624ecd | ||
|
|
5018452be7 | ||
|
|
ece239966f | ||
|
|
3b8bc7e64c | ||
|
|
fc73b2b430 | ||
|
|
901dba6063 | ||
|
|
b88a7a4550 | ||
|
|
106e40845f | ||
|
|
0064bec8f5 | ||
|
|
9e6dbb0b5a | ||
|
|
d26e61388b | ||
|
|
31a7084c75 | ||
|
|
128612a6fc | ||
|
|
6af3f46bc3 | ||
|
|
d2cf8ef070 | ||
|
|
259ad3cfe6 | ||
|
|
18b320d577 | ||
|
|
89e151f035 | ||
|
|
22060f6410 | ||
|
|
7ee3288460 | ||
|
|
cbbc954a8c | ||
|
|
2c425e9c69 | ||
|
|
c59975ab05 | ||
|
|
05f7004487 | ||
|
|
2f9203cd2a | ||
|
|
f09b33f2ef | ||
|
|
65470b0ab1 | ||
|
|
9a23fe662b | ||
|
|
6d7ac09e96 | ||
|
|
c2a39e3639 | ||
|
|
ae625a4d00 | ||
|
|
7f3a029596 | ||
|
|
b34cf00819 | ||
|
|
d4a10b4300 | ||
|
|
9c74d74f7b | ||
|
|
679ee7bea4 | ||
|
|
77d7dc62c4 | ||
|
|
699519d1fe | ||
|
|
8faf39d34e | ||
|
|
5d261a6fcd | ||
|
|
22d5727089 | ||
|
|
c965197d6f | ||
|
|
994a6c4939 | ||
|
|
f926d2a72b | ||
|
|
ddeb9ed93e | ||
|
|
c7e99c7b59 | ||
|
|
6fabc92e56 | ||
|
|
4645b3c919 | ||
|
|
134fe2705c | ||
|
|
3cca32ba7e | ||
|
|
c069e61b26 | ||
|
|
7fa159e164 | ||
|
|
5f92025617 | ||
|
|
333e1bc732 | ||
|
|
e90b97c144 | ||
|
|
747eeb1d46 | ||
|
|
5d2c53abc0 | ||
|
|
0b1e721242 | ||
|
|
8c76a9ce99 | ||
|
|
338321af5b | ||
|
|
2774a92484 | ||
|
|
1a6bfb41a1 | ||
|
|
314981eaf8 | ||
|
|
d7266c633d | ||
|
|
eb4d5f2b95 | ||
|
|
c63b449ad6 | ||
|
|
dd4a778c2c | ||
|
|
a0896d21d6 | ||
|
|
0e697f951a | ||
|
|
fa4bb9082d | ||
|
|
8ff7b15441 | ||
|
|
dd45f85a20 | ||
|
|
decdd9e522 | ||
|
|
31a21d4a2c | ||
|
|
2c129843a7 | ||
|
|
ce71a0bcfb | ||
|
|
0a32c38317 | ||
|
|
36f596f260 | ||
|
|
953552545b | ||
|
|
835e55b1de | ||
|
|
dcd2921eaa | ||
|
|
5e6459fd18 | ||
|
|
50ddb3eb59 | ||
|
|
5eebfee4b5 | ||
|
|
567919ea90 | ||
|
|
27a3997530 | ||
|
|
192ba2c657 | ||
|
|
92abac9ca8 | ||
|
|
04ebbbd73a | ||
|
|
55305e0d95 | ||
|
|
67623639e4 | ||
|
|
cc76def342 | ||
|
|
4967fa5928 | ||
|
|
2b98e4ec56 | ||
|
|
fa1d058ee2 | ||
|
|
a49a588bfa | ||
|
|
ca7dda61c6 | ||
|
|
ffedddd76d | ||
|
|
766c76ae8e | ||
|
|
3096ff33e9 | ||
|
|
90a7451da4 | ||
|
|
529a4b9ee8 | ||
|
|
0567e104eb | ||
|
|
ecbeacd022 | ||
|
|
2772960e41 | ||
|
|
1b694191e2 | ||
|
|
69578a5f8f | ||
|
|
7d96cfe72b | ||
|
|
423514a5a5 | ||
|
|
12568c7d6d | ||
|
|
8d16a0a536 | ||
|
|
87ca801f00 | ||
|
|
e4ecbb6c30 | ||
|
|
b1a67de2b9 | ||
|
|
71a23910fe | ||
|
|
0ede31f9cf | ||
|
|
9f5dcf2d1e | ||
|
|
e878556e98 | ||
|
|
b096928172 | ||
|
|
db7442ae67 | ||
|
|
b6cd430e08 | ||
|
|
478e50cda2 | ||
|
|
1db2b9943c | ||
|
|
ac41aa8b67 | ||
|
|
156a98e2e7 | ||
|
|
d88ec1209e | ||
|
|
fde8dbfc80 | ||
|
|
879dc73eba | ||
|
|
1dfc52de16 | ||
|
|
1331129485 | ||
|
|
1cd98062e5 | ||
|
|
9791d9b77a | ||
|
|
8956452a45 | ||
|
|
f3659fa49c | ||
|
|
585f2be793 | ||
|
|
d13f160222 | ||
|
|
db5495b9d7 | ||
|
|
3def1ae232 | ||
|
|
c6ebead8e5 | ||
|
|
cff4a950e0 | ||
|
|
e4fa894153 | ||
|
|
69caccfa82 | ||
|
|
ab50c13160 | ||
|
|
56d4e82b14 | ||
|
|
09b5bd48bc | ||
|
|
957dcfb6a9 | ||
|
|
67f7bffd18 | ||
|
|
de81b42b49 | ||
|
|
06eb7e9fa7 | ||
|
|
45bc1ac566 | ||
|
|
02aafeff75 | ||
|
|
6b46c52789 | ||
|
|
d732e261a4 | ||
|
|
807c574e91 | ||
|
|
bb171a39b3 | ||
|
|
941a4fc50e | ||
|
|
afe65bd7bf | ||
|
|
6f9762049c | ||
|
|
122970d70d | ||
|
|
8664b1c7a2 | ||
|
|
c92166f38a | ||
|
|
d616058b12 | ||
|
|
a7b4001b75 | ||
|
|
ff85f01459 | ||
|
|
695f81a08b | ||
|
|
326be287da | ||
|
|
0404d98190 | ||
|
|
0a8ec1eb22 | ||
|
|
d860932dcd | ||
|
|
1cb137bd2d | ||
|
|
3c279e5568 | ||
|
|
fb55e3df57 | ||
|
|
de46fb6e2e | ||
|
|
d7a0e3c5ea | ||
|
|
0533ea817d | ||
|
|
755e4fb5f4 | ||
|
|
e4fdde158f | ||
|
|
6d0712fa6d | ||
|
|
bbbb28e3ca | ||
|
|
3bf2e9d065 | ||
|
|
1461fd8777 | ||
|
|
054860539a | ||
|
|
c87870b18e | ||
|
|
5ad2be9c45 | ||
|
|
61a24746a1 | ||
|
|
d557eb9361 | ||
|
|
a9a1a361a9 | ||
|
|
12d070af80 | ||
|
|
8d40557bc8 | ||
|
|
5a5f3a899a | ||
|
|
a2d1f133c8 | ||
|
|
0ae6420c31 | ||
|
|
3a3e05cf18 | ||
|
|
6a20388e25 | ||
|
|
06c836a937 | ||
|
|
049a13fe78 | ||
|
|
30bf6c962f | ||
|
|
a72b3a23c3 | ||
|
|
e9971b168a | ||
|
|
5b59b5e0c1 | ||
|
|
8cfd712428 | ||
|
|
21f7faa80d | ||
|
|
a6a0121118 | ||
|
|
ba66aa33c5 | ||
|
|
8fc024a770 | ||
|
|
52aa9d08aa | ||
|
|
4c9379c39e | ||
|
|
0ff2c39364 | ||
|
|
1af7e5dc49 | ||
|
|
af3bb64e42 | ||
|
|
77281f836e | ||
|
|
550275811d | ||
|
|
c27ce6c54d | ||
|
|
ac4991b069 | ||
|
|
25bee71bb8 | ||
|
|
b993780a3b | ||
|
|
ea0c9f1168 | ||
|
|
08311f275a | ||
|
|
4de0f2f737 | ||
|
|
42ae807c41 | ||
|
|
94593ba4c3 | ||
|
|
6a6e1a0ea9 | ||
|
|
5b19af99ff | ||
|
|
28fb8e607a | ||
|
|
bb85b6ef00 | ||
|
|
b9b5a635ca | ||
|
|
131ea5b627 | ||
|
|
fac70e9642 | ||
|
|
7e76ea40fb | ||
|
|
de09ae42ef | ||
|
|
6424f0666d | ||
|
|
f3ae94ca70 | ||
|
|
09c9f67a02 | ||
|
|
c264ca542d | ||
|
|
bbf30d416d | ||
|
|
27617a1b06 | ||
|
|
e84081769e | ||
|
|
20119fc580 | ||
|
|
09941c0bfb | ||
|
|
cabe0f4993 | ||
|
|
1977c7f190 | ||
|
|
061e7c4eae | ||
|
|
5313e660f6 | ||
|
|
9e32fda304 | ||
|
|
83202cae54 | ||
|
|
d96addfa9d | ||
|
|
a715fe588d | ||
|
|
2ac4a86bb4 | ||
|
|
8670d480a6 | ||
|
|
af0b4ff237 | ||
|
|
e694764065 | ||
|
|
f3c27e0381 | ||
|
|
bf44319d0d | ||
|
|
5b133a640b | ||
|
|
0030a3fe75 | ||
|
|
0a748b009e | ||
|
|
257e951def | ||
|
|
fbd82a2dd0 | ||
|
|
5db321dad2 | ||
|
|
f5638a6354 | ||
|
|
5f64cc6328 | ||
|
|
28b10e8804 | ||
|
|
3277f5095d | ||
|
|
fe3ced2919 | ||
|
|
45e37a07bb | ||
|
|
e57b750ca3 | ||
|
|
49df492268 | ||
|
|
516cd660f1 | ||
|
|
8fd3ace9a1 | ||
|
|
099469cb05 | ||
|
|
6be8c0c618 | ||
|
|
3cddf24747 | ||
|
|
c330360785 | ||
|
|
8cd51570e5 | ||
|
|
0e7aa5cd15 | ||
|
|
e06a5f49de | ||
|
|
fb2f847507 | ||
|
|
e01acc88c9 | ||
|
|
7a5912908a | ||
|
|
4b1b942a7f | ||
|
|
230fe0098f | ||
|
|
cc163429dc | ||
|
|
f670e0a91c | ||
|
|
731674eee7 | ||
|
|
cc1f6f913f | ||
|
|
7f90ff7aec | ||
|
|
8d45670e41 | ||
|
|
e4b8ddb6a1 | ||
|
|
a801561f81 | ||
|
|
16ced07102 | ||
|
|
d35595372d | ||
|
|
81be192279 | ||
|
|
28a1310890 | ||
|
|
2a702e9ca4 | ||
|
|
3ecaea1b6e | ||
|
|
7daf5ac3e3 | ||
|
|
7bc80c17f8 | ||
|
|
1996ceb293 | ||
|
|
0bc3dc43da | ||
|
|
3324c4e6cb | ||
|
|
7329db4e78 |
@@ -1,23 +0,0 @@
|
||||
meta {
|
||||
name: musicgen
|
||||
type: http
|
||||
seq: 1
|
||||
}
|
||||
|
||||
post {
|
||||
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/v1/sound-generation
|
||||
body: json
|
||||
auth: none
|
||||
}
|
||||
|
||||
headers {
|
||||
Content-Type: application/json
|
||||
}
|
||||
|
||||
body:json {
|
||||
{
|
||||
"model_id": "facebook/musicgen-small",
|
||||
"text": "Exciting 80s Newscast Interstitial",
|
||||
"duration_seconds": 8
|
||||
}
|
||||
}
|
||||
@@ -1,17 +0,0 @@
|
||||
meta {
|
||||
name: backend monitor
|
||||
type: http
|
||||
seq: 4
|
||||
}
|
||||
|
||||
get {
|
||||
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/backend/monitor
|
||||
body: json
|
||||
auth: none
|
||||
}
|
||||
|
||||
body:json {
|
||||
{
|
||||
"model": "{{DEFAULT_MODEL}}"
|
||||
}
|
||||
}
|
||||
@@ -1,21 +0,0 @@
|
||||
meta {
|
||||
name: backend-shutdown
|
||||
type: http
|
||||
seq: 3
|
||||
}
|
||||
|
||||
post {
|
||||
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/backend/shutdown
|
||||
body: json
|
||||
auth: none
|
||||
}
|
||||
|
||||
headers {
|
||||
Content-Type: application/json
|
||||
}
|
||||
|
||||
body:json {
|
||||
{
|
||||
"model": "{{DEFAULT_MODEL}}"
|
||||
}
|
||||
}
|
||||
@@ -1,5 +0,0 @@
|
||||
{
|
||||
"version": "1",
|
||||
"name": "LocalAI Test Requests",
|
||||
"type": "collection"
|
||||
}
|
||||
@@ -1,6 +0,0 @@
|
||||
vars {
|
||||
HOST: localhost
|
||||
PORT: 8080
|
||||
DEFAULT_MODEL: gpt-3.5-turbo
|
||||
PROTOCOL: http://
|
||||
}
|
||||
@@ -1,11 +0,0 @@
|
||||
meta {
|
||||
name: get models list
|
||||
type: http
|
||||
seq: 2
|
||||
}
|
||||
|
||||
get {
|
||||
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models
|
||||
body: none
|
||||
auth: none
|
||||
}
|
||||
@@ -1,25 +0,0 @@
|
||||
meta {
|
||||
name: Generate image
|
||||
type: http
|
||||
seq: 1
|
||||
}
|
||||
|
||||
post {
|
||||
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/v1/images/generations
|
||||
body: json
|
||||
auth: none
|
||||
}
|
||||
|
||||
headers {
|
||||
Content-Type: application/json
|
||||
}
|
||||
|
||||
body:json {
|
||||
{
|
||||
"prompt": "<positive prompt>|<negative prompt>",
|
||||
"model": "model-name",
|
||||
"step": 51,
|
||||
"size": "1024x1024",
|
||||
"image": ""
|
||||
}
|
||||
}
|
||||
@@ -1,24 +0,0 @@
|
||||
meta {
|
||||
name: -completions
|
||||
type: http
|
||||
seq: 4
|
||||
}
|
||||
|
||||
post {
|
||||
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/completions
|
||||
body: json
|
||||
auth: none
|
||||
}
|
||||
|
||||
headers {
|
||||
Content-Type: application/json
|
||||
}
|
||||
|
||||
body:json {
|
||||
{
|
||||
"model": "{{DEFAULT_MODEL}}",
|
||||
"prompt": "function downloadFile(string url, string outputPath) {",
|
||||
"max_tokens": 256,
|
||||
"temperature": 0.5
|
||||
}
|
||||
}
|
||||
@@ -1,23 +0,0 @@
|
||||
meta {
|
||||
name: -edits
|
||||
type: http
|
||||
seq: 5
|
||||
}
|
||||
|
||||
post {
|
||||
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/edits
|
||||
body: json
|
||||
auth: none
|
||||
}
|
||||
|
||||
headers {
|
||||
Content-Type: application/json
|
||||
}
|
||||
|
||||
body:json {
|
||||
{
|
||||
"model": "{{DEFAULT_MODEL}}",
|
||||
"input": "What day of the wek is it?",
|
||||
"instruction": "Fix the spelling mistakes"
|
||||
}
|
||||
}
|
||||
@@ -1,22 +0,0 @@
|
||||
meta {
|
||||
name: -embeddings
|
||||
type: http
|
||||
seq: 6
|
||||
}
|
||||
|
||||
post {
|
||||
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/embeddings
|
||||
body: json
|
||||
auth: none
|
||||
}
|
||||
|
||||
headers {
|
||||
Content-Type: application/json
|
||||
}
|
||||
|
||||
body:json {
|
||||
{
|
||||
"model": "{{DEFAULT_MODEL}}",
|
||||
"input": "A STRANGE GAME.\nTHE ONLY WINNING MOVE IS NOT TO PLAY.\n\nHOW ABOUT A NICE GAME OF CHESS?"
|
||||
}
|
||||
}
|
||||
@@ -1,30 +0,0 @@
|
||||
meta {
|
||||
name: chat completion -simple- 1 message-
|
||||
type: http
|
||||
seq: 4
|
||||
}
|
||||
|
||||
post {
|
||||
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/chat/completions
|
||||
body: json
|
||||
auth: none
|
||||
}
|
||||
|
||||
headers {
|
||||
Content-Type: application/json
|
||||
}
|
||||
|
||||
body:json {
|
||||
{
|
||||
"model": "{{DEFAULT_MODEL}}",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "How could one use friction to cook an egg?"
|
||||
}
|
||||
],
|
||||
"max_tokens": 256,
|
||||
"temperature": 0.2,
|
||||
"grammar": ""
|
||||
}
|
||||
}
|
||||
@@ -1,29 +0,0 @@
|
||||
meta {
|
||||
name: chat-completions -long-
|
||||
type: http
|
||||
seq: 5
|
||||
}
|
||||
|
||||
post {
|
||||
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/chat/completions
|
||||
body: json
|
||||
auth: none
|
||||
}
|
||||
|
||||
headers {
|
||||
Content-Type: application/json
|
||||
}
|
||||
|
||||
body:json {
|
||||
{
|
||||
"model": "{{DEFAULT_MODEL}}",
|
||||
"messages": [{"role": "system", "content": "You are a helpful, intelligent pirate assistant. Always answer as helpfully as possible, while being a pirate and using pirate language. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."},
|
||||
{"role": "user", "content": "How could one use electricity to cook an egg?"},
|
||||
{"role": "assistant",
|
||||
"content": "Shiver me timbers! Using electricity to cook an egg? Well, matey, I reckon that be a right curious idea! *adjusts spectacles* Now, I ain't sure if this be possible, but I'll do me best to help ye out.\n\nFirst things first, ye gotta understand that electricity be a powerful force, and it can't just cook an egg on its own. Ye see, electricity be like a mighty wind that can make things happen, but it needs somethin' to work with. So, if ye want to use electricity to cook an egg, ye gotta find a way to harness that power and make it do the cookin'.\n\nNow, I know what ye might be thinkin': \"How do I harness the power of electricity to cook an egg?\" Well, matey, there be a few ways to do it. One way be to use a special device called an \"electric frying pan.\" This be a pan that has a built-in heating element that gets hot when ye plug it into a wall socket. When the element gets hot, ye can crack an egg into the pan and watch as it cook"
|
||||
},
|
||||
{"role": "user", "content": "I don't have one of those, just a raw wire and plenty of power! How do we get it done?"}],
|
||||
"max_tokens": 1024,
|
||||
"temperature": 0.5
|
||||
}
|
||||
}
|
||||
@@ -1,25 +0,0 @@
|
||||
meta {
|
||||
name: chat-completions -stream-
|
||||
type: http
|
||||
seq: 6
|
||||
}
|
||||
|
||||
post {
|
||||
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/chat/completions
|
||||
body: json
|
||||
auth: none
|
||||
}
|
||||
|
||||
headers {
|
||||
Content-Type: application/json
|
||||
}
|
||||
|
||||
body:json {
|
||||
{
|
||||
"model": "{{DEFAULT_MODEL}}",
|
||||
"messages": [{"role": "user", "content": "Explain how I can set sail on the ocean using only power generated by seagulls?"}],
|
||||
"max_tokens": 256,
|
||||
"temperature": 0.9,
|
||||
"stream": true
|
||||
}
|
||||
}
|
||||
@@ -1,22 +0,0 @@
|
||||
meta {
|
||||
name: add model gallery
|
||||
type: http
|
||||
seq: 10
|
||||
}
|
||||
|
||||
post {
|
||||
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/galleries
|
||||
body: json
|
||||
auth: none
|
||||
}
|
||||
|
||||
headers {
|
||||
Content-Type: application/json
|
||||
}
|
||||
|
||||
body:json {
|
||||
{
|
||||
"url": "file:///home/dave/projects/model-gallery/huggingface/TheBloke__CodeLlama-7B-Instruct-GGML.yaml",
|
||||
"name": "test"
|
||||
}
|
||||
}
|
||||
@@ -1,21 +0,0 @@
|
||||
meta {
|
||||
name: delete model gallery
|
||||
type: http
|
||||
seq: 11
|
||||
}
|
||||
|
||||
delete {
|
||||
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/galleries
|
||||
body: json
|
||||
auth: none
|
||||
}
|
||||
|
||||
headers {
|
||||
Content-Type: application/json
|
||||
}
|
||||
|
||||
body:json {
|
||||
{
|
||||
"name": "test"
|
||||
}
|
||||
}
|
||||
@@ -1,11 +0,0 @@
|
||||
meta {
|
||||
name: list MODELS in galleries
|
||||
type: http
|
||||
seq: 7
|
||||
}
|
||||
|
||||
get {
|
||||
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/available
|
||||
body: none
|
||||
auth: none
|
||||
}
|
||||
@@ -1,11 +0,0 @@
|
||||
meta {
|
||||
name: list model GALLERIES
|
||||
type: http
|
||||
seq: 8
|
||||
}
|
||||
|
||||
get {
|
||||
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/galleries
|
||||
body: none
|
||||
auth: none
|
||||
}
|
||||
@@ -1,11 +0,0 @@
|
||||
meta {
|
||||
name: model delete
|
||||
type: http
|
||||
seq: 7
|
||||
}
|
||||
|
||||
post {
|
||||
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/galleries
|
||||
body: none
|
||||
auth: none
|
||||
}
|
||||
@@ -1,21 +0,0 @@
|
||||
meta {
|
||||
name: model gallery apply -gist-
|
||||
type: http
|
||||
seq: 12
|
||||
}
|
||||
|
||||
post {
|
||||
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/apply
|
||||
body: json
|
||||
auth: none
|
||||
}
|
||||
|
||||
headers {
|
||||
Content-Type: application/json
|
||||
}
|
||||
|
||||
body:json {
|
||||
{
|
||||
"id": "TheBloke__CodeLlama-7B-Instruct-GGML__codellama-7b-instruct.ggmlv3.Q2_K.bin"
|
||||
}
|
||||
}
|
||||
@@ -1,22 +0,0 @@
|
||||
meta {
|
||||
name: model gallery apply
|
||||
type: http
|
||||
seq: 9
|
||||
}
|
||||
|
||||
post {
|
||||
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/apply
|
||||
body: json
|
||||
auth: none
|
||||
}
|
||||
|
||||
headers {
|
||||
Content-Type: application/json
|
||||
}
|
||||
|
||||
body:json {
|
||||
{
|
||||
"id": "dave@TheBloke__CodeLlama-7B-Instruct-GGML__codellama-7b-instruct.ggmlv3.Q3_K_S.bin",
|
||||
"name": "codellama7b"
|
||||
}
|
||||
}
|
||||
Binary file not shown.
@@ -1,16 +0,0 @@
|
||||
meta {
|
||||
name: transcribe
|
||||
type: http
|
||||
seq: 1
|
||||
}
|
||||
|
||||
post {
|
||||
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/v1/audio/transcriptions
|
||||
body: multipartForm
|
||||
auth: none
|
||||
}
|
||||
|
||||
body:multipart-form {
|
||||
file: @file(transcription/gb1.ogg)
|
||||
model: whisper-1
|
||||
}
|
||||
@@ -1,22 +0,0 @@
|
||||
meta {
|
||||
name: -tts
|
||||
type: http
|
||||
seq: 2
|
||||
}
|
||||
|
||||
post {
|
||||
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/tts
|
||||
body: json
|
||||
auth: none
|
||||
}
|
||||
|
||||
headers {
|
||||
Content-Type: application/json
|
||||
}
|
||||
|
||||
body:json {
|
||||
{
|
||||
"model": "{{DEFAULT_MODEL}}",
|
||||
"input": "A STRANGE GAME.\nTHE ONLY WINNING MOVE IS NOT TO PLAY.\n\nHOW ABOUT A NICE GAME OF CHESS?"
|
||||
}
|
||||
}
|
||||
@@ -1,23 +0,0 @@
|
||||
meta {
|
||||
name: musicgen
|
||||
type: http
|
||||
seq: 2
|
||||
}
|
||||
|
||||
post {
|
||||
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/tts
|
||||
body: json
|
||||
auth: none
|
||||
}
|
||||
|
||||
headers {
|
||||
Content-Type: application/json
|
||||
}
|
||||
|
||||
body:json {
|
||||
{
|
||||
"backend": "transformers",
|
||||
"model": "facebook/musicgen-small",
|
||||
"input": "80s Synths playing Jazz"
|
||||
}
|
||||
}
|
||||
5
.env
5
.env
@@ -29,6 +29,9 @@
|
||||
## Enable/Disable single backend (useful if only one GPU is available)
|
||||
# LOCALAI_SINGLE_ACTIVE_BACKEND=true
|
||||
|
||||
# Forces shutdown of the backends if busy (only if LOCALAI_SINGLE_ACTIVE_BACKEND is set)
|
||||
# LOCALAI_FORCE_BACKEND_SHUTDOWN=true
|
||||
|
||||
## Specify a build type. Available: cublas, openblas, clblas.
|
||||
## cuBLAS: This is a GPU-accelerated version of the complete standard BLAS (Basic Linear Algebra Subprograms) library. It's provided by Nvidia and is part of their CUDA toolkit.
|
||||
## OpenBLAS: This is an open-source implementation of the BLAS library that aims to provide highly optimized code for various platforms. It includes support for multi-threading and can be compiled to use hardware-specific features for additional performance. OpenBLAS can run on many kinds of hardware, including CPUs from Intel, AMD, and ARM.
|
||||
@@ -73,7 +76,7 @@
|
||||
|
||||
### Define a list of GRPC Servers for llama-cpp workers to distribute the load
|
||||
# https://github.com/ggerganov/llama.cpp/pull/6829
|
||||
# https://github.com/ggerganov/llama.cpp/blob/master/examples/rpc/README.md
|
||||
# https://github.com/ggerganov/llama.cpp/blob/master/tools/rpc/README.md
|
||||
# LLAMACPP_GRPC_SERVERS=""
|
||||
|
||||
### Enable to run parallel requests
|
||||
|
||||
4
.github/dependabot.yml
vendored
4
.github/dependabot.yml
vendored
@@ -29,10 +29,6 @@ updates:
|
||||
schedule:
|
||||
# Check for updates to GitHub Actions every weekday
|
||||
interval: "weekly"
|
||||
- package-ecosystem: "pip"
|
||||
directory: "/backend/python/autogptq"
|
||||
schedule:
|
||||
interval: "weekly"
|
||||
- package-ecosystem: "pip"
|
||||
directory: "/backend/python/bark"
|
||||
schedule:
|
||||
|
||||
2
.github/labeler.yml
vendored
2
.github/labeler.yml
vendored
@@ -1,4 +1,4 @@
|
||||
enhancements:
|
||||
enhancement:
|
||||
- head-branch: ['^feature', 'feature']
|
||||
|
||||
dependencies:
|
||||
|
||||
4
.github/workflows/bump_deps.yaml
vendored
4
.github/workflows/bump_deps.yaml
vendored
@@ -9,10 +9,10 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- repository: "ggerganov/llama.cpp"
|
||||
- repository: "ggml-org/llama.cpp"
|
||||
variable: "CPPLLAMA_VERSION"
|
||||
branch: "master"
|
||||
- repository: "ggerganov/whisper.cpp"
|
||||
- repository: "ggml-org/whisper.cpp"
|
||||
variable: "WHISPER_CPP_VERSION"
|
||||
branch: "master"
|
||||
- repository: "PABannier/bark.cpp"
|
||||
|
||||
2
.github/workflows/dependabot_auto.yml
vendored
2
.github/workflows/dependabot_auto.yml
vendored
@@ -14,7 +14,7 @@ jobs:
|
||||
steps:
|
||||
- name: Dependabot metadata
|
||||
id: metadata
|
||||
uses: dependabot/fetch-metadata@v2.3.0
|
||||
uses: dependabot/fetch-metadata@v2.4.0
|
||||
with:
|
||||
github-token: "${{ secrets.GITHUB_TOKEN }}"
|
||||
skip-commit-verification: true
|
||||
|
||||
6
.github/workflows/deploy-explorer.yaml
vendored
6
.github/workflows/deploy-explorer.yaml
vendored
@@ -33,7 +33,7 @@ jobs:
|
||||
run: |
|
||||
CGO_ENABLED=0 make build-api
|
||||
- name: rm
|
||||
uses: appleboy/ssh-action@v1.2.0
|
||||
uses: appleboy/ssh-action@v1.2.2
|
||||
with:
|
||||
host: ${{ secrets.EXPLORER_SSH_HOST }}
|
||||
username: ${{ secrets.EXPLORER_SSH_USERNAME }}
|
||||
@@ -42,7 +42,7 @@ jobs:
|
||||
script: |
|
||||
sudo rm -rf local-ai/ || true
|
||||
- name: copy file via ssh
|
||||
uses: appleboy/scp-action@v0.1.7
|
||||
uses: appleboy/scp-action@v1.0.0
|
||||
with:
|
||||
host: ${{ secrets.EXPLORER_SSH_HOST }}
|
||||
username: ${{ secrets.EXPLORER_SSH_USERNAME }}
|
||||
@@ -53,7 +53,7 @@ jobs:
|
||||
rm: true
|
||||
target: ./local-ai
|
||||
- name: restarting
|
||||
uses: appleboy/ssh-action@v1.2.0
|
||||
uses: appleboy/ssh-action@v1.2.2
|
||||
with:
|
||||
host: ${{ secrets.EXPLORER_SSH_HOST }}
|
||||
username: ${{ secrets.EXPLORER_SSH_USERNAME }}
|
||||
|
||||
9
.github/workflows/generate_grpc_cache.yaml
vendored
9
.github/workflows/generate_grpc_cache.yaml
vendored
@@ -2,9 +2,10 @@ name: 'generate and publish GRPC docker caches'
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
|
||||
schedule:
|
||||
# daily at midnight
|
||||
- cron: '0 0 * * *'
|
||||
|
||||
concurrency:
|
||||
group: grpc-cache-${{ github.head_ref || github.ref }}-${{ github.repository }}
|
||||
@@ -16,7 +17,7 @@ jobs:
|
||||
matrix:
|
||||
include:
|
||||
- grpc-base-image: ubuntu:22.04
|
||||
runs-on: 'ubuntu-latest'
|
||||
runs-on: 'arc-runner-set'
|
||||
platforms: 'linux/amd64,linux/arm64'
|
||||
runs-on: ${{matrix.runs-on}}
|
||||
steps:
|
||||
|
||||
2
.github/workflows/generate_intel_image.yaml
vendored
2
.github/workflows/generate_intel_image.yaml
vendored
@@ -15,7 +15,7 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- base-image: intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04
|
||||
- base-image: intel/oneapi-basekit:2025.1.0-0-devel-ubuntu22.04
|
||||
runs-on: 'ubuntu-latest'
|
||||
platforms: 'linux/amd64'
|
||||
runs-on: ${{matrix.runs-on}}
|
||||
|
||||
50
.github/workflows/image-pr.yml
vendored
50
.github/workflows/image-pr.yml
vendored
@@ -33,6 +33,7 @@ jobs:
|
||||
# Pushing with all jobs in parallel
|
||||
# eats the bandwidth of all the nodes
|
||||
max-parallel: ${{ github.event_name != 'pull_request' && 4 || 8 }}
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
# This is basically covered by the AIO test
|
||||
@@ -56,26 +57,35 @@ jobs:
|
||||
runs-on: 'arc-runner-set'
|
||||
base-image: "ubuntu:22.04"
|
||||
makeflags: "--jobs=3 --output-sync=target"
|
||||
# - build-type: 'hipblas'
|
||||
# platforms: 'linux/amd64'
|
||||
# tag-latest: 'false'
|
||||
# tag-suffix: '-hipblas'
|
||||
# ffmpeg: 'false'
|
||||
# image-type: 'extras'
|
||||
# base-image: "rocm/dev-ubuntu-22.04:6.1"
|
||||
# grpc-base-image: "ubuntu:22.04"
|
||||
# runs-on: 'arc-runner-set'
|
||||
# makeflags: "--jobs=3 --output-sync=target"
|
||||
# - build-type: 'sycl_f16'
|
||||
# platforms: 'linux/amd64'
|
||||
# tag-latest: 'false'
|
||||
# base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
|
||||
# grpc-base-image: "ubuntu:22.04"
|
||||
# tag-suffix: 'sycl-f16-ffmpeg'
|
||||
# ffmpeg: 'true'
|
||||
# image-type: 'extras'
|
||||
# runs-on: 'arc-runner-set'
|
||||
# makeflags: "--jobs=3 --output-sync=target"
|
||||
- build-type: 'hipblas'
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'false'
|
||||
tag-suffix: '-hipblas'
|
||||
ffmpeg: 'false'
|
||||
image-type: 'extras'
|
||||
base-image: "rocm/dev-ubuntu-22.04:6.1"
|
||||
grpc-base-image: "ubuntu:22.04"
|
||||
runs-on: 'arc-runner-set'
|
||||
makeflags: "--jobs=3 --output-sync=target"
|
||||
- build-type: 'sycl_f16'
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'false'
|
||||
base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
|
||||
grpc-base-image: "ubuntu:22.04"
|
||||
tag-suffix: 'sycl-f16-ffmpeg'
|
||||
ffmpeg: 'true'
|
||||
image-type: 'extras'
|
||||
runs-on: 'arc-runner-set'
|
||||
makeflags: "--jobs=3 --output-sync=target"
|
||||
- build-type: 'vulkan'
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'false'
|
||||
tag-suffix: '-vulkan-ffmpeg-core'
|
||||
ffmpeg: 'true'
|
||||
image-type: 'core'
|
||||
runs-on: 'ubuntu-latest'
|
||||
base-image: "ubuntu:22.04"
|
||||
makeflags: "--jobs=4 --output-sync=target"
|
||||
# core-image-build:
|
||||
# uses: ./.github/workflows/image_build.yml
|
||||
# with:
|
||||
|
||||
167
.github/workflows/image.yml
vendored
167
.github/workflows/image.yml
vendored
@@ -45,13 +45,13 @@ jobs:
|
||||
- build-type: 'hipblas'
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-hipblas-ffmpeg'
|
||||
tag-suffix: '-hipblas-extras'
|
||||
ffmpeg: 'true'
|
||||
image-type: 'extras'
|
||||
aio: "-aio-gpu-hipblas"
|
||||
base-image: "rocm/dev-ubuntu-22.04:6.1"
|
||||
grpc-base-image: "ubuntu:22.04"
|
||||
latest-image: 'latest-gpu-hipblas'
|
||||
latest-image: 'latest-gpu-hipblas-extras'
|
||||
latest-image-aio: 'latest-aio-gpu-hipblas'
|
||||
runs-on: 'arc-runner-set'
|
||||
makeflags: "--jobs=3 --output-sync=target"
|
||||
@@ -59,32 +59,13 @@ jobs:
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'false'
|
||||
tag-suffix: '-hipblas'
|
||||
ffmpeg: 'false'
|
||||
image-type: 'extras'
|
||||
base-image: "rocm/dev-ubuntu-22.04:6.1"
|
||||
grpc-base-image: "ubuntu:22.04"
|
||||
runs-on: 'arc-runner-set'
|
||||
makeflags: "--jobs=3 --output-sync=target"
|
||||
- build-type: 'hipblas'
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'false'
|
||||
tag-suffix: '-hipblas-ffmpeg-core'
|
||||
ffmpeg: 'true'
|
||||
image-type: 'core'
|
||||
base-image: "rocm/dev-ubuntu-22.04:6.1"
|
||||
grpc-base-image: "ubuntu:22.04"
|
||||
runs-on: 'arc-runner-set'
|
||||
makeflags: "--jobs=3 --output-sync=target"
|
||||
- build-type: 'hipblas'
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'false'
|
||||
tag-suffix: '-hipblas-core'
|
||||
ffmpeg: 'false'
|
||||
image-type: 'core'
|
||||
base-image: "rocm/dev-ubuntu-22.04:6.1"
|
||||
grpc-base-image: "ubuntu:22.04"
|
||||
runs-on: 'arc-runner-set'
|
||||
makeflags: "--jobs=3 --output-sync=target"
|
||||
latest-image: 'latest-gpu-hipblas'
|
||||
self-hosted-jobs:
|
||||
uses: ./.github/workflows/image_build.yml
|
||||
with:
|
||||
@@ -114,110 +95,58 @@ jobs:
|
||||
max-parallel: ${{ github.event_name != 'pull_request' && 5 || 8 }}
|
||||
matrix:
|
||||
include:
|
||||
# Extra images
|
||||
- build-type: ''
|
||||
#platforms: 'linux/amd64,linux/arm64'
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: ''
|
||||
ffmpeg: ''
|
||||
image-type: 'extras'
|
||||
runs-on: 'arc-runner-set'
|
||||
base-image: "ubuntu:22.04"
|
||||
makeflags: "--jobs=3 --output-sync=target"
|
||||
- build-type: ''
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-ffmpeg'
|
||||
ffmpeg: 'true'
|
||||
image-type: 'extras'
|
||||
runs-on: 'arc-runner-set'
|
||||
base-image: "ubuntu:22.04"
|
||||
makeflags: "--jobs=3 --output-sync=target"
|
||||
- build-type: 'cublas'
|
||||
cuda-major-version: "11"
|
||||
cuda-minor-version: "7"
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'false'
|
||||
tag-suffix: '-cublas-cuda11'
|
||||
ffmpeg: ''
|
||||
image-type: 'extras'
|
||||
runs-on: 'arc-runner-set'
|
||||
base-image: "ubuntu:22.04"
|
||||
makeflags: "--jobs=3 --output-sync=target"
|
||||
- build-type: 'cublas'
|
||||
cuda-major-version: "12"
|
||||
cuda-minor-version: "0"
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'false'
|
||||
tag-suffix: '-cublas-cuda12'
|
||||
ffmpeg: ''
|
||||
image-type: 'extras'
|
||||
runs-on: 'arc-runner-set'
|
||||
base-image: "ubuntu:22.04"
|
||||
makeflags: "--jobs=3 --output-sync=target"
|
||||
- build-type: 'cublas'
|
||||
cuda-major-version: "11"
|
||||
cuda-minor-version: "7"
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-cublas-cuda11-ffmpeg'
|
||||
tag-suffix: '-cublas-cuda11-extras'
|
||||
ffmpeg: 'true'
|
||||
image-type: 'extras'
|
||||
runs-on: 'arc-runner-set'
|
||||
base-image: "ubuntu:22.04"
|
||||
aio: "-aio-gpu-nvidia-cuda-11"
|
||||
latest-image: 'latest-gpu-nvidia-cuda-11'
|
||||
latest-image: 'latest-gpu-nvidia-cuda-11-extras'
|
||||
latest-image-aio: 'latest-aio-gpu-nvidia-cuda-11'
|
||||
makeflags: "--jobs=3 --output-sync=target"
|
||||
- build-type: 'cublas'
|
||||
cuda-major-version: "12"
|
||||
cuda-minor-version: "0"
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-cublas-cuda12-ffmpeg'
|
||||
tag-latest: 'false'
|
||||
tag-suffix: '-cublas-cuda12-extras'
|
||||
ffmpeg: 'true'
|
||||
image-type: 'extras'
|
||||
runs-on: 'arc-runner-set'
|
||||
base-image: "ubuntu:22.04"
|
||||
aio: "-aio-gpu-nvidia-cuda-12"
|
||||
latest-image: 'latest-gpu-nvidia-cuda-12'
|
||||
latest-image: 'latest-gpu-nvidia-cuda-12-extras'
|
||||
latest-image-aio: 'latest-aio-gpu-nvidia-cuda-12'
|
||||
makeflags: "--jobs=3 --output-sync=target"
|
||||
- build-type: ''
|
||||
#platforms: 'linux/amd64,linux/arm64'
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: ''
|
||||
ffmpeg: ''
|
||||
image-type: 'extras'
|
||||
base-image: "ubuntu:22.04"
|
||||
runs-on: 'arc-runner-set'
|
||||
makeflags: "--jobs=3 --output-sync=target"
|
||||
- build-type: 'sycl_f16'
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-latest: 'false'
|
||||
base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
|
||||
grpc-base-image: "ubuntu:22.04"
|
||||
tag-suffix: '-sycl-f16-ffmpeg'
|
||||
tag-suffix: '-sycl-f16-extras'
|
||||
ffmpeg: 'true'
|
||||
image-type: 'extras'
|
||||
runs-on: 'arc-runner-set'
|
||||
aio: "-aio-gpu-intel-f16"
|
||||
latest-image: 'latest-gpu-intel-f16'
|
||||
latest-image: 'latest-gpu-intel-f16-extras'
|
||||
latest-image-aio: 'latest-aio-gpu-intel-f16'
|
||||
makeflags: "--jobs=3 --output-sync=target"
|
||||
- build-type: 'sycl_f32'
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-latest: 'false'
|
||||
base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
|
||||
grpc-base-image: "ubuntu:22.04"
|
||||
tag-suffix: '-sycl-f32-ffmpeg'
|
||||
tag-suffix: '-sycl-f32-extras'
|
||||
ffmpeg: 'true'
|
||||
image-type: 'extras'
|
||||
runs-on: 'arc-runner-set'
|
||||
aio: "-aio-gpu-intel-f32"
|
||||
latest-image: 'latest-gpu-intel-f32'
|
||||
latest-image: 'latest-gpu-intel-f32-extras'
|
||||
latest-image-aio: 'latest-aio-gpu-intel-f32'
|
||||
makeflags: "--jobs=3 --output-sync=target"
|
||||
# Core images
|
||||
@@ -226,41 +155,23 @@ jobs:
|
||||
tag-latest: 'false'
|
||||
base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
|
||||
grpc-base-image: "ubuntu:22.04"
|
||||
tag-suffix: '-sycl-f16-core'
|
||||
ffmpeg: 'false'
|
||||
tag-suffix: '-sycl-f16'
|
||||
ffmpeg: 'true'
|
||||
image-type: 'core'
|
||||
runs-on: 'arc-runner-set'
|
||||
makeflags: "--jobs=3 --output-sync=target"
|
||||
latest-image: 'latest-gpu-intel-f16'
|
||||
- build-type: 'sycl_f32'
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'false'
|
||||
base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
|
||||
grpc-base-image: "ubuntu:22.04"
|
||||
tag-suffix: '-sycl-f32-core'
|
||||
ffmpeg: 'false'
|
||||
image-type: 'core'
|
||||
runs-on: 'arc-runner-set'
|
||||
makeflags: "--jobs=3 --output-sync=target"
|
||||
- build-type: 'sycl_f16'
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'false'
|
||||
base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
|
||||
grpc-base-image: "ubuntu:22.04"
|
||||
tag-suffix: '-sycl-f16-ffmpeg-core'
|
||||
ffmpeg: 'true'
|
||||
image-type: 'core'
|
||||
runs-on: 'arc-runner-set'
|
||||
makeflags: "--jobs=3 --output-sync=target"
|
||||
- build-type: 'sycl_f32'
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'false'
|
||||
base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
|
||||
grpc-base-image: "ubuntu:22.04"
|
||||
tag-suffix: '-sycl-f32-ffmpeg-core'
|
||||
tag-suffix: '-sycl-f32'
|
||||
ffmpeg: 'true'
|
||||
image-type: 'core'
|
||||
runs-on: 'arc-runner-set'
|
||||
makeflags: "--jobs=3 --output-sync=target"
|
||||
latest-image: 'latest-gpu-intel-f32'
|
||||
|
||||
core-image-build:
|
||||
uses: ./.github/workflows/image_build.yml
|
||||
@@ -293,7 +204,7 @@ jobs:
|
||||
- build-type: ''
|
||||
platforms: 'linux/amd64,linux/arm64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-ffmpeg-core'
|
||||
tag-suffix: ''
|
||||
ffmpeg: 'true'
|
||||
image-type: 'core'
|
||||
base-image: "ubuntu:22.04"
|
||||
@@ -308,60 +219,38 @@ jobs:
|
||||
cuda-minor-version: "7"
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'false'
|
||||
tag-suffix: '-cublas-cuda11-core'
|
||||
ffmpeg: ''
|
||||
image-type: 'core'
|
||||
base-image: "ubuntu:22.04"
|
||||
runs-on: 'arc-runner-set'
|
||||
makeflags: "--jobs=4 --output-sync=target"
|
||||
skip-drivers: 'false'
|
||||
- build-type: 'cublas'
|
||||
cuda-major-version: "12"
|
||||
cuda-minor-version: "0"
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'false'
|
||||
tag-suffix: '-cublas-cuda12-core'
|
||||
ffmpeg: ''
|
||||
image-type: 'core'
|
||||
base-image: "ubuntu:22.04"
|
||||
runs-on: 'arc-runner-set'
|
||||
makeflags: "--jobs=4 --output-sync=target"
|
||||
skip-drivers: 'false'
|
||||
- build-type: 'cublas'
|
||||
cuda-major-version: "11"
|
||||
cuda-minor-version: "7"
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'false'
|
||||
tag-suffix: '-cublas-cuda11-ffmpeg-core'
|
||||
tag-suffix: '-cublas-cuda11'
|
||||
ffmpeg: 'true'
|
||||
image-type: 'core'
|
||||
runs-on: 'arc-runner-set'
|
||||
base-image: "ubuntu:22.04"
|
||||
makeflags: "--jobs=4 --output-sync=target"
|
||||
skip-drivers: 'false'
|
||||
latest-image: 'latest-gpu-nvidia-cuda-12'
|
||||
- build-type: 'cublas'
|
||||
cuda-major-version: "12"
|
||||
cuda-minor-version: "0"
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'false'
|
||||
tag-suffix: '-cublas-cuda12-ffmpeg-core'
|
||||
tag-suffix: '-cublas-cuda12'
|
||||
ffmpeg: 'true'
|
||||
image-type: 'core'
|
||||
runs-on: 'arc-runner-set'
|
||||
base-image: "ubuntu:22.04"
|
||||
skip-drivers: 'false'
|
||||
makeflags: "--jobs=4 --output-sync=target"
|
||||
latest-image: 'latest-gpu-nvidia-cuda-12'
|
||||
- build-type: 'vulkan'
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'false'
|
||||
tag-suffix: '-vulkan-ffmpeg-core'
|
||||
latest-image: 'latest-vulkan-ffmpeg-core'
|
||||
tag-suffix: '-vulkan'
|
||||
ffmpeg: 'true'
|
||||
image-type: 'core'
|
||||
runs-on: 'arc-runner-set'
|
||||
base-image: "ubuntu:22.04"
|
||||
skip-drivers: 'false'
|
||||
makeflags: "--jobs=4 --output-sync=target"
|
||||
latest-image: 'latest-gpu-vulkan'
|
||||
gh-runner:
|
||||
uses: ./.github/workflows/image_build.yml
|
||||
with:
|
||||
@@ -394,8 +283,8 @@ jobs:
|
||||
cuda-minor-version: "0"
|
||||
platforms: 'linux/arm64'
|
||||
tag-latest: 'false'
|
||||
tag-suffix: '-nvidia-l4t-arm64-core'
|
||||
latest-image: 'latest-nvidia-l4t-arm64-core'
|
||||
tag-suffix: '-nvidia-l4t-arm64'
|
||||
latest-image: 'latest-nvidia-l4t-arm64'
|
||||
ffmpeg: 'true'
|
||||
image-type: 'core'
|
||||
base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
|
||||
|
||||
5
.github/workflows/image_build.yml
vendored
5
.github/workflows/image_build.yml
vendored
@@ -310,6 +310,11 @@ jobs:
|
||||
tags: ${{ steps.meta_aio_dockerhub.outputs.tags }}
|
||||
labels: ${{ steps.meta_aio_dockerhub.outputs.labels }}
|
||||
|
||||
- name: Cleanup
|
||||
run: |
|
||||
docker builder prune -f
|
||||
docker system prune --force --volumes --all
|
||||
|
||||
- name: Latest tag
|
||||
# run this on branches, when it is a tag and there is a latest-image defined
|
||||
if: github.event_name != 'pull_request' && inputs.latest-image != '' && github.ref_type == 'tag'
|
||||
|
||||
10
.github/workflows/notify-models.yaml
vendored
10
.github/workflows/notify-models.yaml
vendored
@@ -8,7 +8,7 @@ jobs:
|
||||
notify-discord:
|
||||
if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }}
|
||||
env:
|
||||
MODEL_NAME: hermes-2-theta-llama-3-8b
|
||||
MODEL_NAME: gemma-3-12b-it
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
@@ -16,7 +16,7 @@ jobs:
|
||||
fetch-depth: 0 # needed to checkout all branches for this Action to work
|
||||
- uses: mudler/localai-github-action@v1
|
||||
with:
|
||||
model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
|
||||
model: 'gemma-3-12b-it' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
|
||||
# Check the PR diff using the current branch and the base branch of the PR
|
||||
- uses: GrantBirki/git-diff-action@v2.8.0
|
||||
id: git-diff-action
|
||||
@@ -79,7 +79,7 @@ jobs:
|
||||
args: ${{ steps.summarize.outputs.message }}
|
||||
- name: Setup tmate session if fails
|
||||
if: ${{ failure() }}
|
||||
uses: mxschmitt/action-tmate@v3.19
|
||||
uses: mxschmitt/action-tmate@v3.22
|
||||
with:
|
||||
detached: true
|
||||
connect-timeout-seconds: 180
|
||||
@@ -87,7 +87,7 @@ jobs:
|
||||
notify-twitter:
|
||||
if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }}
|
||||
env:
|
||||
MODEL_NAME: hermes-2-theta-llama-3-8b
|
||||
MODEL_NAME: gemma-3-12b-it
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
@@ -161,7 +161,7 @@ jobs:
|
||||
TWITTER_ACCESS_TOKEN_SECRET: ${{ secrets.TWITTER_ACCESS_TOKEN_SECRET }}
|
||||
- name: Setup tmate session if fails
|
||||
if: ${{ failure() }}
|
||||
uses: mxschmitt/action-tmate@v3.19
|
||||
uses: mxschmitt/action-tmate@v3.22
|
||||
with:
|
||||
detached: true
|
||||
connect-timeout-seconds: 180
|
||||
|
||||
4
.github/workflows/notify-releases.yaml
vendored
4
.github/workflows/notify-releases.yaml
vendored
@@ -14,7 +14,7 @@ jobs:
|
||||
steps:
|
||||
- uses: mudler/localai-github-action@v1
|
||||
with:
|
||||
model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
|
||||
model: 'gemma-3-12b-it' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
|
||||
- name: Summarize
|
||||
id: summarize
|
||||
run: |
|
||||
@@ -60,4 +60,4 @@ jobs:
|
||||
DISCORD_AVATAR: "https://avatars.githubusercontent.com/u/139863280?v=4"
|
||||
uses: Ilshidur/action-discord@master
|
||||
with:
|
||||
args: ${{ steps.summarize.outputs.message }}
|
||||
args: ${{ steps.summarize.outputs.message }}
|
||||
|
||||
16
.github/workflows/release.yaml
vendored
16
.github/workflows/release.yaml
vendored
@@ -36,6 +36,7 @@ jobs:
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential ffmpeg protobuf-compiler ccache upx-ucl gawk
|
||||
sudo apt-get install -qy binutils-aarch64-linux-gnu gcc-aarch64-linux-gnu g++-aarch64-linux-gnu libgmock-dev
|
||||
make install-go-tools
|
||||
- name: Install CUDA Dependencies
|
||||
run: |
|
||||
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/cross-linux-aarch64/cuda-keyring_1.1-1_all.deb
|
||||
@@ -123,7 +124,7 @@ jobs:
|
||||
release/*
|
||||
- name: Setup tmate session if tests fail
|
||||
if: ${{ failure() }}
|
||||
uses: mxschmitt/action-tmate@v3.19
|
||||
uses: mxschmitt/action-tmate@v3.22
|
||||
with:
|
||||
detached: true
|
||||
connect-timeout-seconds: 180
|
||||
@@ -151,6 +152,7 @@ jobs:
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y wget curl build-essential ffmpeg protobuf-compiler ccache upx-ucl gawk cmake libgmock-dev
|
||||
make install-go-tools
|
||||
- name: Intel Dependencies
|
||||
run: |
|
||||
wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
|
||||
@@ -232,7 +234,7 @@ jobs:
|
||||
release/*
|
||||
- name: Setup tmate session if tests fail
|
||||
if: ${{ failure() }}
|
||||
uses: mxschmitt/action-tmate@v3.19
|
||||
uses: mxschmitt/action-tmate@v3.22
|
||||
with:
|
||||
detached: true
|
||||
connect-timeout-seconds: 180
|
||||
@@ -253,8 +255,7 @@ jobs:
|
||||
- name: Dependencies
|
||||
run: |
|
||||
brew install protobuf grpc
|
||||
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@8ba23be9613c672d40ae261d2a1335d639bdd59b
|
||||
go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.0
|
||||
make install-go-tools
|
||||
- name: Build
|
||||
id: build
|
||||
run: |
|
||||
@@ -275,7 +276,7 @@ jobs:
|
||||
release/*
|
||||
- name: Setup tmate session if tests fail
|
||||
if: ${{ failure() }}
|
||||
uses: mxschmitt/action-tmate@v3.19
|
||||
uses: mxschmitt/action-tmate@v3.22
|
||||
with:
|
||||
detached: true
|
||||
connect-timeout-seconds: 180
|
||||
@@ -295,8 +296,7 @@ jobs:
|
||||
- name: Dependencies
|
||||
run: |
|
||||
brew install protobuf grpc libomp llvm
|
||||
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
|
||||
go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
|
||||
make install-go-tools
|
||||
- name: Build
|
||||
id: build
|
||||
run: |
|
||||
@@ -317,7 +317,7 @@ jobs:
|
||||
release/*
|
||||
- name: Setup tmate session if tests fail
|
||||
if: ${{ failure() }}
|
||||
uses: mxschmitt/action-tmate@v3.19
|
||||
uses: mxschmitt/action-tmate@v3.22
|
||||
with:
|
||||
detached: true
|
||||
connect-timeout-seconds: 180
|
||||
|
||||
2
.github/workflows/secscan.yaml
vendored
2
.github/workflows/secscan.yaml
vendored
@@ -18,7 +18,7 @@ jobs:
|
||||
if: ${{ github.actor != 'dependabot[bot]' }}
|
||||
- name: Run Gosec Security Scanner
|
||||
if: ${{ github.actor != 'dependabot[bot]' }}
|
||||
uses: securego/gosec@v2.22.0
|
||||
uses: securego/gosec@v2.22.4
|
||||
with:
|
||||
# we let the report trigger content trigger a failure using the GitHub Security features.
|
||||
args: '-no-fail -fmt sarif -out results.sarif ./...'
|
||||
|
||||
20
.github/workflows/test-extra.yml
vendored
20
.github/workflows/test-extra.yml
vendored
@@ -78,6 +78,26 @@ jobs:
|
||||
make --jobs=5 --output-sync=target -C backend/python/diffusers
|
||||
make --jobs=5 --output-sync=target -C backend/python/diffusers test
|
||||
|
||||
#tests-vllm:
|
||||
# runs-on: ubuntu-latest
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v4
|
||||
# with:
|
||||
# submodules: true
|
||||
# - name: Dependencies
|
||||
# run: |
|
||||
# sudo apt-get update
|
||||
# sudo apt-get install -y build-essential ffmpeg
|
||||
# sudo apt-get install -y ca-certificates cmake curl patch python3-pip
|
||||
# sudo apt-get install -y libopencv-dev
|
||||
# # Install UV
|
||||
# curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
# pip install --user --no-cache-dir grpcio-tools==1.64.1
|
||||
# - name: Test vllm backend
|
||||
# run: |
|
||||
# make --jobs=5 --output-sync=target -C backend/python/vllm
|
||||
# make --jobs=5 --output-sync=target -C backend/python/vllm test
|
||||
# tests-transformers-musicgen:
|
||||
# runs-on: ubuntu-latest
|
||||
# steps:
|
||||
|
||||
11
.github/workflows/test.yml
vendored
11
.github/workflows/test.yml
vendored
@@ -71,7 +71,7 @@ jobs:
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential ccache upx-ucl curl ffmpeg
|
||||
sudo apt-get install -y libgmock-dev
|
||||
sudo apt-get install -y libgmock-dev clang
|
||||
curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
|
||||
sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
|
||||
gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
|
||||
@@ -96,6 +96,7 @@ jobs:
|
||||
|
||||
go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
|
||||
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
|
||||
go install github.com/GeertJohan/go.rice/rice@latest
|
||||
|
||||
# The python3-grpc-tools package in 22.04 is too old
|
||||
pip install --user grpcio-tools
|
||||
@@ -130,7 +131,7 @@ jobs:
|
||||
PATH="$PATH:/root/go/bin" GO_TAGS="tts" make --jobs 5 --output-sync=target test
|
||||
- name: Setup tmate session if tests fail
|
||||
if: ${{ failure() }}
|
||||
uses: mxschmitt/action-tmate@v3.19
|
||||
uses: mxschmitt/action-tmate@v3.22
|
||||
with:
|
||||
detached: true
|
||||
connect-timeout-seconds: 180
|
||||
@@ -183,6 +184,7 @@ jobs:
|
||||
rm protoc.zip
|
||||
go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
|
||||
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
|
||||
go install github.com/GeertJohan/go.rice/rice@latest
|
||||
PATH="$PATH:$HOME/go/bin" make protogen-go
|
||||
- name: Build images
|
||||
run: |
|
||||
@@ -194,7 +196,7 @@ jobs:
|
||||
make run-e2e-aio
|
||||
- name: Setup tmate session if tests fail
|
||||
if: ${{ failure() }}
|
||||
uses: mxschmitt/action-tmate@v3.19
|
||||
uses: mxschmitt/action-tmate@v3.22
|
||||
with:
|
||||
detached: true
|
||||
connect-timeout-seconds: 180
|
||||
@@ -222,6 +224,7 @@ jobs:
|
||||
run: |
|
||||
brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc libomp llvm
|
||||
pip install --user --no-cache-dir grpcio-tools
|
||||
go install github.com/GeertJohan/go.rice/rice@latest
|
||||
- name: Test
|
||||
run: |
|
||||
export C_INCLUDE_PATH=/usr/local/include
|
||||
@@ -232,7 +235,7 @@ jobs:
|
||||
BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" make --jobs 4 --output-sync=target test
|
||||
- name: Setup tmate session if tests fail
|
||||
if: ${{ failure() }}
|
||||
uses: mxschmitt/action-tmate@v3.19
|
||||
uses: mxschmitt/action-tmate@v3.22
|
||||
with:
|
||||
detached: true
|
||||
connect-timeout-seconds: 180
|
||||
|
||||
20
Dockerfile
20
Dockerfile
@@ -15,7 +15,7 @@ ARG TARGETARCH
|
||||
ARG TARGETVARIANT
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,transformers:/build/backend/python/transformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,faster-whisper:/build/backend/python/faster-whisper/run.sh,kokoro:/build/backend/python/kokoro/run.sh,vllm:/build/backend/python/vllm/run.sh,exllama2:/build/backend/python/exllama2/run.sh"
|
||||
ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,transformers:/build/backend/python/transformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,faster-whisper:/build/backend/python/faster-whisper/run.sh,kokoro:/build/backend/python/kokoro/run.sh,vllm:/build/backend/python/vllm/run.sh,exllama2:/build/backend/python/exllama2/run.sh"
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
@@ -24,6 +24,7 @@ RUN apt-get update && \
|
||||
ca-certificates \
|
||||
curl libssl-dev \
|
||||
git \
|
||||
git-lfs \
|
||||
unzip upx-ucl && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
@@ -45,9 +46,10 @@ EOT
|
||||
RUN curl -L -s https://go.dev/dl/go${GO_VERSION}.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -xz
|
||||
ENV PATH=$PATH:/root/go/bin:/usr/local/go/bin
|
||||
|
||||
# Install grpc compilers
|
||||
# Install grpc compilers and rice
|
||||
RUN go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2 && \
|
||||
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
|
||||
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af && \
|
||||
go install github.com/GeertJohan/go.rice/rice@latest
|
||||
|
||||
COPY --chmod=644 custom-ca-certs/* /usr/local/share/ca-certificates/
|
||||
RUN update-ca-certificates
|
||||
@@ -299,10 +301,9 @@ COPY .git .
|
||||
RUN make prepare
|
||||
|
||||
## Build the binary
|
||||
## If it's CUDA or hipblas, we want to skip some of the llama-compat backends to save space
|
||||
## We only leave the most CPU-optimized variant and the fallback for the cublas/hipblas build
|
||||
## (both will use CUDA or hipblas for the actual computation)
|
||||
RUN if [ "${BUILD_TYPE}" = "cublas" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
|
||||
## If we're on arm64 AND using cublas/hipblas, skip some of the llama-compat backends to save space
|
||||
## Otherwise just run the normal build
|
||||
RUN if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
|
||||
SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx512 backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \
|
||||
else \
|
||||
make build; \
|
||||
@@ -354,14 +355,12 @@ FROM requirements-drivers
|
||||
|
||||
ARG FFMPEG
|
||||
ARG BUILD_TYPE
|
||||
ARG BUILD_PLATFORM
|
||||
ARG TARGETARCH
|
||||
ARG IMAGE_TYPE=extras
|
||||
ARG EXTRA_BACKENDS
|
||||
ARG MAKEFLAGS
|
||||
|
||||
ENV BUILD_TYPE=${BUILD_TYPE}
|
||||
ENV BUILD_PLATFORM=${BUILD_PLATFORM}
|
||||
ENV REBUILD=false
|
||||
ENV HEALTHCHECK_ENDPOINT=http://localhost:8080/readyz
|
||||
ENV MAKEFLAGS=${MAKEFLAGS}
|
||||
@@ -432,9 +431,6 @@ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "kokoro" || -z "${EXTRA_BACKENDS}" ) && "$IMA
|
||||
RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vllm" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
|
||||
make -C backend/python/vllm \
|
||||
; fi && \
|
||||
if [[ ( "${EXTRA_BACKENDS}" =~ "autogptq" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
|
||||
make -C backend/python/autogptq \
|
||||
; fi && \
|
||||
if [[ ( "${EXTRA_BACKENDS}" =~ "bark" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
|
||||
make -C backend/python/bark \
|
||||
; fi && \
|
||||
|
||||
2
LICENSE
2
LICENSE
@@ -1,6 +1,6 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2023-2024 Ettore Di Giacinto (mudler@localai.io)
|
||||
Copyright (c) 2023-2025 Ettore Di Giacinto (mudler@localai.io)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
|
||||
140
Makefile
140
Makefile
@@ -6,13 +6,11 @@ BINARY_NAME=local-ai
|
||||
DETECT_LIBS?=true
|
||||
|
||||
# llama.cpp versions
|
||||
GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
|
||||
GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
|
||||
CPPLLAMA_VERSION?=5598f475be3e31430fbe17ebb85654ec90dc201e
|
||||
CPPLLAMA_VERSION?=e5c834f718a32b7584f142799bbf508fddb9021c
|
||||
|
||||
# whisper.cpp version
|
||||
WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
|
||||
WHISPER_CPP_VERSION?=6266a9f9e56a5b925e9892acf650f3eb1245814d
|
||||
WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
|
||||
WHISPER_CPP_VERSION?=e41bc5c61ae66af6be2bd7011769bb821a83e8ae
|
||||
|
||||
# go-piper version
|
||||
PIPER_REPO?=https://github.com/mudler/go-piper
|
||||
@@ -23,8 +21,11 @@ BARKCPP_REPO?=https://github.com/PABannier/bark.cpp.git
|
||||
BARKCPP_VERSION?=v1.0.0
|
||||
|
||||
# stablediffusion.cpp (ggml)
|
||||
STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
|
||||
STABLEDIFFUSION_GGML_VERSION?=5eb15ef4d022bef4a391de4f5f6556e81fbb5024
|
||||
STABLEDIFFUSION_GGML_REPO?=https://github.com/richiejp/stable-diffusion.cpp
|
||||
STABLEDIFFUSION_GGML_VERSION?=53e3b17eb3d0b5760ced06a1f98320b68b34aaae
|
||||
|
||||
# ONEAPI variables for SYCL
|
||||
export ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
|
||||
|
||||
ONNX_VERSION?=1.20.0
|
||||
ONNX_ARCH?=x64
|
||||
@@ -32,8 +33,12 @@ ONNX_OS?=linux
|
||||
|
||||
export BUILD_TYPE?=
|
||||
export STABLE_BUILD_TYPE?=$(BUILD_TYPE)
|
||||
export CMAKE_ARGS?=
|
||||
export CMAKE_ARGS?=-DBUILD_SHARED_LIBS=OFF
|
||||
export WHISPER_CMAKE_ARGS?=-DBUILD_SHARED_LIBS=OFF
|
||||
export BACKEND_LIBS?=
|
||||
export WHISPER_DIR=$(abspath ./sources/whisper.cpp)
|
||||
export WHISPER_INCLUDE_PATH=$(WHISPER_DIR)/include:$(WHISPER_DIR)/ggml/include
|
||||
export WHISPER_LIBRARY_PATH=$(WHISPER_DIR)/build/src/:$(WHISPER_DIR)/build/ggml/src
|
||||
|
||||
CGO_LDFLAGS?=
|
||||
CGO_LDFLAGS_WHISPER?=
|
||||
@@ -83,6 +88,7 @@ endif
|
||||
# IF native is false, we add -DGGML_NATIVE=OFF to CMAKE_ARGS
|
||||
ifeq ($(NATIVE),false)
|
||||
CMAKE_ARGS+=-DGGML_NATIVE=OFF
|
||||
WHISPER_CMAKE_ARGS+=-DGGML_NATIVE=OFF
|
||||
endif
|
||||
|
||||
# Detect if we are running on arm64
|
||||
@@ -110,13 +116,31 @@ ifeq ($(OS),Darwin)
|
||||
# disable metal if on Darwin and any other value is explicitly passed.
|
||||
else ifneq ($(BUILD_TYPE),metal)
|
||||
CMAKE_ARGS+=-DGGML_METAL=OFF
|
||||
WHISPER_CMAKE_ARGS+=-DGGML_METAL=OFF
|
||||
export GGML_NO_ACCELERATE=1
|
||||
export GGML_NO_METAL=1
|
||||
GO_LDFLAGS_WHISPER+=-lggml-blas
|
||||
export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-blas
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_TYPE),metal)
|
||||
# -lcblas removed: it seems to always be listed as a duplicate flag.
|
||||
CGO_LDFLAGS += -framework Accelerate
|
||||
CGO_LDFLAGS_WHISPER+=-lggml-metal -lggml-blas
|
||||
CMAKE_ARGS+=-DGGML_METAL=ON
|
||||
CMAKE_ARGS+=-DGGML_METAL_USE_BF16=ON
|
||||
CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
|
||||
CMAKE_ARGS+=-DGGML_OPENMP=OFF
|
||||
WHISPER_CMAKE_ARGS+=-DGGML_METAL=ON
|
||||
WHISPER_CMAKE_ARGS+=-DGGML_METAL_USE_BF16=ON
|
||||
WHISPER_CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
|
||||
WHISPER_CMAKE_ARGS+=-DWHISPER_BUILD_EXAMPLES=OFF
|
||||
WHISPER_CMAKE_ARGS+=-DWHISPER_BUILD_TESTS=OFF
|
||||
WHISPER_CMAKE_ARGS+=-DWHISPER_BUILD_SERVER=OFF
|
||||
WHISPER_CMAKE_ARGS+=-DGGML_OPENMP=OFF
|
||||
export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-metal/:$(WHISPER_DIR)/build/ggml/src/ggml-blas
|
||||
else
|
||||
CGO_LDFLAGS_WHISPER+=-lggml-blas
|
||||
export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-blas
|
||||
endif
|
||||
else
|
||||
CGO_LDFLAGS_WHISPER+=-lgomp
|
||||
@@ -128,21 +152,29 @@ ifeq ($(BUILD_TYPE),openblas)
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_TYPE),cublas)
|
||||
CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH)
|
||||
CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH) -L$(CUDA_LIBPATH)/stubs/ -lcuda
|
||||
export GGML_CUDA=1
|
||||
CGO_LDFLAGS_WHISPER+=-L$(CUDA_LIBPATH)/stubs/ -lcuda -lcufft
|
||||
CMAKE_ARGS+=-DGGML_CUDA=ON
|
||||
WHISPER_CMAKE_ARGS+=-DGGML_CUDA=ON
|
||||
CGO_LDFLAGS_WHISPER+=-lcufft -lggml-cuda
|
||||
export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-cuda/
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_TYPE),vulkan)
|
||||
CMAKE_ARGS+=-DGGML_VULKAN=1
|
||||
WHISPER_CMAKE_ARGS+=-DGGML_VULKAN=1
|
||||
CGO_LDFLAGS_WHISPER+=-lggml-vulkan -lvulkan
|
||||
export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-vulkan/
|
||||
endif
|
||||
|
||||
ifneq (,$(findstring sycl,$(BUILD_TYPE)))
|
||||
export GGML_SYCL=1
|
||||
CMAKE_ARGS+=-DGGML_SYCL=ON
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_TYPE),sycl_f16)
|
||||
export GGML_SYCL_F16=1
|
||||
CMAKE_ARGS+=-DGGML_SYCL_F16=ON
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_TYPE),hipblas)
|
||||
@@ -151,10 +183,9 @@ ifeq ($(BUILD_TYPE),hipblas)
|
||||
LD_LIBRARY_PATH ?= /opt/rocm/lib:/opt/rocm/llvm/lib
|
||||
export CXX=$(ROCM_HOME)/llvm/bin/clang++
|
||||
export CC=$(ROCM_HOME)/llvm/bin/clang
|
||||
# llama-ggml has no hipblas support, so override it here.
|
||||
export STABLE_BUILD_TYPE=
|
||||
export GGML_HIP=1
|
||||
GPU_TARGETS ?= gfx900,gfx906,gfx908,gfx940,gfx941,gfx942,gfx90a,gfx1030,gfx1031,gfx1100,gfx1101
|
||||
GPU_TARGETS ?= gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102
|
||||
AMDGPU_TARGETS ?= "$(GPU_TARGETS)"
|
||||
CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
|
||||
CGO_LDFLAGS += -O3 --rtlib=compiler-rt -unwindlib=libgcc -lhipblas -lrocblas --hip-link -L${ROCM_HOME}/lib/llvm/lib
|
||||
@@ -188,7 +219,6 @@ ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx
|
||||
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2
|
||||
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx512
|
||||
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback
|
||||
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml
|
||||
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc
|
||||
ALL_GRPC_BACKENDS+=backend-assets/util/llama-cpp-rpc-server
|
||||
ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper
|
||||
@@ -222,19 +252,6 @@ endif
|
||||
|
||||
all: help
|
||||
|
||||
## go-llama.cpp
|
||||
sources/go-llama.cpp:
|
||||
mkdir -p sources/go-llama.cpp
|
||||
cd sources/go-llama.cpp && \
|
||||
git init && \
|
||||
git remote add origin $(GOLLAMA_REPO) && \
|
||||
git fetch origin && \
|
||||
git checkout $(GOLLAMA_VERSION) && \
|
||||
git submodule update --init --recursive --depth 1 --single-branch
|
||||
|
||||
sources/go-llama.cpp/libbinding.a: sources/go-llama.cpp
|
||||
$(MAKE) -C sources/go-llama.cpp BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a
|
||||
|
||||
## bark.cpp
|
||||
sources/bark.cpp:
|
||||
git clone --recursive $(BARKCPP_REPO) sources/bark.cpp && \
|
||||
@@ -277,11 +294,7 @@ backend/go/image/stablediffusion-ggml/libsd.a: sources/stablediffusion-ggml.cpp
|
||||
$(MAKE) -C backend/go/image/stablediffusion-ggml libsd.a
|
||||
|
||||
backend-assets/grpc/stablediffusion-ggml: backend/go/image/stablediffusion-ggml/libsd.a backend-assets/grpc
|
||||
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/image/stablediffusion-ggml/ LIBRARY_PATH=$(CURDIR)/backend/go/image/stablediffusion-ggml/ \
|
||||
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion-ggml ./backend/go/image/stablediffusion-ggml/
|
||||
ifneq ($(UPX),)
|
||||
$(UPX) backend-assets/grpc/stablediffusion-ggml
|
||||
endif
|
||||
$(MAKE) -C backend/go/image/stablediffusion-ggml CGO_LDFLAGS="$(CGO_LDFLAGS)" stablediffusion-ggml
|
||||
|
||||
sources/onnxruntime:
|
||||
mkdir -p sources/onnxruntime
|
||||
@@ -307,22 +320,21 @@ sources/whisper.cpp:
|
||||
git checkout $(WHISPER_CPP_VERSION) && \
|
||||
git submodule update --init --recursive --depth 1 --single-branch
|
||||
|
||||
sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
|
||||
cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a
|
||||
sources/whisper.cpp/build/src/libwhisper.a: sources/whisper.cpp
|
||||
cd sources/whisper.cpp && cmake $(WHISPER_CMAKE_ARGS) . -B ./build
|
||||
cd sources/whisper.cpp/build && cmake --build . --config Release
|
||||
|
||||
get-sources: sources/go-llama.cpp sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp
|
||||
get-sources: sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp
|
||||
|
||||
replace:
|
||||
$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp
|
||||
$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go
|
||||
$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURDIR)/sources/go-piper
|
||||
$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama.cpp
|
||||
|
||||
dropreplace:
|
||||
$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp
|
||||
$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp/bindings/go
|
||||
$(GOCMD) mod edit -dropreplace github.com/mudler/go-piper
|
||||
$(GOCMD) mod edit -dropreplace github.com/go-skynet/go-llama.cpp
|
||||
|
||||
prepare-sources: get-sources replace
|
||||
$(GOCMD) mod download
|
||||
@@ -330,7 +342,6 @@ prepare-sources: get-sources replace
|
||||
## GENERIC
|
||||
rebuild: ## Rebuilds the project
|
||||
$(GOCMD) clean -cache
|
||||
$(MAKE) -C sources/go-llama.cpp clean
|
||||
$(MAKE) -C sources/whisper.cpp clean
|
||||
$(MAKE) -C sources/go-piper clean
|
||||
$(MAKE) build
|
||||
@@ -361,8 +372,14 @@ clean-tests:
|
||||
clean-dc: clean
|
||||
cp -r /build/backend-assets /workspace/backend-assets
|
||||
|
||||
## Install Go tools
|
||||
install-go-tools:
|
||||
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
|
||||
go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
|
||||
go install github.com/GeertJohan/go.rice/rice@latest
|
||||
|
||||
## Build:
|
||||
build: prepare backend-assets grpcs ## Build the project
|
||||
build: prepare backend-assets grpcs install-go-tools ## Build the project
|
||||
$(info ${GREEN}I local-ai build info:${RESET})
|
||||
$(info ${GREEN}I BUILD_TYPE: ${YELLOW}$(BUILD_TYPE)${RESET})
|
||||
$(info ${GREEN}I GO_TAGS: ${YELLOW}$(GO_TAGS)${RESET})
|
||||
@@ -372,7 +389,9 @@ ifneq ($(BACKEND_LIBS),)
|
||||
$(MAKE) backend-assets/lib
|
||||
cp -f $(BACKEND_LIBS) backend-assets/lib/
|
||||
endif
|
||||
rm -rf $(BINARY_NAME) || true
|
||||
CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o $(BINARY_NAME) ./
|
||||
rice append --exec $(BINARY_NAME)
|
||||
|
||||
build-minimal:
|
||||
BUILD_GRPC_FOR_BACKEND_LLAMA=true GRPC_BACKENDS="backend-assets/grpc/llama-cpp-avx2" GO_TAGS=p2p $(MAKE) build
|
||||
@@ -434,7 +453,7 @@ run: prepare ## run local-ai
|
||||
test-models/testmodel.ggml:
|
||||
mkdir test-models
|
||||
mkdir test-dir
|
||||
wget -q https://huggingface.co/TheBloke/orca_mini_3B-GGML/resolve/main/orca-mini-3b.ggmlv3.q4_0.bin -O test-models/testmodel.ggml
|
||||
wget -q https://huggingface.co/RichardErkhov/Qwen_-_Qwen2-1.5B-Instruct-gguf/resolve/main/Qwen2-1.5B-Instruct.Q2_K.gguf -O test-models/testmodel.ggml
|
||||
wget -q https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en
|
||||
wget -q https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin -O test-models/bert
|
||||
wget -q https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav
|
||||
@@ -444,13 +463,13 @@ prepare-test: grpcs
|
||||
cp -rf backend-assets core/http
|
||||
cp tests/models_fixtures/* test-models
|
||||
|
||||
## Test targets
|
||||
test: prepare test-models/testmodel.ggml grpcs
|
||||
@echo 'Running tests'
|
||||
export GO_TAGS="tts debug"
|
||||
$(MAKE) prepare-test
|
||||
HUGGINGFACE_GRPC=$(abspath ./)/backend/python/transformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama && !llama-gguf" --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
|
||||
$(MAKE) test-llama
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama-gguf" --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
|
||||
$(MAKE) test-llama-gguf
|
||||
$(MAKE) test-tts
|
||||
$(MAKE) test-stablediffusion
|
||||
@@ -479,10 +498,6 @@ teardown-e2e:
|
||||
rm -rf $(TEST_DIR) || true
|
||||
docker stop $$(docker ps -q --filter ancestor=localai-tests)
|
||||
|
||||
test-llama: prepare-test
|
||||
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
|
||||
|
||||
test-llama-gguf: prepare-test
|
||||
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
|
||||
@@ -523,7 +538,7 @@ protogen: protogen-go protogen-python
|
||||
protogen-clean: protogen-go-clean protogen-python-clean
|
||||
|
||||
.PHONY: protogen-go
|
||||
protogen-go:
|
||||
protogen-go: install-go-tools
|
||||
mkdir -p pkg/grpc/proto
|
||||
protoc --experimental_allow_proto3_optional -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \
|
||||
backend/backend.proto
|
||||
@@ -534,18 +549,10 @@ protogen-go-clean:
|
||||
$(RM) bin/*
|
||||
|
||||
.PHONY: protogen-python
|
||||
protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama2-protogen rerankers-protogen transformers-protogen kokoro-protogen vllm-protogen faster-whisper-protogen
|
||||
protogen-python: bark-protogen coqui-protogen diffusers-protogen exllama2-protogen rerankers-protogen transformers-protogen kokoro-protogen vllm-protogen faster-whisper-protogen
|
||||
|
||||
.PHONY: protogen-python-clean
|
||||
protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama2-protogen-clean rerankers-protogen-clean transformers-protogen-clean kokoro-protogen-clean vllm-protogen-clean faster-whisper-protogen-clean
|
||||
|
||||
.PHONY: autogptq-protogen
|
||||
autogptq-protogen:
|
||||
$(MAKE) -C backend/python/autogptq protogen
|
||||
|
||||
.PHONY: autogptq-protogen-clean
|
||||
autogptq-protogen-clean:
|
||||
$(MAKE) -C backend/python/autogptq protogen-clean
|
||||
protogen-python-clean: bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama2-protogen-clean rerankers-protogen-clean transformers-protogen-clean kokoro-protogen-clean vllm-protogen-clean faster-whisper-protogen-clean
|
||||
|
||||
.PHONY: bark-protogen
|
||||
bark-protogen:
|
||||
@@ -622,7 +629,6 @@ vllm-protogen-clean:
|
||||
## GRPC
|
||||
# Note: it is duplicated in the Dockerfile
|
||||
prepare-extra-conda-environments: protogen-python
|
||||
$(MAKE) -C backend/python/autogptq
|
||||
$(MAKE) -C backend/python/bark
|
||||
$(MAKE) -C backend/python/coqui
|
||||
$(MAKE) -C backend/python/diffusers
|
||||
@@ -636,10 +642,12 @@ prepare-extra-conda-environments: protogen-python
|
||||
prepare-test-extra: protogen-python
|
||||
$(MAKE) -C backend/python/transformers
|
||||
$(MAKE) -C backend/python/diffusers
|
||||
$(MAKE) -C backend/python/vllm
|
||||
|
||||
test-extra: prepare-test-extra
|
||||
$(MAKE) -C backend/python/transformers test
|
||||
$(MAKE) -C backend/python/diffusers test
|
||||
$(MAKE) -C backend/python/vllm test
|
||||
|
||||
backend-assets:
|
||||
mkdir -p backend-assets
|
||||
@@ -760,13 +768,6 @@ backend-assets/util/llama-cpp-rpc-server: backend-assets/grpc/llama-cpp-grpc
|
||||
mkdir -p backend-assets/util/
|
||||
cp -rf backend/cpp/llama-grpc/llama.cpp/build/bin/rpc-server backend-assets/util/llama-cpp-rpc-server
|
||||
|
||||
backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc
|
||||
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \
|
||||
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/
|
||||
ifneq ($(UPX),)
|
||||
$(UPX) backend-assets/grpc/llama-ggml
|
||||
endif
|
||||
|
||||
backend-assets/grpc/bark-cpp: backend/go/bark/libbark.a backend-assets/grpc
|
||||
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/bark/ LIBRARY_PATH=$(CURDIR)/backend/go/bark/ \
|
||||
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bark-cpp ./backend/go/bark/
|
||||
@@ -788,8 +789,8 @@ ifneq ($(UPX),)
|
||||
$(UPX) backend-assets/grpc/silero-vad
|
||||
endif
|
||||
|
||||
backend-assets/grpc/whisper: sources/whisper.cpp sources/whisper.cpp/libwhisper.a backend-assets/grpc
|
||||
CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_WHISPER)" C_INCLUDE_PATH="$(CURDIR)/sources/whisper.cpp/include:$(CURDIR)/sources/whisper.cpp/ggml/include" LIBRARY_PATH=$(CURDIR)/sources/whisper.cpp \
|
||||
backend-assets/grpc/whisper: sources/whisper.cpp sources/whisper.cpp/build/src/libwhisper.a backend-assets/grpc
|
||||
CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_WHISPER)" C_INCLUDE_PATH="${WHISPER_INCLUDE_PATH}" LIBRARY_PATH="${WHISPER_LIBRARY_PATH}" LD_LIBRARY_PATH="${WHISPER_LIBRARY_PATH}" \
|
||||
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./backend/go/transcribe/whisper
|
||||
ifneq ($(UPX),)
|
||||
$(UPX) backend-assets/grpc/whisper
|
||||
@@ -841,7 +842,8 @@ docker-aio-all:
|
||||
|
||||
docker-image-intel:
|
||||
docker build \
|
||||
--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04 \
|
||||
--progress plain \
|
||||
--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.1.0-0-devel-ubuntu24.04 \
|
||||
--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
|
||||
--build-arg GO_TAGS="none" \
|
||||
--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
|
||||
@@ -849,7 +851,7 @@ docker-image-intel:
|
||||
|
||||
docker-image-intel-xpu:
|
||||
docker build \
|
||||
--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04 \
|
||||
--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.1.0-0-devel-ubuntu22.04 \
|
||||
--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
|
||||
--build-arg GO_TAGS="none" \
|
||||
--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
|
||||
|
||||
106
README.md
106
README.md
@@ -1,7 +1,6 @@
|
||||
<h1 align="center">
|
||||
<br>
|
||||
<img height="300" src="https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd"> <br>
|
||||
LocalAI
|
||||
<img height="300" src="./core/http/static/logo.png"> <br>
|
||||
<br>
|
||||
</h1>
|
||||
|
||||
@@ -31,7 +30,7 @@
|
||||
|
||||
<p align="center">
|
||||
<a href="https://twitter.com/LocalAI_API" target="blank">
|
||||
<img src="https://img.shields.io/twitter/follow/LocalAI_API?label=Follow: LocalAI_API&style=social" alt="Follow LocalAI_API"/>
|
||||
<img src="https://img.shields.io/badge/X-%23000000.svg?style=for-the-badge&logo=X&logoColor=white&label=LocalAI_API" alt="Follow LocalAI_API"/>
|
||||
</a>
|
||||
<a href="https://discord.gg/uJAeKSAGDy" target="blank">
|
||||
<img src="https://dcbadge.vercel.app/api/server/uJAeKSAGDy?style=flat-square&theme=default-inverted" alt="Join LocalAI Discord Community"/>
|
||||
@@ -44,32 +43,89 @@
|
||||
|
||||
> :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/)
|
||||
>
|
||||
> [💻 Quickstart](https://localai.io/basics/getting_started/) [🖼️ Models](https://models.localai.io/) [🚀 Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap) [🥽 Demo](https://demo.localai.io) [🌍 Explorer](https://explorer.localai.io) [🛫 Examples](https://github.com/mudler/LocalAI-examples)
|
||||
> [💻 Quickstart](https://localai.io/basics/getting_started/) [🖼️ Models](https://models.localai.io/) [🚀 Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap) [🥽 Demo](https://demo.localai.io) [🌍 Explorer](https://explorer.localai.io) [🛫 Examples](https://github.com/mudler/LocalAI-examples) Try on
|
||||
[](https://t.me/localaiofficial_bot)
|
||||
|
||||
[](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[](https://artifacthub.io/packages/search?repo=localai)
|
||||
|
||||
**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that’s compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU. It is created and maintained by [Ettore Di Giacinto](https://github.com/mudler).
|
||||
**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that's compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU. It is created and maintained by [Ettore Di Giacinto](https://github.com/mudler).
|
||||
|
||||

|
||||
|
||||
## 📚🆕 Local Stack Family
|
||||
|
||||
🆕 LocalAI is now part of a comprehensive suite of AI tools designed to work together:
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<td width="50%" valign="top">
|
||||
<a href="https://github.com/mudler/LocalAGI">
|
||||
<img src="https://raw.githubusercontent.com/mudler/LocalAGI/refs/heads/main/webui/react-ui/public/logo_2.png" width="300" alt="LocalAGI Logo">
|
||||
</a>
|
||||
</td>
|
||||
<td width="50%" valign="top">
|
||||
<h3><a href="https://github.com/mudler/LocalAGI">LocalAGI</a></h3>
|
||||
<p>A powerful Local AI agent management platform that serves as a drop-in replacement for OpenAI's Responses API, enhanced with advanced agentic capabilities.</p>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td width="50%" valign="top">
|
||||
<a href="https://github.com/mudler/LocalRecall">
|
||||
<img src="https://raw.githubusercontent.com/mudler/LocalRecall/refs/heads/main/static/localrecall_horizontal.png" width="300" alt="LocalRecall Logo">
|
||||
</a>
|
||||
</td>
|
||||
<td width="50%" valign="top">
|
||||
<h3><a href="https://github.com/mudler/LocalRecall">LocalRecall</a></h3>
|
||||
<p>A REST-ful API and knowledge base management system that provides persistent memory and storage capabilities for AI agents.</p>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
## Screenshots
|
||||
|
||||
|
||||
| Talk Interface | Generate Audio |
|
||||
| --- | --- |
|
||||
|  |  |
|
||||
|
||||
| Models Overview | Generate Images |
|
||||
| --- | --- |
|
||||
|  |  |
|
||||
|
||||
| Chat Interface | Home |
|
||||
| --- | --- |
|
||||
|  |  |
|
||||
|
||||
| Login | Swarm |
|
||||
| --- | --- |
|
||||
| |  |
|
||||
|
||||
## 💻 Quickstart
|
||||
|
||||
Run the installer script:
|
||||
|
||||
```bash
|
||||
# Basic installation
|
||||
curl https://localai.io/install.sh | sh
|
||||
```
|
||||
|
||||
For more installation options, see [Installer Options](https://localai.io/docs/advanced/installer/).
|
||||
|
||||
Or run with docker:
|
||||
|
||||
### CPU only image:
|
||||
```bash
|
||||
# CPU only image:
|
||||
docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-cpu
|
||||
|
||||
# Nvidia GPU:
|
||||
```
|
||||
### Nvidia GPU:
|
||||
```bash
|
||||
docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12
|
||||
|
||||
# CPU and GPU image (bigger size):
|
||||
```
|
||||
### CPU and GPU image (bigger size):
|
||||
```bash
|
||||
docker run -ti --name local-ai -p 8080:8080 localai/localai:latest
|
||||
|
||||
# AIO images (it will pre-download a set of models ready for use, see https://localai.io/basics/container/)
|
||||
```
|
||||
### AIO images (it will pre-download a set of models ready for use, see https://localai.io/basics/container/)
|
||||
```bash
|
||||
docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
|
||||
```
|
||||
|
||||
@@ -88,10 +144,13 @@ local-ai run https://gist.githubusercontent.com/.../phi-2.yaml
|
||||
local-ai run oci://localai/phi-2:latest
|
||||
```
|
||||
|
||||
[💻 Getting started](https://localai.io/basics/getting_started/index.html)
|
||||
For more information, see [💻 Getting started](https://localai.io/basics/getting_started/index.html)
|
||||
|
||||
## 📰 Latest project news
|
||||
|
||||
- Apr 2025: [LocalAGI](https://github.com/mudler/LocalAGI) and [LocalRecall](https://github.com/mudler/LocalRecall) join the LocalAI family stack.
|
||||
- Apr 2025: WebUI overhaul, AIO images updates
|
||||
- Feb 2025: Backend cleanup, Breaking changes, new backends (kokoro, OutelTTS, faster-whisper), Nvidia L4T images
|
||||
- Jan 2025: LocalAI model release: https://huggingface.co/mudler/LocalAI-functioncall-phi-4-v0.3, SANA support in diffusers: https://github.com/mudler/LocalAI/pull/4603
|
||||
- Dec 2024: stablediffusion.cpp backend (ggml) added ( https://github.com/mudler/LocalAI/pull/4289 )
|
||||
- Nov 2024: Bark.cpp backend added ( https://github.com/mudler/LocalAI/pull/4287 )
|
||||
@@ -105,19 +164,6 @@ local-ai run oci://localai/phi-2:latest
|
||||
|
||||
Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
|
||||
|
||||
## 🔥🔥 Hot topics (looking for help):
|
||||
|
||||
- Multimodal with vLLM and Video understanding: https://github.com/mudler/LocalAI/pull/3729
|
||||
- Realtime API https://github.com/mudler/LocalAI/issues/3714
|
||||
- WebUI improvements: https://github.com/mudler/LocalAI/issues/2156
|
||||
- Backends v2: https://github.com/mudler/LocalAI/issues/1126
|
||||
- Improving UX v2: https://github.com/mudler/LocalAI/issues/1373
|
||||
- Assistant API: https://github.com/mudler/LocalAI/issues/1273
|
||||
- Vulkan: https://github.com/mudler/LocalAI/issues/1647
|
||||
- Anthropic API: https://github.com/mudler/LocalAI/issues/1808
|
||||
|
||||
If you want to help and contribute, issues up for grabs: https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3A%22up+for+grabs%22
|
||||
|
||||
## 🚀 [Features](https://localai.io/features/)
|
||||
|
||||
- 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `transformers`, `vllm` ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
|
||||
@@ -131,12 +177,10 @@ If you want to help and contribute, issues up for grabs: https://github.com/mudl
|
||||
- 🥽 [Vision API](https://localai.io/features/gpt-vision/)
|
||||
- 📈 [Reranker API](https://localai.io/features/reranker/)
|
||||
- 🆕🖧 [P2P Inferencing](https://localai.io/features/distribute/)
|
||||
- [Agentic capabilities](https://github.com/mudler/LocalAGI)
|
||||
- 🔊 Voice activity detection (Silero-VAD support)
|
||||
- 🌍 Integrated WebUI!
|
||||
|
||||
## 💻 Usage
|
||||
|
||||
Check out the [Getting started](https://localai.io/basics/getting_started/index.html) section in our documentation.
|
||||
|
||||
### 🔗 Community and integrations
|
||||
|
||||
@@ -212,7 +256,7 @@ A huge thank you to our generous sponsors who support this project covering CI e
|
||||
|
||||
<p align="center">
|
||||
<a href="https://www.spectrocloud.com/" target="blank">
|
||||
<img height="200" src="https://github.com/go-skynet/LocalAI/assets/2420543/68a6f3cb-8a65-4a4d-99b5-6417a8905512">
|
||||
<img height="200" src="https://github.com/user-attachments/assets/72eab1dd-8b93-4fc0-9ade-84db49f24962">
|
||||
</a>
|
||||
<a href="https://www.premai.io/" target="blank">
|
||||
<img height="200" src="https://github.com/mudler/LocalAI/assets/2420543/42e4ca83-661e-4f79-8e46-ae43689683d6"> <br>
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
name: text-embedding-ada-002
|
||||
embeddings: true
|
||||
name: text-embedding-ada-002
|
||||
parameters:
|
||||
model: huggingface://hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF/llama-3.2-1b-instruct-q4_k_m.gguf
|
||||
model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf
|
||||
|
||||
usage: |
|
||||
You can test this model with curl like this:
|
||||
|
||||
@@ -1,101 +1,57 @@
|
||||
name: gpt-4
|
||||
mmap: true
|
||||
parameters:
|
||||
model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
|
||||
context_size: 8192
|
||||
|
||||
stopwords:
|
||||
- "<|im_end|>"
|
||||
- "<dummy32000>"
|
||||
- "</tool_call>"
|
||||
- "<|eot_id|>"
|
||||
- "<|end_of_text|>"
|
||||
|
||||
f16: true
|
||||
function:
|
||||
# disable injecting the "answer" tool
|
||||
disable_no_action: true
|
||||
|
||||
grammar:
|
||||
# This allows the grammar to also return messages
|
||||
mixed_mode: true
|
||||
# Suffix to add to the grammar
|
||||
#prefix: '<tool_call>\n'
|
||||
# Force parallel calls in the grammar
|
||||
# parallel_calls: true
|
||||
|
||||
return_name_in_function_response: true
|
||||
# Without grammar uncomment the lines below
|
||||
# Warning: this is relying only on the capability of the
|
||||
# LLM model to generate the correct function call.
|
||||
json_regex_match:
|
||||
- "(?s)<tool_call>(.*?)</tool_call>"
|
||||
- "(?s)<tool_call>(.*?)"
|
||||
replace_llm_results:
|
||||
# Drop the scratchpad content from responses
|
||||
- key: "(?s)<scratchpad>.*</scratchpad>"
|
||||
value: ""
|
||||
replace_function_results:
|
||||
# Replace everything that is not JSON array or object
|
||||
#
|
||||
- key: '(?s)^[^{\[]*'
|
||||
value: ""
|
||||
- key: '(?s)[^}\]]*$'
|
||||
value: ""
|
||||
- key: "'([^']*?)'"
|
||||
value: "_DQUOTE_${1}_DQUOTE_"
|
||||
- key: '\\"'
|
||||
value: "__TEMP_QUOTE__"
|
||||
- key: "\'"
|
||||
value: "'"
|
||||
- key: "_DQUOTE_"
|
||||
value: '"'
|
||||
- key: "__TEMP_QUOTE__"
|
||||
value: '"'
|
||||
# Drop the scratchpad content from responses
|
||||
- key: "(?s)<scratchpad>.*</scratchpad>"
|
||||
value: ""
|
||||
|
||||
no_mixed_free_string: true
|
||||
schema_type: llama3.1 # or JSON is supported too (json)
|
||||
response_regex:
|
||||
- <function=(?P<name>\w+)>(?P<arguments>.*)</function>
|
||||
mmap: true
|
||||
name: gpt-4
|
||||
parameters:
|
||||
model: Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
|
||||
stopwords:
|
||||
- <|im_end|>
|
||||
- <dummy32000>
|
||||
- <|eot_id|>
|
||||
- <|end_of_text|>
|
||||
template:
|
||||
chat: |
|
||||
{{.Input -}}
|
||||
<|im_start|>assistant
|
||||
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
|
||||
You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>
|
||||
{{.Input }}
|
||||
<|start_header_id|>assistant<|end_header_id|>
|
||||
chat_message: |
|
||||
<|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
|
||||
{{- if .FunctionCall }}
|
||||
<tool_call>
|
||||
{{- else if eq .RoleName "tool" }}
|
||||
<tool_response>
|
||||
{{- end }}
|
||||
{{- if .Content}}
|
||||
{{.Content }}
|
||||
{{- end }}
|
||||
{{- if .FunctionCall}}
|
||||
{{toJson .FunctionCall}}
|
||||
{{- end }}
|
||||
{{- if .FunctionCall }}
|
||||
</tool_call>
|
||||
{{- else if eq .RoleName "tool" }}
|
||||
</tool_response>
|
||||
{{- end }}<|im_end|>
|
||||
<|start_header_id|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}<|end_header_id|>
|
||||
{{ if .FunctionCall -}}
|
||||
{{ else if eq .RoleName "tool" -}}
|
||||
The Function was executed and the response was:
|
||||
{{ end -}}
|
||||
{{ if .Content -}}
|
||||
{{.Content -}}
|
||||
{{ else if .FunctionCall -}}
|
||||
{{ range .FunctionCall }}
|
||||
[{{.FunctionCall.Name}}({{.FunctionCall.Arguments}})]
|
||||
{{ end }}
|
||||
{{ end -}}
|
||||
<|eot_id|>
|
||||
completion: |
|
||||
{{.Input}}
|
||||
function: |-
|
||||
<|im_start|>system
|
||||
You are a function calling AI model.
|
||||
Here are the available tools:
|
||||
<tools>
|
||||
{{range .Functions}}
|
||||
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
|
||||
{{end}}
|
||||
</tools>
|
||||
You should call the tools provided to you sequentially
|
||||
Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
|
||||
<scratchpad>
|
||||
{step-by-step reasoning and plan in bullet points}
|
||||
</scratchpad>
|
||||
For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
|
||||
<tool_call>
|
||||
{"arguments": <args-dict>, "name": <function-name>}
|
||||
</tool_call><|im_end|>
|
||||
{{.Input -}}
|
||||
<|im_start|>assistant
|
||||
function: |
|
||||
<|start_header_id|>system<|end_header_id|>
|
||||
You are an expert in composing functions. You are given a question and a set of possible functions.
|
||||
Based on the question, you will need to make one or more function/tool calls to achieve the purpose.
|
||||
If none of the functions can be used, point it out. If the given question lacks the parameters required by the function, also point it out. You should only return the function call in tools call sections.
|
||||
If you decide to invoke any of the function(s), you MUST put it in the format as follows:
|
||||
[func_name1(params_name1=params_value1,params_name2=params_value2,...),func_name2(params_name1=params_value1,params_name2=params_value2,...)]
|
||||
You SHOULD NOT include any other text in the response.
|
||||
Here is a list of functions in JSON format that you can invoke.
|
||||
{{toJson .Functions}}
|
||||
<|eot_id|><|start_header_id|>user<|end_header_id|>
|
||||
{{.Input}}
|
||||
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
|
||||
|
||||
download_files:
|
||||
- filename: Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
|
||||
sha256: 2e220a14ba4328fee38cf36c2c068261560f999fadb5725ce5c6d977cb5126b5
|
||||
uri: huggingface://bartowski/Hermes-3-Llama-3.2-3B-GGUF/Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
|
||||
8
aio/cpu/vad.yaml
Normal file
8
aio/cpu/vad.yaml
Normal file
@@ -0,0 +1,8 @@
|
||||
backend: silero-vad
|
||||
name: silero-vad
|
||||
parameters:
|
||||
model: silero-vad.onnx
|
||||
download_files:
|
||||
- filename: silero-vad.onnx
|
||||
uri: https://huggingface.co/onnx-community/silero-vad/resolve/main/onnx/model.onnx
|
||||
sha256: a4a068cd6cf1ea8355b84327595838ca748ec29a25bc91fc82e6c299ccdc5808
|
||||
@@ -1,31 +1,49 @@
|
||||
backend: llama-cpp
|
||||
context_size: 4096
|
||||
f16: true
|
||||
mmap: true
|
||||
mmproj: minicpm-v-2_6-mmproj-f16.gguf
|
||||
name: gpt-4o
|
||||
|
||||
roles:
|
||||
user: "USER:"
|
||||
assistant: "ASSISTANT:"
|
||||
system: "SYSTEM:"
|
||||
|
||||
mmproj: bakllava-mmproj.gguf
|
||||
parameters:
|
||||
model: bakllava.gguf
|
||||
|
||||
model: minicpm-v-2_6-Q4_K_M.gguf
|
||||
stopwords:
|
||||
- <|im_end|>
|
||||
- <dummy32000>
|
||||
- </s>
|
||||
- <|endoftext|>
|
||||
template:
|
||||
chat: |
|
||||
A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
|
||||
{{.Input -}}
|
||||
<|im_start|>assistant
|
||||
chat_message: |
|
||||
<|im_start|>{{ .RoleName }}
|
||||
{{ if .FunctionCall -}}
|
||||
Function call:
|
||||
{{ else if eq .RoleName "tool" -}}
|
||||
Function response:
|
||||
{{ end -}}
|
||||
{{ if .Content -}}
|
||||
{{.Content }}
|
||||
{{ end -}}
|
||||
{{ if .FunctionCall -}}
|
||||
{{toJson .FunctionCall}}
|
||||
{{ end -}}<|im_end|>
|
||||
completion: |
|
||||
{{.Input}}
|
||||
ASSISTANT:
|
||||
function: |
|
||||
<|im_start|>system
|
||||
You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
|
||||
{{range .Functions}}
|
||||
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
|
||||
{{end}}
|
||||
For each function call return a json object with function name and arguments
|
||||
<|im_end|>
|
||||
{{.Input -}}
|
||||
<|im_start|>assistant
|
||||
|
||||
download_files:
|
||||
- filename: bakllava.gguf
|
||||
uri: huggingface://mys/ggml_bakllava-1/ggml-model-q4_k.gguf
|
||||
- filename: bakllava-mmproj.gguf
|
||||
uri: huggingface://mys/ggml_bakllava-1/mmproj-model-f16.gguf
|
||||
|
||||
usage: |
|
||||
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
|
||||
"model": "gpt-4-vision-preview",
|
||||
"messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
|
||||
- filename: minicpm-v-2_6-Q4_K_M.gguf
|
||||
sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1
|
||||
uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf
|
||||
- filename: minicpm-v-2_6-mmproj-f16.gguf
|
||||
uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
|
||||
sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd
|
||||
@@ -129,7 +129,7 @@ detect_gpu
|
||||
detect_gpu_size
|
||||
|
||||
PROFILE="${PROFILE:-$GPU_SIZE}" # default to cpu
|
||||
export MODELS="${MODELS:-/aio/${PROFILE}/embeddings.yaml,/aio/${PROFILE}/rerank.yaml,/aio/${PROFILE}/text-to-speech.yaml,/aio/${PROFILE}/image-gen.yaml,/aio/${PROFILE}/text-to-text.yaml,/aio/${PROFILE}/speech-to-text.yaml,/aio/${PROFILE}/vision.yaml}"
|
||||
export MODELS="${MODELS:-/aio/${PROFILE}/embeddings.yaml,/aio/${PROFILE}/rerank.yaml,/aio/${PROFILE}/text-to-speech.yaml,/aio/${PROFILE}/image-gen.yaml,/aio/${PROFILE}/text-to-text.yaml,/aio/${PROFILE}/speech-to-text.yaml,/aio/${PROFILE}/vad.yaml,/aio/${PROFILE}/vision.yaml}"
|
||||
|
||||
check_vars
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
embeddings: true
|
||||
name: text-embedding-ada-002
|
||||
backend: sentencetransformers
|
||||
parameters:
|
||||
model: all-MiniLM-L6-v2
|
||||
model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf
|
||||
|
||||
usage: |
|
||||
You can test this model with curl like this:
|
||||
|
||||
@@ -1,101 +1,53 @@
|
||||
name: gpt-4
|
||||
mmap: true
|
||||
parameters:
|
||||
model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
|
||||
context_size: 8192
|
||||
|
||||
stopwords:
|
||||
- "<|im_end|>"
|
||||
- "<dummy32000>"
|
||||
- "</tool_call>"
|
||||
- "<|eot_id|>"
|
||||
- "<|end_of_text|>"
|
||||
|
||||
context_size: 4096
|
||||
f16: true
|
||||
function:
|
||||
# disable injecting the "answer" tool
|
||||
disable_no_action: true
|
||||
|
||||
capture_llm_results:
|
||||
- (?s)<Thought>(.*?)</Thought>
|
||||
grammar:
|
||||
# This allows the grammar to also return messages
|
||||
mixed_mode: true
|
||||
# Suffix to add to the grammar
|
||||
#prefix: '<tool_call>\n'
|
||||
# Force parallel calls in the grammar
|
||||
# parallel_calls: true
|
||||
|
||||
return_name_in_function_response: true
|
||||
# Without grammar uncomment the lines below
|
||||
# Warning: this is relying only on the capability of the
|
||||
# LLM model to generate the correct function call.
|
||||
json_regex_match:
|
||||
- "(?s)<tool_call>(.*?)</tool_call>"
|
||||
- "(?s)<tool_call>(.*?)"
|
||||
properties_order: name,arguments
|
||||
json_regex_match:
|
||||
- (?s)<Output>(.*?)</Output>
|
||||
replace_llm_results:
|
||||
# Drop the scratchpad content from responses
|
||||
- key: "(?s)<scratchpad>.*</scratchpad>"
|
||||
- key: (?s)<Thought>(.*?)</Thought>
|
||||
value: ""
|
||||
replace_function_results:
|
||||
# Replace everything that is not JSON array or object
|
||||
#
|
||||
- key: '(?s)^[^{\[]*'
|
||||
value: ""
|
||||
- key: '(?s)[^}\]]*$'
|
||||
value: ""
|
||||
- key: "'([^']*?)'"
|
||||
value: "_DQUOTE_${1}_DQUOTE_"
|
||||
- key: '\\"'
|
||||
value: "__TEMP_QUOTE__"
|
||||
- key: "\'"
|
||||
value: "'"
|
||||
- key: "_DQUOTE_"
|
||||
value: '"'
|
||||
- key: "__TEMP_QUOTE__"
|
||||
value: '"'
|
||||
# Drop the scratchpad content from responses
|
||||
- key: "(?s)<scratchpad>.*</scratchpad>"
|
||||
value: ""
|
||||
|
||||
mmap: true
|
||||
name: gpt-4
|
||||
parameters:
|
||||
model: localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
|
||||
stopwords:
|
||||
- <|im_end|>
|
||||
- <dummy32000>
|
||||
- </s>
|
||||
template:
|
||||
chat: |
|
||||
{{.Input -}}
|
||||
<|im_start|>assistant
|
||||
chat_message: |
|
||||
<|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
|
||||
{{- if .FunctionCall }}
|
||||
<tool_call>
|
||||
{{- else if eq .RoleName "tool" }}
|
||||
<tool_response>
|
||||
{{- end }}
|
||||
{{- if .Content}}
|
||||
<|im_start|>{{ .RoleName }}
|
||||
{{ if .FunctionCall -}}
|
||||
Function call:
|
||||
{{ else if eq .RoleName "tool" -}}
|
||||
Function response:
|
||||
{{ end -}}
|
||||
{{ if .Content -}}
|
||||
{{.Content }}
|
||||
{{- end }}
|
||||
{{- if .FunctionCall}}
|
||||
{{ end -}}
|
||||
{{ if .FunctionCall -}}
|
||||
{{toJson .FunctionCall}}
|
||||
{{- end }}
|
||||
{{- if .FunctionCall }}
|
||||
</tool_call>
|
||||
{{- else if eq .RoleName "tool" }}
|
||||
</tool_response>
|
||||
{{- end }}<|im_end|>
|
||||
{{ end -}}<|im_end|>
|
||||
completion: |
|
||||
{{.Input}}
|
||||
function: |-
|
||||
function: |
|
||||
<|im_start|>system
|
||||
You are a function calling AI model.
|
||||
Here are the available tools:
|
||||
<tools>
|
||||
You are an AI assistant that executes function calls, and these are the tools at your disposal:
|
||||
{{range .Functions}}
|
||||
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
|
||||
{{end}}
|
||||
</tools>
|
||||
You should call the tools provided to you sequentially
|
||||
Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
|
||||
<scratchpad>
|
||||
{step-by-step reasoning and plan in bullet points}
|
||||
</scratchpad>
|
||||
For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
|
||||
<tool_call>
|
||||
{"arguments": <args-dict>, "name": <function-name>}
|
||||
</tool_call><|im_end|>
|
||||
<|im_end|>
|
||||
{{.Input -}}
|
||||
<|im_start|>assistant
|
||||
<|im_start|>assistant
|
||||
|
||||
download_files:
|
||||
- filename: localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
|
||||
sha256: 4e7b7fe1d54b881f1ef90799219dc6cc285d29db24f559c8998d1addb35713d4
|
||||
uri: huggingface://mudler/LocalAI-functioncall-qwen2.5-7b-v0.5-Q4_K_M-GGUF/localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
|
||||
|
||||
8
aio/gpu-8g/vad.yaml
Normal file
8
aio/gpu-8g/vad.yaml
Normal file
@@ -0,0 +1,8 @@
|
||||
backend: silero-vad
|
||||
name: silero-vad
|
||||
parameters:
|
||||
model: silero-vad.onnx
|
||||
download_files:
|
||||
- filename: silero-vad.onnx
|
||||
uri: https://huggingface.co/onnx-community/silero-vad/resolve/main/onnx/model.onnx
|
||||
sha256: a4a068cd6cf1ea8355b84327595838ca748ec29a25bc91fc82e6c299ccdc5808
|
||||
@@ -1,35 +1,49 @@
|
||||
backend: llama-cpp
|
||||
context_size: 4096
|
||||
f16: true
|
||||
mmap: true
|
||||
mmproj: minicpm-v-2_6-mmproj-f16.gguf
|
||||
name: gpt-4o
|
||||
|
||||
roles:
|
||||
user: "USER:"
|
||||
assistant: "ASSISTANT:"
|
||||
system: "SYSTEM:"
|
||||
|
||||
mmproj: llava-v1.6-7b-mmproj-f16.gguf
|
||||
parameters:
|
||||
model: llava-v1.6-mistral-7b.Q5_K_M.gguf
|
||||
temperature: 0.2
|
||||
top_k: 40
|
||||
top_p: 0.95
|
||||
seed: -1
|
||||
|
||||
model: minicpm-v-2_6-Q4_K_M.gguf
|
||||
stopwords:
|
||||
- <|im_end|>
|
||||
- <dummy32000>
|
||||
- </s>
|
||||
- <|endoftext|>
|
||||
template:
|
||||
chat: |
|
||||
A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
|
||||
{{.Input -}}
|
||||
<|im_start|>assistant
|
||||
chat_message: |
|
||||
<|im_start|>{{ .RoleName }}
|
||||
{{ if .FunctionCall -}}
|
||||
Function call:
|
||||
{{ else if eq .RoleName "tool" -}}
|
||||
Function response:
|
||||
{{ end -}}
|
||||
{{ if .Content -}}
|
||||
{{.Content }}
|
||||
{{ end -}}
|
||||
{{ if .FunctionCall -}}
|
||||
{{toJson .FunctionCall}}
|
||||
{{ end -}}<|im_end|>
|
||||
completion: |
|
||||
{{.Input}}
|
||||
ASSISTANT:
|
||||
function: |
|
||||
<|im_start|>system
|
||||
You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
|
||||
{{range .Functions}}
|
||||
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
|
||||
{{end}}
|
||||
For each function call return a json object with function name and arguments
|
||||
<|im_end|>
|
||||
{{.Input -}}
|
||||
<|im_start|>assistant
|
||||
|
||||
download_files:
|
||||
- filename: llava-v1.6-mistral-7b.Q5_K_M.gguf
|
||||
uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/llava-v1.6-mistral-7b.Q5_K_M.gguf
|
||||
- filename: llava-v1.6-7b-mmproj-f16.gguf
|
||||
uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/mmproj-model-f16.gguf
|
||||
|
||||
usage: |
|
||||
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
|
||||
"model": "gpt-4-vision-preview",
|
||||
"messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
|
||||
- filename: minicpm-v-2_6-Q4_K_M.gguf
|
||||
sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1
|
||||
uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf
|
||||
- filename: minicpm-v-2_6-mmproj-f16.gguf
|
||||
uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
|
||||
sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd
|
||||
@@ -1,7 +1,7 @@
|
||||
embeddings: true
|
||||
name: text-embedding-ada-002
|
||||
backend: sentencetransformers
|
||||
parameters:
|
||||
model: all-MiniLM-L6-v2
|
||||
model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf
|
||||
|
||||
usage: |
|
||||
You can test this model with curl like this:
|
||||
|
||||
@@ -1,103 +1,53 @@
|
||||
name: gpt-4
|
||||
mmap: false
|
||||
context_size: 8192
|
||||
|
||||
f16: false
|
||||
parameters:
|
||||
model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
|
||||
|
||||
stopwords:
|
||||
- "<|im_end|>"
|
||||
- "<dummy32000>"
|
||||
- "</tool_call>"
|
||||
- "<|eot_id|>"
|
||||
- "<|end_of_text|>"
|
||||
|
||||
context_size: 4096
|
||||
f16: true
|
||||
function:
|
||||
# disable injecting the "answer" tool
|
||||
disable_no_action: true
|
||||
|
||||
capture_llm_results:
|
||||
- (?s)<Thought>(.*?)</Thought>
|
||||
grammar:
|
||||
# This allows the grammar to also return messages
|
||||
mixed_mode: true
|
||||
# Suffix to add to the grammar
|
||||
#prefix: '<tool_call>\n'
|
||||
# Force parallel calls in the grammar
|
||||
# parallel_calls: true
|
||||
|
||||
return_name_in_function_response: true
|
||||
# Without grammar uncomment the lines below
|
||||
# Warning: this is relying only on the capability of the
|
||||
# LLM model to generate the correct function call.
|
||||
json_regex_match:
|
||||
- "(?s)<tool_call>(.*?)</tool_call>"
|
||||
- "(?s)<tool_call>(.*?)"
|
||||
properties_order: name,arguments
|
||||
json_regex_match:
|
||||
- (?s)<Output>(.*?)</Output>
|
||||
replace_llm_results:
|
||||
# Drop the scratchpad content from responses
|
||||
- key: "(?s)<scratchpad>.*</scratchpad>"
|
||||
- key: (?s)<Thought>(.*?)</Thought>
|
||||
value: ""
|
||||
replace_function_results:
|
||||
# Replace everything that is not JSON array or object
|
||||
#
|
||||
- key: '(?s)^[^{\[]*'
|
||||
value: ""
|
||||
- key: '(?s)[^}\]]*$'
|
||||
value: ""
|
||||
- key: "'([^']*?)'"
|
||||
value: "_DQUOTE_${1}_DQUOTE_"
|
||||
- key: '\\"'
|
||||
value: "__TEMP_QUOTE__"
|
||||
- key: "\'"
|
||||
value: "'"
|
||||
- key: "_DQUOTE_"
|
||||
value: '"'
|
||||
- key: "__TEMP_QUOTE__"
|
||||
value: '"'
|
||||
# Drop the scratchpad content from responses
|
||||
- key: "(?s)<scratchpad>.*</scratchpad>"
|
||||
value: ""
|
||||
|
||||
mmap: true
|
||||
name: gpt-4
|
||||
parameters:
|
||||
model: localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
|
||||
stopwords:
|
||||
- <|im_end|>
|
||||
- <dummy32000>
|
||||
- </s>
|
||||
template:
|
||||
chat: |
|
||||
{{.Input -}}
|
||||
<|im_start|>assistant
|
||||
chat_message: |
|
||||
<|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
|
||||
{{- if .FunctionCall }}
|
||||
<tool_call>
|
||||
{{- else if eq .RoleName "tool" }}
|
||||
<tool_response>
|
||||
{{- end }}
|
||||
{{- if .Content}}
|
||||
<|im_start|>{{ .RoleName }}
|
||||
{{ if .FunctionCall -}}
|
||||
Function call:
|
||||
{{ else if eq .RoleName "tool" -}}
|
||||
Function response:
|
||||
{{ end -}}
|
||||
{{ if .Content -}}
|
||||
{{.Content }}
|
||||
{{- end }}
|
||||
{{- if .FunctionCall}}
|
||||
{{ end -}}
|
||||
{{ if .FunctionCall -}}
|
||||
{{toJson .FunctionCall}}
|
||||
{{- end }}
|
||||
{{- if .FunctionCall }}
|
||||
</tool_call>
|
||||
{{- else if eq .RoleName "tool" }}
|
||||
</tool_response>
|
||||
{{- end }}<|im_end|>
|
||||
{{ end -}}<|im_end|>
|
||||
completion: |
|
||||
{{.Input}}
|
||||
function: |-
|
||||
function: |
|
||||
<|im_start|>system
|
||||
You are a function calling AI model.
|
||||
Here are the available tools:
|
||||
<tools>
|
||||
You are an AI assistant that executes function calls, and these are the tools at your disposal:
|
||||
{{range .Functions}}
|
||||
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
|
||||
{{end}}
|
||||
</tools>
|
||||
You should call the tools provided to you sequentially
|
||||
Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
|
||||
<scratchpad>
|
||||
{step-by-step reasoning and plan in bullet points}
|
||||
</scratchpad>
|
||||
For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
|
||||
<tool_call>
|
||||
{"arguments": <args-dict>, "name": <function-name>}
|
||||
</tool_call><|im_end|>
|
||||
<|im_end|>
|
||||
{{.Input -}}
|
||||
<|im_start|>assistant
|
||||
|
||||
download_files:
|
||||
- filename: localai-functioncall-phi-4-v0.3-q4_k_m.gguf
|
||||
sha256: 23fee048ded2a6e2e1a7b6bbefa6cbf83068f194caa9552aecbaa00fec8a16d5
|
||||
uri: huggingface://mudler/LocalAI-functioncall-phi-4-v0.3-Q4_K_M-GGUF/localai-functioncall-phi-4-v0.3-q4_k_m.gguf
|
||||
8
aio/intel/vad.yaml
Normal file
8
aio/intel/vad.yaml
Normal file
@@ -0,0 +1,8 @@
|
||||
backend: silero-vad
|
||||
name: silero-vad
|
||||
parameters:
|
||||
model: silero-vad.onnx
|
||||
download_files:
|
||||
- filename: silero-vad.onnx
|
||||
uri: https://huggingface.co/onnx-community/silero-vad/resolve/main/onnx/model.onnx
|
||||
sha256: a4a068cd6cf1ea8355b84327595838ca748ec29a25bc91fc82e6c299ccdc5808
|
||||
@@ -1,35 +1,50 @@
|
||||
backend: llama-cpp
|
||||
context_size: 4096
|
||||
mmap: false
|
||||
f16: false
|
||||
f16: true
|
||||
mmap: true
|
||||
mmproj: minicpm-v-2_6-mmproj-f16.gguf
|
||||
name: gpt-4o
|
||||
|
||||
roles:
|
||||
user: "USER:"
|
||||
assistant: "ASSISTANT:"
|
||||
system: "SYSTEM:"
|
||||
|
||||
mmproj: llava-v1.6-7b-mmproj-f16.gguf
|
||||
parameters:
|
||||
model: llava-v1.6-mistral-7b.Q5_K_M.gguf
|
||||
temperature: 0.2
|
||||
top_k: 40
|
||||
top_p: 0.95
|
||||
seed: -1
|
||||
|
||||
model: minicpm-v-2_6-Q4_K_M.gguf
|
||||
stopwords:
|
||||
- <|im_end|>
|
||||
- <dummy32000>
|
||||
- </s>
|
||||
- <|endoftext|>
|
||||
template:
|
||||
chat: |
|
||||
A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
|
||||
{{.Input -}}
|
||||
<|im_start|>assistant
|
||||
chat_message: |
|
||||
<|im_start|>{{ .RoleName }}
|
||||
{{ if .FunctionCall -}}
|
||||
Function call:
|
||||
{{ else if eq .RoleName "tool" -}}
|
||||
Function response:
|
||||
{{ end -}}
|
||||
{{ if .Content -}}
|
||||
{{.Content }}
|
||||
{{ end -}}
|
||||
{{ if .FunctionCall -}}
|
||||
{{toJson .FunctionCall}}
|
||||
{{ end -}}<|im_end|>
|
||||
completion: |
|
||||
{{.Input}}
|
||||
ASSISTANT:
|
||||
function: |
|
||||
<|im_start|>system
|
||||
You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
|
||||
{{range .Functions}}
|
||||
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
|
||||
{{end}}
|
||||
For each function call return a json object with function name and arguments
|
||||
<|im_end|>
|
||||
{{.Input -}}
|
||||
<|im_start|>assistant
|
||||
|
||||
|
||||
download_files:
|
||||
- filename: llava-v1.6-mistral-7b.Q5_K_M.gguf
|
||||
uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/llava-v1.6-mistral-7b.Q5_K_M.gguf
|
||||
- filename: llava-v1.6-7b-mmproj-f16.gguf
|
||||
uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/mmproj-model-f16.gguf
|
||||
|
||||
usage: |
|
||||
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
|
||||
"model": "gpt-4-vision-preview",
|
||||
"messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
|
||||
- filename: minicpm-v-2_6-Q4_K_M.gguf
|
||||
sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1
|
||||
uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf
|
||||
- filename: minicpm-v-2_6-mmproj-f16.gguf
|
||||
uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
|
||||
sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd
|
||||
15
assets.go
15
assets.go
@@ -1,6 +1,15 @@
|
||||
package main
|
||||
|
||||
import "embed"
|
||||
import (
|
||||
rice "github.com/GeertJohan/go.rice"
|
||||
)
|
||||
|
||||
//go:embed backend-assets/*
|
||||
var backendAssets embed.FS
|
||||
var backendAssets *rice.Box
|
||||
|
||||
func init() {
|
||||
var err error
|
||||
backendAssets, err = rice.FindBox("backend-assets")
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,6 +14,7 @@ service Backend {
|
||||
rpc PredictStream(PredictOptions) returns (stream Reply) {}
|
||||
rpc Embedding(PredictOptions) returns (EmbeddingResult) {}
|
||||
rpc GenerateImage(GenerateImageRequest) returns (Result) {}
|
||||
rpc GenerateVideo(GenerateVideoRequest) returns (Result) {}
|
||||
rpc AudioTranscription(TranscriptRequest) returns (TranscriptResult) {}
|
||||
rpc TTS(TTSRequest) returns (Result) {}
|
||||
rpc SoundGeneration(SoundGenerationRequest) returns (Result) {}
|
||||
@@ -165,7 +166,6 @@ message Reply {
|
||||
|
||||
message GrammarTrigger {
|
||||
string word = 1;
|
||||
bool at_start = 2;
|
||||
}
|
||||
|
||||
message ModelOptions {
|
||||
@@ -191,11 +191,7 @@ message ModelOptions {
|
||||
int32 NGQA = 20;
|
||||
string ModelFile = 21;
|
||||
|
||||
// AutoGPTQ
|
||||
string Device = 22;
|
||||
bool UseTriton = 23;
|
||||
string ModelBaseName = 24;
|
||||
bool UseFastTokenizer = 25;
|
||||
|
||||
|
||||
// Diffusers
|
||||
string PipelineType = 26;
|
||||
@@ -229,6 +225,11 @@ message ModelOptions {
|
||||
int32 MaxModelLen = 54;
|
||||
int32 TensorParallelSize = 55;
|
||||
string LoadFormat = 58;
|
||||
bool DisableLogStatus = 66;
|
||||
string DType = 67;
|
||||
int32 LimitImagePerPrompt = 68;
|
||||
int32 LimitVideoPerPrompt = 69;
|
||||
int32 LimitAudioPerPrompt = 70;
|
||||
|
||||
string MMProj = 41;
|
||||
|
||||
@@ -301,6 +302,19 @@ message GenerateImageRequest {
|
||||
int32 CLIPSkip = 11;
|
||||
}
|
||||
|
||||
message GenerateVideoRequest {
|
||||
string prompt = 1;
|
||||
string start_image = 2; // Path or base64 encoded image for the start frame
|
||||
string end_image = 3; // Path or base64 encoded image for the end frame
|
||||
int32 width = 4;
|
||||
int32 height = 5;
|
||||
int32 num_frames = 6; // Number of frames to generate
|
||||
int32 fps = 7; // Frames per second
|
||||
int32 seed = 8;
|
||||
float cfg_scale = 9; // Classifier-free guidance scale
|
||||
string dst = 10; // Output path for the generated video
|
||||
}
|
||||
|
||||
message TTSRequest {
|
||||
string text = 1;
|
||||
string model = 2;
|
||||
|
||||
@@ -1,17 +1,17 @@
|
||||
|
||||
## XXX: In some versions of CMake clip wasn't being built before llama.
|
||||
## This is an hack for now, but it should be fixed in the future.
|
||||
set(TARGET myclip)
|
||||
add_library(${TARGET} clip.cpp clip.h llava.cpp llava.h)
|
||||
install(TARGETS ${TARGET} LIBRARY)
|
||||
target_include_directories(myclip PUBLIC .)
|
||||
target_include_directories(myclip PUBLIC ../..)
|
||||
target_include_directories(myclip PUBLIC ../../common)
|
||||
target_link_libraries(${TARGET} PRIVATE common ggml llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if (NOT MSVC)
|
||||
target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
|
||||
endif()
|
||||
# set(TARGET myclip)
|
||||
# add_library(${TARGET} clip.cpp clip.h clip-impl.h llava.cpp llava.h)
|
||||
# install(TARGETS ${TARGET} LIBRARY)
|
||||
# target_include_directories(myclip PUBLIC .)
|
||||
# target_include_directories(myclip PUBLIC ../..)
|
||||
# target_include_directories(myclip PUBLIC ../../common)
|
||||
# target_link_libraries(${TARGET} PRIVATE common ggml llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
# target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
# if (NOT MSVC)
|
||||
# target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
|
||||
# endif()
|
||||
# END CLIP hack
|
||||
|
||||
|
||||
@@ -75,7 +75,11 @@ add_library(hw_grpc_proto
|
||||
${hw_proto_hdrs} )
|
||||
|
||||
add_executable(${TARGET} grpc-server.cpp utils.hpp json.hpp)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama myclip ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
|
||||
|
||||
target_include_directories(${TARGET} PRIVATE ../llava)
|
||||
target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
|
||||
|
||||
target_link_libraries(${TARGET} PRIVATE common llama mtmd ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
|
||||
absl::flags_parse
|
||||
gRPC::${_REFLECTION}
|
||||
gRPC::${_GRPC_GRPCPP}
|
||||
|
||||
@@ -8,7 +8,7 @@ ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
|
||||
TARGET?=--target grpc-server
|
||||
|
||||
# Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
|
||||
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
|
||||
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF -DLLAMA_CURL=OFF
|
||||
|
||||
# If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
|
||||
ifeq ($(BUILD_TYPE),cublas)
|
||||
@@ -36,11 +36,18 @@ else ifeq ($(OS),Darwin)
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_TYPE),sycl_f16)
|
||||
CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
|
||||
CMAKE_ARGS+=-DGGML_SYCL=ON \
|
||||
-DCMAKE_C_COMPILER=icx \
|
||||
-DCMAKE_CXX_COMPILER=icpx \
|
||||
-DCMAKE_CXX_FLAGS="-fsycl" \
|
||||
-DGGML_SYCL_F16=ON
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_TYPE),sycl_f32)
|
||||
CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
CMAKE_ARGS+=-DGGML_SYCL=ON \
|
||||
-DCMAKE_C_COMPILER=icx \
|
||||
-DCMAKE_CXX_COMPILER=icpx \
|
||||
-DCMAKE_CXX_FLAGS="-fsycl"
|
||||
endif
|
||||
|
||||
llama.cpp:
|
||||
@@ -52,8 +59,8 @@ llama.cpp:
|
||||
git checkout -b build $(LLAMA_VERSION) && \
|
||||
git submodule update --init --recursive --depth 1 --single-branch
|
||||
|
||||
llama.cpp/examples/grpc-server: llama.cpp
|
||||
mkdir -p llama.cpp/examples/grpc-server
|
||||
llama.cpp/tools/grpc-server: llama.cpp
|
||||
mkdir -p llama.cpp/tools/grpc-server
|
||||
bash prepare.sh
|
||||
|
||||
rebuild:
|
||||
@@ -63,13 +70,13 @@ rebuild:
|
||||
|
||||
purge:
|
||||
rm -rf llama.cpp/build
|
||||
rm -rf llama.cpp/examples/grpc-server
|
||||
rm -rf llama.cpp/tools/grpc-server
|
||||
rm -rf grpc-server
|
||||
|
||||
clean: purge
|
||||
rm -rf llama.cpp
|
||||
|
||||
grpc-server: llama.cpp llama.cpp/examples/grpc-server
|
||||
grpc-server: llama.cpp llama.cpp/tools/grpc-server
|
||||
@echo "Building grpc-server with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
|
||||
ifneq (,$(findstring sycl,$(BUILD_TYPE)))
|
||||
+bash -c "source $(ONEAPI_VARS); \
|
||||
@@ -77,4 +84,4 @@ ifneq (,$(findstring sycl,$(BUILD_TYPE)))
|
||||
else
|
||||
+cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET)
|
||||
endif
|
||||
cp llama.cpp/build/bin/grpc-server .
|
||||
cp llama.cpp/build/bin/grpc-server .
|
||||
|
||||
@@ -11,8 +11,7 @@
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <getopt.h>
|
||||
#include "clip.h"
|
||||
#include "llava.h"
|
||||
#include "mtmd.h"
|
||||
#include "log.h"
|
||||
#include "stb_image.h"
|
||||
#include "common.h"
|
||||
@@ -52,7 +51,7 @@ struct server_params
|
||||
{
|
||||
std::string hostname = "127.0.0.1";
|
||||
std::vector<std::string> api_keys;
|
||||
std::string public_path = "examples/server/public";
|
||||
std::string public_path = "tools/server/public";
|
||||
std::string chat_template = "";
|
||||
int32_t port = 8080;
|
||||
int32_t read_timeout = 600;
|
||||
@@ -210,6 +209,8 @@ struct llama_client_slot
|
||||
int32_t num_prompt_tokens_processed = 0;
|
||||
|
||||
json prompt;
|
||||
json data;
|
||||
|
||||
std::string generated_text;
|
||||
llama_token sampled;
|
||||
std::vector<llama_token> cache_tokens;
|
||||
@@ -239,7 +240,7 @@ struct llama_client_slot
|
||||
int32_t n_past_se = 0; // self-extend
|
||||
|
||||
// multimodal
|
||||
std::vector<slot_image> images;
|
||||
mtmd_context * mctx = nullptr;
|
||||
|
||||
// stats
|
||||
size_t sent_count = 0;
|
||||
@@ -270,17 +271,6 @@ struct llama_client_slot
|
||||
n_past_se = 0;
|
||||
|
||||
generated_token_probs.clear();
|
||||
|
||||
for (slot_image & img : images)
|
||||
{
|
||||
free(img.image_embedding);
|
||||
if (img.img_data) {
|
||||
clip_image_u8_free(img.img_data);
|
||||
}
|
||||
img.prefix_prompt = "";
|
||||
}
|
||||
|
||||
images.clear();
|
||||
}
|
||||
|
||||
bool has_budget(common_params &global_params) {
|
||||
@@ -456,6 +446,9 @@ struct llama_server_context
|
||||
llama_context *ctx = nullptr;
|
||||
const llama_vocab * vocab = nullptr;
|
||||
|
||||
// multimodal
|
||||
mtmd_context * mctx = nullptr;
|
||||
|
||||
clip_ctx *clp_ctx = nullptr;
|
||||
|
||||
common_params params;
|
||||
@@ -467,9 +460,10 @@ struct llama_server_context
|
||||
bool all_slots_are_idle = false;
|
||||
bool add_bos_token = true;
|
||||
bool has_eos_token = true;
|
||||
bool has_gpu = false;
|
||||
|
||||
bool grammar_lazy = false;
|
||||
std::vector<common_grammar_trigger> grammar_trigger_words;
|
||||
std::vector<common_grammar_trigger> grammar_triggers;
|
||||
|
||||
int32_t n_ctx; // total context for all clients / slots
|
||||
|
||||
@@ -493,6 +487,10 @@ struct llama_server_context
|
||||
|
||||
~llama_server_context()
|
||||
{
|
||||
if (mctx) {
|
||||
mtmd_free(mctx);
|
||||
mctx = nullptr;
|
||||
}
|
||||
if (ctx)
|
||||
{
|
||||
llama_free(ctx);
|
||||
@@ -508,12 +506,17 @@ struct llama_server_context
|
||||
bool load_model(const common_params ¶ms_)
|
||||
{
|
||||
params = params_;
|
||||
if (!params.mmproj.empty()) {
|
||||
if (!params.mmproj.path.empty()) {
|
||||
multimodal = true;
|
||||
LOG_INFO("Multi Modal Mode Enabled", {});
|
||||
clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
|
||||
if(clp_ctx == nullptr) {
|
||||
LOG_ERR("unable to load clip model: %s", params.mmproj.c_str());
|
||||
mtmd_context_params mparams = mtmd_context_params_default();
|
||||
mparams.use_gpu = has_gpu;
|
||||
mparams.print_timings = false;
|
||||
mparams.n_threads = params.cpuparams.n_threads;
|
||||
mparams.verbosity = GGML_LOG_LEVEL_INFO;
|
||||
mctx = mtmd_init_from_file(params.mmproj.path.c_str(), model, mparams);
|
||||
if (mctx == nullptr) {
|
||||
LOG_ERR("failed to load multimodal model, '%s'\n", params.mmproj.path.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -527,7 +530,7 @@ struct llama_server_context
|
||||
ctx = common_init.context.release();
|
||||
if (model == nullptr)
|
||||
{
|
||||
LOG_ERR("unable to load model: %s", params.model.c_str());
|
||||
LOG_ERR("unable to load model: %s", params.model.path.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -575,6 +578,8 @@ struct llama_server_context
|
||||
slot.id = i;
|
||||
slot.n_ctx = n_ctx_slot;
|
||||
slot.n_predict = params.n_predict;
|
||||
slot.mctx = mctx;
|
||||
//slot.cache_tokens.has_mtmd = mctx != nullptr;
|
||||
|
||||
LOG_INFO("new slot", {
|
||||
{"slot_id", slot.id},
|
||||
@@ -612,54 +617,61 @@ struct llama_server_context
|
||||
batch = llama_batch_init(n_ctx, 0, params.n_parallel);
|
||||
}
|
||||
|
||||
std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const
|
||||
std::vector<server_tokens> tokenize(json &data, const json & json_prompt, bool add_bos) const
|
||||
{
|
||||
// TODO: currently, we tokenize using special tokens by default
|
||||
// this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)
|
||||
// but it's better compared to completely ignoring ChatML and other chat templates
|
||||
const bool TMP_FORCE_SPECIAL = true;
|
||||
mtmd::bitmaps bitmaps;
|
||||
std::vector<server_tokens> inputs;
|
||||
|
||||
// If `add_bos` is true, we only add BOS, when json_prompt is a string,
|
||||
// or the first element of the json_prompt array is a string.
|
||||
std::vector<llama_token> prompt_tokens;
|
||||
|
||||
if (json_prompt.is_array())
|
||||
if (mctx != nullptr)
|
||||
{
|
||||
bool first = true;
|
||||
for (const auto& p : json_prompt)
|
||||
const auto &images_data = data.find("image_data");
|
||||
if (images_data != data.end() && images_data->is_array())
|
||||
{
|
||||
if (p.is_string())
|
||||
for (const auto &img : *images_data)
|
||||
{
|
||||
auto s = p.template get<std::string>();
|
||||
std::vector<llama_token> p;
|
||||
if (first)
|
||||
{
|
||||
p = common_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
|
||||
first = false;
|
||||
const std::vector<uint8_t> image_buffer = base64_decode(img["data"].get<std::string>());
|
||||
|
||||
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(image_buffer.data(), image_buffer.size()));
|
||||
if (!bmp.ptr) {
|
||||
throw std::runtime_error("Failed to load image");
|
||||
}
|
||||
else
|
||||
{
|
||||
p = common_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
|
||||
}
|
||||
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
|
||||
}
|
||||
else
|
||||
{
|
||||
if (first)
|
||||
{
|
||||
first = false;
|
||||
}
|
||||
prompt_tokens.push_back(p.template get<llama_token>());
|
||||
// calculate bitmap hash (for KV caching)
|
||||
std::string hash = fnv_hash(bmp.data(), bmp.nx()*bmp.ny()*3);
|
||||
bmp.set_id(hash.c_str());
|
||||
bitmaps.entries.push_back(std::move(bmp));
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
auto s = json_prompt.template get<std::string>();
|
||||
prompt_tokens = common_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
|
||||
|
||||
// multimodal
|
||||
std::string prompt_str = json_prompt.template get<std::string>();
|
||||
mtmd_input_text inp_txt = {
|
||||
prompt_str.c_str(),
|
||||
/* add_special */ true,
|
||||
/* parse_special */ true,
|
||||
};
|
||||
mtmd::input_chunks chunks(mtmd_input_chunks_init());
|
||||
auto bitmaps_c_ptr = bitmaps.c_ptr();
|
||||
int32_t tokenized = mtmd_tokenize(mctx,
|
||||
chunks.ptr.get(),
|
||||
&inp_txt,
|
||||
bitmaps_c_ptr.data(),
|
||||
bitmaps_c_ptr.size());
|
||||
if (tokenized != 0) {
|
||||
throw std::runtime_error("Failed to tokenize prompt");
|
||||
}
|
||||
|
||||
server_tokens tmp(chunks, true);
|
||||
inputs.push_back(std::move(tmp));
|
||||
} else {
|
||||
// non-multimodal version
|
||||
auto tokenized_prompts = tokenize_input_prompts(vocab, json_prompt, true, true);
|
||||
for (auto & p : tokenized_prompts) {
|
||||
auto tmp = server_tokens(p, mctx != nullptr);
|
||||
inputs.push_back(std::move(tmp));
|
||||
}
|
||||
}
|
||||
|
||||
return prompt_tokens;
|
||||
return inputs;
|
||||
}
|
||||
|
||||
llama_client_slot* get_slot(int id) {
|
||||
@@ -709,9 +721,11 @@ struct llama_server_context
|
||||
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
|
||||
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
|
||||
slot->sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
|
||||
slot->sparams.grammar_trigger_words = grammar_trigger_words;
|
||||
slot->sparams.grammar_triggers = grammar_triggers;
|
||||
slot->sparams.grammar_lazy = grammar_lazy;
|
||||
|
||||
slot->data = data;
|
||||
|
||||
if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) {
|
||||
// Might be better to reject the request with a 400 ?
|
||||
LOG_WARNING("Max tokens to predict exceeds server configuration", {
|
||||
@@ -753,43 +767,7 @@ struct llama_server_context
|
||||
if (json_value(data, "ignore_eos", false) && has_eos_token) {
|
||||
slot->sparams.logit_bias.push_back({llama_vocab_eos(vocab), -INFINITY});
|
||||
}
|
||||
/*
|
||||
slot->sparams.penalty_prompt_tokens.clear();
|
||||
slot->sparams.use_penalty_prompt_tokens = false;
|
||||
const auto &penalty_prompt = data.find("penalty_prompt");
|
||||
if (penalty_prompt != data.end())
|
||||
{
|
||||
if (penalty_prompt->is_string())
|
||||
{
|
||||
const auto penalty_prompt_string = penalty_prompt->get<std::string>();
|
||||
auto penalty_tokens = llama_tokenize(model, penalty_prompt_string, false);
|
||||
slot->sparams.penalty_prompt_tokens.swap(penalty_tokens);
|
||||
if (slot->params.n_predict > 0)
|
||||
{
|
||||
slot->sparams.penalty_prompt_tokens.reserve(slot->sparams.penalty_prompt_tokens.size() + slot->params.n_predict);
|
||||
}
|
||||
slot->sparams.use_penalty_prompt_tokens = true;
|
||||
}
|
||||
else if (penalty_prompt->is_array())
|
||||
{
|
||||
const auto n_tokens = penalty_prompt->size();
|
||||
slot->sparams.penalty_prompt_tokens.reserve(n_tokens + std::max(0, slot->params.n_predict));
|
||||
const int n_vocab = llama_n_vocab(model);
|
||||
for (const auto &penalty_token : *penalty_prompt)
|
||||
{
|
||||
if (penalty_token.is_number_integer())
|
||||
{
|
||||
const auto tok = penalty_token.get<llama_token>();
|
||||
if (tok >= 0 && tok < n_vocab)
|
||||
{
|
||||
slot->sparams.penalty_prompt_tokens.push_back(tok);
|
||||
}
|
||||
}
|
||||
}
|
||||
slot->sparams.use_penalty_prompt_tokens = true;
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
slot->sparams.logit_bias.clear();
|
||||
|
||||
const auto &logit_bias = data.find("logit_bias");
|
||||
@@ -865,79 +843,6 @@ struct llama_server_context
|
||||
}
|
||||
|
||||
|
||||
if (multimodal)
|
||||
{
|
||||
const auto &images_data = data.find("image_data");
|
||||
if (images_data != data.end() && images_data->is_array())
|
||||
{
|
||||
for (const auto &img : *images_data)
|
||||
{
|
||||
const std::vector<uint8_t> image_buffer = base64_decode(img["data"].get<std::string>());
|
||||
|
||||
slot_image img_sl;
|
||||
img_sl.id = img.count("id") != 0 ? img["id"].get<int>() : slot->images.size();
|
||||
img_sl.img_data = clip_image_u8_init();
|
||||
if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
|
||||
{
|
||||
LOG_ERR("%s: failed to load image, slot_id: %d, img_sl_id: %d",
|
||||
__func__,
|
||||
slot->id,
|
||||
img_sl.id
|
||||
);
|
||||
return false;
|
||||
}
|
||||
LOG_VERBOSE("image loaded", {
|
||||
{"slot_id", slot->id},
|
||||
{"img_sl_id", img_sl.id}
|
||||
});
|
||||
img_sl.request_encode_image = true;
|
||||
slot->images.push_back(img_sl);
|
||||
}
|
||||
// process prompt
|
||||
// example: system prompt [img-102] user [img-103] describe [img-134] -> [{id: 102, prefix: 'system prompt '}, {id: 103, prefix: ' user '}, {id: 134, prefix: ' describe '}]}
|
||||
if (slot->images.size() > 0 && !slot->prompt.is_array())
|
||||
{
|
||||
std::string prompt = slot->prompt.get<std::string>();
|
||||
size_t pos = 0, begin_prefix = 0;
|
||||
std::string pattern = "[img-";
|
||||
while ((pos = prompt.find(pattern, pos)) != std::string::npos) {
|
||||
size_t end_prefix = pos;
|
||||
pos += pattern.length();
|
||||
size_t end_pos = prompt.find(']', pos);
|
||||
if (end_pos != std::string::npos)
|
||||
{
|
||||
std::string image_id = prompt.substr(pos, end_pos - pos);
|
||||
try
|
||||
{
|
||||
int img_id = std::stoi(image_id);
|
||||
bool found = false;
|
||||
for (slot_image &img : slot->images)
|
||||
{
|
||||
if (img.id == img_id) {
|
||||
found = true;
|
||||
img.prefix_prompt = prompt.substr(begin_prefix, end_prefix - begin_prefix);
|
||||
begin_prefix = end_pos + 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found) {
|
||||
LOG("ERROR: Image with id: %i, not found.\n", img_id);
|
||||
slot->images.clear();
|
||||
return false;
|
||||
}
|
||||
} catch (const std::invalid_argument& e) {
|
||||
LOG("Invalid image number id in prompt\n");
|
||||
slot->images.clear();
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
slot->prompt = "";
|
||||
slot->params.input_suffix = prompt.substr(begin_prefix);
|
||||
slot->params.cache_prompt = false; // multimodal doesn't support cache prompt
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (slot->ctx_sampling != nullptr)
|
||||
{
|
||||
@@ -1155,6 +1060,14 @@ struct llama_server_context
|
||||
slot.has_next_token = false;
|
||||
}
|
||||
|
||||
if (slot.n_past >= slot.n_ctx) {
|
||||
slot.truncated = true;
|
||||
slot.stopped_limit = true;
|
||||
slot.has_next_token = false;
|
||||
|
||||
LOG_VERBOSE("stopped due to running out of context capacity", {});
|
||||
}
|
||||
|
||||
if (result.tok == llama_vocab_eos(vocab) || llama_vocab_is_eog(vocab, result.tok))
|
||||
{
|
||||
slot.stopped_eos = true;
|
||||
@@ -1177,26 +1090,6 @@ struct llama_server_context
|
||||
return slot.has_next_token; // continue
|
||||
}
|
||||
|
||||
bool process_images(llama_client_slot &slot) const
|
||||
{
|
||||
for (slot_image &img : slot.images)
|
||||
{
|
||||
if (!img.request_encode_image)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!llava_image_embed_make_with_clip_img(clp_ctx, params.cpuparams.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
|
||||
LOG("Error processing the given image");
|
||||
return false;
|
||||
}
|
||||
|
||||
img.request_encode_image = false;
|
||||
}
|
||||
|
||||
return slot.images.size() > 0;
|
||||
}
|
||||
|
||||
void send_error(task_server& task, const std::string &error)
|
||||
{
|
||||
LOG("task %i - error: %s\n", task.id, error.c_str());
|
||||
@@ -1342,7 +1235,7 @@ struct llama_server_context
|
||||
queue_results.send(res);
|
||||
}
|
||||
|
||||
void send_embedding(llama_client_slot &slot)
|
||||
void send_embedding(llama_client_slot &slot, const llama_batch & batch)
|
||||
{
|
||||
task_result res;
|
||||
res.id = slot.task_id;
|
||||
@@ -1364,10 +1257,38 @@ struct llama_server_context
|
||||
else
|
||||
{
|
||||
const float *data = llama_get_embeddings(ctx);
|
||||
std::vector<float> embedding(data, data + n_embd);
|
||||
std::vector<float> embd_res(n_embd, 0.0f);
|
||||
std::vector<std::vector<float>> embedding;
|
||||
for (int i = 0; i < batch.n_tokens; ++i) {
|
||||
if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
|
||||
if (embd == NULL) {
|
||||
embd = llama_get_embeddings_ith(ctx, i);
|
||||
}
|
||||
|
||||
if (embd == NULL) {
|
||||
LOG("failed to get embeddings");
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// normalize only when there is pooling
|
||||
// TODO: configurable
|
||||
if (llama_pooling_type(ctx) != LLAMA_POOLING_TYPE_NONE) {
|
||||
common_embd_normalize(embd, embd_res.data(), n_embd, 2);
|
||||
embedding.push_back(embd_res);
|
||||
} else {
|
||||
embedding.push_back({ embd, embd + n_embd });
|
||||
}
|
||||
}
|
||||
|
||||
// OAI compat
|
||||
res.result_json = json
|
||||
{
|
||||
{"embedding", embedding },
|
||||
{"embedding", embedding[0] },
|
||||
};
|
||||
}
|
||||
queue_results.send(res);
|
||||
@@ -1411,74 +1332,6 @@ struct llama_server_context
|
||||
}
|
||||
}
|
||||
|
||||
// for multiple images processing
|
||||
bool ingest_images(llama_client_slot &slot, int n_batch)
|
||||
{
|
||||
int image_idx = 0;
|
||||
|
||||
while (image_idx < (int) slot.images.size())
|
||||
{
|
||||
slot_image &img = slot.images[image_idx];
|
||||
|
||||
// process prefix prompt
|
||||
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
|
||||
{
|
||||
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
|
||||
llama_batch batch_view = {
|
||||
n_tokens,
|
||||
batch.token + i,
|
||||
nullptr,
|
||||
batch.pos + i,
|
||||
batch.n_seq_id + i,
|
||||
batch.seq_id + i,
|
||||
batch.logits + i,
|
||||
};
|
||||
if (llama_decode(ctx, batch_view))
|
||||
{
|
||||
LOG("%s : failed to eval\n", __func__);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// process image with llm
|
||||
for (int i = 0; i < img.image_tokens; i += n_batch)
|
||||
{
|
||||
int n_eval = img.image_tokens - i;
|
||||
if (n_eval > n_batch)
|
||||
{
|
||||
n_eval = n_batch;
|
||||
}
|
||||
|
||||
const int n_embd = llama_model_n_embd(model);
|
||||
float * embd = img.image_embedding + i * n_embd;
|
||||
llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, slot.n_past, 0);
|
||||
if (llama_decode(ctx, llava_batch.batch))
|
||||
{
|
||||
LOG("%s : failed to eval image\n", __func__);
|
||||
return false;
|
||||
}
|
||||
slot.n_past += n_eval;
|
||||
}
|
||||
image_idx++;
|
||||
|
||||
common_batch_clear(batch);
|
||||
|
||||
// append prefix of next image
|
||||
const auto json_prompt = (image_idx >= (int) slot.images.size()) ?
|
||||
slot.params.input_suffix : // no more images, then process suffix prompt
|
||||
(json)(slot.images[image_idx].prefix_prompt);
|
||||
|
||||
std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image
|
||||
for (int i = 0; i < (int) append_tokens.size(); ++i)
|
||||
{
|
||||
common_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true);
|
||||
slot.n_past += 1;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void request_cancel(int task_id)
|
||||
{
|
||||
task_server task;
|
||||
@@ -1627,17 +1480,17 @@ struct llama_server_context
|
||||
{
|
||||
if (slot.is_processing() && system_tokens.size() + slot.cache_tokens.size() >= (size_t) slot.n_ctx)
|
||||
{
|
||||
// this check is redundant (for good)
|
||||
// we should never get here, because generation should already stopped in process_token()
|
||||
|
||||
// START LOCALAI changes
|
||||
// Temporary disable context-shifting as it can lead to infinite loops (issue: https://github.com/ggerganov/llama.cpp/issues/3969)
|
||||
// See: https://github.com/mudler/LocalAI/issues/1333
|
||||
// Context is exhausted, release the slot
|
||||
slot.release();
|
||||
send_final_response(slot);
|
||||
slot.cache_tokens.clear();
|
||||
slot.n_past = 0;
|
||||
slot.truncated = false;
|
||||
slot.has_next_token = true;
|
||||
LOG("Context exhausted. Slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
|
||||
slot.has_next_token = false;
|
||||
LOG_ERROR("context is exhausted, release the slot", {});
|
||||
|
||||
continue;
|
||||
// END LOCALAI changes
|
||||
@@ -1693,7 +1546,7 @@ struct llama_server_context
|
||||
{
|
||||
for (auto & slot : slots)
|
||||
{
|
||||
const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty()) || !slot.images.empty();
|
||||
const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty());
|
||||
|
||||
// empty prompt passed -> release the slot and send empty response
|
||||
// note: infill mode allows empty prompt
|
||||
@@ -1710,7 +1563,7 @@ struct llama_server_context
|
||||
{
|
||||
slot.state = PROCESSING;
|
||||
slot.command = NONE;
|
||||
std::vector<llama_token> prompt_tokens;
|
||||
std::vector<server_tokens> prompt_tokens;
|
||||
slot.t_start_process_prompt = ggml_time_us();
|
||||
slot.t_start_genereration = 0;
|
||||
|
||||
@@ -1722,24 +1575,41 @@ struct llama_server_context
|
||||
params.input_suffix.erase(0, 1);
|
||||
suff_rm_leading_spc = false;
|
||||
}
|
||||
auto prefix_tokens = tokenize(slot.params.input_prefix, false);
|
||||
auto suffix_tokens = tokenize(slot.params.input_suffix, false);
|
||||
auto prefix_tokens = tokenize(slot.data, slot.params.input_prefix, false);
|
||||
auto suffix_tokens = tokenize(slot.data, slot.params.input_suffix, false);
|
||||
|
||||
const int space_token = 29871; // TODO: this should not be hardcoded
|
||||
if (suff_rm_leading_spc && !suffix_tokens.empty() && suffix_tokens[0] == space_token) {
|
||||
if (suff_rm_leading_spc && !suffix_tokens.empty() && suffix_tokens[0][0] == space_token) {
|
||||
suffix_tokens.erase(suffix_tokens.begin());
|
||||
}
|
||||
|
||||
prefix_tokens.insert(prefix_tokens.begin(), llama_vocab_fim_pre(vocab));
|
||||
prefix_tokens.insert(prefix_tokens.begin(), llama_vocab_bos(vocab)); // always add BOS
|
||||
prefix_tokens.insert(prefix_tokens.end(), llama_vocab_fim_suf(vocab));
|
||||
prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
|
||||
prefix_tokens.push_back(llama_vocab_fim_mid(vocab));
|
||||
// Create llama_tokens vectors for the special tokens
|
||||
llama_tokens fim_pre_tokens;
|
||||
fim_pre_tokens.push_back(llama_vocab_fim_pre(vocab));
|
||||
llama_tokens bos_tokens;
|
||||
bos_tokens.push_back(llama_vocab_bos(vocab));
|
||||
llama_tokens fim_suf_tokens;
|
||||
fim_suf_tokens.push_back(llama_vocab_fim_suf(vocab));
|
||||
llama_tokens fim_mid_tokens;
|
||||
fim_mid_tokens.push_back(llama_vocab_fim_mid(vocab));
|
||||
|
||||
// Create server_tokens objects
|
||||
server_tokens fim_pre_token(fim_pre_tokens, mctx != nullptr);
|
||||
server_tokens bos_token(bos_tokens, mctx != nullptr);
|
||||
server_tokens fim_suf_token(fim_suf_tokens, mctx != nullptr);
|
||||
server_tokens fim_mid_token(fim_mid_tokens, mctx != nullptr);
|
||||
|
||||
// Insert tokens in the correct order
|
||||
prefix_tokens.insert(prefix_tokens.begin(), fim_pre_token);
|
||||
prefix_tokens.insert(prefix_tokens.begin(), bos_token); // always add BOS
|
||||
prefix_tokens.insert(prefix_tokens.end(), fim_suf_token);
|
||||
prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
|
||||
prefix_tokens.push_back(fim_mid_token);
|
||||
prompt_tokens = prefix_tokens;
|
||||
}
|
||||
else
|
||||
{
|
||||
prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token); // add BOS if there isn't system prompt
|
||||
prompt_tokens = tokenize(slot.data, slot.prompt, system_prompt.empty() && add_bos_token); // add BOS if there isn't system prompt
|
||||
}
|
||||
|
||||
slot.num_prompt_tokens = prompt_tokens.size();
|
||||
@@ -1767,7 +1637,12 @@ struct llama_server_context
|
||||
{"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
|
||||
});
|
||||
slot.truncated = true;
|
||||
prompt_tokens = new_tokens;
|
||||
|
||||
// Convert new_tokens to server_tokens
|
||||
std::vector<server_tokens> new_prompt_tokens;
|
||||
server_tokens new_server_tokens(new_tokens, mctx != nullptr);
|
||||
new_prompt_tokens.push_back(std::move(new_server_tokens));
|
||||
prompt_tokens = std::move(new_prompt_tokens);
|
||||
|
||||
slot.num_prompt_tokens = prompt_tokens.size();
|
||||
GGML_ASSERT(slot.num_prompt_tokens < slot.n_ctx);
|
||||
@@ -1787,10 +1662,17 @@ struct llama_server_context
|
||||
// push the prompt into the sampling context (do not apply grammar)
|
||||
for (auto &token : prompt_tokens)
|
||||
{
|
||||
common_sampler_accept(slot.ctx_sampling, token, false);
|
||||
// Convert server_tokens to llama_token for sampling
|
||||
llama_token tok = token[0]; // Get first token
|
||||
common_sampler_accept(slot.ctx_sampling, tok, false);
|
||||
}
|
||||
|
||||
slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
|
||||
// Convert server_tokens to llama_tokens for comparison
|
||||
std::vector<llama_token> prompt_llama_tokens;
|
||||
for (const auto &token : prompt_tokens) {
|
||||
prompt_llama_tokens.push_back(token[0]);
|
||||
}
|
||||
slot.n_past = common_part(slot.cache_tokens, prompt_llama_tokens);
|
||||
|
||||
// the last token of the cache is not in the KV cache until the next call to llama_decode
|
||||
// (it was sampled, pushed into the "cache_tokens", but not yet put in the context)
|
||||
@@ -1828,7 +1710,12 @@ struct llama_server_context
|
||||
});
|
||||
}
|
||||
|
||||
slot.cache_tokens = prompt_tokens;
|
||||
// Convert server_tokens to llama_tokens for cache
|
||||
std::vector<llama_token> cache_llama_tokens;
|
||||
for (const auto &token : prompt_tokens) {
|
||||
cache_llama_tokens.push_back(token[0]);
|
||||
}
|
||||
slot.cache_tokens = cache_llama_tokens;
|
||||
|
||||
if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0)
|
||||
{
|
||||
@@ -1852,18 +1739,36 @@ struct llama_server_context
|
||||
});
|
||||
llama_kv_cache_seq_rm(ctx, slot.id, p0, -1);
|
||||
|
||||
|
||||
// process the prefix of first image
|
||||
std::vector<server_tokens> prefix_tokens = prompt_tokens;
|
||||
|
||||
int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
|
||||
|
||||
// check if we should process the image
|
||||
if (slot.n_past < slot.n_prompt_tokens
|
||||
&& slot.prompt_tokens[slot.n_past] == LLAMA_TOKEN_NULL) {
|
||||
// process the image
|
||||
int32_t new_n_past;
|
||||
int32_t res = prompt_tokens.process_chunk(ctx, mctx, slot.n_past, slot.id, new_n_past);
|
||||
int32_t n_pos = new_n_past - slot.n_past;
|
||||
if (res != 0) {
|
||||
slot.release();
|
||||
LOG_ERR("failed to process image, res = %d\n", res);
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
slot.n_past += n_pos;
|
||||
// slot.n_prompt_tokens_processed += n_pos;
|
||||
}
|
||||
|
||||
LOG_VERBOSE("prompt ingested", {
|
||||
{"n_past", slot.n_past},
|
||||
{"cached", tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cbegin() + slot.n_past)},
|
||||
{"to_eval", tokens_to_str(ctx, slot.cache_tokens.cbegin() + slot.n_past, slot.cache_tokens.cend())},
|
||||
});
|
||||
|
||||
const bool has_images = process_images(slot);
|
||||
|
||||
// process the prefix of first image
|
||||
std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) : prompt_tokens;
|
||||
|
||||
int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
|
||||
|
||||
int32_t ga_i = slot.ga_i;
|
||||
int32_t ga_n = slot.ga_n;
|
||||
@@ -1883,19 +1788,6 @@ struct llama_server_context
|
||||
slot_npast++;
|
||||
}
|
||||
|
||||
if (has_images && !ingest_images(slot, n_batch))
|
||||
{
|
||||
LOG_ERR("%s: failed processing images Slot id : %d, Task id: %d",
|
||||
__func__,
|
||||
slot.id,
|
||||
slot.task_id
|
||||
);
|
||||
// FIXME @phymbert: to be properly tested
|
||||
// early returning without changing the slot state will block the slot for ever
|
||||
// no one at the moment is checking the return value
|
||||
return false;
|
||||
}
|
||||
|
||||
// extract the logits only for the last token
|
||||
if (batch.n_tokens > 0)
|
||||
{
|
||||
@@ -1988,7 +1880,7 @@ struct llama_server_context
|
||||
// prompt evaluated for embedding
|
||||
if (slot.embedding)
|
||||
{
|
||||
send_embedding(slot);
|
||||
send_embedding(slot, batch_view);
|
||||
slot.release();
|
||||
slot.i_batch = -1;
|
||||
continue;
|
||||
@@ -2082,7 +1974,11 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
|
||||
}
|
||||
|
||||
std::function<void(int)> shutdown_handler;
|
||||
inline void signal_handler(int signal) { shutdown_handler(signal); }
|
||||
|
||||
inline void signal_handler(int signal) {
|
||||
exit(1);
|
||||
}
|
||||
|
||||
|
||||
/////////////////////////////////
|
||||
////////////////////////////////
|
||||
@@ -2120,26 +2016,6 @@ static void start_llama_server() {
|
||||
json parse_options(bool streaming, const backend::PredictOptions* predict, llama_server_context &llama)
|
||||
{
|
||||
|
||||
// This is for example a slot data from the json data
|
||||
// slot->params.stream = json_value(data, "stream", false);
|
||||
// slot->params.cache_prompt = json_value(data, "cache_prompt", false);
|
||||
// slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict);
|
||||
// slot->sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
|
||||
// slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
|
||||
// slot->sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p);
|
||||
// slot->sparams.temp = json_value(data, "temperature", default_sparams.temp);
|
||||
// slot->sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n);
|
||||
// slot->sparams.penalty_repeat = json_value(data, "repeat_penalty", default_sparams.penalty_repeat);
|
||||
// slot->sparams.penalty_freq = json_value(data, "frequency_penalty", default_sparams.penalty_freq);
|
||||
// slot->sparams.penalty_present = json_value(data, "presence_penalty", default_sparams.penalty_present);
|
||||
// slot->sparams.mirostat = json_value(data, "mirostat", default_sparams.mirostat);
|
||||
// slot->sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau);
|
||||
// slot->sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta);
|
||||
// slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep);
|
||||
// slot->params.seed = json_value(data, "seed", default_params.seed);
|
||||
// slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
|
||||
// slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
|
||||
|
||||
// Create now a json data from the prediction options instead
|
||||
//
|
||||
json data;
|
||||
@@ -2184,69 +2060,6 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
|
||||
return data;
|
||||
}
|
||||
|
||||
// static void parse_options_completion(bool streaming,const backend::PredictOptions* predict, llama_server_context &llama)
|
||||
// {
|
||||
// // https://github.com/ggerganov/llama.cpp/blob/d9b33fe95bd257b36c84ee5769cc048230067d6f/examples/server/server.cpp#L673
|
||||
// gpt_params default_params;
|
||||
|
||||
// llama.stream = streaming;
|
||||
// llama.params.n_predict = predict->tokens() == 0 ? -1 : predict->tokens();
|
||||
// llama.params.sparams.top_k = predict->topk();
|
||||
// llama.params.sparams.top_p = predict->topp();
|
||||
// llama.params.sparams.typical_p = predict->typicalp();
|
||||
// llama.params.sparams.penalty_last_n = predict->repeat();
|
||||
// llama.params.sparams.temp = predict->temperature();
|
||||
// llama.params.sparams.penalty_repeat = predict->penalty();
|
||||
// llama.params.sparams.penalty_present = predict->presencepenalty();
|
||||
// llama.params.sparams.penalty_freq = predict->frequencypenalty();
|
||||
// llama.params.sparams.mirostat = predict->mirostat();
|
||||
// llama.params.sparams.mirostat_tau = predict->mirostattau();
|
||||
// llama.params.sparams.mirostat_eta = predict->mirostateta();
|
||||
// llama.params.n_keep = predict->nkeep();
|
||||
// llama.params.seed = predict->seed();
|
||||
// llama.params.sparams.grammar = predict->grammar();
|
||||
// // llama.params.n_probs = predict->
|
||||
// llama.params.prompt = predict->prompt();
|
||||
|
||||
// llama.params.sparams.logit_bias.clear();
|
||||
|
||||
// if (predict->ignoreeos())
|
||||
// {
|
||||
// llama.params.sparams.logit_bias[llama_token_eos(llama.model)] = -INFINITY;
|
||||
// }
|
||||
|
||||
// // const auto &logit_bias = body.find("logit_bias");
|
||||
// // if (logit_bias != body.end() && logit_bias->is_array())
|
||||
// // {
|
||||
// // const int n_vocab = llama_n_vocab(llama.model);
|
||||
// // for (const auto &el : *logit_bias)
|
||||
// // {
|
||||
// // if (el.is_array() && el.size() == 2 && el[0].is_number_integer())
|
||||
// // {
|
||||
// // llama_token tok = el[0].get<llama_token>();
|
||||
// // if (tok >= 0 && tok < n_vocab)
|
||||
// // {
|
||||
// // if (el[1].is_number())
|
||||
// // {
|
||||
// // llama.params.logit_bias[tok] = el[1].get<float>();
|
||||
// // }
|
||||
// // else if (el[1].is_boolean() && !el[1].get<bool>())
|
||||
// // {
|
||||
// // llama.params.logit_bias[tok] = -INFINITY;
|
||||
// // }
|
||||
// // }
|
||||
// // }
|
||||
// // }
|
||||
// // }
|
||||
|
||||
// llama.params.antiprompt.clear();
|
||||
// for (const std::string& stopPrompt : predict->stopprompts()) {
|
||||
// if (!stopPrompt.empty())
|
||||
// {
|
||||
// llama.params.antiprompt.push_back(stopPrompt);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
const std::vector<ggml_type> kv_cache_types = {
|
||||
GGML_TYPE_F32,
|
||||
@@ -2278,15 +2091,15 @@ static std::string get_all_kv_cache_types() {
|
||||
}
|
||||
|
||||
static void params_parse(const backend::ModelOptions* request,
|
||||
common_params & params) {
|
||||
common_params & params, llama_server_context &llama) {
|
||||
|
||||
// this is comparable to: https://github.com/ggerganov/llama.cpp/blob/d9b33fe95bd257b36c84ee5769cc048230067d6f/examples/server/server.cpp#L1809
|
||||
|
||||
params.model = request->modelfile();
|
||||
params.model.path = request->modelfile();
|
||||
if (!request->mmproj().empty()) {
|
||||
// get the directory of modelfile
|
||||
std::string model_dir = params.model.substr(0, params.model.find_last_of("/\\"));
|
||||
params.mmproj = model_dir + "/"+ request->mmproj();
|
||||
std::string model_dir = params.model.path.substr(0, params.model.path.find_last_of("/\\"));
|
||||
params.mmproj.path = model_dir + "/"+ request->mmproj();
|
||||
}
|
||||
// params.model_alias ??
|
||||
params.model_alias = request->modelfile();
|
||||
@@ -2316,6 +2129,20 @@ static void params_parse(const backend::ModelOptions* request,
|
||||
add_rpc_devices(std::string(llama_grpc_servers));
|
||||
}
|
||||
|
||||
// decode options. Options are in form optname:optvale, or if booleans only optname.
|
||||
for (int i = 0; i < request->options_size(); i++) {
|
||||
std::string opt = request->options(i);
|
||||
char *optname = strtok(&opt[0], ":");
|
||||
char *optval = strtok(NULL, ":");
|
||||
if (optval == NULL) {
|
||||
optval = "true";
|
||||
}
|
||||
|
||||
if (!strcmp(optname, "gpu")) {
|
||||
llama.has_gpu = true;
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Add yarn
|
||||
|
||||
if (!request->tensorsplit().empty()) {
|
||||
@@ -2347,7 +2174,7 @@ static void params_parse(const backend::ModelOptions* request,
|
||||
scale_factor = request->lorascale();
|
||||
}
|
||||
// get the directory of modelfile
|
||||
std::string model_dir = params.model.substr(0, params.model.find_last_of("/\\"));
|
||||
std::string model_dir = params.model.path.substr(0, params.model.path.find_last_of("/\\"));
|
||||
params.lora_adapters.push_back({ model_dir + "/"+request->loraadapter(), scale_factor });
|
||||
}
|
||||
params.use_mlock = request->mlock();
|
||||
@@ -2385,12 +2212,12 @@ static void params_parse(const backend::ModelOptions* request,
|
||||
llama.grammar_lazy = true;
|
||||
for (int i = 0; i < request->grammartriggers_size(); i++) {
|
||||
common_grammar_trigger trigger;
|
||||
trigger.word = request->grammartriggers(i).word();
|
||||
trigger.at_start = request->grammartriggers(i).at_start();
|
||||
llama.grammar_trigger_words.push_back(trigger);
|
||||
trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_WORD;
|
||||
trigger.value = request->grammartriggers(i).word();
|
||||
// trigger.at_start = request->grammartriggers(i).at_start();
|
||||
llama.grammar_triggers.push_back(trigger);
|
||||
LOG_INFO("grammar trigger", {
|
||||
{ "word", trigger.word },
|
||||
{ "at_start", trigger.at_start }
|
||||
{ "word", trigger.value },
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -2409,7 +2236,7 @@ public:
|
||||
grpc::Status LoadModel(ServerContext* context, const backend::ModelOptions* request, backend::Result* result) {
|
||||
// Implement LoadModel RPC
|
||||
common_params params;
|
||||
params_parse(request, params);
|
||||
params_parse(request, params, llama);
|
||||
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
@@ -2545,10 +2372,10 @@ public:
|
||||
grpc::Status TokenizeString(ServerContext* context, const backend::PredictOptions* request, backend::TokenizationResponse* response){
|
||||
json data = parse_options(false, request, llama);
|
||||
|
||||
std::vector<llama_token> tokens = llama.tokenize(data["prompt"],false);
|
||||
std::vector<server_tokens> tokens = llama.tokenize(data, data["prompt"],false);
|
||||
|
||||
for (int i=0 ; i< tokens.size(); i++){
|
||||
response->add_tokens(tokens[i]);
|
||||
response->add_tokens(tokens[i].llama_token);
|
||||
}
|
||||
|
||||
return grpc::Status::OK;
|
||||
@@ -2586,7 +2413,9 @@ void RunServer(const std::string& server_address) {
|
||||
ServerBuilder builder;
|
||||
builder.AddListeningPort(server_address, grpc::InsecureServerCredentials());
|
||||
builder.RegisterService(&service);
|
||||
|
||||
builder.SetMaxMessageSize(50 * 1024 * 1024); // 50MB
|
||||
builder.SetMaxSendMessageSize(50 * 1024 * 1024); // 50MB
|
||||
builder.SetMaxReceiveMessageSize(50 * 1024 * 1024); // 50MB
|
||||
std::unique_ptr<Server> server(builder.BuildAndStart());
|
||||
std::cout << "Server listening on " << server_address << std::endl;
|
||||
server->Wait();
|
||||
@@ -2595,6 +2424,20 @@ void RunServer(const std::string& server_address) {
|
||||
int main(int argc, char** argv) {
|
||||
std::string server_address("localhost:50051");
|
||||
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
||||
struct sigaction sigint_action;
|
||||
sigint_action.sa_handler = signal_handler;
|
||||
sigemptyset (&sigint_action.sa_mask);
|
||||
sigint_action.sa_flags = 0;
|
||||
sigaction(SIGINT, &sigint_action, NULL);
|
||||
sigaction(SIGTERM, &sigint_action, NULL);
|
||||
#elif defined (_WIN32)
|
||||
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
|
||||
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
|
||||
};
|
||||
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
||||
#endif
|
||||
|
||||
// Define long and short options
|
||||
struct option long_options[] = {
|
||||
{"addr", required_argument, nullptr, 'a'},
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
|
||||
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
|
||||
index 3cd0d2fa..6c5e811a 100644
|
||||
--- a/examples/llava/clip.cpp
|
||||
+++ b/examples/llava/clip.cpp
|
||||
--- a/tools/mtmd/clip.cpp
|
||||
+++ b/tools/mtmd/clip.cpp
|
||||
@@ -2608,7 +2608,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
|
||||
int* patches_data = (int*)malloc(ggml_nbytes(patches));
|
||||
|
||||
@@ -7,21 +7,22 @@ for patch in $(ls patches); do
|
||||
patch -d llama.cpp/ -p1 < patches/$patch
|
||||
done
|
||||
|
||||
cp -r CMakeLists.txt llama.cpp/examples/grpc-server/
|
||||
cp -r grpc-server.cpp llama.cpp/examples/grpc-server/
|
||||
cp -rfv json.hpp llama.cpp/examples/grpc-server/
|
||||
cp -rfv utils.hpp llama.cpp/examples/grpc-server/
|
||||
cp -r CMakeLists.txt llama.cpp/tools/grpc-server/
|
||||
cp -r grpc-server.cpp llama.cpp/tools/grpc-server/
|
||||
cp -rfv json.hpp llama.cpp/tools/grpc-server/
|
||||
cp -rfv utils.hpp llama.cpp/tools/grpc-server/
|
||||
|
||||
if grep -q "grpc-server" llama.cpp/examples/CMakeLists.txt; then
|
||||
if grep -q "grpc-server" llama.cpp/tools/CMakeLists.txt; then
|
||||
echo "grpc-server already added"
|
||||
else
|
||||
echo "add_subdirectory(grpc-server)" >> llama.cpp/examples/CMakeLists.txt
|
||||
echo "add_subdirectory(grpc-server)" >> llama.cpp/tools/CMakeLists.txt
|
||||
fi
|
||||
|
||||
## XXX: In some versions of CMake clip wasn't being built before llama.
|
||||
## This is an hack for now, but it should be fixed in the future.
|
||||
cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h
|
||||
cp -rfv llama.cpp/examples/llava/llava.cpp llama.cpp/examples/grpc-server/llava.cpp
|
||||
echo '#include "llama.h"' > llama.cpp/examples/grpc-server/llava.h
|
||||
cat llama.cpp/examples/llava/llava.h >> llama.cpp/examples/grpc-server/llava.h
|
||||
cp -rfv llama.cpp/examples/llava/clip.cpp llama.cpp/examples/grpc-server/clip.cpp
|
||||
# cp -rfv llama.cpp/tools/mtmd/clip.h llama.cpp/tools/grpc-server/clip.h
|
||||
# cp -rfv llama.cpp/tools/mtmd/clip-impl.h llama.cpp/tools/grpc-server/clip-impl.h
|
||||
# cp -rfv llama.cpp/tools/mtmd/llava.cpp llama.cpp/tools/grpc-server/llava.cpp
|
||||
# echo '#include "llama.h"' > llama.cpp/tools/grpc-server/llava.h
|
||||
# cat llama.cpp/tools/mtmd/llava.h >> llama.cpp/tools/grpc-server/llava.h
|
||||
# cp -rfv llama.cpp/tools/mtmd/clip.cpp llama.cpp/tools/grpc-server/clip.cpp
|
||||
431
backend/cpp/llama/utils.hpp
vendored
431
backend/cpp/llama/utils.hpp
vendored
@@ -1,4 +1,4 @@
|
||||
// https://github.com/ggerganov/llama.cpp/blob/master/examples/server/utils.hpp
|
||||
// https://github.com/ggerganov/llama.cpp/blob/master/tools/server/utils.hpp
|
||||
|
||||
#pragma once
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
|
||||
#include "json.hpp"
|
||||
|
||||
#include "../llava/clip.h"
|
||||
#include "../mtmd/clip.h"
|
||||
|
||||
using json = nlohmann::json;
|
||||
|
||||
@@ -480,4 +480,431 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str
|
||||
}
|
||||
|
||||
return ret;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
//
|
||||
// tokenizer and input processing utils
|
||||
//
|
||||
|
||||
static bool json_is_array_of_numbers(const json & data) {
|
||||
if (data.is_array()) {
|
||||
for (const auto & e : data) {
|
||||
if (!e.is_number_integer()) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// is array having BOTH numbers & strings?
|
||||
static bool json_is_array_of_mixed_numbers_strings(const json & data) {
|
||||
bool seen_string = false;
|
||||
bool seen_number = false;
|
||||
if (data.is_array()) {
|
||||
for (const auto & e : data) {
|
||||
seen_string |= e.is_string();
|
||||
seen_number |= e.is_number_integer();
|
||||
if (seen_number && seen_string) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// get value by path(key1 / key2)
|
||||
static json json_get_nested_values(const std::vector<std::string> & paths, const json & js) {
|
||||
json result = json::object();
|
||||
|
||||
for (const std::string & path : paths) {
|
||||
json current = js;
|
||||
const auto keys = string_split<std::string>(path, /*separator*/ '/');
|
||||
bool valid_path = true;
|
||||
for (const std::string & k : keys) {
|
||||
if (valid_path && current.is_object() && current.contains(k)) {
|
||||
current = current[k];
|
||||
} else {
|
||||
valid_path = false;
|
||||
}
|
||||
}
|
||||
if (valid_path) {
|
||||
result[path] = current;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* this handles 2 cases:
|
||||
* - only string, example: "string"
|
||||
* - mixed string and tokens, example: [12, 34, "string", 56, 78]
|
||||
*/
|
||||
static llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_prompt, bool add_special, bool parse_special) {
|
||||
// If `add_bos` is true, we only add BOS, when json_prompt is a string,
|
||||
// or the first element of the json_prompt array is a string.
|
||||
llama_tokens prompt_tokens;
|
||||
|
||||
if (json_prompt.is_array()) {
|
||||
bool first = true;
|
||||
for (const auto & p : json_prompt) {
|
||||
if (p.is_string()) {
|
||||
auto s = p.template get<std::string>();
|
||||
|
||||
llama_tokens p;
|
||||
if (first) {
|
||||
p = common_tokenize(vocab, s, add_special, parse_special);
|
||||
first = false;
|
||||
} else {
|
||||
p = common_tokenize(vocab, s, false, parse_special);
|
||||
}
|
||||
|
||||
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
|
||||
} else {
|
||||
if (first) {
|
||||
first = false;
|
||||
}
|
||||
|
||||
prompt_tokens.push_back(p.template get<llama_token>());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
auto s = json_prompt.template get<std::string>();
|
||||
prompt_tokens = common_tokenize(vocab, s, add_special, parse_special);
|
||||
}
|
||||
|
||||
return prompt_tokens;
|
||||
}
|
||||
|
||||
/**
|
||||
* break the input "prompt" object into multiple prompt if needed, then tokenize them
|
||||
* this supports these cases:
|
||||
* - "prompt": "string"
|
||||
* - "prompt": [12, 34, 56]
|
||||
* - "prompt": [12, 34, "string", 56, 78]
|
||||
* and multiple prompts (multi-tasks):
|
||||
* - "prompt": ["string1", "string2"]
|
||||
* - "prompt": ["string1", [12, 34, 56]]
|
||||
* - "prompt": [[12, 34, 56], [78, 90, 12]]
|
||||
* - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56]]
|
||||
*/
|
||||
static std::vector<llama_tokens> tokenize_input_prompts(const llama_vocab * vocab, const json & json_prompt, bool add_special, bool parse_special) {
|
||||
std::vector<llama_tokens> result;
|
||||
if (json_prompt.is_string() || json_is_array_of_mixed_numbers_strings(json_prompt)) {
|
||||
// string or mixed
|
||||
result.push_back(tokenize_mixed(vocab, json_prompt, add_special, parse_special));
|
||||
} else if (json_is_array_of_numbers(json_prompt)) {
|
||||
// array of tokens
|
||||
result.push_back(json_prompt.get<llama_tokens>());
|
||||
} else if (json_prompt.is_array()) {
|
||||
// array of prompts
|
||||
result.reserve(json_prompt.size());
|
||||
for (const auto & p : json_prompt) {
|
||||
if (p.is_string() || json_is_array_of_mixed_numbers_strings(p)) {
|
||||
result.push_back(tokenize_mixed(vocab, p, add_special, parse_special));
|
||||
} else if (json_is_array_of_numbers(p)) {
|
||||
// array of tokens
|
||||
result.push_back(p.get<llama_tokens>());
|
||||
} else {
|
||||
throw std::runtime_error("element of \"prompt\" must be a string, an list of tokens, or a list of mixed strings & tokens");
|
||||
}
|
||||
}
|
||||
} else {
|
||||
throw std::runtime_error("\"prompt\" must be a string, an list of tokens, a list of mixed strings & tokens, or a list of prompts");
|
||||
}
|
||||
if (result.empty()) {
|
||||
throw std::runtime_error("\"prompt\" must not be empty");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
//
|
||||
// utils for interacting with libmtmd
|
||||
// (may need to refactor in near future)
|
||||
//
|
||||
|
||||
/**
|
||||
* server_tokens is a helper to manage the input tokens and image for the server.
|
||||
* it is made this way to simplify the logic of KV cache management.
|
||||
*/
|
||||
struct server_tokens {
|
||||
bool has_mtmd = false;
|
||||
|
||||
private: // disallow accessing these members directly, risking out-of-sync
|
||||
|
||||
// map a **start** position in tokens to the image chunk
|
||||
std::unordered_map<llama_pos, mtmd::input_chunk_ptr> map_pos_to_image;
|
||||
|
||||
// list of tokens
|
||||
// it can include LLAMA_TOKEN_NULL, which is used to indicate a token that is not a text token
|
||||
// a mtmd_input_chunk can occupy multiple tokens, one llama_token per **position**
|
||||
// important: for models using mrope, an image can contain multiple tokens but will use only one **position**
|
||||
llama_tokens tokens;
|
||||
|
||||
// for ex. with input of 5 text tokens and 2 images:
|
||||
// [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
|
||||
// pos 0 1 2 3 4 5 6 7 8 9
|
||||
// map_pos_to_image will contain: {5, img0}, {8, img1}
|
||||
|
||||
public:
|
||||
server_tokens() = default;
|
||||
~server_tokens() = default;
|
||||
|
||||
// Prevent copying
|
||||
server_tokens(const server_tokens&) = delete;
|
||||
server_tokens& operator=(const server_tokens&) = delete;
|
||||
|
||||
// Allow moving (usually implicitly generated if members are movable)
|
||||
server_tokens(server_tokens&&) = default;
|
||||
server_tokens& operator=(server_tokens&&) = default;
|
||||
|
||||
// Allow accessing elements using [] operator
|
||||
llama_token operator[](size_t index) { return tokens[index]; }
|
||||
const llama_token& operator[](size_t index) const { return tokens[index]; }
|
||||
|
||||
server_tokens(mtmd::input_chunks & mtmd_chunks, bool has_mtmd) : has_mtmd(has_mtmd) {
|
||||
for (size_t i = 0; i < mtmd_chunks.size(); ++i) {
|
||||
push_back(mtmd_chunks[i]);
|
||||
}
|
||||
}
|
||||
|
||||
server_tokens(llama_tokens & tokens, bool has_mtmd) : has_mtmd(has_mtmd), tokens(tokens) {}
|
||||
|
||||
// for debugging
|
||||
std::string str() const {
|
||||
std::ostringstream oss;
|
||||
oss << "tokens: ";
|
||||
for (const auto & t : tokens) {
|
||||
if (t == LLAMA_TOKEN_NULL) {
|
||||
oss << "<embd> ";
|
||||
} else {
|
||||
oss << t << " ";
|
||||
}
|
||||
}
|
||||
oss << "\n";
|
||||
oss << "image pos: ";
|
||||
for (const auto & it : map_pos_to_image) {
|
||||
oss << it.first << ", ";
|
||||
}
|
||||
return oss.str();
|
||||
}
|
||||
|
||||
const mtmd::input_chunk_ptr & find_chunk(llama_pos pos) const {
|
||||
auto it = map_pos_to_image.find(pos);
|
||||
if (it != map_pos_to_image.end()) {
|
||||
return it->second;
|
||||
} else {
|
||||
throw std::runtime_error("Chunk not found");
|
||||
}
|
||||
}
|
||||
|
||||
void push_back(llama_token tok) {
|
||||
if (tok == LLAMA_TOKEN_NULL) {
|
||||
throw std::runtime_error("Invalid token");
|
||||
}
|
||||
tokens.emplace_back(tok);
|
||||
}
|
||||
|
||||
// will create a copy of the chunk if it contains non-text data
|
||||
void push_back(const mtmd_input_chunk * chunk) {
|
||||
auto type = mtmd_input_chunk_get_type(chunk);
|
||||
if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
||||
GGML_ASSERT(has_mtmd);
|
||||
auto img_tokens = mtmd_input_chunk_get_tokens_image(chunk);
|
||||
const int n_pos = mtmd_image_tokens_get_n_pos(img_tokens);
|
||||
llama_pos start_pos = tokens.size();
|
||||
for (int i = 0; i < n_pos; ++i) {
|
||||
tokens.emplace_back(LLAMA_TOKEN_NULL);
|
||||
}
|
||||
mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk));
|
||||
map_pos_to_image[start_pos] = std::move(new_chunk);
|
||||
} else if (type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
||||
size_t n_tokens;
|
||||
auto text_tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
|
||||
for (size_t i = 0; i < n_tokens; ++i) {
|
||||
push_back(text_tokens[i]);
|
||||
}
|
||||
} else {
|
||||
GGML_ABORT("Invalid chunk type");
|
||||
}
|
||||
}
|
||||
|
||||
// for compatibility with context shift and prompt truncation
|
||||
void insert(const llama_tokens & inp_tokens) {
|
||||
GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
|
||||
tokens.insert(tokens.end(), inp_tokens.begin(), inp_tokens.end());
|
||||
}
|
||||
|
||||
// for compatibility with speculative decoding, ctx shift, slot save/load
|
||||
const llama_tokens & get_text_tokens() const {
|
||||
GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
|
||||
return tokens;
|
||||
}
|
||||
|
||||
// for compatibility with speculative decoding
|
||||
void set_token(llama_pos pos, llama_token id) {
|
||||
GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
|
||||
tokens[pos] = id;
|
||||
}
|
||||
|
||||
size_t size() const {
|
||||
return tokens.size();
|
||||
}
|
||||
|
||||
bool empty() const {
|
||||
return tokens.empty();
|
||||
}
|
||||
|
||||
void clear() {
|
||||
tokens.clear();
|
||||
}
|
||||
|
||||
void resize(size_t n) {
|
||||
GGML_ASSERT(n <= tokens.size());
|
||||
if (has_mtmd) {
|
||||
// we throw an error if we try to remove a token in the middle of an image
|
||||
// for ex. with input of 5 text tokens and 2 images:
|
||||
// [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
|
||||
// n 1 2 3 4 5 6 7 8 9 10
|
||||
// allowed to resize ^ ^
|
||||
// disallowed to resize ^ ^ ^
|
||||
if (n > 0) {
|
||||
llama_token last_token = tokens[n - 1];
|
||||
// make sure we never remove tokens in the middle of an image
|
||||
if (last_token == LLAMA_TOKEN_NULL) {
|
||||
find_chunk(n - 1); // will throw an error if the token is not begin-of-chunk
|
||||
}
|
||||
}
|
||||
// remove all image chunks that are not used anymore
|
||||
for (auto it = map_pos_to_image.begin(); it != map_pos_to_image.end(); ) {
|
||||
llama_pos pos = it->first;
|
||||
if (pos >= (llama_pos)n) {
|
||||
it = map_pos_to_image.erase(it);
|
||||
} else {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
}
|
||||
tokens.resize(n);
|
||||
}
|
||||
|
||||
std::string detokenize(const llama_context * ctx, bool special) const {
|
||||
llama_tokens text_tokens;
|
||||
text_tokens.reserve(tokens.size());
|
||||
for (const auto & t : tokens) {
|
||||
if (t != LLAMA_TOKEN_NULL) {
|
||||
text_tokens.push_back(t);
|
||||
}
|
||||
}
|
||||
return common_detokenize(ctx, text_tokens, special);
|
||||
}
|
||||
|
||||
size_t get_common_prefix(const server_tokens & b) const {
|
||||
size_t max_idx = std::min(tokens.size(), b.tokens.size());
|
||||
for (size_t i = 0; i < max_idx; ++i) {
|
||||
auto & ai = tokens[i];
|
||||
auto & bi = b.tokens[i];
|
||||
|
||||
if (ai == LLAMA_TOKEN_NULL && bi == LLAMA_TOKEN_NULL) {
|
||||
GGML_ASSERT(has_mtmd);
|
||||
const auto & a_chunk = find_chunk(i);
|
||||
const auto & b_chunk = b.find_chunk(i);
|
||||
GGML_ASSERT(a_chunk && b_chunk);
|
||||
const auto * a_img = mtmd_input_chunk_get_tokens_image(a_chunk.get());
|
||||
const auto * b_img = mtmd_input_chunk_get_tokens_image(b_chunk.get());
|
||||
std::string ai_id = mtmd_image_tokens_get_id(a_img);
|
||||
std::string bi_id = mtmd_image_tokens_get_id(b_img);
|
||||
size_t a_pos = mtmd_image_tokens_get_n_pos(a_img);
|
||||
size_t b_pos = mtmd_image_tokens_get_n_pos(b_img);
|
||||
if (ai_id == bi_id && a_pos == b_pos) {
|
||||
GGML_ASSERT(a_pos > 0 && "Invalid image token"); // should never happen
|
||||
i += a_pos - 1; // will be +1 by the for loop
|
||||
continue;
|
||||
} else {
|
||||
return i;
|
||||
}
|
||||
} else if (ai == bi) {
|
||||
continue;
|
||||
} else {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return max_idx; // all tokens are equal
|
||||
}
|
||||
|
||||
// make sure all text tokens are within the vocab range
|
||||
bool validate(const struct llama_context * ctx) const {
|
||||
const llama_model * model = llama_get_model(ctx);
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
const int32_t n_vocab = llama_vocab_n_tokens(vocab);
|
||||
|
||||
for (size_t i = 0; i < tokens.size(); ++i) {
|
||||
auto & t = tokens[i];
|
||||
if (t == LLAMA_TOKEN_NULL) {
|
||||
try {
|
||||
const auto & chunk = find_chunk(i);
|
||||
const auto * img_tokens = mtmd_input_chunk_get_tokens_image(chunk.get());
|
||||
size_t n_pos = mtmd_image_tokens_get_n_pos(img_tokens);
|
||||
i += n_pos - 1; // will be +1 by the for loop
|
||||
} catch (const std::exception & e) {
|
||||
return false;
|
||||
}
|
||||
} else if (t < 0 || t >= n_vocab) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// encode and decode the image chunk
|
||||
int32_t process_chunk(
|
||||
llama_context * ctx,
|
||||
mtmd_context * mctx,
|
||||
llama_pos n_past,
|
||||
int32_t seq_id,
|
||||
llama_pos & n_pos_out) {
|
||||
auto it = map_pos_to_image.find(n_past);
|
||||
if (it == map_pos_to_image.end()) {
|
||||
throw std::runtime_error("Chunk not found");
|
||||
}
|
||||
// SRV_INF("%s\n", "processing image...");
|
||||
int32_t n_batch = llama_n_batch(ctx);
|
||||
int64_t t0 = ggml_time_ms();
|
||||
llama_pos new_n_past = n_past;
|
||||
int32_t result = mtmd_helper_eval_chunk_single(mctx, ctx,
|
||||
it->second.get(), // chunk
|
||||
n_past,
|
||||
seq_id,
|
||||
n_batch,
|
||||
true, // logits last
|
||||
&new_n_past);
|
||||
//SRV_INF("image processed in %" PRId64 " ms\n", ggml_time_ms() - t0);
|
||||
if (result != 0) {
|
||||
LOG_ERR("mtmd_helper_eval failed with status %d", result);
|
||||
n_pos_out = n_past;
|
||||
return result;
|
||||
}
|
||||
n_pos_out = new_n_past;
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
// Computes FNV-1a hash of the data
|
||||
static std::string fnv_hash(const uint8_t * data, size_t len) {
|
||||
const uint64_t fnv_prime = 0x100000001b3ULL;
|
||||
uint64_t hash = 0xcbf29ce484222325ULL;
|
||||
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
hash ^= data[i];
|
||||
hash *= fnv_prime;
|
||||
}
|
||||
return std::to_string(hash);
|
||||
}
|
||||
@@ -8,12 +8,19 @@ ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
|
||||
# keep standard at C11 and C++11
|
||||
CXXFLAGS = -I. -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/thirdparty -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/ggml/include -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp -O3 -DNDEBUG -std=c++17 -fPIC
|
||||
|
||||
GOCMD?=go
|
||||
CGO_LDFLAGS?=
|
||||
# Avoid parent make file overwriting CGO_LDFLAGS which is needed for hipblas
|
||||
CGO_LDFLAGS_SYCL=
|
||||
GO_TAGS?=
|
||||
LD_FLAGS?=
|
||||
|
||||
# Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
|
||||
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
|
||||
|
||||
# If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
|
||||
ifeq ($(BUILD_TYPE),cublas)
|
||||
CMAKE_ARGS+=-DGGML_CUDA=ON
|
||||
CMAKE_ARGS+=-DSD_CUDA=ON
|
||||
# If build type is openblas then we set -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
|
||||
# to CMAKE_ARGS automatically
|
||||
else ifeq ($(BUILD_TYPE),openblas)
|
||||
@@ -21,31 +28,50 @@ else ifeq ($(BUILD_TYPE),openblas)
|
||||
# If build type is clblas (openCL) we set -DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
|
||||
else ifeq ($(BUILD_TYPE),clblas)
|
||||
CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
|
||||
# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++
|
||||
# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++
|
||||
else ifeq ($(BUILD_TYPE),hipblas)
|
||||
CMAKE_ARGS+=-DGGML_HIP=ON
|
||||
CMAKE_ARGS+=-DSD_HIPBLAS=ON
|
||||
# If it's OSX, DO NOT embed the metal library - -DGGML_METAL_EMBED_LIBRARY=ON requires further investigation
|
||||
# But if it's OSX without metal, disable it here
|
||||
else ifeq ($(OS),Darwin)
|
||||
ifneq ($(BUILD_TYPE),metal)
|
||||
CMAKE_ARGS+=-DGGML_METAL=OFF
|
||||
CMAKE_ARGS+=-DSD_METAL=OFF
|
||||
else
|
||||
CMAKE_ARGS+=-DGGML_METAL=ON
|
||||
CMAKE_ARGS+=-DSD_METAL=ON
|
||||
CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
|
||||
TARGET+=--target ggml-metal
|
||||
endif
|
||||
endif
|
||||
|
||||
# ifeq ($(BUILD_TYPE),sycl_f16)
|
||||
# CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DSD_SYCL=ON -DGGML_SYCL_F16=ON
|
||||
# endif
|
||||
ifeq ($(BUILD_TYPE),sycl_f16)
|
||||
CMAKE_ARGS+=-DGGML_SYCL=ON \
|
||||
-DCMAKE_C_COMPILER=icx \
|
||||
-DCMAKE_CXX_COMPILER=icpx \
|
||||
-DSD_SYCL=ON \
|
||||
-DGGML_SYCL_F16=ON
|
||||
CC=icx
|
||||
CXX=icpx
|
||||
CGO_LDFLAGS_SYCL += -fsycl -L${DNNLROOT}/lib -ldnnl ${MKLROOT}/lib/intel64/libmkl_sycl.a -fiopenmp -fopenmp-targets=spir64 -lOpenCL
|
||||
CGO_LDFLAGS_SYCL += $(shell pkg-config --libs mkl-static-lp64-gomp)
|
||||
CGO_CXXFLAGS += -fiopenmp -fopenmp-targets=spir64
|
||||
CGO_CXXFLAGS += $(shell pkg-config --cflags mkl-static-lp64-gomp )
|
||||
endif
|
||||
|
||||
# ifeq ($(BUILD_TYPE),sycl_f32)
|
||||
# CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DSD_SYCL=ON
|
||||
# endif
|
||||
ifeq ($(BUILD_TYPE),sycl_f32)
|
||||
CMAKE_ARGS+=-DGGML_SYCL=ON \
|
||||
-DCMAKE_C_COMPILER=icx \
|
||||
-DCMAKE_CXX_COMPILER=icpx \
|
||||
-DSD_SYCL=ON
|
||||
CC=icx
|
||||
CXX=icpx
|
||||
CGO_LDFLAGS_SYCL += -fsycl -L${DNNLROOT}/lib -ldnnl ${MKLROOT}/lib/intel64/libmkl_sycl.a -fiopenmp -fopenmp-targets=spir64 -lOpenCL
|
||||
CGO_LDFLAGS_SYCL += $(shell pkg-config --libs mkl-static-lp64-gomp)
|
||||
CGO_CXXFLAGS += -fiopenmp -fopenmp-targets=spir64
|
||||
CGO_CXXFLAGS += $(shell pkg-config --cflags mkl-static-lp64-gomp )
|
||||
endif
|
||||
|
||||
# warnings
|
||||
CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
|
||||
# CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
|
||||
|
||||
# Find all .a archives in ARCHIVE_DIR
|
||||
# (ggml can have different backends cpu, cuda, etc., each backend generates a .a archive)
|
||||
@@ -86,11 +112,24 @@ endif
|
||||
$(MAKE) $(COMBINED_LIB)
|
||||
|
||||
gosd.o:
|
||||
ifneq (,$(findstring sycl,$(BUILD_TYPE)))
|
||||
+bash -c "source $(ONEAPI_VARS); \
|
||||
$(CXX) $(CXXFLAGS) gosd.cpp -o gosd.o -c"
|
||||
else
|
||||
$(CXX) $(CXXFLAGS) gosd.cpp -o gosd.o -c
|
||||
endif
|
||||
|
||||
libsd.a: gosd.o
|
||||
cp $(INCLUDE_PATH)/build/libstable-diffusion.a ./libsd.a
|
||||
$(AR) rcs libsd.a gosd.o
|
||||
|
||||
stablediffusion-ggml:
|
||||
CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_SYCL)" C_INCLUDE_PATH="$(INCLUDE_PATH)" LIBRARY_PATH="$(LIBRARY_PATH)" \
|
||||
CC="$(CC)" CXX="$(CXX)" CGO_CXXFLAGS="$(CGO_CXXFLAGS)" \
|
||||
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o ../../../../backend-assets/grpc/stablediffusion-ggml ./
|
||||
ifneq ($(UPX),)
|
||||
$(UPX) ../../../../backend-assets/grpc/stablediffusion-ggml
|
||||
endif
|
||||
|
||||
clean:
|
||||
rm -rf gosd.o libsd.a build $(COMBINED_LIB)
|
||||
rm -rf gosd.o libsd.a build $(COMBINED_LIB)
|
||||
|
||||
@@ -35,6 +35,8 @@ const char* sample_method_str[] = {
|
||||
"ipndm",
|
||||
"ipndm_v",
|
||||
"lcm",
|
||||
"ddim_trailing",
|
||||
"tcd",
|
||||
};
|
||||
|
||||
// Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h
|
||||
@@ -173,6 +175,7 @@ int gen_image(char *text, char *negativeText, int width, int height, int steps,
|
||||
-1, //clip_skip
|
||||
cfg_scale, // sfg_scale
|
||||
3.5f,
|
||||
0, // eta
|
||||
width,
|
||||
height,
|
||||
sample_method,
|
||||
|
||||
@@ -1,204 +0,0 @@
|
||||
package main
|
||||
|
||||
// This is a wrapper to statisfy the GRPC service interface
|
||||
// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/go-skynet/go-llama.cpp"
|
||||
"github.com/mudler/LocalAI/pkg/grpc/base"
|
||||
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
)
|
||||
|
||||
type LLM struct {
|
||||
base.SingleThread
|
||||
|
||||
llama *llama.LLama
|
||||
}
|
||||
|
||||
func (llm *LLM) Load(opts *pb.ModelOptions) error {
|
||||
ropeFreqBase := float32(10000)
|
||||
ropeFreqScale := float32(1)
|
||||
|
||||
if opts.RopeFreqBase != 0 {
|
||||
ropeFreqBase = opts.RopeFreqBase
|
||||
}
|
||||
if opts.RopeFreqScale != 0 {
|
||||
ropeFreqScale = opts.RopeFreqScale
|
||||
}
|
||||
|
||||
llamaOpts := []llama.ModelOption{
|
||||
llama.WithRopeFreqBase(ropeFreqBase),
|
||||
llama.WithRopeFreqScale(ropeFreqScale),
|
||||
}
|
||||
|
||||
if opts.NGQA != 0 {
|
||||
llamaOpts = append(llamaOpts, llama.WithGQA(int(opts.NGQA)))
|
||||
}
|
||||
|
||||
if opts.RMSNormEps != 0 {
|
||||
llamaOpts = append(llamaOpts, llama.WithRMSNormEPS(opts.RMSNormEps))
|
||||
}
|
||||
|
||||
if opts.ContextSize != 0 {
|
||||
llamaOpts = append(llamaOpts, llama.SetContext(int(opts.ContextSize)))
|
||||
}
|
||||
if opts.F16Memory {
|
||||
llamaOpts = append(llamaOpts, llama.EnableF16Memory)
|
||||
}
|
||||
if opts.Embeddings {
|
||||
llamaOpts = append(llamaOpts, llama.EnableEmbeddings)
|
||||
}
|
||||
if opts.NGPULayers != 0 {
|
||||
llamaOpts = append(llamaOpts, llama.SetGPULayers(int(opts.NGPULayers)))
|
||||
}
|
||||
|
||||
llamaOpts = append(llamaOpts, llama.SetMMap(opts.MMap))
|
||||
llamaOpts = append(llamaOpts, llama.SetMainGPU(opts.MainGPU))
|
||||
llamaOpts = append(llamaOpts, llama.SetTensorSplit(opts.TensorSplit))
|
||||
if opts.NBatch != 0 {
|
||||
llamaOpts = append(llamaOpts, llama.SetNBatch(int(opts.NBatch)))
|
||||
} else {
|
||||
llamaOpts = append(llamaOpts, llama.SetNBatch(512))
|
||||
}
|
||||
|
||||
if opts.NUMA {
|
||||
llamaOpts = append(llamaOpts, llama.EnableNUMA)
|
||||
}
|
||||
|
||||
if opts.LowVRAM {
|
||||
llamaOpts = append(llamaOpts, llama.EnabelLowVRAM)
|
||||
}
|
||||
|
||||
model, err := llama.New(opts.ModelFile, llamaOpts...)
|
||||
llm.llama = model
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
func buildPredictOptions(opts *pb.PredictOptions) []llama.PredictOption {
|
||||
ropeFreqBase := float32(10000)
|
||||
ropeFreqScale := float32(1)
|
||||
|
||||
if opts.RopeFreqBase != 0 {
|
||||
ropeFreqBase = opts.RopeFreqBase
|
||||
}
|
||||
if opts.RopeFreqScale != 0 {
|
||||
ropeFreqScale = opts.RopeFreqScale
|
||||
}
|
||||
predictOptions := []llama.PredictOption{
|
||||
llama.SetTemperature(opts.Temperature),
|
||||
llama.SetTopP(opts.TopP),
|
||||
llama.SetTopK(int(opts.TopK)),
|
||||
llama.SetTokens(int(opts.Tokens)),
|
||||
llama.SetThreads(int(opts.Threads)),
|
||||
llama.WithGrammar(opts.Grammar),
|
||||
llama.SetRopeFreqBase(ropeFreqBase),
|
||||
llama.SetRopeFreqScale(ropeFreqScale),
|
||||
llama.SetNegativePromptScale(opts.NegativePromptScale),
|
||||
llama.SetNegativePrompt(opts.NegativePrompt),
|
||||
}
|
||||
|
||||
if opts.PromptCacheAll {
|
||||
predictOptions = append(predictOptions, llama.EnablePromptCacheAll)
|
||||
}
|
||||
|
||||
if opts.PromptCacheRO {
|
||||
predictOptions = append(predictOptions, llama.EnablePromptCacheRO)
|
||||
}
|
||||
|
||||
// Expected absolute path
|
||||
if opts.PromptCachePath != "" {
|
||||
predictOptions = append(predictOptions, llama.SetPathPromptCache(opts.PromptCachePath))
|
||||
}
|
||||
|
||||
if opts.Mirostat != 0 {
|
||||
predictOptions = append(predictOptions, llama.SetMirostat(int(opts.Mirostat)))
|
||||
}
|
||||
|
||||
if opts.MirostatETA != 0 {
|
||||
predictOptions = append(predictOptions, llama.SetMirostatETA(opts.MirostatETA))
|
||||
}
|
||||
|
||||
if opts.MirostatTAU != 0 {
|
||||
predictOptions = append(predictOptions, llama.SetMirostatTAU(opts.MirostatTAU))
|
||||
}
|
||||
|
||||
if opts.Debug {
|
||||
predictOptions = append(predictOptions, llama.Debug)
|
||||
}
|
||||
|
||||
predictOptions = append(predictOptions, llama.SetStopWords(opts.StopPrompts...))
|
||||
|
||||
if opts.PresencePenalty != 0 {
|
||||
predictOptions = append(predictOptions, llama.SetPenalty(opts.PresencePenalty))
|
||||
}
|
||||
|
||||
if opts.NKeep != 0 {
|
||||
predictOptions = append(predictOptions, llama.SetNKeep(int(opts.NKeep)))
|
||||
}
|
||||
|
||||
if opts.Batch != 0 {
|
||||
predictOptions = append(predictOptions, llama.SetBatch(int(opts.Batch)))
|
||||
}
|
||||
|
||||
if opts.F16KV {
|
||||
predictOptions = append(predictOptions, llama.EnableF16KV)
|
||||
}
|
||||
|
||||
if opts.IgnoreEOS {
|
||||
predictOptions = append(predictOptions, llama.IgnoreEOS)
|
||||
}
|
||||
|
||||
if opts.Seed != 0 {
|
||||
predictOptions = append(predictOptions, llama.SetSeed(int(opts.Seed)))
|
||||
}
|
||||
|
||||
//predictOptions = append(predictOptions, llama.SetLogitBias(c.Seed))
|
||||
|
||||
predictOptions = append(predictOptions, llama.SetFrequencyPenalty(opts.FrequencyPenalty))
|
||||
predictOptions = append(predictOptions, llama.SetMlock(opts.MLock))
|
||||
predictOptions = append(predictOptions, llama.SetMemoryMap(opts.MMap))
|
||||
predictOptions = append(predictOptions, llama.SetPredictionMainGPU(opts.MainGPU))
|
||||
predictOptions = append(predictOptions, llama.SetPredictionTensorSplit(opts.TensorSplit))
|
||||
predictOptions = append(predictOptions, llama.SetTailFreeSamplingZ(opts.TailFreeSamplingZ))
|
||||
predictOptions = append(predictOptions, llama.SetTypicalP(opts.TypicalP))
|
||||
return predictOptions
|
||||
}
|
||||
|
||||
func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
|
||||
return llm.llama.Predict(opts.Prompt, buildPredictOptions(opts)...)
|
||||
}
|
||||
|
||||
func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error {
|
||||
predictOptions := buildPredictOptions(opts)
|
||||
|
||||
predictOptions = append(predictOptions, llama.SetTokenCallback(func(token string) bool {
|
||||
results <- token
|
||||
return true
|
||||
}))
|
||||
|
||||
go func() {
|
||||
_, err := llm.llama.Predict(opts.Prompt, predictOptions...)
|
||||
if err != nil {
|
||||
fmt.Println("err: ", err)
|
||||
}
|
||||
close(results)
|
||||
}()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (llm *LLM) Embeddings(opts *pb.PredictOptions) ([]float32, error) {
|
||||
predictOptions := buildPredictOptions(opts)
|
||||
|
||||
if len(opts.EmbeddingTokens) > 0 {
|
||||
tokens := []int{}
|
||||
for _, t := range opts.EmbeddingTokens {
|
||||
tokens = append(tokens, int(t))
|
||||
}
|
||||
return llm.llama.TokenEmbeddings(tokens, predictOptions...)
|
||||
}
|
||||
|
||||
return llm.llama.Embeddings(opts.Embeddings, predictOptions...)
|
||||
}
|
||||
@@ -1,19 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"flag"
|
||||
|
||||
grpc "github.com/mudler/LocalAI/pkg/grpc"
|
||||
)
|
||||
|
||||
var (
|
||||
addr = flag.String("addr", "localhost:50051", "the address to connect to")
|
||||
)
|
||||
|
||||
func main() {
|
||||
flag.Parse()
|
||||
|
||||
if err := grpc.StartServer(*addr, &LLM{}); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
@@ -74,7 +74,7 @@ func (sd *Whisper) AudioTranscription(opts *pb.TranscriptRequest) (pb.Transcript
|
||||
context.SetTranslate(true)
|
||||
}
|
||||
|
||||
if err := context.Process(data, nil, nil); err != nil {
|
||||
if err := context.Process(data, nil, nil, nil); err != nil {
|
||||
return pb.TranscriptResult{}, err
|
||||
}
|
||||
|
||||
|
||||
@@ -1,17 +0,0 @@
|
||||
.PHONY: autogptq
|
||||
autogptq: protogen
|
||||
bash install.sh
|
||||
|
||||
.PHONY: protogen
|
||||
protogen: backend_pb2_grpc.py backend_pb2.py
|
||||
|
||||
.PHONY: protogen-clean
|
||||
protogen-clean:
|
||||
$(RM) backend_pb2_grpc.py backend_pb2.py
|
||||
|
||||
backend_pb2_grpc.py backend_pb2.py:
|
||||
python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
|
||||
|
||||
.PHONY: clean
|
||||
clean: protogen-clean
|
||||
rm -rf venv __pycache__
|
||||
@@ -1,5 +0,0 @@
|
||||
# Creating a separate environment for the autogptq project
|
||||
|
||||
```
|
||||
make autogptq
|
||||
```
|
||||
@@ -1,153 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
from concurrent import futures
|
||||
import argparse
|
||||
import signal
|
||||
import sys
|
||||
import os
|
||||
import time
|
||||
import base64
|
||||
|
||||
import grpc
|
||||
import backend_pb2
|
||||
import backend_pb2_grpc
|
||||
|
||||
from auto_gptq import AutoGPTQForCausalLM
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
from transformers import TextGenerationPipeline
|
||||
|
||||
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
|
||||
|
||||
# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
|
||||
MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
|
||||
|
||||
# Implement the BackendServicer class with the service methods
|
||||
class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
def Health(self, request, context):
|
||||
return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
|
||||
def LoadModel(self, request, context):
|
||||
try:
|
||||
device = "cuda:0"
|
||||
if request.Device != "":
|
||||
device = request.Device
|
||||
|
||||
# support loading local model files
|
||||
model_path = os.path.join(os.environ.get('MODELS_PATH', './'), request.Model)
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, trust_remote_code=request.TrustRemoteCode)
|
||||
|
||||
# support model `Qwen/Qwen-VL-Chat-Int4`
|
||||
if "qwen-vl" in request.Model.lower():
|
||||
self.model_name = "Qwen-VL-Chat"
|
||||
model = AutoModelForCausalLM.from_pretrained(model_path,
|
||||
trust_remote_code=request.TrustRemoteCode,
|
||||
device_map="auto").eval()
|
||||
else:
|
||||
model = AutoGPTQForCausalLM.from_quantized(model_path,
|
||||
model_basename=request.ModelBaseName,
|
||||
use_safetensors=True,
|
||||
trust_remote_code=request.TrustRemoteCode,
|
||||
device=device,
|
||||
use_triton=request.UseTriton,
|
||||
quantize_config=None)
|
||||
|
||||
self.model = model
|
||||
self.tokenizer = tokenizer
|
||||
except Exception as err:
|
||||
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
|
||||
return backend_pb2.Result(message="Model loaded successfully", success=True)
|
||||
|
||||
def Predict(self, request, context):
|
||||
penalty = 1.0
|
||||
if request.Penalty != 0.0:
|
||||
penalty = request.Penalty
|
||||
tokens = 512
|
||||
if request.Tokens != 0:
|
||||
tokens = request.Tokens
|
||||
top_p = 0.95
|
||||
if request.TopP != 0.0:
|
||||
top_p = request.TopP
|
||||
|
||||
|
||||
prompt_images = self.recompile_vl_prompt(request)
|
||||
compiled_prompt = prompt_images[0]
|
||||
print(f"Prompt: {compiled_prompt}", file=sys.stderr)
|
||||
|
||||
# Implement Predict RPC
|
||||
pipeline = TextGenerationPipeline(
|
||||
model=self.model,
|
||||
tokenizer=self.tokenizer,
|
||||
max_new_tokens=tokens,
|
||||
temperature=request.Temperature,
|
||||
top_p=top_p,
|
||||
repetition_penalty=penalty,
|
||||
)
|
||||
t = pipeline(compiled_prompt)[0]["generated_text"]
|
||||
print(f"generated_text: {t}", file=sys.stderr)
|
||||
|
||||
if compiled_prompt in t:
|
||||
t = t.replace(compiled_prompt, "")
|
||||
# house keeping. Remove the image files from /tmp folder
|
||||
for img_path in prompt_images[1]:
|
||||
try:
|
||||
os.remove(img_path)
|
||||
except Exception as e:
|
||||
print(f"Error removing image file: {img_path}, {e}", file=sys.stderr)
|
||||
|
||||
return backend_pb2.Result(message=bytes(t, encoding='utf-8'))
|
||||
|
||||
def PredictStream(self, request, context):
|
||||
# Implement PredictStream RPC
|
||||
#for reply in some_data_generator():
|
||||
# yield reply
|
||||
# Not implemented yet
|
||||
return self.Predict(request, context)
|
||||
|
||||
def recompile_vl_prompt(self, request):
|
||||
prompt = request.Prompt
|
||||
image_paths = []
|
||||
|
||||
if "qwen-vl" in self.model_name.lower():
|
||||
# request.Images is an array which contains base64 encoded images. Iterate the request.Images array, decode and save each image to /tmp folder with a random filename.
|
||||
# Then, save the image file paths to an array "image_paths".
|
||||
# read "request.Prompt", replace "[img-%d]" with the image file paths in the order they appear in "image_paths". Save the new prompt to "prompt".
|
||||
for i, img in enumerate(request.Images):
|
||||
timestamp = str(int(time.time() * 1000)) # Generate timestamp
|
||||
img_path = f"/tmp/vl-{timestamp}.jpg" # Use timestamp in filename
|
||||
with open(img_path, "wb") as f:
|
||||
f.write(base64.b64decode(img))
|
||||
image_paths.append(img_path)
|
||||
prompt = prompt.replace(f"[img-{i}]", "<img>" + img_path + "</img>,")
|
||||
else:
|
||||
prompt = request.Prompt
|
||||
return (prompt, image_paths)
|
||||
|
||||
def serve(address):
|
||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
|
||||
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
|
||||
server.add_insecure_port(address)
|
||||
server.start()
|
||||
print("Server started. Listening on: " + address, file=sys.stderr)
|
||||
|
||||
# Define the signal handler function
|
||||
def signal_handler(sig, frame):
|
||||
print("Received termination signal. Shutting down...")
|
||||
server.stop(0)
|
||||
sys.exit(0)
|
||||
|
||||
# Set the signal handlers for SIGINT and SIGTERM
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
|
||||
try:
|
||||
while True:
|
||||
time.sleep(_ONE_DAY_IN_SECONDS)
|
||||
except KeyboardInterrupt:
|
||||
server.stop(0)
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Run the gRPC server.")
|
||||
parser.add_argument(
|
||||
"--addr", default="localhost:50051", help="The address to bind the server to."
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
serve(args.addr)
|
||||
@@ -1,14 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
source $(dirname $0)/../common/libbackend.sh
|
||||
|
||||
# This is here because the Intel pip index is broken and returns 200 status codes for every package name, it just doesn't return any package links.
|
||||
# This makes uv think that the package exists in the Intel pip index, and by default it stops looking at other pip indexes once it finds a match.
|
||||
# We need uv to continue falling through to the pypi default index to find optimum[openvino] in the pypi index
|
||||
# the --upgrade actually allows us to *downgrade* torch to the version provided in the Intel pip index
|
||||
if [ "x${BUILD_PROFILE}" == "xintel" ]; then
|
||||
EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
|
||||
fi
|
||||
|
||||
installRequirements
|
||||
@@ -1,2 +0,0 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/cu118
|
||||
torch==2.4.1+cu118
|
||||
@@ -1 +0,0 @@
|
||||
torch==2.4.1
|
||||
@@ -1,2 +0,0 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/rocm6.0
|
||||
torch==2.4.1+rocm6.0
|
||||
@@ -1,6 +0,0 @@
|
||||
--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
|
||||
intel-extension-for-pytorch==2.3.110+xpu
|
||||
torch==2.3.1+cxx11.abi
|
||||
oneccl_bind_pt==2.3.100+xpu
|
||||
optimum[openvino]
|
||||
setuptools
|
||||
@@ -1,2 +0,0 @@
|
||||
--index-url https://pypi.jetson-ai-lab.dev/jp6/cu126/
|
||||
torch
|
||||
@@ -1,6 +0,0 @@
|
||||
accelerate
|
||||
auto-gptq==0.7.1
|
||||
grpcio==1.70.0
|
||||
protobuf
|
||||
certifi
|
||||
transformers
|
||||
@@ -1,4 +0,0 @@
|
||||
#!/bin/bash
|
||||
source $(dirname $0)/../common/libbackend.sh
|
||||
|
||||
startBackend $@
|
||||
@@ -1,6 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
source $(dirname $0)/../common/libbackend.sh
|
||||
|
||||
runUnittests
|
||||
@@ -61,7 +61,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
return backend_pb2.Result(success=True)
|
||||
|
||||
def serve(address):
|
||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
|
||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
|
||||
options=[
|
||||
('grpc.max_message_length', 50 * 1024 * 1024), # 50MB
|
||||
('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB
|
||||
('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB
|
||||
])
|
||||
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
|
||||
server.add_insecure_port(address)
|
||||
server.start()
|
||||
|
||||
@@ -1,5 +0,0 @@
|
||||
--index-url https://pypi.jetson-ai-lab.dev/jp6/cu126/
|
||||
torch
|
||||
torchaudio
|
||||
transformers
|
||||
accelerate
|
||||
@@ -1,4 +1,4 @@
|
||||
bark==0.1.5
|
||||
grpcio==1.70.0
|
||||
grpcio==1.72.0
|
||||
protobuf
|
||||
certifi
|
||||
@@ -132,16 +132,11 @@ function installRequirements() {
|
||||
declare -a requirementFiles=(
|
||||
"${EDIR}/requirements-install.txt"
|
||||
"${EDIR}/requirements.txt"
|
||||
"${EDIR}/requirements-${BUILD_TYPE}.txt"
|
||||
)
|
||||
|
||||
if [ -n "${BUILD_PLATFORM}" ]; then
|
||||
requirementFiles+=("${EDIR}/requirements-${BUILD_PLATFORM}.txt")
|
||||
else
|
||||
requirementFiles+=("${EDIR}/requirements-${BUILD_TYPE}.txt")
|
||||
|
||||
if [ "x${BUILD_TYPE}" != "x${BUILD_PROFILE}" ]; then
|
||||
requirementFiles+=("${EDIR}/requirements-${BUILD_PROFILE}.txt")
|
||||
fi
|
||||
if [ "x${BUILD_TYPE}" != "x${BUILD_PROFILE}" ]; then
|
||||
requirementFiles+=("${EDIR}/requirements-${BUILD_PROFILE}.txt")
|
||||
fi
|
||||
|
||||
# if BUILD_TYPE is empty, we are a CPU build, so we should try to install the CPU requirements
|
||||
@@ -151,14 +146,8 @@ function installRequirements() {
|
||||
|
||||
requirementFiles+=("${EDIR}/requirements-after.txt")
|
||||
|
||||
if [ -n "${BUILD_PLATFORM}" ]; then
|
||||
requirementFiles+=("${EDIR}/requirements-${BUILD_PLATFORM}-after.txt")
|
||||
else
|
||||
if [ "x${BUILD_TYPE}" != "x${BUILD_PROFILE}" ]; then
|
||||
requirementFiles+=("${EDIR}/requirements-${BUILD_PROFILE}-after.txt")
|
||||
else
|
||||
requirementFiles+=("${EDIR}/requirements-${BUILD_TYPE}-after.txt")
|
||||
fi
|
||||
if [ "x${BUILD_TYPE}" != "x${BUILD_PROFILE}" ]; then
|
||||
requirementFiles+=("${EDIR}/requirements-${BUILD_PROFILE}-after.txt")
|
||||
fi
|
||||
|
||||
for reqFile in ${requirementFiles[@]}; do
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
grpcio==1.70.0
|
||||
grpcio==1.72.0
|
||||
protobuf
|
||||
grpcio-tools
|
||||
@@ -86,7 +86,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
return backend_pb2.Result(success=True)
|
||||
|
||||
def serve(address):
|
||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
|
||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
|
||||
options=[
|
||||
('grpc.max_message_length', 50 * 1024 * 1024), # 50MB
|
||||
('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB
|
||||
('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB
|
||||
])
|
||||
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
|
||||
server.add_insecure_port(address)
|
||||
server.start()
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
transformers
|
||||
transformers==4.48.3
|
||||
accelerate
|
||||
torch==2.4.1
|
||||
coqui-tts
|
||||
@@ -1,6 +1,6 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/cu118
|
||||
torch==2.4.1+cu118
|
||||
torchaudio==2.4.1+cu118
|
||||
transformers
|
||||
transformers==4.48.3
|
||||
accelerate
|
||||
coqui-tts
|
||||
@@ -1,5 +1,5 @@
|
||||
torch==2.4.1
|
||||
torchaudio==2.4.1
|
||||
transformers
|
||||
transformers==4.48.3
|
||||
accelerate
|
||||
coqui-tts
|
||||
@@ -1,6 +1,6 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/rocm6.0
|
||||
torch==2.4.1+rocm6.0
|
||||
torchaudio==2.4.1+rocm6.0
|
||||
transformers
|
||||
transformers==4.48.3
|
||||
accelerate
|
||||
coqui-tts
|
||||
@@ -5,6 +5,6 @@ torchaudio==2.3.1+cxx11.abi
|
||||
oneccl_bind_pt==2.3.100+xpu
|
||||
optimum[openvino]
|
||||
setuptools
|
||||
transformers
|
||||
transformers==4.48.3
|
||||
accelerate
|
||||
coqui-tts
|
||||
@@ -1,6 +0,0 @@
|
||||
--index-url https://pypi.jetson-ai-lab.dev/jp6/cu126/
|
||||
torch
|
||||
torchaudio
|
||||
transformers
|
||||
accelerate
|
||||
coqui-tts
|
||||
@@ -1,4 +1,4 @@
|
||||
grpcio==1.70.0
|
||||
grpcio==1.72.0
|
||||
protobuf
|
||||
certifi
|
||||
packaging==24.1
|
||||
@@ -19,7 +19,7 @@ import grpc
|
||||
|
||||
from diffusers import SanaPipeline, StableDiffusion3Pipeline, StableDiffusionXLPipeline, StableDiffusionDepth2ImgPipeline, DPMSolverMultistepScheduler, StableDiffusionPipeline, DiffusionPipeline, \
|
||||
EulerAncestralDiscreteScheduler, FluxPipeline, FluxTransformer2DModel
|
||||
from diffusers import StableDiffusionImg2ImgPipeline, AutoPipelineForText2Image, ControlNetModel, StableVideoDiffusionPipeline
|
||||
from diffusers import StableDiffusionImg2ImgPipeline, AutoPipelineForText2Image, ControlNetModel, StableVideoDiffusionPipeline, Lumina2Text2ImgPipeline
|
||||
from diffusers.pipelines.stable_diffusion import safety_checker
|
||||
from diffusers.utils import load_image, export_to_video
|
||||
from compel import Compel, ReturnedEmbeddingsType
|
||||
@@ -159,6 +159,22 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
torchType = torch.float16
|
||||
variant = "fp16"
|
||||
|
||||
options = request.Options
|
||||
|
||||
# empty dict
|
||||
self.options = {}
|
||||
|
||||
# The options are a list of strings in this form optname:optvalue
|
||||
# We are storing all the options in a dict so we can use it later when
|
||||
# generating the images
|
||||
for opt in options:
|
||||
if ":" not in opt:
|
||||
continue
|
||||
key, value = opt.split(":")
|
||||
self.options[key] = value
|
||||
|
||||
print(f"Options: {self.options}", file=sys.stderr)
|
||||
|
||||
local = False
|
||||
modelFile = request.Model
|
||||
|
||||
@@ -275,6 +291,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
|
||||
if request.LowVRAM:
|
||||
self.pipe.enable_model_cpu_offload()
|
||||
elif request.PipelineType == "Lumina2Text2ImgPipeline":
|
||||
self.pipe = Lumina2Text2ImgPipeline.from_pretrained(
|
||||
request.Model,
|
||||
torch_dtype=torch.bfloat16)
|
||||
if request.LowVRAM:
|
||||
self.pipe.enable_model_cpu_offload()
|
||||
elif request.PipelineType == "SanaPipeline":
|
||||
self.pipe = SanaPipeline.from_pretrained(
|
||||
request.Model,
|
||||
@@ -441,6 +463,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
# create a dictionary of parameters by using the keys from EnableParameters and the values from defaults
|
||||
kwargs = {key: options.get(key) for key in keys if key in options}
|
||||
|
||||
# populate kwargs from self.options.
|
||||
kwargs.update(self.options)
|
||||
|
||||
# Set seed
|
||||
if request.seed > 0:
|
||||
kwargs["generator"] = torch.Generator(device=self.device).manual_seed(
|
||||
@@ -501,7 +526,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
|
||||
|
||||
def serve(address):
|
||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
|
||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
|
||||
options=[
|
||||
('grpc.max_message_length', 50 * 1024 * 1024), # 50MB
|
||||
('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB
|
||||
('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB
|
||||
])
|
||||
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
|
||||
server.add_insecure_port(address)
|
||||
server.start()
|
||||
|
||||
@@ -1,10 +0,0 @@
|
||||
--index-url https://pypi.jetson-ai-lab.dev/jp6/cu126/
|
||||
torch
|
||||
diffusers
|
||||
opencv-python
|
||||
transformers
|
||||
accelerate
|
||||
compel
|
||||
peft
|
||||
sentencepiece
|
||||
optimum-quanto
|
||||
@@ -1,5 +1,5 @@
|
||||
setuptools
|
||||
grpcio==1.70.0
|
||||
grpcio==1.72.0
|
||||
pillow
|
||||
protobuf
|
||||
certifi
|
||||
|
||||
@@ -105,7 +105,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
|
||||
|
||||
def serve(address):
|
||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
|
||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
|
||||
options=[
|
||||
('grpc.max_message_length', 50 * 1024 * 1024), # 50MB
|
||||
('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB
|
||||
('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB
|
||||
])
|
||||
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
|
||||
server.add_insecure_port(address)
|
||||
server.start()
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user